| 1 | /* $NetBSD: rf_parityloggingdags.c,v 1.21 2014/03/23 09:30:59 christos Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 1995 Carnegie-Mellon University. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * Author: William V. Courtright II |
| 7 | * |
| 8 | * Permission to use, copy, modify and distribute this software and |
| 9 | * its documentation is hereby granted, provided that both the copyright |
| 10 | * notice and this permission notice appear in all copies of the |
| 11 | * software, derivative works or modified versions, and any portions |
| 12 | * thereof, and that both notices appear in supporting documentation. |
| 13 | * |
| 14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 17 | * |
| 18 | * Carnegie Mellon requests users of this software to return to |
| 19 | * |
| 20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 21 | * School of Computer Science |
| 22 | * Carnegie Mellon University |
| 23 | * Pittsburgh PA 15213-3890 |
| 24 | * |
| 25 | * any improvements or extensions that they make and grant Carnegie the |
| 26 | * rights to redistribute these changes. |
| 27 | */ |
| 28 | |
| 29 | /* |
| 30 | DAGs specific to parity logging are created here |
| 31 | */ |
| 32 | |
| 33 | #include <sys/cdefs.h> |
| 34 | __KERNEL_RCSID(0, "$NetBSD: rf_parityloggingdags.c,v 1.21 2014/03/23 09:30:59 christos Exp $" ); |
| 35 | |
| 36 | #ifdef _KERNEL_OPT |
| 37 | #include "opt_raid_diagnostic.h" |
| 38 | #endif |
| 39 | |
| 40 | #include "rf_archs.h" |
| 41 | |
| 42 | #if RF_INCLUDE_PARITYLOGGING > 0 |
| 43 | |
| 44 | #include <dev/raidframe/raidframevar.h> |
| 45 | |
| 46 | #include "rf_raid.h" |
| 47 | #include "rf_dag.h" |
| 48 | #include "rf_dagutils.h" |
| 49 | #include "rf_dagfuncs.h" |
| 50 | #include "rf_debugMem.h" |
| 51 | #include "rf_paritylog.h" |
| 52 | #include "rf_general.h" |
| 53 | |
| 54 | #include "rf_parityloggingdags.h" |
| 55 | |
| 56 | /****************************************************************************** |
| 57 | * |
| 58 | * creates a DAG to perform a large-write operation: |
| 59 | * |
| 60 | * / Rod \ / Wnd \ |
| 61 | * H -- NIL- Rod - NIL - Wnd ------ NIL - T |
| 62 | * \ Rod / \ Xor - Lpo / |
| 63 | * |
| 64 | * The writes are not done until the reads complete because if they were done in |
| 65 | * parallel, a failure on one of the reads could leave the parity in an inconsistent |
| 66 | * state, so that the retry with a new DAG would produce erroneous parity. |
| 67 | * |
| 68 | * Note: this DAG has the nasty property that none of the buffers allocated for reading |
| 69 | * old data can be freed until the XOR node fires. Need to fix this. |
| 70 | * |
| 71 | * The last two arguments are the number of faults tolerated, and function for the |
| 72 | * redundancy calculation. The undo for the redundancy calc is assumed to be null |
| 73 | * |
| 74 | *****************************************************************************/ |
| 75 | |
| 76 | void |
| 77 | rf_CommonCreateParityLoggingLargeWriteDAG( |
| 78 | RF_Raid_t * raidPtr, |
| 79 | RF_AccessStripeMap_t * asmap, |
| 80 | RF_DagHeader_t * dag_h, |
| 81 | void *bp, |
| 82 | RF_RaidAccessFlags_t flags, |
| 83 | RF_AllocListElem_t * allocList, |
| 84 | int nfaults, |
| 85 | int (*redFunc) (RF_DagNode_t *)) |
| 86 | { |
| 87 | RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode, |
| 88 | *lpoNode, *blockNode, *unblockNode, *termNode; |
| 89 | int nWndNodes, nRodNodes, i; |
| 90 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
| 91 | RF_AccessStripeMapHeader_t *new_asm_h[2]; |
| 92 | int nodeNum, asmNum; |
| 93 | RF_ReconUnitNum_t which_ru; |
| 94 | char *sosBuffer, *eosBuffer; |
| 95 | RF_PhysDiskAddr_t *pda; |
| 96 | RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); |
| 97 | |
| 98 | if (rf_dagDebug) |
| 99 | printf("[Creating parity-logging large-write DAG]\n" ); |
| 100 | RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */ |
| 101 | dag_h->creator = "ParityLoggingLargeWriteDAG" ; |
| 102 | |
| 103 | /* alloc the Wnd nodes, the xor node, and the Lpo node */ |
| 104 | nWndNodes = asmap->numStripeUnitsAccessed; |
| 105 | RF_MallocAndAdd(nodes, (nWndNodes + 6) * sizeof(RF_DagNode_t), |
| 106 | (RF_DagNode_t *), allocList); |
| 107 | i = 0; |
| 108 | wndNodes = &nodes[i]; |
| 109 | i += nWndNodes; |
| 110 | xorNode = &nodes[i]; |
| 111 | i += 1; |
| 112 | lpoNode = &nodes[i]; |
| 113 | i += 1; |
| 114 | blockNode = &nodes[i]; |
| 115 | i += 1; |
| 116 | syncNode = &nodes[i]; |
| 117 | i += 1; |
| 118 | unblockNode = &nodes[i]; |
| 119 | i += 1; |
| 120 | termNode = &nodes[i]; |
| 121 | i += 1; |
| 122 | |
| 123 | dag_h->numCommitNodes = nWndNodes + 1; |
| 124 | dag_h->numCommits = 0; |
| 125 | dag_h->numSuccedents = 1; |
| 126 | |
| 127 | rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList); |
| 128 | if (nRodNodes > 0) |
| 129 | RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t), |
| 130 | (RF_DagNode_t *), allocList); |
| 131 | |
| 132 | /* begin node initialization */ |
| 133 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil" , allocList); |
| 134 | rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil" , allocList); |
| 135 | rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil" , allocList); |
| 136 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm" , allocList); |
| 137 | |
| 138 | /* initialize the Rod nodes */ |
| 139 | for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) { |
| 140 | if (new_asm_h[asmNum]) { |
| 141 | pda = new_asm_h[asmNum]->stripeMap->physInfo; |
| 142 | while (pda) { |
| 143 | rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod" , allocList); |
| 144 | rodNodes[nodeNum].params[0].p = pda; |
| 145 | rodNodes[nodeNum].params[1].p = pda->bufPtr; |
| 146 | rodNodes[nodeNum].params[2].v = parityStripeID; |
| 147 | rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
| 148 | nodeNum++; |
| 149 | pda = pda->next; |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | RF_ASSERT(nodeNum == nRodNodes); |
| 154 | |
| 155 | /* initialize the wnd nodes */ |
| 156 | pda = asmap->physInfo; |
| 157 | for (i = 0; i < nWndNodes; i++) { |
| 158 | rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd" , allocList); |
| 159 | RF_ASSERT(pda != NULL); |
| 160 | wndNodes[i].params[0].p = pda; |
| 161 | wndNodes[i].params[1].p = pda->bufPtr; |
| 162 | wndNodes[i].params[2].v = parityStripeID; |
| 163 | wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
| 164 | pda = pda->next; |
| 165 | } |
| 166 | |
| 167 | /* initialize the redundancy node */ |
| 168 | rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr " , allocList); |
| 169 | xorNode->flags |= RF_DAGNODE_FLAG_YIELD; |
| 170 | for (i = 0; i < nWndNodes; i++) { |
| 171 | xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */ |
| 172 | xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */ |
| 173 | } |
| 174 | for (i = 0; i < nRodNodes; i++) { |
| 175 | xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */ |
| 176 | xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */ |
| 177 | } |
| 178 | xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get |
| 179 | * at RAID information */ |
| 180 | |
| 181 | /* look for an Rod node that reads a complete SU. If none, alloc a |
| 182 | * buffer to receive the parity info. Note that we can't use a new |
| 183 | * data buffer because it will not have gotten written when the xor |
| 184 | * occurs. */ |
| 185 | for (i = 0; i < nRodNodes; i++) |
| 186 | if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit) |
| 187 | break; |
| 188 | if (i == nRodNodes) { |
| 189 | RF_MallocAndAdd(xorNode->results[0], |
| 190 | rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList); |
| 191 | } else { |
| 192 | xorNode->results[0] = rodNodes[i].params[1].p; |
| 193 | } |
| 194 | |
| 195 | /* initialize the Lpo node */ |
| 196 | rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo" , allocList); |
| 197 | |
| 198 | lpoNode->params[0].p = asmap->parityInfo; |
| 199 | lpoNode->params[1].p = xorNode->results[0]; |
| 200 | RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must |
| 201 | * describe entire |
| 202 | * parity unit */ |
| 203 | |
| 204 | /* connect nodes to form graph */ |
| 205 | |
| 206 | /* connect dag header to block node */ |
| 207 | RF_ASSERT(dag_h->numSuccedents == 1); |
| 208 | RF_ASSERT(blockNode->numAntecedents == 0); |
| 209 | dag_h->succedents[0] = blockNode; |
| 210 | |
| 211 | /* connect the block node to the Rod nodes */ |
| 212 | RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1); |
| 213 | for (i = 0; i < nRodNodes; i++) { |
| 214 | RF_ASSERT(rodNodes[i].numAntecedents == 1); |
| 215 | blockNode->succedents[i] = &rodNodes[i]; |
| 216 | rodNodes[i].antecedents[0] = blockNode; |
| 217 | rodNodes[i].antType[0] = rf_control; |
| 218 | } |
| 219 | |
| 220 | /* connect the block node to the sync node */ |
| 221 | /* necessary if nRodNodes == 0 */ |
| 222 | RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1); |
| 223 | blockNode->succedents[nRodNodes] = syncNode; |
| 224 | syncNode->antecedents[0] = blockNode; |
| 225 | syncNode->antType[0] = rf_control; |
| 226 | |
| 227 | /* connect the Rod nodes to the syncNode */ |
| 228 | for (i = 0; i < nRodNodes; i++) { |
| 229 | rodNodes[i].succedents[0] = syncNode; |
| 230 | syncNode->antecedents[1 + i] = &rodNodes[i]; |
| 231 | syncNode->antType[1 + i] = rf_control; |
| 232 | } |
| 233 | |
| 234 | /* connect the sync node to the xor node */ |
| 235 | RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1); |
| 236 | RF_ASSERT(xorNode->numAntecedents == 1); |
| 237 | syncNode->succedents[0] = xorNode; |
| 238 | xorNode->antecedents[0] = syncNode; |
| 239 | xorNode->antType[0] = rf_trueData; /* carry forward from sync */ |
| 240 | |
| 241 | /* connect the sync node to the Wnd nodes */ |
| 242 | for (i = 0; i < nWndNodes; i++) { |
| 243 | RF_ASSERT(wndNodes->numAntecedents == 1); |
| 244 | syncNode->succedents[1 + i] = &wndNodes[i]; |
| 245 | wndNodes[i].antecedents[0] = syncNode; |
| 246 | wndNodes[i].antType[0] = rf_control; |
| 247 | } |
| 248 | |
| 249 | /* connect the xor node to the Lpo node */ |
| 250 | RF_ASSERT(xorNode->numSuccedents == 1); |
| 251 | RF_ASSERT(lpoNode->numAntecedents == 1); |
| 252 | xorNode->succedents[0] = lpoNode; |
| 253 | lpoNode->antecedents[0] = xorNode; |
| 254 | lpoNode->antType[0] = rf_trueData; |
| 255 | |
| 256 | /* connect the Wnd nodes to the unblock node */ |
| 257 | RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1); |
| 258 | for (i = 0; i < nWndNodes; i++) { |
| 259 | RF_ASSERT(wndNodes->numSuccedents == 1); |
| 260 | wndNodes[i].succedents[0] = unblockNode; |
| 261 | unblockNode->antecedents[i] = &wndNodes[i]; |
| 262 | unblockNode->antType[i] = rf_control; |
| 263 | } |
| 264 | |
| 265 | /* connect the Lpo node to the unblock node */ |
| 266 | RF_ASSERT(lpoNode->numSuccedents == 1); |
| 267 | lpoNode->succedents[0] = unblockNode; |
| 268 | unblockNode->antecedents[nWndNodes] = lpoNode; |
| 269 | unblockNode->antType[nWndNodes] = rf_control; |
| 270 | |
| 271 | /* connect unblock node to terminator */ |
| 272 | RF_ASSERT(unblockNode->numSuccedents == 1); |
| 273 | RF_ASSERT(termNode->numAntecedents == 1); |
| 274 | RF_ASSERT(termNode->numSuccedents == 0); |
| 275 | unblockNode->succedents[0] = termNode; |
| 276 | termNode->antecedents[0] = unblockNode; |
| 277 | termNode->antType[0] = rf_control; |
| 278 | } |
| 279 | |
| 280 | |
| 281 | |
| 282 | |
| 283 | /****************************************************************************** |
| 284 | * |
| 285 | * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows: |
| 286 | * |
| 287 | * Header |
| 288 | * | |
| 289 | * Block |
| 290 | * / | ... \ \ |
| 291 | * / | \ \ |
| 292 | * Rod Rod Rod Rop |
| 293 | * | \ /| \ / | \/ | |
| 294 | * | | | /\ | |
| 295 | * Wnd Wnd Wnd X |
| 296 | * | \ / | |
| 297 | * | \ / | |
| 298 | * \ \ / Lpo |
| 299 | * \ \ / / |
| 300 | * +-> Unblock <-+ |
| 301 | * | |
| 302 | * T |
| 303 | * |
| 304 | * |
| 305 | * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity. |
| 306 | * When the access spans a stripe unit boundary and is less than one SU in size, there will |
| 307 | * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case. |
| 308 | * The second output from each Rod node goes to the X node. In the double-XOR |
| 309 | * case, there are exactly 2 Rod nodes, and each sends one output to one X node. |
| 310 | * There is one Rod -- Wnd -- T branch for each stripe unit being updated. |
| 311 | * |
| 312 | * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG. |
| 313 | * |
| 314 | * Note: this DAG ignores all the optimizations related to making the RMWs atomic. |
| 315 | * it also has the nasty property that none of the buffers allocated for reading |
| 316 | * old data & parity can be freed until the XOR node fires. Need to fix this. |
| 317 | * |
| 318 | * A null qfuncs indicates single fault tolerant |
| 319 | *****************************************************************************/ |
| 320 | |
| 321 | void |
| 322 | rf_CommonCreateParityLoggingSmallWriteDAG( |
| 323 | RF_Raid_t * raidPtr, |
| 324 | RF_AccessStripeMap_t * asmap, |
| 325 | RF_DagHeader_t * dag_h, |
| 326 | void *bp, |
| 327 | RF_RaidAccessFlags_t flags, |
| 328 | RF_AllocListElem_t * allocList, |
| 329 | const RF_RedFuncs_t * pfuncs, |
| 330 | const RF_RedFuncs_t * qfuncs) |
| 331 | { |
| 332 | RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes; |
| 333 | RF_DagNode_t *readDataNodes, *readParityNodes; |
| 334 | RF_DagNode_t *writeDataNodes, *lpuNodes; |
| 335 | RF_DagNode_t *termNode; |
| 336 | RF_PhysDiskAddr_t *pda = asmap->physInfo; |
| 337 | int numDataNodes = asmap->numStripeUnitsAccessed; |
| 338 | int numParityNodes = (asmap->parityInfo->next) ? 2 : 1; |
| 339 | int i, j, nNodes, totalNumNodes; |
| 340 | RF_ReconUnitNum_t which_ru; |
| 341 | int (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node); |
| 342 | const char *name; |
| 343 | RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); |
| 344 | long nfaults __unused = qfuncs ? 2 : 1; |
| 345 | |
| 346 | if (rf_dagDebug) |
| 347 | printf("[Creating parity-logging small-write DAG]\n" ); |
| 348 | RF_ASSERT(numDataNodes > 0); |
| 349 | RF_ASSERT(nfaults == 1); |
| 350 | dag_h->creator = "ParityLoggingSmallWriteDAG" ; |
| 351 | |
| 352 | /* DAG creation occurs in three steps: 1. count the number of nodes in |
| 353 | * the DAG 2. create the nodes 3. initialize the nodes 4. connect the |
| 354 | * nodes */ |
| 355 | |
| 356 | /* Step 1. compute number of nodes in the graph */ |
| 357 | |
| 358 | /* number of nodes: a read and write for each data unit a redundancy |
| 359 | * computation node for each parity node a read and Lpu for each |
| 360 | * parity unit a block and unblock node (2) a terminator node if |
| 361 | * atomic RMW an unlock node for each data unit, redundancy unit */ |
| 362 | totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3; |
| 363 | |
| 364 | nNodes = numDataNodes + numParityNodes; |
| 365 | |
| 366 | dag_h->numCommitNodes = numDataNodes + numParityNodes; |
| 367 | dag_h->numCommits = 0; |
| 368 | dag_h->numSuccedents = 1; |
| 369 | |
| 370 | /* Step 2. create the nodes */ |
| 371 | RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t), |
| 372 | (RF_DagNode_t *), allocList); |
| 373 | i = 0; |
| 374 | blockNode = &nodes[i]; |
| 375 | i += 1; |
| 376 | unblockNode = &nodes[i]; |
| 377 | i += 1; |
| 378 | readDataNodes = &nodes[i]; |
| 379 | i += numDataNodes; |
| 380 | readParityNodes = &nodes[i]; |
| 381 | i += numParityNodes; |
| 382 | writeDataNodes = &nodes[i]; |
| 383 | i += numDataNodes; |
| 384 | lpuNodes = &nodes[i]; |
| 385 | i += numParityNodes; |
| 386 | xorNodes = &nodes[i]; |
| 387 | i += numParityNodes; |
| 388 | termNode = &nodes[i]; |
| 389 | i += 1; |
| 390 | |
| 391 | RF_ASSERT(i == totalNumNodes); |
| 392 | |
| 393 | /* Step 3. initialize the nodes */ |
| 394 | /* initialize block node (Nil) */ |
| 395 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil" , allocList); |
| 396 | |
| 397 | /* initialize unblock node (Nil) */ |
| 398 | rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil" , allocList); |
| 399 | |
| 400 | /* initialize terminatory node (Trm) */ |
| 401 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm" , allocList); |
| 402 | |
| 403 | /* initialize nodes which read old data (Rod) */ |
| 404 | for (i = 0; i < numDataNodes; i++) { |
| 405 | rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod" , allocList); |
| 406 | RF_ASSERT(pda != NULL); |
| 407 | readDataNodes[i].params[0].p = pda; /* physical disk addr |
| 408 | * desc */ |
| 409 | readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); /* buffer to hold old data */ |
| 410 | readDataNodes[i].params[2].v = parityStripeID; |
| 411 | readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
| 412 | pda = pda->next; |
| 413 | readDataNodes[i].propList[0] = NULL; |
| 414 | readDataNodes[i].propList[1] = NULL; |
| 415 | } |
| 416 | |
| 417 | /* initialize nodes which read old parity (Rop) */ |
| 418 | pda = asmap->parityInfo; |
| 419 | i = 0; |
| 420 | for (i = 0; i < numParityNodes; i++) { |
| 421 | RF_ASSERT(pda != NULL); |
| 422 | rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop" , allocList); |
| 423 | readParityNodes[i].params[0].p = pda; |
| 424 | readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); /* buffer to hold old parity */ |
| 425 | readParityNodes[i].params[2].v = parityStripeID; |
| 426 | readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
| 427 | readParityNodes[i].propList[0] = NULL; |
| 428 | pda = pda->next; |
| 429 | } |
| 430 | |
| 431 | /* initialize nodes which write new data (Wnd) */ |
| 432 | pda = asmap->physInfo; |
| 433 | for (i = 0; i < numDataNodes; i++) { |
| 434 | RF_ASSERT(pda != NULL); |
| 435 | rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd" , allocList); |
| 436 | writeDataNodes[i].params[0].p = pda; /* physical disk addr |
| 437 | * desc */ |
| 438 | writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new |
| 439 | * data to be written */ |
| 440 | writeDataNodes[i].params[2].v = parityStripeID; |
| 441 | writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
| 442 | |
| 443 | pda = pda->next; |
| 444 | } |
| 445 | |
| 446 | |
| 447 | /* initialize nodes which compute new parity */ |
| 448 | /* we use the simple XOR func in the double-XOR case, and when we're |
| 449 | * accessing only a portion of one stripe unit. the distinction |
| 450 | * between the two is that the regular XOR func assumes that the |
| 451 | * targbuf is a full SU in size, and examines the pda associated with |
| 452 | * the buffer to decide where within the buffer to XOR the data, |
| 453 | * whereas the simple XOR func just XORs the data into the start of |
| 454 | * the buffer. */ |
| 455 | if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) { |
| 456 | func = pfuncs->simple; |
| 457 | undoFunc = rf_NullNodeUndoFunc; |
| 458 | name = pfuncs->SimpleName; |
| 459 | } else { |
| 460 | func = pfuncs->regular; |
| 461 | undoFunc = rf_NullNodeUndoFunc; |
| 462 | name = pfuncs->RegularName; |
| 463 | } |
| 464 | /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop} |
| 465 | * nodes, and raidPtr */ |
| 466 | if (numParityNodes == 2) { /* double-xor case */ |
| 467 | for (i = 0; i < numParityNodes; i++) { |
| 468 | rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for |
| 469 | * xor */ |
| 470 | xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD; |
| 471 | xorNodes[i].params[0] = readDataNodes[i].params[0]; |
| 472 | xorNodes[i].params[1] = readDataNodes[i].params[1]; |
| 473 | xorNodes[i].params[2] = readParityNodes[i].params[0]; |
| 474 | xorNodes[i].params[3] = readParityNodes[i].params[1]; |
| 475 | xorNodes[i].params[4] = writeDataNodes[i].params[0]; |
| 476 | xorNodes[i].params[5] = writeDataNodes[i].params[1]; |
| 477 | xorNodes[i].params[6].p = raidPtr; |
| 478 | xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as |
| 479 | * target buf */ |
| 480 | } |
| 481 | } else { |
| 482 | /* there is only one xor node in this case */ |
| 483 | rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList); |
| 484 | xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD; |
| 485 | for (i = 0; i < numDataNodes + 1; i++) { |
| 486 | /* set up params related to Rod and Rop nodes */ |
| 487 | xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */ |
| 488 | xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */ |
| 489 | } |
| 490 | for (i = 0; i < numDataNodes; i++) { |
| 491 | /* set up params related to Wnd and Wnp nodes */ |
| 492 | xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */ |
| 493 | xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */ |
| 494 | } |
| 495 | xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get |
| 496 | * at RAID information */ |
| 497 | xorNodes[0].results[0] = readParityNodes[0].params[1].p; |
| 498 | } |
| 499 | |
| 500 | /* initialize the log node(s) */ |
| 501 | pda = asmap->parityInfo; |
| 502 | for (i = 0; i < numParityNodes; i++) { |
| 503 | RF_ASSERT(pda); |
| 504 | rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu" , allocList); |
| 505 | lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */ |
| 506 | lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to |
| 507 | * parity */ |
| 508 | pda = pda->next; |
| 509 | } |
| 510 | |
| 511 | |
| 512 | /* Step 4. connect the nodes */ |
| 513 | |
| 514 | /* connect header to block node */ |
| 515 | RF_ASSERT(dag_h->numSuccedents == 1); |
| 516 | RF_ASSERT(blockNode->numAntecedents == 0); |
| 517 | dag_h->succedents[0] = blockNode; |
| 518 | |
| 519 | /* connect block node to read old data nodes */ |
| 520 | RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes)); |
| 521 | for (i = 0; i < numDataNodes; i++) { |
| 522 | blockNode->succedents[i] = &readDataNodes[i]; |
| 523 | RF_ASSERT(readDataNodes[i].numAntecedents == 1); |
| 524 | readDataNodes[i].antecedents[0] = blockNode; |
| 525 | readDataNodes[i].antType[0] = rf_control; |
| 526 | } |
| 527 | |
| 528 | /* connect block node to read old parity nodes */ |
| 529 | for (i = 0; i < numParityNodes; i++) { |
| 530 | blockNode->succedents[numDataNodes + i] = &readParityNodes[i]; |
| 531 | RF_ASSERT(readParityNodes[i].numAntecedents == 1); |
| 532 | readParityNodes[i].antecedents[0] = blockNode; |
| 533 | readParityNodes[i].antType[0] = rf_control; |
| 534 | } |
| 535 | |
| 536 | /* connect read old data nodes to write new data nodes */ |
| 537 | for (i = 0; i < numDataNodes; i++) { |
| 538 | RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes); |
| 539 | for (j = 0; j < numDataNodes; j++) { |
| 540 | RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes); |
| 541 | readDataNodes[i].succedents[j] = &writeDataNodes[j]; |
| 542 | writeDataNodes[j].antecedents[i] = &readDataNodes[i]; |
| 543 | if (i == j) |
| 544 | writeDataNodes[j].antType[i] = rf_antiData; |
| 545 | else |
| 546 | writeDataNodes[j].antType[i] = rf_control; |
| 547 | } |
| 548 | } |
| 549 | |
| 550 | /* connect read old data nodes to xor nodes */ |
| 551 | for (i = 0; i < numDataNodes; i++) |
| 552 | for (j = 0; j < numParityNodes; j++) { |
| 553 | RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes); |
| 554 | readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j]; |
| 555 | xorNodes[j].antecedents[i] = &readDataNodes[i]; |
| 556 | xorNodes[j].antType[i] = rf_trueData; |
| 557 | } |
| 558 | |
| 559 | /* connect read old parity nodes to write new data nodes */ |
| 560 | for (i = 0; i < numParityNodes; i++) { |
| 561 | RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes); |
| 562 | for (j = 0; j < numDataNodes; j++) { |
| 563 | readParityNodes[i].succedents[j] = &writeDataNodes[j]; |
| 564 | writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; |
| 565 | writeDataNodes[j].antType[numDataNodes + i] = rf_control; |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | /* connect read old parity nodes to xor nodes */ |
| 570 | for (i = 0; i < numParityNodes; i++) |
| 571 | for (j = 0; j < numParityNodes; j++) { |
| 572 | readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j]; |
| 573 | xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; |
| 574 | xorNodes[j].antType[numDataNodes + i] = rf_trueData; |
| 575 | } |
| 576 | |
| 577 | /* connect xor nodes to write new parity nodes */ |
| 578 | for (i = 0; i < numParityNodes; i++) { |
| 579 | RF_ASSERT(xorNodes[i].numSuccedents == 1); |
| 580 | RF_ASSERT(lpuNodes[i].numAntecedents == 1); |
| 581 | xorNodes[i].succedents[0] = &lpuNodes[i]; |
| 582 | lpuNodes[i].antecedents[0] = &xorNodes[i]; |
| 583 | lpuNodes[i].antType[0] = rf_trueData; |
| 584 | } |
| 585 | |
| 586 | for (i = 0; i < numDataNodes; i++) { |
| 587 | /* connect write new data nodes to unblock node */ |
| 588 | RF_ASSERT(writeDataNodes[i].numSuccedents == 1); |
| 589 | RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); |
| 590 | writeDataNodes[i].succedents[0] = unblockNode; |
| 591 | unblockNode->antecedents[i] = &writeDataNodes[i]; |
| 592 | unblockNode->antType[i] = rf_control; |
| 593 | } |
| 594 | |
| 595 | /* connect write new parity nodes to unblock node */ |
| 596 | for (i = 0; i < numParityNodes; i++) { |
| 597 | RF_ASSERT(lpuNodes[i].numSuccedents == 1); |
| 598 | lpuNodes[i].succedents[0] = unblockNode; |
| 599 | unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i]; |
| 600 | unblockNode->antType[numDataNodes + i] = rf_control; |
| 601 | } |
| 602 | |
| 603 | /* connect unblock node to terminator */ |
| 604 | RF_ASSERT(unblockNode->numSuccedents == 1); |
| 605 | RF_ASSERT(termNode->numAntecedents == 1); |
| 606 | RF_ASSERT(termNode->numSuccedents == 0); |
| 607 | unblockNode->succedents[0] = termNode; |
| 608 | termNode->antecedents[0] = unblockNode; |
| 609 | termNode->antType[0] = rf_control; |
| 610 | } |
| 611 | |
| 612 | |
| 613 | void |
| 614 | rf_CreateParityLoggingSmallWriteDAG( |
| 615 | RF_Raid_t * raidPtr, |
| 616 | RF_AccessStripeMap_t * asmap, |
| 617 | RF_DagHeader_t * dag_h, |
| 618 | void *bp, |
| 619 | RF_RaidAccessFlags_t flags, |
| 620 | RF_AllocListElem_t * allocList, |
| 621 | const RF_RedFuncs_t * pfuncs, |
| 622 | const RF_RedFuncs_t * qfuncs) |
| 623 | { |
| 624 | dag_h->creator = "ParityLoggingSmallWriteDAG" ; |
| 625 | rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL); |
| 626 | } |
| 627 | |
| 628 | |
| 629 | void |
| 630 | rf_CreateParityLoggingLargeWriteDAG( |
| 631 | RF_Raid_t * raidPtr, |
| 632 | RF_AccessStripeMap_t * asmap, |
| 633 | RF_DagHeader_t * dag_h, |
| 634 | void *bp, |
| 635 | RF_RaidAccessFlags_t flags, |
| 636 | RF_AllocListElem_t * allocList, |
| 637 | int nfaults, |
| 638 | int (*redFunc) (RF_DagNode_t *)) |
| 639 | { |
| 640 | dag_h->creator = "ParityLoggingSmallWriteDAG" ; |
| 641 | rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc); |
| 642 | } |
| 643 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
| 644 | |