| 1 | /* $NetBSD: rf_dagfuncs.c,v 1.30 2009/03/23 18:38:54 oster Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 1995 Carnegie-Mellon University. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * Author: Mark Holland, William V. Courtright II |
| 7 | * |
| 8 | * Permission to use, copy, modify and distribute this software and |
| 9 | * its documentation is hereby granted, provided that both the copyright |
| 10 | * notice and this permission notice appear in all copies of the |
| 11 | * software, derivative works or modified versions, and any portions |
| 12 | * thereof, and that both notices appear in supporting documentation. |
| 13 | * |
| 14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 17 | * |
| 18 | * Carnegie Mellon requests users of this software to return to |
| 19 | * |
| 20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 21 | * School of Computer Science |
| 22 | * Carnegie Mellon University |
| 23 | * Pittsburgh PA 15213-3890 |
| 24 | * |
| 25 | * any improvements or extensions that they make and grant Carnegie the |
| 26 | * rights to redistribute these changes. |
| 27 | */ |
| 28 | |
| 29 | /* |
| 30 | * dagfuncs.c -- DAG node execution routines |
| 31 | * |
| 32 | * Rules: |
| 33 | * 1. Every DAG execution function must eventually cause node->status to |
| 34 | * get set to "good" or "bad", and "FinishNode" to be called. In the |
| 35 | * case of nodes that complete immediately (xor, NullNodeFunc, etc), |
| 36 | * the node execution function can do these two things directly. In |
| 37 | * the case of nodes that have to wait for some event (a disk read to |
| 38 | * complete, a lock to be released, etc) to occur before they can |
| 39 | * complete, this is typically achieved by having whatever module |
| 40 | * is doing the operation call GenericWakeupFunc upon completion. |
| 41 | * 2. DAG execution functions should check the status in the DAG header |
| 42 | * and NOP out their operations if the status is not "enable". However, |
| 43 | * execution functions that release resources must be sure to release |
| 44 | * them even when they NOP out the function that would use them. |
| 45 | * Functions that acquire resources should go ahead and acquire them |
| 46 | * even when they NOP, so that a downstream release node will not have |
| 47 | * to check to find out whether or not the acquire was suppressed. |
| 48 | */ |
| 49 | |
| 50 | #include <sys/cdefs.h> |
| 51 | __KERNEL_RCSID(0, "$NetBSD: rf_dagfuncs.c,v 1.30 2009/03/23 18:38:54 oster Exp $" ); |
| 52 | |
| 53 | #include <sys/param.h> |
| 54 | #include <sys/ioctl.h> |
| 55 | |
| 56 | #include "rf_archs.h" |
| 57 | #include "rf_raid.h" |
| 58 | #include "rf_dag.h" |
| 59 | #include "rf_layout.h" |
| 60 | #include "rf_etimer.h" |
| 61 | #include "rf_acctrace.h" |
| 62 | #include "rf_diskqueue.h" |
| 63 | #include "rf_dagfuncs.h" |
| 64 | #include "rf_general.h" |
| 65 | #include "rf_engine.h" |
| 66 | #include "rf_dagutils.h" |
| 67 | |
| 68 | #include "rf_kintf.h" |
| 69 | |
| 70 | #if RF_INCLUDE_PARITYLOGGING > 0 |
| 71 | #include "rf_paritylog.h" |
| 72 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
| 73 | |
| 74 | int (*rf_DiskReadFunc) (RF_DagNode_t *); |
| 75 | int (*rf_DiskWriteFunc) (RF_DagNode_t *); |
| 76 | int (*rf_DiskReadUndoFunc) (RF_DagNode_t *); |
| 77 | int (*rf_DiskWriteUndoFunc) (RF_DagNode_t *); |
| 78 | int (*rf_RegularXorUndoFunc) (RF_DagNode_t *); |
| 79 | int (*rf_SimpleXorUndoFunc) (RF_DagNode_t *); |
| 80 | int (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *); |
| 81 | |
| 82 | /***************************************************************************** |
| 83 | * main (only) configuration routine for this module |
| 84 | ****************************************************************************/ |
| 85 | int |
| 86 | rf_ConfigureDAGFuncs(RF_ShutdownList_t **listp) |
| 87 | { |
| 88 | RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || |
| 89 | ((sizeof(long) == 4) && RF_LONGSHIFT == 2)); |
| 90 | rf_DiskReadFunc = rf_DiskReadFuncForThreads; |
| 91 | rf_DiskReadUndoFunc = rf_DiskUndoFunc; |
| 92 | rf_DiskWriteFunc = rf_DiskWriteFuncForThreads; |
| 93 | rf_DiskWriteUndoFunc = rf_DiskUndoFunc; |
| 94 | rf_RegularXorUndoFunc = rf_NullNodeUndoFunc; |
| 95 | rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc; |
| 96 | rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc; |
| 97 | return (0); |
| 98 | } |
| 99 | |
| 100 | |
| 101 | |
| 102 | /***************************************************************************** |
| 103 | * the execution function associated with a terminate node |
| 104 | ****************************************************************************/ |
| 105 | int |
| 106 | rf_TerminateFunc(RF_DagNode_t *node) |
| 107 | { |
| 108 | RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes); |
| 109 | node->status = rf_good; |
| 110 | return (rf_FinishNode(node, RF_THREAD_CONTEXT)); |
| 111 | } |
| 112 | |
| 113 | int |
| 114 | rf_TerminateUndoFunc(RF_DagNode_t *node) |
| 115 | { |
| 116 | return (0); |
| 117 | } |
| 118 | |
| 119 | |
| 120 | /***************************************************************************** |
| 121 | * execution functions associated with a mirror node |
| 122 | * |
| 123 | * parameters: |
| 124 | * |
| 125 | * 0 - physical disk addres of data |
| 126 | * 1 - buffer for holding read data |
| 127 | * 2 - parity stripe ID |
| 128 | * 3 - flags |
| 129 | * 4 - physical disk address of mirror (parity) |
| 130 | * |
| 131 | ****************************************************************************/ |
| 132 | |
| 133 | int |
| 134 | rf_DiskReadMirrorIdleFunc(RF_DagNode_t *node) |
| 135 | { |
| 136 | /* select the mirror copy with the shortest queue and fill in node |
| 137 | * parameters with physical disk address */ |
| 138 | |
| 139 | rf_SelectMirrorDiskIdle(node); |
| 140 | return (rf_DiskReadFunc(node)); |
| 141 | } |
| 142 | |
| 143 | #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0) || (RF_DEBUG_VALIDATE_DAG > 0) |
| 144 | int |
| 145 | rf_DiskReadMirrorPartitionFunc(RF_DagNode_t *node) |
| 146 | { |
| 147 | /* select the mirror copy with the shortest queue and fill in node |
| 148 | * parameters with physical disk address */ |
| 149 | |
| 150 | rf_SelectMirrorDiskPartition(node); |
| 151 | return (rf_DiskReadFunc(node)); |
| 152 | } |
| 153 | #endif |
| 154 | |
| 155 | int |
| 156 | rf_DiskReadMirrorUndoFunc(RF_DagNode_t *node) |
| 157 | { |
| 158 | return (0); |
| 159 | } |
| 160 | |
| 161 | |
| 162 | |
| 163 | #if RF_INCLUDE_PARITYLOGGING > 0 |
| 164 | /***************************************************************************** |
| 165 | * the execution function associated with a parity log update node |
| 166 | ****************************************************************************/ |
| 167 | int |
| 168 | rf_ParityLogUpdateFunc(RF_DagNode_t *node) |
| 169 | { |
| 170 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
| 171 | void *bf = (void *) node->params[1].p; |
| 172 | RF_ParityLogData_t *logData; |
| 173 | #if RF_ACC_TRACE > 0 |
| 174 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
| 175 | RF_Etimer_t timer; |
| 176 | #endif |
| 177 | |
| 178 | if (node->dagHdr->status == rf_enable) { |
| 179 | #if RF_ACC_TRACE > 0 |
| 180 | RF_ETIMER_START(timer); |
| 181 | #endif |
| 182 | logData = rf_CreateParityLogData(RF_UPDATE, pda, bf, |
| 183 | (RF_Raid_t *) (node->dagHdr->raidPtr), |
| 184 | node->wakeFunc, (void *) node, |
| 185 | node->dagHdr->tracerec, timer); |
| 186 | if (logData) |
| 187 | rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); |
| 188 | else { |
| 189 | #if RF_ACC_TRACE > 0 |
| 190 | RF_ETIMER_STOP(timer); |
| 191 | RF_ETIMER_EVAL(timer); |
| 192 | tracerec->plog_us += RF_ETIMER_VAL_US(timer); |
| 193 | #endif |
| 194 | (node->wakeFunc) (node, ENOMEM); |
| 195 | } |
| 196 | } |
| 197 | return (0); |
| 198 | } |
| 199 | |
| 200 | |
| 201 | /***************************************************************************** |
| 202 | * the execution function associated with a parity log overwrite node |
| 203 | ****************************************************************************/ |
| 204 | int |
| 205 | rf_ParityLogOverwriteFunc(RF_DagNode_t *node) |
| 206 | { |
| 207 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
| 208 | void *bf = (void *) node->params[1].p; |
| 209 | RF_ParityLogData_t *logData; |
| 210 | #if RF_ACC_TRACE > 0 |
| 211 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
| 212 | RF_Etimer_t timer; |
| 213 | #endif |
| 214 | |
| 215 | if (node->dagHdr->status == rf_enable) { |
| 216 | #if RF_ACC_TRACE > 0 |
| 217 | RF_ETIMER_START(timer); |
| 218 | #endif |
| 219 | logData = rf_CreateParityLogData(RF_OVERWRITE, pda, bf, |
| 220 | (RF_Raid_t *) (node->dagHdr->raidPtr), |
| 221 | node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer); |
| 222 | if (logData) |
| 223 | rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); |
| 224 | else { |
| 225 | #if RF_ACC_TRACE > 0 |
| 226 | RF_ETIMER_STOP(timer); |
| 227 | RF_ETIMER_EVAL(timer); |
| 228 | tracerec->plog_us += RF_ETIMER_VAL_US(timer); |
| 229 | #endif |
| 230 | (node->wakeFunc) (node, ENOMEM); |
| 231 | } |
| 232 | } |
| 233 | return (0); |
| 234 | } |
| 235 | |
| 236 | int |
| 237 | rf_ParityLogUpdateUndoFunc(RF_DagNode_t *node) |
| 238 | { |
| 239 | return (0); |
| 240 | } |
| 241 | |
| 242 | int |
| 243 | rf_ParityLogOverwriteUndoFunc(RF_DagNode_t *node) |
| 244 | { |
| 245 | return (0); |
| 246 | } |
| 247 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
| 248 | |
| 249 | /***************************************************************************** |
| 250 | * the execution function associated with a NOP node |
| 251 | ****************************************************************************/ |
| 252 | int |
| 253 | rf_NullNodeFunc(RF_DagNode_t *node) |
| 254 | { |
| 255 | node->status = rf_good; |
| 256 | return (rf_FinishNode(node, RF_THREAD_CONTEXT)); |
| 257 | } |
| 258 | |
| 259 | int |
| 260 | rf_NullNodeUndoFunc(RF_DagNode_t *node) |
| 261 | { |
| 262 | node->status = rf_undone; |
| 263 | return (rf_FinishNode(node, RF_THREAD_CONTEXT)); |
| 264 | } |
| 265 | |
| 266 | |
| 267 | /***************************************************************************** |
| 268 | * the execution function associated with a disk-read node |
| 269 | ****************************************************************************/ |
| 270 | int |
| 271 | rf_DiskReadFuncForThreads(RF_DagNode_t *node) |
| 272 | { |
| 273 | RF_DiskQueueData_t *req; |
| 274 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
| 275 | void *bf = (void *) node->params[1].p; |
| 276 | RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v; |
| 277 | unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); |
| 278 | unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); |
| 279 | RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP; |
| 280 | RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; |
| 281 | void *b_proc = NULL; |
| 282 | |
| 283 | if (node->dagHdr->bp) |
| 284 | b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc; |
| 285 | |
| 286 | req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, |
| 287 | bf, parityStripeID, which_ru, |
| 288 | (int (*) (void *, int)) node->wakeFunc, |
| 289 | node, |
| 290 | #if RF_ACC_TRACE > 0 |
| 291 | node->dagHdr->tracerec, |
| 292 | #else |
| 293 | NULL, |
| 294 | #endif |
| 295 | (void *) (node->dagHdr->raidPtr), 0, b_proc, PR_NOWAIT); |
| 296 | if (!req) { |
| 297 | (node->wakeFunc) (node, ENOMEM); |
| 298 | } else { |
| 299 | node->dagFuncData = (void *) req; |
| 300 | rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority); |
| 301 | } |
| 302 | return (0); |
| 303 | } |
| 304 | |
| 305 | |
| 306 | /***************************************************************************** |
| 307 | * the execution function associated with a disk-write node |
| 308 | ****************************************************************************/ |
| 309 | int |
| 310 | rf_DiskWriteFuncForThreads(RF_DagNode_t *node) |
| 311 | { |
| 312 | RF_DiskQueueData_t *req; |
| 313 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
| 314 | void *bf = (void *) node->params[1].p; |
| 315 | RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v; |
| 316 | unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); |
| 317 | unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); |
| 318 | RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP; |
| 319 | RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; |
| 320 | void *b_proc = NULL; |
| 321 | |
| 322 | if (node->dagHdr->bp) |
| 323 | b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc; |
| 324 | |
| 325 | /* normal processing (rollaway or forward recovery) begins here */ |
| 326 | req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, |
| 327 | bf, parityStripeID, which_ru, |
| 328 | (int (*) (void *, int)) node->wakeFunc, |
| 329 | (void *) node, |
| 330 | #if RF_ACC_TRACE > 0 |
| 331 | node->dagHdr->tracerec, |
| 332 | #else |
| 333 | NULL, |
| 334 | #endif |
| 335 | (void *) (node->dagHdr->raidPtr), |
| 336 | 0, b_proc, PR_NOWAIT); |
| 337 | |
| 338 | if (!req) { |
| 339 | (node->wakeFunc) (node, ENOMEM); |
| 340 | } else { |
| 341 | node->dagFuncData = (void *) req; |
| 342 | rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority); |
| 343 | } |
| 344 | |
| 345 | return (0); |
| 346 | } |
| 347 | /***************************************************************************** |
| 348 | * the undo function for disk nodes |
| 349 | * Note: this is not a proper undo of a write node, only locks are released. |
| 350 | * old data is not restored to disk! |
| 351 | ****************************************************************************/ |
| 352 | int |
| 353 | rf_DiskUndoFunc(RF_DagNode_t *node) |
| 354 | { |
| 355 | RF_DiskQueueData_t *req; |
| 356 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
| 357 | RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; |
| 358 | |
| 359 | req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP, |
| 360 | 0L, 0, NULL, 0L, 0, |
| 361 | (int (*) (void *, int)) node->wakeFunc, |
| 362 | (void *) node, |
| 363 | #if RF_ACC_TRACE > 0 |
| 364 | node->dagHdr->tracerec, |
| 365 | #else |
| 366 | NULL, |
| 367 | #endif |
| 368 | (void *) (node->dagHdr->raidPtr), |
| 369 | 0, NULL, PR_NOWAIT); |
| 370 | if (!req) |
| 371 | (node->wakeFunc) (node, ENOMEM); |
| 372 | else { |
| 373 | node->dagFuncData = (void *) req; |
| 374 | rf_DiskIOEnqueue(&(dqs[pda->col]), req, RF_IO_NORMAL_PRIORITY); |
| 375 | } |
| 376 | |
| 377 | return (0); |
| 378 | } |
| 379 | |
| 380 | /***************************************************************************** |
| 381 | * Callback routine for DiskRead and DiskWrite nodes. When the disk |
| 382 | * op completes, the routine is called to set the node status and |
| 383 | * inform the execution engine that the node has fired. |
| 384 | ****************************************************************************/ |
| 385 | int |
| 386 | rf_GenericWakeupFunc(RF_DagNode_t *node, int status) |
| 387 | { |
| 388 | |
| 389 | switch (node->status) { |
| 390 | case rf_fired: |
| 391 | if (status) |
| 392 | node->status = rf_bad; |
| 393 | else |
| 394 | node->status = rf_good; |
| 395 | break; |
| 396 | case rf_recover: |
| 397 | /* probably should never reach this case */ |
| 398 | if (status) |
| 399 | node->status = rf_panic; |
| 400 | else |
| 401 | node->status = rf_undone; |
| 402 | break; |
| 403 | default: |
| 404 | printf("rf_GenericWakeupFunc:" ); |
| 405 | printf("node->status is %d," , node->status); |
| 406 | printf("status is %d \n" , status); |
| 407 | RF_PANIC(); |
| 408 | break; |
| 409 | } |
| 410 | if (node->dagFuncData) |
| 411 | rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData); |
| 412 | return (rf_FinishNode(node, RF_INTR_CONTEXT)); |
| 413 | } |
| 414 | |
| 415 | |
| 416 | /***************************************************************************** |
| 417 | * there are three distinct types of xor nodes: |
| 418 | |
| 419 | * A "regular xor" is used in the fault-free case where the access |
| 420 | * spans a complete stripe unit. It assumes that the result buffer is |
| 421 | * one full stripe unit in size, and uses the stripe-unit-offset |
| 422 | * values that it computes from the PDAs to determine where within the |
| 423 | * stripe unit to XOR each argument buffer. |
| 424 | * |
| 425 | * A "simple xor" is used in the fault-free case where the access |
| 426 | * touches only a portion of one (or two, in some cases) stripe |
| 427 | * unit(s). It assumes that all the argument buffers are of the same |
| 428 | * size and have the same stripe unit offset. |
| 429 | * |
| 430 | * A "recovery xor" is used in the degraded-mode case. It's similar |
| 431 | * to the regular xor function except that it takes the failed PDA as |
| 432 | * an additional parameter, and uses it to determine what portions of |
| 433 | * the argument buffers need to be xor'd into the result buffer, and |
| 434 | * where in the result buffer they should go. |
| 435 | ****************************************************************************/ |
| 436 | |
| 437 | /* xor the params together and store the result in the result field. |
| 438 | * assume the result field points to a buffer that is the size of one |
| 439 | * SU, and use the pda params to determine where within the buffer to |
| 440 | * XOR the input buffers. */ |
| 441 | int |
| 442 | rf_RegularXorFunc(RF_DagNode_t *node) |
| 443 | { |
| 444 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
| 445 | #if RF_ACC_TRACE > 0 |
| 446 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
| 447 | RF_Etimer_t timer; |
| 448 | #endif |
| 449 | int i, retcode; |
| 450 | |
| 451 | retcode = 0; |
| 452 | if (node->dagHdr->status == rf_enable) { |
| 453 | /* don't do the XOR if the input is the same as the output */ |
| 454 | #if RF_ACC_TRACE > 0 |
| 455 | RF_ETIMER_START(timer); |
| 456 | #endif |
| 457 | for (i = 0; i < node->numParams - 1; i += 2) |
| 458 | if (node->params[i + 1].p != node->results[0]) { |
| 459 | retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p, |
| 460 | (char *) node->params[i + 1].p, (char *) node->results[0]); |
| 461 | } |
| 462 | #if RF_ACC_TRACE > 0 |
| 463 | RF_ETIMER_STOP(timer); |
| 464 | RF_ETIMER_EVAL(timer); |
| 465 | tracerec->xor_us += RF_ETIMER_VAL_US(timer); |
| 466 | #endif |
| 467 | } |
| 468 | return (rf_GenericWakeupFunc(node, retcode)); /* call wake func |
| 469 | * explicitly since no |
| 470 | * I/O in this node */ |
| 471 | } |
| 472 | /* xor the inputs into the result buffer, ignoring placement issues */ |
| 473 | int |
| 474 | rf_SimpleXorFunc(RF_DagNode_t *node) |
| 475 | { |
| 476 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
| 477 | int i, retcode = 0; |
| 478 | #if RF_ACC_TRACE > 0 |
| 479 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
| 480 | RF_Etimer_t timer; |
| 481 | #endif |
| 482 | |
| 483 | if (node->dagHdr->status == rf_enable) { |
| 484 | #if RF_ACC_TRACE > 0 |
| 485 | RF_ETIMER_START(timer); |
| 486 | #endif |
| 487 | /* don't do the XOR if the input is the same as the output */ |
| 488 | for (i = 0; i < node->numParams - 1; i += 2) |
| 489 | if (node->params[i + 1].p != node->results[0]) { |
| 490 | retcode = rf_bxor((char *) node->params[i + 1].p, (char *) node->results[0], |
| 491 | rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[i].p)->numSector)); |
| 492 | } |
| 493 | #if RF_ACC_TRACE > 0 |
| 494 | RF_ETIMER_STOP(timer); |
| 495 | RF_ETIMER_EVAL(timer); |
| 496 | tracerec->xor_us += RF_ETIMER_VAL_US(timer); |
| 497 | #endif |
| 498 | } |
| 499 | return (rf_GenericWakeupFunc(node, retcode)); /* call wake func |
| 500 | * explicitly since no |
| 501 | * I/O in this node */ |
| 502 | } |
| 503 | /* this xor is used by the degraded-mode dag functions to recover lost |
| 504 | * data. the second-to-last parameter is the PDA for the failed |
| 505 | * portion of the access. the code here looks at this PDA and assumes |
| 506 | * that the xor target buffer is equal in size to the number of |
| 507 | * sectors in the failed PDA. It then uses the other PDAs in the |
| 508 | * parameter list to determine where within the target buffer the |
| 509 | * corresponding data should be xored. */ |
| 510 | int |
| 511 | rf_RecoveryXorFunc(RF_DagNode_t *node) |
| 512 | { |
| 513 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
| 514 | RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; |
| 515 | RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; |
| 516 | int i, retcode = 0; |
| 517 | RF_PhysDiskAddr_t *pda; |
| 518 | int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); |
| 519 | char *srcbuf, *destbuf; |
| 520 | #if RF_ACC_TRACE > 0 |
| 521 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
| 522 | RF_Etimer_t timer; |
| 523 | #endif |
| 524 | |
| 525 | if (node->dagHdr->status == rf_enable) { |
| 526 | #if RF_ACC_TRACE > 0 |
| 527 | RF_ETIMER_START(timer); |
| 528 | #endif |
| 529 | for (i = 0; i < node->numParams - 2; i += 2) |
| 530 | if (node->params[i + 1].p != node->results[0]) { |
| 531 | pda = (RF_PhysDiskAddr_t *) node->params[i].p; |
| 532 | srcbuf = (char *) node->params[i + 1].p; |
| 533 | suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); |
| 534 | destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); |
| 535 | retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector)); |
| 536 | } |
| 537 | #if RF_ACC_TRACE > 0 |
| 538 | RF_ETIMER_STOP(timer); |
| 539 | RF_ETIMER_EVAL(timer); |
| 540 | tracerec->xor_us += RF_ETIMER_VAL_US(timer); |
| 541 | #endif |
| 542 | } |
| 543 | return (rf_GenericWakeupFunc(node, retcode)); |
| 544 | } |
| 545 | /***************************************************************************** |
| 546 | * The next three functions are utilities used by the above |
| 547 | * xor-execution functions. |
| 548 | ****************************************************************************/ |
| 549 | |
| 550 | |
| 551 | /* |
| 552 | * this is just a glorified buffer xor. targbuf points to a buffer |
| 553 | * that is one full stripe unit in size. srcbuf points to a buffer |
| 554 | * that may be less than 1 SU, but never more. When the access |
| 555 | * described by pda is one SU in size (which by implication means it's |
| 556 | * SU-aligned), all that happens is (targbuf) <- (srcbuf ^ targbuf). |
| 557 | * When the access is less than one SU in size the XOR occurs on only |
| 558 | * the portion of targbuf identified in the pda. */ |
| 559 | |
| 560 | int |
| 561 | rf_XorIntoBuffer(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, |
| 562 | char *srcbuf, char *targbuf) |
| 563 | { |
| 564 | char *targptr; |
| 565 | int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
| 566 | int SUOffset = pda->startSector % sectPerSU; |
| 567 | int length, retcode = 0; |
| 568 | |
| 569 | RF_ASSERT(pda->numSector <= sectPerSU); |
| 570 | |
| 571 | targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset); |
| 572 | length = rf_RaidAddressToByte(raidPtr, pda->numSector); |
| 573 | retcode = rf_bxor(srcbuf, targptr, length); |
| 574 | return (retcode); |
| 575 | } |
| 576 | /* it really should be the case that the buffer pointers (returned by |
| 577 | * malloc) are aligned to the natural word size of the machine, so |
| 578 | * this is the only case we optimize for. The length should always be |
| 579 | * a multiple of the sector size, so there should be no problem with |
| 580 | * leftover bytes at the end. */ |
| 581 | int |
| 582 | rf_bxor(char *src, char *dest, int len) |
| 583 | { |
| 584 | unsigned mask = sizeof(long) - 1, retcode = 0; |
| 585 | |
| 586 | if (!(((unsigned long) src) & mask) && |
| 587 | !(((unsigned long) dest) & mask) && !(len & mask)) { |
| 588 | retcode = rf_longword_bxor((unsigned long *) src, |
| 589 | (unsigned long *) dest, |
| 590 | len >> RF_LONGSHIFT); |
| 591 | } else { |
| 592 | RF_ASSERT(0); |
| 593 | } |
| 594 | return (retcode); |
| 595 | } |
| 596 | |
| 597 | /* When XORing in kernel mode, we need to map each user page to kernel |
| 598 | * space before we can access it. We don't want to assume anything |
| 599 | * about which input buffers are in kernel/user space, nor about their |
| 600 | * alignment, so in each loop we compute the maximum number of bytes |
| 601 | * that we can xor without crossing any page boundaries, and do only |
| 602 | * this many bytes before the next remap. |
| 603 | * |
| 604 | * len - is in longwords |
| 605 | */ |
| 606 | int |
| 607 | rf_longword_bxor(unsigned long *src, unsigned long *dest, int len) |
| 608 | { |
| 609 | unsigned long *end = src + len; |
| 610 | unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */ |
| 611 | unsigned long *pg_src, *pg_dest; /* per-page source/dest pointers */ |
| 612 | int longs_this_time;/* # longwords to xor in the current iteration */ |
| 613 | |
| 614 | pg_src = src; |
| 615 | pg_dest = dest; |
| 616 | if (!pg_src || !pg_dest) |
| 617 | return (EFAULT); |
| 618 | |
| 619 | while (len >= 4) { |
| 620 | longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT); /* note len in longwords */ |
| 621 | src += longs_this_time; |
| 622 | dest += longs_this_time; |
| 623 | len -= longs_this_time; |
| 624 | while (longs_this_time >= 4) { |
| 625 | d0 = pg_dest[0]; |
| 626 | d1 = pg_dest[1]; |
| 627 | d2 = pg_dest[2]; |
| 628 | d3 = pg_dest[3]; |
| 629 | s0 = pg_src[0]; |
| 630 | s1 = pg_src[1]; |
| 631 | s2 = pg_src[2]; |
| 632 | s3 = pg_src[3]; |
| 633 | pg_dest[0] = d0 ^ s0; |
| 634 | pg_dest[1] = d1 ^ s1; |
| 635 | pg_dest[2] = d2 ^ s2; |
| 636 | pg_dest[3] = d3 ^ s3; |
| 637 | pg_src += 4; |
| 638 | pg_dest += 4; |
| 639 | longs_this_time -= 4; |
| 640 | } |
| 641 | while (longs_this_time > 0) { /* cannot cross any page |
| 642 | * boundaries here */ |
| 643 | *pg_dest++ ^= *pg_src++; |
| 644 | longs_this_time--; |
| 645 | } |
| 646 | |
| 647 | /* either we're done, or we've reached a page boundary on one |
| 648 | * (or possibly both) of the pointers */ |
| 649 | if (len) { |
| 650 | if (RF_PAGE_ALIGNED(src)) |
| 651 | pg_src = src; |
| 652 | if (RF_PAGE_ALIGNED(dest)) |
| 653 | pg_dest = dest; |
| 654 | if (!pg_src || !pg_dest) |
| 655 | return (EFAULT); |
| 656 | } |
| 657 | } |
| 658 | while (src < end) { |
| 659 | *pg_dest++ ^= *pg_src++; |
| 660 | src++; |
| 661 | dest++; |
| 662 | len--; |
| 663 | if (RF_PAGE_ALIGNED(src)) |
| 664 | pg_src = src; |
| 665 | if (RF_PAGE_ALIGNED(dest)) |
| 666 | pg_dest = dest; |
| 667 | } |
| 668 | RF_ASSERT(len == 0); |
| 669 | return (0); |
| 670 | } |
| 671 | |
| 672 | #if 0 |
| 673 | /* |
| 674 | dst = a ^ b ^ c; |
| 675 | a may equal dst |
| 676 | see comment above longword_bxor |
| 677 | len is length in longwords |
| 678 | */ |
| 679 | int |
| 680 | rf_longword_bxor3(unsigned long *dst, unsigned long *a, unsigned long *b, |
| 681 | unsigned long *c, int len, void *bp) |
| 682 | { |
| 683 | unsigned long a0, a1, a2, a3, b0, b1, b2, b3; |
| 684 | unsigned long *pg_a, *pg_b, *pg_c, *pg_dst; /* per-page source/dest |
| 685 | * pointers */ |
| 686 | int longs_this_time;/* # longs to xor in the current iteration */ |
| 687 | char dst_is_a = 0; |
| 688 | |
| 689 | pg_a = a; |
| 690 | pg_b = b; |
| 691 | pg_c = c; |
| 692 | if (a == dst) { |
| 693 | pg_dst = pg_a; |
| 694 | dst_is_a = 1; |
| 695 | } else { |
| 696 | pg_dst = dst; |
| 697 | } |
| 698 | |
| 699 | /* align dest to cache line. Can't cross a pg boundary on dst here. */ |
| 700 | while ((((unsigned long) pg_dst) & 0x1f)) { |
| 701 | *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; |
| 702 | dst++; |
| 703 | a++; |
| 704 | b++; |
| 705 | c++; |
| 706 | if (RF_PAGE_ALIGNED(a)) { |
| 707 | pg_a = a; |
| 708 | if (!pg_a) |
| 709 | return (EFAULT); |
| 710 | } |
| 711 | if (RF_PAGE_ALIGNED(b)) { |
| 712 | pg_b = a; |
| 713 | if (!pg_b) |
| 714 | return (EFAULT); |
| 715 | } |
| 716 | if (RF_PAGE_ALIGNED(c)) { |
| 717 | pg_c = a; |
| 718 | if (!pg_c) |
| 719 | return (EFAULT); |
| 720 | } |
| 721 | len--; |
| 722 | } |
| 723 | |
| 724 | while (len > 4) { |
| 725 | longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT); |
| 726 | a += longs_this_time; |
| 727 | b += longs_this_time; |
| 728 | c += longs_this_time; |
| 729 | dst += longs_this_time; |
| 730 | len -= longs_this_time; |
| 731 | while (longs_this_time >= 4) { |
| 732 | a0 = pg_a[0]; |
| 733 | longs_this_time -= 4; |
| 734 | |
| 735 | a1 = pg_a[1]; |
| 736 | a2 = pg_a[2]; |
| 737 | |
| 738 | a3 = pg_a[3]; |
| 739 | pg_a += 4; |
| 740 | |
| 741 | b0 = pg_b[0]; |
| 742 | b1 = pg_b[1]; |
| 743 | |
| 744 | b2 = pg_b[2]; |
| 745 | b3 = pg_b[3]; |
| 746 | /* start dual issue */ |
| 747 | a0 ^= b0; |
| 748 | b0 = pg_c[0]; |
| 749 | |
| 750 | pg_b += 4; |
| 751 | a1 ^= b1; |
| 752 | |
| 753 | a2 ^= b2; |
| 754 | a3 ^= b3; |
| 755 | |
| 756 | b1 = pg_c[1]; |
| 757 | a0 ^= b0; |
| 758 | |
| 759 | b2 = pg_c[2]; |
| 760 | a1 ^= b1; |
| 761 | |
| 762 | b3 = pg_c[3]; |
| 763 | a2 ^= b2; |
| 764 | |
| 765 | pg_dst[0] = a0; |
| 766 | a3 ^= b3; |
| 767 | pg_dst[1] = a1; |
| 768 | pg_c += 4; |
| 769 | pg_dst[2] = a2; |
| 770 | pg_dst[3] = a3; |
| 771 | pg_dst += 4; |
| 772 | } |
| 773 | while (longs_this_time > 0) { /* cannot cross any page |
| 774 | * boundaries here */ |
| 775 | *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; |
| 776 | longs_this_time--; |
| 777 | } |
| 778 | |
| 779 | if (len) { |
| 780 | if (RF_PAGE_ALIGNED(a)) { |
| 781 | pg_a = a; |
| 782 | if (!pg_a) |
| 783 | return (EFAULT); |
| 784 | if (dst_is_a) |
| 785 | pg_dst = pg_a; |
| 786 | } |
| 787 | if (RF_PAGE_ALIGNED(b)) { |
| 788 | pg_b = b; |
| 789 | if (!pg_b) |
| 790 | return (EFAULT); |
| 791 | } |
| 792 | if (RF_PAGE_ALIGNED(c)) { |
| 793 | pg_c = c; |
| 794 | if (!pg_c) |
| 795 | return (EFAULT); |
| 796 | } |
| 797 | if (!dst_is_a) |
| 798 | if (RF_PAGE_ALIGNED(dst)) { |
| 799 | pg_dst = dst; |
| 800 | if (!pg_dst) |
| 801 | return (EFAULT); |
| 802 | } |
| 803 | } |
| 804 | } |
| 805 | while (len) { |
| 806 | *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; |
| 807 | dst++; |
| 808 | a++; |
| 809 | b++; |
| 810 | c++; |
| 811 | if (RF_PAGE_ALIGNED(a)) { |
| 812 | pg_a = a; |
| 813 | if (!pg_a) |
| 814 | return (EFAULT); |
| 815 | if (dst_is_a) |
| 816 | pg_dst = pg_a; |
| 817 | } |
| 818 | if (RF_PAGE_ALIGNED(b)) { |
| 819 | pg_b = b; |
| 820 | if (!pg_b) |
| 821 | return (EFAULT); |
| 822 | } |
| 823 | if (RF_PAGE_ALIGNED(c)) { |
| 824 | pg_c = c; |
| 825 | if (!pg_c) |
| 826 | return (EFAULT); |
| 827 | } |
| 828 | if (!dst_is_a) |
| 829 | if (RF_PAGE_ALIGNED(dst)) { |
| 830 | pg_dst = dst; |
| 831 | if (!pg_dst) |
| 832 | return (EFAULT); |
| 833 | } |
| 834 | len--; |
| 835 | } |
| 836 | return (0); |
| 837 | } |
| 838 | |
| 839 | int |
| 840 | rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b, |
| 841 | unsigned char *c, unsigned long len, void *bp) |
| 842 | { |
| 843 | RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0); |
| 844 | |
| 845 | return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a, |
| 846 | (unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp)); |
| 847 | } |
| 848 | #endif |
| 849 | |