| 1 | /* $NetBSD: rf_copyback.c,v 1.50 2014/06/14 07:39:00 hannken Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 1995 Carnegie-Mellon University. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * Author: Mark Holland |
| 7 | * |
| 8 | * Permission to use, copy, modify and distribute this software and |
| 9 | * its documentation is hereby granted, provided that both the copyright |
| 10 | * notice and this permission notice appear in all copies of the |
| 11 | * software, derivative works or modified versions, and any portions |
| 12 | * thereof, and that both notices appear in supporting documentation. |
| 13 | * |
| 14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 17 | * |
| 18 | * Carnegie Mellon requests users of this software to return to |
| 19 | * |
| 20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 21 | * School of Computer Science |
| 22 | * Carnegie Mellon University |
| 23 | * Pittsburgh PA 15213-3890 |
| 24 | * |
| 25 | * any improvements or extensions that they make and grant Carnegie the |
| 26 | * rights to redistribute these changes. |
| 27 | */ |
| 28 | |
| 29 | /***************************************************************************** |
| 30 | * |
| 31 | * copyback.c -- code to copy reconstructed data back from spare space to |
| 32 | * the replaced disk. |
| 33 | * |
| 34 | * the code operates using callbacks on the I/Os to continue with the |
| 35 | * next unit to be copied back. We do this because a simple loop |
| 36 | * containing blocking I/Os will not work in the simulator. |
| 37 | * |
| 38 | ****************************************************************************/ |
| 39 | |
| 40 | #include <sys/cdefs.h> |
| 41 | __KERNEL_RCSID(0, "$NetBSD: rf_copyback.c,v 1.50 2014/06/14 07:39:00 hannken Exp $" ); |
| 42 | |
| 43 | #include <dev/raidframe/raidframevar.h> |
| 44 | |
| 45 | #include <sys/time.h> |
| 46 | #include <sys/buf.h> |
| 47 | #include "rf_raid.h" |
| 48 | #include "rf_mcpair.h" |
| 49 | #include "rf_acctrace.h" |
| 50 | #include "rf_etimer.h" |
| 51 | #include "rf_general.h" |
| 52 | #include "rf_utils.h" |
| 53 | #include "rf_copyback.h" |
| 54 | #include "rf_decluster.h" |
| 55 | #include "rf_driver.h" |
| 56 | #include "rf_shutdown.h" |
| 57 | #include "rf_kintf.h" |
| 58 | |
| 59 | #define RF_COPYBACK_DATA 0 |
| 60 | #define RF_COPYBACK_PARITY 1 |
| 61 | |
| 62 | int rf_copyback_in_progress; |
| 63 | |
| 64 | static int rf_CopybackReadDoneProc(RF_CopybackDesc_t * desc, int status); |
| 65 | static int rf_CopybackWriteDoneProc(RF_CopybackDesc_t * desc, int status); |
| 66 | static void rf_CopybackOne(RF_CopybackDesc_t * desc, int typ, |
| 67 | RF_RaidAddr_t addr, RF_RowCol_t testCol, |
| 68 | RF_SectorNum_t testOffs); |
| 69 | static void rf_CopybackComplete(RF_CopybackDesc_t * desc, int status); |
| 70 | |
| 71 | int |
| 72 | rf_ConfigureCopyback(RF_ShutdownList_t **listp) |
| 73 | { |
| 74 | rf_copyback_in_progress = 0; |
| 75 | return (0); |
| 76 | } |
| 77 | |
| 78 | #include <sys/param.h> |
| 79 | #include <sys/systm.h> |
| 80 | #include <sys/proc.h> |
| 81 | #include <sys/ioctl.h> |
| 82 | #include <sys/fcntl.h> |
| 83 | #include <sys/vnode.h> |
| 84 | #include <sys/namei.h> /* for pathbuf */ |
| 85 | |
| 86 | #include <miscfs/specfs/specdev.h> /* for v_rdev */ |
| 87 | |
| 88 | /* do a complete copyback */ |
| 89 | void |
| 90 | rf_CopybackReconstructedData(RF_Raid_t *raidPtr) |
| 91 | { |
| 92 | RF_ComponentLabel_t *c_label; |
| 93 | int found, retcode; |
| 94 | RF_CopybackDesc_t *desc; |
| 95 | RF_RowCol_t fcol; |
| 96 | RF_RaidDisk_t *badDisk; |
| 97 | char *databuf; |
| 98 | |
| 99 | struct pathbuf *dev_pb; |
| 100 | struct vnode *vp; |
| 101 | |
| 102 | int ac; |
| 103 | |
| 104 | fcol = 0; |
| 105 | found = 0; |
| 106 | for (fcol = 0; fcol < raidPtr->numCol; fcol++) { |
| 107 | if (raidPtr->Disks[fcol].status == rf_ds_dist_spared |
| 108 | || raidPtr->Disks[fcol].status == rf_ds_spared) { |
| 109 | found = 1; |
| 110 | break; |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | if (!found) { |
| 115 | printf("raid%d: no disks need copyback\n" , raidPtr->raidid); |
| 116 | return; |
| 117 | } |
| 118 | |
| 119 | badDisk = &raidPtr->Disks[fcol]; |
| 120 | |
| 121 | /* This device may have been opened successfully the first time. Close |
| 122 | * it before trying to open it again.. */ |
| 123 | |
| 124 | if (raidPtr->raid_cinfo[fcol].ci_vp != NULL) { |
| 125 | printf("Closed the open device: %s\n" , |
| 126 | raidPtr->Disks[fcol].devname); |
| 127 | vp = raidPtr->raid_cinfo[fcol].ci_vp; |
| 128 | ac = raidPtr->Disks[fcol].auto_configured; |
| 129 | rf_close_component(raidPtr, vp, ac); |
| 130 | raidPtr->raid_cinfo[fcol].ci_vp = NULL; |
| 131 | |
| 132 | } |
| 133 | /* note that this disk was *not* auto_configured (any longer) */ |
| 134 | raidPtr->Disks[fcol].auto_configured = 0; |
| 135 | |
| 136 | printf("About to (re-)open the device: %s\n" , |
| 137 | raidPtr->Disks[fcol].devname); |
| 138 | |
| 139 | dev_pb = pathbuf_create(raidPtr->Disks[fcol].devname); |
| 140 | if (dev_pb == NULL) { |
| 141 | /* shouldn't happen unless maybe the system is OOMing */ |
| 142 | printf("raid%d: copyback: pathbuf_create on device: %s failed: %d!\n" , |
| 143 | raidPtr->raidid, raidPtr->Disks[fcol].devname, |
| 144 | ENOMEM); |
| 145 | return; |
| 146 | } |
| 147 | retcode = dk_lookup(dev_pb, curlwp, &vp); |
| 148 | pathbuf_destroy(dev_pb); |
| 149 | |
| 150 | if (retcode) { |
| 151 | printf("raid%d: copyback: dk_lookup on device: %s failed: %d!\n" , |
| 152 | raidPtr->raidid, raidPtr->Disks[fcol].devname, |
| 153 | retcode); |
| 154 | |
| 155 | /* XXX the component isn't responding properly... must be |
| 156 | * still dead :-( */ |
| 157 | return; |
| 158 | |
| 159 | } else { |
| 160 | |
| 161 | /* Ok, so we can at least do a lookup... How about actually |
| 162 | * getting a vp for it? */ |
| 163 | |
| 164 | retcode = rf_getdisksize(vp, &raidPtr->Disks[fcol]); |
| 165 | if (retcode) { |
| 166 | return; |
| 167 | } |
| 168 | |
| 169 | raidPtr->raid_cinfo[fcol].ci_vp = vp; |
| 170 | raidPtr->raid_cinfo[fcol].ci_dev = vp->v_rdev; |
| 171 | |
| 172 | raidPtr->Disks[fcol].dev = vp->v_rdev; /* XXX or the above? */ |
| 173 | |
| 174 | /* we allow the user to specify that only a fraction of the |
| 175 | * disks should be used this is just for debug: it speeds up |
| 176 | * the parity scan */ |
| 177 | raidPtr->Disks[fcol].numBlocks = |
| 178 | raidPtr->Disks[fcol].numBlocks * |
| 179 | rf_sizePercentage / 100; |
| 180 | } |
| 181 | |
| 182 | if (retcode) { |
| 183 | printf("raid%d: copyback: target disk failed TUR\n" , |
| 184 | raidPtr->raidid); |
| 185 | return; |
| 186 | } |
| 187 | /* get a buffer to hold one SU */ |
| 188 | RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (char *)); |
| 189 | |
| 190 | /* create a descriptor */ |
| 191 | RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *)); |
| 192 | desc->raidPtr = raidPtr; |
| 193 | desc->status = 0; |
| 194 | desc->fcol = fcol; |
| 195 | desc->spCol = badDisk->spareCol; |
| 196 | desc->stripeAddr = 0; |
| 197 | desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
| 198 | desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.numDataCol; |
| 199 | desc->databuf = databuf; |
| 200 | desc->mcpair = rf_AllocMCPair(); |
| 201 | |
| 202 | /* quiesce the array, since we don't want to code support for user |
| 203 | * accs here */ |
| 204 | rf_SuspendNewRequestsAndWait(raidPtr); |
| 205 | |
| 206 | /* adjust state of the array and of the disks */ |
| 207 | rf_lock_mutex2(raidPtr->mutex); |
| 208 | raidPtr->Disks[desc->fcol].status = rf_ds_optimal; |
| 209 | raidPtr->status = rf_rs_optimal; |
| 210 | rf_copyback_in_progress = 1; /* debug only */ |
| 211 | rf_unlock_mutex2(raidPtr->mutex); |
| 212 | |
| 213 | RF_GETTIME(desc->starttime); |
| 214 | rf_ContinueCopyback(desc); |
| 215 | |
| 216 | /* Data has been restored. Fix up the component label. */ |
| 217 | /* Don't actually need the read here.. */ |
| 218 | |
| 219 | c_label = raidget_component_label(raidPtr, fcol); |
| 220 | raid_init_component_label(raidPtr, c_label); |
| 221 | |
| 222 | c_label->row = 0; |
| 223 | c_label->column = fcol; |
| 224 | rf_component_label_set_partitionsize(c_label, |
| 225 | raidPtr->Disks[fcol].partitionSize); |
| 226 | |
| 227 | raidflush_component_label(raidPtr, fcol); |
| 228 | |
| 229 | /* XXXjld why is this here? */ |
| 230 | rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); |
| 231 | } |
| 232 | |
| 233 | |
| 234 | /* |
| 235 | * invoked via callback after a copyback I/O has completed to |
| 236 | * continue on with the next one |
| 237 | */ |
| 238 | void |
| 239 | rf_ContinueCopyback(RF_CopybackDesc_t *desc) |
| 240 | { |
| 241 | RF_SectorNum_t testOffs, stripeAddr; |
| 242 | RF_Raid_t *raidPtr = desc->raidPtr; |
| 243 | RF_RaidAddr_t addr; |
| 244 | RF_RowCol_t testCol; |
| 245 | #if RF_DEBUG_RECON |
| 246 | int old_pctg, new_pctg; |
| 247 | struct timeval t, diff; |
| 248 | #endif |
| 249 | int done; |
| 250 | |
| 251 | #if RF_DEBUG_RECON |
| 252 | old_pctg = (-1); |
| 253 | #endif |
| 254 | while (1) { |
| 255 | stripeAddr = desc->stripeAddr; |
| 256 | desc->raidPtr->copyback_stripes_done = stripeAddr |
| 257 | / desc->sectPerStripe; |
| 258 | #if RF_DEBUG_RECON |
| 259 | if (rf_prReconSched) { |
| 260 | old_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors; |
| 261 | } |
| 262 | #endif |
| 263 | desc->stripeAddr += desc->sectPerStripe; |
| 264 | #if RF_DEBUG_RECON |
| 265 | if (rf_prReconSched) { |
| 266 | new_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors; |
| 267 | if (new_pctg != old_pctg) { |
| 268 | RF_GETTIME(t); |
| 269 | RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff); |
| 270 | printf("%d %d.%06d\n" , new_pctg, (int) diff.tv_sec, (int) diff.tv_usec); |
| 271 | } |
| 272 | } |
| 273 | #endif |
| 274 | if (stripeAddr >= raidPtr->totalSectors) { |
| 275 | rf_CopybackComplete(desc, 0); |
| 276 | return; |
| 277 | } |
| 278 | /* walk through the current stripe, su-by-su */ |
| 279 | for (done = 0, addr = stripeAddr; addr < stripeAddr + desc->sectPerStripe; addr += desc->sectPerSU) { |
| 280 | |
| 281 | /* map the SU, disallowing remap to spare space */ |
| 282 | (raidPtr->Layout.map->MapSector) (raidPtr, addr, &testCol, &testOffs, RF_DONT_REMAP); |
| 283 | |
| 284 | if (testCol == desc->fcol) { |
| 285 | rf_CopybackOne(desc, RF_COPYBACK_DATA, addr, testCol, testOffs); |
| 286 | done = 1; |
| 287 | break; |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | if (!done) { |
| 292 | /* we didn't find the failed disk in the data part. |
| 293 | * check parity. */ |
| 294 | |
| 295 | /* map the parity for this stripe, disallowing remap |
| 296 | * to spare space */ |
| 297 | (raidPtr->Layout.map->MapParity) (raidPtr, stripeAddr, &testCol, &testOffs, RF_DONT_REMAP); |
| 298 | |
| 299 | if (testCol == desc->fcol) { |
| 300 | rf_CopybackOne(desc, RF_COPYBACK_PARITY, stripeAddr, testCol, testOffs); |
| 301 | } |
| 302 | } |
| 303 | /* check to see if the last read/write pair failed */ |
| 304 | if (desc->status) { |
| 305 | rf_CopybackComplete(desc, 1); |
| 306 | return; |
| 307 | } |
| 308 | /* we didn't find any units to copy back in this stripe. |
| 309 | * Continue with the next one */ |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | |
| 314 | /* copyback one unit */ |
| 315 | static void |
| 316 | rf_CopybackOne(RF_CopybackDesc_t *desc, int typ, RF_RaidAddr_t addr, |
| 317 | RF_RowCol_t testCol, RF_SectorNum_t testOffs) |
| 318 | { |
| 319 | RF_SectorCount_t sectPerSU = desc->sectPerSU; |
| 320 | RF_Raid_t *raidPtr = desc->raidPtr; |
| 321 | RF_RowCol_t spCol = desc->spCol; |
| 322 | RF_SectorNum_t spOffs; |
| 323 | |
| 324 | /* find the spare spare location for this SU */ |
| 325 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
| 326 | if (typ == RF_COPYBACK_DATA) |
| 327 | raidPtr->Layout.map->MapSector(raidPtr, addr, &spCol, &spOffs, RF_REMAP); |
| 328 | else |
| 329 | raidPtr->Layout.map->MapParity(raidPtr, addr, &spCol, &spOffs, RF_REMAP); |
| 330 | } else { |
| 331 | spOffs = testOffs; |
| 332 | } |
| 333 | |
| 334 | /* create reqs to read the old location & write the new */ |
| 335 | desc->readreq = rf_CreateDiskQueueData(RF_IO_TYPE_READ, spOffs, |
| 336 | sectPerSU, desc->databuf, 0L, 0, |
| 337 | (int (*) (void *, int)) rf_CopybackReadDoneProc, desc, |
| 338 | NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL, |
| 339 | PR_WAITOK); |
| 340 | desc->writereq = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, testOffs, |
| 341 | sectPerSU, desc->databuf, 0L, 0, |
| 342 | (int (*) (void *, int)) rf_CopybackWriteDoneProc, desc, |
| 343 | NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL, |
| 344 | PR_WAITOK); |
| 345 | desc->fcol = testCol; |
| 346 | |
| 347 | /* enqueue the read. the write will go out as part of the callback on |
| 348 | * the read. at user-level & in the kernel, wait for the read-write |
| 349 | * pair to complete. in the simulator, just return, since everything |
| 350 | * will happen as callbacks */ |
| 351 | |
| 352 | RF_LOCK_MCPAIR(desc->mcpair); |
| 353 | desc->mcpair->flag = 0; |
| 354 | RF_UNLOCK_MCPAIR(desc->mcpair); |
| 355 | |
| 356 | rf_DiskIOEnqueue(&raidPtr->Queues[spCol], desc->readreq, RF_IO_NORMAL_PRIORITY); |
| 357 | |
| 358 | RF_LOCK_MCPAIR(desc->mcpair); |
| 359 | while (!desc->mcpair->flag) { |
| 360 | RF_WAIT_MCPAIR(desc->mcpair); |
| 361 | } |
| 362 | RF_UNLOCK_MCPAIR(desc->mcpair); |
| 363 | rf_FreeDiskQueueData(desc->readreq); |
| 364 | rf_FreeDiskQueueData(desc->writereq); |
| 365 | |
| 366 | } |
| 367 | |
| 368 | |
| 369 | /* called at interrupt context when the read has completed. just send out the write */ |
| 370 | static int |
| 371 | rf_CopybackReadDoneProc(RF_CopybackDesc_t *desc, int status) |
| 372 | { |
| 373 | if (status) { /* invoke the callback with bad status */ |
| 374 | printf("raid%d: copyback read failed. Aborting.\n" , |
| 375 | desc->raidPtr->raidid); |
| 376 | (desc->writereq->CompleteFunc) (desc, -100); |
| 377 | } else { |
| 378 | rf_DiskIOEnqueue(&(desc->raidPtr->Queues[desc->fcol]), desc->writereq, RF_IO_NORMAL_PRIORITY); |
| 379 | } |
| 380 | return (0); |
| 381 | } |
| 382 | /* called at interrupt context when the write has completed. |
| 383 | * at user level & in the kernel, wake up the copyback thread. |
| 384 | * in the simulator, invoke the next copyback directly. |
| 385 | * can't free diskqueuedata structs in the kernel b/c we're at interrupt context. |
| 386 | */ |
| 387 | static int |
| 388 | rf_CopybackWriteDoneProc(RF_CopybackDesc_t *desc, int status) |
| 389 | { |
| 390 | if (status && status != -100) { |
| 391 | printf("raid%d: copyback write failed. Aborting.\n" , |
| 392 | desc->raidPtr->raidid); |
| 393 | } |
| 394 | desc->status = status; |
| 395 | rf_MCPairWakeupFunc(desc->mcpair); |
| 396 | return (0); |
| 397 | } |
| 398 | /* invoked when the copyback has completed */ |
| 399 | static void |
| 400 | rf_CopybackComplete(RF_CopybackDesc_t *desc, int status) |
| 401 | { |
| 402 | RF_Raid_t *raidPtr = desc->raidPtr; |
| 403 | struct timeval t, diff; |
| 404 | |
| 405 | if (!status) { |
| 406 | rf_lock_mutex2(raidPtr->mutex); |
| 407 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
| 408 | RF_ASSERT(raidPtr->Layout.map->parityConfig == 'D'); |
| 409 | rf_FreeSpareTable(raidPtr); |
| 410 | } else { |
| 411 | raidPtr->Disks[desc->spCol].status = rf_ds_spare; |
| 412 | } |
| 413 | rf_unlock_mutex2(raidPtr->mutex); |
| 414 | |
| 415 | RF_GETTIME(t); |
| 416 | RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff); |
| 417 | #if 0 |
| 418 | printf("Copyback time was %d.%06d seconds\n" , |
| 419 | (int) diff.tv_sec, (int) diff.tv_usec); |
| 420 | #endif |
| 421 | } else |
| 422 | printf("raid%d: Copyback failure. Status: %d\n" , |
| 423 | raidPtr->raidid, status); |
| 424 | |
| 425 | RF_Free(desc->databuf, rf_RaidAddressToByte(raidPtr, desc->sectPerSU)); |
| 426 | rf_FreeMCPair(desc->mcpair); |
| 427 | RF_Free(desc, sizeof(*desc)); |
| 428 | |
| 429 | rf_copyback_in_progress = 0; |
| 430 | rf_ResumeNewRequests(raidPtr); |
| 431 | } |
| 432 | |