| 1 | /* $NetBSD: rf_reconmap.c,v 1.34 2012/02/20 22:42:05 oster Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 1995 Carnegie-Mellon University. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * Author: Mark Holland |
| 7 | * |
| 8 | * Permission to use, copy, modify and distribute this software and |
| 9 | * its documentation is hereby granted, provided that both the copyright |
| 10 | * notice and this permission notice appear in all copies of the |
| 11 | * software, derivative works or modified versions, and any portions |
| 12 | * thereof, and that both notices appear in supporting documentation. |
| 13 | * |
| 14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 17 | * |
| 18 | * Carnegie Mellon requests users of this software to return to |
| 19 | * |
| 20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 21 | * School of Computer Science |
| 22 | * Carnegie Mellon University |
| 23 | * Pittsburgh PA 15213-3890 |
| 24 | * |
| 25 | * any improvements or extensions that they make and grant Carnegie the |
| 26 | * rights to redistribute these changes. |
| 27 | */ |
| 28 | |
| 29 | /************************************************************************* |
| 30 | * rf_reconmap.c |
| 31 | * |
| 32 | * code to maintain a map of what sectors have/have not been reconstructed |
| 33 | * |
| 34 | *************************************************************************/ |
| 35 | |
| 36 | #include <sys/cdefs.h> |
| 37 | __KERNEL_RCSID(0, "$NetBSD: rf_reconmap.c,v 1.34 2012/02/20 22:42:05 oster Exp $" ); |
| 38 | |
| 39 | #include "rf_raid.h" |
| 40 | #include <sys/time.h> |
| 41 | #include "rf_general.h" |
| 42 | #include "rf_utils.h" |
| 43 | |
| 44 | /* special pointer values indicating that a reconstruction unit |
| 45 | * has been either totally reconstructed or not at all. Both |
| 46 | * are illegal pointer values, so you have to be careful not to |
| 47 | * dereference through them. RU_NOTHING must be zero, since |
| 48 | * MakeReconMap uses memset to initialize the structure. These are used |
| 49 | * only at the head of the list. |
| 50 | */ |
| 51 | #define RU_ALL ((RF_ReconMapListElem_t *) -1) |
| 52 | #define RU_NOTHING ((RF_ReconMapListElem_t *) 0) |
| 53 | |
| 54 | /* For most reconstructs we need at most 3 RF_ReconMapListElem_t's. |
| 55 | * Bounding the number we need is quite difficult, as it depends on how |
| 56 | * badly the sectors to be reconstructed get divided up. In the current |
| 57 | * code, the reconstructed sectors appeared aligned on stripe boundaries, |
| 58 | * and are always presented in stripe width units, so we're probably |
| 59 | * allocating quite a bit more than we'll ever need. |
| 60 | */ |
| 61 | #define RF_NUM_RECON_POOL_ELEM 100 |
| 62 | |
| 63 | static void |
| 64 | compact_stat_entry(RF_Raid_t *, RF_ReconMap_t *, int, int); |
| 65 | static void crunch_list(RF_ReconMap_t *, RF_ReconMapListElem_t *); |
| 66 | static RF_ReconMapListElem_t * |
| 67 | MakeReconMapListElem(RF_ReconMap_t *, RF_SectorNum_t, RF_SectorNum_t, |
| 68 | RF_ReconMapListElem_t *); |
| 69 | static void |
| 70 | FreeReconMapListElem(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t * p); |
| 71 | |
| 72 | /*--------------------------------------------------------------------------- |
| 73 | * |
| 74 | * Creates and initializes new Reconstruction map |
| 75 | * |
| 76 | * ru_sectors - size of reconstruction unit in sectors |
| 77 | * disk_sectors - size of disk in sectors |
| 78 | * spareUnitsPerDisk - zero unless distributed sparing |
| 79 | *-------------------------------------------------------------------------*/ |
| 80 | |
| 81 | RF_ReconMap_t * |
| 82 | rf_MakeReconMap(RF_Raid_t *raidPtr, RF_SectorCount_t ru_sectors, |
| 83 | RF_SectorCount_t disk_sectors, |
| 84 | RF_ReconUnitCount_t spareUnitsPerDisk) |
| 85 | { |
| 86 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
| 87 | RF_ReconUnitCount_t num_rus = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerRU; |
| 88 | RF_ReconMap_t *p; |
| 89 | |
| 90 | RF_Malloc(p, sizeof(RF_ReconMap_t), (RF_ReconMap_t *)); |
| 91 | p->sectorsPerReconUnit = ru_sectors; |
| 92 | p->sectorsInDisk = disk_sectors; |
| 93 | |
| 94 | p->totalRUs = num_rus; |
| 95 | p->spareRUs = spareUnitsPerDisk; |
| 96 | p->unitsLeft = num_rus - spareUnitsPerDisk; |
| 97 | p->low_ru = 0; |
| 98 | p->status_size = RF_RECONMAP_SIZE; |
| 99 | p->high_ru = p->status_size - 1; |
| 100 | p->head = 0; |
| 101 | |
| 102 | RF_Malloc(p->status, p->status_size * sizeof(RF_ReconMapListElem_t *), (RF_ReconMapListElem_t **)); |
| 103 | RF_ASSERT(p->status != NULL); |
| 104 | |
| 105 | (void) memset((char *) p->status, 0, |
| 106 | p->status_size * sizeof(RF_ReconMapListElem_t *)); |
| 107 | |
| 108 | pool_init(&p->elem_pool, sizeof(RF_ReconMapListElem_t), 0, |
| 109 | 0, 0, "raidreconpl" , NULL, IPL_BIO); |
| 110 | pool_prime(&p->elem_pool, RF_NUM_RECON_POOL_ELEM); |
| 111 | |
| 112 | rf_init_mutex2(p->mutex, IPL_VM); |
| 113 | rf_init_cond2(p->cv, "reconupdate" ); |
| 114 | |
| 115 | return (p); |
| 116 | } |
| 117 | |
| 118 | |
| 119 | /*--------------------------------------------------------------------------- |
| 120 | * |
| 121 | * marks a new set of sectors as reconstructed. All the possible |
| 122 | * mergings get complicated. To simplify matters, the approach I take |
| 123 | * is to just dump something into the list, and then clean it up |
| 124 | * (i.e. merge elements and eliminate redundant ones) in a second pass |
| 125 | * over the list (compact_stat_entry()). Not 100% efficient, since a |
| 126 | * structure can be allocated and then immediately freed, but it keeps |
| 127 | * this code from becoming (more of) a nightmare of special cases. |
| 128 | * The only thing that compact_stat_entry() assumes is that the list |
| 129 | * is sorted by startSector, and so this is the only condition I |
| 130 | * maintain here. (MCH) |
| 131 | * |
| 132 | * This code now uses a pool instead of the previous malloc/free |
| 133 | * stuff. |
| 134 | *-------------------------------------------------------------------------*/ |
| 135 | |
| 136 | void |
| 137 | rf_ReconMapUpdate(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, |
| 138 | RF_SectorNum_t startSector, RF_SectorNum_t stopSector) |
| 139 | { |
| 140 | RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; |
| 141 | RF_SectorNum_t i, first_in_RU, last_in_RU, ru; |
| 142 | RF_ReconMapListElem_t *p, *pt; |
| 143 | |
| 144 | rf_lock_mutex2(mapPtr->mutex); |
| 145 | while(mapPtr->lock) { |
| 146 | rf_wait_cond2(mapPtr->cv, mapPtr->mutex); |
| 147 | } |
| 148 | mapPtr->lock = 1; |
| 149 | rf_unlock_mutex2(mapPtr->mutex); |
| 150 | RF_ASSERT(startSector >= 0 && stopSector < mapPtr->sectorsInDisk && |
| 151 | stopSector >= startSector); |
| 152 | |
| 153 | while (startSector <= stopSector) { |
| 154 | i = startSector / mapPtr->sectorsPerReconUnit; |
| 155 | first_in_RU = i * sectorsPerReconUnit; |
| 156 | last_in_RU = first_in_RU + sectorsPerReconUnit - 1; |
| 157 | |
| 158 | /* do we need to move the queue? */ |
| 159 | while (i > mapPtr->high_ru) { |
| 160 | #if 0 |
| 161 | #ifdef DIAGNOSTIC |
| 162 | /* XXX: The check below is not valid for |
| 163 | * RAID5_RS. It is valid for RAID 1 and RAID 5. |
| 164 | * The issue is that we can easily have |
| 165 | * RU_NOTHING entries here too, and those are |
| 166 | * quite correct. |
| 167 | */ |
| 168 | if (mapPtr->status[mapPtr->head]!=RU_ALL) { |
| 169 | printf("\nraid%d: reconmap incorrect -- working on i %" PRIu64 "\n" , |
| 170 | raidPtr->raidid, i); |
| 171 | printf("raid%d: ru %" PRIu64 " not completed!!!\n" , |
| 172 | raidPtr->raidid, mapPtr->head); |
| 173 | |
| 174 | printf("raid%d: low: %" PRIu64 " high: %" PRIu64 "\n" , |
| 175 | raidPtr->raidid, mapPtr->low_ru, mapPtr->high_ru); |
| 176 | |
| 177 | panic("reconmap incorrect" ); |
| 178 | } |
| 179 | #endif |
| 180 | #endif |
| 181 | mapPtr->low_ru++; |
| 182 | mapPtr->high_ru++; |
| 183 | /* initialize "highest" RU status entry, which |
| 184 | will take over the current head postion */ |
| 185 | mapPtr->status[mapPtr->head]=RU_NOTHING; |
| 186 | |
| 187 | /* move head too */ |
| 188 | mapPtr->head++; |
| 189 | if (mapPtr->head >= mapPtr->status_size) |
| 190 | mapPtr->head = 0; |
| 191 | |
| 192 | } |
| 193 | |
| 194 | ru = i - mapPtr->low_ru + mapPtr->head; |
| 195 | if (ru >= mapPtr->status_size) |
| 196 | ru = ru - mapPtr->status_size; |
| 197 | |
| 198 | if ((ru < 0) || (ru >= mapPtr->status_size)) { |
| 199 | printf("raid%d: ru is bogus %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n" , |
| 200 | raidPtr->raidid, i, ru, mapPtr->head, mapPtr->low_ru, mapPtr->high_ru); |
| 201 | panic("bogus ru in reconmap" ); |
| 202 | } |
| 203 | |
| 204 | p = mapPtr->status[ru]; |
| 205 | if (p != RU_ALL) { |
| 206 | if (p == RU_NOTHING || p->startSector > startSector) { |
| 207 | /* insert at front of list */ |
| 208 | |
| 209 | mapPtr->status[ru] = MakeReconMapListElem(mapPtr,startSector, RF_MIN(stopSector, last_in_RU), (p == RU_NOTHING) ? NULL : p); |
| 210 | |
| 211 | } else {/* general case */ |
| 212 | do { /* search for place to insert */ |
| 213 | pt = p; |
| 214 | p = p->next; |
| 215 | } while (p && (p->startSector < startSector)); |
| 216 | pt->next = MakeReconMapListElem(mapPtr,startSector, RF_MIN(stopSector, last_in_RU), p); |
| 217 | |
| 218 | } |
| 219 | compact_stat_entry(raidPtr, mapPtr, i, ru); |
| 220 | } |
| 221 | startSector = RF_MIN(stopSector, last_in_RU) + 1; |
| 222 | } |
| 223 | rf_lock_mutex2(mapPtr->mutex); |
| 224 | mapPtr->lock = 0; |
| 225 | rf_broadcast_cond2(mapPtr->cv); |
| 226 | rf_unlock_mutex2(mapPtr->mutex); |
| 227 | } |
| 228 | |
| 229 | |
| 230 | |
| 231 | /*--------------------------------------------------------------------------- |
| 232 | * |
| 233 | * performs whatever list compactions can be done, and frees any space |
| 234 | * that is no longer necessary. Assumes only that the list is sorted |
| 235 | * by startSector. crunch_list() compacts a single list as much as |
| 236 | * possible, and the second block of code deletes the entire list if |
| 237 | * possible. crunch_list() is also called from |
| 238 | * MakeReconMapAccessList(). |
| 239 | * |
| 240 | * When a recon unit is detected to be fully reconstructed, we set the |
| 241 | * corresponding bit in the parity stripe map so that the head follow |
| 242 | * code will not select this parity stripe again. This is redundant |
| 243 | * (but harmless) when compact_stat_entry is called from the |
| 244 | * reconstruction code, but necessary when called from the user-write |
| 245 | * code. |
| 246 | * |
| 247 | *-------------------------------------------------------------------------*/ |
| 248 | |
| 249 | static void |
| 250 | compact_stat_entry(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, int i, int j) |
| 251 | { |
| 252 | RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; |
| 253 | RF_ReconMapListElem_t *p = mapPtr->status[j]; |
| 254 | |
| 255 | crunch_list(mapPtr, p); |
| 256 | |
| 257 | if ((p->startSector == i * sectorsPerReconUnit) && |
| 258 | (p->stopSector == i * sectorsPerReconUnit + |
| 259 | sectorsPerReconUnit - 1)) { |
| 260 | mapPtr->status[j] = RU_ALL; |
| 261 | mapPtr->unitsLeft--; |
| 262 | FreeReconMapListElem(mapPtr, p); |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | |
| 267 | static void |
| 268 | crunch_list(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *listPtr) |
| 269 | { |
| 270 | RF_ReconMapListElem_t *pt, *p = listPtr; |
| 271 | |
| 272 | if (!p) |
| 273 | return; |
| 274 | pt = p; |
| 275 | p = p->next; |
| 276 | while (p) { |
| 277 | if (pt->stopSector >= p->startSector - 1) { |
| 278 | pt->stopSector = RF_MAX(pt->stopSector, p->stopSector); |
| 279 | pt->next = p->next; |
| 280 | FreeReconMapListElem(mapPtr, p); |
| 281 | p = pt->next; |
| 282 | } else { |
| 283 | pt = p; |
| 284 | p = p->next; |
| 285 | } |
| 286 | } |
| 287 | } |
| 288 | /*--------------------------------------------------------------------------- |
| 289 | * |
| 290 | * Allocate and fill a new list element |
| 291 | * |
| 292 | *-------------------------------------------------------------------------*/ |
| 293 | |
| 294 | static RF_ReconMapListElem_t * |
| 295 | MakeReconMapListElem(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector, |
| 296 | RF_SectorNum_t stopSector, RF_ReconMapListElem_t *next) |
| 297 | { |
| 298 | RF_ReconMapListElem_t *p; |
| 299 | |
| 300 | p = pool_get(&mapPtr->elem_pool, PR_WAITOK); |
| 301 | p->startSector = startSector; |
| 302 | p->stopSector = stopSector; |
| 303 | p->next = next; |
| 304 | return (p); |
| 305 | } |
| 306 | /*--------------------------------------------------------------------------- |
| 307 | * |
| 308 | * Free a list element |
| 309 | * |
| 310 | *-------------------------------------------------------------------------*/ |
| 311 | |
| 312 | static void |
| 313 | FreeReconMapListElem(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *p) |
| 314 | { |
| 315 | pool_put(&mapPtr->elem_pool, p); |
| 316 | } |
| 317 | /*--------------------------------------------------------------------------- |
| 318 | * |
| 319 | * Free an entire status structure. Inefficient, but can be called at |
| 320 | * any time. |
| 321 | * |
| 322 | *-------------------------------------------------------------------------*/ |
| 323 | void |
| 324 | rf_FreeReconMap(RF_ReconMap_t *mapPtr) |
| 325 | { |
| 326 | RF_ReconMapListElem_t *p, *q; |
| 327 | RF_ReconUnitCount_t numRUs; |
| 328 | RF_ReconUnitNum_t i; |
| 329 | |
| 330 | numRUs = mapPtr->sectorsInDisk / mapPtr->sectorsPerReconUnit; |
| 331 | if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit) |
| 332 | numRUs++; |
| 333 | |
| 334 | for (i = 0; i < mapPtr->status_size; i++) { |
| 335 | p = mapPtr->status[i]; |
| 336 | while (p != RU_NOTHING && p != RU_ALL) { |
| 337 | q = p; |
| 338 | p = p->next; |
| 339 | RF_Free(q, sizeof(*q)); |
| 340 | } |
| 341 | } |
| 342 | |
| 343 | rf_destroy_mutex2(mapPtr->mutex); |
| 344 | rf_destroy_cond2(mapPtr->cv); |
| 345 | |
| 346 | pool_destroy(&mapPtr->elem_pool); |
| 347 | RF_Free(mapPtr->status, mapPtr->status_size * |
| 348 | sizeof(RF_ReconMapListElem_t *)); |
| 349 | RF_Free(mapPtr, sizeof(RF_ReconMap_t)); |
| 350 | } |
| 351 | /*--------------------------------------------------------------------------- |
| 352 | * |
| 353 | * returns nonzero if the indicated RU has been reconstructed already |
| 354 | * |
| 355 | *-------------------------------------------------------------------------*/ |
| 356 | |
| 357 | int |
| 358 | rf_CheckRUReconstructed(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector) |
| 359 | { |
| 360 | RF_ReconUnitNum_t i; |
| 361 | int rv; |
| 362 | |
| 363 | i = startSector / mapPtr->sectorsPerReconUnit; |
| 364 | |
| 365 | if (i < mapPtr->low_ru) |
| 366 | rv = 1; |
| 367 | else if (i > mapPtr->high_ru) |
| 368 | rv = 0; |
| 369 | else { |
| 370 | i = i - mapPtr->low_ru + mapPtr->head; |
| 371 | if (i >= mapPtr->status_size) |
| 372 | i = i - mapPtr->status_size; |
| 373 | if (mapPtr->status[i] == RU_ALL) |
| 374 | rv = 1; |
| 375 | else |
| 376 | rv = 0; |
| 377 | } |
| 378 | |
| 379 | return rv; |
| 380 | } |
| 381 | |
| 382 | RF_ReconUnitCount_t |
| 383 | rf_UnitsLeftToReconstruct(RF_ReconMap_t *mapPtr) |
| 384 | { |
| 385 | RF_ASSERT(mapPtr != NULL); |
| 386 | return (mapPtr->unitsLeft); |
| 387 | } |
| 388 | |
| 389 | #if RF_DEBUG_RECON |
| 390 | void |
| 391 | rf_PrintReconSchedule(RF_ReconMap_t *mapPtr, struct timeval *starttime) |
| 392 | { |
| 393 | static int old_pctg = -1; |
| 394 | struct timeval tv, diff; |
| 395 | int new_pctg; |
| 396 | |
| 397 | new_pctg = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * |
| 398 | 100 / mapPtr->totalRUs); |
| 399 | if (new_pctg != old_pctg) { |
| 400 | RF_GETTIME(tv); |
| 401 | RF_TIMEVAL_DIFF(starttime, &tv, &diff); |
| 402 | printf("%d %d.%06d\n" , (int) new_pctg, (int) diff.tv_sec, |
| 403 | (int) diff.tv_usec); |
| 404 | old_pctg = new_pctg; |
| 405 | } |
| 406 | } |
| 407 | #endif |
| 408 | |
| 409 | |