| 1 | /* $NetBSD: rf_decluster.c,v 1.24 2014/03/23 09:30:59 christos Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 1995 Carnegie-Mellon University. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * Author: Mark Holland |
| 7 | * |
| 8 | * Permission to use, copy, modify and distribute this software and |
| 9 | * its documentation is hereby granted, provided that both the copyright |
| 10 | * notice and this permission notice appear in all copies of the |
| 11 | * software, derivative works or modified versions, and any portions |
| 12 | * thereof, and that both notices appear in supporting documentation. |
| 13 | * |
| 14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 17 | * |
| 18 | * Carnegie Mellon requests users of this software to return to |
| 19 | * |
| 20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 21 | * School of Computer Science |
| 22 | * Carnegie Mellon University |
| 23 | * Pittsburgh PA 15213-3890 |
| 24 | * |
| 25 | * any improvements or extensions that they make and grant Carnegie the |
| 26 | * rights to redistribute these changes. |
| 27 | */ |
| 28 | |
| 29 | /*---------------------------------------------------------------------- |
| 30 | * |
| 31 | * rf_decluster.c -- code related to the declustered layout |
| 32 | * |
| 33 | * Created 10-21-92 (MCH) |
| 34 | * |
| 35 | * Nov 93: adding support for distributed sparing. This code is a little |
| 36 | * complex: the basic layout used is as follows: |
| 37 | * let F = (v-1)/GCD(r,v-1). The spare space for each set of |
| 38 | * F consecutive fulltables is grouped together and placed after |
| 39 | * that set of tables. |
| 40 | * +------------------------------+ |
| 41 | * | F fulltables | |
| 42 | * | Spare Space | |
| 43 | * | F fulltables | |
| 44 | * | Spare Space | |
| 45 | * | ... | |
| 46 | * +------------------------------+ |
| 47 | * |
| 48 | *--------------------------------------------------------------------*/ |
| 49 | |
| 50 | #include <sys/cdefs.h> |
| 51 | __KERNEL_RCSID(0, "$NetBSD: rf_decluster.c,v 1.24 2014/03/23 09:30:59 christos Exp $" ); |
| 52 | |
| 53 | #include <dev/raidframe/raidframevar.h> |
| 54 | |
| 55 | #include "rf_archs.h" |
| 56 | #include "rf_raid.h" |
| 57 | #include "rf_decluster.h" |
| 58 | #include "rf_debugMem.h" |
| 59 | #include "rf_utils.h" |
| 60 | #include "rf_alloclist.h" |
| 61 | #include "rf_general.h" |
| 62 | #include "rf_kintf.h" |
| 63 | #include "rf_shutdown.h" |
| 64 | #include "rf_copyback.h" |
| 65 | |
| 66 | #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) |
| 67 | |
| 68 | /* configuration code */ |
| 69 | |
| 70 | int |
| 71 | rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, |
| 72 | RF_Config_t *cfgPtr) |
| 73 | { |
| 74 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
| 75 | int b, v, k, r, lambda; /* block design params */ |
| 76 | int i, j; |
| 77 | RF_RowCol_t *first_avail_slot; |
| 78 | RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk; |
| 79 | RF_DeclusteredConfigInfo_t *info; |
| 80 | RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, |
| 81 | extraPUsPerDisk; |
| 82 | RF_StripeCount_t totSparePUsPerDisk; |
| 83 | RF_SectorNum_t diskOffsetOfLastFullTableInSUs; |
| 84 | RF_SectorCount_t SpareSpaceInSUs; |
| 85 | char *cfgBuf = (char *) (cfgPtr->layoutSpecific); |
| 86 | RF_StripeNum_t l, SUID; |
| 87 | |
| 88 | SUID = l = 0; |
| 89 | numCompleteSpareRegionsPerDisk = 0; |
| 90 | |
| 91 | /* 1. create layout specific structure */ |
| 92 | RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList); |
| 93 | if (info == NULL) |
| 94 | return (ENOMEM); |
| 95 | layoutPtr->layoutSpecificInfo = (void *) info; |
| 96 | info->SpareTable = NULL; |
| 97 | |
| 98 | /* 2. extract parameters from the config structure */ |
| 99 | if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { |
| 100 | (void)memcpy(info->sparemap_fname, cfgBuf, RF_SPAREMAP_NAME_LEN); |
| 101 | } |
| 102 | cfgBuf += RF_SPAREMAP_NAME_LEN; |
| 103 | |
| 104 | b = *((int *) cfgBuf); |
| 105 | cfgBuf += sizeof(int); |
| 106 | v = *((int *) cfgBuf); |
| 107 | cfgBuf += sizeof(int); |
| 108 | k = *((int *) cfgBuf); |
| 109 | cfgBuf += sizeof(int); |
| 110 | r = *((int *) cfgBuf); |
| 111 | cfgBuf += sizeof(int); |
| 112 | lambda = *((int *) cfgBuf); |
| 113 | cfgBuf += sizeof(int); |
| 114 | raidPtr->noRotate = *((int *) cfgBuf); |
| 115 | cfgBuf += sizeof(int); |
| 116 | |
| 117 | /* the sparemaps are generated assuming that parity is rotated, so we |
| 118 | * issue a warning if both distributed sparing and no-rotate are on at |
| 119 | * the same time */ |
| 120 | if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) { |
| 121 | RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n" ); |
| 122 | } |
| 123 | if (raidPtr->numCol != v) { |
| 124 | RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n" , v, raidPtr->numCol); |
| 125 | return (EINVAL); |
| 126 | } |
| 127 | /* 3. set up the values used in the mapping code */ |
| 128 | info->BlocksPerTable = b; |
| 129 | info->Lambda = lambda; |
| 130 | info->NumParityReps = info->groupSize = k; |
| 131 | info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU; /* b blks, k-1 SUs each */ |
| 132 | info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */ |
| 133 | info->PUsPerBlock = k - 1; |
| 134 | info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU; |
| 135 | info->TableDepthInPUs = (b * k) / v; |
| 136 | info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */ |
| 137 | |
| 138 | /* used only in distributed sparing case */ |
| 139 | info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1); /* (v-1)/gcd fulltables */ |
| 140 | info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion; |
| 141 | info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU; |
| 142 | |
| 143 | /* check to make sure the block design is sufficiently small */ |
| 144 | if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { |
| 145 | if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) { |
| 146 | RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n" , |
| 147 | (int) info->FullTableDepthInPUs, |
| 148 | (int) info->SpareSpaceDepthPerRegionInSUs, |
| 149 | (int) layoutPtr->stripeUnitsPerDisk); |
| 150 | return (EINVAL); |
| 151 | } |
| 152 | } else { |
| 153 | if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) { |
| 154 | RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n" , |
| 155 | (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU), \ |
| 156 | (int) layoutPtr->stripeUnitsPerDisk); |
| 157 | return (EINVAL); |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | |
| 162 | /* compute the size of each disk, and the number of tables in the last |
| 163 | * fulltable (which need not be complete) */ |
| 164 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
| 165 | |
| 166 | PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU; |
| 167 | spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs + |
| 168 | (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1)); |
| 169 | info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU; |
| 170 | |
| 171 | numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs; |
| 172 | info->NumCompleteSRs = numCompleteSpareRegionsPerDisk; |
| 173 | extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs; |
| 174 | |
| 175 | /* assume conservatively that we need the full amount of spare |
| 176 | * space in one region in order to provide spares for the |
| 177 | * partial spare region at the end of the array. We set "i" |
| 178 | * to the number of tables in the partial spare region. This |
| 179 | * may actually include some fulltables. */ |
| 180 | extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); |
| 181 | if (extraPUsPerDisk <= 0) |
| 182 | i = 0; |
| 183 | else |
| 184 | i = extraPUsPerDisk / info->TableDepthInPUs; |
| 185 | |
| 186 | complete_FT_count = (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k); |
| 187 | info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; |
| 188 | info->ExtraTablesPerDisk = i % k; |
| 189 | |
| 190 | /* note that in the last spare region, the spare space is |
| 191 | * complete even though data/parity space is not */ |
| 192 | totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); |
| 193 | info->TotSparePUsPerDisk = totSparePUsPerDisk; |
| 194 | |
| 195 | layoutPtr->stripeUnitsPerDisk = |
| 196 | ((complete_FT_count) * info->FullTableDepthInPUs + /* data & parity space */ |
| 197 | info->ExtraTablesPerDisk * info->TableDepthInPUs + |
| 198 | totSparePUsPerDisk /* spare space */ |
| 199 | ) * layoutPtr->SUsPerPU; |
| 200 | layoutPtr->dataStripeUnitsPerDisk = |
| 201 | (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs) |
| 202 | * layoutPtr->SUsPerPU * (k - 1) / k; |
| 203 | |
| 204 | } else { |
| 205 | /* non-dist spare case: force each disk to contain an |
| 206 | * integral number of tables */ |
| 207 | layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU); |
| 208 | layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU); |
| 209 | |
| 210 | /* compute the number of tables in the last fulltable, which |
| 211 | * need not be complete */ |
| 212 | complete_FT_count = |
| 213 | ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs); |
| 214 | |
| 215 | info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; |
| 216 | info->ExtraTablesPerDisk = |
| 217 | ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k; |
| 218 | } |
| 219 | |
| 220 | raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; |
| 221 | |
| 222 | /* find the disk offset of the stripe unit where the last fulltable |
| 223 | * starts */ |
| 224 | numCompleteFullTablesPerDisk = complete_FT_count; |
| 225 | diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
| 226 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
| 227 | SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs; |
| 228 | diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs; |
| 229 | info->DiskOffsetOfLastSpareSpaceChunkInSUs = |
| 230 | diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; |
| 231 | } |
| 232 | info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs; |
| 233 | info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk; |
| 234 | |
| 235 | /* 4. create and initialize the lookup tables */ |
| 236 | info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList); |
| 237 | if (info->LayoutTable == NULL) |
| 238 | return (ENOMEM); |
| 239 | info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList); |
| 240 | if (info->OffsetTable == NULL) |
| 241 | return (ENOMEM); |
| 242 | info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList); |
| 243 | if (info->BlockTable == NULL) |
| 244 | return (ENOMEM); |
| 245 | |
| 246 | first_avail_slot = rf_make_1d_array(v, NULL); |
| 247 | if (first_avail_slot == NULL) |
| 248 | return (ENOMEM); |
| 249 | |
| 250 | for (i = 0; i < b; i++) |
| 251 | for (j = 0; j < k; j++) |
| 252 | info->LayoutTable[i][j] = *cfgBuf++; |
| 253 | |
| 254 | /* initialize offset table */ |
| 255 | for (i = 0; i < b; i++) |
| 256 | for (j = 0; j < k; j++) { |
| 257 | info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]]; |
| 258 | first_avail_slot[info->LayoutTable[i][j]]++; |
| 259 | } |
| 260 | |
| 261 | /* initialize block table */ |
| 262 | for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) { |
| 263 | for (i = 0; i < b; i++) { |
| 264 | for (j = 0; j < k; j++) { |
| 265 | info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l] |
| 266 | [info->LayoutTable[i][j]] = SUID; |
| 267 | } |
| 268 | SUID++; |
| 269 | } |
| 270 | } |
| 271 | |
| 272 | rf_free_1d_array(first_avail_slot, v); |
| 273 | |
| 274 | /* 5. set up the remaining redundant-but-useful parameters */ |
| 275 | |
| 276 | raidPtr->totalSectors = (k * complete_FT_count + info->ExtraTablesPerDisk) * |
| 277 | info->SUsPerTable * layoutPtr->sectorsPerStripeUnit; |
| 278 | layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 1); |
| 279 | |
| 280 | /* strange evaluation order below to try and minimize overflow |
| 281 | * problems */ |
| 282 | |
| 283 | layoutPtr->dataSectorsPerStripe = (k - 1) * layoutPtr->sectorsPerStripeUnit; |
| 284 | layoutPtr->numDataCol = k - 1; |
| 285 | layoutPtr->numParityCol = 1; |
| 286 | |
| 287 | return (0); |
| 288 | } |
| 289 | /* declustering with distributed sparing */ |
| 290 | static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t); |
| 291 | static void |
| 292 | rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg) |
| 293 | { |
| 294 | RF_DeclusteredConfigInfo_t *info; |
| 295 | RF_Raid_t *raidPtr; |
| 296 | |
| 297 | raidPtr = (RF_Raid_t *) arg; |
| 298 | info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
| 299 | if (info->SpareTable) |
| 300 | rf_FreeSpareTable(raidPtr); |
| 301 | } |
| 302 | |
| 303 | int |
| 304 | rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, |
| 305 | RF_Config_t *cfgPtr) |
| 306 | { |
| 307 | int rc; |
| 308 | |
| 309 | rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr); |
| 310 | if (rc) |
| 311 | return (rc); |
| 312 | rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr); |
| 313 | |
| 314 | return (0); |
| 315 | } |
| 316 | |
| 317 | void |
| 318 | rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
| 319 | RF_RowCol_t *col, |
| 320 | RF_SectorNum_t *diskSector, int remap) |
| 321 | { |
| 322 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
| 323 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
| 324 | RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; |
| 325 | RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; |
| 326 | RF_StripeNum_t BlockID, BlockOffset, RepIndex; |
| 327 | RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; |
| 328 | RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
| 329 | RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; |
| 330 | |
| 331 | rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); |
| 332 | |
| 333 | FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array |
| 334 | * (across rows) */ |
| 335 | |
| 336 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
| 337 | SpareRegion = FullTableID / info->FullTablesPerSpareRegion; |
| 338 | SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; |
| 339 | } |
| 340 | FullTableOffset = SUID % sus_per_fulltable; |
| 341 | TableID = FullTableOffset / info->SUsPerTable; |
| 342 | TableOffset = FullTableOffset - TableID * info->SUsPerTable; |
| 343 | BlockID = TableOffset / info->PUsPerBlock; |
| 344 | BlockOffset = TableOffset - BlockID * info->PUsPerBlock; |
| 345 | BlockID %= info->BlocksPerTable; |
| 346 | RepIndex = info->PUsPerBlock - TableID; |
| 347 | if (!raidPtr->noRotate) |
| 348 | BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0); |
| 349 | *col = info->LayoutTable[BlockID][BlockOffset]; |
| 350 | |
| 351 | /* remap to distributed spare space if indicated */ |
| 352 | if (remap) { |
| 353 | RF_ASSERT(raidPtr->Disks[*col].status == rf_ds_reconstructing || raidPtr->Disks[*col].status == rf_ds_dist_spared || |
| 354 | (rf_copyback_in_progress && raidPtr->Disks[*col].status == rf_ds_optimal)); |
| 355 | rf_remap_to_spare_space(layoutPtr, info, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); |
| 356 | } else { |
| 357 | |
| 358 | outSU = base_suid; |
| 359 | outSU += FullTableID * fulltable_depth; /* offs to strt of FT */ |
| 360 | outSU += SpareSpace; /* skip rsvd spare space */ |
| 361 | outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */ |
| 362 | outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */ |
| 363 | } |
| 364 | outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within |
| 365 | * a PU */ |
| 366 | |
| 367 | /* convert SUs to sectors, and, if not aligned to SU boundary, add in |
| 368 | * offset to sector. */ |
| 369 | *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); |
| 370 | |
| 371 | RF_ASSERT(*col != -1); |
| 372 | } |
| 373 | |
| 374 | |
| 375 | /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */ |
| 376 | void |
| 377 | rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
| 378 | RF_RowCol_t *col, |
| 379 | RF_SectorNum_t *diskSector, int remap) |
| 380 | { |
| 381 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
| 382 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
| 383 | RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; |
| 384 | RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; |
| 385 | RF_StripeNum_t BlockID, RepIndex; |
| 386 | RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; |
| 387 | RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
| 388 | RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; |
| 389 | |
| 390 | rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); |
| 391 | |
| 392 | /* compute row & (possibly) spare space exactly as before */ |
| 393 | FullTableID = SUID / sus_per_fulltable; |
| 394 | |
| 395 | if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { |
| 396 | SpareRegion = FullTableID / info->FullTablesPerSpareRegion; |
| 397 | SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; |
| 398 | } |
| 399 | /* compute BlockID and RepIndex exactly as before */ |
| 400 | FullTableOffset = SUID % sus_per_fulltable; |
| 401 | TableID = FullTableOffset / info->SUsPerTable; |
| 402 | TableOffset = FullTableOffset - TableID * info->SUsPerTable; |
| 403 | /* TableOffset = FullTableOffset % info->SUsPerTable; */ |
| 404 | /* BlockID = (TableOffset / info->PUsPerBlock) % |
| 405 | * info->BlocksPerTable; */ |
| 406 | BlockID = TableOffset / info->PUsPerBlock; |
| 407 | BlockID %= info->BlocksPerTable; |
| 408 | |
| 409 | /* the parity block is in the position indicated by RepIndex */ |
| 410 | RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID; |
| 411 | *col = info->LayoutTable[BlockID][RepIndex]; |
| 412 | |
| 413 | if (remap) { |
| 414 | RF_ASSERT(raidPtr->Disks[*col].status == rf_ds_reconstructing || raidPtr->Disks[*col].status == rf_ds_dist_spared || |
| 415 | (rf_copyback_in_progress && raidPtr->Disks[*col].status == rf_ds_optimal)); |
| 416 | rf_remap_to_spare_space(layoutPtr, info, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); |
| 417 | } else { |
| 418 | |
| 419 | /* compute sector as before, except use RepIndex instead of |
| 420 | * BlockOffset */ |
| 421 | outSU = base_suid; |
| 422 | outSU += FullTableID * fulltable_depth; |
| 423 | outSU += SpareSpace; /* skip rsvd spare space */ |
| 424 | outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; |
| 425 | outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU; |
| 426 | } |
| 427 | |
| 428 | outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); |
| 429 | *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); |
| 430 | |
| 431 | RF_ASSERT(*col != -1); |
| 432 | } |
| 433 | /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address. |
| 434 | * the caller must _never_ attempt to modify this array. |
| 435 | */ |
| 436 | void |
| 437 | rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, |
| 438 | RF_RowCol_t **diskids) |
| 439 | { |
| 440 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
| 441 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
| 442 | RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; |
| 443 | RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
| 444 | RF_StripeNum_t base_suid = 0; |
| 445 | RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr); |
| 446 | RF_StripeNum_t stripeID; |
| 447 | int tableOffset; |
| 448 | |
| 449 | rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); |
| 450 | stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset |
| 451 | * into array */ |
| 452 | tableOffset = (stripeID % info->BlocksPerTable); /* find offset into |
| 453 | * block design table */ |
| 454 | *diskids = info->LayoutTable[tableOffset]; |
| 455 | } |
| 456 | /* This returns the default head-separation limit, which is measured |
| 457 | * in "required units for reconstruction". Each time a disk fetches |
| 458 | * a unit, it bumps a counter. The head-sep code prohibits any disk |
| 459 | * from getting more than headSepLimit counter values ahead of any |
| 460 | * other. |
| 461 | * |
| 462 | * We assume here that the number of floating recon buffers is already |
| 463 | * set. There are r stripes to be reconstructed in each table, and so |
| 464 | * if we have a total of B buffers, we can have at most B/r tables |
| 465 | * under recon at any one time. In each table, lambda units are required |
| 466 | * from each disk, so given B buffers, the head sep limit has to be |
| 467 | * (lambda*B)/r units. We subtract one to avoid weird boundary cases. |
| 468 | * |
| 469 | * for example, suppose were given 50 buffers, r=19, and lambda=4 as in |
| 470 | * the 20.5 design. There are 19 stripes/table to be reconstructed, so |
| 471 | * we can have 50/19 tables concurrently under reconstruction, which means |
| 472 | * we can allow the fastest disk to get 50/19 tables ahead of the slower |
| 473 | * disk. There are lambda "required units" for each disk, so the fastest |
| 474 | * disk can get 4*50/19 = 10 counter values ahead of the slowest. |
| 475 | * |
| 476 | * If numBufsToAccumulate is not 1, we need to limit the head sep further |
| 477 | * because multiple bufs will be required for each stripe under recon. |
| 478 | */ |
| 479 | RF_HeadSepLimit_t |
| 480 | rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr) |
| 481 | { |
| 482 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
| 483 | |
| 484 | return (info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate); |
| 485 | } |
| 486 | /* returns the default number of recon buffers to use. The value |
| 487 | * is somewhat arbitrary...it's intended to be large enough to allow |
| 488 | * for a reasonably large head-sep limit, but small enough that you |
| 489 | * don't use up all your system memory with buffers. |
| 490 | */ |
| 491 | int |
| 492 | rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr) |
| 493 | { |
| 494 | return (100 * rf_numBufsToAccumulate); |
| 495 | } |
| 496 | /* sectors in the last fulltable of the array need to be handled |
| 497 | * specially since this fulltable can be incomplete. this function |
| 498 | * changes the values of certain params to handle this. |
| 499 | * |
| 500 | * the idea here is that MapSector et. al. figure out which disk the |
| 501 | * addressed unit lives on by computing the modulos of the unit number |
| 502 | * with the number of units per fulltable, table, etc. In the last |
| 503 | * fulltable, there are fewer units per fulltable, so we need to adjust |
| 504 | * the number of user data units per fulltable to reflect this. |
| 505 | * |
| 506 | * so, we (1) convert the fulltable size and depth parameters to |
| 507 | * the size of the partial fulltable at the end, (2) compute the |
| 508 | * disk sector offset where this fulltable starts, and (3) convert |
| 509 | * the users stripe unit number from an offset into the array to |
| 510 | * an offset into the last fulltable. |
| 511 | */ |
| 512 | void |
| 513 | rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, |
| 514 | RF_StripeNum_t *SUID, |
| 515 | RF_StripeCount_t *sus_per_fulltable, |
| 516 | RF_StripeCount_t *fulltable_depth, |
| 517 | RF_StripeNum_t *base_suid) |
| 518 | { |
| 519 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
| 520 | |
| 521 | if (*SUID >= info->FullTableLimitSUID) { |
| 522 | /* new full table size is size of last full table on disk */ |
| 523 | *sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable; |
| 524 | |
| 525 | /* new full table depth is corresponding depth */ |
| 526 | *fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; |
| 527 | |
| 528 | /* set up the new base offset */ |
| 529 | *base_suid = info->DiskOffsetOfLastFullTableInSUs; |
| 530 | |
| 531 | /* convert users array address to an offset into the last |
| 532 | * fulltable */ |
| 533 | *SUID -= info->FullTableLimitSUID; |
| 534 | } |
| 535 | } |
| 536 | /* |
| 537 | * map a stripe ID to a parity stripe ID. |
| 538 | * See comment above RaidAddressToParityStripeID in layout.c. |
| 539 | */ |
| 540 | void |
| 541 | rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, |
| 542 | RF_StripeNum_t stripeID, |
| 543 | RF_StripeNum_t *psID, |
| 544 | RF_ReconUnitNum_t *which_ru) |
| 545 | { |
| 546 | RF_DeclusteredConfigInfo_t *info; |
| 547 | |
| 548 | info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
| 549 | |
| 550 | *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) |
| 551 | * info->BlocksPerTable + (stripeID % info->BlocksPerTable); |
| 552 | *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) |
| 553 | / info->BlocksPerTable; |
| 554 | RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU); |
| 555 | } |
| 556 | /* |
| 557 | * Called from MapSector and MapParity to retarget an access at the spare unit. |
| 558 | * Modifies the "col" and "outSU" parameters only. |
| 559 | */ |
| 560 | void |
| 561 | rf_remap_to_spare_space(RF_RaidLayout_t *layoutPtr, |
| 562 | RF_DeclusteredConfigInfo_t *info, |
| 563 | RF_StripeNum_t FullTableID, |
| 564 | RF_StripeNum_t TableID, |
| 565 | RF_SectorNum_t BlockID, |
| 566 | RF_StripeNum_t base_suid, |
| 567 | RF_StripeNum_t SpareRegion, |
| 568 | RF_RowCol_t *outCol, |
| 569 | RF_StripeNum_t *outSU) |
| 570 | { |
| 571 | RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, |
| 572 | which_ft; |
| 573 | |
| 574 | /* |
| 575 | * note that FullTableID and hence SpareRegion may have gotten |
| 576 | * tweaked by rf_decluster_adjust_params. We detect this by |
| 577 | * noticing that base_suid is not 0. |
| 578 | */ |
| 579 | if (base_suid == 0) { |
| 580 | ftID = FullTableID; |
| 581 | } else { |
| 582 | /* |
| 583 | * There may be > 1.0 full tables in the last (i.e. partial) |
| 584 | * spare region. find out which of these we're in. |
| 585 | */ |
| 586 | lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs; |
| 587 | which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU); |
| 588 | |
| 589 | /* compute the actual full table ID */ |
| 590 | ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft; |
| 591 | SpareRegion = info->NumCompleteSRs; |
| 592 | } |
| 593 | TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion; |
| 594 | |
| 595 | *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk; |
| 596 | RF_ASSERT(*outCol != -1); |
| 597 | |
| 598 | spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ? |
| 599 | info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU : |
| 600 | (SpareRegion + 1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs; |
| 601 | *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs; |
| 602 | if (*outSU >= layoutPtr->stripeUnitsPerDisk) { |
| 603 | printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n" , (long) *outSU); |
| 604 | } |
| 605 | } |
| 606 | |
| 607 | #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ |
| 608 | |
| 609 | #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) |
| 610 | int |
| 611 | rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, |
| 612 | RF_RowCol_t fcol) |
| 613 | { |
| 614 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
| 615 | RF_SparetWait_t *req; |
| 616 | int retcode; |
| 617 | |
| 618 | RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *)); |
| 619 | req->C = raidPtr->numCol; |
| 620 | req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; |
| 621 | req->fcol = fcol; |
| 622 | req->SUsPerPU = raidPtr->Layout.SUsPerPU; |
| 623 | req->TablesPerSpareRegion = info->TablesPerSpareRegion; |
| 624 | req->BlocksPerTable = info->BlocksPerTable; |
| 625 | req->TableDepthInPUs = info->TableDepthInPUs; |
| 626 | req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs; |
| 627 | |
| 628 | retcode = rf_GetSpareTableFromDaemon(req); |
| 629 | RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully -- |
| 630 | * XXX */ |
| 631 | return (retcode); |
| 632 | } |
| 633 | #endif |
| 634 | #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) |
| 635 | /* |
| 636 | * Invoked via ioctl to install a spare table in the kernel. |
| 637 | */ |
| 638 | int |
| 639 | rf_SetSpareTable(RF_Raid_t *raidPtr, void *data) |
| 640 | { |
| 641 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
| 642 | RF_SpareTableEntry_t **ptrs; |
| 643 | int i, retcode; |
| 644 | |
| 645 | /* what we need to copyin is a 2-d array, so first copyin the user |
| 646 | * pointers to the rows in the table */ |
| 647 | RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); |
| 648 | retcode = copyin((void *) data, (void *) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); |
| 649 | |
| 650 | if (retcode) |
| 651 | return (retcode); |
| 652 | |
| 653 | /* now allocate kernel space for the row pointers */ |
| 654 | RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); |
| 655 | |
| 656 | /* now allocate kernel space for each row in the table, and copy it in |
| 657 | * from user space */ |
| 658 | for (i = 0; i < info->TablesPerSpareRegion; i++) { |
| 659 | RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *)); |
| 660 | retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); |
| 661 | if (retcode) { |
| 662 | info->SpareTable = NULL; /* blow off the memory |
| 663 | * we've allocated */ |
| 664 | return (retcode); |
| 665 | } |
| 666 | } |
| 667 | |
| 668 | /* free up the temporary array we used */ |
| 669 | RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); |
| 670 | |
| 671 | return (0); |
| 672 | } |
| 673 | |
| 674 | RF_ReconUnitCount_t |
| 675 | rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr) |
| 676 | { |
| 677 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
| 678 | |
| 679 | return (((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk); |
| 680 | } |
| 681 | #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ |
| 682 | |
| 683 | void |
| 684 | rf_FreeSpareTable(RF_Raid_t *raidPtr) |
| 685 | { |
| 686 | long i; |
| 687 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
| 688 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
| 689 | RF_SpareTableEntry_t **table = info->SpareTable; |
| 690 | |
| 691 | for (i = 0; i < info->TablesPerSpareRegion; i++) { |
| 692 | RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); |
| 693 | } |
| 694 | RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); |
| 695 | info->SpareTable = NULL; |
| 696 | } |
| 697 | |