| 1 | /* $NetBSD: rf_raid5.c,v 1.19 2006/11/16 01:33:23 christos Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 1995 Carnegie-Mellon University. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * Author: Mark Holland |
| 7 | * |
| 8 | * Permission to use, copy, modify and distribute this software and |
| 9 | * its documentation is hereby granted, provided that both the copyright |
| 10 | * notice and this permission notice appear in all copies of the |
| 11 | * software, derivative works or modified versions, and any portions |
| 12 | * thereof, and that both notices appear in supporting documentation. |
| 13 | * |
| 14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 17 | * |
| 18 | * Carnegie Mellon requests users of this software to return to |
| 19 | * |
| 20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 21 | * School of Computer Science |
| 22 | * Carnegie Mellon University |
| 23 | * Pittsburgh PA 15213-3890 |
| 24 | * |
| 25 | * any improvements or extensions that they make and grant Carnegie the |
| 26 | * rights to redistribute these changes. |
| 27 | */ |
| 28 | |
| 29 | /****************************************************************************** |
| 30 | * |
| 31 | * rf_raid5.c -- implements RAID Level 5 |
| 32 | * |
| 33 | *****************************************************************************/ |
| 34 | |
| 35 | #include <sys/cdefs.h> |
| 36 | __KERNEL_RCSID(0, "$NetBSD: rf_raid5.c,v 1.19 2006/11/16 01:33:23 christos Exp $" ); |
| 37 | |
| 38 | #include <dev/raidframe/raidframevar.h> |
| 39 | |
| 40 | #include "rf_raid.h" |
| 41 | #include "rf_raid5.h" |
| 42 | #include "rf_dag.h" |
| 43 | #include "rf_dagffrd.h" |
| 44 | #include "rf_dagffwr.h" |
| 45 | #include "rf_dagdegrd.h" |
| 46 | #include "rf_dagdegwr.h" |
| 47 | #include "rf_dagutils.h" |
| 48 | #include "rf_general.h" |
| 49 | #include "rf_map.h" |
| 50 | #include "rf_utils.h" |
| 51 | |
| 52 | typedef struct RF_Raid5ConfigInfo_s { |
| 53 | RF_RowCol_t **stripeIdentifier; /* filled in at config time and used |
| 54 | * by IdentifyStripe */ |
| 55 | } RF_Raid5ConfigInfo_t; |
| 56 | |
| 57 | int |
| 58 | rf_ConfigureRAID5(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, |
| 59 | RF_Config_t *cfgPtr) |
| 60 | { |
| 61 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
| 62 | RF_Raid5ConfigInfo_t *info; |
| 63 | RF_RowCol_t i, j, startdisk; |
| 64 | |
| 65 | /* create a RAID level 5 configuration structure */ |
| 66 | RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t), (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList); |
| 67 | if (info == NULL) |
| 68 | return (ENOMEM); |
| 69 | layoutPtr->layoutSpecificInfo = (void *) info; |
| 70 | |
| 71 | /* the stripe identifier must identify the disks in each stripe, IN |
| 72 | * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ |
| 73 | info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList); |
| 74 | if (info->stripeIdentifier == NULL) |
| 75 | return (ENOMEM); |
| 76 | startdisk = 0; |
| 77 | for (i = 0; i < raidPtr->numCol; i++) { |
| 78 | for (j = 0; j < raidPtr->numCol; j++) { |
| 79 | info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol; |
| 80 | } |
| 81 | if ((--startdisk) < 0) |
| 82 | startdisk = raidPtr->numCol - 1; |
| 83 | } |
| 84 | |
| 85 | /* fill in the remaining layout parameters */ |
| 86 | layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; |
| 87 | layoutPtr->numDataCol = raidPtr->numCol - 1; |
| 88 | layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; |
| 89 | layoutPtr->numParityCol = 1; |
| 90 | layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; |
| 91 | |
| 92 | raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; |
| 93 | |
| 94 | return (0); |
| 95 | } |
| 96 | |
| 97 | int |
| 98 | rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr) |
| 99 | { |
| 100 | return (20); |
| 101 | } |
| 102 | |
| 103 | RF_HeadSepLimit_t |
| 104 | rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr) |
| 105 | { |
| 106 | return (10); |
| 107 | } |
| 108 | #if !defined(__NetBSD__) && !defined(_KERNEL) |
| 109 | /* not currently used */ |
| 110 | int |
| 111 | rf_ShutdownRAID5(RF_Raid_t *raidPtr) |
| 112 | { |
| 113 | return (0); |
| 114 | } |
| 115 | #endif |
| 116 | |
| 117 | void |
| 118 | rf_MapSectorRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
| 119 | RF_RowCol_t *col, RF_SectorNum_t *diskSector, |
| 120 | int remap) |
| 121 | { |
| 122 | RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; |
| 123 | *col = (SUID % raidPtr->numCol); |
| 124 | *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + |
| 125 | (raidSector % raidPtr->Layout.sectorsPerStripeUnit); |
| 126 | } |
| 127 | |
| 128 | void |
| 129 | rf_MapParityRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
| 130 | RF_RowCol_t *col, RF_SectorNum_t *diskSector, |
| 131 | int remap) |
| 132 | { |
| 133 | RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; |
| 134 | |
| 135 | *col = raidPtr->Layout.numDataCol - (SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol; |
| 136 | *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + |
| 137 | (raidSector % raidPtr->Layout.sectorsPerStripeUnit); |
| 138 | } |
| 139 | |
| 140 | void |
| 141 | rf_IdentifyStripeRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, |
| 142 | RF_RowCol_t **diskids) |
| 143 | { |
| 144 | RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); |
| 145 | RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
| 146 | |
| 147 | *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; |
| 148 | } |
| 149 | |
| 150 | void |
| 151 | rf_MapSIDToPSIDRAID5(RF_RaidLayout_t *layoutPtr, |
| 152 | RF_StripeNum_t stripeID, |
| 153 | RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru) |
| 154 | { |
| 155 | *which_ru = 0; |
| 156 | *psID = stripeID; |
| 157 | } |
| 158 | /* select an algorithm for performing an access. Returns two pointers, |
| 159 | * one to a function that will return information about the DAG, and |
| 160 | * another to a function that will create the dag. |
| 161 | */ |
| 162 | void |
| 163 | rf_RaidFiveDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, |
| 164 | RF_AccessStripeMap_t *asmap, |
| 165 | RF_VoidFuncPtr *createFunc) |
| 166 | { |
| 167 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
| 168 | RF_PhysDiskAddr_t *failedPDA = NULL; |
| 169 | RF_RowCol_t fcol; |
| 170 | RF_RowStatus_t rstat; |
| 171 | int prior_recon; |
| 172 | |
| 173 | RF_ASSERT(RF_IO_IS_R_OR_W(type)); |
| 174 | |
| 175 | if ((asmap->numDataFailed + asmap->numParityFailed > 1) || |
| 176 | (raidPtr->numFailures > 1)){ |
| 177 | #if RF_DEBUG_DAG |
| 178 | if (rf_dagDebug) |
| 179 | RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n" ); |
| 180 | #endif |
| 181 | *createFunc = NULL; |
| 182 | return; |
| 183 | } |
| 184 | |
| 185 | if (asmap->numDataFailed + asmap->numParityFailed == 1) { |
| 186 | |
| 187 | /* if under recon & already reconstructed, redirect |
| 188 | * the access to the spare drive and eliminate the |
| 189 | * failure indication */ |
| 190 | failedPDA = asmap->failedPDAs[0]; |
| 191 | fcol = failedPDA->col; |
| 192 | rstat = raidPtr->status; |
| 193 | prior_recon = (rstat == rf_rs_reconfigured) || ( |
| 194 | (rstat == rf_rs_reconstructing) ? |
| 195 | rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0 |
| 196 | ); |
| 197 | if (prior_recon) { |
| 198 | #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0 |
| 199 | RF_RowCol_t oc = failedPDA->col; |
| 200 | RF_SectorNum_t oo = failedPDA->startSector; |
| 201 | #endif |
| 202 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
| 203 | if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist |
| 204 | * spare space */ |
| 205 | |
| 206 | if (failedPDA == asmap->parityInfo) { |
| 207 | |
| 208 | /* parity has failed */ |
| 209 | (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, |
| 210 | &failedPDA->col, &failedPDA->startSector, RF_REMAP); |
| 211 | |
| 212 | if (asmap->parityInfo->next) { /* redir 2nd component, |
| 213 | * if any */ |
| 214 | RF_PhysDiskAddr_t *p = asmap->parityInfo->next; |
| 215 | RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; |
| 216 | p->col = failedPDA->col; |
| 217 | p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + |
| 218 | SUoffs; /* cheating: |
| 219 | * startSector is not |
| 220 | * really a RAID address */ |
| 221 | } |
| 222 | } else |
| 223 | if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) { |
| 224 | RF_ASSERT(0); /* should not ever |
| 225 | * happen */ |
| 226 | } else { |
| 227 | |
| 228 | /* data has failed */ |
| 229 | (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, |
| 230 | &failedPDA->col, &failedPDA->startSector, RF_REMAP); |
| 231 | |
| 232 | } |
| 233 | |
| 234 | } else { |
| 235 | #endif |
| 236 | /* redirect to dedicated spare space */ |
| 237 | |
| 238 | failedPDA->col = raidPtr->Disks[fcol].spareCol; |
| 239 | |
| 240 | /* the parity may have two distinct |
| 241 | * components, both of which may need |
| 242 | * to be redirected */ |
| 243 | if (asmap->parityInfo->next) { |
| 244 | if (failedPDA == asmap->parityInfo) { |
| 245 | failedPDA->next->col = failedPDA->col; |
| 246 | } else |
| 247 | if (failedPDA == asmap->parityInfo->next) { /* paranoid: should |
| 248 | * never occur */ |
| 249 | asmap->parityInfo->col = failedPDA->col; |
| 250 | } |
| 251 | } |
| 252 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
| 253 | } |
| 254 | #endif |
| 255 | RF_ASSERT(failedPDA->col != -1); |
| 256 | |
| 257 | #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0 |
| 258 | if (rf_dagDebug || rf_mapDebug) { |
| 259 | printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n" , |
| 260 | raidPtr->raidid, type, oc, |
| 261 | (long) oo, failedPDA->col, |
| 262 | (long) failedPDA->startSector); |
| 263 | } |
| 264 | #endif |
| 265 | asmap->numDataFailed = asmap->numParityFailed = 0; |
| 266 | } |
| 267 | } |
| 268 | /* all dags begin/end with block/unblock node therefore, hdrSucc & |
| 269 | * termAnt counts should always be 1 also, these counts should not be |
| 270 | * visible outside dag creation routines - manipulating the counts |
| 271 | * here should be removed */ |
| 272 | if (type == RF_IO_TYPE_READ) { |
| 273 | if (asmap->numDataFailed == 0) |
| 274 | *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; |
| 275 | else |
| 276 | *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; |
| 277 | } else { |
| 278 | |
| 279 | |
| 280 | /* if mirroring, always use large writes. If the access |
| 281 | * requires two distinct parity updates, always do a small |
| 282 | * write. If the stripe contains a failure but the access |
| 283 | * does not, do a small write. The first conditional |
| 284 | * (numStripeUnitsAccessed <= numDataCol/2) uses a |
| 285 | * less-than-or-equal rather than just a less-than because |
| 286 | * when G is 3 or 4, numDataCol/2 is 1, and I want |
| 287 | * single-stripe-unit updates to use just one disk. */ |
| 288 | if ((asmap->numDataFailed + asmap->numParityFailed) == 0) { |
| 289 | if (rf_suppressLocksAndLargeWrites || |
| 290 | (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || |
| 291 | (asmap->parityInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { |
| 292 | *createFunc = (RF_VoidFuncPtr) rf_CreateSmallWriteDAG; |
| 293 | } else |
| 294 | *createFunc = (RF_VoidFuncPtr) rf_CreateLargeWriteDAG; |
| 295 | } else { |
| 296 | if (asmap->numParityFailed == 1) |
| 297 | *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; |
| 298 | else |
| 299 | if (asmap->numStripeUnitsAccessed != 1 && (failedPDA == NULL || failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)) |
| 300 | *createFunc = NULL; |
| 301 | else |
| 302 | *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; |
| 303 | } |
| 304 | } |
| 305 | } |
| 306 | |