| 1 | /* $NetBSD: rf_diskqueue.c,v 1.53 2011/05/05 06:04:09 mrg Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 1995 Carnegie-Mellon University. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * Author: Mark Holland |
| 7 | * |
| 8 | * Permission to use, copy, modify and distribute this software and |
| 9 | * its documentation is hereby granted, provided that both the copyright |
| 10 | * notice and this permission notice appear in all copies of the |
| 11 | * software, derivative works or modified versions, and any portions |
| 12 | * thereof, and that both notices appear in supporting documentation. |
| 13 | * |
| 14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
| 16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 17 | * |
| 18 | * Carnegie Mellon requests users of this software to return to |
| 19 | * |
| 20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 21 | * School of Computer Science |
| 22 | * Carnegie Mellon University |
| 23 | * Pittsburgh PA 15213-3890 |
| 24 | * |
| 25 | * any improvements or extensions that they make and grant Carnegie the |
| 26 | * rights to redistribute these changes. |
| 27 | */ |
| 28 | |
| 29 | /**************************************************************************** |
| 30 | * |
| 31 | * rf_diskqueue.c -- higher-level disk queue code |
| 32 | * |
| 33 | * the routines here are a generic wrapper around the actual queueing |
| 34 | * routines. The code here implements thread scheduling, synchronization, |
| 35 | * and locking ops (see below) on top of the lower-level queueing code. |
| 36 | * |
| 37 | * to support atomic RMW, we implement "locking operations". When a |
| 38 | * locking op is dispatched to the lower levels of the driver, the |
| 39 | * queue is locked, and no further I/Os are dispatched until the queue |
| 40 | * receives & completes a corresponding "unlocking operation". This |
| 41 | * code relies on the higher layers to guarantee that a locking op |
| 42 | * will always be eventually followed by an unlocking op. The model |
| 43 | * is that the higher layers are structured so locking and unlocking |
| 44 | * ops occur in pairs, i.e. an unlocking op cannot be generated until |
| 45 | * after a locking op reports completion. There is no good way to |
| 46 | * check to see that an unlocking op "corresponds" to the op that |
| 47 | * currently has the queue locked, so we make no such attempt. Since |
| 48 | * by definition there can be only one locking op outstanding on a |
| 49 | * disk, this should not be a problem. |
| 50 | * |
| 51 | * In the kernel, we allow multiple I/Os to be concurrently dispatched |
| 52 | * to the disk driver. In order to support locking ops in this |
| 53 | * environment, when we decide to do a locking op, we stop dispatching |
| 54 | * new I/Os and wait until all dispatched I/Os have completed before |
| 55 | * dispatching the locking op. |
| 56 | * |
| 57 | * Unfortunately, the code is different in the 3 different operating |
| 58 | * states (user level, kernel, simulator). In the kernel, I/O is |
| 59 | * non-blocking, and we have no disk threads to dispatch for us. |
| 60 | * Therefore, we have to dispatch new I/Os to the scsi driver at the |
| 61 | * time of enqueue, and also at the time of completion. At user |
| 62 | * level, I/O is blocking, and so only the disk threads may dispatch |
| 63 | * I/Os. Thus at user level, all we can do at enqueue time is enqueue |
| 64 | * and wake up the disk thread to do the dispatch. |
| 65 | * |
| 66 | ****************************************************************************/ |
| 67 | |
| 68 | #include <sys/cdefs.h> |
| 69 | __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.53 2011/05/05 06:04:09 mrg Exp $" ); |
| 70 | |
| 71 | #include <dev/raidframe/raidframevar.h> |
| 72 | |
| 73 | #include "rf_threadstuff.h" |
| 74 | #include "rf_raid.h" |
| 75 | #include "rf_diskqueue.h" |
| 76 | #include "rf_alloclist.h" |
| 77 | #include "rf_acctrace.h" |
| 78 | #include "rf_etimer.h" |
| 79 | #include "rf_general.h" |
| 80 | #include "rf_debugprint.h" |
| 81 | #include "rf_shutdown.h" |
| 82 | #include "rf_cvscan.h" |
| 83 | #include "rf_sstf.h" |
| 84 | #include "rf_fifo.h" |
| 85 | #include "rf_kintf.h" |
| 86 | |
| 87 | static void rf_ShutdownDiskQueueSystem(void *); |
| 88 | |
| 89 | #ifndef RF_DEBUG_DISKQUEUE |
| 90 | #define RF_DEBUG_DISKQUEUE 0 |
| 91 | #endif |
| 92 | |
| 93 | #if RF_DEBUG_DISKQUEUE |
| 94 | #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) |
| 95 | #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) |
| 96 | #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) |
| 97 | #else |
| 98 | #define Dprintf1(s,a) |
| 99 | #define Dprintf2(s,a,b) |
| 100 | #define Dprintf3(s,a,b,c) |
| 101 | #endif |
| 102 | |
| 103 | /***************************************************************************** |
| 104 | * |
| 105 | * the disk queue switch defines all the functions used in the |
| 106 | * different queueing disciplines queue ID, init routine, enqueue |
| 107 | * routine, dequeue routine |
| 108 | * |
| 109 | ****************************************************************************/ |
| 110 | |
| 111 | static const RF_DiskQueueSW_t diskqueuesw[] = { |
| 112 | {"fifo" , /* FIFO */ |
| 113 | rf_FifoCreate, |
| 114 | rf_FifoEnqueue, |
| 115 | rf_FifoDequeue, |
| 116 | rf_FifoPeek, |
| 117 | rf_FifoPromote}, |
| 118 | |
| 119 | {"cvscan" , /* cvscan */ |
| 120 | rf_CvscanCreate, |
| 121 | rf_CvscanEnqueue, |
| 122 | rf_CvscanDequeue, |
| 123 | rf_CvscanPeek, |
| 124 | rf_CvscanPromote}, |
| 125 | |
| 126 | {"sstf" , /* shortest seek time first */ |
| 127 | rf_SstfCreate, |
| 128 | rf_SstfEnqueue, |
| 129 | rf_SstfDequeue, |
| 130 | rf_SstfPeek, |
| 131 | rf_SstfPromote}, |
| 132 | |
| 133 | {"scan" , /* SCAN (two-way elevator) */ |
| 134 | rf_ScanCreate, |
| 135 | rf_SstfEnqueue, |
| 136 | rf_ScanDequeue, |
| 137 | rf_ScanPeek, |
| 138 | rf_SstfPromote}, |
| 139 | |
| 140 | {"cscan" , /* CSCAN (one-way elevator) */ |
| 141 | rf_CscanCreate, |
| 142 | rf_SstfEnqueue, |
| 143 | rf_CscanDequeue, |
| 144 | rf_CscanPeek, |
| 145 | rf_SstfPromote}, |
| 146 | |
| 147 | }; |
| 148 | #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) |
| 149 | |
| 150 | #define RF_MAX_FREE_DQD 256 |
| 151 | #define RF_MIN_FREE_DQD 64 |
| 152 | |
| 153 | #include <sys/buf.h> |
| 154 | |
| 155 | /* configures a single disk queue */ |
| 156 | |
| 157 | static void |
| 158 | rf_ShutdownDiskQueue(void *arg) |
| 159 | { |
| 160 | RF_DiskQueue_t *diskqueue = arg; |
| 161 | |
| 162 | rf_destroy_mutex2(diskqueue->mutex); |
| 163 | } |
| 164 | |
| 165 | int |
| 166 | rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue, |
| 167 | RF_RowCol_t c, const RF_DiskQueueSW_t *p, |
| 168 | RF_SectorCount_t sectPerDisk, dev_t dev, |
| 169 | int maxOutstanding, RF_ShutdownList_t **listp, |
| 170 | RF_AllocListElem_t *clList) |
| 171 | { |
| 172 | diskqueue->col = c; |
| 173 | diskqueue->qPtr = p; |
| 174 | diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp); |
| 175 | diskqueue->dev = dev; |
| 176 | diskqueue->numOutstanding = 0; |
| 177 | diskqueue->queueLength = 0; |
| 178 | diskqueue->maxOutstanding = maxOutstanding; |
| 179 | diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; |
| 180 | diskqueue->flags = 0; |
| 181 | diskqueue->raidPtr = raidPtr; |
| 182 | diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c]; |
| 183 | rf_init_mutex2(diskqueue->mutex, IPL_VM); |
| 184 | rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue); |
| 185 | return (0); |
| 186 | } |
| 187 | |
| 188 | static void |
| 189 | rf_ShutdownDiskQueueSystem(void *ignored) |
| 190 | { |
| 191 | pool_destroy(&rf_pools.dqd); |
| 192 | } |
| 193 | |
| 194 | int |
| 195 | rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp) |
| 196 | { |
| 197 | |
| 198 | rf_pool_init(&rf_pools.dqd, sizeof(RF_DiskQueueData_t), |
| 199 | "rf_dqd_pl" , RF_MIN_FREE_DQD, RF_MAX_FREE_DQD); |
| 200 | rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL); |
| 201 | |
| 202 | return (0); |
| 203 | } |
| 204 | |
| 205 | int |
| 206 | rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, |
| 207 | RF_Config_t *cfgPtr) |
| 208 | { |
| 209 | RF_DiskQueue_t *diskQueues, *spareQueues; |
| 210 | const RF_DiskQueueSW_t *p; |
| 211 | RF_RowCol_t r,c; |
| 212 | int rc, i; |
| 213 | |
| 214 | raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; |
| 215 | |
| 216 | for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) { |
| 217 | if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { |
| 218 | p = &diskqueuesw[i]; |
| 219 | break; |
| 220 | } |
| 221 | } |
| 222 | if (p == NULL) { |
| 223 | RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n" , cfgPtr->diskQueueType, diskqueuesw[0].queueType); |
| 224 | p = &diskqueuesw[0]; |
| 225 | } |
| 226 | raidPtr->qType = p; |
| 227 | |
| 228 | RF_MallocAndAdd(diskQueues, |
| 229 | (raidPtr->numCol + RF_MAXSPARE) * |
| 230 | sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), |
| 231 | raidPtr->cleanupList); |
| 232 | if (diskQueues == NULL) |
| 233 | return (ENOMEM); |
| 234 | raidPtr->Queues = diskQueues; |
| 235 | |
| 236 | for (c = 0; c < raidPtr->numCol; c++) { |
| 237 | rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c], |
| 238 | c, p, |
| 239 | raidPtr->sectorsPerDisk, |
| 240 | raidPtr->Disks[c].dev, |
| 241 | cfgPtr->maxOutstandingDiskReqs, |
| 242 | listp, raidPtr->cleanupList); |
| 243 | if (rc) |
| 244 | return (rc); |
| 245 | } |
| 246 | |
| 247 | spareQueues = &raidPtr->Queues[raidPtr->numCol]; |
| 248 | for (r = 0; r < raidPtr->numSpare; r++) { |
| 249 | rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r], |
| 250 | raidPtr->numCol + r, p, |
| 251 | raidPtr->sectorsPerDisk, |
| 252 | raidPtr->Disks[raidPtr->numCol + r].dev, |
| 253 | cfgPtr->maxOutstandingDiskReqs, listp, |
| 254 | raidPtr->cleanupList); |
| 255 | if (rc) |
| 256 | return (rc); |
| 257 | } |
| 258 | return (0); |
| 259 | } |
| 260 | /* Enqueue a disk I/O |
| 261 | * |
| 262 | * In the kernel, I/O is non-blocking and so we'd like to have multiple |
| 263 | * I/Os outstanding on the physical disks when possible. |
| 264 | * |
| 265 | * when any request arrives at a queue, we have two choices: |
| 266 | * dispatch it to the lower levels |
| 267 | * queue it up |
| 268 | * |
| 269 | * kernel rules for when to do what: |
| 270 | * unlocking req : always dispatch it |
| 271 | * normal req : queue empty => dispatch it & set priority |
| 272 | * queue not full & priority is ok => dispatch it |
| 273 | * else queue it |
| 274 | */ |
| 275 | void |
| 276 | rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri) |
| 277 | { |
| 278 | RF_ETIMER_START(req->qtime); |
| 279 | RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); |
| 280 | req->priority = pri; |
| 281 | |
| 282 | #if RF_DEBUG_DISKQUEUE |
| 283 | if (rf_queueDebug && (req->numSector == 0)) { |
| 284 | printf("Warning: Enqueueing zero-sector access\n" ); |
| 285 | } |
| 286 | #endif |
| 287 | RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue" ); |
| 288 | if (RF_OK_TO_DISPATCH(queue, req)) { |
| 289 | Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n" , pri, queue->col); |
| 290 | rf_DispatchKernelIO(queue, req); |
| 291 | } else { |
| 292 | queue->queueLength++; /* increment count of number of requests waiting in this queue */ |
| 293 | Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n" , pri, queue->col); |
| 294 | req->queue = (void *) queue; |
| 295 | (queue->qPtr->Enqueue) (queue->qHdr, req, pri); |
| 296 | } |
| 297 | RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue" ); |
| 298 | } |
| 299 | |
| 300 | |
| 301 | /* get the next set of I/Os started */ |
| 302 | void |
| 303 | rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status) |
| 304 | { |
| 305 | int done = 0; |
| 306 | |
| 307 | RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete" ); |
| 308 | queue->numOutstanding--; |
| 309 | RF_ASSERT(queue->numOutstanding >= 0); |
| 310 | |
| 311 | /* dispatch requests to the disk until we find one that we can't. */ |
| 312 | /* no reason to continue once we've filled up the queue */ |
| 313 | /* no reason to even start if the queue is locked */ |
| 314 | |
| 315 | while (!done && !RF_QUEUE_FULL(queue)) { |
| 316 | req = (queue->qPtr->Dequeue) (queue->qHdr); |
| 317 | if (req) { |
| 318 | Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n" , req->priority, queue->col); |
| 319 | queue->queueLength--; /* decrement count of number of requests waiting in this queue */ |
| 320 | RF_ASSERT(queue->queueLength >= 0); |
| 321 | if (RF_OK_TO_DISPATCH(queue, req)) { |
| 322 | Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n" , req->priority, queue->col); |
| 323 | rf_DispatchKernelIO(queue, req); |
| 324 | } else { |
| 325 | /* we can't dispatch it, so just re-enqueue it. |
| 326 | potential trouble here if disk queues batch reqs */ |
| 327 | Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n" , req->priority, queue->col); |
| 328 | queue->queueLength++; |
| 329 | (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority); |
| 330 | done = 1; |
| 331 | } |
| 332 | } else { |
| 333 | Dprintf1("DiskIOComplete: no more requests to extract.\n" , "" ); |
| 334 | done = 1; |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete" ); |
| 339 | } |
| 340 | /* promotes accesses tagged with the given parityStripeID from low priority |
| 341 | * to normal priority. This promotion is optional, meaning that a queue |
| 342 | * need not implement it. If there is no promotion routine associated with |
| 343 | * a queue, this routine does nothing and returns -1. |
| 344 | */ |
| 345 | int |
| 346 | rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID, |
| 347 | RF_ReconUnitNum_t which_ru) |
| 348 | { |
| 349 | int retval; |
| 350 | |
| 351 | if (!queue->qPtr->Promote) |
| 352 | return (-1); |
| 353 | RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote" ); |
| 354 | retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru); |
| 355 | RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote" ); |
| 356 | return (retval); |
| 357 | } |
| 358 | |
| 359 | RF_DiskQueueData_t * |
| 360 | rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect, |
| 361 | RF_SectorCount_t nsect, void *bf, |
| 362 | RF_StripeNum_t parityStripeID, |
| 363 | RF_ReconUnitNum_t which_ru, |
| 364 | int (*wakeF) (void *, int), void *arg, |
| 365 | RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr, |
| 366 | RF_DiskQueueDataFlags_t flags, void *kb_proc, |
| 367 | int waitflag) |
| 368 | { |
| 369 | RF_DiskQueueData_t *p; |
| 370 | |
| 371 | p = pool_get(&rf_pools.dqd, waitflag); |
| 372 | if (p == NULL) |
| 373 | return (NULL); |
| 374 | |
| 375 | memset(p, 0, sizeof(RF_DiskQueueData_t)); |
| 376 | if (waitflag == PR_WAITOK) { |
| 377 | p->bp = getiobuf(NULL, true); |
| 378 | } else { |
| 379 | p->bp = getiobuf(NULL, false); |
| 380 | } |
| 381 | if (p->bp == NULL) { |
| 382 | pool_put(&rf_pools.dqd, p); |
| 383 | return (NULL); |
| 384 | } |
| 385 | SET(p->bp->b_cflags, BC_BUSY); /* mark buffer busy */ |
| 386 | |
| 387 | p->sectorOffset = ssect + rf_protectedSectors; |
| 388 | p->numSector = nsect; |
| 389 | p->type = typ; |
| 390 | p->buf = bf; |
| 391 | p->parityStripeID = parityStripeID; |
| 392 | p->which_ru = which_ru; |
| 393 | p->CompleteFunc = wakeF; |
| 394 | p->argument = arg; |
| 395 | p->next = NULL; |
| 396 | p->tracerec = tracerec; |
| 397 | p->priority = RF_IO_NORMAL_PRIORITY; |
| 398 | p->raidPtr = raidPtr; |
| 399 | p->flags = flags; |
| 400 | p->b_proc = kb_proc; |
| 401 | return (p); |
| 402 | } |
| 403 | |
| 404 | void |
| 405 | rf_FreeDiskQueueData(RF_DiskQueueData_t *p) |
| 406 | { |
| 407 | int s; |
| 408 | s = splbio(); /* XXX protect only pool_put, or neither? */ |
| 409 | putiobuf(p->bp); |
| 410 | pool_put(&rf_pools.dqd, p); |
| 411 | splx(s); |
| 412 | } |
| 413 | |