| 1 | /* $NetBSD: bufq_priocscan.c,v 1.20 2016/11/16 10:42:14 pgoyette Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi, |
| 5 | * All rights reserved. |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * 1. Redistributions of source code must retain the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer. |
| 12 | * 2. Redistributions in binary form must reproduce the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer in the |
| 14 | * documentation and/or other materials provided with the distribution. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 26 | * SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | #include <sys/cdefs.h> |
| 30 | __KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.20 2016/11/16 10:42:14 pgoyette Exp $" ); |
| 31 | |
| 32 | #include <sys/param.h> |
| 33 | #include <sys/systm.h> |
| 34 | #include <sys/buf.h> |
| 35 | #include <sys/bufq.h> |
| 36 | #include <sys/bufq_impl.h> |
| 37 | #include <sys/kmem.h> |
| 38 | #include <sys/rbtree.h> |
| 39 | #include <sys/module.h> |
| 40 | |
| 41 | #undef PRIOCSCAN_USE_GLOBAL_POSITION |
| 42 | |
| 43 | /* |
| 44 | * Cyclical scan (CSCAN) |
| 45 | */ |
| 46 | |
| 47 | struct cscan_key { |
| 48 | daddr_t k_rawblkno; |
| 49 | int k_cylinder; |
| 50 | }; |
| 51 | |
| 52 | struct cscan_queue { |
| 53 | rb_tree_t cq_buffers; /* ordered list of buffers */ |
| 54 | #if !defined(PRIOCSCAN_USE_GLOBAL_POSITION) |
| 55 | struct cscan_key cq_lastkey; /* key of last request */ |
| 56 | #endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ |
| 57 | int cq_sortby; /* BUFQ_SORT_MASK */ |
| 58 | rb_tree_ops_t cq_ops; |
| 59 | }; |
| 60 | |
| 61 | static signed int |
| 62 | buf_cmp(const struct buf *b1, const struct buf *b2, int sortby) |
| 63 | { |
| 64 | |
| 65 | if (buf_inorder(b2, b1, sortby)) { |
| 66 | return 1; /* b1 > b2 */ |
| 67 | } |
| 68 | if (buf_inorder(b1, b2, sortby)) { |
| 69 | return -1; /* b1 < b2 */ |
| 70 | } |
| 71 | return 0; |
| 72 | } |
| 73 | |
| 74 | /* return positive if n1 > n2 */ |
| 75 | static signed int |
| 76 | cscan_tree_compare_nodes(void *context, const void *n1, const void *n2) |
| 77 | { |
| 78 | const struct cscan_queue * const q = context; |
| 79 | const struct buf * const b1 = n1; |
| 80 | const struct buf * const b2 = n2; |
| 81 | const int sortby = q->cq_sortby; |
| 82 | const int diff = buf_cmp(b1, b2, sortby); |
| 83 | |
| 84 | /* |
| 85 | * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o |
| 86 | */ |
| 87 | |
| 88 | if (diff != 0) { |
| 89 | return diff; |
| 90 | } |
| 91 | |
| 92 | /* |
| 93 | * XXX rawblkno/cylinder might not be unique. eg. unbuffered i/o |
| 94 | */ |
| 95 | if (b1 > b2) { |
| 96 | return 1; |
| 97 | } |
| 98 | if (b1 < b2) { |
| 99 | return -1; |
| 100 | } |
| 101 | return 0; |
| 102 | } |
| 103 | |
| 104 | /* return positive if n1 > k2 */ |
| 105 | static signed int |
| 106 | cscan_tree_compare_key(void *context, const void *n1, const void *k2) |
| 107 | { |
| 108 | const struct cscan_queue * const q = context; |
| 109 | const struct buf * const b1 = n1; |
| 110 | const struct cscan_key * const key = k2; |
| 111 | const struct buf tmp = { |
| 112 | .b_rawblkno = key->k_rawblkno, |
| 113 | .b_cylinder = key->k_cylinder, |
| 114 | }; |
| 115 | const struct buf *b2 = &tmp; |
| 116 | const int sortby = q->cq_sortby; |
| 117 | |
| 118 | return buf_cmp(b1, b2, sortby); |
| 119 | } |
| 120 | |
| 121 | static void __unused |
| 122 | cscan_dump(struct cscan_queue *cq) |
| 123 | { |
| 124 | const int sortby = cq->cq_sortby; |
| 125 | struct buf *bp; |
| 126 | |
| 127 | RB_TREE_FOREACH(bp, &cq->cq_buffers) { |
| 128 | if (sortby == BUFQ_SORT_RAWBLOCK) { |
| 129 | printf(" %jd" , (intmax_t)bp->b_rawblkno); |
| 130 | } else { |
| 131 | printf(" %jd/%jd" , |
| 132 | (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno); |
| 133 | } |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | static inline bool |
| 138 | cscan_empty(struct cscan_queue *q) |
| 139 | { |
| 140 | |
| 141 | /* XXX this might do more work than necessary */ |
| 142 | return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL; |
| 143 | } |
| 144 | |
| 145 | static void |
| 146 | cscan_put(struct cscan_queue *q, struct buf *bp) |
| 147 | { |
| 148 | struct buf *obp __diagused; |
| 149 | |
| 150 | obp = rb_tree_insert_node(&q->cq_buffers, bp); |
| 151 | KASSERT(obp == bp); /* see cscan_tree_compare_nodes */ |
| 152 | } |
| 153 | |
| 154 | static struct buf * |
| 155 | cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key) |
| 156 | { |
| 157 | struct buf *bp; |
| 158 | |
| 159 | bp = rb_tree_find_node_geq(&q->cq_buffers, key); |
| 160 | KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0); |
| 161 | if (bp == NULL) { |
| 162 | bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT); |
| 163 | KDASSERT(cscan_tree_compare_key(q, bp, key) < 0); |
| 164 | } |
| 165 | if (bp != NULL && remove) { |
| 166 | #if defined(DEBUG) |
| 167 | struct buf *nbp; |
| 168 | #endif /* defined(DEBUG) */ |
| 169 | |
| 170 | rb_tree_remove_node(&q->cq_buffers, bp); |
| 171 | /* |
| 172 | * remember the head position. |
| 173 | */ |
| 174 | key->k_cylinder = bp->b_cylinder; |
| 175 | key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT); |
| 176 | #if defined(DEBUG) |
| 177 | nbp = rb_tree_find_node_geq(&q->cq_buffers, key); |
| 178 | if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) { |
| 179 | panic("%s: wrong order %p < %p\n" , __func__, |
| 180 | nbp, bp); |
| 181 | } |
| 182 | #endif /* defined(DEBUG) */ |
| 183 | } |
| 184 | return bp; |
| 185 | } |
| 186 | |
| 187 | static void |
| 188 | cscan_init(struct cscan_queue *q, int sortby) |
| 189 | { |
| 190 | static const rb_tree_ops_t cscan_tree_ops = { |
| 191 | .rbto_compare_nodes = cscan_tree_compare_nodes, |
| 192 | .rbto_compare_key = cscan_tree_compare_key, |
| 193 | .rbto_node_offset = offsetof(struct buf, b_u.u_rbnode), |
| 194 | .rbto_context = NULL, |
| 195 | }; |
| 196 | |
| 197 | q->cq_sortby = sortby; |
| 198 | /* XXX copy ops to workaround rbtree.h API limitation */ |
| 199 | q->cq_ops = cscan_tree_ops; |
| 200 | q->cq_ops.rbto_context = q; |
| 201 | rb_tree_init(&q->cq_buffers, &q->cq_ops); |
| 202 | } |
| 203 | |
| 204 | /* |
| 205 | * Per-prioritiy CSCAN. |
| 206 | * |
| 207 | * XXX probably we should have a way to raise |
| 208 | * priority of the on-queue requests. |
| 209 | */ |
| 210 | #define PRIOCSCAN_NQUEUE 3 |
| 211 | |
| 212 | struct priocscan_queue { |
| 213 | struct cscan_queue q_queue; |
| 214 | unsigned int q_burst; |
| 215 | }; |
| 216 | |
| 217 | struct bufq_priocscan { |
| 218 | struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE]; |
| 219 | |
| 220 | #if defined(PRIOCSCAN_USE_GLOBAL_POSITION) |
| 221 | /* |
| 222 | * XXX using "global" head position can reduce positioning time |
| 223 | * when switching between queues. |
| 224 | * although it might affect against fairness. |
| 225 | */ |
| 226 | struct cscan_key bq_lastkey; |
| 227 | #endif |
| 228 | }; |
| 229 | |
| 230 | /* |
| 231 | * how many requests to serve when having pending requests on other queues. |
| 232 | * |
| 233 | * XXX tune |
| 234 | * be careful: while making these values larger likely |
| 235 | * increases the total throughput, it can also increase latencies |
| 236 | * for some workloads. |
| 237 | */ |
| 238 | const int priocscan_burst[] = { |
| 239 | 64, 16, 4 |
| 240 | }; |
| 241 | |
| 242 | static void bufq_priocscan_init(struct bufq_state *); |
| 243 | static void bufq_priocscan_put(struct bufq_state *, struct buf *); |
| 244 | static struct buf *bufq_priocscan_get(struct bufq_state *, int); |
| 245 | |
| 246 | BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init); |
| 247 | |
| 248 | static inline struct cscan_queue *bufq_priocscan_selectqueue( |
| 249 | struct bufq_priocscan *, const struct buf *); |
| 250 | |
| 251 | static inline struct cscan_queue * |
| 252 | bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp) |
| 253 | { |
| 254 | static const int priocscan_priomap[] = { |
| 255 | [BPRIO_TIMENONCRITICAL] = 2, |
| 256 | [BPRIO_TIMELIMITED] = 1, |
| 257 | [BPRIO_TIMECRITICAL] = 0 |
| 258 | }; |
| 259 | |
| 260 | return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue; |
| 261 | } |
| 262 | |
| 263 | static void |
| 264 | bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp) |
| 265 | { |
| 266 | struct bufq_priocscan *q = bufq->bq_private; |
| 267 | struct cscan_queue *cq; |
| 268 | |
| 269 | cq = bufq_priocscan_selectqueue(q, bp); |
| 270 | cscan_put(cq, bp); |
| 271 | } |
| 272 | |
| 273 | static struct buf * |
| 274 | bufq_priocscan_get(struct bufq_state *bufq, int remove) |
| 275 | { |
| 276 | struct bufq_priocscan *q = bufq->bq_private; |
| 277 | struct priocscan_queue *pq, *npq; |
| 278 | struct priocscan_queue *first; /* highest priority non-empty queue */ |
| 279 | const struct priocscan_queue *epq; |
| 280 | struct buf *bp; |
| 281 | bool single; /* true if there's only one non-empty queue */ |
| 282 | |
| 283 | /* |
| 284 | * find the highest priority non-empty queue. |
| 285 | */ |
| 286 | pq = &q->bq_queue[0]; |
| 287 | epq = pq + PRIOCSCAN_NQUEUE; |
| 288 | for (; pq < epq; pq++) { |
| 289 | if (!cscan_empty(&pq->q_queue)) { |
| 290 | break; |
| 291 | } |
| 292 | } |
| 293 | if (pq == epq) { |
| 294 | /* |
| 295 | * all our queues are empty. there's nothing to serve. |
| 296 | */ |
| 297 | return NULL; |
| 298 | } |
| 299 | first = pq; |
| 300 | |
| 301 | /* |
| 302 | * scan the rest of queues. |
| 303 | * |
| 304 | * if we have two or more non-empty queues, we serve the highest |
| 305 | * priority one with non-zero burst count. |
| 306 | */ |
| 307 | single = true; |
| 308 | for (npq = pq + 1; npq < epq; npq++) { |
| 309 | if (!cscan_empty(&npq->q_queue)) { |
| 310 | /* |
| 311 | * we found another non-empty queue. |
| 312 | * it means that a queue needs to consume its burst |
| 313 | * count to be served. |
| 314 | */ |
| 315 | single = false; |
| 316 | |
| 317 | /* |
| 318 | * check if our current candidate queue has already |
| 319 | * exhausted its burst count. |
| 320 | */ |
| 321 | if (pq->q_burst > 0) { |
| 322 | break; |
| 323 | } |
| 324 | pq = npq; |
| 325 | } |
| 326 | } |
| 327 | if (single) { |
| 328 | /* |
| 329 | * there's only a non-empty queue. |
| 330 | * just serve it without consuming its burst count. |
| 331 | */ |
| 332 | KASSERT(pq == first); |
| 333 | } else { |
| 334 | /* |
| 335 | * there are two or more non-empty queues. |
| 336 | */ |
| 337 | if (pq->q_burst == 0) { |
| 338 | /* |
| 339 | * no queues can be served because they have already |
| 340 | * exhausted their burst count. |
| 341 | */ |
| 342 | unsigned int i; |
| 343 | #ifdef DEBUG |
| 344 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
| 345 | pq = &q->bq_queue[i]; |
| 346 | if (!cscan_empty(&pq->q_queue) && pq->q_burst) { |
| 347 | panic("%s: inconsist" , __func__); |
| 348 | } |
| 349 | } |
| 350 | #endif /* DEBUG */ |
| 351 | /* |
| 352 | * reset burst counts. |
| 353 | */ |
| 354 | if (remove) { |
| 355 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
| 356 | pq = &q->bq_queue[i]; |
| 357 | pq->q_burst = priocscan_burst[i]; |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | /* |
| 362 | * serve the highest priority non-empty queue. |
| 363 | */ |
| 364 | pq = first; |
| 365 | } |
| 366 | /* |
| 367 | * consume the burst count. |
| 368 | * |
| 369 | * XXX account only by number of requests. is it good enough? |
| 370 | */ |
| 371 | if (remove) { |
| 372 | KASSERT(pq->q_burst > 0); |
| 373 | pq->q_burst--; |
| 374 | } |
| 375 | } |
| 376 | |
| 377 | /* |
| 378 | * finally, get a request from the selected queue. |
| 379 | */ |
| 380 | KDASSERT(!cscan_empty(&pq->q_queue)); |
| 381 | bp = cscan_get(&pq->q_queue, remove, |
| 382 | #if defined(PRIOCSCAN_USE_GLOBAL_POSITION) |
| 383 | &q->bq_lastkey |
| 384 | #else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ |
| 385 | &pq->q_queue.cq_lastkey |
| 386 | #endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */ |
| 387 | ); |
| 388 | KDASSERT(bp != NULL); |
| 389 | KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp)); |
| 390 | |
| 391 | return bp; |
| 392 | } |
| 393 | |
| 394 | static struct buf * |
| 395 | bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp) |
| 396 | { |
| 397 | struct bufq_priocscan * const q = bufq->bq_private; |
| 398 | unsigned int i; |
| 399 | |
| 400 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
| 401 | struct cscan_queue * const cq = &q->bq_queue[i].q_queue; |
| 402 | struct buf *it; |
| 403 | |
| 404 | /* |
| 405 | * XXX probably could be faster but the cancel functionality |
| 406 | * is not widely used anyway. |
| 407 | */ |
| 408 | RB_TREE_FOREACH(it, &cq->cq_buffers) { |
| 409 | if (it == bp) { |
| 410 | rb_tree_remove_node(&cq->cq_buffers, bp); |
| 411 | return bp; |
| 412 | } |
| 413 | } |
| 414 | } |
| 415 | return NULL; |
| 416 | } |
| 417 | |
| 418 | static void |
| 419 | bufq_priocscan_fini(struct bufq_state *bufq) |
| 420 | { |
| 421 | |
| 422 | KASSERT(bufq->bq_private != NULL); |
| 423 | kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan)); |
| 424 | } |
| 425 | |
| 426 | static void |
| 427 | bufq_priocscan_init(struct bufq_state *bufq) |
| 428 | { |
| 429 | struct bufq_priocscan *q; |
| 430 | const int sortby = bufq->bq_flags & BUFQ_SORT_MASK; |
| 431 | unsigned int i; |
| 432 | |
| 433 | bufq->bq_get = bufq_priocscan_get; |
| 434 | bufq->bq_put = bufq_priocscan_put; |
| 435 | bufq->bq_cancel = bufq_priocscan_cancel; |
| 436 | bufq->bq_fini = bufq_priocscan_fini; |
| 437 | bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP); |
| 438 | |
| 439 | q = bufq->bq_private; |
| 440 | for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { |
| 441 | struct cscan_queue *cq = &q->bq_queue[i].q_queue; |
| 442 | |
| 443 | cscan_init(cq, sortby); |
| 444 | } |
| 445 | } |
| 446 | |
| 447 | MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL); |
| 448 | |
| 449 | static int |
| 450 | bufq_priocscan_modcmd(modcmd_t cmd, void *opaque) |
| 451 | { |
| 452 | |
| 453 | switch (cmd) { |
| 454 | case MODULE_CMD_INIT: |
| 455 | return bufq_register(&bufq_strat_priocscan); |
| 456 | case MODULE_CMD_FINI: |
| 457 | return bufq_unregister(&bufq_strat_priocscan); |
| 458 | default: |
| 459 | return ENOTTY; |
| 460 | } |
| 461 | } |
| 462 | |