| 1 | /* $NetBSD: uvm_readahead.c,v 1.8 2011/06/12 03:36:04 rmind Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi, |
| 5 | * All rights reserved. |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * 1. Redistributions of source code must retain the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer. |
| 12 | * 2. Redistributions in binary form must reproduce the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer in the |
| 14 | * documentation and/or other materials provided with the distribution. |
| 15 | * |
| 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 26 | * SUCH DAMAGE. |
| 27 | */ |
| 28 | |
| 29 | /* |
| 30 | * uvm_object read-ahead |
| 31 | * |
| 32 | * TODO: |
| 33 | * - tune. |
| 34 | * - handle multiple streams. |
| 35 | * - find a better way to deal with PGO_LOCKED pager requests. |
| 36 | * (currently just ignored) |
| 37 | * - consider the amount of memory in the system. |
| 38 | * - consider the speed of the underlying device. |
| 39 | * - consider filesystem block size / block layout. |
| 40 | */ |
| 41 | |
| 42 | #include <sys/cdefs.h> |
| 43 | __KERNEL_RCSID(0, "$NetBSD: uvm_readahead.c,v 1.8 2011/06/12 03:36:04 rmind Exp $" ); |
| 44 | |
| 45 | #include <sys/param.h> |
| 46 | #include <sys/pool.h> |
| 47 | |
| 48 | #include <uvm/uvm.h> |
| 49 | #include <uvm/uvm_readahead.h> |
| 50 | |
| 51 | #if defined(READAHEAD_DEBUG) |
| 52 | #define DPRINTF(a) printf a |
| 53 | #else /* defined(READAHEAD_DEBUG) */ |
| 54 | #define DPRINTF(a) /* nothing */ |
| 55 | #endif /* defined(READAHEAD_DEBUG) */ |
| 56 | |
| 57 | /* |
| 58 | * uvm_ractx: read-ahead context. |
| 59 | */ |
| 60 | |
| 61 | struct uvm_ractx { |
| 62 | int ra_flags; |
| 63 | #define RA_VALID 1 |
| 64 | off_t ra_winstart; /* window start offset */ |
| 65 | size_t ra_winsize; /* window size */ |
| 66 | off_t ra_next; /* next offset to read-ahead */ |
| 67 | }; |
| 68 | |
| 69 | #if defined(sun2) || defined(sun3) |
| 70 | /* XXX: on sun2 and sun3 MAXPHYS is 0xe000 */ |
| 71 | #undef MAXPHYS |
| 72 | #define MAXPHYS 0x8000 /* XXX */ |
| 73 | #endif |
| 74 | |
| 75 | #define RA_WINSIZE_INIT MAXPHYS /* initial window size */ |
| 76 | #define RA_WINSIZE_MAX (MAXPHYS * 8) /* max window size */ |
| 77 | #define RA_WINSIZE_SEQENTIAL RA_WINSIZE_MAX /* fixed window size used for |
| 78 | SEQUENTIAL hint */ |
| 79 | #define RA_MINSIZE (MAXPHYS * 2) /* min size to start i/o */ |
| 80 | #define RA_IOCHUNK MAXPHYS /* read-ahead i/o chunk size */ |
| 81 | |
| 82 | static off_t ra_startio(struct uvm_object *, off_t, size_t); |
| 83 | static struct uvm_ractx *ra_allocctx(void); |
| 84 | static void ra_freectx(struct uvm_ractx *); |
| 85 | |
| 86 | static struct pool_cache ractx_cache; |
| 87 | |
| 88 | /* |
| 89 | * uvm_ra_init: initialize readahead module. |
| 90 | */ |
| 91 | |
| 92 | void |
| 93 | uvm_ra_init(void) |
| 94 | { |
| 95 | |
| 96 | pool_cache_bootstrap(&ractx_cache, sizeof(struct uvm_ractx), 0, 0, 0, |
| 97 | "ractx" , NULL, IPL_NONE, NULL, NULL, NULL); |
| 98 | } |
| 99 | |
| 100 | static struct uvm_ractx * |
| 101 | ra_allocctx(void) |
| 102 | { |
| 103 | |
| 104 | return pool_cache_get(&ractx_cache, PR_NOWAIT); |
| 105 | } |
| 106 | |
| 107 | static void |
| 108 | ra_freectx(struct uvm_ractx *ra) |
| 109 | { |
| 110 | |
| 111 | pool_cache_put(&ractx_cache, ra); |
| 112 | } |
| 113 | |
| 114 | /* |
| 115 | * ra_startio: start i/o for read-ahead. |
| 116 | * |
| 117 | * => start i/o for each RA_IOCHUNK sized chunk. |
| 118 | * => return offset to which we started i/o. |
| 119 | */ |
| 120 | |
| 121 | static off_t |
| 122 | ra_startio(struct uvm_object *uobj, off_t off, size_t sz) |
| 123 | { |
| 124 | const off_t endoff = off + sz; |
| 125 | |
| 126 | DPRINTF(("%s: uobj=%p, off=%" PRIu64 ", endoff=%" PRIu64 "\n" , |
| 127 | __func__, uobj, off, endoff)); |
| 128 | off = trunc_page(off); |
| 129 | while (off < endoff) { |
| 130 | const size_t chunksize = RA_IOCHUNK; |
| 131 | int error; |
| 132 | size_t donebytes; |
| 133 | int npages; |
| 134 | int orignpages; |
| 135 | size_t bytelen; |
| 136 | |
| 137 | KASSERT((chunksize & (chunksize - 1)) == 0); |
| 138 | KASSERT((off & PAGE_MASK) == 0); |
| 139 | bytelen = ((off + chunksize) & -(off_t)chunksize) - off; |
| 140 | KASSERT((bytelen & PAGE_MASK) == 0); |
| 141 | npages = orignpages = bytelen >> PAGE_SHIFT; |
| 142 | KASSERT(npages != 0); |
| 143 | |
| 144 | /* |
| 145 | * use UVM_ADV_RANDOM to avoid recursion. |
| 146 | */ |
| 147 | |
| 148 | mutex_enter(uobj->vmobjlock); |
| 149 | error = (*uobj->pgops->pgo_get)(uobj, off, NULL, |
| 150 | &npages, 0, VM_PROT_READ, UVM_ADV_RANDOM, 0); |
| 151 | DPRINTF(("%s: off=%" PRIu64 ", bytelen=%zu -> %d\n" , |
| 152 | __func__, off, bytelen, error)); |
| 153 | if (error != 0 && error != EBUSY) { |
| 154 | if (error != EINVAL) { /* maybe past EOF */ |
| 155 | DPRINTF(("%s: error=%d\n" , __func__, error)); |
| 156 | } |
| 157 | break; |
| 158 | } |
| 159 | KASSERT(orignpages == npages); |
| 160 | donebytes = orignpages << PAGE_SHIFT; |
| 161 | off += donebytes; |
| 162 | } |
| 163 | |
| 164 | return off; |
| 165 | } |
| 166 | |
| 167 | /* ------------------------------------------------------------ */ |
| 168 | |
| 169 | /* |
| 170 | * uvm_ra_allocctx: allocate a context. |
| 171 | */ |
| 172 | |
| 173 | struct uvm_ractx * |
| 174 | uvm_ra_allocctx(void) |
| 175 | { |
| 176 | struct uvm_ractx *ra; |
| 177 | |
| 178 | ra = ra_allocctx(); |
| 179 | if (ra != NULL) { |
| 180 | ra->ra_flags = 0; |
| 181 | } |
| 182 | |
| 183 | return ra; |
| 184 | } |
| 185 | |
| 186 | /* |
| 187 | * uvm_ra_freectx: free a context. |
| 188 | */ |
| 189 | |
| 190 | void |
| 191 | uvm_ra_freectx(struct uvm_ractx *ra) |
| 192 | { |
| 193 | |
| 194 | KASSERT(ra != NULL); |
| 195 | ra_freectx(ra); |
| 196 | } |
| 197 | |
| 198 | /* |
| 199 | * uvm_ra_request: update a read-ahead context and start i/o if appropriate. |
| 200 | * |
| 201 | * => called when [reqoff, reqoff+reqsize) is requested. |
| 202 | * => object must be locked by caller, will return locked. |
| 203 | */ |
| 204 | |
| 205 | void |
| 206 | uvm_ra_request(struct uvm_ractx *ra, int advice, struct uvm_object *uobj, |
| 207 | off_t reqoff, size_t reqsize) |
| 208 | { |
| 209 | |
| 210 | KASSERT(mutex_owned(uobj->vmobjlock)); |
| 211 | |
| 212 | if (ra == NULL || advice == UVM_ADV_RANDOM) { |
| 213 | return; |
| 214 | } |
| 215 | |
| 216 | if (advice == UVM_ADV_SEQUENTIAL) { |
| 217 | |
| 218 | /* |
| 219 | * always do read-ahead with a large window. |
| 220 | */ |
| 221 | |
| 222 | if ((ra->ra_flags & RA_VALID) == 0) { |
| 223 | ra->ra_winstart = ra->ra_next = 0; |
| 224 | ra->ra_flags |= RA_VALID; |
| 225 | } |
| 226 | if (reqoff < ra->ra_winstart) { |
| 227 | ra->ra_next = reqoff; |
| 228 | } |
| 229 | ra->ra_winsize = RA_WINSIZE_SEQENTIAL; |
| 230 | goto do_readahead; |
| 231 | } |
| 232 | |
| 233 | /* |
| 234 | * a request with UVM_ADV_NORMAL hint. (ie. no hint) |
| 235 | * |
| 236 | * we keep a sliding window in order to determine: |
| 237 | * - if the previous read-ahead was successful or not. |
| 238 | * - how many bytes to read-ahead. |
| 239 | */ |
| 240 | |
| 241 | /* |
| 242 | * if it's the first request for this context, |
| 243 | * initialize context and return. |
| 244 | */ |
| 245 | |
| 246 | if ((ra->ra_flags & RA_VALID) == 0) { |
| 247 | initialize: |
| 248 | ra->ra_winstart = ra->ra_next = reqoff + reqsize; |
| 249 | ra->ra_winsize = RA_WINSIZE_INIT; |
| 250 | ra->ra_flags |= RA_VALID; |
| 251 | goto done; |
| 252 | } |
| 253 | |
| 254 | /* |
| 255 | * if it isn't in our window, |
| 256 | * initialize context and return. |
| 257 | * (read-ahead miss) |
| 258 | */ |
| 259 | |
| 260 | if (reqoff < ra->ra_winstart || |
| 261 | ra->ra_winstart + ra->ra_winsize < reqoff) { |
| 262 | |
| 263 | /* |
| 264 | * ... unless we seem to be reading the same chunk repeatedly. |
| 265 | * |
| 266 | * XXX should have some margin? |
| 267 | */ |
| 268 | |
| 269 | if (reqoff + reqsize == ra->ra_winstart) { |
| 270 | DPRINTF(("%s: %p: same block: off=%" PRIu64 |
| 271 | ", size=%zd, winstart=%" PRIu64 "\n" , |
| 272 | __func__, ra, reqoff, reqsize, ra->ra_winstart)); |
| 273 | goto done; |
| 274 | } |
| 275 | goto initialize; |
| 276 | } |
| 277 | |
| 278 | /* |
| 279 | * it's in our window. (read-ahead hit) |
| 280 | * - start read-ahead i/o if appropriate. |
| 281 | * - advance and enlarge window. |
| 282 | */ |
| 283 | |
| 284 | do_readahead: |
| 285 | |
| 286 | /* |
| 287 | * don't bother to read-ahead behind current request. |
| 288 | */ |
| 289 | |
| 290 | if (reqoff > ra->ra_next) { |
| 291 | ra->ra_next = reqoff; |
| 292 | } |
| 293 | |
| 294 | /* |
| 295 | * try to make [reqoff, reqoff+ra_winsize) in-core. |
| 296 | * note that [reqoff, ra_next) is considered already done. |
| 297 | */ |
| 298 | |
| 299 | if (reqoff + ra->ra_winsize > ra->ra_next) { |
| 300 | off_t raoff = MAX(reqoff, ra->ra_next); |
| 301 | size_t rasize = reqoff + ra->ra_winsize - ra->ra_next; |
| 302 | |
| 303 | #if defined(DIAGNOSTIC) |
| 304 | if (rasize > RA_WINSIZE_MAX) { |
| 305 | printf("%s: corrupted context" , __func__); |
| 306 | rasize = RA_WINSIZE_MAX; |
| 307 | } |
| 308 | #endif /* defined(DIAGNOSTIC) */ |
| 309 | |
| 310 | /* |
| 311 | * issue read-ahead only if we can start big enough i/o. |
| 312 | * otherwise we end up with a stream of small i/o. |
| 313 | */ |
| 314 | |
| 315 | if (rasize >= RA_MINSIZE) { |
| 316 | off_t next; |
| 317 | |
| 318 | mutex_exit(uobj->vmobjlock); |
| 319 | next = ra_startio(uobj, raoff, rasize); |
| 320 | mutex_enter(uobj->vmobjlock); |
| 321 | ra->ra_next = next; |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | /* |
| 326 | * update window. |
| 327 | * |
| 328 | * enlarge window by reqsize, so that it grows in a predictable manner |
| 329 | * regardless of the size of each read(2). |
| 330 | */ |
| 331 | |
| 332 | ra->ra_winstart = reqoff + reqsize; |
| 333 | ra->ra_winsize = MIN(RA_WINSIZE_MAX, ra->ra_winsize + reqsize); |
| 334 | |
| 335 | done:; |
| 336 | } |
| 337 | |
| 338 | int |
| 339 | uvm_readahead(struct uvm_object *uobj, off_t off, off_t size) |
| 340 | { |
| 341 | |
| 342 | /* |
| 343 | * don't allow too much read-ahead. |
| 344 | */ |
| 345 | if (size > RA_WINSIZE_MAX) { |
| 346 | size = RA_WINSIZE_MAX; |
| 347 | } |
| 348 | ra_startio(uobj, off, size); |
| 349 | return 0; |
| 350 | } |
| 351 | |