| 1 | /* $NetBSD: genfs_io.c,v 1.63 2016/09/29 19:08:48 christos Exp $ */ |
| 2 | |
| 3 | /* |
| 4 | * Copyright (c) 1982, 1986, 1989, 1993 |
| 5 | * The Regents of the University of California. All rights reserved. |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * 1. Redistributions of source code must retain the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer. |
| 12 | * 2. Redistributions in binary form must reproduce the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer in the |
| 14 | * documentation and/or other materials provided with the distribution. |
| 15 | * 3. Neither the name of the University nor the names of its contributors |
| 16 | * may be used to endorse or promote products derived from this software |
| 17 | * without specific prior written permission. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 29 | * SUCH DAMAGE. |
| 30 | * |
| 31 | */ |
| 32 | |
| 33 | #include <sys/cdefs.h> |
| 34 | __KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.63 2016/09/29 19:08:48 christos Exp $" ); |
| 35 | |
| 36 | #include <sys/param.h> |
| 37 | #include <sys/systm.h> |
| 38 | #include <sys/proc.h> |
| 39 | #include <sys/kernel.h> |
| 40 | #include <sys/mount.h> |
| 41 | #include <sys/vnode.h> |
| 42 | #include <sys/kmem.h> |
| 43 | #include <sys/kauth.h> |
| 44 | #include <sys/fstrans.h> |
| 45 | #include <sys/buf.h> |
| 46 | |
| 47 | #include <miscfs/genfs/genfs.h> |
| 48 | #include <miscfs/genfs/genfs_node.h> |
| 49 | #include <miscfs/specfs/specdev.h> |
| 50 | |
| 51 | #include <uvm/uvm.h> |
| 52 | #include <uvm/uvm_pager.h> |
| 53 | |
| 54 | static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *, |
| 55 | off_t, enum uio_rw); |
| 56 | static void genfs_dio_iodone(struct buf *); |
| 57 | |
| 58 | static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t, |
| 59 | off_t, bool, bool, bool, bool); |
| 60 | static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw, |
| 61 | void (*)(struct buf *)); |
| 62 | static void genfs_rel_pages(struct vm_page **, unsigned int); |
| 63 | static void genfs_markdirty(struct vnode *); |
| 64 | |
| 65 | int genfs_maxdio = MAXPHYS; |
| 66 | |
| 67 | static void |
| 68 | genfs_rel_pages(struct vm_page **pgs, unsigned int npages) |
| 69 | { |
| 70 | unsigned int i; |
| 71 | |
| 72 | for (i = 0; i < npages; i++) { |
| 73 | struct vm_page *pg = pgs[i]; |
| 74 | |
| 75 | if (pg == NULL || pg == PGO_DONTCARE) |
| 76 | continue; |
| 77 | KASSERT(uvm_page_locked_p(pg)); |
| 78 | if (pg->flags & PG_FAKE) { |
| 79 | pg->flags |= PG_RELEASED; |
| 80 | } |
| 81 | } |
| 82 | mutex_enter(&uvm_pageqlock); |
| 83 | uvm_page_unbusy(pgs, npages); |
| 84 | mutex_exit(&uvm_pageqlock); |
| 85 | } |
| 86 | |
| 87 | static void |
| 88 | genfs_markdirty(struct vnode *vp) |
| 89 | { |
| 90 | struct genfs_node * const gp = VTOG(vp); |
| 91 | |
| 92 | KASSERT(mutex_owned(vp->v_interlock)); |
| 93 | gp->g_dirtygen++; |
| 94 | if ((vp->v_iflag & VI_ONWORKLST) == 0) { |
| 95 | vn_syncer_add_to_worklist(vp, filedelay); |
| 96 | } |
| 97 | if ((vp->v_iflag & (VI_WRMAP|VI_WRMAPDIRTY)) == VI_WRMAP) { |
| 98 | vp->v_iflag |= VI_WRMAPDIRTY; |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | /* |
| 103 | * generic VM getpages routine. |
| 104 | * Return PG_BUSY pages for the given range, |
| 105 | * reading from backing store if necessary. |
| 106 | */ |
| 107 | |
| 108 | int |
| 109 | genfs_getpages(void *v) |
| 110 | { |
| 111 | struct vop_getpages_args /* { |
| 112 | struct vnode *a_vp; |
| 113 | voff_t a_offset; |
| 114 | struct vm_page **a_m; |
| 115 | int *a_count; |
| 116 | int a_centeridx; |
| 117 | vm_prot_t a_access_type; |
| 118 | int a_advice; |
| 119 | int a_flags; |
| 120 | } */ * const ap = v; |
| 121 | |
| 122 | off_t diskeof, memeof; |
| 123 | int i, error, npages; |
| 124 | const int flags = ap->a_flags; |
| 125 | struct vnode * const vp = ap->a_vp; |
| 126 | struct uvm_object * const uobj = &vp->v_uobj; |
| 127 | const bool async = (flags & PGO_SYNCIO) == 0; |
| 128 | const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0; |
| 129 | const bool overwrite = (flags & PGO_OVERWRITE) != 0; |
| 130 | const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0; |
| 131 | const bool glocked = (flags & PGO_GLOCKHELD) != 0; |
| 132 | const bool need_wapbl = blockalloc && vp->v_mount->mnt_wapbl; |
| 133 | bool has_trans_wapbl = false; |
| 134 | UVMHIST_FUNC("genfs_getpages" ); UVMHIST_CALLED(ubchist); |
| 135 | |
| 136 | UVMHIST_LOG(ubchist, "vp %p off 0x%x/%x count %d" , |
| 137 | vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count); |
| 138 | |
| 139 | KASSERT(vp->v_type == VREG || vp->v_type == VDIR || |
| 140 | vp->v_type == VLNK || vp->v_type == VBLK); |
| 141 | |
| 142 | startover: |
| 143 | error = 0; |
| 144 | const voff_t origvsize = vp->v_size; |
| 145 | const off_t origoffset = ap->a_offset; |
| 146 | const int orignpages = *ap->a_count; |
| 147 | |
| 148 | GOP_SIZE(vp, origvsize, &diskeof, 0); |
| 149 | if (flags & PGO_PASTEOF) { |
| 150 | off_t newsize; |
| 151 | #if defined(DIAGNOSTIC) |
| 152 | off_t writeeof; |
| 153 | #endif /* defined(DIAGNOSTIC) */ |
| 154 | |
| 155 | newsize = MAX(origvsize, |
| 156 | origoffset + (orignpages << PAGE_SHIFT)); |
| 157 | GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM); |
| 158 | #if defined(DIAGNOSTIC) |
| 159 | GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM); |
| 160 | if (newsize > round_page(writeeof)) { |
| 161 | panic("%s: past eof: %" PRId64 " vs. %" PRId64, |
| 162 | __func__, newsize, round_page(writeeof)); |
| 163 | } |
| 164 | #endif /* defined(DIAGNOSTIC) */ |
| 165 | } else { |
| 166 | GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM); |
| 167 | } |
| 168 | KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages); |
| 169 | KASSERT((origoffset & (PAGE_SIZE - 1)) == 0 && origoffset >= 0); |
| 170 | KASSERT(orignpages > 0); |
| 171 | |
| 172 | /* |
| 173 | * Bounds-check the request. |
| 174 | */ |
| 175 | |
| 176 | if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) { |
| 177 | if ((flags & PGO_LOCKED) == 0) { |
| 178 | mutex_exit(uobj->vmobjlock); |
| 179 | } |
| 180 | UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x" , |
| 181 | origoffset, *ap->a_count, memeof,0); |
| 182 | error = EINVAL; |
| 183 | goto out_err; |
| 184 | } |
| 185 | |
| 186 | /* uobj is locked */ |
| 187 | |
| 188 | if ((flags & PGO_NOTIMESTAMP) == 0 && |
| 189 | (vp->v_type != VBLK || |
| 190 | (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) { |
| 191 | int updflags = 0; |
| 192 | |
| 193 | if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { |
| 194 | updflags = GOP_UPDATE_ACCESSED; |
| 195 | } |
| 196 | if (memwrite) { |
| 197 | updflags |= GOP_UPDATE_MODIFIED; |
| 198 | } |
| 199 | if (updflags != 0) { |
| 200 | GOP_MARKUPDATE(vp, updflags); |
| 201 | } |
| 202 | } |
| 203 | |
| 204 | /* |
| 205 | * For PGO_LOCKED requests, just return whatever's in memory. |
| 206 | */ |
| 207 | |
| 208 | if (flags & PGO_LOCKED) { |
| 209 | int nfound; |
| 210 | struct vm_page *pg; |
| 211 | |
| 212 | KASSERT(!glocked); |
| 213 | npages = *ap->a_count; |
| 214 | #if defined(DEBUG) |
| 215 | for (i = 0; i < npages; i++) { |
| 216 | pg = ap->a_m[i]; |
| 217 | KASSERT(pg == NULL || pg == PGO_DONTCARE); |
| 218 | } |
| 219 | #endif /* defined(DEBUG) */ |
| 220 | nfound = uvn_findpages(uobj, origoffset, &npages, |
| 221 | ap->a_m, UFP_NOWAIT|UFP_NOALLOC|(memwrite ? UFP_NORDONLY : 0)); |
| 222 | KASSERT(npages == *ap->a_count); |
| 223 | if (nfound == 0) { |
| 224 | error = EBUSY; |
| 225 | goto out_err; |
| 226 | } |
| 227 | if (!genfs_node_rdtrylock(vp)) { |
| 228 | genfs_rel_pages(ap->a_m, npages); |
| 229 | |
| 230 | /* |
| 231 | * restore the array. |
| 232 | */ |
| 233 | |
| 234 | for (i = 0; i < npages; i++) { |
| 235 | pg = ap->a_m[i]; |
| 236 | |
| 237 | if (pg != NULL && pg != PGO_DONTCARE) { |
| 238 | ap->a_m[i] = NULL; |
| 239 | } |
| 240 | KASSERT(ap->a_m[i] == NULL || |
| 241 | ap->a_m[i] == PGO_DONTCARE); |
| 242 | } |
| 243 | } else { |
| 244 | genfs_node_unlock(vp); |
| 245 | } |
| 246 | error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0); |
| 247 | if (error == 0 && memwrite) { |
| 248 | genfs_markdirty(vp); |
| 249 | } |
| 250 | goto out_err; |
| 251 | } |
| 252 | mutex_exit(uobj->vmobjlock); |
| 253 | |
| 254 | /* |
| 255 | * find the requested pages and make some simple checks. |
| 256 | * leave space in the page array for a whole block. |
| 257 | */ |
| 258 | |
| 259 | const int fs_bshift = (vp->v_type != VBLK) ? |
| 260 | vp->v_mount->mnt_fs_bshift : DEV_BSHIFT; |
| 261 | const int fs_bsize = 1 << fs_bshift; |
| 262 | #define blk_mask (fs_bsize - 1) |
| 263 | #define trunc_blk(x) ((x) & ~blk_mask) |
| 264 | #define round_blk(x) (((x) + blk_mask) & ~blk_mask) |
| 265 | |
| 266 | const int orignmempages = MIN(orignpages, |
| 267 | round_page(memeof - origoffset) >> PAGE_SHIFT); |
| 268 | npages = orignmempages; |
| 269 | const off_t startoffset = trunc_blk(origoffset); |
| 270 | const off_t endoffset = MIN( |
| 271 | round_page(round_blk(origoffset + (npages << PAGE_SHIFT))), |
| 272 | round_page(memeof)); |
| 273 | const int ridx = (origoffset - startoffset) >> PAGE_SHIFT; |
| 274 | |
| 275 | const int pgs_size = sizeof(struct vm_page *) * |
| 276 | ((endoffset - startoffset) >> PAGE_SHIFT); |
| 277 | struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES]; |
| 278 | |
| 279 | if (pgs_size > sizeof(pgs_onstack)) { |
| 280 | pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP); |
| 281 | if (pgs == NULL) { |
| 282 | pgs = pgs_onstack; |
| 283 | error = ENOMEM; |
| 284 | goto out_err; |
| 285 | } |
| 286 | } else { |
| 287 | pgs = pgs_onstack; |
| 288 | (void)memset(pgs, 0, pgs_size); |
| 289 | } |
| 290 | |
| 291 | UVMHIST_LOG(ubchist, "ridx %d npages %d startoff %ld endoff %ld" , |
| 292 | ridx, npages, startoffset, endoffset); |
| 293 | |
| 294 | if (!has_trans_wapbl) { |
| 295 | fstrans_start(vp->v_mount, FSTRANS_SHARED); |
| 296 | /* |
| 297 | * XXX: This assumes that we come here only via |
| 298 | * the mmio path |
| 299 | */ |
| 300 | if (need_wapbl) { |
| 301 | error = WAPBL_BEGIN(vp->v_mount); |
| 302 | if (error) { |
| 303 | fstrans_done(vp->v_mount); |
| 304 | goto out_err_free; |
| 305 | } |
| 306 | } |
| 307 | has_trans_wapbl = true; |
| 308 | } |
| 309 | |
| 310 | /* |
| 311 | * hold g_glock to prevent a race with truncate. |
| 312 | * |
| 313 | * check if our idea of v_size is still valid. |
| 314 | */ |
| 315 | |
| 316 | KASSERT(!glocked || genfs_node_wrlocked(vp)); |
| 317 | if (!glocked) { |
| 318 | if (blockalloc) { |
| 319 | genfs_node_wrlock(vp); |
| 320 | } else { |
| 321 | genfs_node_rdlock(vp); |
| 322 | } |
| 323 | } |
| 324 | mutex_enter(uobj->vmobjlock); |
| 325 | if (vp->v_size < origvsize) { |
| 326 | if (!glocked) { |
| 327 | genfs_node_unlock(vp); |
| 328 | } |
| 329 | if (pgs != pgs_onstack) |
| 330 | kmem_free(pgs, pgs_size); |
| 331 | goto startover; |
| 332 | } |
| 333 | |
| 334 | if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], |
| 335 | async ? UFP_NOWAIT : UFP_ALL) != orignmempages) { |
| 336 | if (!glocked) { |
| 337 | genfs_node_unlock(vp); |
| 338 | } |
| 339 | KASSERT(async != 0); |
| 340 | genfs_rel_pages(&pgs[ridx], orignmempages); |
| 341 | mutex_exit(uobj->vmobjlock); |
| 342 | error = EBUSY; |
| 343 | goto out_err_free; |
| 344 | } |
| 345 | |
| 346 | /* |
| 347 | * if the pages are already resident, just return them. |
| 348 | */ |
| 349 | |
| 350 | for (i = 0; i < npages; i++) { |
| 351 | struct vm_page *pg = pgs[ridx + i]; |
| 352 | |
| 353 | if ((pg->flags & PG_FAKE) || |
| 354 | (blockalloc && (pg->flags & PG_RDONLY))) { |
| 355 | break; |
| 356 | } |
| 357 | } |
| 358 | if (i == npages) { |
| 359 | if (!glocked) { |
| 360 | genfs_node_unlock(vp); |
| 361 | } |
| 362 | UVMHIST_LOG(ubchist, "returning cached pages" , 0,0,0,0); |
| 363 | npages += ridx; |
| 364 | goto out; |
| 365 | } |
| 366 | |
| 367 | /* |
| 368 | * if PGO_OVERWRITE is set, don't bother reading the pages. |
| 369 | */ |
| 370 | |
| 371 | if (overwrite) { |
| 372 | if (!glocked) { |
| 373 | genfs_node_unlock(vp); |
| 374 | } |
| 375 | UVMHIST_LOG(ubchist, "PGO_OVERWRITE" ,0,0,0,0); |
| 376 | |
| 377 | for (i = 0; i < npages; i++) { |
| 378 | struct vm_page *pg = pgs[ridx + i]; |
| 379 | |
| 380 | pg->flags &= ~(PG_RDONLY|PG_CLEAN); |
| 381 | } |
| 382 | npages += ridx; |
| 383 | goto out; |
| 384 | } |
| 385 | |
| 386 | /* |
| 387 | * the page wasn't resident and we're not overwriting, |
| 388 | * so we're going to have to do some i/o. |
| 389 | * find any additional pages needed to cover the expanded range. |
| 390 | */ |
| 391 | |
| 392 | npages = (endoffset - startoffset) >> PAGE_SHIFT; |
| 393 | if (startoffset != origoffset || npages != orignmempages) { |
| 394 | int npgs; |
| 395 | |
| 396 | /* |
| 397 | * we need to avoid deadlocks caused by locking |
| 398 | * additional pages at lower offsets than pages we |
| 399 | * already have locked. unlock them all and start over. |
| 400 | */ |
| 401 | |
| 402 | genfs_rel_pages(&pgs[ridx], orignmempages); |
| 403 | memset(pgs, 0, pgs_size); |
| 404 | |
| 405 | UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x" , |
| 406 | startoffset, endoffset, 0,0); |
| 407 | npgs = npages; |
| 408 | if (uvn_findpages(uobj, startoffset, &npgs, pgs, |
| 409 | async ? UFP_NOWAIT : UFP_ALL) != npages) { |
| 410 | if (!glocked) { |
| 411 | genfs_node_unlock(vp); |
| 412 | } |
| 413 | KASSERT(async != 0); |
| 414 | genfs_rel_pages(pgs, npages); |
| 415 | mutex_exit(uobj->vmobjlock); |
| 416 | error = EBUSY; |
| 417 | goto out_err_free; |
| 418 | } |
| 419 | } |
| 420 | |
| 421 | mutex_exit(uobj->vmobjlock); |
| 422 | error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof, |
| 423 | async, memwrite, blockalloc, glocked); |
| 424 | if (error == 0 && async) |
| 425 | goto out_err_free; |
| 426 | if (!glocked) { |
| 427 | genfs_node_unlock(vp); |
| 428 | } |
| 429 | mutex_enter(uobj->vmobjlock); |
| 430 | |
| 431 | /* |
| 432 | * we're almost done! release the pages... |
| 433 | * for errors, we free the pages. |
| 434 | * otherwise we activate them and mark them as valid and clean. |
| 435 | * also, unbusy pages that were not actually requested. |
| 436 | */ |
| 437 | |
| 438 | if (error) { |
| 439 | genfs_rel_pages(pgs, npages); |
| 440 | mutex_exit(uobj->vmobjlock); |
| 441 | UVMHIST_LOG(ubchist, "returning error %d" , error,0,0,0); |
| 442 | goto out_err_free; |
| 443 | } |
| 444 | |
| 445 | out: |
| 446 | UVMHIST_LOG(ubchist, "succeeding, npages %d" , npages,0,0,0); |
| 447 | error = 0; |
| 448 | mutex_enter(&uvm_pageqlock); |
| 449 | for (i = 0; i < npages; i++) { |
| 450 | struct vm_page *pg = pgs[i]; |
| 451 | if (pg == NULL) { |
| 452 | continue; |
| 453 | } |
| 454 | UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x" , |
| 455 | pg, pg->flags, 0,0); |
| 456 | if (pg->flags & PG_FAKE && !overwrite) { |
| 457 | pg->flags &= ~(PG_FAKE); |
| 458 | pmap_clear_modify(pgs[i]); |
| 459 | } |
| 460 | KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0); |
| 461 | if (i < ridx || i >= ridx + orignmempages || async) { |
| 462 | UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x" , |
| 463 | pg, pg->offset,0,0); |
| 464 | if (pg->flags & PG_WANTED) { |
| 465 | wakeup(pg); |
| 466 | } |
| 467 | if (pg->flags & PG_FAKE) { |
| 468 | KASSERT(overwrite); |
| 469 | uvm_pagezero(pg); |
| 470 | } |
| 471 | if (pg->flags & PG_RELEASED) { |
| 472 | uvm_pagefree(pg); |
| 473 | continue; |
| 474 | } |
| 475 | uvm_pageenqueue(pg); |
| 476 | pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE); |
| 477 | UVM_PAGE_OWN(pg, NULL); |
| 478 | } |
| 479 | } |
| 480 | mutex_exit(&uvm_pageqlock); |
| 481 | if (memwrite) { |
| 482 | genfs_markdirty(vp); |
| 483 | } |
| 484 | mutex_exit(uobj->vmobjlock); |
| 485 | if (ap->a_m != NULL) { |
| 486 | memcpy(ap->a_m, &pgs[ridx], |
| 487 | orignmempages * sizeof(struct vm_page *)); |
| 488 | } |
| 489 | |
| 490 | out_err_free: |
| 491 | if (pgs != NULL && pgs != pgs_onstack) |
| 492 | kmem_free(pgs, pgs_size); |
| 493 | out_err: |
| 494 | if (has_trans_wapbl) { |
| 495 | if (need_wapbl) |
| 496 | WAPBL_END(vp->v_mount); |
| 497 | fstrans_done(vp->v_mount); |
| 498 | } |
| 499 | return error; |
| 500 | } |
| 501 | |
| 502 | /* |
| 503 | * genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY. |
| 504 | */ |
| 505 | static int |
| 506 | genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages, |
| 507 | off_t startoffset, off_t diskeof, |
| 508 | bool async, bool memwrite, bool blockalloc, bool glocked) |
| 509 | { |
| 510 | struct uvm_object * const uobj = &vp->v_uobj; |
| 511 | const int fs_bshift = (vp->v_type != VBLK) ? |
| 512 | vp->v_mount->mnt_fs_bshift : DEV_BSHIFT; |
| 513 | const int dev_bshift = (vp->v_type != VBLK) ? |
| 514 | vp->v_mount->mnt_dev_bshift : DEV_BSHIFT; |
| 515 | kauth_cred_t const cred = curlwp->l_cred; /* XXXUBC curlwp */ |
| 516 | size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes; |
| 517 | vaddr_t kva; |
| 518 | struct buf *bp, *mbp; |
| 519 | bool sawhole = false; |
| 520 | int i; |
| 521 | int error = 0; |
| 522 | |
| 523 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
| 524 | |
| 525 | /* |
| 526 | * read the desired page(s). |
| 527 | */ |
| 528 | |
| 529 | totalbytes = npages << PAGE_SHIFT; |
| 530 | bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0)); |
| 531 | tailbytes = totalbytes - bytes; |
| 532 | skipbytes = 0; |
| 533 | |
| 534 | kva = uvm_pagermapin(pgs, npages, |
| 535 | UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK)); |
| 536 | if (kva == 0) |
| 537 | return EBUSY; |
| 538 | |
| 539 | mbp = getiobuf(vp, true); |
| 540 | mbp->b_bufsize = totalbytes; |
| 541 | mbp->b_data = (void *)kva; |
| 542 | mbp->b_resid = mbp->b_bcount = bytes; |
| 543 | mbp->b_cflags = BC_BUSY; |
| 544 | if (async) { |
| 545 | mbp->b_flags = B_READ | B_ASYNC; |
| 546 | mbp->b_iodone = uvm_aio_biodone; |
| 547 | } else { |
| 548 | mbp->b_flags = B_READ; |
| 549 | mbp->b_iodone = NULL; |
| 550 | } |
| 551 | if (async) |
| 552 | BIO_SETPRIO(mbp, BPRIO_TIMELIMITED); |
| 553 | else |
| 554 | BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL); |
| 555 | |
| 556 | /* |
| 557 | * if EOF is in the middle of the range, zero the part past EOF. |
| 558 | * skip over pages which are not PG_FAKE since in that case they have |
| 559 | * valid data that we need to preserve. |
| 560 | */ |
| 561 | |
| 562 | tailstart = bytes; |
| 563 | while (tailbytes > 0) { |
| 564 | const int len = PAGE_SIZE - (tailstart & PAGE_MASK); |
| 565 | |
| 566 | KASSERT(len <= tailbytes); |
| 567 | if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) { |
| 568 | memset((void *)(kva + tailstart), 0, len); |
| 569 | UVMHIST_LOG(ubchist, "tailbytes %p 0x%x 0x%x" , |
| 570 | kva, tailstart, len, 0); |
| 571 | } |
| 572 | tailstart += len; |
| 573 | tailbytes -= len; |
| 574 | } |
| 575 | |
| 576 | /* |
| 577 | * now loop over the pages, reading as needed. |
| 578 | */ |
| 579 | |
| 580 | bp = NULL; |
| 581 | off_t offset; |
| 582 | for (offset = startoffset; |
| 583 | bytes > 0; |
| 584 | offset += iobytes, bytes -= iobytes) { |
| 585 | int run; |
| 586 | daddr_t lbn, blkno; |
| 587 | int pidx; |
| 588 | struct vnode *devvp; |
| 589 | |
| 590 | /* |
| 591 | * skip pages which don't need to be read. |
| 592 | */ |
| 593 | |
| 594 | pidx = (offset - startoffset) >> PAGE_SHIFT; |
| 595 | while ((pgs[pidx]->flags & PG_FAKE) == 0) { |
| 596 | size_t b; |
| 597 | |
| 598 | KASSERT((offset & (PAGE_SIZE - 1)) == 0); |
| 599 | if ((pgs[pidx]->flags & PG_RDONLY)) { |
| 600 | sawhole = true; |
| 601 | } |
| 602 | b = MIN(PAGE_SIZE, bytes); |
| 603 | offset += b; |
| 604 | bytes -= b; |
| 605 | skipbytes += b; |
| 606 | pidx++; |
| 607 | UVMHIST_LOG(ubchist, "skipping, new offset 0x%x" , |
| 608 | offset, 0,0,0); |
| 609 | if (bytes == 0) { |
| 610 | goto loopdone; |
| 611 | } |
| 612 | } |
| 613 | |
| 614 | /* |
| 615 | * bmap the file to find out the blkno to read from and |
| 616 | * how much we can read in one i/o. if bmap returns an error, |
| 617 | * skip the rest of the top-level i/o. |
| 618 | */ |
| 619 | |
| 620 | lbn = offset >> fs_bshift; |
| 621 | error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run); |
| 622 | if (error) { |
| 623 | UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n" , |
| 624 | lbn,error,0,0); |
| 625 | skipbytes += bytes; |
| 626 | bytes = 0; |
| 627 | goto loopdone; |
| 628 | } |
| 629 | |
| 630 | /* |
| 631 | * see how many pages can be read with this i/o. |
| 632 | * reduce the i/o size if necessary to avoid |
| 633 | * overwriting pages with valid data. |
| 634 | */ |
| 635 | |
| 636 | iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, |
| 637 | bytes); |
| 638 | if (offset + iobytes > round_page(offset)) { |
| 639 | int pcount; |
| 640 | |
| 641 | pcount = 1; |
| 642 | while (pidx + pcount < npages && |
| 643 | pgs[pidx + pcount]->flags & PG_FAKE) { |
| 644 | pcount++; |
| 645 | } |
| 646 | iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) - |
| 647 | (offset - trunc_page(offset))); |
| 648 | } |
| 649 | |
| 650 | /* |
| 651 | * if this block isn't allocated, zero it instead of |
| 652 | * reading it. unless we are going to allocate blocks, |
| 653 | * mark the pages we zeroed PG_RDONLY. |
| 654 | */ |
| 655 | |
| 656 | if (blkno == (daddr_t)-1) { |
| 657 | int holepages = (round_page(offset + iobytes) - |
| 658 | trunc_page(offset)) >> PAGE_SHIFT; |
| 659 | UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE" , lbn,0,0,0); |
| 660 | |
| 661 | sawhole = true; |
| 662 | memset((char *)kva + (offset - startoffset), 0, |
| 663 | iobytes); |
| 664 | skipbytes += iobytes; |
| 665 | |
| 666 | mutex_enter(uobj->vmobjlock); |
| 667 | for (i = 0; i < holepages; i++) { |
| 668 | if (memwrite) { |
| 669 | pgs[pidx + i]->flags &= ~PG_CLEAN; |
| 670 | } |
| 671 | if (!blockalloc) { |
| 672 | pgs[pidx + i]->flags |= PG_RDONLY; |
| 673 | } |
| 674 | } |
| 675 | mutex_exit(uobj->vmobjlock); |
| 676 | continue; |
| 677 | } |
| 678 | |
| 679 | /* |
| 680 | * allocate a sub-buf for this piece of the i/o |
| 681 | * (or just use mbp if there's only 1 piece), |
| 682 | * and start it going. |
| 683 | */ |
| 684 | |
| 685 | if (offset == startoffset && iobytes == bytes) { |
| 686 | bp = mbp; |
| 687 | } else { |
| 688 | UVMHIST_LOG(ubchist, "vp %p bp %p num now %d" , |
| 689 | vp, bp, vp->v_numoutput, 0); |
| 690 | bp = getiobuf(vp, true); |
| 691 | nestiobuf_setup(mbp, bp, offset - startoffset, iobytes); |
| 692 | } |
| 693 | bp->b_lblkno = 0; |
| 694 | |
| 695 | /* adjust physical blkno for partial blocks */ |
| 696 | bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> |
| 697 | dev_bshift); |
| 698 | |
| 699 | UVMHIST_LOG(ubchist, |
| 700 | "bp %p offset 0x%x bcount 0x%x blkno 0x%x" , |
| 701 | bp, offset, bp->b_bcount, bp->b_blkno); |
| 702 | |
| 703 | VOP_STRATEGY(devvp, bp); |
| 704 | } |
| 705 | |
| 706 | loopdone: |
| 707 | nestiobuf_done(mbp, skipbytes, error); |
| 708 | if (async) { |
| 709 | UVMHIST_LOG(ubchist, "returning 0 (async)" ,0,0,0,0); |
| 710 | if (!glocked) { |
| 711 | genfs_node_unlock(vp); |
| 712 | } |
| 713 | return 0; |
| 714 | } |
| 715 | if (bp != NULL) { |
| 716 | error = biowait(mbp); |
| 717 | } |
| 718 | |
| 719 | /* Remove the mapping (make KVA available as soon as possible) */ |
| 720 | uvm_pagermapout(kva, npages); |
| 721 | |
| 722 | /* |
| 723 | * if this we encountered a hole then we have to do a little more work. |
| 724 | * for read faults, we marked the page PG_RDONLY so that future |
| 725 | * write accesses to the page will fault again. |
| 726 | * for write faults, we must make sure that the backing store for |
| 727 | * the page is completely allocated while the pages are locked. |
| 728 | */ |
| 729 | |
| 730 | if (!error && sawhole && blockalloc) { |
| 731 | error = GOP_ALLOC(vp, startoffset, |
| 732 | npages << PAGE_SHIFT, 0, cred); |
| 733 | UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d" , |
| 734 | startoffset, npages << PAGE_SHIFT, error,0); |
| 735 | if (!error) { |
| 736 | mutex_enter(uobj->vmobjlock); |
| 737 | for (i = 0; i < npages; i++) { |
| 738 | struct vm_page *pg = pgs[i]; |
| 739 | |
| 740 | if (pg == NULL) { |
| 741 | continue; |
| 742 | } |
| 743 | pg->flags &= ~(PG_CLEAN|PG_RDONLY); |
| 744 | UVMHIST_LOG(ubchist, "mark dirty pg %p" , |
| 745 | pg,0,0,0); |
| 746 | } |
| 747 | mutex_exit(uobj->vmobjlock); |
| 748 | } |
| 749 | } |
| 750 | |
| 751 | putiobuf(mbp); |
| 752 | return error; |
| 753 | } |
| 754 | |
| 755 | /* |
| 756 | * generic VM putpages routine. |
| 757 | * Write the given range of pages to backing store. |
| 758 | * |
| 759 | * => "offhi == 0" means flush all pages at or after "offlo". |
| 760 | * => object should be locked by caller. we return with the |
| 761 | * object unlocked. |
| 762 | * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O). |
| 763 | * thus, a caller might want to unlock higher level resources |
| 764 | * (e.g. vm_map) before calling flush. |
| 765 | * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block |
| 766 | * => if PGO_ALLPAGES is set, then all pages in the object will be processed. |
| 767 | * => NOTE: we rely on the fact that the object's memq is a TAILQ and |
| 768 | * that new pages are inserted on the tail end of the list. thus, |
| 769 | * we can make a complete pass through the object in one go by starting |
| 770 | * at the head and working towards the tail (new pages are put in |
| 771 | * front of us). |
| 772 | * => NOTE: we are allowed to lock the page queues, so the caller |
| 773 | * must not be holding the page queue lock. |
| 774 | * |
| 775 | * note on "cleaning" object and PG_BUSY pages: |
| 776 | * this routine is holding the lock on the object. the only time |
| 777 | * that it can run into a PG_BUSY page that it does not own is if |
| 778 | * some other process has started I/O on the page (e.g. either |
| 779 | * a pagein, or a pageout). if the PG_BUSY page is being paged |
| 780 | * in, then it can not be dirty (!PG_CLEAN) because no one has |
| 781 | * had a chance to modify it yet. if the PG_BUSY page is being |
| 782 | * paged out then it means that someone else has already started |
| 783 | * cleaning the page for us (how nice!). in this case, if we |
| 784 | * have syncio specified, then after we make our pass through the |
| 785 | * object we need to wait for the other PG_BUSY pages to clear |
| 786 | * off (i.e. we need to do an iosync). also note that once a |
| 787 | * page is PG_BUSY it must stay in its object until it is un-busyed. |
| 788 | * |
| 789 | * note on page traversal: |
| 790 | * we can traverse the pages in an object either by going down the |
| 791 | * linked list in "uobj->memq", or we can go over the address range |
| 792 | * by page doing hash table lookups for each address. depending |
| 793 | * on how many pages are in the object it may be cheaper to do one |
| 794 | * or the other. we set "by_list" to true if we are using memq. |
| 795 | * if the cost of a hash lookup was equal to the cost of the list |
| 796 | * traversal we could compare the number of pages in the start->stop |
| 797 | * range to the total number of pages in the object. however, it |
| 798 | * seems that a hash table lookup is more expensive than the linked |
| 799 | * list traversal, so we multiply the number of pages in the |
| 800 | * range by an estimate of the relatively higher cost of the hash lookup. |
| 801 | */ |
| 802 | |
| 803 | int |
| 804 | genfs_putpages(void *v) |
| 805 | { |
| 806 | struct vop_putpages_args /* { |
| 807 | struct vnode *a_vp; |
| 808 | voff_t a_offlo; |
| 809 | voff_t a_offhi; |
| 810 | int a_flags; |
| 811 | } */ * const ap = v; |
| 812 | |
| 813 | return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi, |
| 814 | ap->a_flags, NULL); |
| 815 | } |
| 816 | |
| 817 | int |
| 818 | genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff, |
| 819 | int origflags, struct vm_page **busypg) |
| 820 | { |
| 821 | struct uvm_object * const uobj = &vp->v_uobj; |
| 822 | kmutex_t * const slock = uobj->vmobjlock; |
| 823 | off_t off; |
| 824 | int i, error, npages, nback; |
| 825 | int freeflag; |
| 826 | /* |
| 827 | * This array is larger than it should so that it's size is constant. |
| 828 | * The right size is MAXPAGES. |
| 829 | */ |
| 830 | struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE]; |
| 831 | #define MAXPAGES (MAXPHYS / PAGE_SIZE) |
| 832 | struct vm_page *pg, *nextpg, *tpg, curmp, endmp; |
| 833 | bool wasclean, by_list, needs_clean, yld; |
| 834 | bool async = (origflags & PGO_SYNCIO) == 0; |
| 835 | bool pagedaemon = curlwp == uvm.pagedaemon_lwp; |
| 836 | struct lwp * const l = curlwp ? curlwp : &lwp0; |
| 837 | struct genfs_node * const gp = VTOG(vp); |
| 838 | int flags; |
| 839 | int dirtygen; |
| 840 | bool modified; |
| 841 | bool need_wapbl; |
| 842 | bool has_trans; |
| 843 | bool cleanall; |
| 844 | bool onworklst; |
| 845 | |
| 846 | UVMHIST_FUNC("genfs_putpages" ); UVMHIST_CALLED(ubchist); |
| 847 | |
| 848 | KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)); |
| 849 | KASSERT((startoff & PAGE_MASK) == 0 && (endoff & PAGE_MASK) == 0); |
| 850 | KASSERT(startoff < endoff || endoff == 0); |
| 851 | |
| 852 | UVMHIST_LOG(ubchist, "vp %p pages %d off 0x%x len 0x%x" , |
| 853 | vp, uobj->uo_npages, startoff, endoff - startoff); |
| 854 | |
| 855 | has_trans = false; |
| 856 | need_wapbl = (!pagedaemon && vp->v_mount && vp->v_mount->mnt_wapbl && |
| 857 | (origflags & PGO_JOURNALLOCKED) == 0); |
| 858 | |
| 859 | retry: |
| 860 | modified = false; |
| 861 | flags = origflags; |
| 862 | KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || |
| 863 | (vp->v_iflag & VI_WRMAPDIRTY) == 0); |
| 864 | if (uobj->uo_npages == 0) { |
| 865 | if (vp->v_iflag & VI_ONWORKLST) { |
| 866 | vp->v_iflag &= ~VI_WRMAPDIRTY; |
| 867 | if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) |
| 868 | vn_syncer_remove_from_worklist(vp); |
| 869 | } |
| 870 | if (has_trans) { |
| 871 | if (need_wapbl) |
| 872 | WAPBL_END(vp->v_mount); |
| 873 | fstrans_done(vp->v_mount); |
| 874 | } |
| 875 | mutex_exit(slock); |
| 876 | return (0); |
| 877 | } |
| 878 | |
| 879 | /* |
| 880 | * the vnode has pages, set up to process the request. |
| 881 | */ |
| 882 | |
| 883 | if (!has_trans && (flags & PGO_CLEANIT) != 0) { |
| 884 | mutex_exit(slock); |
| 885 | if (pagedaemon) { |
| 886 | error = fstrans_start_nowait(vp->v_mount, FSTRANS_LAZY); |
| 887 | if (error) |
| 888 | return error; |
| 889 | } else |
| 890 | fstrans_start(vp->v_mount, FSTRANS_LAZY); |
| 891 | if (need_wapbl) { |
| 892 | error = WAPBL_BEGIN(vp->v_mount); |
| 893 | if (error) { |
| 894 | fstrans_done(vp->v_mount); |
| 895 | return error; |
| 896 | } |
| 897 | } |
| 898 | has_trans = true; |
| 899 | mutex_enter(slock); |
| 900 | goto retry; |
| 901 | } |
| 902 | |
| 903 | error = 0; |
| 904 | wasclean = (vp->v_numoutput == 0); |
| 905 | off = startoff; |
| 906 | if (endoff == 0 || flags & PGO_ALLPAGES) { |
| 907 | endoff = trunc_page(LLONG_MAX); |
| 908 | } |
| 909 | by_list = (uobj->uo_npages <= |
| 910 | ((endoff - startoff) >> PAGE_SHIFT) * UVM_PAGE_TREE_PENALTY); |
| 911 | |
| 912 | /* |
| 913 | * if this vnode is known not to have dirty pages, |
| 914 | * don't bother to clean it out. |
| 915 | */ |
| 916 | |
| 917 | if ((vp->v_iflag & VI_ONWORKLST) == 0) { |
| 918 | #if !defined(DEBUG) |
| 919 | if ((flags & (PGO_FREE|PGO_DEACTIVATE)) == 0) { |
| 920 | goto skip_scan; |
| 921 | } |
| 922 | #endif /* !defined(DEBUG) */ |
| 923 | flags &= ~PGO_CLEANIT; |
| 924 | } |
| 925 | |
| 926 | /* |
| 927 | * start the loop. when scanning by list, hold the last page |
| 928 | * in the list before we start. pages allocated after we start |
| 929 | * will be added to the end of the list, so we can stop at the |
| 930 | * current last page. |
| 931 | */ |
| 932 | |
| 933 | cleanall = (flags & PGO_CLEANIT) != 0 && wasclean && |
| 934 | startoff == 0 && endoff == trunc_page(LLONG_MAX) && |
| 935 | (vp->v_iflag & VI_ONWORKLST) != 0; |
| 936 | dirtygen = gp->g_dirtygen; |
| 937 | freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED; |
| 938 | if (by_list) { |
| 939 | curmp.flags = PG_MARKER; |
| 940 | endmp.flags = PG_MARKER; |
| 941 | pg = TAILQ_FIRST(&uobj->memq); |
| 942 | TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq.queue); |
| 943 | } else { |
| 944 | pg = uvm_pagelookup(uobj, off); |
| 945 | } |
| 946 | nextpg = NULL; |
| 947 | while (by_list || off < endoff) { |
| 948 | |
| 949 | /* |
| 950 | * if the current page is not interesting, move on to the next. |
| 951 | */ |
| 952 | |
| 953 | KASSERT(pg == NULL || pg->uobject == uobj || |
| 954 | (pg->flags & PG_MARKER) != 0); |
| 955 | KASSERT(pg == NULL || |
| 956 | (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || |
| 957 | (pg->flags & (PG_BUSY|PG_MARKER)) != 0); |
| 958 | if (by_list) { |
| 959 | if (pg == &endmp) { |
| 960 | break; |
| 961 | } |
| 962 | if (pg->flags & PG_MARKER) { |
| 963 | pg = TAILQ_NEXT(pg, listq.queue); |
| 964 | continue; |
| 965 | } |
| 966 | if (pg->offset < startoff || pg->offset >= endoff || |
| 967 | pg->flags & (PG_RELEASED|PG_PAGEOUT)) { |
| 968 | if (pg->flags & (PG_RELEASED|PG_PAGEOUT)) { |
| 969 | wasclean = false; |
| 970 | } |
| 971 | pg = TAILQ_NEXT(pg, listq.queue); |
| 972 | continue; |
| 973 | } |
| 974 | off = pg->offset; |
| 975 | } else if (pg == NULL || pg->flags & (PG_RELEASED|PG_PAGEOUT)) { |
| 976 | if (pg != NULL) { |
| 977 | wasclean = false; |
| 978 | } |
| 979 | off += PAGE_SIZE; |
| 980 | if (off < endoff) { |
| 981 | pg = uvm_pagelookup(uobj, off); |
| 982 | } |
| 983 | continue; |
| 984 | } |
| 985 | |
| 986 | /* |
| 987 | * if the current page needs to be cleaned and it's busy, |
| 988 | * wait for it to become unbusy. |
| 989 | */ |
| 990 | |
| 991 | yld = (l->l_cpu->ci_schedstate.spc_flags & |
| 992 | SPCF_SHOULDYIELD) && !pagedaemon; |
| 993 | if (pg->flags & PG_BUSY || yld) { |
| 994 | UVMHIST_LOG(ubchist, "busy %p" , pg,0,0,0); |
| 995 | if (flags & PGO_BUSYFAIL && pg->flags & PG_BUSY) { |
| 996 | UVMHIST_LOG(ubchist, "busyfail %p" , pg, 0,0,0); |
| 997 | error = EDEADLK; |
| 998 | if (busypg != NULL) |
| 999 | *busypg = pg; |
| 1000 | break; |
| 1001 | } |
| 1002 | if (pagedaemon) { |
| 1003 | /* |
| 1004 | * someone has taken the page while we |
| 1005 | * dropped the lock for fstrans_start. |
| 1006 | */ |
| 1007 | break; |
| 1008 | } |
| 1009 | if (by_list) { |
| 1010 | TAILQ_INSERT_BEFORE(pg, &curmp, listq.queue); |
| 1011 | UVMHIST_LOG(ubchist, "curmp next %p" , |
| 1012 | TAILQ_NEXT(&curmp, listq.queue), 0,0,0); |
| 1013 | } |
| 1014 | if (yld) { |
| 1015 | mutex_exit(slock); |
| 1016 | preempt(); |
| 1017 | mutex_enter(slock); |
| 1018 | } else { |
| 1019 | pg->flags |= PG_WANTED; |
| 1020 | UVM_UNLOCK_AND_WAIT(pg, slock, 0, "genput" , 0); |
| 1021 | mutex_enter(slock); |
| 1022 | } |
| 1023 | if (by_list) { |
| 1024 | UVMHIST_LOG(ubchist, "after next %p" , |
| 1025 | TAILQ_NEXT(&curmp, listq.queue), 0,0,0); |
| 1026 | pg = TAILQ_NEXT(&curmp, listq.queue); |
| 1027 | TAILQ_REMOVE(&uobj->memq, &curmp, listq.queue); |
| 1028 | } else { |
| 1029 | pg = uvm_pagelookup(uobj, off); |
| 1030 | } |
| 1031 | continue; |
| 1032 | } |
| 1033 | |
| 1034 | /* |
| 1035 | * if we're freeing, remove all mappings of the page now. |
| 1036 | * if we're cleaning, check if the page is needs to be cleaned. |
| 1037 | */ |
| 1038 | |
| 1039 | if (flags & PGO_FREE) { |
| 1040 | pmap_page_protect(pg, VM_PROT_NONE); |
| 1041 | } else if (flags & PGO_CLEANIT) { |
| 1042 | |
| 1043 | /* |
| 1044 | * if we still have some hope to pull this vnode off |
| 1045 | * from the syncer queue, write-protect the page. |
| 1046 | */ |
| 1047 | |
| 1048 | if (cleanall && wasclean && |
| 1049 | gp->g_dirtygen == dirtygen) { |
| 1050 | |
| 1051 | /* |
| 1052 | * uobj pages get wired only by uvm_fault |
| 1053 | * where uobj is locked. |
| 1054 | */ |
| 1055 | |
| 1056 | if (pg->wire_count == 0) { |
| 1057 | pmap_page_protect(pg, |
| 1058 | VM_PROT_READ|VM_PROT_EXECUTE); |
| 1059 | } else { |
| 1060 | cleanall = false; |
| 1061 | } |
| 1062 | } |
| 1063 | } |
| 1064 | |
| 1065 | if (flags & PGO_CLEANIT) { |
| 1066 | needs_clean = pmap_clear_modify(pg) || |
| 1067 | (pg->flags & PG_CLEAN) == 0; |
| 1068 | pg->flags |= PG_CLEAN; |
| 1069 | } else { |
| 1070 | needs_clean = false; |
| 1071 | } |
| 1072 | |
| 1073 | /* |
| 1074 | * if we're cleaning, build a cluster. |
| 1075 | * the cluster will consist of pages which are currently dirty, |
| 1076 | * but they will be returned to us marked clean. |
| 1077 | * if not cleaning, just operate on the one page. |
| 1078 | */ |
| 1079 | |
| 1080 | if (needs_clean) { |
| 1081 | KDASSERT((vp->v_iflag & VI_ONWORKLST)); |
| 1082 | wasclean = false; |
| 1083 | memset(pgs, 0, sizeof(pgs)); |
| 1084 | pg->flags |= PG_BUSY; |
| 1085 | UVM_PAGE_OWN(pg, "genfs_putpages" ); |
| 1086 | |
| 1087 | /* |
| 1088 | * first look backward. |
| 1089 | */ |
| 1090 | |
| 1091 | npages = MIN(MAXPAGES >> 1, off >> PAGE_SHIFT); |
| 1092 | nback = npages; |
| 1093 | uvn_findpages(uobj, off - PAGE_SIZE, &nback, &pgs[0], |
| 1094 | UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD); |
| 1095 | if (nback) { |
| 1096 | memmove(&pgs[0], &pgs[npages - nback], |
| 1097 | nback * sizeof(pgs[0])); |
| 1098 | if (npages - nback < nback) |
| 1099 | memset(&pgs[nback], 0, |
| 1100 | (npages - nback) * sizeof(pgs[0])); |
| 1101 | else |
| 1102 | memset(&pgs[npages - nback], 0, |
| 1103 | nback * sizeof(pgs[0])); |
| 1104 | } |
| 1105 | |
| 1106 | /* |
| 1107 | * then plug in our page of interest. |
| 1108 | */ |
| 1109 | |
| 1110 | pgs[nback] = pg; |
| 1111 | |
| 1112 | /* |
| 1113 | * then look forward to fill in the remaining space in |
| 1114 | * the array of pages. |
| 1115 | */ |
| 1116 | |
| 1117 | npages = MAXPAGES - nback - 1; |
| 1118 | uvn_findpages(uobj, off + PAGE_SIZE, &npages, |
| 1119 | &pgs[nback + 1], |
| 1120 | UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY); |
| 1121 | npages += nback + 1; |
| 1122 | } else { |
| 1123 | pgs[0] = pg; |
| 1124 | npages = 1; |
| 1125 | nback = 0; |
| 1126 | } |
| 1127 | |
| 1128 | /* |
| 1129 | * apply FREE or DEACTIVATE options if requested. |
| 1130 | */ |
| 1131 | |
| 1132 | if (flags & (PGO_DEACTIVATE|PGO_FREE)) { |
| 1133 | mutex_enter(&uvm_pageqlock); |
| 1134 | } |
| 1135 | for (i = 0; i < npages; i++) { |
| 1136 | tpg = pgs[i]; |
| 1137 | KASSERT(tpg->uobject == uobj); |
| 1138 | if (by_list && tpg == TAILQ_NEXT(pg, listq.queue)) |
| 1139 | pg = tpg; |
| 1140 | if (tpg->offset < startoff || tpg->offset >= endoff) |
| 1141 | continue; |
| 1142 | if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) { |
| 1143 | uvm_pagedeactivate(tpg); |
| 1144 | } else if (flags & PGO_FREE) { |
| 1145 | pmap_page_protect(tpg, VM_PROT_NONE); |
| 1146 | if (tpg->flags & PG_BUSY) { |
| 1147 | tpg->flags |= freeflag; |
| 1148 | if (pagedaemon) { |
| 1149 | uvm_pageout_start(1); |
| 1150 | uvm_pagedequeue(tpg); |
| 1151 | } |
| 1152 | } else { |
| 1153 | |
| 1154 | /* |
| 1155 | * ``page is not busy'' |
| 1156 | * implies that npages is 1 |
| 1157 | * and needs_clean is false. |
| 1158 | */ |
| 1159 | |
| 1160 | nextpg = TAILQ_NEXT(tpg, listq.queue); |
| 1161 | uvm_pagefree(tpg); |
| 1162 | if (pagedaemon) |
| 1163 | uvmexp.pdfreed++; |
| 1164 | } |
| 1165 | } |
| 1166 | } |
| 1167 | if (flags & (PGO_DEACTIVATE|PGO_FREE)) { |
| 1168 | mutex_exit(&uvm_pageqlock); |
| 1169 | } |
| 1170 | if (needs_clean) { |
| 1171 | modified = true; |
| 1172 | |
| 1173 | /* |
| 1174 | * start the i/o. if we're traversing by list, |
| 1175 | * keep our place in the list with a marker page. |
| 1176 | */ |
| 1177 | |
| 1178 | if (by_list) { |
| 1179 | TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp, |
| 1180 | listq.queue); |
| 1181 | } |
| 1182 | mutex_exit(slock); |
| 1183 | error = GOP_WRITE(vp, pgs, npages, flags); |
| 1184 | mutex_enter(slock); |
| 1185 | if (by_list) { |
| 1186 | pg = TAILQ_NEXT(&curmp, listq.queue); |
| 1187 | TAILQ_REMOVE(&uobj->memq, &curmp, listq.queue); |
| 1188 | } |
| 1189 | if (error) { |
| 1190 | break; |
| 1191 | } |
| 1192 | if (by_list) { |
| 1193 | continue; |
| 1194 | } |
| 1195 | } |
| 1196 | |
| 1197 | /* |
| 1198 | * find the next page and continue if there was no error. |
| 1199 | */ |
| 1200 | |
| 1201 | if (by_list) { |
| 1202 | if (nextpg) { |
| 1203 | pg = nextpg; |
| 1204 | nextpg = NULL; |
| 1205 | } else { |
| 1206 | pg = TAILQ_NEXT(pg, listq.queue); |
| 1207 | } |
| 1208 | } else { |
| 1209 | off += (npages - nback) << PAGE_SHIFT; |
| 1210 | if (off < endoff) { |
| 1211 | pg = uvm_pagelookup(uobj, off); |
| 1212 | } |
| 1213 | } |
| 1214 | } |
| 1215 | if (by_list) { |
| 1216 | TAILQ_REMOVE(&uobj->memq, &endmp, listq.queue); |
| 1217 | } |
| 1218 | |
| 1219 | if (modified && (vp->v_iflag & VI_WRMAPDIRTY) != 0 && |
| 1220 | (vp->v_type != VBLK || |
| 1221 | (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) { |
| 1222 | GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED); |
| 1223 | } |
| 1224 | |
| 1225 | /* |
| 1226 | * if we're cleaning and there was nothing to clean, |
| 1227 | * take us off the syncer list. if we started any i/o |
| 1228 | * and we're doing sync i/o, wait for all writes to finish. |
| 1229 | */ |
| 1230 | |
| 1231 | if (cleanall && wasclean && gp->g_dirtygen == dirtygen && |
| 1232 | (vp->v_iflag & VI_ONWORKLST) != 0) { |
| 1233 | #if defined(DEBUG) |
| 1234 | TAILQ_FOREACH(pg, &uobj->memq, listq.queue) { |
| 1235 | if ((pg->flags & (PG_FAKE | PG_MARKER)) != 0) { |
| 1236 | continue; |
| 1237 | } |
| 1238 | if ((pg->flags & PG_CLEAN) == 0) { |
| 1239 | printf("%s: %p: !CLEAN\n" , __func__, pg); |
| 1240 | } |
| 1241 | if (pmap_is_modified(pg)) { |
| 1242 | printf("%s: %p: modified\n" , __func__, pg); |
| 1243 | } |
| 1244 | } |
| 1245 | #endif /* defined(DEBUG) */ |
| 1246 | vp->v_iflag &= ~VI_WRMAPDIRTY; |
| 1247 | if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) |
| 1248 | vn_syncer_remove_from_worklist(vp); |
| 1249 | } |
| 1250 | |
| 1251 | #if !defined(DEBUG) |
| 1252 | skip_scan: |
| 1253 | #endif /* !defined(DEBUG) */ |
| 1254 | |
| 1255 | /* Wait for output to complete. */ |
| 1256 | if (!wasclean && !async && vp->v_numoutput != 0) { |
| 1257 | while (vp->v_numoutput != 0) |
| 1258 | cv_wait(&vp->v_cv, slock); |
| 1259 | } |
| 1260 | onworklst = (vp->v_iflag & VI_ONWORKLST) != 0; |
| 1261 | mutex_exit(slock); |
| 1262 | |
| 1263 | if ((flags & PGO_RECLAIM) != 0 && onworklst) { |
| 1264 | /* |
| 1265 | * in the case of PGO_RECLAIM, ensure to make the vnode clean. |
| 1266 | * retrying is not a big deal because, in many cases, |
| 1267 | * uobj->uo_npages is already 0 here. |
| 1268 | */ |
| 1269 | mutex_enter(slock); |
| 1270 | goto retry; |
| 1271 | } |
| 1272 | |
| 1273 | if (has_trans) { |
| 1274 | if (need_wapbl) |
| 1275 | WAPBL_END(vp->v_mount); |
| 1276 | fstrans_done(vp->v_mount); |
| 1277 | } |
| 1278 | |
| 1279 | return (error); |
| 1280 | } |
| 1281 | |
| 1282 | int |
| 1283 | genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags) |
| 1284 | { |
| 1285 | off_t off; |
| 1286 | vaddr_t kva; |
| 1287 | size_t len; |
| 1288 | int error; |
| 1289 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
| 1290 | |
| 1291 | UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x" , |
| 1292 | vp, pgs, npages, flags); |
| 1293 | |
| 1294 | off = pgs[0]->offset; |
| 1295 | kva = uvm_pagermapin(pgs, npages, |
| 1296 | UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); |
| 1297 | len = npages << PAGE_SHIFT; |
| 1298 | |
| 1299 | error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE, |
| 1300 | uvm_aio_biodone); |
| 1301 | |
| 1302 | return error; |
| 1303 | } |
| 1304 | |
| 1305 | int |
| 1306 | genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages, int flags) |
| 1307 | { |
| 1308 | off_t off; |
| 1309 | vaddr_t kva; |
| 1310 | size_t len; |
| 1311 | int error; |
| 1312 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
| 1313 | |
| 1314 | UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x" , |
| 1315 | vp, pgs, npages, flags); |
| 1316 | |
| 1317 | off = pgs[0]->offset; |
| 1318 | kva = uvm_pagermapin(pgs, npages, |
| 1319 | UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK); |
| 1320 | len = npages << PAGE_SHIFT; |
| 1321 | |
| 1322 | error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE, |
| 1323 | uvm_aio_biodone); |
| 1324 | |
| 1325 | return error; |
| 1326 | } |
| 1327 | |
| 1328 | /* |
| 1329 | * Backend routine for doing I/O to vnode pages. Pages are already locked |
| 1330 | * and mapped into kernel memory. Here we just look up the underlying |
| 1331 | * device block addresses and call the strategy routine. |
| 1332 | */ |
| 1333 | |
| 1334 | static int |
| 1335 | genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags, |
| 1336 | enum uio_rw rw, void (*iodone)(struct buf *)) |
| 1337 | { |
| 1338 | int s, error; |
| 1339 | int fs_bshift, dev_bshift; |
| 1340 | off_t eof, offset, startoffset; |
| 1341 | size_t bytes, iobytes, skipbytes; |
| 1342 | struct buf *mbp, *bp; |
| 1343 | const bool async = (flags & PGO_SYNCIO) == 0; |
| 1344 | const bool lazy = (flags & PGO_LAZY) == 0; |
| 1345 | const bool iowrite = rw == UIO_WRITE; |
| 1346 | const int brw = iowrite ? B_WRITE : B_READ; |
| 1347 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
| 1348 | |
| 1349 | UVMHIST_LOG(ubchist, "vp %p kva %p len 0x%x flags 0x%x" , |
| 1350 | vp, kva, len, flags); |
| 1351 | |
| 1352 | KASSERT(vp->v_size <= vp->v_writesize); |
| 1353 | GOP_SIZE(vp, vp->v_writesize, &eof, 0); |
| 1354 | if (vp->v_type != VBLK) { |
| 1355 | fs_bshift = vp->v_mount->mnt_fs_bshift; |
| 1356 | dev_bshift = vp->v_mount->mnt_dev_bshift; |
| 1357 | } else { |
| 1358 | fs_bshift = DEV_BSHIFT; |
| 1359 | dev_bshift = DEV_BSHIFT; |
| 1360 | } |
| 1361 | error = 0; |
| 1362 | startoffset = off; |
| 1363 | bytes = MIN(len, eof - startoffset); |
| 1364 | skipbytes = 0; |
| 1365 | KASSERT(bytes != 0); |
| 1366 | |
| 1367 | if (iowrite) { |
| 1368 | mutex_enter(vp->v_interlock); |
| 1369 | vp->v_numoutput += 2; |
| 1370 | mutex_exit(vp->v_interlock); |
| 1371 | } |
| 1372 | mbp = getiobuf(vp, true); |
| 1373 | UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x" , |
| 1374 | vp, mbp, vp->v_numoutput, bytes); |
| 1375 | mbp->b_bufsize = len; |
| 1376 | mbp->b_data = (void *)kva; |
| 1377 | mbp->b_resid = mbp->b_bcount = bytes; |
| 1378 | mbp->b_cflags = BC_BUSY | BC_AGE; |
| 1379 | if (async) { |
| 1380 | mbp->b_flags = brw | B_ASYNC; |
| 1381 | mbp->b_iodone = iodone; |
| 1382 | } else { |
| 1383 | mbp->b_flags = brw; |
| 1384 | mbp->b_iodone = NULL; |
| 1385 | } |
| 1386 | if (curlwp == uvm.pagedaemon_lwp) |
| 1387 | BIO_SETPRIO(mbp, BPRIO_TIMELIMITED); |
| 1388 | else if (async || lazy) |
| 1389 | BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL); |
| 1390 | else |
| 1391 | BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL); |
| 1392 | |
| 1393 | bp = NULL; |
| 1394 | for (offset = startoffset; |
| 1395 | bytes > 0; |
| 1396 | offset += iobytes, bytes -= iobytes) { |
| 1397 | int run; |
| 1398 | daddr_t lbn, blkno; |
| 1399 | struct vnode *devvp; |
| 1400 | |
| 1401 | /* |
| 1402 | * bmap the file to find out the blkno to read from and |
| 1403 | * how much we can read in one i/o. if bmap returns an error, |
| 1404 | * skip the rest of the top-level i/o. |
| 1405 | */ |
| 1406 | |
| 1407 | lbn = offset >> fs_bshift; |
| 1408 | error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run); |
| 1409 | if (error) { |
| 1410 | UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n" , |
| 1411 | lbn,error,0,0); |
| 1412 | skipbytes += bytes; |
| 1413 | bytes = 0; |
| 1414 | goto loopdone; |
| 1415 | } |
| 1416 | |
| 1417 | /* |
| 1418 | * see how many pages can be read with this i/o. |
| 1419 | * reduce the i/o size if necessary to avoid |
| 1420 | * overwriting pages with valid data. |
| 1421 | */ |
| 1422 | |
| 1423 | iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, |
| 1424 | bytes); |
| 1425 | |
| 1426 | /* |
| 1427 | * if this block isn't allocated, zero it instead of |
| 1428 | * reading it. unless we are going to allocate blocks, |
| 1429 | * mark the pages we zeroed PG_RDONLY. |
| 1430 | */ |
| 1431 | |
| 1432 | if (blkno == (daddr_t)-1) { |
| 1433 | if (!iowrite) { |
| 1434 | memset((char *)kva + (offset - startoffset), 0, |
| 1435 | iobytes); |
| 1436 | } |
| 1437 | skipbytes += iobytes; |
| 1438 | continue; |
| 1439 | } |
| 1440 | |
| 1441 | /* |
| 1442 | * allocate a sub-buf for this piece of the i/o |
| 1443 | * (or just use mbp if there's only 1 piece), |
| 1444 | * and start it going. |
| 1445 | */ |
| 1446 | |
| 1447 | if (offset == startoffset && iobytes == bytes) { |
| 1448 | bp = mbp; |
| 1449 | } else { |
| 1450 | UVMHIST_LOG(ubchist, "vp %p bp %p num now %d" , |
| 1451 | vp, bp, vp->v_numoutput, 0); |
| 1452 | bp = getiobuf(vp, true); |
| 1453 | nestiobuf_setup(mbp, bp, offset - startoffset, iobytes); |
| 1454 | } |
| 1455 | bp->b_lblkno = 0; |
| 1456 | |
| 1457 | /* adjust physical blkno for partial blocks */ |
| 1458 | bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> |
| 1459 | dev_bshift); |
| 1460 | |
| 1461 | UVMHIST_LOG(ubchist, |
| 1462 | "bp %p offset 0x%x bcount 0x%x blkno 0x%x" , |
| 1463 | bp, offset, bp->b_bcount, bp->b_blkno); |
| 1464 | |
| 1465 | VOP_STRATEGY(devvp, bp); |
| 1466 | } |
| 1467 | |
| 1468 | loopdone: |
| 1469 | if (skipbytes) { |
| 1470 | UVMHIST_LOG(ubchist, "skipbytes %d" , skipbytes, 0,0,0); |
| 1471 | } |
| 1472 | nestiobuf_done(mbp, skipbytes, error); |
| 1473 | if (async) { |
| 1474 | UVMHIST_LOG(ubchist, "returning 0 (async)" , 0,0,0,0); |
| 1475 | return (0); |
| 1476 | } |
| 1477 | UVMHIST_LOG(ubchist, "waiting for mbp %p" , mbp,0,0,0); |
| 1478 | error = biowait(mbp); |
| 1479 | s = splbio(); |
| 1480 | (*iodone)(mbp); |
| 1481 | splx(s); |
| 1482 | UVMHIST_LOG(ubchist, "returning, error %d" , error,0,0,0); |
| 1483 | return (error); |
| 1484 | } |
| 1485 | |
| 1486 | int |
| 1487 | genfs_compat_getpages(void *v) |
| 1488 | { |
| 1489 | struct vop_getpages_args /* { |
| 1490 | struct vnode *a_vp; |
| 1491 | voff_t a_offset; |
| 1492 | struct vm_page **a_m; |
| 1493 | int *a_count; |
| 1494 | int a_centeridx; |
| 1495 | vm_prot_t a_access_type; |
| 1496 | int a_advice; |
| 1497 | int a_flags; |
| 1498 | } */ *ap = v; |
| 1499 | |
| 1500 | off_t origoffset; |
| 1501 | struct vnode *vp = ap->a_vp; |
| 1502 | struct uvm_object *uobj = &vp->v_uobj; |
| 1503 | struct vm_page *pg, **pgs; |
| 1504 | vaddr_t kva; |
| 1505 | int i, error, orignpages, npages; |
| 1506 | struct iovec iov; |
| 1507 | struct uio uio; |
| 1508 | kauth_cred_t cred = curlwp->l_cred; |
| 1509 | const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0; |
| 1510 | |
| 1511 | error = 0; |
| 1512 | origoffset = ap->a_offset; |
| 1513 | orignpages = *ap->a_count; |
| 1514 | pgs = ap->a_m; |
| 1515 | |
| 1516 | if (ap->a_flags & PGO_LOCKED) { |
| 1517 | uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, |
| 1518 | UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0)); |
| 1519 | |
| 1520 | error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0; |
| 1521 | if (error == 0 && memwrite) { |
| 1522 | genfs_markdirty(vp); |
| 1523 | } |
| 1524 | return error; |
| 1525 | } |
| 1526 | if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) { |
| 1527 | mutex_exit(uobj->vmobjlock); |
| 1528 | return EINVAL; |
| 1529 | } |
| 1530 | if ((ap->a_flags & PGO_SYNCIO) == 0) { |
| 1531 | mutex_exit(uobj->vmobjlock); |
| 1532 | return 0; |
| 1533 | } |
| 1534 | npages = orignpages; |
| 1535 | uvn_findpages(uobj, origoffset, &npages, pgs, UFP_ALL); |
| 1536 | mutex_exit(uobj->vmobjlock); |
| 1537 | kva = uvm_pagermapin(pgs, npages, |
| 1538 | UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK); |
| 1539 | for (i = 0; i < npages; i++) { |
| 1540 | pg = pgs[i]; |
| 1541 | if ((pg->flags & PG_FAKE) == 0) { |
| 1542 | continue; |
| 1543 | } |
| 1544 | iov.iov_base = (char *)kva + (i << PAGE_SHIFT); |
| 1545 | iov.iov_len = PAGE_SIZE; |
| 1546 | uio.uio_iov = &iov; |
| 1547 | uio.uio_iovcnt = 1; |
| 1548 | uio.uio_offset = origoffset + (i << PAGE_SHIFT); |
| 1549 | uio.uio_rw = UIO_READ; |
| 1550 | uio.uio_resid = PAGE_SIZE; |
| 1551 | UIO_SETUP_SYSSPACE(&uio); |
| 1552 | /* XXX vn_lock */ |
| 1553 | error = VOP_READ(vp, &uio, 0, cred); |
| 1554 | if (error) { |
| 1555 | break; |
| 1556 | } |
| 1557 | if (uio.uio_resid) { |
| 1558 | memset(iov.iov_base, 0, uio.uio_resid); |
| 1559 | } |
| 1560 | } |
| 1561 | uvm_pagermapout(kva, npages); |
| 1562 | mutex_enter(uobj->vmobjlock); |
| 1563 | mutex_enter(&uvm_pageqlock); |
| 1564 | for (i = 0; i < npages; i++) { |
| 1565 | pg = pgs[i]; |
| 1566 | if (error && (pg->flags & PG_FAKE) != 0) { |
| 1567 | pg->flags |= PG_RELEASED; |
| 1568 | } else { |
| 1569 | pmap_clear_modify(pg); |
| 1570 | uvm_pageactivate(pg); |
| 1571 | } |
| 1572 | } |
| 1573 | if (error) { |
| 1574 | uvm_page_unbusy(pgs, npages); |
| 1575 | } |
| 1576 | mutex_exit(&uvm_pageqlock); |
| 1577 | if (error == 0 && memwrite) { |
| 1578 | genfs_markdirty(vp); |
| 1579 | } |
| 1580 | mutex_exit(uobj->vmobjlock); |
| 1581 | return error; |
| 1582 | } |
| 1583 | |
| 1584 | int |
| 1585 | genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, |
| 1586 | int flags) |
| 1587 | { |
| 1588 | off_t offset; |
| 1589 | struct iovec iov; |
| 1590 | struct uio uio; |
| 1591 | kauth_cred_t cred = curlwp->l_cred; |
| 1592 | struct buf *bp; |
| 1593 | vaddr_t kva; |
| 1594 | int error; |
| 1595 | |
| 1596 | offset = pgs[0]->offset; |
| 1597 | kva = uvm_pagermapin(pgs, npages, |
| 1598 | UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); |
| 1599 | |
| 1600 | iov.iov_base = (void *)kva; |
| 1601 | iov.iov_len = npages << PAGE_SHIFT; |
| 1602 | uio.uio_iov = &iov; |
| 1603 | uio.uio_iovcnt = 1; |
| 1604 | uio.uio_offset = offset; |
| 1605 | uio.uio_rw = UIO_WRITE; |
| 1606 | uio.uio_resid = npages << PAGE_SHIFT; |
| 1607 | UIO_SETUP_SYSSPACE(&uio); |
| 1608 | /* XXX vn_lock */ |
| 1609 | error = VOP_WRITE(vp, &uio, 0, cred); |
| 1610 | |
| 1611 | mutex_enter(vp->v_interlock); |
| 1612 | vp->v_numoutput++; |
| 1613 | mutex_exit(vp->v_interlock); |
| 1614 | |
| 1615 | bp = getiobuf(vp, true); |
| 1616 | bp->b_cflags = BC_BUSY | BC_AGE; |
| 1617 | bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift; |
| 1618 | bp->b_data = (char *)kva; |
| 1619 | bp->b_bcount = npages << PAGE_SHIFT; |
| 1620 | bp->b_bufsize = npages << PAGE_SHIFT; |
| 1621 | bp->b_resid = 0; |
| 1622 | bp->b_error = error; |
| 1623 | uvm_aio_aiodone(bp); |
| 1624 | return (error); |
| 1625 | } |
| 1626 | |
| 1627 | /* |
| 1628 | * Process a uio using direct I/O. If we reach a part of the request |
| 1629 | * which cannot be processed in this fashion for some reason, just return. |
| 1630 | * The caller must handle some additional part of the request using |
| 1631 | * buffered I/O before trying direct I/O again. |
| 1632 | */ |
| 1633 | |
| 1634 | void |
| 1635 | genfs_directio(struct vnode *vp, struct uio *uio, int ioflag) |
| 1636 | { |
| 1637 | struct vmspace *vs; |
| 1638 | struct iovec *iov; |
| 1639 | vaddr_t va; |
| 1640 | size_t len; |
| 1641 | const int mask = DEV_BSIZE - 1; |
| 1642 | int error; |
| 1643 | bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl && |
| 1644 | (ioflag & IO_JOURNALLOCKED) == 0); |
| 1645 | |
| 1646 | /* |
| 1647 | * We only support direct I/O to user space for now. |
| 1648 | */ |
| 1649 | |
| 1650 | if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) { |
| 1651 | return; |
| 1652 | } |
| 1653 | |
| 1654 | /* |
| 1655 | * If the vnode is mapped, we would need to get the getpages lock |
| 1656 | * to stabilize the bmap, but then we would get into trouble while |
| 1657 | * locking the pages if the pages belong to this same vnode (or a |
| 1658 | * multi-vnode cascade to the same effect). Just fall back to |
| 1659 | * buffered I/O if the vnode is mapped to avoid this mess. |
| 1660 | */ |
| 1661 | |
| 1662 | if (vp->v_vflag & VV_MAPPED) { |
| 1663 | return; |
| 1664 | } |
| 1665 | |
| 1666 | if (need_wapbl) { |
| 1667 | error = WAPBL_BEGIN(vp->v_mount); |
| 1668 | if (error) |
| 1669 | return; |
| 1670 | } |
| 1671 | |
| 1672 | /* |
| 1673 | * Do as much of the uio as possible with direct I/O. |
| 1674 | */ |
| 1675 | |
| 1676 | vs = uio->uio_vmspace; |
| 1677 | while (uio->uio_resid) { |
| 1678 | iov = uio->uio_iov; |
| 1679 | if (iov->iov_len == 0) { |
| 1680 | uio->uio_iov++; |
| 1681 | uio->uio_iovcnt--; |
| 1682 | continue; |
| 1683 | } |
| 1684 | va = (vaddr_t)iov->iov_base; |
| 1685 | len = MIN(iov->iov_len, genfs_maxdio); |
| 1686 | len &= ~mask; |
| 1687 | |
| 1688 | /* |
| 1689 | * If the next chunk is smaller than DEV_BSIZE or extends past |
| 1690 | * the current EOF, then fall back to buffered I/O. |
| 1691 | */ |
| 1692 | |
| 1693 | if (len == 0 || uio->uio_offset + len > vp->v_size) { |
| 1694 | break; |
| 1695 | } |
| 1696 | |
| 1697 | /* |
| 1698 | * Check alignment. The file offset must be at least |
| 1699 | * sector-aligned. The exact constraint on memory alignment |
| 1700 | * is very hardware-dependent, but requiring sector-aligned |
| 1701 | * addresses there too is safe. |
| 1702 | */ |
| 1703 | |
| 1704 | if (uio->uio_offset & mask || va & mask) { |
| 1705 | break; |
| 1706 | } |
| 1707 | error = genfs_do_directio(vs, va, len, vp, uio->uio_offset, |
| 1708 | uio->uio_rw); |
| 1709 | if (error) { |
| 1710 | break; |
| 1711 | } |
| 1712 | iov->iov_base = (char *)iov->iov_base + len; |
| 1713 | iov->iov_len -= len; |
| 1714 | uio->uio_offset += len; |
| 1715 | uio->uio_resid -= len; |
| 1716 | } |
| 1717 | |
| 1718 | if (need_wapbl) |
| 1719 | WAPBL_END(vp->v_mount); |
| 1720 | } |
| 1721 | |
| 1722 | /* |
| 1723 | * Iodone routine for direct I/O. We don't do much here since the request is |
| 1724 | * always synchronous, so the caller will do most of the work after biowait(). |
| 1725 | */ |
| 1726 | |
| 1727 | static void |
| 1728 | genfs_dio_iodone(struct buf *bp) |
| 1729 | { |
| 1730 | |
| 1731 | KASSERT((bp->b_flags & B_ASYNC) == 0); |
| 1732 | if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) { |
| 1733 | mutex_enter(bp->b_objlock); |
| 1734 | vwakeup(bp); |
| 1735 | mutex_exit(bp->b_objlock); |
| 1736 | } |
| 1737 | putiobuf(bp); |
| 1738 | } |
| 1739 | |
| 1740 | /* |
| 1741 | * Process one chunk of a direct I/O request. |
| 1742 | */ |
| 1743 | |
| 1744 | static int |
| 1745 | genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp, |
| 1746 | off_t off, enum uio_rw rw) |
| 1747 | { |
| 1748 | struct vm_map *map; |
| 1749 | struct pmap *upm, *kpm __unused; |
| 1750 | size_t klen = round_page(uva + len) - trunc_page(uva); |
| 1751 | off_t spoff, epoff; |
| 1752 | vaddr_t kva, puva; |
| 1753 | paddr_t pa; |
| 1754 | vm_prot_t prot; |
| 1755 | int error, rv __diagused, poff, koff; |
| 1756 | const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED | |
| 1757 | (rw == UIO_WRITE ? PGO_FREE : 0); |
| 1758 | |
| 1759 | /* |
| 1760 | * For writes, verify that this range of the file already has fully |
| 1761 | * allocated backing store. If there are any holes, just punt and |
| 1762 | * make the caller take the buffered write path. |
| 1763 | */ |
| 1764 | |
| 1765 | if (rw == UIO_WRITE) { |
| 1766 | daddr_t lbn, elbn, blkno; |
| 1767 | int bsize, bshift, run; |
| 1768 | |
| 1769 | bshift = vp->v_mount->mnt_fs_bshift; |
| 1770 | bsize = 1 << bshift; |
| 1771 | lbn = off >> bshift; |
| 1772 | elbn = (off + len + bsize - 1) >> bshift; |
| 1773 | while (lbn < elbn) { |
| 1774 | error = VOP_BMAP(vp, lbn, NULL, &blkno, &run); |
| 1775 | if (error) { |
| 1776 | return error; |
| 1777 | } |
| 1778 | if (blkno == (daddr_t)-1) { |
| 1779 | return ENOSPC; |
| 1780 | } |
| 1781 | lbn += 1 + run; |
| 1782 | } |
| 1783 | } |
| 1784 | |
| 1785 | /* |
| 1786 | * Flush any cached pages for parts of the file that we're about to |
| 1787 | * access. If we're writing, invalidate pages as well. |
| 1788 | */ |
| 1789 | |
| 1790 | spoff = trunc_page(off); |
| 1791 | epoff = round_page(off + len); |
| 1792 | mutex_enter(vp->v_interlock); |
| 1793 | error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags); |
| 1794 | if (error) { |
| 1795 | return error; |
| 1796 | } |
| 1797 | |
| 1798 | /* |
| 1799 | * Wire the user pages and remap them into kernel memory. |
| 1800 | */ |
| 1801 | |
| 1802 | prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ; |
| 1803 | error = uvm_vslock(vs, (void *)uva, len, prot); |
| 1804 | if (error) { |
| 1805 | return error; |
| 1806 | } |
| 1807 | |
| 1808 | map = &vs->vm_map; |
| 1809 | upm = vm_map_pmap(map); |
| 1810 | kpm = vm_map_pmap(kernel_map); |
| 1811 | puva = trunc_page(uva); |
| 1812 | kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask, |
| 1813 | UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH); |
| 1814 | for (poff = 0; poff < klen; poff += PAGE_SIZE) { |
| 1815 | rv = pmap_extract(upm, puva + poff, &pa); |
| 1816 | KASSERT(rv); |
| 1817 | pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED); |
| 1818 | } |
| 1819 | pmap_update(kpm); |
| 1820 | |
| 1821 | /* |
| 1822 | * Do the I/O. |
| 1823 | */ |
| 1824 | |
| 1825 | koff = uva - trunc_page(uva); |
| 1826 | error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw, |
| 1827 | genfs_dio_iodone); |
| 1828 | |
| 1829 | /* |
| 1830 | * Tear down the kernel mapping. |
| 1831 | */ |
| 1832 | |
| 1833 | pmap_kremove(kva, klen); |
| 1834 | pmap_update(kpm); |
| 1835 | uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY); |
| 1836 | |
| 1837 | /* |
| 1838 | * Unwire the user pages. |
| 1839 | */ |
| 1840 | |
| 1841 | uvm_vsunlock(vs, (void *)uva, len); |
| 1842 | return error; |
| 1843 | } |
| 1844 | |