| 1 | /* $NetBSD: lfs_pages.c,v 1.9 2016/10/04 16:46:20 christos Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. |
| 5 | * All rights reserved. |
| 6 | * |
| 7 | * This code is derived from software contributed to The NetBSD Foundation |
| 8 | * by Konrad E. Schroder <perseant@hhhh.org>. |
| 9 | * |
| 10 | * Redistribution and use in source and binary forms, with or without |
| 11 | * modification, are permitted provided that the following conditions |
| 12 | * are met: |
| 13 | * 1. Redistributions of source code must retain the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer. |
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
| 16 | * notice, this list of conditions and the following disclaimer in the |
| 17 | * documentation and/or other materials provided with the distribution. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
| 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 29 | * POSSIBILITY OF SUCH DAMAGE. |
| 30 | */ |
| 31 | /* |
| 32 | * Copyright (c) 1986, 1989, 1991, 1993, 1995 |
| 33 | * The Regents of the University of California. All rights reserved. |
| 34 | * |
| 35 | * Redistribution and use in source and binary forms, with or without |
| 36 | * modification, are permitted provided that the following conditions |
| 37 | * are met: |
| 38 | * 1. Redistributions of source code must retain the above copyright |
| 39 | * notice, this list of conditions and the following disclaimer. |
| 40 | * 2. Redistributions in binary form must reproduce the above copyright |
| 41 | * notice, this list of conditions and the following disclaimer in the |
| 42 | * documentation and/or other materials provided with the distribution. |
| 43 | * 3. Neither the name of the University nor the names of its contributors |
| 44 | * may be used to endorse or promote products derived from this software |
| 45 | * without specific prior written permission. |
| 46 | * |
| 47 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 48 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 49 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 50 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 51 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 52 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 53 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 54 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 55 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 56 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 57 | * SUCH DAMAGE. |
| 58 | * |
| 59 | * @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95 |
| 60 | */ |
| 61 | |
| 62 | #include <sys/cdefs.h> |
| 63 | __KERNEL_RCSID(0, "$NetBSD: lfs_pages.c,v 1.9 2016/10/04 16:46:20 christos Exp $" ); |
| 64 | |
| 65 | #ifdef _KERNEL_OPT |
| 66 | #include "opt_compat_netbsd.h" |
| 67 | #include "opt_uvm_page_trkown.h" |
| 68 | #endif |
| 69 | |
| 70 | #include <sys/param.h> |
| 71 | #include <sys/systm.h> |
| 72 | #include <sys/namei.h> |
| 73 | #include <sys/resourcevar.h> |
| 74 | #include <sys/kernel.h> |
| 75 | #include <sys/file.h> |
| 76 | #include <sys/stat.h> |
| 77 | #include <sys/buf.h> |
| 78 | #include <sys/proc.h> |
| 79 | #include <sys/mount.h> |
| 80 | #include <sys/vnode.h> |
| 81 | #include <sys/pool.h> |
| 82 | #include <sys/signalvar.h> |
| 83 | #include <sys/kauth.h> |
| 84 | #include <sys/syslog.h> |
| 85 | #include <sys/fstrans.h> |
| 86 | |
| 87 | #include <miscfs/fifofs/fifo.h> |
| 88 | #include <miscfs/genfs/genfs.h> |
| 89 | #include <miscfs/specfs/specdev.h> |
| 90 | |
| 91 | #include <ufs/lfs/ulfs_inode.h> |
| 92 | #include <ufs/lfs/ulfsmount.h> |
| 93 | #include <ufs/lfs/ulfs_bswap.h> |
| 94 | #include <ufs/lfs/ulfs_extern.h> |
| 95 | |
| 96 | #include <uvm/uvm.h> |
| 97 | #include <uvm/uvm_pmap.h> |
| 98 | #include <uvm/uvm_stat.h> |
| 99 | #include <uvm/uvm_pager.h> |
| 100 | |
| 101 | #include <ufs/lfs/lfs.h> |
| 102 | #include <ufs/lfs/lfs_accessors.h> |
| 103 | #include <ufs/lfs/lfs_kernel.h> |
| 104 | #include <ufs/lfs/lfs_extern.h> |
| 105 | |
| 106 | extern pid_t lfs_writer_daemon; |
| 107 | |
| 108 | static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **); |
| 109 | |
| 110 | int |
| 111 | lfs_getpages(void *v) |
| 112 | { |
| 113 | struct vop_getpages_args /* { |
| 114 | struct vnode *a_vp; |
| 115 | voff_t a_offset; |
| 116 | struct vm_page **a_m; |
| 117 | int *a_count; |
| 118 | int a_centeridx; |
| 119 | vm_prot_t a_access_type; |
| 120 | int a_advice; |
| 121 | int a_flags; |
| 122 | } */ *ap = v; |
| 123 | |
| 124 | if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM && |
| 125 | (ap->a_access_type & VM_PROT_WRITE) != 0) { |
| 126 | return EPERM; |
| 127 | } |
| 128 | if ((ap->a_access_type & VM_PROT_WRITE) != 0) { |
| 129 | mutex_enter(&lfs_lock); |
| 130 | LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED); |
| 131 | mutex_exit(&lfs_lock); |
| 132 | } |
| 133 | |
| 134 | /* |
| 135 | * we're relying on the fact that genfs_getpages() always read in |
| 136 | * entire filesystem blocks. |
| 137 | */ |
| 138 | return genfs_getpages(v); |
| 139 | } |
| 140 | |
| 141 | /* |
| 142 | * Wait for a page to become unbusy, possibly printing diagnostic messages |
| 143 | * as well. |
| 144 | * |
| 145 | * Called with vp->v_interlock held; return with it held. |
| 146 | */ |
| 147 | static void |
| 148 | wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label) |
| 149 | { |
| 150 | KASSERT(mutex_owned(vp->v_interlock)); |
| 151 | if ((pg->flags & PG_BUSY) == 0) |
| 152 | return; /* Nothing to wait for! */ |
| 153 | |
| 154 | #if defined(DEBUG) && defined(UVM_PAGE_TRKOWN) |
| 155 | static struct vm_page *lastpg; |
| 156 | |
| 157 | if (label != NULL && pg != lastpg) { |
| 158 | if (pg->owner_tag) { |
| 159 | printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n" , |
| 160 | curproc->p_pid, curlwp->l_lid, label, |
| 161 | pg, pg->owner, pg->lowner, pg->owner_tag); |
| 162 | } else { |
| 163 | printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n" , |
| 164 | curproc->p_pid, curlwp->l_lid, label, pg); |
| 165 | } |
| 166 | } |
| 167 | lastpg = pg; |
| 168 | #endif |
| 169 | |
| 170 | pg->flags |= PG_WANTED; |
| 171 | UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, "lfsput" , 0); |
| 172 | mutex_enter(vp->v_interlock); |
| 173 | } |
| 174 | |
| 175 | /* |
| 176 | * This routine is called by lfs_putpages() when it can't complete the |
| 177 | * write because a page is busy. This means that either (1) someone, |
| 178 | * possibly the pagedaemon, is looking at this page, and will give it up |
| 179 | * presently; or (2) we ourselves are holding the page busy in the |
| 180 | * process of being written (either gathered or actually on its way to |
| 181 | * disk). We don't need to give up the segment lock, but we might need |
| 182 | * to call lfs_writeseg() to expedite the page's journey to disk. |
| 183 | * |
| 184 | * Called with vp->v_interlock held; return with it held. |
| 185 | */ |
| 186 | /* #define BUSYWAIT */ |
| 187 | static void |
| 188 | write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg, |
| 189 | int seglocked, const char *label) |
| 190 | { |
| 191 | KASSERT(mutex_owned(vp->v_interlock)); |
| 192 | #ifndef BUSYWAIT |
| 193 | struct inode *ip = VTOI(vp); |
| 194 | struct segment *sp = fs->lfs_sp; |
| 195 | int count = 0; |
| 196 | |
| 197 | if (pg == NULL) |
| 198 | return; |
| 199 | |
| 200 | while (pg->flags & PG_BUSY && |
| 201 | pg->uobject == &vp->v_uobj) { |
| 202 | mutex_exit(vp->v_interlock); |
| 203 | if (sp->cbpp - sp->bpp > 1) { |
| 204 | /* Write gathered pages */ |
| 205 | lfs_updatemeta(sp); |
| 206 | lfs_release_finfo(fs); |
| 207 | (void) lfs_writeseg(fs, sp); |
| 208 | |
| 209 | /* |
| 210 | * Reinitialize FIP |
| 211 | */ |
| 212 | KASSERT(sp->vp == vp); |
| 213 | lfs_acquire_finfo(fs, ip->i_number, |
| 214 | ip->i_gen); |
| 215 | } |
| 216 | ++count; |
| 217 | mutex_enter(vp->v_interlock); |
| 218 | wait_for_page(vp, pg, label); |
| 219 | } |
| 220 | if (label != NULL && count > 1) { |
| 221 | DLOG((DLOG_PAGE, "lfs_putpages[%d]: %s: %sn = %d\n" , |
| 222 | curproc->p_pid, label, (count > 0 ? "looping, " : "" ), |
| 223 | count)); |
| 224 | } |
| 225 | #else |
| 226 | preempt(1); |
| 227 | #endif |
| 228 | KASSERT(mutex_owned(vp->v_interlock)); |
| 229 | } |
| 230 | |
| 231 | /* |
| 232 | * Make sure that for all pages in every block in the given range, |
| 233 | * either all are dirty or all are clean. If any of the pages |
| 234 | * we've seen so far are dirty, put the vnode on the paging chain, |
| 235 | * and mark it IN_PAGING. |
| 236 | * |
| 237 | * If checkfirst != 0, don't check all the pages but return at the |
| 238 | * first dirty page. |
| 239 | */ |
| 240 | static int |
| 241 | check_dirty(struct lfs *fs, struct vnode *vp, |
| 242 | off_t startoffset, off_t endoffset, off_t blkeof, |
| 243 | int flags, int checkfirst, struct vm_page **pgp) |
| 244 | { |
| 245 | int by_list; |
| 246 | struct vm_page *curpg = NULL; /* XXX: gcc */ |
| 247 | struct vm_page *pgs[MAXBSIZE / MIN_PAGE_SIZE], *pg; |
| 248 | off_t soff = 0; /* XXX: gcc */ |
| 249 | voff_t off; |
| 250 | int i; |
| 251 | int nonexistent; |
| 252 | int any_dirty; /* number of dirty pages */ |
| 253 | int dirty; /* number of dirty pages in a block */ |
| 254 | int tdirty; |
| 255 | int pages_per_block = lfs_sb_getbsize(fs) >> PAGE_SHIFT; |
| 256 | int pagedaemon = (curlwp == uvm.pagedaemon_lwp); |
| 257 | |
| 258 | KASSERT(mutex_owned(vp->v_interlock)); |
| 259 | ASSERT_MAYBE_SEGLOCK(fs); |
| 260 | top: |
| 261 | by_list = (vp->v_uobj.uo_npages <= |
| 262 | ((endoffset - startoffset) >> PAGE_SHIFT) * |
| 263 | UVM_PAGE_TREE_PENALTY); |
| 264 | any_dirty = 0; |
| 265 | |
| 266 | if (by_list) { |
| 267 | curpg = TAILQ_FIRST(&vp->v_uobj.memq); |
| 268 | } else { |
| 269 | soff = startoffset; |
| 270 | } |
| 271 | while (by_list || soff < MIN(blkeof, endoffset)) { |
| 272 | if (by_list) { |
| 273 | /* |
| 274 | * Find the first page in a block. Skip |
| 275 | * blocks outside our area of interest or beyond |
| 276 | * the end of file. |
| 277 | */ |
| 278 | KASSERT(curpg == NULL |
| 279 | || (curpg->flags & PG_MARKER) == 0); |
| 280 | if (pages_per_block > 1) { |
| 281 | while (curpg && |
| 282 | ((curpg->offset & lfs_sb_getbmask(fs)) || |
| 283 | curpg->offset >= vp->v_size || |
| 284 | curpg->offset >= endoffset)) { |
| 285 | curpg = TAILQ_NEXT(curpg, listq.queue); |
| 286 | KASSERT(curpg == NULL || |
| 287 | (curpg->flags & PG_MARKER) == 0); |
| 288 | } |
| 289 | } |
| 290 | if (curpg == NULL) |
| 291 | break; |
| 292 | soff = curpg->offset; |
| 293 | } |
| 294 | |
| 295 | /* |
| 296 | * Mark all pages in extended range busy; find out if any |
| 297 | * of them are dirty. |
| 298 | */ |
| 299 | nonexistent = dirty = 0; |
| 300 | for (i = 0; i == 0 || i < pages_per_block; i++) { |
| 301 | KASSERT(mutex_owned(vp->v_interlock)); |
| 302 | if (by_list && pages_per_block <= 1) { |
| 303 | pgs[i] = pg = curpg; |
| 304 | } else { |
| 305 | off = soff + (i << PAGE_SHIFT); |
| 306 | pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off); |
| 307 | if (pg == NULL) { |
| 308 | ++nonexistent; |
| 309 | continue; |
| 310 | } |
| 311 | } |
| 312 | KASSERT(pg != NULL); |
| 313 | |
| 314 | /* |
| 315 | * If we're holding the segment lock, we can deadlock |
| 316 | * against a process that has our page and is waiting |
| 317 | * for the cleaner, while the cleaner waits for the |
| 318 | * segment lock. Just bail in that case. |
| 319 | */ |
| 320 | if ((pg->flags & PG_BUSY) && |
| 321 | (pagedaemon || LFS_SEGLOCK_HELD(fs))) { |
| 322 | if (i > 0) |
| 323 | uvm_page_unbusy(pgs, i); |
| 324 | DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n" )); |
| 325 | if (pgp) |
| 326 | *pgp = pg; |
| 327 | KASSERT(mutex_owned(vp->v_interlock)); |
| 328 | return -1; |
| 329 | } |
| 330 | |
| 331 | while (pg->flags & PG_BUSY) { |
| 332 | wait_for_page(vp, pg, NULL); |
| 333 | KASSERT(mutex_owned(vp->v_interlock)); |
| 334 | if (i > 0) |
| 335 | uvm_page_unbusy(pgs, i); |
| 336 | KASSERT(mutex_owned(vp->v_interlock)); |
| 337 | goto top; |
| 338 | } |
| 339 | pg->flags |= PG_BUSY; |
| 340 | UVM_PAGE_OWN(pg, "lfs_putpages" ); |
| 341 | |
| 342 | pmap_page_protect(pg, VM_PROT_NONE); |
| 343 | tdirty = (pmap_clear_modify(pg) || |
| 344 | (pg->flags & PG_CLEAN) == 0); |
| 345 | dirty += tdirty; |
| 346 | } |
| 347 | if (pages_per_block > 0 && nonexistent >= pages_per_block) { |
| 348 | if (by_list) { |
| 349 | curpg = TAILQ_NEXT(curpg, listq.queue); |
| 350 | } else { |
| 351 | soff += lfs_sb_getbsize(fs); |
| 352 | } |
| 353 | continue; |
| 354 | } |
| 355 | |
| 356 | any_dirty += dirty; |
| 357 | KASSERT(nonexistent == 0); |
| 358 | KASSERT(mutex_owned(vp->v_interlock)); |
| 359 | |
| 360 | /* |
| 361 | * If any are dirty make all dirty; unbusy them, |
| 362 | * but if we were asked to clean, wire them so that |
| 363 | * the pagedaemon doesn't bother us about them while |
| 364 | * they're on their way to disk. |
| 365 | */ |
| 366 | for (i = 0; i == 0 || i < pages_per_block; i++) { |
| 367 | KASSERT(mutex_owned(vp->v_interlock)); |
| 368 | pg = pgs[i]; |
| 369 | KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI))); |
| 370 | KASSERT(pg->flags & PG_BUSY); |
| 371 | if (dirty) { |
| 372 | pg->flags &= ~PG_CLEAN; |
| 373 | if (flags & PGO_FREE) { |
| 374 | /* |
| 375 | * Wire the page so that |
| 376 | * pdaemon doesn't see it again. |
| 377 | */ |
| 378 | mutex_enter(&uvm_pageqlock); |
| 379 | uvm_pagewire(pg); |
| 380 | mutex_exit(&uvm_pageqlock); |
| 381 | |
| 382 | /* Suspended write flag */ |
| 383 | pg->flags |= PG_DELWRI; |
| 384 | } |
| 385 | } |
| 386 | if (pg->flags & PG_WANTED) |
| 387 | wakeup(pg); |
| 388 | pg->flags &= ~(PG_WANTED|PG_BUSY); |
| 389 | UVM_PAGE_OWN(pg, NULL); |
| 390 | } |
| 391 | |
| 392 | if (checkfirst && any_dirty) |
| 393 | break; |
| 394 | |
| 395 | if (by_list) { |
| 396 | curpg = TAILQ_NEXT(curpg, listq.queue); |
| 397 | } else { |
| 398 | soff += MAX(PAGE_SIZE, lfs_sb_getbsize(fs)); |
| 399 | } |
| 400 | } |
| 401 | |
| 402 | KASSERT(mutex_owned(vp->v_interlock)); |
| 403 | return any_dirty; |
| 404 | } |
| 405 | |
| 406 | /* |
| 407 | * lfs_putpages functions like genfs_putpages except that |
| 408 | * |
| 409 | * (1) It needs to bounds-check the incoming requests to ensure that |
| 410 | * they are block-aligned; if they are not, expand the range and |
| 411 | * do the right thing in case, e.g., the requested range is clean |
| 412 | * but the expanded range is dirty. |
| 413 | * |
| 414 | * (2) It needs to explicitly send blocks to be written when it is done. |
| 415 | * If VOP_PUTPAGES is called without the seglock held, we simply take |
| 416 | * the seglock and let lfs_segunlock wait for us. |
| 417 | * XXX There might be a bad situation if we have to flush a vnode while |
| 418 | * XXX lfs_markv is in operation. As of this writing we panic in this |
| 419 | * XXX case. |
| 420 | * |
| 421 | * Assumptions: |
| 422 | * |
| 423 | * (1) The caller does not hold any pages in this vnode busy. If it does, |
| 424 | * there is a danger that when we expand the page range and busy the |
| 425 | * pages we will deadlock. |
| 426 | * |
| 427 | * (2) We are called with vp->v_interlock held; we must return with it |
| 428 | * released. |
| 429 | * |
| 430 | * (3) We don't absolutely have to free pages right away, provided that |
| 431 | * the request does not have PGO_SYNCIO. When the pagedaemon gives |
| 432 | * us a request with PGO_FREE, we take the pages out of the paging |
| 433 | * queue and wake up the writer, which will handle freeing them for us. |
| 434 | * |
| 435 | * We ensure that for any filesystem block, all pages for that |
| 436 | * block are either resident or not, even if those pages are higher |
| 437 | * than EOF; that means that we will be getting requests to free |
| 438 | * "unused" pages above EOF all the time, and should ignore them. |
| 439 | * |
| 440 | * (4) If we are called with PGO_LOCKED, the finfo array we are to write |
| 441 | * into has been set up for us by lfs_writefile. If not, we will |
| 442 | * have to handle allocating and/or freeing an finfo entry. |
| 443 | * |
| 444 | * XXX note that we're (ab)using PGO_LOCKED as "seglock held". |
| 445 | */ |
| 446 | |
| 447 | /* How many times to loop before we should start to worry */ |
| 448 | #define TOOMANY 4 |
| 449 | |
| 450 | int |
| 451 | lfs_putpages(void *v) |
| 452 | { |
| 453 | int error; |
| 454 | struct vop_putpages_args /* { |
| 455 | struct vnode *a_vp; |
| 456 | voff_t a_offlo; |
| 457 | voff_t a_offhi; |
| 458 | int a_flags; |
| 459 | } */ *ap = v; |
| 460 | struct vnode *vp; |
| 461 | struct inode *ip; |
| 462 | struct lfs *fs; |
| 463 | struct segment *sp; |
| 464 | off_t origoffset, startoffset, endoffset, origendoffset, blkeof; |
| 465 | off_t off, max_endoffset; |
| 466 | bool seglocked, sync, pagedaemon, reclaim; |
| 467 | struct vm_page *pg, *busypg; |
| 468 | UVMHIST_FUNC("lfs_putpages" ); UVMHIST_CALLED(ubchist); |
| 469 | int oreclaim = 0; |
| 470 | int donewriting = 0; |
| 471 | #ifdef DEBUG |
| 472 | int debug_n_again, debug_n_dirtyclean; |
| 473 | #endif |
| 474 | |
| 475 | vp = ap->a_vp; |
| 476 | ip = VTOI(vp); |
| 477 | fs = ip->i_lfs; |
| 478 | sync = (ap->a_flags & PGO_SYNCIO) != 0; |
| 479 | reclaim = (ap->a_flags & PGO_RECLAIM) != 0; |
| 480 | pagedaemon = (curlwp == uvm.pagedaemon_lwp); |
| 481 | |
| 482 | KASSERT(mutex_owned(vp->v_interlock)); |
| 483 | |
| 484 | /* Putpages does nothing for metadata. */ |
| 485 | if (vp == fs->lfs_ivnode || vp->v_type != VREG) { |
| 486 | mutex_exit(vp->v_interlock); |
| 487 | return 0; |
| 488 | } |
| 489 | |
| 490 | /* |
| 491 | * If there are no pages, don't do anything. |
| 492 | */ |
| 493 | if (vp->v_uobj.uo_npages == 0) { |
| 494 | if (TAILQ_EMPTY(&vp->v_uobj.memq) && |
| 495 | (vp->v_iflag & VI_ONWORKLST) && |
| 496 | LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { |
| 497 | vp->v_iflag &= ~VI_WRMAPDIRTY; |
| 498 | vn_syncer_remove_from_worklist(vp); |
| 499 | } |
| 500 | mutex_exit(vp->v_interlock); |
| 501 | |
| 502 | /* Remove us from paging queue, if we were on it */ |
| 503 | mutex_enter(&lfs_lock); |
| 504 | if (ip->i_flags & IN_PAGING) { |
| 505 | ip->i_flags &= ~IN_PAGING; |
| 506 | TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); |
| 507 | } |
| 508 | mutex_exit(&lfs_lock); |
| 509 | |
| 510 | KASSERT(!mutex_owned(vp->v_interlock)); |
| 511 | return 0; |
| 512 | } |
| 513 | |
| 514 | blkeof = lfs_blkroundup(fs, ip->i_size); |
| 515 | |
| 516 | /* |
| 517 | * Ignore requests to free pages past EOF but in the same block |
| 518 | * as EOF, unless the vnode is being reclaimed or the request |
| 519 | * is synchronous. (If the request is sync, it comes from |
| 520 | * lfs_truncate.) |
| 521 | * |
| 522 | * To avoid being flooded with this request, make these pages |
| 523 | * look "active". |
| 524 | */ |
| 525 | if (!sync && !reclaim && |
| 526 | ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) { |
| 527 | origoffset = ap->a_offlo; |
| 528 | for (off = origoffset; off < blkeof; off += lfs_sb_getbsize(fs)) { |
| 529 | pg = uvm_pagelookup(&vp->v_uobj, off); |
| 530 | KASSERT(pg != NULL); |
| 531 | while (pg->flags & PG_BUSY) { |
| 532 | pg->flags |= PG_WANTED; |
| 533 | UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, |
| 534 | "lfsput2" , 0); |
| 535 | mutex_enter(vp->v_interlock); |
| 536 | } |
| 537 | mutex_enter(&uvm_pageqlock); |
| 538 | uvm_pageactivate(pg); |
| 539 | mutex_exit(&uvm_pageqlock); |
| 540 | } |
| 541 | ap->a_offlo = blkeof; |
| 542 | if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) { |
| 543 | mutex_exit(vp->v_interlock); |
| 544 | return 0; |
| 545 | } |
| 546 | } |
| 547 | |
| 548 | /* |
| 549 | * Extend page range to start and end at block boundaries. |
| 550 | * (For the purposes of VOP_PUTPAGES, fragments don't exist.) |
| 551 | */ |
| 552 | origoffset = ap->a_offlo; |
| 553 | origendoffset = ap->a_offhi; |
| 554 | startoffset = origoffset & ~(lfs_sb_getbmask(fs)); |
| 555 | max_endoffset = (trunc_page(LLONG_MAX) >> lfs_sb_getbshift(fs)) |
| 556 | << lfs_sb_getbshift(fs); |
| 557 | |
| 558 | if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) { |
| 559 | endoffset = max_endoffset; |
| 560 | origendoffset = endoffset; |
| 561 | } else { |
| 562 | origendoffset = round_page(ap->a_offhi); |
| 563 | endoffset = round_page(lfs_blkroundup(fs, origendoffset)); |
| 564 | } |
| 565 | |
| 566 | KASSERT(startoffset > 0 || endoffset >= startoffset); |
| 567 | if (startoffset == endoffset) { |
| 568 | /* Nothing to do, why were we called? */ |
| 569 | mutex_exit(vp->v_interlock); |
| 570 | DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %" |
| 571 | PRId64 "\n" , startoffset)); |
| 572 | return 0; |
| 573 | } |
| 574 | |
| 575 | ap->a_offlo = startoffset; |
| 576 | ap->a_offhi = endoffset; |
| 577 | |
| 578 | /* |
| 579 | * If not cleaning, just send the pages through genfs_putpages |
| 580 | * to be returned to the pool. |
| 581 | */ |
| 582 | if (!(ap->a_flags & PGO_CLEANIT)) { |
| 583 | DLOG((DLOG_PAGE, "lfs_putpages: no cleanit vn %p ino %d (flags %x)\n" , |
| 584 | vp, (int)ip->i_number, ap->a_flags)); |
| 585 | int r = genfs_putpages(v); |
| 586 | KASSERT(!mutex_owned(vp->v_interlock)); |
| 587 | return r; |
| 588 | } |
| 589 | |
| 590 | /* Set PGO_BUSYFAIL to avoid deadlocks */ |
| 591 | ap->a_flags |= PGO_BUSYFAIL; |
| 592 | |
| 593 | /* |
| 594 | * Likewise, if we are asked to clean but the pages are not |
| 595 | * dirty, we can just free them using genfs_putpages. |
| 596 | */ |
| 597 | #ifdef DEBUG |
| 598 | debug_n_dirtyclean = 0; |
| 599 | #endif |
| 600 | do { |
| 601 | int r; |
| 602 | KASSERT(mutex_owned(vp->v_interlock)); |
| 603 | |
| 604 | /* Count the number of dirty pages */ |
| 605 | r = check_dirty(fs, vp, startoffset, endoffset, blkeof, |
| 606 | ap->a_flags, 1, NULL); |
| 607 | if (r < 0) { |
| 608 | /* Pages are busy with another process */ |
| 609 | mutex_exit(vp->v_interlock); |
| 610 | return EDEADLK; |
| 611 | } |
| 612 | if (r > 0) /* Some pages are dirty */ |
| 613 | break; |
| 614 | |
| 615 | /* |
| 616 | * Sometimes pages are dirtied between the time that |
| 617 | * we check and the time we try to clean them. |
| 618 | * Instruct lfs_gop_write to return EDEADLK in this case |
| 619 | * so we can write them properly. |
| 620 | */ |
| 621 | ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE; |
| 622 | r = genfs_do_putpages(vp, startoffset, endoffset, |
| 623 | ap->a_flags & ~PGO_SYNCIO, &busypg); |
| 624 | ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE; |
| 625 | if (r != EDEADLK) { |
| 626 | KASSERT(!mutex_owned(vp->v_interlock)); |
| 627 | return r; |
| 628 | } |
| 629 | |
| 630 | /* One of the pages was busy. Start over. */ |
| 631 | mutex_enter(vp->v_interlock); |
| 632 | wait_for_page(vp, busypg, "dirtyclean" ); |
| 633 | #ifdef DEBUG |
| 634 | ++debug_n_dirtyclean; |
| 635 | #endif |
| 636 | } while(1); |
| 637 | |
| 638 | #ifdef DEBUG |
| 639 | if (debug_n_dirtyclean > TOOMANY) |
| 640 | DLOG((DLOG_PAGE, "lfs_putpages: dirtyclean: looping, n = %d\n" , |
| 641 | debug_n_dirtyclean)); |
| 642 | #endif |
| 643 | |
| 644 | /* |
| 645 | * Dirty and asked to clean. |
| 646 | * |
| 647 | * Pagedaemon can't actually write LFS pages; wake up |
| 648 | * the writer to take care of that. The writer will |
| 649 | * notice the pager inode queue and act on that. |
| 650 | * |
| 651 | * XXX We must drop the vp->interlock before taking the lfs_lock or we |
| 652 | * get a nasty deadlock with lfs_flush_pchain(). |
| 653 | */ |
| 654 | if (pagedaemon) { |
| 655 | mutex_exit(vp->v_interlock); |
| 656 | mutex_enter(&lfs_lock); |
| 657 | if (!(ip->i_flags & IN_PAGING)) { |
| 658 | ip->i_flags |= IN_PAGING; |
| 659 | TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain); |
| 660 | } |
| 661 | wakeup(&lfs_writer_daemon); |
| 662 | mutex_exit(&lfs_lock); |
| 663 | preempt(); |
| 664 | KASSERT(!mutex_owned(vp->v_interlock)); |
| 665 | return EWOULDBLOCK; |
| 666 | } |
| 667 | |
| 668 | /* |
| 669 | * If this is a file created in a recent dirop, we can't flush its |
| 670 | * inode until the dirop is complete. Drain dirops, then flush the |
| 671 | * filesystem (taking care of any other pending dirops while we're |
| 672 | * at it). |
| 673 | */ |
| 674 | if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT && |
| 675 | (vp->v_uflag & VU_DIROP)) { |
| 676 | DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n" )); |
| 677 | |
| 678 | lfs_writer_enter(fs, "ppdirop" ); |
| 679 | |
| 680 | /* Note if we hold the vnode locked */ |
| 681 | if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) |
| 682 | { |
| 683 | DLOG((DLOG_PAGE, "lfs_putpages: dirop inode already locked\n" )); |
| 684 | } else { |
| 685 | DLOG((DLOG_PAGE, "lfs_putpages: dirop inode not locked\n" )); |
| 686 | } |
| 687 | mutex_exit(vp->v_interlock); |
| 688 | |
| 689 | mutex_enter(&lfs_lock); |
| 690 | lfs_flush_fs(fs, sync ? SEGM_SYNC : 0); |
| 691 | mutex_exit(&lfs_lock); |
| 692 | |
| 693 | mutex_enter(vp->v_interlock); |
| 694 | lfs_writer_leave(fs); |
| 695 | |
| 696 | /* The flush will have cleaned out this vnode as well, |
| 697 | no need to do more to it. */ |
| 698 | } |
| 699 | |
| 700 | /* |
| 701 | * This is it. We are going to write some pages. From here on |
| 702 | * down it's all just mechanics. |
| 703 | * |
| 704 | * Don't let genfs_putpages wait; lfs_segunlock will wait for us. |
| 705 | */ |
| 706 | ap->a_flags &= ~PGO_SYNCIO; |
| 707 | |
| 708 | /* |
| 709 | * If we've already got the seglock, flush the node and return. |
| 710 | * The FIP has already been set up for us by lfs_writefile, |
| 711 | * and FIP cleanup and lfs_updatemeta will also be done there, |
| 712 | * unless genfs_putpages returns EDEADLK; then we must flush |
| 713 | * what we have, and correct FIP and segment header accounting. |
| 714 | */ |
| 715 | get_seglock: |
| 716 | /* |
| 717 | * If we are not called with the segment locked, lock it. |
| 718 | * Account for a new FIP in the segment header, and set sp->vp. |
| 719 | * (This should duplicate the setup at the top of lfs_writefile().) |
| 720 | */ |
| 721 | seglocked = (ap->a_flags & PGO_LOCKED) != 0; |
| 722 | if (!seglocked) { |
| 723 | mutex_exit(vp->v_interlock); |
| 724 | error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0)); |
| 725 | if (error != 0) { |
| 726 | KASSERT(!mutex_owned(vp->v_interlock)); |
| 727 | return error; |
| 728 | } |
| 729 | mutex_enter(vp->v_interlock); |
| 730 | lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); |
| 731 | } |
| 732 | sp = fs->lfs_sp; |
| 733 | KASSERT(sp->vp == NULL); |
| 734 | sp->vp = vp; |
| 735 | |
| 736 | /* Note segments written by reclaim; only for debugging */ |
| 737 | if (vdead_check(vp, VDEAD_NOWAIT) != 0) { |
| 738 | sp->seg_flags |= SEGM_RECLAIM; |
| 739 | fs->lfs_reclino = ip->i_number; |
| 740 | } |
| 741 | |
| 742 | /* |
| 743 | * Ensure that the partial segment is marked SS_DIROP if this |
| 744 | * vnode is a DIROP. |
| 745 | */ |
| 746 | if (!seglocked && vp->v_uflag & VU_DIROP) { |
| 747 | SEGSUM *ssp = sp->segsum; |
| 748 | |
| 749 | lfs_ss_setflags(fs, ssp, |
| 750 | lfs_ss_getflags(fs, ssp) | (SS_DIROP|SS_CONT)); |
| 751 | } |
| 752 | |
| 753 | /* |
| 754 | * Loop over genfs_putpages until all pages are gathered. |
| 755 | * genfs_putpages() drops the interlock, so reacquire it if necessary. |
| 756 | * Whenever we lose the interlock we have to rerun check_dirty, as |
| 757 | * well, since more pages might have been dirtied in our absence. |
| 758 | */ |
| 759 | #ifdef DEBUG |
| 760 | debug_n_again = 0; |
| 761 | #endif |
| 762 | do { |
| 763 | busypg = NULL; |
| 764 | KASSERT(mutex_owned(vp->v_interlock)); |
| 765 | if (check_dirty(fs, vp, startoffset, endoffset, blkeof, |
| 766 | ap->a_flags, 0, &busypg) < 0) { |
| 767 | mutex_exit(vp->v_interlock); |
| 768 | /* XXX why? --ks */ |
| 769 | mutex_enter(vp->v_interlock); |
| 770 | write_and_wait(fs, vp, busypg, seglocked, NULL); |
| 771 | if (!seglocked) { |
| 772 | mutex_exit(vp->v_interlock); |
| 773 | lfs_release_finfo(fs); |
| 774 | lfs_segunlock(fs); |
| 775 | mutex_enter(vp->v_interlock); |
| 776 | } |
| 777 | sp->vp = NULL; |
| 778 | goto get_seglock; |
| 779 | } |
| 780 | |
| 781 | busypg = NULL; |
| 782 | KASSERT(!mutex_owned(&uvm_pageqlock)); |
| 783 | oreclaim = (ap->a_flags & PGO_RECLAIM); |
| 784 | ap->a_flags &= ~PGO_RECLAIM; |
| 785 | error = genfs_do_putpages(vp, startoffset, endoffset, |
| 786 | ap->a_flags, &busypg); |
| 787 | ap->a_flags |= oreclaim; |
| 788 | |
| 789 | if (error == EDEADLK || error == EAGAIN) { |
| 790 | DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned" |
| 791 | " %d ino %d off %jx (seg %d)\n" , error, |
| 792 | ip->i_number, (uintmax_t)lfs_sb_getoffset(fs), |
| 793 | lfs_dtosn(fs, lfs_sb_getoffset(fs)))); |
| 794 | |
| 795 | if (oreclaim) { |
| 796 | mutex_enter(vp->v_interlock); |
| 797 | write_and_wait(fs, vp, busypg, seglocked, "again" ); |
| 798 | mutex_exit(vp->v_interlock); |
| 799 | } else { |
| 800 | if ((sp->seg_flags & SEGM_SINGLE) && |
| 801 | lfs_sb_getcurseg(fs) != fs->lfs_startseg) |
| 802 | donewriting = 1; |
| 803 | } |
| 804 | } else if (error) { |
| 805 | DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned" |
| 806 | " %d ino %d off %jx (seg %d)\n" , error, |
| 807 | (int)ip->i_number, (uintmax_t)lfs_sb_getoffset(fs), |
| 808 | lfs_dtosn(fs, lfs_sb_getoffset(fs)))); |
| 809 | } |
| 810 | /* genfs_do_putpages loses the interlock */ |
| 811 | #ifdef DEBUG |
| 812 | ++debug_n_again; |
| 813 | #endif |
| 814 | if (oreclaim && error == EAGAIN) { |
| 815 | DLOG((DLOG_PAGE, "vp %p ino %d vi_flags %x a_flags %x avoiding vclean panic\n" , |
| 816 | vp, (int)ip->i_number, vp->v_iflag, ap->a_flags)); |
| 817 | mutex_enter(vp->v_interlock); |
| 818 | } |
| 819 | if (error == EDEADLK) |
| 820 | mutex_enter(vp->v_interlock); |
| 821 | } while (error == EDEADLK || (oreclaim && error == EAGAIN)); |
| 822 | #ifdef DEBUG |
| 823 | if (debug_n_again > TOOMANY) |
| 824 | DLOG((DLOG_PAGE, "lfs_putpages: again: looping, n = %d\n" , debug_n_again)); |
| 825 | #endif |
| 826 | |
| 827 | KASSERT(sp != NULL && sp->vp == vp); |
| 828 | if (!seglocked && !donewriting) { |
| 829 | sp->vp = NULL; |
| 830 | |
| 831 | /* Write indirect blocks as well */ |
| 832 | lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir); |
| 833 | lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir); |
| 834 | lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir); |
| 835 | |
| 836 | KASSERT(sp->vp == NULL); |
| 837 | sp->vp = vp; |
| 838 | } |
| 839 | |
| 840 | /* |
| 841 | * Blocks are now gathered into a segment waiting to be written. |
| 842 | * All that's left to do is update metadata, and write them. |
| 843 | */ |
| 844 | lfs_updatemeta(sp); |
| 845 | KASSERT(sp->vp == vp); |
| 846 | sp->vp = NULL; |
| 847 | |
| 848 | /* |
| 849 | * If we were called from lfs_writefile, we don't need to clean up |
| 850 | * the FIP or unlock the segment lock. We're done. |
| 851 | */ |
| 852 | if (seglocked) { |
| 853 | KASSERT(!mutex_owned(vp->v_interlock)); |
| 854 | return error; |
| 855 | } |
| 856 | |
| 857 | /* Clean up FIP and send it to disk. */ |
| 858 | lfs_release_finfo(fs); |
| 859 | lfs_writeseg(fs, fs->lfs_sp); |
| 860 | |
| 861 | /* |
| 862 | * Remove us from paging queue if we wrote all our pages. |
| 863 | */ |
| 864 | if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) { |
| 865 | mutex_enter(&lfs_lock); |
| 866 | if (ip->i_flags & IN_PAGING) { |
| 867 | ip->i_flags &= ~IN_PAGING; |
| 868 | TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain); |
| 869 | } |
| 870 | mutex_exit(&lfs_lock); |
| 871 | } |
| 872 | |
| 873 | /* |
| 874 | * XXX - with the malloc/copy writeseg, the pages are freed by now |
| 875 | * even if we don't wait (e.g. if we hold a nested lock). This |
| 876 | * will not be true if we stop using malloc/copy. |
| 877 | */ |
| 878 | KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT); |
| 879 | lfs_segunlock(fs); |
| 880 | |
| 881 | /* |
| 882 | * Wait for v_numoutput to drop to zero. The seglock should |
| 883 | * take care of this, but there is a slight possibility that |
| 884 | * aiodoned might not have got around to our buffers yet. |
| 885 | */ |
| 886 | if (sync) { |
| 887 | mutex_enter(vp->v_interlock); |
| 888 | while (vp->v_numoutput > 0) { |
| 889 | DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on" |
| 890 | " num %d\n" , ip->i_number, vp->v_numoutput)); |
| 891 | cv_wait(&vp->v_cv, vp->v_interlock); |
| 892 | } |
| 893 | mutex_exit(vp->v_interlock); |
| 894 | } |
| 895 | KASSERT(!mutex_owned(vp->v_interlock)); |
| 896 | return error; |
| 897 | } |
| 898 | |
| 899 | |