| 1 | /* $NetBSD: vfs_wapbl.c,v 1.86 2016/11/10 20:56:32 jdolecek Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. |
| 5 | * All rights reserved. |
| 6 | * |
| 7 | * This code is derived from software contributed to The NetBSD Foundation |
| 8 | * by Wasabi Systems, Inc. |
| 9 | * |
| 10 | * Redistribution and use in source and binary forms, with or without |
| 11 | * modification, are permitted provided that the following conditions |
| 12 | * are met: |
| 13 | * 1. Redistributions of source code must retain the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer. |
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
| 16 | * notice, this list of conditions and the following disclaimer in the |
| 17 | * documentation and/or other materials provided with the distribution. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
| 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 29 | * POSSIBILITY OF SUCH DAMAGE. |
| 30 | */ |
| 31 | |
| 32 | /* |
| 33 | * This implements file system independent write ahead filesystem logging. |
| 34 | */ |
| 35 | |
| 36 | #define WAPBL_INTERNAL |
| 37 | |
| 38 | #include <sys/cdefs.h> |
| 39 | __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.86 2016/11/10 20:56:32 jdolecek Exp $" ); |
| 40 | |
| 41 | #include <sys/param.h> |
| 42 | #include <sys/bitops.h> |
| 43 | #include <sys/time.h> |
| 44 | #include <sys/wapbl.h> |
| 45 | #include <sys/wapbl_replay.h> |
| 46 | |
| 47 | #ifdef _KERNEL |
| 48 | |
| 49 | #include <sys/atomic.h> |
| 50 | #include <sys/conf.h> |
| 51 | #include <sys/file.h> |
| 52 | #include <sys/kauth.h> |
| 53 | #include <sys/kernel.h> |
| 54 | #include <sys/module.h> |
| 55 | #include <sys/mount.h> |
| 56 | #include <sys/mutex.h> |
| 57 | #include <sys/namei.h> |
| 58 | #include <sys/proc.h> |
| 59 | #include <sys/resourcevar.h> |
| 60 | #include <sys/sysctl.h> |
| 61 | #include <sys/uio.h> |
| 62 | #include <sys/vnode.h> |
| 63 | |
| 64 | #include <miscfs/specfs/specdev.h> |
| 65 | |
| 66 | #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP) |
| 67 | #define wapbl_free(a, s) kmem_free((a), (s)) |
| 68 | #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) |
| 69 | |
| 70 | static struct sysctllog *wapbl_sysctl; |
| 71 | static int wapbl_flush_disk_cache = 1; |
| 72 | static int wapbl_verbose_commit = 0; |
| 73 | |
| 74 | static inline size_t wapbl_space_free(size_t, off_t, off_t); |
| 75 | |
| 76 | #else /* !_KERNEL */ |
| 77 | |
| 78 | #include <assert.h> |
| 79 | #include <errno.h> |
| 80 | #include <stdbool.h> |
| 81 | #include <stdio.h> |
| 82 | #include <stdlib.h> |
| 83 | #include <string.h> |
| 84 | |
| 85 | #define KDASSERT(x) assert(x) |
| 86 | #define KASSERT(x) assert(x) |
| 87 | #define wapbl_alloc(s) malloc(s) |
| 88 | #define wapbl_free(a, s) free(a) |
| 89 | #define wapbl_calloc(n, s) calloc((n), (s)) |
| 90 | |
| 91 | #endif /* !_KERNEL */ |
| 92 | |
| 93 | /* |
| 94 | * INTERNAL DATA STRUCTURES |
| 95 | */ |
| 96 | |
| 97 | /* |
| 98 | * This structure holds per-mount log information. |
| 99 | * |
| 100 | * Legend: a = atomic access only |
| 101 | * r = read-only after init |
| 102 | * l = rwlock held |
| 103 | * m = mutex held |
| 104 | * lm = rwlock held writing or mutex held |
| 105 | * u = unlocked access ok |
| 106 | * b = bufcache_lock held |
| 107 | */ |
| 108 | LIST_HEAD(wapbl_ino_head, wapbl_ino); |
| 109 | struct wapbl { |
| 110 | struct vnode *wl_logvp; /* r: log here */ |
| 111 | struct vnode *wl_devvp; /* r: log on this device */ |
| 112 | struct mount *wl_mount; /* r: mountpoint wl is associated with */ |
| 113 | daddr_t wl_logpbn; /* r: Physical block number of start of log */ |
| 114 | int wl_log_dev_bshift; /* r: logarithm of device block size of log |
| 115 | device */ |
| 116 | int wl_fs_dev_bshift; /* r: logarithm of device block size of |
| 117 | filesystem device */ |
| 118 | |
| 119 | unsigned wl_lock_count; /* m: Count of transactions in progress */ |
| 120 | |
| 121 | size_t wl_circ_size; /* r: Number of bytes in buffer of log */ |
| 122 | size_t wl_circ_off; /* r: Number of bytes reserved at start */ |
| 123 | |
| 124 | size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ |
| 125 | size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ |
| 126 | |
| 127 | off_t wl_head; /* l: Byte offset of log head */ |
| 128 | off_t wl_tail; /* l: Byte offset of log tail */ |
| 129 | /* |
| 130 | * WAPBL log layout, stored on wl_devvp at wl_logpbn: |
| 131 | * |
| 132 | * ___________________ wl_circ_size __________________ |
| 133 | * / \ |
| 134 | * +---------+---------+-------+--------------+--------+ |
| 135 | * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ] |
| 136 | * +---------+---------+-------+--------------+--------+ |
| 137 | * wl_circ_off --^ ^-- wl_head ^-- wl_tail |
| 138 | * |
| 139 | * commit0 and commit1 are commit headers. A commit header has |
| 140 | * a generation number, indicating which of the two headers is |
| 141 | * more recent, and an assignment of head and tail pointers. |
| 142 | * The rest is a circular queue of log records, starting at |
| 143 | * the byte offset wl_circ_off. |
| 144 | * |
| 145 | * E marks empty space for records. |
| 146 | * W marks records for block writes issued but waiting. |
| 147 | * C marks completed records. |
| 148 | * |
| 149 | * wapbl_flush writes new records to empty `E' spaces after |
| 150 | * wl_head from the current transaction in memory. |
| 151 | * |
| 152 | * wapbl_truncate advances wl_tail past any completed `C' |
| 153 | * records, freeing them up for use. |
| 154 | * |
| 155 | * head == tail == 0 means log is empty. |
| 156 | * head == tail != 0 means log is full. |
| 157 | * |
| 158 | * See assertions in wapbl_advance() for other boundary |
| 159 | * conditions. |
| 160 | * |
| 161 | * Only wapbl_flush moves the head, except when wapbl_truncate |
| 162 | * sets it to 0 to indicate that the log is empty. |
| 163 | * |
| 164 | * Only wapbl_truncate moves the tail, except when wapbl_flush |
| 165 | * sets it to wl_circ_off to indicate that the log is full. |
| 166 | */ |
| 167 | |
| 168 | struct wapbl_wc_header *; /* l */ |
| 169 | void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ |
| 170 | |
| 171 | kmutex_t wl_mtx; /* u: short-term lock */ |
| 172 | krwlock_t wl_rwlock; /* u: File system transaction lock */ |
| 173 | |
| 174 | /* |
| 175 | * Must be held while accessing |
| 176 | * wl_count or wl_bufs or head or tail |
| 177 | */ |
| 178 | |
| 179 | /* |
| 180 | * Callback called from within the flush routine to flush any extra |
| 181 | * bits. Note that flush may be skipped without calling this if |
| 182 | * there are no outstanding buffers in the transaction. |
| 183 | */ |
| 184 | #if _KERNEL |
| 185 | wapbl_flush_fn_t wl_flush; /* r */ |
| 186 | wapbl_flush_fn_t wl_flush_abort;/* r */ |
| 187 | #endif |
| 188 | |
| 189 | size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ |
| 190 | size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ |
| 191 | size_t wl_bcount; /* m: Total bcount of wl_bufs */ |
| 192 | |
| 193 | LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ |
| 194 | |
| 195 | kcondvar_t wl_reclaimable_cv; /* m (obviously) */ |
| 196 | size_t wl_reclaimable_bytes; /* m: Amount of space available for |
| 197 | reclamation by truncate */ |
| 198 | int wl_error_count; /* m: # of wl_entries with errors */ |
| 199 | size_t wl_reserved_bytes; /* never truncate log smaller than this */ |
| 200 | |
| 201 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 202 | size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ |
| 203 | #endif |
| 204 | |
| 205 | #if _KERNEL |
| 206 | int wl_brperjblock; /* r Block records per journal block */ |
| 207 | #endif |
| 208 | |
| 209 | TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */ |
| 210 | int wl_dealloccnt; /* lm: total count */ |
| 211 | int wl_dealloclim; /* r: max count */ |
| 212 | |
| 213 | /* hashtable of inode numbers for allocated but unlinked inodes */ |
| 214 | /* synch ??? */ |
| 215 | struct wapbl_ino_head *wl_inohash; |
| 216 | u_long wl_inohashmask; |
| 217 | int wl_inohashcnt; |
| 218 | |
| 219 | SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction |
| 220 | accounting */ |
| 221 | |
| 222 | u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */ |
| 223 | daddr_t wl_buffer_dblk; /* l: buffer disk block address */ |
| 224 | size_t wl_buffer_used; /* l: buffer current use */ |
| 225 | }; |
| 226 | |
| 227 | #ifdef WAPBL_DEBUG_PRINT |
| 228 | int wapbl_debug_print = WAPBL_DEBUG_PRINT; |
| 229 | #endif |
| 230 | |
| 231 | /****************************************************************/ |
| 232 | #ifdef _KERNEL |
| 233 | |
| 234 | #ifdef WAPBL_DEBUG |
| 235 | struct wapbl *wapbl_debug_wl; |
| 236 | #endif |
| 237 | |
| 238 | static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); |
| 239 | static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); |
| 240 | static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); |
| 241 | static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); |
| 242 | #endif /* _KERNEL */ |
| 243 | |
| 244 | static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); |
| 245 | |
| 246 | static inline size_t wapbl_space_used(size_t avail, off_t head, |
| 247 | off_t tail); |
| 248 | |
| 249 | #ifdef _KERNEL |
| 250 | |
| 251 | static struct pool wapbl_entry_pool; |
| 252 | static struct pool wapbl_dealloc_pool; |
| 253 | |
| 254 | #define WAPBL_INODETRK_SIZE 83 |
| 255 | static int wapbl_ino_pool_refcount; |
| 256 | static struct pool wapbl_ino_pool; |
| 257 | struct wapbl_ino { |
| 258 | LIST_ENTRY(wapbl_ino) wi_hash; |
| 259 | ino_t wi_ino; |
| 260 | mode_t wi_mode; |
| 261 | }; |
| 262 | |
| 263 | static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); |
| 264 | static void wapbl_inodetrk_free(struct wapbl *wl); |
| 265 | static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); |
| 266 | |
| 267 | static size_t wapbl_transaction_len(struct wapbl *wl); |
| 268 | static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); |
| 269 | |
| 270 | static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *, |
| 271 | bool); |
| 272 | |
| 273 | #if 0 |
| 274 | int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); |
| 275 | #endif |
| 276 | |
| 277 | static int wapbl_replay_isopen1(struct wapbl_replay *); |
| 278 | |
| 279 | struct wapbl_ops wapbl_ops = { |
| 280 | .wo_wapbl_discard = wapbl_discard, |
| 281 | .wo_wapbl_replay_isopen = wapbl_replay_isopen1, |
| 282 | .wo_wapbl_replay_can_read = wapbl_replay_can_read, |
| 283 | .wo_wapbl_replay_read = wapbl_replay_read, |
| 284 | .wo_wapbl_add_buf = wapbl_add_buf, |
| 285 | .wo_wapbl_remove_buf = wapbl_remove_buf, |
| 286 | .wo_wapbl_resize_buf = wapbl_resize_buf, |
| 287 | .wo_wapbl_begin = wapbl_begin, |
| 288 | .wo_wapbl_end = wapbl_end, |
| 289 | .wo_wapbl_junlock_assert= wapbl_junlock_assert, |
| 290 | |
| 291 | /* XXX: the following is only used to say "this is a wapbl buf" */ |
| 292 | .wo_wapbl_biodone = wapbl_biodone, |
| 293 | }; |
| 294 | |
| 295 | static int |
| 296 | wapbl_sysctl_init(void) |
| 297 | { |
| 298 | int rv; |
| 299 | const struct sysctlnode *rnode, *cnode; |
| 300 | |
| 301 | wapbl_sysctl = NULL; |
| 302 | |
| 303 | rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, |
| 304 | CTLFLAG_PERMANENT, |
| 305 | CTLTYPE_NODE, "wapbl" , |
| 306 | SYSCTL_DESCR("WAPBL journaling options" ), |
| 307 | NULL, 0, NULL, 0, |
| 308 | CTL_VFS, CTL_CREATE, CTL_EOL); |
| 309 | if (rv) |
| 310 | return rv; |
| 311 | |
| 312 | rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, |
| 313 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
| 314 | CTLTYPE_INT, "flush_disk_cache" , |
| 315 | SYSCTL_DESCR("flush disk cache" ), |
| 316 | NULL, 0, &wapbl_flush_disk_cache, 0, |
| 317 | CTL_CREATE, CTL_EOL); |
| 318 | if (rv) |
| 319 | return rv; |
| 320 | |
| 321 | rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, |
| 322 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
| 323 | CTLTYPE_INT, "verbose_commit" , |
| 324 | SYSCTL_DESCR("show time and size of wapbl log commits" ), |
| 325 | NULL, 0, &wapbl_verbose_commit, 0, |
| 326 | CTL_CREATE, CTL_EOL); |
| 327 | return rv; |
| 328 | } |
| 329 | |
| 330 | static void |
| 331 | wapbl_init(void) |
| 332 | { |
| 333 | |
| 334 | pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0, |
| 335 | "wapblentrypl" , &pool_allocator_kmem, IPL_VM); |
| 336 | pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0, |
| 337 | "wapbldealloc" , &pool_allocator_nointr, IPL_NONE); |
| 338 | |
| 339 | wapbl_sysctl_init(); |
| 340 | } |
| 341 | |
| 342 | static int |
| 343 | wapbl_fini(void) |
| 344 | { |
| 345 | |
| 346 | if (wapbl_sysctl != NULL) |
| 347 | sysctl_teardown(&wapbl_sysctl); |
| 348 | |
| 349 | pool_destroy(&wapbl_dealloc_pool); |
| 350 | pool_destroy(&wapbl_entry_pool); |
| 351 | |
| 352 | return 0; |
| 353 | } |
| 354 | |
| 355 | static int |
| 356 | wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) |
| 357 | { |
| 358 | int error, i; |
| 359 | |
| 360 | WAPBL_PRINTF(WAPBL_PRINT_REPLAY, |
| 361 | ("wapbl_start: reusing log with %d inodes\n" , wr->wr_inodescnt)); |
| 362 | |
| 363 | /* |
| 364 | * Its only valid to reuse the replay log if its |
| 365 | * the same as the new log we just opened. |
| 366 | */ |
| 367 | KDASSERT(!wapbl_replay_isopen(wr)); |
| 368 | KASSERT(wl->wl_devvp->v_type == VBLK); |
| 369 | KASSERT(wr->wr_devvp->v_type == VBLK); |
| 370 | KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); |
| 371 | KASSERT(wl->wl_logpbn == wr->wr_logpbn); |
| 372 | KASSERT(wl->wl_circ_size == wr->wr_circ_size); |
| 373 | KASSERT(wl->wl_circ_off == wr->wr_circ_off); |
| 374 | KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); |
| 375 | KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); |
| 376 | |
| 377 | wl->wl_wc_header->wc_generation = wr->wr_generation + 1; |
| 378 | |
| 379 | for (i = 0; i < wr->wr_inodescnt; i++) |
| 380 | wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, |
| 381 | wr->wr_inodes[i].wr_imode); |
| 382 | |
| 383 | /* Make sure new transaction won't overwrite old inodes list */ |
| 384 | KDASSERT(wapbl_transaction_len(wl) <= |
| 385 | wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, |
| 386 | wr->wr_inodestail)); |
| 387 | |
| 388 | wl->wl_head = wl->wl_tail = wr->wr_inodeshead; |
| 389 | wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = |
| 390 | wapbl_transaction_len(wl); |
| 391 | |
| 392 | error = wapbl_write_inodes(wl, &wl->wl_head); |
| 393 | if (error) |
| 394 | return error; |
| 395 | |
| 396 | KASSERT(wl->wl_head != wl->wl_tail); |
| 397 | KASSERT(wl->wl_head != 0); |
| 398 | |
| 399 | return 0; |
| 400 | } |
| 401 | |
| 402 | int |
| 403 | wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, |
| 404 | daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, |
| 405 | wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) |
| 406 | { |
| 407 | struct wapbl *wl; |
| 408 | struct vnode *devvp; |
| 409 | daddr_t logpbn; |
| 410 | int error; |
| 411 | int log_dev_bshift = ilog2(blksize); |
| 412 | int fs_dev_bshift = log_dev_bshift; |
| 413 | int run; |
| 414 | |
| 415 | WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 |
| 416 | " count=%zu blksize=%zu\n" , vp, off, count, blksize)); |
| 417 | |
| 418 | if (log_dev_bshift > fs_dev_bshift) { |
| 419 | WAPBL_PRINTF(WAPBL_PRINT_OPEN, |
| 420 | ("wapbl: log device's block size cannot be larger " |
| 421 | "than filesystem's\n" )); |
| 422 | /* |
| 423 | * Not currently implemented, although it could be if |
| 424 | * needed someday. |
| 425 | */ |
| 426 | return ENOSYS; |
| 427 | } |
| 428 | |
| 429 | if (off < 0) |
| 430 | return EINVAL; |
| 431 | |
| 432 | if (blksize < DEV_BSIZE) |
| 433 | return EINVAL; |
| 434 | if (blksize % DEV_BSIZE) |
| 435 | return EINVAL; |
| 436 | |
| 437 | /* XXXTODO: verify that the full load is writable */ |
| 438 | |
| 439 | /* |
| 440 | * XXX check for minimum log size |
| 441 | * minimum is governed by minimum amount of space |
| 442 | * to complete a transaction. (probably truncate) |
| 443 | */ |
| 444 | /* XXX for now pick something minimal */ |
| 445 | if ((count * blksize) < MAXPHYS) { |
| 446 | return ENOSPC; |
| 447 | } |
| 448 | |
| 449 | if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { |
| 450 | return error; |
| 451 | } |
| 452 | |
| 453 | wl = wapbl_calloc(1, sizeof(*wl)); |
| 454 | rw_init(&wl->wl_rwlock); |
| 455 | mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); |
| 456 | cv_init(&wl->wl_reclaimable_cv, "wapblrec" ); |
| 457 | LIST_INIT(&wl->wl_bufs); |
| 458 | SIMPLEQ_INIT(&wl->wl_entries); |
| 459 | |
| 460 | wl->wl_logvp = vp; |
| 461 | wl->wl_devvp = devvp; |
| 462 | wl->wl_mount = mp; |
| 463 | wl->wl_logpbn = logpbn; |
| 464 | wl->wl_log_dev_bshift = log_dev_bshift; |
| 465 | wl->wl_fs_dev_bshift = fs_dev_bshift; |
| 466 | |
| 467 | wl->wl_flush = flushfn; |
| 468 | wl->wl_flush_abort = flushabortfn; |
| 469 | |
| 470 | /* Reserve two log device blocks for the commit headers */ |
| 471 | wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; |
| 472 | wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); |
| 473 | /* truncate the log usage to a multiple of log_dev_bshift */ |
| 474 | wl->wl_circ_size >>= wl->wl_log_dev_bshift; |
| 475 | wl->wl_circ_size <<= wl->wl_log_dev_bshift; |
| 476 | |
| 477 | /* |
| 478 | * wl_bufbytes_max limits the size of the in memory transaction space. |
| 479 | * - Since buffers are allocated and accounted for in units of |
| 480 | * PAGE_SIZE it is required to be a multiple of PAGE_SIZE |
| 481 | * (i.e. 1<<PAGE_SHIFT) |
| 482 | * - Since the log device has to be written in units of |
| 483 | * 1<<wl_log_dev_bshift it is required to be a mulitple of |
| 484 | * 1<<wl_log_dev_bshift. |
| 485 | * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, |
| 486 | * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. |
| 487 | * Therefore it must be multiple of the least common multiple of those |
| 488 | * three quantities. Fortunately, all of those quantities are |
| 489 | * guaranteed to be a power of two, and the least common multiple of |
| 490 | * a set of numbers which are all powers of two is simply the maximum |
| 491 | * of those numbers. Finally, the maximum logarithm of a power of two |
| 492 | * is the same as the log of the maximum power of two. So we can do |
| 493 | * the following operations to size wl_bufbytes_max: |
| 494 | */ |
| 495 | |
| 496 | /* XXX fix actual number of pages reserved per filesystem. */ |
| 497 | wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); |
| 498 | |
| 499 | /* Round wl_bufbytes_max to the largest power of two constraint */ |
| 500 | wl->wl_bufbytes_max >>= PAGE_SHIFT; |
| 501 | wl->wl_bufbytes_max <<= PAGE_SHIFT; |
| 502 | wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; |
| 503 | wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; |
| 504 | wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; |
| 505 | wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; |
| 506 | |
| 507 | /* XXX maybe use filesystem fragment size instead of 1024 */ |
| 508 | /* XXX fix actual number of buffers reserved per filesystem. */ |
| 509 | wl->wl_bufcount_max = (nbuf / 2) * 1024; |
| 510 | |
| 511 | wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift) |
| 512 | - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / |
| 513 | sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); |
| 514 | KASSERT(wl->wl_brperjblock > 0); |
| 515 | |
| 516 | /* XXX tie this into resource estimation */ |
| 517 | wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2; |
| 518 | TAILQ_INIT(&wl->wl_dealloclist); |
| 519 | |
| 520 | wl->wl_buffer = wapbl_alloc(MAXPHYS); |
| 521 | wl->wl_buffer_used = 0; |
| 522 | |
| 523 | wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); |
| 524 | |
| 525 | /* Initialize the commit header */ |
| 526 | { |
| 527 | struct wapbl_wc_header *wc; |
| 528 | size_t len = 1 << wl->wl_log_dev_bshift; |
| 529 | wc = wapbl_calloc(1, len); |
| 530 | wc->wc_type = WAPBL_WC_HEADER; |
| 531 | wc->wc_len = len; |
| 532 | wc->wc_circ_off = wl->wl_circ_off; |
| 533 | wc->wc_circ_size = wl->wl_circ_size; |
| 534 | /* XXX wc->wc_fsid */ |
| 535 | wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; |
| 536 | wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; |
| 537 | wl->wl_wc_header = wc; |
| 538 | wl->wl_wc_scratch = wapbl_alloc(len); |
| 539 | } |
| 540 | |
| 541 | /* |
| 542 | * if there was an existing set of unlinked but |
| 543 | * allocated inodes, preserve it in the new |
| 544 | * log. |
| 545 | */ |
| 546 | if (wr && wr->wr_inodescnt) { |
| 547 | error = wapbl_start_flush_inodes(wl, wr); |
| 548 | if (error) |
| 549 | goto errout; |
| 550 | } |
| 551 | |
| 552 | error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); |
| 553 | if (error) { |
| 554 | goto errout; |
| 555 | } |
| 556 | |
| 557 | *wlp = wl; |
| 558 | #if defined(WAPBL_DEBUG) |
| 559 | wapbl_debug_wl = wl; |
| 560 | #endif |
| 561 | |
| 562 | return 0; |
| 563 | errout: |
| 564 | wapbl_discard(wl); |
| 565 | wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); |
| 566 | wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); |
| 567 | wapbl_free(wl->wl_buffer, MAXPHYS); |
| 568 | wapbl_inodetrk_free(wl); |
| 569 | wapbl_free(wl, sizeof(*wl)); |
| 570 | |
| 571 | return error; |
| 572 | } |
| 573 | |
| 574 | /* |
| 575 | * Like wapbl_flush, only discards the transaction |
| 576 | * completely |
| 577 | */ |
| 578 | |
| 579 | void |
| 580 | wapbl_discard(struct wapbl *wl) |
| 581 | { |
| 582 | struct wapbl_entry *we; |
| 583 | struct wapbl_dealloc *wd; |
| 584 | struct buf *bp; |
| 585 | int i; |
| 586 | |
| 587 | /* |
| 588 | * XXX we may consider using upgrade here |
| 589 | * if we want to call flush from inside a transaction |
| 590 | */ |
| 591 | rw_enter(&wl->wl_rwlock, RW_WRITER); |
| 592 | wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist)); |
| 593 | |
| 594 | #ifdef WAPBL_DEBUG_PRINT |
| 595 | { |
| 596 | pid_t pid = -1; |
| 597 | lwpid_t lid = -1; |
| 598 | if (curproc) |
| 599 | pid = curproc->p_pid; |
| 600 | if (curlwp) |
| 601 | lid = curlwp->l_lid; |
| 602 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 603 | WAPBL_PRINTF(WAPBL_PRINT_DISCARD, |
| 604 | ("wapbl_discard: thread %d.%d discarding " |
| 605 | "transaction\n" |
| 606 | "\tbufcount=%zu bufbytes=%zu bcount=%zu " |
| 607 | "deallocs=%d inodes=%d\n" |
| 608 | "\terrcnt = %u, reclaimable=%zu reserved=%zu " |
| 609 | "unsynced=%zu\n" , |
| 610 | pid, lid, wl->wl_bufcount, wl->wl_bufbytes, |
| 611 | wl->wl_bcount, wl->wl_dealloccnt, |
| 612 | wl->wl_inohashcnt, wl->wl_error_count, |
| 613 | wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, |
| 614 | wl->wl_unsynced_bufbytes)); |
| 615 | SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { |
| 616 | WAPBL_PRINTF(WAPBL_PRINT_DISCARD, |
| 617 | ("\tentry: bufcount = %zu, reclaimable = %zu, " |
| 618 | "error = %d, unsynced = %zu\n" , |
| 619 | we->we_bufcount, we->we_reclaimable_bytes, |
| 620 | we->we_error, we->we_unsynced_bufbytes)); |
| 621 | } |
| 622 | #else /* !WAPBL_DEBUG_BUFBYTES */ |
| 623 | WAPBL_PRINTF(WAPBL_PRINT_DISCARD, |
| 624 | ("wapbl_discard: thread %d.%d discarding transaction\n" |
| 625 | "\tbufcount=%zu bufbytes=%zu bcount=%zu " |
| 626 | "deallocs=%d inodes=%d\n" |
| 627 | "\terrcnt = %u, reclaimable=%zu reserved=%zu\n" , |
| 628 | pid, lid, wl->wl_bufcount, wl->wl_bufbytes, |
| 629 | wl->wl_bcount, wl->wl_dealloccnt, |
| 630 | wl->wl_inohashcnt, wl->wl_error_count, |
| 631 | wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); |
| 632 | SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { |
| 633 | WAPBL_PRINTF(WAPBL_PRINT_DISCARD, |
| 634 | ("\tentry: bufcount = %zu, reclaimable = %zu, " |
| 635 | "error = %d\n" , |
| 636 | we->we_bufcount, we->we_reclaimable_bytes, |
| 637 | we->we_error)); |
| 638 | } |
| 639 | #endif /* !WAPBL_DEBUG_BUFBYTES */ |
| 640 | } |
| 641 | #endif /* WAPBL_DEBUG_PRINT */ |
| 642 | |
| 643 | for (i = 0; i <= wl->wl_inohashmask; i++) { |
| 644 | struct wapbl_ino_head *wih; |
| 645 | struct wapbl_ino *wi; |
| 646 | |
| 647 | wih = &wl->wl_inohash[i]; |
| 648 | while ((wi = LIST_FIRST(wih)) != NULL) { |
| 649 | LIST_REMOVE(wi, wi_hash); |
| 650 | pool_put(&wapbl_ino_pool, wi); |
| 651 | KASSERT(wl->wl_inohashcnt > 0); |
| 652 | wl->wl_inohashcnt--; |
| 653 | } |
| 654 | } |
| 655 | |
| 656 | /* |
| 657 | * clean buffer list |
| 658 | */ |
| 659 | mutex_enter(&bufcache_lock); |
| 660 | mutex_enter(&wl->wl_mtx); |
| 661 | while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { |
| 662 | if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { |
| 663 | /* |
| 664 | * The buffer will be unlocked and |
| 665 | * removed from the transaction in brelse |
| 666 | */ |
| 667 | mutex_exit(&wl->wl_mtx); |
| 668 | brelsel(bp, 0); |
| 669 | mutex_enter(&wl->wl_mtx); |
| 670 | } |
| 671 | } |
| 672 | mutex_exit(&wl->wl_mtx); |
| 673 | mutex_exit(&bufcache_lock); |
| 674 | |
| 675 | /* |
| 676 | * Remove references to this wl from wl_entries, free any which |
| 677 | * no longer have buffers, others will be freed in wapbl_biodone |
| 678 | * when they no longer have any buffers. |
| 679 | */ |
| 680 | while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { |
| 681 | SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); |
| 682 | /* XXX should we be accumulating wl_error_count |
| 683 | * and increasing reclaimable bytes ? */ |
| 684 | we->we_wapbl = NULL; |
| 685 | if (we->we_bufcount == 0) { |
| 686 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 687 | KASSERT(we->we_unsynced_bufbytes == 0); |
| 688 | #endif |
| 689 | pool_put(&wapbl_entry_pool, we); |
| 690 | } |
| 691 | } |
| 692 | |
| 693 | /* Discard list of deallocs */ |
| 694 | while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) |
| 695 | wapbl_deallocation_free(wl, wd, true); |
| 696 | |
| 697 | /* XXX should we clear wl_reserved_bytes? */ |
| 698 | |
| 699 | KASSERT(wl->wl_bufbytes == 0); |
| 700 | KASSERT(wl->wl_bcount == 0); |
| 701 | KASSERT(wl->wl_bufcount == 0); |
| 702 | KASSERT(LIST_EMPTY(&wl->wl_bufs)); |
| 703 | KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); |
| 704 | KASSERT(wl->wl_inohashcnt == 0); |
| 705 | KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist)); |
| 706 | KASSERT(wl->wl_dealloccnt == 0); |
| 707 | |
| 708 | rw_exit(&wl->wl_rwlock); |
| 709 | } |
| 710 | |
| 711 | int |
| 712 | wapbl_stop(struct wapbl *wl, int force) |
| 713 | { |
| 714 | int error; |
| 715 | |
| 716 | WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n" )); |
| 717 | error = wapbl_flush(wl, 1); |
| 718 | if (error) { |
| 719 | if (force) |
| 720 | wapbl_discard(wl); |
| 721 | else |
| 722 | return error; |
| 723 | } |
| 724 | |
| 725 | /* Unlinked inodes persist after a flush */ |
| 726 | if (wl->wl_inohashcnt) { |
| 727 | if (force) { |
| 728 | wapbl_discard(wl); |
| 729 | } else { |
| 730 | return EBUSY; |
| 731 | } |
| 732 | } |
| 733 | |
| 734 | KASSERT(wl->wl_bufbytes == 0); |
| 735 | KASSERT(wl->wl_bcount == 0); |
| 736 | KASSERT(wl->wl_bufcount == 0); |
| 737 | KASSERT(LIST_EMPTY(&wl->wl_bufs)); |
| 738 | KASSERT(wl->wl_dealloccnt == 0); |
| 739 | KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); |
| 740 | KASSERT(wl->wl_inohashcnt == 0); |
| 741 | KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist)); |
| 742 | KASSERT(wl->wl_dealloccnt == 0); |
| 743 | |
| 744 | wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); |
| 745 | wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); |
| 746 | wapbl_free(wl->wl_buffer, MAXPHYS); |
| 747 | wapbl_inodetrk_free(wl); |
| 748 | |
| 749 | cv_destroy(&wl->wl_reclaimable_cv); |
| 750 | mutex_destroy(&wl->wl_mtx); |
| 751 | rw_destroy(&wl->wl_rwlock); |
| 752 | wapbl_free(wl, sizeof(*wl)); |
| 753 | |
| 754 | return 0; |
| 755 | } |
| 756 | |
| 757 | /****************************************************************/ |
| 758 | /* |
| 759 | * Unbuffered disk I/O |
| 760 | */ |
| 761 | |
| 762 | static int |
| 763 | wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) |
| 764 | { |
| 765 | struct pstats *pstats = curlwp->l_proc->p_stats; |
| 766 | struct buf *bp; |
| 767 | int error; |
| 768 | |
| 769 | KASSERT((flags & ~(B_WRITE | B_READ)) == 0); |
| 770 | KASSERT(devvp->v_type == VBLK); |
| 771 | |
| 772 | if ((flags & (B_WRITE | B_READ)) == B_WRITE) { |
| 773 | mutex_enter(devvp->v_interlock); |
| 774 | devvp->v_numoutput++; |
| 775 | mutex_exit(devvp->v_interlock); |
| 776 | pstats->p_ru.ru_oublock++; |
| 777 | } else { |
| 778 | pstats->p_ru.ru_inblock++; |
| 779 | } |
| 780 | |
| 781 | bp = getiobuf(devvp, true); |
| 782 | bp->b_flags = flags; |
| 783 | bp->b_cflags = BC_BUSY; /* silly & dubious */ |
| 784 | bp->b_dev = devvp->v_rdev; |
| 785 | bp->b_data = data; |
| 786 | bp->b_bufsize = bp->b_resid = bp->b_bcount = len; |
| 787 | bp->b_blkno = pbn; |
| 788 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); |
| 789 | |
| 790 | WAPBL_PRINTF(WAPBL_PRINT_IO, |
| 791 | ("wapbl_doio: %s %d bytes at block %" PRId64" on dev 0x%" PRIx64"\n" , |
| 792 | BUF_ISWRITE(bp) ? "write" : "read" , bp->b_bcount, |
| 793 | bp->b_blkno, bp->b_dev)); |
| 794 | |
| 795 | VOP_STRATEGY(devvp, bp); |
| 796 | |
| 797 | error = biowait(bp); |
| 798 | putiobuf(bp); |
| 799 | |
| 800 | if (error) { |
| 801 | WAPBL_PRINTF(WAPBL_PRINT_ERROR, |
| 802 | ("wapbl_doio: %s %zu bytes at block %" PRId64 |
| 803 | " on dev 0x%" PRIx64" failed with error %d\n" , |
| 804 | (((flags & (B_WRITE | B_READ)) == B_WRITE) ? |
| 805 | "write" : "read" ), |
| 806 | len, pbn, devvp->v_rdev, error)); |
| 807 | } |
| 808 | |
| 809 | return error; |
| 810 | } |
| 811 | |
| 812 | /* |
| 813 | * wapbl_write(data, len, devvp, pbn) |
| 814 | * |
| 815 | * Synchronously write len bytes from data to physical block pbn |
| 816 | * on devvp. |
| 817 | */ |
| 818 | int |
| 819 | wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) |
| 820 | { |
| 821 | |
| 822 | return wapbl_doio(data, len, devvp, pbn, B_WRITE); |
| 823 | } |
| 824 | |
| 825 | /* |
| 826 | * wapbl_read(data, len, devvp, pbn) |
| 827 | * |
| 828 | * Synchronously read len bytes into data from physical block pbn |
| 829 | * on devvp. |
| 830 | */ |
| 831 | int |
| 832 | wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) |
| 833 | { |
| 834 | |
| 835 | return wapbl_doio(data, len, devvp, pbn, B_READ); |
| 836 | } |
| 837 | |
| 838 | /****************************************************************/ |
| 839 | /* |
| 840 | * Buffered disk writes -- try to coalesce writes and emit |
| 841 | * MAXPHYS-aligned blocks. |
| 842 | */ |
| 843 | |
| 844 | /* |
| 845 | * wapbl_buffered_flush(wl) |
| 846 | * |
| 847 | * Flush any buffered writes from wapbl_buffered_write. |
| 848 | */ |
| 849 | static int |
| 850 | wapbl_buffered_flush(struct wapbl *wl) |
| 851 | { |
| 852 | int error; |
| 853 | |
| 854 | if (wl->wl_buffer_used == 0) |
| 855 | return 0; |
| 856 | |
| 857 | error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, |
| 858 | wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); |
| 859 | wl->wl_buffer_used = 0; |
| 860 | |
| 861 | return error; |
| 862 | } |
| 863 | |
| 864 | /* |
| 865 | * wapbl_buffered_write(data, len, wl, pbn) |
| 866 | * |
| 867 | * Write len bytes from data to physical block pbn on |
| 868 | * wl->wl_devvp. The write may not complete until |
| 869 | * wapbl_buffered_flush. |
| 870 | */ |
| 871 | static int |
| 872 | wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn) |
| 873 | { |
| 874 | int error; |
| 875 | size_t resid; |
| 876 | |
| 877 | /* |
| 878 | * If not adjacent to buffered data flush first. Disk block |
| 879 | * address is always valid for non-empty buffer. |
| 880 | */ |
| 881 | if (wl->wl_buffer_used > 0 && |
| 882 | pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) { |
| 883 | error = wapbl_buffered_flush(wl); |
| 884 | if (error) |
| 885 | return error; |
| 886 | } |
| 887 | /* |
| 888 | * If this write goes to an empty buffer we have to |
| 889 | * save the disk block address first. |
| 890 | */ |
| 891 | if (wl->wl_buffer_used == 0) |
| 892 | wl->wl_buffer_dblk = pbn; |
| 893 | /* |
| 894 | * Remaining space so this buffer ends on a MAXPHYS boundary. |
| 895 | * |
| 896 | * Cannot become less or equal zero as the buffer would have been |
| 897 | * flushed on the last call then. |
| 898 | */ |
| 899 | resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) - |
| 900 | wl->wl_buffer_used; |
| 901 | KASSERT(resid > 0); |
| 902 | KASSERT(dbtob(btodb(resid)) == resid); |
| 903 | if (len >= resid) { |
| 904 | memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid); |
| 905 | wl->wl_buffer_used += resid; |
| 906 | error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, |
| 907 | wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); |
| 908 | data = (uint8_t *)data + resid; |
| 909 | len -= resid; |
| 910 | wl->wl_buffer_dblk = pbn + btodb(resid); |
| 911 | wl->wl_buffer_used = 0; |
| 912 | if (error) |
| 913 | return error; |
| 914 | } |
| 915 | KASSERT(len < MAXPHYS); |
| 916 | if (len > 0) { |
| 917 | memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len); |
| 918 | wl->wl_buffer_used += len; |
| 919 | } |
| 920 | |
| 921 | return 0; |
| 922 | } |
| 923 | |
| 924 | /* |
| 925 | * wapbl_circ_write(wl, data, len, offp) |
| 926 | * |
| 927 | * Write len bytes from data to the circular queue of wl, starting |
| 928 | * at linear byte offset *offp, and returning the new linear byte |
| 929 | * offset in *offp. |
| 930 | * |
| 931 | * If the starting linear byte offset precedes wl->wl_circ_off, |
| 932 | * the write instead begins at wl->wl_circ_off. XXX WTF? This |
| 933 | * should be a KASSERT, not a conditional. |
| 934 | * |
| 935 | * The write is buffered in wl and must be flushed with |
| 936 | * wapbl_buffered_flush before it will be submitted to the disk. |
| 937 | */ |
| 938 | static int |
| 939 | wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) |
| 940 | { |
| 941 | size_t slen; |
| 942 | off_t off = *offp; |
| 943 | int error; |
| 944 | daddr_t pbn; |
| 945 | |
| 946 | KDASSERT(((len >> wl->wl_log_dev_bshift) << |
| 947 | wl->wl_log_dev_bshift) == len); |
| 948 | |
| 949 | if (off < wl->wl_circ_off) |
| 950 | off = wl->wl_circ_off; |
| 951 | slen = wl->wl_circ_off + wl->wl_circ_size - off; |
| 952 | if (slen < len) { |
| 953 | pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); |
| 954 | #ifdef _KERNEL |
| 955 | pbn = btodb(pbn << wl->wl_log_dev_bshift); |
| 956 | #endif |
| 957 | error = wapbl_buffered_write(data, slen, wl, pbn); |
| 958 | if (error) |
| 959 | return error; |
| 960 | data = (uint8_t *)data + slen; |
| 961 | len -= slen; |
| 962 | off = wl->wl_circ_off; |
| 963 | } |
| 964 | pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); |
| 965 | #ifdef _KERNEL |
| 966 | pbn = btodb(pbn << wl->wl_log_dev_bshift); |
| 967 | #endif |
| 968 | error = wapbl_buffered_write(data, len, wl, pbn); |
| 969 | if (error) |
| 970 | return error; |
| 971 | off += len; |
| 972 | if (off >= wl->wl_circ_off + wl->wl_circ_size) |
| 973 | off = wl->wl_circ_off; |
| 974 | *offp = off; |
| 975 | return 0; |
| 976 | } |
| 977 | |
| 978 | /****************************************************************/ |
| 979 | /* |
| 980 | * WAPBL transactions: entering, adding/removing bufs, and exiting |
| 981 | */ |
| 982 | |
| 983 | int |
| 984 | wapbl_begin(struct wapbl *wl, const char *file, int line) |
| 985 | { |
| 986 | int doflush; |
| 987 | unsigned lockcount; |
| 988 | |
| 989 | KDASSERT(wl); |
| 990 | |
| 991 | /* |
| 992 | * XXX this needs to be made much more sophisticated. |
| 993 | * perhaps each wapbl_begin could reserve a specified |
| 994 | * number of buffers and bytes. |
| 995 | */ |
| 996 | mutex_enter(&wl->wl_mtx); |
| 997 | lockcount = wl->wl_lock_count; |
| 998 | doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > |
| 999 | wl->wl_bufbytes_max / 2) || |
| 1000 | ((wl->wl_bufcount + (lockcount * 10)) > |
| 1001 | wl->wl_bufcount_max / 2) || |
| 1002 | (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || |
| 1003 | (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2)); |
| 1004 | mutex_exit(&wl->wl_mtx); |
| 1005 | |
| 1006 | if (doflush) { |
| 1007 | WAPBL_PRINTF(WAPBL_PRINT_FLUSH, |
| 1008 | ("force flush lockcnt=%d bufbytes=%zu " |
| 1009 | "(max=%zu) bufcount=%zu (max=%zu) " |
| 1010 | "dealloccnt %d (lim=%d)\n" , |
| 1011 | lockcount, wl->wl_bufbytes, |
| 1012 | wl->wl_bufbytes_max, wl->wl_bufcount, |
| 1013 | wl->wl_bufcount_max, |
| 1014 | wl->wl_dealloccnt, wl->wl_dealloclim)); |
| 1015 | } |
| 1016 | |
| 1017 | if (doflush) { |
| 1018 | int error = wapbl_flush(wl, 0); |
| 1019 | if (error) |
| 1020 | return error; |
| 1021 | } |
| 1022 | |
| 1023 | rw_enter(&wl->wl_rwlock, RW_READER); |
| 1024 | mutex_enter(&wl->wl_mtx); |
| 1025 | wl->wl_lock_count++; |
| 1026 | mutex_exit(&wl->wl_mtx); |
| 1027 | |
| 1028 | #if defined(WAPBL_DEBUG_PRINT) |
| 1029 | WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, |
| 1030 | ("wapbl_begin thread %d.%d with bufcount=%zu " |
| 1031 | "bufbytes=%zu bcount=%zu at %s:%d\n" , |
| 1032 | curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, |
| 1033 | wl->wl_bufbytes, wl->wl_bcount, file, line)); |
| 1034 | #endif |
| 1035 | |
| 1036 | return 0; |
| 1037 | } |
| 1038 | |
| 1039 | void |
| 1040 | wapbl_end(struct wapbl *wl) |
| 1041 | { |
| 1042 | |
| 1043 | #if defined(WAPBL_DEBUG_PRINT) |
| 1044 | WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, |
| 1045 | ("wapbl_end thread %d.%d with bufcount=%zu " |
| 1046 | "bufbytes=%zu bcount=%zu\n" , |
| 1047 | curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, |
| 1048 | wl->wl_bufbytes, wl->wl_bcount)); |
| 1049 | #endif |
| 1050 | |
| 1051 | /* |
| 1052 | * XXX this could be handled more gracefully, perhaps place |
| 1053 | * only a partial transaction in the log and allow the |
| 1054 | * remaining to flush without the protection of the journal. |
| 1055 | */ |
| 1056 | KASSERTMSG((wapbl_transaction_len(wl) <= |
| 1057 | (wl->wl_circ_size - wl->wl_reserved_bytes)), |
| 1058 | "wapbl_end: current transaction too big to flush" ); |
| 1059 | |
| 1060 | mutex_enter(&wl->wl_mtx); |
| 1061 | KASSERT(wl->wl_lock_count > 0); |
| 1062 | wl->wl_lock_count--; |
| 1063 | mutex_exit(&wl->wl_mtx); |
| 1064 | |
| 1065 | rw_exit(&wl->wl_rwlock); |
| 1066 | } |
| 1067 | |
| 1068 | void |
| 1069 | wapbl_add_buf(struct wapbl *wl, struct buf * bp) |
| 1070 | { |
| 1071 | |
| 1072 | KASSERT(bp->b_cflags & BC_BUSY); |
| 1073 | KASSERT(bp->b_vp); |
| 1074 | |
| 1075 | wapbl_jlock_assert(wl); |
| 1076 | |
| 1077 | #if 0 |
| 1078 | /* |
| 1079 | * XXX this might be an issue for swapfiles. |
| 1080 | * see uvm_swap.c:1702 |
| 1081 | * |
| 1082 | * XXX2 why require it then? leap of semantics? |
| 1083 | */ |
| 1084 | KASSERT((bp->b_cflags & BC_NOCACHE) == 0); |
| 1085 | #endif |
| 1086 | |
| 1087 | mutex_enter(&wl->wl_mtx); |
| 1088 | if (bp->b_flags & B_LOCKED) { |
| 1089 | LIST_REMOVE(bp, b_wapbllist); |
| 1090 | WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, |
| 1091 | ("wapbl_add_buf thread %d.%d re-adding buf %p " |
| 1092 | "with %d bytes %d bcount\n" , |
| 1093 | curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, |
| 1094 | bp->b_bcount)); |
| 1095 | } else { |
| 1096 | /* unlocked by dirty buffers shouldn't exist */ |
| 1097 | KASSERT(!(bp->b_oflags & BO_DELWRI)); |
| 1098 | wl->wl_bufbytes += bp->b_bufsize; |
| 1099 | wl->wl_bcount += bp->b_bcount; |
| 1100 | wl->wl_bufcount++; |
| 1101 | WAPBL_PRINTF(WAPBL_PRINT_BUFFER, |
| 1102 | ("wapbl_add_buf thread %d.%d adding buf %p " |
| 1103 | "with %d bytes %d bcount\n" , |
| 1104 | curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, |
| 1105 | bp->b_bcount)); |
| 1106 | } |
| 1107 | LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); |
| 1108 | mutex_exit(&wl->wl_mtx); |
| 1109 | |
| 1110 | bp->b_flags |= B_LOCKED; |
| 1111 | } |
| 1112 | |
| 1113 | static void |
| 1114 | wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) |
| 1115 | { |
| 1116 | |
| 1117 | KASSERT(mutex_owned(&wl->wl_mtx)); |
| 1118 | KASSERT(bp->b_cflags & BC_BUSY); |
| 1119 | wapbl_jlock_assert(wl); |
| 1120 | |
| 1121 | #if 0 |
| 1122 | /* |
| 1123 | * XXX this might be an issue for swapfiles. |
| 1124 | * see uvm_swap.c:1725 |
| 1125 | * |
| 1126 | * XXXdeux: see above |
| 1127 | */ |
| 1128 | KASSERT((bp->b_flags & BC_NOCACHE) == 0); |
| 1129 | #endif |
| 1130 | KASSERT(bp->b_flags & B_LOCKED); |
| 1131 | |
| 1132 | WAPBL_PRINTF(WAPBL_PRINT_BUFFER, |
| 1133 | ("wapbl_remove_buf thread %d.%d removing buf %p with " |
| 1134 | "%d bytes %d bcount\n" , |
| 1135 | curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); |
| 1136 | |
| 1137 | KASSERT(wl->wl_bufbytes >= bp->b_bufsize); |
| 1138 | wl->wl_bufbytes -= bp->b_bufsize; |
| 1139 | KASSERT(wl->wl_bcount >= bp->b_bcount); |
| 1140 | wl->wl_bcount -= bp->b_bcount; |
| 1141 | KASSERT(wl->wl_bufcount > 0); |
| 1142 | wl->wl_bufcount--; |
| 1143 | KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); |
| 1144 | KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); |
| 1145 | LIST_REMOVE(bp, b_wapbllist); |
| 1146 | |
| 1147 | bp->b_flags &= ~B_LOCKED; |
| 1148 | } |
| 1149 | |
| 1150 | /* called from brelsel() in vfs_bio among other places */ |
| 1151 | void |
| 1152 | wapbl_remove_buf(struct wapbl * wl, struct buf *bp) |
| 1153 | { |
| 1154 | |
| 1155 | mutex_enter(&wl->wl_mtx); |
| 1156 | wapbl_remove_buf_locked(wl, bp); |
| 1157 | mutex_exit(&wl->wl_mtx); |
| 1158 | } |
| 1159 | |
| 1160 | void |
| 1161 | wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) |
| 1162 | { |
| 1163 | |
| 1164 | KASSERT(bp->b_cflags & BC_BUSY); |
| 1165 | |
| 1166 | /* |
| 1167 | * XXX: why does this depend on B_LOCKED? otherwise the buf |
| 1168 | * is not for a transaction? if so, why is this called in the |
| 1169 | * first place? |
| 1170 | */ |
| 1171 | if (bp->b_flags & B_LOCKED) { |
| 1172 | mutex_enter(&wl->wl_mtx); |
| 1173 | wl->wl_bufbytes += bp->b_bufsize - oldsz; |
| 1174 | wl->wl_bcount += bp->b_bcount - oldcnt; |
| 1175 | mutex_exit(&wl->wl_mtx); |
| 1176 | } |
| 1177 | } |
| 1178 | |
| 1179 | #endif /* _KERNEL */ |
| 1180 | |
| 1181 | /****************************************************************/ |
| 1182 | /* Some utility inlines */ |
| 1183 | |
| 1184 | /* |
| 1185 | * wapbl_space_used(avail, head, tail) |
| 1186 | * |
| 1187 | * Number of bytes used in a circular queue of avail total bytes, |
| 1188 | * from tail to head. |
| 1189 | */ |
| 1190 | static inline size_t |
| 1191 | wapbl_space_used(size_t avail, off_t head, off_t tail) |
| 1192 | { |
| 1193 | |
| 1194 | if (tail == 0) { |
| 1195 | KASSERT(head == 0); |
| 1196 | return 0; |
| 1197 | } |
| 1198 | return ((head + (avail - 1) - tail) % avail) + 1; |
| 1199 | } |
| 1200 | |
| 1201 | #ifdef _KERNEL |
| 1202 | /* |
| 1203 | * wapbl_advance(size, off, oldoff, delta) |
| 1204 | * |
| 1205 | * Given a byte offset oldoff into a circular queue of size bytes |
| 1206 | * starting at off, return a new byte offset oldoff + delta into |
| 1207 | * the circular queue. |
| 1208 | */ |
| 1209 | static inline off_t |
| 1210 | wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta) |
| 1211 | { |
| 1212 | off_t newoff; |
| 1213 | |
| 1214 | /* Define acceptable ranges for inputs. */ |
| 1215 | KASSERT(delta <= (size_t)size); |
| 1216 | KASSERT((oldoff == 0) || ((size_t)oldoff >= off)); |
| 1217 | KASSERT(oldoff < (off_t)(size + off)); |
| 1218 | |
| 1219 | if ((oldoff == 0) && (delta != 0)) |
| 1220 | newoff = off + delta; |
| 1221 | else if ((oldoff + delta) < (size + off)) |
| 1222 | newoff = oldoff + delta; |
| 1223 | else |
| 1224 | newoff = (oldoff + delta) - size; |
| 1225 | |
| 1226 | /* Note some interesting axioms */ |
| 1227 | KASSERT((delta != 0) || (newoff == oldoff)); |
| 1228 | KASSERT((delta == 0) || (newoff != 0)); |
| 1229 | KASSERT((delta != (size)) || (newoff == oldoff)); |
| 1230 | |
| 1231 | /* Define acceptable ranges for output. */ |
| 1232 | KASSERT((newoff == 0) || ((size_t)newoff >= off)); |
| 1233 | KASSERT((size_t)newoff < (size + off)); |
| 1234 | return newoff; |
| 1235 | } |
| 1236 | |
| 1237 | /* |
| 1238 | * wapbl_space_free(avail, head, tail) |
| 1239 | * |
| 1240 | * Number of bytes free in a circular queue of avail total bytes, |
| 1241 | * in which everything from tail to head is used. |
| 1242 | */ |
| 1243 | static inline size_t |
| 1244 | wapbl_space_free(size_t avail, off_t head, off_t tail) |
| 1245 | { |
| 1246 | |
| 1247 | return avail - wapbl_space_used(avail, head, tail); |
| 1248 | } |
| 1249 | |
| 1250 | /* |
| 1251 | * wapbl_advance_head(size, off, delta, headp, tailp) |
| 1252 | * |
| 1253 | * In a circular queue of size bytes starting at off, given the |
| 1254 | * old head and tail offsets *headp and *tailp, store the new head |
| 1255 | * and tail offsets in *headp and *tailp resulting from adding |
| 1256 | * delta bytes of data to the head. |
| 1257 | */ |
| 1258 | static inline void |
| 1259 | wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, |
| 1260 | off_t *tailp) |
| 1261 | { |
| 1262 | off_t head = *headp; |
| 1263 | off_t tail = *tailp; |
| 1264 | |
| 1265 | KASSERT(delta <= wapbl_space_free(size, head, tail)); |
| 1266 | head = wapbl_advance(size, off, head, delta); |
| 1267 | if ((tail == 0) && (head != 0)) |
| 1268 | tail = off; |
| 1269 | *headp = head; |
| 1270 | *tailp = tail; |
| 1271 | } |
| 1272 | |
| 1273 | /* |
| 1274 | * wapbl_advance_tail(size, off, delta, headp, tailp) |
| 1275 | * |
| 1276 | * In a circular queue of size bytes starting at off, given the |
| 1277 | * old head and tail offsets *headp and *tailp, store the new head |
| 1278 | * and tail offsets in *headp and *tailp resulting from removing |
| 1279 | * delta bytes of data from the tail. |
| 1280 | */ |
| 1281 | static inline void |
| 1282 | wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, |
| 1283 | off_t *tailp) |
| 1284 | { |
| 1285 | off_t head = *headp; |
| 1286 | off_t tail = *tailp; |
| 1287 | |
| 1288 | KASSERT(delta <= wapbl_space_used(size, head, tail)); |
| 1289 | tail = wapbl_advance(size, off, tail, delta); |
| 1290 | if (head == tail) { |
| 1291 | head = tail = 0; |
| 1292 | } |
| 1293 | *headp = head; |
| 1294 | *tailp = tail; |
| 1295 | } |
| 1296 | |
| 1297 | |
| 1298 | /****************************************************************/ |
| 1299 | |
| 1300 | /* |
| 1301 | * wapbl_truncate(wl, minfree) |
| 1302 | * |
| 1303 | * Wait until at least minfree bytes are available in the log. |
| 1304 | * |
| 1305 | * If it was necessary to wait for writes to complete, |
| 1306 | * advance the circular queue tail to reflect the new write |
| 1307 | * completions and issue a write commit to the log. |
| 1308 | * |
| 1309 | * => Caller must hold wl->wl_rwlock writer lock. |
| 1310 | */ |
| 1311 | static int |
| 1312 | wapbl_truncate(struct wapbl *wl, size_t minfree) |
| 1313 | { |
| 1314 | size_t delta; |
| 1315 | size_t avail; |
| 1316 | off_t head; |
| 1317 | off_t tail; |
| 1318 | int error = 0; |
| 1319 | |
| 1320 | KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); |
| 1321 | KASSERT(rw_write_held(&wl->wl_rwlock)); |
| 1322 | |
| 1323 | mutex_enter(&wl->wl_mtx); |
| 1324 | |
| 1325 | /* |
| 1326 | * First check to see if we have to do a commit |
| 1327 | * at all. |
| 1328 | */ |
| 1329 | avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); |
| 1330 | if (minfree < avail) { |
| 1331 | mutex_exit(&wl->wl_mtx); |
| 1332 | return 0; |
| 1333 | } |
| 1334 | minfree -= avail; |
| 1335 | while ((wl->wl_error_count == 0) && |
| 1336 | (wl->wl_reclaimable_bytes < minfree)) { |
| 1337 | WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, |
| 1338 | ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " |
| 1339 | "minfree=%zd\n" , |
| 1340 | &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, |
| 1341 | minfree)); |
| 1342 | |
| 1343 | cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); |
| 1344 | } |
| 1345 | if (wl->wl_reclaimable_bytes < minfree) { |
| 1346 | KASSERT(wl->wl_error_count); |
| 1347 | /* XXX maybe get actual error from buffer instead someday? */ |
| 1348 | error = EIO; |
| 1349 | } |
| 1350 | head = wl->wl_head; |
| 1351 | tail = wl->wl_tail; |
| 1352 | delta = wl->wl_reclaimable_bytes; |
| 1353 | |
| 1354 | /* If all of of the entries are flushed, then be sure to keep |
| 1355 | * the reserved bytes reserved. Watch out for discarded transactions, |
| 1356 | * which could leave more bytes reserved than are reclaimable. |
| 1357 | */ |
| 1358 | if (SIMPLEQ_EMPTY(&wl->wl_entries) && |
| 1359 | (delta >= wl->wl_reserved_bytes)) { |
| 1360 | delta -= wl->wl_reserved_bytes; |
| 1361 | } |
| 1362 | wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, |
| 1363 | &tail); |
| 1364 | KDASSERT(wl->wl_reserved_bytes <= |
| 1365 | wapbl_space_used(wl->wl_circ_size, head, tail)); |
| 1366 | mutex_exit(&wl->wl_mtx); |
| 1367 | |
| 1368 | if (error) |
| 1369 | return error; |
| 1370 | |
| 1371 | /* |
| 1372 | * This is where head, tail and delta are unprotected |
| 1373 | * from races against itself or flush. This is ok since |
| 1374 | * we only call this routine from inside flush itself. |
| 1375 | * |
| 1376 | * XXX: how can it race against itself when accessed only |
| 1377 | * from behind the write-locked rwlock? |
| 1378 | */ |
| 1379 | error = wapbl_write_commit(wl, head, tail); |
| 1380 | if (error) |
| 1381 | return error; |
| 1382 | |
| 1383 | wl->wl_head = head; |
| 1384 | wl->wl_tail = tail; |
| 1385 | |
| 1386 | mutex_enter(&wl->wl_mtx); |
| 1387 | KASSERT(wl->wl_reclaimable_bytes >= delta); |
| 1388 | wl->wl_reclaimable_bytes -= delta; |
| 1389 | mutex_exit(&wl->wl_mtx); |
| 1390 | WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, |
| 1391 | ("wapbl_truncate thread %d.%d truncating %zu bytes\n" , |
| 1392 | curproc->p_pid, curlwp->l_lid, delta)); |
| 1393 | |
| 1394 | return 0; |
| 1395 | } |
| 1396 | |
| 1397 | /****************************************************************/ |
| 1398 | |
| 1399 | void |
| 1400 | wapbl_biodone(struct buf *bp) |
| 1401 | { |
| 1402 | struct wapbl_entry *we = bp->b_private; |
| 1403 | struct wapbl *wl = we->we_wapbl; |
| 1404 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1405 | const int bufsize = bp->b_bufsize; |
| 1406 | #endif |
| 1407 | |
| 1408 | /* |
| 1409 | * Handle possible flushing of buffers after log has been |
| 1410 | * decomissioned. |
| 1411 | */ |
| 1412 | if (!wl) { |
| 1413 | KASSERT(we->we_bufcount > 0); |
| 1414 | we->we_bufcount--; |
| 1415 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1416 | KASSERT(we->we_unsynced_bufbytes >= bufsize); |
| 1417 | we->we_unsynced_bufbytes -= bufsize; |
| 1418 | #endif |
| 1419 | |
| 1420 | if (we->we_bufcount == 0) { |
| 1421 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1422 | KASSERT(we->we_unsynced_bufbytes == 0); |
| 1423 | #endif |
| 1424 | pool_put(&wapbl_entry_pool, we); |
| 1425 | } |
| 1426 | |
| 1427 | brelse(bp, 0); |
| 1428 | return; |
| 1429 | } |
| 1430 | |
| 1431 | #ifdef ohbother |
| 1432 | KDASSERT(bp->b_oflags & BO_DONE); |
| 1433 | KDASSERT(!(bp->b_oflags & BO_DELWRI)); |
| 1434 | KDASSERT(bp->b_flags & B_ASYNC); |
| 1435 | KDASSERT(bp->b_cflags & BC_BUSY); |
| 1436 | KDASSERT(!(bp->b_flags & B_LOCKED)); |
| 1437 | KDASSERT(!(bp->b_flags & B_READ)); |
| 1438 | KDASSERT(!(bp->b_cflags & BC_INVAL)); |
| 1439 | KDASSERT(!(bp->b_cflags & BC_NOCACHE)); |
| 1440 | #endif |
| 1441 | |
| 1442 | if (bp->b_error) { |
| 1443 | /* |
| 1444 | * If an error occurs, it would be nice to leave the buffer |
| 1445 | * as a delayed write on the LRU queue so that we can retry |
| 1446 | * it later. But buffercache(9) can't handle dirty buffer |
| 1447 | * reuse, so just mark the log permanently errored out. |
| 1448 | */ |
| 1449 | mutex_enter(&wl->wl_mtx); |
| 1450 | if (wl->wl_error_count == 0) { |
| 1451 | wl->wl_error_count++; |
| 1452 | cv_broadcast(&wl->wl_reclaimable_cv); |
| 1453 | } |
| 1454 | mutex_exit(&wl->wl_mtx); |
| 1455 | } |
| 1456 | |
| 1457 | /* |
| 1458 | * Release the buffer here. wapbl_flush() may wait for the |
| 1459 | * log to become empty and we better unbusy the buffer before |
| 1460 | * wapbl_flush() returns. |
| 1461 | */ |
| 1462 | brelse(bp, 0); |
| 1463 | |
| 1464 | mutex_enter(&wl->wl_mtx); |
| 1465 | |
| 1466 | KASSERT(we->we_bufcount > 0); |
| 1467 | we->we_bufcount--; |
| 1468 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1469 | KASSERT(we->we_unsynced_bufbytes >= bufsize); |
| 1470 | we->we_unsynced_bufbytes -= bufsize; |
| 1471 | KASSERT(wl->wl_unsynced_bufbytes >= bufsize); |
| 1472 | wl->wl_unsynced_bufbytes -= bufsize; |
| 1473 | #endif |
| 1474 | |
| 1475 | /* |
| 1476 | * If the current transaction can be reclaimed, start |
| 1477 | * at the beginning and reclaim any consecutive reclaimable |
| 1478 | * transactions. If we successfully reclaim anything, |
| 1479 | * then wakeup anyone waiting for the reclaim. |
| 1480 | */ |
| 1481 | if (we->we_bufcount == 0) { |
| 1482 | size_t delta = 0; |
| 1483 | int errcnt = 0; |
| 1484 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1485 | KDASSERT(we->we_unsynced_bufbytes == 0); |
| 1486 | #endif |
| 1487 | /* |
| 1488 | * clear any posted error, since the buffer it came from |
| 1489 | * has successfully flushed by now |
| 1490 | */ |
| 1491 | while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && |
| 1492 | (we->we_bufcount == 0)) { |
| 1493 | delta += we->we_reclaimable_bytes; |
| 1494 | if (we->we_error) |
| 1495 | errcnt++; |
| 1496 | SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); |
| 1497 | pool_put(&wapbl_entry_pool, we); |
| 1498 | } |
| 1499 | |
| 1500 | if (delta) { |
| 1501 | wl->wl_reclaimable_bytes += delta; |
| 1502 | KASSERT(wl->wl_error_count >= errcnt); |
| 1503 | wl->wl_error_count -= errcnt; |
| 1504 | cv_broadcast(&wl->wl_reclaimable_cv); |
| 1505 | } |
| 1506 | } |
| 1507 | |
| 1508 | mutex_exit(&wl->wl_mtx); |
| 1509 | } |
| 1510 | |
| 1511 | /* |
| 1512 | * wapbl_flush(wl, wait) |
| 1513 | * |
| 1514 | * Flush pending block writes, deallocations, and inodes from |
| 1515 | * the current transaction in memory to the log on disk: |
| 1516 | * |
| 1517 | * 1. Call the file system's wl_flush callback to flush any |
| 1518 | * per-file-system pending updates. |
| 1519 | * 2. Wait for enough space in the log for the current transaction. |
| 1520 | * 3. Synchronously write the new log records, advancing the |
| 1521 | * circular queue head. |
| 1522 | * 4. Issue the pending block writes asynchronously, now that they |
| 1523 | * are recorded in the log and can be replayed after crash. |
| 1524 | * 5. If wait is true, wait for all writes to complete and for the |
| 1525 | * log to become empty. |
| 1526 | * |
| 1527 | * On failure, call the file system's wl_flush_abort callback. |
| 1528 | */ |
| 1529 | int |
| 1530 | wapbl_flush(struct wapbl *wl, int waitfor) |
| 1531 | { |
| 1532 | struct buf *bp; |
| 1533 | struct wapbl_entry *we; |
| 1534 | off_t off; |
| 1535 | off_t head; |
| 1536 | off_t tail; |
| 1537 | size_t delta = 0; |
| 1538 | size_t flushsize; |
| 1539 | size_t reserved; |
| 1540 | int error = 0; |
| 1541 | |
| 1542 | /* |
| 1543 | * Do a quick check to see if a full flush can be skipped |
| 1544 | * This assumes that the flush callback does not need to be called |
| 1545 | * unless there are other outstanding bufs. |
| 1546 | */ |
| 1547 | if (!waitfor) { |
| 1548 | size_t nbufs; |
| 1549 | mutex_enter(&wl->wl_mtx); /* XXX need mutex here to |
| 1550 | protect the KASSERTS */ |
| 1551 | nbufs = wl->wl_bufcount; |
| 1552 | KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); |
| 1553 | KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); |
| 1554 | mutex_exit(&wl->wl_mtx); |
| 1555 | if (nbufs == 0) |
| 1556 | return 0; |
| 1557 | } |
| 1558 | |
| 1559 | /* |
| 1560 | * XXX we may consider using LK_UPGRADE here |
| 1561 | * if we want to call flush from inside a transaction |
| 1562 | */ |
| 1563 | rw_enter(&wl->wl_rwlock, RW_WRITER); |
| 1564 | wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist)); |
| 1565 | |
| 1566 | /* |
| 1567 | * Now that we are exclusively locked and the file system has |
| 1568 | * issued any deferred block writes for this transaction, check |
| 1569 | * whether there are any blocks to write to the log. If not, |
| 1570 | * skip waiting for space or writing any log entries. |
| 1571 | * |
| 1572 | * XXX Shouldn't this also check wl_dealloccnt and |
| 1573 | * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the |
| 1574 | * file system didn't produce any blocks as a consequence of |
| 1575 | * it, but the same does not seem to be so of wl_inohashcnt. |
| 1576 | */ |
| 1577 | if (wl->wl_bufcount == 0) { |
| 1578 | goto wait_out; |
| 1579 | } |
| 1580 | |
| 1581 | #if 0 |
| 1582 | WAPBL_PRINTF(WAPBL_PRINT_FLUSH, |
| 1583 | ("wapbl_flush thread %d.%d flushing entries with " |
| 1584 | "bufcount=%zu bufbytes=%zu\n" , |
| 1585 | curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, |
| 1586 | wl->wl_bufbytes)); |
| 1587 | #endif |
| 1588 | |
| 1589 | /* Calculate amount of space needed to flush */ |
| 1590 | flushsize = wapbl_transaction_len(wl); |
| 1591 | if (wapbl_verbose_commit) { |
| 1592 | struct timespec ts; |
| 1593 | getnanotime(&ts); |
| 1594 | printf("%s: %lld.%09ld this transaction = %zu bytes\n" , |
| 1595 | __func__, (long long)ts.tv_sec, |
| 1596 | (long)ts.tv_nsec, flushsize); |
| 1597 | } |
| 1598 | |
| 1599 | if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { |
| 1600 | /* |
| 1601 | * XXX this could be handled more gracefully, perhaps place |
| 1602 | * only a partial transaction in the log and allow the |
| 1603 | * remaining to flush without the protection of the journal. |
| 1604 | */ |
| 1605 | panic("wapbl_flush: current transaction too big to flush" ); |
| 1606 | } |
| 1607 | |
| 1608 | error = wapbl_truncate(wl, flushsize); |
| 1609 | if (error) |
| 1610 | goto out; |
| 1611 | |
| 1612 | off = wl->wl_head; |
| 1613 | KASSERT((off == 0) || (off >= wl->wl_circ_off)); |
| 1614 | KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size)); |
| 1615 | error = wapbl_write_blocks(wl, &off); |
| 1616 | if (error) |
| 1617 | goto out; |
| 1618 | error = wapbl_write_revocations(wl, &off); |
| 1619 | if (error) |
| 1620 | goto out; |
| 1621 | error = wapbl_write_inodes(wl, &off); |
| 1622 | if (error) |
| 1623 | goto out; |
| 1624 | |
| 1625 | reserved = 0; |
| 1626 | if (wl->wl_inohashcnt) |
| 1627 | reserved = wapbl_transaction_inodes_len(wl); |
| 1628 | |
| 1629 | head = wl->wl_head; |
| 1630 | tail = wl->wl_tail; |
| 1631 | |
| 1632 | wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, |
| 1633 | &head, &tail); |
| 1634 | |
| 1635 | KASSERTMSG(head == off, |
| 1636 | "lost head! head=%" PRIdMAX" tail=%" PRIdMAX |
| 1637 | " off=%" PRIdMAX" flush=%zu" , |
| 1638 | (intmax_t)head, (intmax_t)tail, (intmax_t)off, |
| 1639 | flushsize); |
| 1640 | |
| 1641 | /* Opportunistically move the tail forward if we can */ |
| 1642 | mutex_enter(&wl->wl_mtx); |
| 1643 | delta = wl->wl_reclaimable_bytes; |
| 1644 | mutex_exit(&wl->wl_mtx); |
| 1645 | wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, |
| 1646 | &head, &tail); |
| 1647 | |
| 1648 | error = wapbl_write_commit(wl, head, tail); |
| 1649 | if (error) |
| 1650 | goto out; |
| 1651 | |
| 1652 | we = pool_get(&wapbl_entry_pool, PR_WAITOK); |
| 1653 | |
| 1654 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1655 | WAPBL_PRINTF(WAPBL_PRINT_FLUSH, |
| 1656 | ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" |
| 1657 | " unsynced=%zu" |
| 1658 | "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " |
| 1659 | "inodes=%d\n" , |
| 1660 | curproc->p_pid, curlwp->l_lid, flushsize, delta, |
| 1661 | wapbl_space_used(wl->wl_circ_size, head, tail), |
| 1662 | wl->wl_unsynced_bufbytes, wl->wl_bufcount, |
| 1663 | wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, |
| 1664 | wl->wl_inohashcnt)); |
| 1665 | #else |
| 1666 | WAPBL_PRINTF(WAPBL_PRINT_FLUSH, |
| 1667 | ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" |
| 1668 | "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " |
| 1669 | "inodes=%d\n" , |
| 1670 | curproc->p_pid, curlwp->l_lid, flushsize, delta, |
| 1671 | wapbl_space_used(wl->wl_circ_size, head, tail), |
| 1672 | wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, |
| 1673 | wl->wl_dealloccnt, wl->wl_inohashcnt)); |
| 1674 | #endif |
| 1675 | |
| 1676 | |
| 1677 | mutex_enter(&bufcache_lock); |
| 1678 | mutex_enter(&wl->wl_mtx); |
| 1679 | |
| 1680 | wl->wl_reserved_bytes = reserved; |
| 1681 | wl->wl_head = head; |
| 1682 | wl->wl_tail = tail; |
| 1683 | KASSERT(wl->wl_reclaimable_bytes >= delta); |
| 1684 | wl->wl_reclaimable_bytes -= delta; |
| 1685 | KDASSERT(wl->wl_dealloccnt == 0); |
| 1686 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1687 | wl->wl_unsynced_bufbytes += wl->wl_bufbytes; |
| 1688 | #endif |
| 1689 | |
| 1690 | we->we_wapbl = wl; |
| 1691 | we->we_bufcount = wl->wl_bufcount; |
| 1692 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1693 | we->we_unsynced_bufbytes = wl->wl_bufbytes; |
| 1694 | #endif |
| 1695 | we->we_reclaimable_bytes = flushsize; |
| 1696 | we->we_error = 0; |
| 1697 | SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); |
| 1698 | |
| 1699 | /* |
| 1700 | * this flushes bufs in reverse order than they were queued |
| 1701 | * it shouldn't matter, but if we care we could use TAILQ instead. |
| 1702 | * XXX Note they will get put on the lru queue when they flush |
| 1703 | * so we might actually want to change this to preserve order. |
| 1704 | */ |
| 1705 | while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { |
| 1706 | if (bbusy(bp, 0, 0, &wl->wl_mtx)) { |
| 1707 | continue; |
| 1708 | } |
| 1709 | bp->b_iodone = wapbl_biodone; |
| 1710 | bp->b_private = we; |
| 1711 | bremfree(bp); |
| 1712 | wapbl_remove_buf_locked(wl, bp); |
| 1713 | mutex_exit(&wl->wl_mtx); |
| 1714 | mutex_exit(&bufcache_lock); |
| 1715 | bawrite(bp); |
| 1716 | mutex_enter(&bufcache_lock); |
| 1717 | mutex_enter(&wl->wl_mtx); |
| 1718 | } |
| 1719 | mutex_exit(&wl->wl_mtx); |
| 1720 | mutex_exit(&bufcache_lock); |
| 1721 | |
| 1722 | #if 0 |
| 1723 | WAPBL_PRINTF(WAPBL_PRINT_FLUSH, |
| 1724 | ("wapbl_flush thread %d.%d done flushing entries...\n" , |
| 1725 | curproc->p_pid, curlwp->l_lid)); |
| 1726 | #endif |
| 1727 | |
| 1728 | wait_out: |
| 1729 | |
| 1730 | /* |
| 1731 | * If the waitfor flag is set, don't return until everything is |
| 1732 | * fully flushed and the on disk log is empty. |
| 1733 | */ |
| 1734 | if (waitfor) { |
| 1735 | error = wapbl_truncate(wl, wl->wl_circ_size - |
| 1736 | wl->wl_reserved_bytes); |
| 1737 | } |
| 1738 | |
| 1739 | out: |
| 1740 | if (error) { |
| 1741 | wl->wl_flush_abort(wl->wl_mount, |
| 1742 | TAILQ_FIRST(&wl->wl_dealloclist)); |
| 1743 | } |
| 1744 | |
| 1745 | #ifdef WAPBL_DEBUG_PRINT |
| 1746 | if (error) { |
| 1747 | pid_t pid = -1; |
| 1748 | lwpid_t lid = -1; |
| 1749 | if (curproc) |
| 1750 | pid = curproc->p_pid; |
| 1751 | if (curlwp) |
| 1752 | lid = curlwp->l_lid; |
| 1753 | mutex_enter(&wl->wl_mtx); |
| 1754 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1755 | WAPBL_PRINTF(WAPBL_PRINT_ERROR, |
| 1756 | ("wapbl_flush: thread %d.%d aborted flush: " |
| 1757 | "error = %d\n" |
| 1758 | "\tbufcount=%zu bufbytes=%zu bcount=%zu " |
| 1759 | "deallocs=%d inodes=%d\n" |
| 1760 | "\terrcnt = %d, reclaimable=%zu reserved=%zu " |
| 1761 | "unsynced=%zu\n" , |
| 1762 | pid, lid, error, wl->wl_bufcount, |
| 1763 | wl->wl_bufbytes, wl->wl_bcount, |
| 1764 | wl->wl_dealloccnt, wl->wl_inohashcnt, |
| 1765 | wl->wl_error_count, wl->wl_reclaimable_bytes, |
| 1766 | wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); |
| 1767 | SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { |
| 1768 | WAPBL_PRINTF(WAPBL_PRINT_ERROR, |
| 1769 | ("\tentry: bufcount = %zu, reclaimable = %zu, " |
| 1770 | "error = %d, unsynced = %zu\n" , |
| 1771 | we->we_bufcount, we->we_reclaimable_bytes, |
| 1772 | we->we_error, we->we_unsynced_bufbytes)); |
| 1773 | } |
| 1774 | #else |
| 1775 | WAPBL_PRINTF(WAPBL_PRINT_ERROR, |
| 1776 | ("wapbl_flush: thread %d.%d aborted flush: " |
| 1777 | "error = %d\n" |
| 1778 | "\tbufcount=%zu bufbytes=%zu bcount=%zu " |
| 1779 | "deallocs=%d inodes=%d\n" |
| 1780 | "\terrcnt = %d, reclaimable=%zu reserved=%zu\n" , |
| 1781 | pid, lid, error, wl->wl_bufcount, |
| 1782 | wl->wl_bufbytes, wl->wl_bcount, |
| 1783 | wl->wl_dealloccnt, wl->wl_inohashcnt, |
| 1784 | wl->wl_error_count, wl->wl_reclaimable_bytes, |
| 1785 | wl->wl_reserved_bytes)); |
| 1786 | SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { |
| 1787 | WAPBL_PRINTF(WAPBL_PRINT_ERROR, |
| 1788 | ("\tentry: bufcount = %zu, reclaimable = %zu, " |
| 1789 | "error = %d\n" , we->we_bufcount, |
| 1790 | we->we_reclaimable_bytes, we->we_error)); |
| 1791 | } |
| 1792 | #endif |
| 1793 | mutex_exit(&wl->wl_mtx); |
| 1794 | } |
| 1795 | #endif |
| 1796 | |
| 1797 | rw_exit(&wl->wl_rwlock); |
| 1798 | return error; |
| 1799 | } |
| 1800 | |
| 1801 | /****************************************************************/ |
| 1802 | |
| 1803 | void |
| 1804 | wapbl_jlock_assert(struct wapbl *wl) |
| 1805 | { |
| 1806 | |
| 1807 | KASSERT(rw_lock_held(&wl->wl_rwlock)); |
| 1808 | } |
| 1809 | |
| 1810 | void |
| 1811 | wapbl_junlock_assert(struct wapbl *wl) |
| 1812 | { |
| 1813 | |
| 1814 | KASSERT(!rw_write_held(&wl->wl_rwlock)); |
| 1815 | } |
| 1816 | |
| 1817 | /****************************************************************/ |
| 1818 | |
| 1819 | /* locks missing */ |
| 1820 | void |
| 1821 | wapbl_print(struct wapbl *wl, |
| 1822 | int full, |
| 1823 | void (*pr)(const char *, ...)) |
| 1824 | { |
| 1825 | struct buf *bp; |
| 1826 | struct wapbl_entry *we; |
| 1827 | (*pr)("wapbl %p" , wl); |
| 1828 | (*pr)("\nlogvp = %p, devvp = %p, logpbn = %" PRId64"\n" , |
| 1829 | wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); |
| 1830 | (*pr)("circ = %zu, header = %zu, head = %" PRIdMAX" tail = %" PRIdMAX"\n" , |
| 1831 | wl->wl_circ_size, wl->wl_circ_off, |
| 1832 | (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); |
| 1833 | (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n" , |
| 1834 | wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); |
| 1835 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1836 | (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " |
| 1837 | "reserved = %zu errcnt = %d unsynced = %zu\n" , |
| 1838 | wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, |
| 1839 | wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, |
| 1840 | wl->wl_error_count, wl->wl_unsynced_bufbytes); |
| 1841 | #else |
| 1842 | (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " |
| 1843 | "reserved = %zu errcnt = %d\n" , wl->wl_bufcount, wl->wl_bufbytes, |
| 1844 | wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, |
| 1845 | wl->wl_error_count); |
| 1846 | #endif |
| 1847 | (*pr)("\tdealloccnt = %d, dealloclim = %d\n" , |
| 1848 | wl->wl_dealloccnt, wl->wl_dealloclim); |
| 1849 | (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n" , |
| 1850 | wl->wl_inohashcnt, wl->wl_inohashmask); |
| 1851 | (*pr)("entries:\n" ); |
| 1852 | SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { |
| 1853 | #ifdef WAPBL_DEBUG_BUFBYTES |
| 1854 | (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " |
| 1855 | "unsynced = %zu\n" , |
| 1856 | we->we_bufcount, we->we_reclaimable_bytes, |
| 1857 | we->we_error, we->we_unsynced_bufbytes); |
| 1858 | #else |
| 1859 | (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n" , |
| 1860 | we->we_bufcount, we->we_reclaimable_bytes, we->we_error); |
| 1861 | #endif |
| 1862 | } |
| 1863 | if (full) { |
| 1864 | int cnt = 0; |
| 1865 | (*pr)("bufs =" ); |
| 1866 | LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { |
| 1867 | if (!LIST_NEXT(bp, b_wapbllist)) { |
| 1868 | (*pr)(" %p" , bp); |
| 1869 | } else if ((++cnt % 6) == 0) { |
| 1870 | (*pr)(" %p,\n\t" , bp); |
| 1871 | } else { |
| 1872 | (*pr)(" %p," , bp); |
| 1873 | } |
| 1874 | } |
| 1875 | (*pr)("\n" ); |
| 1876 | |
| 1877 | (*pr)("dealloced blks = " ); |
| 1878 | { |
| 1879 | struct wapbl_dealloc *wd; |
| 1880 | cnt = 0; |
| 1881 | TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) { |
| 1882 | (*pr)(" %" PRId64":%d," , |
| 1883 | wd->wd_blkno, |
| 1884 | wd->wd_len); |
| 1885 | if ((++cnt % 4) == 0) { |
| 1886 | (*pr)("\n\t" ); |
| 1887 | } |
| 1888 | } |
| 1889 | } |
| 1890 | (*pr)("\n" ); |
| 1891 | |
| 1892 | (*pr)("registered inodes = " ); |
| 1893 | { |
| 1894 | int i; |
| 1895 | cnt = 0; |
| 1896 | for (i = 0; i <= wl->wl_inohashmask; i++) { |
| 1897 | struct wapbl_ino_head *wih; |
| 1898 | struct wapbl_ino *wi; |
| 1899 | |
| 1900 | wih = &wl->wl_inohash[i]; |
| 1901 | LIST_FOREACH(wi, wih, wi_hash) { |
| 1902 | if (wi->wi_ino == 0) |
| 1903 | continue; |
| 1904 | (*pr)(" %" PRIu64"/0%06" PRIo32"," , |
| 1905 | wi->wi_ino, wi->wi_mode); |
| 1906 | if ((++cnt % 4) == 0) { |
| 1907 | (*pr)("\n\t" ); |
| 1908 | } |
| 1909 | } |
| 1910 | } |
| 1911 | (*pr)("\n" ); |
| 1912 | } |
| 1913 | } |
| 1914 | } |
| 1915 | |
| 1916 | #if defined(WAPBL_DEBUG) || defined(DDB) |
| 1917 | void |
| 1918 | wapbl_dump(struct wapbl *wl) |
| 1919 | { |
| 1920 | #if defined(WAPBL_DEBUG) |
| 1921 | if (!wl) |
| 1922 | wl = wapbl_debug_wl; |
| 1923 | #endif |
| 1924 | if (!wl) |
| 1925 | return; |
| 1926 | wapbl_print(wl, 1, printf); |
| 1927 | } |
| 1928 | #endif |
| 1929 | |
| 1930 | /****************************************************************/ |
| 1931 | |
| 1932 | int |
| 1933 | wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force, |
| 1934 | void **cookiep) |
| 1935 | { |
| 1936 | struct wapbl_dealloc *wd; |
| 1937 | int error = 0; |
| 1938 | |
| 1939 | wapbl_jlock_assert(wl); |
| 1940 | |
| 1941 | mutex_enter(&wl->wl_mtx); |
| 1942 | |
| 1943 | if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) { |
| 1944 | if (!force) { |
| 1945 | error = EAGAIN; |
| 1946 | goto out; |
| 1947 | } |
| 1948 | |
| 1949 | /* |
| 1950 | * Forced registration can only be used when: |
| 1951 | * 1) the caller can't cope with failure |
| 1952 | * 2) the path can be triggered only bounded, small |
| 1953 | * times per transaction |
| 1954 | * If this is not fullfilled, and the path would be triggered |
| 1955 | * many times, this could overflow maximum transaction size |
| 1956 | * and panic later. |
| 1957 | */ |
| 1958 | printf("%s: forced dealloc registration over limit: %d >= %d\n" , |
| 1959 | wl->wl_mount->mnt_stat.f_mntonname, |
| 1960 | wl->wl_dealloccnt, wl->wl_dealloclim); |
| 1961 | } |
| 1962 | |
| 1963 | wl->wl_dealloccnt++; |
| 1964 | mutex_exit(&wl->wl_mtx); |
| 1965 | |
| 1966 | wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK); |
| 1967 | wd->wd_blkno = blk; |
| 1968 | wd->wd_len = len; |
| 1969 | |
| 1970 | mutex_enter(&wl->wl_mtx); |
| 1971 | TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries); |
| 1972 | |
| 1973 | if (cookiep) |
| 1974 | *cookiep = wd; |
| 1975 | |
| 1976 | out: |
| 1977 | mutex_exit(&wl->wl_mtx); |
| 1978 | |
| 1979 | WAPBL_PRINTF(WAPBL_PRINT_ALLOC, |
| 1980 | ("wapbl_register_deallocation: blk=%" PRId64" len=%d error=%d\n" , |
| 1981 | blk, len, error)); |
| 1982 | |
| 1983 | return error; |
| 1984 | } |
| 1985 | |
| 1986 | static void |
| 1987 | wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd, |
| 1988 | bool locked) |
| 1989 | { |
| 1990 | KASSERT(!locked |
| 1991 | || rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx)); |
| 1992 | |
| 1993 | if (!locked) |
| 1994 | mutex_enter(&wl->wl_mtx); |
| 1995 | |
| 1996 | TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries); |
| 1997 | wl->wl_dealloccnt--; |
| 1998 | |
| 1999 | if (!locked) |
| 2000 | mutex_exit(&wl->wl_mtx); |
| 2001 | |
| 2002 | pool_put(&wapbl_dealloc_pool, wd); |
| 2003 | } |
| 2004 | |
| 2005 | void |
| 2006 | wapbl_unregister_deallocation(struct wapbl *wl, void *cookie) |
| 2007 | { |
| 2008 | KASSERT(cookie != NULL); |
| 2009 | wapbl_deallocation_free(wl, cookie, false); |
| 2010 | } |
| 2011 | |
| 2012 | /****************************************************************/ |
| 2013 | |
| 2014 | static void |
| 2015 | wapbl_inodetrk_init(struct wapbl *wl, u_int size) |
| 2016 | { |
| 2017 | |
| 2018 | wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); |
| 2019 | if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { |
| 2020 | pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, |
| 2021 | "wapblinopl" , &pool_allocator_nointr, IPL_NONE); |
| 2022 | } |
| 2023 | } |
| 2024 | |
| 2025 | static void |
| 2026 | wapbl_inodetrk_free(struct wapbl *wl) |
| 2027 | { |
| 2028 | |
| 2029 | /* XXX this KASSERT needs locking/mutex analysis */ |
| 2030 | KASSERT(wl->wl_inohashcnt == 0); |
| 2031 | hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); |
| 2032 | if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { |
| 2033 | pool_destroy(&wapbl_ino_pool); |
| 2034 | } |
| 2035 | } |
| 2036 | |
| 2037 | static struct wapbl_ino * |
| 2038 | wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) |
| 2039 | { |
| 2040 | struct wapbl_ino_head *wih; |
| 2041 | struct wapbl_ino *wi; |
| 2042 | |
| 2043 | KASSERT(mutex_owned(&wl->wl_mtx)); |
| 2044 | |
| 2045 | wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; |
| 2046 | LIST_FOREACH(wi, wih, wi_hash) { |
| 2047 | if (ino == wi->wi_ino) |
| 2048 | return wi; |
| 2049 | } |
| 2050 | return 0; |
| 2051 | } |
| 2052 | |
| 2053 | void |
| 2054 | wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) |
| 2055 | { |
| 2056 | struct wapbl_ino_head *wih; |
| 2057 | struct wapbl_ino *wi; |
| 2058 | |
| 2059 | wi = pool_get(&wapbl_ino_pool, PR_WAITOK); |
| 2060 | |
| 2061 | mutex_enter(&wl->wl_mtx); |
| 2062 | if (wapbl_inodetrk_get(wl, ino) == NULL) { |
| 2063 | wi->wi_ino = ino; |
| 2064 | wi->wi_mode = mode; |
| 2065 | wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; |
| 2066 | LIST_INSERT_HEAD(wih, wi, wi_hash); |
| 2067 | wl->wl_inohashcnt++; |
| 2068 | WAPBL_PRINTF(WAPBL_PRINT_INODE, |
| 2069 | ("wapbl_register_inode: ino=%" PRId64"\n" , ino)); |
| 2070 | mutex_exit(&wl->wl_mtx); |
| 2071 | } else { |
| 2072 | mutex_exit(&wl->wl_mtx); |
| 2073 | pool_put(&wapbl_ino_pool, wi); |
| 2074 | } |
| 2075 | } |
| 2076 | |
| 2077 | void |
| 2078 | wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) |
| 2079 | { |
| 2080 | struct wapbl_ino *wi; |
| 2081 | |
| 2082 | mutex_enter(&wl->wl_mtx); |
| 2083 | wi = wapbl_inodetrk_get(wl, ino); |
| 2084 | if (wi) { |
| 2085 | WAPBL_PRINTF(WAPBL_PRINT_INODE, |
| 2086 | ("wapbl_unregister_inode: ino=%" PRId64"\n" , ino)); |
| 2087 | KASSERT(wl->wl_inohashcnt > 0); |
| 2088 | wl->wl_inohashcnt--; |
| 2089 | LIST_REMOVE(wi, wi_hash); |
| 2090 | mutex_exit(&wl->wl_mtx); |
| 2091 | |
| 2092 | pool_put(&wapbl_ino_pool, wi); |
| 2093 | } else { |
| 2094 | mutex_exit(&wl->wl_mtx); |
| 2095 | } |
| 2096 | } |
| 2097 | |
| 2098 | /****************************************************************/ |
| 2099 | |
| 2100 | /* |
| 2101 | * wapbl_transaction_inodes_len(wl) |
| 2102 | * |
| 2103 | * Calculate the number of bytes required for inode registration |
| 2104 | * log records in wl. |
| 2105 | */ |
| 2106 | static inline size_t |
| 2107 | wapbl_transaction_inodes_len(struct wapbl *wl) |
| 2108 | { |
| 2109 | int blocklen = 1<<wl->wl_log_dev_bshift; |
| 2110 | int iph; |
| 2111 | |
| 2112 | /* Calculate number of inodes described in a inodelist header */ |
| 2113 | iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / |
| 2114 | sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); |
| 2115 | |
| 2116 | KASSERT(iph > 0); |
| 2117 | |
| 2118 | return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; |
| 2119 | } |
| 2120 | |
| 2121 | |
| 2122 | /* |
| 2123 | * wapbl_transaction_len(wl) |
| 2124 | * |
| 2125 | * Calculate number of bytes required for all log records in wl. |
| 2126 | */ |
| 2127 | static size_t |
| 2128 | wapbl_transaction_len(struct wapbl *wl) |
| 2129 | { |
| 2130 | int blocklen = 1<<wl->wl_log_dev_bshift; |
| 2131 | size_t len; |
| 2132 | |
| 2133 | /* Calculate number of blocks described in a blocklist header */ |
| 2134 | len = wl->wl_bcount; |
| 2135 | len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen; |
| 2136 | len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen; |
| 2137 | len += wapbl_transaction_inodes_len(wl); |
| 2138 | |
| 2139 | return len; |
| 2140 | } |
| 2141 | |
| 2142 | /* |
| 2143 | * wapbl_cache_sync(wl, msg) |
| 2144 | * |
| 2145 | * Issue DIOCCACHESYNC to wl->wl_devvp. |
| 2146 | * |
| 2147 | * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message |
| 2148 | * including msg about the duration of the cache sync. |
| 2149 | */ |
| 2150 | static int |
| 2151 | wapbl_cache_sync(struct wapbl *wl, const char *msg) |
| 2152 | { |
| 2153 | const bool verbose = wapbl_verbose_commit >= 2; |
| 2154 | struct bintime start_time; |
| 2155 | int force = 1; |
| 2156 | int error; |
| 2157 | |
| 2158 | if (!wapbl_flush_disk_cache) { |
| 2159 | return 0; |
| 2160 | } |
| 2161 | if (verbose) { |
| 2162 | bintime(&start_time); |
| 2163 | } |
| 2164 | error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, |
| 2165 | FWRITE, FSCRED); |
| 2166 | if (error) { |
| 2167 | WAPBL_PRINTF(WAPBL_PRINT_ERROR, |
| 2168 | ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx " |
| 2169 | "returned %d\n" , (uintmax_t)wl->wl_devvp->v_rdev, error)); |
| 2170 | } |
| 2171 | if (verbose) { |
| 2172 | struct bintime d; |
| 2173 | struct timespec ts; |
| 2174 | |
| 2175 | bintime(&d); |
| 2176 | bintime_sub(&d, &start_time); |
| 2177 | bintime2timespec(&d, &ts); |
| 2178 | printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n" , |
| 2179 | msg, (uintmax_t)wl->wl_devvp->v_rdev, |
| 2180 | (uintmax_t)ts.tv_sec, ts.tv_nsec); |
| 2181 | } |
| 2182 | return error; |
| 2183 | } |
| 2184 | |
| 2185 | /* |
| 2186 | * wapbl_write_commit(wl, head, tail) |
| 2187 | * |
| 2188 | * Issue a disk cache sync to wait for all pending writes to the |
| 2189 | * log to complete, and then synchronously commit the current |
| 2190 | * circular queue head and tail to the log, in the next of two |
| 2191 | * locations for commit headers on disk. |
| 2192 | * |
| 2193 | * Increment the generation number. If the generation number |
| 2194 | * rolls over to zero, then a subsequent commit would appear to |
| 2195 | * have an older generation than this one -- in that case, issue a |
| 2196 | * duplicate commit to avoid this. |
| 2197 | * |
| 2198 | * => Caller must have exclusive access to wl, either by holding |
| 2199 | * wl->wl_rwlock for writer or by being wapbl_start before anyone |
| 2200 | * else has seen wl. |
| 2201 | */ |
| 2202 | static int |
| 2203 | wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) |
| 2204 | { |
| 2205 | struct wapbl_wc_header *wc = wl->wl_wc_header; |
| 2206 | struct timespec ts; |
| 2207 | int error; |
| 2208 | daddr_t pbn; |
| 2209 | |
| 2210 | error = wapbl_buffered_flush(wl); |
| 2211 | if (error) |
| 2212 | return error; |
| 2213 | /* |
| 2214 | * flush disk cache to ensure that blocks we've written are actually |
| 2215 | * written to the stable storage before the commit header. |
| 2216 | * |
| 2217 | * XXX Calc checksum here, instead we do this for now |
| 2218 | */ |
| 2219 | wapbl_cache_sync(wl, "1" ); |
| 2220 | |
| 2221 | wc->wc_head = head; |
| 2222 | wc->wc_tail = tail; |
| 2223 | wc->wc_checksum = 0; |
| 2224 | wc->wc_version = 1; |
| 2225 | getnanotime(&ts); |
| 2226 | wc->wc_time = ts.tv_sec; |
| 2227 | wc->wc_timensec = ts.tv_nsec; |
| 2228 | |
| 2229 | WAPBL_PRINTF(WAPBL_PRINT_WRITE, |
| 2230 | ("wapbl_write_commit: head = %" PRIdMAX "tail = %" PRIdMAX"\n" , |
| 2231 | (intmax_t)head, (intmax_t)tail)); |
| 2232 | |
| 2233 | /* |
| 2234 | * write the commit header. |
| 2235 | * |
| 2236 | * XXX if generation will rollover, then first zero |
| 2237 | * over second commit header before trying to write both headers. |
| 2238 | */ |
| 2239 | |
| 2240 | pbn = wl->wl_logpbn + (wc->wc_generation % 2); |
| 2241 | #ifdef _KERNEL |
| 2242 | pbn = btodb(pbn << wc->wc_log_dev_bshift); |
| 2243 | #endif |
| 2244 | error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn); |
| 2245 | if (error) |
| 2246 | return error; |
| 2247 | error = wapbl_buffered_flush(wl); |
| 2248 | if (error) |
| 2249 | return error; |
| 2250 | |
| 2251 | /* |
| 2252 | * flush disk cache to ensure that the commit header is actually |
| 2253 | * written before meta data blocks. |
| 2254 | */ |
| 2255 | wapbl_cache_sync(wl, "2" ); |
| 2256 | |
| 2257 | /* |
| 2258 | * If the generation number was zero, write it out a second time. |
| 2259 | * This handles initialization and generation number rollover |
| 2260 | */ |
| 2261 | if (wc->wc_generation++ == 0) { |
| 2262 | error = wapbl_write_commit(wl, head, tail); |
| 2263 | /* |
| 2264 | * This panic should be able to be removed if we do the |
| 2265 | * zero'ing mentioned above, and we are certain to roll |
| 2266 | * back generation number on failure. |
| 2267 | */ |
| 2268 | if (error) |
| 2269 | panic("wapbl_write_commit: error writing duplicate " |
| 2270 | "log header: %d" , error); |
| 2271 | } |
| 2272 | return 0; |
| 2273 | } |
| 2274 | |
| 2275 | /* |
| 2276 | * wapbl_write_blocks(wl, offp) |
| 2277 | * |
| 2278 | * Write all pending physical blocks in the current transaction |
| 2279 | * from wapbl_add_buf to the log on disk, adding to the circular |
| 2280 | * queue head at byte offset *offp, and returning the new head's |
| 2281 | * byte offset in *offp. |
| 2282 | */ |
| 2283 | static int |
| 2284 | wapbl_write_blocks(struct wapbl *wl, off_t *offp) |
| 2285 | { |
| 2286 | struct wapbl_wc_blocklist *wc = |
| 2287 | (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; |
| 2288 | int blocklen = 1<<wl->wl_log_dev_bshift; |
| 2289 | struct buf *bp; |
| 2290 | off_t off = *offp; |
| 2291 | int error; |
| 2292 | size_t padding; |
| 2293 | |
| 2294 | KASSERT(rw_write_held(&wl->wl_rwlock)); |
| 2295 | |
| 2296 | bp = LIST_FIRST(&wl->wl_bufs); |
| 2297 | |
| 2298 | while (bp) { |
| 2299 | int cnt; |
| 2300 | struct buf *obp = bp; |
| 2301 | |
| 2302 | KASSERT(bp->b_flags & B_LOCKED); |
| 2303 | |
| 2304 | wc->wc_type = WAPBL_WC_BLOCKS; |
| 2305 | wc->wc_len = blocklen; |
| 2306 | wc->wc_blkcount = 0; |
| 2307 | while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) { |
| 2308 | /* |
| 2309 | * Make sure all the physical block numbers are up to |
| 2310 | * date. If this is not always true on a given |
| 2311 | * filesystem, then VOP_BMAP must be called. We |
| 2312 | * could call VOP_BMAP here, or else in the filesystem |
| 2313 | * specific flush callback, although neither of those |
| 2314 | * solutions allow us to take the vnode lock. If a |
| 2315 | * filesystem requires that we must take the vnode lock |
| 2316 | * to call VOP_BMAP, then we can probably do it in |
| 2317 | * bwrite when the vnode lock should already be held |
| 2318 | * by the invoking code. |
| 2319 | */ |
| 2320 | KASSERT((bp->b_vp->v_type == VBLK) || |
| 2321 | (bp->b_blkno != bp->b_lblkno)); |
| 2322 | KASSERT(bp->b_blkno > 0); |
| 2323 | |
| 2324 | wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; |
| 2325 | wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; |
| 2326 | wc->wc_len += bp->b_bcount; |
| 2327 | wc->wc_blkcount++; |
| 2328 | bp = LIST_NEXT(bp, b_wapbllist); |
| 2329 | } |
| 2330 | if (wc->wc_len % blocklen != 0) { |
| 2331 | padding = blocklen - wc->wc_len % blocklen; |
| 2332 | wc->wc_len += padding; |
| 2333 | } else { |
| 2334 | padding = 0; |
| 2335 | } |
| 2336 | |
| 2337 | WAPBL_PRINTF(WAPBL_PRINT_WRITE, |
| 2338 | ("wapbl_write_blocks: len = %u (padding %zu) off = %" PRIdMAX"\n" , |
| 2339 | wc->wc_len, padding, (intmax_t)off)); |
| 2340 | |
| 2341 | error = wapbl_circ_write(wl, wc, blocklen, &off); |
| 2342 | if (error) |
| 2343 | return error; |
| 2344 | bp = obp; |
| 2345 | cnt = 0; |
| 2346 | while (bp && (cnt++ < wl->wl_brperjblock)) { |
| 2347 | error = wapbl_circ_write(wl, bp->b_data, |
| 2348 | bp->b_bcount, &off); |
| 2349 | if (error) |
| 2350 | return error; |
| 2351 | bp = LIST_NEXT(bp, b_wapbllist); |
| 2352 | } |
| 2353 | if (padding) { |
| 2354 | void *zero; |
| 2355 | |
| 2356 | zero = wapbl_alloc(padding); |
| 2357 | memset(zero, 0, padding); |
| 2358 | error = wapbl_circ_write(wl, zero, padding, &off); |
| 2359 | wapbl_free(zero, padding); |
| 2360 | if (error) |
| 2361 | return error; |
| 2362 | } |
| 2363 | } |
| 2364 | *offp = off; |
| 2365 | return 0; |
| 2366 | } |
| 2367 | |
| 2368 | /* |
| 2369 | * wapbl_write_revocations(wl, offp) |
| 2370 | * |
| 2371 | * Write all pending deallocations in the current transaction from |
| 2372 | * wapbl_register_deallocation to the log on disk, adding to the |
| 2373 | * circular queue's head at byte offset *offp, and returning the |
| 2374 | * new head's byte offset in *offp. |
| 2375 | */ |
| 2376 | static int |
| 2377 | wapbl_write_revocations(struct wapbl *wl, off_t *offp) |
| 2378 | { |
| 2379 | struct wapbl_wc_blocklist *wc = |
| 2380 | (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; |
| 2381 | struct wapbl_dealloc *wd, *lwd; |
| 2382 | int blocklen = 1<<wl->wl_log_dev_bshift; |
| 2383 | off_t off = *offp; |
| 2384 | int error; |
| 2385 | |
| 2386 | if (wl->wl_dealloccnt == 0) |
| 2387 | return 0; |
| 2388 | |
| 2389 | while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) { |
| 2390 | wc->wc_type = WAPBL_WC_REVOCATIONS; |
| 2391 | wc->wc_len = blocklen; |
| 2392 | wc->wc_blkcount = 0; |
| 2393 | while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) { |
| 2394 | wc->wc_blocks[wc->wc_blkcount].wc_daddr = |
| 2395 | wd->wd_blkno; |
| 2396 | wc->wc_blocks[wc->wc_blkcount].wc_dlen = |
| 2397 | wd->wd_len; |
| 2398 | wc->wc_blkcount++; |
| 2399 | |
| 2400 | wd = TAILQ_NEXT(wd, wd_entries); |
| 2401 | } |
| 2402 | WAPBL_PRINTF(WAPBL_PRINT_WRITE, |
| 2403 | ("wapbl_write_revocations: len = %u off = %" PRIdMAX"\n" , |
| 2404 | wc->wc_len, (intmax_t)off)); |
| 2405 | error = wapbl_circ_write(wl, wc, blocklen, &off); |
| 2406 | if (error) |
| 2407 | return error; |
| 2408 | |
| 2409 | /* free all successfully written deallocs */ |
| 2410 | lwd = wd; |
| 2411 | while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) { |
| 2412 | if (wd == lwd) |
| 2413 | break; |
| 2414 | wapbl_deallocation_free(wl, wd, true); |
| 2415 | } |
| 2416 | } |
| 2417 | *offp = off; |
| 2418 | return 0; |
| 2419 | } |
| 2420 | |
| 2421 | /* |
| 2422 | * wapbl_write_inodes(wl, offp) |
| 2423 | * |
| 2424 | * Write all pending inode allocations in the current transaction |
| 2425 | * from wapbl_register_inode to the log on disk, adding to the |
| 2426 | * circular queue's head at byte offset *offp and returning the |
| 2427 | * new head's byte offset in *offp. |
| 2428 | */ |
| 2429 | static int |
| 2430 | wapbl_write_inodes(struct wapbl *wl, off_t *offp) |
| 2431 | { |
| 2432 | struct wapbl_wc_inodelist *wc = |
| 2433 | (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; |
| 2434 | int i; |
| 2435 | int blocklen = 1 << wl->wl_log_dev_bshift; |
| 2436 | off_t off = *offp; |
| 2437 | int error; |
| 2438 | |
| 2439 | struct wapbl_ino_head *wih; |
| 2440 | struct wapbl_ino *wi; |
| 2441 | int iph; |
| 2442 | |
| 2443 | iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / |
| 2444 | sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); |
| 2445 | |
| 2446 | i = 0; |
| 2447 | wih = &wl->wl_inohash[0]; |
| 2448 | wi = 0; |
| 2449 | do { |
| 2450 | wc->wc_type = WAPBL_WC_INODES; |
| 2451 | wc->wc_len = blocklen; |
| 2452 | wc->wc_inocnt = 0; |
| 2453 | wc->wc_clear = (i == 0); |
| 2454 | while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { |
| 2455 | while (!wi) { |
| 2456 | KASSERT((wih - &wl->wl_inohash[0]) |
| 2457 | <= wl->wl_inohashmask); |
| 2458 | wi = LIST_FIRST(wih++); |
| 2459 | } |
| 2460 | wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; |
| 2461 | wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; |
| 2462 | wc->wc_inocnt++; |
| 2463 | i++; |
| 2464 | wi = LIST_NEXT(wi, wi_hash); |
| 2465 | } |
| 2466 | WAPBL_PRINTF(WAPBL_PRINT_WRITE, |
| 2467 | ("wapbl_write_inodes: len = %u off = %" PRIdMAX"\n" , |
| 2468 | wc->wc_len, (intmax_t)off)); |
| 2469 | error = wapbl_circ_write(wl, wc, blocklen, &off); |
| 2470 | if (error) |
| 2471 | return error; |
| 2472 | } while (i < wl->wl_inohashcnt); |
| 2473 | |
| 2474 | *offp = off; |
| 2475 | return 0; |
| 2476 | } |
| 2477 | |
| 2478 | #endif /* _KERNEL */ |
| 2479 | |
| 2480 | /****************************************************************/ |
| 2481 | |
| 2482 | struct wapbl_blk { |
| 2483 | LIST_ENTRY(wapbl_blk) wb_hash; |
| 2484 | daddr_t wb_blk; |
| 2485 | off_t wb_off; /* Offset of this block in the log */ |
| 2486 | }; |
| 2487 | #define WAPBL_BLKPOOL_MIN 83 |
| 2488 | |
| 2489 | static void |
| 2490 | wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) |
| 2491 | { |
| 2492 | if (size < WAPBL_BLKPOOL_MIN) |
| 2493 | size = WAPBL_BLKPOOL_MIN; |
| 2494 | KASSERT(wr->wr_blkhash == 0); |
| 2495 | #ifdef _KERNEL |
| 2496 | wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); |
| 2497 | #else /* ! _KERNEL */ |
| 2498 | /* Manually implement hashinit */ |
| 2499 | { |
| 2500 | unsigned long i, hashsize; |
| 2501 | for (hashsize = 1; hashsize < size; hashsize <<= 1) |
| 2502 | continue; |
| 2503 | wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash)); |
| 2504 | for (i = 0; i < hashsize; i++) |
| 2505 | LIST_INIT(&wr->wr_blkhash[i]); |
| 2506 | wr->wr_blkhashmask = hashsize - 1; |
| 2507 | } |
| 2508 | #endif /* ! _KERNEL */ |
| 2509 | } |
| 2510 | |
| 2511 | static void |
| 2512 | wapbl_blkhash_free(struct wapbl_replay *wr) |
| 2513 | { |
| 2514 | KASSERT(wr->wr_blkhashcnt == 0); |
| 2515 | #ifdef _KERNEL |
| 2516 | hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); |
| 2517 | #else /* ! _KERNEL */ |
| 2518 | wapbl_free(wr->wr_blkhash, |
| 2519 | (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); |
| 2520 | #endif /* ! _KERNEL */ |
| 2521 | } |
| 2522 | |
| 2523 | static struct wapbl_blk * |
| 2524 | wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) |
| 2525 | { |
| 2526 | struct wapbl_blk_head *wbh; |
| 2527 | struct wapbl_blk *wb; |
| 2528 | wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; |
| 2529 | LIST_FOREACH(wb, wbh, wb_hash) { |
| 2530 | if (blk == wb->wb_blk) |
| 2531 | return wb; |
| 2532 | } |
| 2533 | return 0; |
| 2534 | } |
| 2535 | |
| 2536 | static void |
| 2537 | wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) |
| 2538 | { |
| 2539 | struct wapbl_blk_head *wbh; |
| 2540 | struct wapbl_blk *wb; |
| 2541 | wb = wapbl_blkhash_get(wr, blk); |
| 2542 | if (wb) { |
| 2543 | KASSERT(wb->wb_blk == blk); |
| 2544 | wb->wb_off = off; |
| 2545 | } else { |
| 2546 | wb = wapbl_alloc(sizeof(*wb)); |
| 2547 | wb->wb_blk = blk; |
| 2548 | wb->wb_off = off; |
| 2549 | wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; |
| 2550 | LIST_INSERT_HEAD(wbh, wb, wb_hash); |
| 2551 | wr->wr_blkhashcnt++; |
| 2552 | } |
| 2553 | } |
| 2554 | |
| 2555 | static void |
| 2556 | wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) |
| 2557 | { |
| 2558 | struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); |
| 2559 | if (wb) { |
| 2560 | KASSERT(wr->wr_blkhashcnt > 0); |
| 2561 | wr->wr_blkhashcnt--; |
| 2562 | LIST_REMOVE(wb, wb_hash); |
| 2563 | wapbl_free(wb, sizeof(*wb)); |
| 2564 | } |
| 2565 | } |
| 2566 | |
| 2567 | static void |
| 2568 | wapbl_blkhash_clear(struct wapbl_replay *wr) |
| 2569 | { |
| 2570 | unsigned long i; |
| 2571 | for (i = 0; i <= wr->wr_blkhashmask; i++) { |
| 2572 | struct wapbl_blk *wb; |
| 2573 | |
| 2574 | while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { |
| 2575 | KASSERT(wr->wr_blkhashcnt > 0); |
| 2576 | wr->wr_blkhashcnt--; |
| 2577 | LIST_REMOVE(wb, wb_hash); |
| 2578 | wapbl_free(wb, sizeof(*wb)); |
| 2579 | } |
| 2580 | } |
| 2581 | KASSERT(wr->wr_blkhashcnt == 0); |
| 2582 | } |
| 2583 | |
| 2584 | /****************************************************************/ |
| 2585 | |
| 2586 | /* |
| 2587 | * wapbl_circ_read(wr, data, len, offp) |
| 2588 | * |
| 2589 | * Read len bytes into data from the circular queue of wr, |
| 2590 | * starting at the linear byte offset *offp, and returning the new |
| 2591 | * linear byte offset in *offp. |
| 2592 | * |
| 2593 | * If the starting linear byte offset precedes wr->wr_circ_off, |
| 2594 | * the read instead begins at wr->wr_circ_off. XXX WTF? This |
| 2595 | * should be a KASSERT, not a conditional. |
| 2596 | */ |
| 2597 | static int |
| 2598 | wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) |
| 2599 | { |
| 2600 | size_t slen; |
| 2601 | off_t off = *offp; |
| 2602 | int error; |
| 2603 | daddr_t pbn; |
| 2604 | |
| 2605 | KASSERT(((len >> wr->wr_log_dev_bshift) << |
| 2606 | wr->wr_log_dev_bshift) == len); |
| 2607 | |
| 2608 | if (off < wr->wr_circ_off) |
| 2609 | off = wr->wr_circ_off; |
| 2610 | slen = wr->wr_circ_off + wr->wr_circ_size - off; |
| 2611 | if (slen < len) { |
| 2612 | pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); |
| 2613 | #ifdef _KERNEL |
| 2614 | pbn = btodb(pbn << wr->wr_log_dev_bshift); |
| 2615 | #endif |
| 2616 | error = wapbl_read(data, slen, wr->wr_devvp, pbn); |
| 2617 | if (error) |
| 2618 | return error; |
| 2619 | data = (uint8_t *)data + slen; |
| 2620 | len -= slen; |
| 2621 | off = wr->wr_circ_off; |
| 2622 | } |
| 2623 | pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); |
| 2624 | #ifdef _KERNEL |
| 2625 | pbn = btodb(pbn << wr->wr_log_dev_bshift); |
| 2626 | #endif |
| 2627 | error = wapbl_read(data, len, wr->wr_devvp, pbn); |
| 2628 | if (error) |
| 2629 | return error; |
| 2630 | off += len; |
| 2631 | if (off >= wr->wr_circ_off + wr->wr_circ_size) |
| 2632 | off = wr->wr_circ_off; |
| 2633 | *offp = off; |
| 2634 | return 0; |
| 2635 | } |
| 2636 | |
| 2637 | /* |
| 2638 | * wapbl_circ_advance(wr, len, offp) |
| 2639 | * |
| 2640 | * Compute the linear byte offset of the circular queue of wr that |
| 2641 | * is len bytes past *offp, and store it in *offp. |
| 2642 | * |
| 2643 | * This is as if wapbl_circ_read, but without actually reading |
| 2644 | * anything. |
| 2645 | * |
| 2646 | * If the starting linear byte offset precedes wr->wr_circ_off, it |
| 2647 | * is taken to be wr->wr_circ_off instead. XXX WTF? This should |
| 2648 | * be a KASSERT, not a conditional. |
| 2649 | */ |
| 2650 | static void |
| 2651 | wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) |
| 2652 | { |
| 2653 | size_t slen; |
| 2654 | off_t off = *offp; |
| 2655 | |
| 2656 | KASSERT(((len >> wr->wr_log_dev_bshift) << |
| 2657 | wr->wr_log_dev_bshift) == len); |
| 2658 | |
| 2659 | if (off < wr->wr_circ_off) |
| 2660 | off = wr->wr_circ_off; |
| 2661 | slen = wr->wr_circ_off + wr->wr_circ_size - off; |
| 2662 | if (slen < len) { |
| 2663 | len -= slen; |
| 2664 | off = wr->wr_circ_off; |
| 2665 | } |
| 2666 | off += len; |
| 2667 | if (off >= wr->wr_circ_off + wr->wr_circ_size) |
| 2668 | off = wr->wr_circ_off; |
| 2669 | *offp = off; |
| 2670 | } |
| 2671 | |
| 2672 | /****************************************************************/ |
| 2673 | |
| 2674 | int |
| 2675 | wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, |
| 2676 | daddr_t off, size_t count, size_t blksize) |
| 2677 | { |
| 2678 | struct wapbl_replay *wr; |
| 2679 | int error; |
| 2680 | struct vnode *devvp; |
| 2681 | daddr_t logpbn; |
| 2682 | uint8_t *scratch; |
| 2683 | struct wapbl_wc_header *wch; |
| 2684 | struct wapbl_wc_header *wch2; |
| 2685 | /* Use this until we read the actual log header */ |
| 2686 | int log_dev_bshift = ilog2(blksize); |
| 2687 | size_t used; |
| 2688 | daddr_t pbn; |
| 2689 | |
| 2690 | WAPBL_PRINTF(WAPBL_PRINT_REPLAY, |
| 2691 | ("wapbl_replay_start: vp=%p off=%" PRId64 " count=%zu blksize=%zu\n" , |
| 2692 | vp, off, count, blksize)); |
| 2693 | |
| 2694 | if (off < 0) |
| 2695 | return EINVAL; |
| 2696 | |
| 2697 | if (blksize < DEV_BSIZE) |
| 2698 | return EINVAL; |
| 2699 | if (blksize % DEV_BSIZE) |
| 2700 | return EINVAL; |
| 2701 | |
| 2702 | #ifdef _KERNEL |
| 2703 | #if 0 |
| 2704 | /* XXX vp->v_size isn't reliably set for VBLK devices, |
| 2705 | * especially root. However, we might still want to verify |
| 2706 | * that the full load is readable */ |
| 2707 | if ((off + count) * blksize > vp->v_size) |
| 2708 | return EINVAL; |
| 2709 | #endif |
| 2710 | if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { |
| 2711 | return error; |
| 2712 | } |
| 2713 | #else /* ! _KERNEL */ |
| 2714 | devvp = vp; |
| 2715 | logpbn = off; |
| 2716 | #endif /* ! _KERNEL */ |
| 2717 | |
| 2718 | scratch = wapbl_alloc(MAXBSIZE); |
| 2719 | |
| 2720 | pbn = logpbn; |
| 2721 | #ifdef _KERNEL |
| 2722 | pbn = btodb(pbn << log_dev_bshift); |
| 2723 | #endif |
| 2724 | error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); |
| 2725 | if (error) |
| 2726 | goto errout; |
| 2727 | |
| 2728 | wch = (struct wapbl_wc_header *)scratch; |
| 2729 | wch2 = |
| 2730 | (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); |
| 2731 | /* XXX verify checksums and magic numbers */ |
| 2732 | if (wch->wc_type != WAPBL_WC_HEADER) { |
| 2733 | printf("Unrecognized wapbl magic: 0x%08x\n" , wch->wc_type); |
| 2734 | error = EFTYPE; |
| 2735 | goto errout; |
| 2736 | } |
| 2737 | |
| 2738 | if (wch2->wc_generation > wch->wc_generation) |
| 2739 | wch = wch2; |
| 2740 | |
| 2741 | wr = wapbl_calloc(1, sizeof(*wr)); |
| 2742 | |
| 2743 | wr->wr_logvp = vp; |
| 2744 | wr->wr_devvp = devvp; |
| 2745 | wr->wr_logpbn = logpbn; |
| 2746 | |
| 2747 | wr->wr_scratch = scratch; |
| 2748 | |
| 2749 | wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; |
| 2750 | wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; |
| 2751 | wr->wr_circ_off = wch->wc_circ_off; |
| 2752 | wr->wr_circ_size = wch->wc_circ_size; |
| 2753 | wr->wr_generation = wch->wc_generation; |
| 2754 | |
| 2755 | used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); |
| 2756 | |
| 2757 | WAPBL_PRINTF(WAPBL_PRINT_REPLAY, |
| 2758 | ("wapbl_replay: head=%" PRId64" tail=%" PRId64" off=%" PRId64 |
| 2759 | " len=%" PRId64" used=%zu\n" , |
| 2760 | wch->wc_head, wch->wc_tail, wch->wc_circ_off, |
| 2761 | wch->wc_circ_size, used)); |
| 2762 | |
| 2763 | wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); |
| 2764 | |
| 2765 | error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); |
| 2766 | if (error) { |
| 2767 | wapbl_replay_stop(wr); |
| 2768 | wapbl_replay_free(wr); |
| 2769 | return error; |
| 2770 | } |
| 2771 | |
| 2772 | *wrp = wr; |
| 2773 | return 0; |
| 2774 | |
| 2775 | errout: |
| 2776 | wapbl_free(scratch, MAXBSIZE); |
| 2777 | return error; |
| 2778 | } |
| 2779 | |
| 2780 | void |
| 2781 | wapbl_replay_stop(struct wapbl_replay *wr) |
| 2782 | { |
| 2783 | |
| 2784 | if (!wapbl_replay_isopen(wr)) |
| 2785 | return; |
| 2786 | |
| 2787 | WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n" )); |
| 2788 | |
| 2789 | wapbl_free(wr->wr_scratch, MAXBSIZE); |
| 2790 | wr->wr_scratch = NULL; |
| 2791 | |
| 2792 | wr->wr_logvp = NULL; |
| 2793 | |
| 2794 | wapbl_blkhash_clear(wr); |
| 2795 | wapbl_blkhash_free(wr); |
| 2796 | } |
| 2797 | |
| 2798 | void |
| 2799 | wapbl_replay_free(struct wapbl_replay *wr) |
| 2800 | { |
| 2801 | |
| 2802 | KDASSERT(!wapbl_replay_isopen(wr)); |
| 2803 | |
| 2804 | if (wr->wr_inodes) |
| 2805 | wapbl_free(wr->wr_inodes, |
| 2806 | wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); |
| 2807 | wapbl_free(wr, sizeof(*wr)); |
| 2808 | } |
| 2809 | |
| 2810 | #ifdef _KERNEL |
| 2811 | int |
| 2812 | wapbl_replay_isopen1(struct wapbl_replay *wr) |
| 2813 | { |
| 2814 | |
| 2815 | return wapbl_replay_isopen(wr); |
| 2816 | } |
| 2817 | #endif |
| 2818 | |
| 2819 | /* |
| 2820 | * calculate the disk address for the i'th block in the wc_blockblist |
| 2821 | * offset by j blocks of size blen. |
| 2822 | * |
| 2823 | * wc_daddr is always a kernel disk address in DEV_BSIZE units that |
| 2824 | * was written to the journal. |
| 2825 | * |
| 2826 | * The kernel needs that address plus the offset in DEV_BSIZE units. |
| 2827 | * |
| 2828 | * Userland needs that address plus the offset in blen units. |
| 2829 | * |
| 2830 | */ |
| 2831 | static daddr_t |
| 2832 | wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen) |
| 2833 | { |
| 2834 | daddr_t pbn; |
| 2835 | |
| 2836 | #ifdef _KERNEL |
| 2837 | pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen); |
| 2838 | #else |
| 2839 | pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j; |
| 2840 | #endif |
| 2841 | |
| 2842 | return pbn; |
| 2843 | } |
| 2844 | |
| 2845 | static void |
| 2846 | wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) |
| 2847 | { |
| 2848 | struct wapbl_wc_blocklist *wc = |
| 2849 | (struct wapbl_wc_blocklist *)wr->wr_scratch; |
| 2850 | int fsblklen = 1 << wr->wr_fs_dev_bshift; |
| 2851 | int i, j, n; |
| 2852 | |
| 2853 | for (i = 0; i < wc->wc_blkcount; i++) { |
| 2854 | /* |
| 2855 | * Enter each physical block into the hashtable independently. |
| 2856 | */ |
| 2857 | n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; |
| 2858 | for (j = 0; j < n; j++) { |
| 2859 | wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen), |
| 2860 | *offp); |
| 2861 | wapbl_circ_advance(wr, fsblklen, offp); |
| 2862 | } |
| 2863 | } |
| 2864 | } |
| 2865 | |
| 2866 | static void |
| 2867 | wapbl_replay_process_revocations(struct wapbl_replay *wr) |
| 2868 | { |
| 2869 | struct wapbl_wc_blocklist *wc = |
| 2870 | (struct wapbl_wc_blocklist *)wr->wr_scratch; |
| 2871 | int fsblklen = 1 << wr->wr_fs_dev_bshift; |
| 2872 | int i, j, n; |
| 2873 | |
| 2874 | for (i = 0; i < wc->wc_blkcount; i++) { |
| 2875 | /* |
| 2876 | * Remove any blocks found from the hashtable. |
| 2877 | */ |
| 2878 | n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; |
| 2879 | for (j = 0; j < n; j++) |
| 2880 | wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen)); |
| 2881 | } |
| 2882 | } |
| 2883 | |
| 2884 | static void |
| 2885 | wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) |
| 2886 | { |
| 2887 | struct wapbl_wc_inodelist *wc = |
| 2888 | (struct wapbl_wc_inodelist *)wr->wr_scratch; |
| 2889 | void *new_inodes; |
| 2890 | const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); |
| 2891 | |
| 2892 | KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); |
| 2893 | |
| 2894 | /* |
| 2895 | * Keep track of where we found this so location won't be |
| 2896 | * overwritten. |
| 2897 | */ |
| 2898 | if (wc->wc_clear) { |
| 2899 | wr->wr_inodestail = oldoff; |
| 2900 | wr->wr_inodescnt = 0; |
| 2901 | if (wr->wr_inodes != NULL) { |
| 2902 | wapbl_free(wr->wr_inodes, oldsize); |
| 2903 | wr->wr_inodes = NULL; |
| 2904 | } |
| 2905 | } |
| 2906 | wr->wr_inodeshead = newoff; |
| 2907 | if (wc->wc_inocnt == 0) |
| 2908 | return; |
| 2909 | |
| 2910 | new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) * |
| 2911 | sizeof(wr->wr_inodes[0])); |
| 2912 | if (wr->wr_inodes != NULL) { |
| 2913 | memcpy(new_inodes, wr->wr_inodes, oldsize); |
| 2914 | wapbl_free(wr->wr_inodes, oldsize); |
| 2915 | } |
| 2916 | wr->wr_inodes = new_inodes; |
| 2917 | memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, |
| 2918 | wc->wc_inocnt * sizeof(wr->wr_inodes[0])); |
| 2919 | wr->wr_inodescnt += wc->wc_inocnt; |
| 2920 | } |
| 2921 | |
| 2922 | static int |
| 2923 | wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) |
| 2924 | { |
| 2925 | off_t off; |
| 2926 | int error; |
| 2927 | |
| 2928 | int logblklen = 1 << wr->wr_log_dev_bshift; |
| 2929 | |
| 2930 | wapbl_blkhash_clear(wr); |
| 2931 | |
| 2932 | off = tail; |
| 2933 | while (off != head) { |
| 2934 | struct wapbl_wc_null *wcn; |
| 2935 | off_t saveoff = off; |
| 2936 | error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); |
| 2937 | if (error) |
| 2938 | goto errout; |
| 2939 | wcn = (struct wapbl_wc_null *)wr->wr_scratch; |
| 2940 | switch (wcn->wc_type) { |
| 2941 | case WAPBL_WC_BLOCKS: |
| 2942 | wapbl_replay_process_blocks(wr, &off); |
| 2943 | break; |
| 2944 | |
| 2945 | case WAPBL_WC_REVOCATIONS: |
| 2946 | wapbl_replay_process_revocations(wr); |
| 2947 | break; |
| 2948 | |
| 2949 | case WAPBL_WC_INODES: |
| 2950 | wapbl_replay_process_inodes(wr, saveoff, off); |
| 2951 | break; |
| 2952 | |
| 2953 | default: |
| 2954 | printf("Unrecognized wapbl type: 0x%08x\n" , |
| 2955 | wcn->wc_type); |
| 2956 | error = EFTYPE; |
| 2957 | goto errout; |
| 2958 | } |
| 2959 | wapbl_circ_advance(wr, wcn->wc_len, &saveoff); |
| 2960 | if (off != saveoff) { |
| 2961 | printf("wapbl_replay: corrupted records\n" ); |
| 2962 | error = EFTYPE; |
| 2963 | goto errout; |
| 2964 | } |
| 2965 | } |
| 2966 | return 0; |
| 2967 | |
| 2968 | errout: |
| 2969 | wapbl_blkhash_clear(wr); |
| 2970 | return error; |
| 2971 | } |
| 2972 | |
| 2973 | #if 0 |
| 2974 | int |
| 2975 | wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) |
| 2976 | { |
| 2977 | off_t off; |
| 2978 | int mismatchcnt = 0; |
| 2979 | int logblklen = 1 << wr->wr_log_dev_bshift; |
| 2980 | int fsblklen = 1 << wr->wr_fs_dev_bshift; |
| 2981 | void *scratch1 = wapbl_alloc(MAXBSIZE); |
| 2982 | void *scratch2 = wapbl_alloc(MAXBSIZE); |
| 2983 | int error = 0; |
| 2984 | |
| 2985 | KDASSERT(wapbl_replay_isopen(wr)); |
| 2986 | |
| 2987 | off = wch->wc_tail; |
| 2988 | while (off != wch->wc_head) { |
| 2989 | struct wapbl_wc_null *wcn; |
| 2990 | #ifdef DEBUG |
| 2991 | off_t saveoff = off; |
| 2992 | #endif |
| 2993 | error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); |
| 2994 | if (error) |
| 2995 | goto out; |
| 2996 | wcn = (struct wapbl_wc_null *)wr->wr_scratch; |
| 2997 | switch (wcn->wc_type) { |
| 2998 | case WAPBL_WC_BLOCKS: |
| 2999 | { |
| 3000 | struct wapbl_wc_blocklist *wc = |
| 3001 | (struct wapbl_wc_blocklist *)wr->wr_scratch; |
| 3002 | int i; |
| 3003 | for (i = 0; i < wc->wc_blkcount; i++) { |
| 3004 | int foundcnt = 0; |
| 3005 | int dirtycnt = 0; |
| 3006 | int j, n; |
| 3007 | /* |
| 3008 | * Check each physical block into the |
| 3009 | * hashtable independently |
| 3010 | */ |
| 3011 | n = wc->wc_blocks[i].wc_dlen >> |
| 3012 | wch->wc_fs_dev_bshift; |
| 3013 | for (j = 0; j < n; j++) { |
| 3014 | struct wapbl_blk *wb = |
| 3015 | wapbl_blkhash_get(wr, |
| 3016 | wapbl_block_daddr(wc, i, j, fsblklen)); |
| 3017 | if (wb && (wb->wb_off == off)) { |
| 3018 | foundcnt++; |
| 3019 | error = |
| 3020 | wapbl_circ_read(wr, |
| 3021 | scratch1, fsblklen, |
| 3022 | &off); |
| 3023 | if (error) |
| 3024 | goto out; |
| 3025 | error = |
| 3026 | wapbl_read(scratch2, |
| 3027 | fsblklen, fsdevvp, |
| 3028 | wb->wb_blk); |
| 3029 | if (error) |
| 3030 | goto out; |
| 3031 | if (memcmp(scratch1, |
| 3032 | scratch2, |
| 3033 | fsblklen)) { |
| 3034 | printf( |
| 3035 | "wapbl_verify: mismatch block %" PRId64" at off %" PRIdMAX"\n" , |
| 3036 | wb->wb_blk, (intmax_t)off); |
| 3037 | dirtycnt++; |
| 3038 | mismatchcnt++; |
| 3039 | } |
| 3040 | } else { |
| 3041 | wapbl_circ_advance(wr, |
| 3042 | fsblklen, &off); |
| 3043 | } |
| 3044 | } |
| 3045 | #if 0 |
| 3046 | /* |
| 3047 | * If all of the blocks in an entry |
| 3048 | * are clean, then remove all of its |
| 3049 | * blocks from the hashtable since they |
| 3050 | * never will need replay. |
| 3051 | */ |
| 3052 | if ((foundcnt != 0) && |
| 3053 | (dirtycnt == 0)) { |
| 3054 | off = saveoff; |
| 3055 | wapbl_circ_advance(wr, |
| 3056 | logblklen, &off); |
| 3057 | for (j = 0; j < n; j++) { |
| 3058 | struct wapbl_blk *wb = |
| 3059 | wapbl_blkhash_get(wr, |
| 3060 | wapbl_block_daddr(wc, i, j, fsblklen)); |
| 3061 | if (wb && |
| 3062 | (wb->wb_off == off)) { |
| 3063 | wapbl_blkhash_rem(wr, wb->wb_blk); |
| 3064 | } |
| 3065 | wapbl_circ_advance(wr, |
| 3066 | fsblklen, &off); |
| 3067 | } |
| 3068 | } |
| 3069 | #endif |
| 3070 | } |
| 3071 | } |
| 3072 | break; |
| 3073 | case WAPBL_WC_REVOCATIONS: |
| 3074 | case WAPBL_WC_INODES: |
| 3075 | break; |
| 3076 | default: |
| 3077 | KASSERT(0); |
| 3078 | } |
| 3079 | #ifdef DEBUG |
| 3080 | wapbl_circ_advance(wr, wcn->wc_len, &saveoff); |
| 3081 | KASSERT(off == saveoff); |
| 3082 | #endif |
| 3083 | } |
| 3084 | out: |
| 3085 | wapbl_free(scratch1, MAXBSIZE); |
| 3086 | wapbl_free(scratch2, MAXBSIZE); |
| 3087 | if (!error && mismatchcnt) |
| 3088 | error = EFTYPE; |
| 3089 | return error; |
| 3090 | } |
| 3091 | #endif |
| 3092 | |
| 3093 | int |
| 3094 | wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) |
| 3095 | { |
| 3096 | struct wapbl_blk *wb; |
| 3097 | size_t i; |
| 3098 | off_t off; |
| 3099 | void *scratch; |
| 3100 | int error = 0; |
| 3101 | int fsblklen = 1 << wr->wr_fs_dev_bshift; |
| 3102 | |
| 3103 | KDASSERT(wapbl_replay_isopen(wr)); |
| 3104 | |
| 3105 | scratch = wapbl_alloc(MAXBSIZE); |
| 3106 | |
| 3107 | for (i = 0; i <= wr->wr_blkhashmask; ++i) { |
| 3108 | LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { |
| 3109 | off = wb->wb_off; |
| 3110 | error = wapbl_circ_read(wr, scratch, fsblklen, &off); |
| 3111 | if (error) |
| 3112 | break; |
| 3113 | error = wapbl_write(scratch, fsblklen, fsdevvp, |
| 3114 | wb->wb_blk); |
| 3115 | if (error) |
| 3116 | break; |
| 3117 | } |
| 3118 | } |
| 3119 | |
| 3120 | wapbl_free(scratch, MAXBSIZE); |
| 3121 | return error; |
| 3122 | } |
| 3123 | |
| 3124 | int |
| 3125 | wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) |
| 3126 | { |
| 3127 | int fsblklen = 1 << wr->wr_fs_dev_bshift; |
| 3128 | |
| 3129 | KDASSERT(wapbl_replay_isopen(wr)); |
| 3130 | KASSERT((len % fsblklen) == 0); |
| 3131 | |
| 3132 | while (len != 0) { |
| 3133 | struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); |
| 3134 | if (wb) |
| 3135 | return 1; |
| 3136 | len -= fsblklen; |
| 3137 | } |
| 3138 | return 0; |
| 3139 | } |
| 3140 | |
| 3141 | int |
| 3142 | wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) |
| 3143 | { |
| 3144 | int fsblklen = 1 << wr->wr_fs_dev_bshift; |
| 3145 | |
| 3146 | KDASSERT(wapbl_replay_isopen(wr)); |
| 3147 | |
| 3148 | KASSERT((len % fsblklen) == 0); |
| 3149 | |
| 3150 | while (len != 0) { |
| 3151 | struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); |
| 3152 | if (wb) { |
| 3153 | off_t off = wb->wb_off; |
| 3154 | int error; |
| 3155 | error = wapbl_circ_read(wr, data, fsblklen, &off); |
| 3156 | if (error) |
| 3157 | return error; |
| 3158 | } |
| 3159 | data = (uint8_t *)data + fsblklen; |
| 3160 | len -= fsblklen; |
| 3161 | blk++; |
| 3162 | } |
| 3163 | return 0; |
| 3164 | } |
| 3165 | |
| 3166 | #ifdef _KERNEL |
| 3167 | |
| 3168 | MODULE(MODULE_CLASS_VFS, wapbl, NULL); |
| 3169 | |
| 3170 | static int |
| 3171 | wapbl_modcmd(modcmd_t cmd, void *arg) |
| 3172 | { |
| 3173 | |
| 3174 | switch (cmd) { |
| 3175 | case MODULE_CMD_INIT: |
| 3176 | wapbl_init(); |
| 3177 | return 0; |
| 3178 | case MODULE_CMD_FINI: |
| 3179 | return wapbl_fini(); |
| 3180 | default: |
| 3181 | return ENOTTY; |
| 3182 | } |
| 3183 | } |
| 3184 | #endif /* _KERNEL */ |
| 3185 | |