| 1 | /* $NetBSD: ffs_wapbl.c,v 1.37 2016/11/10 22:19:23 jdolecek Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. |
| 5 | * All rights reserved. |
| 6 | * |
| 7 | * This code is derived from software contributed to The NetBSD Foundation |
| 8 | * by Wasabi Systems, Inc. |
| 9 | * |
| 10 | * Redistribution and use in source and binary forms, with or without |
| 11 | * modification, are permitted provided that the following conditions |
| 12 | * are met: |
| 13 | * 1. Redistributions of source code must retain the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer. |
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
| 16 | * notice, this list of conditions and the following disclaimer in the |
| 17 | * documentation and/or other materials provided with the distribution. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
| 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 29 | * POSSIBILITY OF SUCH DAMAGE. |
| 30 | */ |
| 31 | |
| 32 | #include <sys/cdefs.h> |
| 33 | __KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.37 2016/11/10 22:19:23 jdolecek Exp $" ); |
| 34 | |
| 35 | #define WAPBL_INTERNAL |
| 36 | |
| 37 | #if defined(_KERNEL_OPT) |
| 38 | #include "opt_ffs.h" |
| 39 | #endif |
| 40 | |
| 41 | #include <sys/param.h> |
| 42 | #include <sys/systm.h> |
| 43 | #include <sys/kernel.h> |
| 44 | #include <sys/vnode.h> |
| 45 | #include <sys/mount.h> |
| 46 | #include <sys/file.h> |
| 47 | #include <sys/disk.h> |
| 48 | #include <sys/ioctl.h> |
| 49 | #include <sys/errno.h> |
| 50 | #include <sys/kauth.h> |
| 51 | #include <sys/wapbl.h> |
| 52 | |
| 53 | #include <ufs/ufs/inode.h> |
| 54 | #include <ufs/ufs/quota.h> |
| 55 | #include <ufs/ufs/ufsmount.h> |
| 56 | #include <ufs/ufs/ufs_bswap.h> |
| 57 | #include <ufs/ufs/ufs_extern.h> |
| 58 | #include <ufs/ufs/ufs_wapbl.h> |
| 59 | |
| 60 | #include <ufs/ffs/fs.h> |
| 61 | #include <ufs/ffs/ffs_extern.h> |
| 62 | |
| 63 | #undef WAPBL_DEBUG |
| 64 | #ifdef WAPBL_DEBUG |
| 65 | int ffs_wapbl_debug = 1; |
| 66 | #define DPRINTF(fmt, args...) \ |
| 67 | do { \ |
| 68 | if (ffs_wapbl_debug) \ |
| 69 | printf("%s:%d "fmt, __func__ , __LINE__, ##args); \ |
| 70 | } while (/* CONSTCOND */0) |
| 71 | #else |
| 72 | #define DPRINTF(fmt, args...) \ |
| 73 | do { \ |
| 74 | /* nothing */ \ |
| 75 | } while (/* CONSTCOND */0) |
| 76 | #endif |
| 77 | |
| 78 | static int ffs_superblock_layout(struct fs *); |
| 79 | static int wapbl_log_position(struct mount *, struct fs *, struct vnode *, |
| 80 | daddr_t *, size_t *, size_t *, uint64_t *); |
| 81 | static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *, |
| 82 | daddr_t *, size_t *, uint64_t *); |
| 83 | static void wapbl_find_log_start(struct mount *, struct vnode *, off_t, |
| 84 | daddr_t *, daddr_t *, size_t *); |
| 85 | static int wapbl_remove_log(struct mount *); |
| 86 | static int wapbl_allocate_log_file(struct mount *, struct vnode *, |
| 87 | daddr_t *, size_t *, uint64_t *); |
| 88 | |
| 89 | /* |
| 90 | * Return the super block layout format - UFS1 or UFS2. |
| 91 | * WAPBL only works with UFS2 layout (which is still available |
| 92 | * with FFSv1). |
| 93 | * |
| 94 | * XXX Should this be in ufs/ffs/fs.h? Same style of check is |
| 95 | * also used in ffs_alloc.c in a few places. |
| 96 | */ |
| 97 | static int |
| 98 | ffs_superblock_layout(struct fs *fs) |
| 99 | { |
| 100 | if ((fs->fs_magic == FS_UFS1_MAGIC) && |
| 101 | ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) |
| 102 | return 1; |
| 103 | else |
| 104 | return 2; |
| 105 | } |
| 106 | |
| 107 | /* |
| 108 | * This function is invoked after a log is replayed to |
| 109 | * disk to perform logical cleanup actions as described by |
| 110 | * the log |
| 111 | */ |
| 112 | void |
| 113 | ffs_wapbl_replay_finish(struct mount *mp) |
| 114 | { |
| 115 | struct wapbl_replay *wr = mp->mnt_wapbl_replay; |
| 116 | int i; |
| 117 | int error; |
| 118 | |
| 119 | if (!wr) |
| 120 | return; |
| 121 | |
| 122 | KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0); |
| 123 | |
| 124 | for (i = 0; i < wr->wr_inodescnt; i++) { |
| 125 | struct vnode *vp; |
| 126 | struct inode *ip; |
| 127 | error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp); |
| 128 | if (error) { |
| 129 | printf("%s: %s: unable to cleanup inode %" PRIu32 "\n" , |
| 130 | __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt, |
| 131 | wr->wr_inodes[i].wr_inumber); |
| 132 | continue; |
| 133 | } |
| 134 | ip = VTOI(vp); |
| 135 | KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number); |
| 136 | #ifdef WAPBL_DEBUG |
| 137 | printf("%s%s: %s: cleaning inode %" PRIu64 " size=%" PRIu64 |
| 138 | " mode=%o nlink=%d\n" , |
| 139 | __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt, |
| 140 | ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink); |
| 141 | #endif |
| 142 | KASSERT(ip->i_nlink == 0); |
| 143 | |
| 144 | /* |
| 145 | * The journal may have left partially allocated inodes in mode |
| 146 | * zero. This may occur if a crash occurs betweeen the node |
| 147 | * allocation in ffs_nodeallocg and when the node is properly |
| 148 | * initialized in ufs_makeinode. If so, just dallocate them. |
| 149 | */ |
| 150 | if (ip->i_mode == 0) { |
| 151 | error = UFS_WAPBL_BEGIN(mp); |
| 152 | if (error) { |
| 153 | printf("%s: %s: " |
| 154 | "unable to cleanup inode %" PRIu32 "\n" , |
| 155 | __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt, |
| 156 | wr->wr_inodes[i].wr_inumber); |
| 157 | } else { |
| 158 | ffs_vfree(vp, ip->i_number, |
| 159 | wr->wr_inodes[i].wr_imode); |
| 160 | UFS_WAPBL_END(mp); |
| 161 | } |
| 162 | } |
| 163 | vput(vp); |
| 164 | } |
| 165 | wapbl_replay_stop(wr); |
| 166 | wapbl_replay_free(wr); |
| 167 | mp->mnt_wapbl_replay = NULL; |
| 168 | } |
| 169 | |
| 170 | /* Callback for wapbl */ |
| 171 | void |
| 172 | ffs_wapbl_sync_metadata(struct mount *mp, struct wapbl_dealloc *fdealloc) |
| 173 | { |
| 174 | struct ufsmount *ump = VFSTOUFS(mp); |
| 175 | struct fs *fs = ump->um_fs; |
| 176 | int error __diagused; |
| 177 | struct wapbl_dealloc *wd; |
| 178 | |
| 179 | UFS_WAPBL_JLOCK_ASSERT(mp); |
| 180 | |
| 181 | #ifdef WAPBL_DEBUG_INODES |
| 182 | ufs_wapbl_verify_inodes(mp, __func__); |
| 183 | #endif |
| 184 | |
| 185 | for (wd = fdealloc; wd != NULL; wd = TAILQ_NEXT(wd, wd_entries)) { |
| 186 | /* |
| 187 | * blkfree errors are unreported, might silently fail |
| 188 | * if it cannot read the cylinder group block |
| 189 | */ |
| 190 | ffs_blkfree(fs, ump->um_devvp, |
| 191 | FFS_DBTOFSB(fs, wd->wd_blkno), wd->wd_len, -1); |
| 192 | } |
| 193 | |
| 194 | if (fs->fs_fmod != 0) { |
| 195 | fs->fs_fmod = 0; |
| 196 | fs->fs_time = time_second; |
| 197 | error = ffs_cgupdate(ump, 0); |
| 198 | KASSERT(error == 0); |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | void |
| 203 | ffs_wapbl_abort_sync_metadata(struct mount *mp, struct wapbl_dealloc *fdealloc) |
| 204 | { |
| 205 | struct ufsmount *ump = VFSTOUFS(mp); |
| 206 | struct fs *fs = ump->um_fs; |
| 207 | struct wapbl_dealloc *wd; |
| 208 | |
| 209 | for (wd = fdealloc; wd != NULL; wd = TAILQ_NEXT(wd, wd_entries)) { |
| 210 | /* |
| 211 | * Since the above blkfree may have failed, this blkalloc might |
| 212 | * fail as well, so don't check its error. Note that if the |
| 213 | * blkfree succeeded above, then this shouldn't fail because |
| 214 | * the buffer will be locked in the current transaction. |
| 215 | */ |
| 216 | ffs_blkalloc_ump(ump, FFS_DBTOFSB(fs, wd->wd_blkno), |
| 217 | wd->wd_len); |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | static int |
| 222 | wapbl_remove_log(struct mount *mp) |
| 223 | { |
| 224 | struct ufsmount *ump = VFSTOUFS(mp); |
| 225 | struct fs *fs = ump->um_fs; |
| 226 | struct vnode *vp; |
| 227 | struct inode *ip; |
| 228 | ino_t log_ino; |
| 229 | int error; |
| 230 | |
| 231 | /* If super block layout is too old to support WAPBL, return */ |
| 232 | if (ffs_superblock_layout(fs) < 2) |
| 233 | return 0; |
| 234 | |
| 235 | /* If all the log locators are 0, just clean up */ |
| 236 | if (fs->fs_journallocs[0] == 0 && |
| 237 | fs->fs_journallocs[1] == 0 && |
| 238 | fs->fs_journallocs[2] == 0 && |
| 239 | fs->fs_journallocs[3] == 0) { |
| 240 | DPRINTF("empty locators, just clear\n" ); |
| 241 | goto done; |
| 242 | } |
| 243 | |
| 244 | switch (fs->fs_journal_location) { |
| 245 | case UFS_WAPBL_JOURNALLOC_NONE: |
| 246 | /* nothing! */ |
| 247 | DPRINTF("no log\n" ); |
| 248 | break; |
| 249 | |
| 250 | case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: |
| 251 | log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO]; |
| 252 | DPRINTF("in-fs log, ino = %" PRId64 "\n" ,log_ino); |
| 253 | |
| 254 | /* if no existing log inode, just clear all fields and bail */ |
| 255 | if (log_ino == 0) |
| 256 | goto done; |
| 257 | error = VFS_VGET(mp, log_ino, &vp); |
| 258 | if (error != 0) { |
| 259 | printf("%s: %s: vget failed %d\n" , __func__, |
| 260 | fs->fs_fsmnt, error); |
| 261 | /* clear out log info on error */ |
| 262 | goto done; |
| 263 | } |
| 264 | ip = VTOI(vp); |
| 265 | KASSERT(log_ino == ip->i_number); |
| 266 | if ((ip->i_flags & SF_LOG) == 0) { |
| 267 | printf("%s: %s: try to clear non-log inode " |
| 268 | "%" PRId64 "\n" , __func__, fs->fs_fsmnt, log_ino); |
| 269 | vput(vp); |
| 270 | /* clear out log info on error */ |
| 271 | goto done; |
| 272 | } |
| 273 | |
| 274 | /* |
| 275 | * remove the log inode by setting its link count back |
| 276 | * to zero and bail. |
| 277 | */ |
| 278 | ip->i_nlink = 0; |
| 279 | DIP_ASSIGN(ip, nlink, 0); |
| 280 | vput(vp); |
| 281 | break; |
| 282 | |
| 283 | case UFS_WAPBL_JOURNALLOC_END_PARTITION: |
| 284 | DPRINTF("end-of-partition log\n" ); |
| 285 | /* no extra work required */ |
| 286 | break; |
| 287 | |
| 288 | default: |
| 289 | printf("%s: %s: unknown journal type %d\n" , __func__, |
| 290 | fs->fs_fsmnt, fs->fs_journal_location); |
| 291 | break; |
| 292 | } |
| 293 | |
| 294 | |
| 295 | done: |
| 296 | /* Clear out all previous knowledge of journal */ |
| 297 | fs->fs_journal_version = 0; |
| 298 | fs->fs_journal_location = 0; |
| 299 | fs->fs_journal_flags = 0; |
| 300 | fs->fs_journallocs[0] = 0; |
| 301 | fs->fs_journallocs[1] = 0; |
| 302 | fs->fs_journallocs[2] = 0; |
| 303 | fs->fs_journallocs[3] = 0; |
| 304 | (void) ffs_sbupdate(ump, MNT_WAIT); |
| 305 | |
| 306 | return 0; |
| 307 | } |
| 308 | |
| 309 | int |
| 310 | ffs_wapbl_start(struct mount *mp) |
| 311 | { |
| 312 | struct ufsmount *ump = VFSTOUFS(mp); |
| 313 | struct fs *fs = ump->um_fs; |
| 314 | struct vnode *devvp = ump->um_devvp; |
| 315 | daddr_t off; |
| 316 | size_t count; |
| 317 | size_t blksize; |
| 318 | uint64_t ; |
| 319 | int error; |
| 320 | |
| 321 | if (mp->mnt_wapbl == NULL) { |
| 322 | if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) { |
| 323 | /* Clear out any existing journal file */ |
| 324 | error = wapbl_remove_log(mp); |
| 325 | if (error != 0) |
| 326 | return error; |
| 327 | } |
| 328 | |
| 329 | if (mp->mnt_flag & MNT_LOG) { |
| 330 | KDASSERT(fs->fs_ronly == 0); |
| 331 | |
| 332 | /* WAPBL needs UFS2 format super block */ |
| 333 | if (ffs_superblock_layout(fs) < 2) { |
| 334 | printf("%s: %s: fs superblock in old format, " |
| 335 | "not journaling\n" , __func__, |
| 336 | VFSTOUFS(mp)->um_fs->fs_fsmnt); |
| 337 | mp->mnt_flag &= ~MNT_LOG; |
| 338 | return EINVAL; |
| 339 | } |
| 340 | |
| 341 | error = wapbl_log_position(mp, fs, devvp, &off, |
| 342 | &count, &blksize, &extradata); |
| 343 | if (error) |
| 344 | return error; |
| 345 | |
| 346 | error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off, |
| 347 | count, blksize, mp->mnt_wapbl_replay, |
| 348 | ffs_wapbl_sync_metadata, |
| 349 | ffs_wapbl_abort_sync_metadata); |
| 350 | if (error) |
| 351 | return error; |
| 352 | |
| 353 | mp->mnt_wapbl_op = &wapbl_ops; |
| 354 | |
| 355 | #ifdef WAPBL_DEBUG |
| 356 | printf("%s: %s: enabling logging\n" , __func__, |
| 357 | fs->fs_fsmnt); |
| 358 | #endif |
| 359 | |
| 360 | if ((fs->fs_flags & FS_DOWAPBL) == 0) { |
| 361 | fs->fs_flags |= FS_DOWAPBL; |
| 362 | if ((error = UFS_WAPBL_BEGIN(mp)) != 0) |
| 363 | goto out; |
| 364 | error = ffs_sbupdate(ump, MNT_WAIT); |
| 365 | if (error) { |
| 366 | UFS_WAPBL_END(mp); |
| 367 | goto out; |
| 368 | } |
| 369 | UFS_WAPBL_END(mp); |
| 370 | error = wapbl_flush(mp->mnt_wapbl, 1); |
| 371 | if (error) |
| 372 | goto out; |
| 373 | } |
| 374 | |
| 375 | /* |
| 376 | * XXX discard interferes with block deallocation |
| 377 | * registration and hence log consistency |
| 378 | */ |
| 379 | if (mp->mnt_flag & MNT_DISCARD) { |
| 380 | CLR(mp->mnt_flag, MNT_DISCARD); |
| 381 | printf("%s: %s: disabling discard to preserve log consistency\n" , __func__, |
| 382 | fs->fs_fsmnt); |
| 383 | |
| 384 | if (ump->um_discarddata != NULL) { |
| 385 | ffs_discard_finish(ump->um_discarddata, |
| 386 | 0); |
| 387 | ump->um_discarddata = NULL; |
| 388 | } |
| 389 | } |
| 390 | |
| 391 | } else if (fs->fs_flags & FS_DOWAPBL) { |
| 392 | fs->fs_fmod = 1; |
| 393 | fs->fs_flags &= ~FS_DOWAPBL; |
| 394 | } |
| 395 | } |
| 396 | |
| 397 | /* |
| 398 | * It is recommended that you finish replay with logging enabled. |
| 399 | * However, even if logging is not enabled, the remaining log |
| 400 | * replay should be safely recoverable with an fsck, so perform |
| 401 | * it anyway. |
| 402 | */ |
| 403 | if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) { |
| 404 | int saveflag = mp->mnt_flag & MNT_RDONLY; |
| 405 | /* |
| 406 | * Make sure MNT_RDONLY is not set so that the inode |
| 407 | * cleanup in ufs_inactive will actually do its work. |
| 408 | */ |
| 409 | mp->mnt_flag &= ~MNT_RDONLY; |
| 410 | ffs_wapbl_replay_finish(mp); |
| 411 | mp->mnt_flag |= saveflag; |
| 412 | KASSERT(fs->fs_ronly == 0); |
| 413 | } |
| 414 | |
| 415 | return 0; |
| 416 | out: |
| 417 | ffs_wapbl_stop(mp, MNT_FORCE); |
| 418 | return error; |
| 419 | } |
| 420 | |
| 421 | int |
| 422 | ffs_wapbl_stop(struct mount *mp, int force) |
| 423 | { |
| 424 | struct ufsmount *ump = VFSTOUFS(mp); |
| 425 | struct fs *fs = ump->um_fs; |
| 426 | int error; |
| 427 | |
| 428 | if (mp->mnt_wapbl) { |
| 429 | KDASSERT(fs->fs_ronly == 0); |
| 430 | |
| 431 | /* |
| 432 | * Make sure turning off FS_DOWAPBL is only removed |
| 433 | * as the only change in the final flush since otherwise |
| 434 | * a transaction may reorder writes. |
| 435 | */ |
| 436 | error = wapbl_flush(mp->mnt_wapbl, 1); |
| 437 | if (error && !force) |
| 438 | return error; |
| 439 | if (error && force) |
| 440 | goto forceout; |
| 441 | error = UFS_WAPBL_BEGIN(mp); |
| 442 | if (error && !force) |
| 443 | return error; |
| 444 | if (error && force) |
| 445 | goto forceout; |
| 446 | KASSERT(fs->fs_flags & FS_DOWAPBL); |
| 447 | |
| 448 | fs->fs_flags &= ~FS_DOWAPBL; |
| 449 | error = ffs_sbupdate(ump, MNT_WAIT); |
| 450 | KASSERT(error == 0); /* XXX a bit drastic! */ |
| 451 | UFS_WAPBL_END(mp); |
| 452 | forceout: |
| 453 | error = wapbl_stop(mp->mnt_wapbl, force); |
| 454 | if (error) { |
| 455 | KASSERT(!force); |
| 456 | fs->fs_flags |= FS_DOWAPBL; |
| 457 | return error; |
| 458 | } |
| 459 | fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */ |
| 460 | mp->mnt_wapbl = NULL; |
| 461 | |
| 462 | #ifdef WAPBL_DEBUG |
| 463 | printf("%s: %s: disabled logging\n" , __func__, fs->fs_fsmnt); |
| 464 | #endif |
| 465 | } |
| 466 | |
| 467 | return 0; |
| 468 | } |
| 469 | |
| 470 | int |
| 471 | ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp) |
| 472 | { |
| 473 | int error; |
| 474 | daddr_t off; |
| 475 | size_t count; |
| 476 | size_t blksize; |
| 477 | uint64_t ; |
| 478 | |
| 479 | /* |
| 480 | * WAPBL needs UFS2 format super block, if we got here with a |
| 481 | * UFS1 format super block something is amiss... |
| 482 | */ |
| 483 | if (ffs_superblock_layout(fs) < 2) |
| 484 | return EINVAL; |
| 485 | |
| 486 | error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize, |
| 487 | &extradata); |
| 488 | |
| 489 | if (error) |
| 490 | return error; |
| 491 | |
| 492 | error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off, |
| 493 | count, blksize); |
| 494 | if (error) |
| 495 | return error; |
| 496 | |
| 497 | mp->mnt_wapbl_op = &wapbl_ops; |
| 498 | |
| 499 | return 0; |
| 500 | } |
| 501 | |
| 502 | /* |
| 503 | * If the superblock doesn't already have a recorded journal location |
| 504 | * then we allocate the journal in one of two positions: |
| 505 | * |
| 506 | * - At the end of the partition after the filesystem if there's |
| 507 | * enough space. "Enough space" is defined as >= 1MB of journal |
| 508 | * per 1GB of filesystem or 64MB, whichever is smaller. |
| 509 | * |
| 510 | * - Inside the filesystem. We try to allocate a contiguous journal |
| 511 | * based on the total filesystem size - the target is 1MB of journal |
| 512 | * per 1GB of filesystem, up to a maximum journal size of 64MB. As |
| 513 | * a worst case allowing for fragmentation, we'll allocate a journal |
| 514 | * 1/4 of the desired size but never smaller than 1MB. |
| 515 | * |
| 516 | * XXX In the future if we allow for non-contiguous journal files we |
| 517 | * can tighten the above restrictions. |
| 518 | * |
| 519 | * XXX |
| 520 | * These seems like a lot of duplication both here and in some of |
| 521 | * the userland tools (fsck_ffs, dumpfs, tunefs) with similar |
| 522 | * "switch (fs_journal_location)" constructs. Can we centralise |
| 523 | * this sort of code somehow/somewhere? |
| 524 | */ |
| 525 | static int |
| 526 | wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp, |
| 527 | daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *) |
| 528 | { |
| 529 | struct ufsmount *ump = VFSTOUFS(mp); |
| 530 | daddr_t logstart, logend, desired_logsize; |
| 531 | uint64_t numsecs; |
| 532 | unsigned secsize; |
| 533 | int error, location; |
| 534 | |
| 535 | if (fs->fs_journal_version == UFS_WAPBL_VERSION) { |
| 536 | switch (fs->fs_journal_location) { |
| 537 | case UFS_WAPBL_JOURNALLOC_END_PARTITION: |
| 538 | DPRINTF("found existing end-of-partition log\n" ); |
| 539 | *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR]; |
| 540 | *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; |
| 541 | *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; |
| 542 | DPRINTF(" start = %" PRId64 ", size = %zu, " |
| 543 | "blksize = %zu\n" , *startp, *countp, *blksizep); |
| 544 | return 0; |
| 545 | |
| 546 | case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: |
| 547 | DPRINTF("found existing in-filesystem log\n" ); |
| 548 | *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR]; |
| 549 | *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; |
| 550 | *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; |
| 551 | DPRINTF(" start = %" PRId64 ", size = %zu, " |
| 552 | "blksize = %zu\n" , *startp, *countp, *blksizep); |
| 553 | return 0; |
| 554 | |
| 555 | default: |
| 556 | printf("%s: %s: unknown journal type %d\n" , __func__, |
| 557 | fs->fs_fsmnt, fs->fs_journal_location); |
| 558 | return EINVAL; |
| 559 | } |
| 560 | } |
| 561 | |
| 562 | desired_logsize = |
| 563 | ffs_lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE; |
| 564 | DPRINTF("desired log size = %" PRId64 " kB\n" , desired_logsize / 1024); |
| 565 | desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); |
| 566 | desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); |
| 567 | DPRINTF("adjusted desired log size = %" PRId64 " kB\n" , |
| 568 | desired_logsize / 1024); |
| 569 | |
| 570 | /* Is there space after after filesystem on partition for log? */ |
| 571 | logstart = FFS_FSBTODB(fs, fs->fs_size); |
| 572 | error = getdisksize(devvp, &numsecs, &secsize); |
| 573 | if (error) |
| 574 | return error; |
| 575 | KDASSERT(secsize != 0); |
| 576 | logend = btodb(numsecs * secsize); |
| 577 | |
| 578 | if (dbtob(logend - logstart) >= desired_logsize) { |
| 579 | DPRINTF("enough space, use end-of-partition log\n" ); |
| 580 | |
| 581 | location = UFS_WAPBL_JOURNALLOC_END_PARTITION; |
| 582 | *blksizep = secsize; |
| 583 | |
| 584 | *startp = logstart; |
| 585 | *countp = (logend - logstart); |
| 586 | *extradatap = 0; |
| 587 | |
| 588 | /* convert to physical block numbers */ |
| 589 | *startp = dbtob(*startp) / secsize; |
| 590 | *countp = dbtob(*countp) / secsize; |
| 591 | |
| 592 | fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp; |
| 593 | fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp; |
| 594 | fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep; |
| 595 | fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap; |
| 596 | } else { |
| 597 | DPRINTF("end-of-partition has only %" PRId64 " free\n" , |
| 598 | logend - logstart); |
| 599 | |
| 600 | location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM; |
| 601 | *blksizep = secsize; |
| 602 | |
| 603 | error = wapbl_create_infs_log(mp, fs, devvp, |
| 604 | startp, countp, extradatap); |
| 605 | ffs_sync(mp, MNT_WAIT, FSCRED); |
| 606 | |
| 607 | /* convert to physical block numbers */ |
| 608 | *startp = dbtob(*startp) / secsize; |
| 609 | *countp = dbtob(*countp) / secsize; |
| 610 | |
| 611 | fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = *startp; |
| 612 | fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = *countp; |
| 613 | fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = *blksizep; |
| 614 | fs->fs_journallocs[UFS_WAPBL_INFS_INO] = *extradatap; |
| 615 | } |
| 616 | |
| 617 | if (error == 0) { |
| 618 | /* update superblock with log location */ |
| 619 | fs->fs_journal_version = UFS_WAPBL_VERSION; |
| 620 | fs->fs_journal_location = location; |
| 621 | fs->fs_journal_flags = 0; |
| 622 | |
| 623 | error = ffs_sbupdate(ump, MNT_WAIT); |
| 624 | } |
| 625 | |
| 626 | return error; |
| 627 | } |
| 628 | |
| 629 | /* |
| 630 | * Try to create a journal log inside the filesystem. |
| 631 | */ |
| 632 | static int |
| 633 | wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp, |
| 634 | daddr_t *startp, size_t *countp, uint64_t *) |
| 635 | { |
| 636 | struct vnode *vp, *rvp; |
| 637 | struct vattr va; |
| 638 | struct inode *ip; |
| 639 | int error; |
| 640 | |
| 641 | if ((error = VFS_ROOT(mp, &rvp)) != 0) |
| 642 | return error; |
| 643 | |
| 644 | vattr_null(&va); |
| 645 | va.va_type = VREG; |
| 646 | va.va_mode = 0; |
| 647 | |
| 648 | error = vcache_new(mp, rvp, &va, NOCRED, &vp); |
| 649 | vput(rvp); |
| 650 | if (error) |
| 651 | return error; |
| 652 | |
| 653 | error = vn_lock(vp, LK_EXCLUSIVE); |
| 654 | if (error) { |
| 655 | vrele(vp); |
| 656 | return error; |
| 657 | } |
| 658 | |
| 659 | ip = VTOI(vp); |
| 660 | ip->i_flags = SF_LOG; |
| 661 | DIP_ASSIGN(ip, flags, ip->i_flags); |
| 662 | ip->i_nlink = 1; |
| 663 | DIP_ASSIGN(ip, nlink, 1); |
| 664 | ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; |
| 665 | ffs_update(vp, NULL, NULL, UPDATE_WAIT); |
| 666 | |
| 667 | if ((error = wapbl_allocate_log_file(mp, vp, |
| 668 | startp, countp, extradatap)) != 0) { |
| 669 | /* |
| 670 | * If we couldn't allocate the space for the log file, |
| 671 | * remove the inode by setting its link count back to |
| 672 | * zero and bail. |
| 673 | */ |
| 674 | ip->i_nlink = 0; |
| 675 | DIP_ASSIGN(ip, nlink, 0); |
| 676 | VOP_UNLOCK(vp); |
| 677 | vgone(vp); |
| 678 | |
| 679 | return error; |
| 680 | } |
| 681 | |
| 682 | /* |
| 683 | * Now that we have the place-holder inode for the journal, |
| 684 | * we don't need the vnode ever again. |
| 685 | */ |
| 686 | VOP_UNLOCK(vp); |
| 687 | vgone(vp); |
| 688 | |
| 689 | return 0; |
| 690 | } |
| 691 | |
| 692 | int |
| 693 | wapbl_allocate_log_file(struct mount *mp, struct vnode *vp, |
| 694 | daddr_t *startp, size_t *countp, uint64_t *) |
| 695 | { |
| 696 | struct ufsmount *ump = VFSTOUFS(mp); |
| 697 | struct fs *fs = ump->um_fs; |
| 698 | daddr_t addr, indir_addr; |
| 699 | off_t logsize; |
| 700 | size_t size; |
| 701 | int error; |
| 702 | |
| 703 | logsize = 0; |
| 704 | /* check if there's a suggested log size */ |
| 705 | if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG && |
| 706 | fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) |
| 707 | logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; |
| 708 | |
| 709 | if (vp->v_size > 0) { |
| 710 | printf("%s: %s: file size (%" PRId64 ") non zero\n" , __func__, |
| 711 | fs->fs_fsmnt, vp->v_size); |
| 712 | return EEXIST; |
| 713 | } |
| 714 | wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size); |
| 715 | if (addr == 0) { |
| 716 | printf("%s: %s: log not allocated, largest extent is " |
| 717 | "%" PRId64 "MB\n" , __func__, fs->fs_fsmnt, |
| 718 | ffs_lblktosize(fs, size) / (1024 * 1024)); |
| 719 | return ENOSPC; |
| 720 | } |
| 721 | |
| 722 | logsize = ffs_lblktosize(fs, size); /* final log size */ |
| 723 | |
| 724 | VTOI(vp)->i_ffs_first_data_blk = addr; |
| 725 | VTOI(vp)->i_ffs_first_indir_blk = indir_addr; |
| 726 | |
| 727 | error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED); |
| 728 | if (error) { |
| 729 | printf("%s: %s: GOP_ALLOC error %d\n" , __func__, fs->fs_fsmnt, |
| 730 | error); |
| 731 | return error; |
| 732 | } |
| 733 | |
| 734 | *startp = FFS_FSBTODB(fs, addr); |
| 735 | *countp = btodb(logsize); |
| 736 | *extradatap = VTOI(vp)->i_number; |
| 737 | |
| 738 | return 0; |
| 739 | } |
| 740 | |
| 741 | /* |
| 742 | * Find a suitable location for the journal in the filesystem. |
| 743 | * |
| 744 | * Our strategy here is to look for a contiguous block of free space |
| 745 | * at least "logfile" MB in size (plus room for any indirect blocks). |
| 746 | * We start at the middle of the filesystem and check each cylinder |
| 747 | * group working outwards. If "logfile" MB is not available as a |
| 748 | * single contigous chunk, then return the address and size of the |
| 749 | * largest chunk found. |
| 750 | * |
| 751 | * XXX |
| 752 | * At what stage does the search fail? Is if the largest space we could |
| 753 | * find is less than a quarter the requested space reasonable? If the |
| 754 | * search fails entirely, return a block address if "0" it indicate this. |
| 755 | */ |
| 756 | static void |
| 757 | wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize, |
| 758 | daddr_t *addr, daddr_t *indir_addr, size_t *size) |
| 759 | { |
| 760 | struct ufsmount *ump = VFSTOUFS(mp); |
| 761 | struct fs *fs = ump->um_fs; |
| 762 | struct vnode *devvp = ump->um_devvp; |
| 763 | struct cg *cgp; |
| 764 | struct buf *bp; |
| 765 | uint8_t *blksfree; |
| 766 | daddr_t blkno, best_addr, start_addr; |
| 767 | daddr_t desired_blks, min_desired_blks; |
| 768 | daddr_t freeblks, best_blks; |
| 769 | int bpcg, cg, error, fixedsize, indir_blks, n, s; |
| 770 | const int needswap = UFS_FSNEEDSWAP(fs); |
| 771 | |
| 772 | if (logsize == 0) { |
| 773 | fixedsize = 0; /* We can adjust the size if tight */ |
| 774 | logsize = ffs_lfragtosize(fs, fs->fs_dsize) / |
| 775 | UFS_WAPBL_JOURNAL_SCALE; |
| 776 | DPRINTF("suggested log size = %" PRId64 "\n" , logsize); |
| 777 | logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); |
| 778 | logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); |
| 779 | DPRINTF("adjusted log size = %" PRId64 "\n" , logsize); |
| 780 | } else { |
| 781 | fixedsize = 1; |
| 782 | DPRINTF("fixed log size = %" PRId64 "\n" , logsize); |
| 783 | } |
| 784 | |
| 785 | desired_blks = logsize / fs->fs_bsize; |
| 786 | DPRINTF("desired blocks = %" PRId64 "\n" , desired_blks); |
| 787 | |
| 788 | /* add in number of indirect blocks needed */ |
| 789 | indir_blks = 0; |
| 790 | if (desired_blks >= UFS_NDADDR) { |
| 791 | struct indir indirs[UFS_NIADDR + 2]; |
| 792 | int num; |
| 793 | |
| 794 | error = ufs_getlbns(vp, desired_blks, indirs, &num); |
| 795 | if (error) { |
| 796 | printf("%s: %s: ufs_getlbns failed, error %d!\n" , |
| 797 | __func__, fs->fs_fsmnt, error); |
| 798 | goto bad; |
| 799 | } |
| 800 | |
| 801 | switch (num) { |
| 802 | case 2: |
| 803 | indir_blks = 1; /* 1st level indirect */ |
| 804 | break; |
| 805 | case 3: |
| 806 | indir_blks = 1 + /* 1st level indirect */ |
| 807 | 1 + /* 2nd level indirect */ |
| 808 | indirs[1].in_off + 1; /* extra 1st level indirect */ |
| 809 | break; |
| 810 | default: |
| 811 | printf("%s: %s: unexpected numlevels %d from " |
| 812 | "ufs_getlbns\n" , __func__, fs->fs_fsmnt, num); |
| 813 | *size = 0; |
| 814 | goto bad; |
| 815 | } |
| 816 | desired_blks += indir_blks; |
| 817 | } |
| 818 | DPRINTF("desired blocks = %" PRId64 " (including indirect)\n" , |
| 819 | desired_blks); |
| 820 | |
| 821 | /* |
| 822 | * If a specific size wasn't requested, allow for a smaller log |
| 823 | * if we're really tight for space... |
| 824 | */ |
| 825 | min_desired_blks = desired_blks; |
| 826 | if (!fixedsize) |
| 827 | min_desired_blks = desired_blks / 4; |
| 828 | |
| 829 | /* Look at number of blocks per CG. If it's too small, bail early. */ |
| 830 | bpcg = ffs_fragstoblks(fs, fs->fs_fpg); |
| 831 | if (min_desired_blks > bpcg) { |
| 832 | printf("%s: %s: cylinder group size of %" PRId64 " MB " |
| 833 | " is not big enough for journal\n" , __func__, fs->fs_fsmnt, |
| 834 | ffs_lblktosize(fs, bpcg) / (1024 * 1024)); |
| 835 | goto bad; |
| 836 | } |
| 837 | |
| 838 | /* |
| 839 | * Start with the middle cylinder group, and search outwards in |
| 840 | * both directions until we either find the requested log size |
| 841 | * or reach the start/end of the file system. If we reach the |
| 842 | * start/end without finding enough space for the full requested |
| 843 | * log size, use the largest extent found if it is large enough |
| 844 | * to satisfy the our minimum size. |
| 845 | * |
| 846 | * XXX |
| 847 | * Can we just use the cluster contigsum stuff (esp on UFS2) |
| 848 | * here to simplify this search code? |
| 849 | */ |
| 850 | best_addr = 0; |
| 851 | best_blks = 0; |
| 852 | for (cg = fs->fs_ncg / 2, s = 0, n = 1; |
| 853 | best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg; |
| 854 | s++, n = -n, cg += n * s) { |
| 855 | DPRINTF("check cg %d of %d\n" , cg, fs->fs_ncg); |
| 856 | error = bread(devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), |
| 857 | fs->fs_cgsize, 0, &bp); |
| 858 | if (error) { |
| 859 | continue; |
| 860 | } |
| 861 | cgp = (struct cg *)bp->b_data; |
| 862 | if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { |
| 863 | brelse(bp, 0); |
| 864 | continue; |
| 865 | } |
| 866 | |
| 867 | blksfree = cg_blksfree(cgp, needswap); |
| 868 | |
| 869 | for (blkno = 0; blkno < bpcg;) { |
| 870 | /* look for next free block */ |
| 871 | /* XXX use scanc() and fragtbl[] here? */ |
| 872 | for (; blkno < bpcg - min_desired_blks; blkno++) |
| 873 | if (ffs_isblock(fs, blksfree, blkno)) |
| 874 | break; |
| 875 | |
| 876 | /* past end of search space in this CG? */ |
| 877 | if (blkno >= bpcg - min_desired_blks) |
| 878 | break; |
| 879 | |
| 880 | /* count how many free blocks in this extent */ |
| 881 | start_addr = blkno; |
| 882 | for (freeblks = 0; blkno < bpcg; blkno++, freeblks++) |
| 883 | if (!ffs_isblock(fs, blksfree, blkno)) |
| 884 | break; |
| 885 | |
| 886 | if (freeblks > best_blks) { |
| 887 | best_blks = freeblks; |
| 888 | best_addr = ffs_blkstofrags(fs, start_addr) + |
| 889 | cgbase(fs, cg); |
| 890 | |
| 891 | if (freeblks >= desired_blks) { |
| 892 | DPRINTF("found len %" PRId64 |
| 893 | " at offset %" PRId64 " in gc\n" , |
| 894 | freeblks, start_addr); |
| 895 | break; |
| 896 | } |
| 897 | } |
| 898 | } |
| 899 | brelse(bp, 0); |
| 900 | } |
| 901 | DPRINTF("best found len = %" PRId64 ", wanted %" PRId64 |
| 902 | " at addr %" PRId64 "\n" , best_blks, desired_blks, best_addr); |
| 903 | |
| 904 | if (best_blks < min_desired_blks) { |
| 905 | *addr = 0; |
| 906 | *indir_addr = 0; |
| 907 | } else { |
| 908 | /* put indirect blocks at start, and data blocks after */ |
| 909 | *addr = best_addr + ffs_blkstofrags(fs, indir_blks); |
| 910 | *indir_addr = best_addr; |
| 911 | } |
| 912 | *size = min(desired_blks, best_blks) - indir_blks; |
| 913 | return; |
| 914 | |
| 915 | bad: |
| 916 | *addr = 0; |
| 917 | *indir_addr = 0; |
| 918 | *size = 0; |
| 919 | return; |
| 920 | } |
| 921 | |