1 /* $NetBSD: vfs_wapbl.c,v 1.86 2016/11/10 20:56:32 jdolecek Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.86 2016/11/10 20:56:32 jdolecek Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/bitops.h> 43 #include <sys/time.h> 44 #include <sys/wapbl.h> 45 #include <sys/wapbl_replay.h> 46 47 #ifdef _KERNEL 48 49 #include <sys/atomic.h> 50 #include <sys/conf.h> 51 #include <sys/file.h> 52 #include <sys/kauth.h> 53 #include <sys/kernel.h> 54 #include <sys/module.h> 55 #include <sys/mount.h> 56 #include <sys/mutex.h> 57 #include <sys/namei.h> 58 #include <sys/proc.h> 59 #include <sys/resourcevar.h> 60 #include <sys/sysctl.h> 61 #include <sys/uio.h> 62 #include <sys/vnode.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP) 67 #define wapbl_free(a, s) kmem_free((a), (s)) 68 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 69 70 static struct sysctllog *wapbl_sysctl; 71 static int wapbl_flush_disk_cache = 1; 72 static int wapbl_verbose_commit = 0; 73 74 static inline size_t wapbl_space_free(size_t, off_t, off_t); 75 76 #else /* !_KERNEL */ 77 78 #include <assert.h> 79 #include <errno.h> 80 #include <stdbool.h> 81 #include <stdio.h> 82 #include <stdlib.h> 83 #include <string.h> 84 85 #define KDASSERT(x) assert(x) 86 #define KASSERT(x) assert(x) 87 #define wapbl_alloc(s) malloc(s) 88 #define wapbl_free(a, s) free(a) 89 #define wapbl_calloc(n, s) calloc((n), (s)) 90 91 #endif /* !_KERNEL */ 92 93 /* 94 * INTERNAL DATA STRUCTURES 95 */ 96 97 /* 98 * This structure holds per-mount log information. 99 * 100 * Legend: a = atomic access only 101 * r = read-only after init 102 * l = rwlock held 103 * m = mutex held 104 * lm = rwlock held writing or mutex held 105 * u = unlocked access ok 106 * b = bufcache_lock held 107 */ 108 LIST_HEAD(wapbl_ino_head, wapbl_ino); 109 struct wapbl { 110 struct vnode *wl_logvp; /* r: log here */ 111 struct vnode *wl_devvp; /* r: log on this device */ 112 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 113 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 114 int wl_log_dev_bshift; /* r: logarithm of device block size of log 115 device */ 116 int wl_fs_dev_bshift; /* r: logarithm of device block size of 117 filesystem device */ 118 119 unsigned wl_lock_count; /* m: Count of transactions in progress */ 120 121 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 122 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 123 124 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 125 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 126 127 off_t wl_head; /* l: Byte offset of log head */ 128 off_t wl_tail; /* l: Byte offset of log tail */ 129 /* 130 * WAPBL log layout, stored on wl_devvp at wl_logpbn: 131 * 132 * ___________________ wl_circ_size __________________ 133 * / \ 134 * +---------+---------+-------+--------------+--------+ 135 * [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ] 136 * +---------+---------+-------+--------------+--------+ 137 * wl_circ_off --^ ^-- wl_head ^-- wl_tail 138 * 139 * commit0 and commit1 are commit headers. A commit header has 140 * a generation number, indicating which of the two headers is 141 * more recent, and an assignment of head and tail pointers. 142 * The rest is a circular queue of log records, starting at 143 * the byte offset wl_circ_off. 144 * 145 * E marks empty space for records. 146 * W marks records for block writes issued but waiting. 147 * C marks completed records. 148 * 149 * wapbl_flush writes new records to empty `E' spaces after 150 * wl_head from the current transaction in memory. 151 * 152 * wapbl_truncate advances wl_tail past any completed `C' 153 * records, freeing them up for use. 154 * 155 * head == tail == 0 means log is empty. 156 * head == tail != 0 means log is full. 157 * 158 * See assertions in wapbl_advance() for other boundary 159 * conditions. 160 * 161 * Only wapbl_flush moves the head, except when wapbl_truncate 162 * sets it to 0 to indicate that the log is empty. 163 * 164 * Only wapbl_truncate moves the tail, except when wapbl_flush 165 * sets it to wl_circ_off to indicate that the log is full. 166 */ 167 168 struct wapbl_wc_header *wl_wc_header; /* l */ 169 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 170 171 kmutex_t wl_mtx; /* u: short-term lock */ 172 krwlock_t wl_rwlock; /* u: File system transaction lock */ 173 174 /* 175 * Must be held while accessing 176 * wl_count or wl_bufs or head or tail 177 */ 178 179 /* 180 * Callback called from within the flush routine to flush any extra 181 * bits. Note that flush may be skipped without calling this if 182 * there are no outstanding buffers in the transaction. 183 */ 184 #if _KERNEL 185 wapbl_flush_fn_t wl_flush; /* r */ 186 wapbl_flush_fn_t wl_flush_abort;/* r */ 187 #endif 188 189 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 190 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 191 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 192 193 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 194 195 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 196 size_t wl_reclaimable_bytes; /* m: Amount of space available for 197 reclamation by truncate */ 198 int wl_error_count; /* m: # of wl_entries with errors */ 199 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 200 201 #ifdef WAPBL_DEBUG_BUFBYTES 202 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 203 #endif 204 205 #if _KERNEL 206 int wl_brperjblock; /* r Block records per journal block */ 207 #endif 208 209 TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */ 210 int wl_dealloccnt; /* lm: total count */ 211 int wl_dealloclim; /* r: max count */ 212 213 /* hashtable of inode numbers for allocated but unlinked inodes */ 214 /* synch ??? */ 215 struct wapbl_ino_head *wl_inohash; 216 u_long wl_inohashmask; 217 int wl_inohashcnt; 218 219 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 220 accounting */ 221 222 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */ 223 daddr_t wl_buffer_dblk; /* l: buffer disk block address */ 224 size_t wl_buffer_used; /* l: buffer current use */ 225 }; 226 227 #ifdef WAPBL_DEBUG_PRINT 228 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 229 #endif 230 231 /****************************************************************/ 232 #ifdef _KERNEL 233 234 #ifdef WAPBL_DEBUG 235 struct wapbl *wapbl_debug_wl; 236 #endif 237 238 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 239 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 240 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 241 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 242 #endif /* _KERNEL */ 243 244 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 245 246 static inline size_t wapbl_space_used(size_t avail, off_t head, 247 off_t tail); 248 249 #ifdef _KERNEL 250 251 static struct pool wapbl_entry_pool; 252 static struct pool wapbl_dealloc_pool; 253 254 #define WAPBL_INODETRK_SIZE 83 255 static int wapbl_ino_pool_refcount; 256 static struct pool wapbl_ino_pool; 257 struct wapbl_ino { 258 LIST_ENTRY(wapbl_ino) wi_hash; 259 ino_t wi_ino; 260 mode_t wi_mode; 261 }; 262 263 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 264 static void wapbl_inodetrk_free(struct wapbl *wl); 265 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 266 267 static size_t wapbl_transaction_len(struct wapbl *wl); 268 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 269 270 static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *, 271 bool); 272 273 #if 0 274 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 275 #endif 276 277 static int wapbl_replay_isopen1(struct wapbl_replay *); 278 279 struct wapbl_ops wapbl_ops = { 280 .wo_wapbl_discard = wapbl_discard, 281 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 282 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 283 .wo_wapbl_replay_read = wapbl_replay_read, 284 .wo_wapbl_add_buf = wapbl_add_buf, 285 .wo_wapbl_remove_buf = wapbl_remove_buf, 286 .wo_wapbl_resize_buf = wapbl_resize_buf, 287 .wo_wapbl_begin = wapbl_begin, 288 .wo_wapbl_end = wapbl_end, 289 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 290 291 /* XXX: the following is only used to say "this is a wapbl buf" */ 292 .wo_wapbl_biodone = wapbl_biodone, 293 }; 294 295 static int 296 wapbl_sysctl_init(void) 297 { 298 int rv; 299 const struct sysctlnode *rnode, *cnode; 300 301 wapbl_sysctl = NULL; 302 303 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, 304 CTLFLAG_PERMANENT, 305 CTLTYPE_NODE, "wapbl", 306 SYSCTL_DESCR("WAPBL journaling options"), 307 NULL, 0, NULL, 0, 308 CTL_VFS, CTL_CREATE, CTL_EOL); 309 if (rv) 310 return rv; 311 312 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 313 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 314 CTLTYPE_INT, "flush_disk_cache", 315 SYSCTL_DESCR("flush disk cache"), 316 NULL, 0, &wapbl_flush_disk_cache, 0, 317 CTL_CREATE, CTL_EOL); 318 if (rv) 319 return rv; 320 321 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 322 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 323 CTLTYPE_INT, "verbose_commit", 324 SYSCTL_DESCR("show time and size of wapbl log commits"), 325 NULL, 0, &wapbl_verbose_commit, 0, 326 CTL_CREATE, CTL_EOL); 327 return rv; 328 } 329 330 static void 331 wapbl_init(void) 332 { 333 334 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0, 335 "wapblentrypl", &pool_allocator_kmem, IPL_VM); 336 pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0, 337 "wapbldealloc", &pool_allocator_nointr, IPL_NONE); 338 339 wapbl_sysctl_init(); 340 } 341 342 static int 343 wapbl_fini(void) 344 { 345 346 if (wapbl_sysctl != NULL) 347 sysctl_teardown(&wapbl_sysctl); 348 349 pool_destroy(&wapbl_dealloc_pool); 350 pool_destroy(&wapbl_entry_pool); 351 352 return 0; 353 } 354 355 static int 356 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 357 { 358 int error, i; 359 360 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 361 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 362 363 /* 364 * Its only valid to reuse the replay log if its 365 * the same as the new log we just opened. 366 */ 367 KDASSERT(!wapbl_replay_isopen(wr)); 368 KASSERT(wl->wl_devvp->v_type == VBLK); 369 KASSERT(wr->wr_devvp->v_type == VBLK); 370 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 371 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 372 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 373 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 374 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 375 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 376 377 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 378 379 for (i = 0; i < wr->wr_inodescnt; i++) 380 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 381 wr->wr_inodes[i].wr_imode); 382 383 /* Make sure new transaction won't overwrite old inodes list */ 384 KDASSERT(wapbl_transaction_len(wl) <= 385 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 386 wr->wr_inodestail)); 387 388 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 389 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 390 wapbl_transaction_len(wl); 391 392 error = wapbl_write_inodes(wl, &wl->wl_head); 393 if (error) 394 return error; 395 396 KASSERT(wl->wl_head != wl->wl_tail); 397 KASSERT(wl->wl_head != 0); 398 399 return 0; 400 } 401 402 int 403 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 404 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 405 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 406 { 407 struct wapbl *wl; 408 struct vnode *devvp; 409 daddr_t logpbn; 410 int error; 411 int log_dev_bshift = ilog2(blksize); 412 int fs_dev_bshift = log_dev_bshift; 413 int run; 414 415 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 416 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 417 418 if (log_dev_bshift > fs_dev_bshift) { 419 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 420 ("wapbl: log device's block size cannot be larger " 421 "than filesystem's\n")); 422 /* 423 * Not currently implemented, although it could be if 424 * needed someday. 425 */ 426 return ENOSYS; 427 } 428 429 if (off < 0) 430 return EINVAL; 431 432 if (blksize < DEV_BSIZE) 433 return EINVAL; 434 if (blksize % DEV_BSIZE) 435 return EINVAL; 436 437 /* XXXTODO: verify that the full load is writable */ 438 439 /* 440 * XXX check for minimum log size 441 * minimum is governed by minimum amount of space 442 * to complete a transaction. (probably truncate) 443 */ 444 /* XXX for now pick something minimal */ 445 if ((count * blksize) < MAXPHYS) { 446 return ENOSPC; 447 } 448 449 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 450 return error; 451 } 452 453 wl = wapbl_calloc(1, sizeof(*wl)); 454 rw_init(&wl->wl_rwlock); 455 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 456 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 457 LIST_INIT(&wl->wl_bufs); 458 SIMPLEQ_INIT(&wl->wl_entries); 459 460 wl->wl_logvp = vp; 461 wl->wl_devvp = devvp; 462 wl->wl_mount = mp; 463 wl->wl_logpbn = logpbn; 464 wl->wl_log_dev_bshift = log_dev_bshift; 465 wl->wl_fs_dev_bshift = fs_dev_bshift; 466 467 wl->wl_flush = flushfn; 468 wl->wl_flush_abort = flushabortfn; 469 470 /* Reserve two log device blocks for the commit headers */ 471 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 472 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 473 /* truncate the log usage to a multiple of log_dev_bshift */ 474 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 475 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 476 477 /* 478 * wl_bufbytes_max limits the size of the in memory transaction space. 479 * - Since buffers are allocated and accounted for in units of 480 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 481 * (i.e. 1<<PAGE_SHIFT) 482 * - Since the log device has to be written in units of 483 * 1<<wl_log_dev_bshift it is required to be a mulitple of 484 * 1<<wl_log_dev_bshift. 485 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 486 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 487 * Therefore it must be multiple of the least common multiple of those 488 * three quantities. Fortunately, all of those quantities are 489 * guaranteed to be a power of two, and the least common multiple of 490 * a set of numbers which are all powers of two is simply the maximum 491 * of those numbers. Finally, the maximum logarithm of a power of two 492 * is the same as the log of the maximum power of two. So we can do 493 * the following operations to size wl_bufbytes_max: 494 */ 495 496 /* XXX fix actual number of pages reserved per filesystem. */ 497 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 498 499 /* Round wl_bufbytes_max to the largest power of two constraint */ 500 wl->wl_bufbytes_max >>= PAGE_SHIFT; 501 wl->wl_bufbytes_max <<= PAGE_SHIFT; 502 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 503 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 504 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 505 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 506 507 /* XXX maybe use filesystem fragment size instead of 1024 */ 508 /* XXX fix actual number of buffers reserved per filesystem. */ 509 wl->wl_bufcount_max = (nbuf / 2) * 1024; 510 511 wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift) 512 - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 513 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 514 KASSERT(wl->wl_brperjblock > 0); 515 516 /* XXX tie this into resource estimation */ 517 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2; 518 TAILQ_INIT(&wl->wl_dealloclist); 519 520 wl->wl_buffer = wapbl_alloc(MAXPHYS); 521 wl->wl_buffer_used = 0; 522 523 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 524 525 /* Initialize the commit header */ 526 { 527 struct wapbl_wc_header *wc; 528 size_t len = 1 << wl->wl_log_dev_bshift; 529 wc = wapbl_calloc(1, len); 530 wc->wc_type = WAPBL_WC_HEADER; 531 wc->wc_len = len; 532 wc->wc_circ_off = wl->wl_circ_off; 533 wc->wc_circ_size = wl->wl_circ_size; 534 /* XXX wc->wc_fsid */ 535 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 536 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 537 wl->wl_wc_header = wc; 538 wl->wl_wc_scratch = wapbl_alloc(len); 539 } 540 541 /* 542 * if there was an existing set of unlinked but 543 * allocated inodes, preserve it in the new 544 * log. 545 */ 546 if (wr && wr->wr_inodescnt) { 547 error = wapbl_start_flush_inodes(wl, wr); 548 if (error) 549 goto errout; 550 } 551 552 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 553 if (error) { 554 goto errout; 555 } 556 557 *wlp = wl; 558 #if defined(WAPBL_DEBUG) 559 wapbl_debug_wl = wl; 560 #endif 561 562 return 0; 563 errout: 564 wapbl_discard(wl); 565 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 566 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 567 wapbl_free(wl->wl_buffer, MAXPHYS); 568 wapbl_inodetrk_free(wl); 569 wapbl_free(wl, sizeof(*wl)); 570 571 return error; 572 } 573 574 /* 575 * Like wapbl_flush, only discards the transaction 576 * completely 577 */ 578 579 void 580 wapbl_discard(struct wapbl *wl) 581 { 582 struct wapbl_entry *we; 583 struct wapbl_dealloc *wd; 584 struct buf *bp; 585 int i; 586 587 /* 588 * XXX we may consider using upgrade here 589 * if we want to call flush from inside a transaction 590 */ 591 rw_enter(&wl->wl_rwlock, RW_WRITER); 592 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist)); 593 594 #ifdef WAPBL_DEBUG_PRINT 595 { 596 pid_t pid = -1; 597 lwpid_t lid = -1; 598 if (curproc) 599 pid = curproc->p_pid; 600 if (curlwp) 601 lid = curlwp->l_lid; 602 #ifdef WAPBL_DEBUG_BUFBYTES 603 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 604 ("wapbl_discard: thread %d.%d discarding " 605 "transaction\n" 606 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 607 "deallocs=%d inodes=%d\n" 608 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 609 "unsynced=%zu\n", 610 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 611 wl->wl_bcount, wl->wl_dealloccnt, 612 wl->wl_inohashcnt, wl->wl_error_count, 613 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 614 wl->wl_unsynced_bufbytes)); 615 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 616 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 617 ("\tentry: bufcount = %zu, reclaimable = %zu, " 618 "error = %d, unsynced = %zu\n", 619 we->we_bufcount, we->we_reclaimable_bytes, 620 we->we_error, we->we_unsynced_bufbytes)); 621 } 622 #else /* !WAPBL_DEBUG_BUFBYTES */ 623 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 624 ("wapbl_discard: thread %d.%d discarding transaction\n" 625 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 626 "deallocs=%d inodes=%d\n" 627 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 628 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 629 wl->wl_bcount, wl->wl_dealloccnt, 630 wl->wl_inohashcnt, wl->wl_error_count, 631 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 632 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 633 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 634 ("\tentry: bufcount = %zu, reclaimable = %zu, " 635 "error = %d\n", 636 we->we_bufcount, we->we_reclaimable_bytes, 637 we->we_error)); 638 } 639 #endif /* !WAPBL_DEBUG_BUFBYTES */ 640 } 641 #endif /* WAPBL_DEBUG_PRINT */ 642 643 for (i = 0; i <= wl->wl_inohashmask; i++) { 644 struct wapbl_ino_head *wih; 645 struct wapbl_ino *wi; 646 647 wih = &wl->wl_inohash[i]; 648 while ((wi = LIST_FIRST(wih)) != NULL) { 649 LIST_REMOVE(wi, wi_hash); 650 pool_put(&wapbl_ino_pool, wi); 651 KASSERT(wl->wl_inohashcnt > 0); 652 wl->wl_inohashcnt--; 653 } 654 } 655 656 /* 657 * clean buffer list 658 */ 659 mutex_enter(&bufcache_lock); 660 mutex_enter(&wl->wl_mtx); 661 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 662 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 663 /* 664 * The buffer will be unlocked and 665 * removed from the transaction in brelse 666 */ 667 mutex_exit(&wl->wl_mtx); 668 brelsel(bp, 0); 669 mutex_enter(&wl->wl_mtx); 670 } 671 } 672 mutex_exit(&wl->wl_mtx); 673 mutex_exit(&bufcache_lock); 674 675 /* 676 * Remove references to this wl from wl_entries, free any which 677 * no longer have buffers, others will be freed in wapbl_biodone 678 * when they no longer have any buffers. 679 */ 680 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 681 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 682 /* XXX should we be accumulating wl_error_count 683 * and increasing reclaimable bytes ? */ 684 we->we_wapbl = NULL; 685 if (we->we_bufcount == 0) { 686 #ifdef WAPBL_DEBUG_BUFBYTES 687 KASSERT(we->we_unsynced_bufbytes == 0); 688 #endif 689 pool_put(&wapbl_entry_pool, we); 690 } 691 } 692 693 /* Discard list of deallocs */ 694 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) 695 wapbl_deallocation_free(wl, wd, true); 696 697 /* XXX should we clear wl_reserved_bytes? */ 698 699 KASSERT(wl->wl_bufbytes == 0); 700 KASSERT(wl->wl_bcount == 0); 701 KASSERT(wl->wl_bufcount == 0); 702 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 703 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 704 KASSERT(wl->wl_inohashcnt == 0); 705 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist)); 706 KASSERT(wl->wl_dealloccnt == 0); 707 708 rw_exit(&wl->wl_rwlock); 709 } 710 711 int 712 wapbl_stop(struct wapbl *wl, int force) 713 { 714 int error; 715 716 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 717 error = wapbl_flush(wl, 1); 718 if (error) { 719 if (force) 720 wapbl_discard(wl); 721 else 722 return error; 723 } 724 725 /* Unlinked inodes persist after a flush */ 726 if (wl->wl_inohashcnt) { 727 if (force) { 728 wapbl_discard(wl); 729 } else { 730 return EBUSY; 731 } 732 } 733 734 KASSERT(wl->wl_bufbytes == 0); 735 KASSERT(wl->wl_bcount == 0); 736 KASSERT(wl->wl_bufcount == 0); 737 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 738 KASSERT(wl->wl_dealloccnt == 0); 739 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 740 KASSERT(wl->wl_inohashcnt == 0); 741 KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist)); 742 KASSERT(wl->wl_dealloccnt == 0); 743 744 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 745 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 746 wapbl_free(wl->wl_buffer, MAXPHYS); 747 wapbl_inodetrk_free(wl); 748 749 cv_destroy(&wl->wl_reclaimable_cv); 750 mutex_destroy(&wl->wl_mtx); 751 rw_destroy(&wl->wl_rwlock); 752 wapbl_free(wl, sizeof(*wl)); 753 754 return 0; 755 } 756 757 /****************************************************************/ 758 /* 759 * Unbuffered disk I/O 760 */ 761 762 static int 763 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 764 { 765 struct pstats *pstats = curlwp->l_proc->p_stats; 766 struct buf *bp; 767 int error; 768 769 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 770 KASSERT(devvp->v_type == VBLK); 771 772 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 773 mutex_enter(devvp->v_interlock); 774 devvp->v_numoutput++; 775 mutex_exit(devvp->v_interlock); 776 pstats->p_ru.ru_oublock++; 777 } else { 778 pstats->p_ru.ru_inblock++; 779 } 780 781 bp = getiobuf(devvp, true); 782 bp->b_flags = flags; 783 bp->b_cflags = BC_BUSY; /* silly & dubious */ 784 bp->b_dev = devvp->v_rdev; 785 bp->b_data = data; 786 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 787 bp->b_blkno = pbn; 788 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 789 790 WAPBL_PRINTF(WAPBL_PRINT_IO, 791 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n", 792 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 793 bp->b_blkno, bp->b_dev)); 794 795 VOP_STRATEGY(devvp, bp); 796 797 error = biowait(bp); 798 putiobuf(bp); 799 800 if (error) { 801 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 802 ("wapbl_doio: %s %zu bytes at block %" PRId64 803 " on dev 0x%"PRIx64" failed with error %d\n", 804 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 805 "write" : "read"), 806 len, pbn, devvp->v_rdev, error)); 807 } 808 809 return error; 810 } 811 812 /* 813 * wapbl_write(data, len, devvp, pbn) 814 * 815 * Synchronously write len bytes from data to physical block pbn 816 * on devvp. 817 */ 818 int 819 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 820 { 821 822 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 823 } 824 825 /* 826 * wapbl_read(data, len, devvp, pbn) 827 * 828 * Synchronously read len bytes into data from physical block pbn 829 * on devvp. 830 */ 831 int 832 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 833 { 834 835 return wapbl_doio(data, len, devvp, pbn, B_READ); 836 } 837 838 /****************************************************************/ 839 /* 840 * Buffered disk writes -- try to coalesce writes and emit 841 * MAXPHYS-aligned blocks. 842 */ 843 844 /* 845 * wapbl_buffered_flush(wl) 846 * 847 * Flush any buffered writes from wapbl_buffered_write. 848 */ 849 static int 850 wapbl_buffered_flush(struct wapbl *wl) 851 { 852 int error; 853 854 if (wl->wl_buffer_used == 0) 855 return 0; 856 857 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 858 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 859 wl->wl_buffer_used = 0; 860 861 return error; 862 } 863 864 /* 865 * wapbl_buffered_write(data, len, wl, pbn) 866 * 867 * Write len bytes from data to physical block pbn on 868 * wl->wl_devvp. The write may not complete until 869 * wapbl_buffered_flush. 870 */ 871 static int 872 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn) 873 { 874 int error; 875 size_t resid; 876 877 /* 878 * If not adjacent to buffered data flush first. Disk block 879 * address is always valid for non-empty buffer. 880 */ 881 if (wl->wl_buffer_used > 0 && 882 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) { 883 error = wapbl_buffered_flush(wl); 884 if (error) 885 return error; 886 } 887 /* 888 * If this write goes to an empty buffer we have to 889 * save the disk block address first. 890 */ 891 if (wl->wl_buffer_used == 0) 892 wl->wl_buffer_dblk = pbn; 893 /* 894 * Remaining space so this buffer ends on a MAXPHYS boundary. 895 * 896 * Cannot become less or equal zero as the buffer would have been 897 * flushed on the last call then. 898 */ 899 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) - 900 wl->wl_buffer_used; 901 KASSERT(resid > 0); 902 KASSERT(dbtob(btodb(resid)) == resid); 903 if (len >= resid) { 904 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid); 905 wl->wl_buffer_used += resid; 906 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 907 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 908 data = (uint8_t *)data + resid; 909 len -= resid; 910 wl->wl_buffer_dblk = pbn + btodb(resid); 911 wl->wl_buffer_used = 0; 912 if (error) 913 return error; 914 } 915 KASSERT(len < MAXPHYS); 916 if (len > 0) { 917 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len); 918 wl->wl_buffer_used += len; 919 } 920 921 return 0; 922 } 923 924 /* 925 * wapbl_circ_write(wl, data, len, offp) 926 * 927 * Write len bytes from data to the circular queue of wl, starting 928 * at linear byte offset *offp, and returning the new linear byte 929 * offset in *offp. 930 * 931 * If the starting linear byte offset precedes wl->wl_circ_off, 932 * the write instead begins at wl->wl_circ_off. XXX WTF? This 933 * should be a KASSERT, not a conditional. 934 * 935 * The write is buffered in wl and must be flushed with 936 * wapbl_buffered_flush before it will be submitted to the disk. 937 */ 938 static int 939 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 940 { 941 size_t slen; 942 off_t off = *offp; 943 int error; 944 daddr_t pbn; 945 946 KDASSERT(((len >> wl->wl_log_dev_bshift) << 947 wl->wl_log_dev_bshift) == len); 948 949 if (off < wl->wl_circ_off) 950 off = wl->wl_circ_off; 951 slen = wl->wl_circ_off + wl->wl_circ_size - off; 952 if (slen < len) { 953 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 954 #ifdef _KERNEL 955 pbn = btodb(pbn << wl->wl_log_dev_bshift); 956 #endif 957 error = wapbl_buffered_write(data, slen, wl, pbn); 958 if (error) 959 return error; 960 data = (uint8_t *)data + slen; 961 len -= slen; 962 off = wl->wl_circ_off; 963 } 964 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 965 #ifdef _KERNEL 966 pbn = btodb(pbn << wl->wl_log_dev_bshift); 967 #endif 968 error = wapbl_buffered_write(data, len, wl, pbn); 969 if (error) 970 return error; 971 off += len; 972 if (off >= wl->wl_circ_off + wl->wl_circ_size) 973 off = wl->wl_circ_off; 974 *offp = off; 975 return 0; 976 } 977 978 /****************************************************************/ 979 /* 980 * WAPBL transactions: entering, adding/removing bufs, and exiting 981 */ 982 983 int 984 wapbl_begin(struct wapbl *wl, const char *file, int line) 985 { 986 int doflush; 987 unsigned lockcount; 988 989 KDASSERT(wl); 990 991 /* 992 * XXX this needs to be made much more sophisticated. 993 * perhaps each wapbl_begin could reserve a specified 994 * number of buffers and bytes. 995 */ 996 mutex_enter(&wl->wl_mtx); 997 lockcount = wl->wl_lock_count; 998 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 999 wl->wl_bufbytes_max / 2) || 1000 ((wl->wl_bufcount + (lockcount * 10)) > 1001 wl->wl_bufcount_max / 2) || 1002 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 1003 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2)); 1004 mutex_exit(&wl->wl_mtx); 1005 1006 if (doflush) { 1007 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1008 ("force flush lockcnt=%d bufbytes=%zu " 1009 "(max=%zu) bufcount=%zu (max=%zu) " 1010 "dealloccnt %d (lim=%d)\n", 1011 lockcount, wl->wl_bufbytes, 1012 wl->wl_bufbytes_max, wl->wl_bufcount, 1013 wl->wl_bufcount_max, 1014 wl->wl_dealloccnt, wl->wl_dealloclim)); 1015 } 1016 1017 if (doflush) { 1018 int error = wapbl_flush(wl, 0); 1019 if (error) 1020 return error; 1021 } 1022 1023 rw_enter(&wl->wl_rwlock, RW_READER); 1024 mutex_enter(&wl->wl_mtx); 1025 wl->wl_lock_count++; 1026 mutex_exit(&wl->wl_mtx); 1027 1028 #if defined(WAPBL_DEBUG_PRINT) 1029 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 1030 ("wapbl_begin thread %d.%d with bufcount=%zu " 1031 "bufbytes=%zu bcount=%zu at %s:%d\n", 1032 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1033 wl->wl_bufbytes, wl->wl_bcount, file, line)); 1034 #endif 1035 1036 return 0; 1037 } 1038 1039 void 1040 wapbl_end(struct wapbl *wl) 1041 { 1042 1043 #if defined(WAPBL_DEBUG_PRINT) 1044 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 1045 ("wapbl_end thread %d.%d with bufcount=%zu " 1046 "bufbytes=%zu bcount=%zu\n", 1047 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1048 wl->wl_bufbytes, wl->wl_bcount)); 1049 #endif 1050 1051 /* 1052 * XXX this could be handled more gracefully, perhaps place 1053 * only a partial transaction in the log and allow the 1054 * remaining to flush without the protection of the journal. 1055 */ 1056 KASSERTMSG((wapbl_transaction_len(wl) <= 1057 (wl->wl_circ_size - wl->wl_reserved_bytes)), 1058 "wapbl_end: current transaction too big to flush"); 1059 1060 mutex_enter(&wl->wl_mtx); 1061 KASSERT(wl->wl_lock_count > 0); 1062 wl->wl_lock_count--; 1063 mutex_exit(&wl->wl_mtx); 1064 1065 rw_exit(&wl->wl_rwlock); 1066 } 1067 1068 void 1069 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 1070 { 1071 1072 KASSERT(bp->b_cflags & BC_BUSY); 1073 KASSERT(bp->b_vp); 1074 1075 wapbl_jlock_assert(wl); 1076 1077 #if 0 1078 /* 1079 * XXX this might be an issue for swapfiles. 1080 * see uvm_swap.c:1702 1081 * 1082 * XXX2 why require it then? leap of semantics? 1083 */ 1084 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 1085 #endif 1086 1087 mutex_enter(&wl->wl_mtx); 1088 if (bp->b_flags & B_LOCKED) { 1089 LIST_REMOVE(bp, b_wapbllist); 1090 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 1091 ("wapbl_add_buf thread %d.%d re-adding buf %p " 1092 "with %d bytes %d bcount\n", 1093 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1094 bp->b_bcount)); 1095 } else { 1096 /* unlocked by dirty buffers shouldn't exist */ 1097 KASSERT(!(bp->b_oflags & BO_DELWRI)); 1098 wl->wl_bufbytes += bp->b_bufsize; 1099 wl->wl_bcount += bp->b_bcount; 1100 wl->wl_bufcount++; 1101 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1102 ("wapbl_add_buf thread %d.%d adding buf %p " 1103 "with %d bytes %d bcount\n", 1104 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1105 bp->b_bcount)); 1106 } 1107 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 1108 mutex_exit(&wl->wl_mtx); 1109 1110 bp->b_flags |= B_LOCKED; 1111 } 1112 1113 static void 1114 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 1115 { 1116 1117 KASSERT(mutex_owned(&wl->wl_mtx)); 1118 KASSERT(bp->b_cflags & BC_BUSY); 1119 wapbl_jlock_assert(wl); 1120 1121 #if 0 1122 /* 1123 * XXX this might be an issue for swapfiles. 1124 * see uvm_swap.c:1725 1125 * 1126 * XXXdeux: see above 1127 */ 1128 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 1129 #endif 1130 KASSERT(bp->b_flags & B_LOCKED); 1131 1132 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1133 ("wapbl_remove_buf thread %d.%d removing buf %p with " 1134 "%d bytes %d bcount\n", 1135 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 1136 1137 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 1138 wl->wl_bufbytes -= bp->b_bufsize; 1139 KASSERT(wl->wl_bcount >= bp->b_bcount); 1140 wl->wl_bcount -= bp->b_bcount; 1141 KASSERT(wl->wl_bufcount > 0); 1142 wl->wl_bufcount--; 1143 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1144 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1145 LIST_REMOVE(bp, b_wapbllist); 1146 1147 bp->b_flags &= ~B_LOCKED; 1148 } 1149 1150 /* called from brelsel() in vfs_bio among other places */ 1151 void 1152 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 1153 { 1154 1155 mutex_enter(&wl->wl_mtx); 1156 wapbl_remove_buf_locked(wl, bp); 1157 mutex_exit(&wl->wl_mtx); 1158 } 1159 1160 void 1161 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 1162 { 1163 1164 KASSERT(bp->b_cflags & BC_BUSY); 1165 1166 /* 1167 * XXX: why does this depend on B_LOCKED? otherwise the buf 1168 * is not for a transaction? if so, why is this called in the 1169 * first place? 1170 */ 1171 if (bp->b_flags & B_LOCKED) { 1172 mutex_enter(&wl->wl_mtx); 1173 wl->wl_bufbytes += bp->b_bufsize - oldsz; 1174 wl->wl_bcount += bp->b_bcount - oldcnt; 1175 mutex_exit(&wl->wl_mtx); 1176 } 1177 } 1178 1179 #endif /* _KERNEL */ 1180 1181 /****************************************************************/ 1182 /* Some utility inlines */ 1183 1184 /* 1185 * wapbl_space_used(avail, head, tail) 1186 * 1187 * Number of bytes used in a circular queue of avail total bytes, 1188 * from tail to head. 1189 */ 1190 static inline size_t 1191 wapbl_space_used(size_t avail, off_t head, off_t tail) 1192 { 1193 1194 if (tail == 0) { 1195 KASSERT(head == 0); 1196 return 0; 1197 } 1198 return ((head + (avail - 1) - tail) % avail) + 1; 1199 } 1200 1201 #ifdef _KERNEL 1202 /* 1203 * wapbl_advance(size, off, oldoff, delta) 1204 * 1205 * Given a byte offset oldoff into a circular queue of size bytes 1206 * starting at off, return a new byte offset oldoff + delta into 1207 * the circular queue. 1208 */ 1209 static inline off_t 1210 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta) 1211 { 1212 off_t newoff; 1213 1214 /* Define acceptable ranges for inputs. */ 1215 KASSERT(delta <= (size_t)size); 1216 KASSERT((oldoff == 0) || ((size_t)oldoff >= off)); 1217 KASSERT(oldoff < (off_t)(size + off)); 1218 1219 if ((oldoff == 0) && (delta != 0)) 1220 newoff = off + delta; 1221 else if ((oldoff + delta) < (size + off)) 1222 newoff = oldoff + delta; 1223 else 1224 newoff = (oldoff + delta) - size; 1225 1226 /* Note some interesting axioms */ 1227 KASSERT((delta != 0) || (newoff == oldoff)); 1228 KASSERT((delta == 0) || (newoff != 0)); 1229 KASSERT((delta != (size)) || (newoff == oldoff)); 1230 1231 /* Define acceptable ranges for output. */ 1232 KASSERT((newoff == 0) || ((size_t)newoff >= off)); 1233 KASSERT((size_t)newoff < (size + off)); 1234 return newoff; 1235 } 1236 1237 /* 1238 * wapbl_space_free(avail, head, tail) 1239 * 1240 * Number of bytes free in a circular queue of avail total bytes, 1241 * in which everything from tail to head is used. 1242 */ 1243 static inline size_t 1244 wapbl_space_free(size_t avail, off_t head, off_t tail) 1245 { 1246 1247 return avail - wapbl_space_used(avail, head, tail); 1248 } 1249 1250 /* 1251 * wapbl_advance_head(size, off, delta, headp, tailp) 1252 * 1253 * In a circular queue of size bytes starting at off, given the 1254 * old head and tail offsets *headp and *tailp, store the new head 1255 * and tail offsets in *headp and *tailp resulting from adding 1256 * delta bytes of data to the head. 1257 */ 1258 static inline void 1259 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1260 off_t *tailp) 1261 { 1262 off_t head = *headp; 1263 off_t tail = *tailp; 1264 1265 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1266 head = wapbl_advance(size, off, head, delta); 1267 if ((tail == 0) && (head != 0)) 1268 tail = off; 1269 *headp = head; 1270 *tailp = tail; 1271 } 1272 1273 /* 1274 * wapbl_advance_tail(size, off, delta, headp, tailp) 1275 * 1276 * In a circular queue of size bytes starting at off, given the 1277 * old head and tail offsets *headp and *tailp, store the new head 1278 * and tail offsets in *headp and *tailp resulting from removing 1279 * delta bytes of data from the tail. 1280 */ 1281 static inline void 1282 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1283 off_t *tailp) 1284 { 1285 off_t head = *headp; 1286 off_t tail = *tailp; 1287 1288 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1289 tail = wapbl_advance(size, off, tail, delta); 1290 if (head == tail) { 1291 head = tail = 0; 1292 } 1293 *headp = head; 1294 *tailp = tail; 1295 } 1296 1297 1298 /****************************************************************/ 1299 1300 /* 1301 * wapbl_truncate(wl, minfree) 1302 * 1303 * Wait until at least minfree bytes are available in the log. 1304 * 1305 * If it was necessary to wait for writes to complete, 1306 * advance the circular queue tail to reflect the new write 1307 * completions and issue a write commit to the log. 1308 * 1309 * => Caller must hold wl->wl_rwlock writer lock. 1310 */ 1311 static int 1312 wapbl_truncate(struct wapbl *wl, size_t minfree) 1313 { 1314 size_t delta; 1315 size_t avail; 1316 off_t head; 1317 off_t tail; 1318 int error = 0; 1319 1320 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1321 KASSERT(rw_write_held(&wl->wl_rwlock)); 1322 1323 mutex_enter(&wl->wl_mtx); 1324 1325 /* 1326 * First check to see if we have to do a commit 1327 * at all. 1328 */ 1329 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1330 if (minfree < avail) { 1331 mutex_exit(&wl->wl_mtx); 1332 return 0; 1333 } 1334 minfree -= avail; 1335 while ((wl->wl_error_count == 0) && 1336 (wl->wl_reclaimable_bytes < minfree)) { 1337 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1338 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1339 "minfree=%zd\n", 1340 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1341 minfree)); 1342 1343 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1344 } 1345 if (wl->wl_reclaimable_bytes < minfree) { 1346 KASSERT(wl->wl_error_count); 1347 /* XXX maybe get actual error from buffer instead someday? */ 1348 error = EIO; 1349 } 1350 head = wl->wl_head; 1351 tail = wl->wl_tail; 1352 delta = wl->wl_reclaimable_bytes; 1353 1354 /* If all of of the entries are flushed, then be sure to keep 1355 * the reserved bytes reserved. Watch out for discarded transactions, 1356 * which could leave more bytes reserved than are reclaimable. 1357 */ 1358 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1359 (delta >= wl->wl_reserved_bytes)) { 1360 delta -= wl->wl_reserved_bytes; 1361 } 1362 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1363 &tail); 1364 KDASSERT(wl->wl_reserved_bytes <= 1365 wapbl_space_used(wl->wl_circ_size, head, tail)); 1366 mutex_exit(&wl->wl_mtx); 1367 1368 if (error) 1369 return error; 1370 1371 /* 1372 * This is where head, tail and delta are unprotected 1373 * from races against itself or flush. This is ok since 1374 * we only call this routine from inside flush itself. 1375 * 1376 * XXX: how can it race against itself when accessed only 1377 * from behind the write-locked rwlock? 1378 */ 1379 error = wapbl_write_commit(wl, head, tail); 1380 if (error) 1381 return error; 1382 1383 wl->wl_head = head; 1384 wl->wl_tail = tail; 1385 1386 mutex_enter(&wl->wl_mtx); 1387 KASSERT(wl->wl_reclaimable_bytes >= delta); 1388 wl->wl_reclaimable_bytes -= delta; 1389 mutex_exit(&wl->wl_mtx); 1390 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1391 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1392 curproc->p_pid, curlwp->l_lid, delta)); 1393 1394 return 0; 1395 } 1396 1397 /****************************************************************/ 1398 1399 void 1400 wapbl_biodone(struct buf *bp) 1401 { 1402 struct wapbl_entry *we = bp->b_private; 1403 struct wapbl *wl = we->we_wapbl; 1404 #ifdef WAPBL_DEBUG_BUFBYTES 1405 const int bufsize = bp->b_bufsize; 1406 #endif 1407 1408 /* 1409 * Handle possible flushing of buffers after log has been 1410 * decomissioned. 1411 */ 1412 if (!wl) { 1413 KASSERT(we->we_bufcount > 0); 1414 we->we_bufcount--; 1415 #ifdef WAPBL_DEBUG_BUFBYTES 1416 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1417 we->we_unsynced_bufbytes -= bufsize; 1418 #endif 1419 1420 if (we->we_bufcount == 0) { 1421 #ifdef WAPBL_DEBUG_BUFBYTES 1422 KASSERT(we->we_unsynced_bufbytes == 0); 1423 #endif 1424 pool_put(&wapbl_entry_pool, we); 1425 } 1426 1427 brelse(bp, 0); 1428 return; 1429 } 1430 1431 #ifdef ohbother 1432 KDASSERT(bp->b_oflags & BO_DONE); 1433 KDASSERT(!(bp->b_oflags & BO_DELWRI)); 1434 KDASSERT(bp->b_flags & B_ASYNC); 1435 KDASSERT(bp->b_cflags & BC_BUSY); 1436 KDASSERT(!(bp->b_flags & B_LOCKED)); 1437 KDASSERT(!(bp->b_flags & B_READ)); 1438 KDASSERT(!(bp->b_cflags & BC_INVAL)); 1439 KDASSERT(!(bp->b_cflags & BC_NOCACHE)); 1440 #endif 1441 1442 if (bp->b_error) { 1443 /* 1444 * If an error occurs, it would be nice to leave the buffer 1445 * as a delayed write on the LRU queue so that we can retry 1446 * it later. But buffercache(9) can't handle dirty buffer 1447 * reuse, so just mark the log permanently errored out. 1448 */ 1449 mutex_enter(&wl->wl_mtx); 1450 if (wl->wl_error_count == 0) { 1451 wl->wl_error_count++; 1452 cv_broadcast(&wl->wl_reclaimable_cv); 1453 } 1454 mutex_exit(&wl->wl_mtx); 1455 } 1456 1457 /* 1458 * Release the buffer here. wapbl_flush() may wait for the 1459 * log to become empty and we better unbusy the buffer before 1460 * wapbl_flush() returns. 1461 */ 1462 brelse(bp, 0); 1463 1464 mutex_enter(&wl->wl_mtx); 1465 1466 KASSERT(we->we_bufcount > 0); 1467 we->we_bufcount--; 1468 #ifdef WAPBL_DEBUG_BUFBYTES 1469 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1470 we->we_unsynced_bufbytes -= bufsize; 1471 KASSERT(wl->wl_unsynced_bufbytes >= bufsize); 1472 wl->wl_unsynced_bufbytes -= bufsize; 1473 #endif 1474 1475 /* 1476 * If the current transaction can be reclaimed, start 1477 * at the beginning and reclaim any consecutive reclaimable 1478 * transactions. If we successfully reclaim anything, 1479 * then wakeup anyone waiting for the reclaim. 1480 */ 1481 if (we->we_bufcount == 0) { 1482 size_t delta = 0; 1483 int errcnt = 0; 1484 #ifdef WAPBL_DEBUG_BUFBYTES 1485 KDASSERT(we->we_unsynced_bufbytes == 0); 1486 #endif 1487 /* 1488 * clear any posted error, since the buffer it came from 1489 * has successfully flushed by now 1490 */ 1491 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1492 (we->we_bufcount == 0)) { 1493 delta += we->we_reclaimable_bytes; 1494 if (we->we_error) 1495 errcnt++; 1496 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1497 pool_put(&wapbl_entry_pool, we); 1498 } 1499 1500 if (delta) { 1501 wl->wl_reclaimable_bytes += delta; 1502 KASSERT(wl->wl_error_count >= errcnt); 1503 wl->wl_error_count -= errcnt; 1504 cv_broadcast(&wl->wl_reclaimable_cv); 1505 } 1506 } 1507 1508 mutex_exit(&wl->wl_mtx); 1509 } 1510 1511 /* 1512 * wapbl_flush(wl, wait) 1513 * 1514 * Flush pending block writes, deallocations, and inodes from 1515 * the current transaction in memory to the log on disk: 1516 * 1517 * 1. Call the file system's wl_flush callback to flush any 1518 * per-file-system pending updates. 1519 * 2. Wait for enough space in the log for the current transaction. 1520 * 3. Synchronously write the new log records, advancing the 1521 * circular queue head. 1522 * 4. Issue the pending block writes asynchronously, now that they 1523 * are recorded in the log and can be replayed after crash. 1524 * 5. If wait is true, wait for all writes to complete and for the 1525 * log to become empty. 1526 * 1527 * On failure, call the file system's wl_flush_abort callback. 1528 */ 1529 int 1530 wapbl_flush(struct wapbl *wl, int waitfor) 1531 { 1532 struct buf *bp; 1533 struct wapbl_entry *we; 1534 off_t off; 1535 off_t head; 1536 off_t tail; 1537 size_t delta = 0; 1538 size_t flushsize; 1539 size_t reserved; 1540 int error = 0; 1541 1542 /* 1543 * Do a quick check to see if a full flush can be skipped 1544 * This assumes that the flush callback does not need to be called 1545 * unless there are other outstanding bufs. 1546 */ 1547 if (!waitfor) { 1548 size_t nbufs; 1549 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1550 protect the KASSERTS */ 1551 nbufs = wl->wl_bufcount; 1552 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1553 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1554 mutex_exit(&wl->wl_mtx); 1555 if (nbufs == 0) 1556 return 0; 1557 } 1558 1559 /* 1560 * XXX we may consider using LK_UPGRADE here 1561 * if we want to call flush from inside a transaction 1562 */ 1563 rw_enter(&wl->wl_rwlock, RW_WRITER); 1564 wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist)); 1565 1566 /* 1567 * Now that we are exclusively locked and the file system has 1568 * issued any deferred block writes for this transaction, check 1569 * whether there are any blocks to write to the log. If not, 1570 * skip waiting for space or writing any log entries. 1571 * 1572 * XXX Shouldn't this also check wl_dealloccnt and 1573 * wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the 1574 * file system didn't produce any blocks as a consequence of 1575 * it, but the same does not seem to be so of wl_inohashcnt. 1576 */ 1577 if (wl->wl_bufcount == 0) { 1578 goto wait_out; 1579 } 1580 1581 #if 0 1582 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1583 ("wapbl_flush thread %d.%d flushing entries with " 1584 "bufcount=%zu bufbytes=%zu\n", 1585 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1586 wl->wl_bufbytes)); 1587 #endif 1588 1589 /* Calculate amount of space needed to flush */ 1590 flushsize = wapbl_transaction_len(wl); 1591 if (wapbl_verbose_commit) { 1592 struct timespec ts; 1593 getnanotime(&ts); 1594 printf("%s: %lld.%09ld this transaction = %zu bytes\n", 1595 __func__, (long long)ts.tv_sec, 1596 (long)ts.tv_nsec, flushsize); 1597 } 1598 1599 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1600 /* 1601 * XXX this could be handled more gracefully, perhaps place 1602 * only a partial transaction in the log and allow the 1603 * remaining to flush without the protection of the journal. 1604 */ 1605 panic("wapbl_flush: current transaction too big to flush"); 1606 } 1607 1608 error = wapbl_truncate(wl, flushsize); 1609 if (error) 1610 goto out; 1611 1612 off = wl->wl_head; 1613 KASSERT((off == 0) || (off >= wl->wl_circ_off)); 1614 KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size)); 1615 error = wapbl_write_blocks(wl, &off); 1616 if (error) 1617 goto out; 1618 error = wapbl_write_revocations(wl, &off); 1619 if (error) 1620 goto out; 1621 error = wapbl_write_inodes(wl, &off); 1622 if (error) 1623 goto out; 1624 1625 reserved = 0; 1626 if (wl->wl_inohashcnt) 1627 reserved = wapbl_transaction_inodes_len(wl); 1628 1629 head = wl->wl_head; 1630 tail = wl->wl_tail; 1631 1632 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1633 &head, &tail); 1634 1635 KASSERTMSG(head == off, 1636 "lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1637 " off=%"PRIdMAX" flush=%zu", 1638 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1639 flushsize); 1640 1641 /* Opportunistically move the tail forward if we can */ 1642 mutex_enter(&wl->wl_mtx); 1643 delta = wl->wl_reclaimable_bytes; 1644 mutex_exit(&wl->wl_mtx); 1645 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1646 &head, &tail); 1647 1648 error = wapbl_write_commit(wl, head, tail); 1649 if (error) 1650 goto out; 1651 1652 we = pool_get(&wapbl_entry_pool, PR_WAITOK); 1653 1654 #ifdef WAPBL_DEBUG_BUFBYTES 1655 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1656 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1657 " unsynced=%zu" 1658 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1659 "inodes=%d\n", 1660 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1661 wapbl_space_used(wl->wl_circ_size, head, tail), 1662 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1663 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1664 wl->wl_inohashcnt)); 1665 #else 1666 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1667 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1668 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1669 "inodes=%d\n", 1670 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1671 wapbl_space_used(wl->wl_circ_size, head, tail), 1672 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1673 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1674 #endif 1675 1676 1677 mutex_enter(&bufcache_lock); 1678 mutex_enter(&wl->wl_mtx); 1679 1680 wl->wl_reserved_bytes = reserved; 1681 wl->wl_head = head; 1682 wl->wl_tail = tail; 1683 KASSERT(wl->wl_reclaimable_bytes >= delta); 1684 wl->wl_reclaimable_bytes -= delta; 1685 KDASSERT(wl->wl_dealloccnt == 0); 1686 #ifdef WAPBL_DEBUG_BUFBYTES 1687 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1688 #endif 1689 1690 we->we_wapbl = wl; 1691 we->we_bufcount = wl->wl_bufcount; 1692 #ifdef WAPBL_DEBUG_BUFBYTES 1693 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1694 #endif 1695 we->we_reclaimable_bytes = flushsize; 1696 we->we_error = 0; 1697 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1698 1699 /* 1700 * this flushes bufs in reverse order than they were queued 1701 * it shouldn't matter, but if we care we could use TAILQ instead. 1702 * XXX Note they will get put on the lru queue when they flush 1703 * so we might actually want to change this to preserve order. 1704 */ 1705 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1706 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1707 continue; 1708 } 1709 bp->b_iodone = wapbl_biodone; 1710 bp->b_private = we; 1711 bremfree(bp); 1712 wapbl_remove_buf_locked(wl, bp); 1713 mutex_exit(&wl->wl_mtx); 1714 mutex_exit(&bufcache_lock); 1715 bawrite(bp); 1716 mutex_enter(&bufcache_lock); 1717 mutex_enter(&wl->wl_mtx); 1718 } 1719 mutex_exit(&wl->wl_mtx); 1720 mutex_exit(&bufcache_lock); 1721 1722 #if 0 1723 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1724 ("wapbl_flush thread %d.%d done flushing entries...\n", 1725 curproc->p_pid, curlwp->l_lid)); 1726 #endif 1727 1728 wait_out: 1729 1730 /* 1731 * If the waitfor flag is set, don't return until everything is 1732 * fully flushed and the on disk log is empty. 1733 */ 1734 if (waitfor) { 1735 error = wapbl_truncate(wl, wl->wl_circ_size - 1736 wl->wl_reserved_bytes); 1737 } 1738 1739 out: 1740 if (error) { 1741 wl->wl_flush_abort(wl->wl_mount, 1742 TAILQ_FIRST(&wl->wl_dealloclist)); 1743 } 1744 1745 #ifdef WAPBL_DEBUG_PRINT 1746 if (error) { 1747 pid_t pid = -1; 1748 lwpid_t lid = -1; 1749 if (curproc) 1750 pid = curproc->p_pid; 1751 if (curlwp) 1752 lid = curlwp->l_lid; 1753 mutex_enter(&wl->wl_mtx); 1754 #ifdef WAPBL_DEBUG_BUFBYTES 1755 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1756 ("wapbl_flush: thread %d.%d aborted flush: " 1757 "error = %d\n" 1758 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1759 "deallocs=%d inodes=%d\n" 1760 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1761 "unsynced=%zu\n", 1762 pid, lid, error, wl->wl_bufcount, 1763 wl->wl_bufbytes, wl->wl_bcount, 1764 wl->wl_dealloccnt, wl->wl_inohashcnt, 1765 wl->wl_error_count, wl->wl_reclaimable_bytes, 1766 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1767 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1768 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1769 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1770 "error = %d, unsynced = %zu\n", 1771 we->we_bufcount, we->we_reclaimable_bytes, 1772 we->we_error, we->we_unsynced_bufbytes)); 1773 } 1774 #else 1775 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1776 ("wapbl_flush: thread %d.%d aborted flush: " 1777 "error = %d\n" 1778 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1779 "deallocs=%d inodes=%d\n" 1780 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1781 pid, lid, error, wl->wl_bufcount, 1782 wl->wl_bufbytes, wl->wl_bcount, 1783 wl->wl_dealloccnt, wl->wl_inohashcnt, 1784 wl->wl_error_count, wl->wl_reclaimable_bytes, 1785 wl->wl_reserved_bytes)); 1786 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1787 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1788 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1789 "error = %d\n", we->we_bufcount, 1790 we->we_reclaimable_bytes, we->we_error)); 1791 } 1792 #endif 1793 mutex_exit(&wl->wl_mtx); 1794 } 1795 #endif 1796 1797 rw_exit(&wl->wl_rwlock); 1798 return error; 1799 } 1800 1801 /****************************************************************/ 1802 1803 void 1804 wapbl_jlock_assert(struct wapbl *wl) 1805 { 1806 1807 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1808 } 1809 1810 void 1811 wapbl_junlock_assert(struct wapbl *wl) 1812 { 1813 1814 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1815 } 1816 1817 /****************************************************************/ 1818 1819 /* locks missing */ 1820 void 1821 wapbl_print(struct wapbl *wl, 1822 int full, 1823 void (*pr)(const char *, ...)) 1824 { 1825 struct buf *bp; 1826 struct wapbl_entry *we; 1827 (*pr)("wapbl %p", wl); 1828 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1829 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1830 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1831 wl->wl_circ_size, wl->wl_circ_off, 1832 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1833 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1834 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1835 #ifdef WAPBL_DEBUG_BUFBYTES 1836 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1837 "reserved = %zu errcnt = %d unsynced = %zu\n", 1838 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1839 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1840 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1841 #else 1842 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1843 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1844 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1845 wl->wl_error_count); 1846 #endif 1847 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1848 wl->wl_dealloccnt, wl->wl_dealloclim); 1849 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1850 wl->wl_inohashcnt, wl->wl_inohashmask); 1851 (*pr)("entries:\n"); 1852 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1853 #ifdef WAPBL_DEBUG_BUFBYTES 1854 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1855 "unsynced = %zu\n", 1856 we->we_bufcount, we->we_reclaimable_bytes, 1857 we->we_error, we->we_unsynced_bufbytes); 1858 #else 1859 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1860 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1861 #endif 1862 } 1863 if (full) { 1864 int cnt = 0; 1865 (*pr)("bufs ="); 1866 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1867 if (!LIST_NEXT(bp, b_wapbllist)) { 1868 (*pr)(" %p", bp); 1869 } else if ((++cnt % 6) == 0) { 1870 (*pr)(" %p,\n\t", bp); 1871 } else { 1872 (*pr)(" %p,", bp); 1873 } 1874 } 1875 (*pr)("\n"); 1876 1877 (*pr)("dealloced blks = "); 1878 { 1879 struct wapbl_dealloc *wd; 1880 cnt = 0; 1881 TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) { 1882 (*pr)(" %"PRId64":%d,", 1883 wd->wd_blkno, 1884 wd->wd_len); 1885 if ((++cnt % 4) == 0) { 1886 (*pr)("\n\t"); 1887 } 1888 } 1889 } 1890 (*pr)("\n"); 1891 1892 (*pr)("registered inodes = "); 1893 { 1894 int i; 1895 cnt = 0; 1896 for (i = 0; i <= wl->wl_inohashmask; i++) { 1897 struct wapbl_ino_head *wih; 1898 struct wapbl_ino *wi; 1899 1900 wih = &wl->wl_inohash[i]; 1901 LIST_FOREACH(wi, wih, wi_hash) { 1902 if (wi->wi_ino == 0) 1903 continue; 1904 (*pr)(" %"PRIu64"/0%06"PRIo32",", 1905 wi->wi_ino, wi->wi_mode); 1906 if ((++cnt % 4) == 0) { 1907 (*pr)("\n\t"); 1908 } 1909 } 1910 } 1911 (*pr)("\n"); 1912 } 1913 } 1914 } 1915 1916 #if defined(WAPBL_DEBUG) || defined(DDB) 1917 void 1918 wapbl_dump(struct wapbl *wl) 1919 { 1920 #if defined(WAPBL_DEBUG) 1921 if (!wl) 1922 wl = wapbl_debug_wl; 1923 #endif 1924 if (!wl) 1925 return; 1926 wapbl_print(wl, 1, printf); 1927 } 1928 #endif 1929 1930 /****************************************************************/ 1931 1932 int 1933 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force, 1934 void **cookiep) 1935 { 1936 struct wapbl_dealloc *wd; 1937 int error = 0; 1938 1939 wapbl_jlock_assert(wl); 1940 1941 mutex_enter(&wl->wl_mtx); 1942 1943 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) { 1944 if (!force) { 1945 error = EAGAIN; 1946 goto out; 1947 } 1948 1949 /* 1950 * Forced registration can only be used when: 1951 * 1) the caller can't cope with failure 1952 * 2) the path can be triggered only bounded, small 1953 * times per transaction 1954 * If this is not fullfilled, and the path would be triggered 1955 * many times, this could overflow maximum transaction size 1956 * and panic later. 1957 */ 1958 printf("%s: forced dealloc registration over limit: %d >= %d\n", 1959 wl->wl_mount->mnt_stat.f_mntonname, 1960 wl->wl_dealloccnt, wl->wl_dealloclim); 1961 } 1962 1963 wl->wl_dealloccnt++; 1964 mutex_exit(&wl->wl_mtx); 1965 1966 wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK); 1967 wd->wd_blkno = blk; 1968 wd->wd_len = len; 1969 1970 mutex_enter(&wl->wl_mtx); 1971 TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries); 1972 1973 if (cookiep) 1974 *cookiep = wd; 1975 1976 out: 1977 mutex_exit(&wl->wl_mtx); 1978 1979 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1980 ("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n", 1981 blk, len, error)); 1982 1983 return error; 1984 } 1985 1986 static void 1987 wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd, 1988 bool locked) 1989 { 1990 KASSERT(!locked 1991 || rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx)); 1992 1993 if (!locked) 1994 mutex_enter(&wl->wl_mtx); 1995 1996 TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries); 1997 wl->wl_dealloccnt--; 1998 1999 if (!locked) 2000 mutex_exit(&wl->wl_mtx); 2001 2002 pool_put(&wapbl_dealloc_pool, wd); 2003 } 2004 2005 void 2006 wapbl_unregister_deallocation(struct wapbl *wl, void *cookie) 2007 { 2008 KASSERT(cookie != NULL); 2009 wapbl_deallocation_free(wl, cookie, false); 2010 } 2011 2012 /****************************************************************/ 2013 2014 static void 2015 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 2016 { 2017 2018 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 2019 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 2020 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 2021 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 2022 } 2023 } 2024 2025 static void 2026 wapbl_inodetrk_free(struct wapbl *wl) 2027 { 2028 2029 /* XXX this KASSERT needs locking/mutex analysis */ 2030 KASSERT(wl->wl_inohashcnt == 0); 2031 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 2032 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 2033 pool_destroy(&wapbl_ino_pool); 2034 } 2035 } 2036 2037 static struct wapbl_ino * 2038 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 2039 { 2040 struct wapbl_ino_head *wih; 2041 struct wapbl_ino *wi; 2042 2043 KASSERT(mutex_owned(&wl->wl_mtx)); 2044 2045 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 2046 LIST_FOREACH(wi, wih, wi_hash) { 2047 if (ino == wi->wi_ino) 2048 return wi; 2049 } 2050 return 0; 2051 } 2052 2053 void 2054 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 2055 { 2056 struct wapbl_ino_head *wih; 2057 struct wapbl_ino *wi; 2058 2059 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 2060 2061 mutex_enter(&wl->wl_mtx); 2062 if (wapbl_inodetrk_get(wl, ino) == NULL) { 2063 wi->wi_ino = ino; 2064 wi->wi_mode = mode; 2065 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 2066 LIST_INSERT_HEAD(wih, wi, wi_hash); 2067 wl->wl_inohashcnt++; 2068 WAPBL_PRINTF(WAPBL_PRINT_INODE, 2069 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 2070 mutex_exit(&wl->wl_mtx); 2071 } else { 2072 mutex_exit(&wl->wl_mtx); 2073 pool_put(&wapbl_ino_pool, wi); 2074 } 2075 } 2076 2077 void 2078 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 2079 { 2080 struct wapbl_ino *wi; 2081 2082 mutex_enter(&wl->wl_mtx); 2083 wi = wapbl_inodetrk_get(wl, ino); 2084 if (wi) { 2085 WAPBL_PRINTF(WAPBL_PRINT_INODE, 2086 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 2087 KASSERT(wl->wl_inohashcnt > 0); 2088 wl->wl_inohashcnt--; 2089 LIST_REMOVE(wi, wi_hash); 2090 mutex_exit(&wl->wl_mtx); 2091 2092 pool_put(&wapbl_ino_pool, wi); 2093 } else { 2094 mutex_exit(&wl->wl_mtx); 2095 } 2096 } 2097 2098 /****************************************************************/ 2099 2100 /* 2101 * wapbl_transaction_inodes_len(wl) 2102 * 2103 * Calculate the number of bytes required for inode registration 2104 * log records in wl. 2105 */ 2106 static inline size_t 2107 wapbl_transaction_inodes_len(struct wapbl *wl) 2108 { 2109 int blocklen = 1<<wl->wl_log_dev_bshift; 2110 int iph; 2111 2112 /* Calculate number of inodes described in a inodelist header */ 2113 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2114 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2115 2116 KASSERT(iph > 0); 2117 2118 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; 2119 } 2120 2121 2122 /* 2123 * wapbl_transaction_len(wl) 2124 * 2125 * Calculate number of bytes required for all log records in wl. 2126 */ 2127 static size_t 2128 wapbl_transaction_len(struct wapbl *wl) 2129 { 2130 int blocklen = 1<<wl->wl_log_dev_bshift; 2131 size_t len; 2132 2133 /* Calculate number of blocks described in a blocklist header */ 2134 len = wl->wl_bcount; 2135 len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen; 2136 len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen; 2137 len += wapbl_transaction_inodes_len(wl); 2138 2139 return len; 2140 } 2141 2142 /* 2143 * wapbl_cache_sync(wl, msg) 2144 * 2145 * Issue DIOCCACHESYNC to wl->wl_devvp. 2146 * 2147 * If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message 2148 * including msg about the duration of the cache sync. 2149 */ 2150 static int 2151 wapbl_cache_sync(struct wapbl *wl, const char *msg) 2152 { 2153 const bool verbose = wapbl_verbose_commit >= 2; 2154 struct bintime start_time; 2155 int force = 1; 2156 int error; 2157 2158 if (!wapbl_flush_disk_cache) { 2159 return 0; 2160 } 2161 if (verbose) { 2162 bintime(&start_time); 2163 } 2164 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 2165 FWRITE, FSCRED); 2166 if (error) { 2167 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 2168 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx " 2169 "returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error)); 2170 } 2171 if (verbose) { 2172 struct bintime d; 2173 struct timespec ts; 2174 2175 bintime(&d); 2176 bintime_sub(&d, &start_time); 2177 bintime2timespec(&d, &ts); 2178 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n", 2179 msg, (uintmax_t)wl->wl_devvp->v_rdev, 2180 (uintmax_t)ts.tv_sec, ts.tv_nsec); 2181 } 2182 return error; 2183 } 2184 2185 /* 2186 * wapbl_write_commit(wl, head, tail) 2187 * 2188 * Issue a disk cache sync to wait for all pending writes to the 2189 * log to complete, and then synchronously commit the current 2190 * circular queue head and tail to the log, in the next of two 2191 * locations for commit headers on disk. 2192 * 2193 * Increment the generation number. If the generation number 2194 * rolls over to zero, then a subsequent commit would appear to 2195 * have an older generation than this one -- in that case, issue a 2196 * duplicate commit to avoid this. 2197 * 2198 * => Caller must have exclusive access to wl, either by holding 2199 * wl->wl_rwlock for writer or by being wapbl_start before anyone 2200 * else has seen wl. 2201 */ 2202 static int 2203 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 2204 { 2205 struct wapbl_wc_header *wc = wl->wl_wc_header; 2206 struct timespec ts; 2207 int error; 2208 daddr_t pbn; 2209 2210 error = wapbl_buffered_flush(wl); 2211 if (error) 2212 return error; 2213 /* 2214 * flush disk cache to ensure that blocks we've written are actually 2215 * written to the stable storage before the commit header. 2216 * 2217 * XXX Calc checksum here, instead we do this for now 2218 */ 2219 wapbl_cache_sync(wl, "1"); 2220 2221 wc->wc_head = head; 2222 wc->wc_tail = tail; 2223 wc->wc_checksum = 0; 2224 wc->wc_version = 1; 2225 getnanotime(&ts); 2226 wc->wc_time = ts.tv_sec; 2227 wc->wc_timensec = ts.tv_nsec; 2228 2229 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2230 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 2231 (intmax_t)head, (intmax_t)tail)); 2232 2233 /* 2234 * write the commit header. 2235 * 2236 * XXX if generation will rollover, then first zero 2237 * over second commit header before trying to write both headers. 2238 */ 2239 2240 pbn = wl->wl_logpbn + (wc->wc_generation % 2); 2241 #ifdef _KERNEL 2242 pbn = btodb(pbn << wc->wc_log_dev_bshift); 2243 #endif 2244 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn); 2245 if (error) 2246 return error; 2247 error = wapbl_buffered_flush(wl); 2248 if (error) 2249 return error; 2250 2251 /* 2252 * flush disk cache to ensure that the commit header is actually 2253 * written before meta data blocks. 2254 */ 2255 wapbl_cache_sync(wl, "2"); 2256 2257 /* 2258 * If the generation number was zero, write it out a second time. 2259 * This handles initialization and generation number rollover 2260 */ 2261 if (wc->wc_generation++ == 0) { 2262 error = wapbl_write_commit(wl, head, tail); 2263 /* 2264 * This panic should be able to be removed if we do the 2265 * zero'ing mentioned above, and we are certain to roll 2266 * back generation number on failure. 2267 */ 2268 if (error) 2269 panic("wapbl_write_commit: error writing duplicate " 2270 "log header: %d", error); 2271 } 2272 return 0; 2273 } 2274 2275 /* 2276 * wapbl_write_blocks(wl, offp) 2277 * 2278 * Write all pending physical blocks in the current transaction 2279 * from wapbl_add_buf to the log on disk, adding to the circular 2280 * queue head at byte offset *offp, and returning the new head's 2281 * byte offset in *offp. 2282 */ 2283 static int 2284 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 2285 { 2286 struct wapbl_wc_blocklist *wc = 2287 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2288 int blocklen = 1<<wl->wl_log_dev_bshift; 2289 struct buf *bp; 2290 off_t off = *offp; 2291 int error; 2292 size_t padding; 2293 2294 KASSERT(rw_write_held(&wl->wl_rwlock)); 2295 2296 bp = LIST_FIRST(&wl->wl_bufs); 2297 2298 while (bp) { 2299 int cnt; 2300 struct buf *obp = bp; 2301 2302 KASSERT(bp->b_flags & B_LOCKED); 2303 2304 wc->wc_type = WAPBL_WC_BLOCKS; 2305 wc->wc_len = blocklen; 2306 wc->wc_blkcount = 0; 2307 while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) { 2308 /* 2309 * Make sure all the physical block numbers are up to 2310 * date. If this is not always true on a given 2311 * filesystem, then VOP_BMAP must be called. We 2312 * could call VOP_BMAP here, or else in the filesystem 2313 * specific flush callback, although neither of those 2314 * solutions allow us to take the vnode lock. If a 2315 * filesystem requires that we must take the vnode lock 2316 * to call VOP_BMAP, then we can probably do it in 2317 * bwrite when the vnode lock should already be held 2318 * by the invoking code. 2319 */ 2320 KASSERT((bp->b_vp->v_type == VBLK) || 2321 (bp->b_blkno != bp->b_lblkno)); 2322 KASSERT(bp->b_blkno > 0); 2323 2324 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 2325 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 2326 wc->wc_len += bp->b_bcount; 2327 wc->wc_blkcount++; 2328 bp = LIST_NEXT(bp, b_wapbllist); 2329 } 2330 if (wc->wc_len % blocklen != 0) { 2331 padding = blocklen - wc->wc_len % blocklen; 2332 wc->wc_len += padding; 2333 } else { 2334 padding = 0; 2335 } 2336 2337 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2338 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 2339 wc->wc_len, padding, (intmax_t)off)); 2340 2341 error = wapbl_circ_write(wl, wc, blocklen, &off); 2342 if (error) 2343 return error; 2344 bp = obp; 2345 cnt = 0; 2346 while (bp && (cnt++ < wl->wl_brperjblock)) { 2347 error = wapbl_circ_write(wl, bp->b_data, 2348 bp->b_bcount, &off); 2349 if (error) 2350 return error; 2351 bp = LIST_NEXT(bp, b_wapbllist); 2352 } 2353 if (padding) { 2354 void *zero; 2355 2356 zero = wapbl_alloc(padding); 2357 memset(zero, 0, padding); 2358 error = wapbl_circ_write(wl, zero, padding, &off); 2359 wapbl_free(zero, padding); 2360 if (error) 2361 return error; 2362 } 2363 } 2364 *offp = off; 2365 return 0; 2366 } 2367 2368 /* 2369 * wapbl_write_revocations(wl, offp) 2370 * 2371 * Write all pending deallocations in the current transaction from 2372 * wapbl_register_deallocation to the log on disk, adding to the 2373 * circular queue's head at byte offset *offp, and returning the 2374 * new head's byte offset in *offp. 2375 */ 2376 static int 2377 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2378 { 2379 struct wapbl_wc_blocklist *wc = 2380 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2381 struct wapbl_dealloc *wd, *lwd; 2382 int blocklen = 1<<wl->wl_log_dev_bshift; 2383 off_t off = *offp; 2384 int error; 2385 2386 if (wl->wl_dealloccnt == 0) 2387 return 0; 2388 2389 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) { 2390 wc->wc_type = WAPBL_WC_REVOCATIONS; 2391 wc->wc_len = blocklen; 2392 wc->wc_blkcount = 0; 2393 while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) { 2394 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2395 wd->wd_blkno; 2396 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2397 wd->wd_len; 2398 wc->wc_blkcount++; 2399 2400 wd = TAILQ_NEXT(wd, wd_entries); 2401 } 2402 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2403 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2404 wc->wc_len, (intmax_t)off)); 2405 error = wapbl_circ_write(wl, wc, blocklen, &off); 2406 if (error) 2407 return error; 2408 2409 /* free all successfully written deallocs */ 2410 lwd = wd; 2411 while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) { 2412 if (wd == lwd) 2413 break; 2414 wapbl_deallocation_free(wl, wd, true); 2415 } 2416 } 2417 *offp = off; 2418 return 0; 2419 } 2420 2421 /* 2422 * wapbl_write_inodes(wl, offp) 2423 * 2424 * Write all pending inode allocations in the current transaction 2425 * from wapbl_register_inode to the log on disk, adding to the 2426 * circular queue's head at byte offset *offp and returning the 2427 * new head's byte offset in *offp. 2428 */ 2429 static int 2430 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2431 { 2432 struct wapbl_wc_inodelist *wc = 2433 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2434 int i; 2435 int blocklen = 1 << wl->wl_log_dev_bshift; 2436 off_t off = *offp; 2437 int error; 2438 2439 struct wapbl_ino_head *wih; 2440 struct wapbl_ino *wi; 2441 int iph; 2442 2443 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2444 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2445 2446 i = 0; 2447 wih = &wl->wl_inohash[0]; 2448 wi = 0; 2449 do { 2450 wc->wc_type = WAPBL_WC_INODES; 2451 wc->wc_len = blocklen; 2452 wc->wc_inocnt = 0; 2453 wc->wc_clear = (i == 0); 2454 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2455 while (!wi) { 2456 KASSERT((wih - &wl->wl_inohash[0]) 2457 <= wl->wl_inohashmask); 2458 wi = LIST_FIRST(wih++); 2459 } 2460 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2461 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2462 wc->wc_inocnt++; 2463 i++; 2464 wi = LIST_NEXT(wi, wi_hash); 2465 } 2466 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2467 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2468 wc->wc_len, (intmax_t)off)); 2469 error = wapbl_circ_write(wl, wc, blocklen, &off); 2470 if (error) 2471 return error; 2472 } while (i < wl->wl_inohashcnt); 2473 2474 *offp = off; 2475 return 0; 2476 } 2477 2478 #endif /* _KERNEL */ 2479 2480 /****************************************************************/ 2481 2482 struct wapbl_blk { 2483 LIST_ENTRY(wapbl_blk) wb_hash; 2484 daddr_t wb_blk; 2485 off_t wb_off; /* Offset of this block in the log */ 2486 }; 2487 #define WAPBL_BLKPOOL_MIN 83 2488 2489 static void 2490 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2491 { 2492 if (size < WAPBL_BLKPOOL_MIN) 2493 size = WAPBL_BLKPOOL_MIN; 2494 KASSERT(wr->wr_blkhash == 0); 2495 #ifdef _KERNEL 2496 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2497 #else /* ! _KERNEL */ 2498 /* Manually implement hashinit */ 2499 { 2500 unsigned long i, hashsize; 2501 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2502 continue; 2503 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash)); 2504 for (i = 0; i < hashsize; i++) 2505 LIST_INIT(&wr->wr_blkhash[i]); 2506 wr->wr_blkhashmask = hashsize - 1; 2507 } 2508 #endif /* ! _KERNEL */ 2509 } 2510 2511 static void 2512 wapbl_blkhash_free(struct wapbl_replay *wr) 2513 { 2514 KASSERT(wr->wr_blkhashcnt == 0); 2515 #ifdef _KERNEL 2516 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2517 #else /* ! _KERNEL */ 2518 wapbl_free(wr->wr_blkhash, 2519 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2520 #endif /* ! _KERNEL */ 2521 } 2522 2523 static struct wapbl_blk * 2524 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2525 { 2526 struct wapbl_blk_head *wbh; 2527 struct wapbl_blk *wb; 2528 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2529 LIST_FOREACH(wb, wbh, wb_hash) { 2530 if (blk == wb->wb_blk) 2531 return wb; 2532 } 2533 return 0; 2534 } 2535 2536 static void 2537 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2538 { 2539 struct wapbl_blk_head *wbh; 2540 struct wapbl_blk *wb; 2541 wb = wapbl_blkhash_get(wr, blk); 2542 if (wb) { 2543 KASSERT(wb->wb_blk == blk); 2544 wb->wb_off = off; 2545 } else { 2546 wb = wapbl_alloc(sizeof(*wb)); 2547 wb->wb_blk = blk; 2548 wb->wb_off = off; 2549 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2550 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2551 wr->wr_blkhashcnt++; 2552 } 2553 } 2554 2555 static void 2556 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2557 { 2558 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2559 if (wb) { 2560 KASSERT(wr->wr_blkhashcnt > 0); 2561 wr->wr_blkhashcnt--; 2562 LIST_REMOVE(wb, wb_hash); 2563 wapbl_free(wb, sizeof(*wb)); 2564 } 2565 } 2566 2567 static void 2568 wapbl_blkhash_clear(struct wapbl_replay *wr) 2569 { 2570 unsigned long i; 2571 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2572 struct wapbl_blk *wb; 2573 2574 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2575 KASSERT(wr->wr_blkhashcnt > 0); 2576 wr->wr_blkhashcnt--; 2577 LIST_REMOVE(wb, wb_hash); 2578 wapbl_free(wb, sizeof(*wb)); 2579 } 2580 } 2581 KASSERT(wr->wr_blkhashcnt == 0); 2582 } 2583 2584 /****************************************************************/ 2585 2586 /* 2587 * wapbl_circ_read(wr, data, len, offp) 2588 * 2589 * Read len bytes into data from the circular queue of wr, 2590 * starting at the linear byte offset *offp, and returning the new 2591 * linear byte offset in *offp. 2592 * 2593 * If the starting linear byte offset precedes wr->wr_circ_off, 2594 * the read instead begins at wr->wr_circ_off. XXX WTF? This 2595 * should be a KASSERT, not a conditional. 2596 */ 2597 static int 2598 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2599 { 2600 size_t slen; 2601 off_t off = *offp; 2602 int error; 2603 daddr_t pbn; 2604 2605 KASSERT(((len >> wr->wr_log_dev_bshift) << 2606 wr->wr_log_dev_bshift) == len); 2607 2608 if (off < wr->wr_circ_off) 2609 off = wr->wr_circ_off; 2610 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2611 if (slen < len) { 2612 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2613 #ifdef _KERNEL 2614 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2615 #endif 2616 error = wapbl_read(data, slen, wr->wr_devvp, pbn); 2617 if (error) 2618 return error; 2619 data = (uint8_t *)data + slen; 2620 len -= slen; 2621 off = wr->wr_circ_off; 2622 } 2623 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2624 #ifdef _KERNEL 2625 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2626 #endif 2627 error = wapbl_read(data, len, wr->wr_devvp, pbn); 2628 if (error) 2629 return error; 2630 off += len; 2631 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2632 off = wr->wr_circ_off; 2633 *offp = off; 2634 return 0; 2635 } 2636 2637 /* 2638 * wapbl_circ_advance(wr, len, offp) 2639 * 2640 * Compute the linear byte offset of the circular queue of wr that 2641 * is len bytes past *offp, and store it in *offp. 2642 * 2643 * This is as if wapbl_circ_read, but without actually reading 2644 * anything. 2645 * 2646 * If the starting linear byte offset precedes wr->wr_circ_off, it 2647 * is taken to be wr->wr_circ_off instead. XXX WTF? This should 2648 * be a KASSERT, not a conditional. 2649 */ 2650 static void 2651 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2652 { 2653 size_t slen; 2654 off_t off = *offp; 2655 2656 KASSERT(((len >> wr->wr_log_dev_bshift) << 2657 wr->wr_log_dev_bshift) == len); 2658 2659 if (off < wr->wr_circ_off) 2660 off = wr->wr_circ_off; 2661 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2662 if (slen < len) { 2663 len -= slen; 2664 off = wr->wr_circ_off; 2665 } 2666 off += len; 2667 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2668 off = wr->wr_circ_off; 2669 *offp = off; 2670 } 2671 2672 /****************************************************************/ 2673 2674 int 2675 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2676 daddr_t off, size_t count, size_t blksize) 2677 { 2678 struct wapbl_replay *wr; 2679 int error; 2680 struct vnode *devvp; 2681 daddr_t logpbn; 2682 uint8_t *scratch; 2683 struct wapbl_wc_header *wch; 2684 struct wapbl_wc_header *wch2; 2685 /* Use this until we read the actual log header */ 2686 int log_dev_bshift = ilog2(blksize); 2687 size_t used; 2688 daddr_t pbn; 2689 2690 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2691 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2692 vp, off, count, blksize)); 2693 2694 if (off < 0) 2695 return EINVAL; 2696 2697 if (blksize < DEV_BSIZE) 2698 return EINVAL; 2699 if (blksize % DEV_BSIZE) 2700 return EINVAL; 2701 2702 #ifdef _KERNEL 2703 #if 0 2704 /* XXX vp->v_size isn't reliably set for VBLK devices, 2705 * especially root. However, we might still want to verify 2706 * that the full load is readable */ 2707 if ((off + count) * blksize > vp->v_size) 2708 return EINVAL; 2709 #endif 2710 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2711 return error; 2712 } 2713 #else /* ! _KERNEL */ 2714 devvp = vp; 2715 logpbn = off; 2716 #endif /* ! _KERNEL */ 2717 2718 scratch = wapbl_alloc(MAXBSIZE); 2719 2720 pbn = logpbn; 2721 #ifdef _KERNEL 2722 pbn = btodb(pbn << log_dev_bshift); 2723 #endif 2724 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); 2725 if (error) 2726 goto errout; 2727 2728 wch = (struct wapbl_wc_header *)scratch; 2729 wch2 = 2730 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2731 /* XXX verify checksums and magic numbers */ 2732 if (wch->wc_type != WAPBL_WC_HEADER) { 2733 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2734 error = EFTYPE; 2735 goto errout; 2736 } 2737 2738 if (wch2->wc_generation > wch->wc_generation) 2739 wch = wch2; 2740 2741 wr = wapbl_calloc(1, sizeof(*wr)); 2742 2743 wr->wr_logvp = vp; 2744 wr->wr_devvp = devvp; 2745 wr->wr_logpbn = logpbn; 2746 2747 wr->wr_scratch = scratch; 2748 2749 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2750 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2751 wr->wr_circ_off = wch->wc_circ_off; 2752 wr->wr_circ_size = wch->wc_circ_size; 2753 wr->wr_generation = wch->wc_generation; 2754 2755 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2756 2757 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2758 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2759 " len=%"PRId64" used=%zu\n", 2760 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2761 wch->wc_circ_size, used)); 2762 2763 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2764 2765 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2766 if (error) { 2767 wapbl_replay_stop(wr); 2768 wapbl_replay_free(wr); 2769 return error; 2770 } 2771 2772 *wrp = wr; 2773 return 0; 2774 2775 errout: 2776 wapbl_free(scratch, MAXBSIZE); 2777 return error; 2778 } 2779 2780 void 2781 wapbl_replay_stop(struct wapbl_replay *wr) 2782 { 2783 2784 if (!wapbl_replay_isopen(wr)) 2785 return; 2786 2787 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2788 2789 wapbl_free(wr->wr_scratch, MAXBSIZE); 2790 wr->wr_scratch = NULL; 2791 2792 wr->wr_logvp = NULL; 2793 2794 wapbl_blkhash_clear(wr); 2795 wapbl_blkhash_free(wr); 2796 } 2797 2798 void 2799 wapbl_replay_free(struct wapbl_replay *wr) 2800 { 2801 2802 KDASSERT(!wapbl_replay_isopen(wr)); 2803 2804 if (wr->wr_inodes) 2805 wapbl_free(wr->wr_inodes, 2806 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2807 wapbl_free(wr, sizeof(*wr)); 2808 } 2809 2810 #ifdef _KERNEL 2811 int 2812 wapbl_replay_isopen1(struct wapbl_replay *wr) 2813 { 2814 2815 return wapbl_replay_isopen(wr); 2816 } 2817 #endif 2818 2819 /* 2820 * calculate the disk address for the i'th block in the wc_blockblist 2821 * offset by j blocks of size blen. 2822 * 2823 * wc_daddr is always a kernel disk address in DEV_BSIZE units that 2824 * was written to the journal. 2825 * 2826 * The kernel needs that address plus the offset in DEV_BSIZE units. 2827 * 2828 * Userland needs that address plus the offset in blen units. 2829 * 2830 */ 2831 static daddr_t 2832 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen) 2833 { 2834 daddr_t pbn; 2835 2836 #ifdef _KERNEL 2837 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen); 2838 #else 2839 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j; 2840 #endif 2841 2842 return pbn; 2843 } 2844 2845 static void 2846 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2847 { 2848 struct wapbl_wc_blocklist *wc = 2849 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2850 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2851 int i, j, n; 2852 2853 for (i = 0; i < wc->wc_blkcount; i++) { 2854 /* 2855 * Enter each physical block into the hashtable independently. 2856 */ 2857 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2858 for (j = 0; j < n; j++) { 2859 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen), 2860 *offp); 2861 wapbl_circ_advance(wr, fsblklen, offp); 2862 } 2863 } 2864 } 2865 2866 static void 2867 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2868 { 2869 struct wapbl_wc_blocklist *wc = 2870 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2871 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2872 int i, j, n; 2873 2874 for (i = 0; i < wc->wc_blkcount; i++) { 2875 /* 2876 * Remove any blocks found from the hashtable. 2877 */ 2878 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2879 for (j = 0; j < n; j++) 2880 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen)); 2881 } 2882 } 2883 2884 static void 2885 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2886 { 2887 struct wapbl_wc_inodelist *wc = 2888 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2889 void *new_inodes; 2890 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2891 2892 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2893 2894 /* 2895 * Keep track of where we found this so location won't be 2896 * overwritten. 2897 */ 2898 if (wc->wc_clear) { 2899 wr->wr_inodestail = oldoff; 2900 wr->wr_inodescnt = 0; 2901 if (wr->wr_inodes != NULL) { 2902 wapbl_free(wr->wr_inodes, oldsize); 2903 wr->wr_inodes = NULL; 2904 } 2905 } 2906 wr->wr_inodeshead = newoff; 2907 if (wc->wc_inocnt == 0) 2908 return; 2909 2910 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) * 2911 sizeof(wr->wr_inodes[0])); 2912 if (wr->wr_inodes != NULL) { 2913 memcpy(new_inodes, wr->wr_inodes, oldsize); 2914 wapbl_free(wr->wr_inodes, oldsize); 2915 } 2916 wr->wr_inodes = new_inodes; 2917 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2918 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2919 wr->wr_inodescnt += wc->wc_inocnt; 2920 } 2921 2922 static int 2923 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2924 { 2925 off_t off; 2926 int error; 2927 2928 int logblklen = 1 << wr->wr_log_dev_bshift; 2929 2930 wapbl_blkhash_clear(wr); 2931 2932 off = tail; 2933 while (off != head) { 2934 struct wapbl_wc_null *wcn; 2935 off_t saveoff = off; 2936 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2937 if (error) 2938 goto errout; 2939 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2940 switch (wcn->wc_type) { 2941 case WAPBL_WC_BLOCKS: 2942 wapbl_replay_process_blocks(wr, &off); 2943 break; 2944 2945 case WAPBL_WC_REVOCATIONS: 2946 wapbl_replay_process_revocations(wr); 2947 break; 2948 2949 case WAPBL_WC_INODES: 2950 wapbl_replay_process_inodes(wr, saveoff, off); 2951 break; 2952 2953 default: 2954 printf("Unrecognized wapbl type: 0x%08x\n", 2955 wcn->wc_type); 2956 error = EFTYPE; 2957 goto errout; 2958 } 2959 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2960 if (off != saveoff) { 2961 printf("wapbl_replay: corrupted records\n"); 2962 error = EFTYPE; 2963 goto errout; 2964 } 2965 } 2966 return 0; 2967 2968 errout: 2969 wapbl_blkhash_clear(wr); 2970 return error; 2971 } 2972 2973 #if 0 2974 int 2975 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2976 { 2977 off_t off; 2978 int mismatchcnt = 0; 2979 int logblklen = 1 << wr->wr_log_dev_bshift; 2980 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2981 void *scratch1 = wapbl_alloc(MAXBSIZE); 2982 void *scratch2 = wapbl_alloc(MAXBSIZE); 2983 int error = 0; 2984 2985 KDASSERT(wapbl_replay_isopen(wr)); 2986 2987 off = wch->wc_tail; 2988 while (off != wch->wc_head) { 2989 struct wapbl_wc_null *wcn; 2990 #ifdef DEBUG 2991 off_t saveoff = off; 2992 #endif 2993 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2994 if (error) 2995 goto out; 2996 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2997 switch (wcn->wc_type) { 2998 case WAPBL_WC_BLOCKS: 2999 { 3000 struct wapbl_wc_blocklist *wc = 3001 (struct wapbl_wc_blocklist *)wr->wr_scratch; 3002 int i; 3003 for (i = 0; i < wc->wc_blkcount; i++) { 3004 int foundcnt = 0; 3005 int dirtycnt = 0; 3006 int j, n; 3007 /* 3008 * Check each physical block into the 3009 * hashtable independently 3010 */ 3011 n = wc->wc_blocks[i].wc_dlen >> 3012 wch->wc_fs_dev_bshift; 3013 for (j = 0; j < n; j++) { 3014 struct wapbl_blk *wb = 3015 wapbl_blkhash_get(wr, 3016 wapbl_block_daddr(wc, i, j, fsblklen)); 3017 if (wb && (wb->wb_off == off)) { 3018 foundcnt++; 3019 error = 3020 wapbl_circ_read(wr, 3021 scratch1, fsblklen, 3022 &off); 3023 if (error) 3024 goto out; 3025 error = 3026 wapbl_read(scratch2, 3027 fsblklen, fsdevvp, 3028 wb->wb_blk); 3029 if (error) 3030 goto out; 3031 if (memcmp(scratch1, 3032 scratch2, 3033 fsblklen)) { 3034 printf( 3035 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 3036 wb->wb_blk, (intmax_t)off); 3037 dirtycnt++; 3038 mismatchcnt++; 3039 } 3040 } else { 3041 wapbl_circ_advance(wr, 3042 fsblklen, &off); 3043 } 3044 } 3045 #if 0 3046 /* 3047 * If all of the blocks in an entry 3048 * are clean, then remove all of its 3049 * blocks from the hashtable since they 3050 * never will need replay. 3051 */ 3052 if ((foundcnt != 0) && 3053 (dirtycnt == 0)) { 3054 off = saveoff; 3055 wapbl_circ_advance(wr, 3056 logblklen, &off); 3057 for (j = 0; j < n; j++) { 3058 struct wapbl_blk *wb = 3059 wapbl_blkhash_get(wr, 3060 wapbl_block_daddr(wc, i, j, fsblklen)); 3061 if (wb && 3062 (wb->wb_off == off)) { 3063 wapbl_blkhash_rem(wr, wb->wb_blk); 3064 } 3065 wapbl_circ_advance(wr, 3066 fsblklen, &off); 3067 } 3068 } 3069 #endif 3070 } 3071 } 3072 break; 3073 case WAPBL_WC_REVOCATIONS: 3074 case WAPBL_WC_INODES: 3075 break; 3076 default: 3077 KASSERT(0); 3078 } 3079 #ifdef DEBUG 3080 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 3081 KASSERT(off == saveoff); 3082 #endif 3083 } 3084 out: 3085 wapbl_free(scratch1, MAXBSIZE); 3086 wapbl_free(scratch2, MAXBSIZE); 3087 if (!error && mismatchcnt) 3088 error = EFTYPE; 3089 return error; 3090 } 3091 #endif 3092 3093 int 3094 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 3095 { 3096 struct wapbl_blk *wb; 3097 size_t i; 3098 off_t off; 3099 void *scratch; 3100 int error = 0; 3101 int fsblklen = 1 << wr->wr_fs_dev_bshift; 3102 3103 KDASSERT(wapbl_replay_isopen(wr)); 3104 3105 scratch = wapbl_alloc(MAXBSIZE); 3106 3107 for (i = 0; i <= wr->wr_blkhashmask; ++i) { 3108 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 3109 off = wb->wb_off; 3110 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 3111 if (error) 3112 break; 3113 error = wapbl_write(scratch, fsblklen, fsdevvp, 3114 wb->wb_blk); 3115 if (error) 3116 break; 3117 } 3118 } 3119 3120 wapbl_free(scratch, MAXBSIZE); 3121 return error; 3122 } 3123 3124 int 3125 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 3126 { 3127 int fsblklen = 1 << wr->wr_fs_dev_bshift; 3128 3129 KDASSERT(wapbl_replay_isopen(wr)); 3130 KASSERT((len % fsblklen) == 0); 3131 3132 while (len != 0) { 3133 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 3134 if (wb) 3135 return 1; 3136 len -= fsblklen; 3137 } 3138 return 0; 3139 } 3140 3141 int 3142 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 3143 { 3144 int fsblklen = 1 << wr->wr_fs_dev_bshift; 3145 3146 KDASSERT(wapbl_replay_isopen(wr)); 3147 3148 KASSERT((len % fsblklen) == 0); 3149 3150 while (len != 0) { 3151 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 3152 if (wb) { 3153 off_t off = wb->wb_off; 3154 int error; 3155 error = wapbl_circ_read(wr, data, fsblklen, &off); 3156 if (error) 3157 return error; 3158 } 3159 data = (uint8_t *)data + fsblklen; 3160 len -= fsblklen; 3161 blk++; 3162 } 3163 return 0; 3164 } 3165 3166 #ifdef _KERNEL 3167 3168 MODULE(MODULE_CLASS_VFS, wapbl, NULL); 3169 3170 static int 3171 wapbl_modcmd(modcmd_t cmd, void *arg) 3172 { 3173 3174 switch (cmd) { 3175 case MODULE_CMD_INIT: 3176 wapbl_init(); 3177 return 0; 3178 case MODULE_CMD_FINI: 3179 return wapbl_fini(); 3180 default: 3181 return ENOTTY; 3182 } 3183 } 3184 #endif /* _KERNEL */ 3185