1 /* $NetBSD: vfs_wapbl.c,v 1.47 2011/09/01 09:03:43 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.47 2011/09/01 09:03:43 christos Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/bitops.h> 43 44 #ifdef _KERNEL 45 #include <sys/param.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/sysctl.h> 49 #include <sys/uio.h> 50 #include <sys/vnode.h> 51 #include <sys/file.h> 52 #include <sys/malloc.h> 53 #include <sys/module.h> 54 #include <sys/resourcevar.h> 55 #include <sys/conf.h> 56 #include <sys/mount.h> 57 #include <sys/kernel.h> 58 #include <sys/kauth.h> 59 #include <sys/mutex.h> 60 #include <sys/atomic.h> 61 #include <sys/wapbl.h> 62 #include <sys/wapbl_replay.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 #if 0 /* notyet */ 67 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP) 68 #define wapbl_free(a, s) kmem_free((a), (s)) 69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 70 #else 71 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); 72 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) 73 #define wapbl_free(a, s) free((a), M_WAPBL) 74 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) 75 #endif 76 77 static struct sysctllog *wapbl_sysctl; 78 static int wapbl_flush_disk_cache = 1; 79 static int wapbl_verbose_commit = 0; 80 81 #else /* !_KERNEL */ 82 #include <assert.h> 83 #include <errno.h> 84 #include <stdio.h> 85 #include <stdbool.h> 86 #include <stdlib.h> 87 #include <string.h> 88 89 #include <sys/time.h> 90 #include <sys/wapbl.h> 91 #include <sys/wapbl_replay.h> 92 93 #define KDASSERT(x) assert(x) 94 #define KASSERT(x) assert(x) 95 #define wapbl_malloc(s) malloc(s) 96 #define wapbl_free(a, s) free(a) 97 #define wapbl_calloc(n, s) calloc((n), (s)) 98 99 #endif /* !_KERNEL */ 100 101 /* 102 * INTERNAL DATA STRUCTURES 103 */ 104 105 /* 106 * This structure holds per-mount log information. 107 * 108 * Legend: a = atomic access only 109 * r = read-only after init 110 * l = rwlock held 111 * m = mutex held 112 * lm = rwlock held writing or mutex held 113 * u = unlocked access ok 114 * b = bufcache_lock held 115 */ 116 struct wapbl { 117 struct vnode *wl_logvp; /* r: log here */ 118 struct vnode *wl_devvp; /* r: log on this device */ 119 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 120 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 121 int wl_log_dev_bshift; /* r: logarithm of device block size of log 122 device */ 123 int wl_fs_dev_bshift; /* r: logarithm of device block size of 124 filesystem device */ 125 126 unsigned wl_lock_count; /* m: Count of transactions in progress */ 127 128 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 129 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 130 131 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 132 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 133 134 off_t wl_head; /* l: Byte offset of log head */ 135 off_t wl_tail; /* l: Byte offset of log tail */ 136 /* 137 * head == tail == 0 means log is empty 138 * head == tail != 0 means log is full 139 * see assertions in wapbl_advance() for other boundary conditions. 140 * only truncate moves the tail, except when flush sets it to 141 * wl_header_size only flush moves the head, except when truncate 142 * sets it to 0. 143 */ 144 145 struct wapbl_wc_header *wl_wc_header; /* l */ 146 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 147 148 kmutex_t wl_mtx; /* u: short-term lock */ 149 krwlock_t wl_rwlock; /* u: File system transaction lock */ 150 151 /* 152 * Must be held while accessing 153 * wl_count or wl_bufs or head or tail 154 */ 155 156 /* 157 * Callback called from within the flush routine to flush any extra 158 * bits. Note that flush may be skipped without calling this if 159 * there are no outstanding buffers in the transaction. 160 */ 161 #if _KERNEL 162 wapbl_flush_fn_t wl_flush; /* r */ 163 wapbl_flush_fn_t wl_flush_abort;/* r */ 164 #endif 165 166 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 167 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 168 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 169 170 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 171 172 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 173 size_t wl_reclaimable_bytes; /* m: Amount of space available for 174 reclamation by truncate */ 175 int wl_error_count; /* m: # of wl_entries with errors */ 176 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 177 178 #ifdef WAPBL_DEBUG_BUFBYTES 179 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 180 #endif 181 182 daddr_t *wl_deallocblks;/* lm: address of block */ 183 int *wl_dealloclens; /* lm: size of block */ 184 int wl_dealloccnt; /* lm: total count */ 185 int wl_dealloclim; /* l: max count */ 186 187 /* hashtable of inode numbers for allocated but unlinked inodes */ 188 /* synch ??? */ 189 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 190 u_long wl_inohashmask; 191 int wl_inohashcnt; 192 193 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 194 accounting */ 195 }; 196 197 #ifdef WAPBL_DEBUG_PRINT 198 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 199 #endif 200 201 /****************************************************************/ 202 #ifdef _KERNEL 203 204 #ifdef WAPBL_DEBUG 205 struct wapbl *wapbl_debug_wl; 206 #endif 207 208 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 209 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 210 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 211 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 212 #endif /* _KERNEL */ 213 214 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 215 216 static inline size_t wapbl_space_free(size_t avail, off_t head, 217 off_t tail); 218 static inline size_t wapbl_space_used(size_t avail, off_t head, 219 off_t tail); 220 221 #ifdef _KERNEL 222 223 #define WAPBL_INODETRK_SIZE 83 224 static int wapbl_ino_pool_refcount; 225 static struct pool wapbl_ino_pool; 226 struct wapbl_ino { 227 LIST_ENTRY(wapbl_ino) wi_hash; 228 ino_t wi_ino; 229 mode_t wi_mode; 230 }; 231 232 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 233 static void wapbl_inodetrk_free(struct wapbl *wl); 234 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 235 236 static size_t wapbl_transaction_len(struct wapbl *wl); 237 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 238 239 #if 0 240 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 241 #endif 242 243 static int wapbl_replay_isopen1(struct wapbl_replay *); 244 245 /* 246 * This is useful for debugging. If set, the log will 247 * only be truncated when necessary. 248 */ 249 int wapbl_lazy_truncate = 0; 250 251 struct wapbl_ops wapbl_ops = { 252 .wo_wapbl_discard = wapbl_discard, 253 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 254 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 255 .wo_wapbl_replay_read = wapbl_replay_read, 256 .wo_wapbl_add_buf = wapbl_add_buf, 257 .wo_wapbl_remove_buf = wapbl_remove_buf, 258 .wo_wapbl_resize_buf = wapbl_resize_buf, 259 .wo_wapbl_begin = wapbl_begin, 260 .wo_wapbl_end = wapbl_end, 261 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 262 263 /* XXX: the following is only used to say "this is a wapbl buf" */ 264 .wo_wapbl_biodone = wapbl_biodone, 265 }; 266 267 static int 268 wapbl_sysctl_init(void) 269 { 270 int rv; 271 const struct sysctlnode *rnode, *cnode; 272 273 wapbl_sysctl = NULL; 274 275 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, 276 CTLFLAG_PERMANENT, 277 CTLTYPE_NODE, "vfs", NULL, 278 NULL, 0, NULL, 0, 279 CTL_VFS, CTL_EOL); 280 if (rv) 281 return rv; 282 283 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode, 284 CTLFLAG_PERMANENT, 285 CTLTYPE_NODE, "wapbl", 286 SYSCTL_DESCR("WAPBL journaling options"), 287 NULL, 0, NULL, 0, 288 CTL_CREATE, CTL_EOL); 289 if (rv) 290 return rv; 291 292 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 294 CTLTYPE_INT, "flush_disk_cache", 295 SYSCTL_DESCR("flush disk cache"), 296 NULL, 0, &wapbl_flush_disk_cache, 0, 297 CTL_CREATE, CTL_EOL); 298 if (rv) 299 return rv; 300 301 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 302 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 303 CTLTYPE_INT, "verbose_commit", 304 SYSCTL_DESCR("show time and size of wapbl log commits"), 305 NULL, 0, &wapbl_verbose_commit, 0, 306 CTL_CREATE, CTL_EOL); 307 return rv; 308 } 309 310 static void 311 wapbl_init(void) 312 { 313 malloc_type_attach(M_WAPBL); 314 wapbl_sysctl_init(); 315 } 316 317 #ifdef notyet 318 static int 319 wapbl_fini(bool interface) 320 { 321 if (aio_sysctl != NULL) 322 sysctl_teardown(&aio_sysctl); 323 return 0; 324 } 325 #endif 326 327 static int 328 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 329 { 330 int error, i; 331 332 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 333 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 334 335 /* 336 * Its only valid to reuse the replay log if its 337 * the same as the new log we just opened. 338 */ 339 KDASSERT(!wapbl_replay_isopen(wr)); 340 KASSERT(wl->wl_devvp->v_type == VBLK); 341 KASSERT(wr->wr_devvp->v_type == VBLK); 342 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 343 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 344 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 345 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 346 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 347 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 348 349 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 350 351 for (i = 0; i < wr->wr_inodescnt; i++) 352 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 353 wr->wr_inodes[i].wr_imode); 354 355 /* Make sure new transaction won't overwrite old inodes list */ 356 KDASSERT(wapbl_transaction_len(wl) <= 357 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 358 wr->wr_inodestail)); 359 360 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 361 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 362 wapbl_transaction_len(wl); 363 364 error = wapbl_write_inodes(wl, &wl->wl_head); 365 if (error) 366 return error; 367 368 KASSERT(wl->wl_head != wl->wl_tail); 369 KASSERT(wl->wl_head != 0); 370 371 return 0; 372 } 373 374 int 375 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 376 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 377 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 378 { 379 struct wapbl *wl; 380 struct vnode *devvp; 381 daddr_t logpbn; 382 int error; 383 int log_dev_bshift = ilog2(blksize); 384 int fs_dev_bshift = log_dev_bshift; 385 int run; 386 387 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 388 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 389 390 if (log_dev_bshift > fs_dev_bshift) { 391 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 392 ("wapbl: log device's block size cannot be larger " 393 "than filesystem's\n")); 394 /* 395 * Not currently implemented, although it could be if 396 * needed someday. 397 */ 398 return ENOSYS; 399 } 400 401 if (off < 0) 402 return EINVAL; 403 404 if (blksize < DEV_BSIZE) 405 return EINVAL; 406 if (blksize % DEV_BSIZE) 407 return EINVAL; 408 409 /* XXXTODO: verify that the full load is writable */ 410 411 /* 412 * XXX check for minimum log size 413 * minimum is governed by minimum amount of space 414 * to complete a transaction. (probably truncate) 415 */ 416 /* XXX for now pick something minimal */ 417 if ((count * blksize) < MAXPHYS) { 418 return ENOSPC; 419 } 420 421 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 422 return error; 423 } 424 425 wl = wapbl_calloc(1, sizeof(*wl)); 426 rw_init(&wl->wl_rwlock); 427 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 428 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 429 LIST_INIT(&wl->wl_bufs); 430 SIMPLEQ_INIT(&wl->wl_entries); 431 432 wl->wl_logvp = vp; 433 wl->wl_devvp = devvp; 434 wl->wl_mount = mp; 435 wl->wl_logpbn = logpbn; 436 wl->wl_log_dev_bshift = log_dev_bshift; 437 wl->wl_fs_dev_bshift = fs_dev_bshift; 438 439 wl->wl_flush = flushfn; 440 wl->wl_flush_abort = flushabortfn; 441 442 /* Reserve two log device blocks for the commit headers */ 443 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 444 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 445 /* truncate the log usage to a multiple of log_dev_bshift */ 446 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 447 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 448 449 /* 450 * wl_bufbytes_max limits the size of the in memory transaction space. 451 * - Since buffers are allocated and accounted for in units of 452 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 453 * (i.e. 1<<PAGE_SHIFT) 454 * - Since the log device has to be written in units of 455 * 1<<wl_log_dev_bshift it is required to be a mulitple of 456 * 1<<wl_log_dev_bshift. 457 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 458 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 459 * Therefore it must be multiple of the least common multiple of those 460 * three quantities. Fortunately, all of those quantities are 461 * guaranteed to be a power of two, and the least common multiple of 462 * a set of numbers which are all powers of two is simply the maximum 463 * of those numbers. Finally, the maximum logarithm of a power of two 464 * is the same as the log of the maximum power of two. So we can do 465 * the following operations to size wl_bufbytes_max: 466 */ 467 468 /* XXX fix actual number of pages reserved per filesystem. */ 469 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 470 471 /* Round wl_bufbytes_max to the largest power of two constraint */ 472 wl->wl_bufbytes_max >>= PAGE_SHIFT; 473 wl->wl_bufbytes_max <<= PAGE_SHIFT; 474 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 475 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 476 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 477 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 478 479 /* XXX maybe use filesystem fragment size instead of 1024 */ 480 /* XXX fix actual number of buffers reserved per filesystem. */ 481 wl->wl_bufcount_max = (nbuf / 2) * 1024; 482 483 /* XXX tie this into resource estimation */ 484 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2; 485 486 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * 487 wl->wl_dealloclim); 488 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * 489 wl->wl_dealloclim); 490 491 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 492 493 /* Initialize the commit header */ 494 { 495 struct wapbl_wc_header *wc; 496 size_t len = 1 << wl->wl_log_dev_bshift; 497 wc = wapbl_calloc(1, len); 498 wc->wc_type = WAPBL_WC_HEADER; 499 wc->wc_len = len; 500 wc->wc_circ_off = wl->wl_circ_off; 501 wc->wc_circ_size = wl->wl_circ_size; 502 /* XXX wc->wc_fsid */ 503 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 504 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 505 wl->wl_wc_header = wc; 506 wl->wl_wc_scratch = wapbl_malloc(len); 507 } 508 509 /* 510 * if there was an existing set of unlinked but 511 * allocated inodes, preserve it in the new 512 * log. 513 */ 514 if (wr && wr->wr_inodescnt) { 515 error = wapbl_start_flush_inodes(wl, wr); 516 if (error) 517 goto errout; 518 } 519 520 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 521 if (error) { 522 goto errout; 523 } 524 525 *wlp = wl; 526 #if defined(WAPBL_DEBUG) 527 wapbl_debug_wl = wl; 528 #endif 529 530 return 0; 531 errout: 532 wapbl_discard(wl); 533 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 534 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 535 wapbl_free(wl->wl_deallocblks, 536 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 537 wapbl_free(wl->wl_dealloclens, 538 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 539 wapbl_inodetrk_free(wl); 540 wapbl_free(wl, sizeof(*wl)); 541 542 return error; 543 } 544 545 /* 546 * Like wapbl_flush, only discards the transaction 547 * completely 548 */ 549 550 void 551 wapbl_discard(struct wapbl *wl) 552 { 553 struct wapbl_entry *we; 554 struct buf *bp; 555 int i; 556 557 /* 558 * XXX we may consider using upgrade here 559 * if we want to call flush from inside a transaction 560 */ 561 rw_enter(&wl->wl_rwlock, RW_WRITER); 562 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 563 wl->wl_dealloccnt); 564 565 #ifdef WAPBL_DEBUG_PRINT 566 { 567 pid_t pid = -1; 568 lwpid_t lid = -1; 569 if (curproc) 570 pid = curproc->p_pid; 571 if (curlwp) 572 lid = curlwp->l_lid; 573 #ifdef WAPBL_DEBUG_BUFBYTES 574 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 575 ("wapbl_discard: thread %d.%d discarding " 576 "transaction\n" 577 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 578 "deallocs=%d inodes=%d\n" 579 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 580 "unsynced=%zu\n", 581 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 582 wl->wl_bcount, wl->wl_dealloccnt, 583 wl->wl_inohashcnt, wl->wl_error_count, 584 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 585 wl->wl_unsynced_bufbytes)); 586 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 587 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 588 ("\tentry: bufcount = %zu, reclaimable = %zu, " 589 "error = %d, unsynced = %zu\n", 590 we->we_bufcount, we->we_reclaimable_bytes, 591 we->we_error, we->we_unsynced_bufbytes)); 592 } 593 #else /* !WAPBL_DEBUG_BUFBYTES */ 594 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 595 ("wapbl_discard: thread %d.%d discarding transaction\n" 596 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 597 "deallocs=%d inodes=%d\n" 598 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 599 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 600 wl->wl_bcount, wl->wl_dealloccnt, 601 wl->wl_inohashcnt, wl->wl_error_count, 602 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 603 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 604 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 605 ("\tentry: bufcount = %zu, reclaimable = %zu, " 606 "error = %d\n", 607 we->we_bufcount, we->we_reclaimable_bytes, 608 we->we_error)); 609 } 610 #endif /* !WAPBL_DEBUG_BUFBYTES */ 611 } 612 #endif /* WAPBL_DEBUG_PRINT */ 613 614 for (i = 0; i <= wl->wl_inohashmask; i++) { 615 struct wapbl_ino_head *wih; 616 struct wapbl_ino *wi; 617 618 wih = &wl->wl_inohash[i]; 619 while ((wi = LIST_FIRST(wih)) != NULL) { 620 LIST_REMOVE(wi, wi_hash); 621 pool_put(&wapbl_ino_pool, wi); 622 KASSERT(wl->wl_inohashcnt > 0); 623 wl->wl_inohashcnt--; 624 } 625 } 626 627 /* 628 * clean buffer list 629 */ 630 mutex_enter(&bufcache_lock); 631 mutex_enter(&wl->wl_mtx); 632 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 633 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 634 /* 635 * The buffer will be unlocked and 636 * removed from the transaction in brelse 637 */ 638 mutex_exit(&wl->wl_mtx); 639 brelsel(bp, 0); 640 mutex_enter(&wl->wl_mtx); 641 } 642 } 643 mutex_exit(&wl->wl_mtx); 644 mutex_exit(&bufcache_lock); 645 646 /* 647 * Remove references to this wl from wl_entries, free any which 648 * no longer have buffers, others will be freed in wapbl_biodone 649 * when they no longer have any buffers. 650 */ 651 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 652 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 653 /* XXX should we be accumulating wl_error_count 654 * and increasing reclaimable bytes ? */ 655 we->we_wapbl = NULL; 656 if (we->we_bufcount == 0) { 657 #ifdef WAPBL_DEBUG_BUFBYTES 658 KASSERT(we->we_unsynced_bufbytes == 0); 659 #endif 660 wapbl_free(we, sizeof(*we)); 661 } 662 } 663 664 /* Discard list of deallocs */ 665 wl->wl_dealloccnt = 0; 666 /* XXX should we clear wl_reserved_bytes? */ 667 668 KASSERT(wl->wl_bufbytes == 0); 669 KASSERT(wl->wl_bcount == 0); 670 KASSERT(wl->wl_bufcount == 0); 671 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 672 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 673 KASSERT(wl->wl_inohashcnt == 0); 674 675 rw_exit(&wl->wl_rwlock); 676 } 677 678 int 679 wapbl_stop(struct wapbl *wl, int force) 680 { 681 struct vnode *vp; 682 int error; 683 684 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 685 error = wapbl_flush(wl, 1); 686 if (error) { 687 if (force) 688 wapbl_discard(wl); 689 else 690 return error; 691 } 692 693 /* Unlinked inodes persist after a flush */ 694 if (wl->wl_inohashcnt) { 695 if (force) { 696 wapbl_discard(wl); 697 } else { 698 return EBUSY; 699 } 700 } 701 702 KASSERT(wl->wl_bufbytes == 0); 703 KASSERT(wl->wl_bcount == 0); 704 KASSERT(wl->wl_bufcount == 0); 705 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 706 KASSERT(wl->wl_dealloccnt == 0); 707 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 708 KASSERT(wl->wl_inohashcnt == 0); 709 710 vp = wl->wl_logvp; 711 712 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 713 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 714 wapbl_free(wl->wl_deallocblks, 715 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 716 wapbl_free(wl->wl_dealloclens, 717 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 718 wapbl_inodetrk_free(wl); 719 720 cv_destroy(&wl->wl_reclaimable_cv); 721 mutex_destroy(&wl->wl_mtx); 722 rw_destroy(&wl->wl_rwlock); 723 wapbl_free(wl, sizeof(*wl)); 724 725 return 0; 726 } 727 728 static int 729 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 730 { 731 struct pstats *pstats = curlwp->l_proc->p_stats; 732 struct buf *bp; 733 int error; 734 735 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 736 KASSERT(devvp->v_type == VBLK); 737 738 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 739 mutex_enter(devvp->v_interlock); 740 devvp->v_numoutput++; 741 mutex_exit(devvp->v_interlock); 742 pstats->p_ru.ru_oublock++; 743 } else { 744 pstats->p_ru.ru_inblock++; 745 } 746 747 bp = getiobuf(devvp, true); 748 bp->b_flags = flags; 749 bp->b_cflags = BC_BUSY; /* silly & dubious */ 750 bp->b_dev = devvp->v_rdev; 751 bp->b_data = data; 752 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 753 bp->b_blkno = pbn; 754 755 WAPBL_PRINTF(WAPBL_PRINT_IO, 756 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n", 757 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 758 bp->b_blkno, bp->b_dev)); 759 760 VOP_STRATEGY(devvp, bp); 761 762 error = biowait(bp); 763 putiobuf(bp); 764 765 if (error) { 766 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 767 ("wapbl_doio: %s %zu bytes at block %" PRId64 768 " on dev 0x%"PRIx64" failed with error %d\n", 769 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 770 "write" : "read"), 771 len, pbn, devvp->v_rdev, error)); 772 } 773 774 return error; 775 } 776 777 int 778 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 779 { 780 781 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 782 } 783 784 int 785 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 786 { 787 788 return wapbl_doio(data, len, devvp, pbn, B_READ); 789 } 790 791 /* 792 * Off is byte offset returns new offset for next write 793 * handles log wraparound 794 */ 795 static int 796 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 797 { 798 size_t slen; 799 off_t off = *offp; 800 int error; 801 daddr_t pbn; 802 803 KDASSERT(((len >> wl->wl_log_dev_bshift) << 804 wl->wl_log_dev_bshift) == len); 805 806 if (off < wl->wl_circ_off) 807 off = wl->wl_circ_off; 808 slen = wl->wl_circ_off + wl->wl_circ_size - off; 809 if (slen < len) { 810 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 811 #ifdef _KERNEL 812 pbn = btodb(pbn << wl->wl_log_dev_bshift); 813 #endif 814 error = wapbl_write(data, slen, wl->wl_devvp, pbn); 815 if (error) 816 return error; 817 data = (uint8_t *)data + slen; 818 len -= slen; 819 off = wl->wl_circ_off; 820 } 821 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 822 #ifdef _KERNEL 823 pbn = btodb(pbn << wl->wl_log_dev_bshift); 824 #endif 825 error = wapbl_write(data, len, wl->wl_devvp, pbn); 826 if (error) 827 return error; 828 off += len; 829 if (off >= wl->wl_circ_off + wl->wl_circ_size) 830 off = wl->wl_circ_off; 831 *offp = off; 832 return 0; 833 } 834 835 /****************************************************************/ 836 837 int 838 wapbl_begin(struct wapbl *wl, const char *file, int line) 839 { 840 int doflush; 841 unsigned lockcount; 842 843 KDASSERT(wl); 844 845 /* 846 * XXX this needs to be made much more sophisticated. 847 * perhaps each wapbl_begin could reserve a specified 848 * number of buffers and bytes. 849 */ 850 mutex_enter(&wl->wl_mtx); 851 lockcount = wl->wl_lock_count; 852 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 853 wl->wl_bufbytes_max / 2) || 854 ((wl->wl_bufcount + (lockcount * 10)) > 855 wl->wl_bufcount_max / 2) || 856 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 857 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2)); 858 mutex_exit(&wl->wl_mtx); 859 860 if (doflush) { 861 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 862 ("force flush lockcnt=%d bufbytes=%zu " 863 "(max=%zu) bufcount=%zu (max=%zu) " 864 "dealloccnt %d (lim=%d)\n", 865 lockcount, wl->wl_bufbytes, 866 wl->wl_bufbytes_max, wl->wl_bufcount, 867 wl->wl_bufcount_max, 868 wl->wl_dealloccnt, wl->wl_dealloclim)); 869 } 870 871 if (doflush) { 872 int error = wapbl_flush(wl, 0); 873 if (error) 874 return error; 875 } 876 877 rw_enter(&wl->wl_rwlock, RW_READER); 878 mutex_enter(&wl->wl_mtx); 879 wl->wl_lock_count++; 880 mutex_exit(&wl->wl_mtx); 881 882 #if defined(WAPBL_DEBUG_PRINT) 883 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 884 ("wapbl_begin thread %d.%d with bufcount=%zu " 885 "bufbytes=%zu bcount=%zu at %s:%d\n", 886 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 887 wl->wl_bufbytes, wl->wl_bcount, file, line)); 888 #endif 889 890 return 0; 891 } 892 893 void 894 wapbl_end(struct wapbl *wl) 895 { 896 897 #if defined(WAPBL_DEBUG_PRINT) 898 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 899 ("wapbl_end thread %d.%d with bufcount=%zu " 900 "bufbytes=%zu bcount=%zu\n", 901 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 902 wl->wl_bufbytes, wl->wl_bcount)); 903 #endif 904 905 #ifdef DIAGNOSTIC 906 size_t flushsize = wapbl_transaction_len(wl); 907 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 908 /* 909 * XXX this could be handled more gracefully, perhaps place 910 * only a partial transaction in the log and allow the 911 * remaining to flush without the protection of the journal. 912 */ 913 panic("wapbl_end: current transaction too big to flush\n"); 914 } 915 #endif 916 917 mutex_enter(&wl->wl_mtx); 918 KASSERT(wl->wl_lock_count > 0); 919 wl->wl_lock_count--; 920 mutex_exit(&wl->wl_mtx); 921 922 rw_exit(&wl->wl_rwlock); 923 } 924 925 void 926 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 927 { 928 929 KASSERT(bp->b_cflags & BC_BUSY); 930 KASSERT(bp->b_vp); 931 932 wapbl_jlock_assert(wl); 933 934 #if 0 935 /* 936 * XXX this might be an issue for swapfiles. 937 * see uvm_swap.c:1702 938 * 939 * XXX2 why require it then? leap of semantics? 940 */ 941 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 942 #endif 943 944 mutex_enter(&wl->wl_mtx); 945 if (bp->b_flags & B_LOCKED) { 946 LIST_REMOVE(bp, b_wapbllist); 947 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 948 ("wapbl_add_buf thread %d.%d re-adding buf %p " 949 "with %d bytes %d bcount\n", 950 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 951 bp->b_bcount)); 952 } else { 953 /* unlocked by dirty buffers shouldn't exist */ 954 KASSERT(!(bp->b_oflags & BO_DELWRI)); 955 wl->wl_bufbytes += bp->b_bufsize; 956 wl->wl_bcount += bp->b_bcount; 957 wl->wl_bufcount++; 958 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 959 ("wapbl_add_buf thread %d.%d adding buf %p " 960 "with %d bytes %d bcount\n", 961 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 962 bp->b_bcount)); 963 } 964 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 965 mutex_exit(&wl->wl_mtx); 966 967 bp->b_flags |= B_LOCKED; 968 } 969 970 static void 971 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 972 { 973 974 KASSERT(mutex_owned(&wl->wl_mtx)); 975 KASSERT(bp->b_cflags & BC_BUSY); 976 wapbl_jlock_assert(wl); 977 978 #if 0 979 /* 980 * XXX this might be an issue for swapfiles. 981 * see uvm_swap.c:1725 982 * 983 * XXXdeux: see above 984 */ 985 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 986 #endif 987 KASSERT(bp->b_flags & B_LOCKED); 988 989 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 990 ("wapbl_remove_buf thread %d.%d removing buf %p with " 991 "%d bytes %d bcount\n", 992 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 993 994 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 995 wl->wl_bufbytes -= bp->b_bufsize; 996 KASSERT(wl->wl_bcount >= bp->b_bcount); 997 wl->wl_bcount -= bp->b_bcount; 998 KASSERT(wl->wl_bufcount > 0); 999 wl->wl_bufcount--; 1000 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1001 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1002 LIST_REMOVE(bp, b_wapbllist); 1003 1004 bp->b_flags &= ~B_LOCKED; 1005 } 1006 1007 /* called from brelsel() in vfs_bio among other places */ 1008 void 1009 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 1010 { 1011 1012 mutex_enter(&wl->wl_mtx); 1013 wapbl_remove_buf_locked(wl, bp); 1014 mutex_exit(&wl->wl_mtx); 1015 } 1016 1017 void 1018 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 1019 { 1020 1021 KASSERT(bp->b_cflags & BC_BUSY); 1022 1023 /* 1024 * XXX: why does this depend on B_LOCKED? otherwise the buf 1025 * is not for a transaction? if so, why is this called in the 1026 * first place? 1027 */ 1028 if (bp->b_flags & B_LOCKED) { 1029 mutex_enter(&wl->wl_mtx); 1030 wl->wl_bufbytes += bp->b_bufsize - oldsz; 1031 wl->wl_bcount += bp->b_bcount - oldcnt; 1032 mutex_exit(&wl->wl_mtx); 1033 } 1034 } 1035 1036 #endif /* _KERNEL */ 1037 1038 /****************************************************************/ 1039 /* Some utility inlines */ 1040 1041 /* This is used to advance the pointer at old to new value at old+delta */ 1042 static inline off_t 1043 wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 1044 { 1045 off_t new; 1046 1047 /* Define acceptable ranges for inputs. */ 1048 KASSERT(delta <= (size_t)size); 1049 KASSERT((old == 0) || ((size_t)old >= off)); 1050 KASSERT(old < (off_t)(size + off)); 1051 1052 if ((old == 0) && (delta != 0)) 1053 new = off + delta; 1054 else if ((old + delta) < (size + off)) 1055 new = old + delta; 1056 else 1057 new = (old + delta) - size; 1058 1059 /* Note some interesting axioms */ 1060 KASSERT((delta != 0) || (new == old)); 1061 KASSERT((delta == 0) || (new != 0)); 1062 KASSERT((delta != (size)) || (new == old)); 1063 1064 /* Define acceptable ranges for output. */ 1065 KASSERT((new == 0) || ((size_t)new >= off)); 1066 KASSERT((size_t)new < (size + off)); 1067 return new; 1068 } 1069 1070 static inline size_t 1071 wapbl_space_used(size_t avail, off_t head, off_t tail) 1072 { 1073 1074 if (tail == 0) { 1075 KASSERT(head == 0); 1076 return 0; 1077 } 1078 return ((head + (avail - 1) - tail) % avail) + 1; 1079 } 1080 1081 static inline size_t 1082 wapbl_space_free(size_t avail, off_t head, off_t tail) 1083 { 1084 1085 return avail - wapbl_space_used(avail, head, tail); 1086 } 1087 1088 static inline void 1089 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1090 off_t *tailp) 1091 { 1092 off_t head = *headp; 1093 off_t tail = *tailp; 1094 1095 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1096 head = wapbl_advance(size, off, head, delta); 1097 if ((tail == 0) && (head != 0)) 1098 tail = off; 1099 *headp = head; 1100 *tailp = tail; 1101 } 1102 1103 static inline void 1104 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1105 off_t *tailp) 1106 { 1107 off_t head = *headp; 1108 off_t tail = *tailp; 1109 1110 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1111 tail = wapbl_advance(size, off, tail, delta); 1112 if (head == tail) { 1113 head = tail = 0; 1114 } 1115 *headp = head; 1116 *tailp = tail; 1117 } 1118 1119 #ifdef _KERNEL 1120 1121 /****************************************************************/ 1122 1123 /* 1124 * Remove transactions whose buffers are completely flushed to disk. 1125 * Will block until at least minfree space is available. 1126 * only intended to be called from inside wapbl_flush and therefore 1127 * does not protect against commit races with itself or with flush. 1128 */ 1129 static int 1130 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1131 { 1132 size_t delta; 1133 size_t avail; 1134 off_t head; 1135 off_t tail; 1136 int error = 0; 1137 1138 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1139 KASSERT(rw_write_held(&wl->wl_rwlock)); 1140 1141 mutex_enter(&wl->wl_mtx); 1142 1143 /* 1144 * First check to see if we have to do a commit 1145 * at all. 1146 */ 1147 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1148 if (minfree < avail) { 1149 mutex_exit(&wl->wl_mtx); 1150 return 0; 1151 } 1152 minfree -= avail; 1153 while ((wl->wl_error_count == 0) && 1154 (wl->wl_reclaimable_bytes < minfree)) { 1155 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1156 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1157 "minfree=%zd\n", 1158 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1159 minfree)); 1160 1161 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1162 } 1163 if (wl->wl_reclaimable_bytes < minfree) { 1164 KASSERT(wl->wl_error_count); 1165 /* XXX maybe get actual error from buffer instead someday? */ 1166 error = EIO; 1167 } 1168 head = wl->wl_head; 1169 tail = wl->wl_tail; 1170 delta = wl->wl_reclaimable_bytes; 1171 1172 /* If all of of the entries are flushed, then be sure to keep 1173 * the reserved bytes reserved. Watch out for discarded transactions, 1174 * which could leave more bytes reserved than are reclaimable. 1175 */ 1176 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1177 (delta >= wl->wl_reserved_bytes)) { 1178 delta -= wl->wl_reserved_bytes; 1179 } 1180 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1181 &tail); 1182 KDASSERT(wl->wl_reserved_bytes <= 1183 wapbl_space_used(wl->wl_circ_size, head, tail)); 1184 mutex_exit(&wl->wl_mtx); 1185 1186 if (error) 1187 return error; 1188 1189 if (waitonly) 1190 return 0; 1191 1192 /* 1193 * This is where head, tail and delta are unprotected 1194 * from races against itself or flush. This is ok since 1195 * we only call this routine from inside flush itself. 1196 * 1197 * XXX: how can it race against itself when accessed only 1198 * from behind the write-locked rwlock? 1199 */ 1200 error = wapbl_write_commit(wl, head, tail); 1201 if (error) 1202 return error; 1203 1204 wl->wl_head = head; 1205 wl->wl_tail = tail; 1206 1207 mutex_enter(&wl->wl_mtx); 1208 KASSERT(wl->wl_reclaimable_bytes >= delta); 1209 wl->wl_reclaimable_bytes -= delta; 1210 mutex_exit(&wl->wl_mtx); 1211 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1212 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1213 curproc->p_pid, curlwp->l_lid, delta)); 1214 1215 return 0; 1216 } 1217 1218 /****************************************************************/ 1219 1220 void 1221 wapbl_biodone(struct buf *bp) 1222 { 1223 struct wapbl_entry *we = bp->b_private; 1224 struct wapbl *wl = we->we_wapbl; 1225 1226 /* 1227 * Handle possible flushing of buffers after log has been 1228 * decomissioned. 1229 */ 1230 if (!wl) { 1231 KASSERT(we->we_bufcount > 0); 1232 we->we_bufcount--; 1233 #ifdef WAPBL_DEBUG_BUFBYTES 1234 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1235 we->we_unsynced_bufbytes -= bp->b_bufsize; 1236 #endif 1237 1238 if (we->we_bufcount == 0) { 1239 #ifdef WAPBL_DEBUG_BUFBYTES 1240 KASSERT(we->we_unsynced_bufbytes == 0); 1241 #endif 1242 wapbl_free(we, sizeof(*we)); 1243 } 1244 1245 brelse(bp, 0); 1246 return; 1247 } 1248 1249 #ifdef ohbother 1250 KDASSERT(bp->b_oflags & BO_DONE); 1251 KDASSERT(!(bp->b_oflags & BO_DELWRI)); 1252 KDASSERT(bp->b_flags & B_ASYNC); 1253 KDASSERT(bp->b_cflags & BC_BUSY); 1254 KDASSERT(!(bp->b_flags & B_LOCKED)); 1255 KDASSERT(!(bp->b_flags & B_READ)); 1256 KDASSERT(!(bp->b_cflags & BC_INVAL)); 1257 KDASSERT(!(bp->b_cflags & BC_NOCACHE)); 1258 #endif 1259 1260 if (bp->b_error) { 1261 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1262 /* 1263 * XXXpooka: interfaces not fully updated 1264 * Note: this was not enabled in the original patch 1265 * against netbsd4 either. I don't know if comment 1266 * above is true or not. 1267 */ 1268 1269 /* 1270 * If an error occurs, report the error and leave the 1271 * buffer as a delayed write on the LRU queue. 1272 * restarting the write would likely result in 1273 * an error spinloop, so let it be done harmlessly 1274 * by the syncer. 1275 */ 1276 bp->b_flags &= ~(B_DONE); 1277 simple_unlock(&bp->b_interlock); 1278 1279 if (we->we_error == 0) { 1280 mutex_enter(&wl->wl_mtx); 1281 wl->wl_error_count++; 1282 mutex_exit(&wl->wl_mtx); 1283 cv_broadcast(&wl->wl_reclaimable_cv); 1284 } 1285 we->we_error = bp->b_error; 1286 bp->b_error = 0; 1287 brelse(bp); 1288 return; 1289 #else 1290 /* For now, just mark the log permanently errored out */ 1291 1292 mutex_enter(&wl->wl_mtx); 1293 if (wl->wl_error_count == 0) { 1294 wl->wl_error_count++; 1295 cv_broadcast(&wl->wl_reclaimable_cv); 1296 } 1297 mutex_exit(&wl->wl_mtx); 1298 #endif 1299 } 1300 1301 mutex_enter(&wl->wl_mtx); 1302 1303 KASSERT(we->we_bufcount > 0); 1304 we->we_bufcount--; 1305 #ifdef WAPBL_DEBUG_BUFBYTES 1306 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1307 we->we_unsynced_bufbytes -= bp->b_bufsize; 1308 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); 1309 wl->wl_unsynced_bufbytes -= bp->b_bufsize; 1310 #endif 1311 1312 /* 1313 * If the current transaction can be reclaimed, start 1314 * at the beginning and reclaim any consecutive reclaimable 1315 * transactions. If we successfully reclaim anything, 1316 * then wakeup anyone waiting for the reclaim. 1317 */ 1318 if (we->we_bufcount == 0) { 1319 size_t delta = 0; 1320 int errcnt = 0; 1321 #ifdef WAPBL_DEBUG_BUFBYTES 1322 KDASSERT(we->we_unsynced_bufbytes == 0); 1323 #endif 1324 /* 1325 * clear any posted error, since the buffer it came from 1326 * has successfully flushed by now 1327 */ 1328 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1329 (we->we_bufcount == 0)) { 1330 delta += we->we_reclaimable_bytes; 1331 if (we->we_error) 1332 errcnt++; 1333 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1334 wapbl_free(we, sizeof(*we)); 1335 } 1336 1337 if (delta) { 1338 wl->wl_reclaimable_bytes += delta; 1339 KASSERT(wl->wl_error_count >= errcnt); 1340 wl->wl_error_count -= errcnt; 1341 cv_broadcast(&wl->wl_reclaimable_cv); 1342 } 1343 } 1344 1345 mutex_exit(&wl->wl_mtx); 1346 brelse(bp, 0); 1347 } 1348 1349 /* 1350 * Write transactions to disk + start I/O for contents 1351 */ 1352 int 1353 wapbl_flush(struct wapbl *wl, int waitfor) 1354 { 1355 struct buf *bp; 1356 struct wapbl_entry *we; 1357 off_t off; 1358 off_t head; 1359 off_t tail; 1360 size_t delta = 0; 1361 size_t flushsize; 1362 size_t reserved; 1363 int error = 0; 1364 1365 /* 1366 * Do a quick check to see if a full flush can be skipped 1367 * This assumes that the flush callback does not need to be called 1368 * unless there are other outstanding bufs. 1369 */ 1370 if (!waitfor) { 1371 size_t nbufs; 1372 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1373 protect the KASSERTS */ 1374 nbufs = wl->wl_bufcount; 1375 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1376 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1377 mutex_exit(&wl->wl_mtx); 1378 if (nbufs == 0) 1379 return 0; 1380 } 1381 1382 /* 1383 * XXX we may consider using LK_UPGRADE here 1384 * if we want to call flush from inside a transaction 1385 */ 1386 rw_enter(&wl->wl_rwlock, RW_WRITER); 1387 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1388 wl->wl_dealloccnt); 1389 1390 /* 1391 * Now that we are fully locked and flushed, 1392 * do another check for nothing to do. 1393 */ 1394 if (wl->wl_bufcount == 0) { 1395 goto out; 1396 } 1397 1398 #if 0 1399 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1400 ("wapbl_flush thread %d.%d flushing entries with " 1401 "bufcount=%zu bufbytes=%zu\n", 1402 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1403 wl->wl_bufbytes)); 1404 #endif 1405 1406 /* Calculate amount of space needed to flush */ 1407 flushsize = wapbl_transaction_len(wl); 1408 if (wapbl_verbose_commit) { 1409 struct timespec ts; 1410 getnanotime(&ts); 1411 printf("%s: %lld.%09ld this transaction = %zu bytes\n", 1412 __func__, (long long)ts.tv_sec, 1413 (long)ts.tv_nsec, flushsize); 1414 } 1415 1416 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1417 /* 1418 * XXX this could be handled more gracefully, perhaps place 1419 * only a partial transaction in the log and allow the 1420 * remaining to flush without the protection of the journal. 1421 */ 1422 panic("wapbl_flush: current transaction too big to flush\n"); 1423 } 1424 1425 error = wapbl_truncate(wl, flushsize, 0); 1426 if (error) 1427 goto out2; 1428 1429 off = wl->wl_head; 1430 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1431 (off < wl->wl_circ_off + wl->wl_circ_size))); 1432 error = wapbl_write_blocks(wl, &off); 1433 if (error) 1434 goto out2; 1435 error = wapbl_write_revocations(wl, &off); 1436 if (error) 1437 goto out2; 1438 error = wapbl_write_inodes(wl, &off); 1439 if (error) 1440 goto out2; 1441 1442 reserved = 0; 1443 if (wl->wl_inohashcnt) 1444 reserved = wapbl_transaction_inodes_len(wl); 1445 1446 head = wl->wl_head; 1447 tail = wl->wl_tail; 1448 1449 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1450 &head, &tail); 1451 #ifdef WAPBL_DEBUG 1452 if (head != off) { 1453 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1454 " off=%"PRIdMAX" flush=%zu\n", 1455 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1456 flushsize); 1457 } 1458 #else 1459 KASSERT(head == off); 1460 #endif 1461 1462 /* Opportunistically move the tail forward if we can */ 1463 if (!wapbl_lazy_truncate) { 1464 mutex_enter(&wl->wl_mtx); 1465 delta = wl->wl_reclaimable_bytes; 1466 mutex_exit(&wl->wl_mtx); 1467 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1468 &head, &tail); 1469 } 1470 1471 error = wapbl_write_commit(wl, head, tail); 1472 if (error) 1473 goto out2; 1474 1475 we = wapbl_calloc(1, sizeof(*we)); 1476 1477 #ifdef WAPBL_DEBUG_BUFBYTES 1478 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1479 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1480 " unsynced=%zu" 1481 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1482 "inodes=%d\n", 1483 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1484 wapbl_space_used(wl->wl_circ_size, head, tail), 1485 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1486 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1487 wl->wl_inohashcnt)); 1488 #else 1489 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1490 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1491 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1492 "inodes=%d\n", 1493 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1494 wapbl_space_used(wl->wl_circ_size, head, tail), 1495 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1496 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1497 #endif 1498 1499 1500 mutex_enter(&bufcache_lock); 1501 mutex_enter(&wl->wl_mtx); 1502 1503 wl->wl_reserved_bytes = reserved; 1504 wl->wl_head = head; 1505 wl->wl_tail = tail; 1506 KASSERT(wl->wl_reclaimable_bytes >= delta); 1507 wl->wl_reclaimable_bytes -= delta; 1508 wl->wl_dealloccnt = 0; 1509 #ifdef WAPBL_DEBUG_BUFBYTES 1510 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1511 #endif 1512 1513 we->we_wapbl = wl; 1514 we->we_bufcount = wl->wl_bufcount; 1515 #ifdef WAPBL_DEBUG_BUFBYTES 1516 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1517 #endif 1518 we->we_reclaimable_bytes = flushsize; 1519 we->we_error = 0; 1520 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1521 1522 /* 1523 * this flushes bufs in reverse order than they were queued 1524 * it shouldn't matter, but if we care we could use TAILQ instead. 1525 * XXX Note they will get put on the lru queue when they flush 1526 * so we might actually want to change this to preserve order. 1527 */ 1528 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1529 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1530 continue; 1531 } 1532 bp->b_iodone = wapbl_biodone; 1533 bp->b_private = we; 1534 bremfree(bp); 1535 wapbl_remove_buf_locked(wl, bp); 1536 mutex_exit(&wl->wl_mtx); 1537 mutex_exit(&bufcache_lock); 1538 bawrite(bp); 1539 mutex_enter(&bufcache_lock); 1540 mutex_enter(&wl->wl_mtx); 1541 } 1542 mutex_exit(&wl->wl_mtx); 1543 mutex_exit(&bufcache_lock); 1544 1545 #if 0 1546 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1547 ("wapbl_flush thread %d.%d done flushing entries...\n", 1548 curproc->p_pid, curlwp->l_lid)); 1549 #endif 1550 1551 out: 1552 1553 /* 1554 * If the waitfor flag is set, don't return until everything is 1555 * fully flushed and the on disk log is empty. 1556 */ 1557 if (waitfor) { 1558 error = wapbl_truncate(wl, wl->wl_circ_size - 1559 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1560 } 1561 1562 out2: 1563 if (error) { 1564 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1565 wl->wl_dealloclens, wl->wl_dealloccnt); 1566 } 1567 1568 #ifdef WAPBL_DEBUG_PRINT 1569 if (error) { 1570 pid_t pid = -1; 1571 lwpid_t lid = -1; 1572 if (curproc) 1573 pid = curproc->p_pid; 1574 if (curlwp) 1575 lid = curlwp->l_lid; 1576 mutex_enter(&wl->wl_mtx); 1577 #ifdef WAPBL_DEBUG_BUFBYTES 1578 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1579 ("wapbl_flush: thread %d.%d aborted flush: " 1580 "error = %d\n" 1581 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1582 "deallocs=%d inodes=%d\n" 1583 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1584 "unsynced=%zu\n", 1585 pid, lid, error, wl->wl_bufcount, 1586 wl->wl_bufbytes, wl->wl_bcount, 1587 wl->wl_dealloccnt, wl->wl_inohashcnt, 1588 wl->wl_error_count, wl->wl_reclaimable_bytes, 1589 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1590 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1591 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1592 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1593 "error = %d, unsynced = %zu\n", 1594 we->we_bufcount, we->we_reclaimable_bytes, 1595 we->we_error, we->we_unsynced_bufbytes)); 1596 } 1597 #else 1598 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1599 ("wapbl_flush: thread %d.%d aborted flush: " 1600 "error = %d\n" 1601 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1602 "deallocs=%d inodes=%d\n" 1603 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1604 pid, lid, error, wl->wl_bufcount, 1605 wl->wl_bufbytes, wl->wl_bcount, 1606 wl->wl_dealloccnt, wl->wl_inohashcnt, 1607 wl->wl_error_count, wl->wl_reclaimable_bytes, 1608 wl->wl_reserved_bytes)); 1609 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1610 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1611 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1612 "error = %d\n", we->we_bufcount, 1613 we->we_reclaimable_bytes, we->we_error)); 1614 } 1615 #endif 1616 mutex_exit(&wl->wl_mtx); 1617 } 1618 #endif 1619 1620 rw_exit(&wl->wl_rwlock); 1621 return error; 1622 } 1623 1624 /****************************************************************/ 1625 1626 void 1627 wapbl_jlock_assert(struct wapbl *wl) 1628 { 1629 1630 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1631 } 1632 1633 void 1634 wapbl_junlock_assert(struct wapbl *wl) 1635 { 1636 1637 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1638 } 1639 1640 /****************************************************************/ 1641 1642 /* locks missing */ 1643 void 1644 wapbl_print(struct wapbl *wl, 1645 int full, 1646 void (*pr)(const char *, ...)) 1647 { 1648 struct buf *bp; 1649 struct wapbl_entry *we; 1650 (*pr)("wapbl %p", wl); 1651 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1652 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1653 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1654 wl->wl_circ_size, wl->wl_circ_off, 1655 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1656 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1657 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1658 #ifdef WAPBL_DEBUG_BUFBYTES 1659 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1660 "reserved = %zu errcnt = %d unsynced = %zu\n", 1661 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1662 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1663 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1664 #else 1665 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1666 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1667 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1668 wl->wl_error_count); 1669 #endif 1670 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1671 wl->wl_dealloccnt, wl->wl_dealloclim); 1672 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1673 wl->wl_inohashcnt, wl->wl_inohashmask); 1674 (*pr)("entries:\n"); 1675 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1676 #ifdef WAPBL_DEBUG_BUFBYTES 1677 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1678 "unsynced = %zu\n", 1679 we->we_bufcount, we->we_reclaimable_bytes, 1680 we->we_error, we->we_unsynced_bufbytes); 1681 #else 1682 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1683 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1684 #endif 1685 } 1686 if (full) { 1687 int cnt = 0; 1688 (*pr)("bufs ="); 1689 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1690 if (!LIST_NEXT(bp, b_wapbllist)) { 1691 (*pr)(" %p", bp); 1692 } else if ((++cnt % 6) == 0) { 1693 (*pr)(" %p,\n\t", bp); 1694 } else { 1695 (*pr)(" %p,", bp); 1696 } 1697 } 1698 (*pr)("\n"); 1699 1700 (*pr)("dealloced blks = "); 1701 { 1702 int i; 1703 cnt = 0; 1704 for (i = 0; i < wl->wl_dealloccnt; i++) { 1705 (*pr)(" %"PRId64":%d,", 1706 wl->wl_deallocblks[i], 1707 wl->wl_dealloclens[i]); 1708 if ((++cnt % 4) == 0) { 1709 (*pr)("\n\t"); 1710 } 1711 } 1712 } 1713 (*pr)("\n"); 1714 1715 (*pr)("registered inodes = "); 1716 { 1717 int i; 1718 cnt = 0; 1719 for (i = 0; i <= wl->wl_inohashmask; i++) { 1720 struct wapbl_ino_head *wih; 1721 struct wapbl_ino *wi; 1722 1723 wih = &wl->wl_inohash[i]; 1724 LIST_FOREACH(wi, wih, wi_hash) { 1725 if (wi->wi_ino == 0) 1726 continue; 1727 (*pr)(" %"PRId32"/0%06"PRIo32",", 1728 wi->wi_ino, wi->wi_mode); 1729 if ((++cnt % 4) == 0) { 1730 (*pr)("\n\t"); 1731 } 1732 } 1733 } 1734 (*pr)("\n"); 1735 } 1736 } 1737 } 1738 1739 #if defined(WAPBL_DEBUG) || defined(DDB) 1740 void 1741 wapbl_dump(struct wapbl *wl) 1742 { 1743 #if defined(WAPBL_DEBUG) 1744 if (!wl) 1745 wl = wapbl_debug_wl; 1746 #endif 1747 if (!wl) 1748 return; 1749 wapbl_print(wl, 1, printf); 1750 } 1751 #endif 1752 1753 /****************************************************************/ 1754 1755 void 1756 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1757 { 1758 1759 wapbl_jlock_assert(wl); 1760 1761 mutex_enter(&wl->wl_mtx); 1762 /* XXX should eventually instead tie this into resource estimation */ 1763 /* 1764 * XXX this panic needs locking/mutex analysis and the 1765 * ability to cope with the failure. 1766 */ 1767 /* XXX this XXX doesn't have enough XXX */ 1768 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) 1769 panic("wapbl_register_deallocation: out of resources"); 1770 1771 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1772 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1773 wl->wl_dealloccnt++; 1774 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1775 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1776 mutex_exit(&wl->wl_mtx); 1777 } 1778 1779 /****************************************************************/ 1780 1781 static void 1782 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1783 { 1784 1785 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1786 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1787 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1788 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1789 } 1790 } 1791 1792 static void 1793 wapbl_inodetrk_free(struct wapbl *wl) 1794 { 1795 1796 /* XXX this KASSERT needs locking/mutex analysis */ 1797 KASSERT(wl->wl_inohashcnt == 0); 1798 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1799 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1800 pool_destroy(&wapbl_ino_pool); 1801 } 1802 } 1803 1804 static struct wapbl_ino * 1805 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1806 { 1807 struct wapbl_ino_head *wih; 1808 struct wapbl_ino *wi; 1809 1810 KASSERT(mutex_owned(&wl->wl_mtx)); 1811 1812 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1813 LIST_FOREACH(wi, wih, wi_hash) { 1814 if (ino == wi->wi_ino) 1815 return wi; 1816 } 1817 return 0; 1818 } 1819 1820 void 1821 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1822 { 1823 struct wapbl_ino_head *wih; 1824 struct wapbl_ino *wi; 1825 1826 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1827 1828 mutex_enter(&wl->wl_mtx); 1829 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1830 wi->wi_ino = ino; 1831 wi->wi_mode = mode; 1832 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1833 LIST_INSERT_HEAD(wih, wi, wi_hash); 1834 wl->wl_inohashcnt++; 1835 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1836 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1837 mutex_exit(&wl->wl_mtx); 1838 } else { 1839 mutex_exit(&wl->wl_mtx); 1840 pool_put(&wapbl_ino_pool, wi); 1841 } 1842 } 1843 1844 void 1845 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1846 { 1847 struct wapbl_ino *wi; 1848 1849 mutex_enter(&wl->wl_mtx); 1850 wi = wapbl_inodetrk_get(wl, ino); 1851 if (wi) { 1852 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1853 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1854 KASSERT(wl->wl_inohashcnt > 0); 1855 wl->wl_inohashcnt--; 1856 LIST_REMOVE(wi, wi_hash); 1857 mutex_exit(&wl->wl_mtx); 1858 1859 pool_put(&wapbl_ino_pool, wi); 1860 } else { 1861 mutex_exit(&wl->wl_mtx); 1862 } 1863 } 1864 1865 /****************************************************************/ 1866 1867 static inline size_t 1868 wapbl_transaction_inodes_len(struct wapbl *wl) 1869 { 1870 int blocklen = 1<<wl->wl_log_dev_bshift; 1871 int iph; 1872 1873 /* Calculate number of inodes described in a inodelist header */ 1874 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1875 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1876 1877 KASSERT(iph > 0); 1878 1879 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; 1880 } 1881 1882 1883 /* Calculate amount of space a transaction will take on disk */ 1884 static size_t 1885 wapbl_transaction_len(struct wapbl *wl) 1886 { 1887 int blocklen = 1<<wl->wl_log_dev_bshift; 1888 size_t len; 1889 int bph; 1890 1891 /* Calculate number of blocks described in a blocklist header */ 1892 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1893 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1894 1895 KASSERT(bph > 0); 1896 1897 len = wl->wl_bcount; 1898 len += howmany(wl->wl_bufcount, bph) * blocklen; 1899 len += howmany(wl->wl_dealloccnt, bph) * blocklen; 1900 len += wapbl_transaction_inodes_len(wl); 1901 1902 return len; 1903 } 1904 1905 /* 1906 * Perform commit operation 1907 * 1908 * Note that generation number incrementation needs to 1909 * be protected against racing with other invocations 1910 * of wapbl_commit. This is ok since this routine 1911 * is only invoked from wapbl_flush 1912 */ 1913 static int 1914 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 1915 { 1916 struct wapbl_wc_header *wc = wl->wl_wc_header; 1917 struct timespec ts; 1918 int error; 1919 int force = 1; 1920 daddr_t pbn; 1921 1922 if (wapbl_flush_disk_cache) { 1923 /* XXX Calc checksum here, instead we do this for now */ 1924 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 1925 FWRITE, FSCRED); 1926 if (error) { 1927 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1928 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1929 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1930 } 1931 } 1932 1933 wc->wc_head = head; 1934 wc->wc_tail = tail; 1935 wc->wc_checksum = 0; 1936 wc->wc_version = 1; 1937 getnanotime(&ts); 1938 wc->wc_time = ts.tv_sec; 1939 wc->wc_timensec = ts.tv_nsec; 1940 1941 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1942 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 1943 (intmax_t)head, (intmax_t)tail)); 1944 1945 /* 1946 * XXX if generation will rollover, then first zero 1947 * over second commit header before trying to write both headers. 1948 */ 1949 1950 pbn = wl->wl_logpbn + (wc->wc_generation % 2); 1951 #ifdef _KERNEL 1952 pbn = btodb(pbn << wc->wc_log_dev_bshift); 1953 #endif 1954 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn); 1955 if (error) 1956 return error; 1957 1958 if (wapbl_flush_disk_cache) { 1959 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 1960 FWRITE, FSCRED); 1961 if (error) { 1962 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1963 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1964 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1965 } 1966 } 1967 1968 /* 1969 * If the generation number was zero, write it out a second time. 1970 * This handles initialization and generation number rollover 1971 */ 1972 if (wc->wc_generation++ == 0) { 1973 error = wapbl_write_commit(wl, head, tail); 1974 /* 1975 * This panic should be able to be removed if we do the 1976 * zero'ing mentioned above, and we are certain to roll 1977 * back generation number on failure. 1978 */ 1979 if (error) 1980 panic("wapbl_write_commit: error writing duplicate " 1981 "log header: %d\n", error); 1982 } 1983 return 0; 1984 } 1985 1986 /* Returns new offset value */ 1987 static int 1988 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 1989 { 1990 struct wapbl_wc_blocklist *wc = 1991 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1992 int blocklen = 1<<wl->wl_log_dev_bshift; 1993 int bph; 1994 struct buf *bp; 1995 off_t off = *offp; 1996 int error; 1997 size_t padding; 1998 1999 KASSERT(rw_write_held(&wl->wl_rwlock)); 2000 2001 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2002 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2003 2004 bp = LIST_FIRST(&wl->wl_bufs); 2005 2006 while (bp) { 2007 int cnt; 2008 struct buf *obp = bp; 2009 2010 KASSERT(bp->b_flags & B_LOCKED); 2011 2012 wc->wc_type = WAPBL_WC_BLOCKS; 2013 wc->wc_len = blocklen; 2014 wc->wc_blkcount = 0; 2015 while (bp && (wc->wc_blkcount < bph)) { 2016 /* 2017 * Make sure all the physical block numbers are up to 2018 * date. If this is not always true on a given 2019 * filesystem, then VOP_BMAP must be called. We 2020 * could call VOP_BMAP here, or else in the filesystem 2021 * specific flush callback, although neither of those 2022 * solutions allow us to take the vnode lock. If a 2023 * filesystem requires that we must take the vnode lock 2024 * to call VOP_BMAP, then we can probably do it in 2025 * bwrite when the vnode lock should already be held 2026 * by the invoking code. 2027 */ 2028 KASSERT((bp->b_vp->v_type == VBLK) || 2029 (bp->b_blkno != bp->b_lblkno)); 2030 KASSERT(bp->b_blkno > 0); 2031 2032 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 2033 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 2034 wc->wc_len += bp->b_bcount; 2035 wc->wc_blkcount++; 2036 bp = LIST_NEXT(bp, b_wapbllist); 2037 } 2038 if (wc->wc_len % blocklen != 0) { 2039 padding = blocklen - wc->wc_len % blocklen; 2040 wc->wc_len += padding; 2041 } else { 2042 padding = 0; 2043 } 2044 2045 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2046 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 2047 wc->wc_len, padding, (intmax_t)off)); 2048 2049 error = wapbl_circ_write(wl, wc, blocklen, &off); 2050 if (error) 2051 return error; 2052 bp = obp; 2053 cnt = 0; 2054 while (bp && (cnt++ < bph)) { 2055 error = wapbl_circ_write(wl, bp->b_data, 2056 bp->b_bcount, &off); 2057 if (error) 2058 return error; 2059 bp = LIST_NEXT(bp, b_wapbllist); 2060 } 2061 if (padding) { 2062 void *zero; 2063 2064 zero = wapbl_malloc(padding); 2065 memset(zero, 0, padding); 2066 error = wapbl_circ_write(wl, zero, padding, &off); 2067 wapbl_free(zero, padding); 2068 if (error) 2069 return error; 2070 } 2071 } 2072 *offp = off; 2073 return 0; 2074 } 2075 2076 static int 2077 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2078 { 2079 struct wapbl_wc_blocklist *wc = 2080 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2081 int i; 2082 int blocklen = 1<<wl->wl_log_dev_bshift; 2083 int bph; 2084 off_t off = *offp; 2085 int error; 2086 2087 if (wl->wl_dealloccnt == 0) 2088 return 0; 2089 2090 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2091 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2092 2093 i = 0; 2094 while (i < wl->wl_dealloccnt) { 2095 wc->wc_type = WAPBL_WC_REVOCATIONS; 2096 wc->wc_len = blocklen; 2097 wc->wc_blkcount = 0; 2098 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2099 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2100 wl->wl_deallocblks[i]; 2101 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2102 wl->wl_dealloclens[i]; 2103 wc->wc_blkcount++; 2104 i++; 2105 } 2106 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2107 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2108 wc->wc_len, (intmax_t)off)); 2109 error = wapbl_circ_write(wl, wc, blocklen, &off); 2110 if (error) 2111 return error; 2112 } 2113 *offp = off; 2114 return 0; 2115 } 2116 2117 static int 2118 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2119 { 2120 struct wapbl_wc_inodelist *wc = 2121 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2122 int i; 2123 int blocklen = 1 << wl->wl_log_dev_bshift; 2124 off_t off = *offp; 2125 int error; 2126 2127 struct wapbl_ino_head *wih; 2128 struct wapbl_ino *wi; 2129 int iph; 2130 2131 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2132 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2133 2134 i = 0; 2135 wih = &wl->wl_inohash[0]; 2136 wi = 0; 2137 do { 2138 wc->wc_type = WAPBL_WC_INODES; 2139 wc->wc_len = blocklen; 2140 wc->wc_inocnt = 0; 2141 wc->wc_clear = (i == 0); 2142 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2143 while (!wi) { 2144 KASSERT((wih - &wl->wl_inohash[0]) 2145 <= wl->wl_inohashmask); 2146 wi = LIST_FIRST(wih++); 2147 } 2148 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2149 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2150 wc->wc_inocnt++; 2151 i++; 2152 wi = LIST_NEXT(wi, wi_hash); 2153 } 2154 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2155 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2156 wc->wc_len, (intmax_t)off)); 2157 error = wapbl_circ_write(wl, wc, blocklen, &off); 2158 if (error) 2159 return error; 2160 } while (i < wl->wl_inohashcnt); 2161 2162 *offp = off; 2163 return 0; 2164 } 2165 2166 #endif /* _KERNEL */ 2167 2168 /****************************************************************/ 2169 2170 struct wapbl_blk { 2171 LIST_ENTRY(wapbl_blk) wb_hash; 2172 daddr_t wb_blk; 2173 off_t wb_off; /* Offset of this block in the log */ 2174 }; 2175 #define WAPBL_BLKPOOL_MIN 83 2176 2177 static void 2178 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2179 { 2180 if (size < WAPBL_BLKPOOL_MIN) 2181 size = WAPBL_BLKPOOL_MIN; 2182 KASSERT(wr->wr_blkhash == 0); 2183 #ifdef _KERNEL 2184 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2185 #else /* ! _KERNEL */ 2186 /* Manually implement hashinit */ 2187 { 2188 unsigned long i, hashsize; 2189 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2190 continue; 2191 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); 2192 for (i = 0; i < hashsize; i++) 2193 LIST_INIT(&wr->wr_blkhash[i]); 2194 wr->wr_blkhashmask = hashsize - 1; 2195 } 2196 #endif /* ! _KERNEL */ 2197 } 2198 2199 static void 2200 wapbl_blkhash_free(struct wapbl_replay *wr) 2201 { 2202 KASSERT(wr->wr_blkhashcnt == 0); 2203 #ifdef _KERNEL 2204 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2205 #else /* ! _KERNEL */ 2206 wapbl_free(wr->wr_blkhash, 2207 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2208 #endif /* ! _KERNEL */ 2209 } 2210 2211 static struct wapbl_blk * 2212 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2213 { 2214 struct wapbl_blk_head *wbh; 2215 struct wapbl_blk *wb; 2216 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2217 LIST_FOREACH(wb, wbh, wb_hash) { 2218 if (blk == wb->wb_blk) 2219 return wb; 2220 } 2221 return 0; 2222 } 2223 2224 static void 2225 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2226 { 2227 struct wapbl_blk_head *wbh; 2228 struct wapbl_blk *wb; 2229 wb = wapbl_blkhash_get(wr, blk); 2230 if (wb) { 2231 KASSERT(wb->wb_blk == blk); 2232 wb->wb_off = off; 2233 } else { 2234 wb = wapbl_malloc(sizeof(*wb)); 2235 wb->wb_blk = blk; 2236 wb->wb_off = off; 2237 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2238 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2239 wr->wr_blkhashcnt++; 2240 } 2241 } 2242 2243 static void 2244 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2245 { 2246 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2247 if (wb) { 2248 KASSERT(wr->wr_blkhashcnt > 0); 2249 wr->wr_blkhashcnt--; 2250 LIST_REMOVE(wb, wb_hash); 2251 wapbl_free(wb, sizeof(*wb)); 2252 } 2253 } 2254 2255 static void 2256 wapbl_blkhash_clear(struct wapbl_replay *wr) 2257 { 2258 unsigned long i; 2259 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2260 struct wapbl_blk *wb; 2261 2262 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2263 KASSERT(wr->wr_blkhashcnt > 0); 2264 wr->wr_blkhashcnt--; 2265 LIST_REMOVE(wb, wb_hash); 2266 wapbl_free(wb, sizeof(*wb)); 2267 } 2268 } 2269 KASSERT(wr->wr_blkhashcnt == 0); 2270 } 2271 2272 /****************************************************************/ 2273 2274 static int 2275 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2276 { 2277 size_t slen; 2278 off_t off = *offp; 2279 int error; 2280 daddr_t pbn; 2281 2282 KASSERT(((len >> wr->wr_log_dev_bshift) << 2283 wr->wr_log_dev_bshift) == len); 2284 2285 if (off < wr->wr_circ_off) 2286 off = wr->wr_circ_off; 2287 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2288 if (slen < len) { 2289 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2290 #ifdef _KERNEL 2291 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2292 #endif 2293 error = wapbl_read(data, slen, wr->wr_devvp, pbn); 2294 if (error) 2295 return error; 2296 data = (uint8_t *)data + slen; 2297 len -= slen; 2298 off = wr->wr_circ_off; 2299 } 2300 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2301 #ifdef _KERNEL 2302 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2303 #endif 2304 error = wapbl_read(data, len, wr->wr_devvp, pbn); 2305 if (error) 2306 return error; 2307 off += len; 2308 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2309 off = wr->wr_circ_off; 2310 *offp = off; 2311 return 0; 2312 } 2313 2314 static void 2315 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2316 { 2317 size_t slen; 2318 off_t off = *offp; 2319 2320 KASSERT(((len >> wr->wr_log_dev_bshift) << 2321 wr->wr_log_dev_bshift) == len); 2322 2323 if (off < wr->wr_circ_off) 2324 off = wr->wr_circ_off; 2325 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2326 if (slen < len) { 2327 len -= slen; 2328 off = wr->wr_circ_off; 2329 } 2330 off += len; 2331 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2332 off = wr->wr_circ_off; 2333 *offp = off; 2334 } 2335 2336 /****************************************************************/ 2337 2338 int 2339 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2340 daddr_t off, size_t count, size_t blksize) 2341 { 2342 struct wapbl_replay *wr; 2343 int error; 2344 struct vnode *devvp; 2345 daddr_t logpbn; 2346 uint8_t *scratch; 2347 struct wapbl_wc_header *wch; 2348 struct wapbl_wc_header *wch2; 2349 /* Use this until we read the actual log header */ 2350 int log_dev_bshift = ilog2(blksize); 2351 size_t used; 2352 daddr_t pbn; 2353 2354 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2355 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2356 vp, off, count, blksize)); 2357 2358 if (off < 0) 2359 return EINVAL; 2360 2361 if (blksize < DEV_BSIZE) 2362 return EINVAL; 2363 if (blksize % DEV_BSIZE) 2364 return EINVAL; 2365 2366 #ifdef _KERNEL 2367 #if 0 2368 /* XXX vp->v_size isn't reliably set for VBLK devices, 2369 * especially root. However, we might still want to verify 2370 * that the full load is readable */ 2371 if ((off + count) * blksize > vp->v_size) 2372 return EINVAL; 2373 #endif 2374 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2375 return error; 2376 } 2377 #else /* ! _KERNEL */ 2378 devvp = vp; 2379 logpbn = off; 2380 #endif /* ! _KERNEL */ 2381 2382 scratch = wapbl_malloc(MAXBSIZE); 2383 2384 pbn = logpbn; 2385 #ifdef _KERNEL 2386 pbn = btodb(pbn << log_dev_bshift); 2387 #endif 2388 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); 2389 if (error) 2390 goto errout; 2391 2392 wch = (struct wapbl_wc_header *)scratch; 2393 wch2 = 2394 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2395 /* XXX verify checksums and magic numbers */ 2396 if (wch->wc_type != WAPBL_WC_HEADER) { 2397 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2398 error = EFTYPE; 2399 goto errout; 2400 } 2401 2402 if (wch2->wc_generation > wch->wc_generation) 2403 wch = wch2; 2404 2405 wr = wapbl_calloc(1, sizeof(*wr)); 2406 2407 wr->wr_logvp = vp; 2408 wr->wr_devvp = devvp; 2409 wr->wr_logpbn = logpbn; 2410 2411 wr->wr_scratch = scratch; 2412 2413 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2414 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2415 wr->wr_circ_off = wch->wc_circ_off; 2416 wr->wr_circ_size = wch->wc_circ_size; 2417 wr->wr_generation = wch->wc_generation; 2418 2419 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2420 2421 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2422 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2423 " len=%"PRId64" used=%zu\n", 2424 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2425 wch->wc_circ_size, used)); 2426 2427 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2428 2429 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2430 if (error) { 2431 wapbl_replay_stop(wr); 2432 wapbl_replay_free(wr); 2433 return error; 2434 } 2435 2436 *wrp = wr; 2437 return 0; 2438 2439 errout: 2440 wapbl_free(scratch, MAXBSIZE); 2441 return error; 2442 } 2443 2444 void 2445 wapbl_replay_stop(struct wapbl_replay *wr) 2446 { 2447 2448 if (!wapbl_replay_isopen(wr)) 2449 return; 2450 2451 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2452 2453 wapbl_free(wr->wr_scratch, MAXBSIZE); 2454 wr->wr_scratch = NULL; 2455 2456 wr->wr_logvp = NULL; 2457 2458 wapbl_blkhash_clear(wr); 2459 wapbl_blkhash_free(wr); 2460 } 2461 2462 void 2463 wapbl_replay_free(struct wapbl_replay *wr) 2464 { 2465 2466 KDASSERT(!wapbl_replay_isopen(wr)); 2467 2468 if (wr->wr_inodes) 2469 wapbl_free(wr->wr_inodes, 2470 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2471 wapbl_free(wr, sizeof(*wr)); 2472 } 2473 2474 #ifdef _KERNEL 2475 int 2476 wapbl_replay_isopen1(struct wapbl_replay *wr) 2477 { 2478 2479 return wapbl_replay_isopen(wr); 2480 } 2481 #endif 2482 2483 static void 2484 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2485 { 2486 struct wapbl_wc_blocklist *wc = 2487 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2488 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2489 int i, j, n; 2490 2491 for (i = 0; i < wc->wc_blkcount; i++) { 2492 /* 2493 * Enter each physical block into the hashtable independently. 2494 */ 2495 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2496 for (j = 0; j < n; j++) { 2497 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen), 2498 *offp); 2499 wapbl_circ_advance(wr, fsblklen, offp); 2500 } 2501 } 2502 } 2503 2504 static void 2505 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2506 { 2507 struct wapbl_wc_blocklist *wc = 2508 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2509 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2510 int i, j, n; 2511 2512 for (i = 0; i < wc->wc_blkcount; i++) { 2513 /* 2514 * Remove any blocks found from the hashtable. 2515 */ 2516 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2517 for (j = 0; j < n; j++) 2518 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2519 } 2520 } 2521 2522 static void 2523 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2524 { 2525 struct wapbl_wc_inodelist *wc = 2526 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2527 void *new_inodes; 2528 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2529 2530 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2531 2532 /* 2533 * Keep track of where we found this so location won't be 2534 * overwritten. 2535 */ 2536 if (wc->wc_clear) { 2537 wr->wr_inodestail = oldoff; 2538 wr->wr_inodescnt = 0; 2539 if (wr->wr_inodes != NULL) { 2540 wapbl_free(wr->wr_inodes, oldsize); 2541 wr->wr_inodes = NULL; 2542 } 2543 } 2544 wr->wr_inodeshead = newoff; 2545 if (wc->wc_inocnt == 0) 2546 return; 2547 2548 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) * 2549 sizeof(wr->wr_inodes[0])); 2550 if (wr->wr_inodes != NULL) { 2551 memcpy(new_inodes, wr->wr_inodes, oldsize); 2552 wapbl_free(wr->wr_inodes, oldsize); 2553 } 2554 wr->wr_inodes = new_inodes; 2555 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2556 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2557 wr->wr_inodescnt += wc->wc_inocnt; 2558 } 2559 2560 static int 2561 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2562 { 2563 off_t off; 2564 int error; 2565 2566 int logblklen = 1 << wr->wr_log_dev_bshift; 2567 2568 wapbl_blkhash_clear(wr); 2569 2570 off = tail; 2571 while (off != head) { 2572 struct wapbl_wc_null *wcn; 2573 off_t saveoff = off; 2574 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2575 if (error) 2576 goto errout; 2577 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2578 switch (wcn->wc_type) { 2579 case WAPBL_WC_BLOCKS: 2580 wapbl_replay_process_blocks(wr, &off); 2581 break; 2582 2583 case WAPBL_WC_REVOCATIONS: 2584 wapbl_replay_process_revocations(wr); 2585 break; 2586 2587 case WAPBL_WC_INODES: 2588 wapbl_replay_process_inodes(wr, saveoff, off); 2589 break; 2590 2591 default: 2592 printf("Unrecognized wapbl type: 0x%08x\n", 2593 wcn->wc_type); 2594 error = EFTYPE; 2595 goto errout; 2596 } 2597 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2598 if (off != saveoff) { 2599 printf("wapbl_replay: corrupted records\n"); 2600 error = EFTYPE; 2601 goto errout; 2602 } 2603 } 2604 return 0; 2605 2606 errout: 2607 wapbl_blkhash_clear(wr); 2608 return error; 2609 } 2610 2611 #if 0 2612 int 2613 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2614 { 2615 off_t off; 2616 int mismatchcnt = 0; 2617 int logblklen = 1 << wr->wr_log_dev_bshift; 2618 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2619 void *scratch1 = wapbl_malloc(MAXBSIZE); 2620 void *scratch2 = wapbl_malloc(MAXBSIZE); 2621 int error = 0; 2622 2623 KDASSERT(wapbl_replay_isopen(wr)); 2624 2625 off = wch->wc_tail; 2626 while (off != wch->wc_head) { 2627 struct wapbl_wc_null *wcn; 2628 #ifdef DEBUG 2629 off_t saveoff = off; 2630 #endif 2631 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2632 if (error) 2633 goto out; 2634 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2635 switch (wcn->wc_type) { 2636 case WAPBL_WC_BLOCKS: 2637 { 2638 struct wapbl_wc_blocklist *wc = 2639 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2640 int i; 2641 for (i = 0; i < wc->wc_blkcount; i++) { 2642 int foundcnt = 0; 2643 int dirtycnt = 0; 2644 int j, n; 2645 /* 2646 * Check each physical block into the 2647 * hashtable independently 2648 */ 2649 n = wc->wc_blocks[i].wc_dlen >> 2650 wch->wc_fs_dev_bshift; 2651 for (j = 0; j < n; j++) { 2652 struct wapbl_blk *wb = 2653 wapbl_blkhash_get(wr, 2654 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2655 if (wb && (wb->wb_off == off)) { 2656 foundcnt++; 2657 error = 2658 wapbl_circ_read(wr, 2659 scratch1, fsblklen, 2660 &off); 2661 if (error) 2662 goto out; 2663 error = 2664 wapbl_read(scratch2, 2665 fsblklen, fsdevvp, 2666 wb->wb_blk); 2667 if (error) 2668 goto out; 2669 if (memcmp(scratch1, 2670 scratch2, 2671 fsblklen)) { 2672 printf( 2673 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2674 wb->wb_blk, (intmax_t)off); 2675 dirtycnt++; 2676 mismatchcnt++; 2677 } 2678 } else { 2679 wapbl_circ_advance(wr, 2680 fsblklen, &off); 2681 } 2682 } 2683 #if 0 2684 /* 2685 * If all of the blocks in an entry 2686 * are clean, then remove all of its 2687 * blocks from the hashtable since they 2688 * never will need replay. 2689 */ 2690 if ((foundcnt != 0) && 2691 (dirtycnt == 0)) { 2692 off = saveoff; 2693 wapbl_circ_advance(wr, 2694 logblklen, &off); 2695 for (j = 0; j < n; j++) { 2696 struct wapbl_blk *wb = 2697 wapbl_blkhash_get(wr, 2698 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2699 if (wb && 2700 (wb->wb_off == off)) { 2701 wapbl_blkhash_rem(wr, wb->wb_blk); 2702 } 2703 wapbl_circ_advance(wr, 2704 fsblklen, &off); 2705 } 2706 } 2707 #endif 2708 } 2709 } 2710 break; 2711 case WAPBL_WC_REVOCATIONS: 2712 case WAPBL_WC_INODES: 2713 break; 2714 default: 2715 KASSERT(0); 2716 } 2717 #ifdef DEBUG 2718 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2719 KASSERT(off == saveoff); 2720 #endif 2721 } 2722 out: 2723 wapbl_free(scratch1, MAXBSIZE); 2724 wapbl_free(scratch2, MAXBSIZE); 2725 if (!error && mismatchcnt) 2726 error = EFTYPE; 2727 return error; 2728 } 2729 #endif 2730 2731 int 2732 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2733 { 2734 struct wapbl_blk *wb; 2735 size_t i; 2736 off_t off; 2737 void *scratch; 2738 int error = 0; 2739 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2740 2741 KDASSERT(wapbl_replay_isopen(wr)); 2742 2743 scratch = wapbl_malloc(MAXBSIZE); 2744 2745 for (i = 0; i <= wr->wr_blkhashmask; ++i) { 2746 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2747 off = wb->wb_off; 2748 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2749 if (error) 2750 break; 2751 error = wapbl_write(scratch, fsblklen, fsdevvp, 2752 wb->wb_blk); 2753 if (error) 2754 break; 2755 } 2756 } 2757 2758 wapbl_free(scratch, MAXBSIZE); 2759 return error; 2760 } 2761 2762 int 2763 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2764 { 2765 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2766 2767 KDASSERT(wapbl_replay_isopen(wr)); 2768 KASSERT((len % fsblklen) == 0); 2769 2770 while (len != 0) { 2771 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2772 if (wb) 2773 return 1; 2774 len -= fsblklen; 2775 } 2776 return 0; 2777 } 2778 2779 int 2780 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2781 { 2782 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2783 2784 KDASSERT(wapbl_replay_isopen(wr)); 2785 2786 KASSERT((len % fsblklen) == 0); 2787 2788 while (len != 0) { 2789 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2790 if (wb) { 2791 off_t off = wb->wb_off; 2792 int error; 2793 error = wapbl_circ_read(wr, data, fsblklen, &off); 2794 if (error) 2795 return error; 2796 } 2797 data = (uint8_t *)data + fsblklen; 2798 len -= fsblklen; 2799 blk++; 2800 } 2801 return 0; 2802 } 2803 2804 #ifdef _KERNEL 2805 /* 2806 * This is not really a module now, but maybe on it's way to 2807 * being one some day. 2808 */ 2809 MODULE(MODULE_CLASS_VFS, wapbl, NULL); 2810 2811 static int 2812 wapbl_modcmd(modcmd_t cmd, void *arg) 2813 { 2814 2815 switch (cmd) { 2816 case MODULE_CMD_INIT: 2817 wapbl_init(); 2818 return 0; 2819 case MODULE_CMD_FINI: 2820 #ifdef notyet 2821 return wapbl_fini(true); 2822 #endif 2823 return EOPNOTSUPP; 2824 default: 2825 return ENOTTY; 2826 } 2827 } 2828 #endif /* _KERNEL */ 2829