1 /* $NetBSD: vfs_wapbl.c,v 1.64 2015/11/15 03:09:39 pgoyette Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.64 2015/11/15 03:09:39 pgoyette Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/bitops.h> 43 44 #ifdef _KERNEL 45 #include <sys/param.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/sysctl.h> 49 #include <sys/uio.h> 50 #include <sys/vnode.h> 51 #include <sys/file.h> 52 #include <sys/module.h> 53 #include <sys/resourcevar.h> 54 #include <sys/conf.h> 55 #include <sys/mount.h> 56 #include <sys/kernel.h> 57 #include <sys/kauth.h> 58 #include <sys/mutex.h> 59 #include <sys/atomic.h> 60 #include <sys/wapbl.h> 61 #include <sys/wapbl_replay.h> 62 63 #include <miscfs/specfs/specdev.h> 64 65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP) 66 #define wapbl_free(a, s) kmem_free((a), (s)) 67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 68 69 static struct sysctllog *wapbl_sysctl; 70 static int wapbl_flush_disk_cache = 1; 71 static int wapbl_verbose_commit = 0; 72 73 static inline size_t wapbl_space_free(size_t, off_t, off_t); 74 75 #else /* !_KERNEL */ 76 #include <assert.h> 77 #include <errno.h> 78 #include <stdio.h> 79 #include <stdbool.h> 80 #include <stdlib.h> 81 #include <string.h> 82 83 #include <sys/time.h> 84 #include <sys/wapbl.h> 85 #include <sys/wapbl_replay.h> 86 87 #define KDASSERT(x) assert(x) 88 #define KASSERT(x) assert(x) 89 #define wapbl_alloc(s) malloc(s) 90 #define wapbl_free(a, s) free(a) 91 #define wapbl_calloc(n, s) calloc((n), (s)) 92 93 #endif /* !_KERNEL */ 94 95 /* 96 * INTERNAL DATA STRUCTURES 97 */ 98 99 /* 100 * This structure holds per-mount log information. 101 * 102 * Legend: a = atomic access only 103 * r = read-only after init 104 * l = rwlock held 105 * m = mutex held 106 * lm = rwlock held writing or mutex held 107 * u = unlocked access ok 108 * b = bufcache_lock held 109 */ 110 LIST_HEAD(wapbl_ino_head, wapbl_ino); 111 struct wapbl { 112 struct vnode *wl_logvp; /* r: log here */ 113 struct vnode *wl_devvp; /* r: log on this device */ 114 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 115 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 116 int wl_log_dev_bshift; /* r: logarithm of device block size of log 117 device */ 118 int wl_fs_dev_bshift; /* r: logarithm of device block size of 119 filesystem device */ 120 121 unsigned wl_lock_count; /* m: Count of transactions in progress */ 122 123 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 124 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 125 126 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 127 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 128 129 off_t wl_head; /* l: Byte offset of log head */ 130 off_t wl_tail; /* l: Byte offset of log tail */ 131 /* 132 * head == tail == 0 means log is empty 133 * head == tail != 0 means log is full 134 * see assertions in wapbl_advance() for other boundary conditions. 135 * only truncate moves the tail, except when flush sets it to 136 * wl_header_size only flush moves the head, except when truncate 137 * sets it to 0. 138 */ 139 140 struct wapbl_wc_header *wl_wc_header; /* l */ 141 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 142 143 kmutex_t wl_mtx; /* u: short-term lock */ 144 krwlock_t wl_rwlock; /* u: File system transaction lock */ 145 146 /* 147 * Must be held while accessing 148 * wl_count or wl_bufs or head or tail 149 */ 150 151 /* 152 * Callback called from within the flush routine to flush any extra 153 * bits. Note that flush may be skipped without calling this if 154 * there are no outstanding buffers in the transaction. 155 */ 156 #if _KERNEL 157 wapbl_flush_fn_t wl_flush; /* r */ 158 wapbl_flush_fn_t wl_flush_abort;/* r */ 159 #endif 160 161 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 162 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 163 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 164 165 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 166 167 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 168 size_t wl_reclaimable_bytes; /* m: Amount of space available for 169 reclamation by truncate */ 170 int wl_error_count; /* m: # of wl_entries with errors */ 171 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 172 173 #ifdef WAPBL_DEBUG_BUFBYTES 174 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 175 #endif 176 177 daddr_t *wl_deallocblks;/* lm: address of block */ 178 int *wl_dealloclens; /* lm: size of block */ 179 int wl_dealloccnt; /* lm: total count */ 180 int wl_dealloclim; /* l: max count */ 181 182 /* hashtable of inode numbers for allocated but unlinked inodes */ 183 /* synch ??? */ 184 struct wapbl_ino_head *wl_inohash; 185 u_long wl_inohashmask; 186 int wl_inohashcnt; 187 188 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 189 accounting */ 190 191 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */ 192 daddr_t wl_buffer_dblk; /* l: buffer disk block address */ 193 size_t wl_buffer_used; /* l: buffer current use */ 194 }; 195 196 #ifdef WAPBL_DEBUG_PRINT 197 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 198 #endif 199 200 /****************************************************************/ 201 #ifdef _KERNEL 202 203 #ifdef WAPBL_DEBUG 204 struct wapbl *wapbl_debug_wl; 205 #endif 206 207 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 208 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 209 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 210 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 211 #endif /* _KERNEL */ 212 213 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 214 215 static inline size_t wapbl_space_used(size_t avail, off_t head, 216 off_t tail); 217 218 #ifdef _KERNEL 219 220 static struct pool wapbl_entry_pool; 221 222 #define WAPBL_INODETRK_SIZE 83 223 static int wapbl_ino_pool_refcount; 224 static struct pool wapbl_ino_pool; 225 struct wapbl_ino { 226 LIST_ENTRY(wapbl_ino) wi_hash; 227 ino_t wi_ino; 228 mode_t wi_mode; 229 }; 230 231 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 232 static void wapbl_inodetrk_free(struct wapbl *wl); 233 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 234 235 static size_t wapbl_transaction_len(struct wapbl *wl); 236 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 237 238 #if 0 239 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 240 #endif 241 242 static int wapbl_replay_isopen1(struct wapbl_replay *); 243 244 /* 245 * This is useful for debugging. If set, the log will 246 * only be truncated when necessary. 247 */ 248 int wapbl_lazy_truncate = 0; 249 250 struct wapbl_ops wapbl_ops = { 251 .wo_wapbl_discard = wapbl_discard, 252 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 253 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 254 .wo_wapbl_replay_read = wapbl_replay_read, 255 .wo_wapbl_add_buf = wapbl_add_buf, 256 .wo_wapbl_remove_buf = wapbl_remove_buf, 257 .wo_wapbl_resize_buf = wapbl_resize_buf, 258 .wo_wapbl_begin = wapbl_begin, 259 .wo_wapbl_end = wapbl_end, 260 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 261 262 /* XXX: the following is only used to say "this is a wapbl buf" */ 263 .wo_wapbl_biodone = wapbl_biodone, 264 }; 265 266 static int 267 wapbl_sysctl_init(void) 268 { 269 int rv; 270 const struct sysctlnode *rnode, *cnode; 271 272 wapbl_sysctl = NULL; 273 274 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, 275 CTLFLAG_PERMANENT, 276 CTLTYPE_NODE, "wapbl", 277 SYSCTL_DESCR("WAPBL journaling options"), 278 NULL, 0, NULL, 0, 279 CTL_VFS, CTL_CREATE, CTL_EOL); 280 if (rv) 281 return rv; 282 283 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 284 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 285 CTLTYPE_INT, "flush_disk_cache", 286 SYSCTL_DESCR("flush disk cache"), 287 NULL, 0, &wapbl_flush_disk_cache, 0, 288 CTL_CREATE, CTL_EOL); 289 if (rv) 290 return rv; 291 292 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 294 CTLTYPE_INT, "verbose_commit", 295 SYSCTL_DESCR("show time and size of wapbl log commits"), 296 NULL, 0, &wapbl_verbose_commit, 0, 297 CTL_CREATE, CTL_EOL); 298 return rv; 299 } 300 301 static void 302 wapbl_init(void) 303 { 304 305 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0, 306 "wapblentrypl", &pool_allocator_kmem, IPL_VM); 307 308 wapbl_sysctl_init(); 309 } 310 311 static int 312 wapbl_fini(bool interface) 313 { 314 315 if (wapbl_sysctl != NULL) 316 sysctl_teardown(&wapbl_sysctl); 317 318 pool_destroy(&wapbl_entry_pool); 319 320 return 0; 321 } 322 323 static int 324 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 325 { 326 int error, i; 327 328 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 329 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 330 331 /* 332 * Its only valid to reuse the replay log if its 333 * the same as the new log we just opened. 334 */ 335 KDASSERT(!wapbl_replay_isopen(wr)); 336 KASSERT(wl->wl_devvp->v_type == VBLK); 337 KASSERT(wr->wr_devvp->v_type == VBLK); 338 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 339 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 340 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 341 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 342 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 343 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 344 345 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 346 347 for (i = 0; i < wr->wr_inodescnt; i++) 348 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 349 wr->wr_inodes[i].wr_imode); 350 351 /* Make sure new transaction won't overwrite old inodes list */ 352 KDASSERT(wapbl_transaction_len(wl) <= 353 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 354 wr->wr_inodestail)); 355 356 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 357 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 358 wapbl_transaction_len(wl); 359 360 error = wapbl_write_inodes(wl, &wl->wl_head); 361 if (error) 362 return error; 363 364 KASSERT(wl->wl_head != wl->wl_tail); 365 KASSERT(wl->wl_head != 0); 366 367 return 0; 368 } 369 370 int 371 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 372 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 373 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 374 { 375 struct wapbl *wl; 376 struct vnode *devvp; 377 daddr_t logpbn; 378 int error; 379 int log_dev_bshift = ilog2(blksize); 380 int fs_dev_bshift = log_dev_bshift; 381 int run; 382 383 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 384 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 385 386 if (log_dev_bshift > fs_dev_bshift) { 387 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 388 ("wapbl: log device's block size cannot be larger " 389 "than filesystem's\n")); 390 /* 391 * Not currently implemented, although it could be if 392 * needed someday. 393 */ 394 return ENOSYS; 395 } 396 397 if (off < 0) 398 return EINVAL; 399 400 if (blksize < DEV_BSIZE) 401 return EINVAL; 402 if (blksize % DEV_BSIZE) 403 return EINVAL; 404 405 /* XXXTODO: verify that the full load is writable */ 406 407 /* 408 * XXX check for minimum log size 409 * minimum is governed by minimum amount of space 410 * to complete a transaction. (probably truncate) 411 */ 412 /* XXX for now pick something minimal */ 413 if ((count * blksize) < MAXPHYS) { 414 return ENOSPC; 415 } 416 417 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 418 return error; 419 } 420 421 wl = wapbl_calloc(1, sizeof(*wl)); 422 rw_init(&wl->wl_rwlock); 423 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 424 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 425 LIST_INIT(&wl->wl_bufs); 426 SIMPLEQ_INIT(&wl->wl_entries); 427 428 wl->wl_logvp = vp; 429 wl->wl_devvp = devvp; 430 wl->wl_mount = mp; 431 wl->wl_logpbn = logpbn; 432 wl->wl_log_dev_bshift = log_dev_bshift; 433 wl->wl_fs_dev_bshift = fs_dev_bshift; 434 435 wl->wl_flush = flushfn; 436 wl->wl_flush_abort = flushabortfn; 437 438 /* Reserve two log device blocks for the commit headers */ 439 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 440 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 441 /* truncate the log usage to a multiple of log_dev_bshift */ 442 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 443 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 444 445 /* 446 * wl_bufbytes_max limits the size of the in memory transaction space. 447 * - Since buffers are allocated and accounted for in units of 448 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 449 * (i.e. 1<<PAGE_SHIFT) 450 * - Since the log device has to be written in units of 451 * 1<<wl_log_dev_bshift it is required to be a mulitple of 452 * 1<<wl_log_dev_bshift. 453 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 454 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 455 * Therefore it must be multiple of the least common multiple of those 456 * three quantities. Fortunately, all of those quantities are 457 * guaranteed to be a power of two, and the least common multiple of 458 * a set of numbers which are all powers of two is simply the maximum 459 * of those numbers. Finally, the maximum logarithm of a power of two 460 * is the same as the log of the maximum power of two. So we can do 461 * the following operations to size wl_bufbytes_max: 462 */ 463 464 /* XXX fix actual number of pages reserved per filesystem. */ 465 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 466 467 /* Round wl_bufbytes_max to the largest power of two constraint */ 468 wl->wl_bufbytes_max >>= PAGE_SHIFT; 469 wl->wl_bufbytes_max <<= PAGE_SHIFT; 470 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 471 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 472 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 473 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 474 475 /* XXX maybe use filesystem fragment size instead of 1024 */ 476 /* XXX fix actual number of buffers reserved per filesystem. */ 477 wl->wl_bufcount_max = (nbuf / 2) * 1024; 478 479 /* XXX tie this into resource estimation */ 480 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2; 481 482 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) * 483 wl->wl_dealloclim); 484 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) * 485 wl->wl_dealloclim); 486 487 wl->wl_buffer = wapbl_alloc(MAXPHYS); 488 wl->wl_buffer_used = 0; 489 490 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 491 492 /* Initialize the commit header */ 493 { 494 struct wapbl_wc_header *wc; 495 size_t len = 1 << wl->wl_log_dev_bshift; 496 wc = wapbl_calloc(1, len); 497 wc->wc_type = WAPBL_WC_HEADER; 498 wc->wc_len = len; 499 wc->wc_circ_off = wl->wl_circ_off; 500 wc->wc_circ_size = wl->wl_circ_size; 501 /* XXX wc->wc_fsid */ 502 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 503 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 504 wl->wl_wc_header = wc; 505 wl->wl_wc_scratch = wapbl_alloc(len); 506 } 507 508 /* 509 * if there was an existing set of unlinked but 510 * allocated inodes, preserve it in the new 511 * log. 512 */ 513 if (wr && wr->wr_inodescnt) { 514 error = wapbl_start_flush_inodes(wl, wr); 515 if (error) 516 goto errout; 517 } 518 519 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 520 if (error) { 521 goto errout; 522 } 523 524 *wlp = wl; 525 #if defined(WAPBL_DEBUG) 526 wapbl_debug_wl = wl; 527 #endif 528 529 return 0; 530 errout: 531 wapbl_discard(wl); 532 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 533 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 534 wapbl_free(wl->wl_deallocblks, 535 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 536 wapbl_free(wl->wl_dealloclens, 537 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 538 wapbl_free(wl->wl_buffer, MAXPHYS); 539 wapbl_inodetrk_free(wl); 540 wapbl_free(wl, sizeof(*wl)); 541 542 return error; 543 } 544 545 /* 546 * Like wapbl_flush, only discards the transaction 547 * completely 548 */ 549 550 void 551 wapbl_discard(struct wapbl *wl) 552 { 553 struct wapbl_entry *we; 554 struct buf *bp; 555 int i; 556 557 /* 558 * XXX we may consider using upgrade here 559 * if we want to call flush from inside a transaction 560 */ 561 rw_enter(&wl->wl_rwlock, RW_WRITER); 562 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 563 wl->wl_dealloccnt); 564 565 #ifdef WAPBL_DEBUG_PRINT 566 { 567 pid_t pid = -1; 568 lwpid_t lid = -1; 569 if (curproc) 570 pid = curproc->p_pid; 571 if (curlwp) 572 lid = curlwp->l_lid; 573 #ifdef WAPBL_DEBUG_BUFBYTES 574 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 575 ("wapbl_discard: thread %d.%d discarding " 576 "transaction\n" 577 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 578 "deallocs=%d inodes=%d\n" 579 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 580 "unsynced=%zu\n", 581 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 582 wl->wl_bcount, wl->wl_dealloccnt, 583 wl->wl_inohashcnt, wl->wl_error_count, 584 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 585 wl->wl_unsynced_bufbytes)); 586 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 587 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 588 ("\tentry: bufcount = %zu, reclaimable = %zu, " 589 "error = %d, unsynced = %zu\n", 590 we->we_bufcount, we->we_reclaimable_bytes, 591 we->we_error, we->we_unsynced_bufbytes)); 592 } 593 #else /* !WAPBL_DEBUG_BUFBYTES */ 594 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 595 ("wapbl_discard: thread %d.%d discarding transaction\n" 596 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 597 "deallocs=%d inodes=%d\n" 598 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 599 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 600 wl->wl_bcount, wl->wl_dealloccnt, 601 wl->wl_inohashcnt, wl->wl_error_count, 602 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 603 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 604 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 605 ("\tentry: bufcount = %zu, reclaimable = %zu, " 606 "error = %d\n", 607 we->we_bufcount, we->we_reclaimable_bytes, 608 we->we_error)); 609 } 610 #endif /* !WAPBL_DEBUG_BUFBYTES */ 611 } 612 #endif /* WAPBL_DEBUG_PRINT */ 613 614 for (i = 0; i <= wl->wl_inohashmask; i++) { 615 struct wapbl_ino_head *wih; 616 struct wapbl_ino *wi; 617 618 wih = &wl->wl_inohash[i]; 619 while ((wi = LIST_FIRST(wih)) != NULL) { 620 LIST_REMOVE(wi, wi_hash); 621 pool_put(&wapbl_ino_pool, wi); 622 KASSERT(wl->wl_inohashcnt > 0); 623 wl->wl_inohashcnt--; 624 } 625 } 626 627 /* 628 * clean buffer list 629 */ 630 mutex_enter(&bufcache_lock); 631 mutex_enter(&wl->wl_mtx); 632 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 633 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 634 /* 635 * The buffer will be unlocked and 636 * removed from the transaction in brelse 637 */ 638 mutex_exit(&wl->wl_mtx); 639 brelsel(bp, 0); 640 mutex_enter(&wl->wl_mtx); 641 } 642 } 643 mutex_exit(&wl->wl_mtx); 644 mutex_exit(&bufcache_lock); 645 646 /* 647 * Remove references to this wl from wl_entries, free any which 648 * no longer have buffers, others will be freed in wapbl_biodone 649 * when they no longer have any buffers. 650 */ 651 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 652 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 653 /* XXX should we be accumulating wl_error_count 654 * and increasing reclaimable bytes ? */ 655 we->we_wapbl = NULL; 656 if (we->we_bufcount == 0) { 657 #ifdef WAPBL_DEBUG_BUFBYTES 658 KASSERT(we->we_unsynced_bufbytes == 0); 659 #endif 660 pool_put(&wapbl_entry_pool, we); 661 } 662 } 663 664 /* Discard list of deallocs */ 665 wl->wl_dealloccnt = 0; 666 /* XXX should we clear wl_reserved_bytes? */ 667 668 KASSERT(wl->wl_bufbytes == 0); 669 KASSERT(wl->wl_bcount == 0); 670 KASSERT(wl->wl_bufcount == 0); 671 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 672 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 673 KASSERT(wl->wl_inohashcnt == 0); 674 675 rw_exit(&wl->wl_rwlock); 676 } 677 678 int 679 wapbl_stop(struct wapbl *wl, int force) 680 { 681 int error; 682 683 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 684 error = wapbl_flush(wl, 1); 685 if (error) { 686 if (force) 687 wapbl_discard(wl); 688 else 689 return error; 690 } 691 692 /* Unlinked inodes persist after a flush */ 693 if (wl->wl_inohashcnt) { 694 if (force) { 695 wapbl_discard(wl); 696 } else { 697 return EBUSY; 698 } 699 } 700 701 KASSERT(wl->wl_bufbytes == 0); 702 KASSERT(wl->wl_bcount == 0); 703 KASSERT(wl->wl_bufcount == 0); 704 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 705 KASSERT(wl->wl_dealloccnt == 0); 706 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 707 KASSERT(wl->wl_inohashcnt == 0); 708 709 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 710 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 711 wapbl_free(wl->wl_deallocblks, 712 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 713 wapbl_free(wl->wl_dealloclens, 714 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 715 wapbl_free(wl->wl_buffer, MAXPHYS); 716 wapbl_inodetrk_free(wl); 717 718 cv_destroy(&wl->wl_reclaimable_cv); 719 mutex_destroy(&wl->wl_mtx); 720 rw_destroy(&wl->wl_rwlock); 721 wapbl_free(wl, sizeof(*wl)); 722 723 return 0; 724 } 725 726 static int 727 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 728 { 729 struct pstats *pstats = curlwp->l_proc->p_stats; 730 struct buf *bp; 731 int error; 732 733 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 734 KASSERT(devvp->v_type == VBLK); 735 736 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 737 mutex_enter(devvp->v_interlock); 738 devvp->v_numoutput++; 739 mutex_exit(devvp->v_interlock); 740 pstats->p_ru.ru_oublock++; 741 } else { 742 pstats->p_ru.ru_inblock++; 743 } 744 745 bp = getiobuf(devvp, true); 746 bp->b_flags = flags; 747 bp->b_cflags = BC_BUSY; /* silly & dubious */ 748 bp->b_dev = devvp->v_rdev; 749 bp->b_data = data; 750 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 751 bp->b_blkno = pbn; 752 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 753 754 WAPBL_PRINTF(WAPBL_PRINT_IO, 755 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n", 756 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 757 bp->b_blkno, bp->b_dev)); 758 759 VOP_STRATEGY(devvp, bp); 760 761 error = biowait(bp); 762 putiobuf(bp); 763 764 if (error) { 765 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 766 ("wapbl_doio: %s %zu bytes at block %" PRId64 767 " on dev 0x%"PRIx64" failed with error %d\n", 768 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 769 "write" : "read"), 770 len, pbn, devvp->v_rdev, error)); 771 } 772 773 return error; 774 } 775 776 int 777 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 778 { 779 780 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 781 } 782 783 int 784 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 785 { 786 787 return wapbl_doio(data, len, devvp, pbn, B_READ); 788 } 789 790 /* 791 * Flush buffered data if any. 792 */ 793 static int 794 wapbl_buffered_flush(struct wapbl *wl) 795 { 796 int error; 797 798 if (wl->wl_buffer_used == 0) 799 return 0; 800 801 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 802 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 803 wl->wl_buffer_used = 0; 804 805 return error; 806 } 807 808 /* 809 * Write data to the log. 810 * Try to coalesce writes and emit MAXPHYS aligned blocks. 811 */ 812 static int 813 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn) 814 { 815 int error; 816 size_t resid; 817 818 /* 819 * If not adjacent to buffered data flush first. Disk block 820 * address is always valid for non-empty buffer. 821 */ 822 if (wl->wl_buffer_used > 0 && 823 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) { 824 error = wapbl_buffered_flush(wl); 825 if (error) 826 return error; 827 } 828 /* 829 * If this write goes to an empty buffer we have to 830 * save the disk block address first. 831 */ 832 if (wl->wl_buffer_used == 0) 833 wl->wl_buffer_dblk = pbn; 834 /* 835 * Remaining space so this buffer ends on a MAXPHYS boundary. 836 * 837 * Cannot become less or equal zero as the buffer would have been 838 * flushed on the last call then. 839 */ 840 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) - 841 wl->wl_buffer_used; 842 KASSERT(resid > 0); 843 KASSERT(dbtob(btodb(resid)) == resid); 844 if (len >= resid) { 845 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid); 846 wl->wl_buffer_used += resid; 847 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 848 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 849 data = (uint8_t *)data + resid; 850 len -= resid; 851 wl->wl_buffer_dblk = pbn + btodb(resid); 852 wl->wl_buffer_used = 0; 853 if (error) 854 return error; 855 } 856 KASSERT(len < MAXPHYS); 857 if (len > 0) { 858 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len); 859 wl->wl_buffer_used += len; 860 } 861 862 return 0; 863 } 864 865 /* 866 * Off is byte offset returns new offset for next write 867 * handles log wraparound 868 */ 869 static int 870 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 871 { 872 size_t slen; 873 off_t off = *offp; 874 int error; 875 daddr_t pbn; 876 877 KDASSERT(((len >> wl->wl_log_dev_bshift) << 878 wl->wl_log_dev_bshift) == len); 879 880 if (off < wl->wl_circ_off) 881 off = wl->wl_circ_off; 882 slen = wl->wl_circ_off + wl->wl_circ_size - off; 883 if (slen < len) { 884 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 885 #ifdef _KERNEL 886 pbn = btodb(pbn << wl->wl_log_dev_bshift); 887 #endif 888 error = wapbl_buffered_write(data, slen, wl, pbn); 889 if (error) 890 return error; 891 data = (uint8_t *)data + slen; 892 len -= slen; 893 off = wl->wl_circ_off; 894 } 895 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 896 #ifdef _KERNEL 897 pbn = btodb(pbn << wl->wl_log_dev_bshift); 898 #endif 899 error = wapbl_buffered_write(data, len, wl, pbn); 900 if (error) 901 return error; 902 off += len; 903 if (off >= wl->wl_circ_off + wl->wl_circ_size) 904 off = wl->wl_circ_off; 905 *offp = off; 906 return 0; 907 } 908 909 /****************************************************************/ 910 911 int 912 wapbl_begin(struct wapbl *wl, const char *file, int line) 913 { 914 int doflush; 915 unsigned lockcount; 916 917 KDASSERT(wl); 918 919 /* 920 * XXX this needs to be made much more sophisticated. 921 * perhaps each wapbl_begin could reserve a specified 922 * number of buffers and bytes. 923 */ 924 mutex_enter(&wl->wl_mtx); 925 lockcount = wl->wl_lock_count; 926 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 927 wl->wl_bufbytes_max / 2) || 928 ((wl->wl_bufcount + (lockcount * 10)) > 929 wl->wl_bufcount_max / 2) || 930 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 931 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2)); 932 mutex_exit(&wl->wl_mtx); 933 934 if (doflush) { 935 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 936 ("force flush lockcnt=%d bufbytes=%zu " 937 "(max=%zu) bufcount=%zu (max=%zu) " 938 "dealloccnt %d (lim=%d)\n", 939 lockcount, wl->wl_bufbytes, 940 wl->wl_bufbytes_max, wl->wl_bufcount, 941 wl->wl_bufcount_max, 942 wl->wl_dealloccnt, wl->wl_dealloclim)); 943 } 944 945 if (doflush) { 946 int error = wapbl_flush(wl, 0); 947 if (error) 948 return error; 949 } 950 951 rw_enter(&wl->wl_rwlock, RW_READER); 952 mutex_enter(&wl->wl_mtx); 953 wl->wl_lock_count++; 954 mutex_exit(&wl->wl_mtx); 955 956 #if defined(WAPBL_DEBUG_PRINT) 957 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 958 ("wapbl_begin thread %d.%d with bufcount=%zu " 959 "bufbytes=%zu bcount=%zu at %s:%d\n", 960 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 961 wl->wl_bufbytes, wl->wl_bcount, file, line)); 962 #endif 963 964 return 0; 965 } 966 967 void 968 wapbl_end(struct wapbl *wl) 969 { 970 971 #if defined(WAPBL_DEBUG_PRINT) 972 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 973 ("wapbl_end thread %d.%d with bufcount=%zu " 974 "bufbytes=%zu bcount=%zu\n", 975 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 976 wl->wl_bufbytes, wl->wl_bcount)); 977 #endif 978 979 #ifdef DIAGNOSTIC 980 size_t flushsize = wapbl_transaction_len(wl); 981 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 982 /* 983 * XXX this could be handled more gracefully, perhaps place 984 * only a partial transaction in the log and allow the 985 * remaining to flush without the protection of the journal. 986 */ 987 panic("wapbl_end: current transaction too big to flush\n"); 988 } 989 #endif 990 991 mutex_enter(&wl->wl_mtx); 992 KASSERT(wl->wl_lock_count > 0); 993 wl->wl_lock_count--; 994 mutex_exit(&wl->wl_mtx); 995 996 rw_exit(&wl->wl_rwlock); 997 } 998 999 void 1000 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 1001 { 1002 1003 KASSERT(bp->b_cflags & BC_BUSY); 1004 KASSERT(bp->b_vp); 1005 1006 wapbl_jlock_assert(wl); 1007 1008 #if 0 1009 /* 1010 * XXX this might be an issue for swapfiles. 1011 * see uvm_swap.c:1702 1012 * 1013 * XXX2 why require it then? leap of semantics? 1014 */ 1015 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 1016 #endif 1017 1018 mutex_enter(&wl->wl_mtx); 1019 if (bp->b_flags & B_LOCKED) { 1020 LIST_REMOVE(bp, b_wapbllist); 1021 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 1022 ("wapbl_add_buf thread %d.%d re-adding buf %p " 1023 "with %d bytes %d bcount\n", 1024 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1025 bp->b_bcount)); 1026 } else { 1027 /* unlocked by dirty buffers shouldn't exist */ 1028 KASSERT(!(bp->b_oflags & BO_DELWRI)); 1029 wl->wl_bufbytes += bp->b_bufsize; 1030 wl->wl_bcount += bp->b_bcount; 1031 wl->wl_bufcount++; 1032 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1033 ("wapbl_add_buf thread %d.%d adding buf %p " 1034 "with %d bytes %d bcount\n", 1035 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1036 bp->b_bcount)); 1037 } 1038 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 1039 mutex_exit(&wl->wl_mtx); 1040 1041 bp->b_flags |= B_LOCKED; 1042 } 1043 1044 static void 1045 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 1046 { 1047 1048 KASSERT(mutex_owned(&wl->wl_mtx)); 1049 KASSERT(bp->b_cflags & BC_BUSY); 1050 wapbl_jlock_assert(wl); 1051 1052 #if 0 1053 /* 1054 * XXX this might be an issue for swapfiles. 1055 * see uvm_swap.c:1725 1056 * 1057 * XXXdeux: see above 1058 */ 1059 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 1060 #endif 1061 KASSERT(bp->b_flags & B_LOCKED); 1062 1063 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1064 ("wapbl_remove_buf thread %d.%d removing buf %p with " 1065 "%d bytes %d bcount\n", 1066 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 1067 1068 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 1069 wl->wl_bufbytes -= bp->b_bufsize; 1070 KASSERT(wl->wl_bcount >= bp->b_bcount); 1071 wl->wl_bcount -= bp->b_bcount; 1072 KASSERT(wl->wl_bufcount > 0); 1073 wl->wl_bufcount--; 1074 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1075 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1076 LIST_REMOVE(bp, b_wapbllist); 1077 1078 bp->b_flags &= ~B_LOCKED; 1079 } 1080 1081 /* called from brelsel() in vfs_bio among other places */ 1082 void 1083 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 1084 { 1085 1086 mutex_enter(&wl->wl_mtx); 1087 wapbl_remove_buf_locked(wl, bp); 1088 mutex_exit(&wl->wl_mtx); 1089 } 1090 1091 void 1092 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 1093 { 1094 1095 KASSERT(bp->b_cflags & BC_BUSY); 1096 1097 /* 1098 * XXX: why does this depend on B_LOCKED? otherwise the buf 1099 * is not for a transaction? if so, why is this called in the 1100 * first place? 1101 */ 1102 if (bp->b_flags & B_LOCKED) { 1103 mutex_enter(&wl->wl_mtx); 1104 wl->wl_bufbytes += bp->b_bufsize - oldsz; 1105 wl->wl_bcount += bp->b_bcount - oldcnt; 1106 mutex_exit(&wl->wl_mtx); 1107 } 1108 } 1109 1110 #endif /* _KERNEL */ 1111 1112 /****************************************************************/ 1113 /* Some utility inlines */ 1114 1115 static inline size_t 1116 wapbl_space_used(size_t avail, off_t head, off_t tail) 1117 { 1118 1119 if (tail == 0) { 1120 KASSERT(head == 0); 1121 return 0; 1122 } 1123 return ((head + (avail - 1) - tail) % avail) + 1; 1124 } 1125 1126 #ifdef _KERNEL 1127 /* This is used to advance the pointer at old to new value at old+delta */ 1128 static inline off_t 1129 wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta) 1130 { 1131 off_t newoff; 1132 1133 /* Define acceptable ranges for inputs. */ 1134 KASSERT(delta <= (size_t)size); 1135 KASSERT((oldoff == 0) || ((size_t)oldoff >= off)); 1136 KASSERT(oldoff < (off_t)(size + off)); 1137 1138 if ((oldoff == 0) && (delta != 0)) 1139 newoff = off + delta; 1140 else if ((oldoff + delta) < (size + off)) 1141 newoff = oldoff + delta; 1142 else 1143 newoff = (oldoff + delta) - size; 1144 1145 /* Note some interesting axioms */ 1146 KASSERT((delta != 0) || (newoff == oldoff)); 1147 KASSERT((delta == 0) || (newoff != 0)); 1148 KASSERT((delta != (size)) || (newoff == oldoff)); 1149 1150 /* Define acceptable ranges for output. */ 1151 KASSERT((newoff == 0) || ((size_t)newoff >= off)); 1152 KASSERT((size_t)newoff < (size + off)); 1153 return newoff; 1154 } 1155 1156 static inline size_t 1157 wapbl_space_free(size_t avail, off_t head, off_t tail) 1158 { 1159 1160 return avail - wapbl_space_used(avail, head, tail); 1161 } 1162 1163 static inline void 1164 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1165 off_t *tailp) 1166 { 1167 off_t head = *headp; 1168 off_t tail = *tailp; 1169 1170 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1171 head = wapbl_advance(size, off, head, delta); 1172 if ((tail == 0) && (head != 0)) 1173 tail = off; 1174 *headp = head; 1175 *tailp = tail; 1176 } 1177 1178 static inline void 1179 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1180 off_t *tailp) 1181 { 1182 off_t head = *headp; 1183 off_t tail = *tailp; 1184 1185 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1186 tail = wapbl_advance(size, off, tail, delta); 1187 if (head == tail) { 1188 head = tail = 0; 1189 } 1190 *headp = head; 1191 *tailp = tail; 1192 } 1193 1194 1195 /****************************************************************/ 1196 1197 /* 1198 * Remove transactions whose buffers are completely flushed to disk. 1199 * Will block until at least minfree space is available. 1200 * only intended to be called from inside wapbl_flush and therefore 1201 * does not protect against commit races with itself or with flush. 1202 */ 1203 static int 1204 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1205 { 1206 size_t delta; 1207 size_t avail; 1208 off_t head; 1209 off_t tail; 1210 int error = 0; 1211 1212 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1213 KASSERT(rw_write_held(&wl->wl_rwlock)); 1214 1215 mutex_enter(&wl->wl_mtx); 1216 1217 /* 1218 * First check to see if we have to do a commit 1219 * at all. 1220 */ 1221 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1222 if (minfree < avail) { 1223 mutex_exit(&wl->wl_mtx); 1224 return 0; 1225 } 1226 minfree -= avail; 1227 while ((wl->wl_error_count == 0) && 1228 (wl->wl_reclaimable_bytes < minfree)) { 1229 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1230 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1231 "minfree=%zd\n", 1232 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1233 minfree)); 1234 1235 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1236 } 1237 if (wl->wl_reclaimable_bytes < minfree) { 1238 KASSERT(wl->wl_error_count); 1239 /* XXX maybe get actual error from buffer instead someday? */ 1240 error = EIO; 1241 } 1242 head = wl->wl_head; 1243 tail = wl->wl_tail; 1244 delta = wl->wl_reclaimable_bytes; 1245 1246 /* If all of of the entries are flushed, then be sure to keep 1247 * the reserved bytes reserved. Watch out for discarded transactions, 1248 * which could leave more bytes reserved than are reclaimable. 1249 */ 1250 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1251 (delta >= wl->wl_reserved_bytes)) { 1252 delta -= wl->wl_reserved_bytes; 1253 } 1254 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1255 &tail); 1256 KDASSERT(wl->wl_reserved_bytes <= 1257 wapbl_space_used(wl->wl_circ_size, head, tail)); 1258 mutex_exit(&wl->wl_mtx); 1259 1260 if (error) 1261 return error; 1262 1263 if (waitonly) 1264 return 0; 1265 1266 /* 1267 * This is where head, tail and delta are unprotected 1268 * from races against itself or flush. This is ok since 1269 * we only call this routine from inside flush itself. 1270 * 1271 * XXX: how can it race against itself when accessed only 1272 * from behind the write-locked rwlock? 1273 */ 1274 error = wapbl_write_commit(wl, head, tail); 1275 if (error) 1276 return error; 1277 1278 wl->wl_head = head; 1279 wl->wl_tail = tail; 1280 1281 mutex_enter(&wl->wl_mtx); 1282 KASSERT(wl->wl_reclaimable_bytes >= delta); 1283 wl->wl_reclaimable_bytes -= delta; 1284 mutex_exit(&wl->wl_mtx); 1285 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1286 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1287 curproc->p_pid, curlwp->l_lid, delta)); 1288 1289 return 0; 1290 } 1291 1292 /****************************************************************/ 1293 1294 void 1295 wapbl_biodone(struct buf *bp) 1296 { 1297 struct wapbl_entry *we = bp->b_private; 1298 struct wapbl *wl = we->we_wapbl; 1299 #ifdef WAPBL_DEBUG_BUFBYTES 1300 const int bufsize = bp->b_bufsize; 1301 #endif 1302 1303 /* 1304 * Handle possible flushing of buffers after log has been 1305 * decomissioned. 1306 */ 1307 if (!wl) { 1308 KASSERT(we->we_bufcount > 0); 1309 we->we_bufcount--; 1310 #ifdef WAPBL_DEBUG_BUFBYTES 1311 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1312 we->we_unsynced_bufbytes -= bufsize; 1313 #endif 1314 1315 if (we->we_bufcount == 0) { 1316 #ifdef WAPBL_DEBUG_BUFBYTES 1317 KASSERT(we->we_unsynced_bufbytes == 0); 1318 #endif 1319 pool_put(&wapbl_entry_pool, we); 1320 } 1321 1322 brelse(bp, 0); 1323 return; 1324 } 1325 1326 #ifdef ohbother 1327 KDASSERT(bp->b_oflags & BO_DONE); 1328 KDASSERT(!(bp->b_oflags & BO_DELWRI)); 1329 KDASSERT(bp->b_flags & B_ASYNC); 1330 KDASSERT(bp->b_cflags & BC_BUSY); 1331 KDASSERT(!(bp->b_flags & B_LOCKED)); 1332 KDASSERT(!(bp->b_flags & B_READ)); 1333 KDASSERT(!(bp->b_cflags & BC_INVAL)); 1334 KDASSERT(!(bp->b_cflags & BC_NOCACHE)); 1335 #endif 1336 1337 if (bp->b_error) { 1338 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1339 /* 1340 * XXXpooka: interfaces not fully updated 1341 * Note: this was not enabled in the original patch 1342 * against netbsd4 either. I don't know if comment 1343 * above is true or not. 1344 */ 1345 1346 /* 1347 * If an error occurs, report the error and leave the 1348 * buffer as a delayed write on the LRU queue. 1349 * restarting the write would likely result in 1350 * an error spinloop, so let it be done harmlessly 1351 * by the syncer. 1352 */ 1353 bp->b_flags &= ~(B_DONE); 1354 simple_unlock(&bp->b_interlock); 1355 1356 if (we->we_error == 0) { 1357 mutex_enter(&wl->wl_mtx); 1358 wl->wl_error_count++; 1359 mutex_exit(&wl->wl_mtx); 1360 cv_broadcast(&wl->wl_reclaimable_cv); 1361 } 1362 we->we_error = bp->b_error; 1363 bp->b_error = 0; 1364 brelse(bp); 1365 return; 1366 #else 1367 /* For now, just mark the log permanently errored out */ 1368 1369 mutex_enter(&wl->wl_mtx); 1370 if (wl->wl_error_count == 0) { 1371 wl->wl_error_count++; 1372 cv_broadcast(&wl->wl_reclaimable_cv); 1373 } 1374 mutex_exit(&wl->wl_mtx); 1375 #endif 1376 } 1377 1378 /* 1379 * Release the buffer here. wapbl_flush() may wait for the 1380 * log to become empty and we better unbusy the buffer before 1381 * wapbl_flush() returns. 1382 */ 1383 brelse(bp, 0); 1384 1385 mutex_enter(&wl->wl_mtx); 1386 1387 KASSERT(we->we_bufcount > 0); 1388 we->we_bufcount--; 1389 #ifdef WAPBL_DEBUG_BUFBYTES 1390 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1391 we->we_unsynced_bufbytes -= bufsize; 1392 KASSERT(wl->wl_unsynced_bufbytes >= bufsize); 1393 wl->wl_unsynced_bufbytes -= bufsize; 1394 #endif 1395 1396 /* 1397 * If the current transaction can be reclaimed, start 1398 * at the beginning and reclaim any consecutive reclaimable 1399 * transactions. If we successfully reclaim anything, 1400 * then wakeup anyone waiting for the reclaim. 1401 */ 1402 if (we->we_bufcount == 0) { 1403 size_t delta = 0; 1404 int errcnt = 0; 1405 #ifdef WAPBL_DEBUG_BUFBYTES 1406 KDASSERT(we->we_unsynced_bufbytes == 0); 1407 #endif 1408 /* 1409 * clear any posted error, since the buffer it came from 1410 * has successfully flushed by now 1411 */ 1412 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1413 (we->we_bufcount == 0)) { 1414 delta += we->we_reclaimable_bytes; 1415 if (we->we_error) 1416 errcnt++; 1417 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1418 pool_put(&wapbl_entry_pool, we); 1419 } 1420 1421 if (delta) { 1422 wl->wl_reclaimable_bytes += delta; 1423 KASSERT(wl->wl_error_count >= errcnt); 1424 wl->wl_error_count -= errcnt; 1425 cv_broadcast(&wl->wl_reclaimable_cv); 1426 } 1427 } 1428 1429 mutex_exit(&wl->wl_mtx); 1430 } 1431 1432 /* 1433 * Write transactions to disk + start I/O for contents 1434 */ 1435 int 1436 wapbl_flush(struct wapbl *wl, int waitfor) 1437 { 1438 struct buf *bp; 1439 struct wapbl_entry *we; 1440 off_t off; 1441 off_t head; 1442 off_t tail; 1443 size_t delta = 0; 1444 size_t flushsize; 1445 size_t reserved; 1446 int error = 0; 1447 1448 /* 1449 * Do a quick check to see if a full flush can be skipped 1450 * This assumes that the flush callback does not need to be called 1451 * unless there are other outstanding bufs. 1452 */ 1453 if (!waitfor) { 1454 size_t nbufs; 1455 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1456 protect the KASSERTS */ 1457 nbufs = wl->wl_bufcount; 1458 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1459 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1460 mutex_exit(&wl->wl_mtx); 1461 if (nbufs == 0) 1462 return 0; 1463 } 1464 1465 /* 1466 * XXX we may consider using LK_UPGRADE here 1467 * if we want to call flush from inside a transaction 1468 */ 1469 rw_enter(&wl->wl_rwlock, RW_WRITER); 1470 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1471 wl->wl_dealloccnt); 1472 1473 /* 1474 * Now that we are fully locked and flushed, 1475 * do another check for nothing to do. 1476 */ 1477 if (wl->wl_bufcount == 0) { 1478 goto out; 1479 } 1480 1481 #if 0 1482 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1483 ("wapbl_flush thread %d.%d flushing entries with " 1484 "bufcount=%zu bufbytes=%zu\n", 1485 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1486 wl->wl_bufbytes)); 1487 #endif 1488 1489 /* Calculate amount of space needed to flush */ 1490 flushsize = wapbl_transaction_len(wl); 1491 if (wapbl_verbose_commit) { 1492 struct timespec ts; 1493 getnanotime(&ts); 1494 printf("%s: %lld.%09ld this transaction = %zu bytes\n", 1495 __func__, (long long)ts.tv_sec, 1496 (long)ts.tv_nsec, flushsize); 1497 } 1498 1499 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1500 /* 1501 * XXX this could be handled more gracefully, perhaps place 1502 * only a partial transaction in the log and allow the 1503 * remaining to flush without the protection of the journal. 1504 */ 1505 panic("wapbl_flush: current transaction too big to flush\n"); 1506 } 1507 1508 error = wapbl_truncate(wl, flushsize, 0); 1509 if (error) 1510 goto out2; 1511 1512 off = wl->wl_head; 1513 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1514 (off < wl->wl_circ_off + wl->wl_circ_size))); 1515 error = wapbl_write_blocks(wl, &off); 1516 if (error) 1517 goto out2; 1518 error = wapbl_write_revocations(wl, &off); 1519 if (error) 1520 goto out2; 1521 error = wapbl_write_inodes(wl, &off); 1522 if (error) 1523 goto out2; 1524 1525 reserved = 0; 1526 if (wl->wl_inohashcnt) 1527 reserved = wapbl_transaction_inodes_len(wl); 1528 1529 head = wl->wl_head; 1530 tail = wl->wl_tail; 1531 1532 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1533 &head, &tail); 1534 #ifdef WAPBL_DEBUG 1535 if (head != off) { 1536 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1537 " off=%"PRIdMAX" flush=%zu\n", 1538 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1539 flushsize); 1540 } 1541 #else 1542 KASSERT(head == off); 1543 #endif 1544 1545 /* Opportunistically move the tail forward if we can */ 1546 if (!wapbl_lazy_truncate) { 1547 mutex_enter(&wl->wl_mtx); 1548 delta = wl->wl_reclaimable_bytes; 1549 mutex_exit(&wl->wl_mtx); 1550 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1551 &head, &tail); 1552 } 1553 1554 error = wapbl_write_commit(wl, head, tail); 1555 if (error) 1556 goto out2; 1557 1558 we = pool_get(&wapbl_entry_pool, PR_WAITOK); 1559 1560 #ifdef WAPBL_DEBUG_BUFBYTES 1561 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1562 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1563 " unsynced=%zu" 1564 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1565 "inodes=%d\n", 1566 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1567 wapbl_space_used(wl->wl_circ_size, head, tail), 1568 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1569 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1570 wl->wl_inohashcnt)); 1571 #else 1572 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1573 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1574 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1575 "inodes=%d\n", 1576 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1577 wapbl_space_used(wl->wl_circ_size, head, tail), 1578 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1579 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1580 #endif 1581 1582 1583 mutex_enter(&bufcache_lock); 1584 mutex_enter(&wl->wl_mtx); 1585 1586 wl->wl_reserved_bytes = reserved; 1587 wl->wl_head = head; 1588 wl->wl_tail = tail; 1589 KASSERT(wl->wl_reclaimable_bytes >= delta); 1590 wl->wl_reclaimable_bytes -= delta; 1591 wl->wl_dealloccnt = 0; 1592 #ifdef WAPBL_DEBUG_BUFBYTES 1593 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1594 #endif 1595 1596 we->we_wapbl = wl; 1597 we->we_bufcount = wl->wl_bufcount; 1598 #ifdef WAPBL_DEBUG_BUFBYTES 1599 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1600 #endif 1601 we->we_reclaimable_bytes = flushsize; 1602 we->we_error = 0; 1603 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1604 1605 /* 1606 * this flushes bufs in reverse order than they were queued 1607 * it shouldn't matter, but if we care we could use TAILQ instead. 1608 * XXX Note they will get put on the lru queue when they flush 1609 * so we might actually want to change this to preserve order. 1610 */ 1611 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1612 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1613 continue; 1614 } 1615 bp->b_iodone = wapbl_biodone; 1616 bp->b_private = we; 1617 bremfree(bp); 1618 wapbl_remove_buf_locked(wl, bp); 1619 mutex_exit(&wl->wl_mtx); 1620 mutex_exit(&bufcache_lock); 1621 bawrite(bp); 1622 mutex_enter(&bufcache_lock); 1623 mutex_enter(&wl->wl_mtx); 1624 } 1625 mutex_exit(&wl->wl_mtx); 1626 mutex_exit(&bufcache_lock); 1627 1628 #if 0 1629 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1630 ("wapbl_flush thread %d.%d done flushing entries...\n", 1631 curproc->p_pid, curlwp->l_lid)); 1632 #endif 1633 1634 out: 1635 1636 /* 1637 * If the waitfor flag is set, don't return until everything is 1638 * fully flushed and the on disk log is empty. 1639 */ 1640 if (waitfor) { 1641 error = wapbl_truncate(wl, wl->wl_circ_size - 1642 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1643 } 1644 1645 out2: 1646 if (error) { 1647 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1648 wl->wl_dealloclens, wl->wl_dealloccnt); 1649 } 1650 1651 #ifdef WAPBL_DEBUG_PRINT 1652 if (error) { 1653 pid_t pid = -1; 1654 lwpid_t lid = -1; 1655 if (curproc) 1656 pid = curproc->p_pid; 1657 if (curlwp) 1658 lid = curlwp->l_lid; 1659 mutex_enter(&wl->wl_mtx); 1660 #ifdef WAPBL_DEBUG_BUFBYTES 1661 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1662 ("wapbl_flush: thread %d.%d aborted flush: " 1663 "error = %d\n" 1664 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1665 "deallocs=%d inodes=%d\n" 1666 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1667 "unsynced=%zu\n", 1668 pid, lid, error, wl->wl_bufcount, 1669 wl->wl_bufbytes, wl->wl_bcount, 1670 wl->wl_dealloccnt, wl->wl_inohashcnt, 1671 wl->wl_error_count, wl->wl_reclaimable_bytes, 1672 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1673 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1674 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1675 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1676 "error = %d, unsynced = %zu\n", 1677 we->we_bufcount, we->we_reclaimable_bytes, 1678 we->we_error, we->we_unsynced_bufbytes)); 1679 } 1680 #else 1681 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1682 ("wapbl_flush: thread %d.%d aborted flush: " 1683 "error = %d\n" 1684 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1685 "deallocs=%d inodes=%d\n" 1686 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1687 pid, lid, error, wl->wl_bufcount, 1688 wl->wl_bufbytes, wl->wl_bcount, 1689 wl->wl_dealloccnt, wl->wl_inohashcnt, 1690 wl->wl_error_count, wl->wl_reclaimable_bytes, 1691 wl->wl_reserved_bytes)); 1692 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1693 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1694 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1695 "error = %d\n", we->we_bufcount, 1696 we->we_reclaimable_bytes, we->we_error)); 1697 } 1698 #endif 1699 mutex_exit(&wl->wl_mtx); 1700 } 1701 #endif 1702 1703 rw_exit(&wl->wl_rwlock); 1704 return error; 1705 } 1706 1707 /****************************************************************/ 1708 1709 void 1710 wapbl_jlock_assert(struct wapbl *wl) 1711 { 1712 1713 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1714 } 1715 1716 void 1717 wapbl_junlock_assert(struct wapbl *wl) 1718 { 1719 1720 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1721 } 1722 1723 /****************************************************************/ 1724 1725 /* locks missing */ 1726 void 1727 wapbl_print(struct wapbl *wl, 1728 int full, 1729 void (*pr)(const char *, ...)) 1730 { 1731 struct buf *bp; 1732 struct wapbl_entry *we; 1733 (*pr)("wapbl %p", wl); 1734 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1735 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1736 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1737 wl->wl_circ_size, wl->wl_circ_off, 1738 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1739 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1740 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1741 #ifdef WAPBL_DEBUG_BUFBYTES 1742 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1743 "reserved = %zu errcnt = %d unsynced = %zu\n", 1744 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1745 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1746 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1747 #else 1748 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1749 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1750 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1751 wl->wl_error_count); 1752 #endif 1753 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1754 wl->wl_dealloccnt, wl->wl_dealloclim); 1755 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1756 wl->wl_inohashcnt, wl->wl_inohashmask); 1757 (*pr)("entries:\n"); 1758 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1759 #ifdef WAPBL_DEBUG_BUFBYTES 1760 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1761 "unsynced = %zu\n", 1762 we->we_bufcount, we->we_reclaimable_bytes, 1763 we->we_error, we->we_unsynced_bufbytes); 1764 #else 1765 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1766 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1767 #endif 1768 } 1769 if (full) { 1770 int cnt = 0; 1771 (*pr)("bufs ="); 1772 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1773 if (!LIST_NEXT(bp, b_wapbllist)) { 1774 (*pr)(" %p", bp); 1775 } else if ((++cnt % 6) == 0) { 1776 (*pr)(" %p,\n\t", bp); 1777 } else { 1778 (*pr)(" %p,", bp); 1779 } 1780 } 1781 (*pr)("\n"); 1782 1783 (*pr)("dealloced blks = "); 1784 { 1785 int i; 1786 cnt = 0; 1787 for (i = 0; i < wl->wl_dealloccnt; i++) { 1788 (*pr)(" %"PRId64":%d,", 1789 wl->wl_deallocblks[i], 1790 wl->wl_dealloclens[i]); 1791 if ((++cnt % 4) == 0) { 1792 (*pr)("\n\t"); 1793 } 1794 } 1795 } 1796 (*pr)("\n"); 1797 1798 (*pr)("registered inodes = "); 1799 { 1800 int i; 1801 cnt = 0; 1802 for (i = 0; i <= wl->wl_inohashmask; i++) { 1803 struct wapbl_ino_head *wih; 1804 struct wapbl_ino *wi; 1805 1806 wih = &wl->wl_inohash[i]; 1807 LIST_FOREACH(wi, wih, wi_hash) { 1808 if (wi->wi_ino == 0) 1809 continue; 1810 (*pr)(" %"PRIu64"/0%06"PRIo32",", 1811 wi->wi_ino, wi->wi_mode); 1812 if ((++cnt % 4) == 0) { 1813 (*pr)("\n\t"); 1814 } 1815 } 1816 } 1817 (*pr)("\n"); 1818 } 1819 } 1820 } 1821 1822 #if defined(WAPBL_DEBUG) || defined(DDB) 1823 void 1824 wapbl_dump(struct wapbl *wl) 1825 { 1826 #if defined(WAPBL_DEBUG) 1827 if (!wl) 1828 wl = wapbl_debug_wl; 1829 #endif 1830 if (!wl) 1831 return; 1832 wapbl_print(wl, 1, printf); 1833 } 1834 #endif 1835 1836 /****************************************************************/ 1837 1838 void 1839 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1840 { 1841 1842 wapbl_jlock_assert(wl); 1843 1844 mutex_enter(&wl->wl_mtx); 1845 /* XXX should eventually instead tie this into resource estimation */ 1846 /* 1847 * XXX this panic needs locking/mutex analysis and the 1848 * ability to cope with the failure. 1849 */ 1850 /* XXX this XXX doesn't have enough XXX */ 1851 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) 1852 panic("wapbl_register_deallocation: out of resources"); 1853 1854 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1855 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1856 wl->wl_dealloccnt++; 1857 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1858 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1859 mutex_exit(&wl->wl_mtx); 1860 } 1861 1862 /****************************************************************/ 1863 1864 static void 1865 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1866 { 1867 1868 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1869 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1870 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1871 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1872 } 1873 } 1874 1875 static void 1876 wapbl_inodetrk_free(struct wapbl *wl) 1877 { 1878 1879 /* XXX this KASSERT needs locking/mutex analysis */ 1880 KASSERT(wl->wl_inohashcnt == 0); 1881 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1882 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1883 pool_destroy(&wapbl_ino_pool); 1884 } 1885 } 1886 1887 static struct wapbl_ino * 1888 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1889 { 1890 struct wapbl_ino_head *wih; 1891 struct wapbl_ino *wi; 1892 1893 KASSERT(mutex_owned(&wl->wl_mtx)); 1894 1895 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1896 LIST_FOREACH(wi, wih, wi_hash) { 1897 if (ino == wi->wi_ino) 1898 return wi; 1899 } 1900 return 0; 1901 } 1902 1903 void 1904 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1905 { 1906 struct wapbl_ino_head *wih; 1907 struct wapbl_ino *wi; 1908 1909 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1910 1911 mutex_enter(&wl->wl_mtx); 1912 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1913 wi->wi_ino = ino; 1914 wi->wi_mode = mode; 1915 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1916 LIST_INSERT_HEAD(wih, wi, wi_hash); 1917 wl->wl_inohashcnt++; 1918 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1919 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1920 mutex_exit(&wl->wl_mtx); 1921 } else { 1922 mutex_exit(&wl->wl_mtx); 1923 pool_put(&wapbl_ino_pool, wi); 1924 } 1925 } 1926 1927 void 1928 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1929 { 1930 struct wapbl_ino *wi; 1931 1932 mutex_enter(&wl->wl_mtx); 1933 wi = wapbl_inodetrk_get(wl, ino); 1934 if (wi) { 1935 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1936 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1937 KASSERT(wl->wl_inohashcnt > 0); 1938 wl->wl_inohashcnt--; 1939 LIST_REMOVE(wi, wi_hash); 1940 mutex_exit(&wl->wl_mtx); 1941 1942 pool_put(&wapbl_ino_pool, wi); 1943 } else { 1944 mutex_exit(&wl->wl_mtx); 1945 } 1946 } 1947 1948 /****************************************************************/ 1949 1950 static inline size_t 1951 wapbl_transaction_inodes_len(struct wapbl *wl) 1952 { 1953 int blocklen = 1<<wl->wl_log_dev_bshift; 1954 int iph; 1955 1956 /* Calculate number of inodes described in a inodelist header */ 1957 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1958 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1959 1960 KASSERT(iph > 0); 1961 1962 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; 1963 } 1964 1965 1966 /* Calculate amount of space a transaction will take on disk */ 1967 static size_t 1968 wapbl_transaction_len(struct wapbl *wl) 1969 { 1970 int blocklen = 1<<wl->wl_log_dev_bshift; 1971 size_t len; 1972 int bph; 1973 1974 /* Calculate number of blocks described in a blocklist header */ 1975 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1976 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1977 1978 KASSERT(bph > 0); 1979 1980 len = wl->wl_bcount; 1981 len += howmany(wl->wl_bufcount, bph) * blocklen; 1982 len += howmany(wl->wl_dealloccnt, bph) * blocklen; 1983 len += wapbl_transaction_inodes_len(wl); 1984 1985 return len; 1986 } 1987 1988 /* 1989 * wapbl_cache_sync: issue DIOCCACHESYNC 1990 */ 1991 static int 1992 wapbl_cache_sync(struct wapbl *wl, const char *msg) 1993 { 1994 const bool verbose = wapbl_verbose_commit >= 2; 1995 struct bintime start_time; 1996 int force = 1; 1997 int error; 1998 1999 if (!wapbl_flush_disk_cache) { 2000 return 0; 2001 } 2002 if (verbose) { 2003 bintime(&start_time); 2004 } 2005 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 2006 FWRITE, FSCRED); 2007 if (error) { 2008 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 2009 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x " 2010 "returned %d\n", wl->wl_devvp->v_rdev, error)); 2011 } 2012 if (verbose) { 2013 struct bintime d; 2014 struct timespec ts; 2015 2016 bintime(&d); 2017 bintime_sub(&d, &start_time); 2018 bintime2timespec(&d, &ts); 2019 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n", 2020 msg, (uintmax_t)wl->wl_devvp->v_rdev, 2021 (uintmax_t)ts.tv_sec, ts.tv_nsec); 2022 } 2023 return error; 2024 } 2025 2026 /* 2027 * Perform commit operation 2028 * 2029 * Note that generation number incrementation needs to 2030 * be protected against racing with other invocations 2031 * of wapbl_write_commit. This is ok since this routine 2032 * is only invoked from wapbl_flush 2033 */ 2034 static int 2035 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 2036 { 2037 struct wapbl_wc_header *wc = wl->wl_wc_header; 2038 struct timespec ts; 2039 int error; 2040 daddr_t pbn; 2041 2042 error = wapbl_buffered_flush(wl); 2043 if (error) 2044 return error; 2045 /* 2046 * flush disk cache to ensure that blocks we've written are actually 2047 * written to the stable storage before the commit header. 2048 * 2049 * XXX Calc checksum here, instead we do this for now 2050 */ 2051 wapbl_cache_sync(wl, "1"); 2052 2053 wc->wc_head = head; 2054 wc->wc_tail = tail; 2055 wc->wc_checksum = 0; 2056 wc->wc_version = 1; 2057 getnanotime(&ts); 2058 wc->wc_time = ts.tv_sec; 2059 wc->wc_timensec = ts.tv_nsec; 2060 2061 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2062 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 2063 (intmax_t)head, (intmax_t)tail)); 2064 2065 /* 2066 * write the commit header. 2067 * 2068 * XXX if generation will rollover, then first zero 2069 * over second commit header before trying to write both headers. 2070 */ 2071 2072 pbn = wl->wl_logpbn + (wc->wc_generation % 2); 2073 #ifdef _KERNEL 2074 pbn = btodb(pbn << wc->wc_log_dev_bshift); 2075 #endif 2076 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn); 2077 if (error) 2078 return error; 2079 error = wapbl_buffered_flush(wl); 2080 if (error) 2081 return error; 2082 2083 /* 2084 * flush disk cache to ensure that the commit header is actually 2085 * written before meta data blocks. 2086 */ 2087 wapbl_cache_sync(wl, "2"); 2088 2089 /* 2090 * If the generation number was zero, write it out a second time. 2091 * This handles initialization and generation number rollover 2092 */ 2093 if (wc->wc_generation++ == 0) { 2094 error = wapbl_write_commit(wl, head, tail); 2095 /* 2096 * This panic should be able to be removed if we do the 2097 * zero'ing mentioned above, and we are certain to roll 2098 * back generation number on failure. 2099 */ 2100 if (error) 2101 panic("wapbl_write_commit: error writing duplicate " 2102 "log header: %d\n", error); 2103 } 2104 return 0; 2105 } 2106 2107 /* Returns new offset value */ 2108 static int 2109 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 2110 { 2111 struct wapbl_wc_blocklist *wc = 2112 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2113 int blocklen = 1<<wl->wl_log_dev_bshift; 2114 int bph; 2115 struct buf *bp; 2116 off_t off = *offp; 2117 int error; 2118 size_t padding; 2119 2120 KASSERT(rw_write_held(&wl->wl_rwlock)); 2121 2122 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2123 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2124 2125 bp = LIST_FIRST(&wl->wl_bufs); 2126 2127 while (bp) { 2128 int cnt; 2129 struct buf *obp = bp; 2130 2131 KASSERT(bp->b_flags & B_LOCKED); 2132 2133 wc->wc_type = WAPBL_WC_BLOCKS; 2134 wc->wc_len = blocklen; 2135 wc->wc_blkcount = 0; 2136 while (bp && (wc->wc_blkcount < bph)) { 2137 /* 2138 * Make sure all the physical block numbers are up to 2139 * date. If this is not always true on a given 2140 * filesystem, then VOP_BMAP must be called. We 2141 * could call VOP_BMAP here, or else in the filesystem 2142 * specific flush callback, although neither of those 2143 * solutions allow us to take the vnode lock. If a 2144 * filesystem requires that we must take the vnode lock 2145 * to call VOP_BMAP, then we can probably do it in 2146 * bwrite when the vnode lock should already be held 2147 * by the invoking code. 2148 */ 2149 KASSERT((bp->b_vp->v_type == VBLK) || 2150 (bp->b_blkno != bp->b_lblkno)); 2151 KASSERT(bp->b_blkno > 0); 2152 2153 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 2154 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 2155 wc->wc_len += bp->b_bcount; 2156 wc->wc_blkcount++; 2157 bp = LIST_NEXT(bp, b_wapbllist); 2158 } 2159 if (wc->wc_len % blocklen != 0) { 2160 padding = blocklen - wc->wc_len % blocklen; 2161 wc->wc_len += padding; 2162 } else { 2163 padding = 0; 2164 } 2165 2166 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2167 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 2168 wc->wc_len, padding, (intmax_t)off)); 2169 2170 error = wapbl_circ_write(wl, wc, blocklen, &off); 2171 if (error) 2172 return error; 2173 bp = obp; 2174 cnt = 0; 2175 while (bp && (cnt++ < bph)) { 2176 error = wapbl_circ_write(wl, bp->b_data, 2177 bp->b_bcount, &off); 2178 if (error) 2179 return error; 2180 bp = LIST_NEXT(bp, b_wapbllist); 2181 } 2182 if (padding) { 2183 void *zero; 2184 2185 zero = wapbl_alloc(padding); 2186 memset(zero, 0, padding); 2187 error = wapbl_circ_write(wl, zero, padding, &off); 2188 wapbl_free(zero, padding); 2189 if (error) 2190 return error; 2191 } 2192 } 2193 *offp = off; 2194 return 0; 2195 } 2196 2197 static int 2198 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2199 { 2200 struct wapbl_wc_blocklist *wc = 2201 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2202 int i; 2203 int blocklen = 1<<wl->wl_log_dev_bshift; 2204 int bph; 2205 off_t off = *offp; 2206 int error; 2207 2208 if (wl->wl_dealloccnt == 0) 2209 return 0; 2210 2211 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2212 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2213 2214 i = 0; 2215 while (i < wl->wl_dealloccnt) { 2216 wc->wc_type = WAPBL_WC_REVOCATIONS; 2217 wc->wc_len = blocklen; 2218 wc->wc_blkcount = 0; 2219 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2220 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2221 wl->wl_deallocblks[i]; 2222 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2223 wl->wl_dealloclens[i]; 2224 wc->wc_blkcount++; 2225 i++; 2226 } 2227 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2228 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2229 wc->wc_len, (intmax_t)off)); 2230 error = wapbl_circ_write(wl, wc, blocklen, &off); 2231 if (error) 2232 return error; 2233 } 2234 *offp = off; 2235 return 0; 2236 } 2237 2238 static int 2239 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2240 { 2241 struct wapbl_wc_inodelist *wc = 2242 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2243 int i; 2244 int blocklen = 1 << wl->wl_log_dev_bshift; 2245 off_t off = *offp; 2246 int error; 2247 2248 struct wapbl_ino_head *wih; 2249 struct wapbl_ino *wi; 2250 int iph; 2251 2252 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2253 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2254 2255 i = 0; 2256 wih = &wl->wl_inohash[0]; 2257 wi = 0; 2258 do { 2259 wc->wc_type = WAPBL_WC_INODES; 2260 wc->wc_len = blocklen; 2261 wc->wc_inocnt = 0; 2262 wc->wc_clear = (i == 0); 2263 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2264 while (!wi) { 2265 KASSERT((wih - &wl->wl_inohash[0]) 2266 <= wl->wl_inohashmask); 2267 wi = LIST_FIRST(wih++); 2268 } 2269 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2270 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2271 wc->wc_inocnt++; 2272 i++; 2273 wi = LIST_NEXT(wi, wi_hash); 2274 } 2275 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2276 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2277 wc->wc_len, (intmax_t)off)); 2278 error = wapbl_circ_write(wl, wc, blocklen, &off); 2279 if (error) 2280 return error; 2281 } while (i < wl->wl_inohashcnt); 2282 2283 *offp = off; 2284 return 0; 2285 } 2286 2287 #endif /* _KERNEL */ 2288 2289 /****************************************************************/ 2290 2291 struct wapbl_blk { 2292 LIST_ENTRY(wapbl_blk) wb_hash; 2293 daddr_t wb_blk; 2294 off_t wb_off; /* Offset of this block in the log */ 2295 }; 2296 #define WAPBL_BLKPOOL_MIN 83 2297 2298 static void 2299 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2300 { 2301 if (size < WAPBL_BLKPOOL_MIN) 2302 size = WAPBL_BLKPOOL_MIN; 2303 KASSERT(wr->wr_blkhash == 0); 2304 #ifdef _KERNEL 2305 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2306 #else /* ! _KERNEL */ 2307 /* Manually implement hashinit */ 2308 { 2309 unsigned long i, hashsize; 2310 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2311 continue; 2312 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash)); 2313 for (i = 0; i < hashsize; i++) 2314 LIST_INIT(&wr->wr_blkhash[i]); 2315 wr->wr_blkhashmask = hashsize - 1; 2316 } 2317 #endif /* ! _KERNEL */ 2318 } 2319 2320 static void 2321 wapbl_blkhash_free(struct wapbl_replay *wr) 2322 { 2323 KASSERT(wr->wr_blkhashcnt == 0); 2324 #ifdef _KERNEL 2325 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2326 #else /* ! _KERNEL */ 2327 wapbl_free(wr->wr_blkhash, 2328 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2329 #endif /* ! _KERNEL */ 2330 } 2331 2332 static struct wapbl_blk * 2333 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2334 { 2335 struct wapbl_blk_head *wbh; 2336 struct wapbl_blk *wb; 2337 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2338 LIST_FOREACH(wb, wbh, wb_hash) { 2339 if (blk == wb->wb_blk) 2340 return wb; 2341 } 2342 return 0; 2343 } 2344 2345 static void 2346 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2347 { 2348 struct wapbl_blk_head *wbh; 2349 struct wapbl_blk *wb; 2350 wb = wapbl_blkhash_get(wr, blk); 2351 if (wb) { 2352 KASSERT(wb->wb_blk == blk); 2353 wb->wb_off = off; 2354 } else { 2355 wb = wapbl_alloc(sizeof(*wb)); 2356 wb->wb_blk = blk; 2357 wb->wb_off = off; 2358 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2359 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2360 wr->wr_blkhashcnt++; 2361 } 2362 } 2363 2364 static void 2365 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2366 { 2367 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2368 if (wb) { 2369 KASSERT(wr->wr_blkhashcnt > 0); 2370 wr->wr_blkhashcnt--; 2371 LIST_REMOVE(wb, wb_hash); 2372 wapbl_free(wb, sizeof(*wb)); 2373 } 2374 } 2375 2376 static void 2377 wapbl_blkhash_clear(struct wapbl_replay *wr) 2378 { 2379 unsigned long i; 2380 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2381 struct wapbl_blk *wb; 2382 2383 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2384 KASSERT(wr->wr_blkhashcnt > 0); 2385 wr->wr_blkhashcnt--; 2386 LIST_REMOVE(wb, wb_hash); 2387 wapbl_free(wb, sizeof(*wb)); 2388 } 2389 } 2390 KASSERT(wr->wr_blkhashcnt == 0); 2391 } 2392 2393 /****************************************************************/ 2394 2395 static int 2396 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2397 { 2398 size_t slen; 2399 off_t off = *offp; 2400 int error; 2401 daddr_t pbn; 2402 2403 KASSERT(((len >> wr->wr_log_dev_bshift) << 2404 wr->wr_log_dev_bshift) == len); 2405 2406 if (off < wr->wr_circ_off) 2407 off = wr->wr_circ_off; 2408 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2409 if (slen < len) { 2410 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2411 #ifdef _KERNEL 2412 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2413 #endif 2414 error = wapbl_read(data, slen, wr->wr_devvp, pbn); 2415 if (error) 2416 return error; 2417 data = (uint8_t *)data + slen; 2418 len -= slen; 2419 off = wr->wr_circ_off; 2420 } 2421 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2422 #ifdef _KERNEL 2423 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2424 #endif 2425 error = wapbl_read(data, len, wr->wr_devvp, pbn); 2426 if (error) 2427 return error; 2428 off += len; 2429 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2430 off = wr->wr_circ_off; 2431 *offp = off; 2432 return 0; 2433 } 2434 2435 static void 2436 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2437 { 2438 size_t slen; 2439 off_t off = *offp; 2440 2441 KASSERT(((len >> wr->wr_log_dev_bshift) << 2442 wr->wr_log_dev_bshift) == len); 2443 2444 if (off < wr->wr_circ_off) 2445 off = wr->wr_circ_off; 2446 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2447 if (slen < len) { 2448 len -= slen; 2449 off = wr->wr_circ_off; 2450 } 2451 off += len; 2452 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2453 off = wr->wr_circ_off; 2454 *offp = off; 2455 } 2456 2457 /****************************************************************/ 2458 2459 int 2460 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2461 daddr_t off, size_t count, size_t blksize) 2462 { 2463 struct wapbl_replay *wr; 2464 int error; 2465 struct vnode *devvp; 2466 daddr_t logpbn; 2467 uint8_t *scratch; 2468 struct wapbl_wc_header *wch; 2469 struct wapbl_wc_header *wch2; 2470 /* Use this until we read the actual log header */ 2471 int log_dev_bshift = ilog2(blksize); 2472 size_t used; 2473 daddr_t pbn; 2474 2475 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2476 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2477 vp, off, count, blksize)); 2478 2479 if (off < 0) 2480 return EINVAL; 2481 2482 if (blksize < DEV_BSIZE) 2483 return EINVAL; 2484 if (blksize % DEV_BSIZE) 2485 return EINVAL; 2486 2487 #ifdef _KERNEL 2488 #if 0 2489 /* XXX vp->v_size isn't reliably set for VBLK devices, 2490 * especially root. However, we might still want to verify 2491 * that the full load is readable */ 2492 if ((off + count) * blksize > vp->v_size) 2493 return EINVAL; 2494 #endif 2495 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2496 return error; 2497 } 2498 #else /* ! _KERNEL */ 2499 devvp = vp; 2500 logpbn = off; 2501 #endif /* ! _KERNEL */ 2502 2503 scratch = wapbl_alloc(MAXBSIZE); 2504 2505 pbn = logpbn; 2506 #ifdef _KERNEL 2507 pbn = btodb(pbn << log_dev_bshift); 2508 #endif 2509 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); 2510 if (error) 2511 goto errout; 2512 2513 wch = (struct wapbl_wc_header *)scratch; 2514 wch2 = 2515 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2516 /* XXX verify checksums and magic numbers */ 2517 if (wch->wc_type != WAPBL_WC_HEADER) { 2518 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2519 error = EFTYPE; 2520 goto errout; 2521 } 2522 2523 if (wch2->wc_generation > wch->wc_generation) 2524 wch = wch2; 2525 2526 wr = wapbl_calloc(1, sizeof(*wr)); 2527 2528 wr->wr_logvp = vp; 2529 wr->wr_devvp = devvp; 2530 wr->wr_logpbn = logpbn; 2531 2532 wr->wr_scratch = scratch; 2533 2534 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2535 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2536 wr->wr_circ_off = wch->wc_circ_off; 2537 wr->wr_circ_size = wch->wc_circ_size; 2538 wr->wr_generation = wch->wc_generation; 2539 2540 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2541 2542 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2543 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2544 " len=%"PRId64" used=%zu\n", 2545 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2546 wch->wc_circ_size, used)); 2547 2548 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2549 2550 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2551 if (error) { 2552 wapbl_replay_stop(wr); 2553 wapbl_replay_free(wr); 2554 return error; 2555 } 2556 2557 *wrp = wr; 2558 return 0; 2559 2560 errout: 2561 wapbl_free(scratch, MAXBSIZE); 2562 return error; 2563 } 2564 2565 void 2566 wapbl_replay_stop(struct wapbl_replay *wr) 2567 { 2568 2569 if (!wapbl_replay_isopen(wr)) 2570 return; 2571 2572 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2573 2574 wapbl_free(wr->wr_scratch, MAXBSIZE); 2575 wr->wr_scratch = NULL; 2576 2577 wr->wr_logvp = NULL; 2578 2579 wapbl_blkhash_clear(wr); 2580 wapbl_blkhash_free(wr); 2581 } 2582 2583 void 2584 wapbl_replay_free(struct wapbl_replay *wr) 2585 { 2586 2587 KDASSERT(!wapbl_replay_isopen(wr)); 2588 2589 if (wr->wr_inodes) 2590 wapbl_free(wr->wr_inodes, 2591 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2592 wapbl_free(wr, sizeof(*wr)); 2593 } 2594 2595 #ifdef _KERNEL 2596 int 2597 wapbl_replay_isopen1(struct wapbl_replay *wr) 2598 { 2599 2600 return wapbl_replay_isopen(wr); 2601 } 2602 #endif 2603 2604 /* 2605 * calculate the disk address for the i'th block in the wc_blockblist 2606 * offset by j blocks of size blen. 2607 * 2608 * wc_daddr is always a kernel disk address in DEV_BSIZE units that 2609 * was written to the journal. 2610 * 2611 * The kernel needs that address plus the offset in DEV_BSIZE units. 2612 * 2613 * Userland needs that address plus the offset in blen units. 2614 * 2615 */ 2616 static daddr_t 2617 wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen) 2618 { 2619 daddr_t pbn; 2620 2621 #ifdef _KERNEL 2622 pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen); 2623 #else 2624 pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j; 2625 #endif 2626 2627 return pbn; 2628 } 2629 2630 static void 2631 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2632 { 2633 struct wapbl_wc_blocklist *wc = 2634 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2635 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2636 int i, j, n; 2637 2638 for (i = 0; i < wc->wc_blkcount; i++) { 2639 /* 2640 * Enter each physical block into the hashtable independently. 2641 */ 2642 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2643 for (j = 0; j < n; j++) { 2644 wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen), 2645 *offp); 2646 wapbl_circ_advance(wr, fsblklen, offp); 2647 } 2648 } 2649 } 2650 2651 static void 2652 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2653 { 2654 struct wapbl_wc_blocklist *wc = 2655 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2656 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2657 int i, j, n; 2658 2659 for (i = 0; i < wc->wc_blkcount; i++) { 2660 /* 2661 * Remove any blocks found from the hashtable. 2662 */ 2663 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2664 for (j = 0; j < n; j++) 2665 wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen)); 2666 } 2667 } 2668 2669 static void 2670 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2671 { 2672 struct wapbl_wc_inodelist *wc = 2673 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2674 void *new_inodes; 2675 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2676 2677 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2678 2679 /* 2680 * Keep track of where we found this so location won't be 2681 * overwritten. 2682 */ 2683 if (wc->wc_clear) { 2684 wr->wr_inodestail = oldoff; 2685 wr->wr_inodescnt = 0; 2686 if (wr->wr_inodes != NULL) { 2687 wapbl_free(wr->wr_inodes, oldsize); 2688 wr->wr_inodes = NULL; 2689 } 2690 } 2691 wr->wr_inodeshead = newoff; 2692 if (wc->wc_inocnt == 0) 2693 return; 2694 2695 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) * 2696 sizeof(wr->wr_inodes[0])); 2697 if (wr->wr_inodes != NULL) { 2698 memcpy(new_inodes, wr->wr_inodes, oldsize); 2699 wapbl_free(wr->wr_inodes, oldsize); 2700 } 2701 wr->wr_inodes = new_inodes; 2702 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2703 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2704 wr->wr_inodescnt += wc->wc_inocnt; 2705 } 2706 2707 static int 2708 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2709 { 2710 off_t off; 2711 int error; 2712 2713 int logblklen = 1 << wr->wr_log_dev_bshift; 2714 2715 wapbl_blkhash_clear(wr); 2716 2717 off = tail; 2718 while (off != head) { 2719 struct wapbl_wc_null *wcn; 2720 off_t saveoff = off; 2721 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2722 if (error) 2723 goto errout; 2724 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2725 switch (wcn->wc_type) { 2726 case WAPBL_WC_BLOCKS: 2727 wapbl_replay_process_blocks(wr, &off); 2728 break; 2729 2730 case WAPBL_WC_REVOCATIONS: 2731 wapbl_replay_process_revocations(wr); 2732 break; 2733 2734 case WAPBL_WC_INODES: 2735 wapbl_replay_process_inodes(wr, saveoff, off); 2736 break; 2737 2738 default: 2739 printf("Unrecognized wapbl type: 0x%08x\n", 2740 wcn->wc_type); 2741 error = EFTYPE; 2742 goto errout; 2743 } 2744 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2745 if (off != saveoff) { 2746 printf("wapbl_replay: corrupted records\n"); 2747 error = EFTYPE; 2748 goto errout; 2749 } 2750 } 2751 return 0; 2752 2753 errout: 2754 wapbl_blkhash_clear(wr); 2755 return error; 2756 } 2757 2758 #if 0 2759 int 2760 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2761 { 2762 off_t off; 2763 int mismatchcnt = 0; 2764 int logblklen = 1 << wr->wr_log_dev_bshift; 2765 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2766 void *scratch1 = wapbl_alloc(MAXBSIZE); 2767 void *scratch2 = wapbl_alloc(MAXBSIZE); 2768 int error = 0; 2769 2770 KDASSERT(wapbl_replay_isopen(wr)); 2771 2772 off = wch->wc_tail; 2773 while (off != wch->wc_head) { 2774 struct wapbl_wc_null *wcn; 2775 #ifdef DEBUG 2776 off_t saveoff = off; 2777 #endif 2778 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2779 if (error) 2780 goto out; 2781 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2782 switch (wcn->wc_type) { 2783 case WAPBL_WC_BLOCKS: 2784 { 2785 struct wapbl_wc_blocklist *wc = 2786 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2787 int i; 2788 for (i = 0; i < wc->wc_blkcount; i++) { 2789 int foundcnt = 0; 2790 int dirtycnt = 0; 2791 int j, n; 2792 /* 2793 * Check each physical block into the 2794 * hashtable independently 2795 */ 2796 n = wc->wc_blocks[i].wc_dlen >> 2797 wch->wc_fs_dev_bshift; 2798 for (j = 0; j < n; j++) { 2799 struct wapbl_blk *wb = 2800 wapbl_blkhash_get(wr, 2801 wapbl_block_daddr(wc, i, j, fsblklen)); 2802 if (wb && (wb->wb_off == off)) { 2803 foundcnt++; 2804 error = 2805 wapbl_circ_read(wr, 2806 scratch1, fsblklen, 2807 &off); 2808 if (error) 2809 goto out; 2810 error = 2811 wapbl_read(scratch2, 2812 fsblklen, fsdevvp, 2813 wb->wb_blk); 2814 if (error) 2815 goto out; 2816 if (memcmp(scratch1, 2817 scratch2, 2818 fsblklen)) { 2819 printf( 2820 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2821 wb->wb_blk, (intmax_t)off); 2822 dirtycnt++; 2823 mismatchcnt++; 2824 } 2825 } else { 2826 wapbl_circ_advance(wr, 2827 fsblklen, &off); 2828 } 2829 } 2830 #if 0 2831 /* 2832 * If all of the blocks in an entry 2833 * are clean, then remove all of its 2834 * blocks from the hashtable since they 2835 * never will need replay. 2836 */ 2837 if ((foundcnt != 0) && 2838 (dirtycnt == 0)) { 2839 off = saveoff; 2840 wapbl_circ_advance(wr, 2841 logblklen, &off); 2842 for (j = 0; j < n; j++) { 2843 struct wapbl_blk *wb = 2844 wapbl_blkhash_get(wr, 2845 wapbl_block_daddr(wc, i, j, fsblklen)); 2846 if (wb && 2847 (wb->wb_off == off)) { 2848 wapbl_blkhash_rem(wr, wb->wb_blk); 2849 } 2850 wapbl_circ_advance(wr, 2851 fsblklen, &off); 2852 } 2853 } 2854 #endif 2855 } 2856 } 2857 break; 2858 case WAPBL_WC_REVOCATIONS: 2859 case WAPBL_WC_INODES: 2860 break; 2861 default: 2862 KASSERT(0); 2863 } 2864 #ifdef DEBUG 2865 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2866 KASSERT(off == saveoff); 2867 #endif 2868 } 2869 out: 2870 wapbl_free(scratch1, MAXBSIZE); 2871 wapbl_free(scratch2, MAXBSIZE); 2872 if (!error && mismatchcnt) 2873 error = EFTYPE; 2874 return error; 2875 } 2876 #endif 2877 2878 int 2879 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2880 { 2881 struct wapbl_blk *wb; 2882 size_t i; 2883 off_t off; 2884 void *scratch; 2885 int error = 0; 2886 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2887 2888 KDASSERT(wapbl_replay_isopen(wr)); 2889 2890 scratch = wapbl_alloc(MAXBSIZE); 2891 2892 for (i = 0; i <= wr->wr_blkhashmask; ++i) { 2893 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2894 off = wb->wb_off; 2895 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2896 if (error) 2897 break; 2898 error = wapbl_write(scratch, fsblklen, fsdevvp, 2899 wb->wb_blk); 2900 if (error) 2901 break; 2902 } 2903 } 2904 2905 wapbl_free(scratch, MAXBSIZE); 2906 return error; 2907 } 2908 2909 int 2910 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2911 { 2912 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2913 2914 KDASSERT(wapbl_replay_isopen(wr)); 2915 KASSERT((len % fsblklen) == 0); 2916 2917 while (len != 0) { 2918 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2919 if (wb) 2920 return 1; 2921 len -= fsblklen; 2922 } 2923 return 0; 2924 } 2925 2926 int 2927 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2928 { 2929 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2930 2931 KDASSERT(wapbl_replay_isopen(wr)); 2932 2933 KASSERT((len % fsblklen) == 0); 2934 2935 while (len != 0) { 2936 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2937 if (wb) { 2938 off_t off = wb->wb_off; 2939 int error; 2940 error = wapbl_circ_read(wr, data, fsblklen, &off); 2941 if (error) 2942 return error; 2943 } 2944 data = (uint8_t *)data + fsblklen; 2945 len -= fsblklen; 2946 blk++; 2947 } 2948 return 0; 2949 } 2950 2951 #ifdef _KERNEL 2952 2953 MODULE(MODULE_CLASS_VFS, wapbl, NULL); 2954 2955 static int 2956 wapbl_modcmd(modcmd_t cmd, void *arg) 2957 { 2958 2959 switch (cmd) { 2960 case MODULE_CMD_INIT: 2961 wapbl_init(); 2962 return 0; 2963 case MODULE_CMD_FINI: 2964 return wapbl_fini(true); 2965 default: 2966 return ENOTTY; 2967 } 2968 } 2969 #endif /* _KERNEL */ 2970