1 /* $NetBSD: vfs_wapbl.c,v 1.59 2014/02/25 18:30:11 pooka Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.59 2014/02/25 18:30:11 pooka Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/bitops.h> 43 44 #ifdef _KERNEL 45 #include <sys/param.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/sysctl.h> 49 #include <sys/uio.h> 50 #include <sys/vnode.h> 51 #include <sys/file.h> 52 #include <sys/module.h> 53 #include <sys/resourcevar.h> 54 #include <sys/conf.h> 55 #include <sys/mount.h> 56 #include <sys/kernel.h> 57 #include <sys/kauth.h> 58 #include <sys/mutex.h> 59 #include <sys/atomic.h> 60 #include <sys/wapbl.h> 61 #include <sys/wapbl_replay.h> 62 63 #include <miscfs/specfs/specdev.h> 64 65 #define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP) 66 #define wapbl_free(a, s) kmem_free((a), (s)) 67 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 68 69 static struct sysctllog *wapbl_sysctl; 70 static int wapbl_flush_disk_cache = 1; 71 static int wapbl_verbose_commit = 0; 72 73 static inline size_t wapbl_space_free(size_t, off_t, off_t); 74 75 #else /* !_KERNEL */ 76 #include <assert.h> 77 #include <errno.h> 78 #include <stdio.h> 79 #include <stdbool.h> 80 #include <stdlib.h> 81 #include <string.h> 82 83 #include <sys/time.h> 84 #include <sys/wapbl.h> 85 #include <sys/wapbl_replay.h> 86 87 #define KDASSERT(x) assert(x) 88 #define KASSERT(x) assert(x) 89 #define wapbl_alloc(s) malloc(s) 90 #define wapbl_free(a, s) free(a) 91 #define wapbl_calloc(n, s) calloc((n), (s)) 92 93 #endif /* !_KERNEL */ 94 95 /* 96 * INTERNAL DATA STRUCTURES 97 */ 98 99 /* 100 * This structure holds per-mount log information. 101 * 102 * Legend: a = atomic access only 103 * r = read-only after init 104 * l = rwlock held 105 * m = mutex held 106 * lm = rwlock held writing or mutex held 107 * u = unlocked access ok 108 * b = bufcache_lock held 109 */ 110 struct wapbl { 111 struct vnode *wl_logvp; /* r: log here */ 112 struct vnode *wl_devvp; /* r: log on this device */ 113 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 114 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 115 int wl_log_dev_bshift; /* r: logarithm of device block size of log 116 device */ 117 int wl_fs_dev_bshift; /* r: logarithm of device block size of 118 filesystem device */ 119 120 unsigned wl_lock_count; /* m: Count of transactions in progress */ 121 122 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 123 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 124 125 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 126 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 127 128 off_t wl_head; /* l: Byte offset of log head */ 129 off_t wl_tail; /* l: Byte offset of log tail */ 130 /* 131 * head == tail == 0 means log is empty 132 * head == tail != 0 means log is full 133 * see assertions in wapbl_advance() for other boundary conditions. 134 * only truncate moves the tail, except when flush sets it to 135 * wl_header_size only flush moves the head, except when truncate 136 * sets it to 0. 137 */ 138 139 struct wapbl_wc_header *wl_wc_header; /* l */ 140 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 141 142 kmutex_t wl_mtx; /* u: short-term lock */ 143 krwlock_t wl_rwlock; /* u: File system transaction lock */ 144 145 /* 146 * Must be held while accessing 147 * wl_count or wl_bufs or head or tail 148 */ 149 150 /* 151 * Callback called from within the flush routine to flush any extra 152 * bits. Note that flush may be skipped without calling this if 153 * there are no outstanding buffers in the transaction. 154 */ 155 #if _KERNEL 156 wapbl_flush_fn_t wl_flush; /* r */ 157 wapbl_flush_fn_t wl_flush_abort;/* r */ 158 #endif 159 160 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 161 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 162 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 163 164 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 165 166 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 167 size_t wl_reclaimable_bytes; /* m: Amount of space available for 168 reclamation by truncate */ 169 int wl_error_count; /* m: # of wl_entries with errors */ 170 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 171 172 #ifdef WAPBL_DEBUG_BUFBYTES 173 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 174 #endif 175 176 daddr_t *wl_deallocblks;/* lm: address of block */ 177 int *wl_dealloclens; /* lm: size of block */ 178 int wl_dealloccnt; /* lm: total count */ 179 int wl_dealloclim; /* l: max count */ 180 181 /* hashtable of inode numbers for allocated but unlinked inodes */ 182 /* synch ??? */ 183 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 184 u_long wl_inohashmask; 185 int wl_inohashcnt; 186 187 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 188 accounting */ 189 190 u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */ 191 daddr_t wl_buffer_dblk; /* l: buffer disk block address */ 192 size_t wl_buffer_used; /* l: buffer current use */ 193 }; 194 195 #ifdef WAPBL_DEBUG_PRINT 196 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 197 #endif 198 199 /****************************************************************/ 200 #ifdef _KERNEL 201 202 #ifdef WAPBL_DEBUG 203 struct wapbl *wapbl_debug_wl; 204 #endif 205 206 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 207 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 208 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 209 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 210 #endif /* _KERNEL */ 211 212 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 213 214 static inline size_t wapbl_space_used(size_t avail, off_t head, 215 off_t tail); 216 217 #ifdef _KERNEL 218 219 static struct pool wapbl_entry_pool; 220 221 #define WAPBL_INODETRK_SIZE 83 222 static int wapbl_ino_pool_refcount; 223 static struct pool wapbl_ino_pool; 224 struct wapbl_ino { 225 LIST_ENTRY(wapbl_ino) wi_hash; 226 ino_t wi_ino; 227 mode_t wi_mode; 228 }; 229 230 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 231 static void wapbl_inodetrk_free(struct wapbl *wl); 232 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 233 234 static size_t wapbl_transaction_len(struct wapbl *wl); 235 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 236 237 #if 0 238 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 239 #endif 240 241 static int wapbl_replay_isopen1(struct wapbl_replay *); 242 243 /* 244 * This is useful for debugging. If set, the log will 245 * only be truncated when necessary. 246 */ 247 int wapbl_lazy_truncate = 0; 248 249 struct wapbl_ops wapbl_ops = { 250 .wo_wapbl_discard = wapbl_discard, 251 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 252 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 253 .wo_wapbl_replay_read = wapbl_replay_read, 254 .wo_wapbl_add_buf = wapbl_add_buf, 255 .wo_wapbl_remove_buf = wapbl_remove_buf, 256 .wo_wapbl_resize_buf = wapbl_resize_buf, 257 .wo_wapbl_begin = wapbl_begin, 258 .wo_wapbl_end = wapbl_end, 259 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 260 261 /* XXX: the following is only used to say "this is a wapbl buf" */ 262 .wo_wapbl_biodone = wapbl_biodone, 263 }; 264 265 static int 266 wapbl_sysctl_init(void) 267 { 268 int rv; 269 const struct sysctlnode *rnode, *cnode; 270 271 wapbl_sysctl = NULL; 272 273 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, 274 CTLFLAG_PERMANENT, 275 CTLTYPE_NODE, "wapbl", 276 SYSCTL_DESCR("WAPBL journaling options"), 277 NULL, 0, NULL, 0, 278 CTL_VFS, CTL_CREATE, CTL_EOL); 279 if (rv) 280 return rv; 281 282 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 283 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 284 CTLTYPE_INT, "flush_disk_cache", 285 SYSCTL_DESCR("flush disk cache"), 286 NULL, 0, &wapbl_flush_disk_cache, 0, 287 CTL_CREATE, CTL_EOL); 288 if (rv) 289 return rv; 290 291 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 292 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 293 CTLTYPE_INT, "verbose_commit", 294 SYSCTL_DESCR("show time and size of wapbl log commits"), 295 NULL, 0, &wapbl_verbose_commit, 0, 296 CTL_CREATE, CTL_EOL); 297 return rv; 298 } 299 300 static void 301 wapbl_init(void) 302 { 303 304 pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0, 305 "wapblentrypl", &pool_allocator_kmem, IPL_VM); 306 307 wapbl_sysctl_init(); 308 } 309 310 #ifdef notyet 311 static int 312 wapbl_fini(bool interface) 313 { 314 315 if (aio_sysctl != NULL) 316 sysctl_teardown(&aio_sysctl); 317 318 pool_destroy(&wapbl_entry_pool); 319 320 return 0; 321 } 322 #endif 323 324 static int 325 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 326 { 327 int error, i; 328 329 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 330 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 331 332 /* 333 * Its only valid to reuse the replay log if its 334 * the same as the new log we just opened. 335 */ 336 KDASSERT(!wapbl_replay_isopen(wr)); 337 KASSERT(wl->wl_devvp->v_type == VBLK); 338 KASSERT(wr->wr_devvp->v_type == VBLK); 339 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 340 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 341 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 342 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 343 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 344 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 345 346 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 347 348 for (i = 0; i < wr->wr_inodescnt; i++) 349 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 350 wr->wr_inodes[i].wr_imode); 351 352 /* Make sure new transaction won't overwrite old inodes list */ 353 KDASSERT(wapbl_transaction_len(wl) <= 354 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 355 wr->wr_inodestail)); 356 357 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 358 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 359 wapbl_transaction_len(wl); 360 361 error = wapbl_write_inodes(wl, &wl->wl_head); 362 if (error) 363 return error; 364 365 KASSERT(wl->wl_head != wl->wl_tail); 366 KASSERT(wl->wl_head != 0); 367 368 return 0; 369 } 370 371 int 372 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 373 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 374 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 375 { 376 struct wapbl *wl; 377 struct vnode *devvp; 378 daddr_t logpbn; 379 int error; 380 int log_dev_bshift = ilog2(blksize); 381 int fs_dev_bshift = log_dev_bshift; 382 int run; 383 384 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 385 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 386 387 if (log_dev_bshift > fs_dev_bshift) { 388 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 389 ("wapbl: log device's block size cannot be larger " 390 "than filesystem's\n")); 391 /* 392 * Not currently implemented, although it could be if 393 * needed someday. 394 */ 395 return ENOSYS; 396 } 397 398 if (off < 0) 399 return EINVAL; 400 401 if (blksize < DEV_BSIZE) 402 return EINVAL; 403 if (blksize % DEV_BSIZE) 404 return EINVAL; 405 406 /* XXXTODO: verify that the full load is writable */ 407 408 /* 409 * XXX check for minimum log size 410 * minimum is governed by minimum amount of space 411 * to complete a transaction. (probably truncate) 412 */ 413 /* XXX for now pick something minimal */ 414 if ((count * blksize) < MAXPHYS) { 415 return ENOSPC; 416 } 417 418 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 419 return error; 420 } 421 422 wl = wapbl_calloc(1, sizeof(*wl)); 423 rw_init(&wl->wl_rwlock); 424 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 425 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 426 LIST_INIT(&wl->wl_bufs); 427 SIMPLEQ_INIT(&wl->wl_entries); 428 429 wl->wl_logvp = vp; 430 wl->wl_devvp = devvp; 431 wl->wl_mount = mp; 432 wl->wl_logpbn = logpbn; 433 wl->wl_log_dev_bshift = log_dev_bshift; 434 wl->wl_fs_dev_bshift = fs_dev_bshift; 435 436 wl->wl_flush = flushfn; 437 wl->wl_flush_abort = flushabortfn; 438 439 /* Reserve two log device blocks for the commit headers */ 440 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 441 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 442 /* truncate the log usage to a multiple of log_dev_bshift */ 443 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 444 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 445 446 /* 447 * wl_bufbytes_max limits the size of the in memory transaction space. 448 * - Since buffers are allocated and accounted for in units of 449 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 450 * (i.e. 1<<PAGE_SHIFT) 451 * - Since the log device has to be written in units of 452 * 1<<wl_log_dev_bshift it is required to be a mulitple of 453 * 1<<wl_log_dev_bshift. 454 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 455 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 456 * Therefore it must be multiple of the least common multiple of those 457 * three quantities. Fortunately, all of those quantities are 458 * guaranteed to be a power of two, and the least common multiple of 459 * a set of numbers which are all powers of two is simply the maximum 460 * of those numbers. Finally, the maximum logarithm of a power of two 461 * is the same as the log of the maximum power of two. So we can do 462 * the following operations to size wl_bufbytes_max: 463 */ 464 465 /* XXX fix actual number of pages reserved per filesystem. */ 466 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 467 468 /* Round wl_bufbytes_max to the largest power of two constraint */ 469 wl->wl_bufbytes_max >>= PAGE_SHIFT; 470 wl->wl_bufbytes_max <<= PAGE_SHIFT; 471 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 472 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 473 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 474 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 475 476 /* XXX maybe use filesystem fragment size instead of 1024 */ 477 /* XXX fix actual number of buffers reserved per filesystem. */ 478 wl->wl_bufcount_max = (nbuf / 2) * 1024; 479 480 /* XXX tie this into resource estimation */ 481 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2; 482 483 wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) * 484 wl->wl_dealloclim); 485 wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) * 486 wl->wl_dealloclim); 487 488 wl->wl_buffer = wapbl_alloc(MAXPHYS); 489 wl->wl_buffer_used = 0; 490 491 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 492 493 /* Initialize the commit header */ 494 { 495 struct wapbl_wc_header *wc; 496 size_t len = 1 << wl->wl_log_dev_bshift; 497 wc = wapbl_calloc(1, len); 498 wc->wc_type = WAPBL_WC_HEADER; 499 wc->wc_len = len; 500 wc->wc_circ_off = wl->wl_circ_off; 501 wc->wc_circ_size = wl->wl_circ_size; 502 /* XXX wc->wc_fsid */ 503 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 504 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 505 wl->wl_wc_header = wc; 506 wl->wl_wc_scratch = wapbl_alloc(len); 507 } 508 509 /* 510 * if there was an existing set of unlinked but 511 * allocated inodes, preserve it in the new 512 * log. 513 */ 514 if (wr && wr->wr_inodescnt) { 515 error = wapbl_start_flush_inodes(wl, wr); 516 if (error) 517 goto errout; 518 } 519 520 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 521 if (error) { 522 goto errout; 523 } 524 525 *wlp = wl; 526 #if defined(WAPBL_DEBUG) 527 wapbl_debug_wl = wl; 528 #endif 529 530 return 0; 531 errout: 532 wapbl_discard(wl); 533 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 534 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 535 wapbl_free(wl->wl_deallocblks, 536 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 537 wapbl_free(wl->wl_dealloclens, 538 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 539 wapbl_free(wl->wl_buffer, MAXPHYS); 540 wapbl_inodetrk_free(wl); 541 wapbl_free(wl, sizeof(*wl)); 542 543 return error; 544 } 545 546 /* 547 * Like wapbl_flush, only discards the transaction 548 * completely 549 */ 550 551 void 552 wapbl_discard(struct wapbl *wl) 553 { 554 struct wapbl_entry *we; 555 struct buf *bp; 556 int i; 557 558 /* 559 * XXX we may consider using upgrade here 560 * if we want to call flush from inside a transaction 561 */ 562 rw_enter(&wl->wl_rwlock, RW_WRITER); 563 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 564 wl->wl_dealloccnt); 565 566 #ifdef WAPBL_DEBUG_PRINT 567 { 568 pid_t pid = -1; 569 lwpid_t lid = -1; 570 if (curproc) 571 pid = curproc->p_pid; 572 if (curlwp) 573 lid = curlwp->l_lid; 574 #ifdef WAPBL_DEBUG_BUFBYTES 575 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 576 ("wapbl_discard: thread %d.%d discarding " 577 "transaction\n" 578 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 579 "deallocs=%d inodes=%d\n" 580 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 581 "unsynced=%zu\n", 582 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 583 wl->wl_bcount, wl->wl_dealloccnt, 584 wl->wl_inohashcnt, wl->wl_error_count, 585 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 586 wl->wl_unsynced_bufbytes)); 587 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 588 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 589 ("\tentry: bufcount = %zu, reclaimable = %zu, " 590 "error = %d, unsynced = %zu\n", 591 we->we_bufcount, we->we_reclaimable_bytes, 592 we->we_error, we->we_unsynced_bufbytes)); 593 } 594 #else /* !WAPBL_DEBUG_BUFBYTES */ 595 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 596 ("wapbl_discard: thread %d.%d discarding transaction\n" 597 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 598 "deallocs=%d inodes=%d\n" 599 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 600 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 601 wl->wl_bcount, wl->wl_dealloccnt, 602 wl->wl_inohashcnt, wl->wl_error_count, 603 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 604 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 605 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 606 ("\tentry: bufcount = %zu, reclaimable = %zu, " 607 "error = %d\n", 608 we->we_bufcount, we->we_reclaimable_bytes, 609 we->we_error)); 610 } 611 #endif /* !WAPBL_DEBUG_BUFBYTES */ 612 } 613 #endif /* WAPBL_DEBUG_PRINT */ 614 615 for (i = 0; i <= wl->wl_inohashmask; i++) { 616 struct wapbl_ino_head *wih; 617 struct wapbl_ino *wi; 618 619 wih = &wl->wl_inohash[i]; 620 while ((wi = LIST_FIRST(wih)) != NULL) { 621 LIST_REMOVE(wi, wi_hash); 622 pool_put(&wapbl_ino_pool, wi); 623 KASSERT(wl->wl_inohashcnt > 0); 624 wl->wl_inohashcnt--; 625 } 626 } 627 628 /* 629 * clean buffer list 630 */ 631 mutex_enter(&bufcache_lock); 632 mutex_enter(&wl->wl_mtx); 633 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 634 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 635 /* 636 * The buffer will be unlocked and 637 * removed from the transaction in brelse 638 */ 639 mutex_exit(&wl->wl_mtx); 640 brelsel(bp, 0); 641 mutex_enter(&wl->wl_mtx); 642 } 643 } 644 mutex_exit(&wl->wl_mtx); 645 mutex_exit(&bufcache_lock); 646 647 /* 648 * Remove references to this wl from wl_entries, free any which 649 * no longer have buffers, others will be freed in wapbl_biodone 650 * when they no longer have any buffers. 651 */ 652 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 653 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 654 /* XXX should we be accumulating wl_error_count 655 * and increasing reclaimable bytes ? */ 656 we->we_wapbl = NULL; 657 if (we->we_bufcount == 0) { 658 #ifdef WAPBL_DEBUG_BUFBYTES 659 KASSERT(we->we_unsynced_bufbytes == 0); 660 #endif 661 pool_put(&wapbl_entry_pool, we); 662 } 663 } 664 665 /* Discard list of deallocs */ 666 wl->wl_dealloccnt = 0; 667 /* XXX should we clear wl_reserved_bytes? */ 668 669 KASSERT(wl->wl_bufbytes == 0); 670 KASSERT(wl->wl_bcount == 0); 671 KASSERT(wl->wl_bufcount == 0); 672 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 673 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 674 KASSERT(wl->wl_inohashcnt == 0); 675 676 rw_exit(&wl->wl_rwlock); 677 } 678 679 int 680 wapbl_stop(struct wapbl *wl, int force) 681 { 682 int error; 683 684 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 685 error = wapbl_flush(wl, 1); 686 if (error) { 687 if (force) 688 wapbl_discard(wl); 689 else 690 return error; 691 } 692 693 /* Unlinked inodes persist after a flush */ 694 if (wl->wl_inohashcnt) { 695 if (force) { 696 wapbl_discard(wl); 697 } else { 698 return EBUSY; 699 } 700 } 701 702 KASSERT(wl->wl_bufbytes == 0); 703 KASSERT(wl->wl_bcount == 0); 704 KASSERT(wl->wl_bufcount == 0); 705 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 706 KASSERT(wl->wl_dealloccnt == 0); 707 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 708 KASSERT(wl->wl_inohashcnt == 0); 709 710 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 711 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 712 wapbl_free(wl->wl_deallocblks, 713 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 714 wapbl_free(wl->wl_dealloclens, 715 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 716 wapbl_free(wl->wl_buffer, MAXPHYS); 717 wapbl_inodetrk_free(wl); 718 719 cv_destroy(&wl->wl_reclaimable_cv); 720 mutex_destroy(&wl->wl_mtx); 721 rw_destroy(&wl->wl_rwlock); 722 wapbl_free(wl, sizeof(*wl)); 723 724 return 0; 725 } 726 727 static int 728 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 729 { 730 struct pstats *pstats = curlwp->l_proc->p_stats; 731 struct buf *bp; 732 int error; 733 734 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 735 KASSERT(devvp->v_type == VBLK); 736 737 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 738 mutex_enter(devvp->v_interlock); 739 devvp->v_numoutput++; 740 mutex_exit(devvp->v_interlock); 741 pstats->p_ru.ru_oublock++; 742 } else { 743 pstats->p_ru.ru_inblock++; 744 } 745 746 bp = getiobuf(devvp, true); 747 bp->b_flags = flags; 748 bp->b_cflags = BC_BUSY; /* silly & dubious */ 749 bp->b_dev = devvp->v_rdev; 750 bp->b_data = data; 751 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 752 bp->b_blkno = pbn; 753 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 754 755 WAPBL_PRINTF(WAPBL_PRINT_IO, 756 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n", 757 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 758 bp->b_blkno, bp->b_dev)); 759 760 VOP_STRATEGY(devvp, bp); 761 762 error = biowait(bp); 763 putiobuf(bp); 764 765 if (error) { 766 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 767 ("wapbl_doio: %s %zu bytes at block %" PRId64 768 " on dev 0x%"PRIx64" failed with error %d\n", 769 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 770 "write" : "read"), 771 len, pbn, devvp->v_rdev, error)); 772 } 773 774 return error; 775 } 776 777 int 778 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 779 { 780 781 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 782 } 783 784 int 785 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 786 { 787 788 return wapbl_doio(data, len, devvp, pbn, B_READ); 789 } 790 791 /* 792 * Flush buffered data if any. 793 */ 794 static int 795 wapbl_buffered_flush(struct wapbl *wl) 796 { 797 int error; 798 799 if (wl->wl_buffer_used == 0) 800 return 0; 801 802 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 803 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 804 wl->wl_buffer_used = 0; 805 806 return error; 807 } 808 809 /* 810 * Write data to the log. 811 * Try to coalesce writes and emit MAXPHYS aligned blocks. 812 */ 813 static int 814 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn) 815 { 816 int error; 817 size_t resid; 818 819 /* 820 * If not adjacent to buffered data flush first. Disk block 821 * address is always valid for non-empty buffer. 822 */ 823 if (wl->wl_buffer_used > 0 && 824 pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) { 825 error = wapbl_buffered_flush(wl); 826 if (error) 827 return error; 828 } 829 /* 830 * If this write goes to an empty buffer we have to 831 * save the disk block address first. 832 */ 833 if (wl->wl_buffer_used == 0) 834 wl->wl_buffer_dblk = pbn; 835 /* 836 * Remaining space so this buffer ends on a MAXPHYS boundary. 837 * 838 * Cannot become less or equal zero as the buffer would have been 839 * flushed on the last call then. 840 */ 841 resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) - 842 wl->wl_buffer_used; 843 KASSERT(resid > 0); 844 KASSERT(dbtob(btodb(resid)) == resid); 845 if (len >= resid) { 846 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid); 847 wl->wl_buffer_used += resid; 848 error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, 849 wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); 850 data = (uint8_t *)data + resid; 851 len -= resid; 852 wl->wl_buffer_dblk = pbn + btodb(resid); 853 wl->wl_buffer_used = 0; 854 if (error) 855 return error; 856 } 857 KASSERT(len < MAXPHYS); 858 if (len > 0) { 859 memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len); 860 wl->wl_buffer_used += len; 861 } 862 863 return 0; 864 } 865 866 /* 867 * Off is byte offset returns new offset for next write 868 * handles log wraparound 869 */ 870 static int 871 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 872 { 873 size_t slen; 874 off_t off = *offp; 875 int error; 876 daddr_t pbn; 877 878 KDASSERT(((len >> wl->wl_log_dev_bshift) << 879 wl->wl_log_dev_bshift) == len); 880 881 if (off < wl->wl_circ_off) 882 off = wl->wl_circ_off; 883 slen = wl->wl_circ_off + wl->wl_circ_size - off; 884 if (slen < len) { 885 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 886 #ifdef _KERNEL 887 pbn = btodb(pbn << wl->wl_log_dev_bshift); 888 #endif 889 error = wapbl_buffered_write(data, slen, wl, pbn); 890 if (error) 891 return error; 892 data = (uint8_t *)data + slen; 893 len -= slen; 894 off = wl->wl_circ_off; 895 } 896 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 897 #ifdef _KERNEL 898 pbn = btodb(pbn << wl->wl_log_dev_bshift); 899 #endif 900 error = wapbl_buffered_write(data, len, wl, pbn); 901 if (error) 902 return error; 903 off += len; 904 if (off >= wl->wl_circ_off + wl->wl_circ_size) 905 off = wl->wl_circ_off; 906 *offp = off; 907 return 0; 908 } 909 910 /****************************************************************/ 911 912 int 913 wapbl_begin(struct wapbl *wl, const char *file, int line) 914 { 915 int doflush; 916 unsigned lockcount; 917 918 KDASSERT(wl); 919 920 /* 921 * XXX this needs to be made much more sophisticated. 922 * perhaps each wapbl_begin could reserve a specified 923 * number of buffers and bytes. 924 */ 925 mutex_enter(&wl->wl_mtx); 926 lockcount = wl->wl_lock_count; 927 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 928 wl->wl_bufbytes_max / 2) || 929 ((wl->wl_bufcount + (lockcount * 10)) > 930 wl->wl_bufcount_max / 2) || 931 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 932 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2)); 933 mutex_exit(&wl->wl_mtx); 934 935 if (doflush) { 936 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 937 ("force flush lockcnt=%d bufbytes=%zu " 938 "(max=%zu) bufcount=%zu (max=%zu) " 939 "dealloccnt %d (lim=%d)\n", 940 lockcount, wl->wl_bufbytes, 941 wl->wl_bufbytes_max, wl->wl_bufcount, 942 wl->wl_bufcount_max, 943 wl->wl_dealloccnt, wl->wl_dealloclim)); 944 } 945 946 if (doflush) { 947 int error = wapbl_flush(wl, 0); 948 if (error) 949 return error; 950 } 951 952 rw_enter(&wl->wl_rwlock, RW_READER); 953 mutex_enter(&wl->wl_mtx); 954 wl->wl_lock_count++; 955 mutex_exit(&wl->wl_mtx); 956 957 #if defined(WAPBL_DEBUG_PRINT) 958 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 959 ("wapbl_begin thread %d.%d with bufcount=%zu " 960 "bufbytes=%zu bcount=%zu at %s:%d\n", 961 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 962 wl->wl_bufbytes, wl->wl_bcount, file, line)); 963 #endif 964 965 return 0; 966 } 967 968 void 969 wapbl_end(struct wapbl *wl) 970 { 971 972 #if defined(WAPBL_DEBUG_PRINT) 973 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 974 ("wapbl_end thread %d.%d with bufcount=%zu " 975 "bufbytes=%zu bcount=%zu\n", 976 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 977 wl->wl_bufbytes, wl->wl_bcount)); 978 #endif 979 980 #ifdef DIAGNOSTIC 981 size_t flushsize = wapbl_transaction_len(wl); 982 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 983 /* 984 * XXX this could be handled more gracefully, perhaps place 985 * only a partial transaction in the log and allow the 986 * remaining to flush without the protection of the journal. 987 */ 988 panic("wapbl_end: current transaction too big to flush\n"); 989 } 990 #endif 991 992 mutex_enter(&wl->wl_mtx); 993 KASSERT(wl->wl_lock_count > 0); 994 wl->wl_lock_count--; 995 mutex_exit(&wl->wl_mtx); 996 997 rw_exit(&wl->wl_rwlock); 998 } 999 1000 void 1001 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 1002 { 1003 1004 KASSERT(bp->b_cflags & BC_BUSY); 1005 KASSERT(bp->b_vp); 1006 1007 wapbl_jlock_assert(wl); 1008 1009 #if 0 1010 /* 1011 * XXX this might be an issue for swapfiles. 1012 * see uvm_swap.c:1702 1013 * 1014 * XXX2 why require it then? leap of semantics? 1015 */ 1016 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 1017 #endif 1018 1019 mutex_enter(&wl->wl_mtx); 1020 if (bp->b_flags & B_LOCKED) { 1021 LIST_REMOVE(bp, b_wapbllist); 1022 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 1023 ("wapbl_add_buf thread %d.%d re-adding buf %p " 1024 "with %d bytes %d bcount\n", 1025 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1026 bp->b_bcount)); 1027 } else { 1028 /* unlocked by dirty buffers shouldn't exist */ 1029 KASSERT(!(bp->b_oflags & BO_DELWRI)); 1030 wl->wl_bufbytes += bp->b_bufsize; 1031 wl->wl_bcount += bp->b_bcount; 1032 wl->wl_bufcount++; 1033 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1034 ("wapbl_add_buf thread %d.%d adding buf %p " 1035 "with %d bytes %d bcount\n", 1036 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 1037 bp->b_bcount)); 1038 } 1039 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 1040 mutex_exit(&wl->wl_mtx); 1041 1042 bp->b_flags |= B_LOCKED; 1043 } 1044 1045 static void 1046 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 1047 { 1048 1049 KASSERT(mutex_owned(&wl->wl_mtx)); 1050 KASSERT(bp->b_cflags & BC_BUSY); 1051 wapbl_jlock_assert(wl); 1052 1053 #if 0 1054 /* 1055 * XXX this might be an issue for swapfiles. 1056 * see uvm_swap.c:1725 1057 * 1058 * XXXdeux: see above 1059 */ 1060 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 1061 #endif 1062 KASSERT(bp->b_flags & B_LOCKED); 1063 1064 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 1065 ("wapbl_remove_buf thread %d.%d removing buf %p with " 1066 "%d bytes %d bcount\n", 1067 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 1068 1069 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 1070 wl->wl_bufbytes -= bp->b_bufsize; 1071 KASSERT(wl->wl_bcount >= bp->b_bcount); 1072 wl->wl_bcount -= bp->b_bcount; 1073 KASSERT(wl->wl_bufcount > 0); 1074 wl->wl_bufcount--; 1075 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1076 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1077 LIST_REMOVE(bp, b_wapbllist); 1078 1079 bp->b_flags &= ~B_LOCKED; 1080 } 1081 1082 /* called from brelsel() in vfs_bio among other places */ 1083 void 1084 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 1085 { 1086 1087 mutex_enter(&wl->wl_mtx); 1088 wapbl_remove_buf_locked(wl, bp); 1089 mutex_exit(&wl->wl_mtx); 1090 } 1091 1092 void 1093 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 1094 { 1095 1096 KASSERT(bp->b_cflags & BC_BUSY); 1097 1098 /* 1099 * XXX: why does this depend on B_LOCKED? otherwise the buf 1100 * is not for a transaction? if so, why is this called in the 1101 * first place? 1102 */ 1103 if (bp->b_flags & B_LOCKED) { 1104 mutex_enter(&wl->wl_mtx); 1105 wl->wl_bufbytes += bp->b_bufsize - oldsz; 1106 wl->wl_bcount += bp->b_bcount - oldcnt; 1107 mutex_exit(&wl->wl_mtx); 1108 } 1109 } 1110 1111 #endif /* _KERNEL */ 1112 1113 /****************************************************************/ 1114 /* Some utility inlines */ 1115 1116 static inline size_t 1117 wapbl_space_used(size_t avail, off_t head, off_t tail) 1118 { 1119 1120 if (tail == 0) { 1121 KASSERT(head == 0); 1122 return 0; 1123 } 1124 return ((head + (avail - 1) - tail) % avail) + 1; 1125 } 1126 1127 #ifdef _KERNEL 1128 /* This is used to advance the pointer at old to new value at old+delta */ 1129 static inline off_t 1130 wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 1131 { 1132 off_t new; 1133 1134 /* Define acceptable ranges for inputs. */ 1135 KASSERT(delta <= (size_t)size); 1136 KASSERT((old == 0) || ((size_t)old >= off)); 1137 KASSERT(old < (off_t)(size + off)); 1138 1139 if ((old == 0) && (delta != 0)) 1140 new = off + delta; 1141 else if ((old + delta) < (size + off)) 1142 new = old + delta; 1143 else 1144 new = (old + delta) - size; 1145 1146 /* Note some interesting axioms */ 1147 KASSERT((delta != 0) || (new == old)); 1148 KASSERT((delta == 0) || (new != 0)); 1149 KASSERT((delta != (size)) || (new == old)); 1150 1151 /* Define acceptable ranges for output. */ 1152 KASSERT((new == 0) || ((size_t)new >= off)); 1153 KASSERT((size_t)new < (size + off)); 1154 return new; 1155 } 1156 1157 static inline size_t 1158 wapbl_space_free(size_t avail, off_t head, off_t tail) 1159 { 1160 1161 return avail - wapbl_space_used(avail, head, tail); 1162 } 1163 1164 static inline void 1165 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1166 off_t *tailp) 1167 { 1168 off_t head = *headp; 1169 off_t tail = *tailp; 1170 1171 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1172 head = wapbl_advance(size, off, head, delta); 1173 if ((tail == 0) && (head != 0)) 1174 tail = off; 1175 *headp = head; 1176 *tailp = tail; 1177 } 1178 1179 static inline void 1180 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1181 off_t *tailp) 1182 { 1183 off_t head = *headp; 1184 off_t tail = *tailp; 1185 1186 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1187 tail = wapbl_advance(size, off, tail, delta); 1188 if (head == tail) { 1189 head = tail = 0; 1190 } 1191 *headp = head; 1192 *tailp = tail; 1193 } 1194 1195 1196 /****************************************************************/ 1197 1198 /* 1199 * Remove transactions whose buffers are completely flushed to disk. 1200 * Will block until at least minfree space is available. 1201 * only intended to be called from inside wapbl_flush and therefore 1202 * does not protect against commit races with itself or with flush. 1203 */ 1204 static int 1205 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1206 { 1207 size_t delta; 1208 size_t avail; 1209 off_t head; 1210 off_t tail; 1211 int error = 0; 1212 1213 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1214 KASSERT(rw_write_held(&wl->wl_rwlock)); 1215 1216 mutex_enter(&wl->wl_mtx); 1217 1218 /* 1219 * First check to see if we have to do a commit 1220 * at all. 1221 */ 1222 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1223 if (minfree < avail) { 1224 mutex_exit(&wl->wl_mtx); 1225 return 0; 1226 } 1227 minfree -= avail; 1228 while ((wl->wl_error_count == 0) && 1229 (wl->wl_reclaimable_bytes < minfree)) { 1230 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1231 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1232 "minfree=%zd\n", 1233 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1234 minfree)); 1235 1236 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1237 } 1238 if (wl->wl_reclaimable_bytes < minfree) { 1239 KASSERT(wl->wl_error_count); 1240 /* XXX maybe get actual error from buffer instead someday? */ 1241 error = EIO; 1242 } 1243 head = wl->wl_head; 1244 tail = wl->wl_tail; 1245 delta = wl->wl_reclaimable_bytes; 1246 1247 /* If all of of the entries are flushed, then be sure to keep 1248 * the reserved bytes reserved. Watch out for discarded transactions, 1249 * which could leave more bytes reserved than are reclaimable. 1250 */ 1251 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1252 (delta >= wl->wl_reserved_bytes)) { 1253 delta -= wl->wl_reserved_bytes; 1254 } 1255 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1256 &tail); 1257 KDASSERT(wl->wl_reserved_bytes <= 1258 wapbl_space_used(wl->wl_circ_size, head, tail)); 1259 mutex_exit(&wl->wl_mtx); 1260 1261 if (error) 1262 return error; 1263 1264 if (waitonly) 1265 return 0; 1266 1267 /* 1268 * This is where head, tail and delta are unprotected 1269 * from races against itself or flush. This is ok since 1270 * we only call this routine from inside flush itself. 1271 * 1272 * XXX: how can it race against itself when accessed only 1273 * from behind the write-locked rwlock? 1274 */ 1275 error = wapbl_write_commit(wl, head, tail); 1276 if (error) 1277 return error; 1278 1279 wl->wl_head = head; 1280 wl->wl_tail = tail; 1281 1282 mutex_enter(&wl->wl_mtx); 1283 KASSERT(wl->wl_reclaimable_bytes >= delta); 1284 wl->wl_reclaimable_bytes -= delta; 1285 mutex_exit(&wl->wl_mtx); 1286 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1287 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1288 curproc->p_pid, curlwp->l_lid, delta)); 1289 1290 return 0; 1291 } 1292 1293 /****************************************************************/ 1294 1295 void 1296 wapbl_biodone(struct buf *bp) 1297 { 1298 struct wapbl_entry *we = bp->b_private; 1299 struct wapbl *wl = we->we_wapbl; 1300 #ifdef WAPBL_DEBUG_BUFBYTES 1301 const int bufsize = bp->b_bufsize; 1302 #endif 1303 1304 /* 1305 * Handle possible flushing of buffers after log has been 1306 * decomissioned. 1307 */ 1308 if (!wl) { 1309 KASSERT(we->we_bufcount > 0); 1310 we->we_bufcount--; 1311 #ifdef WAPBL_DEBUG_BUFBYTES 1312 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1313 we->we_unsynced_bufbytes -= bufsize; 1314 #endif 1315 1316 if (we->we_bufcount == 0) { 1317 #ifdef WAPBL_DEBUG_BUFBYTES 1318 KASSERT(we->we_unsynced_bufbytes == 0); 1319 #endif 1320 pool_put(&wapbl_entry_pool, we); 1321 } 1322 1323 brelse(bp, 0); 1324 return; 1325 } 1326 1327 #ifdef ohbother 1328 KDASSERT(bp->b_oflags & BO_DONE); 1329 KDASSERT(!(bp->b_oflags & BO_DELWRI)); 1330 KDASSERT(bp->b_flags & B_ASYNC); 1331 KDASSERT(bp->b_cflags & BC_BUSY); 1332 KDASSERT(!(bp->b_flags & B_LOCKED)); 1333 KDASSERT(!(bp->b_flags & B_READ)); 1334 KDASSERT(!(bp->b_cflags & BC_INVAL)); 1335 KDASSERT(!(bp->b_cflags & BC_NOCACHE)); 1336 #endif 1337 1338 if (bp->b_error) { 1339 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1340 /* 1341 * XXXpooka: interfaces not fully updated 1342 * Note: this was not enabled in the original patch 1343 * against netbsd4 either. I don't know if comment 1344 * above is true or not. 1345 */ 1346 1347 /* 1348 * If an error occurs, report the error and leave the 1349 * buffer as a delayed write on the LRU queue. 1350 * restarting the write would likely result in 1351 * an error spinloop, so let it be done harmlessly 1352 * by the syncer. 1353 */ 1354 bp->b_flags &= ~(B_DONE); 1355 simple_unlock(&bp->b_interlock); 1356 1357 if (we->we_error == 0) { 1358 mutex_enter(&wl->wl_mtx); 1359 wl->wl_error_count++; 1360 mutex_exit(&wl->wl_mtx); 1361 cv_broadcast(&wl->wl_reclaimable_cv); 1362 } 1363 we->we_error = bp->b_error; 1364 bp->b_error = 0; 1365 brelse(bp); 1366 return; 1367 #else 1368 /* For now, just mark the log permanently errored out */ 1369 1370 mutex_enter(&wl->wl_mtx); 1371 if (wl->wl_error_count == 0) { 1372 wl->wl_error_count++; 1373 cv_broadcast(&wl->wl_reclaimable_cv); 1374 } 1375 mutex_exit(&wl->wl_mtx); 1376 #endif 1377 } 1378 1379 /* 1380 * Release the buffer here. wapbl_flush() may wait for the 1381 * log to become empty and we better unbusy the buffer before 1382 * wapbl_flush() returns. 1383 */ 1384 brelse(bp, 0); 1385 1386 mutex_enter(&wl->wl_mtx); 1387 1388 KASSERT(we->we_bufcount > 0); 1389 we->we_bufcount--; 1390 #ifdef WAPBL_DEBUG_BUFBYTES 1391 KASSERT(we->we_unsynced_bufbytes >= bufsize); 1392 we->we_unsynced_bufbytes -= bufsize; 1393 KASSERT(wl->wl_unsynced_bufbytes >= bufsize); 1394 wl->wl_unsynced_bufbytes -= bufsize; 1395 #endif 1396 1397 /* 1398 * If the current transaction can be reclaimed, start 1399 * at the beginning and reclaim any consecutive reclaimable 1400 * transactions. If we successfully reclaim anything, 1401 * then wakeup anyone waiting for the reclaim. 1402 */ 1403 if (we->we_bufcount == 0) { 1404 size_t delta = 0; 1405 int errcnt = 0; 1406 #ifdef WAPBL_DEBUG_BUFBYTES 1407 KDASSERT(we->we_unsynced_bufbytes == 0); 1408 #endif 1409 /* 1410 * clear any posted error, since the buffer it came from 1411 * has successfully flushed by now 1412 */ 1413 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1414 (we->we_bufcount == 0)) { 1415 delta += we->we_reclaimable_bytes; 1416 if (we->we_error) 1417 errcnt++; 1418 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1419 pool_put(&wapbl_entry_pool, we); 1420 } 1421 1422 if (delta) { 1423 wl->wl_reclaimable_bytes += delta; 1424 KASSERT(wl->wl_error_count >= errcnt); 1425 wl->wl_error_count -= errcnt; 1426 cv_broadcast(&wl->wl_reclaimable_cv); 1427 } 1428 } 1429 1430 mutex_exit(&wl->wl_mtx); 1431 } 1432 1433 /* 1434 * Write transactions to disk + start I/O for contents 1435 */ 1436 int 1437 wapbl_flush(struct wapbl *wl, int waitfor) 1438 { 1439 struct buf *bp; 1440 struct wapbl_entry *we; 1441 off_t off; 1442 off_t head; 1443 off_t tail; 1444 size_t delta = 0; 1445 size_t flushsize; 1446 size_t reserved; 1447 int error = 0; 1448 1449 /* 1450 * Do a quick check to see if a full flush can be skipped 1451 * This assumes that the flush callback does not need to be called 1452 * unless there are other outstanding bufs. 1453 */ 1454 if (!waitfor) { 1455 size_t nbufs; 1456 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1457 protect the KASSERTS */ 1458 nbufs = wl->wl_bufcount; 1459 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1460 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1461 mutex_exit(&wl->wl_mtx); 1462 if (nbufs == 0) 1463 return 0; 1464 } 1465 1466 /* 1467 * XXX we may consider using LK_UPGRADE here 1468 * if we want to call flush from inside a transaction 1469 */ 1470 rw_enter(&wl->wl_rwlock, RW_WRITER); 1471 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1472 wl->wl_dealloccnt); 1473 1474 /* 1475 * Now that we are fully locked and flushed, 1476 * do another check for nothing to do. 1477 */ 1478 if (wl->wl_bufcount == 0) { 1479 goto out; 1480 } 1481 1482 #if 0 1483 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1484 ("wapbl_flush thread %d.%d flushing entries with " 1485 "bufcount=%zu bufbytes=%zu\n", 1486 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1487 wl->wl_bufbytes)); 1488 #endif 1489 1490 /* Calculate amount of space needed to flush */ 1491 flushsize = wapbl_transaction_len(wl); 1492 if (wapbl_verbose_commit) { 1493 struct timespec ts; 1494 getnanotime(&ts); 1495 printf("%s: %lld.%09ld this transaction = %zu bytes\n", 1496 __func__, (long long)ts.tv_sec, 1497 (long)ts.tv_nsec, flushsize); 1498 } 1499 1500 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1501 /* 1502 * XXX this could be handled more gracefully, perhaps place 1503 * only a partial transaction in the log and allow the 1504 * remaining to flush without the protection of the journal. 1505 */ 1506 panic("wapbl_flush: current transaction too big to flush\n"); 1507 } 1508 1509 error = wapbl_truncate(wl, flushsize, 0); 1510 if (error) 1511 goto out2; 1512 1513 off = wl->wl_head; 1514 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1515 (off < wl->wl_circ_off + wl->wl_circ_size))); 1516 error = wapbl_write_blocks(wl, &off); 1517 if (error) 1518 goto out2; 1519 error = wapbl_write_revocations(wl, &off); 1520 if (error) 1521 goto out2; 1522 error = wapbl_write_inodes(wl, &off); 1523 if (error) 1524 goto out2; 1525 1526 reserved = 0; 1527 if (wl->wl_inohashcnt) 1528 reserved = wapbl_transaction_inodes_len(wl); 1529 1530 head = wl->wl_head; 1531 tail = wl->wl_tail; 1532 1533 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1534 &head, &tail); 1535 #ifdef WAPBL_DEBUG 1536 if (head != off) { 1537 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1538 " off=%"PRIdMAX" flush=%zu\n", 1539 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1540 flushsize); 1541 } 1542 #else 1543 KASSERT(head == off); 1544 #endif 1545 1546 /* Opportunistically move the tail forward if we can */ 1547 if (!wapbl_lazy_truncate) { 1548 mutex_enter(&wl->wl_mtx); 1549 delta = wl->wl_reclaimable_bytes; 1550 mutex_exit(&wl->wl_mtx); 1551 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1552 &head, &tail); 1553 } 1554 1555 error = wapbl_write_commit(wl, head, tail); 1556 if (error) 1557 goto out2; 1558 1559 we = pool_get(&wapbl_entry_pool, PR_WAITOK); 1560 1561 #ifdef WAPBL_DEBUG_BUFBYTES 1562 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1563 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1564 " unsynced=%zu" 1565 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1566 "inodes=%d\n", 1567 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1568 wapbl_space_used(wl->wl_circ_size, head, tail), 1569 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1570 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1571 wl->wl_inohashcnt)); 1572 #else 1573 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1574 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1575 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1576 "inodes=%d\n", 1577 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1578 wapbl_space_used(wl->wl_circ_size, head, tail), 1579 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1580 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1581 #endif 1582 1583 1584 mutex_enter(&bufcache_lock); 1585 mutex_enter(&wl->wl_mtx); 1586 1587 wl->wl_reserved_bytes = reserved; 1588 wl->wl_head = head; 1589 wl->wl_tail = tail; 1590 KASSERT(wl->wl_reclaimable_bytes >= delta); 1591 wl->wl_reclaimable_bytes -= delta; 1592 wl->wl_dealloccnt = 0; 1593 #ifdef WAPBL_DEBUG_BUFBYTES 1594 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1595 #endif 1596 1597 we->we_wapbl = wl; 1598 we->we_bufcount = wl->wl_bufcount; 1599 #ifdef WAPBL_DEBUG_BUFBYTES 1600 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1601 #endif 1602 we->we_reclaimable_bytes = flushsize; 1603 we->we_error = 0; 1604 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1605 1606 /* 1607 * this flushes bufs in reverse order than they were queued 1608 * it shouldn't matter, but if we care we could use TAILQ instead. 1609 * XXX Note they will get put on the lru queue when they flush 1610 * so we might actually want to change this to preserve order. 1611 */ 1612 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1613 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1614 continue; 1615 } 1616 bp->b_iodone = wapbl_biodone; 1617 bp->b_private = we; 1618 bremfree(bp); 1619 wapbl_remove_buf_locked(wl, bp); 1620 mutex_exit(&wl->wl_mtx); 1621 mutex_exit(&bufcache_lock); 1622 bawrite(bp); 1623 mutex_enter(&bufcache_lock); 1624 mutex_enter(&wl->wl_mtx); 1625 } 1626 mutex_exit(&wl->wl_mtx); 1627 mutex_exit(&bufcache_lock); 1628 1629 #if 0 1630 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1631 ("wapbl_flush thread %d.%d done flushing entries...\n", 1632 curproc->p_pid, curlwp->l_lid)); 1633 #endif 1634 1635 out: 1636 1637 /* 1638 * If the waitfor flag is set, don't return until everything is 1639 * fully flushed and the on disk log is empty. 1640 */ 1641 if (waitfor) { 1642 error = wapbl_truncate(wl, wl->wl_circ_size - 1643 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1644 } 1645 1646 out2: 1647 if (error) { 1648 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1649 wl->wl_dealloclens, wl->wl_dealloccnt); 1650 } 1651 1652 #ifdef WAPBL_DEBUG_PRINT 1653 if (error) { 1654 pid_t pid = -1; 1655 lwpid_t lid = -1; 1656 if (curproc) 1657 pid = curproc->p_pid; 1658 if (curlwp) 1659 lid = curlwp->l_lid; 1660 mutex_enter(&wl->wl_mtx); 1661 #ifdef WAPBL_DEBUG_BUFBYTES 1662 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1663 ("wapbl_flush: thread %d.%d aborted flush: " 1664 "error = %d\n" 1665 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1666 "deallocs=%d inodes=%d\n" 1667 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1668 "unsynced=%zu\n", 1669 pid, lid, error, wl->wl_bufcount, 1670 wl->wl_bufbytes, wl->wl_bcount, 1671 wl->wl_dealloccnt, wl->wl_inohashcnt, 1672 wl->wl_error_count, wl->wl_reclaimable_bytes, 1673 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1674 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1675 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1676 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1677 "error = %d, unsynced = %zu\n", 1678 we->we_bufcount, we->we_reclaimable_bytes, 1679 we->we_error, we->we_unsynced_bufbytes)); 1680 } 1681 #else 1682 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1683 ("wapbl_flush: thread %d.%d aborted flush: " 1684 "error = %d\n" 1685 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1686 "deallocs=%d inodes=%d\n" 1687 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1688 pid, lid, error, wl->wl_bufcount, 1689 wl->wl_bufbytes, wl->wl_bcount, 1690 wl->wl_dealloccnt, wl->wl_inohashcnt, 1691 wl->wl_error_count, wl->wl_reclaimable_bytes, 1692 wl->wl_reserved_bytes)); 1693 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1694 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1695 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1696 "error = %d\n", we->we_bufcount, 1697 we->we_reclaimable_bytes, we->we_error)); 1698 } 1699 #endif 1700 mutex_exit(&wl->wl_mtx); 1701 } 1702 #endif 1703 1704 rw_exit(&wl->wl_rwlock); 1705 return error; 1706 } 1707 1708 /****************************************************************/ 1709 1710 void 1711 wapbl_jlock_assert(struct wapbl *wl) 1712 { 1713 1714 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1715 } 1716 1717 void 1718 wapbl_junlock_assert(struct wapbl *wl) 1719 { 1720 1721 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1722 } 1723 1724 /****************************************************************/ 1725 1726 /* locks missing */ 1727 void 1728 wapbl_print(struct wapbl *wl, 1729 int full, 1730 void (*pr)(const char *, ...)) 1731 { 1732 struct buf *bp; 1733 struct wapbl_entry *we; 1734 (*pr)("wapbl %p", wl); 1735 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1736 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1737 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1738 wl->wl_circ_size, wl->wl_circ_off, 1739 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1740 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1741 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1742 #ifdef WAPBL_DEBUG_BUFBYTES 1743 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1744 "reserved = %zu errcnt = %d unsynced = %zu\n", 1745 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1746 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1747 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1748 #else 1749 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1750 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1751 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1752 wl->wl_error_count); 1753 #endif 1754 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1755 wl->wl_dealloccnt, wl->wl_dealloclim); 1756 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1757 wl->wl_inohashcnt, wl->wl_inohashmask); 1758 (*pr)("entries:\n"); 1759 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1760 #ifdef WAPBL_DEBUG_BUFBYTES 1761 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1762 "unsynced = %zu\n", 1763 we->we_bufcount, we->we_reclaimable_bytes, 1764 we->we_error, we->we_unsynced_bufbytes); 1765 #else 1766 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1767 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1768 #endif 1769 } 1770 if (full) { 1771 int cnt = 0; 1772 (*pr)("bufs ="); 1773 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1774 if (!LIST_NEXT(bp, b_wapbllist)) { 1775 (*pr)(" %p", bp); 1776 } else if ((++cnt % 6) == 0) { 1777 (*pr)(" %p,\n\t", bp); 1778 } else { 1779 (*pr)(" %p,", bp); 1780 } 1781 } 1782 (*pr)("\n"); 1783 1784 (*pr)("dealloced blks = "); 1785 { 1786 int i; 1787 cnt = 0; 1788 for (i = 0; i < wl->wl_dealloccnt; i++) { 1789 (*pr)(" %"PRId64":%d,", 1790 wl->wl_deallocblks[i], 1791 wl->wl_dealloclens[i]); 1792 if ((++cnt % 4) == 0) { 1793 (*pr)("\n\t"); 1794 } 1795 } 1796 } 1797 (*pr)("\n"); 1798 1799 (*pr)("registered inodes = "); 1800 { 1801 int i; 1802 cnt = 0; 1803 for (i = 0; i <= wl->wl_inohashmask; i++) { 1804 struct wapbl_ino_head *wih; 1805 struct wapbl_ino *wi; 1806 1807 wih = &wl->wl_inohash[i]; 1808 LIST_FOREACH(wi, wih, wi_hash) { 1809 if (wi->wi_ino == 0) 1810 continue; 1811 (*pr)(" %"PRIu64"/0%06"PRIo32",", 1812 wi->wi_ino, wi->wi_mode); 1813 if ((++cnt % 4) == 0) { 1814 (*pr)("\n\t"); 1815 } 1816 } 1817 } 1818 (*pr)("\n"); 1819 } 1820 } 1821 } 1822 1823 #if defined(WAPBL_DEBUG) || defined(DDB) 1824 void 1825 wapbl_dump(struct wapbl *wl) 1826 { 1827 #if defined(WAPBL_DEBUG) 1828 if (!wl) 1829 wl = wapbl_debug_wl; 1830 #endif 1831 if (!wl) 1832 return; 1833 wapbl_print(wl, 1, printf); 1834 } 1835 #endif 1836 1837 /****************************************************************/ 1838 1839 void 1840 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1841 { 1842 1843 wapbl_jlock_assert(wl); 1844 1845 mutex_enter(&wl->wl_mtx); 1846 /* XXX should eventually instead tie this into resource estimation */ 1847 /* 1848 * XXX this panic needs locking/mutex analysis and the 1849 * ability to cope with the failure. 1850 */ 1851 /* XXX this XXX doesn't have enough XXX */ 1852 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) 1853 panic("wapbl_register_deallocation: out of resources"); 1854 1855 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1856 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1857 wl->wl_dealloccnt++; 1858 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1859 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1860 mutex_exit(&wl->wl_mtx); 1861 } 1862 1863 /****************************************************************/ 1864 1865 static void 1866 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1867 { 1868 1869 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1870 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1871 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1872 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1873 } 1874 } 1875 1876 static void 1877 wapbl_inodetrk_free(struct wapbl *wl) 1878 { 1879 1880 /* XXX this KASSERT needs locking/mutex analysis */ 1881 KASSERT(wl->wl_inohashcnt == 0); 1882 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1883 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1884 pool_destroy(&wapbl_ino_pool); 1885 } 1886 } 1887 1888 static struct wapbl_ino * 1889 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1890 { 1891 struct wapbl_ino_head *wih; 1892 struct wapbl_ino *wi; 1893 1894 KASSERT(mutex_owned(&wl->wl_mtx)); 1895 1896 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1897 LIST_FOREACH(wi, wih, wi_hash) { 1898 if (ino == wi->wi_ino) 1899 return wi; 1900 } 1901 return 0; 1902 } 1903 1904 void 1905 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1906 { 1907 struct wapbl_ino_head *wih; 1908 struct wapbl_ino *wi; 1909 1910 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1911 1912 mutex_enter(&wl->wl_mtx); 1913 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1914 wi->wi_ino = ino; 1915 wi->wi_mode = mode; 1916 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1917 LIST_INSERT_HEAD(wih, wi, wi_hash); 1918 wl->wl_inohashcnt++; 1919 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1920 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1921 mutex_exit(&wl->wl_mtx); 1922 } else { 1923 mutex_exit(&wl->wl_mtx); 1924 pool_put(&wapbl_ino_pool, wi); 1925 } 1926 } 1927 1928 void 1929 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1930 { 1931 struct wapbl_ino *wi; 1932 1933 mutex_enter(&wl->wl_mtx); 1934 wi = wapbl_inodetrk_get(wl, ino); 1935 if (wi) { 1936 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1937 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1938 KASSERT(wl->wl_inohashcnt > 0); 1939 wl->wl_inohashcnt--; 1940 LIST_REMOVE(wi, wi_hash); 1941 mutex_exit(&wl->wl_mtx); 1942 1943 pool_put(&wapbl_ino_pool, wi); 1944 } else { 1945 mutex_exit(&wl->wl_mtx); 1946 } 1947 } 1948 1949 /****************************************************************/ 1950 1951 static inline size_t 1952 wapbl_transaction_inodes_len(struct wapbl *wl) 1953 { 1954 int blocklen = 1<<wl->wl_log_dev_bshift; 1955 int iph; 1956 1957 /* Calculate number of inodes described in a inodelist header */ 1958 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1959 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1960 1961 KASSERT(iph > 0); 1962 1963 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; 1964 } 1965 1966 1967 /* Calculate amount of space a transaction will take on disk */ 1968 static size_t 1969 wapbl_transaction_len(struct wapbl *wl) 1970 { 1971 int blocklen = 1<<wl->wl_log_dev_bshift; 1972 size_t len; 1973 int bph; 1974 1975 /* Calculate number of blocks described in a blocklist header */ 1976 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1977 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1978 1979 KASSERT(bph > 0); 1980 1981 len = wl->wl_bcount; 1982 len += howmany(wl->wl_bufcount, bph) * blocklen; 1983 len += howmany(wl->wl_dealloccnt, bph) * blocklen; 1984 len += wapbl_transaction_inodes_len(wl); 1985 1986 return len; 1987 } 1988 1989 /* 1990 * wapbl_cache_sync: issue DIOCCACHESYNC 1991 */ 1992 static int 1993 wapbl_cache_sync(struct wapbl *wl, const char *msg) 1994 { 1995 const bool verbose = wapbl_verbose_commit >= 2; 1996 struct bintime start_time; 1997 int force = 1; 1998 int error; 1999 2000 if (!wapbl_flush_disk_cache) { 2001 return 0; 2002 } 2003 if (verbose) { 2004 bintime(&start_time); 2005 } 2006 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 2007 FWRITE, FSCRED); 2008 if (error) { 2009 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 2010 ("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%x " 2011 "returned %d\n", wl->wl_devvp->v_rdev, error)); 2012 } 2013 if (verbose) { 2014 struct bintime d; 2015 struct timespec ts; 2016 2017 bintime(&d); 2018 bintime_sub(&d, &start_time); 2019 bintime2timespec(&d, &ts); 2020 printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n", 2021 msg, (uintmax_t)wl->wl_devvp->v_rdev, 2022 (uintmax_t)ts.tv_sec, ts.tv_nsec); 2023 } 2024 return error; 2025 } 2026 2027 /* 2028 * Perform commit operation 2029 * 2030 * Note that generation number incrementation needs to 2031 * be protected against racing with other invocations 2032 * of wapbl_write_commit. This is ok since this routine 2033 * is only invoked from wapbl_flush 2034 */ 2035 static int 2036 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 2037 { 2038 struct wapbl_wc_header *wc = wl->wl_wc_header; 2039 struct timespec ts; 2040 int error; 2041 daddr_t pbn; 2042 2043 error = wapbl_buffered_flush(wl); 2044 if (error) 2045 return error; 2046 /* 2047 * flush disk cache to ensure that blocks we've written are actually 2048 * written to the stable storage before the commit header. 2049 * 2050 * XXX Calc checksum here, instead we do this for now 2051 */ 2052 wapbl_cache_sync(wl, "1"); 2053 2054 wc->wc_head = head; 2055 wc->wc_tail = tail; 2056 wc->wc_checksum = 0; 2057 wc->wc_version = 1; 2058 getnanotime(&ts); 2059 wc->wc_time = ts.tv_sec; 2060 wc->wc_timensec = ts.tv_nsec; 2061 2062 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2063 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 2064 (intmax_t)head, (intmax_t)tail)); 2065 2066 /* 2067 * write the commit header. 2068 * 2069 * XXX if generation will rollover, then first zero 2070 * over second commit header before trying to write both headers. 2071 */ 2072 2073 pbn = wl->wl_logpbn + (wc->wc_generation % 2); 2074 #ifdef _KERNEL 2075 pbn = btodb(pbn << wc->wc_log_dev_bshift); 2076 #endif 2077 error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn); 2078 if (error) 2079 return error; 2080 error = wapbl_buffered_flush(wl); 2081 if (error) 2082 return error; 2083 2084 /* 2085 * flush disk cache to ensure that the commit header is actually 2086 * written before meta data blocks. 2087 */ 2088 wapbl_cache_sync(wl, "2"); 2089 2090 /* 2091 * If the generation number was zero, write it out a second time. 2092 * This handles initialization and generation number rollover 2093 */ 2094 if (wc->wc_generation++ == 0) { 2095 error = wapbl_write_commit(wl, head, tail); 2096 /* 2097 * This panic should be able to be removed if we do the 2098 * zero'ing mentioned above, and we are certain to roll 2099 * back generation number on failure. 2100 */ 2101 if (error) 2102 panic("wapbl_write_commit: error writing duplicate " 2103 "log header: %d\n", error); 2104 } 2105 return 0; 2106 } 2107 2108 /* Returns new offset value */ 2109 static int 2110 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 2111 { 2112 struct wapbl_wc_blocklist *wc = 2113 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2114 int blocklen = 1<<wl->wl_log_dev_bshift; 2115 int bph; 2116 struct buf *bp; 2117 off_t off = *offp; 2118 int error; 2119 size_t padding; 2120 2121 KASSERT(rw_write_held(&wl->wl_rwlock)); 2122 2123 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2124 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2125 2126 bp = LIST_FIRST(&wl->wl_bufs); 2127 2128 while (bp) { 2129 int cnt; 2130 struct buf *obp = bp; 2131 2132 KASSERT(bp->b_flags & B_LOCKED); 2133 2134 wc->wc_type = WAPBL_WC_BLOCKS; 2135 wc->wc_len = blocklen; 2136 wc->wc_blkcount = 0; 2137 while (bp && (wc->wc_blkcount < bph)) { 2138 /* 2139 * Make sure all the physical block numbers are up to 2140 * date. If this is not always true on a given 2141 * filesystem, then VOP_BMAP must be called. We 2142 * could call VOP_BMAP here, or else in the filesystem 2143 * specific flush callback, although neither of those 2144 * solutions allow us to take the vnode lock. If a 2145 * filesystem requires that we must take the vnode lock 2146 * to call VOP_BMAP, then we can probably do it in 2147 * bwrite when the vnode lock should already be held 2148 * by the invoking code. 2149 */ 2150 KASSERT((bp->b_vp->v_type == VBLK) || 2151 (bp->b_blkno != bp->b_lblkno)); 2152 KASSERT(bp->b_blkno > 0); 2153 2154 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 2155 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 2156 wc->wc_len += bp->b_bcount; 2157 wc->wc_blkcount++; 2158 bp = LIST_NEXT(bp, b_wapbllist); 2159 } 2160 if (wc->wc_len % blocklen != 0) { 2161 padding = blocklen - wc->wc_len % blocklen; 2162 wc->wc_len += padding; 2163 } else { 2164 padding = 0; 2165 } 2166 2167 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2168 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 2169 wc->wc_len, padding, (intmax_t)off)); 2170 2171 error = wapbl_circ_write(wl, wc, blocklen, &off); 2172 if (error) 2173 return error; 2174 bp = obp; 2175 cnt = 0; 2176 while (bp && (cnt++ < bph)) { 2177 error = wapbl_circ_write(wl, bp->b_data, 2178 bp->b_bcount, &off); 2179 if (error) 2180 return error; 2181 bp = LIST_NEXT(bp, b_wapbllist); 2182 } 2183 if (padding) { 2184 void *zero; 2185 2186 zero = wapbl_alloc(padding); 2187 memset(zero, 0, padding); 2188 error = wapbl_circ_write(wl, zero, padding, &off); 2189 wapbl_free(zero, padding); 2190 if (error) 2191 return error; 2192 } 2193 } 2194 *offp = off; 2195 return 0; 2196 } 2197 2198 static int 2199 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2200 { 2201 struct wapbl_wc_blocklist *wc = 2202 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2203 int i; 2204 int blocklen = 1<<wl->wl_log_dev_bshift; 2205 int bph; 2206 off_t off = *offp; 2207 int error; 2208 2209 if (wl->wl_dealloccnt == 0) 2210 return 0; 2211 2212 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2213 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2214 2215 i = 0; 2216 while (i < wl->wl_dealloccnt) { 2217 wc->wc_type = WAPBL_WC_REVOCATIONS; 2218 wc->wc_len = blocklen; 2219 wc->wc_blkcount = 0; 2220 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2221 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2222 wl->wl_deallocblks[i]; 2223 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2224 wl->wl_dealloclens[i]; 2225 wc->wc_blkcount++; 2226 i++; 2227 } 2228 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2229 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2230 wc->wc_len, (intmax_t)off)); 2231 error = wapbl_circ_write(wl, wc, blocklen, &off); 2232 if (error) 2233 return error; 2234 } 2235 *offp = off; 2236 return 0; 2237 } 2238 2239 static int 2240 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2241 { 2242 struct wapbl_wc_inodelist *wc = 2243 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2244 int i; 2245 int blocklen = 1 << wl->wl_log_dev_bshift; 2246 off_t off = *offp; 2247 int error; 2248 2249 struct wapbl_ino_head *wih; 2250 struct wapbl_ino *wi; 2251 int iph; 2252 2253 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2254 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2255 2256 i = 0; 2257 wih = &wl->wl_inohash[0]; 2258 wi = 0; 2259 do { 2260 wc->wc_type = WAPBL_WC_INODES; 2261 wc->wc_len = blocklen; 2262 wc->wc_inocnt = 0; 2263 wc->wc_clear = (i == 0); 2264 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2265 while (!wi) { 2266 KASSERT((wih - &wl->wl_inohash[0]) 2267 <= wl->wl_inohashmask); 2268 wi = LIST_FIRST(wih++); 2269 } 2270 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2271 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2272 wc->wc_inocnt++; 2273 i++; 2274 wi = LIST_NEXT(wi, wi_hash); 2275 } 2276 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2277 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2278 wc->wc_len, (intmax_t)off)); 2279 error = wapbl_circ_write(wl, wc, blocklen, &off); 2280 if (error) 2281 return error; 2282 } while (i < wl->wl_inohashcnt); 2283 2284 *offp = off; 2285 return 0; 2286 } 2287 2288 #endif /* _KERNEL */ 2289 2290 /****************************************************************/ 2291 2292 struct wapbl_blk { 2293 LIST_ENTRY(wapbl_blk) wb_hash; 2294 daddr_t wb_blk; 2295 off_t wb_off; /* Offset of this block in the log */ 2296 }; 2297 #define WAPBL_BLKPOOL_MIN 83 2298 2299 static void 2300 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2301 { 2302 if (size < WAPBL_BLKPOOL_MIN) 2303 size = WAPBL_BLKPOOL_MIN; 2304 KASSERT(wr->wr_blkhash == 0); 2305 #ifdef _KERNEL 2306 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2307 #else /* ! _KERNEL */ 2308 /* Manually implement hashinit */ 2309 { 2310 unsigned long i, hashsize; 2311 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2312 continue; 2313 wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash)); 2314 for (i = 0; i < hashsize; i++) 2315 LIST_INIT(&wr->wr_blkhash[i]); 2316 wr->wr_blkhashmask = hashsize - 1; 2317 } 2318 #endif /* ! _KERNEL */ 2319 } 2320 2321 static void 2322 wapbl_blkhash_free(struct wapbl_replay *wr) 2323 { 2324 KASSERT(wr->wr_blkhashcnt == 0); 2325 #ifdef _KERNEL 2326 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2327 #else /* ! _KERNEL */ 2328 wapbl_free(wr->wr_blkhash, 2329 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2330 #endif /* ! _KERNEL */ 2331 } 2332 2333 static struct wapbl_blk * 2334 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2335 { 2336 struct wapbl_blk_head *wbh; 2337 struct wapbl_blk *wb; 2338 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2339 LIST_FOREACH(wb, wbh, wb_hash) { 2340 if (blk == wb->wb_blk) 2341 return wb; 2342 } 2343 return 0; 2344 } 2345 2346 static void 2347 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2348 { 2349 struct wapbl_blk_head *wbh; 2350 struct wapbl_blk *wb; 2351 wb = wapbl_blkhash_get(wr, blk); 2352 if (wb) { 2353 KASSERT(wb->wb_blk == blk); 2354 wb->wb_off = off; 2355 } else { 2356 wb = wapbl_alloc(sizeof(*wb)); 2357 wb->wb_blk = blk; 2358 wb->wb_off = off; 2359 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2360 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2361 wr->wr_blkhashcnt++; 2362 } 2363 } 2364 2365 static void 2366 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2367 { 2368 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2369 if (wb) { 2370 KASSERT(wr->wr_blkhashcnt > 0); 2371 wr->wr_blkhashcnt--; 2372 LIST_REMOVE(wb, wb_hash); 2373 wapbl_free(wb, sizeof(*wb)); 2374 } 2375 } 2376 2377 static void 2378 wapbl_blkhash_clear(struct wapbl_replay *wr) 2379 { 2380 unsigned long i; 2381 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2382 struct wapbl_blk *wb; 2383 2384 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2385 KASSERT(wr->wr_blkhashcnt > 0); 2386 wr->wr_blkhashcnt--; 2387 LIST_REMOVE(wb, wb_hash); 2388 wapbl_free(wb, sizeof(*wb)); 2389 } 2390 } 2391 KASSERT(wr->wr_blkhashcnt == 0); 2392 } 2393 2394 /****************************************************************/ 2395 2396 static int 2397 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2398 { 2399 size_t slen; 2400 off_t off = *offp; 2401 int error; 2402 daddr_t pbn; 2403 2404 KASSERT(((len >> wr->wr_log_dev_bshift) << 2405 wr->wr_log_dev_bshift) == len); 2406 2407 if (off < wr->wr_circ_off) 2408 off = wr->wr_circ_off; 2409 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2410 if (slen < len) { 2411 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2412 #ifdef _KERNEL 2413 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2414 #endif 2415 error = wapbl_read(data, slen, wr->wr_devvp, pbn); 2416 if (error) 2417 return error; 2418 data = (uint8_t *)data + slen; 2419 len -= slen; 2420 off = wr->wr_circ_off; 2421 } 2422 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2423 #ifdef _KERNEL 2424 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2425 #endif 2426 error = wapbl_read(data, len, wr->wr_devvp, pbn); 2427 if (error) 2428 return error; 2429 off += len; 2430 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2431 off = wr->wr_circ_off; 2432 *offp = off; 2433 return 0; 2434 } 2435 2436 static void 2437 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2438 { 2439 size_t slen; 2440 off_t off = *offp; 2441 2442 KASSERT(((len >> wr->wr_log_dev_bshift) << 2443 wr->wr_log_dev_bshift) == len); 2444 2445 if (off < wr->wr_circ_off) 2446 off = wr->wr_circ_off; 2447 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2448 if (slen < len) { 2449 len -= slen; 2450 off = wr->wr_circ_off; 2451 } 2452 off += len; 2453 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2454 off = wr->wr_circ_off; 2455 *offp = off; 2456 } 2457 2458 /****************************************************************/ 2459 2460 int 2461 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2462 daddr_t off, size_t count, size_t blksize) 2463 { 2464 struct wapbl_replay *wr; 2465 int error; 2466 struct vnode *devvp; 2467 daddr_t logpbn; 2468 uint8_t *scratch; 2469 struct wapbl_wc_header *wch; 2470 struct wapbl_wc_header *wch2; 2471 /* Use this until we read the actual log header */ 2472 int log_dev_bshift = ilog2(blksize); 2473 size_t used; 2474 daddr_t pbn; 2475 2476 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2477 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2478 vp, off, count, blksize)); 2479 2480 if (off < 0) 2481 return EINVAL; 2482 2483 if (blksize < DEV_BSIZE) 2484 return EINVAL; 2485 if (blksize % DEV_BSIZE) 2486 return EINVAL; 2487 2488 #ifdef _KERNEL 2489 #if 0 2490 /* XXX vp->v_size isn't reliably set for VBLK devices, 2491 * especially root. However, we might still want to verify 2492 * that the full load is readable */ 2493 if ((off + count) * blksize > vp->v_size) 2494 return EINVAL; 2495 #endif 2496 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2497 return error; 2498 } 2499 #else /* ! _KERNEL */ 2500 devvp = vp; 2501 logpbn = off; 2502 #endif /* ! _KERNEL */ 2503 2504 scratch = wapbl_alloc(MAXBSIZE); 2505 2506 pbn = logpbn; 2507 #ifdef _KERNEL 2508 pbn = btodb(pbn << log_dev_bshift); 2509 #endif 2510 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); 2511 if (error) 2512 goto errout; 2513 2514 wch = (struct wapbl_wc_header *)scratch; 2515 wch2 = 2516 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2517 /* XXX verify checksums and magic numbers */ 2518 if (wch->wc_type != WAPBL_WC_HEADER) { 2519 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2520 error = EFTYPE; 2521 goto errout; 2522 } 2523 2524 if (wch2->wc_generation > wch->wc_generation) 2525 wch = wch2; 2526 2527 wr = wapbl_calloc(1, sizeof(*wr)); 2528 2529 wr->wr_logvp = vp; 2530 wr->wr_devvp = devvp; 2531 wr->wr_logpbn = logpbn; 2532 2533 wr->wr_scratch = scratch; 2534 2535 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2536 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2537 wr->wr_circ_off = wch->wc_circ_off; 2538 wr->wr_circ_size = wch->wc_circ_size; 2539 wr->wr_generation = wch->wc_generation; 2540 2541 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2542 2543 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2544 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2545 " len=%"PRId64" used=%zu\n", 2546 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2547 wch->wc_circ_size, used)); 2548 2549 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2550 2551 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2552 if (error) { 2553 wapbl_replay_stop(wr); 2554 wapbl_replay_free(wr); 2555 return error; 2556 } 2557 2558 *wrp = wr; 2559 return 0; 2560 2561 errout: 2562 wapbl_free(scratch, MAXBSIZE); 2563 return error; 2564 } 2565 2566 void 2567 wapbl_replay_stop(struct wapbl_replay *wr) 2568 { 2569 2570 if (!wapbl_replay_isopen(wr)) 2571 return; 2572 2573 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2574 2575 wapbl_free(wr->wr_scratch, MAXBSIZE); 2576 wr->wr_scratch = NULL; 2577 2578 wr->wr_logvp = NULL; 2579 2580 wapbl_blkhash_clear(wr); 2581 wapbl_blkhash_free(wr); 2582 } 2583 2584 void 2585 wapbl_replay_free(struct wapbl_replay *wr) 2586 { 2587 2588 KDASSERT(!wapbl_replay_isopen(wr)); 2589 2590 if (wr->wr_inodes) 2591 wapbl_free(wr->wr_inodes, 2592 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2593 wapbl_free(wr, sizeof(*wr)); 2594 } 2595 2596 #ifdef _KERNEL 2597 int 2598 wapbl_replay_isopen1(struct wapbl_replay *wr) 2599 { 2600 2601 return wapbl_replay_isopen(wr); 2602 } 2603 #endif 2604 2605 static void 2606 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2607 { 2608 struct wapbl_wc_blocklist *wc = 2609 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2610 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2611 int i, j, n; 2612 2613 for (i = 0; i < wc->wc_blkcount; i++) { 2614 /* 2615 * Enter each physical block into the hashtable independently. 2616 */ 2617 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2618 for (j = 0; j < n; j++) { 2619 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen), 2620 *offp); 2621 wapbl_circ_advance(wr, fsblklen, offp); 2622 } 2623 } 2624 } 2625 2626 static void 2627 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2628 { 2629 struct wapbl_wc_blocklist *wc = 2630 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2631 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2632 int i, j, n; 2633 2634 for (i = 0; i < wc->wc_blkcount; i++) { 2635 /* 2636 * Remove any blocks found from the hashtable. 2637 */ 2638 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2639 for (j = 0; j < n; j++) 2640 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2641 } 2642 } 2643 2644 static void 2645 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2646 { 2647 struct wapbl_wc_inodelist *wc = 2648 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2649 void *new_inodes; 2650 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2651 2652 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2653 2654 /* 2655 * Keep track of where we found this so location won't be 2656 * overwritten. 2657 */ 2658 if (wc->wc_clear) { 2659 wr->wr_inodestail = oldoff; 2660 wr->wr_inodescnt = 0; 2661 if (wr->wr_inodes != NULL) { 2662 wapbl_free(wr->wr_inodes, oldsize); 2663 wr->wr_inodes = NULL; 2664 } 2665 } 2666 wr->wr_inodeshead = newoff; 2667 if (wc->wc_inocnt == 0) 2668 return; 2669 2670 new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) * 2671 sizeof(wr->wr_inodes[0])); 2672 if (wr->wr_inodes != NULL) { 2673 memcpy(new_inodes, wr->wr_inodes, oldsize); 2674 wapbl_free(wr->wr_inodes, oldsize); 2675 } 2676 wr->wr_inodes = new_inodes; 2677 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2678 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2679 wr->wr_inodescnt += wc->wc_inocnt; 2680 } 2681 2682 static int 2683 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2684 { 2685 off_t off; 2686 int error; 2687 2688 int logblklen = 1 << wr->wr_log_dev_bshift; 2689 2690 wapbl_blkhash_clear(wr); 2691 2692 off = tail; 2693 while (off != head) { 2694 struct wapbl_wc_null *wcn; 2695 off_t saveoff = off; 2696 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2697 if (error) 2698 goto errout; 2699 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2700 switch (wcn->wc_type) { 2701 case WAPBL_WC_BLOCKS: 2702 wapbl_replay_process_blocks(wr, &off); 2703 break; 2704 2705 case WAPBL_WC_REVOCATIONS: 2706 wapbl_replay_process_revocations(wr); 2707 break; 2708 2709 case WAPBL_WC_INODES: 2710 wapbl_replay_process_inodes(wr, saveoff, off); 2711 break; 2712 2713 default: 2714 printf("Unrecognized wapbl type: 0x%08x\n", 2715 wcn->wc_type); 2716 error = EFTYPE; 2717 goto errout; 2718 } 2719 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2720 if (off != saveoff) { 2721 printf("wapbl_replay: corrupted records\n"); 2722 error = EFTYPE; 2723 goto errout; 2724 } 2725 } 2726 return 0; 2727 2728 errout: 2729 wapbl_blkhash_clear(wr); 2730 return error; 2731 } 2732 2733 #if 0 2734 int 2735 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2736 { 2737 off_t off; 2738 int mismatchcnt = 0; 2739 int logblklen = 1 << wr->wr_log_dev_bshift; 2740 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2741 void *scratch1 = wapbl_alloc(MAXBSIZE); 2742 void *scratch2 = wapbl_alloc(MAXBSIZE); 2743 int error = 0; 2744 2745 KDASSERT(wapbl_replay_isopen(wr)); 2746 2747 off = wch->wc_tail; 2748 while (off != wch->wc_head) { 2749 struct wapbl_wc_null *wcn; 2750 #ifdef DEBUG 2751 off_t saveoff = off; 2752 #endif 2753 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2754 if (error) 2755 goto out; 2756 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2757 switch (wcn->wc_type) { 2758 case WAPBL_WC_BLOCKS: 2759 { 2760 struct wapbl_wc_blocklist *wc = 2761 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2762 int i; 2763 for (i = 0; i < wc->wc_blkcount; i++) { 2764 int foundcnt = 0; 2765 int dirtycnt = 0; 2766 int j, n; 2767 /* 2768 * Check each physical block into the 2769 * hashtable independently 2770 */ 2771 n = wc->wc_blocks[i].wc_dlen >> 2772 wch->wc_fs_dev_bshift; 2773 for (j = 0; j < n; j++) { 2774 struct wapbl_blk *wb = 2775 wapbl_blkhash_get(wr, 2776 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2777 if (wb && (wb->wb_off == off)) { 2778 foundcnt++; 2779 error = 2780 wapbl_circ_read(wr, 2781 scratch1, fsblklen, 2782 &off); 2783 if (error) 2784 goto out; 2785 error = 2786 wapbl_read(scratch2, 2787 fsblklen, fsdevvp, 2788 wb->wb_blk); 2789 if (error) 2790 goto out; 2791 if (memcmp(scratch1, 2792 scratch2, 2793 fsblklen)) { 2794 printf( 2795 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2796 wb->wb_blk, (intmax_t)off); 2797 dirtycnt++; 2798 mismatchcnt++; 2799 } 2800 } else { 2801 wapbl_circ_advance(wr, 2802 fsblklen, &off); 2803 } 2804 } 2805 #if 0 2806 /* 2807 * If all of the blocks in an entry 2808 * are clean, then remove all of its 2809 * blocks from the hashtable since they 2810 * never will need replay. 2811 */ 2812 if ((foundcnt != 0) && 2813 (dirtycnt == 0)) { 2814 off = saveoff; 2815 wapbl_circ_advance(wr, 2816 logblklen, &off); 2817 for (j = 0; j < n; j++) { 2818 struct wapbl_blk *wb = 2819 wapbl_blkhash_get(wr, 2820 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2821 if (wb && 2822 (wb->wb_off == off)) { 2823 wapbl_blkhash_rem(wr, wb->wb_blk); 2824 } 2825 wapbl_circ_advance(wr, 2826 fsblklen, &off); 2827 } 2828 } 2829 #endif 2830 } 2831 } 2832 break; 2833 case WAPBL_WC_REVOCATIONS: 2834 case WAPBL_WC_INODES: 2835 break; 2836 default: 2837 KASSERT(0); 2838 } 2839 #ifdef DEBUG 2840 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2841 KASSERT(off == saveoff); 2842 #endif 2843 } 2844 out: 2845 wapbl_free(scratch1, MAXBSIZE); 2846 wapbl_free(scratch2, MAXBSIZE); 2847 if (!error && mismatchcnt) 2848 error = EFTYPE; 2849 return error; 2850 } 2851 #endif 2852 2853 int 2854 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2855 { 2856 struct wapbl_blk *wb; 2857 size_t i; 2858 off_t off; 2859 void *scratch; 2860 int error = 0; 2861 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2862 2863 KDASSERT(wapbl_replay_isopen(wr)); 2864 2865 scratch = wapbl_alloc(MAXBSIZE); 2866 2867 for (i = 0; i <= wr->wr_blkhashmask; ++i) { 2868 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2869 off = wb->wb_off; 2870 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2871 if (error) 2872 break; 2873 error = wapbl_write(scratch, fsblklen, fsdevvp, 2874 wb->wb_blk); 2875 if (error) 2876 break; 2877 } 2878 } 2879 2880 wapbl_free(scratch, MAXBSIZE); 2881 return error; 2882 } 2883 2884 int 2885 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2886 { 2887 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2888 2889 KDASSERT(wapbl_replay_isopen(wr)); 2890 KASSERT((len % fsblklen) == 0); 2891 2892 while (len != 0) { 2893 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2894 if (wb) 2895 return 1; 2896 len -= fsblklen; 2897 } 2898 return 0; 2899 } 2900 2901 int 2902 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2903 { 2904 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2905 2906 KDASSERT(wapbl_replay_isopen(wr)); 2907 2908 KASSERT((len % fsblklen) == 0); 2909 2910 while (len != 0) { 2911 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2912 if (wb) { 2913 off_t off = wb->wb_off; 2914 int error; 2915 error = wapbl_circ_read(wr, data, fsblklen, &off); 2916 if (error) 2917 return error; 2918 } 2919 data = (uint8_t *)data + fsblklen; 2920 len -= fsblklen; 2921 blk++; 2922 } 2923 return 0; 2924 } 2925 2926 #ifdef _KERNEL 2927 /* 2928 * This is not really a module now, but maybe on it's way to 2929 * being one some day. 2930 */ 2931 MODULE(MODULE_CLASS_VFS, wapbl, NULL); 2932 2933 static int 2934 wapbl_modcmd(modcmd_t cmd, void *arg) 2935 { 2936 2937 switch (cmd) { 2938 case MODULE_CMD_INIT: 2939 wapbl_init(); 2940 return 0; 2941 case MODULE_CMD_FINI: 2942 #ifdef notyet 2943 return wapbl_fini(true); 2944 #endif 2945 return EOPNOTSUPP; 2946 default: 2947 return ENOTTY; 2948 } 2949 } 2950 #endif /* _KERNEL */ 2951