1 /* $NetBSD: vfs_wapbl.c,v 1.43 2011/02/20 11:21:34 nakayama Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.43 2011/02/20 11:21:34 nakayama Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/bitops.h> 43 44 #ifdef _KERNEL 45 #include <sys/param.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/sysctl.h> 49 #include <sys/uio.h> 50 #include <sys/vnode.h> 51 #include <sys/file.h> 52 #include <sys/malloc.h> 53 #include <sys/module.h> 54 #include <sys/resourcevar.h> 55 #include <sys/conf.h> 56 #include <sys/mount.h> 57 #include <sys/kernel.h> 58 #include <sys/kauth.h> 59 #include <sys/mutex.h> 60 #include <sys/atomic.h> 61 #include <sys/wapbl.h> 62 #include <sys/wapbl_replay.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 #if 0 /* notyet */ 67 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP) 68 #define wapbl_free(a, s) kmem_free((a), (s)) 69 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 70 #else 71 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); 72 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) 73 #define wapbl_free(a, s) free((a), M_WAPBL) 74 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) 75 #endif 76 77 static struct sysctllog *wapbl_sysctl; 78 static int wapbl_flush_disk_cache = 1; 79 static int wapbl_verbose_commit = 0; 80 81 #else /* !_KERNEL */ 82 #include <assert.h> 83 #include <errno.h> 84 #include <stdio.h> 85 #include <stdbool.h> 86 #include <stdlib.h> 87 #include <string.h> 88 89 #include <sys/time.h> 90 #include <sys/wapbl.h> 91 #include <sys/wapbl_replay.h> 92 93 #define KDASSERT(x) assert(x) 94 #define KASSERT(x) assert(x) 95 #define wapbl_malloc(s) malloc(s) 96 #define wapbl_free(a, s) free(a) 97 #define wapbl_calloc(n, s) calloc((n), (s)) 98 99 #endif /* !_KERNEL */ 100 101 /* 102 * INTERNAL DATA STRUCTURES 103 */ 104 105 /* 106 * This structure holds per-mount log information. 107 * 108 * Legend: a = atomic access only 109 * r = read-only after init 110 * l = rwlock held 111 * m = mutex held 112 * lm = rwlock held writing or mutex held 113 * u = unlocked access ok 114 * b = bufcache_lock held 115 */ 116 struct wapbl { 117 struct vnode *wl_logvp; /* r: log here */ 118 struct vnode *wl_devvp; /* r: log on this device */ 119 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 120 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 121 int wl_log_dev_bshift; /* r: logarithm of device block size of log 122 device */ 123 int wl_fs_dev_bshift; /* r: logarithm of device block size of 124 filesystem device */ 125 126 unsigned wl_lock_count; /* m: Count of transactions in progress */ 127 128 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 129 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 130 131 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 132 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 133 134 off_t wl_head; /* l: Byte offset of log head */ 135 off_t wl_tail; /* l: Byte offset of log tail */ 136 /* 137 * head == tail == 0 means log is empty 138 * head == tail != 0 means log is full 139 * see assertions in wapbl_advance() for other boundary conditions. 140 * only truncate moves the tail, except when flush sets it to 141 * wl_header_size only flush moves the head, except when truncate 142 * sets it to 0. 143 */ 144 145 struct wapbl_wc_header *wl_wc_header; /* l */ 146 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 147 148 kmutex_t wl_mtx; /* u: short-term lock */ 149 krwlock_t wl_rwlock; /* u: File system transaction lock */ 150 151 /* 152 * Must be held while accessing 153 * wl_count or wl_bufs or head or tail 154 */ 155 156 /* 157 * Callback called from within the flush routine to flush any extra 158 * bits. Note that flush may be skipped without calling this if 159 * there are no outstanding buffers in the transaction. 160 */ 161 #if _KERNEL 162 wapbl_flush_fn_t wl_flush; /* r */ 163 wapbl_flush_fn_t wl_flush_abort;/* r */ 164 #endif 165 166 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 167 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 168 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 169 170 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 171 172 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 173 size_t wl_reclaimable_bytes; /* m: Amount of space available for 174 reclamation by truncate */ 175 int wl_error_count; /* m: # of wl_entries with errors */ 176 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 177 178 #ifdef WAPBL_DEBUG_BUFBYTES 179 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 180 #endif 181 182 daddr_t *wl_deallocblks;/* lm: address of block */ 183 int *wl_dealloclens; /* lm: size of block */ 184 int wl_dealloccnt; /* lm: total count */ 185 int wl_dealloclim; /* l: max count */ 186 187 /* hashtable of inode numbers for allocated but unlinked inodes */ 188 /* synch ??? */ 189 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 190 u_long wl_inohashmask; 191 int wl_inohashcnt; 192 193 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 194 accounting */ 195 }; 196 197 #ifdef WAPBL_DEBUG_PRINT 198 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 199 #endif 200 201 /****************************************************************/ 202 #ifdef _KERNEL 203 204 #ifdef WAPBL_DEBUG 205 struct wapbl *wapbl_debug_wl; 206 #endif 207 208 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 209 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 210 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 211 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 212 #endif /* _KERNEL */ 213 214 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 215 216 static inline size_t wapbl_space_free(size_t avail, off_t head, 217 off_t tail); 218 static inline size_t wapbl_space_used(size_t avail, off_t head, 219 off_t tail); 220 221 #ifdef _KERNEL 222 223 #define WAPBL_INODETRK_SIZE 83 224 static int wapbl_ino_pool_refcount; 225 static struct pool wapbl_ino_pool; 226 struct wapbl_ino { 227 LIST_ENTRY(wapbl_ino) wi_hash; 228 ino_t wi_ino; 229 mode_t wi_mode; 230 }; 231 232 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 233 static void wapbl_inodetrk_free(struct wapbl *wl); 234 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 235 236 static size_t wapbl_transaction_len(struct wapbl *wl); 237 static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 238 239 #if 0 240 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 241 #endif 242 243 static int wapbl_replay_isopen1(struct wapbl_replay *); 244 245 /* 246 * This is useful for debugging. If set, the log will 247 * only be truncated when necessary. 248 */ 249 int wapbl_lazy_truncate = 0; 250 251 struct wapbl_ops wapbl_ops = { 252 .wo_wapbl_discard = wapbl_discard, 253 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 254 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 255 .wo_wapbl_replay_read = wapbl_replay_read, 256 .wo_wapbl_add_buf = wapbl_add_buf, 257 .wo_wapbl_remove_buf = wapbl_remove_buf, 258 .wo_wapbl_resize_buf = wapbl_resize_buf, 259 .wo_wapbl_begin = wapbl_begin, 260 .wo_wapbl_end = wapbl_end, 261 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 262 263 /* XXX: the following is only used to say "this is a wapbl buf" */ 264 .wo_wapbl_biodone = wapbl_biodone, 265 }; 266 267 static int 268 wapbl_sysctl_init(void) 269 { 270 int rv; 271 const struct sysctlnode *rnode, *cnode; 272 273 wapbl_sysctl = NULL; 274 275 rv = sysctl_createv(&wapbl_sysctl, 0, NULL, &rnode, 276 CTLFLAG_PERMANENT, 277 CTLTYPE_NODE, "vfs", NULL, 278 NULL, 0, NULL, 0, 279 CTL_VFS, CTL_EOL); 280 if (rv) 281 return rv; 282 283 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &rnode, 284 CTLFLAG_PERMANENT, 285 CTLTYPE_NODE, "wapbl", 286 SYSCTL_DESCR("WAPBL journaling options"), 287 NULL, 0, NULL, 0, 288 CTL_CREATE, CTL_EOL); 289 if (rv) 290 return rv; 291 292 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 293 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 294 CTLTYPE_INT, "flush_disk_cache", 295 SYSCTL_DESCR("flush disk cache"), 296 NULL, 0, &wapbl_flush_disk_cache, 0, 297 CTL_CREATE, CTL_EOL); 298 if (rv) 299 return rv; 300 301 rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, 302 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 303 CTLTYPE_INT, "verbose_commit", 304 SYSCTL_DESCR("show time and size of wapbl log commits"), 305 NULL, 0, &wapbl_verbose_commit, 0, 306 CTL_CREATE, CTL_EOL); 307 return rv; 308 } 309 310 static void 311 wapbl_init(void) 312 { 313 malloc_type_attach(M_WAPBL); 314 wapbl_sysctl_init(); 315 } 316 317 #ifdef notyet 318 static int 319 wapbl_fini(bool interface) 320 { 321 if (aio_sysctl != NULL) 322 sysctl_teardown(&aio_sysctl); 323 return 0; 324 } 325 #endif 326 327 static int 328 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 329 { 330 int error, i; 331 332 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 333 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 334 335 /* 336 * Its only valid to reuse the replay log if its 337 * the same as the new log we just opened. 338 */ 339 KDASSERT(!wapbl_replay_isopen(wr)); 340 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 341 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 342 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 343 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 344 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 345 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 346 347 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 348 349 for (i = 0; i < wr->wr_inodescnt; i++) 350 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 351 wr->wr_inodes[i].wr_imode); 352 353 /* Make sure new transaction won't overwrite old inodes list */ 354 KDASSERT(wapbl_transaction_len(wl) <= 355 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 356 wr->wr_inodestail)); 357 358 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 359 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 360 wapbl_transaction_len(wl); 361 362 error = wapbl_write_inodes(wl, &wl->wl_head); 363 if (error) 364 return error; 365 366 KASSERT(wl->wl_head != wl->wl_tail); 367 KASSERT(wl->wl_head != 0); 368 369 return 0; 370 } 371 372 int 373 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 374 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 375 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 376 { 377 struct wapbl *wl; 378 struct vnode *devvp; 379 daddr_t logpbn; 380 int error; 381 int log_dev_bshift = ilog2(blksize); 382 int fs_dev_bshift = log_dev_bshift; 383 int run; 384 385 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 386 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 387 388 if (log_dev_bshift > fs_dev_bshift) { 389 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 390 ("wapbl: log device's block size cannot be larger " 391 "than filesystem's\n")); 392 /* 393 * Not currently implemented, although it could be if 394 * needed someday. 395 */ 396 return ENOSYS; 397 } 398 399 if (off < 0) 400 return EINVAL; 401 402 if (blksize < DEV_BSIZE) 403 return EINVAL; 404 if (blksize % DEV_BSIZE) 405 return EINVAL; 406 407 /* XXXTODO: verify that the full load is writable */ 408 409 /* 410 * XXX check for minimum log size 411 * minimum is governed by minimum amount of space 412 * to complete a transaction. (probably truncate) 413 */ 414 /* XXX for now pick something minimal */ 415 if ((count * blksize) < MAXPHYS) { 416 return ENOSPC; 417 } 418 419 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 420 return error; 421 } 422 423 wl = wapbl_calloc(1, sizeof(*wl)); 424 rw_init(&wl->wl_rwlock); 425 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 426 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 427 LIST_INIT(&wl->wl_bufs); 428 SIMPLEQ_INIT(&wl->wl_entries); 429 430 wl->wl_logvp = vp; 431 wl->wl_devvp = devvp; 432 wl->wl_mount = mp; 433 wl->wl_logpbn = logpbn; 434 wl->wl_log_dev_bshift = log_dev_bshift; 435 wl->wl_fs_dev_bshift = fs_dev_bshift; 436 437 wl->wl_flush = flushfn; 438 wl->wl_flush_abort = flushabortfn; 439 440 /* Reserve two log device blocks for the commit headers */ 441 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 442 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 443 /* truncate the log usage to a multiple of log_dev_bshift */ 444 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 445 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 446 447 /* 448 * wl_bufbytes_max limits the size of the in memory transaction space. 449 * - Since buffers are allocated and accounted for in units of 450 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 451 * (i.e. 1<<PAGE_SHIFT) 452 * - Since the log device has to be written in units of 453 * 1<<wl_log_dev_bshift it is required to be a mulitple of 454 * 1<<wl_log_dev_bshift. 455 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 456 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 457 * Therefore it must be multiple of the least common multiple of those 458 * three quantities. Fortunately, all of those quantities are 459 * guaranteed to be a power of two, and the least common multiple of 460 * a set of numbers which are all powers of two is simply the maximum 461 * of those numbers. Finally, the maximum logarithm of a power of two 462 * is the same as the log of the maximum power of two. So we can do 463 * the following operations to size wl_bufbytes_max: 464 */ 465 466 /* XXX fix actual number of pages reserved per filesystem. */ 467 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 468 469 /* Round wl_bufbytes_max to the largest power of two constraint */ 470 wl->wl_bufbytes_max >>= PAGE_SHIFT; 471 wl->wl_bufbytes_max <<= PAGE_SHIFT; 472 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 473 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 474 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 475 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 476 477 /* XXX maybe use filesystem fragment size instead of 1024 */ 478 /* XXX fix actual number of buffers reserved per filesystem. */ 479 wl->wl_bufcount_max = (nbuf / 2) * 1024; 480 481 /* XXX tie this into resource estimation */ 482 wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2; 483 484 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * 485 wl->wl_dealloclim); 486 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * 487 wl->wl_dealloclim); 488 489 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 490 491 /* Initialize the commit header */ 492 { 493 struct wapbl_wc_header *wc; 494 size_t len = 1 << wl->wl_log_dev_bshift; 495 wc = wapbl_calloc(1, len); 496 wc->wc_type = WAPBL_WC_HEADER; 497 wc->wc_len = len; 498 wc->wc_circ_off = wl->wl_circ_off; 499 wc->wc_circ_size = wl->wl_circ_size; 500 /* XXX wc->wc_fsid */ 501 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 502 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 503 wl->wl_wc_header = wc; 504 wl->wl_wc_scratch = wapbl_malloc(len); 505 } 506 507 /* 508 * if there was an existing set of unlinked but 509 * allocated inodes, preserve it in the new 510 * log. 511 */ 512 if (wr && wr->wr_inodescnt) { 513 error = wapbl_start_flush_inodes(wl, wr); 514 if (error) 515 goto errout; 516 } 517 518 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 519 if (error) { 520 goto errout; 521 } 522 523 *wlp = wl; 524 #if defined(WAPBL_DEBUG) 525 wapbl_debug_wl = wl; 526 #endif 527 528 return 0; 529 errout: 530 wapbl_discard(wl); 531 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 532 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 533 wapbl_free(wl->wl_deallocblks, 534 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 535 wapbl_free(wl->wl_dealloclens, 536 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 537 wapbl_inodetrk_free(wl); 538 wapbl_free(wl, sizeof(*wl)); 539 540 return error; 541 } 542 543 /* 544 * Like wapbl_flush, only discards the transaction 545 * completely 546 */ 547 548 void 549 wapbl_discard(struct wapbl *wl) 550 { 551 struct wapbl_entry *we; 552 struct buf *bp; 553 int i; 554 555 /* 556 * XXX we may consider using upgrade here 557 * if we want to call flush from inside a transaction 558 */ 559 rw_enter(&wl->wl_rwlock, RW_WRITER); 560 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 561 wl->wl_dealloccnt); 562 563 #ifdef WAPBL_DEBUG_PRINT 564 { 565 pid_t pid = -1; 566 lwpid_t lid = -1; 567 if (curproc) 568 pid = curproc->p_pid; 569 if (curlwp) 570 lid = curlwp->l_lid; 571 #ifdef WAPBL_DEBUG_BUFBYTES 572 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 573 ("wapbl_discard: thread %d.%d discarding " 574 "transaction\n" 575 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 576 "deallocs=%d inodes=%d\n" 577 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 578 "unsynced=%zu\n", 579 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 580 wl->wl_bcount, wl->wl_dealloccnt, 581 wl->wl_inohashcnt, wl->wl_error_count, 582 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 583 wl->wl_unsynced_bufbytes)); 584 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 585 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 586 ("\tentry: bufcount = %zu, reclaimable = %zu, " 587 "error = %d, unsynced = %zu\n", 588 we->we_bufcount, we->we_reclaimable_bytes, 589 we->we_error, we->we_unsynced_bufbytes)); 590 } 591 #else /* !WAPBL_DEBUG_BUFBYTES */ 592 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 593 ("wapbl_discard: thread %d.%d discarding transaction\n" 594 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 595 "deallocs=%d inodes=%d\n" 596 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 597 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 598 wl->wl_bcount, wl->wl_dealloccnt, 599 wl->wl_inohashcnt, wl->wl_error_count, 600 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 601 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 602 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 603 ("\tentry: bufcount = %zu, reclaimable = %zu, " 604 "error = %d\n", 605 we->we_bufcount, we->we_reclaimable_bytes, 606 we->we_error)); 607 } 608 #endif /* !WAPBL_DEBUG_BUFBYTES */ 609 } 610 #endif /* WAPBL_DEBUG_PRINT */ 611 612 for (i = 0; i <= wl->wl_inohashmask; i++) { 613 struct wapbl_ino_head *wih; 614 struct wapbl_ino *wi; 615 616 wih = &wl->wl_inohash[i]; 617 while ((wi = LIST_FIRST(wih)) != NULL) { 618 LIST_REMOVE(wi, wi_hash); 619 pool_put(&wapbl_ino_pool, wi); 620 KASSERT(wl->wl_inohashcnt > 0); 621 wl->wl_inohashcnt--; 622 } 623 } 624 625 /* 626 * clean buffer list 627 */ 628 mutex_enter(&bufcache_lock); 629 mutex_enter(&wl->wl_mtx); 630 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 631 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 632 /* 633 * The buffer will be unlocked and 634 * removed from the transaction in brelse 635 */ 636 mutex_exit(&wl->wl_mtx); 637 brelsel(bp, 0); 638 mutex_enter(&wl->wl_mtx); 639 } 640 } 641 mutex_exit(&wl->wl_mtx); 642 mutex_exit(&bufcache_lock); 643 644 /* 645 * Remove references to this wl from wl_entries, free any which 646 * no longer have buffers, others will be freed in wapbl_biodone 647 * when they no longer have any buffers. 648 */ 649 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 650 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 651 /* XXX should we be accumulating wl_error_count 652 * and increasing reclaimable bytes ? */ 653 we->we_wapbl = NULL; 654 if (we->we_bufcount == 0) { 655 #ifdef WAPBL_DEBUG_BUFBYTES 656 KASSERT(we->we_unsynced_bufbytes == 0); 657 #endif 658 wapbl_free(we, sizeof(*we)); 659 } 660 } 661 662 /* Discard list of deallocs */ 663 wl->wl_dealloccnt = 0; 664 /* XXX should we clear wl_reserved_bytes? */ 665 666 KASSERT(wl->wl_bufbytes == 0); 667 KASSERT(wl->wl_bcount == 0); 668 KASSERT(wl->wl_bufcount == 0); 669 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 670 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 671 KASSERT(wl->wl_inohashcnt == 0); 672 673 rw_exit(&wl->wl_rwlock); 674 } 675 676 int 677 wapbl_stop(struct wapbl *wl, int force) 678 { 679 struct vnode *vp; 680 int error; 681 682 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 683 error = wapbl_flush(wl, 1); 684 if (error) { 685 if (force) 686 wapbl_discard(wl); 687 else 688 return error; 689 } 690 691 /* Unlinked inodes persist after a flush */ 692 if (wl->wl_inohashcnt) { 693 if (force) { 694 wapbl_discard(wl); 695 } else { 696 return EBUSY; 697 } 698 } 699 700 KASSERT(wl->wl_bufbytes == 0); 701 KASSERT(wl->wl_bcount == 0); 702 KASSERT(wl->wl_bufcount == 0); 703 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 704 KASSERT(wl->wl_dealloccnt == 0); 705 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 706 KASSERT(wl->wl_inohashcnt == 0); 707 708 vp = wl->wl_logvp; 709 710 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 711 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 712 wapbl_free(wl->wl_deallocblks, 713 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 714 wapbl_free(wl->wl_dealloclens, 715 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 716 wapbl_inodetrk_free(wl); 717 718 cv_destroy(&wl->wl_reclaimable_cv); 719 mutex_destroy(&wl->wl_mtx); 720 rw_destroy(&wl->wl_rwlock); 721 wapbl_free(wl, sizeof(*wl)); 722 723 return 0; 724 } 725 726 static int 727 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 728 { 729 struct pstats *pstats = curlwp->l_proc->p_stats; 730 struct buf *bp; 731 int error; 732 733 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 734 KASSERT(devvp->v_type == VBLK); 735 736 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 737 mutex_enter(&devvp->v_interlock); 738 devvp->v_numoutput++; 739 mutex_exit(&devvp->v_interlock); 740 pstats->p_ru.ru_oublock++; 741 } else { 742 pstats->p_ru.ru_inblock++; 743 } 744 745 bp = getiobuf(devvp, true); 746 bp->b_flags = flags; 747 bp->b_cflags = BC_BUSY; /* silly & dubious */ 748 bp->b_dev = devvp->v_rdev; 749 bp->b_data = data; 750 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 751 bp->b_blkno = pbn; 752 753 WAPBL_PRINTF(WAPBL_PRINT_IO, 754 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n", 755 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 756 bp->b_blkno, bp->b_dev)); 757 758 VOP_STRATEGY(devvp, bp); 759 760 error = biowait(bp); 761 putiobuf(bp); 762 763 if (error) { 764 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 765 ("wapbl_doio: %s %zu bytes at block %" PRId64 766 " on dev 0x%"PRIx64" failed with error %d\n", 767 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 768 "write" : "read"), 769 len, pbn, devvp->v_rdev, error)); 770 } 771 772 return error; 773 } 774 775 int 776 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 777 { 778 779 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 780 } 781 782 int 783 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 784 { 785 786 return wapbl_doio(data, len, devvp, pbn, B_READ); 787 } 788 789 /* 790 * Off is byte offset returns new offset for next write 791 * handles log wraparound 792 */ 793 static int 794 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 795 { 796 size_t slen; 797 off_t off = *offp; 798 int error; 799 daddr_t pbn; 800 801 KDASSERT(((len >> wl->wl_log_dev_bshift) << 802 wl->wl_log_dev_bshift) == len); 803 804 if (off < wl->wl_circ_off) 805 off = wl->wl_circ_off; 806 slen = wl->wl_circ_off + wl->wl_circ_size - off; 807 if (slen < len) { 808 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 809 #ifdef _KERNEL 810 pbn = btodb(pbn << wl->wl_log_dev_bshift); 811 #endif 812 error = wapbl_write(data, slen, wl->wl_devvp, pbn); 813 if (error) 814 return error; 815 data = (uint8_t *)data + slen; 816 len -= slen; 817 off = wl->wl_circ_off; 818 } 819 pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift); 820 #ifdef _KERNEL 821 pbn = btodb(pbn << wl->wl_log_dev_bshift); 822 #endif 823 error = wapbl_write(data, len, wl->wl_devvp, pbn); 824 if (error) 825 return error; 826 off += len; 827 if (off >= wl->wl_circ_off + wl->wl_circ_size) 828 off = wl->wl_circ_off; 829 *offp = off; 830 return 0; 831 } 832 833 /****************************************************************/ 834 835 int 836 wapbl_begin(struct wapbl *wl, const char *file, int line) 837 { 838 int doflush; 839 unsigned lockcount; 840 841 KDASSERT(wl); 842 843 /* 844 * XXX this needs to be made much more sophisticated. 845 * perhaps each wapbl_begin could reserve a specified 846 * number of buffers and bytes. 847 */ 848 mutex_enter(&wl->wl_mtx); 849 lockcount = wl->wl_lock_count; 850 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 851 wl->wl_bufbytes_max / 2) || 852 ((wl->wl_bufcount + (lockcount * 10)) > 853 wl->wl_bufcount_max / 2) || 854 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 855 (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2)); 856 mutex_exit(&wl->wl_mtx); 857 858 if (doflush) { 859 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 860 ("force flush lockcnt=%d bufbytes=%zu " 861 "(max=%zu) bufcount=%zu (max=%zu) " 862 "dealloccnt %d (lim=%d)\n", 863 lockcount, wl->wl_bufbytes, 864 wl->wl_bufbytes_max, wl->wl_bufcount, 865 wl->wl_bufcount_max, 866 wl->wl_dealloccnt, wl->wl_dealloclim)); 867 } 868 869 if (doflush) { 870 int error = wapbl_flush(wl, 0); 871 if (error) 872 return error; 873 } 874 875 rw_enter(&wl->wl_rwlock, RW_READER); 876 mutex_enter(&wl->wl_mtx); 877 wl->wl_lock_count++; 878 mutex_exit(&wl->wl_mtx); 879 880 #if defined(WAPBL_DEBUG_PRINT) 881 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 882 ("wapbl_begin thread %d.%d with bufcount=%zu " 883 "bufbytes=%zu bcount=%zu at %s:%d\n", 884 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 885 wl->wl_bufbytes, wl->wl_bcount, file, line)); 886 #endif 887 888 return 0; 889 } 890 891 void 892 wapbl_end(struct wapbl *wl) 893 { 894 895 #if defined(WAPBL_DEBUG_PRINT) 896 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 897 ("wapbl_end thread %d.%d with bufcount=%zu " 898 "bufbytes=%zu bcount=%zu\n", 899 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 900 wl->wl_bufbytes, wl->wl_bcount)); 901 #endif 902 903 #ifdef DIAGNOSTIC 904 size_t flushsize = wapbl_transaction_len(wl); 905 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 906 /* 907 * XXX this could be handled more gracefully, perhaps place 908 * only a partial transaction in the log and allow the 909 * remaining to flush without the protection of the journal. 910 */ 911 panic("wapbl_end: current transaction too big to flush\n"); 912 } 913 #endif 914 915 mutex_enter(&wl->wl_mtx); 916 KASSERT(wl->wl_lock_count > 0); 917 wl->wl_lock_count--; 918 mutex_exit(&wl->wl_mtx); 919 920 rw_exit(&wl->wl_rwlock); 921 } 922 923 void 924 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 925 { 926 927 KASSERT(bp->b_cflags & BC_BUSY); 928 KASSERT(bp->b_vp); 929 930 wapbl_jlock_assert(wl); 931 932 #if 0 933 /* 934 * XXX this might be an issue for swapfiles. 935 * see uvm_swap.c:1702 936 * 937 * XXX2 why require it then? leap of semantics? 938 */ 939 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 940 #endif 941 942 mutex_enter(&wl->wl_mtx); 943 if (bp->b_flags & B_LOCKED) { 944 LIST_REMOVE(bp, b_wapbllist); 945 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 946 ("wapbl_add_buf thread %d.%d re-adding buf %p " 947 "with %d bytes %d bcount\n", 948 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 949 bp->b_bcount)); 950 } else { 951 /* unlocked by dirty buffers shouldn't exist */ 952 KASSERT(!(bp->b_oflags & BO_DELWRI)); 953 wl->wl_bufbytes += bp->b_bufsize; 954 wl->wl_bcount += bp->b_bcount; 955 wl->wl_bufcount++; 956 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 957 ("wapbl_add_buf thread %d.%d adding buf %p " 958 "with %d bytes %d bcount\n", 959 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 960 bp->b_bcount)); 961 } 962 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 963 mutex_exit(&wl->wl_mtx); 964 965 bp->b_flags |= B_LOCKED; 966 } 967 968 static void 969 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 970 { 971 972 KASSERT(mutex_owned(&wl->wl_mtx)); 973 KASSERT(bp->b_cflags & BC_BUSY); 974 wapbl_jlock_assert(wl); 975 976 #if 0 977 /* 978 * XXX this might be an issue for swapfiles. 979 * see uvm_swap.c:1725 980 * 981 * XXXdeux: see above 982 */ 983 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 984 #endif 985 KASSERT(bp->b_flags & B_LOCKED); 986 987 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 988 ("wapbl_remove_buf thread %d.%d removing buf %p with " 989 "%d bytes %d bcount\n", 990 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 991 992 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 993 wl->wl_bufbytes -= bp->b_bufsize; 994 KASSERT(wl->wl_bcount >= bp->b_bcount); 995 wl->wl_bcount -= bp->b_bcount; 996 KASSERT(wl->wl_bufcount > 0); 997 wl->wl_bufcount--; 998 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 999 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1000 LIST_REMOVE(bp, b_wapbllist); 1001 1002 bp->b_flags &= ~B_LOCKED; 1003 } 1004 1005 /* called from brelsel() in vfs_bio among other places */ 1006 void 1007 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 1008 { 1009 1010 mutex_enter(&wl->wl_mtx); 1011 wapbl_remove_buf_locked(wl, bp); 1012 mutex_exit(&wl->wl_mtx); 1013 } 1014 1015 void 1016 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 1017 { 1018 1019 KASSERT(bp->b_cflags & BC_BUSY); 1020 1021 /* 1022 * XXX: why does this depend on B_LOCKED? otherwise the buf 1023 * is not for a transaction? if so, why is this called in the 1024 * first place? 1025 */ 1026 if (bp->b_flags & B_LOCKED) { 1027 mutex_enter(&wl->wl_mtx); 1028 wl->wl_bufbytes += bp->b_bufsize - oldsz; 1029 wl->wl_bcount += bp->b_bcount - oldcnt; 1030 mutex_exit(&wl->wl_mtx); 1031 } 1032 } 1033 1034 #endif /* _KERNEL */ 1035 1036 /****************************************************************/ 1037 /* Some utility inlines */ 1038 1039 /* This is used to advance the pointer at old to new value at old+delta */ 1040 static inline off_t 1041 wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 1042 { 1043 off_t new; 1044 1045 /* Define acceptable ranges for inputs. */ 1046 KASSERT(delta <= size); 1047 KASSERT((old == 0) || (old >= off)); 1048 KASSERT(old < (size + off)); 1049 1050 if ((old == 0) && (delta != 0)) 1051 new = off + delta; 1052 else if ((old + delta) < (size + off)) 1053 new = old + delta; 1054 else 1055 new = (old + delta) - size; 1056 1057 /* Note some interesting axioms */ 1058 KASSERT((delta != 0) || (new == old)); 1059 KASSERT((delta == 0) || (new != 0)); 1060 KASSERT((delta != (size)) || (new == old)); 1061 1062 /* Define acceptable ranges for output. */ 1063 KASSERT((new == 0) || (new >= off)); 1064 KASSERT(new < (size + off)); 1065 return new; 1066 } 1067 1068 static inline size_t 1069 wapbl_space_used(size_t avail, off_t head, off_t tail) 1070 { 1071 1072 if (tail == 0) { 1073 KASSERT(head == 0); 1074 return 0; 1075 } 1076 return ((head + (avail - 1) - tail) % avail) + 1; 1077 } 1078 1079 static inline size_t 1080 wapbl_space_free(size_t avail, off_t head, off_t tail) 1081 { 1082 1083 return avail - wapbl_space_used(avail, head, tail); 1084 } 1085 1086 static inline void 1087 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1088 off_t *tailp) 1089 { 1090 off_t head = *headp; 1091 off_t tail = *tailp; 1092 1093 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1094 head = wapbl_advance(size, off, head, delta); 1095 if ((tail == 0) && (head != 0)) 1096 tail = off; 1097 *headp = head; 1098 *tailp = tail; 1099 } 1100 1101 static inline void 1102 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1103 off_t *tailp) 1104 { 1105 off_t head = *headp; 1106 off_t tail = *tailp; 1107 1108 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1109 tail = wapbl_advance(size, off, tail, delta); 1110 if (head == tail) { 1111 head = tail = 0; 1112 } 1113 *headp = head; 1114 *tailp = tail; 1115 } 1116 1117 #ifdef _KERNEL 1118 1119 /****************************************************************/ 1120 1121 /* 1122 * Remove transactions whose buffers are completely flushed to disk. 1123 * Will block until at least minfree space is available. 1124 * only intended to be called from inside wapbl_flush and therefore 1125 * does not protect against commit races with itself or with flush. 1126 */ 1127 static int 1128 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1129 { 1130 size_t delta; 1131 size_t avail; 1132 off_t head; 1133 off_t tail; 1134 int error = 0; 1135 1136 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1137 KASSERT(rw_write_held(&wl->wl_rwlock)); 1138 1139 mutex_enter(&wl->wl_mtx); 1140 1141 /* 1142 * First check to see if we have to do a commit 1143 * at all. 1144 */ 1145 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1146 if (minfree < avail) { 1147 mutex_exit(&wl->wl_mtx); 1148 return 0; 1149 } 1150 minfree -= avail; 1151 while ((wl->wl_error_count == 0) && 1152 (wl->wl_reclaimable_bytes < minfree)) { 1153 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1154 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1155 "minfree=%zd\n", 1156 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1157 minfree)); 1158 1159 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1160 } 1161 if (wl->wl_reclaimable_bytes < minfree) { 1162 KASSERT(wl->wl_error_count); 1163 /* XXX maybe get actual error from buffer instead someday? */ 1164 error = EIO; 1165 } 1166 head = wl->wl_head; 1167 tail = wl->wl_tail; 1168 delta = wl->wl_reclaimable_bytes; 1169 1170 /* If all of of the entries are flushed, then be sure to keep 1171 * the reserved bytes reserved. Watch out for discarded transactions, 1172 * which could leave more bytes reserved than are reclaimable. 1173 */ 1174 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1175 (delta >= wl->wl_reserved_bytes)) { 1176 delta -= wl->wl_reserved_bytes; 1177 } 1178 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1179 &tail); 1180 KDASSERT(wl->wl_reserved_bytes <= 1181 wapbl_space_used(wl->wl_circ_size, head, tail)); 1182 mutex_exit(&wl->wl_mtx); 1183 1184 if (error) 1185 return error; 1186 1187 if (waitonly) 1188 return 0; 1189 1190 /* 1191 * This is where head, tail and delta are unprotected 1192 * from races against itself or flush. This is ok since 1193 * we only call this routine from inside flush itself. 1194 * 1195 * XXX: how can it race against itself when accessed only 1196 * from behind the write-locked rwlock? 1197 */ 1198 error = wapbl_write_commit(wl, head, tail); 1199 if (error) 1200 return error; 1201 1202 wl->wl_head = head; 1203 wl->wl_tail = tail; 1204 1205 mutex_enter(&wl->wl_mtx); 1206 KASSERT(wl->wl_reclaimable_bytes >= delta); 1207 wl->wl_reclaimable_bytes -= delta; 1208 mutex_exit(&wl->wl_mtx); 1209 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1210 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1211 curproc->p_pid, curlwp->l_lid, delta)); 1212 1213 return 0; 1214 } 1215 1216 /****************************************************************/ 1217 1218 void 1219 wapbl_biodone(struct buf *bp) 1220 { 1221 struct wapbl_entry *we = bp->b_private; 1222 struct wapbl *wl = we->we_wapbl; 1223 1224 /* 1225 * Handle possible flushing of buffers after log has been 1226 * decomissioned. 1227 */ 1228 if (!wl) { 1229 KASSERT(we->we_bufcount > 0); 1230 we->we_bufcount--; 1231 #ifdef WAPBL_DEBUG_BUFBYTES 1232 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1233 we->we_unsynced_bufbytes -= bp->b_bufsize; 1234 #endif 1235 1236 if (we->we_bufcount == 0) { 1237 #ifdef WAPBL_DEBUG_BUFBYTES 1238 KASSERT(we->we_unsynced_bufbytes == 0); 1239 #endif 1240 wapbl_free(we, sizeof(*we)); 1241 } 1242 1243 brelse(bp, 0); 1244 return; 1245 } 1246 1247 #ifdef ohbother 1248 KDASSERT(bp->b_flags & B_DONE); 1249 KDASSERT(!(bp->b_flags & B_DELWRI)); 1250 KDASSERT(bp->b_flags & B_ASYNC); 1251 KDASSERT(bp->b_flags & B_BUSY); 1252 KDASSERT(!(bp->b_flags & B_LOCKED)); 1253 KDASSERT(!(bp->b_flags & B_READ)); 1254 KDASSERT(!(bp->b_flags & B_INVAL)); 1255 KDASSERT(!(bp->b_flags & B_NOCACHE)); 1256 #endif 1257 1258 if (bp->b_error) { 1259 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1260 /* 1261 * XXXpooka: interfaces not fully updated 1262 * Note: this was not enabled in the original patch 1263 * against netbsd4 either. I don't know if comment 1264 * above is true or not. 1265 */ 1266 1267 /* 1268 * If an error occurs, report the error and leave the 1269 * buffer as a delayed write on the LRU queue. 1270 * restarting the write would likely result in 1271 * an error spinloop, so let it be done harmlessly 1272 * by the syncer. 1273 */ 1274 bp->b_flags &= ~(B_DONE); 1275 simple_unlock(&bp->b_interlock); 1276 1277 if (we->we_error == 0) { 1278 mutex_enter(&wl->wl_mtx); 1279 wl->wl_error_count++; 1280 mutex_exit(&wl->wl_mtx); 1281 cv_broadcast(&wl->wl_reclaimable_cv); 1282 } 1283 we->we_error = bp->b_error; 1284 bp->b_error = 0; 1285 brelse(bp); 1286 return; 1287 #else 1288 /* For now, just mark the log permanently errored out */ 1289 1290 mutex_enter(&wl->wl_mtx); 1291 if (wl->wl_error_count == 0) { 1292 wl->wl_error_count++; 1293 cv_broadcast(&wl->wl_reclaimable_cv); 1294 } 1295 mutex_exit(&wl->wl_mtx); 1296 #endif 1297 } 1298 1299 mutex_enter(&wl->wl_mtx); 1300 1301 KASSERT(we->we_bufcount > 0); 1302 we->we_bufcount--; 1303 #ifdef WAPBL_DEBUG_BUFBYTES 1304 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1305 we->we_unsynced_bufbytes -= bp->b_bufsize; 1306 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); 1307 wl->wl_unsynced_bufbytes -= bp->b_bufsize; 1308 #endif 1309 1310 /* 1311 * If the current transaction can be reclaimed, start 1312 * at the beginning and reclaim any consecutive reclaimable 1313 * transactions. If we successfully reclaim anything, 1314 * then wakeup anyone waiting for the reclaim. 1315 */ 1316 if (we->we_bufcount == 0) { 1317 size_t delta = 0; 1318 int errcnt = 0; 1319 #ifdef WAPBL_DEBUG_BUFBYTES 1320 KDASSERT(we->we_unsynced_bufbytes == 0); 1321 #endif 1322 /* 1323 * clear any posted error, since the buffer it came from 1324 * has successfully flushed by now 1325 */ 1326 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1327 (we->we_bufcount == 0)) { 1328 delta += we->we_reclaimable_bytes; 1329 if (we->we_error) 1330 errcnt++; 1331 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1332 wapbl_free(we, sizeof(*we)); 1333 } 1334 1335 if (delta) { 1336 wl->wl_reclaimable_bytes += delta; 1337 KASSERT(wl->wl_error_count >= errcnt); 1338 wl->wl_error_count -= errcnt; 1339 cv_broadcast(&wl->wl_reclaimable_cv); 1340 } 1341 } 1342 1343 mutex_exit(&wl->wl_mtx); 1344 brelse(bp, 0); 1345 } 1346 1347 /* 1348 * Write transactions to disk + start I/O for contents 1349 */ 1350 int 1351 wapbl_flush(struct wapbl *wl, int waitfor) 1352 { 1353 struct buf *bp; 1354 struct wapbl_entry *we; 1355 off_t off; 1356 off_t head; 1357 off_t tail; 1358 size_t delta = 0; 1359 size_t flushsize; 1360 size_t reserved; 1361 int error = 0; 1362 1363 /* 1364 * Do a quick check to see if a full flush can be skipped 1365 * This assumes that the flush callback does not need to be called 1366 * unless there are other outstanding bufs. 1367 */ 1368 if (!waitfor) { 1369 size_t nbufs; 1370 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1371 protect the KASSERTS */ 1372 nbufs = wl->wl_bufcount; 1373 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1374 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1375 mutex_exit(&wl->wl_mtx); 1376 if (nbufs == 0) 1377 return 0; 1378 } 1379 1380 /* 1381 * XXX we may consider using LK_UPGRADE here 1382 * if we want to call flush from inside a transaction 1383 */ 1384 rw_enter(&wl->wl_rwlock, RW_WRITER); 1385 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1386 wl->wl_dealloccnt); 1387 1388 /* 1389 * Now that we are fully locked and flushed, 1390 * do another check for nothing to do. 1391 */ 1392 if (wl->wl_bufcount == 0) { 1393 goto out; 1394 } 1395 1396 #if 0 1397 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1398 ("wapbl_flush thread %d.%d flushing entries with " 1399 "bufcount=%zu bufbytes=%zu\n", 1400 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1401 wl->wl_bufbytes)); 1402 #endif 1403 1404 /* Calculate amount of space needed to flush */ 1405 flushsize = wapbl_transaction_len(wl); 1406 if (wapbl_verbose_commit) { 1407 struct timespec ts; 1408 getnanotime(&ts); 1409 printf("%s: %lld.%09ld this transaction = %zu bytes\n", 1410 __func__, (long long)ts.tv_sec, 1411 (long)ts.tv_nsec, flushsize); 1412 } 1413 1414 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1415 /* 1416 * XXX this could be handled more gracefully, perhaps place 1417 * only a partial transaction in the log and allow the 1418 * remaining to flush without the protection of the journal. 1419 */ 1420 panic("wapbl_flush: current transaction too big to flush\n"); 1421 } 1422 1423 error = wapbl_truncate(wl, flushsize, 0); 1424 if (error) 1425 goto out2; 1426 1427 off = wl->wl_head; 1428 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1429 (off < wl->wl_circ_off + wl->wl_circ_size))); 1430 error = wapbl_write_blocks(wl, &off); 1431 if (error) 1432 goto out2; 1433 error = wapbl_write_revocations(wl, &off); 1434 if (error) 1435 goto out2; 1436 error = wapbl_write_inodes(wl, &off); 1437 if (error) 1438 goto out2; 1439 1440 reserved = 0; 1441 if (wl->wl_inohashcnt) 1442 reserved = wapbl_transaction_inodes_len(wl); 1443 1444 head = wl->wl_head; 1445 tail = wl->wl_tail; 1446 1447 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1448 &head, &tail); 1449 #ifdef WAPBL_DEBUG 1450 if (head != off) { 1451 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1452 " off=%"PRIdMAX" flush=%zu\n", 1453 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1454 flushsize); 1455 } 1456 #else 1457 KASSERT(head == off); 1458 #endif 1459 1460 /* Opportunistically move the tail forward if we can */ 1461 if (!wapbl_lazy_truncate) { 1462 mutex_enter(&wl->wl_mtx); 1463 delta = wl->wl_reclaimable_bytes; 1464 mutex_exit(&wl->wl_mtx); 1465 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1466 &head, &tail); 1467 } 1468 1469 error = wapbl_write_commit(wl, head, tail); 1470 if (error) 1471 goto out2; 1472 1473 we = wapbl_calloc(1, sizeof(*we)); 1474 1475 #ifdef WAPBL_DEBUG_BUFBYTES 1476 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1477 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1478 " unsynced=%zu" 1479 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1480 "inodes=%d\n", 1481 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1482 wapbl_space_used(wl->wl_circ_size, head, tail), 1483 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1484 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1485 wl->wl_inohashcnt)); 1486 #else 1487 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1488 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1489 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1490 "inodes=%d\n", 1491 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1492 wapbl_space_used(wl->wl_circ_size, head, tail), 1493 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1494 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1495 #endif 1496 1497 1498 mutex_enter(&bufcache_lock); 1499 mutex_enter(&wl->wl_mtx); 1500 1501 wl->wl_reserved_bytes = reserved; 1502 wl->wl_head = head; 1503 wl->wl_tail = tail; 1504 KASSERT(wl->wl_reclaimable_bytes >= delta); 1505 wl->wl_reclaimable_bytes -= delta; 1506 wl->wl_dealloccnt = 0; 1507 #ifdef WAPBL_DEBUG_BUFBYTES 1508 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1509 #endif 1510 1511 we->we_wapbl = wl; 1512 we->we_bufcount = wl->wl_bufcount; 1513 #ifdef WAPBL_DEBUG_BUFBYTES 1514 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1515 #endif 1516 we->we_reclaimable_bytes = flushsize; 1517 we->we_error = 0; 1518 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1519 1520 /* 1521 * this flushes bufs in reverse order than they were queued 1522 * it shouldn't matter, but if we care we could use TAILQ instead. 1523 * XXX Note they will get put on the lru queue when they flush 1524 * so we might actually want to change this to preserve order. 1525 */ 1526 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1527 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1528 continue; 1529 } 1530 bp->b_iodone = wapbl_biodone; 1531 bp->b_private = we; 1532 bremfree(bp); 1533 wapbl_remove_buf_locked(wl, bp); 1534 mutex_exit(&wl->wl_mtx); 1535 mutex_exit(&bufcache_lock); 1536 bawrite(bp); 1537 mutex_enter(&bufcache_lock); 1538 mutex_enter(&wl->wl_mtx); 1539 } 1540 mutex_exit(&wl->wl_mtx); 1541 mutex_exit(&bufcache_lock); 1542 1543 #if 0 1544 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1545 ("wapbl_flush thread %d.%d done flushing entries...\n", 1546 curproc->p_pid, curlwp->l_lid)); 1547 #endif 1548 1549 out: 1550 1551 /* 1552 * If the waitfor flag is set, don't return until everything is 1553 * fully flushed and the on disk log is empty. 1554 */ 1555 if (waitfor) { 1556 error = wapbl_truncate(wl, wl->wl_circ_size - 1557 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1558 } 1559 1560 out2: 1561 if (error) { 1562 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1563 wl->wl_dealloclens, wl->wl_dealloccnt); 1564 } 1565 1566 #ifdef WAPBL_DEBUG_PRINT 1567 if (error) { 1568 pid_t pid = -1; 1569 lwpid_t lid = -1; 1570 if (curproc) 1571 pid = curproc->p_pid; 1572 if (curlwp) 1573 lid = curlwp->l_lid; 1574 mutex_enter(&wl->wl_mtx); 1575 #ifdef WAPBL_DEBUG_BUFBYTES 1576 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1577 ("wapbl_flush: thread %d.%d aborted flush: " 1578 "error = %d\n" 1579 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1580 "deallocs=%d inodes=%d\n" 1581 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1582 "unsynced=%zu\n", 1583 pid, lid, error, wl->wl_bufcount, 1584 wl->wl_bufbytes, wl->wl_bcount, 1585 wl->wl_dealloccnt, wl->wl_inohashcnt, 1586 wl->wl_error_count, wl->wl_reclaimable_bytes, 1587 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1588 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1589 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1590 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1591 "error = %d, unsynced = %zu\n", 1592 we->we_bufcount, we->we_reclaimable_bytes, 1593 we->we_error, we->we_unsynced_bufbytes)); 1594 } 1595 #else 1596 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1597 ("wapbl_flush: thread %d.%d aborted flush: " 1598 "error = %d\n" 1599 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1600 "deallocs=%d inodes=%d\n" 1601 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1602 pid, lid, error, wl->wl_bufcount, 1603 wl->wl_bufbytes, wl->wl_bcount, 1604 wl->wl_dealloccnt, wl->wl_inohashcnt, 1605 wl->wl_error_count, wl->wl_reclaimable_bytes, 1606 wl->wl_reserved_bytes)); 1607 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1608 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1609 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1610 "error = %d\n", we->we_bufcount, 1611 we->we_reclaimable_bytes, we->we_error)); 1612 } 1613 #endif 1614 mutex_exit(&wl->wl_mtx); 1615 } 1616 #endif 1617 1618 rw_exit(&wl->wl_rwlock); 1619 return error; 1620 } 1621 1622 /****************************************************************/ 1623 1624 void 1625 wapbl_jlock_assert(struct wapbl *wl) 1626 { 1627 1628 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1629 } 1630 1631 void 1632 wapbl_junlock_assert(struct wapbl *wl) 1633 { 1634 1635 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1636 } 1637 1638 /****************************************************************/ 1639 1640 /* locks missing */ 1641 void 1642 wapbl_print(struct wapbl *wl, 1643 int full, 1644 void (*pr)(const char *, ...)) 1645 { 1646 struct buf *bp; 1647 struct wapbl_entry *we; 1648 (*pr)("wapbl %p", wl); 1649 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1650 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1651 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1652 wl->wl_circ_size, wl->wl_circ_off, 1653 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1654 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1655 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1656 #ifdef WAPBL_DEBUG_BUFBYTES 1657 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1658 "reserved = %zu errcnt = %d unsynced = %zu\n", 1659 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1660 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1661 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1662 #else 1663 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1664 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1665 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1666 wl->wl_error_count); 1667 #endif 1668 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1669 wl->wl_dealloccnt, wl->wl_dealloclim); 1670 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1671 wl->wl_inohashcnt, wl->wl_inohashmask); 1672 (*pr)("entries:\n"); 1673 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1674 #ifdef WAPBL_DEBUG_BUFBYTES 1675 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1676 "unsynced = %zu\n", 1677 we->we_bufcount, we->we_reclaimable_bytes, 1678 we->we_error, we->we_unsynced_bufbytes); 1679 #else 1680 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1681 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1682 #endif 1683 } 1684 if (full) { 1685 int cnt = 0; 1686 (*pr)("bufs ="); 1687 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1688 if (!LIST_NEXT(bp, b_wapbllist)) { 1689 (*pr)(" %p", bp); 1690 } else if ((++cnt % 6) == 0) { 1691 (*pr)(" %p,\n\t", bp); 1692 } else { 1693 (*pr)(" %p,", bp); 1694 } 1695 } 1696 (*pr)("\n"); 1697 1698 (*pr)("dealloced blks = "); 1699 { 1700 int i; 1701 cnt = 0; 1702 for (i = 0; i < wl->wl_dealloccnt; i++) { 1703 (*pr)(" %"PRId64":%d,", 1704 wl->wl_deallocblks[i], 1705 wl->wl_dealloclens[i]); 1706 if ((++cnt % 4) == 0) { 1707 (*pr)("\n\t"); 1708 } 1709 } 1710 } 1711 (*pr)("\n"); 1712 1713 (*pr)("registered inodes = "); 1714 { 1715 int i; 1716 cnt = 0; 1717 for (i = 0; i <= wl->wl_inohashmask; i++) { 1718 struct wapbl_ino_head *wih; 1719 struct wapbl_ino *wi; 1720 1721 wih = &wl->wl_inohash[i]; 1722 LIST_FOREACH(wi, wih, wi_hash) { 1723 if (wi->wi_ino == 0) 1724 continue; 1725 (*pr)(" %"PRId32"/0%06"PRIo32",", 1726 wi->wi_ino, wi->wi_mode); 1727 if ((++cnt % 4) == 0) { 1728 (*pr)("\n\t"); 1729 } 1730 } 1731 } 1732 (*pr)("\n"); 1733 } 1734 } 1735 } 1736 1737 #if defined(WAPBL_DEBUG) || defined(DDB) 1738 void 1739 wapbl_dump(struct wapbl *wl) 1740 { 1741 #if defined(WAPBL_DEBUG) 1742 if (!wl) 1743 wl = wapbl_debug_wl; 1744 #endif 1745 if (!wl) 1746 return; 1747 wapbl_print(wl, 1, printf); 1748 } 1749 #endif 1750 1751 /****************************************************************/ 1752 1753 void 1754 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1755 { 1756 1757 wapbl_jlock_assert(wl); 1758 1759 mutex_enter(&wl->wl_mtx); 1760 /* XXX should eventually instead tie this into resource estimation */ 1761 /* 1762 * XXX this panic needs locking/mutex analysis and the 1763 * ability to cope with the failure. 1764 */ 1765 /* XXX this XXX doesn't have enough XXX */ 1766 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) 1767 panic("wapbl_register_deallocation: out of resources"); 1768 1769 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1770 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1771 wl->wl_dealloccnt++; 1772 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1773 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1774 mutex_exit(&wl->wl_mtx); 1775 } 1776 1777 /****************************************************************/ 1778 1779 static void 1780 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1781 { 1782 1783 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1784 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1785 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1786 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1787 } 1788 } 1789 1790 static void 1791 wapbl_inodetrk_free(struct wapbl *wl) 1792 { 1793 1794 /* XXX this KASSERT needs locking/mutex analysis */ 1795 KASSERT(wl->wl_inohashcnt == 0); 1796 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1797 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1798 pool_destroy(&wapbl_ino_pool); 1799 } 1800 } 1801 1802 static struct wapbl_ino * 1803 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1804 { 1805 struct wapbl_ino_head *wih; 1806 struct wapbl_ino *wi; 1807 1808 KASSERT(mutex_owned(&wl->wl_mtx)); 1809 1810 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1811 LIST_FOREACH(wi, wih, wi_hash) { 1812 if (ino == wi->wi_ino) 1813 return wi; 1814 } 1815 return 0; 1816 } 1817 1818 void 1819 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1820 { 1821 struct wapbl_ino_head *wih; 1822 struct wapbl_ino *wi; 1823 1824 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1825 1826 mutex_enter(&wl->wl_mtx); 1827 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1828 wi->wi_ino = ino; 1829 wi->wi_mode = mode; 1830 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1831 LIST_INSERT_HEAD(wih, wi, wi_hash); 1832 wl->wl_inohashcnt++; 1833 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1834 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1835 mutex_exit(&wl->wl_mtx); 1836 } else { 1837 mutex_exit(&wl->wl_mtx); 1838 pool_put(&wapbl_ino_pool, wi); 1839 } 1840 } 1841 1842 void 1843 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1844 { 1845 struct wapbl_ino *wi; 1846 1847 mutex_enter(&wl->wl_mtx); 1848 wi = wapbl_inodetrk_get(wl, ino); 1849 if (wi) { 1850 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1851 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1852 KASSERT(wl->wl_inohashcnt > 0); 1853 wl->wl_inohashcnt--; 1854 LIST_REMOVE(wi, wi_hash); 1855 mutex_exit(&wl->wl_mtx); 1856 1857 pool_put(&wapbl_ino_pool, wi); 1858 } else { 1859 mutex_exit(&wl->wl_mtx); 1860 } 1861 } 1862 1863 /****************************************************************/ 1864 1865 static inline size_t 1866 wapbl_transaction_inodes_len(struct wapbl *wl) 1867 { 1868 int blocklen = 1<<wl->wl_log_dev_bshift; 1869 int iph; 1870 1871 /* Calculate number of inodes described in a inodelist header */ 1872 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1873 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1874 1875 KASSERT(iph > 0); 1876 1877 return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen; 1878 } 1879 1880 1881 /* Calculate amount of space a transaction will take on disk */ 1882 static size_t 1883 wapbl_transaction_len(struct wapbl *wl) 1884 { 1885 int blocklen = 1<<wl->wl_log_dev_bshift; 1886 size_t len; 1887 int bph; 1888 1889 /* Calculate number of blocks described in a blocklist header */ 1890 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1891 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1892 1893 KASSERT(bph > 0); 1894 1895 len = wl->wl_bcount; 1896 len += howmany(wl->wl_bufcount, bph) * blocklen; 1897 len += howmany(wl->wl_dealloccnt, bph) * blocklen; 1898 len += wapbl_transaction_inodes_len(wl); 1899 1900 return len; 1901 } 1902 1903 /* 1904 * Perform commit operation 1905 * 1906 * Note that generation number incrementation needs to 1907 * be protected against racing with other invocations 1908 * of wapbl_commit. This is ok since this routine 1909 * is only invoked from wapbl_flush 1910 */ 1911 static int 1912 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 1913 { 1914 struct wapbl_wc_header *wc = wl->wl_wc_header; 1915 struct timespec ts; 1916 int error; 1917 int force = 1; 1918 daddr_t pbn; 1919 1920 if (wapbl_flush_disk_cache) { 1921 /* XXX Calc checksum here, instead we do this for now */ 1922 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 1923 FWRITE, FSCRED); 1924 if (error) { 1925 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1926 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1927 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1928 } 1929 } 1930 1931 wc->wc_head = head; 1932 wc->wc_tail = tail; 1933 wc->wc_checksum = 0; 1934 wc->wc_version = 1; 1935 getnanotime(&ts); 1936 wc->wc_time = ts.tv_sec; 1937 wc->wc_timensec = ts.tv_nsec; 1938 1939 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1940 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 1941 (intmax_t)head, (intmax_t)tail)); 1942 1943 /* 1944 * XXX if generation will rollover, then first zero 1945 * over second commit header before trying to write both headers. 1946 */ 1947 1948 pbn = wl->wl_logpbn + (wc->wc_generation % 2); 1949 #ifdef _KERNEL 1950 pbn = btodb(pbn << wc->wc_log_dev_bshift); 1951 #endif 1952 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn); 1953 if (error) 1954 return error; 1955 1956 if (wapbl_flush_disk_cache) { 1957 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, 1958 FWRITE, FSCRED); 1959 if (error) { 1960 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1961 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1962 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1963 } 1964 } 1965 1966 /* 1967 * If the generation number was zero, write it out a second time. 1968 * This handles initialization and generation number rollover 1969 */ 1970 if (wc->wc_generation++ == 0) { 1971 error = wapbl_write_commit(wl, head, tail); 1972 /* 1973 * This panic should be able to be removed if we do the 1974 * zero'ing mentioned above, and we are certain to roll 1975 * back generation number on failure. 1976 */ 1977 if (error) 1978 panic("wapbl_write_commit: error writing duplicate " 1979 "log header: %d\n", error); 1980 } 1981 return 0; 1982 } 1983 1984 /* Returns new offset value */ 1985 static int 1986 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 1987 { 1988 struct wapbl_wc_blocklist *wc = 1989 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1990 int blocklen = 1<<wl->wl_log_dev_bshift; 1991 int bph; 1992 struct buf *bp; 1993 off_t off = *offp; 1994 int error; 1995 size_t padding; 1996 1997 KASSERT(rw_write_held(&wl->wl_rwlock)); 1998 1999 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2000 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2001 2002 bp = LIST_FIRST(&wl->wl_bufs); 2003 2004 while (bp) { 2005 int cnt; 2006 struct buf *obp = bp; 2007 2008 KASSERT(bp->b_flags & B_LOCKED); 2009 2010 wc->wc_type = WAPBL_WC_BLOCKS; 2011 wc->wc_len = blocklen; 2012 wc->wc_blkcount = 0; 2013 while (bp && (wc->wc_blkcount < bph)) { 2014 /* 2015 * Make sure all the physical block numbers are up to 2016 * date. If this is not always true on a given 2017 * filesystem, then VOP_BMAP must be called. We 2018 * could call VOP_BMAP here, or else in the filesystem 2019 * specific flush callback, although neither of those 2020 * solutions allow us to take the vnode lock. If a 2021 * filesystem requires that we must take the vnode lock 2022 * to call VOP_BMAP, then we can probably do it in 2023 * bwrite when the vnode lock should already be held 2024 * by the invoking code. 2025 */ 2026 KASSERT((bp->b_vp->v_type == VBLK) || 2027 (bp->b_blkno != bp->b_lblkno)); 2028 KASSERT(bp->b_blkno > 0); 2029 2030 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 2031 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 2032 wc->wc_len += bp->b_bcount; 2033 wc->wc_blkcount++; 2034 bp = LIST_NEXT(bp, b_wapbllist); 2035 } 2036 if (wc->wc_len % blocklen != 0) { 2037 padding = blocklen - wc->wc_len % blocklen; 2038 wc->wc_len += padding; 2039 } else { 2040 padding = 0; 2041 } 2042 2043 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2044 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 2045 wc->wc_len, padding, (intmax_t)off)); 2046 2047 error = wapbl_circ_write(wl, wc, blocklen, &off); 2048 if (error) 2049 return error; 2050 bp = obp; 2051 cnt = 0; 2052 while (bp && (cnt++ < bph)) { 2053 error = wapbl_circ_write(wl, bp->b_data, 2054 bp->b_bcount, &off); 2055 if (error) 2056 return error; 2057 bp = LIST_NEXT(bp, b_wapbllist); 2058 } 2059 if (padding) { 2060 void *zero; 2061 2062 zero = wapbl_malloc(padding); 2063 memset(zero, 0, padding); 2064 error = wapbl_circ_write(wl, zero, padding, &off); 2065 wapbl_free(zero, padding); 2066 if (error) 2067 return error; 2068 } 2069 } 2070 *offp = off; 2071 return 0; 2072 } 2073 2074 static int 2075 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 2076 { 2077 struct wapbl_wc_blocklist *wc = 2078 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 2079 int i; 2080 int blocklen = 1<<wl->wl_log_dev_bshift; 2081 int bph; 2082 off_t off = *offp; 2083 int error; 2084 2085 if (wl->wl_dealloccnt == 0) 2086 return 0; 2087 2088 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 2089 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 2090 2091 i = 0; 2092 while (i < wl->wl_dealloccnt) { 2093 wc->wc_type = WAPBL_WC_REVOCATIONS; 2094 wc->wc_len = blocklen; 2095 wc->wc_blkcount = 0; 2096 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2097 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2098 wl->wl_deallocblks[i]; 2099 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2100 wl->wl_dealloclens[i]; 2101 wc->wc_blkcount++; 2102 i++; 2103 } 2104 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2105 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2106 wc->wc_len, (intmax_t)off)); 2107 error = wapbl_circ_write(wl, wc, blocklen, &off); 2108 if (error) 2109 return error; 2110 } 2111 *offp = off; 2112 return 0; 2113 } 2114 2115 static int 2116 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2117 { 2118 struct wapbl_wc_inodelist *wc = 2119 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2120 int i; 2121 int blocklen = 1 << wl->wl_log_dev_bshift; 2122 off_t off = *offp; 2123 int error; 2124 2125 struct wapbl_ino_head *wih; 2126 struct wapbl_ino *wi; 2127 int iph; 2128 2129 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2130 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2131 2132 i = 0; 2133 wih = &wl->wl_inohash[0]; 2134 wi = 0; 2135 do { 2136 wc->wc_type = WAPBL_WC_INODES; 2137 wc->wc_len = blocklen; 2138 wc->wc_inocnt = 0; 2139 wc->wc_clear = (i == 0); 2140 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2141 while (!wi) { 2142 KASSERT((wih - &wl->wl_inohash[0]) 2143 <= wl->wl_inohashmask); 2144 wi = LIST_FIRST(wih++); 2145 } 2146 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2147 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2148 wc->wc_inocnt++; 2149 i++; 2150 wi = LIST_NEXT(wi, wi_hash); 2151 } 2152 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2153 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2154 wc->wc_len, (intmax_t)off)); 2155 error = wapbl_circ_write(wl, wc, blocklen, &off); 2156 if (error) 2157 return error; 2158 } while (i < wl->wl_inohashcnt); 2159 2160 *offp = off; 2161 return 0; 2162 } 2163 2164 #endif /* _KERNEL */ 2165 2166 /****************************************************************/ 2167 2168 struct wapbl_blk { 2169 LIST_ENTRY(wapbl_blk) wb_hash; 2170 daddr_t wb_blk; 2171 off_t wb_off; /* Offset of this block in the log */ 2172 }; 2173 #define WAPBL_BLKPOOL_MIN 83 2174 2175 static void 2176 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2177 { 2178 if (size < WAPBL_BLKPOOL_MIN) 2179 size = WAPBL_BLKPOOL_MIN; 2180 KASSERT(wr->wr_blkhash == 0); 2181 #ifdef _KERNEL 2182 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2183 #else /* ! _KERNEL */ 2184 /* Manually implement hashinit */ 2185 { 2186 unsigned long i, hashsize; 2187 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2188 continue; 2189 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); 2190 for (i = 0; i < hashsize; i++) 2191 LIST_INIT(&wr->wr_blkhash[i]); 2192 wr->wr_blkhashmask = hashsize - 1; 2193 } 2194 #endif /* ! _KERNEL */ 2195 } 2196 2197 static void 2198 wapbl_blkhash_free(struct wapbl_replay *wr) 2199 { 2200 KASSERT(wr->wr_blkhashcnt == 0); 2201 #ifdef _KERNEL 2202 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2203 #else /* ! _KERNEL */ 2204 wapbl_free(wr->wr_blkhash, 2205 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2206 #endif /* ! _KERNEL */ 2207 } 2208 2209 static struct wapbl_blk * 2210 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2211 { 2212 struct wapbl_blk_head *wbh; 2213 struct wapbl_blk *wb; 2214 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2215 LIST_FOREACH(wb, wbh, wb_hash) { 2216 if (blk == wb->wb_blk) 2217 return wb; 2218 } 2219 return 0; 2220 } 2221 2222 static void 2223 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2224 { 2225 struct wapbl_blk_head *wbh; 2226 struct wapbl_blk *wb; 2227 wb = wapbl_blkhash_get(wr, blk); 2228 if (wb) { 2229 KASSERT(wb->wb_blk == blk); 2230 wb->wb_off = off; 2231 } else { 2232 wb = wapbl_malloc(sizeof(*wb)); 2233 wb->wb_blk = blk; 2234 wb->wb_off = off; 2235 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2236 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2237 wr->wr_blkhashcnt++; 2238 } 2239 } 2240 2241 static void 2242 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2243 { 2244 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2245 if (wb) { 2246 KASSERT(wr->wr_blkhashcnt > 0); 2247 wr->wr_blkhashcnt--; 2248 LIST_REMOVE(wb, wb_hash); 2249 wapbl_free(wb, sizeof(*wb)); 2250 } 2251 } 2252 2253 static void 2254 wapbl_blkhash_clear(struct wapbl_replay *wr) 2255 { 2256 unsigned long i; 2257 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2258 struct wapbl_blk *wb; 2259 2260 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2261 KASSERT(wr->wr_blkhashcnt > 0); 2262 wr->wr_blkhashcnt--; 2263 LIST_REMOVE(wb, wb_hash); 2264 wapbl_free(wb, sizeof(*wb)); 2265 } 2266 } 2267 KASSERT(wr->wr_blkhashcnt == 0); 2268 } 2269 2270 /****************************************************************/ 2271 2272 static int 2273 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2274 { 2275 size_t slen; 2276 off_t off = *offp; 2277 int error; 2278 daddr_t pbn; 2279 2280 KASSERT(((len >> wr->wr_log_dev_bshift) << 2281 wr->wr_log_dev_bshift) == len); 2282 2283 if (off < wr->wr_circ_off) 2284 off = wr->wr_circ_off; 2285 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2286 if (slen < len) { 2287 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2288 #ifdef _KERNEL 2289 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2290 #endif 2291 error = wapbl_read(data, slen, wr->wr_devvp, pbn); 2292 if (error) 2293 return error; 2294 data = (uint8_t *)data + slen; 2295 len -= slen; 2296 off = wr->wr_circ_off; 2297 } 2298 pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift); 2299 #ifdef _KERNEL 2300 pbn = btodb(pbn << wr->wr_log_dev_bshift); 2301 #endif 2302 error = wapbl_read(data, len, wr->wr_devvp, pbn); 2303 if (error) 2304 return error; 2305 off += len; 2306 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2307 off = wr->wr_circ_off; 2308 *offp = off; 2309 return 0; 2310 } 2311 2312 static void 2313 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2314 { 2315 size_t slen; 2316 off_t off = *offp; 2317 2318 KASSERT(((len >> wr->wr_log_dev_bshift) << 2319 wr->wr_log_dev_bshift) == len); 2320 2321 if (off < wr->wr_circ_off) 2322 off = wr->wr_circ_off; 2323 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2324 if (slen < len) { 2325 len -= slen; 2326 off = wr->wr_circ_off; 2327 } 2328 off += len; 2329 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2330 off = wr->wr_circ_off; 2331 *offp = off; 2332 } 2333 2334 /****************************************************************/ 2335 2336 int 2337 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2338 daddr_t off, size_t count, size_t blksize) 2339 { 2340 struct wapbl_replay *wr; 2341 int error; 2342 struct vnode *devvp; 2343 daddr_t logpbn; 2344 uint8_t *scratch; 2345 struct wapbl_wc_header *wch; 2346 struct wapbl_wc_header *wch2; 2347 /* Use this until we read the actual log header */ 2348 int log_dev_bshift = ilog2(blksize); 2349 size_t used; 2350 daddr_t pbn; 2351 2352 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2353 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2354 vp, off, count, blksize)); 2355 2356 if (off < 0) 2357 return EINVAL; 2358 2359 if (blksize < DEV_BSIZE) 2360 return EINVAL; 2361 if (blksize % DEV_BSIZE) 2362 return EINVAL; 2363 2364 #ifdef _KERNEL 2365 #if 0 2366 /* XXX vp->v_size isn't reliably set for VBLK devices, 2367 * especially root. However, we might still want to verify 2368 * that the full load is readable */ 2369 if ((off + count) * blksize > vp->v_size) 2370 return EINVAL; 2371 #endif 2372 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2373 return error; 2374 } 2375 #else /* ! _KERNEL */ 2376 devvp = vp; 2377 logpbn = off; 2378 #endif /* ! _KERNEL */ 2379 2380 scratch = wapbl_malloc(MAXBSIZE); 2381 2382 pbn = logpbn; 2383 #ifdef _KERNEL 2384 pbn = btodb(pbn << log_dev_bshift); 2385 #endif 2386 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn); 2387 if (error) 2388 goto errout; 2389 2390 wch = (struct wapbl_wc_header *)scratch; 2391 wch2 = 2392 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2393 /* XXX verify checksums and magic numbers */ 2394 if (wch->wc_type != WAPBL_WC_HEADER) { 2395 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2396 error = EFTYPE; 2397 goto errout; 2398 } 2399 2400 if (wch2->wc_generation > wch->wc_generation) 2401 wch = wch2; 2402 2403 wr = wapbl_calloc(1, sizeof(*wr)); 2404 2405 wr->wr_logvp = vp; 2406 wr->wr_devvp = devvp; 2407 wr->wr_logpbn = logpbn; 2408 2409 wr->wr_scratch = scratch; 2410 2411 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2412 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2413 wr->wr_circ_off = wch->wc_circ_off; 2414 wr->wr_circ_size = wch->wc_circ_size; 2415 wr->wr_generation = wch->wc_generation; 2416 2417 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2418 2419 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2420 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2421 " len=%"PRId64" used=%zu\n", 2422 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2423 wch->wc_circ_size, used)); 2424 2425 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2426 2427 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2428 if (error) { 2429 wapbl_replay_stop(wr); 2430 wapbl_replay_free(wr); 2431 return error; 2432 } 2433 2434 *wrp = wr; 2435 return 0; 2436 2437 errout: 2438 wapbl_free(scratch, MAXBSIZE); 2439 return error; 2440 } 2441 2442 void 2443 wapbl_replay_stop(struct wapbl_replay *wr) 2444 { 2445 2446 if (!wapbl_replay_isopen(wr)) 2447 return; 2448 2449 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2450 2451 wapbl_free(wr->wr_scratch, MAXBSIZE); 2452 wr->wr_scratch = NULL; 2453 2454 wr->wr_logvp = NULL; 2455 2456 wapbl_blkhash_clear(wr); 2457 wapbl_blkhash_free(wr); 2458 } 2459 2460 void 2461 wapbl_replay_free(struct wapbl_replay *wr) 2462 { 2463 2464 KDASSERT(!wapbl_replay_isopen(wr)); 2465 2466 if (wr->wr_inodes) 2467 wapbl_free(wr->wr_inodes, 2468 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2469 wapbl_free(wr, sizeof(*wr)); 2470 } 2471 2472 #ifdef _KERNEL 2473 int 2474 wapbl_replay_isopen1(struct wapbl_replay *wr) 2475 { 2476 2477 return wapbl_replay_isopen(wr); 2478 } 2479 #endif 2480 2481 static void 2482 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2483 { 2484 struct wapbl_wc_blocklist *wc = 2485 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2486 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2487 int i, j, n; 2488 2489 for (i = 0; i < wc->wc_blkcount; i++) { 2490 /* 2491 * Enter each physical block into the hashtable independently. 2492 */ 2493 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2494 for (j = 0; j < n; j++) { 2495 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen), 2496 *offp); 2497 wapbl_circ_advance(wr, fsblklen, offp); 2498 } 2499 } 2500 } 2501 2502 static void 2503 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2504 { 2505 struct wapbl_wc_blocklist *wc = 2506 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2507 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2508 int i, j, n; 2509 2510 for (i = 0; i < wc->wc_blkcount; i++) { 2511 /* 2512 * Remove any blocks found from the hashtable. 2513 */ 2514 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2515 for (j = 0; j < n; j++) 2516 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2517 } 2518 } 2519 2520 static void 2521 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2522 { 2523 struct wapbl_wc_inodelist *wc = 2524 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2525 void *new_inodes; 2526 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2527 2528 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2529 2530 /* 2531 * Keep track of where we found this so location won't be 2532 * overwritten. 2533 */ 2534 if (wc->wc_clear) { 2535 wr->wr_inodestail = oldoff; 2536 wr->wr_inodescnt = 0; 2537 if (wr->wr_inodes != NULL) { 2538 wapbl_free(wr->wr_inodes, oldsize); 2539 wr->wr_inodes = NULL; 2540 } 2541 } 2542 wr->wr_inodeshead = newoff; 2543 if (wc->wc_inocnt == 0) 2544 return; 2545 2546 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) * 2547 sizeof(wr->wr_inodes[0])); 2548 if (wr->wr_inodes != NULL) { 2549 memcpy(new_inodes, wr->wr_inodes, oldsize); 2550 wapbl_free(wr->wr_inodes, oldsize); 2551 } 2552 wr->wr_inodes = new_inodes; 2553 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2554 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2555 wr->wr_inodescnt += wc->wc_inocnt; 2556 } 2557 2558 static int 2559 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2560 { 2561 off_t off; 2562 int error; 2563 2564 int logblklen = 1 << wr->wr_log_dev_bshift; 2565 2566 wapbl_blkhash_clear(wr); 2567 2568 off = tail; 2569 while (off != head) { 2570 struct wapbl_wc_null *wcn; 2571 off_t saveoff = off; 2572 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2573 if (error) 2574 goto errout; 2575 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2576 switch (wcn->wc_type) { 2577 case WAPBL_WC_BLOCKS: 2578 wapbl_replay_process_blocks(wr, &off); 2579 break; 2580 2581 case WAPBL_WC_REVOCATIONS: 2582 wapbl_replay_process_revocations(wr); 2583 break; 2584 2585 case WAPBL_WC_INODES: 2586 wapbl_replay_process_inodes(wr, saveoff, off); 2587 break; 2588 2589 default: 2590 printf("Unrecognized wapbl type: 0x%08x\n", 2591 wcn->wc_type); 2592 error = EFTYPE; 2593 goto errout; 2594 } 2595 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2596 if (off != saveoff) { 2597 printf("wapbl_replay: corrupted records\n"); 2598 error = EFTYPE; 2599 goto errout; 2600 } 2601 } 2602 return 0; 2603 2604 errout: 2605 wapbl_blkhash_clear(wr); 2606 return error; 2607 } 2608 2609 #if 0 2610 int 2611 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2612 { 2613 off_t off; 2614 int mismatchcnt = 0; 2615 int logblklen = 1 << wr->wr_log_dev_bshift; 2616 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2617 void *scratch1 = wapbl_malloc(MAXBSIZE); 2618 void *scratch2 = wapbl_malloc(MAXBSIZE); 2619 int error = 0; 2620 2621 KDASSERT(wapbl_replay_isopen(wr)); 2622 2623 off = wch->wc_tail; 2624 while (off != wch->wc_head) { 2625 struct wapbl_wc_null *wcn; 2626 #ifdef DEBUG 2627 off_t saveoff = off; 2628 #endif 2629 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2630 if (error) 2631 goto out; 2632 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2633 switch (wcn->wc_type) { 2634 case WAPBL_WC_BLOCKS: 2635 { 2636 struct wapbl_wc_blocklist *wc = 2637 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2638 int i; 2639 for (i = 0; i < wc->wc_blkcount; i++) { 2640 int foundcnt = 0; 2641 int dirtycnt = 0; 2642 int j, n; 2643 /* 2644 * Check each physical block into the 2645 * hashtable independently 2646 */ 2647 n = wc->wc_blocks[i].wc_dlen >> 2648 wch->wc_fs_dev_bshift; 2649 for (j = 0; j < n; j++) { 2650 struct wapbl_blk *wb = 2651 wapbl_blkhash_get(wr, 2652 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2653 if (wb && (wb->wb_off == off)) { 2654 foundcnt++; 2655 error = 2656 wapbl_circ_read(wr, 2657 scratch1, fsblklen, 2658 &off); 2659 if (error) 2660 goto out; 2661 error = 2662 wapbl_read(scratch2, 2663 fsblklen, fsdevvp, 2664 wb->wb_blk); 2665 if (error) 2666 goto out; 2667 if (memcmp(scratch1, 2668 scratch2, 2669 fsblklen)) { 2670 printf( 2671 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2672 wb->wb_blk, (intmax_t)off); 2673 dirtycnt++; 2674 mismatchcnt++; 2675 } 2676 } else { 2677 wapbl_circ_advance(wr, 2678 fsblklen, &off); 2679 } 2680 } 2681 #if 0 2682 /* 2683 * If all of the blocks in an entry 2684 * are clean, then remove all of its 2685 * blocks from the hashtable since they 2686 * never will need replay. 2687 */ 2688 if ((foundcnt != 0) && 2689 (dirtycnt == 0)) { 2690 off = saveoff; 2691 wapbl_circ_advance(wr, 2692 logblklen, &off); 2693 for (j = 0; j < n; j++) { 2694 struct wapbl_blk *wb = 2695 wapbl_blkhash_get(wr, 2696 wc->wc_blocks[i].wc_daddr + btodb(j * fsblklen)); 2697 if (wb && 2698 (wb->wb_off == off)) { 2699 wapbl_blkhash_rem(wr, wb->wb_blk); 2700 } 2701 wapbl_circ_advance(wr, 2702 fsblklen, &off); 2703 } 2704 } 2705 #endif 2706 } 2707 } 2708 break; 2709 case WAPBL_WC_REVOCATIONS: 2710 case WAPBL_WC_INODES: 2711 break; 2712 default: 2713 KASSERT(0); 2714 } 2715 #ifdef DEBUG 2716 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2717 KASSERT(off == saveoff); 2718 #endif 2719 } 2720 out: 2721 wapbl_free(scratch1, MAXBSIZE); 2722 wapbl_free(scratch2, MAXBSIZE); 2723 if (!error && mismatchcnt) 2724 error = EFTYPE; 2725 return error; 2726 } 2727 #endif 2728 2729 int 2730 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2731 { 2732 struct wapbl_blk *wb; 2733 size_t i; 2734 off_t off; 2735 void *scratch; 2736 int error = 0; 2737 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2738 2739 KDASSERT(wapbl_replay_isopen(wr)); 2740 2741 scratch = wapbl_malloc(MAXBSIZE); 2742 2743 for (i = 0; i <= wr->wr_blkhashmask; ++i) { 2744 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2745 off = wb->wb_off; 2746 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2747 if (error) 2748 break; 2749 error = wapbl_write(scratch, fsblklen, fsdevvp, 2750 wb->wb_blk); 2751 if (error) 2752 break; 2753 } 2754 } 2755 2756 wapbl_free(scratch, MAXBSIZE); 2757 return error; 2758 } 2759 2760 int 2761 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2762 { 2763 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2764 2765 KDASSERT(wapbl_replay_isopen(wr)); 2766 KASSERT((len % fsblklen) == 0); 2767 2768 while (len != 0) { 2769 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2770 if (wb) 2771 return 1; 2772 len -= fsblklen; 2773 } 2774 return 0; 2775 } 2776 2777 int 2778 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2779 { 2780 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2781 2782 KDASSERT(wapbl_replay_isopen(wr)); 2783 2784 KASSERT((len % fsblklen) == 0); 2785 2786 while (len != 0) { 2787 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2788 if (wb) { 2789 off_t off = wb->wb_off; 2790 int error; 2791 error = wapbl_circ_read(wr, data, fsblklen, &off); 2792 if (error) 2793 return error; 2794 } 2795 data = (uint8_t *)data + fsblklen; 2796 len -= fsblklen; 2797 blk++; 2798 } 2799 return 0; 2800 } 2801 2802 #ifdef _KERNEL 2803 /* 2804 * This is not really a module now, but maybe on it's way to 2805 * being one some day. 2806 */ 2807 MODULE(MODULE_CLASS_VFS, wapbl, NULL); 2808 2809 static int 2810 wapbl_modcmd(modcmd_t cmd, void *arg) 2811 { 2812 2813 switch (cmd) { 2814 case MODULE_CMD_INIT: 2815 wapbl_init(); 2816 return 0; 2817 case MODULE_CMD_FINI: 2818 #ifdef notyet 2819 return wapbl_fini(true); 2820 #endif 2821 return EOPNOTSUPP; 2822 default: 2823 return ENOTTY; 2824 } 2825 } 2826 #endif /* _KERNEL */ 2827