1 /* $NetBSD: vfs_wapbl.c,v 1.25 2009/04/05 11:48:02 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.25 2009/04/05 11:48:02 lukem Exp $"); 40 41 #include <sys/param.h> 42 43 #ifdef _KERNEL 44 #include <sys/param.h> 45 #include <sys/namei.h> 46 #include <sys/proc.h> 47 #include <sys/uio.h> 48 #include <sys/vnode.h> 49 #include <sys/file.h> 50 #include <sys/malloc.h> 51 #include <sys/resourcevar.h> 52 #include <sys/conf.h> 53 #include <sys/mount.h> 54 #include <sys/kernel.h> 55 #include <sys/kauth.h> 56 #include <sys/mutex.h> 57 #include <sys/atomic.h> 58 #include <sys/wapbl.h> 59 #include <sys/wapbl_replay.h> 60 61 #include <miscfs/specfs/specdev.h> 62 63 #if 0 /* notyet */ 64 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP) 65 #define wapbl_free(a, s) kmem_free((a), (s)) 66 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 67 #else 68 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); 69 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) 70 #define wapbl_free(a, s) free((a), M_WAPBL) 71 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) 72 #endif 73 74 #else /* !_KERNEL */ 75 #include <assert.h> 76 #include <errno.h> 77 #include <stdio.h> 78 #include <stdbool.h> 79 #include <stdlib.h> 80 #include <string.h> 81 82 #include <sys/time.h> 83 #include <sys/wapbl.h> 84 #include <sys/wapbl_replay.h> 85 86 #define KDASSERT(x) assert(x) 87 #define KASSERT(x) assert(x) 88 #define wapbl_malloc(s) malloc(s) 89 #define wapbl_free(a, s) free(a) 90 #define wapbl_calloc(n, s) calloc((n), (s)) 91 92 #endif /* !_KERNEL */ 93 94 /* 95 * INTERNAL DATA STRUCTURES 96 */ 97 98 /* 99 * This structure holds per-mount log information. 100 * 101 * Legend: a = atomic access only 102 * r = read-only after init 103 * l = rwlock held 104 * m = mutex held 105 * u = unlocked access ok 106 * b = bufcache_lock held 107 */ 108 struct wapbl { 109 struct vnode *wl_logvp; /* r: log here */ 110 struct vnode *wl_devvp; /* r: log on this device */ 111 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 112 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 113 int wl_log_dev_bshift; /* r: logarithm of device block size of log 114 device */ 115 int wl_fs_dev_bshift; /* r: logarithm of device block size of 116 filesystem device */ 117 118 unsigned wl_lock_count; /* m: Count of transactions in progress */ 119 120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 121 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 122 123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 125 126 off_t wl_head; /* l: Byte offset of log head */ 127 off_t wl_tail; /* l: Byte offset of log tail */ 128 /* 129 * head == tail == 0 means log is empty 130 * head == tail != 0 means log is full 131 * see assertions in wapbl_advance() for other boundary conditions. 132 * only truncate moves the tail, except when flush sets it to 133 * wl_header_size only flush moves the head, except when truncate 134 * sets it to 0. 135 */ 136 137 struct wapbl_wc_header *wl_wc_header; /* l */ 138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 139 140 kmutex_t wl_mtx; /* u: short-term lock */ 141 krwlock_t wl_rwlock; /* u: File system transaction lock */ 142 143 /* 144 * Must be held while accessing 145 * wl_count or wl_bufs or head or tail 146 */ 147 148 /* 149 * Callback called from within the flush routine to flush any extra 150 * bits. Note that flush may be skipped without calling this if 151 * there are no outstanding buffers in the transaction. 152 */ 153 #if _KERNEL 154 wapbl_flush_fn_t wl_flush; /* r */ 155 wapbl_flush_fn_t wl_flush_abort;/* r */ 156 #endif 157 158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 160 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 161 162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 163 164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 165 size_t wl_reclaimable_bytes; /* m: Amount of space available for 166 reclamation by truncate */ 167 int wl_error_count; /* m: # of wl_entries with errors */ 168 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 169 170 #ifdef WAPBL_DEBUG_BUFBYTES 171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 172 #endif 173 174 daddr_t *wl_deallocblks;/* l: address of block */ 175 int *wl_dealloclens; /* l: size of block */ 176 int wl_dealloccnt; /* l: total count */ 177 int wl_dealloclim; /* l: max count */ 178 179 /* hashtable of inode numbers for allocated but unlinked inodes */ 180 /* synch ??? */ 181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 182 u_long wl_inohashmask; 183 int wl_inohashcnt; 184 185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 186 accounting */ 187 }; 188 189 #ifdef WAPBL_DEBUG_PRINT 190 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 191 #endif 192 193 /****************************************************************/ 194 #ifdef _KERNEL 195 196 #ifdef WAPBL_DEBUG 197 struct wapbl *wapbl_debug_wl; 198 #endif 199 200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 204 #endif /* _KERNEL */ 205 206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 207 208 static __inline size_t wapbl_space_free(size_t avail, off_t head, 209 off_t tail); 210 static __inline size_t wapbl_space_used(size_t avail, off_t head, 211 off_t tail); 212 213 #ifdef _KERNEL 214 215 #define WAPBL_INODETRK_SIZE 83 216 static int wapbl_ino_pool_refcount; 217 static struct pool wapbl_ino_pool; 218 struct wapbl_ino { 219 LIST_ENTRY(wapbl_ino) wi_hash; 220 ino_t wi_ino; 221 mode_t wi_mode; 222 }; 223 224 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 225 static void wapbl_inodetrk_free(struct wapbl *wl); 226 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 227 228 static size_t wapbl_transaction_len(struct wapbl *wl); 229 static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 230 231 #if 0 232 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 233 #endif 234 235 static int wapbl_replay_isopen1(struct wapbl_replay *); 236 237 /* 238 * This is useful for debugging. If set, the log will 239 * only be truncated when necessary. 240 */ 241 int wapbl_lazy_truncate = 0; 242 243 struct wapbl_ops wapbl_ops = { 244 .wo_wapbl_discard = wapbl_discard, 245 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 246 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 247 .wo_wapbl_replay_read = wapbl_replay_read, 248 .wo_wapbl_add_buf = wapbl_add_buf, 249 .wo_wapbl_remove_buf = wapbl_remove_buf, 250 .wo_wapbl_resize_buf = wapbl_resize_buf, 251 .wo_wapbl_begin = wapbl_begin, 252 .wo_wapbl_end = wapbl_end, 253 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 254 255 /* XXX: the following is only used to say "this is a wapbl buf" */ 256 .wo_wapbl_biodone = wapbl_biodone, 257 }; 258 259 void 260 wapbl_init(void) 261 { 262 263 malloc_type_attach(M_WAPBL); 264 } 265 266 static int 267 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 268 { 269 int error, i; 270 271 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 272 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 273 274 /* 275 * Its only valid to reuse the replay log if its 276 * the same as the new log we just opened. 277 */ 278 KDASSERT(!wapbl_replay_isopen(wr)); 279 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 280 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 281 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 282 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 283 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 284 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 285 286 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 287 288 for (i = 0; i < wr->wr_inodescnt; i++) 289 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 290 wr->wr_inodes[i].wr_imode); 291 292 /* Make sure new transaction won't overwrite old inodes list */ 293 KDASSERT(wapbl_transaction_len(wl) <= 294 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 295 wr->wr_inodestail)); 296 297 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 298 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 299 wapbl_transaction_len(wl); 300 301 error = wapbl_write_inodes(wl, &wl->wl_head); 302 if (error) 303 return error; 304 305 KASSERT(wl->wl_head != wl->wl_tail); 306 KASSERT(wl->wl_head != 0); 307 308 return 0; 309 } 310 311 int 312 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 313 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 314 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 315 { 316 struct wapbl *wl; 317 struct vnode *devvp; 318 daddr_t logpbn; 319 int error; 320 int log_dev_bshift = DEV_BSHIFT; 321 int fs_dev_bshift = DEV_BSHIFT; 322 int run; 323 324 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 325 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 326 327 if (log_dev_bshift > fs_dev_bshift) { 328 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 329 ("wapbl: log device's block size cannot be larger " 330 "than filesystem's\n")); 331 /* 332 * Not currently implemented, although it could be if 333 * needed someday. 334 */ 335 return ENOSYS; 336 } 337 338 if (off < 0) 339 return EINVAL; 340 341 if (blksize < DEV_BSIZE) 342 return EINVAL; 343 if (blksize % DEV_BSIZE) 344 return EINVAL; 345 346 /* XXXTODO: verify that the full load is writable */ 347 348 /* 349 * XXX check for minimum log size 350 * minimum is governed by minimum amount of space 351 * to complete a transaction. (probably truncate) 352 */ 353 /* XXX for now pick something minimal */ 354 if ((count * blksize) < MAXPHYS) { 355 return ENOSPC; 356 } 357 358 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 359 return error; 360 } 361 362 wl = wapbl_calloc(1, sizeof(*wl)); 363 rw_init(&wl->wl_rwlock); 364 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 365 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 366 LIST_INIT(&wl->wl_bufs); 367 SIMPLEQ_INIT(&wl->wl_entries); 368 369 wl->wl_logvp = vp; 370 wl->wl_devvp = devvp; 371 wl->wl_mount = mp; 372 wl->wl_logpbn = logpbn; 373 wl->wl_log_dev_bshift = log_dev_bshift; 374 wl->wl_fs_dev_bshift = fs_dev_bshift; 375 376 wl->wl_flush = flushfn; 377 wl->wl_flush_abort = flushabortfn; 378 379 /* Reserve two log device blocks for the commit headers */ 380 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 381 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 382 /* truncate the log usage to a multiple of log_dev_bshift */ 383 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 384 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 385 386 /* 387 * wl_bufbytes_max limits the size of the in memory transaction space. 388 * - Since buffers are allocated and accounted for in units of 389 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 390 * (i.e. 1<<PAGE_SHIFT) 391 * - Since the log device has to be written in units of 392 * 1<<wl_log_dev_bshift it is required to be a mulitple of 393 * 1<<wl_log_dev_bshift. 394 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 395 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 396 * Therefore it must be multiple of the least common multiple of those 397 * three quantities. Fortunately, all of those quantities are 398 * guaranteed to be a power of two, and the least common multiple of 399 * a set of numbers which are all powers of two is simply the maximum 400 * of those numbers. Finally, the maximum logarithm of a power of two 401 * is the same as the log of the maximum power of two. So we can do 402 * the following operations to size wl_bufbytes_max: 403 */ 404 405 /* XXX fix actual number of pages reserved per filesystem. */ 406 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 407 408 /* Round wl_bufbytes_max to the largest power of two constraint */ 409 wl->wl_bufbytes_max >>= PAGE_SHIFT; 410 wl->wl_bufbytes_max <<= PAGE_SHIFT; 411 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 412 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 413 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 414 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 415 416 /* XXX maybe use filesystem fragment size instead of 1024 */ 417 /* XXX fix actual number of buffers reserved per filesystem. */ 418 wl->wl_bufcount_max = (nbuf / 2) * 1024; 419 420 /* XXX tie this into resource estimation */ 421 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max); 422 423 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * 424 wl->wl_dealloclim); 425 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * 426 wl->wl_dealloclim); 427 428 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 429 430 /* Initialize the commit header */ 431 { 432 struct wapbl_wc_header *wc; 433 size_t len = 1 << wl->wl_log_dev_bshift; 434 wc = wapbl_calloc(1, len); 435 wc->wc_type = WAPBL_WC_HEADER; 436 wc->wc_len = len; 437 wc->wc_circ_off = wl->wl_circ_off; 438 wc->wc_circ_size = wl->wl_circ_size; 439 /* XXX wc->wc_fsid */ 440 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 441 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 442 wl->wl_wc_header = wc; 443 wl->wl_wc_scratch = wapbl_malloc(len); 444 } 445 446 /* 447 * if there was an existing set of unlinked but 448 * allocated inodes, preserve it in the new 449 * log. 450 */ 451 if (wr && wr->wr_inodescnt) { 452 error = wapbl_start_flush_inodes(wl, wr); 453 if (error) 454 goto errout; 455 } 456 457 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 458 if (error) { 459 goto errout; 460 } 461 462 *wlp = wl; 463 #if defined(WAPBL_DEBUG) 464 wapbl_debug_wl = wl; 465 #endif 466 467 return 0; 468 errout: 469 wapbl_discard(wl); 470 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 471 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 472 wapbl_free(wl->wl_deallocblks, 473 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 474 wapbl_free(wl->wl_dealloclens, 475 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 476 wapbl_inodetrk_free(wl); 477 wapbl_free(wl, sizeof(*wl)); 478 479 return error; 480 } 481 482 /* 483 * Like wapbl_flush, only discards the transaction 484 * completely 485 */ 486 487 void 488 wapbl_discard(struct wapbl *wl) 489 { 490 struct wapbl_entry *we; 491 struct buf *bp; 492 int i; 493 494 /* 495 * XXX we may consider using upgrade here 496 * if we want to call flush from inside a transaction 497 */ 498 rw_enter(&wl->wl_rwlock, RW_WRITER); 499 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 500 wl->wl_dealloccnt); 501 502 #ifdef WAPBL_DEBUG_PRINT 503 { 504 struct wapbl_entry *we; 505 pid_t pid = -1; 506 lwpid_t lid = -1; 507 if (curproc) 508 pid = curproc->p_pid; 509 if (curlwp) 510 lid = curlwp->l_lid; 511 #ifdef WAPBL_DEBUG_BUFBYTES 512 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 513 ("wapbl_discard: thread %d.%d discarding " 514 "transaction\n" 515 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 516 "deallocs=%d inodes=%d\n" 517 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 518 "unsynced=%zu\n", 519 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 520 wl->wl_bcount, wl->wl_dealloccnt, 521 wl->wl_inohashcnt, wl->wl_error_count, 522 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 523 wl->wl_unsynced_bufbytes)); 524 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 525 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 526 ("\tentry: bufcount = %zu, reclaimable = %zu, " 527 "error = %d, unsynced = %zu\n", 528 we->we_bufcount, we->we_reclaimable_bytes, 529 we->we_error, we->we_unsynced_bufbytes)); 530 } 531 #else /* !WAPBL_DEBUG_BUFBYTES */ 532 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 533 ("wapbl_discard: thread %d.%d discarding transaction\n" 534 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 535 "deallocs=%d inodes=%d\n" 536 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 537 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 538 wl->wl_bcount, wl->wl_dealloccnt, 539 wl->wl_inohashcnt, wl->wl_error_count, 540 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 541 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 542 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 543 ("\tentry: bufcount = %zu, reclaimable = %zu, " 544 "error = %d\n", 545 we->we_bufcount, we->we_reclaimable_bytes, 546 we->we_error)); 547 } 548 #endif /* !WAPBL_DEBUG_BUFBYTES */ 549 } 550 #endif /* WAPBL_DEBUG_PRINT */ 551 552 for (i = 0; i <= wl->wl_inohashmask; i++) { 553 struct wapbl_ino_head *wih; 554 struct wapbl_ino *wi; 555 556 wih = &wl->wl_inohash[i]; 557 while ((wi = LIST_FIRST(wih)) != NULL) { 558 LIST_REMOVE(wi, wi_hash); 559 pool_put(&wapbl_ino_pool, wi); 560 KASSERT(wl->wl_inohashcnt > 0); 561 wl->wl_inohashcnt--; 562 } 563 } 564 565 /* 566 * clean buffer list 567 */ 568 mutex_enter(&bufcache_lock); 569 mutex_enter(&wl->wl_mtx); 570 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 571 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 572 /* 573 * The buffer will be unlocked and 574 * removed from the transaction in brelse 575 */ 576 mutex_exit(&wl->wl_mtx); 577 brelsel(bp, 0); 578 mutex_enter(&wl->wl_mtx); 579 } 580 } 581 mutex_exit(&wl->wl_mtx); 582 mutex_exit(&bufcache_lock); 583 584 /* 585 * Remove references to this wl from wl_entries, free any which 586 * no longer have buffers, others will be freed in wapbl_biodone 587 * when they no longer have any buffers. 588 */ 589 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 590 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 591 /* XXX should we be accumulating wl_error_count 592 * and increasing reclaimable bytes ? */ 593 we->we_wapbl = NULL; 594 if (we->we_bufcount == 0) { 595 #ifdef WAPBL_DEBUG_BUFBYTES 596 KASSERT(we->we_unsynced_bufbytes == 0); 597 #endif 598 wapbl_free(we, sizeof(*we)); 599 } 600 } 601 602 /* Discard list of deallocs */ 603 wl->wl_dealloccnt = 0; 604 /* XXX should we clear wl_reserved_bytes? */ 605 606 KASSERT(wl->wl_bufbytes == 0); 607 KASSERT(wl->wl_bcount == 0); 608 KASSERT(wl->wl_bufcount == 0); 609 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 610 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 611 KASSERT(wl->wl_inohashcnt == 0); 612 613 rw_exit(&wl->wl_rwlock); 614 } 615 616 int 617 wapbl_stop(struct wapbl *wl, int force) 618 { 619 struct vnode *vp; 620 int error; 621 622 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 623 error = wapbl_flush(wl, 1); 624 if (error) { 625 if (force) 626 wapbl_discard(wl); 627 else 628 return error; 629 } 630 631 /* Unlinked inodes persist after a flush */ 632 if (wl->wl_inohashcnt) { 633 if (force) { 634 wapbl_discard(wl); 635 } else { 636 return EBUSY; 637 } 638 } 639 640 KASSERT(wl->wl_bufbytes == 0); 641 KASSERT(wl->wl_bcount == 0); 642 KASSERT(wl->wl_bufcount == 0); 643 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 644 KASSERT(wl->wl_dealloccnt == 0); 645 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 646 KASSERT(wl->wl_inohashcnt == 0); 647 648 vp = wl->wl_logvp; 649 650 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 651 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 652 wapbl_free(wl->wl_deallocblks, 653 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 654 wapbl_free(wl->wl_dealloclens, 655 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 656 wapbl_inodetrk_free(wl); 657 658 cv_destroy(&wl->wl_reclaimable_cv); 659 mutex_destroy(&wl->wl_mtx); 660 rw_destroy(&wl->wl_rwlock); 661 wapbl_free(wl, sizeof(*wl)); 662 663 return 0; 664 } 665 666 static int 667 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 668 { 669 struct pstats *pstats = curlwp->l_proc->p_stats; 670 struct buf *bp; 671 int error; 672 673 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 674 KASSERT(devvp->v_type == VBLK); 675 676 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 677 mutex_enter(&devvp->v_interlock); 678 devvp->v_numoutput++; 679 mutex_exit(&devvp->v_interlock); 680 pstats->p_ru.ru_oublock++; 681 } else { 682 pstats->p_ru.ru_inblock++; 683 } 684 685 bp = getiobuf(devvp, true); 686 bp->b_flags = flags; 687 bp->b_cflags = BC_BUSY; /* silly & dubious */ 688 bp->b_dev = devvp->v_rdev; 689 bp->b_data = data; 690 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 691 bp->b_blkno = pbn; 692 693 WAPBL_PRINTF(WAPBL_PRINT_IO, 694 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n", 695 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 696 bp->b_blkno, bp->b_dev)); 697 698 VOP_STRATEGY(devvp, bp); 699 700 error = biowait(bp); 701 putiobuf(bp); 702 703 if (error) { 704 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 705 ("wapbl_doio: %s %zu bytes at block %" PRId64 706 " on dev 0x%x failed with error %d\n", 707 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 708 "write" : "read"), 709 len, pbn, devvp->v_rdev, error)); 710 } 711 712 return error; 713 } 714 715 int 716 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 717 { 718 719 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 720 } 721 722 int 723 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 724 { 725 726 return wapbl_doio(data, len, devvp, pbn, B_READ); 727 } 728 729 /* 730 * Off is byte offset returns new offset for next write 731 * handles log wraparound 732 */ 733 static int 734 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 735 { 736 size_t slen; 737 off_t off = *offp; 738 int error; 739 740 KDASSERT(((len >> wl->wl_log_dev_bshift) << 741 wl->wl_log_dev_bshift) == len); 742 743 if (off < wl->wl_circ_off) 744 off = wl->wl_circ_off; 745 slen = wl->wl_circ_off + wl->wl_circ_size - off; 746 if (slen < len) { 747 error = wapbl_write(data, slen, wl->wl_devvp, 748 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); 749 if (error) 750 return error; 751 data = (uint8_t *)data + slen; 752 len -= slen; 753 off = wl->wl_circ_off; 754 } 755 error = wapbl_write(data, len, wl->wl_devvp, 756 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); 757 if (error) 758 return error; 759 off += len; 760 if (off >= wl->wl_circ_off + wl->wl_circ_size) 761 off = wl->wl_circ_off; 762 *offp = off; 763 return 0; 764 } 765 766 /****************************************************************/ 767 768 int 769 wapbl_begin(struct wapbl *wl, const char *file, int line) 770 { 771 int doflush; 772 unsigned lockcount; 773 774 KDASSERT(wl); 775 776 /* 777 * XXX this needs to be made much more sophisticated. 778 * perhaps each wapbl_begin could reserve a specified 779 * number of buffers and bytes. 780 */ 781 mutex_enter(&wl->wl_mtx); 782 lockcount = wl->wl_lock_count; 783 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 784 wl->wl_bufbytes_max / 2) || 785 ((wl->wl_bufcount + (lockcount * 10)) > 786 wl->wl_bufcount_max / 2) || 787 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2); 788 mutex_exit(&wl->wl_mtx); 789 790 if (doflush) { 791 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 792 ("force flush lockcnt=%d bufbytes=%zu " 793 "(max=%zu) bufcount=%zu (max=%zu)\n", 794 lockcount, wl->wl_bufbytes, 795 wl->wl_bufbytes_max, wl->wl_bufcount, 796 wl->wl_bufcount_max)); 797 } 798 799 if (doflush) { 800 int error = wapbl_flush(wl, 0); 801 if (error) 802 return error; 803 } 804 805 rw_enter(&wl->wl_rwlock, RW_READER); 806 mutex_enter(&wl->wl_mtx); 807 wl->wl_lock_count++; 808 mutex_exit(&wl->wl_mtx); 809 810 #if defined(WAPBL_DEBUG_PRINT) 811 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 812 ("wapbl_begin thread %d.%d with bufcount=%zu " 813 "bufbytes=%zu bcount=%zu at %s:%d\n", 814 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 815 wl->wl_bufbytes, wl->wl_bcount, file, line)); 816 #endif 817 818 return 0; 819 } 820 821 void 822 wapbl_end(struct wapbl *wl) 823 { 824 825 #if defined(WAPBL_DEBUG_PRINT) 826 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 827 ("wapbl_end thread %d.%d with bufcount=%zu " 828 "bufbytes=%zu bcount=%zu\n", 829 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 830 wl->wl_bufbytes, wl->wl_bcount)); 831 #endif 832 833 mutex_enter(&wl->wl_mtx); 834 KASSERT(wl->wl_lock_count > 0); 835 wl->wl_lock_count--; 836 mutex_exit(&wl->wl_mtx); 837 838 rw_exit(&wl->wl_rwlock); 839 } 840 841 void 842 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 843 { 844 845 KASSERT(bp->b_cflags & BC_BUSY); 846 KASSERT(bp->b_vp); 847 848 wapbl_jlock_assert(wl); 849 850 #if 0 851 /* 852 * XXX this might be an issue for swapfiles. 853 * see uvm_swap.c:1702 854 * 855 * XXX2 why require it then? leap of semantics? 856 */ 857 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 858 #endif 859 860 mutex_enter(&wl->wl_mtx); 861 if (bp->b_flags & B_LOCKED) { 862 LIST_REMOVE(bp, b_wapbllist); 863 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 864 ("wapbl_add_buf thread %d.%d re-adding buf %p " 865 "with %d bytes %d bcount\n", 866 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 867 bp->b_bcount)); 868 } else { 869 /* unlocked by dirty buffers shouldn't exist */ 870 KASSERT(!(bp->b_oflags & BO_DELWRI)); 871 wl->wl_bufbytes += bp->b_bufsize; 872 wl->wl_bcount += bp->b_bcount; 873 wl->wl_bufcount++; 874 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 875 ("wapbl_add_buf thread %d.%d adding buf %p " 876 "with %d bytes %d bcount\n", 877 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 878 bp->b_bcount)); 879 } 880 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 881 mutex_exit(&wl->wl_mtx); 882 883 bp->b_flags |= B_LOCKED; 884 } 885 886 static void 887 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 888 { 889 890 KASSERT(mutex_owned(&wl->wl_mtx)); 891 KASSERT(bp->b_cflags & BC_BUSY); 892 wapbl_jlock_assert(wl); 893 894 #if 0 895 /* 896 * XXX this might be an issue for swapfiles. 897 * see uvm_swap.c:1725 898 * 899 * XXXdeux: see above 900 */ 901 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 902 #endif 903 KASSERT(bp->b_flags & B_LOCKED); 904 905 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 906 ("wapbl_remove_buf thread %d.%d removing buf %p with " 907 "%d bytes %d bcount\n", 908 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 909 910 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 911 wl->wl_bufbytes -= bp->b_bufsize; 912 KASSERT(wl->wl_bcount >= bp->b_bcount); 913 wl->wl_bcount -= bp->b_bcount; 914 KASSERT(wl->wl_bufcount > 0); 915 wl->wl_bufcount--; 916 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 917 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 918 LIST_REMOVE(bp, b_wapbllist); 919 920 bp->b_flags &= ~B_LOCKED; 921 } 922 923 /* called from brelsel() in vfs_bio among other places */ 924 void 925 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 926 { 927 928 mutex_enter(&wl->wl_mtx); 929 wapbl_remove_buf_locked(wl, bp); 930 mutex_exit(&wl->wl_mtx); 931 } 932 933 void 934 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 935 { 936 937 KASSERT(bp->b_cflags & BC_BUSY); 938 939 /* 940 * XXX: why does this depend on B_LOCKED? otherwise the buf 941 * is not for a transaction? if so, why is this called in the 942 * first place? 943 */ 944 if (bp->b_flags & B_LOCKED) { 945 mutex_enter(&wl->wl_mtx); 946 wl->wl_bufbytes += bp->b_bufsize - oldsz; 947 wl->wl_bcount += bp->b_bcount - oldcnt; 948 mutex_exit(&wl->wl_mtx); 949 } 950 } 951 952 #endif /* _KERNEL */ 953 954 /****************************************************************/ 955 /* Some utility inlines */ 956 957 /* This is used to advance the pointer at old to new value at old+delta */ 958 static __inline off_t 959 wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 960 { 961 off_t new; 962 963 /* Define acceptable ranges for inputs. */ 964 KASSERT(delta <= size); 965 KASSERT((old == 0) || (old >= off)); 966 KASSERT(old < (size + off)); 967 968 if ((old == 0) && (delta != 0)) 969 new = off + delta; 970 else if ((old + delta) < (size + off)) 971 new = old + delta; 972 else 973 new = (old + delta) - size; 974 975 /* Note some interesting axioms */ 976 KASSERT((delta != 0) || (new == old)); 977 KASSERT((delta == 0) || (new != 0)); 978 KASSERT((delta != (size)) || (new == old)); 979 980 /* Define acceptable ranges for output. */ 981 KASSERT((new == 0) || (new >= off)); 982 KASSERT(new < (size + off)); 983 return new; 984 } 985 986 static __inline size_t 987 wapbl_space_used(size_t avail, off_t head, off_t tail) 988 { 989 990 if (tail == 0) { 991 KASSERT(head == 0); 992 return 0; 993 } 994 return ((head + (avail - 1) - tail) % avail) + 1; 995 } 996 997 static __inline size_t 998 wapbl_space_free(size_t avail, off_t head, off_t tail) 999 { 1000 1001 return avail - wapbl_space_used(avail, head, tail); 1002 } 1003 1004 static __inline void 1005 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1006 off_t *tailp) 1007 { 1008 off_t head = *headp; 1009 off_t tail = *tailp; 1010 1011 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1012 head = wapbl_advance(size, off, head, delta); 1013 if ((tail == 0) && (head != 0)) 1014 tail = off; 1015 *headp = head; 1016 *tailp = tail; 1017 } 1018 1019 static __inline void 1020 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1021 off_t *tailp) 1022 { 1023 off_t head = *headp; 1024 off_t tail = *tailp; 1025 1026 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1027 tail = wapbl_advance(size, off, tail, delta); 1028 if (head == tail) { 1029 head = tail = 0; 1030 } 1031 *headp = head; 1032 *tailp = tail; 1033 } 1034 1035 #ifdef _KERNEL 1036 1037 /****************************************************************/ 1038 1039 /* 1040 * Remove transactions whose buffers are completely flushed to disk. 1041 * Will block until at least minfree space is available. 1042 * only intended to be called from inside wapbl_flush and therefore 1043 * does not protect against commit races with itself or with flush. 1044 */ 1045 static int 1046 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1047 { 1048 size_t delta; 1049 size_t avail; 1050 off_t head; 1051 off_t tail; 1052 int error = 0; 1053 1054 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1055 KASSERT(rw_write_held(&wl->wl_rwlock)); 1056 1057 mutex_enter(&wl->wl_mtx); 1058 1059 /* 1060 * First check to see if we have to do a commit 1061 * at all. 1062 */ 1063 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1064 if (minfree < avail) { 1065 mutex_exit(&wl->wl_mtx); 1066 return 0; 1067 } 1068 minfree -= avail; 1069 while ((wl->wl_error_count == 0) && 1070 (wl->wl_reclaimable_bytes < minfree)) { 1071 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1072 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1073 "minfree=%zd\n", 1074 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1075 minfree)); 1076 1077 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1078 } 1079 if (wl->wl_reclaimable_bytes < minfree) { 1080 KASSERT(wl->wl_error_count); 1081 /* XXX maybe get actual error from buffer instead someday? */ 1082 error = EIO; 1083 } 1084 head = wl->wl_head; 1085 tail = wl->wl_tail; 1086 delta = wl->wl_reclaimable_bytes; 1087 1088 /* If all of of the entries are flushed, then be sure to keep 1089 * the reserved bytes reserved. Watch out for discarded transactions, 1090 * which could leave more bytes reserved than are reclaimable. 1091 */ 1092 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1093 (delta >= wl->wl_reserved_bytes)) { 1094 delta -= wl->wl_reserved_bytes; 1095 } 1096 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1097 &tail); 1098 KDASSERT(wl->wl_reserved_bytes <= 1099 wapbl_space_used(wl->wl_circ_size, head, tail)); 1100 mutex_exit(&wl->wl_mtx); 1101 1102 if (error) 1103 return error; 1104 1105 if (waitonly) 1106 return 0; 1107 1108 /* 1109 * This is where head, tail and delta are unprotected 1110 * from races against itself or flush. This is ok since 1111 * we only call this routine from inside flush itself. 1112 * 1113 * XXX: how can it race against itself when accessed only 1114 * from behind the write-locked rwlock? 1115 */ 1116 error = wapbl_write_commit(wl, head, tail); 1117 if (error) 1118 return error; 1119 1120 wl->wl_head = head; 1121 wl->wl_tail = tail; 1122 1123 mutex_enter(&wl->wl_mtx); 1124 KASSERT(wl->wl_reclaimable_bytes >= delta); 1125 wl->wl_reclaimable_bytes -= delta; 1126 mutex_exit(&wl->wl_mtx); 1127 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1128 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1129 curproc->p_pid, curlwp->l_lid, delta)); 1130 1131 return 0; 1132 } 1133 1134 /****************************************************************/ 1135 1136 void 1137 wapbl_biodone(struct buf *bp) 1138 { 1139 struct wapbl_entry *we = bp->b_private; 1140 struct wapbl *wl = we->we_wapbl; 1141 1142 /* 1143 * Handle possible flushing of buffers after log has been 1144 * decomissioned. 1145 */ 1146 if (!wl) { 1147 KASSERT(we->we_bufcount > 0); 1148 we->we_bufcount--; 1149 #ifdef WAPBL_DEBUG_BUFBYTES 1150 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1151 we->we_unsynced_bufbytes -= bp->b_bufsize; 1152 #endif 1153 1154 if (we->we_bufcount == 0) { 1155 #ifdef WAPBL_DEBUG_BUFBYTES 1156 KASSERT(we->we_unsynced_bufbytes == 0); 1157 #endif 1158 wapbl_free(we, sizeof(*we)); 1159 } 1160 1161 brelse(bp, 0); 1162 return; 1163 } 1164 1165 #ifdef ohbother 1166 KDASSERT(bp->b_flags & B_DONE); 1167 KDASSERT(!(bp->b_flags & B_DELWRI)); 1168 KDASSERT(bp->b_flags & B_ASYNC); 1169 KDASSERT(bp->b_flags & B_BUSY); 1170 KDASSERT(!(bp->b_flags & B_LOCKED)); 1171 KDASSERT(!(bp->b_flags & B_READ)); 1172 KDASSERT(!(bp->b_flags & B_INVAL)); 1173 KDASSERT(!(bp->b_flags & B_NOCACHE)); 1174 #endif 1175 1176 if (bp->b_error) { 1177 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1178 XXXpooka: interfaces not fully updated 1179 Note: this was not enabled in the original patch 1180 against netbsd4 either. I don't know if comment 1181 above is true or not. 1182 1183 /* 1184 * If an error occurs, report the error and leave the 1185 * buffer as a delayed write on the LRU queue. 1186 * restarting the write would likely result in 1187 * an error spinloop, so let it be done harmlessly 1188 * by the syncer. 1189 */ 1190 bp->b_flags &= ~(B_DONE); 1191 simple_unlock(&bp->b_interlock); 1192 1193 if (we->we_error == 0) { 1194 mutex_enter(&wl->wl_mtx); 1195 wl->wl_error_count++; 1196 mutex_exit(&wl->wl_mtx); 1197 cv_broadcast(&wl->wl_reclaimable_cv); 1198 } 1199 we->we_error = bp->b_error; 1200 bp->b_error = 0; 1201 brelse(bp); 1202 return; 1203 #else 1204 /* For now, just mark the log permanently errored out */ 1205 1206 mutex_enter(&wl->wl_mtx); 1207 if (wl->wl_error_count == 0) { 1208 wl->wl_error_count++; 1209 cv_broadcast(&wl->wl_reclaimable_cv); 1210 } 1211 mutex_exit(&wl->wl_mtx); 1212 #endif 1213 } 1214 1215 mutex_enter(&wl->wl_mtx); 1216 1217 KASSERT(we->we_bufcount > 0); 1218 we->we_bufcount--; 1219 #ifdef WAPBL_DEBUG_BUFBYTES 1220 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1221 we->we_unsynced_bufbytes -= bp->b_bufsize; 1222 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); 1223 wl->wl_unsynced_bufbytes -= bp->b_bufsize; 1224 #endif 1225 1226 /* 1227 * If the current transaction can be reclaimed, start 1228 * at the beginning and reclaim any consecutive reclaimable 1229 * transactions. If we successfully reclaim anything, 1230 * then wakeup anyone waiting for the reclaim. 1231 */ 1232 if (we->we_bufcount == 0) { 1233 size_t delta = 0; 1234 int errcnt = 0; 1235 #ifdef WAPBL_DEBUG_BUFBYTES 1236 KDASSERT(we->we_unsynced_bufbytes == 0); 1237 #endif 1238 /* 1239 * clear any posted error, since the buffer it came from 1240 * has successfully flushed by now 1241 */ 1242 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1243 (we->we_bufcount == 0)) { 1244 delta += we->we_reclaimable_bytes; 1245 if (we->we_error) 1246 errcnt++; 1247 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1248 wapbl_free(we, sizeof(*we)); 1249 } 1250 1251 if (delta) { 1252 wl->wl_reclaimable_bytes += delta; 1253 KASSERT(wl->wl_error_count >= errcnt); 1254 wl->wl_error_count -= errcnt; 1255 cv_broadcast(&wl->wl_reclaimable_cv); 1256 } 1257 } 1258 1259 mutex_exit(&wl->wl_mtx); 1260 brelse(bp, 0); 1261 } 1262 1263 /* 1264 * Write transactions to disk + start I/O for contents 1265 */ 1266 int 1267 wapbl_flush(struct wapbl *wl, int waitfor) 1268 { 1269 struct buf *bp; 1270 struct wapbl_entry *we; 1271 off_t off; 1272 off_t head; 1273 off_t tail; 1274 size_t delta = 0; 1275 size_t flushsize; 1276 size_t reserved; 1277 int error = 0; 1278 1279 /* 1280 * Do a quick check to see if a full flush can be skipped 1281 * This assumes that the flush callback does not need to be called 1282 * unless there are other outstanding bufs. 1283 */ 1284 if (!waitfor) { 1285 size_t nbufs; 1286 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1287 protect the KASSERTS */ 1288 nbufs = wl->wl_bufcount; 1289 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1290 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1291 mutex_exit(&wl->wl_mtx); 1292 if (nbufs == 0) 1293 return 0; 1294 } 1295 1296 /* 1297 * XXX we may consider using LK_UPGRADE here 1298 * if we want to call flush from inside a transaction 1299 */ 1300 rw_enter(&wl->wl_rwlock, RW_WRITER); 1301 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1302 wl->wl_dealloccnt); 1303 1304 /* 1305 * Now that we are fully locked and flushed, 1306 * do another check for nothing to do. 1307 */ 1308 if (wl->wl_bufcount == 0) { 1309 goto out; 1310 } 1311 1312 #if 0 1313 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1314 ("wapbl_flush thread %d.%d flushing entries with " 1315 "bufcount=%zu bufbytes=%zu\n", 1316 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1317 wl->wl_bufbytes)); 1318 #endif 1319 1320 /* Calculate amount of space needed to flush */ 1321 flushsize = wapbl_transaction_len(wl); 1322 1323 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1324 /* 1325 * XXX this could be handled more gracefully, perhaps place 1326 * only a partial transaction in the log and allow the 1327 * remaining to flush without the protection of the journal. 1328 */ 1329 panic("wapbl_flush: current transaction too big to flush\n"); 1330 } 1331 1332 error = wapbl_truncate(wl, flushsize, 0); 1333 if (error) 1334 goto out2; 1335 1336 off = wl->wl_head; 1337 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1338 (off < wl->wl_circ_off + wl->wl_circ_size))); 1339 error = wapbl_write_blocks(wl, &off); 1340 if (error) 1341 goto out2; 1342 error = wapbl_write_revocations(wl, &off); 1343 if (error) 1344 goto out2; 1345 error = wapbl_write_inodes(wl, &off); 1346 if (error) 1347 goto out2; 1348 1349 reserved = 0; 1350 if (wl->wl_inohashcnt) 1351 reserved = wapbl_transaction_inodes_len(wl); 1352 1353 head = wl->wl_head; 1354 tail = wl->wl_tail; 1355 1356 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1357 &head, &tail); 1358 #ifdef WAPBL_DEBUG 1359 if (head != off) { 1360 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1361 " off=%"PRIdMAX" flush=%zu\n", 1362 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1363 flushsize); 1364 } 1365 #else 1366 KASSERT(head == off); 1367 #endif 1368 1369 /* Opportunistically move the tail forward if we can */ 1370 if (!wapbl_lazy_truncate) { 1371 mutex_enter(&wl->wl_mtx); 1372 delta = wl->wl_reclaimable_bytes; 1373 mutex_exit(&wl->wl_mtx); 1374 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1375 &head, &tail); 1376 } 1377 1378 error = wapbl_write_commit(wl, head, tail); 1379 if (error) 1380 goto out2; 1381 1382 we = wapbl_calloc(1, sizeof(*we)); 1383 1384 #ifdef WAPBL_DEBUG_BUFBYTES 1385 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1386 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1387 " unsynced=%zu" 1388 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1389 "inodes=%d\n", 1390 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1391 wapbl_space_used(wl->wl_circ_size, head, tail), 1392 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1393 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1394 wl->wl_inohashcnt)); 1395 #else 1396 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1397 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1398 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1399 "inodes=%d\n", 1400 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1401 wapbl_space_used(wl->wl_circ_size, head, tail), 1402 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1403 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1404 #endif 1405 1406 1407 mutex_enter(&bufcache_lock); 1408 mutex_enter(&wl->wl_mtx); 1409 1410 wl->wl_reserved_bytes = reserved; 1411 wl->wl_head = head; 1412 wl->wl_tail = tail; 1413 KASSERT(wl->wl_reclaimable_bytes >= delta); 1414 wl->wl_reclaimable_bytes -= delta; 1415 wl->wl_dealloccnt = 0; 1416 #ifdef WAPBL_DEBUG_BUFBYTES 1417 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1418 #endif 1419 1420 we->we_wapbl = wl; 1421 we->we_bufcount = wl->wl_bufcount; 1422 #ifdef WAPBL_DEBUG_BUFBYTES 1423 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1424 #endif 1425 we->we_reclaimable_bytes = flushsize; 1426 we->we_error = 0; 1427 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1428 1429 /* 1430 * this flushes bufs in reverse order than they were queued 1431 * it shouldn't matter, but if we care we could use TAILQ instead. 1432 * XXX Note they will get put on the lru queue when they flush 1433 * so we might actually want to change this to preserve order. 1434 */ 1435 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1436 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1437 continue; 1438 } 1439 bp->b_iodone = wapbl_biodone; 1440 bp->b_private = we; 1441 bremfree(bp); 1442 wapbl_remove_buf_locked(wl, bp); 1443 mutex_exit(&wl->wl_mtx); 1444 mutex_exit(&bufcache_lock); 1445 bawrite(bp); 1446 mutex_enter(&bufcache_lock); 1447 mutex_enter(&wl->wl_mtx); 1448 } 1449 mutex_exit(&wl->wl_mtx); 1450 mutex_exit(&bufcache_lock); 1451 1452 #if 0 1453 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1454 ("wapbl_flush thread %d.%d done flushing entries...\n", 1455 curproc->p_pid, curlwp->l_lid)); 1456 #endif 1457 1458 out: 1459 1460 /* 1461 * If the waitfor flag is set, don't return until everything is 1462 * fully flushed and the on disk log is empty. 1463 */ 1464 if (waitfor) { 1465 error = wapbl_truncate(wl, wl->wl_circ_size - 1466 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1467 } 1468 1469 out2: 1470 if (error) { 1471 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1472 wl->wl_dealloclens, wl->wl_dealloccnt); 1473 } 1474 1475 #ifdef WAPBL_DEBUG_PRINT 1476 if (error) { 1477 pid_t pid = -1; 1478 lwpid_t lid = -1; 1479 if (curproc) 1480 pid = curproc->p_pid; 1481 if (curlwp) 1482 lid = curlwp->l_lid; 1483 mutex_enter(&wl->wl_mtx); 1484 #ifdef WAPBL_DEBUG_BUFBYTES 1485 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1486 ("wapbl_flush: thread %d.%d aborted flush: " 1487 "error = %d\n" 1488 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1489 "deallocs=%d inodes=%d\n" 1490 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1491 "unsynced=%zu\n", 1492 pid, lid, error, wl->wl_bufcount, 1493 wl->wl_bufbytes, wl->wl_bcount, 1494 wl->wl_dealloccnt, wl->wl_inohashcnt, 1495 wl->wl_error_count, wl->wl_reclaimable_bytes, 1496 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1497 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1498 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1499 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1500 "error = %d, unsynced = %zu\n", 1501 we->we_bufcount, we->we_reclaimable_bytes, 1502 we->we_error, we->we_unsynced_bufbytes)); 1503 } 1504 #else 1505 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1506 ("wapbl_flush: thread %d.%d aborted flush: " 1507 "error = %d\n" 1508 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1509 "deallocs=%d inodes=%d\n" 1510 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1511 pid, lid, error, wl->wl_bufcount, 1512 wl->wl_bufbytes, wl->wl_bcount, 1513 wl->wl_dealloccnt, wl->wl_inohashcnt, 1514 wl->wl_error_count, wl->wl_reclaimable_bytes, 1515 wl->wl_reserved_bytes)); 1516 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1517 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1518 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1519 "error = %d\n", we->we_bufcount, 1520 we->we_reclaimable_bytes, we->we_error)); 1521 } 1522 #endif 1523 mutex_exit(&wl->wl_mtx); 1524 } 1525 #endif 1526 1527 rw_exit(&wl->wl_rwlock); 1528 return error; 1529 } 1530 1531 /****************************************************************/ 1532 1533 void 1534 wapbl_jlock_assert(struct wapbl *wl) 1535 { 1536 1537 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1538 } 1539 1540 void 1541 wapbl_junlock_assert(struct wapbl *wl) 1542 { 1543 1544 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1545 } 1546 1547 /****************************************************************/ 1548 1549 /* locks missing */ 1550 void 1551 wapbl_print(struct wapbl *wl, 1552 int full, 1553 void (*pr)(const char *, ...)) 1554 { 1555 struct buf *bp; 1556 struct wapbl_entry *we; 1557 (*pr)("wapbl %p", wl); 1558 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1559 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1560 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1561 wl->wl_circ_size, wl->wl_circ_off, 1562 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1563 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1564 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1565 #ifdef WAPBL_DEBUG_BUFBYTES 1566 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1567 "reserved = %zu errcnt = %d unsynced = %zu\n", 1568 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1569 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1570 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1571 #else 1572 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1573 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1574 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1575 wl->wl_error_count); 1576 #endif 1577 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1578 wl->wl_dealloccnt, wl->wl_dealloclim); 1579 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1580 wl->wl_inohashcnt, wl->wl_inohashmask); 1581 (*pr)("entries:\n"); 1582 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1583 #ifdef WAPBL_DEBUG_BUFBYTES 1584 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1585 "unsynced = %zu\n", 1586 we->we_bufcount, we->we_reclaimable_bytes, 1587 we->we_error, we->we_unsynced_bufbytes); 1588 #else 1589 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1590 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1591 #endif 1592 } 1593 if (full) { 1594 int cnt = 0; 1595 (*pr)("bufs ="); 1596 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1597 if (!LIST_NEXT(bp, b_wapbllist)) { 1598 (*pr)(" %p", bp); 1599 } else if ((++cnt % 6) == 0) { 1600 (*pr)(" %p,\n\t", bp); 1601 } else { 1602 (*pr)(" %p,", bp); 1603 } 1604 } 1605 (*pr)("\n"); 1606 1607 (*pr)("dealloced blks = "); 1608 { 1609 int i; 1610 cnt = 0; 1611 for (i = 0; i < wl->wl_dealloccnt; i++) { 1612 (*pr)(" %"PRId64":%d,", 1613 wl->wl_deallocblks[i], 1614 wl->wl_dealloclens[i]); 1615 if ((++cnt % 4) == 0) { 1616 (*pr)("\n\t"); 1617 } 1618 } 1619 } 1620 (*pr)("\n"); 1621 1622 (*pr)("registered inodes = "); 1623 { 1624 int i; 1625 cnt = 0; 1626 for (i = 0; i <= wl->wl_inohashmask; i++) { 1627 struct wapbl_ino_head *wih; 1628 struct wapbl_ino *wi; 1629 1630 wih = &wl->wl_inohash[i]; 1631 LIST_FOREACH(wi, wih, wi_hash) { 1632 if (wi->wi_ino == 0) 1633 continue; 1634 (*pr)(" %"PRId32"/0%06"PRIo32",", 1635 wi->wi_ino, wi->wi_mode); 1636 if ((++cnt % 4) == 0) { 1637 (*pr)("\n\t"); 1638 } 1639 } 1640 } 1641 (*pr)("\n"); 1642 } 1643 } 1644 } 1645 1646 #if defined(WAPBL_DEBUG) || defined(DDB) 1647 void 1648 wapbl_dump(struct wapbl *wl) 1649 { 1650 #if defined(WAPBL_DEBUG) 1651 if (!wl) 1652 wl = wapbl_debug_wl; 1653 #endif 1654 if (!wl) 1655 return; 1656 wapbl_print(wl, 1, printf); 1657 } 1658 #endif 1659 1660 /****************************************************************/ 1661 1662 void 1663 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1664 { 1665 1666 wapbl_jlock_assert(wl); 1667 1668 /* XXX should eventually instead tie this into resource estimation */ 1669 /* XXX this KASSERT needs locking/mutex analysis */ 1670 KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim); 1671 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1672 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1673 wl->wl_dealloccnt++; 1674 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1675 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1676 } 1677 1678 /****************************************************************/ 1679 1680 static void 1681 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1682 { 1683 1684 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1685 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1686 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1687 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1688 } 1689 } 1690 1691 static void 1692 wapbl_inodetrk_free(struct wapbl *wl) 1693 { 1694 1695 /* XXX this KASSERT needs locking/mutex analysis */ 1696 KASSERT(wl->wl_inohashcnt == 0); 1697 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1698 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1699 pool_destroy(&wapbl_ino_pool); 1700 } 1701 } 1702 1703 static struct wapbl_ino * 1704 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1705 { 1706 struct wapbl_ino_head *wih; 1707 struct wapbl_ino *wi; 1708 1709 KASSERT(mutex_owned(&wl->wl_mtx)); 1710 1711 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1712 LIST_FOREACH(wi, wih, wi_hash) { 1713 if (ino == wi->wi_ino) 1714 return wi; 1715 } 1716 return 0; 1717 } 1718 1719 void 1720 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1721 { 1722 struct wapbl_ino_head *wih; 1723 struct wapbl_ino *wi; 1724 1725 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1726 1727 mutex_enter(&wl->wl_mtx); 1728 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1729 wi->wi_ino = ino; 1730 wi->wi_mode = mode; 1731 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1732 LIST_INSERT_HEAD(wih, wi, wi_hash); 1733 wl->wl_inohashcnt++; 1734 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1735 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1736 mutex_exit(&wl->wl_mtx); 1737 } else { 1738 mutex_exit(&wl->wl_mtx); 1739 pool_put(&wapbl_ino_pool, wi); 1740 } 1741 } 1742 1743 void 1744 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1745 { 1746 struct wapbl_ino *wi; 1747 1748 mutex_enter(&wl->wl_mtx); 1749 wi = wapbl_inodetrk_get(wl, ino); 1750 if (wi) { 1751 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1752 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1753 KASSERT(wl->wl_inohashcnt > 0); 1754 wl->wl_inohashcnt--; 1755 LIST_REMOVE(wi, wi_hash); 1756 mutex_exit(&wl->wl_mtx); 1757 1758 pool_put(&wapbl_ino_pool, wi); 1759 } else { 1760 mutex_exit(&wl->wl_mtx); 1761 } 1762 } 1763 1764 /****************************************************************/ 1765 1766 static __inline size_t 1767 wapbl_transaction_inodes_len(struct wapbl *wl) 1768 { 1769 int blocklen = 1<<wl->wl_log_dev_bshift; 1770 int iph; 1771 1772 /* Calculate number of inodes described in a inodelist header */ 1773 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1774 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1775 1776 KASSERT(iph > 0); 1777 1778 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen; 1779 } 1780 1781 1782 /* Calculate amount of space a transaction will take on disk */ 1783 static size_t 1784 wapbl_transaction_len(struct wapbl *wl) 1785 { 1786 int blocklen = 1<<wl->wl_log_dev_bshift; 1787 size_t len; 1788 int bph; 1789 1790 /* Calculate number of blocks described in a blocklist header */ 1791 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1792 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1793 1794 KASSERT(bph > 0); 1795 1796 len = wl->wl_bcount; 1797 len += howmany(wl->wl_bufcount, bph)*blocklen; 1798 len += howmany(wl->wl_dealloccnt, bph)*blocklen; 1799 len += wapbl_transaction_inodes_len(wl); 1800 1801 return len; 1802 } 1803 1804 /* 1805 * Perform commit operation 1806 * 1807 * Note that generation number incrementation needs to 1808 * be protected against racing with other invocations 1809 * of wapbl_commit. This is ok since this routine 1810 * is only invoked from wapbl_flush 1811 */ 1812 static int 1813 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 1814 { 1815 struct wapbl_wc_header *wc = wl->wl_wc_header; 1816 struct timespec ts; 1817 int error; 1818 int force = 1; 1819 1820 /* XXX Calc checksum here, instead we do this for now */ 1821 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); 1822 if (error) { 1823 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1824 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1825 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1826 } 1827 1828 wc->wc_head = head; 1829 wc->wc_tail = tail; 1830 wc->wc_checksum = 0; 1831 wc->wc_version = 1; 1832 getnanotime(&ts); 1833 wc->wc_time = ts.tv_sec; 1834 wc->wc_timensec = ts.tv_nsec; 1835 1836 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1837 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 1838 (intmax_t)head, (intmax_t)tail)); 1839 1840 /* 1841 * XXX if generation will rollover, then first zero 1842 * over second commit header before trying to write both headers. 1843 */ 1844 1845 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, 1846 wl->wl_logpbn + wc->wc_generation % 2); 1847 if (error) 1848 return error; 1849 1850 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); 1851 if (error) { 1852 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1853 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1854 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1855 } 1856 1857 /* 1858 * If the generation number was zero, write it out a second time. 1859 * This handles initialization and generation number rollover 1860 */ 1861 if (wc->wc_generation++ == 0) { 1862 error = wapbl_write_commit(wl, head, tail); 1863 /* 1864 * This panic should be able to be removed if we do the 1865 * zero'ing mentioned above, and we are certain to roll 1866 * back generation number on failure. 1867 */ 1868 if (error) 1869 panic("wapbl_write_commit: error writing duplicate " 1870 "log header: %d\n", error); 1871 } 1872 return 0; 1873 } 1874 1875 /* Returns new offset value */ 1876 static int 1877 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 1878 { 1879 struct wapbl_wc_blocklist *wc = 1880 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1881 int blocklen = 1<<wl->wl_log_dev_bshift; 1882 int bph; 1883 struct buf *bp; 1884 off_t off = *offp; 1885 int error; 1886 size_t padding; 1887 1888 KASSERT(rw_write_held(&wl->wl_rwlock)); 1889 1890 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1891 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1892 1893 bp = LIST_FIRST(&wl->wl_bufs); 1894 1895 while (bp) { 1896 int cnt; 1897 struct buf *obp = bp; 1898 1899 KASSERT(bp->b_flags & B_LOCKED); 1900 1901 wc->wc_type = WAPBL_WC_BLOCKS; 1902 wc->wc_len = blocklen; 1903 wc->wc_blkcount = 0; 1904 while (bp && (wc->wc_blkcount < bph)) { 1905 /* 1906 * Make sure all the physical block numbers are up to 1907 * date. If this is not always true on a given 1908 * filesystem, then VOP_BMAP must be called. We 1909 * could call VOP_BMAP here, or else in the filesystem 1910 * specific flush callback, although neither of those 1911 * solutions allow us to take the vnode lock. If a 1912 * filesystem requires that we must take the vnode lock 1913 * to call VOP_BMAP, then we can probably do it in 1914 * bwrite when the vnode lock should already be held 1915 * by the invoking code. 1916 */ 1917 KASSERT((bp->b_vp->v_type == VBLK) || 1918 (bp->b_blkno != bp->b_lblkno)); 1919 KASSERT(bp->b_blkno > 0); 1920 1921 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 1922 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 1923 wc->wc_len += bp->b_bcount; 1924 wc->wc_blkcount++; 1925 bp = LIST_NEXT(bp, b_wapbllist); 1926 } 1927 if (wc->wc_len % blocklen != 0) { 1928 padding = blocklen - wc->wc_len % blocklen; 1929 wc->wc_len += padding; 1930 } else { 1931 padding = 0; 1932 } 1933 1934 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1935 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 1936 wc->wc_len, padding, (intmax_t)off)); 1937 1938 error = wapbl_circ_write(wl, wc, blocklen, &off); 1939 if (error) 1940 return error; 1941 bp = obp; 1942 cnt = 0; 1943 while (bp && (cnt++ < bph)) { 1944 error = wapbl_circ_write(wl, bp->b_data, 1945 bp->b_bcount, &off); 1946 if (error) 1947 return error; 1948 bp = LIST_NEXT(bp, b_wapbllist); 1949 } 1950 if (padding) { 1951 void *zero; 1952 1953 zero = wapbl_malloc(padding); 1954 memset(zero, 0, padding); 1955 error = wapbl_circ_write(wl, zero, padding, &off); 1956 wapbl_free(zero, padding); 1957 if (error) 1958 return error; 1959 } 1960 } 1961 *offp = off; 1962 return 0; 1963 } 1964 1965 static int 1966 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 1967 { 1968 struct wapbl_wc_blocklist *wc = 1969 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1970 int i; 1971 int blocklen = 1<<wl->wl_log_dev_bshift; 1972 int bph; 1973 off_t off = *offp; 1974 int error; 1975 1976 if (wl->wl_dealloccnt == 0) 1977 return 0; 1978 1979 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1980 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1981 1982 i = 0; 1983 while (i < wl->wl_dealloccnt) { 1984 wc->wc_type = WAPBL_WC_REVOCATIONS; 1985 wc->wc_len = blocklen; 1986 wc->wc_blkcount = 0; 1987 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 1988 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 1989 wl->wl_deallocblks[i]; 1990 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 1991 wl->wl_dealloclens[i]; 1992 wc->wc_blkcount++; 1993 i++; 1994 } 1995 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1996 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 1997 wc->wc_len, (intmax_t)off)); 1998 error = wapbl_circ_write(wl, wc, blocklen, &off); 1999 if (error) 2000 return error; 2001 } 2002 *offp = off; 2003 return 0; 2004 } 2005 2006 static int 2007 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2008 { 2009 struct wapbl_wc_inodelist *wc = 2010 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2011 int i; 2012 int blocklen = 1 << wl->wl_log_dev_bshift; 2013 off_t off = *offp; 2014 int error; 2015 2016 struct wapbl_ino_head *wih; 2017 struct wapbl_ino *wi; 2018 int iph; 2019 2020 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2021 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2022 2023 i = 0; 2024 wih = &wl->wl_inohash[0]; 2025 wi = 0; 2026 do { 2027 wc->wc_type = WAPBL_WC_INODES; 2028 wc->wc_len = blocklen; 2029 wc->wc_inocnt = 0; 2030 wc->wc_clear = (i == 0); 2031 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2032 while (!wi) { 2033 KASSERT((wih - &wl->wl_inohash[0]) 2034 <= wl->wl_inohashmask); 2035 wi = LIST_FIRST(wih++); 2036 } 2037 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2038 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2039 wc->wc_inocnt++; 2040 i++; 2041 wi = LIST_NEXT(wi, wi_hash); 2042 } 2043 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2044 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2045 wc->wc_len, (intmax_t)off)); 2046 error = wapbl_circ_write(wl, wc, blocklen, &off); 2047 if (error) 2048 return error; 2049 } while (i < wl->wl_inohashcnt); 2050 2051 *offp = off; 2052 return 0; 2053 } 2054 2055 #endif /* _KERNEL */ 2056 2057 /****************************************************************/ 2058 2059 struct wapbl_blk { 2060 LIST_ENTRY(wapbl_blk) wb_hash; 2061 daddr_t wb_blk; 2062 off_t wb_off; /* Offset of this block in the log */ 2063 }; 2064 #define WAPBL_BLKPOOL_MIN 83 2065 2066 static void 2067 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2068 { 2069 if (size < WAPBL_BLKPOOL_MIN) 2070 size = WAPBL_BLKPOOL_MIN; 2071 KASSERT(wr->wr_blkhash == 0); 2072 #ifdef _KERNEL 2073 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2074 #else /* ! _KERNEL */ 2075 /* Manually implement hashinit */ 2076 { 2077 unsigned long i, hashsize; 2078 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2079 continue; 2080 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); 2081 for (i = 0; i < wr->wr_blkhashmask; i++) 2082 LIST_INIT(&wr->wr_blkhash[i]); 2083 wr->wr_blkhashmask = hashsize - 1; 2084 } 2085 #endif /* ! _KERNEL */ 2086 } 2087 2088 static void 2089 wapbl_blkhash_free(struct wapbl_replay *wr) 2090 { 2091 KASSERT(wr->wr_blkhashcnt == 0); 2092 #ifdef _KERNEL 2093 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2094 #else /* ! _KERNEL */ 2095 wapbl_free(wr->wr_blkhash, 2096 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2097 #endif /* ! _KERNEL */ 2098 } 2099 2100 static struct wapbl_blk * 2101 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2102 { 2103 struct wapbl_blk_head *wbh; 2104 struct wapbl_blk *wb; 2105 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2106 LIST_FOREACH(wb, wbh, wb_hash) { 2107 if (blk == wb->wb_blk) 2108 return wb; 2109 } 2110 return 0; 2111 } 2112 2113 static void 2114 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2115 { 2116 struct wapbl_blk_head *wbh; 2117 struct wapbl_blk *wb; 2118 wb = wapbl_blkhash_get(wr, blk); 2119 if (wb) { 2120 KASSERT(wb->wb_blk == blk); 2121 wb->wb_off = off; 2122 } else { 2123 wb = wapbl_malloc(sizeof(*wb)); 2124 wb->wb_blk = blk; 2125 wb->wb_off = off; 2126 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2127 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2128 wr->wr_blkhashcnt++; 2129 } 2130 } 2131 2132 static void 2133 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2134 { 2135 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2136 if (wb) { 2137 KASSERT(wr->wr_blkhashcnt > 0); 2138 wr->wr_blkhashcnt--; 2139 LIST_REMOVE(wb, wb_hash); 2140 wapbl_free(wb, sizeof(*wb)); 2141 } 2142 } 2143 2144 static void 2145 wapbl_blkhash_clear(struct wapbl_replay *wr) 2146 { 2147 unsigned long i; 2148 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2149 struct wapbl_blk *wb; 2150 2151 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2152 KASSERT(wr->wr_blkhashcnt > 0); 2153 wr->wr_blkhashcnt--; 2154 LIST_REMOVE(wb, wb_hash); 2155 wapbl_free(wb, sizeof(*wb)); 2156 } 2157 } 2158 KASSERT(wr->wr_blkhashcnt == 0); 2159 } 2160 2161 /****************************************************************/ 2162 2163 static int 2164 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2165 { 2166 size_t slen; 2167 off_t off = *offp; 2168 int error; 2169 2170 KASSERT(((len >> wr->wr_log_dev_bshift) << 2171 wr->wr_log_dev_bshift) == len); 2172 if (off < wr->wr_circ_off) 2173 off = wr->wr_circ_off; 2174 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2175 if (slen < len) { 2176 error = wapbl_read(data, slen, wr->wr_devvp, 2177 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift)); 2178 if (error) 2179 return error; 2180 data = (uint8_t *)data + slen; 2181 len -= slen; 2182 off = wr->wr_circ_off; 2183 } 2184 error = wapbl_read(data, len, wr->wr_devvp, 2185 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift)); 2186 if (error) 2187 return error; 2188 off += len; 2189 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2190 off = wr->wr_circ_off; 2191 *offp = off; 2192 return 0; 2193 } 2194 2195 static void 2196 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2197 { 2198 size_t slen; 2199 off_t off = *offp; 2200 2201 KASSERT(((len >> wr->wr_log_dev_bshift) << 2202 wr->wr_log_dev_bshift) == len); 2203 2204 if (off < wr->wr_circ_off) 2205 off = wr->wr_circ_off; 2206 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2207 if (slen < len) { 2208 len -= slen; 2209 off = wr->wr_circ_off; 2210 } 2211 off += len; 2212 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2213 off = wr->wr_circ_off; 2214 *offp = off; 2215 } 2216 2217 /****************************************************************/ 2218 2219 int 2220 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2221 daddr_t off, size_t count, size_t blksize) 2222 { 2223 struct wapbl_replay *wr; 2224 int error; 2225 struct vnode *devvp; 2226 daddr_t logpbn; 2227 uint8_t *scratch; 2228 struct wapbl_wc_header *wch; 2229 struct wapbl_wc_header *wch2; 2230 /* Use this until we read the actual log header */ 2231 int log_dev_bshift = DEV_BSHIFT; 2232 size_t used; 2233 2234 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2235 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2236 vp, off, count, blksize)); 2237 2238 if (off < 0) 2239 return EINVAL; 2240 2241 if (blksize < DEV_BSIZE) 2242 return EINVAL; 2243 if (blksize % DEV_BSIZE) 2244 return EINVAL; 2245 2246 #ifdef _KERNEL 2247 #if 0 2248 /* XXX vp->v_size isn't reliably set for VBLK devices, 2249 * especially root. However, we might still want to verify 2250 * that the full load is readable */ 2251 if ((off + count) * blksize > vp->v_size) 2252 return EINVAL; 2253 #endif 2254 2255 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2256 return error; 2257 } 2258 #else /* ! _KERNEL */ 2259 devvp = vp; 2260 logpbn = off; 2261 #endif /* ! _KERNEL */ 2262 2263 scratch = wapbl_malloc(MAXBSIZE); 2264 2265 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn); 2266 if (error) 2267 goto errout; 2268 2269 wch = (struct wapbl_wc_header *)scratch; 2270 wch2 = 2271 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2272 /* XXX verify checksums and magic numbers */ 2273 if (wch->wc_type != WAPBL_WC_HEADER) { 2274 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2275 error = EFTYPE; 2276 goto errout; 2277 } 2278 2279 if (wch2->wc_generation > wch->wc_generation) 2280 wch = wch2; 2281 2282 wr = wapbl_calloc(1, sizeof(*wr)); 2283 2284 wr->wr_logvp = vp; 2285 wr->wr_devvp = devvp; 2286 wr->wr_logpbn = logpbn; 2287 2288 wr->wr_scratch = scratch; 2289 2290 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2291 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2292 wr->wr_circ_off = wch->wc_circ_off; 2293 wr->wr_circ_size = wch->wc_circ_size; 2294 wr->wr_generation = wch->wc_generation; 2295 2296 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2297 2298 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2299 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2300 " len=%"PRId64" used=%zu\n", 2301 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2302 wch->wc_circ_size, used)); 2303 2304 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2305 2306 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2307 if (error) { 2308 wapbl_replay_stop(wr); 2309 wapbl_replay_free(wr); 2310 return error; 2311 } 2312 2313 *wrp = wr; 2314 return 0; 2315 2316 errout: 2317 wapbl_free(scratch, MAXBSIZE); 2318 return error; 2319 } 2320 2321 void 2322 wapbl_replay_stop(struct wapbl_replay *wr) 2323 { 2324 2325 if (!wapbl_replay_isopen(wr)) 2326 return; 2327 2328 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2329 2330 wapbl_free(wr->wr_scratch, MAXBSIZE); 2331 wr->wr_scratch = NULL; 2332 2333 wr->wr_logvp = NULL; 2334 2335 wapbl_blkhash_clear(wr); 2336 wapbl_blkhash_free(wr); 2337 } 2338 2339 void 2340 wapbl_replay_free(struct wapbl_replay *wr) 2341 { 2342 2343 KDASSERT(!wapbl_replay_isopen(wr)); 2344 2345 if (wr->wr_inodes) 2346 wapbl_free(wr->wr_inodes, 2347 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2348 wapbl_free(wr, sizeof(*wr)); 2349 } 2350 2351 #ifdef _KERNEL 2352 int 2353 wapbl_replay_isopen1(struct wapbl_replay *wr) 2354 { 2355 2356 return wapbl_replay_isopen(wr); 2357 } 2358 #endif 2359 2360 static void 2361 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2362 { 2363 struct wapbl_wc_blocklist *wc = 2364 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2365 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2366 int i, j, n; 2367 2368 for (i = 0; i < wc->wc_blkcount; i++) { 2369 /* 2370 * Enter each physical block into the hashtable independently. 2371 */ 2372 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2373 for (j = 0; j < n; j++) { 2374 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j, 2375 *offp); 2376 wapbl_circ_advance(wr, fsblklen, offp); 2377 } 2378 } 2379 } 2380 2381 static void 2382 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2383 { 2384 struct wapbl_wc_blocklist *wc = 2385 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2386 int i, j, n; 2387 2388 for (i = 0; i < wc->wc_blkcount; i++) { 2389 /* 2390 * Remove any blocks found from the hashtable. 2391 */ 2392 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2393 for (j = 0; j < n; j++) 2394 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j); 2395 } 2396 } 2397 2398 static void 2399 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2400 { 2401 struct wapbl_wc_inodelist *wc = 2402 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2403 void *new_inodes; 2404 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2405 2406 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2407 2408 /* 2409 * Keep track of where we found this so location won't be 2410 * overwritten. 2411 */ 2412 if (wc->wc_clear) { 2413 wr->wr_inodestail = oldoff; 2414 wr->wr_inodescnt = 0; 2415 if (wr->wr_inodes != NULL) { 2416 wapbl_free(wr->wr_inodes, oldsize); 2417 wr->wr_inodes = NULL; 2418 } 2419 } 2420 wr->wr_inodeshead = newoff; 2421 if (wc->wc_inocnt == 0) 2422 return; 2423 2424 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) * 2425 sizeof(wr->wr_inodes[0])); 2426 if (wr->wr_inodes != NULL) { 2427 memcpy(new_inodes, wr->wr_inodes, oldsize); 2428 wapbl_free(wr->wr_inodes, oldsize); 2429 } 2430 wr->wr_inodes = new_inodes; 2431 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2432 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2433 wr->wr_inodescnt += wc->wc_inocnt; 2434 } 2435 2436 static int 2437 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2438 { 2439 off_t off; 2440 int error; 2441 2442 int logblklen = 1 << wr->wr_log_dev_bshift; 2443 2444 wapbl_blkhash_clear(wr); 2445 2446 off = tail; 2447 while (off != head) { 2448 struct wapbl_wc_null *wcn; 2449 off_t saveoff = off; 2450 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2451 if (error) 2452 goto errout; 2453 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2454 switch (wcn->wc_type) { 2455 case WAPBL_WC_BLOCKS: 2456 wapbl_replay_process_blocks(wr, &off); 2457 break; 2458 2459 case WAPBL_WC_REVOCATIONS: 2460 wapbl_replay_process_revocations(wr); 2461 break; 2462 2463 case WAPBL_WC_INODES: 2464 wapbl_replay_process_inodes(wr, saveoff, off); 2465 break; 2466 2467 default: 2468 printf("Unrecognized wapbl type: 0x%08x\n", 2469 wcn->wc_type); 2470 error = EFTYPE; 2471 goto errout; 2472 } 2473 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2474 if (off != saveoff) { 2475 printf("wapbl_replay: corrupted records\n"); 2476 error = EFTYPE; 2477 goto errout; 2478 } 2479 } 2480 return 0; 2481 2482 errout: 2483 wapbl_blkhash_clear(wr); 2484 return error; 2485 } 2486 2487 #if 0 2488 int 2489 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2490 { 2491 off_t off; 2492 int mismatchcnt = 0; 2493 int logblklen = 1 << wr->wr_log_dev_bshift; 2494 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2495 void *scratch1 = wapbl_malloc(MAXBSIZE); 2496 void *scratch2 = wapbl_malloc(MAXBSIZE); 2497 int error = 0; 2498 2499 KDASSERT(wapbl_replay_isopen(wr)); 2500 2501 off = wch->wc_tail; 2502 while (off != wch->wc_head) { 2503 struct wapbl_wc_null *wcn; 2504 #ifdef DEBUG 2505 off_t saveoff = off; 2506 #endif 2507 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2508 if (error) 2509 goto out; 2510 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2511 switch (wcn->wc_type) { 2512 case WAPBL_WC_BLOCKS: 2513 { 2514 struct wapbl_wc_blocklist *wc = 2515 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2516 int i; 2517 for (i = 0; i < wc->wc_blkcount; i++) { 2518 int foundcnt = 0; 2519 int dirtycnt = 0; 2520 int j, n; 2521 /* 2522 * Check each physical block into the 2523 * hashtable independently 2524 */ 2525 n = wc->wc_blocks[i].wc_dlen >> 2526 wch->wc_fs_dev_bshift; 2527 for (j = 0; j < n; j++) { 2528 struct wapbl_blk *wb = 2529 wapbl_blkhash_get(wr, 2530 wc->wc_blocks[i].wc_daddr + j); 2531 if (wb && (wb->wb_off == off)) { 2532 foundcnt++; 2533 error = 2534 wapbl_circ_read(wr, 2535 scratch1, fsblklen, 2536 &off); 2537 if (error) 2538 goto out; 2539 error = 2540 wapbl_read(scratch2, 2541 fsblklen, fsdevvp, 2542 wb->wb_blk); 2543 if (error) 2544 goto out; 2545 if (memcmp(scratch1, 2546 scratch2, 2547 fsblklen)) { 2548 printf( 2549 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2550 wb->wb_blk, (intmax_t)off); 2551 dirtycnt++; 2552 mismatchcnt++; 2553 } 2554 } else { 2555 wapbl_circ_advance(wr, 2556 fsblklen, &off); 2557 } 2558 } 2559 #if 0 2560 /* 2561 * If all of the blocks in an entry 2562 * are clean, then remove all of its 2563 * blocks from the hashtable since they 2564 * never will need replay. 2565 */ 2566 if ((foundcnt != 0) && 2567 (dirtycnt == 0)) { 2568 off = saveoff; 2569 wapbl_circ_advance(wr, 2570 logblklen, &off); 2571 for (j = 0; j < n; j++) { 2572 struct wapbl_blk *wb = 2573 wapbl_blkhash_get(wr, 2574 wc->wc_blocks[i].wc_daddr + j); 2575 if (wb && 2576 (wb->wb_off == off)) { 2577 wapbl_blkhash_rem(wr, wb->wb_blk); 2578 } 2579 wapbl_circ_advance(wr, 2580 fsblklen, &off); 2581 } 2582 } 2583 #endif 2584 } 2585 } 2586 break; 2587 case WAPBL_WC_REVOCATIONS: 2588 case WAPBL_WC_INODES: 2589 break; 2590 default: 2591 KASSERT(0); 2592 } 2593 #ifdef DEBUG 2594 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2595 KASSERT(off == saveoff); 2596 #endif 2597 } 2598 out: 2599 wapbl_free(scratch1, MAXBSIZE); 2600 wapbl_free(scratch2, MAXBSIZE); 2601 if (!error && mismatchcnt) 2602 error = EFTYPE; 2603 return error; 2604 } 2605 #endif 2606 2607 int 2608 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2609 { 2610 struct wapbl_blk *wb; 2611 size_t i; 2612 off_t off; 2613 void *scratch; 2614 int error = 0; 2615 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2616 2617 KDASSERT(wapbl_replay_isopen(wr)); 2618 2619 scratch = wapbl_malloc(MAXBSIZE); 2620 2621 for (i = 0; i < wr->wr_blkhashmask; ++i) { 2622 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2623 off = wb->wb_off; 2624 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2625 if (error) 2626 break; 2627 error = wapbl_write(scratch, fsblklen, fsdevvp, 2628 wb->wb_blk); 2629 if (error) 2630 break; 2631 } 2632 } 2633 2634 wapbl_free(scratch, MAXBSIZE); 2635 return error; 2636 } 2637 2638 int 2639 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2640 { 2641 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2642 2643 KDASSERT(wapbl_replay_isopen(wr)); 2644 KASSERT((len % fsblklen) == 0); 2645 2646 while (len != 0) { 2647 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2648 if (wb) 2649 return 1; 2650 len -= fsblklen; 2651 } 2652 return 0; 2653 } 2654 2655 int 2656 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2657 { 2658 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2659 2660 KDASSERT(wapbl_replay_isopen(wr)); 2661 2662 KASSERT((len % fsblklen) == 0); 2663 2664 while (len != 0) { 2665 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2666 if (wb) { 2667 off_t off = wb->wb_off; 2668 int error; 2669 error = wapbl_circ_read(wr, data, fsblklen, &off); 2670 if (error) 2671 return error; 2672 } 2673 data = (uint8_t *)data + fsblklen; 2674 len -= fsblklen; 2675 blk++; 2676 } 2677 return 0; 2678 } 2679