1 /* $NetBSD: vfs_wapbl.c,v 1.28 2009/10/01 12:28:34 pooka Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * This implements file system independent write ahead filesystem logging. 34 */ 35 36 #define WAPBL_INTERNAL 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.28 2009/10/01 12:28:34 pooka Exp $"); 40 41 #include <sys/param.h> 42 43 #ifdef _KERNEL 44 #include <sys/param.h> 45 #include <sys/namei.h> 46 #include <sys/proc.h> 47 #include <sys/uio.h> 48 #include <sys/vnode.h> 49 #include <sys/file.h> 50 #include <sys/malloc.h> 51 #include <sys/resourcevar.h> 52 #include <sys/conf.h> 53 #include <sys/mount.h> 54 #include <sys/kernel.h> 55 #include <sys/kauth.h> 56 #include <sys/mutex.h> 57 #include <sys/atomic.h> 58 #include <sys/wapbl.h> 59 #include <sys/wapbl_replay.h> 60 61 #include <miscfs/specfs/specdev.h> 62 63 #if 0 /* notyet */ 64 #define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP) 65 #define wapbl_free(a, s) kmem_free((a), (s)) 66 #define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP) 67 #else 68 MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); 69 #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) 70 #define wapbl_free(a, s) free((a), M_WAPBL) 71 #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) 72 #endif 73 74 #else /* !_KERNEL */ 75 #include <assert.h> 76 #include <errno.h> 77 #include <stdio.h> 78 #include <stdbool.h> 79 #include <stdlib.h> 80 #include <string.h> 81 82 #include <sys/time.h> 83 #include <sys/wapbl.h> 84 #include <sys/wapbl_replay.h> 85 86 #define KDASSERT(x) assert(x) 87 #define KASSERT(x) assert(x) 88 #define wapbl_malloc(s) malloc(s) 89 #define wapbl_free(a, s) free(a) 90 #define wapbl_calloc(n, s) calloc((n), (s)) 91 92 #endif /* !_KERNEL */ 93 94 /* 95 * INTERNAL DATA STRUCTURES 96 */ 97 98 /* 99 * This structure holds per-mount log information. 100 * 101 * Legend: a = atomic access only 102 * r = read-only after init 103 * l = rwlock held 104 * m = mutex held 105 * u = unlocked access ok 106 * b = bufcache_lock held 107 */ 108 struct wapbl { 109 struct vnode *wl_logvp; /* r: log here */ 110 struct vnode *wl_devvp; /* r: log on this device */ 111 struct mount *wl_mount; /* r: mountpoint wl is associated with */ 112 daddr_t wl_logpbn; /* r: Physical block number of start of log */ 113 int wl_log_dev_bshift; /* r: logarithm of device block size of log 114 device */ 115 int wl_fs_dev_bshift; /* r: logarithm of device block size of 116 filesystem device */ 117 118 unsigned wl_lock_count; /* m: Count of transactions in progress */ 119 120 size_t wl_circ_size; /* r: Number of bytes in buffer of log */ 121 size_t wl_circ_off; /* r: Number of bytes reserved at start */ 122 123 size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ 124 size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ 125 126 off_t wl_head; /* l: Byte offset of log head */ 127 off_t wl_tail; /* l: Byte offset of log tail */ 128 /* 129 * head == tail == 0 means log is empty 130 * head == tail != 0 means log is full 131 * see assertions in wapbl_advance() for other boundary conditions. 132 * only truncate moves the tail, except when flush sets it to 133 * wl_header_size only flush moves the head, except when truncate 134 * sets it to 0. 135 */ 136 137 struct wapbl_wc_header *wl_wc_header; /* l */ 138 void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ 139 140 kmutex_t wl_mtx; /* u: short-term lock */ 141 krwlock_t wl_rwlock; /* u: File system transaction lock */ 142 143 /* 144 * Must be held while accessing 145 * wl_count or wl_bufs or head or tail 146 */ 147 148 /* 149 * Callback called from within the flush routine to flush any extra 150 * bits. Note that flush may be skipped without calling this if 151 * there are no outstanding buffers in the transaction. 152 */ 153 #if _KERNEL 154 wapbl_flush_fn_t wl_flush; /* r */ 155 wapbl_flush_fn_t wl_flush_abort;/* r */ 156 #endif 157 158 size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ 159 size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ 160 size_t wl_bcount; /* m: Total bcount of wl_bufs */ 161 162 LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ 163 164 kcondvar_t wl_reclaimable_cv; /* m (obviously) */ 165 size_t wl_reclaimable_bytes; /* m: Amount of space available for 166 reclamation by truncate */ 167 int wl_error_count; /* m: # of wl_entries with errors */ 168 size_t wl_reserved_bytes; /* never truncate log smaller than this */ 169 170 #ifdef WAPBL_DEBUG_BUFBYTES 171 size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ 172 #endif 173 174 daddr_t *wl_deallocblks;/* l: address of block */ 175 int *wl_dealloclens; /* l: size of block */ 176 int wl_dealloccnt; /* l: total count */ 177 int wl_dealloclim; /* l: max count */ 178 179 /* hashtable of inode numbers for allocated but unlinked inodes */ 180 /* synch ??? */ 181 LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; 182 u_long wl_inohashmask; 183 int wl_inohashcnt; 184 185 SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction 186 accounting */ 187 }; 188 189 #ifdef WAPBL_DEBUG_PRINT 190 int wapbl_debug_print = WAPBL_DEBUG_PRINT; 191 #endif 192 193 /****************************************************************/ 194 #ifdef _KERNEL 195 196 #ifdef WAPBL_DEBUG 197 struct wapbl *wapbl_debug_wl; 198 #endif 199 200 static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); 201 static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); 202 static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); 203 static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); 204 #endif /* _KERNEL */ 205 206 static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t); 207 208 static __inline size_t wapbl_space_free(size_t avail, off_t head, 209 off_t tail); 210 static __inline size_t wapbl_space_used(size_t avail, off_t head, 211 off_t tail); 212 213 #ifdef _KERNEL 214 215 #define WAPBL_INODETRK_SIZE 83 216 static int wapbl_ino_pool_refcount; 217 static struct pool wapbl_ino_pool; 218 struct wapbl_ino { 219 LIST_ENTRY(wapbl_ino) wi_hash; 220 ino_t wi_ino; 221 mode_t wi_mode; 222 }; 223 224 static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); 225 static void wapbl_inodetrk_free(struct wapbl *wl); 226 static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); 227 228 static size_t wapbl_transaction_len(struct wapbl *wl); 229 static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); 230 231 #if 0 232 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); 233 #endif 234 235 static int wapbl_replay_isopen1(struct wapbl_replay *); 236 237 /* 238 * This is useful for debugging. If set, the log will 239 * only be truncated when necessary. 240 */ 241 int wapbl_lazy_truncate = 0; 242 243 struct wapbl_ops wapbl_ops = { 244 .wo_wapbl_discard = wapbl_discard, 245 .wo_wapbl_replay_isopen = wapbl_replay_isopen1, 246 .wo_wapbl_replay_can_read = wapbl_replay_can_read, 247 .wo_wapbl_replay_read = wapbl_replay_read, 248 .wo_wapbl_add_buf = wapbl_add_buf, 249 .wo_wapbl_remove_buf = wapbl_remove_buf, 250 .wo_wapbl_resize_buf = wapbl_resize_buf, 251 .wo_wapbl_begin = wapbl_begin, 252 .wo_wapbl_end = wapbl_end, 253 .wo_wapbl_junlock_assert= wapbl_junlock_assert, 254 255 /* XXX: the following is only used to say "this is a wapbl buf" */ 256 .wo_wapbl_biodone = wapbl_biodone, 257 }; 258 259 void 260 wapbl_init(void) 261 { 262 263 malloc_type_attach(M_WAPBL); 264 } 265 266 static int 267 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) 268 { 269 int error, i; 270 271 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 272 ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); 273 274 /* 275 * Its only valid to reuse the replay log if its 276 * the same as the new log we just opened. 277 */ 278 KDASSERT(!wapbl_replay_isopen(wr)); 279 KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev); 280 KASSERT(wl->wl_logpbn == wr->wr_logpbn); 281 KASSERT(wl->wl_circ_size == wr->wr_circ_size); 282 KASSERT(wl->wl_circ_off == wr->wr_circ_off); 283 KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift); 284 KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift); 285 286 wl->wl_wc_header->wc_generation = wr->wr_generation + 1; 287 288 for (i = 0; i < wr->wr_inodescnt; i++) 289 wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, 290 wr->wr_inodes[i].wr_imode); 291 292 /* Make sure new transaction won't overwrite old inodes list */ 293 KDASSERT(wapbl_transaction_len(wl) <= 294 wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, 295 wr->wr_inodestail)); 296 297 wl->wl_head = wl->wl_tail = wr->wr_inodeshead; 298 wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = 299 wapbl_transaction_len(wl); 300 301 error = wapbl_write_inodes(wl, &wl->wl_head); 302 if (error) 303 return error; 304 305 KASSERT(wl->wl_head != wl->wl_tail); 306 KASSERT(wl->wl_head != 0); 307 308 return 0; 309 } 310 311 int 312 wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, 313 daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, 314 wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) 315 { 316 struct wapbl *wl; 317 struct vnode *devvp; 318 daddr_t logpbn; 319 int error; 320 int log_dev_bshift = DEV_BSHIFT; 321 int fs_dev_bshift = DEV_BSHIFT; 322 int run; 323 324 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 325 " count=%zu blksize=%zu\n", vp, off, count, blksize)); 326 327 if (log_dev_bshift > fs_dev_bshift) { 328 WAPBL_PRINTF(WAPBL_PRINT_OPEN, 329 ("wapbl: log device's block size cannot be larger " 330 "than filesystem's\n")); 331 /* 332 * Not currently implemented, although it could be if 333 * needed someday. 334 */ 335 return ENOSYS; 336 } 337 338 if (off < 0) 339 return EINVAL; 340 341 if (blksize < DEV_BSIZE) 342 return EINVAL; 343 if (blksize % DEV_BSIZE) 344 return EINVAL; 345 346 /* XXXTODO: verify that the full load is writable */ 347 348 /* 349 * XXX check for minimum log size 350 * minimum is governed by minimum amount of space 351 * to complete a transaction. (probably truncate) 352 */ 353 /* XXX for now pick something minimal */ 354 if ((count * blksize) < MAXPHYS) { 355 return ENOSPC; 356 } 357 358 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { 359 return error; 360 } 361 362 wl = wapbl_calloc(1, sizeof(*wl)); 363 rw_init(&wl->wl_rwlock); 364 mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); 365 cv_init(&wl->wl_reclaimable_cv, "wapblrec"); 366 LIST_INIT(&wl->wl_bufs); 367 SIMPLEQ_INIT(&wl->wl_entries); 368 369 wl->wl_logvp = vp; 370 wl->wl_devvp = devvp; 371 wl->wl_mount = mp; 372 wl->wl_logpbn = logpbn; 373 wl->wl_log_dev_bshift = log_dev_bshift; 374 wl->wl_fs_dev_bshift = fs_dev_bshift; 375 376 wl->wl_flush = flushfn; 377 wl->wl_flush_abort = flushabortfn; 378 379 /* Reserve two log device blocks for the commit headers */ 380 wl->wl_circ_off = 2<<wl->wl_log_dev_bshift; 381 wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); 382 /* truncate the log usage to a multiple of log_dev_bshift */ 383 wl->wl_circ_size >>= wl->wl_log_dev_bshift; 384 wl->wl_circ_size <<= wl->wl_log_dev_bshift; 385 386 /* 387 * wl_bufbytes_max limits the size of the in memory transaction space. 388 * - Since buffers are allocated and accounted for in units of 389 * PAGE_SIZE it is required to be a multiple of PAGE_SIZE 390 * (i.e. 1<<PAGE_SHIFT) 391 * - Since the log device has to be written in units of 392 * 1<<wl_log_dev_bshift it is required to be a mulitple of 393 * 1<<wl_log_dev_bshift. 394 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift, 395 * it is convenient to be a multiple of 1<<wl_fs_dev_bshift. 396 * Therefore it must be multiple of the least common multiple of those 397 * three quantities. Fortunately, all of those quantities are 398 * guaranteed to be a power of two, and the least common multiple of 399 * a set of numbers which are all powers of two is simply the maximum 400 * of those numbers. Finally, the maximum logarithm of a power of two 401 * is the same as the log of the maximum power of two. So we can do 402 * the following operations to size wl_bufbytes_max: 403 */ 404 405 /* XXX fix actual number of pages reserved per filesystem. */ 406 wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); 407 408 /* Round wl_bufbytes_max to the largest power of two constraint */ 409 wl->wl_bufbytes_max >>= PAGE_SHIFT; 410 wl->wl_bufbytes_max <<= PAGE_SHIFT; 411 wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; 412 wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; 413 wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; 414 wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; 415 416 /* XXX maybe use filesystem fragment size instead of 1024 */ 417 /* XXX fix actual number of buffers reserved per filesystem. */ 418 wl->wl_bufcount_max = (nbuf / 2) * 1024; 419 420 /* XXX tie this into resource estimation */ 421 wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max); 422 423 wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * 424 wl->wl_dealloclim); 425 wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * 426 wl->wl_dealloclim); 427 428 wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); 429 430 /* Initialize the commit header */ 431 { 432 struct wapbl_wc_header *wc; 433 size_t len = 1 << wl->wl_log_dev_bshift; 434 wc = wapbl_calloc(1, len); 435 wc->wc_type = WAPBL_WC_HEADER; 436 wc->wc_len = len; 437 wc->wc_circ_off = wl->wl_circ_off; 438 wc->wc_circ_size = wl->wl_circ_size; 439 /* XXX wc->wc_fsid */ 440 wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; 441 wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; 442 wl->wl_wc_header = wc; 443 wl->wl_wc_scratch = wapbl_malloc(len); 444 } 445 446 /* 447 * if there was an existing set of unlinked but 448 * allocated inodes, preserve it in the new 449 * log. 450 */ 451 if (wr && wr->wr_inodescnt) { 452 error = wapbl_start_flush_inodes(wl, wr); 453 if (error) 454 goto errout; 455 } 456 457 error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); 458 if (error) { 459 goto errout; 460 } 461 462 *wlp = wl; 463 #if defined(WAPBL_DEBUG) 464 wapbl_debug_wl = wl; 465 #endif 466 467 return 0; 468 errout: 469 wapbl_discard(wl); 470 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 471 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 472 wapbl_free(wl->wl_deallocblks, 473 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 474 wapbl_free(wl->wl_dealloclens, 475 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 476 wapbl_inodetrk_free(wl); 477 wapbl_free(wl, sizeof(*wl)); 478 479 return error; 480 } 481 482 /* 483 * Like wapbl_flush, only discards the transaction 484 * completely 485 */ 486 487 void 488 wapbl_discard(struct wapbl *wl) 489 { 490 struct wapbl_entry *we; 491 struct buf *bp; 492 int i; 493 494 /* 495 * XXX we may consider using upgrade here 496 * if we want to call flush from inside a transaction 497 */ 498 rw_enter(&wl->wl_rwlock, RW_WRITER); 499 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 500 wl->wl_dealloccnt); 501 502 #ifdef WAPBL_DEBUG_PRINT 503 { 504 struct wapbl_entry *we; 505 pid_t pid = -1; 506 lwpid_t lid = -1; 507 if (curproc) 508 pid = curproc->p_pid; 509 if (curlwp) 510 lid = curlwp->l_lid; 511 #ifdef WAPBL_DEBUG_BUFBYTES 512 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 513 ("wapbl_discard: thread %d.%d discarding " 514 "transaction\n" 515 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 516 "deallocs=%d inodes=%d\n" 517 "\terrcnt = %u, reclaimable=%zu reserved=%zu " 518 "unsynced=%zu\n", 519 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 520 wl->wl_bcount, wl->wl_dealloccnt, 521 wl->wl_inohashcnt, wl->wl_error_count, 522 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 523 wl->wl_unsynced_bufbytes)); 524 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 525 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 526 ("\tentry: bufcount = %zu, reclaimable = %zu, " 527 "error = %d, unsynced = %zu\n", 528 we->we_bufcount, we->we_reclaimable_bytes, 529 we->we_error, we->we_unsynced_bufbytes)); 530 } 531 #else /* !WAPBL_DEBUG_BUFBYTES */ 532 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 533 ("wapbl_discard: thread %d.%d discarding transaction\n" 534 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 535 "deallocs=%d inodes=%d\n" 536 "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", 537 pid, lid, wl->wl_bufcount, wl->wl_bufbytes, 538 wl->wl_bcount, wl->wl_dealloccnt, 539 wl->wl_inohashcnt, wl->wl_error_count, 540 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); 541 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 542 WAPBL_PRINTF(WAPBL_PRINT_DISCARD, 543 ("\tentry: bufcount = %zu, reclaimable = %zu, " 544 "error = %d\n", 545 we->we_bufcount, we->we_reclaimable_bytes, 546 we->we_error)); 547 } 548 #endif /* !WAPBL_DEBUG_BUFBYTES */ 549 } 550 #endif /* WAPBL_DEBUG_PRINT */ 551 552 for (i = 0; i <= wl->wl_inohashmask; i++) { 553 struct wapbl_ino_head *wih; 554 struct wapbl_ino *wi; 555 556 wih = &wl->wl_inohash[i]; 557 while ((wi = LIST_FIRST(wih)) != NULL) { 558 LIST_REMOVE(wi, wi_hash); 559 pool_put(&wapbl_ino_pool, wi); 560 KASSERT(wl->wl_inohashcnt > 0); 561 wl->wl_inohashcnt--; 562 } 563 } 564 565 /* 566 * clean buffer list 567 */ 568 mutex_enter(&bufcache_lock); 569 mutex_enter(&wl->wl_mtx); 570 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 571 if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { 572 /* 573 * The buffer will be unlocked and 574 * removed from the transaction in brelse 575 */ 576 mutex_exit(&wl->wl_mtx); 577 brelsel(bp, 0); 578 mutex_enter(&wl->wl_mtx); 579 } 580 } 581 mutex_exit(&wl->wl_mtx); 582 mutex_exit(&bufcache_lock); 583 584 /* 585 * Remove references to this wl from wl_entries, free any which 586 * no longer have buffers, others will be freed in wapbl_biodone 587 * when they no longer have any buffers. 588 */ 589 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { 590 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 591 /* XXX should we be accumulating wl_error_count 592 * and increasing reclaimable bytes ? */ 593 we->we_wapbl = NULL; 594 if (we->we_bufcount == 0) { 595 #ifdef WAPBL_DEBUG_BUFBYTES 596 KASSERT(we->we_unsynced_bufbytes == 0); 597 #endif 598 wapbl_free(we, sizeof(*we)); 599 } 600 } 601 602 /* Discard list of deallocs */ 603 wl->wl_dealloccnt = 0; 604 /* XXX should we clear wl_reserved_bytes? */ 605 606 KASSERT(wl->wl_bufbytes == 0); 607 KASSERT(wl->wl_bcount == 0); 608 KASSERT(wl->wl_bufcount == 0); 609 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 610 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 611 KASSERT(wl->wl_inohashcnt == 0); 612 613 rw_exit(&wl->wl_rwlock); 614 } 615 616 int 617 wapbl_stop(struct wapbl *wl, int force) 618 { 619 struct vnode *vp; 620 int error; 621 622 WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); 623 error = wapbl_flush(wl, 1); 624 if (error) { 625 if (force) 626 wapbl_discard(wl); 627 else 628 return error; 629 } 630 631 /* Unlinked inodes persist after a flush */ 632 if (wl->wl_inohashcnt) { 633 if (force) { 634 wapbl_discard(wl); 635 } else { 636 return EBUSY; 637 } 638 } 639 640 KASSERT(wl->wl_bufbytes == 0); 641 KASSERT(wl->wl_bcount == 0); 642 KASSERT(wl->wl_bufcount == 0); 643 KASSERT(LIST_EMPTY(&wl->wl_bufs)); 644 KASSERT(wl->wl_dealloccnt == 0); 645 KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); 646 KASSERT(wl->wl_inohashcnt == 0); 647 648 vp = wl->wl_logvp; 649 650 wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len); 651 wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len); 652 wapbl_free(wl->wl_deallocblks, 653 sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); 654 wapbl_free(wl->wl_dealloclens, 655 sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); 656 wapbl_inodetrk_free(wl); 657 658 cv_destroy(&wl->wl_reclaimable_cv); 659 mutex_destroy(&wl->wl_mtx); 660 rw_destroy(&wl->wl_rwlock); 661 wapbl_free(wl, sizeof(*wl)); 662 663 return 0; 664 } 665 666 static int 667 wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) 668 { 669 struct pstats *pstats = curlwp->l_proc->p_stats; 670 struct buf *bp; 671 int error; 672 673 KASSERT((flags & ~(B_WRITE | B_READ)) == 0); 674 KASSERT(devvp->v_type == VBLK); 675 676 if ((flags & (B_WRITE | B_READ)) == B_WRITE) { 677 mutex_enter(&devvp->v_interlock); 678 devvp->v_numoutput++; 679 mutex_exit(&devvp->v_interlock); 680 pstats->p_ru.ru_oublock++; 681 } else { 682 pstats->p_ru.ru_inblock++; 683 } 684 685 bp = getiobuf(devvp, true); 686 bp->b_flags = flags; 687 bp->b_cflags = BC_BUSY; /* silly & dubious */ 688 bp->b_dev = devvp->v_rdev; 689 bp->b_data = data; 690 bp->b_bufsize = bp->b_resid = bp->b_bcount = len; 691 bp->b_blkno = pbn; 692 693 WAPBL_PRINTF(WAPBL_PRINT_IO, 694 ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n", 695 BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, 696 bp->b_blkno, bp->b_dev)); 697 698 VOP_STRATEGY(devvp, bp); 699 700 error = biowait(bp); 701 putiobuf(bp); 702 703 if (error) { 704 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 705 ("wapbl_doio: %s %zu bytes at block %" PRId64 706 " on dev 0x%x failed with error %d\n", 707 (((flags & (B_WRITE | B_READ)) == B_WRITE) ? 708 "write" : "read"), 709 len, pbn, devvp->v_rdev, error)); 710 } 711 712 return error; 713 } 714 715 int 716 wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 717 { 718 719 return wapbl_doio(data, len, devvp, pbn, B_WRITE); 720 } 721 722 int 723 wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) 724 { 725 726 return wapbl_doio(data, len, devvp, pbn, B_READ); 727 } 728 729 /* 730 * Off is byte offset returns new offset for next write 731 * handles log wraparound 732 */ 733 static int 734 wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) 735 { 736 size_t slen; 737 off_t off = *offp; 738 int error; 739 740 KDASSERT(((len >> wl->wl_log_dev_bshift) << 741 wl->wl_log_dev_bshift) == len); 742 743 if (off < wl->wl_circ_off) 744 off = wl->wl_circ_off; 745 slen = wl->wl_circ_off + wl->wl_circ_size - off; 746 if (slen < len) { 747 error = wapbl_write(data, slen, wl->wl_devvp, 748 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); 749 if (error) 750 return error; 751 data = (uint8_t *)data + slen; 752 len -= slen; 753 off = wl->wl_circ_off; 754 } 755 error = wapbl_write(data, len, wl->wl_devvp, 756 wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); 757 if (error) 758 return error; 759 off += len; 760 if (off >= wl->wl_circ_off + wl->wl_circ_size) 761 off = wl->wl_circ_off; 762 *offp = off; 763 return 0; 764 } 765 766 /****************************************************************/ 767 768 int 769 wapbl_begin(struct wapbl *wl, const char *file, int line) 770 { 771 int doflush; 772 unsigned lockcount; 773 774 KDASSERT(wl); 775 776 /* 777 * XXX this needs to be made much more sophisticated. 778 * perhaps each wapbl_begin could reserve a specified 779 * number of buffers and bytes. 780 */ 781 mutex_enter(&wl->wl_mtx); 782 lockcount = wl->wl_lock_count; 783 doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > 784 wl->wl_bufbytes_max / 2) || 785 ((wl->wl_bufcount + (lockcount * 10)) > 786 wl->wl_bufcount_max / 2) || 787 (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) || 788 (wl->wl_dealloccnt >= 789 (wl->wl_dealloclim - (wl->wl_dealloclim >> 8))); 790 mutex_exit(&wl->wl_mtx); 791 792 if (doflush) { 793 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 794 ("force flush lockcnt=%d bufbytes=%zu " 795 "(max=%zu) bufcount=%zu (max=%zu) " 796 "dealloccnt %d (lim=%d)\n", 797 lockcount, wl->wl_bufbytes, 798 wl->wl_bufbytes_max, wl->wl_bufcount, 799 wl->wl_bufcount_max, 800 wl->wl_dealloccnt, wl->wl_dealloclim)); 801 } 802 803 if (doflush) { 804 int error = wapbl_flush(wl, 0); 805 if (error) 806 return error; 807 } 808 809 rw_enter(&wl->wl_rwlock, RW_READER); 810 mutex_enter(&wl->wl_mtx); 811 wl->wl_lock_count++; 812 mutex_exit(&wl->wl_mtx); 813 814 #if defined(WAPBL_DEBUG_PRINT) 815 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 816 ("wapbl_begin thread %d.%d with bufcount=%zu " 817 "bufbytes=%zu bcount=%zu at %s:%d\n", 818 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 819 wl->wl_bufbytes, wl->wl_bcount, file, line)); 820 #endif 821 822 return 0; 823 } 824 825 void 826 wapbl_end(struct wapbl *wl) 827 { 828 829 #if defined(WAPBL_DEBUG_PRINT) 830 WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, 831 ("wapbl_end thread %d.%d with bufcount=%zu " 832 "bufbytes=%zu bcount=%zu\n", 833 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 834 wl->wl_bufbytes, wl->wl_bcount)); 835 #endif 836 837 mutex_enter(&wl->wl_mtx); 838 KASSERT(wl->wl_lock_count > 0); 839 wl->wl_lock_count--; 840 mutex_exit(&wl->wl_mtx); 841 842 rw_exit(&wl->wl_rwlock); 843 } 844 845 void 846 wapbl_add_buf(struct wapbl *wl, struct buf * bp) 847 { 848 849 KASSERT(bp->b_cflags & BC_BUSY); 850 KASSERT(bp->b_vp); 851 852 wapbl_jlock_assert(wl); 853 854 #if 0 855 /* 856 * XXX this might be an issue for swapfiles. 857 * see uvm_swap.c:1702 858 * 859 * XXX2 why require it then? leap of semantics? 860 */ 861 KASSERT((bp->b_cflags & BC_NOCACHE) == 0); 862 #endif 863 864 mutex_enter(&wl->wl_mtx); 865 if (bp->b_flags & B_LOCKED) { 866 LIST_REMOVE(bp, b_wapbllist); 867 WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, 868 ("wapbl_add_buf thread %d.%d re-adding buf %p " 869 "with %d bytes %d bcount\n", 870 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 871 bp->b_bcount)); 872 } else { 873 /* unlocked by dirty buffers shouldn't exist */ 874 KASSERT(!(bp->b_oflags & BO_DELWRI)); 875 wl->wl_bufbytes += bp->b_bufsize; 876 wl->wl_bcount += bp->b_bcount; 877 wl->wl_bufcount++; 878 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 879 ("wapbl_add_buf thread %d.%d adding buf %p " 880 "with %d bytes %d bcount\n", 881 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, 882 bp->b_bcount)); 883 } 884 LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); 885 mutex_exit(&wl->wl_mtx); 886 887 bp->b_flags |= B_LOCKED; 888 } 889 890 static void 891 wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) 892 { 893 894 KASSERT(mutex_owned(&wl->wl_mtx)); 895 KASSERT(bp->b_cflags & BC_BUSY); 896 wapbl_jlock_assert(wl); 897 898 #if 0 899 /* 900 * XXX this might be an issue for swapfiles. 901 * see uvm_swap.c:1725 902 * 903 * XXXdeux: see above 904 */ 905 KASSERT((bp->b_flags & BC_NOCACHE) == 0); 906 #endif 907 KASSERT(bp->b_flags & B_LOCKED); 908 909 WAPBL_PRINTF(WAPBL_PRINT_BUFFER, 910 ("wapbl_remove_buf thread %d.%d removing buf %p with " 911 "%d bytes %d bcount\n", 912 curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); 913 914 KASSERT(wl->wl_bufbytes >= bp->b_bufsize); 915 wl->wl_bufbytes -= bp->b_bufsize; 916 KASSERT(wl->wl_bcount >= bp->b_bcount); 917 wl->wl_bcount -= bp->b_bcount; 918 KASSERT(wl->wl_bufcount > 0); 919 wl->wl_bufcount--; 920 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 921 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 922 LIST_REMOVE(bp, b_wapbllist); 923 924 bp->b_flags &= ~B_LOCKED; 925 } 926 927 /* called from brelsel() in vfs_bio among other places */ 928 void 929 wapbl_remove_buf(struct wapbl * wl, struct buf *bp) 930 { 931 932 mutex_enter(&wl->wl_mtx); 933 wapbl_remove_buf_locked(wl, bp); 934 mutex_exit(&wl->wl_mtx); 935 } 936 937 void 938 wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) 939 { 940 941 KASSERT(bp->b_cflags & BC_BUSY); 942 943 /* 944 * XXX: why does this depend on B_LOCKED? otherwise the buf 945 * is not for a transaction? if so, why is this called in the 946 * first place? 947 */ 948 if (bp->b_flags & B_LOCKED) { 949 mutex_enter(&wl->wl_mtx); 950 wl->wl_bufbytes += bp->b_bufsize - oldsz; 951 wl->wl_bcount += bp->b_bcount - oldcnt; 952 mutex_exit(&wl->wl_mtx); 953 } 954 } 955 956 #endif /* _KERNEL */ 957 958 /****************************************************************/ 959 /* Some utility inlines */ 960 961 /* This is used to advance the pointer at old to new value at old+delta */ 962 static __inline off_t 963 wapbl_advance(size_t size, size_t off, off_t old, size_t delta) 964 { 965 off_t new; 966 967 /* Define acceptable ranges for inputs. */ 968 KASSERT(delta <= size); 969 KASSERT((old == 0) || (old >= off)); 970 KASSERT(old < (size + off)); 971 972 if ((old == 0) && (delta != 0)) 973 new = off + delta; 974 else if ((old + delta) < (size + off)) 975 new = old + delta; 976 else 977 new = (old + delta) - size; 978 979 /* Note some interesting axioms */ 980 KASSERT((delta != 0) || (new == old)); 981 KASSERT((delta == 0) || (new != 0)); 982 KASSERT((delta != (size)) || (new == old)); 983 984 /* Define acceptable ranges for output. */ 985 KASSERT((new == 0) || (new >= off)); 986 KASSERT(new < (size + off)); 987 return new; 988 } 989 990 static __inline size_t 991 wapbl_space_used(size_t avail, off_t head, off_t tail) 992 { 993 994 if (tail == 0) { 995 KASSERT(head == 0); 996 return 0; 997 } 998 return ((head + (avail - 1) - tail) % avail) + 1; 999 } 1000 1001 static __inline size_t 1002 wapbl_space_free(size_t avail, off_t head, off_t tail) 1003 { 1004 1005 return avail - wapbl_space_used(avail, head, tail); 1006 } 1007 1008 static __inline void 1009 wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, 1010 off_t *tailp) 1011 { 1012 off_t head = *headp; 1013 off_t tail = *tailp; 1014 1015 KASSERT(delta <= wapbl_space_free(size, head, tail)); 1016 head = wapbl_advance(size, off, head, delta); 1017 if ((tail == 0) && (head != 0)) 1018 tail = off; 1019 *headp = head; 1020 *tailp = tail; 1021 } 1022 1023 static __inline void 1024 wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, 1025 off_t *tailp) 1026 { 1027 off_t head = *headp; 1028 off_t tail = *tailp; 1029 1030 KASSERT(delta <= wapbl_space_used(size, head, tail)); 1031 tail = wapbl_advance(size, off, tail, delta); 1032 if (head == tail) { 1033 head = tail = 0; 1034 } 1035 *headp = head; 1036 *tailp = tail; 1037 } 1038 1039 #ifdef _KERNEL 1040 1041 /****************************************************************/ 1042 1043 /* 1044 * Remove transactions whose buffers are completely flushed to disk. 1045 * Will block until at least minfree space is available. 1046 * only intended to be called from inside wapbl_flush and therefore 1047 * does not protect against commit races with itself or with flush. 1048 */ 1049 static int 1050 wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) 1051 { 1052 size_t delta; 1053 size_t avail; 1054 off_t head; 1055 off_t tail; 1056 int error = 0; 1057 1058 KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); 1059 KASSERT(rw_write_held(&wl->wl_rwlock)); 1060 1061 mutex_enter(&wl->wl_mtx); 1062 1063 /* 1064 * First check to see if we have to do a commit 1065 * at all. 1066 */ 1067 avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); 1068 if (minfree < avail) { 1069 mutex_exit(&wl->wl_mtx); 1070 return 0; 1071 } 1072 minfree -= avail; 1073 while ((wl->wl_error_count == 0) && 1074 (wl->wl_reclaimable_bytes < minfree)) { 1075 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1076 ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " 1077 "minfree=%zd\n", 1078 &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, 1079 minfree)); 1080 1081 cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); 1082 } 1083 if (wl->wl_reclaimable_bytes < minfree) { 1084 KASSERT(wl->wl_error_count); 1085 /* XXX maybe get actual error from buffer instead someday? */ 1086 error = EIO; 1087 } 1088 head = wl->wl_head; 1089 tail = wl->wl_tail; 1090 delta = wl->wl_reclaimable_bytes; 1091 1092 /* If all of of the entries are flushed, then be sure to keep 1093 * the reserved bytes reserved. Watch out for discarded transactions, 1094 * which could leave more bytes reserved than are reclaimable. 1095 */ 1096 if (SIMPLEQ_EMPTY(&wl->wl_entries) && 1097 (delta >= wl->wl_reserved_bytes)) { 1098 delta -= wl->wl_reserved_bytes; 1099 } 1100 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, 1101 &tail); 1102 KDASSERT(wl->wl_reserved_bytes <= 1103 wapbl_space_used(wl->wl_circ_size, head, tail)); 1104 mutex_exit(&wl->wl_mtx); 1105 1106 if (error) 1107 return error; 1108 1109 if (waitonly) 1110 return 0; 1111 1112 /* 1113 * This is where head, tail and delta are unprotected 1114 * from races against itself or flush. This is ok since 1115 * we only call this routine from inside flush itself. 1116 * 1117 * XXX: how can it race against itself when accessed only 1118 * from behind the write-locked rwlock? 1119 */ 1120 error = wapbl_write_commit(wl, head, tail); 1121 if (error) 1122 return error; 1123 1124 wl->wl_head = head; 1125 wl->wl_tail = tail; 1126 1127 mutex_enter(&wl->wl_mtx); 1128 KASSERT(wl->wl_reclaimable_bytes >= delta); 1129 wl->wl_reclaimable_bytes -= delta; 1130 mutex_exit(&wl->wl_mtx); 1131 WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, 1132 ("wapbl_truncate thread %d.%d truncating %zu bytes\n", 1133 curproc->p_pid, curlwp->l_lid, delta)); 1134 1135 return 0; 1136 } 1137 1138 /****************************************************************/ 1139 1140 void 1141 wapbl_biodone(struct buf *bp) 1142 { 1143 struct wapbl_entry *we = bp->b_private; 1144 struct wapbl *wl = we->we_wapbl; 1145 1146 /* 1147 * Handle possible flushing of buffers after log has been 1148 * decomissioned. 1149 */ 1150 if (!wl) { 1151 KASSERT(we->we_bufcount > 0); 1152 we->we_bufcount--; 1153 #ifdef WAPBL_DEBUG_BUFBYTES 1154 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1155 we->we_unsynced_bufbytes -= bp->b_bufsize; 1156 #endif 1157 1158 if (we->we_bufcount == 0) { 1159 #ifdef WAPBL_DEBUG_BUFBYTES 1160 KASSERT(we->we_unsynced_bufbytes == 0); 1161 #endif 1162 wapbl_free(we, sizeof(*we)); 1163 } 1164 1165 brelse(bp, 0); 1166 return; 1167 } 1168 1169 #ifdef ohbother 1170 KDASSERT(bp->b_flags & B_DONE); 1171 KDASSERT(!(bp->b_flags & B_DELWRI)); 1172 KDASSERT(bp->b_flags & B_ASYNC); 1173 KDASSERT(bp->b_flags & B_BUSY); 1174 KDASSERT(!(bp->b_flags & B_LOCKED)); 1175 KDASSERT(!(bp->b_flags & B_READ)); 1176 KDASSERT(!(bp->b_flags & B_INVAL)); 1177 KDASSERT(!(bp->b_flags & B_NOCACHE)); 1178 #endif 1179 1180 if (bp->b_error) { 1181 #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ 1182 /* 1183 * XXXpooka: interfaces not fully updated 1184 * Note: this was not enabled in the original patch 1185 * against netbsd4 either. I don't know if comment 1186 * above is true or not. 1187 */ 1188 1189 /* 1190 * If an error occurs, report the error and leave the 1191 * buffer as a delayed write on the LRU queue. 1192 * restarting the write would likely result in 1193 * an error spinloop, so let it be done harmlessly 1194 * by the syncer. 1195 */ 1196 bp->b_flags &= ~(B_DONE); 1197 simple_unlock(&bp->b_interlock); 1198 1199 if (we->we_error == 0) { 1200 mutex_enter(&wl->wl_mtx); 1201 wl->wl_error_count++; 1202 mutex_exit(&wl->wl_mtx); 1203 cv_broadcast(&wl->wl_reclaimable_cv); 1204 } 1205 we->we_error = bp->b_error; 1206 bp->b_error = 0; 1207 brelse(bp); 1208 return; 1209 #else 1210 /* For now, just mark the log permanently errored out */ 1211 1212 mutex_enter(&wl->wl_mtx); 1213 if (wl->wl_error_count == 0) { 1214 wl->wl_error_count++; 1215 cv_broadcast(&wl->wl_reclaimable_cv); 1216 } 1217 mutex_exit(&wl->wl_mtx); 1218 #endif 1219 } 1220 1221 mutex_enter(&wl->wl_mtx); 1222 1223 KASSERT(we->we_bufcount > 0); 1224 we->we_bufcount--; 1225 #ifdef WAPBL_DEBUG_BUFBYTES 1226 KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); 1227 we->we_unsynced_bufbytes -= bp->b_bufsize; 1228 KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); 1229 wl->wl_unsynced_bufbytes -= bp->b_bufsize; 1230 #endif 1231 1232 /* 1233 * If the current transaction can be reclaimed, start 1234 * at the beginning and reclaim any consecutive reclaimable 1235 * transactions. If we successfully reclaim anything, 1236 * then wakeup anyone waiting for the reclaim. 1237 */ 1238 if (we->we_bufcount == 0) { 1239 size_t delta = 0; 1240 int errcnt = 0; 1241 #ifdef WAPBL_DEBUG_BUFBYTES 1242 KDASSERT(we->we_unsynced_bufbytes == 0); 1243 #endif 1244 /* 1245 * clear any posted error, since the buffer it came from 1246 * has successfully flushed by now 1247 */ 1248 while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && 1249 (we->we_bufcount == 0)) { 1250 delta += we->we_reclaimable_bytes; 1251 if (we->we_error) 1252 errcnt++; 1253 SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); 1254 wapbl_free(we, sizeof(*we)); 1255 } 1256 1257 if (delta) { 1258 wl->wl_reclaimable_bytes += delta; 1259 KASSERT(wl->wl_error_count >= errcnt); 1260 wl->wl_error_count -= errcnt; 1261 cv_broadcast(&wl->wl_reclaimable_cv); 1262 } 1263 } 1264 1265 mutex_exit(&wl->wl_mtx); 1266 brelse(bp, 0); 1267 } 1268 1269 /* 1270 * Write transactions to disk + start I/O for contents 1271 */ 1272 int 1273 wapbl_flush(struct wapbl *wl, int waitfor) 1274 { 1275 struct buf *bp; 1276 struct wapbl_entry *we; 1277 off_t off; 1278 off_t head; 1279 off_t tail; 1280 size_t delta = 0; 1281 size_t flushsize; 1282 size_t reserved; 1283 int error = 0; 1284 1285 /* 1286 * Do a quick check to see if a full flush can be skipped 1287 * This assumes that the flush callback does not need to be called 1288 * unless there are other outstanding bufs. 1289 */ 1290 if (!waitfor) { 1291 size_t nbufs; 1292 mutex_enter(&wl->wl_mtx); /* XXX need mutex here to 1293 protect the KASSERTS */ 1294 nbufs = wl->wl_bufcount; 1295 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); 1296 KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); 1297 mutex_exit(&wl->wl_mtx); 1298 if (nbufs == 0) 1299 return 0; 1300 } 1301 1302 /* 1303 * XXX we may consider using LK_UPGRADE here 1304 * if we want to call flush from inside a transaction 1305 */ 1306 rw_enter(&wl->wl_rwlock, RW_WRITER); 1307 wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, 1308 wl->wl_dealloccnt); 1309 1310 /* 1311 * Now that we are fully locked and flushed, 1312 * do another check for nothing to do. 1313 */ 1314 if (wl->wl_bufcount == 0) { 1315 goto out; 1316 } 1317 1318 #if 0 1319 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1320 ("wapbl_flush thread %d.%d flushing entries with " 1321 "bufcount=%zu bufbytes=%zu\n", 1322 curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, 1323 wl->wl_bufbytes)); 1324 #endif 1325 1326 /* Calculate amount of space needed to flush */ 1327 flushsize = wapbl_transaction_len(wl); 1328 1329 if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { 1330 /* 1331 * XXX this could be handled more gracefully, perhaps place 1332 * only a partial transaction in the log and allow the 1333 * remaining to flush without the protection of the journal. 1334 */ 1335 panic("wapbl_flush: current transaction too big to flush\n"); 1336 } 1337 1338 error = wapbl_truncate(wl, flushsize, 0); 1339 if (error) 1340 goto out2; 1341 1342 off = wl->wl_head; 1343 KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 1344 (off < wl->wl_circ_off + wl->wl_circ_size))); 1345 error = wapbl_write_blocks(wl, &off); 1346 if (error) 1347 goto out2; 1348 error = wapbl_write_revocations(wl, &off); 1349 if (error) 1350 goto out2; 1351 error = wapbl_write_inodes(wl, &off); 1352 if (error) 1353 goto out2; 1354 1355 reserved = 0; 1356 if (wl->wl_inohashcnt) 1357 reserved = wapbl_transaction_inodes_len(wl); 1358 1359 head = wl->wl_head; 1360 tail = wl->wl_tail; 1361 1362 wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, 1363 &head, &tail); 1364 #ifdef WAPBL_DEBUG 1365 if (head != off) { 1366 panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX 1367 " off=%"PRIdMAX" flush=%zu\n", 1368 (intmax_t)head, (intmax_t)tail, (intmax_t)off, 1369 flushsize); 1370 } 1371 #else 1372 KASSERT(head == off); 1373 #endif 1374 1375 /* Opportunistically move the tail forward if we can */ 1376 if (!wapbl_lazy_truncate) { 1377 mutex_enter(&wl->wl_mtx); 1378 delta = wl->wl_reclaimable_bytes; 1379 mutex_exit(&wl->wl_mtx); 1380 wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, 1381 &head, &tail); 1382 } 1383 1384 error = wapbl_write_commit(wl, head, tail); 1385 if (error) 1386 goto out2; 1387 1388 we = wapbl_calloc(1, sizeof(*we)); 1389 1390 #ifdef WAPBL_DEBUG_BUFBYTES 1391 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1392 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1393 " unsynced=%zu" 1394 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1395 "inodes=%d\n", 1396 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1397 wapbl_space_used(wl->wl_circ_size, head, tail), 1398 wl->wl_unsynced_bufbytes, wl->wl_bufcount, 1399 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, 1400 wl->wl_inohashcnt)); 1401 #else 1402 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1403 ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" 1404 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " 1405 "inodes=%d\n", 1406 curproc->p_pid, curlwp->l_lid, flushsize, delta, 1407 wapbl_space_used(wl->wl_circ_size, head, tail), 1408 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1409 wl->wl_dealloccnt, wl->wl_inohashcnt)); 1410 #endif 1411 1412 1413 mutex_enter(&bufcache_lock); 1414 mutex_enter(&wl->wl_mtx); 1415 1416 wl->wl_reserved_bytes = reserved; 1417 wl->wl_head = head; 1418 wl->wl_tail = tail; 1419 KASSERT(wl->wl_reclaimable_bytes >= delta); 1420 wl->wl_reclaimable_bytes -= delta; 1421 wl->wl_dealloccnt = 0; 1422 #ifdef WAPBL_DEBUG_BUFBYTES 1423 wl->wl_unsynced_bufbytes += wl->wl_bufbytes; 1424 #endif 1425 1426 we->we_wapbl = wl; 1427 we->we_bufcount = wl->wl_bufcount; 1428 #ifdef WAPBL_DEBUG_BUFBYTES 1429 we->we_unsynced_bufbytes = wl->wl_bufbytes; 1430 #endif 1431 we->we_reclaimable_bytes = flushsize; 1432 we->we_error = 0; 1433 SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); 1434 1435 /* 1436 * this flushes bufs in reverse order than they were queued 1437 * it shouldn't matter, but if we care we could use TAILQ instead. 1438 * XXX Note they will get put on the lru queue when they flush 1439 * so we might actually want to change this to preserve order. 1440 */ 1441 while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { 1442 if (bbusy(bp, 0, 0, &wl->wl_mtx)) { 1443 continue; 1444 } 1445 bp->b_iodone = wapbl_biodone; 1446 bp->b_private = we; 1447 bremfree(bp); 1448 wapbl_remove_buf_locked(wl, bp); 1449 mutex_exit(&wl->wl_mtx); 1450 mutex_exit(&bufcache_lock); 1451 bawrite(bp); 1452 mutex_enter(&bufcache_lock); 1453 mutex_enter(&wl->wl_mtx); 1454 } 1455 mutex_exit(&wl->wl_mtx); 1456 mutex_exit(&bufcache_lock); 1457 1458 #if 0 1459 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, 1460 ("wapbl_flush thread %d.%d done flushing entries...\n", 1461 curproc->p_pid, curlwp->l_lid)); 1462 #endif 1463 1464 out: 1465 1466 /* 1467 * If the waitfor flag is set, don't return until everything is 1468 * fully flushed and the on disk log is empty. 1469 */ 1470 if (waitfor) { 1471 error = wapbl_truncate(wl, wl->wl_circ_size - 1472 wl->wl_reserved_bytes, wapbl_lazy_truncate); 1473 } 1474 1475 out2: 1476 if (error) { 1477 wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, 1478 wl->wl_dealloclens, wl->wl_dealloccnt); 1479 } 1480 1481 #ifdef WAPBL_DEBUG_PRINT 1482 if (error) { 1483 pid_t pid = -1; 1484 lwpid_t lid = -1; 1485 if (curproc) 1486 pid = curproc->p_pid; 1487 if (curlwp) 1488 lid = curlwp->l_lid; 1489 mutex_enter(&wl->wl_mtx); 1490 #ifdef WAPBL_DEBUG_BUFBYTES 1491 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1492 ("wapbl_flush: thread %d.%d aborted flush: " 1493 "error = %d\n" 1494 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1495 "deallocs=%d inodes=%d\n" 1496 "\terrcnt = %d, reclaimable=%zu reserved=%zu " 1497 "unsynced=%zu\n", 1498 pid, lid, error, wl->wl_bufcount, 1499 wl->wl_bufbytes, wl->wl_bcount, 1500 wl->wl_dealloccnt, wl->wl_inohashcnt, 1501 wl->wl_error_count, wl->wl_reclaimable_bytes, 1502 wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); 1503 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1504 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1505 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1506 "error = %d, unsynced = %zu\n", 1507 we->we_bufcount, we->we_reclaimable_bytes, 1508 we->we_error, we->we_unsynced_bufbytes)); 1509 } 1510 #else 1511 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1512 ("wapbl_flush: thread %d.%d aborted flush: " 1513 "error = %d\n" 1514 "\tbufcount=%zu bufbytes=%zu bcount=%zu " 1515 "deallocs=%d inodes=%d\n" 1516 "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", 1517 pid, lid, error, wl->wl_bufcount, 1518 wl->wl_bufbytes, wl->wl_bcount, 1519 wl->wl_dealloccnt, wl->wl_inohashcnt, 1520 wl->wl_error_count, wl->wl_reclaimable_bytes, 1521 wl->wl_reserved_bytes)); 1522 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1523 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1524 ("\tentry: bufcount = %zu, reclaimable = %zu, " 1525 "error = %d\n", we->we_bufcount, 1526 we->we_reclaimable_bytes, we->we_error)); 1527 } 1528 #endif 1529 mutex_exit(&wl->wl_mtx); 1530 } 1531 #endif 1532 1533 rw_exit(&wl->wl_rwlock); 1534 return error; 1535 } 1536 1537 /****************************************************************/ 1538 1539 void 1540 wapbl_jlock_assert(struct wapbl *wl) 1541 { 1542 1543 KASSERT(rw_lock_held(&wl->wl_rwlock)); 1544 } 1545 1546 void 1547 wapbl_junlock_assert(struct wapbl *wl) 1548 { 1549 1550 KASSERT(!rw_write_held(&wl->wl_rwlock)); 1551 } 1552 1553 /****************************************************************/ 1554 1555 /* locks missing */ 1556 void 1557 wapbl_print(struct wapbl *wl, 1558 int full, 1559 void (*pr)(const char *, ...)) 1560 { 1561 struct buf *bp; 1562 struct wapbl_entry *we; 1563 (*pr)("wapbl %p", wl); 1564 (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", 1565 wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); 1566 (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", 1567 wl->wl_circ_size, wl->wl_circ_off, 1568 (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); 1569 (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", 1570 wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); 1571 #ifdef WAPBL_DEBUG_BUFBYTES 1572 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1573 "reserved = %zu errcnt = %d unsynced = %zu\n", 1574 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, 1575 wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1576 wl->wl_error_count, wl->wl_unsynced_bufbytes); 1577 #else 1578 (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " 1579 "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, 1580 wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, 1581 wl->wl_error_count); 1582 #endif 1583 (*pr)("\tdealloccnt = %d, dealloclim = %d\n", 1584 wl->wl_dealloccnt, wl->wl_dealloclim); 1585 (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", 1586 wl->wl_inohashcnt, wl->wl_inohashmask); 1587 (*pr)("entries:\n"); 1588 SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { 1589 #ifdef WAPBL_DEBUG_BUFBYTES 1590 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " 1591 "unsynced = %zu\n", 1592 we->we_bufcount, we->we_reclaimable_bytes, 1593 we->we_error, we->we_unsynced_bufbytes); 1594 #else 1595 (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", 1596 we->we_bufcount, we->we_reclaimable_bytes, we->we_error); 1597 #endif 1598 } 1599 if (full) { 1600 int cnt = 0; 1601 (*pr)("bufs ="); 1602 LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { 1603 if (!LIST_NEXT(bp, b_wapbllist)) { 1604 (*pr)(" %p", bp); 1605 } else if ((++cnt % 6) == 0) { 1606 (*pr)(" %p,\n\t", bp); 1607 } else { 1608 (*pr)(" %p,", bp); 1609 } 1610 } 1611 (*pr)("\n"); 1612 1613 (*pr)("dealloced blks = "); 1614 { 1615 int i; 1616 cnt = 0; 1617 for (i = 0; i < wl->wl_dealloccnt; i++) { 1618 (*pr)(" %"PRId64":%d,", 1619 wl->wl_deallocblks[i], 1620 wl->wl_dealloclens[i]); 1621 if ((++cnt % 4) == 0) { 1622 (*pr)("\n\t"); 1623 } 1624 } 1625 } 1626 (*pr)("\n"); 1627 1628 (*pr)("registered inodes = "); 1629 { 1630 int i; 1631 cnt = 0; 1632 for (i = 0; i <= wl->wl_inohashmask; i++) { 1633 struct wapbl_ino_head *wih; 1634 struct wapbl_ino *wi; 1635 1636 wih = &wl->wl_inohash[i]; 1637 LIST_FOREACH(wi, wih, wi_hash) { 1638 if (wi->wi_ino == 0) 1639 continue; 1640 (*pr)(" %"PRId32"/0%06"PRIo32",", 1641 wi->wi_ino, wi->wi_mode); 1642 if ((++cnt % 4) == 0) { 1643 (*pr)("\n\t"); 1644 } 1645 } 1646 } 1647 (*pr)("\n"); 1648 } 1649 } 1650 } 1651 1652 #if defined(WAPBL_DEBUG) || defined(DDB) 1653 void 1654 wapbl_dump(struct wapbl *wl) 1655 { 1656 #if defined(WAPBL_DEBUG) 1657 if (!wl) 1658 wl = wapbl_debug_wl; 1659 #endif 1660 if (!wl) 1661 return; 1662 wapbl_print(wl, 1, printf); 1663 } 1664 #endif 1665 1666 /****************************************************************/ 1667 1668 void 1669 wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) 1670 { 1671 1672 wapbl_jlock_assert(wl); 1673 1674 /* XXX should eventually instead tie this into resource estimation */ 1675 /* 1676 * XXX this panic needs locking/mutex analysis and the 1677 * ability to cope with the failure. 1678 */ 1679 /* XXX this XXX doesn't have enough XXX */ 1680 if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) 1681 panic("wapbl_register_deallocation: out of resources"); 1682 1683 wl->wl_deallocblks[wl->wl_dealloccnt] = blk; 1684 wl->wl_dealloclens[wl->wl_dealloccnt] = len; 1685 wl->wl_dealloccnt++; 1686 WAPBL_PRINTF(WAPBL_PRINT_ALLOC, 1687 ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); 1688 } 1689 1690 /****************************************************************/ 1691 1692 static void 1693 wapbl_inodetrk_init(struct wapbl *wl, u_int size) 1694 { 1695 1696 wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); 1697 if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { 1698 pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, 1699 "wapblinopl", &pool_allocator_nointr, IPL_NONE); 1700 } 1701 } 1702 1703 static void 1704 wapbl_inodetrk_free(struct wapbl *wl) 1705 { 1706 1707 /* XXX this KASSERT needs locking/mutex analysis */ 1708 KASSERT(wl->wl_inohashcnt == 0); 1709 hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); 1710 if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { 1711 pool_destroy(&wapbl_ino_pool); 1712 } 1713 } 1714 1715 static struct wapbl_ino * 1716 wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) 1717 { 1718 struct wapbl_ino_head *wih; 1719 struct wapbl_ino *wi; 1720 1721 KASSERT(mutex_owned(&wl->wl_mtx)); 1722 1723 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1724 LIST_FOREACH(wi, wih, wi_hash) { 1725 if (ino == wi->wi_ino) 1726 return wi; 1727 } 1728 return 0; 1729 } 1730 1731 void 1732 wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1733 { 1734 struct wapbl_ino_head *wih; 1735 struct wapbl_ino *wi; 1736 1737 wi = pool_get(&wapbl_ino_pool, PR_WAITOK); 1738 1739 mutex_enter(&wl->wl_mtx); 1740 if (wapbl_inodetrk_get(wl, ino) == NULL) { 1741 wi->wi_ino = ino; 1742 wi->wi_mode = mode; 1743 wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; 1744 LIST_INSERT_HEAD(wih, wi, wi_hash); 1745 wl->wl_inohashcnt++; 1746 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1747 ("wapbl_register_inode: ino=%"PRId64"\n", ino)); 1748 mutex_exit(&wl->wl_mtx); 1749 } else { 1750 mutex_exit(&wl->wl_mtx); 1751 pool_put(&wapbl_ino_pool, wi); 1752 } 1753 } 1754 1755 void 1756 wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) 1757 { 1758 struct wapbl_ino *wi; 1759 1760 mutex_enter(&wl->wl_mtx); 1761 wi = wapbl_inodetrk_get(wl, ino); 1762 if (wi) { 1763 WAPBL_PRINTF(WAPBL_PRINT_INODE, 1764 ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); 1765 KASSERT(wl->wl_inohashcnt > 0); 1766 wl->wl_inohashcnt--; 1767 LIST_REMOVE(wi, wi_hash); 1768 mutex_exit(&wl->wl_mtx); 1769 1770 pool_put(&wapbl_ino_pool, wi); 1771 } else { 1772 mutex_exit(&wl->wl_mtx); 1773 } 1774 } 1775 1776 /****************************************************************/ 1777 1778 static __inline size_t 1779 wapbl_transaction_inodes_len(struct wapbl *wl) 1780 { 1781 int blocklen = 1<<wl->wl_log_dev_bshift; 1782 int iph; 1783 1784 /* Calculate number of inodes described in a inodelist header */ 1785 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 1786 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 1787 1788 KASSERT(iph > 0); 1789 1790 return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen; 1791 } 1792 1793 1794 /* Calculate amount of space a transaction will take on disk */ 1795 static size_t 1796 wapbl_transaction_len(struct wapbl *wl) 1797 { 1798 int blocklen = 1<<wl->wl_log_dev_bshift; 1799 size_t len; 1800 int bph; 1801 1802 /* Calculate number of blocks described in a blocklist header */ 1803 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1804 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1805 1806 KASSERT(bph > 0); 1807 1808 len = wl->wl_bcount; 1809 len += howmany(wl->wl_bufcount, bph)*blocklen; 1810 len += howmany(wl->wl_dealloccnt, bph)*blocklen; 1811 len += wapbl_transaction_inodes_len(wl); 1812 1813 return len; 1814 } 1815 1816 /* 1817 * Perform commit operation 1818 * 1819 * Note that generation number incrementation needs to 1820 * be protected against racing with other invocations 1821 * of wapbl_commit. This is ok since this routine 1822 * is only invoked from wapbl_flush 1823 */ 1824 static int 1825 wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) 1826 { 1827 struct wapbl_wc_header *wc = wl->wl_wc_header; 1828 struct timespec ts; 1829 int error; 1830 int force = 1; 1831 1832 /* XXX Calc checksum here, instead we do this for now */ 1833 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); 1834 if (error) { 1835 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1836 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1837 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1838 } 1839 1840 wc->wc_head = head; 1841 wc->wc_tail = tail; 1842 wc->wc_checksum = 0; 1843 wc->wc_version = 1; 1844 getnanotime(&ts); 1845 wc->wc_time = ts.tv_sec; 1846 wc->wc_timensec = ts.tv_nsec; 1847 1848 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1849 ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", 1850 (intmax_t)head, (intmax_t)tail)); 1851 1852 /* 1853 * XXX if generation will rollover, then first zero 1854 * over second commit header before trying to write both headers. 1855 */ 1856 1857 error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, 1858 wl->wl_logpbn + wc->wc_generation % 2); 1859 if (error) 1860 return error; 1861 1862 error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); 1863 if (error) { 1864 WAPBL_PRINTF(WAPBL_PRINT_ERROR, 1865 ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " 1866 "returned %d\n", wl->wl_devvp->v_rdev, error)); 1867 } 1868 1869 /* 1870 * If the generation number was zero, write it out a second time. 1871 * This handles initialization and generation number rollover 1872 */ 1873 if (wc->wc_generation++ == 0) { 1874 error = wapbl_write_commit(wl, head, tail); 1875 /* 1876 * This panic should be able to be removed if we do the 1877 * zero'ing mentioned above, and we are certain to roll 1878 * back generation number on failure. 1879 */ 1880 if (error) 1881 panic("wapbl_write_commit: error writing duplicate " 1882 "log header: %d\n", error); 1883 } 1884 return 0; 1885 } 1886 1887 /* Returns new offset value */ 1888 static int 1889 wapbl_write_blocks(struct wapbl *wl, off_t *offp) 1890 { 1891 struct wapbl_wc_blocklist *wc = 1892 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1893 int blocklen = 1<<wl->wl_log_dev_bshift; 1894 int bph; 1895 struct buf *bp; 1896 off_t off = *offp; 1897 int error; 1898 size_t padding; 1899 1900 KASSERT(rw_write_held(&wl->wl_rwlock)); 1901 1902 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1903 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1904 1905 bp = LIST_FIRST(&wl->wl_bufs); 1906 1907 while (bp) { 1908 int cnt; 1909 struct buf *obp = bp; 1910 1911 KASSERT(bp->b_flags & B_LOCKED); 1912 1913 wc->wc_type = WAPBL_WC_BLOCKS; 1914 wc->wc_len = blocklen; 1915 wc->wc_blkcount = 0; 1916 while (bp && (wc->wc_blkcount < bph)) { 1917 /* 1918 * Make sure all the physical block numbers are up to 1919 * date. If this is not always true on a given 1920 * filesystem, then VOP_BMAP must be called. We 1921 * could call VOP_BMAP here, or else in the filesystem 1922 * specific flush callback, although neither of those 1923 * solutions allow us to take the vnode lock. If a 1924 * filesystem requires that we must take the vnode lock 1925 * to call VOP_BMAP, then we can probably do it in 1926 * bwrite when the vnode lock should already be held 1927 * by the invoking code. 1928 */ 1929 KASSERT((bp->b_vp->v_type == VBLK) || 1930 (bp->b_blkno != bp->b_lblkno)); 1931 KASSERT(bp->b_blkno > 0); 1932 1933 wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; 1934 wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; 1935 wc->wc_len += bp->b_bcount; 1936 wc->wc_blkcount++; 1937 bp = LIST_NEXT(bp, b_wapbllist); 1938 } 1939 if (wc->wc_len % blocklen != 0) { 1940 padding = blocklen - wc->wc_len % blocklen; 1941 wc->wc_len += padding; 1942 } else { 1943 padding = 0; 1944 } 1945 1946 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 1947 ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", 1948 wc->wc_len, padding, (intmax_t)off)); 1949 1950 error = wapbl_circ_write(wl, wc, blocklen, &off); 1951 if (error) 1952 return error; 1953 bp = obp; 1954 cnt = 0; 1955 while (bp && (cnt++ < bph)) { 1956 error = wapbl_circ_write(wl, bp->b_data, 1957 bp->b_bcount, &off); 1958 if (error) 1959 return error; 1960 bp = LIST_NEXT(bp, b_wapbllist); 1961 } 1962 if (padding) { 1963 void *zero; 1964 1965 zero = wapbl_malloc(padding); 1966 memset(zero, 0, padding); 1967 error = wapbl_circ_write(wl, zero, padding, &off); 1968 wapbl_free(zero, padding); 1969 if (error) 1970 return error; 1971 } 1972 } 1973 *offp = off; 1974 return 0; 1975 } 1976 1977 static int 1978 wapbl_write_revocations(struct wapbl *wl, off_t *offp) 1979 { 1980 struct wapbl_wc_blocklist *wc = 1981 (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; 1982 int i; 1983 int blocklen = 1<<wl->wl_log_dev_bshift; 1984 int bph; 1985 off_t off = *offp; 1986 int error; 1987 1988 if (wl->wl_dealloccnt == 0) 1989 return 0; 1990 1991 bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / 1992 sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); 1993 1994 i = 0; 1995 while (i < wl->wl_dealloccnt) { 1996 wc->wc_type = WAPBL_WC_REVOCATIONS; 1997 wc->wc_len = blocklen; 1998 wc->wc_blkcount = 0; 1999 while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { 2000 wc->wc_blocks[wc->wc_blkcount].wc_daddr = 2001 wl->wl_deallocblks[i]; 2002 wc->wc_blocks[wc->wc_blkcount].wc_dlen = 2003 wl->wl_dealloclens[i]; 2004 wc->wc_blkcount++; 2005 i++; 2006 } 2007 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2008 ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", 2009 wc->wc_len, (intmax_t)off)); 2010 error = wapbl_circ_write(wl, wc, blocklen, &off); 2011 if (error) 2012 return error; 2013 } 2014 *offp = off; 2015 return 0; 2016 } 2017 2018 static int 2019 wapbl_write_inodes(struct wapbl *wl, off_t *offp) 2020 { 2021 struct wapbl_wc_inodelist *wc = 2022 (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; 2023 int i; 2024 int blocklen = 1 << wl->wl_log_dev_bshift; 2025 off_t off = *offp; 2026 int error; 2027 2028 struct wapbl_ino_head *wih; 2029 struct wapbl_ino *wi; 2030 int iph; 2031 2032 iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / 2033 sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); 2034 2035 i = 0; 2036 wih = &wl->wl_inohash[0]; 2037 wi = 0; 2038 do { 2039 wc->wc_type = WAPBL_WC_INODES; 2040 wc->wc_len = blocklen; 2041 wc->wc_inocnt = 0; 2042 wc->wc_clear = (i == 0); 2043 while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { 2044 while (!wi) { 2045 KASSERT((wih - &wl->wl_inohash[0]) 2046 <= wl->wl_inohashmask); 2047 wi = LIST_FIRST(wih++); 2048 } 2049 wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; 2050 wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; 2051 wc->wc_inocnt++; 2052 i++; 2053 wi = LIST_NEXT(wi, wi_hash); 2054 } 2055 WAPBL_PRINTF(WAPBL_PRINT_WRITE, 2056 ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", 2057 wc->wc_len, (intmax_t)off)); 2058 error = wapbl_circ_write(wl, wc, blocklen, &off); 2059 if (error) 2060 return error; 2061 } while (i < wl->wl_inohashcnt); 2062 2063 *offp = off; 2064 return 0; 2065 } 2066 2067 #endif /* _KERNEL */ 2068 2069 /****************************************************************/ 2070 2071 struct wapbl_blk { 2072 LIST_ENTRY(wapbl_blk) wb_hash; 2073 daddr_t wb_blk; 2074 off_t wb_off; /* Offset of this block in the log */ 2075 }; 2076 #define WAPBL_BLKPOOL_MIN 83 2077 2078 static void 2079 wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) 2080 { 2081 if (size < WAPBL_BLKPOOL_MIN) 2082 size = WAPBL_BLKPOOL_MIN; 2083 KASSERT(wr->wr_blkhash == 0); 2084 #ifdef _KERNEL 2085 wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); 2086 #else /* ! _KERNEL */ 2087 /* Manually implement hashinit */ 2088 { 2089 unsigned long i, hashsize; 2090 for (hashsize = 1; hashsize < size; hashsize <<= 1) 2091 continue; 2092 wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); 2093 for (i = 0; i < wr->wr_blkhashmask; i++) 2094 LIST_INIT(&wr->wr_blkhash[i]); 2095 wr->wr_blkhashmask = hashsize - 1; 2096 } 2097 #endif /* ! _KERNEL */ 2098 } 2099 2100 static void 2101 wapbl_blkhash_free(struct wapbl_replay *wr) 2102 { 2103 KASSERT(wr->wr_blkhashcnt == 0); 2104 #ifdef _KERNEL 2105 hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); 2106 #else /* ! _KERNEL */ 2107 wapbl_free(wr->wr_blkhash, 2108 (wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash)); 2109 #endif /* ! _KERNEL */ 2110 } 2111 2112 static struct wapbl_blk * 2113 wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) 2114 { 2115 struct wapbl_blk_head *wbh; 2116 struct wapbl_blk *wb; 2117 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2118 LIST_FOREACH(wb, wbh, wb_hash) { 2119 if (blk == wb->wb_blk) 2120 return wb; 2121 } 2122 return 0; 2123 } 2124 2125 static void 2126 wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) 2127 { 2128 struct wapbl_blk_head *wbh; 2129 struct wapbl_blk *wb; 2130 wb = wapbl_blkhash_get(wr, blk); 2131 if (wb) { 2132 KASSERT(wb->wb_blk == blk); 2133 wb->wb_off = off; 2134 } else { 2135 wb = wapbl_malloc(sizeof(*wb)); 2136 wb->wb_blk = blk; 2137 wb->wb_off = off; 2138 wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; 2139 LIST_INSERT_HEAD(wbh, wb, wb_hash); 2140 wr->wr_blkhashcnt++; 2141 } 2142 } 2143 2144 static void 2145 wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) 2146 { 2147 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2148 if (wb) { 2149 KASSERT(wr->wr_blkhashcnt > 0); 2150 wr->wr_blkhashcnt--; 2151 LIST_REMOVE(wb, wb_hash); 2152 wapbl_free(wb, sizeof(*wb)); 2153 } 2154 } 2155 2156 static void 2157 wapbl_blkhash_clear(struct wapbl_replay *wr) 2158 { 2159 unsigned long i; 2160 for (i = 0; i <= wr->wr_blkhashmask; i++) { 2161 struct wapbl_blk *wb; 2162 2163 while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { 2164 KASSERT(wr->wr_blkhashcnt > 0); 2165 wr->wr_blkhashcnt--; 2166 LIST_REMOVE(wb, wb_hash); 2167 wapbl_free(wb, sizeof(*wb)); 2168 } 2169 } 2170 KASSERT(wr->wr_blkhashcnt == 0); 2171 } 2172 2173 /****************************************************************/ 2174 2175 static int 2176 wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) 2177 { 2178 size_t slen; 2179 off_t off = *offp; 2180 int error; 2181 2182 KASSERT(((len >> wr->wr_log_dev_bshift) << 2183 wr->wr_log_dev_bshift) == len); 2184 if (off < wr->wr_circ_off) 2185 off = wr->wr_circ_off; 2186 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2187 if (slen < len) { 2188 error = wapbl_read(data, slen, wr->wr_devvp, 2189 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift)); 2190 if (error) 2191 return error; 2192 data = (uint8_t *)data + slen; 2193 len -= slen; 2194 off = wr->wr_circ_off; 2195 } 2196 error = wapbl_read(data, len, wr->wr_devvp, 2197 wr->wr_logpbn + (off >> wr->wr_log_dev_bshift)); 2198 if (error) 2199 return error; 2200 off += len; 2201 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2202 off = wr->wr_circ_off; 2203 *offp = off; 2204 return 0; 2205 } 2206 2207 static void 2208 wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) 2209 { 2210 size_t slen; 2211 off_t off = *offp; 2212 2213 KASSERT(((len >> wr->wr_log_dev_bshift) << 2214 wr->wr_log_dev_bshift) == len); 2215 2216 if (off < wr->wr_circ_off) 2217 off = wr->wr_circ_off; 2218 slen = wr->wr_circ_off + wr->wr_circ_size - off; 2219 if (slen < len) { 2220 len -= slen; 2221 off = wr->wr_circ_off; 2222 } 2223 off += len; 2224 if (off >= wr->wr_circ_off + wr->wr_circ_size) 2225 off = wr->wr_circ_off; 2226 *offp = off; 2227 } 2228 2229 /****************************************************************/ 2230 2231 int 2232 wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, 2233 daddr_t off, size_t count, size_t blksize) 2234 { 2235 struct wapbl_replay *wr; 2236 int error; 2237 struct vnode *devvp; 2238 daddr_t logpbn; 2239 uint8_t *scratch; 2240 struct wapbl_wc_header *wch; 2241 struct wapbl_wc_header *wch2; 2242 /* Use this until we read the actual log header */ 2243 int log_dev_bshift = DEV_BSHIFT; 2244 size_t used; 2245 2246 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2247 ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", 2248 vp, off, count, blksize)); 2249 2250 if (off < 0) 2251 return EINVAL; 2252 2253 if (blksize < DEV_BSIZE) 2254 return EINVAL; 2255 if (blksize % DEV_BSIZE) 2256 return EINVAL; 2257 2258 #ifdef _KERNEL 2259 #if 0 2260 /* XXX vp->v_size isn't reliably set for VBLK devices, 2261 * especially root. However, we might still want to verify 2262 * that the full load is readable */ 2263 if ((off + count) * blksize > vp->v_size) 2264 return EINVAL; 2265 #endif 2266 2267 if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { 2268 return error; 2269 } 2270 #else /* ! _KERNEL */ 2271 devvp = vp; 2272 logpbn = off; 2273 #endif /* ! _KERNEL */ 2274 2275 scratch = wapbl_malloc(MAXBSIZE); 2276 2277 error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn); 2278 if (error) 2279 goto errout; 2280 2281 wch = (struct wapbl_wc_header *)scratch; 2282 wch2 = 2283 (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift)); 2284 /* XXX verify checksums and magic numbers */ 2285 if (wch->wc_type != WAPBL_WC_HEADER) { 2286 printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); 2287 error = EFTYPE; 2288 goto errout; 2289 } 2290 2291 if (wch2->wc_generation > wch->wc_generation) 2292 wch = wch2; 2293 2294 wr = wapbl_calloc(1, sizeof(*wr)); 2295 2296 wr->wr_logvp = vp; 2297 wr->wr_devvp = devvp; 2298 wr->wr_logpbn = logpbn; 2299 2300 wr->wr_scratch = scratch; 2301 2302 wr->wr_log_dev_bshift = wch->wc_log_dev_bshift; 2303 wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift; 2304 wr->wr_circ_off = wch->wc_circ_off; 2305 wr->wr_circ_size = wch->wc_circ_size; 2306 wr->wr_generation = wch->wc_generation; 2307 2308 used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); 2309 2310 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, 2311 ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 2312 " len=%"PRId64" used=%zu\n", 2313 wch->wc_head, wch->wc_tail, wch->wc_circ_off, 2314 wch->wc_circ_size, used)); 2315 2316 wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); 2317 2318 error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail); 2319 if (error) { 2320 wapbl_replay_stop(wr); 2321 wapbl_replay_free(wr); 2322 return error; 2323 } 2324 2325 *wrp = wr; 2326 return 0; 2327 2328 errout: 2329 wapbl_free(scratch, MAXBSIZE); 2330 return error; 2331 } 2332 2333 void 2334 wapbl_replay_stop(struct wapbl_replay *wr) 2335 { 2336 2337 if (!wapbl_replay_isopen(wr)) 2338 return; 2339 2340 WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); 2341 2342 wapbl_free(wr->wr_scratch, MAXBSIZE); 2343 wr->wr_scratch = NULL; 2344 2345 wr->wr_logvp = NULL; 2346 2347 wapbl_blkhash_clear(wr); 2348 wapbl_blkhash_free(wr); 2349 } 2350 2351 void 2352 wapbl_replay_free(struct wapbl_replay *wr) 2353 { 2354 2355 KDASSERT(!wapbl_replay_isopen(wr)); 2356 2357 if (wr->wr_inodes) 2358 wapbl_free(wr->wr_inodes, 2359 wr->wr_inodescnt * sizeof(wr->wr_inodes[0])); 2360 wapbl_free(wr, sizeof(*wr)); 2361 } 2362 2363 #ifdef _KERNEL 2364 int 2365 wapbl_replay_isopen1(struct wapbl_replay *wr) 2366 { 2367 2368 return wapbl_replay_isopen(wr); 2369 } 2370 #endif 2371 2372 static void 2373 wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp) 2374 { 2375 struct wapbl_wc_blocklist *wc = 2376 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2377 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2378 int i, j, n; 2379 2380 for (i = 0; i < wc->wc_blkcount; i++) { 2381 /* 2382 * Enter each physical block into the hashtable independently. 2383 */ 2384 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2385 for (j = 0; j < n; j++) { 2386 wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j, 2387 *offp); 2388 wapbl_circ_advance(wr, fsblklen, offp); 2389 } 2390 } 2391 } 2392 2393 static void 2394 wapbl_replay_process_revocations(struct wapbl_replay *wr) 2395 { 2396 struct wapbl_wc_blocklist *wc = 2397 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2398 int i, j, n; 2399 2400 for (i = 0; i < wc->wc_blkcount; i++) { 2401 /* 2402 * Remove any blocks found from the hashtable. 2403 */ 2404 n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift; 2405 for (j = 0; j < n; j++) 2406 wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j); 2407 } 2408 } 2409 2410 static void 2411 wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff) 2412 { 2413 struct wapbl_wc_inodelist *wc = 2414 (struct wapbl_wc_inodelist *)wr->wr_scratch; 2415 void *new_inodes; 2416 const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]); 2417 2418 KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0])); 2419 2420 /* 2421 * Keep track of where we found this so location won't be 2422 * overwritten. 2423 */ 2424 if (wc->wc_clear) { 2425 wr->wr_inodestail = oldoff; 2426 wr->wr_inodescnt = 0; 2427 if (wr->wr_inodes != NULL) { 2428 wapbl_free(wr->wr_inodes, oldsize); 2429 wr->wr_inodes = NULL; 2430 } 2431 } 2432 wr->wr_inodeshead = newoff; 2433 if (wc->wc_inocnt == 0) 2434 return; 2435 2436 new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) * 2437 sizeof(wr->wr_inodes[0])); 2438 if (wr->wr_inodes != NULL) { 2439 memcpy(new_inodes, wr->wr_inodes, oldsize); 2440 wapbl_free(wr->wr_inodes, oldsize); 2441 } 2442 wr->wr_inodes = new_inodes; 2443 memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes, 2444 wc->wc_inocnt * sizeof(wr->wr_inodes[0])); 2445 wr->wr_inodescnt += wc->wc_inocnt; 2446 } 2447 2448 static int 2449 wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail) 2450 { 2451 off_t off; 2452 int error; 2453 2454 int logblklen = 1 << wr->wr_log_dev_bshift; 2455 2456 wapbl_blkhash_clear(wr); 2457 2458 off = tail; 2459 while (off != head) { 2460 struct wapbl_wc_null *wcn; 2461 off_t saveoff = off; 2462 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2463 if (error) 2464 goto errout; 2465 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2466 switch (wcn->wc_type) { 2467 case WAPBL_WC_BLOCKS: 2468 wapbl_replay_process_blocks(wr, &off); 2469 break; 2470 2471 case WAPBL_WC_REVOCATIONS: 2472 wapbl_replay_process_revocations(wr); 2473 break; 2474 2475 case WAPBL_WC_INODES: 2476 wapbl_replay_process_inodes(wr, saveoff, off); 2477 break; 2478 2479 default: 2480 printf("Unrecognized wapbl type: 0x%08x\n", 2481 wcn->wc_type); 2482 error = EFTYPE; 2483 goto errout; 2484 } 2485 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2486 if (off != saveoff) { 2487 printf("wapbl_replay: corrupted records\n"); 2488 error = EFTYPE; 2489 goto errout; 2490 } 2491 } 2492 return 0; 2493 2494 errout: 2495 wapbl_blkhash_clear(wr); 2496 return error; 2497 } 2498 2499 #if 0 2500 int 2501 wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) 2502 { 2503 off_t off; 2504 int mismatchcnt = 0; 2505 int logblklen = 1 << wr->wr_log_dev_bshift; 2506 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2507 void *scratch1 = wapbl_malloc(MAXBSIZE); 2508 void *scratch2 = wapbl_malloc(MAXBSIZE); 2509 int error = 0; 2510 2511 KDASSERT(wapbl_replay_isopen(wr)); 2512 2513 off = wch->wc_tail; 2514 while (off != wch->wc_head) { 2515 struct wapbl_wc_null *wcn; 2516 #ifdef DEBUG 2517 off_t saveoff = off; 2518 #endif 2519 error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); 2520 if (error) 2521 goto out; 2522 wcn = (struct wapbl_wc_null *)wr->wr_scratch; 2523 switch (wcn->wc_type) { 2524 case WAPBL_WC_BLOCKS: 2525 { 2526 struct wapbl_wc_blocklist *wc = 2527 (struct wapbl_wc_blocklist *)wr->wr_scratch; 2528 int i; 2529 for (i = 0; i < wc->wc_blkcount; i++) { 2530 int foundcnt = 0; 2531 int dirtycnt = 0; 2532 int j, n; 2533 /* 2534 * Check each physical block into the 2535 * hashtable independently 2536 */ 2537 n = wc->wc_blocks[i].wc_dlen >> 2538 wch->wc_fs_dev_bshift; 2539 for (j = 0; j < n; j++) { 2540 struct wapbl_blk *wb = 2541 wapbl_blkhash_get(wr, 2542 wc->wc_blocks[i].wc_daddr + j); 2543 if (wb && (wb->wb_off == off)) { 2544 foundcnt++; 2545 error = 2546 wapbl_circ_read(wr, 2547 scratch1, fsblklen, 2548 &off); 2549 if (error) 2550 goto out; 2551 error = 2552 wapbl_read(scratch2, 2553 fsblklen, fsdevvp, 2554 wb->wb_blk); 2555 if (error) 2556 goto out; 2557 if (memcmp(scratch1, 2558 scratch2, 2559 fsblklen)) { 2560 printf( 2561 "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", 2562 wb->wb_blk, (intmax_t)off); 2563 dirtycnt++; 2564 mismatchcnt++; 2565 } 2566 } else { 2567 wapbl_circ_advance(wr, 2568 fsblklen, &off); 2569 } 2570 } 2571 #if 0 2572 /* 2573 * If all of the blocks in an entry 2574 * are clean, then remove all of its 2575 * blocks from the hashtable since they 2576 * never will need replay. 2577 */ 2578 if ((foundcnt != 0) && 2579 (dirtycnt == 0)) { 2580 off = saveoff; 2581 wapbl_circ_advance(wr, 2582 logblklen, &off); 2583 for (j = 0; j < n; j++) { 2584 struct wapbl_blk *wb = 2585 wapbl_blkhash_get(wr, 2586 wc->wc_blocks[i].wc_daddr + j); 2587 if (wb && 2588 (wb->wb_off == off)) { 2589 wapbl_blkhash_rem(wr, wb->wb_blk); 2590 } 2591 wapbl_circ_advance(wr, 2592 fsblklen, &off); 2593 } 2594 } 2595 #endif 2596 } 2597 } 2598 break; 2599 case WAPBL_WC_REVOCATIONS: 2600 case WAPBL_WC_INODES: 2601 break; 2602 default: 2603 KASSERT(0); 2604 } 2605 #ifdef DEBUG 2606 wapbl_circ_advance(wr, wcn->wc_len, &saveoff); 2607 KASSERT(off == saveoff); 2608 #endif 2609 } 2610 out: 2611 wapbl_free(scratch1, MAXBSIZE); 2612 wapbl_free(scratch2, MAXBSIZE); 2613 if (!error && mismatchcnt) 2614 error = EFTYPE; 2615 return error; 2616 } 2617 #endif 2618 2619 int 2620 wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) 2621 { 2622 struct wapbl_blk *wb; 2623 size_t i; 2624 off_t off; 2625 void *scratch; 2626 int error = 0; 2627 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2628 2629 KDASSERT(wapbl_replay_isopen(wr)); 2630 2631 scratch = wapbl_malloc(MAXBSIZE); 2632 2633 for (i = 0; i < wr->wr_blkhashmask; ++i) { 2634 LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) { 2635 off = wb->wb_off; 2636 error = wapbl_circ_read(wr, scratch, fsblklen, &off); 2637 if (error) 2638 break; 2639 error = wapbl_write(scratch, fsblklen, fsdevvp, 2640 wb->wb_blk); 2641 if (error) 2642 break; 2643 } 2644 } 2645 2646 wapbl_free(scratch, MAXBSIZE); 2647 return error; 2648 } 2649 2650 int 2651 wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) 2652 { 2653 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2654 2655 KDASSERT(wapbl_replay_isopen(wr)); 2656 KASSERT((len % fsblklen) == 0); 2657 2658 while (len != 0) { 2659 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2660 if (wb) 2661 return 1; 2662 len -= fsblklen; 2663 } 2664 return 0; 2665 } 2666 2667 int 2668 wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) 2669 { 2670 int fsblklen = 1 << wr->wr_fs_dev_bshift; 2671 2672 KDASSERT(wapbl_replay_isopen(wr)); 2673 2674 KASSERT((len % fsblklen) == 0); 2675 2676 while (len != 0) { 2677 struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); 2678 if (wb) { 2679 off_t off = wb->wb_off; 2680 int error; 2681 error = wapbl_circ_read(wr, data, fsblklen, &off); 2682 if (error) 2683 return error; 2684 } 2685 data = (uint8_t *)data + fsblklen; 2686 len -= fsblklen; 2687 blk++; 2688 } 2689 return 0; 2690 } 2691