1 /* $NetBSD: lfs_segment.c,v 1.233 2013/10/29 09:53:51 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 /* 32 * Copyright (c) 1991, 1993 33 * The Regents of the University of California. All rights reserved. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.233 2013/10/29 09:53:51 hannken Exp $"); 64 65 #ifdef DEBUG 66 # define vndebug(vp, str) do { \ 67 if (VTOI(vp)->i_flag & IN_CLEANING) \ 68 DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \ 69 VTOI(vp)->i_number, (str), op)); \ 70 } while(0) 71 #else 72 # define vndebug(vp, str) 73 #endif 74 #define ivndebug(vp, str) \ 75 DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str))) 76 77 #if defined(_KERNEL_OPT) 78 #include "opt_ddb.h" 79 #endif 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/namei.h> 84 #include <sys/kernel.h> 85 #include <sys/resourcevar.h> 86 #include <sys/file.h> 87 #include <sys/stat.h> 88 #include <sys/buf.h> 89 #include <sys/proc.h> 90 #include <sys/vnode.h> 91 #include <sys/mount.h> 92 #include <sys/kauth.h> 93 #include <sys/syslog.h> 94 95 #include <miscfs/specfs/specdev.h> 96 #include <miscfs/fifofs/fifo.h> 97 98 #include <ufs/lfs/ulfs_inode.h> 99 #include <ufs/lfs/ulfsmount.h> 100 #include <ufs/lfs/ulfs_extern.h> 101 102 #include <ufs/lfs/lfs.h> 103 #include <ufs/lfs/lfs_kernel.h> 104 #include <ufs/lfs/lfs_extern.h> 105 106 #include <uvm/uvm.h> 107 #include <uvm/uvm_extern.h> 108 109 MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS"); 110 111 static void lfs_generic_callback(struct buf *, void (*)(struct buf *)); 112 static void lfs_free_aiodone(struct buf *); 113 static void lfs_super_aiodone(struct buf *); 114 static void lfs_cluster_aiodone(struct buf *); 115 static void lfs_cluster_callback(struct buf *); 116 117 /* 118 * Determine if it's OK to start a partial in this segment, or if we need 119 * to go on to a new segment. 120 */ 121 #define LFS_PARTIAL_FITS(fs) \ 122 ((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ 123 (fs)->lfs_frag) 124 125 /* 126 * Figure out whether we should do a checkpoint write or go ahead with 127 * an ordinary write. 128 */ 129 #define LFS_SHOULD_CHECKPOINT(fs, flags) \ 130 ((flags & SEGM_CLEAN) == 0 && \ 131 ((fs->lfs_nactive > LFS_MAX_ACTIVE || \ 132 (flags & SEGM_CKP) || \ 133 fs->lfs_nclean < LFS_MAX_ACTIVE))) 134 135 int lfs_match_fake(struct lfs *, struct buf *); 136 void lfs_newseg(struct lfs *); 137 /* XXX ondisk32 */ 138 void lfs_shellsort(struct buf **, int32_t *, int, int); 139 void lfs_supercallback(struct buf *); 140 void lfs_updatemeta(struct segment *); 141 void lfs_writesuper(struct lfs *, daddr_t); 142 int lfs_writevnodes(struct lfs *fs, struct mount *mp, 143 struct segment *sp, int dirops); 144 145 int lfs_allclean_wakeup; /* Cleaner wakeup address. */ 146 int lfs_writeindir = 1; /* whether to flush indir on non-ckp */ 147 int lfs_clean_vnhead = 0; /* Allow freeing to head of vn list */ 148 int lfs_dirvcount = 0; /* # active dirops */ 149 150 /* Statistics Counters */ 151 int lfs_dostats = 1; 152 struct lfs_stats lfs_stats; 153 154 /* op values to lfs_writevnodes */ 155 #define VN_REG 0 156 #define VN_DIROP 1 157 #define VN_EMPTY 2 158 #define VN_CLEAN 3 159 160 /* 161 * XXX KS - Set modification time on the Ifile, so the cleaner can 162 * read the fs mod time off of it. We don't set IN_UPDATE here, 163 * since we don't really need this to be flushed to disk (and in any 164 * case that wouldn't happen to the Ifile until we checkpoint). 165 */ 166 void 167 lfs_imtime(struct lfs *fs) 168 { 169 struct timespec ts; 170 struct inode *ip; 171 172 ASSERT_MAYBE_SEGLOCK(fs); 173 vfs_timestamp(&ts); 174 ip = VTOI(fs->lfs_ivnode); 175 ip->i_ffs1_mtime = ts.tv_sec; 176 ip->i_ffs1_mtimensec = ts.tv_nsec; 177 } 178 179 /* 180 * Ifile and meta data blocks are not marked busy, so segment writes MUST be 181 * single threaded. Currently, there are two paths into lfs_segwrite, sync() 182 * and getnewbuf(). They both mark the file system busy. Lfs_vflush() 183 * explicitly marks the file system busy. So lfs_segwrite is safe. I think. 184 */ 185 186 #define IS_FLUSHING(fs,vp) ((fs)->lfs_flushvp == (vp)) 187 188 int 189 lfs_vflush(struct vnode *vp) 190 { 191 struct inode *ip; 192 struct lfs *fs; 193 struct segment *sp; 194 struct buf *bp, *nbp, *tbp, *tnbp; 195 int error; 196 int flushed; 197 int relock; 198 199 ip = VTOI(vp); 200 fs = VFSTOULFS(vp->v_mount)->um_lfs; 201 relock = 0; 202 203 top: 204 KASSERT(mutex_owned(vp->v_interlock) == false); 205 KASSERT(mutex_owned(&lfs_lock) == false); 206 KASSERT(mutex_owned(&bufcache_lock) == false); 207 ASSERT_NO_SEGLOCK(fs); 208 if (ip->i_flag & IN_CLEANING) { 209 ivndebug(vp,"vflush/in_cleaning"); 210 mutex_enter(&lfs_lock); 211 LFS_CLR_UINO(ip, IN_CLEANING); 212 LFS_SET_UINO(ip, IN_MODIFIED); 213 mutex_exit(&lfs_lock); 214 215 /* 216 * Toss any cleaning buffers that have real counterparts 217 * to avoid losing new data. 218 */ 219 mutex_enter(vp->v_interlock); 220 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 221 nbp = LIST_NEXT(bp, b_vnbufs); 222 if (!LFS_IS_MALLOC_BUF(bp)) 223 continue; 224 /* 225 * Look for pages matching the range covered 226 * by cleaning blocks. It's okay if more dirty 227 * pages appear, so long as none disappear out 228 * from under us. 229 */ 230 if (bp->b_lblkno > 0 && vp->v_type == VREG && 231 vp != fs->lfs_ivnode) { 232 struct vm_page *pg; 233 voff_t off; 234 235 for (off = lfs_lblktosize(fs, bp->b_lblkno); 236 off < lfs_lblktosize(fs, bp->b_lblkno + 1); 237 off += PAGE_SIZE) { 238 pg = uvm_pagelookup(&vp->v_uobj, off); 239 if (pg == NULL) 240 continue; 241 if ((pg->flags & PG_CLEAN) == 0 || 242 pmap_is_modified(pg)) { 243 fs->lfs_avail += lfs_btofsb(fs, 244 bp->b_bcount); 245 wakeup(&fs->lfs_avail); 246 mutex_exit(vp->v_interlock); 247 lfs_freebuf(fs, bp); 248 mutex_enter(vp->v_interlock); 249 bp = NULL; 250 break; 251 } 252 } 253 } 254 for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp; 255 tbp = tnbp) 256 { 257 tnbp = LIST_NEXT(tbp, b_vnbufs); 258 if (tbp->b_vp == bp->b_vp 259 && tbp->b_lblkno == bp->b_lblkno 260 && tbp != bp) 261 { 262 fs->lfs_avail += lfs_btofsb(fs, 263 bp->b_bcount); 264 wakeup(&fs->lfs_avail); 265 mutex_exit(vp->v_interlock); 266 lfs_freebuf(fs, bp); 267 mutex_enter(vp->v_interlock); 268 bp = NULL; 269 break; 270 } 271 } 272 } 273 } else { 274 mutex_enter(vp->v_interlock); 275 } 276 277 /* If the node is being written, wait until that is done */ 278 while (WRITEINPROG(vp)) { 279 ivndebug(vp,"vflush/writeinprog"); 280 cv_wait(&vp->v_cv, vp->v_interlock); 281 } 282 mutex_exit(vp->v_interlock); 283 284 /* Protect against VI_XLOCK deadlock in vinvalbuf() */ 285 lfs_seglock(fs, SEGM_SYNC | ((vp->v_iflag & VI_XLOCK) ? SEGM_RECLAIM : 0)); 286 if (vp->v_iflag & VI_XLOCK) { 287 fs->lfs_reclino = ip->i_number; 288 } 289 290 /* If we're supposed to flush a freed inode, just toss it */ 291 if (ip->i_lfs_iflags & LFSI_DELETED) { 292 DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n", 293 ip->i_number)); 294 /* Drain v_numoutput */ 295 mutex_enter(vp->v_interlock); 296 while (vp->v_numoutput > 0) { 297 cv_wait(&vp->v_cv, vp->v_interlock); 298 } 299 KASSERT(vp->v_numoutput == 0); 300 mutex_exit(vp->v_interlock); 301 302 mutex_enter(&bufcache_lock); 303 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 304 nbp = LIST_NEXT(bp, b_vnbufs); 305 306 KASSERT((bp->b_flags & B_GATHERED) == 0); 307 if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */ 308 fs->lfs_avail += lfs_btofsb(fs, bp->b_bcount); 309 wakeup(&fs->lfs_avail); 310 } 311 /* Copied from lfs_writeseg */ 312 if (bp->b_iodone != NULL) { 313 mutex_exit(&bufcache_lock); 314 biodone(bp); 315 mutex_enter(&bufcache_lock); 316 } else { 317 bremfree(bp); 318 LFS_UNLOCK_BUF(bp); 319 mutex_enter(vp->v_interlock); 320 bp->b_flags &= ~(B_READ | B_GATHERED); 321 bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE; 322 bp->b_error = 0; 323 reassignbuf(bp, vp); 324 mutex_exit(vp->v_interlock); 325 brelse(bp, 0); 326 } 327 } 328 mutex_exit(&bufcache_lock); 329 LFS_CLR_UINO(ip, IN_CLEANING); 330 LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED); 331 ip->i_flag &= ~IN_ALLMOD; 332 DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n", 333 ip->i_number)); 334 lfs_segunlock(fs); 335 336 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 337 338 return 0; 339 } 340 341 fs->lfs_flushvp = vp; 342 if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) { 343 error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC); 344 fs->lfs_flushvp = NULL; 345 KASSERT(fs->lfs_flushvp_fakevref == 0); 346 lfs_segunlock(fs); 347 348 /* Make sure that any pending buffers get written */ 349 mutex_enter(vp->v_interlock); 350 while (vp->v_numoutput > 0) { 351 cv_wait(&vp->v_cv, vp->v_interlock); 352 } 353 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 354 KASSERT(vp->v_numoutput == 0); 355 mutex_exit(vp->v_interlock); 356 357 return error; 358 } 359 sp = fs->lfs_sp; 360 361 flushed = 0; 362 if (VPISEMPTY(vp)) { 363 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); 364 ++flushed; 365 } else if ((ip->i_flag & IN_CLEANING) && 366 (fs->lfs_sp->seg_flags & SEGM_CLEAN)) { 367 ivndebug(vp,"vflush/clean"); 368 lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN); 369 ++flushed; 370 } else if (lfs_dostats) { 371 if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD)) 372 ++lfs_stats.vflush_invoked; 373 ivndebug(vp,"vflush"); 374 } 375 376 #ifdef DIAGNOSTIC 377 if (vp->v_uflag & VU_DIROP) { 378 DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n")); 379 /* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */ 380 } 381 #endif 382 383 do { 384 #ifdef DEBUG 385 int loopcount = 0; 386 #endif 387 do { 388 if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { 389 relock = lfs_writefile(fs, sp, vp); 390 if (relock && vp != fs->lfs_ivnode) { 391 /* 392 * Might have to wait for the 393 * cleaner to run; but we're 394 * still not done with this vnode. 395 * XXX we can do better than this. 396 */ 397 KDASSERT(ip->i_number != LFS_IFILE_INUM); 398 lfs_writeinode(fs, sp, ip); 399 mutex_enter(&lfs_lock); 400 LFS_SET_UINO(ip, IN_MODIFIED); 401 mutex_exit(&lfs_lock); 402 lfs_writeseg(fs, sp); 403 lfs_segunlock(fs); 404 lfs_segunlock_relock(fs); 405 goto top; 406 } 407 } 408 /* 409 * If we begin a new segment in the middle of writing 410 * the Ifile, it creates an inconsistent checkpoint, 411 * since the Ifile information for the new segment 412 * is not up-to-date. Take care of this here by 413 * sending the Ifile through again in case there 414 * are newly dirtied blocks. But wait, there's more! 415 * This second Ifile write could *also* cross a segment 416 * boundary, if the first one was large. The second 417 * one is guaranteed to be no more than 8 blocks, 418 * though (two segment blocks and supporting indirects) 419 * so the third write *will not* cross the boundary. 420 */ 421 if (vp == fs->lfs_ivnode) { 422 lfs_writefile(fs, sp, vp); 423 lfs_writefile(fs, sp, vp); 424 } 425 #ifdef DEBUG 426 if (++loopcount > 2) 427 log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount); 428 #endif 429 } while (lfs_writeinode(fs, sp, ip)); 430 } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); 431 432 if (lfs_dostats) { 433 ++lfs_stats.nwrites; 434 if (sp->seg_flags & SEGM_SYNC) 435 ++lfs_stats.nsync_writes; 436 if (sp->seg_flags & SEGM_CKP) 437 ++lfs_stats.ncheckpoints; 438 } 439 /* 440 * If we were called from somewhere that has already held the seglock 441 * (e.g., lfs_markv()), the lfs_segunlock will not wait for 442 * the write to complete because we are still locked. 443 * Since lfs_vflush() must return the vnode with no dirty buffers, 444 * we must explicitly wait, if that is the case. 445 * 446 * We compare the iocount against 1, not 0, because it is 447 * artificially incremented by lfs_seglock(). 448 */ 449 mutex_enter(&lfs_lock); 450 if (fs->lfs_seglock > 1) { 451 while (fs->lfs_iocount > 1) 452 (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, 453 "lfs_vflush", 0, &lfs_lock); 454 } 455 mutex_exit(&lfs_lock); 456 457 lfs_segunlock(fs); 458 459 /* Wait for these buffers to be recovered by aiodoned */ 460 mutex_enter(vp->v_interlock); 461 while (vp->v_numoutput > 0) { 462 cv_wait(&vp->v_cv, vp->v_interlock); 463 } 464 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 465 KASSERT(vp->v_numoutput == 0); 466 mutex_exit(vp->v_interlock); 467 468 fs->lfs_flushvp = NULL; 469 KASSERT(fs->lfs_flushvp_fakevref == 0); 470 471 return (0); 472 } 473 474 int 475 lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op) 476 { 477 struct inode *ip; 478 struct vnode *vp; 479 int inodes_written = 0, only_cleaning; 480 int error = 0; 481 482 ASSERT_SEGLOCK(fs); 483 loop: 484 /* start at last (newest) vnode. */ 485 mutex_enter(&mntvnode_lock); 486 TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) { 487 /* 488 * If the vnode that we are about to sync is no longer 489 * associated with this mount point, start over. 490 */ 491 if (vp->v_mount != mp) { 492 DLOG((DLOG_VNODE, "lfs_writevnodes: starting over\n")); 493 /* 494 * After this, pages might be busy 495 * due to our own previous putpages. 496 * Start actual segment write here to avoid deadlock. 497 * If we were just writing one segment and we've done 498 * that, break out. 499 */ 500 mutex_exit(&mntvnode_lock); 501 if (lfs_writeseg(fs, sp) && 502 (sp->seg_flags & SEGM_SINGLE) && 503 fs->lfs_curseg != fs->lfs_startseg) { 504 DLOG((DLOG_VNODE, "lfs_writevnodes: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset)); 505 break; 506 } 507 goto loop; 508 } 509 510 mutex_enter(vp->v_interlock); 511 if (vp->v_type == VNON || vismarker(vp) || 512 (vp->v_iflag & VI_CLEAN) != 0) { 513 mutex_exit(vp->v_interlock); 514 continue; 515 } 516 517 ip = VTOI(vp); 518 if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) || 519 (op != VN_DIROP && op != VN_CLEAN && 520 (vp->v_uflag & VU_DIROP))) { 521 mutex_exit(vp->v_interlock); 522 vndebug(vp,"dirop"); 523 continue; 524 } 525 526 if (op == VN_EMPTY && !VPISEMPTY(vp)) { 527 mutex_exit(vp->v_interlock); 528 vndebug(vp,"empty"); 529 continue; 530 } 531 532 if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM 533 && vp != fs->lfs_flushvp 534 && !(ip->i_flag & IN_CLEANING)) { 535 mutex_exit(vp->v_interlock); 536 vndebug(vp,"cleaning"); 537 continue; 538 } 539 540 mutex_exit(&mntvnode_lock); 541 if (lfs_vref(vp)) { 542 vndebug(vp,"vref"); 543 mutex_enter(&mntvnode_lock); 544 continue; 545 } 546 547 only_cleaning = 0; 548 /* 549 * Write the inode/file if dirty and it's not the IFILE. 550 */ 551 if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) { 552 only_cleaning = 553 ((ip->i_flag & IN_ALLMOD) == IN_CLEANING); 554 555 if (ip->i_number != LFS_IFILE_INUM) { 556 error = lfs_writefile(fs, sp, vp); 557 if (error) { 558 lfs_vunref(vp); 559 if (error == EAGAIN) { 560 /* 561 * This error from lfs_putpages 562 * indicates we need to drop 563 * the segment lock and start 564 * over after the cleaner has 565 * had a chance to run. 566 */ 567 lfs_writeinode(fs, sp, ip); 568 lfs_writeseg(fs, sp); 569 if (!VPISEMPTY(vp) && 570 !WRITEINPROG(vp) && 571 !(ip->i_flag & IN_ALLMOD)) { 572 mutex_enter(&lfs_lock); 573 LFS_SET_UINO(ip, IN_MODIFIED); 574 mutex_exit(&lfs_lock); 575 } 576 mutex_enter(&mntvnode_lock); 577 break; 578 } 579 error = 0; /* XXX not quite right */ 580 mutex_enter(&mntvnode_lock); 581 continue; 582 } 583 584 if (!VPISEMPTY(vp)) { 585 if (WRITEINPROG(vp)) { 586 ivndebug(vp,"writevnodes/write2"); 587 } else if (!(ip->i_flag & IN_ALLMOD)) { 588 mutex_enter(&lfs_lock); 589 LFS_SET_UINO(ip, IN_MODIFIED); 590 mutex_exit(&lfs_lock); 591 } 592 } 593 (void) lfs_writeinode(fs, sp, ip); 594 inodes_written++; 595 } 596 } 597 598 if (lfs_clean_vnhead && only_cleaning) 599 lfs_vunref_head(vp); 600 else 601 lfs_vunref(vp); 602 603 mutex_enter(&mntvnode_lock); 604 } 605 mutex_exit(&mntvnode_lock); 606 return error; 607 } 608 609 /* 610 * Do a checkpoint. 611 */ 612 int 613 lfs_segwrite(struct mount *mp, int flags) 614 { 615 struct buf *bp; 616 struct inode *ip; 617 struct lfs *fs; 618 struct segment *sp; 619 struct vnode *vp; 620 SEGUSE *segusep; 621 int do_ckp, did_ckp, error; 622 unsigned n, segleft, maxseg, sn, i, curseg; 623 int writer_set = 0; 624 int dirty; 625 int redo; 626 int um_error; 627 628 fs = VFSTOULFS(mp)->um_lfs; 629 ASSERT_MAYBE_SEGLOCK(fs); 630 631 if (fs->lfs_ronly) 632 return EROFS; 633 634 lfs_imtime(fs); 635 636 /* 637 * Allocate a segment structure and enough space to hold pointers to 638 * the maximum possible number of buffers which can be described in a 639 * single summary block. 640 */ 641 do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags); 642 643 /* We can't do a partial write and checkpoint at the same time. */ 644 if (do_ckp) 645 flags &= ~SEGM_SINGLE; 646 647 lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); 648 sp = fs->lfs_sp; 649 if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP)) 650 do_ckp = 1; 651 652 /* 653 * If lfs_flushvp is non-NULL, we are called from lfs_vflush, 654 * in which case we have to flush *all* buffers off of this vnode. 655 * We don't care about other nodes, but write any non-dirop nodes 656 * anyway in anticipation of another getnewvnode(). 657 * 658 * If we're cleaning we only write cleaning and ifile blocks, and 659 * no dirops, since otherwise we'd risk corruption in a crash. 660 */ 661 if (sp->seg_flags & SEGM_CLEAN) 662 lfs_writevnodes(fs, mp, sp, VN_CLEAN); 663 else if (!(sp->seg_flags & SEGM_FORCE_CKP)) { 664 do { 665 um_error = lfs_writevnodes(fs, mp, sp, VN_REG); 666 if ((sp->seg_flags & SEGM_SINGLE) && 667 fs->lfs_curseg != fs->lfs_startseg) { 668 DLOG((DLOG_SEG, "lfs_segwrite: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset)); 669 break; 670 } 671 672 if (do_ckp || fs->lfs_dirops == 0) { 673 if (!writer_set) { 674 lfs_writer_enter(fs, "lfs writer"); 675 writer_set = 1; 676 } 677 error = lfs_writevnodes(fs, mp, sp, VN_DIROP); 678 if (um_error == 0) 679 um_error = error; 680 /* In case writevnodes errored out */ 681 lfs_flush_dirops(fs); 682 ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT); 683 lfs_finalize_fs_seguse(fs); 684 } 685 if (do_ckp && um_error) { 686 lfs_segunlock_relock(fs); 687 sp = fs->lfs_sp; 688 } 689 } while (do_ckp && um_error != 0); 690 } 691 692 /* 693 * If we are doing a checkpoint, mark everything since the 694 * last checkpoint as no longer ACTIVE. 695 */ 696 if (do_ckp || fs->lfs_doifile) { 697 segleft = fs->lfs_nseg; 698 curseg = 0; 699 for (n = 0; n < fs->lfs_segtabsz; n++) { 700 dirty = 0; 701 if (bread(fs->lfs_ivnode, fs->lfs_cleansz + n, 702 fs->lfs_bsize, NOCRED, B_MODIFY, &bp)) 703 panic("lfs_segwrite: ifile read"); 704 segusep = (SEGUSE *)bp->b_data; 705 maxseg = min(segleft, fs->lfs_sepb); 706 for (i = 0; i < maxseg; i++) { 707 sn = curseg + i; 708 if (sn != lfs_dtosn(fs, fs->lfs_curseg) && 709 segusep->su_flags & SEGUSE_ACTIVE) { 710 segusep->su_flags &= ~SEGUSE_ACTIVE; 711 --fs->lfs_nactive; 712 ++dirty; 713 } 714 fs->lfs_suflags[fs->lfs_activesb][sn] = 715 segusep->su_flags; 716 if (fs->lfs_version > 1) 717 ++segusep; 718 else 719 segusep = (SEGUSE *) 720 ((SEGUSE_V1 *)segusep + 1); 721 } 722 723 if (dirty) 724 error = LFS_BWRITE_LOG(bp); /* Ifile */ 725 else 726 brelse(bp, 0); 727 segleft -= fs->lfs_sepb; 728 curseg += fs->lfs_sepb; 729 } 730 } 731 732 KASSERT(LFS_SEGLOCK_HELD(fs)); 733 734 did_ckp = 0; 735 if (do_ckp || fs->lfs_doifile) { 736 vp = fs->lfs_ivnode; 737 #ifdef DEBUG 738 int loopcount = 0; 739 #endif 740 do { 741 #ifdef DEBUG 742 LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid); 743 #endif 744 mutex_enter(&lfs_lock); 745 fs->lfs_flags &= ~LFS_IFDIRTY; 746 mutex_exit(&lfs_lock); 747 748 ip = VTOI(vp); 749 750 if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { 751 /* 752 * Ifile has no pages, so we don't need 753 * to check error return here. 754 */ 755 lfs_writefile(fs, sp, vp); 756 /* 757 * Ensure the Ifile takes the current segment 758 * into account. See comment in lfs_vflush. 759 */ 760 lfs_writefile(fs, sp, vp); 761 lfs_writefile(fs, sp, vp); 762 } 763 764 if (ip->i_flag & IN_ALLMOD) 765 ++did_ckp; 766 #if 0 767 redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : 0); 768 #else 769 redo = lfs_writeinode(fs, sp, ip); 770 #endif 771 redo += lfs_writeseg(fs, sp); 772 mutex_enter(&lfs_lock); 773 redo += (fs->lfs_flags & LFS_IFDIRTY); 774 mutex_exit(&lfs_lock); 775 #ifdef DEBUG 776 if (++loopcount > 2) 777 log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n", 778 loopcount); 779 #endif 780 } while (redo && do_ckp); 781 782 /* 783 * Unless we are unmounting, the Ifile may continue to have 784 * dirty blocks even after a checkpoint, due to changes to 785 * inodes' atime. If we're checkpointing, it's "impossible" 786 * for other parts of the Ifile to be dirty after the loop 787 * above, since we hold the segment lock. 788 */ 789 mutex_enter(vp->v_interlock); 790 if (LIST_EMPTY(&vp->v_dirtyblkhd)) { 791 LFS_CLR_UINO(ip, IN_ALLMOD); 792 } 793 #ifdef DIAGNOSTIC 794 else if (do_ckp) { 795 int do_panic = 0; 796 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 797 if (bp->b_lblkno < fs->lfs_cleansz + 798 fs->lfs_segtabsz && 799 !(bp->b_flags & B_GATHERED)) { 800 printf("ifile lbn %ld still dirty (flags %lx)\n", 801 (long)bp->b_lblkno, 802 (long)bp->b_flags); 803 ++do_panic; 804 } 805 } 806 if (do_panic) 807 panic("dirty blocks"); 808 } 809 #endif 810 mutex_exit(vp->v_interlock); 811 } else { 812 (void) lfs_writeseg(fs, sp); 813 } 814 815 /* Note Ifile no longer needs to be written */ 816 fs->lfs_doifile = 0; 817 if (writer_set) 818 lfs_writer_leave(fs); 819 820 /* 821 * If we didn't write the Ifile, we didn't really do anything. 822 * That means that (1) there is a checkpoint on disk and (2) 823 * nothing has changed since it was written. 824 * 825 * Take the flags off of the segment so that lfs_segunlock 826 * doesn't have to write the superblock either. 827 */ 828 if (do_ckp && !did_ckp) { 829 sp->seg_flags &= ~SEGM_CKP; 830 } 831 832 if (lfs_dostats) { 833 ++lfs_stats.nwrites; 834 if (sp->seg_flags & SEGM_SYNC) 835 ++lfs_stats.nsync_writes; 836 if (sp->seg_flags & SEGM_CKP) 837 ++lfs_stats.ncheckpoints; 838 } 839 lfs_segunlock(fs); 840 return (0); 841 } 842 843 /* 844 * Write the dirty blocks associated with a vnode. 845 */ 846 int 847 lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp) 848 { 849 struct inode *ip; 850 int i, frag; 851 int error; 852 853 ASSERT_SEGLOCK(fs); 854 error = 0; 855 ip = VTOI(vp); 856 857 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 858 859 if (vp->v_uflag & VU_DIROP) 860 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); 861 862 if (sp->seg_flags & SEGM_CLEAN) { 863 lfs_gather(fs, sp, vp, lfs_match_fake); 864 /* 865 * For a file being flushed, we need to write *all* blocks. 866 * This means writing the cleaning blocks first, and then 867 * immediately following with any non-cleaning blocks. 868 * The same is true of the Ifile since checkpoints assume 869 * that all valid Ifile blocks are written. 870 */ 871 if (IS_FLUSHING(fs, vp) || vp == fs->lfs_ivnode) { 872 lfs_gather(fs, sp, vp, lfs_match_data); 873 /* 874 * Don't call VOP_PUTPAGES: if we're flushing, 875 * we've already done it, and the Ifile doesn't 876 * use the page cache. 877 */ 878 } 879 } else { 880 lfs_gather(fs, sp, vp, lfs_match_data); 881 /* 882 * If we're flushing, we've already called VOP_PUTPAGES 883 * so don't do it again. Otherwise, we want to write 884 * everything we've got. 885 */ 886 if (!IS_FLUSHING(fs, vp)) { 887 mutex_enter(vp->v_interlock); 888 error = VOP_PUTPAGES(vp, 0, 0, 889 PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED); 890 } 891 } 892 893 /* 894 * It may not be necessary to write the meta-data blocks at this point, 895 * as the roll-forward recovery code should be able to reconstruct the 896 * list. 897 * 898 * We have to write them anyway, though, under two conditions: (1) the 899 * vnode is being flushed (for reuse by vinvalbuf); or (2) we are 900 * checkpointing. 901 * 902 * BUT if we are cleaning, we might have indirect blocks that refer to 903 * new blocks not being written yet, in addition to fragments being 904 * moved out of a cleaned segment. If that is the case, don't 905 * write the indirect blocks, or the finfo will have a small block 906 * in the middle of it! 907 * XXX in this case isn't the inode size wrong too? 908 */ 909 frag = 0; 910 if (sp->seg_flags & SEGM_CLEAN) { 911 for (i = 0; i < ULFS_NDADDR; i++) 912 if (ip->i_lfs_fragsize[i] > 0 && 913 ip->i_lfs_fragsize[i] < fs->lfs_bsize) 914 ++frag; 915 } 916 #ifdef DIAGNOSTIC 917 if (frag > 1) 918 panic("lfs_writefile: more than one fragment!"); 919 #endif 920 if (IS_FLUSHING(fs, vp) || 921 (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) { 922 lfs_gather(fs, sp, vp, lfs_match_indir); 923 lfs_gather(fs, sp, vp, lfs_match_dindir); 924 lfs_gather(fs, sp, vp, lfs_match_tindir); 925 } 926 lfs_release_finfo(fs); 927 928 return error; 929 } 930 931 /* 932 * Update segment accounting to reflect this inode's change of address. 933 */ 934 static int 935 lfs_update_iaddr(struct lfs *fs, struct segment *sp, struct inode *ip, daddr_t ndaddr) 936 { 937 struct buf *bp; 938 daddr_t daddr; 939 IFILE *ifp; 940 SEGUSE *sup; 941 ino_t ino; 942 int redo_ifile; 943 u_int32_t sn; 944 945 redo_ifile = 0; 946 947 /* 948 * If updating the ifile, update the super-block. Update the disk 949 * address and access times for this inode in the ifile. 950 */ 951 ino = ip->i_number; 952 if (ino == LFS_IFILE_INUM) { 953 daddr = fs->lfs_idaddr; 954 fs->lfs_idaddr = LFS_DBTOFSB(fs, ndaddr); 955 } else { 956 LFS_IENTRY(ifp, fs, ino, bp); 957 daddr = ifp->if_daddr; 958 ifp->if_daddr = LFS_DBTOFSB(fs, ndaddr); 959 (void)LFS_BWRITE_LOG(bp); /* Ifile */ 960 } 961 962 /* 963 * If this is the Ifile and lfs_offset is set to the first block 964 * in the segment, dirty the new segment's accounting block 965 * (XXX should already be dirty?) and tell the caller to do it again. 966 */ 967 if (ip->i_number == LFS_IFILE_INUM) { 968 sn = lfs_dtosn(fs, fs->lfs_offset); 969 if (lfs_sntod(fs, sn) + lfs_btofsb(fs, fs->lfs_sumsize) == 970 fs->lfs_offset) { 971 LFS_SEGENTRY(sup, fs, sn, bp); 972 KASSERT(bp->b_oflags & BO_DELWRI); 973 LFS_WRITESEGENTRY(sup, fs, sn, bp); 974 /* fs->lfs_flags |= LFS_IFDIRTY; */ 975 redo_ifile |= 1; 976 } 977 } 978 979 /* 980 * The inode's last address should not be in the current partial 981 * segment, except under exceptional circumstances (lfs_writevnodes 982 * had to start over, and in the meantime more blocks were written 983 * to a vnode). Both inodes will be accounted to this segment 984 * in lfs_writeseg so we need to subtract the earlier version 985 * here anyway. The segment count can temporarily dip below 986 * zero here; keep track of how many duplicates we have in 987 * "dupino" so we don't panic below. 988 */ 989 if (daddr >= fs->lfs_lastpseg && daddr <= fs->lfs_offset) { 990 ++sp->ndupino; 991 DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg " 992 "(ino %d daddr 0x%llx) ndupino=%d\n", ino, 993 (long long)daddr, sp->ndupino)); 994 } 995 /* 996 * Account the inode: it no longer belongs to its former segment, 997 * though it will not belong to the new segment until that segment 998 * is actually written. 999 */ 1000 if (daddr != LFS_UNUSED_DADDR) { 1001 u_int32_t oldsn = lfs_dtosn(fs, daddr); 1002 #ifdef DIAGNOSTIC 1003 int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : 0; 1004 #endif 1005 LFS_SEGENTRY(sup, fs, oldsn, bp); 1006 #ifdef DIAGNOSTIC 1007 if (sup->su_nbytes + 1008 sizeof (struct ulfs1_dinode) * ndupino 1009 < sizeof (struct ulfs1_dinode)) { 1010 printf("lfs_writeinode: negative bytes " 1011 "(segment %" PRIu32 " short by %d, " 1012 "oldsn=%" PRIu32 ", cursn=%" PRIu32 1013 ", daddr=%" PRId64 ", su_nbytes=%u, " 1014 "ndupino=%d)\n", 1015 lfs_dtosn(fs, daddr), 1016 (int)sizeof (struct ulfs1_dinode) * 1017 (1 - sp->ndupino) - sup->su_nbytes, 1018 oldsn, sp->seg_number, daddr, 1019 (unsigned int)sup->su_nbytes, 1020 sp->ndupino); 1021 panic("lfs_writeinode: negative bytes"); 1022 sup->su_nbytes = sizeof (struct ulfs1_dinode); 1023 } 1024 #endif 1025 DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n", 1026 lfs_dtosn(fs, daddr), sizeof (struct ulfs1_dinode), ino)); 1027 sup->su_nbytes -= sizeof (struct ulfs1_dinode); 1028 redo_ifile |= 1029 (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); 1030 if (redo_ifile) { 1031 mutex_enter(&lfs_lock); 1032 fs->lfs_flags |= LFS_IFDIRTY; 1033 mutex_exit(&lfs_lock); 1034 /* Don't double-account */ 1035 fs->lfs_idaddr = 0x0; 1036 } 1037 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */ 1038 } 1039 1040 return redo_ifile; 1041 } 1042 1043 int 1044 lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip) 1045 { 1046 struct buf *bp; 1047 struct ulfs1_dinode *cdp; 1048 struct vnode *vp = ITOV(ip); 1049 daddr_t daddr; 1050 int32_t *daddrp; /* XXX ondisk32 */ 1051 int i, ndx; 1052 int redo_ifile = 0; 1053 int gotblk = 0; 1054 int count; 1055 1056 ASSERT_SEGLOCK(fs); 1057 if (!(ip->i_flag & IN_ALLMOD) && !(vp->v_uflag & VU_DIROP)) 1058 return (0); 1059 1060 /* Can't write ifile when writer is not set */ 1061 KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 || 1062 (sp->seg_flags & SEGM_CLEAN)); 1063 1064 /* 1065 * If this is the Ifile, see if writing it here will generate a 1066 * temporary misaccounting. If it will, do the accounting and write 1067 * the blocks, postponing the inode write until the accounting is 1068 * solid. 1069 */ 1070 count = 0; 1071 while (vp == fs->lfs_ivnode) { 1072 int redo = 0; 1073 1074 if (sp->idp == NULL && sp->ibp == NULL && 1075 (sp->seg_bytes_left < fs->lfs_ibsize || 1076 sp->sum_bytes_left < sizeof(int32_t))) { 1077 (void) lfs_writeseg(fs, sp); 1078 continue; 1079 } 1080 1081 /* Look for dirty Ifile blocks */ 1082 LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) { 1083 if (!(bp->b_flags & B_GATHERED)) { 1084 redo = 1; 1085 break; 1086 } 1087 } 1088 1089 if (redo == 0) 1090 redo = lfs_update_iaddr(fs, sp, ip, 0x0); 1091 if (redo == 0) 1092 break; 1093 1094 if (sp->idp) { 1095 sp->idp->di_inumber = 0; 1096 sp->idp = NULL; 1097 } 1098 ++count; 1099 if (count > 2) 1100 log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count); 1101 lfs_writefile(fs, sp, fs->lfs_ivnode); 1102 } 1103 1104 /* Allocate a new inode block if necessary. */ 1105 if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) && 1106 sp->ibp == NULL) { 1107 /* Allocate a new segment if necessary. */ 1108 if (sp->seg_bytes_left < fs->lfs_ibsize || 1109 sp->sum_bytes_left < sizeof(int32_t)) 1110 (void) lfs_writeseg(fs, sp); 1111 1112 /* Get next inode block. */ 1113 daddr = fs->lfs_offset; 1114 fs->lfs_offset += lfs_btofsb(fs, fs->lfs_ibsize); 1115 sp->ibp = *sp->cbpp++ = 1116 getblk(VTOI(fs->lfs_ivnode)->i_devvp, 1117 LFS_FSBTODB(fs, daddr), fs->lfs_ibsize, 0, 0); 1118 gotblk++; 1119 1120 /* Zero out inode numbers */ 1121 for (i = 0; i < LFS_INOPB(fs); ++i) 1122 ((struct ulfs1_dinode *)sp->ibp->b_data)[i].di_inumber = 1123 0; 1124 1125 ++sp->start_bpp; 1126 fs->lfs_avail -= lfs_btofsb(fs, fs->lfs_ibsize); 1127 /* Set remaining space counters. */ 1128 sp->seg_bytes_left -= fs->lfs_ibsize; 1129 sp->sum_bytes_left -= sizeof(int32_t); 1130 ndx = fs->lfs_sumsize / sizeof(int32_t) - 1131 sp->ninodes / LFS_INOPB(fs) - 1; 1132 ((int32_t *)(sp->segsum))[ndx] = daddr; 1133 } 1134 1135 /* Check VU_DIROP in case there is a new file with no data blocks */ 1136 if (vp->v_uflag & VU_DIROP) 1137 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); 1138 1139 /* Update the inode times and copy the inode onto the inode page. */ 1140 /* XXX kludge --- don't redirty the ifile just to put times on it */ 1141 if (ip->i_number != LFS_IFILE_INUM) 1142 LFS_ITIMES(ip, NULL, NULL, NULL); 1143 1144 /* 1145 * If this is the Ifile, and we've already written the Ifile in this 1146 * partial segment, just overwrite it (it's not on disk yet) and 1147 * continue. 1148 * 1149 * XXX we know that the bp that we get the second time around has 1150 * already been gathered. 1151 */ 1152 if (ip->i_number == LFS_IFILE_INUM && sp->idp) { 1153 *(sp->idp) = *ip->i_din.ffs1_din; 1154 ip->i_lfs_osize = ip->i_size; 1155 return 0; 1156 } 1157 1158 bp = sp->ibp; 1159 cdp = ((struct ulfs1_dinode *)bp->b_data) + (sp->ninodes % LFS_INOPB(fs)); 1160 *cdp = *ip->i_din.ffs1_din; 1161 1162 /* 1163 * This inode is on its way to disk; clear its VU_DIROP status when 1164 * the write is complete. 1165 */ 1166 if (vp->v_uflag & VU_DIROP) { 1167 if (!(sp->seg_flags & SEGM_CLEAN)) 1168 ip->i_flag |= IN_CDIROP; 1169 else { 1170 DLOG((DLOG_DIROP, "lfs_writeinode: not clearing dirop for cleaned ino %d\n", (int)ip->i_number)); 1171 } 1172 } 1173 1174 /* 1175 * If cleaning, link counts and directory file sizes cannot change, 1176 * since those would be directory operations---even if the file 1177 * we are writing is marked VU_DIROP we should write the old values. 1178 * If we're not cleaning, of course, update the values so we get 1179 * current values the next time we clean. 1180 */ 1181 if (sp->seg_flags & SEGM_CLEAN) { 1182 if (vp->v_uflag & VU_DIROP) { 1183 cdp->di_nlink = ip->i_lfs_odnlink; 1184 /* if (vp->v_type == VDIR) */ 1185 cdp->di_size = ip->i_lfs_osize; 1186 } 1187 } else { 1188 ip->i_lfs_odnlink = cdp->di_nlink; 1189 ip->i_lfs_osize = ip->i_size; 1190 } 1191 1192 1193 /* We can finish the segment accounting for truncations now */ 1194 lfs_finalize_ino_seguse(fs, ip); 1195 1196 /* 1197 * If we are cleaning, ensure that we don't write UNWRITTEN disk 1198 * addresses to disk; possibly change the on-disk record of 1199 * the inode size, either by reverting to the previous size 1200 * (in the case of cleaning) or by verifying the inode's block 1201 * holdings (in the case of files being allocated as they are being 1202 * written). 1203 * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail 1204 * XXX count on disk wrong by the same amount. We should be 1205 * XXX able to "borrow" from lfs_avail and return it after the 1206 * XXX Ifile is written. See also in lfs_writeseg. 1207 */ 1208 1209 /* Check file size based on highest allocated block */ 1210 if (((ip->i_ffs1_mode & LFS_IFMT) == LFS_IFREG || 1211 (ip->i_ffs1_mode & LFS_IFMT) == LFS_IFDIR) && 1212 ip->i_size > ((ip->i_lfs_hiblk + 1) << fs->lfs_bshift)) { 1213 cdp->di_size = (ip->i_lfs_hiblk + 1) << fs->lfs_bshift; 1214 DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %" 1215 PRId64 "\n", (int)ip->i_number, ip->i_size, cdp->di_size)); 1216 } 1217 if (ip->i_lfs_effnblks != ip->i_ffs1_blocks) { 1218 DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %d != nblk %d)" 1219 " at %x\n", ip->i_number, ip->i_lfs_effnblks, 1220 ip->i_ffs1_blocks, fs->lfs_offset)); 1221 for (daddrp = cdp->di_db; daddrp < cdp->di_ib + ULFS_NIADDR; 1222 daddrp++) { 1223 if (*daddrp == UNWRITTEN) { 1224 DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n")); 1225 *daddrp = 0; 1226 } 1227 } 1228 } 1229 1230 #ifdef DIAGNOSTIC 1231 /* 1232 * Check dinode held blocks against dinode size. 1233 * This should be identical to the check in lfs_vget(). 1234 */ 1235 for (i = (cdp->di_size + fs->lfs_bsize - 1) >> fs->lfs_bshift; 1236 i < ULFS_NDADDR; i++) { 1237 KASSERT(i >= 0); 1238 if ((cdp->di_mode & LFS_IFMT) == LFS_IFLNK) 1239 continue; 1240 if (((cdp->di_mode & LFS_IFMT) == LFS_IFBLK || 1241 (cdp->di_mode & LFS_IFMT) == LFS_IFCHR) && i == 0) 1242 continue; 1243 if (cdp->di_db[i] != 0) { 1244 # ifdef DEBUG 1245 lfs_dump_dinode(cdp); 1246 # endif 1247 panic("writing inconsistent inode"); 1248 } 1249 } 1250 #endif /* DIAGNOSTIC */ 1251 1252 if (ip->i_flag & IN_CLEANING) 1253 LFS_CLR_UINO(ip, IN_CLEANING); 1254 else { 1255 /* XXX IN_ALLMOD */ 1256 LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE | 1257 IN_UPDATE | IN_MODIFY); 1258 if (ip->i_lfs_effnblks == ip->i_ffs1_blocks) 1259 LFS_CLR_UINO(ip, IN_MODIFIED); 1260 else { 1261 DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real " 1262 "blks=%d, eff=%d\n", ip->i_number, 1263 ip->i_ffs1_blocks, ip->i_lfs_effnblks)); 1264 } 1265 } 1266 1267 if (ip->i_number == LFS_IFILE_INUM) { 1268 /* We know sp->idp == NULL */ 1269 sp->idp = ((struct ulfs1_dinode *)bp->b_data) + 1270 (sp->ninodes % LFS_INOPB(fs)); 1271 1272 /* Not dirty any more */ 1273 mutex_enter(&lfs_lock); 1274 fs->lfs_flags &= ~LFS_IFDIRTY; 1275 mutex_exit(&lfs_lock); 1276 } 1277 1278 if (gotblk) { 1279 mutex_enter(&bufcache_lock); 1280 LFS_LOCK_BUF(bp); 1281 brelsel(bp, 0); 1282 mutex_exit(&bufcache_lock); 1283 } 1284 1285 /* Increment inode count in segment summary block. */ 1286 ++((SEGSUM *)(sp->segsum))->ss_ninos; 1287 1288 /* If this page is full, set flag to allocate a new page. */ 1289 if (++sp->ninodes % LFS_INOPB(fs) == 0) 1290 sp->ibp = NULL; 1291 1292 redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno); 1293 1294 KASSERT(redo_ifile == 0); 1295 return (redo_ifile); 1296 } 1297 1298 int 1299 lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr) 1300 { 1301 struct lfs *fs; 1302 int vers; 1303 int j, blksinblk; 1304 1305 ASSERT_SEGLOCK(sp->fs); 1306 /* 1307 * If full, finish this segment. We may be doing I/O, so 1308 * release and reacquire the splbio(). 1309 */ 1310 #ifdef DIAGNOSTIC 1311 if (sp->vp == NULL) 1312 panic ("lfs_gatherblock: Null vp in segment"); 1313 #endif 1314 fs = sp->fs; 1315 blksinblk = howmany(bp->b_bcount, fs->lfs_bsize); 1316 if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk || 1317 sp->seg_bytes_left < bp->b_bcount) { 1318 if (mptr) 1319 mutex_exit(mptr); 1320 lfs_updatemeta(sp); 1321 1322 vers = sp->fip->fi_version; 1323 (void) lfs_writeseg(fs, sp); 1324 1325 /* Add the current file to the segment summary. */ 1326 lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers); 1327 1328 if (mptr) 1329 mutex_enter(mptr); 1330 return (1); 1331 } 1332 1333 if (bp->b_flags & B_GATHERED) { 1334 DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %d," 1335 " lbn %" PRId64 "\n", 1336 sp->fip->fi_ino, bp->b_lblkno)); 1337 return (0); 1338 } 1339 1340 /* Insert into the buffer list, update the FINFO block. */ 1341 bp->b_flags |= B_GATHERED; 1342 1343 *sp->cbpp++ = bp; 1344 for (j = 0; j < blksinblk; j++) { 1345 sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno + j; 1346 /* This block's accounting moves from lfs_favail to lfs_avail */ 1347 lfs_deregister_block(sp->vp, bp->b_lblkno + j); 1348 } 1349 1350 sp->sum_bytes_left -= sizeof(int32_t) * blksinblk; 1351 sp->seg_bytes_left -= bp->b_bcount; 1352 return (0); 1353 } 1354 1355 int 1356 lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp, 1357 int (*match)(struct lfs *, struct buf *)) 1358 { 1359 struct buf *bp, *nbp; 1360 int count = 0; 1361 1362 ASSERT_SEGLOCK(fs); 1363 if (vp->v_type == VBLK) 1364 return 0; 1365 KASSERT(sp->vp == NULL); 1366 sp->vp = vp; 1367 mutex_enter(&bufcache_lock); 1368 1369 #ifndef LFS_NO_BACKBUF_HACK 1370 /* This is a hack to see if ordering the blocks in LFS makes a difference. */ 1371 # define BUF_OFFSET \ 1372 (((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp) 1373 # define BACK_BUF(BP) \ 1374 ((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET)) 1375 # define BEG_OF_LIST \ 1376 ((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET)) 1377 1378 loop: 1379 /* Find last buffer. */ 1380 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); 1381 bp && LIST_NEXT(bp, b_vnbufs) != NULL; 1382 bp = LIST_NEXT(bp, b_vnbufs)) 1383 /* nothing */; 1384 for (; bp && bp != BEG_OF_LIST; bp = nbp) { 1385 nbp = BACK_BUF(bp); 1386 #else /* LFS_NO_BACKBUF_HACK */ 1387 loop: 1388 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1389 nbp = LIST_NEXT(bp, b_vnbufs); 1390 #endif /* LFS_NO_BACKBUF_HACK */ 1391 if ((bp->b_cflags & BC_BUSY) != 0 || 1392 (bp->b_flags & B_GATHERED) != 0 || !match(fs, bp)) { 1393 #ifdef DEBUG 1394 if (vp == fs->lfs_ivnode && 1395 (bp->b_cflags & BC_BUSY) != 0 && 1396 (bp->b_flags & B_GATHERED) == 0) 1397 log(LOG_NOTICE, "lfs_gather: ifile lbn %" 1398 PRId64 " busy (%x) at 0x%x", 1399 bp->b_lblkno, bp->b_flags, 1400 (unsigned)fs->lfs_offset); 1401 #endif 1402 continue; 1403 } 1404 #ifdef DIAGNOSTIC 1405 # ifdef LFS_USE_B_INVAL 1406 if ((bp->b_flags & BC_INVAL) != 0 && bp->b_iodone == NULL) { 1407 DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 1408 " is BC_INVAL\n", bp->b_lblkno)); 1409 VOP_PRINT(bp->b_vp); 1410 } 1411 # endif /* LFS_USE_B_INVAL */ 1412 if (!(bp->b_oflags & BO_DELWRI)) 1413 panic("lfs_gather: bp not BO_DELWRI"); 1414 if (!(bp->b_flags & B_LOCKED)) { 1415 DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 1416 " blk %" PRId64 " not B_LOCKED\n", 1417 bp->b_lblkno, 1418 LFS_DBTOFSB(fs, bp->b_blkno))); 1419 VOP_PRINT(bp->b_vp); 1420 panic("lfs_gather: bp not B_LOCKED"); 1421 } 1422 #endif 1423 if (lfs_gatherblock(sp, bp, &bufcache_lock)) { 1424 goto loop; 1425 } 1426 count++; 1427 } 1428 mutex_exit(&bufcache_lock); 1429 lfs_updatemeta(sp); 1430 KASSERT(sp->vp == vp); 1431 sp->vp = NULL; 1432 return count; 1433 } 1434 1435 #if DEBUG 1436 # define DEBUG_OOFF(n) do { \ 1437 if (ooff == 0) { \ 1438 DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \ 1439 "ino %d lbn %" PRId64 " at 0x%" PRIx32 \ 1440 ", was 0x0 (or %" PRId64 ")\n", \ 1441 (n), ip->i_number, lbn, ndaddr, daddr)); \ 1442 } \ 1443 } while (0) 1444 #else 1445 # define DEBUG_OOFF(n) 1446 #endif 1447 1448 /* 1449 * Change the given block's address to ndaddr, finding its previous 1450 * location using ulfs_bmaparray(). 1451 * 1452 * Account for this change in the segment table. 1453 * 1454 * called with sp == NULL by roll-forwarding code. 1455 */ 1456 void 1457 lfs_update_single(struct lfs *fs, struct segment *sp, 1458 struct vnode *vp, daddr_t lbn, int32_t ndaddr, int size) 1459 { 1460 SEGUSE *sup; 1461 struct buf *bp; 1462 struct indir a[ULFS_NIADDR + 2], *ap; 1463 struct inode *ip; 1464 daddr_t daddr, ooff; 1465 int num, error; 1466 int bb, osize, obb; 1467 1468 ASSERT_SEGLOCK(fs); 1469 KASSERT(sp == NULL || sp->vp == vp); 1470 ip = VTOI(vp); 1471 1472 error = ulfs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL); 1473 if (error) 1474 panic("lfs_updatemeta: ulfs_bmaparray returned %d", error); 1475 1476 daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */ 1477 KASSERT(daddr <= LFS_MAX_DADDR); 1478 if (daddr > 0) 1479 daddr = LFS_DBTOFSB(fs, daddr); 1480 1481 bb = lfs_numfrags(fs, size); 1482 switch (num) { 1483 case 0: 1484 ooff = ip->i_ffs1_db[lbn]; 1485 DEBUG_OOFF(0); 1486 if (ooff == UNWRITTEN) 1487 ip->i_ffs1_blocks += bb; 1488 else { 1489 /* possible fragment truncation or extension */ 1490 obb = lfs_btofsb(fs, ip->i_lfs_fragsize[lbn]); 1491 ip->i_ffs1_blocks += (bb - obb); 1492 } 1493 ip->i_ffs1_db[lbn] = ndaddr; 1494 break; 1495 case 1: 1496 ooff = ip->i_ffs1_ib[a[0].in_off]; 1497 DEBUG_OOFF(1); 1498 if (ooff == UNWRITTEN) 1499 ip->i_ffs1_blocks += bb; 1500 ip->i_ffs1_ib[a[0].in_off] = ndaddr; 1501 break; 1502 default: 1503 ap = &a[num - 1]; 1504 if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, 1505 B_MODIFY, &bp)) 1506 panic("lfs_updatemeta: bread bno %" PRId64, 1507 ap->in_lbn); 1508 1509 /* XXX ondisk32 */ 1510 ooff = ((int32_t *)bp->b_data)[ap->in_off]; 1511 DEBUG_OOFF(num); 1512 if (ooff == UNWRITTEN) 1513 ip->i_ffs1_blocks += bb; 1514 /* XXX ondisk32 */ 1515 ((int32_t *)bp->b_data)[ap->in_off] = ndaddr; 1516 (void) VOP_BWRITE(bp->b_vp, bp); 1517 } 1518 1519 KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr); 1520 1521 /* Update hiblk when extending the file */ 1522 if (lbn > ip->i_lfs_hiblk) 1523 ip->i_lfs_hiblk = lbn; 1524 1525 /* 1526 * Though we'd rather it couldn't, this *can* happen right now 1527 * if cleaning blocks and regular blocks coexist. 1528 */ 1529 /* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */ 1530 1531 /* 1532 * Update segment usage information, based on old size 1533 * and location. 1534 */ 1535 if (daddr > 0) { 1536 u_int32_t oldsn = lfs_dtosn(fs, daddr); 1537 #ifdef DIAGNOSTIC 1538 int ndupino; 1539 1540 if (sp && sp->seg_number == oldsn) { 1541 ndupino = sp->ndupino; 1542 } else { 1543 ndupino = 0; 1544 } 1545 #endif 1546 KASSERT(oldsn < fs->lfs_nseg); 1547 if (lbn >= 0 && lbn < ULFS_NDADDR) 1548 osize = ip->i_lfs_fragsize[lbn]; 1549 else 1550 osize = fs->lfs_bsize; 1551 LFS_SEGENTRY(sup, fs, oldsn, bp); 1552 #ifdef DIAGNOSTIC 1553 if (sup->su_nbytes + sizeof (struct ulfs1_dinode) * ndupino 1554 < osize) { 1555 printf("lfs_updatemeta: negative bytes " 1556 "(segment %" PRIu32 " short by %" PRId64 1557 ")\n", lfs_dtosn(fs, daddr), 1558 (int64_t)osize - 1559 (sizeof (struct ulfs1_dinode) * ndupino + 1560 sup->su_nbytes)); 1561 printf("lfs_updatemeta: ino %llu, lbn %" PRId64 1562 ", addr = 0x%" PRIx64 "\n", 1563 (unsigned long long)ip->i_number, lbn, daddr); 1564 printf("lfs_updatemeta: ndupino=%d\n", ndupino); 1565 panic("lfs_updatemeta: negative bytes"); 1566 sup->su_nbytes = osize - 1567 sizeof (struct ulfs1_dinode) * ndupino; 1568 } 1569 #endif 1570 DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64 1571 " db 0x%" PRIx64 "\n", 1572 lfs_dtosn(fs, daddr), osize, 1573 ip->i_number, lbn, daddr)); 1574 sup->su_nbytes -= osize; 1575 if (!(bp->b_flags & B_GATHERED)) { 1576 mutex_enter(&lfs_lock); 1577 fs->lfs_flags |= LFS_IFDIRTY; 1578 mutex_exit(&lfs_lock); 1579 } 1580 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); 1581 } 1582 /* 1583 * Now that this block has a new address, and its old 1584 * segment no longer owns it, we can forget about its 1585 * old size. 1586 */ 1587 if (lbn >= 0 && lbn < ULFS_NDADDR) 1588 ip->i_lfs_fragsize[lbn] = size; 1589 } 1590 1591 /* 1592 * Update the metadata that points to the blocks listed in the FINFO 1593 * array. 1594 */ 1595 void 1596 lfs_updatemeta(struct segment *sp) 1597 { 1598 struct buf *sbp; 1599 struct lfs *fs; 1600 struct vnode *vp; 1601 daddr_t lbn; 1602 int i, nblocks, num; 1603 int bb; 1604 int bytesleft, size; 1605 1606 ASSERT_SEGLOCK(sp->fs); 1607 vp = sp->vp; 1608 nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; 1609 KASSERT(nblocks >= 0); 1610 KASSERT(vp != NULL); 1611 if (nblocks == 0) 1612 return; 1613 1614 /* 1615 * This count may be high due to oversize blocks from lfs_gop_write. 1616 * Correct for this. (XXX we should be able to keep track of these.) 1617 */ 1618 fs = sp->fs; 1619 for (i = 0; i < nblocks; i++) { 1620 if (sp->start_bpp[i] == NULL) { 1621 DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks)); 1622 nblocks = i; 1623 break; 1624 } 1625 num = howmany(sp->start_bpp[i]->b_bcount, fs->lfs_bsize); 1626 KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1); 1627 nblocks -= num - 1; 1628 } 1629 1630 KASSERT(vp->v_type == VREG || 1631 nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp); 1632 KASSERT(nblocks == sp->cbpp - sp->start_bpp); 1633 1634 /* 1635 * Sort the blocks. 1636 * 1637 * We have to sort even if the blocks come from the 1638 * cleaner, because there might be other pending blocks on the 1639 * same inode...and if we don't sort, and there are fragments 1640 * present, blocks may be written in the wrong place. 1641 */ 1642 lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks, fs->lfs_bsize); 1643 1644 /* 1645 * Record the length of the last block in case it's a fragment. 1646 * If there are indirect blocks present, they sort last. An 1647 * indirect block will be lfs_bsize and its presence indicates 1648 * that you cannot have fragments. 1649 * 1650 * XXX This last is a lie. A cleaned fragment can coexist with 1651 * XXX a later indirect block. This will continue to be 1652 * XXX true until lfs_markv is fixed to do everything with 1653 * XXX fake blocks (including fake inodes and fake indirect blocks). 1654 */ 1655 sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) & 1656 fs->lfs_bmask) + 1; 1657 1658 /* 1659 * Assign disk addresses, and update references to the logical 1660 * block and the segment usage information. 1661 */ 1662 for (i = nblocks; i--; ++sp->start_bpp) { 1663 sbp = *sp->start_bpp; 1664 lbn = *sp->start_lbp; 1665 KASSERT(sbp->b_lblkno == lbn); 1666 1667 sbp->b_blkno = LFS_FSBTODB(fs, fs->lfs_offset); 1668 1669 /* 1670 * If we write a frag in the wrong place, the cleaner won't 1671 * be able to correctly identify its size later, and the 1672 * segment will be uncleanable. (Even worse, it will assume 1673 * that the indirect block that actually ends the list 1674 * is of a smaller size!) 1675 */ 1676 if ((sbp->b_bcount & fs->lfs_bmask) && i != 0) 1677 panic("lfs_updatemeta: fragment is not last block"); 1678 1679 /* 1680 * For each subblock in this possibly oversized block, 1681 * update its address on disk. 1682 */ 1683 KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize); 1684 KASSERT(vp == sbp->b_vp); 1685 for (bytesleft = sbp->b_bcount; bytesleft > 0; 1686 bytesleft -= fs->lfs_bsize) { 1687 size = MIN(bytesleft, fs->lfs_bsize); 1688 bb = lfs_numfrags(fs, size); 1689 lbn = *sp->start_lbp++; 1690 lfs_update_single(fs, sp, sp->vp, lbn, fs->lfs_offset, 1691 size); 1692 fs->lfs_offset += bb; 1693 } 1694 1695 } 1696 1697 /* This inode has been modified */ 1698 LFS_SET_UINO(VTOI(vp), IN_MODIFIED); 1699 } 1700 1701 /* 1702 * Move lfs_offset to a segment earlier than sn. 1703 */ 1704 int 1705 lfs_rewind(struct lfs *fs, int newsn) 1706 { 1707 int sn, osn, isdirty; 1708 struct buf *bp; 1709 SEGUSE *sup; 1710 1711 ASSERT_SEGLOCK(fs); 1712 1713 osn = lfs_dtosn(fs, fs->lfs_offset); 1714 if (osn < newsn) 1715 return 0; 1716 1717 /* lfs_avail eats the remaining space in this segment */ 1718 fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - fs->lfs_curseg); 1719 1720 /* Find a low-numbered segment */ 1721 for (sn = 0; sn < fs->lfs_nseg; ++sn) { 1722 LFS_SEGENTRY(sup, fs, sn, bp); 1723 isdirty = sup->su_flags & SEGUSE_DIRTY; 1724 brelse(bp, 0); 1725 1726 if (!isdirty) 1727 break; 1728 } 1729 if (sn == fs->lfs_nseg) 1730 panic("lfs_rewind: no clean segments"); 1731 if (newsn >= 0 && sn >= newsn) 1732 return ENOENT; 1733 fs->lfs_nextseg = sn; 1734 lfs_newseg(fs); 1735 fs->lfs_offset = fs->lfs_curseg; 1736 1737 return 0; 1738 } 1739 1740 /* 1741 * Start a new partial segment. 1742 * 1743 * Return 1 when we entered to a new segment. 1744 * Otherwise, return 0. 1745 */ 1746 int 1747 lfs_initseg(struct lfs *fs) 1748 { 1749 struct segment *sp = fs->lfs_sp; 1750 SEGSUM *ssp; 1751 struct buf *sbp; /* buffer for SEGSUM */ 1752 int repeat = 0; /* return value */ 1753 1754 ASSERT_SEGLOCK(fs); 1755 /* Advance to the next segment. */ 1756 if (!LFS_PARTIAL_FITS(fs)) { 1757 SEGUSE *sup; 1758 struct buf *bp; 1759 1760 /* lfs_avail eats the remaining space */ 1761 fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - 1762 fs->lfs_curseg); 1763 /* Wake up any cleaning procs waiting on this file system. */ 1764 lfs_wakeup_cleaner(fs); 1765 lfs_newseg(fs); 1766 repeat = 1; 1767 fs->lfs_offset = fs->lfs_curseg; 1768 1769 sp->seg_number = lfs_dtosn(fs, fs->lfs_curseg); 1770 sp->seg_bytes_left = lfs_fsbtob(fs, fs->lfs_fsbpseg); 1771 1772 /* 1773 * If the segment contains a superblock, update the offset 1774 * and summary address to skip over it. 1775 */ 1776 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 1777 if (sup->su_flags & SEGUSE_SUPERBLOCK) { 1778 fs->lfs_offset += lfs_btofsb(fs, LFS_SBPAD); 1779 sp->seg_bytes_left -= LFS_SBPAD; 1780 } 1781 brelse(bp, 0); 1782 /* Segment zero could also contain the labelpad */ 1783 if (fs->lfs_version > 1 && sp->seg_number == 0 && 1784 fs->lfs_start < lfs_btofsb(fs, LFS_LABELPAD)) { 1785 fs->lfs_offset += 1786 lfs_btofsb(fs, LFS_LABELPAD) - fs->lfs_start; 1787 sp->seg_bytes_left -= 1788 LFS_LABELPAD - lfs_fsbtob(fs, fs->lfs_start); 1789 } 1790 } else { 1791 sp->seg_number = lfs_dtosn(fs, fs->lfs_curseg); 1792 sp->seg_bytes_left = lfs_fsbtob(fs, fs->lfs_fsbpseg - 1793 (fs->lfs_offset - fs->lfs_curseg)); 1794 } 1795 fs->lfs_lastpseg = fs->lfs_offset; 1796 1797 /* Record first address of this partial segment */ 1798 if (sp->seg_flags & SEGM_CLEAN) { 1799 fs->lfs_cleanint[fs->lfs_cleanind] = fs->lfs_offset; 1800 if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) { 1801 /* "1" is the artificial inc in lfs_seglock */ 1802 mutex_enter(&lfs_lock); 1803 while (fs->lfs_iocount > 1) { 1804 mtsleep(&fs->lfs_iocount, PRIBIO + 1, 1805 "lfs_initseg", 0, &lfs_lock); 1806 } 1807 mutex_exit(&lfs_lock); 1808 fs->lfs_cleanind = 0; 1809 } 1810 } 1811 1812 sp->fs = fs; 1813 sp->ibp = NULL; 1814 sp->idp = NULL; 1815 sp->ninodes = 0; 1816 sp->ndupino = 0; 1817 1818 sp->cbpp = sp->bpp; 1819 1820 /* Get a new buffer for SEGSUM */ 1821 sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp, 1822 LFS_FSBTODB(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY); 1823 1824 /* ... and enter it into the buffer list. */ 1825 *sp->cbpp = sbp; 1826 sp->cbpp++; 1827 fs->lfs_offset += lfs_btofsb(fs, fs->lfs_sumsize); 1828 1829 sp->start_bpp = sp->cbpp; 1830 1831 /* Set point to SEGSUM, initialize it. */ 1832 ssp = sp->segsum = sbp->b_data; 1833 memset(ssp, 0, fs->lfs_sumsize); 1834 ssp->ss_next = fs->lfs_nextseg; 1835 ssp->ss_nfinfo = ssp->ss_ninos = 0; 1836 ssp->ss_magic = SS_MAGIC; 1837 1838 /* Set pointer to first FINFO, initialize it. */ 1839 sp->fip = (struct finfo *)((char *)sp->segsum + SEGSUM_SIZE(fs)); 1840 sp->fip->fi_nblocks = 0; 1841 sp->start_lbp = &sp->fip->fi_blocks[0]; 1842 sp->fip->fi_lastlength = 0; 1843 1844 sp->seg_bytes_left -= fs->lfs_sumsize; 1845 sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs); 1846 1847 return (repeat); 1848 } 1849 1850 /* 1851 * Remove SEGUSE_INVAL from all segments. 1852 */ 1853 void 1854 lfs_unset_inval_all(struct lfs *fs) 1855 { 1856 SEGUSE *sup; 1857 struct buf *bp; 1858 int i; 1859 1860 for (i = 0; i < fs->lfs_nseg; i++) { 1861 LFS_SEGENTRY(sup, fs, i, bp); 1862 if (sup->su_flags & SEGUSE_INVAL) { 1863 sup->su_flags &= ~SEGUSE_INVAL; 1864 LFS_WRITESEGENTRY(sup, fs, i, bp); 1865 } else 1866 brelse(bp, 0); 1867 } 1868 } 1869 1870 /* 1871 * Return the next segment to write. 1872 */ 1873 void 1874 lfs_newseg(struct lfs *fs) 1875 { 1876 CLEANERINFO *cip; 1877 SEGUSE *sup; 1878 struct buf *bp; 1879 int curseg, isdirty, sn, skip_inval; 1880 1881 ASSERT_SEGLOCK(fs); 1882 1883 /* Honor LFCNWRAPSTOP */ 1884 mutex_enter(&lfs_lock); 1885 while (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) { 1886 if (fs->lfs_wrappass) { 1887 log(LOG_NOTICE, "%s: wrappass=%d\n", 1888 fs->lfs_fsmnt, fs->lfs_wrappass); 1889 fs->lfs_wrappass = 0; 1890 break; 1891 } 1892 fs->lfs_wrapstatus = LFS_WRAP_WAITING; 1893 wakeup(&fs->lfs_nowrap); 1894 log(LOG_NOTICE, "%s: waiting at log wrap\n", fs->lfs_fsmnt); 1895 mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz, 1896 &lfs_lock); 1897 } 1898 fs->lfs_wrapstatus = LFS_WRAP_GOING; 1899 mutex_exit(&lfs_lock); 1900 1901 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, fs->lfs_nextseg), bp); 1902 DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n", 1903 lfs_dtosn(fs, fs->lfs_nextseg))); 1904 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1905 sup->su_nbytes = 0; 1906 sup->su_nsums = 0; 1907 sup->su_ninos = 0; 1908 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, fs->lfs_nextseg), bp); 1909 1910 LFS_CLEANERINFO(cip, fs, bp); 1911 --cip->clean; 1912 ++cip->dirty; 1913 fs->lfs_nclean = cip->clean; 1914 LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); 1915 1916 fs->lfs_lastseg = fs->lfs_curseg; 1917 fs->lfs_curseg = fs->lfs_nextseg; 1918 skip_inval = 1; 1919 for (sn = curseg = lfs_dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) { 1920 sn = (sn + 1) % fs->lfs_nseg; 1921 1922 if (sn == curseg) { 1923 if (skip_inval) 1924 skip_inval = 0; 1925 else 1926 panic("lfs_nextseg: no clean segments"); 1927 } 1928 LFS_SEGENTRY(sup, fs, sn, bp); 1929 isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0)); 1930 /* Check SEGUSE_EMPTY as we go along */ 1931 if (isdirty && sup->su_nbytes == 0 && 1932 !(sup->su_flags & SEGUSE_EMPTY)) 1933 LFS_WRITESEGENTRY(sup, fs, sn, bp); 1934 else 1935 brelse(bp, 0); 1936 1937 if (!isdirty) 1938 break; 1939 } 1940 if (skip_inval == 0) 1941 lfs_unset_inval_all(fs); 1942 1943 ++fs->lfs_nactive; 1944 fs->lfs_nextseg = lfs_sntod(fs, sn); 1945 if (lfs_dostats) { 1946 ++lfs_stats.segsused; 1947 } 1948 } 1949 1950 static struct buf * 1951 lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr, 1952 int n) 1953 { 1954 struct lfs_cluster *cl; 1955 struct buf **bpp, *bp; 1956 1957 ASSERT_SEGLOCK(fs); 1958 cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK); 1959 bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK); 1960 memset(cl, 0, sizeof(*cl)); 1961 cl->fs = fs; 1962 cl->bpp = bpp; 1963 cl->bufcount = 0; 1964 cl->bufsize = 0; 1965 1966 /* If this segment is being written synchronously, note that */ 1967 if (fs->lfs_sp->seg_flags & SEGM_SYNC) { 1968 cl->flags |= LFS_CL_SYNC; 1969 cl->seg = fs->lfs_sp; 1970 ++cl->seg->seg_iocount; 1971 } 1972 1973 /* Get an empty buffer header, or maybe one with something on it */ 1974 bp = getiobuf(vp, true); 1975 bp->b_dev = NODEV; 1976 bp->b_blkno = bp->b_lblkno = addr; 1977 bp->b_iodone = lfs_cluster_callback; 1978 bp->b_private = cl; 1979 1980 return bp; 1981 } 1982 1983 int 1984 lfs_writeseg(struct lfs *fs, struct segment *sp) 1985 { 1986 struct buf **bpp, *bp, *cbp, *newbp, *unbusybp; 1987 SEGUSE *sup; 1988 SEGSUM *ssp; 1989 int i; 1990 int do_again, nblocks, byteoffset; 1991 size_t el_size; 1992 struct lfs_cluster *cl; 1993 u_short ninos; 1994 struct vnode *devvp; 1995 char *p = NULL; 1996 struct vnode *vp; 1997 int32_t *daddrp; /* XXX ondisk32 */ 1998 int changed; 1999 u_int32_t sum; 2000 #ifdef DEBUG 2001 FINFO *fip; 2002 int findex; 2003 #endif 2004 2005 ASSERT_SEGLOCK(fs); 2006 2007 ssp = (SEGSUM *)sp->segsum; 2008 2009 /* 2010 * If there are no buffers other than the segment summary to write, 2011 * don't do anything. If we are the end of a dirop sequence, however, 2012 * write the empty segment summary anyway, to help out the 2013 * roll-forward agent. 2014 */ 2015 if ((nblocks = sp->cbpp - sp->bpp) == 1) { 2016 if ((ssp->ss_flags & (SS_DIROP | SS_CONT)) != SS_DIROP) 2017 return 0; 2018 } 2019 2020 /* Note if partial segment is being written by the cleaner */ 2021 if (sp->seg_flags & SEGM_CLEAN) 2022 ssp->ss_flags |= SS_CLEAN; 2023 2024 /* Note if we are writing to reclaim */ 2025 if (sp->seg_flags & SEGM_RECLAIM) { 2026 ssp->ss_flags |= SS_RECLAIM; 2027 ssp->ss_reclino = fs->lfs_reclino; 2028 } 2029 2030 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2031 2032 /* Update the segment usage information. */ 2033 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 2034 2035 /* Loop through all blocks, except the segment summary. */ 2036 for (bpp = sp->bpp; ++bpp < sp->cbpp; ) { 2037 if ((*bpp)->b_vp != devvp) { 2038 sup->su_nbytes += (*bpp)->b_bcount; 2039 DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d" 2040 " lbn %" PRId64 " db 0x%" PRIx64 "\n", 2041 sp->seg_number, (*bpp)->b_bcount, 2042 VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno, 2043 (*bpp)->b_blkno)); 2044 } 2045 } 2046 2047 #ifdef DEBUG 2048 /* Check for zero-length and zero-version FINFO entries. */ 2049 fip = (struct finfo *)((char *)ssp + SEGSUM_SIZE(fs)); 2050 for (findex = 0; findex < ssp->ss_nfinfo; findex++) { 2051 KDASSERT(fip->fi_nblocks > 0); 2052 KDASSERT(fip->fi_version > 0); 2053 fip = (FINFO *)((char *)fip + FINFOSIZE + 2054 sizeof(int32_t) * fip->fi_nblocks); 2055 } 2056 #endif /* DEBUG */ 2057 2058 ninos = (ssp->ss_ninos + LFS_INOPB(fs) - 1) / LFS_INOPB(fs); 2059 DLOG((DLOG_SU, "seg %d += %d for %d inodes\n", 2060 sp->seg_number, ssp->ss_ninos * sizeof (struct ulfs1_dinode), 2061 ssp->ss_ninos)); 2062 sup->su_nbytes += ssp->ss_ninos * sizeof (struct ulfs1_dinode); 2063 /* sup->su_nbytes += fs->lfs_sumsize; */ 2064 if (fs->lfs_version == 1) 2065 sup->su_olastmod = time_second; 2066 else 2067 sup->su_lastmod = time_second; 2068 sup->su_ninos += ninos; 2069 ++sup->su_nsums; 2070 fs->lfs_avail -= lfs_btofsb(fs, fs->lfs_sumsize); 2071 2072 do_again = !(bp->b_flags & B_GATHERED); 2073 LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */ 2074 2075 /* 2076 * Mark blocks B_BUSY, to prevent then from being changed between 2077 * the checksum computation and the actual write. 2078 * 2079 * If we are cleaning, check indirect blocks for UNWRITTEN, and if 2080 * there are any, replace them with copies that have UNASSIGNED 2081 * instead. 2082 */ 2083 mutex_enter(&bufcache_lock); 2084 for (bpp = sp->bpp, i = nblocks - 1; i--;) { 2085 ++bpp; 2086 bp = *bpp; 2087 if (bp->b_iodone != NULL) { /* UBC or malloced buffer */ 2088 bp->b_cflags |= BC_BUSY; 2089 continue; 2090 } 2091 2092 while (bp->b_cflags & BC_BUSY) { 2093 DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential" 2094 " data summary corruption for ino %d, lbn %" 2095 PRId64 "\n", 2096 VTOI(bp->b_vp)->i_number, bp->b_lblkno)); 2097 bp->b_cflags |= BC_WANTED; 2098 cv_wait(&bp->b_busy, &bufcache_lock); 2099 } 2100 bp->b_cflags |= BC_BUSY; 2101 mutex_exit(&bufcache_lock); 2102 unbusybp = NULL; 2103 2104 /* 2105 * Check and replace indirect block UNWRITTEN bogosity. 2106 * XXX See comment in lfs_writefile. 2107 */ 2108 if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp && 2109 VTOI(bp->b_vp)->i_ffs1_blocks != 2110 VTOI(bp->b_vp)->i_lfs_effnblks) { 2111 DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%d != %d)\n", 2112 VTOI(bp->b_vp)->i_number, 2113 VTOI(bp->b_vp)->i_lfs_effnblks, 2114 VTOI(bp->b_vp)->i_ffs1_blocks)); 2115 /* Make a copy we'll make changes to */ 2116 newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno, 2117 bp->b_bcount, LFS_NB_IBLOCK); 2118 newbp->b_blkno = bp->b_blkno; 2119 memcpy(newbp->b_data, bp->b_data, 2120 newbp->b_bcount); 2121 2122 changed = 0; 2123 /* XXX ondisk32 */ 2124 for (daddrp = (int32_t *)(newbp->b_data); 2125 daddrp < (int32_t *)((char *)newbp->b_data + 2126 newbp->b_bcount); daddrp++) { 2127 if (*daddrp == UNWRITTEN) { 2128 ++changed; 2129 *daddrp = 0; 2130 } 2131 } 2132 /* 2133 * Get rid of the old buffer. Don't mark it clean, 2134 * though, if it still has dirty data on it. 2135 */ 2136 if (changed) { 2137 DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):" 2138 " bp = %p newbp = %p\n", changed, bp, 2139 newbp)); 2140 *bpp = newbp; 2141 bp->b_flags &= ~B_GATHERED; 2142 bp->b_error = 0; 2143 if (bp->b_iodone != NULL) { 2144 DLOG((DLOG_SEG, "lfs_writeseg: " 2145 "indir bp should not be B_CALL\n")); 2146 biodone(bp); 2147 bp = NULL; 2148 } else { 2149 /* Still on free list, leave it there */ 2150 unbusybp = bp; 2151 /* 2152 * We have to re-decrement lfs_avail 2153 * since this block is going to come 2154 * back around to us in the next 2155 * segment. 2156 */ 2157 fs->lfs_avail -= 2158 lfs_btofsb(fs, bp->b_bcount); 2159 } 2160 } else { 2161 lfs_freebuf(fs, newbp); 2162 } 2163 } 2164 mutex_enter(&bufcache_lock); 2165 if (unbusybp != NULL) { 2166 unbusybp->b_cflags &= ~BC_BUSY; 2167 if (unbusybp->b_cflags & BC_WANTED) 2168 cv_broadcast(&bp->b_busy); 2169 } 2170 } 2171 mutex_exit(&bufcache_lock); 2172 2173 /* 2174 * Compute checksum across data and then across summary; the first 2175 * block (the summary block) is skipped. Set the create time here 2176 * so that it's guaranteed to be later than the inode mod times. 2177 */ 2178 sum = 0; 2179 if (fs->lfs_version == 1) 2180 el_size = sizeof(u_long); 2181 else 2182 el_size = sizeof(u_int32_t); 2183 for (bpp = sp->bpp, i = nblocks - 1; i--; ) { 2184 ++bpp; 2185 /* Loop through gop_write cluster blocks */ 2186 for (byteoffset = 0; byteoffset < (*bpp)->b_bcount; 2187 byteoffset += fs->lfs_bsize) { 2188 #ifdef LFS_USE_B_INVAL 2189 if (((*bpp)->b_cflags & BC_INVAL) != 0 && 2190 (*bpp)->b_iodone != NULL) { 2191 if (copyin((void *)(*bpp)->b_saveaddr + 2192 byteoffset, dp, el_size)) { 2193 panic("lfs_writeseg: copyin failed [1]:" 2194 " ino %d blk %" PRId64, 2195 VTOI((*bpp)->b_vp)->i_number, 2196 (*bpp)->b_lblkno); 2197 } 2198 } else 2199 #endif /* LFS_USE_B_INVAL */ 2200 { 2201 sum = lfs_cksum_part((char *) 2202 (*bpp)->b_data + byteoffset, el_size, sum); 2203 } 2204 } 2205 } 2206 if (fs->lfs_version == 1) 2207 ssp->ss_ocreate = time_second; 2208 else { 2209 ssp->ss_create = time_second; 2210 ssp->ss_serial = ++fs->lfs_serial; 2211 ssp->ss_ident = fs->lfs_ident; 2212 } 2213 ssp->ss_datasum = lfs_cksum_fold(sum); 2214 ssp->ss_sumsum = cksum(&ssp->ss_datasum, 2215 fs->lfs_sumsize - sizeof(ssp->ss_sumsum)); 2216 2217 mutex_enter(&lfs_lock); 2218 fs->lfs_bfree -= (lfs_btofsb(fs, ninos * fs->lfs_ibsize) + 2219 lfs_btofsb(fs, fs->lfs_sumsize)); 2220 fs->lfs_dmeta += (lfs_btofsb(fs, ninos * fs->lfs_ibsize) + 2221 lfs_btofsb(fs, fs->lfs_sumsize)); 2222 mutex_exit(&lfs_lock); 2223 2224 /* 2225 * When we simply write the blocks we lose a rotation for every block 2226 * written. To avoid this problem, we cluster the buffers into a 2227 * chunk and write the chunk. MAXPHYS is the largest size I/O 2228 * devices can handle, use that for the size of the chunks. 2229 * 2230 * Blocks that are already clusters (from GOP_WRITE), however, we 2231 * don't bother to copy into other clusters. 2232 */ 2233 2234 #define CHUNKSIZE MAXPHYS 2235 2236 if (devvp == NULL) 2237 panic("devvp is NULL"); 2238 for (bpp = sp->bpp, i = nblocks; i;) { 2239 cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i); 2240 cl = cbp->b_private; 2241 2242 cbp->b_flags |= B_ASYNC; 2243 cbp->b_cflags |= BC_BUSY; 2244 cbp->b_bcount = 0; 2245 2246 #if defined(DEBUG) && defined(DIAGNOSTIC) 2247 if (bpp - sp->bpp > (fs->lfs_sumsize - SEGSUM_SIZE(fs)) 2248 / sizeof(int32_t)) { 2249 panic("lfs_writeseg: real bpp overwrite"); 2250 } 2251 if (bpp - sp->bpp > lfs_segsize(fs) / fs->lfs_fsize) { 2252 panic("lfs_writeseg: theoretical bpp overwrite"); 2253 } 2254 #endif 2255 2256 /* 2257 * Construct the cluster. 2258 */ 2259 mutex_enter(&lfs_lock); 2260 ++fs->lfs_iocount; 2261 mutex_exit(&lfs_lock); 2262 while (i && cbp->b_bcount < CHUNKSIZE) { 2263 bp = *bpp; 2264 2265 if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount)) 2266 break; 2267 if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC)) 2268 break; 2269 2270 /* Clusters from GOP_WRITE are expedited */ 2271 if (bp->b_bcount > fs->lfs_bsize) { 2272 if (cbp->b_bcount > 0) 2273 /* Put in its own buffer */ 2274 break; 2275 else { 2276 cbp->b_data = bp->b_data; 2277 } 2278 } else if (cbp->b_bcount == 0) { 2279 p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE, 2280 LFS_NB_CLUSTER); 2281 cl->flags |= LFS_CL_MALLOC; 2282 } 2283 #ifdef DIAGNOSTIC 2284 if (lfs_dtosn(fs, LFS_DBTOFSB(fs, bp->b_blkno + 2285 btodb(bp->b_bcount - 1))) != 2286 sp->seg_number) { 2287 printf("blk size %d daddr %" PRIx64 2288 " not in seg %d\n", 2289 bp->b_bcount, bp->b_blkno, 2290 sp->seg_number); 2291 panic("segment overwrite"); 2292 } 2293 #endif 2294 2295 #ifdef LFS_USE_B_INVAL 2296 /* 2297 * Fake buffers from the cleaner are marked as B_INVAL. 2298 * We need to copy the data from user space rather than 2299 * from the buffer indicated. 2300 * XXX == what do I do on an error? 2301 */ 2302 if ((bp->b_cflags & BC_INVAL) != 0 && 2303 bp->b_iodone != NULL) { 2304 if (copyin(bp->b_saveaddr, p, bp->b_bcount)) 2305 panic("lfs_writeseg: " 2306 "copyin failed [2]"); 2307 } else 2308 #endif /* LFS_USE_B_INVAL */ 2309 if (cl->flags & LFS_CL_MALLOC) { 2310 /* copy data into our cluster. */ 2311 memcpy(p, bp->b_data, bp->b_bcount); 2312 p += bp->b_bcount; 2313 } 2314 2315 cbp->b_bcount += bp->b_bcount; 2316 cl->bufsize += bp->b_bcount; 2317 2318 bp->b_flags &= ~B_READ; 2319 bp->b_error = 0; 2320 cl->bpp[cl->bufcount++] = bp; 2321 2322 vp = bp->b_vp; 2323 mutex_enter(&bufcache_lock); 2324 mutex_enter(vp->v_interlock); 2325 bp->b_oflags &= ~(BO_DELWRI | BO_DONE); 2326 reassignbuf(bp, vp); 2327 vp->v_numoutput++; 2328 mutex_exit(vp->v_interlock); 2329 mutex_exit(&bufcache_lock); 2330 2331 bpp++; 2332 i--; 2333 } 2334 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 2335 BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL); 2336 else 2337 BIO_SETPRIO(cbp, BPRIO_TIMELIMITED); 2338 mutex_enter(devvp->v_interlock); 2339 devvp->v_numoutput++; 2340 mutex_exit(devvp->v_interlock); 2341 VOP_STRATEGY(devvp, cbp); 2342 curlwp->l_ru.ru_oublock++; 2343 } 2344 2345 if (lfs_dostats) { 2346 ++lfs_stats.psegwrites; 2347 lfs_stats.blocktot += nblocks - 1; 2348 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 2349 ++lfs_stats.psyncwrites; 2350 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { 2351 ++lfs_stats.pcleanwrites; 2352 lfs_stats.cleanblocks += nblocks - 1; 2353 } 2354 } 2355 2356 return (lfs_initseg(fs) || do_again); 2357 } 2358 2359 void 2360 lfs_writesuper(struct lfs *fs, daddr_t daddr) 2361 { 2362 struct buf *bp; 2363 struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2364 int s; 2365 2366 ASSERT_MAYBE_SEGLOCK(fs); 2367 #ifdef DIAGNOSTIC 2368 KASSERT(fs->lfs_magic == LFS_MAGIC); 2369 #endif 2370 /* 2371 * If we can write one superblock while another is in 2372 * progress, we risk not having a complete checkpoint if we crash. 2373 * So, block here if a superblock write is in progress. 2374 */ 2375 mutex_enter(&lfs_lock); 2376 s = splbio(); 2377 while (fs->lfs_sbactive) { 2378 mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0, 2379 &lfs_lock); 2380 } 2381 fs->lfs_sbactive = daddr; 2382 splx(s); 2383 mutex_exit(&lfs_lock); 2384 2385 /* Set timestamp of this version of the superblock */ 2386 if (fs->lfs_version == 1) 2387 fs->lfs_otstamp = time_second; 2388 fs->lfs_tstamp = time_second; 2389 2390 /* Checksum the superblock and copy it into a buffer. */ 2391 fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs)); 2392 bp = lfs_newbuf(fs, devvp, 2393 LFS_FSBTODB(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK); 2394 memset((char *)bp->b_data + sizeof(struct dlfs), 0, 2395 LFS_SBPAD - sizeof(struct dlfs)); 2396 *(struct dlfs *)bp->b_data = fs->lfs_dlfs; 2397 2398 bp->b_cflags |= BC_BUSY; 2399 bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC; 2400 bp->b_oflags &= ~(BO_DONE | BO_DELWRI); 2401 bp->b_error = 0; 2402 bp->b_iodone = lfs_supercallback; 2403 2404 if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC) 2405 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 2406 else 2407 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 2408 curlwp->l_ru.ru_oublock++; 2409 2410 mutex_enter(devvp->v_interlock); 2411 devvp->v_numoutput++; 2412 mutex_exit(devvp->v_interlock); 2413 2414 mutex_enter(&lfs_lock); 2415 ++fs->lfs_iocount; 2416 mutex_exit(&lfs_lock); 2417 VOP_STRATEGY(devvp, bp); 2418 } 2419 2420 /* 2421 * Logical block number match routines used when traversing the dirty block 2422 * chain. 2423 */ 2424 int 2425 lfs_match_fake(struct lfs *fs, struct buf *bp) 2426 { 2427 2428 ASSERT_SEGLOCK(fs); 2429 return LFS_IS_MALLOC_BUF(bp); 2430 } 2431 2432 #if 0 2433 int 2434 lfs_match_real(struct lfs *fs, struct buf *bp) 2435 { 2436 2437 ASSERT_SEGLOCK(fs); 2438 return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp)); 2439 } 2440 #endif 2441 2442 int 2443 lfs_match_data(struct lfs *fs, struct buf *bp) 2444 { 2445 2446 ASSERT_SEGLOCK(fs); 2447 return (bp->b_lblkno >= 0); 2448 } 2449 2450 int 2451 lfs_match_indir(struct lfs *fs, struct buf *bp) 2452 { 2453 daddr_t lbn; 2454 2455 ASSERT_SEGLOCK(fs); 2456 lbn = bp->b_lblkno; 2457 return (lbn < 0 && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == 0); 2458 } 2459 2460 int 2461 lfs_match_dindir(struct lfs *fs, struct buf *bp) 2462 { 2463 daddr_t lbn; 2464 2465 ASSERT_SEGLOCK(fs); 2466 lbn = bp->b_lblkno; 2467 return (lbn < 0 && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == 1); 2468 } 2469 2470 int 2471 lfs_match_tindir(struct lfs *fs, struct buf *bp) 2472 { 2473 daddr_t lbn; 2474 2475 ASSERT_SEGLOCK(fs); 2476 lbn = bp->b_lblkno; 2477 return (lbn < 0 && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == 2); 2478 } 2479 2480 static void 2481 lfs_free_aiodone(struct buf *bp) 2482 { 2483 struct lfs *fs; 2484 2485 KERNEL_LOCK(1, curlwp); 2486 fs = bp->b_private; 2487 ASSERT_NO_SEGLOCK(fs); 2488 lfs_freebuf(fs, bp); 2489 KERNEL_UNLOCK_LAST(curlwp); 2490 } 2491 2492 static void 2493 lfs_super_aiodone(struct buf *bp) 2494 { 2495 struct lfs *fs; 2496 2497 KERNEL_LOCK(1, curlwp); 2498 fs = bp->b_private; 2499 ASSERT_NO_SEGLOCK(fs); 2500 mutex_enter(&lfs_lock); 2501 fs->lfs_sbactive = 0; 2502 if (--fs->lfs_iocount <= 1) 2503 wakeup(&fs->lfs_iocount); 2504 wakeup(&fs->lfs_sbactive); 2505 mutex_exit(&lfs_lock); 2506 lfs_freebuf(fs, bp); 2507 KERNEL_UNLOCK_LAST(curlwp); 2508 } 2509 2510 static void 2511 lfs_cluster_aiodone(struct buf *bp) 2512 { 2513 struct lfs_cluster *cl; 2514 struct lfs *fs; 2515 struct buf *tbp, *fbp; 2516 struct vnode *vp, *devvp, *ovp; 2517 struct inode *ip; 2518 int error; 2519 2520 KERNEL_LOCK(1, curlwp); 2521 2522 error = bp->b_error; 2523 cl = bp->b_private; 2524 fs = cl->fs; 2525 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2526 ASSERT_NO_SEGLOCK(fs); 2527 2528 /* Put the pages back, and release the buffer */ 2529 while (cl->bufcount--) { 2530 tbp = cl->bpp[cl->bufcount]; 2531 KASSERT(tbp->b_cflags & BC_BUSY); 2532 if (error) { 2533 tbp->b_error = error; 2534 } 2535 2536 /* 2537 * We're done with tbp. If it has not been re-dirtied since 2538 * the cluster was written, free it. Otherwise, keep it on 2539 * the locked list to be written again. 2540 */ 2541 vp = tbp->b_vp; 2542 2543 tbp->b_flags &= ~B_GATHERED; 2544 2545 LFS_BCLEAN_LOG(fs, tbp); 2546 2547 mutex_enter(&bufcache_lock); 2548 if (tbp->b_iodone == NULL) { 2549 KASSERT(tbp->b_flags & B_LOCKED); 2550 bremfree(tbp); 2551 if (vp) { 2552 mutex_enter(vp->v_interlock); 2553 reassignbuf(tbp, vp); 2554 mutex_exit(vp->v_interlock); 2555 } 2556 tbp->b_flags |= B_ASYNC; /* for biodone */ 2557 } 2558 2559 if (((tbp->b_flags | tbp->b_oflags) & 2560 (B_LOCKED | BO_DELWRI)) == B_LOCKED) 2561 LFS_UNLOCK_BUF(tbp); 2562 2563 if (tbp->b_oflags & BO_DONE) { 2564 DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n", 2565 cl->bufcount, (long)tbp->b_flags)); 2566 } 2567 2568 if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) { 2569 /* 2570 * A buffer from the page daemon. 2571 * We use the same iodone as it does, 2572 * so we must manually disassociate its 2573 * buffers from the vp. 2574 */ 2575 if ((ovp = tbp->b_vp) != NULL) { 2576 /* This is just silly */ 2577 mutex_enter(ovp->v_interlock); 2578 brelvp(tbp); 2579 mutex_exit(ovp->v_interlock); 2580 tbp->b_vp = vp; 2581 tbp->b_objlock = vp->v_interlock; 2582 } 2583 /* Put it back the way it was */ 2584 tbp->b_flags |= B_ASYNC; 2585 /* Master buffers have BC_AGE */ 2586 if (tbp->b_private == tbp) 2587 tbp->b_cflags |= BC_AGE; 2588 } 2589 mutex_exit(&bufcache_lock); 2590 2591 biodone(tbp); 2592 2593 /* 2594 * If this is the last block for this vnode, but 2595 * there are other blocks on its dirty list, 2596 * set IN_MODIFIED/IN_CLEANING depending on what 2597 * sort of block. Only do this for our mount point, 2598 * not for, e.g., inode blocks that are attached to 2599 * the devvp. 2600 * XXX KS - Shouldn't we set *both* if both types 2601 * of blocks are present (traverse the dirty list?) 2602 */ 2603 mutex_enter(vp->v_interlock); 2604 mutex_enter(&lfs_lock); 2605 if (vp != devvp && vp->v_numoutput == 0 && 2606 (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) { 2607 ip = VTOI(vp); 2608 DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n", 2609 ip->i_number)); 2610 if (LFS_IS_MALLOC_BUF(fbp)) 2611 LFS_SET_UINO(ip, IN_CLEANING); 2612 else 2613 LFS_SET_UINO(ip, IN_MODIFIED); 2614 } 2615 cv_broadcast(&vp->v_cv); 2616 mutex_exit(&lfs_lock); 2617 mutex_exit(vp->v_interlock); 2618 } 2619 2620 /* Fix up the cluster buffer, and release it */ 2621 if (cl->flags & LFS_CL_MALLOC) 2622 lfs_free(fs, bp->b_data, LFS_NB_CLUSTER); 2623 putiobuf(bp); 2624 2625 /* Note i/o done */ 2626 if (cl->flags & LFS_CL_SYNC) { 2627 if (--cl->seg->seg_iocount == 0) 2628 wakeup(&cl->seg->seg_iocount); 2629 } 2630 mutex_enter(&lfs_lock); 2631 #ifdef DIAGNOSTIC 2632 if (fs->lfs_iocount == 0) 2633 panic("lfs_cluster_aiodone: zero iocount"); 2634 #endif 2635 if (--fs->lfs_iocount <= 1) 2636 wakeup(&fs->lfs_iocount); 2637 mutex_exit(&lfs_lock); 2638 2639 KERNEL_UNLOCK_LAST(curlwp); 2640 2641 pool_put(&fs->lfs_bpppool, cl->bpp); 2642 cl->bpp = NULL; 2643 pool_put(&fs->lfs_clpool, cl); 2644 } 2645 2646 static void 2647 lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *)) 2648 { 2649 /* reset b_iodone for when this is a single-buf i/o. */ 2650 bp->b_iodone = aiodone; 2651 2652 workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL); 2653 } 2654 2655 static void 2656 lfs_cluster_callback(struct buf *bp) 2657 { 2658 2659 lfs_generic_callback(bp, lfs_cluster_aiodone); 2660 } 2661 2662 void 2663 lfs_supercallback(struct buf *bp) 2664 { 2665 2666 lfs_generic_callback(bp, lfs_super_aiodone); 2667 } 2668 2669 /* 2670 * The only buffers that are going to hit these functions are the 2671 * segment write blocks, or the segment summaries, or the superblocks. 2672 * 2673 * All of the above are created by lfs_newbuf, and so do not need to be 2674 * released via brelse. 2675 */ 2676 void 2677 lfs_callback(struct buf *bp) 2678 { 2679 2680 lfs_generic_callback(bp, lfs_free_aiodone); 2681 } 2682 2683 /* 2684 * Shellsort (diminishing increment sort) from Data Structures and 2685 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; 2686 * see also Knuth Vol. 3, page 84. The increments are selected from 2687 * formula (8), page 95. Roughly O(N^3/2). 2688 */ 2689 /* 2690 * This is our own private copy of shellsort because we want to sort 2691 * two parallel arrays (the array of buffer pointers and the array of 2692 * logical block numbers) simultaneously. Note that we cast the array 2693 * of logical block numbers to a unsigned in this routine so that the 2694 * negative block numbers (meta data blocks) sort AFTER the data blocks. 2695 */ 2696 2697 void 2698 lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size) 2699 { 2700 static int __rsshell_increments[] = { 4, 1, 0 }; 2701 int incr, *incrp, t1, t2; 2702 struct buf *bp_temp; 2703 2704 #ifdef DEBUG 2705 incr = 0; 2706 for (t1 = 0; t1 < nmemb; t1++) { 2707 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { 2708 if (lb_array[incr++] != bp_array[t1]->b_lblkno + t2) { 2709 /* dump before panic */ 2710 printf("lfs_shellsort: nmemb=%d, size=%d\n", 2711 nmemb, size); 2712 incr = 0; 2713 for (t1 = 0; t1 < nmemb; t1++) { 2714 const struct buf *bp = bp_array[t1]; 2715 2716 printf("bp[%d]: lbn=%" PRIu64 ", size=%" 2717 PRIu64 "\n", t1, 2718 (uint64_t)bp->b_bcount, 2719 (uint64_t)bp->b_lblkno); 2720 printf("lbns:"); 2721 for (t2 = 0; t2 * size < bp->b_bcount; 2722 t2++) { 2723 printf(" %" PRId32, 2724 lb_array[incr++]); 2725 } 2726 printf("\n"); 2727 } 2728 panic("lfs_shellsort: inconsistent input"); 2729 } 2730 } 2731 } 2732 #endif 2733 2734 for (incrp = __rsshell_increments; (incr = *incrp++) != 0;) 2735 for (t1 = incr; t1 < nmemb; ++t1) 2736 for (t2 = t1 - incr; t2 >= 0;) 2737 if ((u_int32_t)bp_array[t2]->b_lblkno > 2738 (u_int32_t)bp_array[t2 + incr]->b_lblkno) { 2739 bp_temp = bp_array[t2]; 2740 bp_array[t2] = bp_array[t2 + incr]; 2741 bp_array[t2 + incr] = bp_temp; 2742 t2 -= incr; 2743 } else 2744 break; 2745 2746 /* Reform the list of logical blocks */ 2747 incr = 0; 2748 for (t1 = 0; t1 < nmemb; t1++) { 2749 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { 2750 lb_array[incr++] = bp_array[t1]->b_lblkno + t2; 2751 } 2752 } 2753 } 2754 2755 /* 2756 * Call vget with LK_NOWAIT. If we are the one who holds VI_XLOCK, 2757 * however, we must press on. Just fake success in that case. 2758 */ 2759 int 2760 lfs_vref(struct vnode *vp) 2761 { 2762 struct lfs *fs; 2763 2764 KASSERT(mutex_owned(vp->v_interlock)); 2765 2766 fs = VTOI(vp)->i_lfs; 2767 2768 ASSERT_MAYBE_SEGLOCK(fs); 2769 2770 /* 2771 * If we return 1 here during a flush, we risk vinvalbuf() not 2772 * being able to flush all of the pages from this vnode, which 2773 * will cause it to panic. So, return 0 if a flush is in progress. 2774 */ 2775 if (IS_FLUSHING(VTOI(vp)->i_lfs, vp)) { 2776 ++fs->lfs_flushvp_fakevref; 2777 mutex_exit(vp->v_interlock); 2778 return 0; 2779 } 2780 2781 return vget(vp, LK_NOWAIT); 2782 } 2783 2784 /* 2785 * This is vrele except that we do not want to VOP_INACTIVE this vnode. We 2786 * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end. 2787 */ 2788 void 2789 lfs_vunref(struct vnode *vp) 2790 { 2791 struct lfs *fs; 2792 2793 fs = VTOI(vp)->i_lfs; 2794 ASSERT_MAYBE_SEGLOCK(fs); 2795 2796 /* 2797 * Analogous to lfs_vref, if the node is flushing, fake it. 2798 */ 2799 if (IS_FLUSHING(fs, vp) && fs->lfs_flushvp_fakevref) { 2800 --fs->lfs_flushvp_fakevref; 2801 return; 2802 } 2803 2804 /* does not call inactive XXX sure it does XXX */ 2805 vrele(vp); 2806 } 2807 2808 /* 2809 * We use this when we have vnodes that were loaded in solely for cleaning. 2810 * There is no reason to believe that these vnodes will be referenced again 2811 * soon, since the cleaning process is unrelated to normal filesystem 2812 * activity. Putting cleaned vnodes at the tail of the list has the effect 2813 * of flushing the vnode LRU. So, put vnodes that were loaded only for 2814 * cleaning at the head of the list, instead. 2815 */ 2816 void 2817 lfs_vunref_head(struct vnode *vp) 2818 { 2819 2820 ASSERT_SEGLOCK(VTOI(vp)->i_lfs); 2821 2822 /* does not call inactive XXX sure it does XXX, 2823 inserts non-held vnode at head of freelist */ 2824 vrele(vp); 2825 } 2826 2827 2828 /* 2829 * Set up an FINFO entry for a new file. The fip pointer is assumed to 2830 * point at uninitialized space. 2831 */ 2832 void 2833 lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers) 2834 { 2835 struct segment *sp = fs->lfs_sp; 2836 2837 KASSERT(vers > 0); 2838 2839 if (sp->seg_bytes_left < fs->lfs_bsize || 2840 sp->sum_bytes_left < sizeof(struct finfo)) 2841 (void) lfs_writeseg(fs, fs->lfs_sp); 2842 2843 sp->sum_bytes_left -= FINFOSIZE; 2844 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 2845 sp->fip->fi_nblocks = 0; 2846 sp->fip->fi_ino = ino; 2847 sp->fip->fi_version = vers; 2848 } 2849 2850 /* 2851 * Release the FINFO entry, either clearing out an unused entry or 2852 * advancing us to the next available entry. 2853 */ 2854 void 2855 lfs_release_finfo(struct lfs *fs) 2856 { 2857 struct segment *sp = fs->lfs_sp; 2858 2859 if (sp->fip->fi_nblocks != 0) { 2860 sp->fip = (FINFO*)((char *)sp->fip + FINFOSIZE + 2861 sizeof(int32_t) * sp->fip->fi_nblocks); 2862 sp->start_lbp = &sp->fip->fi_blocks[0]; 2863 } else { 2864 sp->sum_bytes_left += FINFOSIZE; 2865 --((SEGSUM *)(sp->segsum))->ss_nfinfo; 2866 } 2867 } 2868