1 /* $NetBSD: lfs_segment.c,v 1.209 2008/02/15 13:30:56 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 /* 39 * Copyright (c) 1991, 1993 40 * The Regents of the University of California. All rights reserved. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95 67 */ 68 69 #include <sys/cdefs.h> 70 __KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.209 2008/02/15 13:30:56 ad Exp $"); 71 72 #ifdef DEBUG 73 # define vndebug(vp, str) do { \ 74 if (VTOI(vp)->i_flag & IN_CLEANING) \ 75 DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \ 76 VTOI(vp)->i_number, (str), op)); \ 77 } while(0) 78 #else 79 # define vndebug(vp, str) 80 #endif 81 #define ivndebug(vp, str) \ 82 DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str))) 83 84 #if defined(_KERNEL_OPT) 85 #include "opt_ddb.h" 86 #endif 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/namei.h> 91 #include <sys/kernel.h> 92 #include <sys/resourcevar.h> 93 #include <sys/file.h> 94 #include <sys/stat.h> 95 #include <sys/buf.h> 96 #include <sys/proc.h> 97 #include <sys/vnode.h> 98 #include <sys/mount.h> 99 #include <sys/kauth.h> 100 #include <sys/syslog.h> 101 102 #include <miscfs/specfs/specdev.h> 103 #include <miscfs/fifofs/fifo.h> 104 105 #include <ufs/ufs/inode.h> 106 #include <ufs/ufs/dir.h> 107 #include <ufs/ufs/ufsmount.h> 108 #include <ufs/ufs/ufs_extern.h> 109 110 #include <ufs/lfs/lfs.h> 111 #include <ufs/lfs/lfs_extern.h> 112 113 #include <uvm/uvm.h> 114 #include <uvm/uvm_extern.h> 115 116 MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS"); 117 118 extern int count_lock_queue(void); 119 extern kmutex_t vnode_free_list_lock; /* XXX */ 120 121 static void lfs_generic_callback(struct buf *, void (*)(struct buf *)); 122 static void lfs_free_aiodone(struct buf *); 123 static void lfs_super_aiodone(struct buf *); 124 static void lfs_cluster_aiodone(struct buf *); 125 static void lfs_cluster_callback(struct buf *); 126 127 /* 128 * Determine if it's OK to start a partial in this segment, or if we need 129 * to go on to a new segment. 130 */ 131 #define LFS_PARTIAL_FITS(fs) \ 132 ((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ 133 fragstofsb((fs), (fs)->lfs_frag)) 134 135 /* 136 * Figure out whether we should do a checkpoint write or go ahead with 137 * an ordinary write. 138 */ 139 #define LFS_SHOULD_CHECKPOINT(fs, flags) \ 140 ((flags & SEGM_CLEAN) == 0 && \ 141 ((fs->lfs_nactive > LFS_MAX_ACTIVE || \ 142 (flags & SEGM_CKP) || \ 143 fs->lfs_nclean < LFS_MAX_ACTIVE))) 144 145 int lfs_match_fake(struct lfs *, struct buf *); 146 void lfs_newseg(struct lfs *); 147 /* XXX ondisk32 */ 148 void lfs_shellsort(struct buf **, int32_t *, int, int); 149 void lfs_supercallback(struct buf *); 150 void lfs_updatemeta(struct segment *); 151 void lfs_writesuper(struct lfs *, daddr_t); 152 int lfs_writevnodes(struct lfs *fs, struct mount *mp, 153 struct segment *sp, int dirops); 154 155 int lfs_allclean_wakeup; /* Cleaner wakeup address. */ 156 int lfs_writeindir = 1; /* whether to flush indir on non-ckp */ 157 int lfs_clean_vnhead = 0; /* Allow freeing to head of vn list */ 158 int lfs_dirvcount = 0; /* # active dirops */ 159 160 /* Statistics Counters */ 161 int lfs_dostats = 1; 162 struct lfs_stats lfs_stats; 163 164 /* op values to lfs_writevnodes */ 165 #define VN_REG 0 166 #define VN_DIROP 1 167 #define VN_EMPTY 2 168 #define VN_CLEAN 3 169 170 /* 171 * XXX KS - Set modification time on the Ifile, so the cleaner can 172 * read the fs mod time off of it. We don't set IN_UPDATE here, 173 * since we don't really need this to be flushed to disk (and in any 174 * case that wouldn't happen to the Ifile until we checkpoint). 175 */ 176 void 177 lfs_imtime(struct lfs *fs) 178 { 179 struct timespec ts; 180 struct inode *ip; 181 182 ASSERT_MAYBE_SEGLOCK(fs); 183 vfs_timestamp(&ts); 184 ip = VTOI(fs->lfs_ivnode); 185 ip->i_ffs1_mtime = ts.tv_sec; 186 ip->i_ffs1_mtimensec = ts.tv_nsec; 187 } 188 189 /* 190 * Ifile and meta data blocks are not marked busy, so segment writes MUST be 191 * single threaded. Currently, there are two paths into lfs_segwrite, sync() 192 * and getnewbuf(). They both mark the file system busy. Lfs_vflush() 193 * explicitly marks the file system busy. So lfs_segwrite is safe. I think. 194 */ 195 196 #define IS_FLUSHING(fs,vp) ((fs)->lfs_flushvp == (vp)) 197 198 int 199 lfs_vflush(struct vnode *vp) 200 { 201 struct inode *ip; 202 struct lfs *fs; 203 struct segment *sp; 204 struct buf *bp, *nbp, *tbp, *tnbp; 205 int error; 206 int flushed; 207 int relock; 208 int loopcount; 209 210 ip = VTOI(vp); 211 fs = VFSTOUFS(vp->v_mount)->um_lfs; 212 relock = 0; 213 214 top: 215 ASSERT_NO_SEGLOCK(fs); 216 if (ip->i_flag & IN_CLEANING) { 217 ivndebug(vp,"vflush/in_cleaning"); 218 mutex_enter(&lfs_lock); 219 LFS_CLR_UINO(ip, IN_CLEANING); 220 LFS_SET_UINO(ip, IN_MODIFIED); 221 mutex_exit(&lfs_lock); 222 223 /* 224 * Toss any cleaning buffers that have real counterparts 225 * to avoid losing new data. 226 */ 227 mutex_enter(&vp->v_interlock); 228 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 229 nbp = LIST_NEXT(bp, b_vnbufs); 230 if (!LFS_IS_MALLOC_BUF(bp)) 231 continue; 232 /* 233 * Look for pages matching the range covered 234 * by cleaning blocks. It's okay if more dirty 235 * pages appear, so long as none disappear out 236 * from under us. 237 */ 238 if (bp->b_lblkno > 0 && vp->v_type == VREG && 239 vp != fs->lfs_ivnode) { 240 struct vm_page *pg; 241 voff_t off; 242 243 for (off = lblktosize(fs, bp->b_lblkno); 244 off < lblktosize(fs, bp->b_lblkno + 1); 245 off += PAGE_SIZE) { 246 pg = uvm_pagelookup(&vp->v_uobj, off); 247 if (pg == NULL) 248 continue; 249 if ((pg->flags & PG_CLEAN) == 0 || 250 pmap_is_modified(pg)) { 251 fs->lfs_avail += btofsb(fs, 252 bp->b_bcount); 253 wakeup(&fs->lfs_avail); 254 mutex_exit(&vp->v_interlock); 255 lfs_freebuf(fs, bp); 256 mutex_enter(&vp->v_interlock); 257 bp = NULL; 258 break; 259 } 260 } 261 } 262 for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp; 263 tbp = tnbp) 264 { 265 tnbp = LIST_NEXT(tbp, b_vnbufs); 266 if (tbp->b_vp == bp->b_vp 267 && tbp->b_lblkno == bp->b_lblkno 268 && tbp != bp) 269 { 270 fs->lfs_avail += btofsb(fs, 271 bp->b_bcount); 272 wakeup(&fs->lfs_avail); 273 mutex_exit(&vp->v_interlock); 274 lfs_freebuf(fs, bp); 275 mutex_enter(&vp->v_interlock); 276 bp = NULL; 277 break; 278 } 279 } 280 } 281 } else { 282 mutex_enter(&vp->v_interlock); 283 } 284 285 /* If the node is being written, wait until that is done */ 286 while (WRITEINPROG(vp)) { 287 ivndebug(vp,"vflush/writeinprog"); 288 cv_wait(&vp->v_cv, &vp->v_interlock); 289 } 290 mutex_exit(&vp->v_interlock); 291 292 /* Protect against VI_XLOCK deadlock in vinvalbuf() */ 293 lfs_seglock(fs, SEGM_SYNC); 294 295 /* If we're supposed to flush a freed inode, just toss it */ 296 if (ip->i_lfs_iflags & LFSI_DELETED) { 297 DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n", 298 ip->i_number)); 299 /* Drain v_numoutput */ 300 mutex_enter(&vp->v_interlock); 301 while (vp->v_numoutput > 0) { 302 cv_wait(&vp->v_cv, &vp->v_interlock); 303 } 304 KASSERT(vp->v_numoutput == 0); 305 mutex_exit(&vp->v_interlock); 306 307 mutex_enter(&bufcache_lock); 308 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 309 nbp = LIST_NEXT(bp, b_vnbufs); 310 311 KASSERT((bp->b_flags & B_GATHERED) == 0); 312 if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */ 313 fs->lfs_avail += btofsb(fs, bp->b_bcount); 314 wakeup(&fs->lfs_avail); 315 } 316 /* Copied from lfs_writeseg */ 317 if (bp->b_iodone != NULL) { 318 mutex_exit(&bufcache_lock); 319 biodone(bp); 320 mutex_enter(&bufcache_lock); 321 } else { 322 bremfree(bp); 323 LFS_UNLOCK_BUF(bp); 324 mutex_enter(&vp->v_interlock); 325 bp->b_flags &= ~(B_READ | B_GATHERED); 326 bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE; 327 bp->b_error = 0; 328 reassignbuf(bp, vp); 329 mutex_exit(&vp->v_interlock); 330 brelse(bp, 0); 331 } 332 } 333 mutex_exit(&bufcache_lock); 334 LFS_CLR_UINO(ip, IN_CLEANING); 335 LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED); 336 ip->i_flag &= ~IN_ALLMOD; 337 DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n", 338 ip->i_number)); 339 lfs_segunlock(fs); 340 341 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 342 343 return 0; 344 } 345 346 fs->lfs_flushvp = vp; 347 if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) { 348 error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC); 349 fs->lfs_flushvp = NULL; 350 KASSERT(fs->lfs_flushvp_fakevref == 0); 351 lfs_segunlock(fs); 352 353 /* Make sure that any pending buffers get written */ 354 mutex_enter(&vp->v_interlock); 355 while (vp->v_numoutput > 0) { 356 cv_wait(&vp->v_cv, &vp->v_interlock); 357 } 358 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 359 KASSERT(vp->v_numoutput == 0); 360 mutex_exit(&vp->v_interlock); 361 362 return error; 363 } 364 sp = fs->lfs_sp; 365 366 flushed = 0; 367 if (VPISEMPTY(vp)) { 368 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); 369 ++flushed; 370 } else if ((ip->i_flag & IN_CLEANING) && 371 (fs->lfs_sp->seg_flags & SEGM_CLEAN)) { 372 ivndebug(vp,"vflush/clean"); 373 lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN); 374 ++flushed; 375 } else if (lfs_dostats) { 376 if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD)) 377 ++lfs_stats.vflush_invoked; 378 ivndebug(vp,"vflush"); 379 } 380 381 #ifdef DIAGNOSTIC 382 if (vp->v_uflag & VU_DIROP) { 383 DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n")); 384 /* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */ 385 } 386 if (vp->v_usecount < 0) { 387 printf("usecount=%ld\n", (long)vp->v_usecount); 388 panic("lfs_vflush: usecount<0"); 389 } 390 #endif 391 392 do { 393 loopcount = 0; 394 do { 395 if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { 396 relock = lfs_writefile(fs, sp, vp); 397 if (relock) { 398 /* 399 * Might have to wait for the 400 * cleaner to run; but we're 401 * still not done with this vnode. 402 */ 403 KDASSERT(ip->i_number != LFS_IFILE_INUM); 404 lfs_writeinode(fs, sp, ip); 405 mutex_enter(&lfs_lock); 406 LFS_SET_UINO(ip, IN_MODIFIED); 407 mutex_exit(&lfs_lock); 408 lfs_writeseg(fs, sp); 409 lfs_segunlock(fs); 410 lfs_segunlock_relock(fs); 411 goto top; 412 } 413 } 414 /* 415 * If we begin a new segment in the middle of writing 416 * the Ifile, it creates an inconsistent checkpoint, 417 * since the Ifile information for the new segment 418 * is not up-to-date. Take care of this here by 419 * sending the Ifile through again in case there 420 * are newly dirtied blocks. But wait, there's more! 421 * This second Ifile write could *also* cross a segment 422 * boundary, if the first one was large. The second 423 * one is guaranteed to be no more than 8 blocks, 424 * though (two segment blocks and supporting indirects) 425 * so the third write *will not* cross the boundary. 426 */ 427 if (vp == fs->lfs_ivnode) { 428 lfs_writefile(fs, sp, vp); 429 lfs_writefile(fs, sp, vp); 430 } 431 #ifdef DEBUG 432 if (++loopcount > 2) 433 log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount); 434 #endif 435 } while (lfs_writeinode(fs, sp, ip)); 436 } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); 437 438 if (lfs_dostats) { 439 ++lfs_stats.nwrites; 440 if (sp->seg_flags & SEGM_SYNC) 441 ++lfs_stats.nsync_writes; 442 if (sp->seg_flags & SEGM_CKP) 443 ++lfs_stats.ncheckpoints; 444 } 445 /* 446 * If we were called from somewhere that has already held the seglock 447 * (e.g., lfs_markv()), the lfs_segunlock will not wait for 448 * the write to complete because we are still locked. 449 * Since lfs_vflush() must return the vnode with no dirty buffers, 450 * we must explicitly wait, if that is the case. 451 * 452 * We compare the iocount against 1, not 0, because it is 453 * artificially incremented by lfs_seglock(). 454 */ 455 mutex_enter(&lfs_lock); 456 if (fs->lfs_seglock > 1) { 457 while (fs->lfs_iocount > 1) 458 (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1, 459 "lfs_vflush", 0, &lfs_lock); 460 } 461 mutex_exit(&lfs_lock); 462 463 lfs_segunlock(fs); 464 465 /* Wait for these buffers to be recovered by aiodoned */ 466 mutex_enter(&vp->v_interlock); 467 while (vp->v_numoutput > 0) { 468 cv_wait(&vp->v_cv, &vp->v_interlock); 469 } 470 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 471 KASSERT(vp->v_numoutput == 0); 472 mutex_exit(&vp->v_interlock); 473 474 fs->lfs_flushvp = NULL; 475 KASSERT(fs->lfs_flushvp_fakevref == 0); 476 477 return (0); 478 } 479 480 int 481 lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op) 482 { 483 struct inode *ip; 484 struct vnode *vp; 485 int inodes_written = 0, only_cleaning; 486 int error = 0; 487 488 ASSERT_SEGLOCK(fs); 489 loop: 490 /* start at last (newest) vnode. */ 491 mutex_enter(&mntvnode_lock); 492 TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) { 493 /* 494 * If the vnode that we are about to sync is no longer 495 * associated with this mount point, start over. 496 */ 497 if (vp->v_mount != mp) { 498 DLOG((DLOG_VNODE, "lfs_writevnodes: starting over\n")); 499 /* 500 * After this, pages might be busy 501 * due to our own previous putpages. 502 * Start actual segment write here to avoid deadlock. 503 */ 504 mutex_exit(&mntvnode_lock); 505 (void)lfs_writeseg(fs, sp); 506 goto loop; 507 } 508 509 mutex_enter(&vp->v_interlock); 510 if (vp->v_type == VNON || vismarker(vp) || 511 (vp->v_iflag & VI_CLEAN) != 0) { 512 mutex_exit(&vp->v_interlock); 513 continue; 514 } 515 516 ip = VTOI(vp); 517 if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) || 518 (op != VN_DIROP && op != VN_CLEAN && 519 (vp->v_uflag & VU_DIROP))) { 520 mutex_exit(&vp->v_interlock); 521 vndebug(vp,"dirop"); 522 continue; 523 } 524 525 if (op == VN_EMPTY && !VPISEMPTY(vp)) { 526 mutex_exit(&vp->v_interlock); 527 vndebug(vp,"empty"); 528 continue; 529 } 530 531 if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM 532 && vp != fs->lfs_flushvp 533 && !(ip->i_flag & IN_CLEANING)) { 534 mutex_exit(&vp->v_interlock); 535 vndebug(vp,"cleaning"); 536 continue; 537 } 538 539 mutex_exit(&mntvnode_lock); 540 if (lfs_vref(vp)) { 541 vndebug(vp,"vref"); 542 mutex_enter(&mntvnode_lock); 543 continue; 544 } 545 546 only_cleaning = 0; 547 /* 548 * Write the inode/file if dirty and it's not the IFILE. 549 */ 550 if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) { 551 only_cleaning = 552 ((ip->i_flag & IN_ALLMOD) == IN_CLEANING); 553 554 if (ip->i_number != LFS_IFILE_INUM) { 555 error = lfs_writefile(fs, sp, vp); 556 if (error) { 557 lfs_vunref(vp); 558 if (error == EAGAIN) { 559 /* 560 * This error from lfs_putpages 561 * indicates we need to drop 562 * the segment lock and start 563 * over after the cleaner has 564 * had a chance to run. 565 */ 566 lfs_writeinode(fs, sp, ip); 567 lfs_writeseg(fs, sp); 568 if (!VPISEMPTY(vp) && 569 !WRITEINPROG(vp) && 570 !(ip->i_flag & IN_ALLMOD)) { 571 mutex_enter(&lfs_lock); 572 LFS_SET_UINO(ip, IN_MODIFIED); 573 mutex_exit(&lfs_lock); 574 } 575 mutex_enter(&mntvnode_lock); 576 break; 577 } 578 error = 0; /* XXX not quite right */ 579 mutex_enter(&mntvnode_lock); 580 continue; 581 } 582 583 if (!VPISEMPTY(vp)) { 584 if (WRITEINPROG(vp)) { 585 ivndebug(vp,"writevnodes/write2"); 586 } else if (!(ip->i_flag & IN_ALLMOD)) { 587 mutex_enter(&lfs_lock); 588 LFS_SET_UINO(ip, IN_MODIFIED); 589 mutex_exit(&lfs_lock); 590 } 591 } 592 (void) lfs_writeinode(fs, sp, ip); 593 inodes_written++; 594 } 595 } 596 597 if (lfs_clean_vnhead && only_cleaning) 598 lfs_vunref_head(vp); 599 else 600 lfs_vunref(vp); 601 602 mutex_enter(&mntvnode_lock); 603 } 604 mutex_exit(&mntvnode_lock); 605 return error; 606 } 607 608 /* 609 * Do a checkpoint. 610 */ 611 int 612 lfs_segwrite(struct mount *mp, int flags) 613 { 614 struct buf *bp; 615 struct inode *ip; 616 struct lfs *fs; 617 struct segment *sp; 618 struct vnode *vp; 619 SEGUSE *segusep; 620 int do_ckp, did_ckp, error; 621 unsigned n, segleft, maxseg, sn, i, curseg; 622 int writer_set = 0; 623 int dirty; 624 int redo; 625 int um_error; 626 int loopcount; 627 628 fs = VFSTOUFS(mp)->um_lfs; 629 ASSERT_MAYBE_SEGLOCK(fs); 630 631 if (fs->lfs_ronly) 632 return EROFS; 633 634 lfs_imtime(fs); 635 636 /* 637 * Allocate a segment structure and enough space to hold pointers to 638 * the maximum possible number of buffers which can be described in a 639 * single summary block. 640 */ 641 do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags); 642 643 lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); 644 sp = fs->lfs_sp; 645 if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP)) 646 do_ckp = 1; 647 648 /* 649 * If lfs_flushvp is non-NULL, we are called from lfs_vflush, 650 * in which case we have to flush *all* buffers off of this vnode. 651 * We don't care about other nodes, but write any non-dirop nodes 652 * anyway in anticipation of another getnewvnode(). 653 * 654 * If we're cleaning we only write cleaning and ifile blocks, and 655 * no dirops, since otherwise we'd risk corruption in a crash. 656 */ 657 if (sp->seg_flags & SEGM_CLEAN) 658 lfs_writevnodes(fs, mp, sp, VN_CLEAN); 659 else if (!(sp->seg_flags & SEGM_FORCE_CKP)) { 660 do { 661 um_error = lfs_writevnodes(fs, mp, sp, VN_REG); 662 663 if (do_ckp || fs->lfs_dirops == 0) { 664 if (!writer_set) { 665 lfs_writer_enter(fs, "lfs writer"); 666 writer_set = 1; 667 } 668 error = lfs_writevnodes(fs, mp, sp, VN_DIROP); 669 if (um_error == 0) 670 um_error = error; 671 /* In case writevnodes errored out */ 672 lfs_flush_dirops(fs); 673 ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT); 674 lfs_finalize_fs_seguse(fs); 675 } 676 if (do_ckp && um_error) { 677 lfs_segunlock_relock(fs); 678 sp = fs->lfs_sp; 679 } 680 } while (do_ckp && um_error != 0); 681 } 682 683 /* 684 * If we are doing a checkpoint, mark everything since the 685 * last checkpoint as no longer ACTIVE. 686 */ 687 if (do_ckp || fs->lfs_doifile) { 688 segleft = fs->lfs_nseg; 689 curseg = 0; 690 for (n = 0; n < fs->lfs_segtabsz; n++) { 691 dirty = 0; 692 if (bread(fs->lfs_ivnode, 693 fs->lfs_cleansz + n, fs->lfs_bsize, NOCRED, &bp)) 694 panic("lfs_segwrite: ifile read"); 695 segusep = (SEGUSE *)bp->b_data; 696 maxseg = min(segleft, fs->lfs_sepb); 697 for (i = 0; i < maxseg; i++) { 698 sn = curseg + i; 699 if (sn != dtosn(fs, fs->lfs_curseg) && 700 segusep->su_flags & SEGUSE_ACTIVE) { 701 segusep->su_flags &= ~SEGUSE_ACTIVE; 702 --fs->lfs_nactive; 703 ++dirty; 704 } 705 fs->lfs_suflags[fs->lfs_activesb][sn] = 706 segusep->su_flags; 707 if (fs->lfs_version > 1) 708 ++segusep; 709 else 710 segusep = (SEGUSE *) 711 ((SEGUSE_V1 *)segusep + 1); 712 } 713 714 if (dirty) 715 error = LFS_BWRITE_LOG(bp); /* Ifile */ 716 else 717 brelse(bp, 0); 718 segleft -= fs->lfs_sepb; 719 curseg += fs->lfs_sepb; 720 } 721 } 722 723 KASSERT(LFS_SEGLOCK_HELD(fs)); 724 725 did_ckp = 0; 726 if (do_ckp || fs->lfs_doifile) { 727 vp = fs->lfs_ivnode; 728 vn_lock(vp, LK_EXCLUSIVE); 729 loopcount = 0; 730 do { 731 #ifdef DEBUG 732 LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid); 733 #endif 734 mutex_enter(&lfs_lock); 735 fs->lfs_flags &= ~LFS_IFDIRTY; 736 mutex_exit(&lfs_lock); 737 738 ip = VTOI(vp); 739 740 if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) { 741 /* 742 * Ifile has no pages, so we don't need 743 * to check error return here. 744 */ 745 lfs_writefile(fs, sp, vp); 746 /* 747 * Ensure the Ifile takes the current segment 748 * into account. See comment in lfs_vflush. 749 */ 750 lfs_writefile(fs, sp, vp); 751 lfs_writefile(fs, sp, vp); 752 } 753 754 if (ip->i_flag & IN_ALLMOD) 755 ++did_ckp; 756 #if 0 757 redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : 0); 758 #else 759 redo = lfs_writeinode(fs, sp, ip); 760 #endif 761 redo += lfs_writeseg(fs, sp); 762 mutex_enter(&lfs_lock); 763 redo += (fs->lfs_flags & LFS_IFDIRTY); 764 mutex_exit(&lfs_lock); 765 #ifdef DEBUG 766 if (++loopcount > 2) 767 log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n", 768 loopcount); 769 #endif 770 } while (redo && do_ckp); 771 772 /* 773 * Unless we are unmounting, the Ifile may continue to have 774 * dirty blocks even after a checkpoint, due to changes to 775 * inodes' atime. If we're checkpointing, it's "impossible" 776 * for other parts of the Ifile to be dirty after the loop 777 * above, since we hold the segment lock. 778 */ 779 mutex_enter(&vp->v_interlock); 780 if (LIST_EMPTY(&vp->v_dirtyblkhd)) { 781 LFS_CLR_UINO(ip, IN_ALLMOD); 782 } 783 #ifdef DIAGNOSTIC 784 else if (do_ckp) { 785 int do_panic = 0; 786 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 787 if (bp->b_lblkno < fs->lfs_cleansz + 788 fs->lfs_segtabsz && 789 !(bp->b_flags & B_GATHERED)) { 790 printf("ifile lbn %ld still dirty (flags %lx)\n", 791 (long)bp->b_lblkno, 792 (long)bp->b_flags); 793 ++do_panic; 794 } 795 } 796 if (do_panic) 797 panic("dirty blocks"); 798 } 799 #endif 800 mutex_exit(&vp->v_interlock); 801 VOP_UNLOCK(vp, 0); 802 } else { 803 (void) lfs_writeseg(fs, sp); 804 } 805 806 /* Note Ifile no longer needs to be written */ 807 fs->lfs_doifile = 0; 808 if (writer_set) 809 lfs_writer_leave(fs); 810 811 /* 812 * If we didn't write the Ifile, we didn't really do anything. 813 * That means that (1) there is a checkpoint on disk and (2) 814 * nothing has changed since it was written. 815 * 816 * Take the flags off of the segment so that lfs_segunlock 817 * doesn't have to write the superblock either. 818 */ 819 if (do_ckp && !did_ckp) { 820 sp->seg_flags &= ~SEGM_CKP; 821 } 822 823 if (lfs_dostats) { 824 ++lfs_stats.nwrites; 825 if (sp->seg_flags & SEGM_SYNC) 826 ++lfs_stats.nsync_writes; 827 if (sp->seg_flags & SEGM_CKP) 828 ++lfs_stats.ncheckpoints; 829 } 830 lfs_segunlock(fs); 831 return (0); 832 } 833 834 /* 835 * Write the dirty blocks associated with a vnode. 836 */ 837 int 838 lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp) 839 { 840 struct finfo *fip; 841 struct inode *ip; 842 int i, frag; 843 int error; 844 845 ASSERT_SEGLOCK(fs); 846 error = 0; 847 ip = VTOI(vp); 848 849 fip = sp->fip; 850 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen); 851 852 if (vp->v_uflag & VU_DIROP) 853 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); 854 855 if (sp->seg_flags & SEGM_CLEAN) { 856 lfs_gather(fs, sp, vp, lfs_match_fake); 857 /* 858 * For a file being flushed, we need to write *all* blocks. 859 * This means writing the cleaning blocks first, and then 860 * immediately following with any non-cleaning blocks. 861 * The same is true of the Ifile since checkpoints assume 862 * that all valid Ifile blocks are written. 863 */ 864 if (IS_FLUSHING(fs, vp) || vp == fs->lfs_ivnode) { 865 lfs_gather(fs, sp, vp, lfs_match_data); 866 /* 867 * Don't call VOP_PUTPAGES: if we're flushing, 868 * we've already done it, and the Ifile doesn't 869 * use the page cache. 870 */ 871 } 872 } else { 873 lfs_gather(fs, sp, vp, lfs_match_data); 874 /* 875 * If we're flushing, we've already called VOP_PUTPAGES 876 * so don't do it again. Otherwise, we want to write 877 * everything we've got. 878 */ 879 if (!IS_FLUSHING(fs, vp)) { 880 mutex_enter(&vp->v_interlock); 881 error = VOP_PUTPAGES(vp, 0, 0, 882 PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED); 883 } 884 } 885 886 /* 887 * It may not be necessary to write the meta-data blocks at this point, 888 * as the roll-forward recovery code should be able to reconstruct the 889 * list. 890 * 891 * We have to write them anyway, though, under two conditions: (1) the 892 * vnode is being flushed (for reuse by vinvalbuf); or (2) we are 893 * checkpointing. 894 * 895 * BUT if we are cleaning, we might have indirect blocks that refer to 896 * new blocks not being written yet, in addition to fragments being 897 * moved out of a cleaned segment. If that is the case, don't 898 * write the indirect blocks, or the finfo will have a small block 899 * in the middle of it! 900 * XXX in this case isn't the inode size wrong too? 901 */ 902 frag = 0; 903 if (sp->seg_flags & SEGM_CLEAN) { 904 for (i = 0; i < NDADDR; i++) 905 if (ip->i_lfs_fragsize[i] > 0 && 906 ip->i_lfs_fragsize[i] < fs->lfs_bsize) 907 ++frag; 908 } 909 #ifdef DIAGNOSTIC 910 if (frag > 1) 911 panic("lfs_writefile: more than one fragment!"); 912 #endif 913 if (IS_FLUSHING(fs, vp) || 914 (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) { 915 lfs_gather(fs, sp, vp, lfs_match_indir); 916 lfs_gather(fs, sp, vp, lfs_match_dindir); 917 lfs_gather(fs, sp, vp, lfs_match_tindir); 918 } 919 fip = sp->fip; 920 lfs_release_finfo(fs); 921 922 return error; 923 } 924 925 /* 926 * Update segment accounting to reflect this inode's change of address. 927 */ 928 static int 929 lfs_update_iaddr(struct lfs *fs, struct segment *sp, struct inode *ip, daddr_t ndaddr) 930 { 931 struct buf *bp; 932 daddr_t daddr; 933 IFILE *ifp; 934 SEGUSE *sup; 935 ino_t ino; 936 int redo_ifile, error; 937 u_int32_t sn; 938 939 redo_ifile = 0; 940 941 /* 942 * If updating the ifile, update the super-block. Update the disk 943 * address and access times for this inode in the ifile. 944 */ 945 ino = ip->i_number; 946 if (ino == LFS_IFILE_INUM) { 947 daddr = fs->lfs_idaddr; 948 fs->lfs_idaddr = dbtofsb(fs, ndaddr); 949 } else { 950 LFS_IENTRY(ifp, fs, ino, bp); 951 daddr = ifp->if_daddr; 952 ifp->if_daddr = dbtofsb(fs, ndaddr); 953 error = LFS_BWRITE_LOG(bp); /* Ifile */ 954 } 955 956 /* 957 * If this is the Ifile and lfs_offset is set to the first block 958 * in the segment, dirty the new segment's accounting block 959 * (XXX should already be dirty?) and tell the caller to do it again. 960 */ 961 if (ip->i_number == LFS_IFILE_INUM) { 962 sn = dtosn(fs, fs->lfs_offset); 963 if (sntod(fs, sn) + btofsb(fs, fs->lfs_sumsize) == 964 fs->lfs_offset) { 965 LFS_SEGENTRY(sup, fs, sn, bp); 966 KASSERT(bp->b_oflags & BO_DELWRI); 967 LFS_WRITESEGENTRY(sup, fs, sn, bp); 968 /* fs->lfs_flags |= LFS_IFDIRTY; */ 969 redo_ifile |= 1; 970 } 971 } 972 973 /* 974 * The inode's last address should not be in the current partial 975 * segment, except under exceptional circumstances (lfs_writevnodes 976 * had to start over, and in the meantime more blocks were written 977 * to a vnode). Both inodes will be accounted to this segment 978 * in lfs_writeseg so we need to subtract the earlier version 979 * here anyway. The segment count can temporarily dip below 980 * zero here; keep track of how many duplicates we have in 981 * "dupino" so we don't panic below. 982 */ 983 if (daddr >= fs->lfs_lastpseg && daddr <= fs->lfs_offset) { 984 ++sp->ndupino; 985 DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg " 986 "(ino %d daddr 0x%llx) ndupino=%d\n", ino, 987 (long long)daddr, sp->ndupino)); 988 } 989 /* 990 * Account the inode: it no longer belongs to its former segment, 991 * though it will not belong to the new segment until that segment 992 * is actually written. 993 */ 994 if (daddr != LFS_UNUSED_DADDR) { 995 u_int32_t oldsn = dtosn(fs, daddr); 996 #ifdef DIAGNOSTIC 997 int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : 0; 998 #endif 999 LFS_SEGENTRY(sup, fs, oldsn, bp); 1000 #ifdef DIAGNOSTIC 1001 if (sup->su_nbytes + 1002 sizeof (struct ufs1_dinode) * ndupino 1003 < sizeof (struct ufs1_dinode)) { 1004 printf("lfs_writeinode: negative bytes " 1005 "(segment %" PRIu32 " short by %d, " 1006 "oldsn=%" PRIu32 ", cursn=%" PRIu32 1007 ", daddr=%" PRId64 ", su_nbytes=%u, " 1008 "ndupino=%d)\n", 1009 dtosn(fs, daddr), 1010 (int)sizeof (struct ufs1_dinode) * 1011 (1 - sp->ndupino) - sup->su_nbytes, 1012 oldsn, sp->seg_number, daddr, 1013 (unsigned int)sup->su_nbytes, 1014 sp->ndupino); 1015 panic("lfs_writeinode: negative bytes"); 1016 sup->su_nbytes = sizeof (struct ufs1_dinode); 1017 } 1018 #endif 1019 DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n", 1020 dtosn(fs, daddr), sizeof (struct ufs1_dinode), ino)); 1021 sup->su_nbytes -= sizeof (struct ufs1_dinode); 1022 redo_ifile |= 1023 (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); 1024 if (redo_ifile) { 1025 mutex_enter(&lfs_lock); 1026 fs->lfs_flags |= LFS_IFDIRTY; 1027 mutex_exit(&lfs_lock); 1028 /* Don't double-account */ 1029 fs->lfs_idaddr = 0x0; 1030 } 1031 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */ 1032 } 1033 1034 return redo_ifile; 1035 } 1036 1037 int 1038 lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip) 1039 { 1040 struct buf *bp; 1041 struct ufs1_dinode *cdp; 1042 daddr_t daddr; 1043 int32_t *daddrp; /* XXX ondisk32 */ 1044 int i, ndx; 1045 int redo_ifile = 0; 1046 int gotblk = 0; 1047 int count; 1048 1049 ASSERT_SEGLOCK(fs); 1050 if (!(ip->i_flag & IN_ALLMOD)) 1051 return (0); 1052 1053 /* Can't write ifile when writer is not set */ 1054 KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 || 1055 (sp->seg_flags & SEGM_CLEAN)); 1056 1057 /* 1058 * If this is the Ifile, see if writing it here will generate a 1059 * temporary misaccounting. If it will, do the accounting and write 1060 * the blocks, postponing the inode write until the accounting is 1061 * solid. 1062 */ 1063 count = 0; 1064 while (ip->i_number == LFS_IFILE_INUM) { 1065 int redo = 0; 1066 1067 if (sp->idp == NULL && sp->ibp == NULL && 1068 (sp->seg_bytes_left < fs->lfs_ibsize || 1069 sp->sum_bytes_left < sizeof(int32_t))) { 1070 (void) lfs_writeseg(fs, sp); 1071 continue; 1072 } 1073 1074 /* Look for dirty Ifile blocks */ 1075 LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) { 1076 if (!(bp->b_flags & B_GATHERED)) { 1077 redo = 1; 1078 break; 1079 } 1080 } 1081 1082 if (redo == 0) 1083 redo = lfs_update_iaddr(fs, sp, ip, 0x0); 1084 if (redo == 0) 1085 break; 1086 1087 if (sp->idp) { 1088 sp->idp->di_inumber = 0; 1089 sp->idp = NULL; 1090 } 1091 ++count; 1092 if (count > 2) 1093 log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count); 1094 lfs_writefile(fs, sp, fs->lfs_ivnode); 1095 } 1096 1097 /* Allocate a new inode block if necessary. */ 1098 if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) && 1099 sp->ibp == NULL) { 1100 /* Allocate a new segment if necessary. */ 1101 if (sp->seg_bytes_left < fs->lfs_ibsize || 1102 sp->sum_bytes_left < sizeof(int32_t)) 1103 (void) lfs_writeseg(fs, sp); 1104 1105 /* Get next inode block. */ 1106 daddr = fs->lfs_offset; 1107 fs->lfs_offset += btofsb(fs, fs->lfs_ibsize); 1108 sp->ibp = *sp->cbpp++ = 1109 getblk(VTOI(fs->lfs_ivnode)->i_devvp, 1110 fsbtodb(fs, daddr), fs->lfs_ibsize, 0, 0); 1111 gotblk++; 1112 1113 /* Zero out inode numbers */ 1114 for (i = 0; i < INOPB(fs); ++i) 1115 ((struct ufs1_dinode *)sp->ibp->b_data)[i].di_inumber = 1116 0; 1117 1118 ++sp->start_bpp; 1119 fs->lfs_avail -= btofsb(fs, fs->lfs_ibsize); 1120 /* Set remaining space counters. */ 1121 sp->seg_bytes_left -= fs->lfs_ibsize; 1122 sp->sum_bytes_left -= sizeof(int32_t); 1123 ndx = fs->lfs_sumsize / sizeof(int32_t) - 1124 sp->ninodes / INOPB(fs) - 1; 1125 ((int32_t *)(sp->segsum))[ndx] = daddr; 1126 } 1127 1128 /* Check VU_DIROP in case there is a new file with no data blocks */ 1129 if (ITOV(ip)->v_uflag & VU_DIROP) 1130 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); 1131 1132 /* Update the inode times and copy the inode onto the inode page. */ 1133 /* XXX kludge --- don't redirty the ifile just to put times on it */ 1134 if (ip->i_number != LFS_IFILE_INUM) 1135 LFS_ITIMES(ip, NULL, NULL, NULL); 1136 1137 /* 1138 * If this is the Ifile, and we've already written the Ifile in this 1139 * partial segment, just overwrite it (it's not on disk yet) and 1140 * continue. 1141 * 1142 * XXX we know that the bp that we get the second time around has 1143 * already been gathered. 1144 */ 1145 if (ip->i_number == LFS_IFILE_INUM && sp->idp) { 1146 *(sp->idp) = *ip->i_din.ffs1_din; 1147 ip->i_lfs_osize = ip->i_size; 1148 return 0; 1149 } 1150 1151 bp = sp->ibp; 1152 cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs)); 1153 *cdp = *ip->i_din.ffs1_din; 1154 1155 /* 1156 * If cleaning, link counts and directory file sizes cannot change, 1157 * since those would be directory operations---even if the file 1158 * we are writing is marked VU_DIROP we should write the old values. 1159 * If we're not cleaning, of course, update the values so we get 1160 * current values the next time we clean. 1161 */ 1162 if (sp->seg_flags & SEGM_CLEAN) { 1163 if (ITOV(ip)->v_uflag & VU_DIROP) { 1164 cdp->di_nlink = ip->i_lfs_odnlink; 1165 /* if (ITOV(ip)->v_type == VDIR) */ 1166 cdp->di_size = ip->i_lfs_osize; 1167 } 1168 } else { 1169 ip->i_lfs_odnlink = cdp->di_nlink; 1170 ip->i_lfs_osize = ip->i_size; 1171 } 1172 1173 1174 /* We can finish the segment accounting for truncations now */ 1175 lfs_finalize_ino_seguse(fs, ip); 1176 1177 /* 1178 * If we are cleaning, ensure that we don't write UNWRITTEN disk 1179 * addresses to disk; possibly change the on-disk record of 1180 * the inode size, either by reverting to the previous size 1181 * (in the case of cleaning) or by verifying the inode's block 1182 * holdings (in the case of files being allocated as they are being 1183 * written). 1184 * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail 1185 * XXX count on disk wrong by the same amount. We should be 1186 * XXX able to "borrow" from lfs_avail and return it after the 1187 * XXX Ifile is written. See also in lfs_writeseg. 1188 */ 1189 1190 /* Check file size based on highest allocated block */ 1191 if (((ip->i_ffs1_mode & IFMT) == IFREG || 1192 (ip->i_ffs1_mode & IFMT) == IFDIR) && 1193 ip->i_size > ((ip->i_lfs_hiblk + 1) << fs->lfs_bshift)) { 1194 cdp->di_size = (ip->i_lfs_hiblk + 1) << fs->lfs_bshift; 1195 DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %" 1196 PRId64 "\n", (int)ip->i_number, ip->i_size, cdp->di_size)); 1197 } 1198 if (ip->i_lfs_effnblks != ip->i_ffs1_blocks) { 1199 DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %d != nblk %d)" 1200 " at %x\n", ip->i_number, ip->i_lfs_effnblks, 1201 ip->i_ffs1_blocks, fs->lfs_offset)); 1202 for (daddrp = cdp->di_db; daddrp < cdp->di_ib + NIADDR; 1203 daddrp++) { 1204 if (*daddrp == UNWRITTEN) { 1205 DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n")); 1206 *daddrp = 0; 1207 } 1208 } 1209 } 1210 1211 #ifdef DIAGNOSTIC 1212 /* 1213 * Check dinode held blocks against dinode size. 1214 * This should be identical to the check in lfs_vget(). 1215 */ 1216 for (i = (cdp->di_size + fs->lfs_bsize - 1) >> fs->lfs_bshift; 1217 i < NDADDR; i++) { 1218 KASSERT(i >= 0); 1219 if ((cdp->di_mode & IFMT) == IFLNK) 1220 continue; 1221 if (((cdp->di_mode & IFMT) == IFBLK || 1222 (cdp->di_mode & IFMT) == IFCHR) && i == 0) 1223 continue; 1224 if (cdp->di_db[i] != 0) { 1225 # ifdef DEBUG 1226 lfs_dump_dinode(cdp); 1227 # endif 1228 panic("writing inconsistent inode"); 1229 } 1230 } 1231 #endif /* DIAGNOSTIC */ 1232 1233 if (ip->i_flag & IN_CLEANING) 1234 LFS_CLR_UINO(ip, IN_CLEANING); 1235 else { 1236 /* XXX IN_ALLMOD */ 1237 LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE | 1238 IN_UPDATE | IN_MODIFY); 1239 if (ip->i_lfs_effnblks == ip->i_ffs1_blocks) 1240 LFS_CLR_UINO(ip, IN_MODIFIED); 1241 else { 1242 DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real " 1243 "blks=%d, eff=%d\n", ip->i_number, 1244 ip->i_ffs1_blocks, ip->i_lfs_effnblks)); 1245 } 1246 } 1247 1248 if (ip->i_number == LFS_IFILE_INUM) { 1249 /* We know sp->idp == NULL */ 1250 sp->idp = ((struct ufs1_dinode *)bp->b_data) + 1251 (sp->ninodes % INOPB(fs)); 1252 1253 /* Not dirty any more */ 1254 mutex_enter(&lfs_lock); 1255 fs->lfs_flags &= ~LFS_IFDIRTY; 1256 mutex_exit(&lfs_lock); 1257 } 1258 1259 if (gotblk) { 1260 mutex_enter(&bufcache_lock); 1261 LFS_LOCK_BUF(bp); 1262 brelsel(bp, 0); 1263 mutex_exit(&bufcache_lock); 1264 } 1265 1266 /* Increment inode count in segment summary block. */ 1267 ++((SEGSUM *)(sp->segsum))->ss_ninos; 1268 1269 /* If this page is full, set flag to allocate a new page. */ 1270 if (++sp->ninodes % INOPB(fs) == 0) 1271 sp->ibp = NULL; 1272 1273 redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno); 1274 1275 KASSERT(redo_ifile == 0); 1276 return (redo_ifile); 1277 } 1278 1279 int 1280 lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr) 1281 { 1282 struct lfs *fs; 1283 int vers; 1284 int j, blksinblk; 1285 1286 ASSERT_SEGLOCK(sp->fs); 1287 /* 1288 * If full, finish this segment. We may be doing I/O, so 1289 * release and reacquire the splbio(). 1290 */ 1291 #ifdef DIAGNOSTIC 1292 if (sp->vp == NULL) 1293 panic ("lfs_gatherblock: Null vp in segment"); 1294 #endif 1295 fs = sp->fs; 1296 blksinblk = howmany(bp->b_bcount, fs->lfs_bsize); 1297 if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk || 1298 sp->seg_bytes_left < bp->b_bcount) { 1299 if (mptr) 1300 mutex_exit(mptr); 1301 lfs_updatemeta(sp); 1302 1303 vers = sp->fip->fi_version; 1304 (void) lfs_writeseg(fs, sp); 1305 1306 /* Add the current file to the segment summary. */ 1307 lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers); 1308 1309 if (mptr) 1310 mutex_enter(mptr); 1311 return (1); 1312 } 1313 1314 if (bp->b_flags & B_GATHERED) { 1315 DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %d," 1316 " lbn %" PRId64 "\n", 1317 sp->fip->fi_ino, bp->b_lblkno)); 1318 return (0); 1319 } 1320 1321 /* Insert into the buffer list, update the FINFO block. */ 1322 bp->b_flags |= B_GATHERED; 1323 1324 *sp->cbpp++ = bp; 1325 for (j = 0; j < blksinblk; j++) { 1326 sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno + j; 1327 /* This block's accounting moves from lfs_favail to lfs_avail */ 1328 lfs_deregister_block(sp->vp, bp->b_lblkno + j); 1329 } 1330 1331 sp->sum_bytes_left -= sizeof(int32_t) * blksinblk; 1332 sp->seg_bytes_left -= bp->b_bcount; 1333 return (0); 1334 } 1335 1336 int 1337 lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp, 1338 int (*match)(struct lfs *, struct buf *)) 1339 { 1340 struct buf *bp, *nbp; 1341 int count = 0; 1342 1343 ASSERT_SEGLOCK(fs); 1344 if (vp->v_type == VBLK) 1345 return 0; 1346 KASSERT(sp->vp == NULL); 1347 sp->vp = vp; 1348 mutex_enter(&bufcache_lock); 1349 1350 #ifndef LFS_NO_BACKBUF_HACK 1351 /* This is a hack to see if ordering the blocks in LFS makes a difference. */ 1352 # define BUF_OFFSET \ 1353 (((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp) 1354 # define BACK_BUF(BP) \ 1355 ((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET)) 1356 # define BEG_OF_LIST \ 1357 ((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET)) 1358 1359 loop: 1360 /* Find last buffer. */ 1361 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); 1362 bp && LIST_NEXT(bp, b_vnbufs) != NULL; 1363 bp = LIST_NEXT(bp, b_vnbufs)) 1364 /* nothing */; 1365 for (; bp && bp != BEG_OF_LIST; bp = nbp) { 1366 nbp = BACK_BUF(bp); 1367 #else /* LFS_NO_BACKBUF_HACK */ 1368 loop: 1369 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1370 nbp = LIST_NEXT(bp, b_vnbufs); 1371 #endif /* LFS_NO_BACKBUF_HACK */ 1372 if ((bp->b_cflags & BC_BUSY) != 0 || 1373 (bp->b_flags & B_GATHERED) != 0 || !match(fs, bp)) { 1374 #ifdef DEBUG 1375 if (vp == fs->lfs_ivnode && 1376 (bp->b_cflags & BC_BUSY) != 0 && 1377 (bp->b_flags & B_GATHERED) == 0) 1378 log(LOG_NOTICE, "lfs_gather: ifile lbn %" 1379 PRId64 " busy (%x) at 0x%x", 1380 bp->b_lblkno, bp->b_flags, 1381 (unsigned)fs->lfs_offset); 1382 #endif 1383 continue; 1384 } 1385 #ifdef DIAGNOSTIC 1386 # ifdef LFS_USE_B_INVAL 1387 if ((bp->b_flags & BC_INVAL) != 0 && bp->b_iodone == NULL) { 1388 DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 1389 " is BC_INVAL\n", bp->b_lblkno)); 1390 VOP_PRINT(bp->b_vp); 1391 } 1392 # endif /* LFS_USE_B_INVAL */ 1393 if (!(bp->b_oflags & BO_DELWRI)) 1394 panic("lfs_gather: bp not BO_DELWRI"); 1395 if (!(bp->b_flags & B_LOCKED)) { 1396 DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64 1397 " blk %" PRId64 " not B_LOCKED\n", 1398 bp->b_lblkno, 1399 dbtofsb(fs, bp->b_blkno))); 1400 VOP_PRINT(bp->b_vp); 1401 panic("lfs_gather: bp not B_LOCKED"); 1402 } 1403 #endif 1404 if (lfs_gatherblock(sp, bp, &bufcache_lock)) { 1405 goto loop; 1406 } 1407 count++; 1408 } 1409 mutex_exit(&bufcache_lock); 1410 lfs_updatemeta(sp); 1411 KASSERT(sp->vp == vp); 1412 sp->vp = NULL; 1413 return count; 1414 } 1415 1416 #if DEBUG 1417 # define DEBUG_OOFF(n) do { \ 1418 if (ooff == 0) { \ 1419 DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \ 1420 "ino %d lbn %" PRId64 " at 0x%" PRIx32 \ 1421 ", was 0x0 (or %" PRId64 ")\n", \ 1422 (n), ip->i_number, lbn, ndaddr, daddr)); \ 1423 } \ 1424 } while (0) 1425 #else 1426 # define DEBUG_OOFF(n) 1427 #endif 1428 1429 /* 1430 * Change the given block's address to ndaddr, finding its previous 1431 * location using ufs_bmaparray(). 1432 * 1433 * Account for this change in the segment table. 1434 * 1435 * called with sp == NULL by roll-forwarding code. 1436 */ 1437 void 1438 lfs_update_single(struct lfs *fs, struct segment *sp, 1439 struct vnode *vp, daddr_t lbn, int32_t ndaddr, int size) 1440 { 1441 SEGUSE *sup; 1442 struct buf *bp; 1443 struct indir a[NIADDR + 2], *ap; 1444 struct inode *ip; 1445 daddr_t daddr, ooff; 1446 int num, error; 1447 int bb, osize, obb; 1448 1449 ASSERT_SEGLOCK(fs); 1450 KASSERT(sp == NULL || sp->vp == vp); 1451 ip = VTOI(vp); 1452 1453 error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL); 1454 if (error) 1455 panic("lfs_updatemeta: ufs_bmaparray returned %d", error); 1456 1457 daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */ 1458 KASSERT(daddr <= LFS_MAX_DADDR); 1459 if (daddr > 0) 1460 daddr = dbtofsb(fs, daddr); 1461 1462 bb = fragstofsb(fs, numfrags(fs, size)); 1463 switch (num) { 1464 case 0: 1465 ooff = ip->i_ffs1_db[lbn]; 1466 DEBUG_OOFF(0); 1467 if (ooff == UNWRITTEN) 1468 ip->i_ffs1_blocks += bb; 1469 else { 1470 /* possible fragment truncation or extension */ 1471 obb = btofsb(fs, ip->i_lfs_fragsize[lbn]); 1472 ip->i_ffs1_blocks += (bb - obb); 1473 } 1474 ip->i_ffs1_db[lbn] = ndaddr; 1475 break; 1476 case 1: 1477 ooff = ip->i_ffs1_ib[a[0].in_off]; 1478 DEBUG_OOFF(1); 1479 if (ooff == UNWRITTEN) 1480 ip->i_ffs1_blocks += bb; 1481 ip->i_ffs1_ib[a[0].in_off] = ndaddr; 1482 break; 1483 default: 1484 ap = &a[num - 1]; 1485 if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp)) 1486 panic("lfs_updatemeta: bread bno %" PRId64, 1487 ap->in_lbn); 1488 1489 /* XXX ondisk32 */ 1490 ooff = ((int32_t *)bp->b_data)[ap->in_off]; 1491 DEBUG_OOFF(num); 1492 if (ooff == UNWRITTEN) 1493 ip->i_ffs1_blocks += bb; 1494 /* XXX ondisk32 */ 1495 ((int32_t *)bp->b_data)[ap->in_off] = ndaddr; 1496 (void) VOP_BWRITE(bp); 1497 } 1498 1499 KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr); 1500 1501 /* Update hiblk when extending the file */ 1502 if (lbn > ip->i_lfs_hiblk) 1503 ip->i_lfs_hiblk = lbn; 1504 1505 /* 1506 * Though we'd rather it couldn't, this *can* happen right now 1507 * if cleaning blocks and regular blocks coexist. 1508 */ 1509 /* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */ 1510 1511 /* 1512 * Update segment usage information, based on old size 1513 * and location. 1514 */ 1515 if (daddr > 0) { 1516 u_int32_t oldsn = dtosn(fs, daddr); 1517 #ifdef DIAGNOSTIC 1518 int ndupino; 1519 1520 if (sp && sp->seg_number == oldsn) { 1521 ndupino = sp->ndupino; 1522 } else { 1523 ndupino = 0; 1524 } 1525 #endif 1526 KASSERT(oldsn < fs->lfs_nseg); 1527 if (lbn >= 0 && lbn < NDADDR) 1528 osize = ip->i_lfs_fragsize[lbn]; 1529 else 1530 osize = fs->lfs_bsize; 1531 LFS_SEGENTRY(sup, fs, oldsn, bp); 1532 #ifdef DIAGNOSTIC 1533 if (sup->su_nbytes + sizeof (struct ufs1_dinode) * ndupino 1534 < osize) { 1535 printf("lfs_updatemeta: negative bytes " 1536 "(segment %" PRIu32 " short by %" PRId64 1537 ")\n", dtosn(fs, daddr), 1538 (int64_t)osize - 1539 (sizeof (struct ufs1_dinode) * ndupino + 1540 sup->su_nbytes)); 1541 printf("lfs_updatemeta: ino %llu, lbn %" PRId64 1542 ", addr = 0x%" PRIx64 "\n", 1543 (unsigned long long)ip->i_number, lbn, daddr); 1544 printf("lfs_updatemeta: ndupino=%d\n", ndupino); 1545 panic("lfs_updatemeta: negative bytes"); 1546 sup->su_nbytes = osize - 1547 sizeof (struct ufs1_dinode) * ndupino; 1548 } 1549 #endif 1550 DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64 1551 " db 0x%" PRIx64 "\n", 1552 dtosn(fs, daddr), osize, 1553 ip->i_number, lbn, daddr)); 1554 sup->su_nbytes -= osize; 1555 if (!(bp->b_flags & B_GATHERED)) { 1556 mutex_enter(&lfs_lock); 1557 fs->lfs_flags |= LFS_IFDIRTY; 1558 mutex_exit(&lfs_lock); 1559 } 1560 LFS_WRITESEGENTRY(sup, fs, oldsn, bp); 1561 } 1562 /* 1563 * Now that this block has a new address, and its old 1564 * segment no longer owns it, we can forget about its 1565 * old size. 1566 */ 1567 if (lbn >= 0 && lbn < NDADDR) 1568 ip->i_lfs_fragsize[lbn] = size; 1569 } 1570 1571 /* 1572 * Update the metadata that points to the blocks listed in the FINFO 1573 * array. 1574 */ 1575 void 1576 lfs_updatemeta(struct segment *sp) 1577 { 1578 struct buf *sbp; 1579 struct lfs *fs; 1580 struct vnode *vp; 1581 daddr_t lbn; 1582 int i, nblocks, num; 1583 int bb; 1584 int bytesleft, size; 1585 1586 ASSERT_SEGLOCK(sp->fs); 1587 vp = sp->vp; 1588 nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; 1589 KASSERT(nblocks >= 0); 1590 KASSERT(vp != NULL); 1591 if (nblocks == 0) 1592 return; 1593 1594 /* 1595 * This count may be high due to oversize blocks from lfs_gop_write. 1596 * Correct for this. (XXX we should be able to keep track of these.) 1597 */ 1598 fs = sp->fs; 1599 for (i = 0; i < nblocks; i++) { 1600 if (sp->start_bpp[i] == NULL) { 1601 DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks)); 1602 nblocks = i; 1603 break; 1604 } 1605 num = howmany(sp->start_bpp[i]->b_bcount, fs->lfs_bsize); 1606 KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1); 1607 nblocks -= num - 1; 1608 } 1609 1610 KASSERT(vp->v_type == VREG || 1611 nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp); 1612 KASSERT(nblocks == sp->cbpp - sp->start_bpp); 1613 1614 /* 1615 * Sort the blocks. 1616 * 1617 * We have to sort even if the blocks come from the 1618 * cleaner, because there might be other pending blocks on the 1619 * same inode...and if we don't sort, and there are fragments 1620 * present, blocks may be written in the wrong place. 1621 */ 1622 lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks, fs->lfs_bsize); 1623 1624 /* 1625 * Record the length of the last block in case it's a fragment. 1626 * If there are indirect blocks present, they sort last. An 1627 * indirect block will be lfs_bsize and its presence indicates 1628 * that you cannot have fragments. 1629 * 1630 * XXX This last is a lie. A cleaned fragment can coexist with 1631 * XXX a later indirect block. This will continue to be 1632 * XXX true until lfs_markv is fixed to do everything with 1633 * XXX fake blocks (including fake inodes and fake indirect blocks). 1634 */ 1635 sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) & 1636 fs->lfs_bmask) + 1; 1637 1638 /* 1639 * Assign disk addresses, and update references to the logical 1640 * block and the segment usage information. 1641 */ 1642 for (i = nblocks; i--; ++sp->start_bpp) { 1643 sbp = *sp->start_bpp; 1644 lbn = *sp->start_lbp; 1645 KASSERT(sbp->b_lblkno == lbn); 1646 1647 sbp->b_blkno = fsbtodb(fs, fs->lfs_offset); 1648 1649 /* 1650 * If we write a frag in the wrong place, the cleaner won't 1651 * be able to correctly identify its size later, and the 1652 * segment will be uncleanable. (Even worse, it will assume 1653 * that the indirect block that actually ends the list 1654 * is of a smaller size!) 1655 */ 1656 if ((sbp->b_bcount & fs->lfs_bmask) && i != 0) 1657 panic("lfs_updatemeta: fragment is not last block"); 1658 1659 /* 1660 * For each subblock in this possibly oversized block, 1661 * update its address on disk. 1662 */ 1663 KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize); 1664 KASSERT(vp == sbp->b_vp); 1665 for (bytesleft = sbp->b_bcount; bytesleft > 0; 1666 bytesleft -= fs->lfs_bsize) { 1667 size = MIN(bytesleft, fs->lfs_bsize); 1668 bb = fragstofsb(fs, numfrags(fs, size)); 1669 lbn = *sp->start_lbp++; 1670 lfs_update_single(fs, sp, sp->vp, lbn, fs->lfs_offset, 1671 size); 1672 fs->lfs_offset += bb; 1673 } 1674 1675 } 1676 1677 /* This inode has been modified */ 1678 LFS_SET_UINO(VTOI(vp), IN_MODIFIED); 1679 } 1680 1681 /* 1682 * Move lfs_offset to a segment earlier than sn. 1683 */ 1684 int 1685 lfs_rewind(struct lfs *fs, int newsn) 1686 { 1687 int sn, osn, isdirty; 1688 struct buf *bp; 1689 SEGUSE *sup; 1690 1691 ASSERT_SEGLOCK(fs); 1692 1693 osn = dtosn(fs, fs->lfs_offset); 1694 if (osn < newsn) 1695 return 0; 1696 1697 /* lfs_avail eats the remaining space in this segment */ 1698 fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - fs->lfs_curseg); 1699 1700 /* Find a low-numbered segment */ 1701 for (sn = 0; sn < fs->lfs_nseg; ++sn) { 1702 LFS_SEGENTRY(sup, fs, sn, bp); 1703 isdirty = sup->su_flags & SEGUSE_DIRTY; 1704 brelse(bp, 0); 1705 1706 if (!isdirty) 1707 break; 1708 } 1709 if (sn == fs->lfs_nseg) 1710 panic("lfs_rewind: no clean segments"); 1711 if (newsn >= 0 && sn >= newsn) 1712 return ENOENT; 1713 fs->lfs_nextseg = sn; 1714 lfs_newseg(fs); 1715 fs->lfs_offset = fs->lfs_curseg; 1716 1717 return 0; 1718 } 1719 1720 /* 1721 * Start a new partial segment. 1722 * 1723 * Return 1 when we entered to a new segment. 1724 * Otherwise, return 0. 1725 */ 1726 int 1727 lfs_initseg(struct lfs *fs) 1728 { 1729 struct segment *sp = fs->lfs_sp; 1730 SEGSUM *ssp; 1731 struct buf *sbp; /* buffer for SEGSUM */ 1732 int repeat = 0; /* return value */ 1733 1734 ASSERT_SEGLOCK(fs); 1735 /* Advance to the next segment. */ 1736 if (!LFS_PARTIAL_FITS(fs)) { 1737 SEGUSE *sup; 1738 struct buf *bp; 1739 1740 /* lfs_avail eats the remaining space */ 1741 fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - 1742 fs->lfs_curseg); 1743 /* Wake up any cleaning procs waiting on this file system. */ 1744 lfs_wakeup_cleaner(fs); 1745 lfs_newseg(fs); 1746 repeat = 1; 1747 fs->lfs_offset = fs->lfs_curseg; 1748 1749 sp->seg_number = dtosn(fs, fs->lfs_curseg); 1750 sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg); 1751 1752 /* 1753 * If the segment contains a superblock, update the offset 1754 * and summary address to skip over it. 1755 */ 1756 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 1757 if (sup->su_flags & SEGUSE_SUPERBLOCK) { 1758 fs->lfs_offset += btofsb(fs, LFS_SBPAD); 1759 sp->seg_bytes_left -= LFS_SBPAD; 1760 } 1761 brelse(bp, 0); 1762 /* Segment zero could also contain the labelpad */ 1763 if (fs->lfs_version > 1 && sp->seg_number == 0 && 1764 fs->lfs_start < btofsb(fs, LFS_LABELPAD)) { 1765 fs->lfs_offset += 1766 btofsb(fs, LFS_LABELPAD) - fs->lfs_start; 1767 sp->seg_bytes_left -= 1768 LFS_LABELPAD - fsbtob(fs, fs->lfs_start); 1769 } 1770 } else { 1771 sp->seg_number = dtosn(fs, fs->lfs_curseg); 1772 sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg - 1773 (fs->lfs_offset - fs->lfs_curseg)); 1774 } 1775 fs->lfs_lastpseg = fs->lfs_offset; 1776 1777 /* Record first address of this partial segment */ 1778 if (sp->seg_flags & SEGM_CLEAN) { 1779 fs->lfs_cleanint[fs->lfs_cleanind] = fs->lfs_offset; 1780 if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) { 1781 /* "1" is the artificial inc in lfs_seglock */ 1782 mutex_enter(&lfs_lock); 1783 while (fs->lfs_iocount > 1) { 1784 mtsleep(&fs->lfs_iocount, PRIBIO + 1, 1785 "lfs_initseg", 0, &lfs_lock); 1786 } 1787 mutex_exit(&lfs_lock); 1788 fs->lfs_cleanind = 0; 1789 } 1790 } 1791 1792 sp->fs = fs; 1793 sp->ibp = NULL; 1794 sp->idp = NULL; 1795 sp->ninodes = 0; 1796 sp->ndupino = 0; 1797 1798 sp->cbpp = sp->bpp; 1799 1800 /* Get a new buffer for SEGSUM */ 1801 sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp, 1802 fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY); 1803 1804 /* ... and enter it into the buffer list. */ 1805 *sp->cbpp = sbp; 1806 sp->cbpp++; 1807 fs->lfs_offset += btofsb(fs, fs->lfs_sumsize); 1808 1809 sp->start_bpp = sp->cbpp; 1810 1811 /* Set point to SEGSUM, initialize it. */ 1812 ssp = sp->segsum = sbp->b_data; 1813 memset(ssp, 0, fs->lfs_sumsize); 1814 ssp->ss_next = fs->lfs_nextseg; 1815 ssp->ss_nfinfo = ssp->ss_ninos = 0; 1816 ssp->ss_magic = SS_MAGIC; 1817 1818 /* Set pointer to first FINFO, initialize it. */ 1819 sp->fip = (struct finfo *)((char *)sp->segsum + SEGSUM_SIZE(fs)); 1820 sp->fip->fi_nblocks = 0; 1821 sp->start_lbp = &sp->fip->fi_blocks[0]; 1822 sp->fip->fi_lastlength = 0; 1823 1824 sp->seg_bytes_left -= fs->lfs_sumsize; 1825 sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs); 1826 1827 return (repeat); 1828 } 1829 1830 /* 1831 * Remove SEGUSE_INVAL from all segments. 1832 */ 1833 void 1834 lfs_unset_inval_all(struct lfs *fs) 1835 { 1836 SEGUSE *sup; 1837 struct buf *bp; 1838 int i; 1839 1840 for (i = 0; i < fs->lfs_nseg; i++) { 1841 LFS_SEGENTRY(sup, fs, i, bp); 1842 if (sup->su_flags & SEGUSE_INVAL) { 1843 sup->su_flags &= ~SEGUSE_INVAL; 1844 LFS_WRITESEGENTRY(sup, fs, i, bp); 1845 } else 1846 brelse(bp, 0); 1847 } 1848 } 1849 1850 /* 1851 * Return the next segment to write. 1852 */ 1853 void 1854 lfs_newseg(struct lfs *fs) 1855 { 1856 CLEANERINFO *cip; 1857 SEGUSE *sup; 1858 struct buf *bp; 1859 int curseg, isdirty, sn, skip_inval; 1860 1861 ASSERT_SEGLOCK(fs); 1862 1863 /* Honor LFCNWRAPSTOP */ 1864 mutex_enter(&lfs_lock); 1865 while (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) { 1866 if (fs->lfs_wrappass) { 1867 log(LOG_NOTICE, "%s: wrappass=%d\n", 1868 fs->lfs_fsmnt, fs->lfs_wrappass); 1869 fs->lfs_wrappass = 0; 1870 break; 1871 } 1872 fs->lfs_wrapstatus = LFS_WRAP_WAITING; 1873 wakeup(&fs->lfs_nowrap); 1874 log(LOG_NOTICE, "%s: waiting at log wrap\n", fs->lfs_fsmnt); 1875 mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz, 1876 &lfs_lock); 1877 } 1878 fs->lfs_wrapstatus = LFS_WRAP_GOING; 1879 mutex_exit(&lfs_lock); 1880 1881 LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp); 1882 DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n", 1883 dtosn(fs, fs->lfs_nextseg))); 1884 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1885 sup->su_nbytes = 0; 1886 sup->su_nsums = 0; 1887 sup->su_ninos = 0; 1888 LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp); 1889 1890 LFS_CLEANERINFO(cip, fs, bp); 1891 --cip->clean; 1892 ++cip->dirty; 1893 fs->lfs_nclean = cip->clean; 1894 LFS_SYNC_CLEANERINFO(cip, fs, bp, 1); 1895 1896 fs->lfs_lastseg = fs->lfs_curseg; 1897 fs->lfs_curseg = fs->lfs_nextseg; 1898 skip_inval = 1; 1899 for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) { 1900 sn = (sn + 1) % fs->lfs_nseg; 1901 1902 if (sn == curseg) { 1903 if (skip_inval) 1904 skip_inval = 0; 1905 else 1906 panic("lfs_nextseg: no clean segments"); 1907 } 1908 LFS_SEGENTRY(sup, fs, sn, bp); 1909 isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0)); 1910 /* Check SEGUSE_EMPTY as we go along */ 1911 if (isdirty && sup->su_nbytes == 0 && 1912 !(sup->su_flags & SEGUSE_EMPTY)) 1913 LFS_WRITESEGENTRY(sup, fs, sn, bp); 1914 else 1915 brelse(bp, 0); 1916 1917 if (!isdirty) 1918 break; 1919 } 1920 if (skip_inval == 0) 1921 lfs_unset_inval_all(fs); 1922 1923 ++fs->lfs_nactive; 1924 fs->lfs_nextseg = sntod(fs, sn); 1925 if (lfs_dostats) { 1926 ++lfs_stats.segsused; 1927 } 1928 } 1929 1930 static struct buf * 1931 lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr, 1932 int n) 1933 { 1934 struct lfs_cluster *cl; 1935 struct buf **bpp, *bp; 1936 1937 ASSERT_SEGLOCK(fs); 1938 cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK); 1939 bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK); 1940 memset(cl, 0, sizeof(*cl)); 1941 cl->fs = fs; 1942 cl->bpp = bpp; 1943 cl->bufcount = 0; 1944 cl->bufsize = 0; 1945 1946 /* If this segment is being written synchronously, note that */ 1947 if (fs->lfs_sp->seg_flags & SEGM_SYNC) { 1948 cl->flags |= LFS_CL_SYNC; 1949 cl->seg = fs->lfs_sp; 1950 ++cl->seg->seg_iocount; 1951 } 1952 1953 /* Get an empty buffer header, or maybe one with something on it */ 1954 bp = getiobuf(vp, true); 1955 bp->b_dev = NODEV; 1956 bp->b_blkno = bp->b_lblkno = addr; 1957 bp->b_iodone = lfs_cluster_callback; 1958 bp->b_private = cl; 1959 1960 return bp; 1961 } 1962 1963 int 1964 lfs_writeseg(struct lfs *fs, struct segment *sp) 1965 { 1966 struct buf **bpp, *bp, *cbp, *newbp, *unbusybp; 1967 SEGUSE *sup; 1968 SEGSUM *ssp; 1969 int i; 1970 int do_again, nblocks, byteoffset; 1971 size_t el_size; 1972 struct lfs_cluster *cl; 1973 u_short ninos; 1974 struct vnode *devvp; 1975 char *p = NULL; 1976 struct vnode *vp; 1977 int32_t *daddrp; /* XXX ondisk32 */ 1978 int changed; 1979 u_int32_t sum; 1980 #ifdef DEBUG 1981 FINFO *fip; 1982 int findex; 1983 #endif 1984 1985 ASSERT_SEGLOCK(fs); 1986 1987 ssp = (SEGSUM *)sp->segsum; 1988 1989 /* 1990 * If there are no buffers other than the segment summary to write, 1991 * don't do anything. If we are the end of a dirop sequence, however, 1992 * write the empty segment summary anyway, to help out the 1993 * roll-forward agent. 1994 */ 1995 if ((nblocks = sp->cbpp - sp->bpp) == 1) { 1996 if ((ssp->ss_flags & (SS_DIROP | SS_CONT)) != SS_DIROP) 1997 return 0; 1998 } 1999 2000 /* Note if partial segment is being written by the cleaner */ 2001 if (sp->seg_flags & SEGM_CLEAN) 2002 ssp->ss_flags |= SS_CLEAN; 2003 2004 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2005 2006 /* Update the segment usage information. */ 2007 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 2008 2009 /* Loop through all blocks, except the segment summary. */ 2010 for (bpp = sp->bpp; ++bpp < sp->cbpp; ) { 2011 if ((*bpp)->b_vp != devvp) { 2012 sup->su_nbytes += (*bpp)->b_bcount; 2013 DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d" 2014 " lbn %" PRId64 " db 0x%" PRIx64 "\n", 2015 sp->seg_number, (*bpp)->b_bcount, 2016 VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno, 2017 (*bpp)->b_blkno)); 2018 } 2019 } 2020 2021 #ifdef DEBUG 2022 /* Check for zero-length and zero-version FINFO entries. */ 2023 fip = (struct finfo *)((char *)ssp + SEGSUM_SIZE(fs)); 2024 for (findex = 0; findex < ssp->ss_nfinfo; findex++) { 2025 KDASSERT(fip->fi_nblocks > 0); 2026 KDASSERT(fip->fi_version > 0); 2027 fip = (FINFO *)((char *)fip + FINFOSIZE + 2028 sizeof(int32_t) * fip->fi_nblocks); 2029 } 2030 #endif /* DEBUG */ 2031 2032 ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); 2033 DLOG((DLOG_SU, "seg %d += %d for %d inodes\n", 2034 sp->seg_number, ssp->ss_ninos * sizeof (struct ufs1_dinode), 2035 ssp->ss_ninos)); 2036 sup->su_nbytes += ssp->ss_ninos * sizeof (struct ufs1_dinode); 2037 /* sup->su_nbytes += fs->lfs_sumsize; */ 2038 if (fs->lfs_version == 1) 2039 sup->su_olastmod = time_second; 2040 else 2041 sup->su_lastmod = time_second; 2042 sup->su_ninos += ninos; 2043 ++sup->su_nsums; 2044 fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize); 2045 2046 do_again = !(bp->b_flags & B_GATHERED); 2047 LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */ 2048 2049 /* 2050 * Mark blocks B_BUSY, to prevent then from being changed between 2051 * the checksum computation and the actual write. 2052 * 2053 * If we are cleaning, check indirect blocks for UNWRITTEN, and if 2054 * there are any, replace them with copies that have UNASSIGNED 2055 * instead. 2056 */ 2057 mutex_enter(&bufcache_lock); 2058 for (bpp = sp->bpp, i = nblocks - 1; i--;) { 2059 ++bpp; 2060 bp = *bpp; 2061 if (bp->b_iodone != NULL) { /* UBC or malloced buffer */ 2062 bp->b_cflags |= BC_BUSY; 2063 continue; 2064 } 2065 2066 while (bp->b_cflags & BC_BUSY) { 2067 DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential" 2068 " data summary corruption for ino %d, lbn %" 2069 PRId64 "\n", 2070 VTOI(bp->b_vp)->i_number, bp->b_lblkno)); 2071 bp->b_cflags |= BC_WANTED; 2072 cv_wait(&bp->b_busy, &bufcache_lock); 2073 } 2074 bp->b_cflags |= BC_BUSY; 2075 mutex_exit(&bufcache_lock); 2076 unbusybp = NULL; 2077 2078 /* 2079 * Check and replace indirect block UNWRITTEN bogosity. 2080 * XXX See comment in lfs_writefile. 2081 */ 2082 if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp && 2083 VTOI(bp->b_vp)->i_ffs1_blocks != 2084 VTOI(bp->b_vp)->i_lfs_effnblks) { 2085 DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%d != %d)\n", 2086 VTOI(bp->b_vp)->i_number, 2087 VTOI(bp->b_vp)->i_lfs_effnblks, 2088 VTOI(bp->b_vp)->i_ffs1_blocks)); 2089 /* Make a copy we'll make changes to */ 2090 newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno, 2091 bp->b_bcount, LFS_NB_IBLOCK); 2092 newbp->b_blkno = bp->b_blkno; 2093 memcpy(newbp->b_data, bp->b_data, 2094 newbp->b_bcount); 2095 2096 changed = 0; 2097 /* XXX ondisk32 */ 2098 for (daddrp = (int32_t *)(newbp->b_data); 2099 daddrp < (int32_t *)((char *)newbp->b_data + 2100 newbp->b_bcount); daddrp++) { 2101 if (*daddrp == UNWRITTEN) { 2102 ++changed; 2103 *daddrp = 0; 2104 } 2105 } 2106 /* 2107 * Get rid of the old buffer. Don't mark it clean, 2108 * though, if it still has dirty data on it. 2109 */ 2110 if (changed) { 2111 DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):" 2112 " bp = %p newbp = %p\n", changed, bp, 2113 newbp)); 2114 *bpp = newbp; 2115 bp->b_flags &= ~B_GATHERED; 2116 bp->b_error = 0; 2117 if (bp->b_iodone != NULL) { 2118 DLOG((DLOG_SEG, "lfs_writeseg: " 2119 "indir bp should not be B_CALL\n")); 2120 biodone(bp); 2121 bp = NULL; 2122 } else { 2123 /* Still on free list, leave it there */ 2124 unbusybp = bp; 2125 /* 2126 * We have to re-decrement lfs_avail 2127 * since this block is going to come 2128 * back around to us in the next 2129 * segment. 2130 */ 2131 fs->lfs_avail -= 2132 btofsb(fs, bp->b_bcount); 2133 } 2134 } else { 2135 lfs_freebuf(fs, newbp); 2136 } 2137 } 2138 mutex_enter(&bufcache_lock); 2139 if (unbusybp != NULL) { 2140 unbusybp->b_cflags &= ~BC_BUSY; 2141 if (unbusybp->b_cflags & BC_WANTED) 2142 cv_broadcast(&bp->b_busy); 2143 } 2144 } 2145 mutex_exit(&bufcache_lock); 2146 2147 /* 2148 * Compute checksum across data and then across summary; the first 2149 * block (the summary block) is skipped. Set the create time here 2150 * so that it's guaranteed to be later than the inode mod times. 2151 */ 2152 sum = 0; 2153 if (fs->lfs_version == 1) 2154 el_size = sizeof(u_long); 2155 else 2156 el_size = sizeof(u_int32_t); 2157 for (bpp = sp->bpp, i = nblocks - 1; i--; ) { 2158 ++bpp; 2159 /* Loop through gop_write cluster blocks */ 2160 for (byteoffset = 0; byteoffset < (*bpp)->b_bcount; 2161 byteoffset += fs->lfs_bsize) { 2162 #ifdef LFS_USE_B_INVAL 2163 if ((*bpp)->b_cflags & BC_INVAL) != 0 && 2164 (*bpp)->b_iodone != NULL) { 2165 if (copyin((void *)(*bpp)->b_saveaddr + 2166 byteoffset, dp, el_size)) { 2167 panic("lfs_writeseg: copyin failed [1]:" 2168 " ino %d blk %" PRId64, 2169 VTOI((*bpp)->b_vp)->i_number, 2170 (*bpp)->b_lblkno); 2171 } 2172 } else 2173 #endif /* LFS_USE_B_INVAL */ 2174 { 2175 sum = lfs_cksum_part((char *) 2176 (*bpp)->b_data + byteoffset, el_size, sum); 2177 } 2178 } 2179 } 2180 if (fs->lfs_version == 1) 2181 ssp->ss_ocreate = time_second; 2182 else { 2183 ssp->ss_create = time_second; 2184 ssp->ss_serial = ++fs->lfs_serial; 2185 ssp->ss_ident = fs->lfs_ident; 2186 } 2187 ssp->ss_datasum = lfs_cksum_fold(sum); 2188 ssp->ss_sumsum = cksum(&ssp->ss_datasum, 2189 fs->lfs_sumsize - sizeof(ssp->ss_sumsum)); 2190 2191 mutex_enter(&lfs_lock); 2192 fs->lfs_bfree -= (btofsb(fs, ninos * fs->lfs_ibsize) + 2193 btofsb(fs, fs->lfs_sumsize)); 2194 fs->lfs_dmeta += (btofsb(fs, ninos * fs->lfs_ibsize) + 2195 btofsb(fs, fs->lfs_sumsize)); 2196 mutex_exit(&lfs_lock); 2197 2198 /* 2199 * When we simply write the blocks we lose a rotation for every block 2200 * written. To avoid this problem, we cluster the buffers into a 2201 * chunk and write the chunk. MAXPHYS is the largest size I/O 2202 * devices can handle, use that for the size of the chunks. 2203 * 2204 * Blocks that are already clusters (from GOP_WRITE), however, we 2205 * don't bother to copy into other clusters. 2206 */ 2207 2208 #define CHUNKSIZE MAXPHYS 2209 2210 if (devvp == NULL) 2211 panic("devvp is NULL"); 2212 for (bpp = sp->bpp, i = nblocks; i;) { 2213 cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i); 2214 cl = cbp->b_private; 2215 2216 cbp->b_flags |= B_ASYNC; 2217 cbp->b_cflags |= BC_BUSY; 2218 cbp->b_bcount = 0; 2219 2220 #if defined(DEBUG) && defined(DIAGNOSTIC) 2221 if (bpp - sp->bpp > (fs->lfs_sumsize - SEGSUM_SIZE(fs)) 2222 / sizeof(int32_t)) { 2223 panic("lfs_writeseg: real bpp overwrite"); 2224 } 2225 if (bpp - sp->bpp > segsize(fs) / fs->lfs_fsize) { 2226 panic("lfs_writeseg: theoretical bpp overwrite"); 2227 } 2228 #endif 2229 2230 /* 2231 * Construct the cluster. 2232 */ 2233 mutex_enter(&lfs_lock); 2234 ++fs->lfs_iocount; 2235 mutex_exit(&lfs_lock); 2236 while (i && cbp->b_bcount < CHUNKSIZE) { 2237 bp = *bpp; 2238 2239 if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount)) 2240 break; 2241 if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC)) 2242 break; 2243 2244 /* Clusters from GOP_WRITE are expedited */ 2245 if (bp->b_bcount > fs->lfs_bsize) { 2246 if (cbp->b_bcount > 0) 2247 /* Put in its own buffer */ 2248 break; 2249 else { 2250 cbp->b_data = bp->b_data; 2251 } 2252 } else if (cbp->b_bcount == 0) { 2253 p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE, 2254 LFS_NB_CLUSTER); 2255 cl->flags |= LFS_CL_MALLOC; 2256 } 2257 #ifdef DIAGNOSTIC 2258 if (dtosn(fs, dbtofsb(fs, bp->b_blkno + 2259 btodb(bp->b_bcount - 1))) != 2260 sp->seg_number) { 2261 printf("blk size %d daddr %" PRIx64 2262 " not in seg %d\n", 2263 bp->b_bcount, bp->b_blkno, 2264 sp->seg_number); 2265 panic("segment overwrite"); 2266 } 2267 #endif 2268 2269 #ifdef LFS_USE_B_INVAL 2270 /* 2271 * Fake buffers from the cleaner are marked as B_INVAL. 2272 * We need to copy the data from user space rather than 2273 * from the buffer indicated. 2274 * XXX == what do I do on an error? 2275 */ 2276 if ((bp->b_cflags & BC_INVAL) != 0 && 2277 bp->b_iodone != NULL) { 2278 if (copyin(bp->b_saveaddr, p, bp->b_bcount)) 2279 panic("lfs_writeseg: " 2280 "copyin failed [2]"); 2281 } else 2282 #endif /* LFS_USE_B_INVAL */ 2283 if (cl->flags & LFS_CL_MALLOC) { 2284 /* copy data into our cluster. */ 2285 memcpy(p, bp->b_data, bp->b_bcount); 2286 p += bp->b_bcount; 2287 } 2288 2289 cbp->b_bcount += bp->b_bcount; 2290 cl->bufsize += bp->b_bcount; 2291 2292 bp->b_flags &= ~B_READ; 2293 bp->b_error = 0; 2294 cl->bpp[cl->bufcount++] = bp; 2295 2296 vp = bp->b_vp; 2297 mutex_enter(&bufcache_lock); 2298 mutex_enter(&vp->v_interlock); 2299 bp->b_oflags &= ~(BO_DELWRI | BO_DONE); 2300 reassignbuf(bp, vp); 2301 vp->v_numoutput++; 2302 mutex_exit(&vp->v_interlock); 2303 mutex_exit(&bufcache_lock); 2304 2305 bpp++; 2306 i--; 2307 } 2308 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 2309 BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL); 2310 else 2311 BIO_SETPRIO(cbp, BPRIO_TIMELIMITED); 2312 mutex_enter(&devvp->v_interlock); 2313 devvp->v_numoutput++; 2314 mutex_exit(&devvp->v_interlock); 2315 VOP_STRATEGY(devvp, cbp); 2316 curproc->p_stats->p_ru.ru_oublock++; 2317 } 2318 2319 if (lfs_dostats) { 2320 ++lfs_stats.psegwrites; 2321 lfs_stats.blocktot += nblocks - 1; 2322 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 2323 ++lfs_stats.psyncwrites; 2324 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { 2325 ++lfs_stats.pcleanwrites; 2326 lfs_stats.cleanblocks += nblocks - 1; 2327 } 2328 } 2329 2330 return (lfs_initseg(fs) || do_again); 2331 } 2332 2333 void 2334 lfs_writesuper(struct lfs *fs, daddr_t daddr) 2335 { 2336 struct buf *bp; 2337 struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2338 int s; 2339 2340 ASSERT_MAYBE_SEGLOCK(fs); 2341 #ifdef DIAGNOSTIC 2342 KASSERT(fs->lfs_magic == LFS_MAGIC); 2343 #endif 2344 /* 2345 * If we can write one superblock while another is in 2346 * progress, we risk not having a complete checkpoint if we crash. 2347 * So, block here if a superblock write is in progress. 2348 */ 2349 mutex_enter(&lfs_lock); 2350 s = splbio(); 2351 while (fs->lfs_sbactive) { 2352 mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0, 2353 &lfs_lock); 2354 } 2355 fs->lfs_sbactive = daddr; 2356 splx(s); 2357 mutex_exit(&lfs_lock); 2358 2359 /* Set timestamp of this version of the superblock */ 2360 if (fs->lfs_version == 1) 2361 fs->lfs_otstamp = time_second; 2362 fs->lfs_tstamp = time_second; 2363 2364 /* Checksum the superblock and copy it into a buffer. */ 2365 fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs)); 2366 bp = lfs_newbuf(fs, devvp, 2367 fsbtodb(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK); 2368 memset((char *)bp->b_data + sizeof(struct dlfs), 0, 2369 LFS_SBPAD - sizeof(struct dlfs)); 2370 *(struct dlfs *)bp->b_data = fs->lfs_dlfs; 2371 2372 bp->b_cflags |= BC_BUSY; 2373 bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC; 2374 bp->b_oflags &= ~(BO_DONE | BO_DELWRI); 2375 bp->b_error = 0; 2376 bp->b_iodone = lfs_supercallback; 2377 2378 if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC) 2379 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 2380 else 2381 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 2382 curproc->p_stats->p_ru.ru_oublock++; 2383 2384 mutex_enter(&devvp->v_interlock); 2385 devvp->v_numoutput++; 2386 mutex_exit(&devvp->v_interlock); 2387 2388 mutex_enter(&lfs_lock); 2389 ++fs->lfs_iocount; 2390 mutex_exit(&lfs_lock); 2391 VOP_STRATEGY(devvp, bp); 2392 } 2393 2394 /* 2395 * Logical block number match routines used when traversing the dirty block 2396 * chain. 2397 */ 2398 int 2399 lfs_match_fake(struct lfs *fs, struct buf *bp) 2400 { 2401 2402 ASSERT_SEGLOCK(fs); 2403 return LFS_IS_MALLOC_BUF(bp); 2404 } 2405 2406 #if 0 2407 int 2408 lfs_match_real(struct lfs *fs, struct buf *bp) 2409 { 2410 2411 ASSERT_SEGLOCK(fs); 2412 return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp)); 2413 } 2414 #endif 2415 2416 int 2417 lfs_match_data(struct lfs *fs, struct buf *bp) 2418 { 2419 2420 ASSERT_SEGLOCK(fs); 2421 return (bp->b_lblkno >= 0); 2422 } 2423 2424 int 2425 lfs_match_indir(struct lfs *fs, struct buf *bp) 2426 { 2427 daddr_t lbn; 2428 2429 ASSERT_SEGLOCK(fs); 2430 lbn = bp->b_lblkno; 2431 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); 2432 } 2433 2434 int 2435 lfs_match_dindir(struct lfs *fs, struct buf *bp) 2436 { 2437 daddr_t lbn; 2438 2439 ASSERT_SEGLOCK(fs); 2440 lbn = bp->b_lblkno; 2441 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); 2442 } 2443 2444 int 2445 lfs_match_tindir(struct lfs *fs, struct buf *bp) 2446 { 2447 daddr_t lbn; 2448 2449 ASSERT_SEGLOCK(fs); 2450 lbn = bp->b_lblkno; 2451 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); 2452 } 2453 2454 static void 2455 lfs_free_aiodone(struct buf *bp) 2456 { 2457 struct lfs *fs; 2458 2459 KERNEL_LOCK(1, curlwp); 2460 fs = bp->b_private; 2461 ASSERT_NO_SEGLOCK(fs); 2462 lfs_freebuf(fs, bp); 2463 KERNEL_UNLOCK_LAST(curlwp); 2464 } 2465 2466 static void 2467 lfs_super_aiodone(struct buf *bp) 2468 { 2469 struct lfs *fs; 2470 2471 KERNEL_LOCK(1, curlwp); 2472 fs = bp->b_private; 2473 ASSERT_NO_SEGLOCK(fs); 2474 mutex_enter(&lfs_lock); 2475 fs->lfs_sbactive = 0; 2476 if (--fs->lfs_iocount <= 1) 2477 wakeup(&fs->lfs_iocount); 2478 wakeup(&fs->lfs_sbactive); 2479 mutex_exit(&lfs_lock); 2480 lfs_freebuf(fs, bp); 2481 KERNEL_UNLOCK_LAST(curlwp); 2482 } 2483 2484 static void 2485 lfs_cluster_aiodone(struct buf *bp) 2486 { 2487 struct lfs_cluster *cl; 2488 struct lfs *fs; 2489 struct buf *tbp, *fbp; 2490 struct vnode *vp, *devvp, *ovp; 2491 struct inode *ip; 2492 int error; 2493 2494 KERNEL_LOCK(1, curlwp); 2495 2496 error = bp->b_error; 2497 cl = bp->b_private; 2498 fs = cl->fs; 2499 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 2500 ASSERT_NO_SEGLOCK(fs); 2501 2502 /* Put the pages back, and release the buffer */ 2503 while (cl->bufcount--) { 2504 tbp = cl->bpp[cl->bufcount]; 2505 KASSERT(tbp->b_cflags & BC_BUSY); 2506 if (error) { 2507 tbp->b_error = error; 2508 } 2509 2510 /* 2511 * We're done with tbp. If it has not been re-dirtied since 2512 * the cluster was written, free it. Otherwise, keep it on 2513 * the locked list to be written again. 2514 */ 2515 vp = tbp->b_vp; 2516 2517 tbp->b_flags &= ~B_GATHERED; 2518 2519 LFS_BCLEAN_LOG(fs, tbp); 2520 2521 mutex_enter(&bufcache_lock); 2522 if (tbp->b_iodone == NULL) { 2523 KASSERT(tbp->b_flags & B_LOCKED); 2524 bremfree(tbp); 2525 if (vp) { 2526 mutex_enter(&vp->v_interlock); 2527 reassignbuf(tbp, vp); 2528 mutex_exit(&vp->v_interlock); 2529 } 2530 tbp->b_flags |= B_ASYNC; /* for biodone */ 2531 } 2532 2533 if (((tbp->b_flags | tbp->b_oflags) & 2534 (B_LOCKED | BO_DELWRI)) == B_LOCKED) 2535 LFS_UNLOCK_BUF(tbp); 2536 2537 if (tbp->b_oflags & BO_DONE) { 2538 DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n", 2539 cl->bufcount, (long)tbp->b_flags)); 2540 } 2541 2542 if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) { 2543 /* 2544 * A buffer from the page daemon. 2545 * We use the same iodone as it does, 2546 * so we must manually disassociate its 2547 * buffers from the vp. 2548 */ 2549 if ((ovp = tbp->b_vp) != NULL) { 2550 /* This is just silly */ 2551 mutex_enter(&ovp->v_interlock); 2552 brelvp(tbp); 2553 mutex_exit(&ovp->v_interlock); 2554 tbp->b_vp = vp; 2555 tbp->b_objlock = &vp->v_interlock; 2556 } 2557 /* Put it back the way it was */ 2558 tbp->b_flags |= B_ASYNC; 2559 /* Master buffers have BC_AGE */ 2560 if (tbp->b_private == tbp) 2561 tbp->b_flags |= BC_AGE; 2562 } 2563 mutex_exit(&bufcache_lock); 2564 2565 biodone(tbp); 2566 2567 /* 2568 * If this is the last block for this vnode, but 2569 * there are other blocks on its dirty list, 2570 * set IN_MODIFIED/IN_CLEANING depending on what 2571 * sort of block. Only do this for our mount point, 2572 * not for, e.g., inode blocks that are attached to 2573 * the devvp. 2574 * XXX KS - Shouldn't we set *both* if both types 2575 * of blocks are present (traverse the dirty list?) 2576 */ 2577 mutex_enter(&lfs_lock); 2578 mutex_enter(&vp->v_interlock); 2579 if (vp != devvp && vp->v_numoutput == 0 && 2580 (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) { 2581 ip = VTOI(vp); 2582 DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n", 2583 ip->i_number)); 2584 if (LFS_IS_MALLOC_BUF(fbp)) 2585 LFS_SET_UINO(ip, IN_CLEANING); 2586 else 2587 LFS_SET_UINO(ip, IN_MODIFIED); 2588 } 2589 cv_broadcast(&vp->v_cv); 2590 mutex_exit(&vp->v_interlock); 2591 mutex_exit(&lfs_lock); 2592 } 2593 2594 /* Fix up the cluster buffer, and release it */ 2595 if (cl->flags & LFS_CL_MALLOC) 2596 lfs_free(fs, bp->b_data, LFS_NB_CLUSTER); 2597 putiobuf(bp); 2598 2599 /* Note i/o done */ 2600 if (cl->flags & LFS_CL_SYNC) { 2601 if (--cl->seg->seg_iocount == 0) 2602 wakeup(&cl->seg->seg_iocount); 2603 } 2604 mutex_enter(&lfs_lock); 2605 #ifdef DIAGNOSTIC 2606 if (fs->lfs_iocount == 0) 2607 panic("lfs_cluster_aiodone: zero iocount"); 2608 #endif 2609 if (--fs->lfs_iocount <= 1) 2610 wakeup(&fs->lfs_iocount); 2611 mutex_exit(&lfs_lock); 2612 2613 KERNEL_UNLOCK_LAST(curlwp); 2614 2615 pool_put(&fs->lfs_bpppool, cl->bpp); 2616 cl->bpp = NULL; 2617 pool_put(&fs->lfs_clpool, cl); 2618 } 2619 2620 static void 2621 lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *)) 2622 { 2623 /* reset b_iodone for when this is a single-buf i/o. */ 2624 bp->b_iodone = aiodone; 2625 2626 workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL); 2627 } 2628 2629 static void 2630 lfs_cluster_callback(struct buf *bp) 2631 { 2632 2633 lfs_generic_callback(bp, lfs_cluster_aiodone); 2634 } 2635 2636 void 2637 lfs_supercallback(struct buf *bp) 2638 { 2639 2640 lfs_generic_callback(bp, lfs_super_aiodone); 2641 } 2642 2643 /* 2644 * The only buffers that are going to hit these functions are the 2645 * segment write blocks, or the segment summaries, or the superblocks. 2646 * 2647 * All of the above are created by lfs_newbuf, and so do not need to be 2648 * released via brelse. 2649 */ 2650 void 2651 lfs_callback(struct buf *bp) 2652 { 2653 2654 lfs_generic_callback(bp, lfs_free_aiodone); 2655 } 2656 2657 /* 2658 * Shellsort (diminishing increment sort) from Data Structures and 2659 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; 2660 * see also Knuth Vol. 3, page 84. The increments are selected from 2661 * formula (8), page 95. Roughly O(N^3/2). 2662 */ 2663 /* 2664 * This is our own private copy of shellsort because we want to sort 2665 * two parallel arrays (the array of buffer pointers and the array of 2666 * logical block numbers) simultaneously. Note that we cast the array 2667 * of logical block numbers to a unsigned in this routine so that the 2668 * negative block numbers (meta data blocks) sort AFTER the data blocks. 2669 */ 2670 2671 void 2672 lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size) 2673 { 2674 static int __rsshell_increments[] = { 4, 1, 0 }; 2675 int incr, *incrp, t1, t2; 2676 struct buf *bp_temp; 2677 2678 #ifdef DEBUG 2679 incr = 0; 2680 for (t1 = 0; t1 < nmemb; t1++) { 2681 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { 2682 if (lb_array[incr++] != bp_array[t1]->b_lblkno + t2) { 2683 /* dump before panic */ 2684 printf("lfs_shellsort: nmemb=%d, size=%d\n", 2685 nmemb, size); 2686 incr = 0; 2687 for (t1 = 0; t1 < nmemb; t1++) { 2688 const struct buf *bp = bp_array[t1]; 2689 2690 printf("bp[%d]: lbn=%" PRIu64 ", size=%" 2691 PRIu64 "\n", t1, 2692 (uint64_t)bp->b_bcount, 2693 (uint64_t)bp->b_lblkno); 2694 printf("lbns:"); 2695 for (t2 = 0; t2 * size < bp->b_bcount; 2696 t2++) { 2697 printf(" %" PRId32, 2698 lb_array[incr++]); 2699 } 2700 printf("\n"); 2701 } 2702 panic("lfs_shellsort: inconsistent input"); 2703 } 2704 } 2705 } 2706 #endif 2707 2708 for (incrp = __rsshell_increments; (incr = *incrp++) != 0;) 2709 for (t1 = incr; t1 < nmemb; ++t1) 2710 for (t2 = t1 - incr; t2 >= 0;) 2711 if ((u_int32_t)bp_array[t2]->b_lblkno > 2712 (u_int32_t)bp_array[t2 + incr]->b_lblkno) { 2713 bp_temp = bp_array[t2]; 2714 bp_array[t2] = bp_array[t2 + incr]; 2715 bp_array[t2 + incr] = bp_temp; 2716 t2 -= incr; 2717 } else 2718 break; 2719 2720 /* Reform the list of logical blocks */ 2721 incr = 0; 2722 for (t1 = 0; t1 < nmemb; t1++) { 2723 for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) { 2724 lb_array[incr++] = bp_array[t1]->b_lblkno + t2; 2725 } 2726 } 2727 } 2728 2729 /* 2730 * Call vget with LK_NOWAIT. If we are the one who holds VI_XLOCK, 2731 * however, we must press on. Just fake success in that case. 2732 */ 2733 int 2734 lfs_vref(struct vnode *vp) 2735 { 2736 int error; 2737 struct lfs *fs; 2738 2739 KASSERT(mutex_owned(&vp->v_interlock)); 2740 2741 fs = VTOI(vp)->i_lfs; 2742 2743 ASSERT_MAYBE_SEGLOCK(fs); 2744 2745 /* 2746 * If we return 1 here during a flush, we risk vinvalbuf() not 2747 * being able to flush all of the pages from this vnode, which 2748 * will cause it to panic. So, return 0 if a flush is in progress. 2749 */ 2750 error = vget(vp, LK_NOWAIT | LK_INTERLOCK); 2751 if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) { 2752 ++fs->lfs_flushvp_fakevref; 2753 return 0; 2754 } 2755 return error; 2756 } 2757 2758 /* 2759 * This is vrele except that we do not want to VOP_INACTIVE this vnode. We 2760 * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end. 2761 */ 2762 void 2763 lfs_vunref(struct vnode *vp) 2764 { 2765 struct lfs *fs; 2766 2767 fs = VTOI(vp)->i_lfs; 2768 ASSERT_MAYBE_SEGLOCK(fs); 2769 2770 /* 2771 * Analogous to lfs_vref, if the node is flushing, fake it. 2772 */ 2773 if (IS_FLUSHING(fs, vp) && fs->lfs_flushvp_fakevref) { 2774 --fs->lfs_flushvp_fakevref; 2775 return; 2776 } 2777 2778 /* does not call inactive */ 2779 mutex_enter(&vp->v_interlock); 2780 vrelel(vp, VRELEL_NOINACTIVE); 2781 } 2782 2783 /* 2784 * We use this when we have vnodes that were loaded in solely for cleaning. 2785 * There is no reason to believe that these vnodes will be referenced again 2786 * soon, since the cleaning process is unrelated to normal filesystem 2787 * activity. Putting cleaned vnodes at the tail of the list has the effect 2788 * of flushing the vnode LRU. So, put vnodes that were loaded only for 2789 * cleaning at the head of the list, instead. 2790 */ 2791 void 2792 lfs_vunref_head(struct vnode *vp) 2793 { 2794 2795 ASSERT_SEGLOCK(VTOI(vp)->i_lfs); 2796 2797 /* does not call inactive, inserts non-held vnode at head of freelist */ 2798 mutex_enter(&vp->v_interlock); 2799 vrelel(vp, VRELEL_NOINACTIVE | VRELEL_ONHEAD); 2800 } 2801 2802 2803 /* 2804 * Set up an FINFO entry for a new file. The fip pointer is assumed to 2805 * point at uninitialized space. 2806 */ 2807 void 2808 lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers) 2809 { 2810 struct segment *sp = fs->lfs_sp; 2811 2812 KASSERT(vers > 0); 2813 2814 if (sp->seg_bytes_left < fs->lfs_bsize || 2815 sp->sum_bytes_left < sizeof(struct finfo)) 2816 (void) lfs_writeseg(fs, fs->lfs_sp); 2817 2818 sp->sum_bytes_left -= FINFOSIZE; 2819 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 2820 sp->fip->fi_nblocks = 0; 2821 sp->fip->fi_ino = ino; 2822 sp->fip->fi_version = vers; 2823 } 2824 2825 /* 2826 * Release the FINFO entry, either clearing out an unused entry or 2827 * advancing us to the next available entry. 2828 */ 2829 void 2830 lfs_release_finfo(struct lfs *fs) 2831 { 2832 struct segment *sp = fs->lfs_sp; 2833 2834 if (sp->fip->fi_nblocks != 0) { 2835 sp->fip = (FINFO*)((char *)sp->fip + FINFOSIZE + 2836 sizeof(int32_t) * sp->fip->fi_nblocks); 2837 sp->start_lbp = &sp->fip->fi_blocks[0]; 2838 } else { 2839 sp->sum_bytes_left += FINFOSIZE; 2840 --((SEGSUM *)(sp->segsum))->ss_nfinfo; 2841 } 2842 } 2843