1 /* $NetBSD: lfs_segment.c,v 1.40 2000/01/19 00:03:04 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 /* 39 * Copyright (c) 1991, 1993 40 * The Regents of the University of California. All rights reserved. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. All advertising materials mentioning features or use of this software 51 * must display the following acknowledgement: 52 * This product includes software developed by the University of 53 * California, Berkeley and its contributors. 54 * 4. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 * 70 * @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95 71 */ 72 73 #define ivndebug(vp,str) printf("ino %d: %s\n",VTOI(vp)->i_number,(str)) 74 75 #include "opt_ddb.h" 76 #include <sys/param.h> 77 #include <sys/systm.h> 78 #include <sys/namei.h> 79 #include <sys/kernel.h> 80 #include <sys/resourcevar.h> 81 #include <sys/file.h> 82 #include <sys/stat.h> 83 #include <sys/buf.h> 84 #include <sys/proc.h> 85 #include <sys/conf.h> 86 #include <sys/vnode.h> 87 #include <sys/malloc.h> 88 #include <sys/mount.h> 89 90 #include <miscfs/specfs/specdev.h> 91 #include <miscfs/fifofs/fifo.h> 92 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/dir.h> 96 #include <ufs/ufs/ufsmount.h> 97 #include <ufs/ufs/ufs_extern.h> 98 99 #include <ufs/lfs/lfs.h> 100 #include <ufs/lfs/lfs_extern.h> 101 102 extern int count_lock_queue __P((void)); 103 extern struct simplelock vnode_free_list_slock; /* XXX */ 104 105 /* 106 * Determine if it's OK to start a partial in this segment, or if we need 107 * to go on to a new segment. 108 */ 109 #define LFS_PARTIAL_FITS(fs) \ 110 ((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ 111 1 << (fs)->lfs_fsbtodb) 112 113 void lfs_callback __P((struct buf *)); 114 int lfs_gather __P((struct lfs *, struct segment *, 115 struct vnode *, int (*) __P((struct lfs *, struct buf *)))); 116 int lfs_gatherblock __P((struct segment *, struct buf *, int *)); 117 void lfs_iset __P((struct inode *, ufs_daddr_t, time_t)); 118 int lfs_match_fake __P((struct lfs *, struct buf *)); 119 int lfs_match_data __P((struct lfs *, struct buf *)); 120 int lfs_match_dindir __P((struct lfs *, struct buf *)); 121 int lfs_match_indir __P((struct lfs *, struct buf *)); 122 int lfs_match_tindir __P((struct lfs *, struct buf *)); 123 void lfs_newseg __P((struct lfs *)); 124 void lfs_shellsort __P((struct buf **, ufs_daddr_t *, register int)); 125 void lfs_supercallback __P((struct buf *)); 126 void lfs_updatemeta __P((struct segment *)); 127 int lfs_vref __P((struct vnode *)); 128 void lfs_vunref __P((struct vnode *)); 129 void lfs_writefile __P((struct lfs *, struct segment *, struct vnode *)); 130 int lfs_writeinode __P((struct lfs *, struct segment *, struct inode *)); 131 int lfs_writeseg __P((struct lfs *, struct segment *)); 132 void lfs_writesuper __P((struct lfs *, daddr_t)); 133 int lfs_writevnodes __P((struct lfs *fs, struct mount *mp, 134 struct segment *sp, int dirops)); 135 136 int lfs_allclean_wakeup; /* Cleaner wakeup address. */ 137 int lfs_writeindir = 1; /* whether to flush indir on non-ckp */ 138 int lfs_clean_vnhead = 0; /* Allow freeing to head of vn list */ 139 int lfs_dirvcount = 0; /* # active dirops */ 140 141 /* Statistics Counters */ 142 int lfs_dostats = 1; 143 struct lfs_stats lfs_stats; 144 145 /* op values to lfs_writevnodes */ 146 #define VN_REG 0 147 #define VN_DIROP 1 148 #define VN_EMPTY 2 149 #define VN_CLEAN 3 150 151 #define LFS_MAX_ACTIVE 10 152 153 /* 154 * XXX KS - Set modification time on the Ifile, so the cleaner can 155 * read the fs mod time off of it. We don't set IN_UPDATE here, 156 * since we don't really need this to be flushed to disk (and in any 157 * case that wouldn't happen to the Ifile until we checkpoint). 158 */ 159 void 160 lfs_imtime(fs) 161 struct lfs *fs; 162 { 163 struct timespec ts; 164 struct inode *ip; 165 166 TIMEVAL_TO_TIMESPEC(&time, &ts); 167 ip = VTOI(fs->lfs_ivnode); 168 ip->i_ffs_mtime = ts.tv_sec; 169 ip->i_ffs_mtimensec = ts.tv_nsec; 170 } 171 172 /* 173 * Ifile and meta data blocks are not marked busy, so segment writes MUST be 174 * single threaded. Currently, there are two paths into lfs_segwrite, sync() 175 * and getnewbuf(). They both mark the file system busy. Lfs_vflush() 176 * explicitly marks the file system busy. So lfs_segwrite is safe. I think. 177 */ 178 179 #define SET_FLUSHING(fs,vp) (fs)->lfs_flushvp = (vp) 180 #define IS_FLUSHING(fs,vp) ((fs)->lfs_flushvp == (vp)) 181 #define CLR_FLUSHING(fs,vp) (fs)->lfs_flushvp = NULL 182 183 int 184 lfs_vflush(vp) 185 struct vnode *vp; 186 { 187 struct inode *ip; 188 struct lfs *fs; 189 struct segment *sp; 190 struct buf *bp, *nbp, *tbp, *tnbp; 191 int error, s; 192 193 ip = VTOI(vp); 194 fs = VFSTOUFS(vp->v_mount)->um_lfs; 195 196 if(ip->i_flag & IN_CLEANING) { 197 #ifdef DEBUG_LFS 198 ivndebug(vp,"vflush/in_cleaning"); 199 #endif 200 ip->i_flag &= ~IN_CLEANING; 201 if(ip->i_flag & IN_MODIFIED) { 202 fs->lfs_uinodes--; 203 } else 204 ip->i_flag |= IN_MODIFIED; 205 /* 206 * Toss any cleaning buffers that have real counterparts 207 * to avoid losing new data 208 */ 209 s = splbio(); 210 for(bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) { 211 nbp = bp->b_vnbufs.le_next; 212 if(bp->b_flags & B_CALL) { 213 for(tbp=vp->v_dirtyblkhd.lh_first; tbp; 214 tbp=tnbp) 215 { 216 tnbp = tbp->b_vnbufs.le_next; 217 if(tbp->b_vp == bp->b_vp 218 && tbp->b_lblkno == bp->b_lblkno 219 && tbp != bp) 220 { 221 lfs_freebuf(bp); 222 } 223 } 224 } 225 } 226 splx(s); 227 } 228 229 /* If the node is being written, wait until that is done */ 230 if(WRITEINPROG(vp)) { 231 #ifdef DEBUG_LFS 232 ivndebug(vp,"vflush/writeinprog"); 233 #endif 234 tsleep(vp, PRIBIO+1, "lfs_vw", 0); 235 } 236 237 /* Protect against VXLOCK deadlock in vinvalbuf() */ 238 lfs_seglock(fs, SEGM_SYNC); 239 240 /* If we're supposed to flush a freed inode, just toss it */ 241 /* XXX - seglock, so these buffers can't be gathered, right? */ 242 if(ip->i_ffs_mode == 0) { 243 printf("lfs_vflush: ino %d is freed, not flushing\n", 244 ip->i_number); 245 s = splbio(); 246 for(bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) { 247 nbp = bp->b_vnbufs.le_next; 248 /* Copied from lfs_writeseg */ 249 if (bp->b_flags & B_CALL) { 250 /* if B_CALL, it was created with newbuf */ 251 lfs_freebuf(bp); 252 } else { 253 bremfree(bp); 254 bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | 255 B_LOCKED | B_GATHERED); 256 bp->b_flags |= B_DONE; 257 reassignbuf(bp, vp); 258 brelse(bp); 259 } 260 } 261 splx(s); 262 if(ip->i_flag & IN_CLEANING) 263 fs->lfs_uinodes--; 264 if(ip->i_flag & IN_MODIFIED) 265 fs->lfs_uinodes--; 266 ip->i_flag &= ~(IN_MODIFIED|IN_UPDATE|IN_ACCESS|IN_CHANGE|IN_CLEANING); 267 printf("lfs_vflush: done not flushing ino %d\n", 268 ip->i_number); 269 lfs_segunlock(fs); 270 return 0; 271 } 272 273 SET_FLUSHING(fs,vp); 274 if (fs->lfs_nactive > LFS_MAX_ACTIVE) { 275 error = lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP); 276 CLR_FLUSHING(fs,vp); 277 lfs_segunlock(fs); 278 return error; 279 } 280 sp = fs->lfs_sp; 281 282 if (vp->v_dirtyblkhd.lh_first == NULL) { 283 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); 284 } else if((ip->i_flag & IN_CLEANING) && (fs->lfs_sp->seg_flags & SEGM_CLEAN)) { 285 #ifdef DEBUG_LFS 286 ivndebug(vp,"vflush/clean"); 287 #endif 288 lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN); 289 } 290 else if(lfs_dostats) { 291 if(vp->v_dirtyblkhd.lh_first || (VTOI(vp)->i_flag & (IN_MODIFIED|IN_UPDATE|IN_ACCESS|IN_CHANGE|IN_CLEANING))) 292 ++lfs_stats.vflush_invoked; 293 #ifdef DEBUG_LFS 294 ivndebug(vp,"vflush"); 295 #endif 296 } 297 298 #ifdef DIAGNOSTIC 299 /* XXX KS This actually can happen right now, though it shouldn't(?) */ 300 if(vp->v_flag & VDIROP) { 301 printf("lfs_vflush: flushing VDIROP, this shouldn\'t be\n"); 302 /* panic("VDIROP being flushed...this can\'t happen"); */ 303 } 304 if(vp->v_usecount<0) { 305 printf("usecount=%ld\n",vp->v_usecount); 306 panic("lfs_vflush: usecount<0"); 307 } 308 #endif 309 310 do { 311 do { 312 if (vp->v_dirtyblkhd.lh_first != NULL) 313 lfs_writefile(fs, sp, vp); 314 } while (lfs_writeinode(fs, sp, ip)); 315 } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); 316 317 if(lfs_dostats) { 318 ++lfs_stats.nwrites; 319 if (sp->seg_flags & SEGM_SYNC) 320 ++lfs_stats.nsync_writes; 321 if (sp->seg_flags & SEGM_CKP) 322 ++lfs_stats.ncheckpoints; 323 } 324 lfs_segunlock(fs); 325 326 CLR_FLUSHING(fs,vp); 327 return (0); 328 } 329 330 #ifdef DEBUG_LFS_VERBOSE 331 # define vndebug(vp,str) if(VTOI(vp)->i_flag & IN_CLEANING) printf("not writing ino %d because %s (op %d)\n",VTOI(vp)->i_number,(str),op) 332 #else 333 # define vndebug(vp,str) 334 #endif 335 336 int 337 lfs_writevnodes(fs, mp, sp, op) 338 struct lfs *fs; 339 struct mount *mp; 340 struct segment *sp; 341 int op; 342 { 343 struct inode *ip; 344 struct vnode *vp; 345 int inodes_written=0, only_cleaning; 346 347 #ifndef LFS_NO_BACKVP_HACK 348 /* BEGIN HACK */ 349 #define VN_OFFSET (((caddr_t)&vp->v_mntvnodes.le_next) - (caddr_t)vp) 350 #define BACK_VP(VP) ((struct vnode *)(((caddr_t)VP->v_mntvnodes.le_prev) - VN_OFFSET)) 351 #define BEG_OF_VLIST ((struct vnode *)(((caddr_t)&mp->mnt_vnodelist.lh_first) - VN_OFFSET)) 352 353 /* Find last vnode. */ 354 loop: for (vp = mp->mnt_vnodelist.lh_first; 355 vp && vp->v_mntvnodes.le_next != NULL; 356 vp = vp->v_mntvnodes.le_next); 357 for (; vp && vp != BEG_OF_VLIST; vp = BACK_VP(vp)) { 358 #else 359 loop: 360 for (vp = mp->mnt_vnodelist.lh_first; 361 vp != NULL; 362 vp = vp->v_mntvnodes.le_next) { 363 #endif 364 /* 365 * If the vnode that we are about to sync is no longer 366 * associated with this mount point, start over. 367 */ 368 if (vp->v_mount != mp) 369 goto loop; 370 371 ip = VTOI(vp); 372 if ((op == VN_DIROP && !(vp->v_flag & VDIROP)) || 373 (op != VN_DIROP && op != VN_CLEAN && (vp->v_flag & VDIROP))) { 374 vndebug(vp,"dirop"); 375 continue; 376 } 377 378 if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first) { 379 vndebug(vp,"empty"); 380 continue; 381 } 382 383 if (vp->v_type == VNON) { 384 continue; 385 } 386 387 if(op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM 388 && vp != fs->lfs_flushvp 389 && !(ip->i_flag & IN_CLEANING)) { 390 vndebug(vp,"cleaning"); 391 continue; 392 } 393 394 if (lfs_vref(vp)) { 395 vndebug(vp,"vref"); 396 continue; 397 } 398 399 #if 0 /* XXX KS - if we skip the ifile, things could go badly for us. */ 400 if(WRITEINPROG(vp)) { 401 lfs_vunref(vp); 402 #ifdef DEBUG_LFS 403 ivndebug(vp,"writevnodes/writeinprog"); 404 #endif 405 continue; 406 } 407 #endif 408 only_cleaning = 0; 409 /* 410 * Write the inode/file if dirty and it's not the 411 * the IFILE. 412 */ 413 if ((ip->i_flag & 414 (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE | IN_CLEANING) || 415 vp->v_dirtyblkhd.lh_first != NULL)) 416 { 417 only_cleaning = ((ip->i_flag & (IN_ACCESS|IN_CHANGE|IN_MODIFIED|IN_UPDATE|IN_CLEANING))==IN_CLEANING); 418 419 if(ip->i_number != LFS_IFILE_INUM 420 && vp->v_dirtyblkhd.lh_first != NULL) 421 { 422 lfs_writefile(fs, sp, vp); 423 } 424 if(vp->v_dirtyblkhd.lh_first != NULL) { 425 if(WRITEINPROG(vp)) { 426 #ifdef DEBUG_LFS 427 ivndebug(vp,"writevnodes/write2"); 428 #endif 429 } else if(!(ip->i_flag & (IN_ACCESS|IN_CHANGE|IN_MODIFIED|IN_UPDATE|IN_CLEANING))) { 430 #ifdef DEBUG_LFS 431 printf("<%d>",ip->i_number); 432 #endif 433 ip->i_flag |= IN_MODIFIED; 434 ++fs->lfs_uinodes; 435 } 436 } 437 (void) lfs_writeinode(fs, sp, ip); 438 inodes_written++; 439 } 440 441 if(lfs_clean_vnhead && only_cleaning) 442 lfs_vunref_head(vp); 443 else 444 lfs_vunref(vp); 445 } 446 return inodes_written; 447 } 448 449 int 450 lfs_segwrite(mp, flags) 451 struct mount *mp; 452 int flags; /* Do a checkpoint. */ 453 { 454 struct buf *bp; 455 struct inode *ip; 456 struct lfs *fs; 457 struct segment *sp; 458 struct vnode *vp; 459 SEGUSE *segusep; 460 ufs_daddr_t ibno; 461 int do_ckp, error, i; 462 int writer_set = 0; 463 int need_unlock = 0; 464 465 fs = VFSTOUFS(mp)->um_lfs; 466 467 lfs_imtime(fs); 468 469 /* 470 * If we are not the cleaner, and we have fewer than MIN_FREE_SEGS 471 * clean segments, wait until cleaner writes. 472 */ 473 if(!(flags & SEGM_CLEAN) 474 && (!fs->lfs_seglock || !(fs->lfs_sp->seg_flags & SEGM_CLEAN))) 475 { 476 do { 477 if (fs->lfs_nclean <= MIN_FREE_SEGS 478 || fs->lfs_avail <= 0) 479 { 480 wakeup(&lfs_allclean_wakeup); 481 wakeup(&fs->lfs_nextseg); 482 error = tsleep(&fs->lfs_avail, PRIBIO + 1, 483 "lfs_avail", 0); 484 if (error) { 485 return (error); 486 } 487 } 488 } while (fs->lfs_nclean <= MIN_FREE_SEGS || fs->lfs_avail <= 0); 489 } 490 491 /* 492 * Allocate a segment structure and enough space to hold pointers to 493 * the maximum possible number of buffers which can be described in a 494 * single summary block. 495 */ 496 do_ckp = (flags & SEGM_CKP) || fs->lfs_nactive > LFS_MAX_ACTIVE; 497 lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); 498 sp = fs->lfs_sp; 499 500 /* 501 * If lfs_flushvp is non-NULL, we are called from lfs_vflush, 502 * in which case we have to flush *all* buffers off of this vnode. 503 * We don't care about other nodes, but write any non-dirop nodes 504 * anyway in anticipation of another getnewvnode(). 505 * 506 * If we're cleaning we only write cleaning and ifile blocks, and 507 * no dirops, since otherwise we'd risk corruption in a crash. 508 */ 509 if(sp->seg_flags & SEGM_CLEAN) 510 lfs_writevnodes(fs, mp, sp, VN_CLEAN); 511 else { 512 lfs_writevnodes(fs, mp, sp, VN_REG); 513 if(!fs->lfs_dirops || !fs->lfs_flushvp) { 514 while(fs->lfs_dirops) 515 if((error = tsleep(&fs->lfs_writer, PRIBIO + 1, 516 "lfs writer", 0))) 517 { 518 free(sp->bpp, M_SEGMENT); 519 free(sp, M_SEGMENT); 520 return (error); 521 } 522 fs->lfs_writer++; 523 writer_set=1; 524 lfs_writevnodes(fs, mp, sp, VN_DIROP); 525 ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT); 526 } 527 } 528 529 /* 530 * If we are doing a checkpoint, mark everything since the 531 * last checkpoint as no longer ACTIVE. 532 */ 533 if (do_ckp) { 534 for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz; 535 --ibno >= fs->lfs_cleansz; ) { 536 if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize, NOCRED, &bp)) 537 538 panic("lfs_segwrite: ifile read"); 539 segusep = (SEGUSE *)bp->b_data; 540 for (i = fs->lfs_sepb; i--; segusep++) 541 segusep->su_flags &= ~SEGUSE_ACTIVE; 542 543 /* But the current segment is still ACTIVE */ 544 if (fs->lfs_curseg/fs->lfs_sepb==(ibno-fs->lfs_cleansz)) 545 ((SEGUSE *)(bp->b_data))[fs->lfs_curseg%fs->lfs_sepb].su_flags |= SEGUSE_ACTIVE; 546 error = VOP_BWRITE(bp); 547 } 548 } 549 550 if (do_ckp || fs->lfs_doifile) { 551 redo: 552 vp = fs->lfs_ivnode; 553 /* 554 * Depending on the circumstances of our calling, the ifile 555 * inode might be locked. If it is, and if it is locked by 556 * us, we should VREF instead of vget here. 557 */ 558 need_unlock = 0; 559 if(VOP_ISLOCKED(vp) 560 && vp->v_lock.lk_lockholder == curproc->p_pid) { 561 VREF(vp); 562 } else { 563 while (vget(vp, LK_EXCLUSIVE)) 564 continue; 565 need_unlock = 1; 566 } 567 ip = VTOI(vp); 568 if (vp->v_dirtyblkhd.lh_first != NULL) 569 lfs_writefile(fs, sp, vp); 570 (void)lfs_writeinode(fs, sp, ip); 571 572 /* Only vput if we used vget() above. */ 573 if(need_unlock) 574 vput(vp); 575 else 576 vrele(vp); 577 578 if (lfs_writeseg(fs, sp) && do_ckp) 579 goto redo; 580 } else { 581 (void) lfs_writeseg(fs, sp); 582 } 583 584 /* 585 * If the I/O count is non-zero, sleep until it reaches zero. 586 * At the moment, the user's process hangs around so we can 587 * sleep. 588 */ 589 fs->lfs_doifile = 0; 590 if(writer_set && --fs->lfs_writer==0) 591 wakeup(&fs->lfs_dirops); 592 593 if(lfs_dostats) { 594 ++lfs_stats.nwrites; 595 if (sp->seg_flags & SEGM_SYNC) 596 ++lfs_stats.nsync_writes; 597 if (sp->seg_flags & SEGM_CKP) 598 ++lfs_stats.ncheckpoints; 599 } 600 lfs_segunlock(fs); 601 return (0); 602 } 603 604 /* 605 * Write the dirty blocks associated with a vnode. 606 */ 607 void 608 lfs_writefile(fs, sp, vp) 609 struct lfs *fs; 610 struct segment *sp; 611 struct vnode *vp; 612 { 613 struct buf *bp; 614 struct finfo *fip; 615 IFILE *ifp; 616 617 618 if (sp->seg_bytes_left < fs->lfs_bsize || 619 sp->sum_bytes_left < sizeof(struct finfo)) 620 (void) lfs_writeseg(fs, sp); 621 622 sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(ufs_daddr_t); 623 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 624 625 if(vp->v_flag & VDIROP) 626 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT); 627 628 fip = sp->fip; 629 fip->fi_nblocks = 0; 630 fip->fi_ino = VTOI(vp)->i_number; 631 LFS_IENTRY(ifp, fs, fip->fi_ino, bp); 632 fip->fi_version = ifp->if_version; 633 brelse(bp); 634 635 if(sp->seg_flags & SEGM_CLEAN) 636 { 637 lfs_gather(fs, sp, vp, lfs_match_fake); 638 /* 639 * For a file being flushed, we need to write *all* blocks. 640 * This means writing the cleaning blocks first, and then 641 * immediately following with any non-cleaning blocks. 642 * The same is true of the Ifile since checkpoints assume 643 * that all valid Ifile blocks are written. 644 */ 645 if(IS_FLUSHING(fs,vp) || VTOI(vp)->i_number == LFS_IFILE_INUM) 646 lfs_gather(fs, sp, vp, lfs_match_data); 647 } else 648 lfs_gather(fs, sp, vp, lfs_match_data); 649 650 /* 651 * It may not be necessary to write the meta-data blocks at this point, 652 * as the roll-forward recovery code should be able to reconstruct the 653 * list. 654 * 655 * We have to write them anyway, though, under two conditions: (1) the 656 * vnode is being flushed (for reuse by vinvalbuf); or (2) we are 657 * checkpointing. 658 */ 659 if(lfs_writeindir 660 || IS_FLUSHING(fs,vp) 661 || (sp->seg_flags & SEGM_CKP)) 662 { 663 lfs_gather(fs, sp, vp, lfs_match_indir); 664 lfs_gather(fs, sp, vp, lfs_match_dindir); 665 lfs_gather(fs, sp, vp, lfs_match_tindir); 666 } 667 fip = sp->fip; 668 if (fip->fi_nblocks != 0) { 669 sp->fip = (FINFO*)((caddr_t)fip + sizeof(struct finfo) + 670 sizeof(ufs_daddr_t) * (fip->fi_nblocks-1)); 671 sp->start_lbp = &sp->fip->fi_blocks[0]; 672 } else { 673 sp->sum_bytes_left += sizeof(FINFO) - sizeof(ufs_daddr_t); 674 --((SEGSUM *)(sp->segsum))->ss_nfinfo; 675 } 676 } 677 678 int 679 lfs_writeinode(fs, sp, ip) 680 struct lfs *fs; 681 struct segment *sp; 682 struct inode *ip; 683 { 684 struct buf *bp, *ibp; 685 IFILE *ifp; 686 SEGUSE *sup; 687 ufs_daddr_t daddr; 688 ino_t ino; 689 int error, i, ndx; 690 int redo_ifile = 0; 691 struct timespec ts; 692 int gotblk=0; 693 694 if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE | IN_CLEANING))) 695 return(0); 696 697 /* Allocate a new inode block if necessary. */ 698 if ((ip->i_number != LFS_IFILE_INUM || sp->idp==NULL) && sp->ibp == NULL) { 699 /* Allocate a new segment if necessary. */ 700 if (sp->seg_bytes_left < fs->lfs_bsize || 701 sp->sum_bytes_left < sizeof(ufs_daddr_t)) 702 (void) lfs_writeseg(fs, sp); 703 704 /* Get next inode block. */ 705 daddr = fs->lfs_offset; 706 fs->lfs_offset += fsbtodb(fs, 1); 707 sp->ibp = *sp->cbpp++ = 708 getblk(VTOI(fs->lfs_ivnode)->i_devvp, daddr, fs->lfs_bsize, 0, 0); 709 gotblk++; 710 711 /* Zero out inode numbers */ 712 for (i = 0; i < INOPB(fs); ++i) 713 ((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0; 714 715 ++sp->start_bpp; 716 fs->lfs_avail -= fsbtodb(fs, 1); 717 /* Set remaining space counters. */ 718 sp->seg_bytes_left -= fs->lfs_bsize; 719 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 720 ndx = LFS_SUMMARY_SIZE / sizeof(ufs_daddr_t) - 721 sp->ninodes / INOPB(fs) - 1; 722 ((ufs_daddr_t *)(sp->segsum))[ndx] = daddr; 723 } 724 725 /* Update the inode times and copy the inode onto the inode page. */ 726 if (ip->i_flag & (IN_CLEANING|IN_MODIFIED)) 727 --fs->lfs_uinodes; 728 TIMEVAL_TO_TIMESPEC(&time, &ts); 729 LFS_ITIMES(ip, &ts, &ts, &ts); 730 731 if(ip->i_flag & IN_CLEANING) 732 ip->i_flag &= ~IN_CLEANING; 733 else 734 ip->i_flag &= ~(IN_ACCESS|IN_CHANGE|IN_MODIFIED|IN_UPDATE); 735 736 /* 737 * If this is the Ifile, and we've already written the Ifile in this 738 * partial segment, just overwrite it (it's not on disk yet) and 739 * continue. 740 * 741 * XXX we know that the bp that we get the second time around has 742 * already been gathered. 743 */ 744 if(ip->i_number == LFS_IFILE_INUM && sp->idp) { 745 *(sp->idp) = ip->i_din.ffs_din; 746 return 0; 747 } 748 749 bp = sp->ibp; 750 ((struct dinode *)bp->b_data)[sp->ninodes % INOPB(fs)] = 751 ip->i_din.ffs_din; 752 753 if(ip->i_number == LFS_IFILE_INUM) /* We know sp->idp == NULL */ 754 sp->idp = ((struct dinode *)bp->b_data)+(sp->ninodes % INOPB(fs)); 755 if(gotblk) { 756 bp->b_flags |= B_LOCKED; 757 brelse(bp); 758 } 759 760 /* Increment inode count in segment summary block. */ 761 ++((SEGSUM *)(sp->segsum))->ss_ninos; 762 763 /* If this page is full, set flag to allocate a new page. */ 764 if (++sp->ninodes % INOPB(fs) == 0) 765 sp->ibp = NULL; 766 767 /* 768 * If updating the ifile, update the super-block. Update the disk 769 * address and access times for this inode in the ifile. 770 */ 771 ino = ip->i_number; 772 if (ino == LFS_IFILE_INUM) { 773 daddr = fs->lfs_idaddr; 774 fs->lfs_idaddr = bp->b_blkno; 775 } else { 776 LFS_IENTRY(ifp, fs, ino, ibp); 777 daddr = ifp->if_daddr; 778 ifp->if_daddr = bp->b_blkno; 779 #ifdef LFS_DEBUG_NEXTFREE 780 if(ino > 3 && ifp->if_nextfree) { 781 vprint("lfs_writeinode",ITOV(ip)); 782 printf("lfs_writeinode: updating free ino %d\n", 783 ip->i_number); 784 } 785 #endif 786 error = VOP_BWRITE(ibp); 787 } 788 789 /* 790 * No need to update segment usage if there was no former inode address 791 * or if the last inode address is in the current partial segment. 792 */ 793 if (daddr != LFS_UNUSED_DADDR && 794 !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) { 795 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 796 #ifdef DIAGNOSTIC 797 if (sup->su_nbytes < DINODE_SIZE) { 798 /* XXX -- Change to a panic. */ 799 printf("lfs_writeinode: negative bytes (segment %d short by %d)\n", 800 datosn(fs, daddr), (int)DINODE_SIZE - sup->su_nbytes); 801 panic("lfs_writeinode: negative bytes"); 802 sup->su_nbytes = DINODE_SIZE; 803 } 804 #endif 805 sup->su_nbytes -= DINODE_SIZE; 806 redo_ifile = 807 (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); 808 error = VOP_BWRITE(bp); 809 } 810 return (redo_ifile); 811 } 812 813 int 814 lfs_gatherblock(sp, bp, sptr) 815 struct segment *sp; 816 struct buf *bp; 817 int *sptr; 818 { 819 struct lfs *fs; 820 int version; 821 822 /* 823 * If full, finish this segment. We may be doing I/O, so 824 * release and reacquire the splbio(). 825 */ 826 #ifdef DIAGNOSTIC 827 if (sp->vp == NULL) 828 panic ("lfs_gatherblock: Null vp in segment"); 829 #endif 830 fs = sp->fs; 831 if (sp->sum_bytes_left < sizeof(ufs_daddr_t) || 832 sp->seg_bytes_left < bp->b_bcount) { 833 if (sptr) 834 splx(*sptr); 835 lfs_updatemeta(sp); 836 837 version = sp->fip->fi_version; 838 (void) lfs_writeseg(fs, sp); 839 840 sp->fip->fi_version = version; 841 sp->fip->fi_ino = VTOI(sp->vp)->i_number; 842 /* Add the current file to the segment summary. */ 843 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 844 sp->sum_bytes_left -= 845 sizeof(struct finfo) - sizeof(ufs_daddr_t); 846 847 if (sptr) 848 *sptr = splbio(); 849 return(1); 850 } 851 852 #ifdef DEBUG 853 if(bp->b_flags & B_GATHERED) { 854 printf("lfs_gatherblock: already gathered! Ino %d, lbn %d\n", 855 sp->fip->fi_ino, bp->b_lblkno); 856 return(0); 857 } 858 #endif 859 /* Insert into the buffer list, update the FINFO block. */ 860 bp->b_flags |= B_GATHERED; 861 *sp->cbpp++ = bp; 862 sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno; 863 864 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 865 sp->seg_bytes_left -= bp->b_bcount; 866 return(0); 867 } 868 869 int 870 lfs_gather(fs, sp, vp, match) 871 struct lfs *fs; 872 struct segment *sp; 873 struct vnode *vp; 874 int (*match) __P((struct lfs *, struct buf *)); 875 { 876 struct buf *bp; 877 int s, count=0; 878 879 sp->vp = vp; 880 s = splbio(); 881 882 #ifndef LFS_NO_BACKBUF_HACK 883 loop: for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) { 884 #else /* LFS_NO_BACKBUF_HACK */ 885 /* This is a hack to see if ordering the blocks in LFS makes a difference. */ 886 # define BUF_OFFSET (((void *)&bp->b_vnbufs.le_next) - (void *)bp) 887 # define BACK_BUF(BP) ((struct buf *)(((void *)BP->b_vnbufs.le_prev) - BUF_OFFSET)) 888 # define BEG_OF_LIST ((struct buf *)(((void *)&vp->v_dirtyblkhd.lh_first) - BUF_OFFSET)) 889 /* Find last buffer. */ 890 loop: for (bp = vp->v_dirtyblkhd.lh_first; bp && bp->b_vnbufs.le_next != NULL; 891 bp = bp->b_vnbufs.le_next); 892 for (; bp && bp != BEG_OF_LIST; bp = BACK_BUF(bp)) { 893 #endif /* LFS_NO_BACKBUF_HACK */ 894 if ((bp->b_flags & (B_BUSY|B_GATHERED)) || !match(fs, bp)) 895 continue; 896 if(vp->v_type == VBLK) { 897 /* For block devices, just write the blocks. */ 898 /* XXX Do we really need to even do this? */ 899 #ifdef DEBUG_LFS 900 if(count==0) 901 printf("BLK("); 902 printf("."); 903 #endif 904 /* Get the block before bwrite, so we don't corrupt the free list */ 905 bp->b_flags |= B_BUSY; 906 bremfree(bp); 907 bwrite(bp); 908 } else { 909 #ifdef DIAGNOSTIC 910 if (!(bp->b_flags & B_DELWRI)) 911 panic("lfs_gather: bp not B_DELWRI"); 912 if (!(bp->b_flags & B_LOCKED)) { 913 printf("lfs_gather: lbn %d blk %d not B_LOCKED\n", bp->b_lblkno, bp->b_blkno); 914 VOP_PRINT(bp->b_vp); 915 panic("lfs_gather: bp not B_LOCKED"); 916 } 917 #endif 918 if (lfs_gatherblock(sp, bp, &s)) { 919 goto loop; 920 } 921 } 922 count++; 923 } 924 splx(s); 925 #ifdef DEBUG_LFS 926 if(vp->v_type == VBLK && count) 927 printf(")\n"); 928 #endif 929 lfs_updatemeta(sp); 930 sp->vp = NULL; 931 return count; 932 } 933 934 /* 935 * Update the metadata that points to the blocks listed in the FINFO 936 * array. 937 */ 938 void 939 lfs_updatemeta(sp) 940 struct segment *sp; 941 { 942 SEGUSE *sup; 943 struct buf *bp; 944 struct lfs *fs; 945 struct vnode *vp; 946 struct indir a[NIADDR + 2], *ap; 947 struct inode *ip; 948 ufs_daddr_t daddr, lbn, off; 949 int error, i, nblocks, num; 950 951 vp = sp->vp; 952 nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; 953 if (nblocks < 0) 954 panic("This is a bad thing\n"); 955 if (vp == NULL || nblocks == 0) 956 return; 957 958 /* Sort the blocks. */ 959 /* 960 * XXX KS - We have to sort even if the blocks come from the 961 * cleaner, because there might be other pending blocks on the 962 * same inode...and if we don't sort, and there are fragments 963 * present, blocks may be written in the wrong place. 964 */ 965 /* if (!(sp->seg_flags & SEGM_CLEAN)) */ 966 lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks); 967 968 /* 969 * Record the length of the last block in case it's a fragment. 970 * If there are indirect blocks present, they sort last. An 971 * indirect block will be lfs_bsize and its presence indicates 972 * that you cannot have fragments. 973 */ 974 sp->fip->fi_lastlength = sp->start_bpp[nblocks - 1]->b_bcount; 975 976 /* 977 * Assign disk addresses, and update references to the logical 978 * block and the segment usage information. 979 */ 980 fs = sp->fs; 981 for (i = nblocks; i--; ++sp->start_bpp) { 982 lbn = *sp->start_lbp++; 983 984 (*sp->start_bpp)->b_blkno = off = fs->lfs_offset; 985 if((*sp->start_bpp)->b_blkno == (*sp->start_bpp)->b_lblkno) { 986 printf("lfs_updatemeta: ino %d blk %d has same lbn and daddr\n", VTOI(vp)->i_number, off); 987 } 988 fs->lfs_offset += 989 fragstodb(fs, numfrags(fs, (*sp->start_bpp)->b_bcount)); 990 error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL); 991 if (error) 992 panic("lfs_updatemeta: ufs_bmaparray %d", error); 993 ip = VTOI(vp); 994 switch (num) { 995 case 0: 996 ip->i_ffs_db[lbn] = off; 997 break; 998 case 1: 999 ip->i_ffs_ib[a[0].in_off] = off; 1000 break; 1001 default: 1002 ap = &a[num - 1]; 1003 if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp)) 1004 panic("lfs_updatemeta: bread bno %d", 1005 ap->in_lbn); 1006 /* 1007 * Bread may create a new (indirect) block which needs 1008 * to get counted for the inode. 1009 */ 1010 if (/* bp->b_blkno == -1 && */ 1011 !(bp->b_flags & (B_DELWRI|B_DONE))) { 1012 ip->i_ffs_blocks += fsbtodb(fs, 1); 1013 fs->lfs_bfree -= fragstodb(fs, fs->lfs_frag); 1014 } 1015 ((ufs_daddr_t *)bp->b_data)[ap->in_off] = off; 1016 VOP_BWRITE(bp); 1017 } 1018 /* Update segment usage information. */ 1019 if (daddr != UNASSIGNED && !(daddr >= fs->lfs_lastpseg && daddr <= off)) { 1020 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 1021 #ifdef DIAGNOSTIC 1022 if (sup->su_nbytes < (*sp->start_bpp)->b_bcount) { 1023 /* XXX -- Change to a panic. */ 1024 printf("lfs_updatemeta: negative bytes (segment %d short by %ld)\n", 1025 datosn(fs, daddr), (*sp->start_bpp)->b_bcount - sup->su_nbytes); 1026 printf("lfs_updatemeta: ino %d, lbn %d, addr = %x\n", 1027 VTOI(sp->vp)->i_number, (*sp->start_bpp)->b_lblkno, daddr); 1028 panic("lfs_updatemeta: negative bytes"); 1029 sup->su_nbytes = (*sp->start_bpp)->b_bcount; 1030 } 1031 #endif 1032 sup->su_nbytes -= (*sp->start_bpp)->b_bcount; 1033 error = VOP_BWRITE(bp); 1034 } 1035 } 1036 } 1037 1038 /* 1039 * Start a new segment. 1040 */ 1041 int 1042 lfs_initseg(fs) 1043 struct lfs *fs; 1044 { 1045 struct segment *sp; 1046 SEGUSE *sup; 1047 SEGSUM *ssp; 1048 struct buf *bp; 1049 int repeat; 1050 1051 sp = fs->lfs_sp; 1052 1053 repeat = 0; 1054 /* Advance to the next segment. */ 1055 if (!LFS_PARTIAL_FITS(fs)) { 1056 /* Wake up any cleaning procs waiting on this file system. */ 1057 wakeup(&lfs_allclean_wakeup); 1058 wakeup(&fs->lfs_nextseg); 1059 lfs_newseg(fs); 1060 repeat = 1; 1061 fs->lfs_offset = fs->lfs_curseg; 1062 sp->seg_number = datosn(fs, fs->lfs_curseg); 1063 sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE; 1064 /* 1065 * If the segment contains a superblock, update the offset 1066 * and summary address to skip over it. 1067 */ 1068 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 1069 if (sup->su_flags & SEGUSE_SUPERBLOCK) { 1070 fs->lfs_offset += LFS_SBPAD / DEV_BSIZE; 1071 sp->seg_bytes_left -= LFS_SBPAD; 1072 } 1073 brelse(bp); 1074 } else { 1075 sp->seg_number = datosn(fs, fs->lfs_curseg); 1076 sp->seg_bytes_left = (fs->lfs_dbpseg - 1077 (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE; 1078 } 1079 fs->lfs_lastpseg = fs->lfs_offset; 1080 1081 sp->fs = fs; 1082 sp->ibp = NULL; 1083 sp->idp = NULL; 1084 sp->ninodes = 0; 1085 1086 /* Get a new buffer for SEGSUM and enter it into the buffer list. */ 1087 sp->cbpp = sp->bpp; 1088 *sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, 1089 fs->lfs_offset, LFS_SUMMARY_SIZE); 1090 sp->segsum = (*sp->cbpp)->b_data; 1091 bzero(sp->segsum, LFS_SUMMARY_SIZE); 1092 sp->start_bpp = ++sp->cbpp; 1093 fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE; 1094 1095 /* Set point to SEGSUM, initialize it. */ 1096 ssp = sp->segsum; 1097 ssp->ss_next = fs->lfs_nextseg; 1098 ssp->ss_nfinfo = ssp->ss_ninos = 0; 1099 ssp->ss_magic = SS_MAGIC; 1100 1101 /* Set pointer to first FINFO, initialize it. */ 1102 sp->fip = (struct finfo *)((caddr_t)sp->segsum + sizeof(SEGSUM)); 1103 sp->fip->fi_nblocks = 0; 1104 sp->start_lbp = &sp->fip->fi_blocks[0]; 1105 sp->fip->fi_lastlength = 0; 1106 1107 sp->seg_bytes_left -= LFS_SUMMARY_SIZE; 1108 sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM); 1109 1110 return(repeat); 1111 } 1112 1113 /* 1114 * Return the next segment to write. 1115 */ 1116 void 1117 lfs_newseg(fs) 1118 struct lfs *fs; 1119 { 1120 CLEANERINFO *cip; 1121 SEGUSE *sup; 1122 struct buf *bp; 1123 int curseg, isdirty, sn; 1124 1125 LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp); 1126 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 1127 sup->su_nbytes = 0; 1128 sup->su_nsums = 0; 1129 sup->su_ninos = 0; 1130 (void) VOP_BWRITE(bp); 1131 1132 LFS_CLEANERINFO(cip, fs, bp); 1133 --cip->clean; 1134 ++cip->dirty; 1135 fs->lfs_nclean = cip->clean; 1136 (void) VOP_BWRITE(bp); 1137 1138 fs->lfs_lastseg = fs->lfs_curseg; 1139 fs->lfs_curseg = fs->lfs_nextseg; 1140 for (sn = curseg = datosn(fs, fs->lfs_curseg);;) { 1141 sn = (sn + 1) % fs->lfs_nseg; 1142 if (sn == curseg) 1143 panic("lfs_nextseg: no clean segments"); 1144 LFS_SEGENTRY(sup, fs, sn, bp); 1145 isdirty = sup->su_flags & SEGUSE_DIRTY; 1146 brelse(bp); 1147 if (!isdirty) 1148 break; 1149 } 1150 1151 ++fs->lfs_nactive; 1152 fs->lfs_nextseg = sntoda(fs, sn); 1153 if(lfs_dostats) { 1154 ++lfs_stats.segsused; 1155 } 1156 } 1157 1158 int 1159 lfs_writeseg(fs, sp) 1160 struct lfs *fs; 1161 struct segment *sp; 1162 { 1163 extern int locked_queue_count; 1164 extern long locked_queue_bytes; 1165 struct buf **bpp, *bp, *cbp; 1166 SEGUSE *sup; 1167 SEGSUM *ssp; 1168 dev_t i_dev; 1169 u_long *datap, *dp; 1170 int do_again, i, nblocks, s; 1171 #ifdef LFS_TRACK_IOS 1172 int j; 1173 #endif 1174 int (*strategy)__P((void *)); 1175 struct vop_strategy_args vop_strategy_a; 1176 u_short ninos; 1177 struct vnode *devvp; 1178 char *p; 1179 struct vnode *vn; 1180 struct inode *ip; 1181 #if defined(DEBUG) && defined(LFS_PROPELLER) 1182 static int propeller; 1183 char propstring[4] = "-\\|/"; 1184 1185 printf("%c\b",propstring[propeller++]); 1186 if(propeller==4) 1187 propeller = 0; 1188 #endif 1189 1190 /* 1191 * If there are no buffers other than the segment summary to write 1192 * and it is not a checkpoint, don't do anything. On a checkpoint, 1193 * even if there aren't any buffers, you need to write the superblock. 1194 */ 1195 if ((nblocks = sp->cbpp - sp->bpp) == 1) 1196 return (0); 1197 1198 #ifdef DEBUG_LFS 1199 lfs_check_bpp(fs,sp,__FILE__,__LINE__); 1200 #endif 1201 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 1202 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 1203 1204 /* Update the segment usage information. */ 1205 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 1206 1207 /* Loop through all blocks, except the segment summary. */ 1208 for (bpp = sp->bpp; ++bpp < sp->cbpp; ) { 1209 if((*bpp)->b_vp != devvp) 1210 sup->su_nbytes += (*bpp)->b_bcount; 1211 } 1212 1213 ssp = (SEGSUM *)sp->segsum; 1214 1215 ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); 1216 sup->su_nbytes += ssp->ss_ninos * DINODE_SIZE; 1217 /* sup->su_nbytes += LFS_SUMMARY_SIZE; */ 1218 sup->su_lastmod = time.tv_sec; 1219 sup->su_ninos += ninos; 1220 ++sup->su_nsums; 1221 1222 do_again = !(bp->b_flags & B_GATHERED); 1223 (void)VOP_BWRITE(bp); 1224 /* 1225 * Compute checksum across data and then across summary; the first 1226 * block (the summary block) is skipped. Set the create time here 1227 * so that it's guaranteed to be later than the inode mod times. 1228 * 1229 * XXX 1230 * Fix this to do it inline, instead of malloc/copy. 1231 */ 1232 datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK); 1233 for (bpp = sp->bpp, i = nblocks - 1; i--;) { 1234 if (((*++bpp)->b_flags & (B_CALL|B_INVAL)) == (B_CALL|B_INVAL)) { 1235 if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long))) 1236 panic("lfs_writeseg: copyin failed [1]: ino %d blk %d", VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno); 1237 } else { 1238 if( !((*bpp)->b_flags & B_CALL) ) { 1239 /* 1240 * Before we record data for a checksm, 1241 * make sure the data won't change in between 1242 * the checksum calculation and the write, 1243 * by marking the buffer B_BUSY. It will 1244 * be freed later by brelse(). 1245 */ 1246 again: 1247 s = splbio(); 1248 if((*bpp)->b_flags & B_BUSY) { 1249 #ifdef DEBUG 1250 printf("lfs_writeseg: avoiding potential data summary corruption for ino %d, lbn %d\n", 1251 VTOI((*bpp)->b_vp)->i_number, 1252 bp->b_lblkno); 1253 #endif 1254 (*bpp)->b_flags |= B_WANTED; 1255 tsleep((*bpp), (PRIBIO + 1), 1256 "lfs_writeseg", 0); 1257 splx(s); 1258 goto again; 1259 } 1260 (*bpp)->b_flags |= B_BUSY; 1261 splx(s); 1262 } 1263 *dp++ = ((u_long *)(*bpp)->b_data)[0]; 1264 } 1265 } 1266 ssp->ss_create = time.tv_sec; 1267 ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long)); 1268 ssp->ss_sumsum = 1269 cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum)); 1270 free(datap, M_SEGMENT); 1271 #ifdef DIAGNOSTIC 1272 if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE) 1273 panic("lfs_writeseg: No diskspace for summary"); 1274 #endif 1275 fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE); 1276 1277 strategy = devvp->v_op[VOFFSET(vop_strategy)]; 1278 1279 /* 1280 * When we simply write the blocks we lose a rotation for every block 1281 * written. To avoid this problem, we allocate memory in chunks, copy 1282 * the buffers into the chunk and write the chunk. CHUNKSIZE is the 1283 * largest size I/O devices can handle. 1284 * When the data is copied to the chunk, turn off the the B_LOCKED bit 1285 * and brelse the buffer (which will move them to the LRU list). Add 1286 * the B_CALL flag to the buffer header so we can count I/O's for the 1287 * checkpoints and so we can release the allocated memory. 1288 * 1289 * XXX 1290 * This should be removed if the new virtual memory system allows us to 1291 * easily make the buffers contiguous in kernel memory and if that's 1292 * fast enough. 1293 */ 1294 1295 #define CHUNKSIZE MAXPHYS 1296 1297 if(devvp==NULL) 1298 panic("devvp is NULL"); 1299 for (bpp = sp->bpp,i = nblocks; i;) { 1300 cbp = lfs_newbuf(devvp, (*bpp)->b_blkno, CHUNKSIZE); 1301 cbp->b_dev = i_dev; 1302 cbp->b_flags |= B_ASYNC | B_BUSY; 1303 cbp->b_bcount = 0; 1304 1305 #ifdef DIAGNOSTIC 1306 if(datosn(fs,(*bpp)->b_blkno + ((*bpp)->b_bcount - 1)/DEV_BSIZE) != datosn(fs,cbp->b_blkno)) { 1307 panic("lfs_writeseg: Segment overwrite"); 1308 } 1309 #endif 1310 1311 s = splbio(); 1312 if(fs->lfs_iocount >= LFS_THROTTLE) { 1313 tsleep(&fs->lfs_iocount, PRIBIO+1, "lfs throttle", 0); 1314 } 1315 ++fs->lfs_iocount; 1316 #ifdef LFS_TRACK_IOS 1317 for(j=0;j<LFS_THROTTLE;j++) { 1318 if(fs->lfs_pending[j]==LFS_UNUSED_DADDR) { 1319 fs->lfs_pending[j] = cbp->b_blkno; 1320 break; 1321 } 1322 } 1323 #endif /* LFS_TRACK_IOS */ 1324 for (p = cbp->b_data; i && cbp->b_bcount < CHUNKSIZE; i--) { 1325 bp = *bpp; 1326 1327 if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount)) 1328 break; 1329 1330 /* 1331 * Fake buffers from the cleaner are marked as B_INVAL. 1332 * We need to copy the data from user space rather than 1333 * from the buffer indicated. 1334 * XXX == what do I do on an error? 1335 */ 1336 if ((bp->b_flags & (B_CALL|B_INVAL)) == (B_CALL|B_INVAL)) { 1337 if (copyin(bp->b_saveaddr, p, bp->b_bcount)) 1338 panic("lfs_writeseg: copyin failed [2]"); 1339 } else 1340 bcopy(bp->b_data, p, bp->b_bcount); 1341 p += bp->b_bcount; 1342 cbp->b_bcount += bp->b_bcount; 1343 if (bp->b_flags & B_LOCKED) { 1344 --locked_queue_count; 1345 locked_queue_bytes -= bp->b_bufsize; 1346 } 1347 bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | 1348 B_LOCKED | B_GATHERED); 1349 vn = bp->b_vp; 1350 if (bp->b_flags & B_CALL) { 1351 /* if B_CALL, it was created with newbuf */ 1352 lfs_freebuf(bp); 1353 } else { 1354 bremfree(bp); 1355 bp->b_flags |= B_DONE; 1356 if(vn) 1357 reassignbuf(bp, vn); 1358 brelse(bp); 1359 } 1360 if(bp->b_flags & B_NEEDCOMMIT) { /* XXX */ 1361 bp->b_flags &= ~B_NEEDCOMMIT; 1362 wakeup(bp); 1363 } 1364 1365 bpp++; 1366 1367 /* 1368 * If this is the last block for this vnode, but 1369 * there are other blocks on its dirty list, 1370 * set IN_MODIFIED/IN_CLEANING depending on what 1371 * sort of block. Only do this for our mount point, 1372 * not for, e.g., inode blocks that are attached to 1373 * the devvp. 1374 */ 1375 if(i>1 && vn && *bpp && (*bpp)->b_vp != vn 1376 && (*bpp)->b_vp && (bp=vn->v_dirtyblkhd.lh_first)!=NULL && 1377 vn->v_mount == fs->lfs_ivnode->v_mount) 1378 { 1379 ip = VTOI(vn); 1380 #ifdef DEBUG_LFS 1381 printf("lfs_writeseg: marking ino %d\n",ip->i_number); 1382 #endif 1383 if(!(ip->i_flag & (IN_CLEANING|IN_MODIFIED))) { 1384 fs->lfs_uinodes++; 1385 if(bp->b_flags & B_CALL) 1386 ip->i_flag |= IN_CLEANING; 1387 else 1388 ip->i_flag |= IN_MODIFIED; 1389 } 1390 } 1391 /* if(vn->v_dirtyblkhd.lh_first == NULL) */ 1392 wakeup(vn); 1393 } 1394 ++cbp->b_vp->v_numoutput; 1395 splx(s); 1396 /* 1397 * XXXX This is a gross and disgusting hack. Since these 1398 * buffers are physically addressed, they hang off the 1399 * device vnode (devvp). As a result, they have no way 1400 * of getting to the LFS superblock or lfs structure to 1401 * keep track of the number of I/O's pending. So, I am 1402 * going to stuff the fs into the saveaddr field of 1403 * the buffer (yuk). 1404 */ 1405 cbp->b_saveaddr = (caddr_t)fs; 1406 vop_strategy_a.a_desc = VDESC(vop_strategy); 1407 vop_strategy_a.a_bp = cbp; 1408 (strategy)(&vop_strategy_a); 1409 } 1410 /* 1411 * XXX 1412 * Vinvalbuf can move locked buffers off the locked queue 1413 * and we have no way of knowing about this. So, after 1414 * doing a big write, we recalculate how many buffers are 1415 * really still left on the locked queue. 1416 */ 1417 lfs_countlocked(&locked_queue_count,&locked_queue_bytes); 1418 wakeup(&locked_queue_count); 1419 if(lfs_dostats) { 1420 ++lfs_stats.psegwrites; 1421 lfs_stats.blocktot += nblocks - 1; 1422 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 1423 ++lfs_stats.psyncwrites; 1424 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { 1425 ++lfs_stats.pcleanwrites; 1426 lfs_stats.cleanblocks += nblocks - 1; 1427 } 1428 } 1429 return (lfs_initseg(fs) || do_again); 1430 } 1431 1432 void 1433 lfs_writesuper(fs, daddr) 1434 struct lfs *fs; 1435 daddr_t daddr; 1436 { 1437 struct buf *bp; 1438 dev_t i_dev; 1439 int (*strategy) __P((void *)); 1440 int s; 1441 struct vop_strategy_args vop_strategy_a; 1442 1443 #ifdef LFS_CANNOT_ROLLFW 1444 /* 1445 * If we can write one superblock while another is in 1446 * progress, we risk not having a complete checkpoint if we crash. 1447 * So, block here if a superblock write is in progress. 1448 */ 1449 s = splbio(); 1450 while(fs->lfs_sbactive) { 1451 tsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0); 1452 } 1453 fs->lfs_sbactive = daddr; 1454 splx(s); 1455 #endif 1456 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 1457 strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; 1458 1459 /* Set timestamp of this version of the superblock */ 1460 fs->lfs_tstamp = time.tv_sec; 1461 1462 /* Checksum the superblock and copy it into a buffer. */ 1463 fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs)); 1464 bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr, LFS_SBPAD); 1465 *(struct dlfs *)bp->b_data = fs->lfs_dlfs; 1466 1467 bp->b_dev = i_dev; 1468 bp->b_flags |= B_BUSY | B_CALL | B_ASYNC; 1469 bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI); 1470 bp->b_iodone = lfs_supercallback; 1471 /* XXX KS - same nasty hack as above */ 1472 bp->b_saveaddr = (caddr_t)fs; 1473 1474 vop_strategy_a.a_desc = VDESC(vop_strategy); 1475 vop_strategy_a.a_bp = bp; 1476 s = splbio(); 1477 ++bp->b_vp->v_numoutput; 1478 splx(s); 1479 (strategy)(&vop_strategy_a); 1480 } 1481 1482 /* 1483 * Logical block number match routines used when traversing the dirty block 1484 * chain. 1485 */ 1486 int 1487 lfs_match_fake(fs, bp) 1488 struct lfs *fs; 1489 struct buf *bp; 1490 { 1491 return (bp->b_flags & B_CALL); 1492 } 1493 1494 int 1495 lfs_match_data(fs, bp) 1496 struct lfs *fs; 1497 struct buf *bp; 1498 { 1499 return (bp->b_lblkno >= 0); 1500 } 1501 1502 int 1503 lfs_match_indir(fs, bp) 1504 struct lfs *fs; 1505 struct buf *bp; 1506 { 1507 int lbn; 1508 1509 lbn = bp->b_lblkno; 1510 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); 1511 } 1512 1513 int 1514 lfs_match_dindir(fs, bp) 1515 struct lfs *fs; 1516 struct buf *bp; 1517 { 1518 int lbn; 1519 1520 lbn = bp->b_lblkno; 1521 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); 1522 } 1523 1524 int 1525 lfs_match_tindir(fs, bp) 1526 struct lfs *fs; 1527 struct buf *bp; 1528 { 1529 int lbn; 1530 1531 lbn = bp->b_lblkno; 1532 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); 1533 } 1534 1535 /* 1536 * XXX - The only buffers that are going to hit these functions are the 1537 * segment write blocks, or the segment summaries, or the superblocks. 1538 * 1539 * All of the above are created by lfs_newbuf, and so do not need to be 1540 * released via brelse. 1541 */ 1542 void 1543 lfs_callback(bp) 1544 struct buf *bp; 1545 { 1546 struct lfs *fs; 1547 #ifdef LFS_TRACK_IOS 1548 int j; 1549 #endif 1550 1551 fs = (struct lfs *)bp->b_saveaddr; 1552 #ifdef DIAGNOSTIC 1553 if (fs->lfs_iocount == 0) 1554 panic("lfs_callback: zero iocount\n"); 1555 #endif 1556 if (--fs->lfs_iocount < LFS_THROTTLE) 1557 wakeup(&fs->lfs_iocount); 1558 #ifdef LFS_TRACK_IOS 1559 for(j=0;j<LFS_THROTTLE;j++) { 1560 if(fs->lfs_pending[j]==bp->b_blkno) { 1561 fs->lfs_pending[j] = LFS_UNUSED_DADDR; 1562 wakeup(&(fs->lfs_pending[j])); 1563 break; 1564 } 1565 } 1566 #endif /* LFS_TRACK_IOS */ 1567 1568 lfs_freebuf(bp); 1569 } 1570 1571 void 1572 lfs_supercallback(bp) 1573 struct buf *bp; 1574 { 1575 #ifdef LFS_CANNOT_ROLLFW 1576 struct lfs *fs; 1577 1578 fs = (struct lfs *)bp->b_saveaddr; 1579 fs->lfs_sbactive=NULL; 1580 wakeup(&fs->lfs_sbactive); 1581 #endif 1582 lfs_freebuf(bp); 1583 } 1584 1585 /* 1586 * Shellsort (diminishing increment sort) from Data Structures and 1587 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; 1588 * see also Knuth Vol. 3, page 84. The increments are selected from 1589 * formula (8), page 95. Roughly O(N^3/2). 1590 */ 1591 /* 1592 * This is our own private copy of shellsort because we want to sort 1593 * two parallel arrays (the array of buffer pointers and the array of 1594 * logical block numbers) simultaneously. Note that we cast the array 1595 * of logical block numbers to a unsigned in this routine so that the 1596 * negative block numbers (meta data blocks) sort AFTER the data blocks. 1597 */ 1598 1599 void 1600 lfs_shellsort(bp_array, lb_array, nmemb) 1601 struct buf **bp_array; 1602 ufs_daddr_t *lb_array; 1603 register int nmemb; 1604 { 1605 static int __rsshell_increments[] = { 4, 1, 0 }; 1606 register int incr, *incrp, t1, t2; 1607 struct buf *bp_temp; 1608 u_long lb_temp; 1609 1610 for (incrp = __rsshell_increments; (incr = *incrp++) != 0;) 1611 for (t1 = incr; t1 < nmemb; ++t1) 1612 for (t2 = t1 - incr; t2 >= 0;) 1613 if (lb_array[t2] > lb_array[t2 + incr]) { 1614 lb_temp = lb_array[t2]; 1615 lb_array[t2] = lb_array[t2 + incr]; 1616 lb_array[t2 + incr] = lb_temp; 1617 bp_temp = bp_array[t2]; 1618 bp_array[t2] = bp_array[t2 + incr]; 1619 bp_array[t2 + incr] = bp_temp; 1620 t2 -= incr; 1621 } else 1622 break; 1623 } 1624 1625 /* 1626 * Check VXLOCK. Return 1 if the vnode is locked. Otherwise, vget it. 1627 */ 1628 int 1629 lfs_vref(vp) 1630 register struct vnode *vp; 1631 { 1632 /* 1633 * If we return 1 here during a flush, we risk vinvalbuf() not 1634 * being able to flush all of the pages from this vnode, which 1635 * will cause it to panic. So, return 0 if a flush is in progress. 1636 */ 1637 if (vp->v_flag & VXLOCK) { 1638 if(IS_FLUSHING(VTOI(vp)->i_lfs,vp)) { 1639 return 0; 1640 } 1641 return(1); 1642 } 1643 return (vget(vp, 0)); 1644 } 1645 1646 /* 1647 * This is vrele except that we do not want to VOP_INACTIVE this vnode. We 1648 * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end. 1649 */ 1650 void 1651 lfs_vunref(vp) 1652 register struct vnode *vp; 1653 { 1654 /* 1655 * Analogous to lfs_vref, if the node is flushing, fake it. 1656 */ 1657 if((vp->v_flag & VXLOCK) && IS_FLUSHING(VTOI(vp)->i_lfs,vp)) { 1658 return; 1659 } 1660 1661 simple_lock(&vp->v_interlock); 1662 #ifdef DIAGNOSTIC 1663 if(vp->v_usecount<=0) { 1664 printf("lfs_vunref: flags are 0x%lx\n", vp->v_flag); 1665 printf("lfs_vunref: usecount = %ld\n", vp->v_usecount); 1666 panic("lfs_vunref: v_usecount<0"); 1667 } 1668 #endif 1669 vp->v_usecount--; 1670 if (vp->v_usecount > 0) { 1671 simple_unlock(&vp->v_interlock); 1672 return; 1673 } 1674 #ifdef DIAGNOSTIC 1675 if(VOP_ISLOCKED(vp)) 1676 panic("lfs_vunref: vnode locked"); 1677 #endif 1678 /* 1679 * insert at tail of LRU list 1680 */ 1681 simple_lock(&vnode_free_list_slock); 1682 if (vp->v_holdcnt > 0) 1683 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1684 else 1685 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1686 simple_unlock(&vnode_free_list_slock); 1687 simple_unlock(&vp->v_interlock); 1688 } 1689 1690 /* 1691 * We use this when we have vnodes that were loaded in solely for cleaning. 1692 * There is no reason to believe that these vnodes will be referenced again 1693 * soon, since the cleaning process is unrelated to normal filesystem 1694 * activity. Putting cleaned vnodes at the tail of the list has the effect 1695 * of flushing the vnode LRU. So, put vnodes that were loaded only for 1696 * cleaning at the head of the list, instead. 1697 */ 1698 void 1699 lfs_vunref_head(vp) 1700 register struct vnode *vp; 1701 { 1702 simple_lock(&vp->v_interlock); 1703 #ifdef DIAGNOSTIC 1704 if(vp->v_usecount==0) { 1705 panic("lfs_vunref: v_usecount<0"); 1706 } 1707 #endif 1708 vp->v_usecount--; 1709 if (vp->v_usecount > 0) { 1710 simple_unlock(&vp->v_interlock); 1711 return; 1712 } 1713 #ifdef DIAGNOSTIC 1714 if(VOP_ISLOCKED(vp)) 1715 panic("lfs_vunref_head: vnode locked"); 1716 #endif 1717 /* 1718 * insert at head of LRU list 1719 */ 1720 simple_lock(&vnode_free_list_slock); 1721 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1722 simple_unlock(&vnode_free_list_slock); 1723 simple_unlock(&vp->v_interlock); 1724 } 1725 1726