1 /* $NetBSD: lfs_segment.c,v 1.14 1998/11/09 01:18:35 mycroft Exp $ */ 2 3 /* 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95 36 */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/namei.h> 41 #include <sys/kernel.h> 42 #include <sys/resourcevar.h> 43 #include <sys/file.h> 44 #include <sys/stat.h> 45 #include <sys/buf.h> 46 #include <sys/proc.h> 47 #include <sys/conf.h> 48 #include <sys/vnode.h> 49 #include <sys/malloc.h> 50 #include <sys/mount.h> 51 52 #include <miscfs/specfs/specdev.h> 53 #include <miscfs/fifofs/fifo.h> 54 55 #include <ufs/ufs/quota.h> 56 #include <ufs/ufs/inode.h> 57 #include <ufs/ufs/dir.h> 58 #include <ufs/ufs/ufsmount.h> 59 #include <ufs/ufs/ufs_extern.h> 60 61 #include <ufs/lfs/lfs.h> 62 #include <ufs/lfs/lfs_extern.h> 63 64 extern int count_lock_queue __P((void)); 65 extern struct simplelock vnode_free_list_slock; /* XXX */ 66 extern TAILQ_HEAD(freelst, vnode) vnode_free_list; /* XXX */ 67 68 #define MAX_ACTIVE 10 69 /* 70 * Determine if it's OK to start a partial in this segment, or if we need 71 * to go on to a new segment. 72 */ 73 #define LFS_PARTIAL_FITS(fs) \ 74 ((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ 75 1 << (fs)->lfs_fsbtodb) 76 77 void lfs_callback __P((struct buf *)); 78 void lfs_gather __P((struct lfs *, struct segment *, 79 struct vnode *, int (*) __P((struct lfs *, struct buf *)))); 80 int lfs_gatherblock __P((struct segment *, struct buf *, int *)); 81 void lfs_iset __P((struct inode *, ufs_daddr_t, time_t)); 82 int lfs_match_data __P((struct lfs *, struct buf *)); 83 int lfs_match_dindir __P((struct lfs *, struct buf *)); 84 int lfs_match_indir __P((struct lfs *, struct buf *)); 85 int lfs_match_tindir __P((struct lfs *, struct buf *)); 86 void lfs_newseg __P((struct lfs *)); 87 void lfs_shellsort __P((struct buf **, ufs_daddr_t *, register int)); 88 void lfs_supercallback __P((struct buf *)); 89 void lfs_updatemeta __P((struct segment *)); 90 int lfs_vref __P((struct vnode *)); 91 void lfs_vunref __P((struct vnode *)); 92 void lfs_writefile __P((struct lfs *, struct segment *, struct vnode *)); 93 int lfs_writeinode __P((struct lfs *, struct segment *, struct inode *)); 94 int lfs_writeseg __P((struct lfs *, struct segment *)); 95 void lfs_writesuper __P((struct lfs *)); 96 void lfs_writevnodes __P((struct lfs *fs, struct mount *mp, 97 struct segment *sp, int dirops)); 98 99 int lfs_allclean_wakeup; /* Cleaner wakeup address. */ 100 101 /* Statistics Counters */ 102 #define DOSTATS 103 struct lfs_stats lfs_stats; 104 105 /* op values to lfs_writevnodes */ 106 #define VN_REG 0 107 #define VN_DIROP 1 108 #define VN_EMPTY 2 109 110 /* 111 * Ifile and meta data blocks are not marked busy, so segment writes MUST be 112 * single threaded. Currently, there are two paths into lfs_segwrite, sync() 113 * and getnewbuf(). They both mark the file system busy. Lfs_vflush() 114 * explicitly marks the file system busy. So lfs_segwrite is safe. I think. 115 */ 116 117 int 118 lfs_vflush(vp) 119 struct vnode *vp; 120 { 121 struct inode *ip; 122 struct lfs *fs; 123 struct segment *sp; 124 125 fs = VFSTOUFS(vp->v_mount)->um_lfs; 126 if (fs->lfs_nactive > MAX_ACTIVE) 127 return(lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP)); 128 lfs_seglock(fs, SEGM_SYNC); 129 sp = fs->lfs_sp; 130 131 132 ip = VTOI(vp); 133 if (vp->v_dirtyblkhd.lh_first == NULL) 134 lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); 135 136 do { 137 do { 138 if (vp->v_dirtyblkhd.lh_first != NULL) 139 lfs_writefile(fs, sp, vp); 140 } while (lfs_writeinode(fs, sp, ip)); 141 142 } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); 143 144 #ifdef DOSTATS 145 ++lfs_stats.nwrites; 146 if (sp->seg_flags & SEGM_SYNC) 147 ++lfs_stats.nsync_writes; 148 if (sp->seg_flags & SEGM_CKP) 149 ++lfs_stats.ncheckpoints; 150 #endif 151 lfs_segunlock(fs); 152 return (0); 153 } 154 155 void 156 lfs_writevnodes(fs, mp, sp, op) 157 struct lfs *fs; 158 struct mount *mp; 159 struct segment *sp; 160 int op; 161 { 162 struct inode *ip; 163 struct vnode *vp; 164 165 /* BEGIN HACK */ 166 #define VN_OFFSET (((caddr_t)&vp->v_mntvnodes.le_next) - (caddr_t)vp) 167 #define BACK_VP(VP) ((struct vnode *)(((caddr_t)VP->v_mntvnodes.le_prev) - VN_OFFSET)) 168 #define BEG_OF_VLIST ((struct vnode *)(((caddr_t)&mp->mnt_vnodelist.lh_first) - VN_OFFSET)) 169 170 /* Find last vnode. */ 171 loop: for (vp = mp->mnt_vnodelist.lh_first; 172 vp && vp->v_mntvnodes.le_next != NULL; 173 vp = vp->v_mntvnodes.le_next); 174 for (; vp && vp != BEG_OF_VLIST; vp = BACK_VP(vp)) { 175 /* END HACK */ 176 /* 177 loop: 178 for (vp = mp->mnt_vnodelist.lh_first; 179 vp != NULL; 180 vp = vp->v_mntvnodes.le_next) { 181 */ 182 /* 183 * If the vnode that we are about to sync is no longer 184 * associated with this mount point, start over. 185 */ 186 if (vp->v_mount != mp) 187 goto loop; 188 189 /* XXX ignore dirops for now 190 if (op == VN_DIROP && !(vp->v_flag & VDIROP) || 191 op != VN_DIROP && (vp->v_flag & VDIROP)) 192 continue; 193 */ 194 195 if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first) 196 continue; 197 198 if (vp->v_type == VNON) 199 continue; 200 201 if (lfs_vref(vp)) 202 continue; 203 204 /* 205 * Write the inode/file if dirty and it's not the 206 * the IFILE. 207 */ 208 ip = VTOI(vp); 209 if ((ip->i_flag & 210 (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE) || 211 vp->v_dirtyblkhd.lh_first != NULL) && 212 ip->i_number != LFS_IFILE_INUM) { 213 if (vp->v_dirtyblkhd.lh_first != NULL) 214 lfs_writefile(fs, sp, vp); 215 (void) lfs_writeinode(fs, sp, ip); 216 } 217 vp->v_flag &= ~VDIROP; 218 lfs_vunref(vp); 219 } 220 } 221 222 int 223 lfs_segwrite(mp, flags) 224 struct mount *mp; 225 int flags; /* Do a checkpoint. */ 226 { 227 struct buf *bp; 228 struct inode *ip; 229 struct lfs *fs; 230 struct segment *sp; 231 struct vnode *vp; 232 SEGUSE *segusep; 233 ufs_daddr_t ibno; 234 CLEANERINFO *cip; 235 int clean, do_ckp, error, i; 236 237 fs = VFSTOUFS(mp)->um_lfs; 238 239 /* 240 * If we have fewer than 2 clean segments, wait until cleaner 241 * writes. 242 */ 243 do { 244 LFS_CLEANERINFO(cip, fs, bp); 245 clean = cip->clean; 246 brelse(bp); 247 if (clean <= 2 || fs->lfs_avail <= 0) { 248 /* printf ("segs clean: %d\n", clean); */ 249 wakeup(&lfs_allclean_wakeup); 250 wakeup(&fs->lfs_nextseg); 251 error = tsleep(&fs->lfs_avail, PRIBIO + 1, 252 "lfs writer", 0); 253 if (error) 254 return (error); 255 } 256 } while (clean <= 2 || fs->lfs_avail <= 0); 257 258 /* 259 * Allocate a segment structure and enough space to hold pointers to 260 * the maximum possible number of buffers which can be described in a 261 * single summary block. 262 */ 263 do_ckp = flags & SEGM_CKP || fs->lfs_nactive > MAX_ACTIVE; 264 lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); 265 sp = fs->lfs_sp; 266 267 lfs_writevnodes(fs, mp, sp, VN_REG); 268 269 /* XXX ignore ordering of dirops for now */ 270 /* XXX 271 fs->lfs_writer = 1; 272 if (fs->lfs_dirops && (error = 273 tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) { 274 free(sp->bpp, M_SEGMENT); 275 free(sp, M_SEGMENT); 276 fs->lfs_writer = 0; 277 return (error); 278 } 279 280 lfs_writevnodes(fs, mp, sp, VN_DIROP); 281 */ 282 283 /* 284 * If we are doing a checkpoint, mark everything since the 285 * last checkpoint as no longer ACTIVE. 286 */ 287 if (do_ckp) 288 for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz; 289 --ibno >= fs->lfs_cleansz; ) { 290 if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize, 291 NOCRED, &bp)) 292 293 panic("lfs: ifile read"); 294 segusep = (SEGUSE *)bp->b_data; 295 for (i = fs->lfs_sepb; i--; segusep++) 296 segusep->su_flags &= ~SEGUSE_ACTIVE; 297 298 error = VOP_BWRITE(bp); 299 } 300 301 if (do_ckp || fs->lfs_doifile) { 302 redo: 303 vp = fs->lfs_ivnode; 304 while (vget(vp, LK_EXCLUSIVE)) 305 continue; 306 ip = VTOI(vp); 307 if (vp->v_dirtyblkhd.lh_first != NULL) 308 lfs_writefile(fs, sp, vp); 309 (void)lfs_writeinode(fs, sp, ip); 310 vput(vp); 311 if (lfs_writeseg(fs, sp) && do_ckp) 312 goto redo; 313 } else 314 (void) lfs_writeseg(fs, sp); 315 316 /* 317 * If the I/O count is non-zero, sleep until it reaches zero. At the 318 * moment, the user's process hangs around so we can sleep. 319 */ 320 /* XXX ignore dirops for now 321 fs->lfs_writer = 0; 322 fs->lfs_doifile = 0; 323 wakeup(&fs->lfs_dirops); 324 */ 325 326 #ifdef DOSTATS 327 ++lfs_stats.nwrites; 328 if (sp->seg_flags & SEGM_SYNC) 329 ++lfs_stats.nsync_writes; 330 if (sp->seg_flags & SEGM_CKP) 331 ++lfs_stats.ncheckpoints; 332 #endif 333 lfs_segunlock(fs); 334 return (0); 335 } 336 337 /* 338 * Write the dirty blocks associated with a vnode. 339 */ 340 void 341 lfs_writefile(fs, sp, vp) 342 struct lfs *fs; 343 struct segment *sp; 344 struct vnode *vp; 345 { 346 struct buf *bp; 347 struct finfo *fip; 348 IFILE *ifp; 349 350 if (sp->seg_bytes_left < fs->lfs_bsize || 351 sp->sum_bytes_left < sizeof(struct finfo)) 352 (void) lfs_writeseg(fs, sp); 353 354 sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(ufs_daddr_t); 355 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 356 357 fip = sp->fip; 358 fip->fi_nblocks = 0; 359 fip->fi_ino = VTOI(vp)->i_number; 360 LFS_IENTRY(ifp, fs, fip->fi_ino, bp); 361 fip->fi_version = ifp->if_version; 362 brelse(bp); 363 364 /* 365 * It may not be necessary to write the meta-data blocks at this point, 366 * as the roll-forward recovery code should be able to reconstruct the 367 * list. 368 */ 369 lfs_gather(fs, sp, vp, lfs_match_data); 370 lfs_gather(fs, sp, vp, lfs_match_indir); 371 lfs_gather(fs, sp, vp, lfs_match_dindir); 372 #ifdef TRIPLE 373 lfs_gather(fs, sp, vp, lfs_match_tindir); 374 #endif 375 376 fip = sp->fip; 377 if (fip->fi_nblocks != 0) { 378 sp->fip = 379 (struct finfo *)((caddr_t)fip + sizeof(struct finfo) + 380 sizeof(ufs_daddr_t) * (fip->fi_nblocks - 1)); 381 sp->start_lbp = &sp->fip->fi_blocks[0]; 382 } else { 383 sp->sum_bytes_left += sizeof(struct finfo) - sizeof(ufs_daddr_t); 384 --((SEGSUM *)(sp->segsum))->ss_nfinfo; 385 } 386 } 387 388 int 389 lfs_writeinode(fs, sp, ip) 390 struct lfs *fs; 391 struct segment *sp; 392 struct inode *ip; 393 { 394 struct buf *bp, *ibp; 395 IFILE *ifp; 396 SEGUSE *sup; 397 ufs_daddr_t daddr; 398 ino_t ino; 399 int error, i, ndx; 400 int redo_ifile = 0; 401 struct timespec ts; 402 403 if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE))) 404 return(0); 405 406 /* Allocate a new inode block if necessary. */ 407 if (sp->ibp == NULL) { 408 /* Allocate a new segment if necessary. */ 409 if (sp->seg_bytes_left < fs->lfs_bsize || 410 sp->sum_bytes_left < sizeof(ufs_daddr_t)) 411 (void) lfs_writeseg(fs, sp); 412 413 /* Get next inode block. */ 414 daddr = fs->lfs_offset; 415 fs->lfs_offset += fsbtodb(fs, 1); 416 sp->ibp = *sp->cbpp++ = 417 lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr, 418 fs->lfs_bsize); 419 /* Zero out inode numbers */ 420 for (i = 0; i < INOPB(fs); ++i) 421 ((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0; 422 ++sp->start_bpp; 423 fs->lfs_avail -= fsbtodb(fs, 1); 424 /* Set remaining space counters. */ 425 sp->seg_bytes_left -= fs->lfs_bsize; 426 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 427 ndx = LFS_SUMMARY_SIZE / sizeof(ufs_daddr_t) - 428 sp->ninodes / INOPB(fs) - 1; 429 ((ufs_daddr_t *)(sp->segsum))[ndx] = daddr; 430 } 431 432 /* Update the inode times and copy the inode onto the inode page. */ 433 if (ip->i_flag & IN_MODIFIED) 434 --fs->lfs_uinodes; 435 TIMEVAL_TO_TIMESPEC(&time, &ts); 436 FFS_ITIMES(ip, &ts, &ts, &ts); 437 ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); 438 bp = sp->ibp; 439 ((struct dinode *)bp->b_data)[sp->ninodes % INOPB(fs)] = ip->i_din.ffs_din; 440 /* Increment inode count in segment summary block. */ 441 ++((SEGSUM *)(sp->segsum))->ss_ninos; 442 443 /* If this page is full, set flag to allocate a new page. */ 444 if (++sp->ninodes % INOPB(fs) == 0) 445 sp->ibp = NULL; 446 447 /* 448 * If updating the ifile, update the super-block. Update the disk 449 * address and access times for this inode in the ifile. 450 */ 451 ino = ip->i_number; 452 if (ino == LFS_IFILE_INUM) { 453 daddr = fs->lfs_idaddr; 454 fs->lfs_idaddr = bp->b_blkno; 455 } else { 456 LFS_IENTRY(ifp, fs, ino, ibp); 457 daddr = ifp->if_daddr; 458 ifp->if_daddr = bp->b_blkno; 459 error = VOP_BWRITE(ibp); 460 } 461 462 /* 463 * No need to update segment usage if there was no former inode address 464 * or if the last inode address is in the current partial segment. 465 */ 466 if (daddr != LFS_UNUSED_DADDR && 467 !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) { 468 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 469 #ifdef DIAGNOSTIC 470 if (sup->su_nbytes < DINODE_SIZE) { 471 /* XXX -- Change to a panic. */ 472 printf("lfs: negative bytes (segment %d)\n", 473 datosn(fs, daddr)); 474 panic("negative bytes"); 475 } 476 #endif 477 sup->su_nbytes -= DINODE_SIZE; 478 redo_ifile = 479 (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); 480 error = VOP_BWRITE(bp); 481 } 482 return (redo_ifile); 483 } 484 485 int 486 lfs_gatherblock(sp, bp, sptr) 487 struct segment *sp; 488 struct buf *bp; 489 int *sptr; 490 { 491 struct lfs *fs; 492 int version; 493 494 /* 495 * If full, finish this segment. We may be doing I/O, so 496 * release and reacquire the splbio(). 497 */ 498 #ifdef DIAGNOSTIC 499 if (sp->vp == NULL) 500 panic ("lfs_gatherblock: Null vp in segment"); 501 #endif 502 fs = sp->fs; 503 if (sp->sum_bytes_left < sizeof(ufs_daddr_t) || 504 sp->seg_bytes_left < bp->b_bcount) { 505 if (sptr) 506 splx(*sptr); 507 lfs_updatemeta(sp); 508 509 version = sp->fip->fi_version; 510 (void) lfs_writeseg(fs, sp); 511 512 sp->fip->fi_version = version; 513 sp->fip->fi_ino = VTOI(sp->vp)->i_number; 514 /* Add the current file to the segment summary. */ 515 ++((SEGSUM *)(sp->segsum))->ss_nfinfo; 516 sp->sum_bytes_left -= 517 sizeof(struct finfo) - sizeof(ufs_daddr_t); 518 519 if (sptr) 520 *sptr = splbio(); 521 return(1); 522 } 523 524 /* Insert into the buffer list, update the FINFO block. */ 525 bp->b_flags |= B_GATHERED; 526 *sp->cbpp++ = bp; 527 sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno; 528 529 sp->sum_bytes_left -= sizeof(ufs_daddr_t); 530 sp->seg_bytes_left -= bp->b_bcount; 531 return(0); 532 } 533 534 void 535 lfs_gather(fs, sp, vp, match) 536 struct lfs *fs; 537 struct segment *sp; 538 struct vnode *vp; 539 int (*match) __P((struct lfs *, struct buf *)); 540 { 541 struct buf *bp; 542 int s; 543 544 sp->vp = vp; 545 s = splbio(); 546 /* This is a hack to see if ordering the blocks in LFS makes a difference. */ 547 /* BEGIN HACK */ 548 #define BUF_OFFSET (((caddr_t)&bp->b_vnbufs.le_next) - (caddr_t)bp) 549 #define BACK_BUF(BP) ((struct buf *)(((caddr_t)BP->b_vnbufs.le_prev) - BUF_OFFSET)) 550 #define BEG_OF_LIST ((struct buf *)(((caddr_t)&vp->v_dirtyblkhd.lh_first) - BUF_OFFSET)) 551 552 553 /*loop: for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) {*/ 554 /* Find last buffer. */ 555 loop: for (bp = vp->v_dirtyblkhd.lh_first; bp && bp->b_vnbufs.le_next != NULL; 556 bp = bp->b_vnbufs.le_next); 557 for (; bp && bp != BEG_OF_LIST; bp = BACK_BUF(bp)) { 558 /* END HACK */ 559 if (bp->b_flags & B_BUSY || !match(fs, bp) || 560 bp->b_flags & B_GATHERED) 561 continue; 562 #ifdef DIAGNOSTIC 563 if (!(bp->b_flags & B_DELWRI)) 564 panic("lfs_gather: bp not B_DELWRI"); 565 if (!(bp->b_flags & B_LOCKED)) 566 panic("lfs_gather: bp not B_LOCKED"); 567 #endif 568 if (lfs_gatherblock(sp, bp, &s)) 569 goto loop; 570 } 571 splx(s); 572 lfs_updatemeta(sp); 573 sp->vp = NULL; 574 } 575 576 577 /* 578 * Update the metadata that points to the blocks listed in the FINFO 579 * array. 580 */ 581 void 582 lfs_updatemeta(sp) 583 struct segment *sp; 584 { 585 SEGUSE *sup; 586 struct buf *bp; 587 struct lfs *fs; 588 struct vnode *vp; 589 struct indir a[NIADDR + 2], *ap; 590 struct inode *ip; 591 ufs_daddr_t daddr, lbn, off; 592 int error, i, nblocks, num; 593 594 vp = sp->vp; 595 nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; 596 if (nblocks < 0) 597 panic("This is a bad thing\n"); 598 if (vp == NULL || nblocks == 0) 599 return; 600 601 /* Sort the blocks. */ 602 if (!(sp->seg_flags & SEGM_CLEAN)) 603 lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks); 604 605 /* 606 * Record the length of the last block in case it's a fragment. 607 * If there are indirect blocks present, they sort last. An 608 * indirect block will be lfs_bsize and its presence indicates 609 * that you cannot have fragments. 610 */ 611 sp->fip->fi_lastlength = sp->start_bpp[nblocks - 1]->b_bcount; 612 613 /* 614 * Assign disk addresses, and update references to the logical 615 * block and the segment usage information. 616 */ 617 fs = sp->fs; 618 for (i = nblocks; i--; ++sp->start_bpp) { 619 lbn = *sp->start_lbp++; 620 (*sp->start_bpp)->b_blkno = off = fs->lfs_offset; 621 fs->lfs_offset += 622 fragstodb(fs, numfrags(fs, (*sp->start_bpp)->b_bcount)); 623 624 error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL); 625 if (error) 626 panic("lfs_updatemeta: ufs_bmaparray %d", error); 627 ip = VTOI(vp); 628 switch (num) { 629 case 0: 630 ip->i_ffs_db[lbn] = off; 631 break; 632 case 1: 633 ip->i_ffs_ib[a[0].in_off] = off; 634 break; 635 default: 636 ap = &a[num - 1]; 637 if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp)) 638 panic("lfs_updatemeta: bread bno %d", 639 ap->in_lbn); 640 /* 641 * Bread may create a new indirect block which needs 642 * to get counted for the inode. 643 */ 644 if (bp->b_blkno == -1 && 645 !(bp->b_flags & (B_DELWRI | B_DONE))) { 646 ip->i_ffs_blocks += fsbtodb(fs, 1); 647 fs->lfs_bfree -= fragstodb(fs, fs->lfs_frag); 648 } 649 ((ufs_daddr_t *)bp->b_data)[ap->in_off] = off; 650 VOP_BWRITE(bp); 651 } 652 653 /* Update segment usage information. */ 654 if (daddr != UNASSIGNED && 655 !(daddr >= fs->lfs_lastpseg && daddr <= off)) { 656 LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); 657 #ifdef DIAGNOSTIC 658 if (sup->su_nbytes < (*sp->start_bpp)->b_bcount) { 659 /* XXX -- Change to a panic. */ 660 printf("lfs: negative bytes (segment %d)\n", 661 datosn(fs, daddr)); 662 printf("lfs: bp = 0x%p, addr = 0x%p\n", 663 bp, bp->b_un.b_addr); 664 panic ("Negative Bytes"); 665 } 666 #endif 667 sup->su_nbytes -= (*sp->start_bpp)->b_bcount; 668 error = VOP_BWRITE(bp); 669 } 670 } 671 } 672 673 /* 674 * Start a new segment. 675 */ 676 int 677 lfs_initseg(fs) 678 struct lfs *fs; 679 { 680 struct segment *sp; 681 SEGUSE *sup; 682 SEGSUM *ssp; 683 struct buf *bp; 684 int repeat; 685 686 sp = fs->lfs_sp; 687 688 repeat = 0; 689 /* Advance to the next segment. */ 690 if (!LFS_PARTIAL_FITS(fs)) { 691 /* Wake up any cleaning procs waiting on this file system. */ 692 wakeup(&lfs_allclean_wakeup); 693 wakeup(&fs->lfs_nextseg); 694 695 lfs_newseg(fs); 696 repeat = 1; 697 fs->lfs_offset = fs->lfs_curseg; 698 sp->seg_number = datosn(fs, fs->lfs_curseg); 699 sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE; 700 701 /* 702 * If the segment contains a superblock, update the offset 703 * and summary address to skip over it. 704 */ 705 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 706 if (sup->su_flags & SEGUSE_SUPERBLOCK) { 707 fs->lfs_offset += LFS_SBPAD / DEV_BSIZE; 708 sp->seg_bytes_left -= LFS_SBPAD; 709 } 710 brelse(bp); 711 } else { 712 sp->seg_number = datosn(fs, fs->lfs_curseg); 713 sp->seg_bytes_left = (fs->lfs_dbpseg - 714 (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE; 715 } 716 fs->lfs_lastpseg = fs->lfs_offset; 717 718 sp->fs = fs; 719 sp->ibp = NULL; 720 sp->ninodes = 0; 721 722 /* Get a new buffer for SEGSUM and enter it into the buffer list. */ 723 sp->cbpp = sp->bpp; 724 *sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_offset, 725 LFS_SUMMARY_SIZE); 726 sp->segsum = (*sp->cbpp)->b_data; 727 bzero(sp->segsum, LFS_SUMMARY_SIZE); 728 sp->start_bpp = ++sp->cbpp; 729 fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE; 730 731 /* Set point to SEGSUM, initialize it. */ 732 ssp = sp->segsum; 733 ssp->ss_next = fs->lfs_nextseg; 734 ssp->ss_nfinfo = ssp->ss_ninos = 0; 735 ssp->ss_magic = SS_MAGIC; 736 737 /* Set pointer to first FINFO, initialize it. */ 738 sp->fip = (struct finfo *)((caddr_t)sp->segsum + sizeof(SEGSUM)); 739 sp->fip->fi_nblocks = 0; 740 sp->start_lbp = &sp->fip->fi_blocks[0]; 741 sp->fip->fi_lastlength = 0; 742 743 sp->seg_bytes_left -= LFS_SUMMARY_SIZE; 744 sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM); 745 746 return(repeat); 747 } 748 749 /* 750 * Return the next segment to write. 751 */ 752 void 753 lfs_newseg(fs) 754 struct lfs *fs; 755 { 756 CLEANERINFO *cip; 757 SEGUSE *sup; 758 struct buf *bp; 759 int curseg, isdirty, sn; 760 761 LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp); 762 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; 763 sup->su_nbytes = 0; 764 sup->su_nsums = 0; 765 sup->su_ninos = 0; 766 (void) VOP_BWRITE(bp); 767 768 LFS_CLEANERINFO(cip, fs, bp); 769 --cip->clean; 770 ++cip->dirty; 771 (void) VOP_BWRITE(bp); 772 773 fs->lfs_lastseg = fs->lfs_curseg; 774 fs->lfs_curseg = fs->lfs_nextseg; 775 for (sn = curseg = datosn(fs, fs->lfs_curseg);;) { 776 sn = (sn + 1) % fs->lfs_nseg; 777 if (sn == curseg) 778 panic("lfs_nextseg: no clean segments"); 779 LFS_SEGENTRY(sup, fs, sn, bp); 780 isdirty = sup->su_flags & SEGUSE_DIRTY; 781 brelse(bp); 782 if (!isdirty) 783 break; 784 } 785 786 ++fs->lfs_nactive; 787 fs->lfs_nextseg = sntoda(fs, sn); 788 #ifdef DOSTATS 789 ++lfs_stats.segsused; 790 #endif 791 } 792 793 int 794 lfs_writeseg(fs, sp) 795 struct lfs *fs; 796 struct segment *sp; 797 { 798 extern int locked_queue_count; 799 struct buf **bpp, *bp, *cbp; 800 SEGUSE *sup; 801 SEGSUM *ssp; 802 dev_t i_dev; 803 u_long *datap, *dp; 804 int do_again, i, nblocks, s; 805 int (*strategy)__P((void *)); 806 struct vop_strategy_args vop_strategy_a; 807 u_short ninos; 808 char *p; 809 810 /* 811 * If there are no buffers other than the segment summary to write 812 * and it is not a checkpoint, don't do anything. On a checkpoint, 813 * even if there aren't any buffers, you need to write the superblock. 814 */ 815 if ((nblocks = sp->cbpp - sp->bpp) == 1) 816 return (0); 817 818 /* Update the segment usage information. */ 819 LFS_SEGENTRY(sup, fs, sp->seg_number, bp); 820 821 /* Loop through all blocks, except the segment summary. */ 822 for (bpp = sp->bpp; ++bpp < sp->cbpp; ) 823 sup->su_nbytes += (*bpp)->b_bcount; 824 825 ssp = (SEGSUM *)sp->segsum; 826 827 ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); 828 sup->su_nbytes += ssp->ss_ninos * DINODE_SIZE; 829 sup->su_nbytes += LFS_SUMMARY_SIZE; 830 sup->su_lastmod = time.tv_sec; 831 sup->su_ninos += ninos; 832 ++sup->su_nsums; 833 do_again = !(bp->b_flags & B_GATHERED); 834 (void)VOP_BWRITE(bp); 835 /* 836 * Compute checksum across data and then across summary; the first 837 * block (the summary block) is skipped. Set the create time here 838 * so that it's guaranteed to be later than the inode mod times. 839 * 840 * XXX 841 * Fix this to do it inline, instead of malloc/copy. 842 */ 843 datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK); 844 for (bpp = sp->bpp, i = nblocks - 1; i--;) { 845 if ((*++bpp)->b_flags & B_INVAL) { 846 if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long))) 847 panic("lfs_writeseg: copyin failed"); 848 } else 849 *dp++ = ((u_long *)(*bpp)->b_data)[0]; 850 } 851 ssp->ss_create = time.tv_sec; 852 ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long)); 853 ssp->ss_sumsum = 854 cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum)); 855 free(datap, M_SEGMENT); 856 #ifdef DIAGNOSTIC 857 if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE) 858 panic("lfs_writeseg: No diskspace for summary"); 859 #endif 860 fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE); 861 862 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 863 strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; 864 865 /* 866 * When we simply write the blocks we lose a rotation for every block 867 * written. To avoid this problem, we allocate memory in chunks, copy 868 * the buffers into the chunk and write the chunk. MAXPHYS is the 869 * largest size I/O devices can handle. 870 * When the data is copied to the chunk, turn off the the B_LOCKED bit 871 * and brelse the buffer (which will move them to the LRU list). Add 872 * the B_CALL flag to the buffer header so we can count I/O's for the 873 * checkpoints and so we can release the allocated memory. 874 * 875 * XXX 876 * This should be removed if the new virtual memory system allows us to 877 * easily make the buffers contiguous in kernel memory and if that's 878 * fast enough. 879 */ 880 for (bpp = sp->bpp, i = nblocks; i;) { 881 cbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, 882 (*bpp)->b_blkno, MAXPHYS); 883 cbp->b_dev = i_dev; 884 cbp->b_flags |= B_ASYNC | B_BUSY; 885 cbp->b_bcount = 0; 886 887 s = splbio(); 888 ++fs->lfs_iocount; 889 for (p = cbp->b_data; i && cbp->b_bcount < MAXPHYS; i--) { 890 bp = *bpp; 891 if (bp->b_bcount > (MAXPHYS - cbp->b_bcount)) 892 break; 893 bpp++; 894 895 /* 896 * Fake buffers from the cleaner are marked as B_INVAL. 897 * We need to copy the data from user space rather than 898 * from the buffer indicated. 899 * XXX == what do I do on an error? 900 */ 901 if (bp->b_flags & B_INVAL) { 902 if (copyin(bp->b_saveaddr, p, bp->b_bcount)) 903 panic("lfs_writeseg: copyin failed"); 904 } else 905 bcopy(bp->b_data, p, bp->b_bcount); 906 p += bp->b_bcount; 907 cbp->b_bcount += bp->b_bcount; 908 if (bp->b_flags & B_LOCKED) 909 --locked_queue_count; 910 bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | 911 B_LOCKED | B_GATHERED); 912 if (bp->b_flags & B_CALL) { 913 /* if B_CALL, it was created with newbuf */ 914 brelvp(bp); 915 if (!(bp->b_flags & B_INVAL)) 916 free(bp->b_data, M_SEGMENT); 917 free(bp, M_SEGMENT); 918 } else { 919 bremfree(bp); 920 bp->b_flags |= B_DONE; 921 reassignbuf(bp, bp->b_vp); 922 brelse(bp); 923 } 924 } 925 ++cbp->b_vp->v_numoutput; 926 splx(s); 927 /* 928 * XXXX This is a gross and disgusting hack. Since these 929 * buffers are physically addressed, they hang off the 930 * device vnode (devvp). As a result, they have no way 931 * of getting to the LFS superblock or lfs structure to 932 * keep track of the number of I/O's pending. So, I am 933 * going to stuff the fs into the saveaddr field of 934 * the buffer (yuk). 935 */ 936 cbp->b_saveaddr = (caddr_t)fs; 937 vop_strategy_a.a_desc = VDESC(vop_strategy); 938 vop_strategy_a.a_bp = cbp; 939 (strategy)(&vop_strategy_a); 940 } 941 /* 942 * XXX 943 * Vinvalbuf can move locked buffers off the locked queue 944 * and we have no way of knowing about this. So, after 945 * doing a big write, we recalculate how many bufers are 946 * really still left on the locked queue. 947 */ 948 locked_queue_count = count_lock_queue(); 949 wakeup(&locked_queue_count); 950 #ifdef DOSTATS 951 ++lfs_stats.psegwrites; 952 lfs_stats.blocktot += nblocks - 1; 953 if (fs->lfs_sp->seg_flags & SEGM_SYNC) 954 ++lfs_stats.psyncwrites; 955 if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { 956 ++lfs_stats.pcleanwrites; 957 lfs_stats.cleanblocks += nblocks - 1; 958 } 959 #endif 960 return (lfs_initseg(fs) || do_again); 961 } 962 963 void 964 lfs_writesuper(fs) 965 struct lfs *fs; 966 { 967 struct buf *bp; 968 dev_t i_dev; 969 int (*strategy) __P((void *)); 970 int s; 971 struct vop_strategy_args vop_strategy_a; 972 973 i_dev = VTOI(fs->lfs_ivnode)->i_dev; 974 strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; 975 976 /* Checksum the superblock and copy it into a buffer. */ 977 fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs)); 978 bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_sboffs[0], 979 LFS_SBPAD); 980 *(struct dlfs *)bp->b_data = fs->lfs_dlfs; 981 982 /* XXX Toggle between first two superblocks; for now just write first */ 983 bp->b_dev = i_dev; 984 bp->b_flags |= B_BUSY | B_CALL | B_ASYNC; 985 bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI); 986 bp->b_iodone = lfs_supercallback; 987 vop_strategy_a.a_desc = VDESC(vop_strategy); 988 vop_strategy_a.a_bp = bp; 989 s = splbio(); 990 ++bp->b_vp->v_numoutput; 991 splx(s); 992 (strategy)(&vop_strategy_a); 993 } 994 995 /* 996 * Logical block number match routines used when traversing the dirty block 997 * chain. 998 */ 999 int 1000 lfs_match_data(fs, bp) 1001 struct lfs *fs; 1002 struct buf *bp; 1003 { 1004 return (bp->b_lblkno >= 0); 1005 } 1006 1007 int 1008 lfs_match_indir(fs, bp) 1009 struct lfs *fs; 1010 struct buf *bp; 1011 { 1012 int lbn; 1013 1014 lbn = bp->b_lblkno; 1015 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); 1016 } 1017 1018 int 1019 lfs_match_dindir(fs, bp) 1020 struct lfs *fs; 1021 struct buf *bp; 1022 { 1023 int lbn; 1024 1025 lbn = bp->b_lblkno; 1026 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); 1027 } 1028 1029 int 1030 lfs_match_tindir(fs, bp) 1031 struct lfs *fs; 1032 struct buf *bp; 1033 { 1034 int lbn; 1035 1036 lbn = bp->b_lblkno; 1037 return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); 1038 } 1039 1040 /* 1041 * Allocate a new buffer header. 1042 */ 1043 struct buf * 1044 lfs_newbuf(vp, daddr, size) 1045 struct vnode *vp; 1046 ufs_daddr_t daddr; 1047 size_t size; 1048 { 1049 struct buf *bp; 1050 size_t nbytes; 1051 1052 nbytes = roundup(size, DEV_BSIZE); 1053 bp = malloc(sizeof(struct buf), M_SEGMENT, M_WAITOK); 1054 bzero(bp, sizeof(struct buf)); 1055 if (nbytes) 1056 bp->b_data = malloc(nbytes, M_SEGMENT, M_WAITOK); 1057 bgetvp(vp, bp); 1058 bp->b_bufsize = size; 1059 bp->b_bcount = size; 1060 bp->b_lblkno = daddr; 1061 bp->b_blkno = daddr; 1062 bp->b_error = 0; 1063 bp->b_resid = 0; 1064 bp->b_iodone = lfs_callback; 1065 bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE; 1066 return (bp); 1067 } 1068 1069 void 1070 lfs_callback(bp) 1071 struct buf *bp; 1072 { 1073 struct lfs *fs; 1074 1075 fs = (struct lfs *)bp->b_saveaddr; 1076 #ifdef DIAGNOSTIC 1077 if (fs->lfs_iocount == 0) 1078 panic("lfs_callback: zero iocount\n"); 1079 #endif 1080 if (--fs->lfs_iocount == 0) 1081 wakeup(&fs->lfs_iocount); 1082 1083 brelvp(bp); 1084 free(bp->b_data, M_SEGMENT); 1085 free(bp, M_SEGMENT); 1086 } 1087 1088 void 1089 lfs_supercallback(bp) 1090 struct buf *bp; 1091 { 1092 brelvp(bp); 1093 free(bp->b_data, M_SEGMENT); 1094 free(bp, M_SEGMENT); 1095 } 1096 1097 /* 1098 * Shellsort (diminishing increment sort) from Data Structures and 1099 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; 1100 * see also Knuth Vol. 3, page 84. The increments are selected from 1101 * formula (8), page 95. Roughly O(N^3/2). 1102 */ 1103 /* 1104 * This is our own private copy of shellsort because we want to sort 1105 * two parallel arrays (the array of buffer pointers and the array of 1106 * logical block numbers) simultaneously. Note that we cast the array 1107 * of logical block numbers to a unsigned in this routine so that the 1108 * negative block numbers (meta data blocks) sort AFTER the data blocks. 1109 */ 1110 void 1111 lfs_shellsort(bp_array, lb_array, nmemb) 1112 struct buf **bp_array; 1113 ufs_daddr_t *lb_array; 1114 register int nmemb; 1115 { 1116 static int __rsshell_increments[] = { 4, 1, 0 }; 1117 register int incr, *incrp, t1, t2; 1118 struct buf *bp_temp; 1119 u_long lb_temp; 1120 1121 for (incrp = __rsshell_increments; (incr = *incrp++) != 0;) 1122 for (t1 = incr; t1 < nmemb; ++t1) 1123 for (t2 = t1 - incr; t2 >= 0;) 1124 if (lb_array[t2] > lb_array[t2 + incr]) { 1125 lb_temp = lb_array[t2]; 1126 lb_array[t2] = lb_array[t2 + incr]; 1127 lb_array[t2 + incr] = lb_temp; 1128 bp_temp = bp_array[t2]; 1129 bp_array[t2] = bp_array[t2 + incr]; 1130 bp_array[t2 + incr] = bp_temp; 1131 t2 -= incr; 1132 } else 1133 break; 1134 } 1135 1136 /* 1137 * Check VXLOCK. Return 1 if the vnode is locked. Otherwise, vget it. 1138 */ 1139 int 1140 lfs_vref(vp) 1141 register struct vnode *vp; 1142 { 1143 if (vp->v_flag & VXLOCK) /* XXX */ 1144 return(1); 1145 return (vget(vp, 0)); 1146 } 1147 1148 /* 1149 * This is vrele except that we do not want to VOP_INACTIVE this vnode. We 1150 * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end. 1151 */ 1152 void 1153 lfs_vunref(vp) 1154 register struct vnode *vp; 1155 { 1156 simple_lock(&vp->v_interlock); 1157 vp->v_usecount--; 1158 if (vp->v_usecount > 0) { 1159 simple_unlock(&vp->v_interlock); 1160 return; 1161 } 1162 /* 1163 * insert at tail of LRU list 1164 */ 1165 simple_lock(&vnode_free_list_slock); 1166 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1167 simple_unlock(&vnode_free_list_slock); 1168 simple_unlock(&vp->v_interlock); 1169 } 1170