xref: /csrg-svn/sys/ufs/lfs/lfs_segment.c (revision 55807)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  *
7  *	@(#)lfs_segment.c	7.28 (Berkeley) 08/01/92
8  */
9 
10 #include <sys/param.h>
11 #include <sys/systm.h>
12 #include <sys/namei.h>
13 #include <sys/kernel.h>
14 #include <sys/resourcevar.h>
15 #include <sys/file.h>
16 #include <sys/stat.h>
17 #include <sys/buf.h>
18 #include <sys/proc.h>
19 #include <sys/conf.h>
20 #include <sys/vnode.h>
21 #include <sys/malloc.h>
22 #include <sys/mount.h>
23 
24 #include <miscfs/specfs/specdev.h>
25 #include <miscfs/fifofs/fifo.h>
26 
27 #include <ufs/ufs/quota.h>
28 #include <ufs/ufs/inode.h>
29 #include <ufs/ufs/dir.h>
30 #include <ufs/ufs/ufsmount.h>
31 
32 #include <ufs/lfs/lfs.h>
33 #include <ufs/lfs/lfs_extern.h>
34 
35 /* In-memory description of a segment about to be written. */
36 struct segment {
37 	struct buf	**bpp;		/* pointer to buffer array */
38 	struct buf	**cbpp;		/* pointer to next available bp */
39 	struct buf	*ibp;		/* buffer pointer to inode page */
40 	struct finfo	*fip;		/* current fileinfo pointer */
41 	void	*segsum;		/* segment summary info */
42 	u_long	ninodes;		/* number of inodes in this segment */
43 	u_long	seg_bytes_left;		/* bytes left in segment */
44 	u_long	sum_bytes_left;		/* bytes left in summary block */
45 	u_long	seg_number;		/* number of this segment */
46 #define	SEGM_CKP	0x01		/* doing a checkpoint */
47 	u_long	seg_flags;		/* run-time flags for this segment */
48 };
49 
50 /*
51  * Determine if it's OK to start a partial in this segment, or if we need
52  * to go on to a new segment.
53  */
54 #define	LFS_PARTIAL_FITS(fs) \
55 	((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
56 	1 << (fs)->lfs_fsbtodb)
57 
58 void	 lfs_callback __P((struct buf *));
59 void	 lfs_gather __P((struct lfs *, struct segment *,
60 	     struct vnode *, int (*) __P((struct lfs *, struct buf *))));
61 void	 lfs_initseg __P((struct lfs *, struct segment *));
62 void	 lfs_iset __P((struct inode *, daddr_t, time_t));
63 int	 lfs_match_data __P((struct lfs *, struct buf *));
64 int	 lfs_match_dindir __P((struct lfs *, struct buf *));
65 int	 lfs_match_indir __P((struct lfs *, struct buf *));
66 int	 lfs_match_tindir __P((struct lfs *, struct buf *));
67 struct buf *
68 	 lfs_newbuf __P((struct lfs *, daddr_t, size_t));
69 void	 lfs_newseg __P((struct lfs *));
70 void	 lfs_shellsort __P((struct buf **, daddr_t *, register int));
71 void	 lfs_updatemeta __P((struct lfs *,
72 	    struct segment *, struct vnode *, daddr_t *, struct buf **, int));
73 void	 lfs_writefile __P((struct lfs *, struct segment *, struct vnode *));
74 int	 lfs_writeinode __P((struct lfs *, struct segment *, struct inode *));
75 int	 lfs_writeseg __P((struct lfs *, struct segment *));
76 void	 lfs_writesuper __P((struct lfs *, struct segment *));
77 void	 lfs_writevnodes __P((struct lfs *fs, struct mount *mp,
78 	    struct segment *sp, int dirops));
79 
80 int	lfs_allclean_wakeup;		/* Cleaner wakeup address. */
81 
82 /*
83  * Ifile and meta data blocks are not marked busy, so segment writes MUST be
84  * single threaded.  Currently, there are two paths into lfs_segwrite, sync()
85  * and getnewbuf().  They both mark the file system busy.  Lfs_vflush()
86  * explicitly marks the file system busy.  So lfs_segwrite is safe.  I think.
87  */
88 
89 int
90 lfs_vflush(vp)
91 	struct vnode *vp;
92 {
93 	struct inode *ip;
94 	struct lfs *fs;
95 	struct segment *sp;
96 	int error, s;
97 
98 	fs = VFSTOUFS(vp->v_mount)->um_lfs;
99 	lfs_seglock(fs);
100 
101 	/*
102 	 * Allocate a segment structure and enough space to hold pointers to
103 	 * the maximum possible number of buffers which can be described in a
104 	 * single summary block.
105 	 */
106 	sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK);
107 	sp->bpp = malloc(((LFS_SUMMARY_SIZE - sizeof(SEGSUM)) /
108 	    sizeof(daddr_t) + 1) * sizeof(struct buf *), M_SEGMENT, M_WAITOK);
109 	sp->seg_flags = SEGM_CKP;
110 
111 	/*
112 	 * Keep a cumulative count of the outstanding I/O operations.  If the
113 	 * disk drive catches up with us it could go to zero before we finish,
114 	 * so we artificially increment it by one until we've scheduled all of
115 	 * the writes we intend to do.
116 	 */
117 	s = splbio();
118 	++fs->lfs_iocount;
119 	splx(s);
120 
121 	ip = VTOI(vp);
122 	do {
123 		lfs_initseg(fs, sp);
124 		do {
125 			if (vp->v_dirtyblkhd != NULL)
126 				lfs_writefile(fs, sp, vp);
127 		} while (lfs_writeinode(fs, sp, ip));
128 		ip->i_flags &= ~(IMOD | IACC | IUPD | ICHG);
129 
130 	} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
131 
132 	/*
133 	 * If the I/O count is non-zero, sleep until it reaches zero.  At the
134 	 * moment, the user's process hangs around so we can sleep.
135 	 */
136 	s = splbio();
137 	if (--fs->lfs_iocount && (error =
138 	    tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs vflush", 0))) {
139 		free(sp->bpp, M_SEGMENT);
140 		free(sp, M_SEGMENT);
141 		return (error);
142 	}
143 	splx(s);
144 	lfs_segunlock(fs);
145 
146 	/*
147 	 * XXX
148 	 * Should be writing a checkpoint?
149 	 */
150 	free(sp->bpp, M_SEGMENT);
151 	free(sp, M_SEGMENT);
152 
153 	return (0);
154 }
155 
156 void
157 lfs_writevnodes(fs, mp, sp, dirops)
158 	struct lfs *fs;
159 	struct mount *mp;
160 	struct segment *sp;
161 	int dirops;
162 {
163 	struct inode *ip;
164 	struct vnode *vp;
165 	int error, s;
166 
167 loop:	for (vp = mp->mnt_mounth; vp; vp = vp->v_mountf) {
168 		/*
169 		 * If the vnode that we are about to sync is no longer
170 		 * associated with this mount point, start over.
171 		 */
172 		if (vp->v_mount != mp)
173 			goto loop;
174 
175 		if (dirops && !(vp->v_flag & VDIROP) ||
176 		    !dirops && (vp->v_flag & VDIROP))
177 			continue;
178 		/*
179 		 * XXX
180 		 * Up the ref count so we don't get tossed out of
181 		 * memory.
182 		 */
183 		VREF(vp);
184 
185 		/*
186 		 * Write the inode/file if dirty and it's not the
187 		 * the IFILE.
188 		 */
189 		ip = VTOI(vp);
190 		if ((ip->i_flag & (IMOD | IACC | IUPD | ICHG) ||
191 		    vp->v_dirtyblkhd != NULL) &&
192 		    ip->i_number != LFS_IFILE_INUM) {
193 			if (vp->v_dirtyblkhd != NULL)
194 				lfs_writefile(fs, sp, vp);
195 			(void) lfs_writeinode(fs, sp, ip);
196 			ip->i_flags &= ~(IMOD | IACC | IUPD | ICHG);
197 		}
198 		vp->v_flag &= ~VDIROP;
199 		vrele(vp);
200 	}
201 }
202 
203 int
204 lfs_segwrite(mp, do_ckp)
205 	struct mount *mp;
206 	int do_ckp;			/* Do a checkpoint. */
207 {
208 	struct buf *bp;
209 	struct inode *ip;
210 	struct lfs *fs;
211 	struct segment *sp;
212 	struct vnode *vp;
213 	SEGUSE *segusep;
214 	daddr_t ibno;
215 	int error, i, s;
216 
217 	fs = VFSTOUFS(mp)->um_lfs;
218 	lfs_seglock(fs);
219 
220 	/*
221 	 * Allocate a segment structure and enough space to hold pointers to
222 	 * the maximum possible number of buffers which can be described in a
223 	 * single summary block.
224 	 */
225 	sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK);
226 	sp->bpp = malloc(((LFS_SUMMARY_SIZE - sizeof(SEGSUM)) /
227 	    sizeof(daddr_t) + 1) * sizeof(struct buf *), M_SEGMENT, M_WAITOK);
228 	sp->seg_flags = do_ckp ? SEGM_CKP : 0;
229 	lfs_initseg(fs, sp);
230 
231 	/*
232 	 * Keep a cumulative count of the outstanding I/O operations.  If the
233 	 * disk drive catches up with us it could go to zero before we finish,
234 	 * so we artificially increment it by one until we've scheduled all of
235 	 * the writes we intend to do.  If not a checkpoint, we never do the
236 	 * final decrement, avoiding the wakeup in the callback routine.
237 	 */
238 	s = splbio();
239 	++fs->lfs_iocount;
240 	splx(s);
241 
242 	lfs_writevnodes(fs, mp, sp, 0);
243 	fs->lfs_writer = 1;
244 	if (fs->lfs_dirops && (error =
245 	    tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) {
246 		free(sp->bpp, M_SEGMENT);
247 		free(sp, M_SEGMENT);
248 		fs->lfs_writer = 0;
249 		return (error);
250 	}
251 
252 	lfs_writevnodes(fs, mp, sp, 1);
253 
254 	/*
255 	 * If we are doing a checkpoint, mark everything since the
256 	 * last checkpoint as no longer ACTIVE.
257 	 */
258 	if (do_ckp)
259 		for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz;
260 		     --ibno >= fs->lfs_cleansz; ) {
261 			if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize,
262 			    NOCRED, &bp))
263 
264 				panic("lfs: ifile read");
265 			segusep = (SEGUSE *)bp->b_un.b_addr;
266 			for (i = fs->lfs_sepb; i--; segusep++)
267 				segusep->su_flags &= ~SEGUSE_ACTIVE;
268 
269 			LFS_UBWRITE(bp);
270 		}
271 
272 	if (do_ckp || fs->lfs_doifile) {
273 		vp = fs->lfs_ivnode;
274 		while (vget(vp));
275 		ip = VTOI(vp);
276 		if (vp->v_dirtyblkhd != NULL)
277 			lfs_writefile(fs, sp, vp);
278 		(void)lfs_writeinode(fs, sp, ip);
279 		ip->i_flags &= ~(IMOD | IACC | IUPD | ICHG);
280 		vput(vp);
281 		/*
282 		 * This should never happen because we just guaranteed
283 		 * that all the segment usage table blocks are dirty, so
284 		 * no new ones should get written.
285 		 */
286 		if (lfs_writeseg(fs, sp) && do_ckp)
287 			panic("lfs_segwrite: created dirty blocks on ckp");
288 	} else
289 		(void) lfs_writeseg(fs, sp);
290 
291 	/*
292 	 * If the I/O count is non-zero, sleep until it reaches zero.  At the
293 	 * moment, the user's process hangs around so we can sleep.
294 	 */
295 	fs->lfs_writer = 0;
296 	fs->lfs_doifile = 0;
297 	wakeup(&fs->lfs_dirops);
298 
299 	s = splbio();
300 	--fs->lfs_iocount;
301 	if (do_ckp) {
302 		if (fs->lfs_iocount && (error =
303 		    tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs sync", 0))) {
304 			free(sp->bpp, M_SEGMENT);
305 			free(sp, M_SEGMENT);
306 			return (error);
307 		}
308 		splx(s);
309 		lfs_writesuper(fs, sp);
310 	} else
311 		splx(s);
312 
313 	lfs_segunlock(fs);
314 
315 	free(sp->bpp, M_SEGMENT);
316 	free(sp, M_SEGMENT);
317 
318 	return (0);
319 }
320 
321 /*
322  * Write the dirty blocks associated with a vnode.
323  */
324 void
325 lfs_writefile(fs, sp, vp)
326 	struct lfs *fs;
327 	struct segment *sp;
328 	struct vnode *vp;
329 {
330 	struct buf *bp;
331 	struct finfo *fip;
332 	IFILE *ifp;
333 
334 	if (sp->seg_bytes_left < fs->lfs_bsize ||
335 	    sp->sum_bytes_left < sizeof(struct finfo)) {
336 		(void) lfs_writeseg(fs, sp);
337 		lfs_initseg(fs, sp);
338 	}
339 	sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(daddr_t);
340 
341 	fip = sp->fip;
342 	fip->fi_nblocks = 0;
343 	fip->fi_ino = VTOI(vp)->i_number;
344 	LFS_IENTRY(ifp, fs, fip->fi_ino, bp);
345 	fip->fi_version = ifp->if_version;
346 	brelse(bp);
347 
348 	/*
349 	 * It may not be necessary to write the meta-data blocks at this point,
350 	 * as the roll-forward recovery code should be able to reconstruct the
351 	 * list.
352 	 */
353 	lfs_gather(fs, sp, vp, lfs_match_data);
354 	lfs_gather(fs, sp, vp, lfs_match_indir);
355 	lfs_gather(fs, sp, vp, lfs_match_dindir);
356 #ifdef TRIPLE
357 	lfs_gather(fs, sp, vp, lfs_match_tindir);
358 #endif
359 
360 	fip = sp->fip;
361 #ifdef META
362 	printf("lfs_writefile: adding %d blocks\n", fip->fi_nblocks);
363 #endif
364 	if (fip->fi_nblocks != 0) {
365 		++((SEGSUM *)(sp->segsum))->ss_nfinfo;
366 		sp->fip =
367 		    (struct finfo *)((caddr_t)fip + sizeof(struct finfo) +
368 		    sizeof(daddr_t) * (fip->fi_nblocks - 1));
369 	} else
370 		sp->sum_bytes_left += sizeof(struct finfo) - sizeof(daddr_t);
371 }
372 
373 int
374 lfs_writeinode(fs, sp, ip)
375 	struct lfs *fs;
376 	struct segment *sp;
377 	struct inode *ip;
378 {
379 	struct buf *bp, *ibp;
380 	IFILE *ifp;
381 	SEGUSE *sup;
382 	daddr_t daddr;
383 	ino_t ino;
384 	int ndx;
385 	int redo_ifile = 0;
386 
387 	/* Allocate a new inode block if necessary. */
388 	if (sp->ibp == NULL) {
389 		/* Allocate a new segment if necessary. */
390 		if (sp->seg_bytes_left < fs->lfs_bsize ||
391 		    sp->sum_bytes_left < sizeof(daddr_t)) {
392 			(void) lfs_writeseg(fs, sp);
393 			lfs_initseg(fs, sp);
394 		}
395 
396 		/* Get next inode block. */
397 		daddr = fs->lfs_offset;
398 		fs->lfs_offset += fsbtodb(fs, 1);
399 		sp->ibp = *sp->cbpp++ =
400 		    lfs_newbuf(fs, daddr, fs->lfs_bsize);
401 
402 		/* Set remaining space counters. */
403 		sp->seg_bytes_left -= fs->lfs_bsize;
404 		sp->sum_bytes_left -= sizeof(daddr_t);
405 		ndx = LFS_SUMMARY_SIZE / sizeof(daddr_t) -
406 		    sp->ninodes / INOPB(fs) - 1;
407 		((daddr_t *)(sp->segsum))[ndx] = daddr;
408 	}
409 
410 	/* Update the inode times and copy the inode onto the inode page. */
411 	ITIMES(ip, &time, &time);
412 	bp = sp->ibp;
413 	bp->b_un.b_dino[sp->ninodes % INOPB(fs)] = ip->i_din;
414 
415 	/* Increment inode count in segment summary block. */
416 	++((SEGSUM *)(sp->segsum))->ss_ninos;
417 
418 	/* If this page is full, set flag to allocate a new page. */
419 	if (++sp->ninodes % INOPB(fs) == 0)
420 		sp->ibp = NULL;
421 
422 	/*
423 	 * If updating the ifile, update the super-block.  Update the disk
424 	 * address and access times for this inode in the ifile.
425 	 */
426 	ino = ip->i_number;
427 	if (ino == LFS_IFILE_INUM) {
428 		daddr = fs->lfs_idaddr;
429 		fs->lfs_idaddr = bp->b_blkno;
430 	} else {
431 		LFS_IENTRY(ifp, fs, ino, ibp);
432 		daddr = ifp->if_daddr;
433 		ifp->if_daddr = bp->b_blkno;
434 		LFS_UBWRITE(ibp);
435 	}
436 
437 	/*
438 	 * No need to update segment usage if there was no former inode address
439 	 * or if the last inode address is in the current partial segment.
440 	 */
441 	if (daddr != LFS_UNUSED_DADDR &&
442 	    !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) {
443 		LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp);
444 #ifdef DIAGNOSTIC
445 		if (sup->su_nbytes < sizeof(struct dinode)) {
446 			/* XXX -- Change to a panic. */
447 			printf("lfs: negative bytes (segment %d)\n",
448 			    datosn(fs, daddr));
449 			panic("negative bytes");
450 		}
451 #endif
452 		sup->su_nbytes -= sizeof(struct dinode);
453 		LFS_UBWRITE(bp);
454 		redo_ifile = (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
455 	}
456 	return (redo_ifile);
457 }
458 
459 void
460 lfs_gather(fs, sp, vp, match)
461 	struct lfs *fs;
462 	struct segment *sp;
463 	struct vnode *vp;
464 	int (*match) __P((struct lfs *, struct buf *));
465 {
466 	struct buf **bpp, *bp;
467 struct buf *lastbp;
468 	struct finfo *fip;
469 	struct inode *ip;
470 	daddr_t *lbp, *start_lbp;
471 	u_long version;
472 	int s;
473 
474 	ip = VTOI(vp);
475 	bpp = sp->cbpp;
476 	fip = sp->fip;
477 	start_lbp = lbp = &fip->fi_blocks[fip->fi_nblocks];
478 
479 loop:	s = splbio();
480 	lastbp = NULL;
481 	for (bp = vp->v_dirtyblkhd; bp; lastbp = bp, bp = bp->b_blockf) {
482 		if (bp->b_flags & B_BUSY || !match(fs, bp) ||
483 		    bp->b_flags & B_GATHERED)
484 			continue;
485 #ifdef DIAGNOSTIC
486 		if (!(bp->b_flags & B_DELWRI))
487 			panic("lfs_gather: bp not B_DELWRI");
488 		if (!(bp->b_flags & B_LOCKED))
489 			panic("lfs_gather: bp not B_LOCKED");
490 #endif
491 		/*
492 		 * If full, finish this segment.  We may be doing I/O, so
493 		 * release and reacquire the splbio().
494 		 */
495 		if (sp->sum_bytes_left < sizeof(daddr_t) ||
496 		    sp->seg_bytes_left < fs->lfs_bsize) {
497 			splx(s);
498 			lfs_updatemeta(fs,
499 			    sp, vp, start_lbp, bpp, lbp - start_lbp);
500 
501 			/* Add the current file to the segment summary. */
502 			++((SEGSUM *)(sp->segsum))->ss_nfinfo;
503 
504 			version = fip->fi_version;
505 			(void) lfs_writeseg(fs, sp);
506 			lfs_initseg(fs, sp);
507 
508 			fip = sp->fip;
509 			fip->fi_version = version;
510 			fip->fi_ino = ip->i_number;
511 			start_lbp = lbp = fip->fi_blocks;
512 
513 			sp->sum_bytes_left -=
514 			    sizeof(struct finfo) - sizeof(daddr_t);
515 
516 			bpp = sp->cbpp;
517 			goto loop;
518 		}
519 
520 		/* Insert into the buffer list, update the FINFO block. */
521 		bp->b_flags |= B_GATHERED;
522 		*sp->cbpp++ = bp;
523 		++fip->fi_nblocks;
524 		*lbp++ = bp->b_lblkno;
525 
526 		sp->sum_bytes_left -= sizeof(daddr_t);
527 		sp->seg_bytes_left -= bp->b_bufsize;
528 	}
529 	splx(s);
530 	lfs_updatemeta(fs, sp, vp, start_lbp, bpp, lbp - start_lbp);
531 }
532 
533 /*
534  * Update the metadata that points to the blocks listed in the FINFO
535  * array.
536  */
537 void
538 lfs_updatemeta(fs, sp, vp, lbp, bpp, nblocks)
539 	struct lfs *fs;
540 	struct segment *sp;
541 	struct vnode *vp;
542 	daddr_t *lbp;
543 	struct buf **bpp;
544 	int nblocks;
545 {
546 	SEGUSE *sup;
547 	struct buf *bp;
548 	INDIR a[NIADDR], *ap;
549 	struct inode *ip;
550 	daddr_t daddr, lbn, off;
551 	int db_per_fsb, error, i, num;
552 
553 	if (nblocks == 0)
554 		return;
555 
556 	/* Sort the blocks. */
557 	lfs_shellsort(bpp, lbp, nblocks);
558 
559 	/*
560 	 * Assign disk addresses, and update references to the logical
561 	 * block and the segment usage information.
562 	 */
563 	db_per_fsb = fsbtodb(fs, 1);
564 	for (i = nblocks; i--; ++bpp) {
565 		lbn = *lbp++;
566 		(*bpp)->b_blkno = off = fs->lfs_offset;
567 		fs->lfs_offset += db_per_fsb;
568 
569 		if (error = lfs_bmaparray(vp, lbn, &daddr, a, &num))
570 			panic("lfs_updatemeta: lfs_bmaparray %d", error);
571 		ip = VTOI(vp);
572 		switch (num) {
573 		case 0:
574 			ip->i_db[lbn] = off;
575 			break;
576 		case 1:
577 			ip->i_ib[a[0].in_off] = off;
578 			break;
579 		default:
580 			ap = &a[num - 1];
581 			if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp))
582 				panic("lfs_updatemeta: bread bno %d",
583 				    ap->in_lbn);
584 			/*
585 			 * Bread may create a new indirect block which needs
586 			 * to get counted for the inode.
587 			 */
588 			if (bp->b_blkno == -1 && !(bp->b_flags & B_CACHE)) {
589 				ip->i_blocks += btodb(fs->lfs_bsize);
590 				fs->lfs_bfree -= btodb(fs->lfs_bsize);
591 			}
592 			bp->b_un.b_daddr[ap->in_off] = off;
593 			VOP_BWRITE(bp);
594 		}
595 
596 		/* Update segment usage information. */
597 		if (daddr != UNASSIGNED) {
598 			LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp);
599 #ifdef DIAGNOSTIC
600 			if (sup->su_nbytes < fs->lfs_bsize) {
601 				/* XXX -- Change to a panic. */
602 				printf("lfs: negative bytes (segment %d)\n",
603 				    datosn(fs, daddr));
604 				panic ("Negative Bytes");
605 			}
606 #endif
607 			sup->su_nbytes -= fs->lfs_bsize;
608 			LFS_UBWRITE(bp);
609 		}
610 	}
611 }
612 
613 /*
614  * Start a new segment.
615  */
616 void
617 lfs_initseg(fs, sp)
618 	struct lfs *fs;
619 	struct segment *sp;
620 {
621 	SEGUSE *sup;
622 	SEGSUM *ssp;
623 	struct buf *bp;
624 	daddr_t lbn, *lbnp;
625 
626 	/* Advance to the next segment. */
627 	if (!LFS_PARTIAL_FITS(fs)) {
628 		/* Wake up any cleaning procs waiting on this file system. */
629 		wakeup(&fs->lfs_nextseg);
630 		wakeup(&lfs_allclean_wakeup);
631 
632 		lfs_newseg(fs);
633 		fs->lfs_offset = fs->lfs_curseg;
634 		sp->seg_number = datosn(fs, fs->lfs_curseg);
635 		sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE;
636 
637 		/*
638 		 * If the segment contains a superblock, update the offset
639 		 * and summary address to skip over it.
640 		 */
641 		LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
642 		if (sup->su_flags & SEGUSE_SUPERBLOCK) {
643 			fs->lfs_offset += LFS_SBPAD / DEV_BSIZE;
644 			sp->seg_bytes_left -= LFS_SBPAD;
645 		}
646 		brelse(bp);
647 	} else {
648 		sp->seg_number = datosn(fs, fs->lfs_curseg);
649 		sp->seg_bytes_left = (fs->lfs_dbpseg -
650 		    (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE;
651 	}
652 	fs->lfs_lastpseg = fs->lfs_offset;
653 
654 	sp->ibp = NULL;
655 	sp->ninodes = 0;
656 
657 	/* Get a new buffer for SEGSUM and enter it into the buffer list. */
658 	sp->cbpp = sp->bpp;
659 	*sp->cbpp = lfs_newbuf(fs, fs->lfs_offset, LFS_SUMMARY_SIZE);
660 	sp->segsum = (*sp->cbpp)->b_un.b_addr;
661 	++sp->cbpp;
662 	fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE;
663 
664 	/* Set point to SEGSUM, initialize it. */
665 	ssp = sp->segsum;
666 	ssp->ss_next = fs->lfs_nextseg;
667 	ssp->ss_nfinfo = ssp->ss_ninos = 0;
668 
669 	/* Set pointer to first FINFO, initialize it. */
670 	sp->fip = (struct finfo *)(sp->segsum + sizeof(SEGSUM));
671 	sp->fip->fi_nblocks = 0;
672 
673 	sp->seg_bytes_left -= LFS_SUMMARY_SIZE;
674 	sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM);
675 }
676 
677 /*
678  * Return the next segment to write.
679  */
680 void
681 lfs_newseg(fs)
682 	struct lfs *fs;
683 {
684 	CLEANERINFO *cip;
685 	SEGUSE *sup;
686 	struct buf *bp;
687 	int curseg, isdirty, sn;
688 
689         LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp);
690         sup->su_flags |= SEGUSE_DIRTY;
691         LFS_UBWRITE(bp);
692 
693 	LFS_CLEANERINFO(cip, fs, bp);
694 	--cip->clean;
695 	++cip->dirty;
696 	LFS_UBWRITE(bp);
697 
698 	fs->lfs_lastseg = fs->lfs_curseg;
699 	fs->lfs_curseg = fs->lfs_nextseg;
700 	for (sn = curseg = datosn(fs, fs->lfs_curseg);;) {
701 		sn = (sn + 1) % fs->lfs_nseg;
702 		if (sn == curseg)
703 			panic("lfs_nextseg: no clean segments");
704 		LFS_SEGENTRY(sup, fs, sn, bp);
705 		isdirty = sup->su_flags & SEGUSE_DIRTY;
706 		brelse(bp);
707 		if (!isdirty)
708 			break;
709 	}
710 
711 	fs->lfs_nextseg = sntoda(fs, sn);
712 }
713 
714 int
715 lfs_writeseg(fs, sp)
716 	struct lfs *fs;
717 	struct segment *sp;
718 {
719 	struct buf **bpp, *bp, *cbp;
720 	SEGUSE *sup;
721 	SEGSUM *ssp;
722 	dev_t i_dev;
723 	size_t size;
724 	u_long *datap, *dp;
725 	int ch_per_blk, do_again, i, nblocks, num, s;
726 	int (*strategy)__P((struct vop_strategy_args *));
727 	struct vop_strategy_args vop_strategy_a;
728 	u_short ninos;
729 	char *p;
730 
731 	/* Checkpoint always writes superblock, even if no data blocks. */
732 	if ((nblocks = sp->cbpp - sp->bpp) == 0 && !(sp->seg_flags & SEGM_CKP))
733 		return (0);
734 
735 	/*
736 	 * Compute checksum across data and then across summary; the first
737 	 * block (the summary block) is skipped.  Set the create time here
738 	 * so that it's guaranteed to be later than the inode mod times.
739 	 *
740 	 * XXX
741 	 * Fix this to do it inline, instead of malloc/copy.
742 	 */
743 	datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK);
744 	for (bpp = sp->bpp, i = nblocks - 1; i--;)
745 		*dp++ = (*++bpp)->b_un.b_words[0];
746 	ssp = (SEGSUM *)sp->segsum;
747 	ssp->ss_create = time.tv_sec;
748 	ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long));
749 	ssp->ss_sumsum =
750 	    cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum));
751 	free(datap, M_SEGMENT);
752 
753 	/* Update the segment usage information. */
754 	LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
755 	ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
756 	sup->su_nbytes += nblocks - 1 - ninos << fs->lfs_bshift;
757 	sup->su_nbytes += ssp->ss_ninos * sizeof(struct dinode);
758 	sup->su_nbytes += LFS_SUMMARY_SIZE;
759 	sup->su_lastmod = time.tv_sec;
760 	sup->su_flags |= SEGUSE_ACTIVE;
761 	sup->su_ninos += ninos;
762 	++sup->su_nsums;
763 	LFS_UBWRITE(bp);
764 	fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE);
765 	do_again = !(bp->b_flags & B_GATHERED);
766 
767 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
768 	strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)];
769 
770 	/*
771 	 * When we simply write the blocks we lose a rotation for every block
772 	 * written.  To avoid this problem, we allocate memory in chunks, copy
773 	 * the buffers into the chunk and write the chunk.  56K was chosen as
774 	 * some driver/controllers can't handle unsigned 16 bit transfers.
775 	 * When the data is copied to the chunk, turn off the the B_LOCKED bit
776 	 * and brelse the buffer (which will move them to the LRU list).  Add
777 	 * the B_CALL flag to the buffer header so we can count I/O's for the
778 	 * checkpoints and so we can release the allocated memory.
779 	 *
780 	 * XXX
781 	 * This should be removed if the new virtual memory system allows us to
782 	 * easily make the buffers contiguous in kernel memory and if that's
783 	 * fast enough.
784 	 */
785 #define	LFS_CHUNKSIZE	(56 * 1024)
786 	ch_per_blk = LFS_CHUNKSIZE / fs->lfs_bsize;
787 	for (bpp = sp->bpp, i = nblocks; i;) {
788 		num = ch_per_blk;
789 		if (num > i)
790 			num = i;
791 		i -= num;
792 		size = num * fs->lfs_bsize;
793 
794 		cbp = lfs_newbuf(fs, (*bpp)->b_blkno, 0);
795 		cbp->b_dev = i_dev;
796 		cbp->b_flags = B_ASYNC | B_BUSY | B_CALL;
797 		cbp->b_iodone = lfs_callback;
798 		cbp->b_saveaddr = cbp->b_un.b_addr;
799 		cbp->b_un.b_addr = malloc(size, M_SEGMENT, M_WAITOK);
800 
801 		s = splbio();
802 		++fs->lfs_iocount;
803 		for (p = cbp->b_un.b_addr; num--;) {
804 			bp = *bpp++;
805 			bcopy(bp->b_un.b_addr, p, bp->b_bcount);
806 			p += bp->b_bcount;
807 			bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI |
808 			     B_LOCKED | B_GATHERED);
809 			if (!(bp->b_flags & (B_NOCACHE | B_INVAL))) {
810 				bremfree(bp);
811 				reassignbuf(bp, bp->b_vp);
812 			}
813 			brelse(bp);
814 		}
815 		splx(s);
816 		cbp->b_bcount = p - cbp->b_un.b_addr;
817 		vop_strategy_a.a_desc = VDESC(vop_strategy);
818 		vop_strategy_a.a_bp = cbp;
819 		(strategy)(&vop_strategy_a);
820 	}
821 	return (do_again);
822 }
823 
824 void
825 lfs_writesuper(fs, sp)
826 	struct lfs *fs;
827 	struct segment *sp;
828 {
829 	struct buf *bp;
830 	dev_t i_dev;
831 	int (*strategy) __P((struct vop_strategy_args *));
832 	struct vop_strategy_args vop_strategy_a;
833 
834 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
835 	strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)];
836 
837 	/* Checksum the superblock and copy it into a buffer. */
838 	fs->lfs_cksum = cksum(fs, sizeof(struct lfs) - sizeof(fs->lfs_cksum));
839 	bp = lfs_newbuf(fs, fs->lfs_sboffs[0], LFS_SBPAD);
840 	*bp->b_un.b_lfs = *fs;
841 
842 	/* Write the first superblock (wait). */
843 	bp->b_dev = i_dev;
844 	bp->b_flags |= B_BUSY;
845 	bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI);
846 	vop_strategy_a.a_desc = VDESC(vop_strategy);
847 	vop_strategy_a.a_bp = bp;
848 	(strategy)(&vop_strategy_a);
849 	biowait(bp);
850 
851 	/* Write the second superblock (don't wait). */
852 	bp->b_blkno = bp->b_lblkno = fs->lfs_sboffs[1];
853 	bp->b_flags |= B_ASYNC | B_BUSY;
854 	bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI);
855 	(strategy)(&vop_strategy_a);
856 }
857 
858 /*
859  * Logical block number match routines used when traversing the dirty block
860  * chain.
861  */
862 int
863 lfs_match_data(fs, bp)
864 	struct lfs *fs;
865 	struct buf *bp;
866 {
867 	return (bp->b_lblkno >= 0);
868 }
869 
870 int
871 lfs_match_indir(fs, bp)
872 	struct lfs *fs;
873 	struct buf *bp;
874 {
875 	int lbn;
876 
877 	lbn = bp->b_lblkno;
878 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
879 }
880 
881 int
882 lfs_match_dindir(fs, bp)
883 	struct lfs *fs;
884 	struct buf *bp;
885 {
886 	int lbn;
887 
888 	lbn = bp->b_lblkno;
889 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
890 }
891 
892 int
893 lfs_match_tindir(fs, bp)
894 	struct lfs *fs;
895 	struct buf *bp;
896 {
897 	int lbn;
898 
899 	lbn = bp->b_lblkno;
900 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
901 }
902 
903 /*
904  * Allocate a new buffer header.
905  */
906 struct buf *
907 lfs_newbuf(fs, daddr, size)
908 	struct lfs *fs;
909 	daddr_t daddr;
910 	size_t size;
911 {
912 	struct buf *bp;
913 
914 	bp = getnewbuf();
915 	bremhash(bp);
916 	bgetvp(fs->lfs_ivnode, bp);
917 	bp->b_bcount = 0;
918 	bp->b_lblkno = daddr;
919 	bp->b_blkno = daddr;
920 	bp->b_error = 0;
921 	bp->b_resid = 0;
922 	if (size)
923 		allocbuf(bp, size);
924 	bp->b_flags |= B_NOCACHE;
925 	bp->b_saveaddr = NULL;
926 	binshash(bp, &bfreelist[BQ_AGE]);
927 	return (bp);
928 }
929 
930 void
931 lfs_callback(bp)
932 	struct buf *bp;
933 {
934 	struct lfs *fs;
935 
936 	fs = VFSTOUFS(bp->b_vp->v_mount)->um_lfs;
937 #ifdef DIAGNOSTIC
938 	if (fs->lfs_iocount == 0)
939 		panic("lfs_callback: zero iocount\n");
940 #endif
941 	if (--fs->lfs_iocount == 0)
942 		wakeup(&fs->lfs_iocount);
943 
944 	if (bp->b_saveaddr) {
945 		free(bp->b_un.b_addr, M_SEGMENT);
946 		bp->b_un.b_addr = bp->b_saveaddr;
947 		bp->b_saveaddr = NULL;
948 	}
949 	brelse(bp);
950 }
951 
952 /*
953  * Shellsort (diminishing increment sort) from Data Structures and
954  * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
955  * see also Knuth Vol. 3, page 84.  The increments are selected from
956  * formula (8), page 95.  Roughly O(N^3/2).
957  */
958 /*
959  * This is our own private copy of shellsort because we want to sort
960  * two parallel arrays (the array of buffer pointers and the array of
961  * logical block numbers) simultaneously.  Note that we cast the array
962  * of logical block numbers to a unsigned in this routine so that the
963  * negative block numbers (meta data blocks) sort AFTER the data blocks.
964  */
965 void
966 lfs_shellsort(bp_array, lb_array, nmemb)
967 	struct buf **bp_array;
968 	daddr_t *lb_array;
969 	register int nmemb;
970 {
971 	static int __rsshell_increments[] = { 4, 1, 0 };
972 	register int incr, *incrp, t1, t2;
973 	struct buf *bp_temp;
974 	u_long lb_temp;
975 
976 	for (incrp = __rsshell_increments; incr = *incrp++;)
977 		for (t1 = incr; t1 < nmemb; ++t1)
978 			for (t2 = t1 - incr; t2 >= 0;)
979 				if (lb_array[t2] > lb_array[t2 + incr]) {
980 					lb_temp = lb_array[t2];
981 					lb_array[t2] = lb_array[t2 + incr];
982 					lb_array[t2 + incr] = lb_temp;
983 					bp_temp = bp_array[t2];
984 					bp_array[t2] = bp_array[t2 + incr];
985 					bp_array[t2 + incr] = bp_temp;
986 					t2 -= incr;
987 				} else
988 					break;
989 }
990