xref: /netbsd-src/sys/ufs/lfs/lfs_segment.c (revision 27578b9aac214cc7796ead81dcc5427e79d5f2a0)
1 /*	$NetBSD: lfs_segment.c,v 1.70 2001/07/26 20:20:15 jdolecek Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *      This product includes software developed by the NetBSD
21  *      Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 /*
39  * Copyright (c) 1991, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. All advertising materials mentioning features or use of this software
51  *    must display the following acknowledgement:
52  *	This product includes software developed by the University of
53  *	California, Berkeley and its contributors.
54  * 4. Neither the name of the University nor the names of its contributors
55  *    may be used to endorse or promote products derived from this software
56  *    without specific prior written permission.
57  *
58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  *
70  *	@(#)lfs_segment.c	8.10 (Berkeley) 6/10/95
71  */
72 
73 #define ivndebug(vp,str) printf("ino %d: %s\n",VTOI(vp)->i_number,(str))
74 
75 #if defined(_KERNEL_OPT)
76 #include "opt_ddb.h"
77 #endif
78 
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/namei.h>
82 #include <sys/kernel.h>
83 #include <sys/resourcevar.h>
84 #include <sys/file.h>
85 #include <sys/stat.h>
86 #include <sys/buf.h>
87 #include <sys/proc.h>
88 #include <sys/conf.h>
89 #include <sys/vnode.h>
90 #include <sys/malloc.h>
91 #include <sys/mount.h>
92 
93 #include <miscfs/specfs/specdev.h>
94 #include <miscfs/fifofs/fifo.h>
95 
96 #include <ufs/ufs/quota.h>
97 #include <ufs/ufs/inode.h>
98 #include <ufs/ufs/dir.h>
99 #include <ufs/ufs/ufsmount.h>
100 #include <ufs/ufs/ufs_extern.h>
101 
102 #include <ufs/lfs/lfs.h>
103 #include <ufs/lfs/lfs_extern.h>
104 
105 extern int count_lock_queue(void);
106 extern struct simplelock vnode_free_list_slock;		/* XXX */
107 
108 /*
109  * Determine if it's OK to start a partial in this segment, or if we need
110  * to go on to a new segment.
111  */
112 #define	LFS_PARTIAL_FITS(fs) \
113 	((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
114 	fragstofsb((fs), (fs)->lfs_frag))
115 
116 void	 lfs_callback(struct buf *);
117 int	 lfs_gather(struct lfs *, struct segment *,
118 	     struct vnode *, int (*)(struct lfs *, struct buf *));
119 int	 lfs_gatherblock(struct segment *, struct buf *, int *);
120 void	 lfs_iset(struct inode *, ufs_daddr_t, time_t);
121 int	 lfs_match_fake(struct lfs *, struct buf *);
122 int	 lfs_match_data(struct lfs *, struct buf *);
123 int	 lfs_match_dindir(struct lfs *, struct buf *);
124 int	 lfs_match_indir(struct lfs *, struct buf *);
125 int	 lfs_match_tindir(struct lfs *, struct buf *);
126 void	 lfs_newseg(struct lfs *);
127 void	 lfs_shellsort(struct buf **, ufs_daddr_t *, int);
128 void	 lfs_supercallback(struct buf *);
129 void	 lfs_updatemeta(struct segment *);
130 int	 lfs_vref(struct vnode *);
131 void	 lfs_vunref(struct vnode *);
132 void	 lfs_writefile(struct lfs *, struct segment *, struct vnode *);
133 int	 lfs_writeinode(struct lfs *, struct segment *, struct inode *);
134 int	 lfs_writeseg(struct lfs *, struct segment *);
135 void	 lfs_writesuper(struct lfs *, daddr_t);
136 int	 lfs_writevnodes(struct lfs *fs, struct mount *mp,
137 	    struct segment *sp, int dirops);
138 
139 int	lfs_allclean_wakeup;		/* Cleaner wakeup address. */
140 int	lfs_writeindir = 1;             /* whether to flush indir on non-ckp */
141 int	lfs_clean_vnhead = 0;		/* Allow freeing to head of vn list */
142 int	lfs_dirvcount = 0;		/* # active dirops */
143 
144 /* Statistics Counters */
145 int lfs_dostats = 1;
146 struct lfs_stats lfs_stats;
147 
148 extern int locked_queue_count;
149 extern long locked_queue_bytes;
150 
151 /* op values to lfs_writevnodes */
152 #define	VN_REG	        0
153 #define	VN_DIROP	1
154 #define	VN_EMPTY	2
155 #define VN_CLEAN        3
156 
157 #define LFS_MAX_ACTIVE          10
158 
159 /*
160  * XXX KS - Set modification time on the Ifile, so the cleaner can
161  * read the fs mod time off of it.  We don't set IN_UPDATE here,
162  * since we don't really need this to be flushed to disk (and in any
163  * case that wouldn't happen to the Ifile until we checkpoint).
164  */
165 void
166 lfs_imtime(struct lfs *fs)
167 {
168 	struct timespec ts;
169 	struct inode *ip;
170 
171 	TIMEVAL_TO_TIMESPEC(&time, &ts);
172 	ip = VTOI(fs->lfs_ivnode);
173 	ip->i_ffs_mtime = ts.tv_sec;
174 	ip->i_ffs_mtimensec = ts.tv_nsec;
175 }
176 
177 /*
178  * Ifile and meta data blocks are not marked busy, so segment writes MUST be
179  * single threaded.  Currently, there are two paths into lfs_segwrite, sync()
180  * and getnewbuf().  They both mark the file system busy.  Lfs_vflush()
181  * explicitly marks the file system busy.  So lfs_segwrite is safe.  I think.
182  */
183 
184 #define SET_FLUSHING(fs,vp) (fs)->lfs_flushvp = (vp)
185 #define IS_FLUSHING(fs,vp)  ((fs)->lfs_flushvp == (vp))
186 #define CLR_FLUSHING(fs,vp) (fs)->lfs_flushvp = NULL
187 
188 int
189 lfs_vflush(struct vnode *vp)
190 {
191 	struct inode *ip;
192 	struct lfs *fs;
193 	struct segment *sp;
194 	struct buf *bp, *nbp, *tbp, *tnbp;
195 	int error, s;
196 
197 	ip = VTOI(vp);
198 	fs = VFSTOUFS(vp->v_mount)->um_lfs;
199 
200 	if(ip->i_flag & IN_CLEANING) {
201 #ifdef DEBUG_LFS
202 		ivndebug(vp,"vflush/in_cleaning");
203 #endif
204 		LFS_CLR_UINO(ip, IN_CLEANING);
205 		LFS_SET_UINO(ip, IN_MODIFIED);
206 
207 		/*
208 		 * Toss any cleaning buffers that have real counterparts
209 		 * to avoid losing new data
210 		 */
211 		s = splbio();
212 		for(bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
213 			nbp = bp->b_vnbufs.le_next;
214 			if(bp->b_flags & B_CALL) {
215 				for(tbp=vp->v_dirtyblkhd.lh_first; tbp;
216 				    tbp=tnbp)
217 				{
218 					tnbp = tbp->b_vnbufs.le_next;
219 					if(tbp->b_vp == bp->b_vp
220 					   && tbp->b_lblkno == bp->b_lblkno
221 					   && tbp != bp)
222 					{
223 						fs->lfs_avail += btofsb(fs, bp->b_bcount);
224 						wakeup(&fs->lfs_avail);
225 						lfs_freebuf(bp);
226 						bp = NULL;
227 						break;
228 					}
229 				}
230 			}
231 		}
232 		splx(s);
233 	}
234 
235 	/* If the node is being written, wait until that is done */
236 	if(WRITEINPROG(vp)) {
237 #ifdef DEBUG_LFS
238 		ivndebug(vp,"vflush/writeinprog");
239 #endif
240 		tsleep(vp, PRIBIO+1, "lfs_vw", 0);
241 	}
242 
243 	/* Protect against VXLOCK deadlock in vinvalbuf() */
244 	lfs_seglock(fs, SEGM_SYNC);
245 
246 	/* If we're supposed to flush a freed inode, just toss it */
247 	/* XXX - seglock, so these buffers can't be gathered, right? */
248 	if(ip->i_ffs_mode == 0) {
249 		printf("lfs_vflush: ino %d is freed, not flushing\n",
250 			ip->i_number);
251 		s = splbio();
252 		for(bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
253 			nbp = bp->b_vnbufs.le_next;
254 			if (bp->b_flags & B_DELWRI) { /* XXX always true? */
255 				fs->lfs_avail += btofsb(fs, bp->b_bcount);
256 				wakeup(&fs->lfs_avail);
257 			}
258 			/* Copied from lfs_writeseg */
259 			if (bp->b_flags & B_CALL) {
260 				/* if B_CALL, it was created with newbuf */
261 				lfs_freebuf(bp);
262 				bp = NULL;
263 			} else {
264 				bremfree(bp);
265 				LFS_UNLOCK_BUF(bp);
266 				bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
267                                          B_GATHERED);
268 				bp->b_flags |= B_DONE;
269 				reassignbuf(bp, vp);
270 				brelse(bp);
271 			}
272 		}
273 		splx(s);
274 		LFS_CLR_UINO(ip, IN_CLEANING);
275 		LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED);
276 		ip->i_flag &= ~IN_ALLMOD;
277 		printf("lfs_vflush: done not flushing ino %d\n",
278 			ip->i_number);
279 		lfs_segunlock(fs);
280 		return 0;
281 	}
282 
283 	SET_FLUSHING(fs,vp);
284 	if (fs->lfs_nactive > LFS_MAX_ACTIVE) {
285 		error = lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP);
286 		CLR_FLUSHING(fs,vp);
287 		lfs_segunlock(fs);
288 		return error;
289 	}
290 	sp = fs->lfs_sp;
291 
292 	if (vp->v_dirtyblkhd.lh_first == NULL) {
293 		lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
294 	} else if((ip->i_flag & IN_CLEANING) &&
295 		  (fs->lfs_sp->seg_flags & SEGM_CLEAN)) {
296 #ifdef DEBUG_LFS
297 		ivndebug(vp,"vflush/clean");
298 #endif
299 		lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
300 	}
301 	else if(lfs_dostats) {
302 		if(vp->v_dirtyblkhd.lh_first || (VTOI(vp)->i_flag & IN_ALLMOD))
303 			++lfs_stats.vflush_invoked;
304 #ifdef DEBUG_LFS
305 		ivndebug(vp,"vflush");
306 #endif
307 	}
308 
309 #ifdef DIAGNOSTIC
310 	/* XXX KS This actually can happen right now, though it shouldn't(?) */
311 	if(vp->v_flag & VDIROP) {
312 		printf("lfs_vflush: flushing VDIROP, this shouldn\'t be\n");
313 		/* panic("VDIROP being flushed...this can\'t happen"); */
314 	}
315 	if(vp->v_usecount<0) {
316 		printf("usecount=%ld\n", (long)vp->v_usecount);
317 		panic("lfs_vflush: usecount<0");
318 	}
319 #endif
320 
321 	do {
322 		do {
323 			if (vp->v_dirtyblkhd.lh_first != NULL)
324 				lfs_writefile(fs, sp, vp);
325 		} while (lfs_writeinode(fs, sp, ip));
326 	} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
327 
328 	if(lfs_dostats) {
329 		++lfs_stats.nwrites;
330 		if (sp->seg_flags & SEGM_SYNC)
331 			++lfs_stats.nsync_writes;
332 		if (sp->seg_flags & SEGM_CKP)
333 			++lfs_stats.ncheckpoints;
334 	}
335 	lfs_segunlock(fs);
336 
337 	CLR_FLUSHING(fs,vp);
338 	return (0);
339 }
340 
341 #ifdef DEBUG_LFS_VERBOSE
342 # define vndebug(vp,str) if(VTOI(vp)->i_flag & IN_CLEANING) printf("not writing ino %d because %s (op %d)\n",VTOI(vp)->i_number,(str),op)
343 #else
344 # define vndebug(vp,str)
345 #endif
346 
347 int
348 lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
349 {
350 	struct inode *ip;
351 	struct vnode *vp;
352 	int inodes_written=0, only_cleaning;
353 	int needs_unlock;
354 
355 #ifndef LFS_NO_BACKVP_HACK
356 	/* BEGIN HACK */
357 #define	VN_OFFSET	(((caddr_t)&vp->v_mntvnodes.le_next) - (caddr_t)vp)
358 #define	BACK_VP(VP)	((struct vnode *)(((caddr_t)VP->v_mntvnodes.le_prev) - VN_OFFSET))
359 #define	BEG_OF_VLIST	((struct vnode *)(((caddr_t)&mp->mnt_vnodelist.lh_first) - VN_OFFSET))
360 
361 	/* Find last vnode. */
362  loop:	for (vp = mp->mnt_vnodelist.lh_first;
363 	     vp && vp->v_mntvnodes.le_next != NULL;
364 	     vp = vp->v_mntvnodes.le_next);
365 	for (; vp && vp != BEG_OF_VLIST; vp = BACK_VP(vp)) {
366 #else
367 	loop:
368 	for (vp = mp->mnt_vnodelist.lh_first;
369 	     vp != NULL;
370 	     vp = vp->v_mntvnodes.le_next) {
371 #endif
372 		/*
373 		 * If the vnode that we are about to sync is no longer
374 		 * associated with this mount point, start over.
375 		 */
376 		if (vp->v_mount != mp) {
377 			printf("lfs_writevnodes: starting over\n");
378 			goto loop;
379 		}
380 
381 		ip = VTOI(vp);
382 		if ((op == VN_DIROP && !(vp->v_flag & VDIROP)) ||
383 		    (op != VN_DIROP && op != VN_CLEAN && (vp->v_flag & VDIROP))) {
384 			vndebug(vp,"dirop");
385 			continue;
386 		}
387 
388 		if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first) {
389 			vndebug(vp,"empty");
390 			continue;
391 		}
392 
393 		if (vp->v_type == VNON) {
394 			continue;
395 		}
396 
397 		if(op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM
398 		   && vp != fs->lfs_flushvp
399 		   && !(ip->i_flag & IN_CLEANING)) {
400 			vndebug(vp,"cleaning");
401 			continue;
402 		}
403 
404 		if (lfs_vref(vp)) {
405 			vndebug(vp,"vref");
406 			continue;
407 		}
408 
409 		needs_unlock = 0;
410 		if (VOP_ISLOCKED(vp)) {
411 			if (vp != fs->lfs_ivnode &&
412 			    vp->v_lock.lk_lockholder != curproc->p_pid) {
413 #ifdef DEBUG_LFS
414 				printf("lfs_writevnodes: not writing ino %d,"
415 				       " locked by pid %d\n",
416 				       VTOI(vp)->i_number,
417 				       vp->v_lock.lk_lockholder);
418 #endif
419 				lfs_vunref(vp);
420 				continue;
421 			}
422 		} else if (vp != fs->lfs_ivnode) {
423 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
424 			needs_unlock = 1;
425 		}
426 
427 		only_cleaning = 0;
428 		/*
429 		 * Write the inode/file if dirty and it's not the IFILE.
430 		 */
431 		if ((ip->i_flag & IN_ALLMOD) ||
432 		     (vp->v_dirtyblkhd.lh_first != NULL))
433 		{
434 			only_cleaning = ((ip->i_flag & IN_ALLMOD)==IN_CLEANING);
435 
436 			if(ip->i_number != LFS_IFILE_INUM
437 			   && vp->v_dirtyblkhd.lh_first != NULL)
438 			{
439 				lfs_writefile(fs, sp, vp);
440 			}
441 			if(vp->v_dirtyblkhd.lh_first != NULL) {
442 				if(WRITEINPROG(vp)) {
443 #ifdef DEBUG_LFS
444 					ivndebug(vp,"writevnodes/write2");
445 #endif
446 				} else if(!(ip->i_flag & IN_ALLMOD)) {
447 #ifdef DEBUG_LFS
448 					printf("<%d>",ip->i_number);
449 #endif
450 					LFS_SET_UINO(ip, IN_MODIFIED);
451 				}
452 			}
453 			(void) lfs_writeinode(fs, sp, ip);
454 			inodes_written++;
455 		}
456 
457 		if (needs_unlock)
458 			VOP_UNLOCK(vp, 0);
459 
460 		if (lfs_clean_vnhead && only_cleaning)
461 			lfs_vunref_head(vp);
462 		else
463 			lfs_vunref(vp);
464 	}
465 	return inodes_written;
466 }
467 
468 /*
469  * Do a checkpoint.
470  */
471 int
472 lfs_segwrite(struct mount *mp, int flags)
473 {
474 	struct buf *bp;
475 	struct inode *ip;
476 	struct lfs *fs;
477 	struct segment *sp;
478 	struct vnode *vp;
479 	SEGUSE *segusep;
480 	ufs_daddr_t ibno;
481 	int do_ckp, did_ckp, error, i;
482 	int writer_set = 0;
483 	int dirty;
484 
485 	fs = VFSTOUFS(mp)->um_lfs;
486 
487 	if (fs->lfs_ronly)
488 		return EROFS;
489 
490 	lfs_imtime(fs);
491 
492 	/* printf("lfs_segwrite: ifile flags are 0x%lx\n",
493 	       (long)(VTOI(fs->lfs_ivnode)->i_flag)); */
494 
495 #if 0
496 	/*
497 	 * If we are not the cleaner, and there is no space available,
498 	 * wait until cleaner writes.
499 	 */
500 	if(!(flags & SEGM_CLEAN) && !(fs->lfs_seglock && fs->lfs_sp &&
501 				      (fs->lfs_sp->seg_flags & SEGM_CLEAN)))
502 	{
503 		while (fs->lfs_avail <= 0) {
504 			LFS_CLEANERINFO(cip, fs, bp);
505 			LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
506 
507 			wakeup(&lfs_allclean_wakeup);
508 			wakeup(&fs->lfs_nextseg);
509 			error = tsleep(&fs->lfs_avail, PRIBIO + 1, "lfs_av2",
510 				       0);
511 			if (error) {
512 				return (error);
513 			}
514 		}
515 	}
516 #endif
517 	/*
518 	 * Allocate a segment structure and enough space to hold pointers to
519 	 * the maximum possible number of buffers which can be described in a
520 	 * single summary block.
521 	 */
522 	do_ckp = (flags & SEGM_CKP) || fs->lfs_nactive > LFS_MAX_ACTIVE;
523 	lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
524 	sp = fs->lfs_sp;
525 
526 	/*
527 	 * If lfs_flushvp is non-NULL, we are called from lfs_vflush,
528 	 * in which case we have to flush *all* buffers off of this vnode.
529 	 * We don't care about other nodes, but write any non-dirop nodes
530 	 * anyway in anticipation of another getnewvnode().
531 	 *
532 	 * If we're cleaning we only write cleaning and ifile blocks, and
533 	 * no dirops, since otherwise we'd risk corruption in a crash.
534 	 */
535 	if(sp->seg_flags & SEGM_CLEAN)
536 		lfs_writevnodes(fs, mp, sp, VN_CLEAN);
537 	else {
538 		lfs_writevnodes(fs, mp, sp, VN_REG);
539 		if(!fs->lfs_dirops || !fs->lfs_flushvp) {
540 			while(fs->lfs_dirops)
541 				if((error = tsleep(&fs->lfs_writer, PRIBIO + 1,
542 						"lfs writer", 0)))
543 				{
544 					/* XXX why not segunlock? */
545 					free(sp->bpp, M_SEGMENT);
546 					sp->bpp = NULL;
547 					free(sp, M_SEGMENT);
548 					fs->lfs_sp = NULL;
549 					return (error);
550 				}
551 			fs->lfs_writer++;
552 			writer_set=1;
553 			lfs_writevnodes(fs, mp, sp, VN_DIROP);
554 			((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
555 		}
556 	}
557 
558 	/*
559 	 * If we are doing a checkpoint, mark everything since the
560 	 * last checkpoint as no longer ACTIVE.
561 	 */
562 	if (do_ckp) {
563 		for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz;
564 		     --ibno >= fs->lfs_cleansz; ) {
565 			dirty = 0;
566 			if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize, NOCRED, &bp))
567 
568 				panic("lfs_segwrite: ifile read");
569 			segusep = (SEGUSE *)bp->b_data;
570 			for (i = fs->lfs_sepb; i--;) {
571 				if (segusep->su_flags & SEGUSE_ACTIVE) {
572 					segusep->su_flags &= ~SEGUSE_ACTIVE;
573 					++dirty;
574 				}
575 				if (fs->lfs_version > 1)
576 					++segusep;
577 				else
578 					segusep = (SEGUSE *)
579 						((SEGUSE_V1 *)segusep + 1);
580 			}
581 
582 			/* But the current segment is still ACTIVE */
583 			segusep = (SEGUSE *)bp->b_data;
584 			if (dtosn(fs, fs->lfs_curseg) / fs->lfs_sepb ==
585 			    (ibno-fs->lfs_cleansz)) {
586 				if (fs->lfs_version > 1)
587 					segusep[dtosn(fs, fs->lfs_curseg) %
588 					     fs->lfs_sepb].su_flags |=
589 						     SEGUSE_ACTIVE;
590 				else
591 					((SEGUSE *)
592 					 ((SEGUSE_V1 *)(bp->b_data) +
593 					  (dtosn(fs, fs->lfs_curseg) %
594 					   fs->lfs_sepb)))->su_flags
595 						   |= SEGUSE_ACTIVE;
596 				--dirty;
597 			}
598 			if (dirty)
599 				error = VOP_BWRITE(bp); /* Ifile */
600 			else
601 				brelse(bp);
602 		}
603 	}
604 
605 	did_ckp = 0;
606 	if (do_ckp || fs->lfs_doifile) {
607 		do {
608 			vp = fs->lfs_ivnode;
609 
610 			vget(vp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
611 
612 			ip = VTOI(vp);
613 			if (vp->v_dirtyblkhd.lh_first != NULL)
614 				lfs_writefile(fs, sp, vp);
615 			if (ip->i_flag & IN_ALLMOD)
616 				++did_ckp;
617 			(void) lfs_writeinode(fs, sp, ip);
618 
619 			vput(vp);
620 		} while (lfs_writeseg(fs, sp) && do_ckp);
621 
622 		/* The ifile should now be all clear */
623 		LFS_CLR_UINO(ip, IN_ALLMOD);
624 	} else {
625 		(void) lfs_writeseg(fs, sp);
626 	}
627 
628 	/*
629 	 * If the I/O count is non-zero, sleep until it reaches zero.
630 	 * At the moment, the user's process hangs around so we can
631 	 * sleep.
632 	 */
633 	fs->lfs_doifile = 0;
634 	if(writer_set && --fs->lfs_writer==0)
635 		wakeup(&fs->lfs_dirops);
636 
637 	/*
638 	 * If we didn't write the Ifile, we didn't really do anything.
639 	 * That means that (1) there is a checkpoint on disk and (2)
640 	 * nothing has changed since it was written.
641 	 *
642 	 * Take the flags off of the segment so that lfs_segunlock
643 	 * doesn't have to write the superblock either.
644 	 */
645 	if (did_ckp == 0) {
646 		sp->seg_flags &= ~(SEGM_SYNC|SEGM_CKP);
647 		/* if(do_ckp) printf("lfs_segwrite: no checkpoint\n"); */
648 	}
649 
650 	if(lfs_dostats) {
651 		++lfs_stats.nwrites;
652 		if (sp->seg_flags & SEGM_SYNC)
653 			++lfs_stats.nsync_writes;
654 		if (sp->seg_flags & SEGM_CKP)
655 			++lfs_stats.ncheckpoints;
656 	}
657 	lfs_segunlock(fs);
658 	return (0);
659 }
660 
661 /*
662  * Write the dirty blocks associated with a vnode.
663  */
664 void
665 lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
666 {
667 	struct buf *bp;
668 	struct finfo *fip;
669 	IFILE *ifp;
670 
671 
672 	if (sp->seg_bytes_left < fs->lfs_bsize ||
673 	    sp->sum_bytes_left < sizeof(struct finfo))
674 		(void) lfs_writeseg(fs, sp);
675 
676 	sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(ufs_daddr_t);
677 	++((SEGSUM *)(sp->segsum))->ss_nfinfo;
678 
679 	if(vp->v_flag & VDIROP)
680 		((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
681 
682 	fip = sp->fip;
683 	fip->fi_nblocks = 0;
684 	fip->fi_ino = VTOI(vp)->i_number;
685 	LFS_IENTRY(ifp, fs, fip->fi_ino, bp);
686 	fip->fi_version = ifp->if_version;
687 	brelse(bp);
688 
689 	if(sp->seg_flags & SEGM_CLEAN)
690 	{
691 		lfs_gather(fs, sp, vp, lfs_match_fake);
692 		/*
693 		 * For a file being flushed, we need to write *all* blocks.
694 		 * This means writing the cleaning blocks first, and then
695 		 * immediately following with any non-cleaning blocks.
696 		 * The same is true of the Ifile since checkpoints assume
697 		 * that all valid Ifile blocks are written.
698 		 */
699 	   	if(IS_FLUSHING(fs,vp) || VTOI(vp)->i_number == LFS_IFILE_INUM)
700 			lfs_gather(fs, sp, vp, lfs_match_data);
701 	} else
702 		lfs_gather(fs, sp, vp, lfs_match_data);
703 
704 	/*
705 	 * It may not be necessary to write the meta-data blocks at this point,
706 	 * as the roll-forward recovery code should be able to reconstruct the
707 	 * list.
708 	 *
709 	 * We have to write them anyway, though, under two conditions: (1) the
710 	 * vnode is being flushed (for reuse by vinvalbuf); or (2) we are
711 	 * checkpointing.
712 	 */
713 	if(lfs_writeindir
714 	   || IS_FLUSHING(fs,vp)
715 	   || (sp->seg_flags & SEGM_CKP))
716 	{
717 		lfs_gather(fs, sp, vp, lfs_match_indir);
718 		lfs_gather(fs, sp, vp, lfs_match_dindir);
719 		lfs_gather(fs, sp, vp, lfs_match_tindir);
720 	}
721 	fip = sp->fip;
722 	if (fip->fi_nblocks != 0) {
723 		sp->fip = (FINFO*)((caddr_t)fip + sizeof(struct finfo) +
724 				   sizeof(ufs_daddr_t) * (fip->fi_nblocks-1));
725 		sp->start_lbp = &sp->fip->fi_blocks[0];
726 	} else {
727 		sp->sum_bytes_left += sizeof(FINFO) - sizeof(ufs_daddr_t);
728 		--((SEGSUM *)(sp->segsum))->ss_nfinfo;
729 	}
730 }
731 
732 int
733 lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
734 {
735 	struct buf *bp, *ibp;
736 	struct dinode *cdp;
737 	IFILE *ifp;
738 	SEGUSE *sup;
739 	ufs_daddr_t daddr;
740 	daddr_t *daddrp;
741 	ino_t ino;
742 	int error, i, ndx, fsb = 0;
743 	int redo_ifile = 0;
744 	struct timespec ts;
745 	int gotblk = 0;
746 
747 	if (!(ip->i_flag & IN_ALLMOD))
748 		return(0);
749 
750 	/* Allocate a new inode block if necessary. */
751 	if ((ip->i_number != LFS_IFILE_INUM || sp->idp==NULL) && sp->ibp == NULL) {
752 		/* Allocate a new segment if necessary. */
753 		if (sp->seg_bytes_left < fs->lfs_ibsize ||
754 		    sp->sum_bytes_left < sizeof(ufs_daddr_t))
755 			(void) lfs_writeseg(fs, sp);
756 
757 		/* Get next inode block. */
758 		daddr = fs->lfs_offset;
759 		fs->lfs_offset += btofsb(fs, fs->lfs_ibsize);
760 		sp->ibp = *sp->cbpp++ =
761 			getblk(VTOI(fs->lfs_ivnode)->i_devvp, fsbtodb(fs, daddr),
762 			       fs->lfs_ibsize, 0, 0);
763 		gotblk++;
764 
765 		/* Zero out inode numbers */
766 		for (i = 0; i < INOPB(fs); ++i)
767 			((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0;
768 
769 		++sp->start_bpp;
770 		fs->lfs_avail -= btofsb(fs, fs->lfs_ibsize);
771 		/* Set remaining space counters. */
772 		sp->seg_bytes_left -= fs->lfs_ibsize;
773 		sp->sum_bytes_left -= sizeof(ufs_daddr_t);
774 		ndx = fs->lfs_sumsize / sizeof(ufs_daddr_t) -
775 			sp->ninodes / INOPB(fs) - 1;
776 		((ufs_daddr_t *)(sp->segsum))[ndx] = daddr;
777 	}
778 
779 	/* Update the inode times and copy the inode onto the inode page. */
780 	TIMEVAL_TO_TIMESPEC(&time, &ts);
781 	LFS_ITIMES(ip, &ts, &ts, &ts);
782 
783 	/*
784 	 * If this is the Ifile, and we've already written the Ifile in this
785 	 * partial segment, just overwrite it (it's not on disk yet) and
786 	 * continue.
787 	 *
788 	 * XXX we know that the bp that we get the second time around has
789 	 * already been gathered.
790 	 */
791 	if(ip->i_number == LFS_IFILE_INUM && sp->idp) {
792 		*(sp->idp) = ip->i_din.ffs_din;
793 		return 0;
794 	}
795 
796 	bp = sp->ibp;
797 	cdp = ((struct dinode *)bp->b_data) + (sp->ninodes % INOPB(fs));
798 	*cdp = ip->i_din.ffs_din;
799 #ifdef LFS_IFILE_FRAG_ADDRESSING
800 	if (fs->lfs_version > 1)
801 		fsb = (sp->ninodes % INOPB(fs)) / INOPF(fs);
802 #endif
803 
804 	/*
805 	 * If we are cleaning, ensure that we don't write UNWRITTEN disk
806 	 * addresses to disk.
807 	 */
808 	if (ip->i_lfs_effnblks != ip->i_ffs_blocks) {
809 #ifdef DEBUG_LFS
810 		printf("lfs_writeinode: cleansing ino %d (%d != %d)\n",
811 		       ip->i_number, ip->i_lfs_effnblks, ip->i_ffs_blocks);
812 #endif
813 		for (daddrp = cdp->di_db; daddrp < cdp->di_ib + NIADDR;
814 		     daddrp++) {
815 			if (*daddrp == UNWRITTEN) {
816 #ifdef DEBUG_LFS
817 				printf("lfs_writeinode: wiping UNWRITTEN\n");
818 #endif
819 				*daddrp = 0;
820 			}
821 		}
822 	}
823 
824 	if(ip->i_flag & IN_CLEANING)
825 		LFS_CLR_UINO(ip, IN_CLEANING);
826 	else {
827 		/* XXX IN_ALLMOD */
828 		LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE |
829 			     IN_UPDATE);
830 		if (ip->i_lfs_effnblks == ip->i_ffs_blocks)
831 			LFS_CLR_UINO(ip, IN_MODIFIED);
832 #ifdef DEBUG_LFS
833 		else
834 			printf("lfs_writeinode: ino %d: real blks=%d, "
835 			       "eff=%d\n", ip->i_number, ip->i_ffs_blocks,
836 			       ip->i_lfs_effnblks);
837 #endif
838 	}
839 
840 	if(ip->i_number == LFS_IFILE_INUM) /* We know sp->idp == NULL */
841 		sp->idp = ((struct dinode *)bp->b_data) +
842 			(sp->ninodes % INOPB(fs));
843 	if(gotblk) {
844 		LFS_LOCK_BUF(bp);
845 		brelse(bp);
846 	}
847 
848 	/* Increment inode count in segment summary block. */
849 	++((SEGSUM *)(sp->segsum))->ss_ninos;
850 
851 	/* If this page is full, set flag to allocate a new page. */
852 	if (++sp->ninodes % INOPB(fs) == 0)
853 		sp->ibp = NULL;
854 
855 	/*
856 	 * If updating the ifile, update the super-block.  Update the disk
857 	 * address and access times for this inode in the ifile.
858 	 */
859 	ino = ip->i_number;
860 	if (ino == LFS_IFILE_INUM) {
861 		daddr = fs->lfs_idaddr;
862 		fs->lfs_idaddr = dbtofsb(fs, bp->b_blkno);
863 	} else {
864 		LFS_IENTRY(ifp, fs, ino, ibp);
865 		daddr = ifp->if_daddr;
866 		ifp->if_daddr = dbtofsb(fs, bp->b_blkno) + fsb;
867 #ifdef LFS_DEBUG_NEXTFREE
868 		if(ino > 3 && ifp->if_nextfree) {
869 			vprint("lfs_writeinode",ITOV(ip));
870 			printf("lfs_writeinode: updating free ino %d\n",
871 				ip->i_number);
872 		}
873 #endif
874 		error = VOP_BWRITE(ibp); /* Ifile */
875 	}
876 
877 	/*
878 	 * Account the inode: it no longer belongs to its former segment,
879 	 * though it will not belong to the new segment until that segment
880 	 * is actually written.
881 	 */
882 #ifdef DEBUG
883 	/*
884 	 * The inode's last address should not be in the current partial
885 	 * segment, except under exceptional circumstances (lfs_writevnodes
886 	 * had to start over, and in the meantime more blocks were written
887 	 * to a vnode).  Although the previous inode won't be accounted in
888 	 * su_nbytes until lfs_writeseg, this shouldn't be a problem as we
889 	 * have more data blocks in the current partial segment.
890 	 */
891 	if (daddr >= fs->lfs_lastpseg && daddr <= dbtofsb(fs, bp->b_blkno))
892 		printf("lfs_writeinode: last inode addr in current pseg "
893 		       "(ino %d daddr 0x%x)\n", ino, daddr);
894 #endif
895 	if (daddr != LFS_UNUSED_DADDR) {
896 		LFS_SEGENTRY(sup, fs, dtosn(fs, daddr), bp);
897 #ifdef DIAGNOSTIC
898 		if (sup->su_nbytes < DINODE_SIZE) {
899 			printf("lfs_writeinode: negative bytes "
900 			       "(segment %d short by %d)\n",
901 			       dtosn(fs, daddr),
902 			       (int)DINODE_SIZE - sup->su_nbytes);
903 			panic("lfs_writeinode: negative bytes");
904 			sup->su_nbytes = DINODE_SIZE;
905 		}
906 #endif
907 #ifdef DEBUG_SU_NBYTES
908 		printf("seg %d -= %d for ino %d inode\n",
909 		       dtosn(fs, daddr), DINODE_SIZE, ino);
910 #endif
911 		sup->su_nbytes -= DINODE_SIZE;
912 		redo_ifile =
913 			(ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
914 		error = VOP_BWRITE(bp); /* Ifile */
915 	}
916 	return (redo_ifile);
917 }
918 
919 int
920 lfs_gatherblock(struct segment *sp, struct buf *bp, int *sptr)
921 {
922 	struct lfs *fs;
923 	int version;
924 
925 	/*
926 	 * If full, finish this segment.  We may be doing I/O, so
927 	 * release and reacquire the splbio().
928 	 */
929 #ifdef DIAGNOSTIC
930 	if (sp->vp == NULL)
931 		panic ("lfs_gatherblock: Null vp in segment");
932 #endif
933 	fs = sp->fs;
934 	if (sp->sum_bytes_left < sizeof(ufs_daddr_t) ||
935 	    sp->seg_bytes_left < bp->b_bcount) {
936 		if (sptr)
937 			splx(*sptr);
938 		lfs_updatemeta(sp);
939 
940 		version = sp->fip->fi_version;
941 		(void) lfs_writeseg(fs, sp);
942 
943 		sp->fip->fi_version = version;
944 		sp->fip->fi_ino = VTOI(sp->vp)->i_number;
945 		/* Add the current file to the segment summary. */
946 		++((SEGSUM *)(sp->segsum))->ss_nfinfo;
947 		sp->sum_bytes_left -=
948 			sizeof(struct finfo) - sizeof(ufs_daddr_t);
949 
950 		if (sptr)
951 			*sptr = splbio();
952 		return(1);
953 	}
954 
955 #ifdef DEBUG
956 	if(bp->b_flags & B_GATHERED) {
957 		printf("lfs_gatherblock: already gathered! Ino %d, lbn %d\n",
958 		       sp->fip->fi_ino, bp->b_lblkno);
959 		return(0);
960 	}
961 #endif
962 	/* Insert into the buffer list, update the FINFO block. */
963 	bp->b_flags |= B_GATHERED;
964 	*sp->cbpp++ = bp;
965 	sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno;
966 
967 	sp->sum_bytes_left -= sizeof(ufs_daddr_t);
968 	sp->seg_bytes_left -= bp->b_bcount;
969 	return(0);
970 }
971 
972 int
973 lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp, int (*match)(struct lfs *, struct buf *))
974 {
975 	struct buf *bp;
976 	int s, count=0;
977 
978 	sp->vp = vp;
979 	s = splbio();
980 
981 #ifndef LFS_NO_BACKBUF_HACK
982 loop:	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) {
983 #else /* LFS_NO_BACKBUF_HACK */
984 /* This is a hack to see if ordering the blocks in LFS makes a difference. */
985 # define	BUF_OFFSET	(((void *)&bp->b_vnbufs.le_next) - (void *)bp)
986 # define	BACK_BUF(BP)	((struct buf *)(((void *)BP->b_vnbufs.le_prev) - BUF_OFFSET))
987 # define	BEG_OF_LIST	((struct buf *)(((void *)&vp->v_dirtyblkhd.lh_first) - BUF_OFFSET))
988 /* Find last buffer. */
989 loop:	for (bp = vp->v_dirtyblkhd.lh_first; bp && bp->b_vnbufs.le_next != NULL;
990 	    bp = bp->b_vnbufs.le_next);
991 	for (; bp && bp != BEG_OF_LIST; bp = BACK_BUF(bp)) {
992 #endif /* LFS_NO_BACKBUF_HACK */
993 		if ((bp->b_flags & (B_BUSY|B_GATHERED)) || !match(fs, bp))
994 			continue;
995 		if(vp->v_type == VBLK) {
996 			/* For block devices, just write the blocks. */
997 			/* XXX Do we really need to even do this? */
998 #ifdef DEBUG_LFS
999 			if(count==0)
1000 				printf("BLK(");
1001 			printf(".");
1002 #endif
1003 			/* Get the block before bwrite, so we don't corrupt the free list */
1004 			bp->b_flags |= B_BUSY;
1005 			bremfree(bp);
1006 			bwrite(bp);
1007 		} else {
1008 #ifdef DIAGNOSTIC
1009 			if ((bp->b_flags & (B_CALL|B_INVAL))==B_INVAL) {
1010 				printf("lfs_gather: lbn %d is B_INVAL\n",
1011 					bp->b_lblkno);
1012 				VOP_PRINT(bp->b_vp);
1013 			}
1014 			if (!(bp->b_flags & B_DELWRI))
1015 				panic("lfs_gather: bp not B_DELWRI");
1016 			if (!(bp->b_flags & B_LOCKED)) {
1017 				printf("lfs_gather: lbn %d blk %d"
1018 				       " not B_LOCKED\n", bp->b_lblkno,
1019 				       dbtofsb(fs, bp->b_blkno));
1020 				VOP_PRINT(bp->b_vp);
1021 				panic("lfs_gather: bp not B_LOCKED");
1022 			}
1023 #endif
1024 			if (lfs_gatherblock(sp, bp, &s)) {
1025 				goto loop;
1026 			}
1027 		}
1028 		count++;
1029 	}
1030 	splx(s);
1031 #ifdef DEBUG_LFS
1032 	if(vp->v_type == VBLK && count)
1033 		printf(")\n");
1034 #endif
1035 	lfs_updatemeta(sp);
1036 	sp->vp = NULL;
1037 	return count;
1038 }
1039 
1040 /*
1041  * Update the metadata that points to the blocks listed in the FINFO
1042  * array.
1043  */
1044 void
1045 lfs_updatemeta(struct segment *sp)
1046 {
1047 	SEGUSE *sup;
1048 	struct buf *bp;
1049 	struct lfs *fs;
1050 	struct vnode *vp;
1051 	struct indir a[NIADDR + 2], *ap;
1052 	struct inode *ip;
1053 	ufs_daddr_t daddr, lbn, off;
1054 	daddr_t ooff;
1055 	int error, i, nblocks, num;
1056 	int bb;
1057 
1058 	vp = sp->vp;
1059 	nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
1060 	if (nblocks < 0)
1061 		panic("This is a bad thing\n");
1062 	if (vp == NULL || nblocks == 0)
1063 		return;
1064 
1065 	/* Sort the blocks. */
1066 	/*
1067 	 * XXX KS - We have to sort even if the blocks come from the
1068 	 * cleaner, because there might be other pending blocks on the
1069 	 * same inode...and if we don't sort, and there are fragments
1070 	 * present, blocks may be written in the wrong place.
1071 	 */
1072 	/* if (!(sp->seg_flags & SEGM_CLEAN)) */
1073 	lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks);
1074 
1075 	/*
1076 	 * Record the length of the last block in case it's a fragment.
1077 	 * If there are indirect blocks present, they sort last.  An
1078 	 * indirect block will be lfs_bsize and its presence indicates
1079 	 * that you cannot have fragments.
1080 	 */
1081 	sp->fip->fi_lastlength = sp->start_bpp[nblocks - 1]->b_bcount;
1082 
1083 	/*
1084 	 * Assign disk addresses, and update references to the logical
1085 	 * block and the segment usage information.
1086 	 */
1087 	fs = sp->fs;
1088 	for (i = nblocks; i--; ++sp->start_bpp) {
1089 		lbn = *sp->start_lbp++;
1090 
1091 		(*sp->start_bpp)->b_blkno = fsbtodb(fs, fs->lfs_offset);
1092 		off = fs->lfs_offset;
1093 		if((*sp->start_bpp)->b_blkno == (*sp->start_bpp)->b_lblkno) {
1094 			printf("lfs_updatemeta: ino %d blk %d"
1095 			       " has same lbn and daddr\n",
1096 			       VTOI(vp)->i_number, off);
1097 		}
1098 #ifdef DIAGNOSTIC
1099 		if((*sp->start_bpp)->b_bcount < fs->lfs_bsize && i != 0)
1100 			panic("lfs_updatemeta: fragment is not last block\n");
1101 #endif
1102 		bb = fragstofsb(fs, numfrags(fs, (*sp->start_bpp)->b_bcount));
1103 		fs->lfs_offset += bb;
1104 		error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL);
1105 		if (daddr > 0)
1106 			daddr = dbtofsb(fs, daddr);
1107 		if (error)
1108 			panic("lfs_updatemeta: ufs_bmaparray %d", error);
1109 		ip = VTOI(vp);
1110 		switch (num) {
1111 		case 0:
1112 			ooff = ip->i_ffs_db[lbn];
1113 #ifdef DEBUG
1114 			if (ooff == 0) {
1115 				printf("lfs_updatemeta[1]: warning: writing "
1116 				       "ino %d lbn %d at 0x%x, was 0x0\n",
1117 				       ip->i_number, lbn, off);
1118 			}
1119 #endif
1120 			if (ooff == UNWRITTEN)
1121 				ip->i_ffs_blocks += bb;
1122 			ip->i_ffs_db[lbn] = off;
1123 			break;
1124 		case 1:
1125 			ooff = ip->i_ffs_ib[a[0].in_off];
1126 #ifdef DEBUG
1127 			if (ooff == 0) {
1128 				printf("lfs_updatemeta[2]: warning: writing "
1129 				       "ino %d lbn %d at 0x%x, was 0x0\n",
1130 				       ip->i_number, lbn, off);
1131 			}
1132 #endif
1133 			if (ooff == UNWRITTEN)
1134 				ip->i_ffs_blocks += bb;
1135 			ip->i_ffs_ib[a[0].in_off] = off;
1136 			break;
1137 		default:
1138 			ap = &a[num - 1];
1139 			if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp))
1140 				panic("lfs_updatemeta: bread bno %d",
1141 				      ap->in_lbn);
1142 
1143 			ooff = ((ufs_daddr_t *)bp->b_data)[ap->in_off];
1144 #if DEBUG
1145 			if (ooff == 0) {
1146 				printf("lfs_updatemeta[3]: warning: writing "
1147 				       "ino %d lbn %d at 0x%x, was 0x0\n",
1148 				       ip->i_number, lbn, off);
1149 			}
1150 #endif
1151 			if (ooff == UNWRITTEN)
1152 				ip->i_ffs_blocks += bb;
1153 			((ufs_daddr_t *)bp->b_data)[ap->in_off] = off;
1154 			(void) VOP_BWRITE(bp);
1155 		}
1156 #ifdef DEBUG
1157 		if (daddr >= fs->lfs_lastpseg && daddr <= off) {
1158 			printf("lfs_updatemeta: ino %d, lbn %d, addr = %x "
1159 			       "in same pseg\n", VTOI(sp->vp)->i_number,
1160 			       (*sp->start_bpp)->b_lblkno, daddr);
1161 		}
1162 #endif
1163 		/* Update segment usage information. */
1164 		if (daddr > 0) {
1165 			LFS_SEGENTRY(sup, fs, dtosn(fs, daddr), bp);
1166 #ifdef DIAGNOSTIC
1167 			if (sup->su_nbytes < (*sp->start_bpp)->b_bcount) {
1168 				/* XXX -- Change to a panic. */
1169 				printf("lfs_updatemeta: negative bytes "
1170 				       "(segment %d short by %ld)\n",
1171 				       dtosn(fs, daddr),
1172 				       (*sp->start_bpp)->b_bcount -
1173 				       sup->su_nbytes);
1174 				printf("lfs_updatemeta: ino %d, lbn %d, "
1175 				       "addr = 0x%x\n", VTOI(sp->vp)->i_number,
1176 				       (*sp->start_bpp)->b_lblkno, daddr);
1177 				panic("lfs_updatemeta: negative bytes");
1178 				sup->su_nbytes = (*sp->start_bpp)->b_bcount;
1179 			}
1180 #endif
1181 #ifdef DEBUG_SU_NBYTES
1182 			printf("seg %d -= %ld for ino %d lbn %d db 0x%x\n",
1183 			       dtosn(fs, daddr), (*sp->start_bpp)->b_bcount,
1184 			       VTOI(sp->vp)->i_number,
1185 			       (*sp->start_bpp)->b_lblkno, daddr);
1186 #endif
1187 			sup->su_nbytes -= (*sp->start_bpp)->b_bcount;
1188 			error = VOP_BWRITE(bp); /* Ifile */
1189 		}
1190 	}
1191 }
1192 
1193 /*
1194  * Start a new segment.
1195  */
1196 int
1197 lfs_initseg(struct lfs *fs)
1198 {
1199 	struct segment *sp;
1200 	SEGUSE *sup;
1201 	SEGSUM *ssp;
1202 	struct buf *bp;
1203 	int repeat;
1204 
1205 	sp = fs->lfs_sp;
1206 
1207 	repeat = 0;
1208 	/* Advance to the next segment. */
1209 	if (!LFS_PARTIAL_FITS(fs)) {
1210 		/* lfs_avail eats the remaining space */
1211 		fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset -
1212 						   fs->lfs_curseg);
1213 		/* Wake up any cleaning procs waiting on this file system. */
1214 		wakeup(&lfs_allclean_wakeup);
1215 		wakeup(&fs->lfs_nextseg);
1216 		lfs_newseg(fs);
1217 		repeat = 1;
1218 		fs->lfs_offset = fs->lfs_curseg;
1219 		sp->seg_number = dtosn(fs, fs->lfs_curseg);
1220 		sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg);
1221 		/*
1222 		 * If the segment contains a superblock, update the offset
1223 		 * and summary address to skip over it.
1224 		 */
1225 		LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
1226 		if (sup->su_flags & SEGUSE_SUPERBLOCK) {
1227 			fs->lfs_offset += btofsb(fs, LFS_SBPAD);
1228 			sp->seg_bytes_left -= LFS_SBPAD;
1229 		}
1230 		brelse(bp);
1231 		/* Segment zero could also contain the labelpad */
1232 		if (fs->lfs_version > 1 && sp->seg_number == 0 &&
1233 		    fs->lfs_start < btofsb(fs, LFS_LABELPAD)) {
1234 			fs->lfs_offset += btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
1235 			sp->seg_bytes_left -= LFS_LABELPAD - fsbtob(fs, fs->lfs_start);
1236 		}
1237 	} else {
1238 		sp->seg_number = dtosn(fs, fs->lfs_curseg);
1239 		sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg -
1240 				      (fs->lfs_offset - fs->lfs_curseg));
1241 	}
1242 	fs->lfs_lastpseg = fs->lfs_offset;
1243 
1244 	sp->fs = fs;
1245 	sp->ibp = NULL;
1246 	sp->idp = NULL;
1247 	sp->ninodes = 0;
1248 
1249 	/* Get a new buffer for SEGSUM and enter it into the buffer list. */
1250 	sp->cbpp = sp->bpp;
1251 	*sp->cbpp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
1252 			       fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize);
1253 	sp->segsum = (*sp->cbpp)->b_data;
1254 	bzero(sp->segsum, fs->lfs_sumsize);
1255 	sp->start_bpp = ++sp->cbpp;
1256 	fs->lfs_offset += btofsb(fs, fs->lfs_sumsize);
1257 
1258 	/* Set point to SEGSUM, initialize it. */
1259 	ssp = sp->segsum;
1260 	ssp->ss_next = fs->lfs_nextseg;
1261 	ssp->ss_nfinfo = ssp->ss_ninos = 0;
1262 	ssp->ss_magic = SS_MAGIC;
1263 
1264 	/* Set pointer to first FINFO, initialize it. */
1265 	sp->fip = (struct finfo *)((caddr_t)sp->segsum + SEGSUM_SIZE(fs));
1266 	sp->fip->fi_nblocks = 0;
1267 	sp->start_lbp = &sp->fip->fi_blocks[0];
1268 	sp->fip->fi_lastlength = 0;
1269 
1270 	sp->seg_bytes_left -= fs->lfs_sumsize;
1271 	sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs);
1272 
1273 	return(repeat);
1274 }
1275 
1276 /*
1277  * Return the next segment to write.
1278  */
1279 void
1280 lfs_newseg(struct lfs *fs)
1281 {
1282 	CLEANERINFO *cip;
1283 	SEGUSE *sup;
1284 	struct buf *bp;
1285 	int curseg, isdirty, sn;
1286 
1287 	LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
1288 #ifdef DEBUG_SU_NBYTES
1289 	printf("lfs_newseg: seg %d := 0 in newseg\n",   /* XXXDEBUG */
1290 	       dtosn(fs, fs->lfs_nextseg)); /* XXXDEBUG */
1291 #endif
1292 	sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
1293 	sup->su_nbytes = 0;
1294 	sup->su_nsums = 0;
1295 	sup->su_ninos = 0;
1296 	(void) VOP_BWRITE(bp); /* Ifile */
1297 
1298 	LFS_CLEANERINFO(cip, fs, bp);
1299 	--cip->clean;
1300 	++cip->dirty;
1301 	fs->lfs_nclean = cip->clean;
1302 	LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
1303 
1304 	fs->lfs_lastseg = fs->lfs_curseg;
1305 	fs->lfs_curseg = fs->lfs_nextseg;
1306 	for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) {
1307 		sn = (sn + 1) % fs->lfs_nseg;
1308 		if (sn == curseg)
1309 			panic("lfs_nextseg: no clean segments");
1310 		LFS_SEGENTRY(sup, fs, sn, bp);
1311 		isdirty = sup->su_flags & SEGUSE_DIRTY;
1312 		brelse(bp);
1313 		if (!isdirty)
1314 			break;
1315 	}
1316 
1317 	++fs->lfs_nactive;
1318 	fs->lfs_nextseg = sntod(fs, sn);
1319 	if(lfs_dostats) {
1320 		++lfs_stats.segsused;
1321 	}
1322 }
1323 
1324 int
1325 lfs_writeseg(struct lfs *fs, struct segment *sp)
1326 {
1327 	struct buf **bpp, *bp, *cbp, *newbp;
1328 	SEGUSE *sup;
1329 	SEGSUM *ssp;
1330 	dev_t i_dev;
1331 	char *datap, *dp;
1332 	int do_again, i, nblocks, s;
1333 	size_t el_size;
1334 #ifdef LFS_TRACK_IOS
1335 	int j;
1336 #endif
1337 	int (*strategy)(void *);
1338 	struct vop_strategy_args vop_strategy_a;
1339 	u_short ninos;
1340 	struct vnode *devvp;
1341 	char *p;
1342 	struct vnode *vp;
1343 	struct inode *ip;
1344 	daddr_t *daddrp;
1345 	int changed;
1346 #if defined(DEBUG) && defined(LFS_PROPELLER)
1347 	static int propeller;
1348 	char propstring[4] = "-\\|/";
1349 
1350 	printf("%c\b",propstring[propeller++]);
1351 	if(propeller==4)
1352 		propeller = 0;
1353 #endif
1354 
1355 	/*
1356 	 * If there are no buffers other than the segment summary to write
1357 	 * and it is not a checkpoint, don't do anything.  On a checkpoint,
1358 	 * even if there aren't any buffers, you need to write the superblock.
1359 	 */
1360 	if ((nblocks = sp->cbpp - sp->bpp) == 1)
1361 		return (0);
1362 
1363 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
1364 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
1365 
1366 	/* Update the segment usage information. */
1367 	LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
1368 
1369 	/* Loop through all blocks, except the segment summary. */
1370 	for (bpp = sp->bpp; ++bpp < sp->cbpp; ) {
1371 		if((*bpp)->b_vp != devvp) {
1372 			sup->su_nbytes += (*bpp)->b_bcount;
1373 #ifdef DEBUG_SU_NBYTES
1374 		printf("seg %d += %ld for ino %d lbn %d db 0x%x\n",
1375 		       sp->seg_number, (*bpp)->b_bcount,
1376 		       VTOI((*bpp)->b_vp)->i_number,
1377 		       (*bpp)->b_lblkno, (*bpp)->b_blkno);
1378 #endif
1379 		}
1380 	}
1381 
1382 	ssp = (SEGSUM *)sp->segsum;
1383 
1384 	ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
1385 #ifdef DEBUG_SU_NBYTES
1386 	printf("seg %d += %d for %d inodes\n",   /* XXXDEBUG */
1387 	       sp->seg_number, ssp->ss_ninos * DINODE_SIZE,
1388 	       ssp->ss_ninos);
1389 #endif
1390 	sup->su_nbytes += ssp->ss_ninos * DINODE_SIZE;
1391 	/* sup->su_nbytes += fs->lfs_sumsize; */
1392 	if (fs->lfs_version == 1)
1393 		sup->su_olastmod = time.tv_sec;
1394 	else
1395 		sup->su_lastmod = time.tv_sec;
1396 	sup->su_ninos += ninos;
1397 	++sup->su_nsums;
1398 	fs->lfs_dmeta += (btofsb(fs, fs->lfs_sumsize) + btofsb(fs, ninos *
1399 							 fs->lfs_ibsize));
1400 	fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize);
1401 
1402 	do_again = !(bp->b_flags & B_GATHERED);
1403 	(void)VOP_BWRITE(bp); /* Ifile */
1404 	/*
1405 	 * Mark blocks B_BUSY, to prevent then from being changed between
1406 	 * the checksum computation and the actual write.
1407 	 *
1408 	 * If we are cleaning, check indirect blocks for UNWRITTEN, and if
1409 	 * there are any, replace them with copies that have UNASSIGNED
1410 	 * instead.
1411 	 */
1412 	for (bpp = sp->bpp, i = nblocks - 1; i--;) {
1413 		++bpp;
1414 		if((*bpp)->b_flags & B_CALL)
1415 			continue;
1416 		bp = *bpp;
1417 	    again:
1418 		s = splbio();
1419 		if(bp->b_flags & B_BUSY) {
1420 #ifdef DEBUG
1421 			printf("lfs_writeseg: avoiding potential data "
1422 			       "summary corruption for ino %d, lbn %d\n",
1423 			       VTOI(bp->b_vp)->i_number, bp->b_lblkno);
1424 #endif
1425 			bp->b_flags |= B_WANTED;
1426 			tsleep(bp, (PRIBIO + 1), "lfs_writeseg", 0);
1427 			splx(s);
1428 			goto again;
1429 		}
1430 		bp->b_flags |= B_BUSY;
1431 		splx(s);
1432 		/* Check and replace indirect block UNWRITTEN bogosity */
1433 		if(bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp &&
1434 		   VTOI(bp->b_vp)->i_ffs_blocks !=
1435 		   VTOI(bp->b_vp)->i_lfs_effnblks) {
1436 #ifdef DEBUG_LFS
1437 			printf("lfs_writeseg: cleansing ino %d (%d != %d)\n",
1438 			       VTOI(bp->b_vp)->i_number,
1439 			       VTOI(bp->b_vp)->i_lfs_effnblks,
1440 			       VTOI(bp->b_vp)->i_ffs_blocks);
1441 #endif
1442 			/* Make a copy we'll make changes to */
1443 			newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
1444 					   bp->b_bcount);
1445 			newbp->b_blkno = bp->b_blkno;
1446 			memcpy(newbp->b_data, bp->b_data,
1447 			       newbp->b_bcount);
1448 			*bpp = newbp;
1449 
1450 			changed = 0;
1451 			for (daddrp = (daddr_t *)(newbp->b_data);
1452 			     daddrp < (daddr_t *)(newbp->b_data +
1453 						  newbp->b_bcount); daddrp++) {
1454 				if (*daddrp == UNWRITTEN) {
1455 					++changed;
1456 #ifdef DEBUG_LFS
1457 					printf("lfs_writeseg: replacing UNWRITTEN\n");
1458 #endif
1459 					*daddrp = 0;
1460 				}
1461 			}
1462 			/*
1463 			 * Get rid of the old buffer.  Don't mark it clean,
1464 			 * though, if it still has dirty data on it.
1465 			 */
1466 			if (changed) {
1467 				bp->b_flags &= ~(B_ERROR | B_GATHERED);
1468 				if (bp->b_flags & B_CALL) {
1469 					lfs_freebuf(bp);
1470 					bp = NULL;
1471 				} else {
1472 					/* Still on free list, leave it there */
1473 					s = splbio();
1474 					bp->b_flags &= ~B_BUSY;
1475 					if (bp->b_flags & B_WANTED)
1476 						wakeup(bp);
1477 				 	splx(s);
1478 					/*
1479 					 * We have to re-decrement lfs_avail
1480 					 * since this block is going to come
1481 					 * back around to us in the next
1482 					 * segment.
1483 					 */
1484 					fs->lfs_avail -= btofsb(fs, bp->b_bcount);
1485 				}
1486 			} else {
1487 				bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
1488 						 B_GATHERED);
1489 				LFS_UNLOCK_BUF(bp);
1490 				if (bp->b_flags & B_CALL) {
1491 					lfs_freebuf(bp);
1492 					bp = NULL;
1493 				} else {
1494 					bremfree(bp);
1495 					bp->b_flags |= B_DONE;
1496 					reassignbuf(bp, bp->b_vp);
1497 					brelse(bp);
1498 				}
1499 			}
1500 
1501 		}
1502 	}
1503 	/*
1504 	 * Compute checksum across data and then across summary; the first
1505 	 * block (the summary block) is skipped.  Set the create time here
1506 	 * so that it's guaranteed to be later than the inode mod times.
1507 	 *
1508 	 * XXX
1509 	 * Fix this to do it inline, instead of malloc/copy.
1510 	 */
1511 	if (fs->lfs_version == 1)
1512 		el_size = sizeof(u_long);
1513 	else
1514 		el_size = sizeof(u_int32_t);
1515 	datap = dp = malloc(nblocks * el_size, M_SEGMENT, M_WAITOK);
1516 	for (bpp = sp->bpp, i = nblocks - 1; i--;) {
1517 		if (((*++bpp)->b_flags & (B_CALL|B_INVAL)) == (B_CALL|B_INVAL)) {
1518 			if (copyin((*bpp)->b_saveaddr, dp, el_size))
1519 				panic("lfs_writeseg: copyin failed [1]: "
1520 				      "ino %d blk %d",
1521 				      VTOI((*bpp)->b_vp)->i_number,
1522 				      (*bpp)->b_lblkno);
1523 		} else
1524 			memcpy(dp, (*bpp)->b_data, el_size);
1525 		dp += el_size;
1526 	}
1527 	if (fs->lfs_version == 1)
1528 		ssp->ss_ocreate = time.tv_sec;
1529 	else {
1530 		ssp->ss_create = time.tv_sec;
1531 		ssp->ss_serial = ++fs->lfs_serial;
1532 		ssp->ss_ident  = fs->lfs_ident;
1533 	}
1534 	ssp->ss_datasum = cksum(datap, (nblocks - 1) * el_size);
1535 	ssp->ss_sumsum =
1536 	    cksum(&ssp->ss_datasum, fs->lfs_sumsize - sizeof(ssp->ss_sumsum));
1537 	free(datap, M_SEGMENT);
1538 	datap = dp = NULL;
1539 #ifdef DIAGNOSTIC
1540 	if (fs->lfs_bfree < btofsb(fs, ninos * fs->lfs_ibsize) + btofsb(fs, fs->lfs_sumsize))
1541 		panic("lfs_writeseg: No diskspace for summary");
1542 #endif
1543 	fs->lfs_bfree -= (btofsb(fs, ninos * fs->lfs_ibsize) +
1544 			  btofsb(fs, fs->lfs_sumsize));
1545 
1546 	strategy = devvp->v_op[VOFFSET(vop_strategy)];
1547 
1548 	/*
1549 	 * When we simply write the blocks we lose a rotation for every block
1550 	 * written.  To avoid this problem, we allocate memory in chunks, copy
1551 	 * the buffers into the chunk and write the chunk.  CHUNKSIZE is the
1552 	 * largest size I/O devices can handle.
1553 	 * When the data is copied to the chunk, turn off the B_LOCKED bit
1554 	 * and brelse the buffer (which will move them to the LRU list).  Add
1555 	 * the B_CALL flag to the buffer header so we can count I/O's for the
1556 	 * checkpoints and so we can release the allocated memory.
1557 	 *
1558 	 * XXX
1559 	 * This should be removed if the new virtual memory system allows us to
1560 	 * easily make the buffers contiguous in kernel memory and if that's
1561 	 * fast enough.
1562 	 */
1563 
1564 #define CHUNKSIZE MAXPHYS
1565 
1566 	if(devvp==NULL)
1567 		panic("devvp is NULL");
1568 	for (bpp = sp->bpp,i = nblocks; i;) {
1569 		cbp = lfs_newbuf(fs, devvp, (*bpp)->b_blkno, CHUNKSIZE);
1570 		cbp->b_dev = i_dev;
1571 		cbp->b_flags |= B_ASYNC | B_BUSY;
1572 		cbp->b_bcount = 0;
1573 
1574 #ifdef DIAGNOSTIC
1575 		if(dtosn(fs, dbtofsb(fs, (*bpp)->b_blkno) + btofsb(fs, (*bpp)->b_bcount) - 1) !=
1576 		   dtosn(fs, dbtofsb(fs, cbp->b_blkno))) {
1577 			panic("lfs_writeseg: Segment overwrite");
1578 		}
1579 #endif
1580 
1581 		s = splbio();
1582 		if(fs->lfs_iocount >= LFS_THROTTLE) {
1583 			tsleep(&fs->lfs_iocount, PRIBIO+1, "lfs throttle", 0);
1584 		}
1585 		++fs->lfs_iocount;
1586 #ifdef LFS_TRACK_IOS
1587 		for(j=0;j<LFS_THROTTLE;j++) {
1588 			if(fs->lfs_pending[j]==LFS_UNUSED_DADDR) {
1589 				fs->lfs_pending[j] = dbtofsb(fs, cbp->b_blkno);
1590 				break;
1591 			}
1592 		}
1593 #endif /* LFS_TRACK_IOS */
1594 		for (p = cbp->b_data; i && cbp->b_bcount < CHUNKSIZE; i--) {
1595 			bp = *bpp;
1596 
1597 			if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount))
1598 				break;
1599 
1600 			/*
1601 			 * Fake buffers from the cleaner are marked as B_INVAL.
1602 			 * We need to copy the data from user space rather than
1603 			 * from the buffer indicated.
1604 			 * XXX == what do I do on an error?
1605 			 */
1606 			if ((bp->b_flags & (B_CALL|B_INVAL)) == (B_CALL|B_INVAL)) {
1607 				if (copyin(bp->b_saveaddr, p, bp->b_bcount))
1608 					panic("lfs_writeseg: copyin failed [2]");
1609 			} else
1610 				bcopy(bp->b_data, p, bp->b_bcount);
1611 			p += bp->b_bcount;
1612 			cbp->b_bcount += bp->b_bcount;
1613 			LFS_UNLOCK_BUF(bp);
1614 			bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
1615 					 B_GATHERED);
1616 			vp = bp->b_vp;
1617 			if (bp->b_flags & B_CALL) {
1618 				/* if B_CALL, it was created with newbuf */
1619 				lfs_freebuf(bp);
1620 				bp = NULL;
1621 			} else {
1622 				bremfree(bp);
1623 				bp->b_flags |= B_DONE;
1624 				if(vp)
1625 					reassignbuf(bp, vp);
1626 				brelse(bp);
1627 			}
1628 
1629 			bpp++;
1630 
1631 			/*
1632 			 * If this is the last block for this vnode, but
1633 			 * there are other blocks on its dirty list,
1634 			 * set IN_MODIFIED/IN_CLEANING depending on what
1635 			 * sort of block.  Only do this for our mount point,
1636 			 * not for, e.g., inode blocks that are attached to
1637 			 * the devvp.
1638 			 * XXX KS - Shouldn't we set *both* if both types
1639 			 * of blocks are present (traverse the dirty list?)
1640 			 */
1641 			if((i == 1 ||
1642 			    (i > 1 && vp && *bpp && (*bpp)->b_vp != vp)) &&
1643 			   (bp = vp->v_dirtyblkhd.lh_first) != NULL &&
1644 			   vp->v_mount == fs->lfs_ivnode->v_mount)
1645 			{
1646 				ip = VTOI(vp);
1647 #ifdef DEBUG_LFS
1648 				printf("lfs_writeseg: marking ino %d\n",
1649 				       ip->i_number);
1650 #endif
1651 				if(bp->b_flags & B_CALL)
1652 					LFS_SET_UINO(ip, IN_CLEANING);
1653 				else
1654 					LFS_SET_UINO(ip, IN_MODIFIED);
1655 			}
1656 			wakeup(vp);
1657 		}
1658 		++cbp->b_vp->v_numoutput;
1659 		splx(s);
1660 		/*
1661 		 * XXXX This is a gross and disgusting hack.  Since these
1662 		 * buffers are physically addressed, they hang off the
1663 		 * device vnode (devvp).  As a result, they have no way
1664 		 * of getting to the LFS superblock or lfs structure to
1665 		 * keep track of the number of I/O's pending.  So, I am
1666 		 * going to stuff the fs into the saveaddr field of
1667 		 * the buffer (yuk).
1668 		 */
1669 		cbp->b_saveaddr = (caddr_t)fs;
1670 		vop_strategy_a.a_desc = VDESC(vop_strategy);
1671 		vop_strategy_a.a_bp = cbp;
1672 		(strategy)(&vop_strategy_a);
1673 	}
1674 #if 1 || defined(DEBUG)
1675 	/*
1676 	 * After doing a big write, we recalculate how many buffers are
1677 	 * really still left on the locked queue.
1678 	 */
1679 	s = splbio();
1680 	lfs_countlocked(&locked_queue_count, &locked_queue_bytes);
1681 	splx(s);
1682 	wakeup(&locked_queue_count);
1683 #endif /* 1 || DEBUG */
1684 	if(lfs_dostats) {
1685 		++lfs_stats.psegwrites;
1686 		lfs_stats.blocktot += nblocks - 1;
1687 		if (fs->lfs_sp->seg_flags & SEGM_SYNC)
1688 			++lfs_stats.psyncwrites;
1689 		if (fs->lfs_sp->seg_flags & SEGM_CLEAN) {
1690 			++lfs_stats.pcleanwrites;
1691 			lfs_stats.cleanblocks += nblocks - 1;
1692 		}
1693 	}
1694 	return (lfs_initseg(fs) || do_again);
1695 }
1696 
1697 void
1698 lfs_writesuper(struct lfs *fs, daddr_t daddr)
1699 {
1700 	struct buf *bp;
1701 	dev_t i_dev;
1702 	int (*strategy)(void *);
1703 	int s;
1704 	struct vop_strategy_args vop_strategy_a;
1705 
1706 	/*
1707 	 * If we can write one superblock while another is in
1708 	 * progress, we risk not having a complete checkpoint if we crash.
1709 	 * So, block here if a superblock write is in progress.
1710 	 */
1711 	s = splbio();
1712 	while(fs->lfs_sbactive) {
1713 		tsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0);
1714 	}
1715 	fs->lfs_sbactive = daddr;
1716 	splx(s);
1717 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
1718 	strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)];
1719 
1720 	/* Set timestamp of this version of the superblock */
1721 	if (fs->lfs_version == 1)
1722 		fs->lfs_otstamp = time.tv_sec;
1723 	fs->lfs_tstamp = time.tv_sec;
1724 
1725 	/* Checksum the superblock and copy it into a buffer. */
1726 	fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs));
1727 	bp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp, fsbtodb(fs, daddr), LFS_SBPAD);
1728 	*(struct dlfs *)bp->b_data = fs->lfs_dlfs;
1729 
1730 	bp->b_dev = i_dev;
1731 	bp->b_flags |= B_BUSY | B_CALL | B_ASYNC;
1732 	bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI);
1733 	bp->b_iodone = lfs_supercallback;
1734 	/* XXX KS - same nasty hack as above */
1735 	bp->b_saveaddr = (caddr_t)fs;
1736 
1737 	vop_strategy_a.a_desc = VDESC(vop_strategy);
1738 	vop_strategy_a.a_bp = bp;
1739 	s = splbio();
1740 	++bp->b_vp->v_numoutput;
1741 	++fs->lfs_iocount;
1742 	splx(s);
1743 	(strategy)(&vop_strategy_a);
1744 }
1745 
1746 /*
1747  * Logical block number match routines used when traversing the dirty block
1748  * chain.
1749  */
1750 int
1751 lfs_match_fake(struct lfs *fs, struct buf *bp)
1752 {
1753 	return (bp->b_flags & B_CALL);
1754 }
1755 
1756 int
1757 lfs_match_data(struct lfs *fs, struct buf *bp)
1758 {
1759 	return (bp->b_lblkno >= 0);
1760 }
1761 
1762 int
1763 lfs_match_indir(struct lfs *fs, struct buf *bp)
1764 {
1765 	int lbn;
1766 
1767 	lbn = bp->b_lblkno;
1768 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
1769 }
1770 
1771 int
1772 lfs_match_dindir(struct lfs *fs, struct buf *bp)
1773 {
1774 	int lbn;
1775 
1776 	lbn = bp->b_lblkno;
1777 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
1778 }
1779 
1780 int
1781 lfs_match_tindir(struct lfs *fs, struct buf *bp)
1782 {
1783 	int lbn;
1784 
1785 	lbn = bp->b_lblkno;
1786 	return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
1787 }
1788 
1789 /*
1790  * XXX - The only buffers that are going to hit these functions are the
1791  * segment write blocks, or the segment summaries, or the superblocks.
1792  *
1793  * All of the above are created by lfs_newbuf, and so do not need to be
1794  * released via brelse.
1795  */
1796 void
1797 lfs_callback(struct buf *bp)
1798 {
1799 	struct lfs *fs;
1800 #ifdef LFS_TRACK_IOS
1801 	int j;
1802 #endif
1803 
1804 	fs = (struct lfs *)bp->b_saveaddr;
1805 #ifdef DIAGNOSTIC
1806 	if (fs->lfs_iocount == 0)
1807 		panic("lfs_callback: zero iocount\n");
1808 #endif
1809 	if (--fs->lfs_iocount < LFS_THROTTLE)
1810 		wakeup(&fs->lfs_iocount);
1811 #ifdef LFS_TRACK_IOS
1812 	for(j=0;j<LFS_THROTTLE;j++) {
1813 		if(fs->lfs_pending[j]==dbtofsb(fs, bp->b_blkno)) {
1814 			fs->lfs_pending[j] = LFS_UNUSED_DADDR;
1815 			wakeup(&(fs->lfs_pending[j]));
1816 			break;
1817 		}
1818 	}
1819 #endif /* LFS_TRACK_IOS */
1820 
1821 	lfs_freebuf(bp);
1822 }
1823 
1824 void
1825 lfs_supercallback(struct buf *bp)
1826 {
1827 	struct lfs *fs;
1828 
1829 	fs = (struct lfs *)bp->b_saveaddr;
1830 	fs->lfs_sbactive = 0;
1831 	wakeup(&fs->lfs_sbactive);
1832 	if (--fs->lfs_iocount < LFS_THROTTLE)
1833 		wakeup(&fs->lfs_iocount);
1834 	lfs_freebuf(bp);
1835 }
1836 
1837 /*
1838  * Shellsort (diminishing increment sort) from Data Structures and
1839  * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
1840  * see also Knuth Vol. 3, page 84.  The increments are selected from
1841  * formula (8), page 95.  Roughly O(N^3/2).
1842  */
1843 /*
1844  * This is our own private copy of shellsort because we want to sort
1845  * two parallel arrays (the array of buffer pointers and the array of
1846  * logical block numbers) simultaneously.  Note that we cast the array
1847  * of logical block numbers to a unsigned in this routine so that the
1848  * negative block numbers (meta data blocks) sort AFTER the data blocks.
1849  */
1850 
1851 void
1852 lfs_shellsort(struct buf **bp_array, ufs_daddr_t *lb_array, int nmemb)
1853 {
1854 	static int __rsshell_increments[] = { 4, 1, 0 };
1855 	int incr, *incrp, t1, t2;
1856 	struct buf *bp_temp;
1857 	u_long lb_temp;
1858 
1859 	for (incrp = __rsshell_increments; (incr = *incrp++) != 0;)
1860 		for (t1 = incr; t1 < nmemb; ++t1)
1861 			for (t2 = t1 - incr; t2 >= 0;)
1862 				if (lb_array[t2] > lb_array[t2 + incr]) {
1863 					lb_temp = lb_array[t2];
1864 					lb_array[t2] = lb_array[t2 + incr];
1865 					lb_array[t2 + incr] = lb_temp;
1866 					bp_temp = bp_array[t2];
1867 					bp_array[t2] = bp_array[t2 + incr];
1868 					bp_array[t2 + incr] = bp_temp;
1869 					t2 -= incr;
1870 				} else
1871 					break;
1872 }
1873 
1874 /*
1875  * Check VXLOCK.  Return 1 if the vnode is locked.  Otherwise, vget it.
1876  */
1877 int
1878 lfs_vref(struct vnode *vp)
1879 {
1880 	/*
1881 	 * If we return 1 here during a flush, we risk vinvalbuf() not
1882 	 * being able to flush all of the pages from this vnode, which
1883 	 * will cause it to panic.  So, return 0 if a flush is in progress.
1884 	 */
1885 	if (vp->v_flag & VXLOCK) {
1886 		if(IS_FLUSHING(VTOI(vp)->i_lfs,vp)) {
1887 			return 0;
1888 		}
1889 		return(1);
1890 	}
1891 	return (vget(vp, 0));
1892 }
1893 
1894 /*
1895  * This is vrele except that we do not want to VOP_INACTIVE this vnode. We
1896  * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end.
1897  */
1898 void
1899 lfs_vunref(struct vnode *vp)
1900 {
1901 	/*
1902 	 * Analogous to lfs_vref, if the node is flushing, fake it.
1903 	 */
1904 	if((vp->v_flag & VXLOCK) && IS_FLUSHING(VTOI(vp)->i_lfs,vp)) {
1905 		return;
1906 	}
1907 
1908 	simple_lock(&vp->v_interlock);
1909 #ifdef DIAGNOSTIC
1910 	if(vp->v_usecount<=0) {
1911 		printf("lfs_vunref: inum is %d\n", VTOI(vp)->i_number);
1912 		printf("lfs_vunref: flags are 0x%lx\n", (u_long)vp->v_flag);
1913 		printf("lfs_vunref: usecount = %ld\n", (long)vp->v_usecount);
1914 		panic("lfs_vunref: v_usecount<0");
1915 	}
1916 #endif
1917 	vp->v_usecount--;
1918 	if (vp->v_usecount > 0) {
1919 		simple_unlock(&vp->v_interlock);
1920 		return;
1921 	}
1922 	/*
1923 	 * insert at tail of LRU list
1924 	 */
1925 	simple_lock(&vnode_free_list_slock);
1926 	if (vp->v_holdcnt > 0)
1927 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1928 	else
1929 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1930 	simple_unlock(&vnode_free_list_slock);
1931 	simple_unlock(&vp->v_interlock);
1932 }
1933 
1934 /*
1935  * We use this when we have vnodes that were loaded in solely for cleaning.
1936  * There is no reason to believe that these vnodes will be referenced again
1937  * soon, since the cleaning process is unrelated to normal filesystem
1938  * activity.  Putting cleaned vnodes at the tail of the list has the effect
1939  * of flushing the vnode LRU.  So, put vnodes that were loaded only for
1940  * cleaning at the head of the list, instead.
1941  */
1942 void
1943 lfs_vunref_head(struct vnode *vp)
1944 {
1945 	simple_lock(&vp->v_interlock);
1946 #ifdef DIAGNOSTIC
1947 	if(vp->v_usecount==0) {
1948 		panic("lfs_vunref: v_usecount<0");
1949 	}
1950 #endif
1951 	vp->v_usecount--;
1952 	if (vp->v_usecount > 0) {
1953 		simple_unlock(&vp->v_interlock);
1954 		return;
1955 	}
1956 	/*
1957 	 * insert at head of LRU list
1958 	 */
1959 	simple_lock(&vnode_free_list_slock);
1960 	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1961 	simple_unlock(&vnode_free_list_slock);
1962 	simple_unlock(&vp->v_interlock);
1963 }
1964 
1965