xref: /csrg-svn/sys/kern/vfs_bio.c (revision 59872)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_bio.c	7.59 (Berkeley) 05/10/93
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/mount.h>
17 #include <sys/trace.h>
18 #include <sys/resourcevar.h>
19 #include <sys/malloc.h>
20 #include <libkern/libkern.h>
21 
22 /*
23  * Definitions for the buffer hash lists.
24  */
25 #define	BUFHASH(dvp, lbn)	\
26 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
27 struct	list_entry *bufhashtbl, invalhash;
28 u_long	bufhash;
29 
30 /*
31  * Insq/Remq for the buffer hash lists.
32  */
33 #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
34 #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
35 
36 /*
37  * Definitions for the buffer free lists.
38  */
39 #define	BQUEUES		4		/* number of free buffer queues */
40 
41 #define	BQ_LOCKED	0		/* super-blocks &c */
42 #define	BQ_LRU		1		/* lru, useful buffers */
43 #define	BQ_AGE		2		/* rubbish */
44 #define	BQ_EMPTY	3		/* buffer headers with no memory */
45 
46 struct queue_entry bufqueues[BQUEUES];
47 int needbuffer;
48 
49 /*
50  * Insq/Remq for the buffer free lists.
51  */
52 #define	binsheadfree(bp, dp) \
53 	queue_enter_head(dp, bp, struct buf *, b_freelist)
54 #define	binstailfree(bp, dp) \
55 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
56 
57 /*
58  * Local declarations
59  */
60 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
61 	    daddr_t, long, int));
62 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
63 	    daddr_t, daddr_t, long, int, long));
64 void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
65 	    daddr_t start_lbn, int len, daddr_t lbn));
66 
67 void
68 bremfree(bp)
69 	struct buf *bp;
70 {
71 	struct queue_entry *dp;
72 
73 	/*
74 	 * We only calculate the head of the freelist when removing
75 	 * the last element of the list as that is the only time that
76 	 * it is needed (e.g. to reset the tail pointer).
77 	 */
78 	if (bp->b_freelist.qe_next == NULL) {
79 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
80 			if (dp->qe_prev == &bp->b_freelist.qe_next)
81 				break;
82 		if (dp == &bufqueues[BQUEUES])
83 			panic("bremfree: lost tail");
84 	}
85 	queue_remove(dp, bp, struct buf *, b_freelist);
86 }
87 
88 /*
89  * Initialize buffers and hash links for buffers.
90  */
91 void
92 bufinit()
93 {
94 	register struct buf *bp;
95 	struct queue_entry *dp;
96 	register int i;
97 	int base, residual;
98 
99 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
100 		queue_init(dp);
101 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
102 	base = bufpages / nbuf;
103 	residual = bufpages % nbuf;
104 	for (i = 0; i < nbuf; i++) {
105 		bp = &buf[i];
106 		bzero((char *)bp, sizeof *bp);
107 		bp->b_dev = NODEV;
108 		bp->b_rcred = NOCRED;
109 		bp->b_wcred = NOCRED;
110 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
111 		if (i < residual)
112 			bp->b_bufsize = (base + 1) * CLBYTES;
113 		else
114 			bp->b_bufsize = base * CLBYTES;
115 		bp->b_flags = B_INVAL;
116 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
117 		binsheadfree(bp, dp);
118 		binshash(bp, &invalhash);
119 	}
120 }
121 
122 /*
123  * Find the block in the buffer pool.
124  * If the buffer is not present, allocate a new buffer and load
125  * its contents according to the filesystem fill routine.
126  */
127 bread(vp, blkno, size, cred, bpp)
128 	struct vnode *vp;
129 	daddr_t blkno;
130 	int size;
131 	struct ucred *cred;
132 	struct buf **bpp;
133 {
134 	struct proc *p = curproc;		/* XXX */
135 	register struct buf *bp;
136 
137 	if (size == 0)
138 		panic("bread: size 0");
139 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
140 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
141 		trace(TR_BREADHIT, pack(vp, size), blkno);
142 		return (0);
143 	}
144 	bp->b_flags |= B_READ;
145 	if (bp->b_bcount > bp->b_bufsize)
146 		panic("bread");
147 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
148 		crhold(cred);
149 		bp->b_rcred = cred;
150 	}
151 	VOP_STRATEGY(bp);
152 	trace(TR_BREADMISS, pack(vp, size), blkno);
153 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
154 	return (biowait(bp));
155 }
156 
157 /*
158  * Operates like bread, but also starts I/O on the N specified
159  * read-ahead blocks.
160  */
161 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
162 	struct vnode *vp;
163 	daddr_t blkno; int size;
164 	daddr_t rablkno[]; int rabsize[];
165 	int num;
166 	struct ucred *cred;
167 	struct buf **bpp;
168 {
169 	struct proc *p = curproc;		/* XXX */
170 	register struct buf *bp, *rabp;
171 	register int i;
172 
173 	bp = NULL;
174 	/*
175 	 * If the block is not memory resident,
176 	 * allocate a buffer and start I/O.
177 	 */
178 	if (!incore(vp, blkno)) {
179 		*bpp = bp = getblk(vp, blkno, size, 0, 0);
180 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
181 			bp->b_flags |= B_READ;
182 			if (bp->b_bcount > bp->b_bufsize)
183 				panic("breadn");
184 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
185 				crhold(cred);
186 				bp->b_rcred = cred;
187 			}
188 			VOP_STRATEGY(bp);
189 			trace(TR_BREADMISS, pack(vp, size), blkno);
190 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
191 		} else {
192 			trace(TR_BREADHIT, pack(vp, size), blkno);
193 		}
194 	}
195 
196 	/*
197 	 * If there's read-ahead block(s), start I/O
198 	 * on them also (as above).
199 	 */
200 	for (i = 0; i < num; i++) {
201 		if (incore(vp, rablkno[i]))
202 			continue;
203 		rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0);
204 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
205 			brelse(rabp);
206 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
207 		} else {
208 			rabp->b_flags |= B_ASYNC | B_READ;
209 			if (rabp->b_bcount > rabp->b_bufsize)
210 				panic("breadrabp");
211 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
212 				crhold(cred);
213 				rabp->b_rcred = cred;
214 			}
215 			VOP_STRATEGY(rabp);
216 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
217 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
218 		}
219 	}
220 
221 	/*
222 	 * If block was memory resident, let bread get it.
223 	 * If block was not memory resident, the read was
224 	 * started above, so just wait for the read to complete.
225 	 */
226 	if (bp == NULL)
227 		return (bread(vp, blkno, size, cred, bpp));
228 	return (biowait(bp));
229 }
230 
231 /*
232  * We could optimize this by keeping track of where the last read-ahead
233  * was, but it would involve adding fields to the vnode.  For now, let's
234  * just get it working.
235  *
236  * This replaces bread.  If this is a bread at the beginning of a file and
237  * lastr is 0, we assume this is the first read and we'll read up to two
238  * blocks if they are sequential.  After that, we'll do regular read ahead
239  * in clustered chunks.
240  *
241  * There are 4 or 5 cases depending on how you count:
242  *	Desired block is in the cache:
243  *	    1 Not sequential access (0 I/Os).
244  *	    2 Access is sequential, do read-ahead (1 ASYNC).
245  *	Desired block is not in cache:
246  *	    3 Not sequential access (1 SYNC).
247  *	    4 Sequential access, next block is contiguous (1 SYNC).
248  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
249  *
250  * There are potentially two buffers that require I/O.
251  * 	bp is the block requested.
252  *	rbp is the read-ahead block.
253  *	If either is NULL, then you don't have to do the I/O.
254  */
255 cluster_read(vp, filesize, lblkno, size, cred, bpp)
256 	struct vnode *vp;
257 	u_quad_t filesize;
258 	daddr_t lblkno;
259 	long size;
260 	struct ucred *cred;
261 	struct buf **bpp;
262 {
263 	struct buf *bp, *rbp;
264 	daddr_t blkno, ioblkno;
265 	long flags;
266 	int error, num_ra, alreadyincore;
267 
268 #ifdef DIAGNOSTIC
269 	if (size == 0)
270 		panic("cluster_read: size = 0");
271 #endif
272 
273 	error = 0;
274 	flags = B_READ;
275 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
276 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
277 		/*
278 		 * Desired block is in cache; do any readahead ASYNC.
279 		 * Case 1, 2.
280 		 */
281 		trace(TR_BREADHIT, pack(vp, size), lblkno);
282 		flags |= B_ASYNC;
283 		ioblkno = lblkno +
284 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
285 		alreadyincore = (int)incore(vp, ioblkno);
286 		bp = NULL;
287 	} else {
288 		/* Block wasn't in cache, case 3, 4, 5. */
289 		trace(TR_BREADMISS, pack(vp, size), lblkno);
290 		ioblkno = lblkno;
291 		bp->b_flags |= flags;
292 		alreadyincore = 0;
293 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
294 	}
295 	/*
296 	 * XXX
297 	 * Replace 1 with a window size based on some permutation of
298 	 * maxcontig and rot_delay.  This will let you figure out how
299 	 * many blocks you should read-ahead (case 2, 4, 5).
300 	 *
301 	 * If the access isn't sequential, cut the window size in half.
302 	 */
303 	rbp = NULL;
304 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
305 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
306 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
307 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
308 		/*
309 		 * Reading sequentially, and the next block is not in the
310 		 * cache.  We are going to try reading ahead. If this is
311 		 * the first read of a file, then limit read-ahead to a
312 		 * single block, else read as much as we're allowed.
313 		 */
314 		if (num_ra > vp->v_ralen) {
315 			num_ra = vp->v_ralen;
316 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
317 		} else
318 			vp->v_ralen = num_ra + 1;
319 
320 
321 		if (num_ra)				/* case 2, 4 */
322 			rbp = cluster_rbuild(vp, filesize,
323 			    bp, ioblkno, blkno, size, num_ra, flags);
324 		else if (lblkno != 0 && ioblkno == lblkno) {
325 			/* Case 5: check how many blocks to read ahead */
326 			++ioblkno;
327 			if ((ioblkno + 1) * size > filesize ||
328 			    (error = VOP_BMAP(vp,
329 			    ioblkno, NULL, &blkno, &num_ra)))
330 				goto skip_readahead;
331 			flags |= B_ASYNC;
332 			if (num_ra)
333 				rbp = cluster_rbuild(vp, filesize,
334 				    NULL, ioblkno, blkno, size, num_ra, flags);
335 			else {
336 				rbp = getblk(vp, ioblkno, size, 0, 0);
337 				rbp->b_flags |= flags;
338 				rbp->b_blkno = blkno;
339 			}
340 		} else if (lblkno != 0) {
341 			/* case 2; read ahead single block */
342 			rbp = getblk(vp, ioblkno, size, 0, 0);
343 			rbp->b_flags |= flags;
344 			rbp->b_blkno = blkno;
345 		} else if (bp)				/* case 1, 3, block 0 */
346 			bp->b_blkno = blkno;
347 		/* Case 1 on block 0; not really doing sequential I/O */
348 
349 		if (rbp == bp)		/* case 4 */
350 			rbp = NULL;
351 		else if (rbp) {			/* case 2, 5 */
352 			trace(TR_BREADMISSRA,
353 			    pack(vp, (num_ra + 1) * size), ioblkno);
354 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
355 		}
356 	}
357 
358 	/* XXX Kirk, do we need to make sure the bp has creds? */
359 skip_readahead:
360 	if (bp)
361 		if (bp->b_flags & (B_DONE | B_DELWRI))
362 			panic("cluster_read: DONE bp");
363 		else
364 			error = VOP_STRATEGY(bp);
365 
366 	if (rbp)
367 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
368 			rbp->b_flags &= ~(B_ASYNC | B_READ);
369 			brelse(rbp);
370 		} else
371 			(void) VOP_STRATEGY(rbp);
372 
373 	if (bp)
374 		return(biowait(bp));
375 	return(error);
376 }
377 
378 /*
379  * If blocks are contiguous on disk, use this to provide clustered
380  * read ahead.  We will read as many blocks as possible sequentially
381  * and then parcel them up into logical blocks in the buffer hash table.
382  */
383 struct buf *
384 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
385 	struct vnode *vp;
386 	u_quad_t filesize;
387 	struct buf *bp;
388 	daddr_t lbn;
389 	daddr_t blkno;
390 	long size;
391 	int run;
392 	long flags;
393 {
394 	struct cluster_save *b_save;
395 	struct buf *tbp;
396 	daddr_t bn;
397 	int i, inc;
398 
399 #ifdef DIAGNOSTIC
400 	if (size != vp->v_mount->mnt_stat.f_iosize)
401 		panic("cluster_rbuild: size %d != filesize %d\n",
402 			size, vp->v_mount->mnt_stat.f_iosize);
403 #endif
404 	if (size * (lbn + run + 1) > filesize)
405 		--run;
406 	if (run == 0) {
407 		if (!bp) {
408 			bp = getblk(vp, lbn, size, 0, 0);
409 			bp->b_blkno = blkno;
410 			bp->b_flags |= flags;
411 		}
412 		return(bp);
413 	}
414 
415 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
416 	if (bp->b_flags & (B_DONE | B_DELWRI))
417 		return (bp);
418 
419 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
420 	    M_SEGMENT, M_WAITOK);
421 	b_save->bs_bufsize = b_save->bs_bcount = size;
422 	b_save->bs_nchildren = 0;
423 	b_save->bs_children = (struct buf **)(b_save + 1);
424 	b_save->bs_saveaddr = bp->b_saveaddr;
425 	bp->b_saveaddr = (caddr_t) b_save;
426 
427 	inc = size / DEV_BSIZE;
428 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
429 		if (incore(vp, lbn + i)) {
430 			if (i == 1) {
431 				bp->b_saveaddr = b_save->bs_saveaddr;
432 				bp->b_flags &= ~B_CALL;
433 				bp->b_iodone = NULL;
434 				allocbuf(bp, size);
435 				free(b_save, M_SEGMENT);
436 			} else
437 				allocbuf(bp, size * i);
438 			break;
439 		}
440 		tbp = getblk(vp, lbn + i, 0, 0, 0);
441 		tbp->b_bcount = tbp->b_bufsize = size;
442 		tbp->b_blkno = bn;
443 		tbp->b_flags |= flags | B_READ | B_ASYNC;
444 		++b_save->bs_nchildren;
445 		b_save->bs_children[i - 1] = tbp;
446 	}
447 	if (!(bp->b_flags & B_ASYNC))
448 		vp->v_ralen = max(vp->v_ralen - 1, 1);
449 	return(bp);
450 }
451 
452 /*
453  * Either get a new buffer or grow the existing one.
454  */
455 struct buf *
456 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
457 	struct vnode *vp;
458 	struct buf *bp;
459 	long flags;
460 	daddr_t blkno;
461 	daddr_t lblkno;
462 	long size;
463 	int run;
464 {
465 	if (!bp) {
466 		bp = getblk(vp, lblkno, size, 0, 0);
467 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
468 			bp->b_blkno = blkno;
469 			return(bp);
470 		}
471 	}
472 	allocbuf(bp, run * size);
473 	bp->b_blkno = blkno;
474 	bp->b_iodone = cluster_callback;
475 	bp->b_flags |= flags | B_CALL;
476 	return(bp);
477 }
478 
479 /*
480  * Cleanup after a clustered read or write.
481  */
482 void
483 cluster_callback(bp)
484 	struct buf *bp;
485 {
486 	struct cluster_save *b_save;
487 	struct buf **tbp;
488 	long bsize;
489 	caddr_t cp;
490 	b_save = (struct cluster_save *)(bp->b_saveaddr);
491 	bp->b_saveaddr = b_save->bs_saveaddr;
492 
493 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
494 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
495 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
496 		cp += (*tbp)->b_bufsize;
497 		bp->b_bufsize -= (*tbp)->b_bufsize;
498 		biodone(*tbp);
499 	}
500 #ifdef DIAGNOSTIC
501 	if (bp->b_bufsize != b_save->bs_bufsize)
502 		panic ("cluster_callback: more space to reclaim");
503 #endif
504 	bp->b_bcount = bp->b_bufsize;
505 	bp->b_iodone = NULL;
506 	free(b_save, M_SEGMENT);
507 	if (bp->b_flags & B_ASYNC)
508 		brelse(bp);
509 	else
510 		wakeup((caddr_t)bp);
511 }
512 
513 /*
514  * Synchronous write.
515  * Release buffer on completion.
516  */
517 bwrite(bp)
518 	register struct buf *bp;
519 {
520 	struct proc *p = curproc;		/* XXX */
521 	register int flag;
522 	int s, error = 0;
523 
524 	flag = bp->b_flags;
525 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
526 	if (flag & B_ASYNC) {
527 		if ((flag & B_DELWRI) == 0)
528 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
529 		else
530 			reassignbuf(bp, bp->b_vp);
531 	}
532 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
533 	if (bp->b_bcount > bp->b_bufsize)
534 		panic("bwrite");
535 	s = splbio();
536 	bp->b_vp->v_numoutput++;
537 	bp->b_flags |= B_WRITEINPROG;
538 	splx(s);
539 	VOP_STRATEGY(bp);
540 
541 	/*
542 	 * If the write was synchronous, then await I/O completion.
543 	 * If the write was "delayed", then we put the buffer on
544 	 * the queue of blocks awaiting I/O completion status.
545 	 */
546 	if ((flag & B_ASYNC) == 0) {
547 		error = biowait(bp);
548 		if ((flag&B_DELWRI) == 0)
549 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
550 		else
551 			reassignbuf(bp, bp->b_vp);
552 		if (bp->b_flags & B_EINTR) {
553 			bp->b_flags &= ~B_EINTR;
554 			error = EINTR;
555 		}
556 		brelse(bp);
557 	} else if (flag & B_DELWRI) {
558 		s = splbio();
559 		bp->b_flags |= B_AGE;
560 		splx(s);
561 	}
562 	return (error);
563 }
564 
565 int
566 vn_bwrite(ap)
567 	struct vop_bwrite_args *ap;
568 {
569 	return (bwrite(ap->a_bp));
570 }
571 
572 
573 /*
574  * Delayed write.
575  *
576  * The buffer is marked dirty, but is not queued for I/O.
577  * This routine should be used when the buffer is expected
578  * to be modified again soon, typically a small write that
579  * partially fills a buffer.
580  *
581  * NB: magnetic tapes cannot be delayed; they must be
582  * written in the order that the writes are requested.
583  */
584 bdwrite(bp)
585 	register struct buf *bp;
586 {
587 	struct proc *p = curproc;		/* XXX */
588 
589 	if ((bp->b_flags & B_DELWRI) == 0) {
590 		bp->b_flags |= B_DELWRI;
591 		reassignbuf(bp, bp->b_vp);
592 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
593 	}
594 	/*
595 	 * If this is a tape drive, the write must be initiated.
596 	 */
597 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
598 		bawrite(bp);
599 	} else {
600 		bp->b_flags |= (B_DONE | B_DELWRI);
601 		brelse(bp);
602 	}
603 }
604 
605 /*
606  * Asynchronous write.
607  * Start I/O on a buffer, but do not wait for it to complete.
608  * The buffer is released when the I/O completes.
609  */
610 bawrite(bp)
611 	register struct buf *bp;
612 {
613 
614 	/*
615 	 * Setting the ASYNC flag causes bwrite to return
616 	 * after starting the I/O.
617 	 */
618 	bp->b_flags |= B_ASYNC;
619 	(void) VOP_BWRITE(bp);
620 }
621 
622 /*
623  * Do clustered write for FFS.
624  *
625  * Three cases:
626  *	1. Write is not sequential (write asynchronously)
627  *	Write is sequential:
628  *	2.	beginning of cluster - begin cluster
629  *	3.	middle of a cluster - add to cluster
630  *	4.	end of a cluster - asynchronously write cluster
631  */
632 void
633 cluster_write(bp, filesize)
634         struct buf *bp;
635 	u_quad_t filesize;
636 {
637         struct vnode *vp;
638         daddr_t lbn;
639         int clen;
640 
641         vp = bp->b_vp;
642         lbn = bp->b_lblkno;
643 
644 	/* Initialize vnode to beginning of file. */
645 	if (lbn == 0)
646 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
647 
648         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
649 	    (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) {
650 		if (vp->v_clen != 0)
651 			/*
652 			 * Write is not sequential.
653 			 */
654 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
655 			    vp->v_lastw - vp->v_cstart + 1, lbn);
656 		/*
657 		 * Consider beginning a cluster.
658 		 */
659 		if ((lbn + 1) * bp->b_bcount == filesize)
660 			/* End of file, make cluster as large as possible */
661 			clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
662 		else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
663 			bawrite(bp);
664 			vp->v_clen = 0;
665 			vp->v_lasta = bp->b_blkno;
666 			vp->v_cstart = lbn + 1;
667 			vp->v_lastw = lbn;
668 			return;
669 		} else
670 			clen = 0;
671                 vp->v_clen = clen;
672                 if (clen == 0) {		/* I/O not contiguous */
673 			vp->v_cstart = lbn + 1;
674                         bawrite(bp);
675                 } else {			/* Wait for rest of cluster */
676 			vp->v_cstart = lbn;
677                         bdwrite(bp);
678 		}
679         } else if (lbn == vp->v_cstart + vp->v_clen) {
680 		/*
681 		 * At end of cluster, write it out.
682 		 */
683 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
684 		    vp->v_clen + 1, lbn);
685 		vp->v_clen = 0;
686 		vp->v_cstart = lbn + 1;
687         } else
688 		/*
689 		 * In the middle of a cluster, so just delay the
690 		 * I/O for now.
691 		 */
692                 bdwrite(bp);
693         vp->v_lastw = lbn;
694 	vp->v_lasta = bp->b_blkno;
695 }
696 
697 
698 /*
699  * This is an awful lot like cluster_rbuild...wish they could be combined.
700  * The last lbn argument is the current block on which I/O is being
701  * performed.  Check to see that it doesn't fall in the middle of
702  * the current block.
703  */
704 void
705 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
706 	struct vnode *vp;
707 	struct buf *last_bp;
708 	long size;
709 	daddr_t start_lbn;
710 	int len;
711 	daddr_t	lbn;
712 {
713 	struct cluster_save *b_save;
714 	struct buf *bp, *tbp;
715 	caddr_t	cp;
716 	int i, s;
717 
718 #ifdef DIAGNOSTIC
719 	if (size != vp->v_mount->mnt_stat.f_iosize)
720 		panic("cluster_wbuild: size %d != filesize %d\n",
721 			size, vp->v_mount->mnt_stat.f_iosize);
722 #endif
723 redo:
724 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
725 		++start_lbn;
726 		--len;
727 	}
728 
729 	/* Get more memory for current buffer */
730 	if (len <= 1) {
731 		if (last_bp) {
732 			bawrite(last_bp);
733 		} else if (len) {
734 			bp = getblk(vp, start_lbn, size, 0, 0);
735 			bawrite(bp);
736 		}
737 		return;
738 	}
739 
740 	bp = getblk(vp, start_lbn, size, 0, 0);
741 	if (!(bp->b_flags & B_DELWRI)) {
742 		++start_lbn;
743 		--len;
744 		brelse(bp);
745 		goto redo;
746 	}
747 
748 	--len;
749 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
750 	    M_SEGMENT, M_WAITOK);
751 	b_save->bs_bcount = bp->b_bcount;
752 	b_save->bs_bufsize = bp->b_bufsize;
753 	b_save->bs_nchildren = 0;
754 	b_save->bs_children = (struct buf **)(b_save + 1);
755 	b_save->bs_saveaddr = bp->b_saveaddr;
756 	bp->b_saveaddr = (caddr_t) b_save;
757 
758 
759 	bp->b_flags |= B_CALL;
760 	bp->b_iodone = cluster_callback;
761 	cp = bp->b_un.b_addr + bp->b_bufsize;
762 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
763 		if (!incore(vp, start_lbn) || start_lbn == lbn)
764 			break;
765 
766 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
767 			tbp = getblk(vp, start_lbn, size, 0, 0);
768 #ifdef DIAGNOSTIC
769 			if (tbp->b_bcount != tbp->b_bufsize)
770 				panic("cluster_wbuild: Buffer too big");
771 #endif
772 			if (!(tbp->b_flags & B_DELWRI)) {
773 				brelse(tbp);
774 				break;
775 			}
776 		} else
777 			tbp = last_bp;
778 
779 		++b_save->bs_nchildren;
780 
781 		/* Move memory from children to parent */
782 		if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) {
783 			printf("Clustered Block: %d addr %x bufsize: %d\n",
784 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
785 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
786 			    tbp->b_blkno);
787 			panic("Clustered write to wrong blocks");
788 		}
789 
790 		pagemove(tbp->b_un.b_daddr, cp, size);
791 		bp->b_bcount += size;
792 		bp->b_bufsize += size;
793 
794 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
795 		tbp->b_flags |= B_ASYNC;
796 		s = splbio();
797 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
798 		++tbp->b_vp->v_numoutput;
799 		splx(s);
800 		b_save->bs_children[i] = tbp;
801 
802 		cp += tbp->b_bufsize;
803 	}
804 
805 	if (i == 0) {
806 		/* None to cluster */
807 		bp->b_saveaddr = b_save->bs_saveaddr;
808 		bp->b_flags &= ~B_CALL;
809 		bp->b_iodone = NULL;
810 		free(b_save, M_SEGMENT);
811 	}
812 	bawrite(bp);
813 	if (i < len) {
814 		len -= i + 1;
815 		start_lbn += 1;
816 		goto redo;
817 	}
818 }
819 
820 /*
821  * Release a buffer.
822  * Even if the buffer is dirty, no I/O is started.
823  */
824 brelse(bp)
825 	register struct buf *bp;
826 {
827 	register struct queue_entry *flist;
828 	int s;
829 
830 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
831 	/*
832 	 * If a process is waiting for the buffer, or
833 	 * is waiting for a free buffer, awaken it.
834 	 */
835 	if (bp->b_flags & B_WANTED)
836 		wakeup((caddr_t)bp);
837 	if (needbuffer) {
838 		needbuffer = 0;
839 		wakeup((caddr_t)&needbuffer);
840 	}
841 	/*
842 	 * Retry I/O for locked buffers rather than invalidating them.
843 	 */
844 	s = splbio();
845 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
846 		bp->b_flags &= ~B_ERROR;
847 	/*
848 	 * Disassociate buffers that are no longer valid.
849 	 */
850 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
851 		bp->b_flags |= B_INVAL;
852 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
853 		if (bp->b_vp)
854 			brelvp(bp);
855 		bp->b_flags &= ~B_DELWRI;
856 	}
857 	/*
858 	 * Stick the buffer back on a free list.
859 	 */
860 	if (bp->b_bufsize <= 0) {
861 		/* block has no buffer ... put at front of unused buffer list */
862 		flist = &bufqueues[BQ_EMPTY];
863 		binsheadfree(bp, flist);
864 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
865 		/* block has no info ... put at front of most free list */
866 		flist = &bufqueues[BQ_AGE];
867 		binsheadfree(bp, flist);
868 	} else {
869 		if (bp->b_flags & B_LOCKED)
870 			flist = &bufqueues[BQ_LOCKED];
871 		else if (bp->b_flags & B_AGE)
872 			flist = &bufqueues[BQ_AGE];
873 		else
874 			flist = &bufqueues[BQ_LRU];
875 		binstailfree(bp, flist);
876 	}
877 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
878 	splx(s);
879 }
880 
881 /*
882  * Check to see if a block is currently memory resident.
883  */
884 struct buf *
885 incore(vp, blkno)
886 	struct vnode *vp;
887 	daddr_t blkno;
888 {
889 	register struct buf *bp;
890 
891 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
892 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
893 		    (bp->b_flags & B_INVAL) == 0)
894 			return (bp);
895 	return (NULL);
896 }
897 
898 /*
899  * Check to see if a block is currently memory resident.
900  * If it is resident, return it. If it is not resident,
901  * allocate a new buffer and assign it to the block.
902  */
903 struct buf *
904 getblk(vp, blkno, size, slpflag, slptimeo)
905 	register struct vnode *vp;
906 	daddr_t blkno;
907 	int size, slpflag, slptimeo;
908 {
909 	register struct buf *bp;
910 	struct list_entry *dp;
911 	int s, error;
912 
913 	if (size > MAXBSIZE)
914 		panic("getblk: size too big");
915 	/*
916 	 * Search the cache for the block. If the buffer is found,
917 	 * but it is currently locked, the we must wait for it to
918 	 * become available.
919 	 */
920 	dp = BUFHASH(vp, blkno);
921 loop:
922 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
923 		if (bp->b_lblkno != blkno || bp->b_vp != vp)
924 			continue;
925 		s = splbio();
926 		if (bp->b_flags & B_BUSY) {
927 			bp->b_flags |= B_WANTED;
928 			error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
929 				"getblk", slptimeo);
930 			splx(s);
931 			if (error)
932 				return (NULL);
933 			goto loop;
934 		}
935 		/*
936 		 * The test for B_INVAL is moved down here, since there
937 		 * are cases where B_INVAL is set before VOP_BWRITE() is
938 		 * called and for NFS, the process cannot be allowed to
939 		 * allocate a new buffer for the same block until the write
940 		 * back to the server has been completed. (ie. B_BUSY clears)
941 		 */
942 		if (bp->b_flags & B_INVAL) {
943 			splx(s);
944 			continue;
945 		}
946 		bremfree(bp);
947 		bp->b_flags |= B_BUSY;
948 		splx(s);
949 		if (bp->b_bcount != size) {
950 			printf("getblk: stray size");
951 			bp->b_flags |= B_INVAL;
952 			VOP_BWRITE(bp);
953 			goto loop;
954 		}
955 		bp->b_flags |= B_CACHE;
956 		return (bp);
957 	}
958 	/*
959 	 * The loop back to the top when getnewbuf() fails is because
960 	 * stateless filesystems like NFS have no node locks. Thus,
961 	 * there is a slight chance that more than one process will
962 	 * try and getnewbuf() for the same block concurrently when
963 	 * the first sleeps in getnewbuf(). So after a sleep, go back
964 	 * up to the top to check the hash lists again.
965 	 */
966 	if ((bp = getnewbuf(slpflag, slptimeo)) == 0)
967 		goto loop;
968 	bremhash(bp);
969 	bgetvp(vp, bp);
970 	bp->b_bcount = 0;
971 	bp->b_lblkno = blkno;
972 	bp->b_blkno = blkno;
973 	bp->b_error = 0;
974 	bp->b_resid = 0;
975 	binshash(bp, dp);
976 	allocbuf(bp, size);
977 	return (bp);
978 }
979 
980 /*
981  * Allocate a buffer.
982  * The caller will assign it to a block.
983  */
984 struct buf *
985 geteblk(size)
986 	int size;
987 {
988 	register struct buf *bp;
989 
990 	if (size > MAXBSIZE)
991 		panic("geteblk: size too big");
992 	while ((bp = getnewbuf(0, 0)) == NULL)
993 		/* void */;
994 	bp->b_flags |= B_INVAL;
995 	bremhash(bp);
996 	binshash(bp, &invalhash);
997 	bp->b_bcount = 0;
998 	bp->b_error = 0;
999 	bp->b_resid = 0;
1000 	allocbuf(bp, size);
1001 	return (bp);
1002 }
1003 
1004 /*
1005  * Expand or contract the actual memory allocated to a buffer.
1006  * If no memory is available, release buffer and take error exit.
1007  */
1008 allocbuf(tp, size)
1009 	register struct buf *tp;
1010 	int size;
1011 {
1012 	register struct buf *bp, *ep;
1013 	int sizealloc, take, s;
1014 
1015 	sizealloc = roundup(size, CLBYTES);
1016 	/*
1017 	 * Buffer size does not change
1018 	 */
1019 	if (sizealloc == tp->b_bufsize)
1020 		goto out;
1021 	/*
1022 	 * Buffer size is shrinking.
1023 	 * Place excess space in a buffer header taken from the
1024 	 * BQ_EMPTY buffer list and placed on the "most free" list.
1025 	 * If no extra buffer headers are available, leave the
1026 	 * extra space in the present buffer.
1027 	 */
1028 	if (sizealloc < tp->b_bufsize) {
1029 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
1030 			goto out;
1031 		s = splbio();
1032 		bremfree(ep);
1033 		ep->b_flags |= B_BUSY;
1034 		splx(s);
1035 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
1036 		    (int)tp->b_bufsize - sizealloc);
1037 		ep->b_bufsize = tp->b_bufsize - sizealloc;
1038 		tp->b_bufsize = sizealloc;
1039 		ep->b_flags |= B_INVAL;
1040 		ep->b_bcount = 0;
1041 		brelse(ep);
1042 		goto out;
1043 	}
1044 	/*
1045 	 * More buffer space is needed. Get it out of buffers on
1046 	 * the "most free" list, placing the empty headers on the
1047 	 * BQ_EMPTY buffer header list.
1048 	 */
1049 	while (tp->b_bufsize < sizealloc) {
1050 		take = sizealloc - tp->b_bufsize;
1051 		while ((bp = getnewbuf(0, 0)) == NULL)
1052 			/* void */;
1053 		if (take >= bp->b_bufsize)
1054 			take = bp->b_bufsize;
1055 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
1056 		    &tp->b_un.b_addr[tp->b_bufsize], take);
1057 		tp->b_bufsize += take;
1058 		bp->b_bufsize = bp->b_bufsize - take;
1059 		if (bp->b_bcount > bp->b_bufsize)
1060 			bp->b_bcount = bp->b_bufsize;
1061 		if (bp->b_bufsize <= 0) {
1062 			bremhash(bp);
1063 			binshash(bp, &invalhash);
1064 			bp->b_dev = NODEV;
1065 			bp->b_error = 0;
1066 			bp->b_flags |= B_INVAL;
1067 		}
1068 		brelse(bp);
1069 	}
1070 out:
1071 	tp->b_bcount = size;
1072 	return (1);
1073 }
1074 
1075 /*
1076  * Find a buffer which is available for use.
1077  * Select something from a free list.
1078  * Preference is to AGE list, then LRU list.
1079  */
1080 struct buf *
1081 getnewbuf(slpflag, slptimeo)
1082 	int slpflag, slptimeo;
1083 {
1084 	register struct buf *bp;
1085 	register struct queue_entry *dp;
1086 	register struct ucred *cred;
1087 	int s;
1088 
1089 loop:
1090 	s = splbio();
1091 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--)
1092 		if (dp->qe_next)
1093 			break;
1094 	if (dp == bufqueues) {		/* no free blocks */
1095 		needbuffer = 1;
1096 		(void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1),
1097 			"getnewbuf", slptimeo);
1098 		splx(s);
1099 		return (NULL);
1100 	}
1101 	bp = dp->qe_next;
1102 	bremfree(bp);
1103 	bp->b_flags |= B_BUSY;
1104 	splx(s);
1105 	if (bp->b_flags & B_DELWRI) {
1106 		(void) bawrite(bp);
1107 		goto loop;
1108 	}
1109 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1110 	if (bp->b_vp)
1111 		brelvp(bp);
1112 	if (bp->b_rcred != NOCRED) {
1113 		cred = bp->b_rcred;
1114 		bp->b_rcred = NOCRED;
1115 		crfree(cred);
1116 	}
1117 	if (bp->b_wcred != NOCRED) {
1118 		cred = bp->b_wcred;
1119 		bp->b_wcred = NOCRED;
1120 		crfree(cred);
1121 	}
1122 	bp->b_flags = B_BUSY;
1123 	bp->b_dirtyoff = bp->b_dirtyend = 0;
1124 	bp->b_validoff = bp->b_validend = 0;
1125 	return (bp);
1126 }
1127 
1128 /*
1129  * Wait for I/O to complete.
1130  *
1131  * Extract and return any errors associated with the I/O.
1132  * If the error flag is set, but no specific error is
1133  * given, return EIO.
1134  */
1135 biowait(bp)
1136 	register struct buf *bp;
1137 {
1138 	int s;
1139 
1140 	s = splbio();
1141 	while ((bp->b_flags & B_DONE) == 0)
1142 		sleep((caddr_t)bp, PRIBIO);
1143 	splx(s);
1144 	if ((bp->b_flags & B_ERROR) == 0)
1145 		return (0);
1146 	if (bp->b_error)
1147 		return (bp->b_error);
1148 	return (EIO);
1149 }
1150 
1151 /*
1152  * Mark I/O complete on a buffer.
1153  *
1154  * If a callback has been requested, e.g. the pageout
1155  * daemon, do so. Otherwise, awaken waiting processes.
1156  */
1157 void
1158 biodone(bp)
1159 	register struct buf *bp;
1160 {
1161 
1162 	if (bp->b_flags & B_DONE)
1163 		panic("dup biodone");
1164 	bp->b_flags |= B_DONE;
1165 	if ((bp->b_flags & B_READ) == 0)
1166 		vwakeup(bp);
1167 	if (bp->b_flags & B_CALL) {
1168 		bp->b_flags &= ~B_CALL;
1169 		(*bp->b_iodone)(bp);
1170 		return;
1171 	}
1172 	if (bp->b_flags & B_ASYNC)
1173 		brelse(bp);
1174 	else {
1175 		bp->b_flags &= ~B_WANTED;
1176 		wakeup((caddr_t)bp);
1177 	}
1178 }
1179 
1180 int
1181 count_lock_queue()
1182 {
1183 	register struct buf *bp;
1184 	register int ret;
1185 
1186 	for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next;
1187 	    bp; bp = (struct buf *)bp->b_freelist.qe_next)
1188 		++ret;
1189 	return(ret);
1190 }
1191 
1192 #ifdef DIAGNOSTIC
1193 /*
1194  * Print out statistics on the current allocation of the buffer pool.
1195  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1196  * above.
1197  */
1198 void
1199 vfs_bufstats()
1200 {
1201 	int s, i, j, count;
1202 	register struct buf *bp;
1203 	register struct queue_entry *dp;
1204 	int counts[MAXBSIZE/CLBYTES+1];
1205 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
1206 
1207 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1208 		count = 0;
1209 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1210 			counts[j] = 0;
1211 		s = splbio();
1212 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
1213 			counts[bp->b_bufsize/CLBYTES]++;
1214 			count++;
1215 		}
1216 		splx(s);
1217 		printf("%s: total-%d", bname[i], count);
1218 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1219 			if (counts[j] != 0)
1220 				printf(", %d-%d", j * CLBYTES, counts[j]);
1221 		printf("\n");
1222 	}
1223 }
1224 #endif /* DIAGNOSTIC */
1225