xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 56607)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_cluster.c	7.55 (Berkeley) 10/22/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/mount.h>
17 #include <sys/trace.h>
18 #include <sys/resourcevar.h>
19 #include <sys/malloc.h>
20 #include <libkern/libkern.h>
21 
22 /*
23  * Definitions for the buffer hash lists.
24  */
25 #define	BUFHASH(dvp, lbn)	\
26 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
27 struct	list_entry *bufhashtbl, invalhash;
28 u_long	bufhash;
29 
30 /*
31  * Insq/Remq for the buffer hash lists.
32  */
33 #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
34 #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
35 
36 /*
37  * Definitions for the buffer free lists.
38  */
39 #define	BQUEUES		4		/* number of free buffer queues */
40 
41 #define	BQ_LOCKED	0		/* super-blocks &c */
42 #define	BQ_LRU		1		/* lru, useful buffers */
43 #define	BQ_AGE		2		/* rubbish */
44 #define	BQ_EMPTY	3		/* buffer headers with no memory */
45 
46 struct queue_entry bufqueues[BQUEUES];
47 int needbuffer;
48 
49 /*
50  * Insq/Remq for the buffer free lists.
51  */
52 #define	binsheadfree(bp, dp) \
53 	queue_enter_head(dp, bp, struct buf *, b_freelist)
54 #define	binstailfree(bp, dp) \
55 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
56 
57 void
58 bremfree(bp)
59 	struct buf *bp;
60 {
61 	struct queue_entry *dp;
62 
63 	/*
64 	 * We only calculate the head of the freelist when removing
65 	 * the last element of the list as that is the only time that
66 	 * it is needed (e.g. to reset the tail pointer).
67 	 */
68 	if (bp->b_freelist.qe_next == NULL) {
69 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
70 			if (dp->qe_prev == &bp->b_freelist.qe_next)
71 				break;
72 		if (dp == &bufqueues[BQUEUES])
73 			panic("bremfree: lost tail");
74 	}
75 	queue_remove(dp, bp, struct buf *, b_freelist);
76 }
77 
78 /*
79  * Initialize buffers and hash links for buffers.
80  */
81 void
82 bufinit()
83 {
84 	register struct buf *bp;
85 	struct queue_entry *dp;
86 	register int i;
87 	int base, residual;
88 
89 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
90 		queue_init(dp);
91 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
92 	base = bufpages / nbuf;
93 	residual = bufpages % nbuf;
94 	for (i = 0; i < nbuf; i++) {
95 		bp = &buf[i];
96 		bzero((char *)bp, sizeof *bp);
97 		bp->b_dev = NODEV;
98 		bp->b_rcred = NOCRED;
99 		bp->b_wcred = NOCRED;
100 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
101 		if (i < residual)
102 			bp->b_bufsize = (base + 1) * CLBYTES;
103 		else
104 			bp->b_bufsize = base * CLBYTES;
105 		bp->b_flags = B_INVAL;
106 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
107 		binsheadfree(bp, dp);
108 		binshash(bp, &invalhash);
109 	}
110 }
111 
112 /*
113  * Find the block in the buffer pool.
114  * If the buffer is not present, allocate a new buffer and load
115  * its contents according to the filesystem fill routine.
116  */
117 bread(vp, blkno, size, cred, bpp)
118 	struct vnode *vp;
119 	daddr_t blkno;
120 	int size;
121 	struct ucred *cred;
122 	struct buf **bpp;
123 {
124 	struct proc *p = curproc;		/* XXX */
125 	register struct buf *bp;
126 
127 	if (size == 0)
128 		panic("bread: size 0");
129 	*bpp = bp = getblk(vp, blkno, size);
130 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
131 		trace(TR_BREADHIT, pack(vp, size), blkno);
132 		return (0);
133 	}
134 	bp->b_flags |= B_READ;
135 	if (bp->b_bcount > bp->b_bufsize)
136 		panic("bread");
137 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
138 		crhold(cred);
139 		bp->b_rcred = cred;
140 	}
141 	VOP_STRATEGY(bp);
142 	trace(TR_BREADMISS, pack(vp, size), blkno);
143 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
144 	return (biowait(bp));
145 }
146 
147 /*
148  * Operates like bread, but also starts I/O on the N specified
149  * read-ahead blocks.
150  */
151 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
152 	struct vnode *vp;
153 	daddr_t blkno; int size;
154 	daddr_t rablkno[]; int rabsize[];
155 	int num;
156 	struct ucred *cred;
157 	struct buf **bpp;
158 {
159 	struct proc *p = curproc;		/* XXX */
160 	register struct buf *bp, *rabp;
161 	register int i;
162 
163 	bp = NULL;
164 	/*
165 	 * If the block is not memory resident,
166 	 * allocate a buffer and start I/O.
167 	 */
168 	if (!incore(vp, blkno)) {
169 		*bpp = bp = getblk(vp, blkno, size);
170 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
171 			bp->b_flags |= B_READ;
172 			if (bp->b_bcount > bp->b_bufsize)
173 				panic("breadn");
174 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
175 				crhold(cred);
176 				bp->b_rcred = cred;
177 			}
178 			VOP_STRATEGY(bp);
179 			trace(TR_BREADMISS, pack(vp, size), blkno);
180 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
181 		} else {
182 			trace(TR_BREADHIT, pack(vp, size), blkno);
183 		}
184 	}
185 
186 	/*
187 	 * If there's read-ahead block(s), start I/O
188 	 * on them also (as above).
189 	 */
190 	for (i = 0; i < num; i++) {
191 		if (incore(vp, rablkno[i]))
192 			continue;
193 		rabp = getblk(vp, rablkno[i], rabsize[i]);
194 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
195 			brelse(rabp);
196 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
197 		} else {
198 			rabp->b_flags |= B_ASYNC | B_READ;
199 			if (rabp->b_bcount > rabp->b_bufsize)
200 				panic("breadrabp");
201 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
202 				crhold(cred);
203 				rabp->b_rcred = cred;
204 			}
205 			VOP_STRATEGY(rabp);
206 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
207 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
208 		}
209 	}
210 
211 	/*
212 	 * If block was memory resident, let bread get it.
213 	 * If block was not memory resident, the read was
214 	 * started above, so just wait for the read to complete.
215 	 */
216 	if (bp == NULL)
217 		return (bread(vp, blkno, size, cred, bpp));
218 	return (biowait(bp));
219 }
220 
221 /*
222  * Synchronous write.
223  * Release buffer on completion.
224  */
225 bwrite(bp)
226 	register struct buf *bp;
227 {
228 	struct proc *p = curproc;		/* XXX */
229 	register int flag;
230 	int s, error = 0;
231 
232 	flag = bp->b_flags;
233 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
234 	if (flag & B_ASYNC) {
235 		if ((flag & B_DELWRI) == 0)
236 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
237 		else
238 			reassignbuf(bp, bp->b_vp);
239 	}
240 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
241 	if (bp->b_bcount > bp->b_bufsize)
242 		panic("bwrite");
243 	s = splbio();
244 	bp->b_vp->v_numoutput++;
245 	splx(s);
246 	VOP_STRATEGY(bp);
247 
248 	/*
249 	 * If the write was synchronous, then await I/O completion.
250 	 * If the write was "delayed", then we put the buffer on
251 	 * the queue of blocks awaiting I/O completion status.
252 	 */
253 	if ((flag & B_ASYNC) == 0) {
254 		error = biowait(bp);
255 		if ((flag&B_DELWRI) == 0)
256 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
257 		else
258 			reassignbuf(bp, bp->b_vp);
259 		brelse(bp);
260 	} else if (flag & B_DELWRI) {
261 		s = splbio();
262 		bp->b_flags |= B_AGE;
263 		splx(s);
264 	}
265 	return (error);
266 }
267 
268 int
269 vn_bwrite(ap)
270 	struct vop_bwrite_args *ap;
271 {
272 	return (bwrite(ap->a_bp));
273 }
274 
275 
276 /*
277  * Delayed write.
278  *
279  * The buffer is marked dirty, but is not queued for I/O.
280  * This routine should be used when the buffer is expected
281  * to be modified again soon, typically a small write that
282  * partially fills a buffer.
283  *
284  * NB: magnetic tapes cannot be delayed; they must be
285  * written in the order that the writes are requested.
286  */
287 bdwrite(bp)
288 	register struct buf *bp;
289 {
290 	struct proc *p = curproc;		/* XXX */
291 
292 	if ((bp->b_flags & B_DELWRI) == 0) {
293 		bp->b_flags |= B_DELWRI;
294 		reassignbuf(bp, bp->b_vp);
295 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
296 	}
297 	/*
298 	 * If this is a tape drive, the write must be initiated.
299 	 */
300 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
301 		bawrite(bp);
302 	} else {
303 		bp->b_flags |= (B_DONE | B_DELWRI);
304 		brelse(bp);
305 	}
306 }
307 
308 /*
309  * Asynchronous write.
310  * Start I/O on a buffer, but do not wait for it to complete.
311  * The buffer is released when the I/O completes.
312  */
313 bawrite(bp)
314 	register struct buf *bp;
315 {
316 
317 	/*
318 	 * Setting the ASYNC flag causes bwrite to return
319 	 * after starting the I/O.
320 	 */
321 	bp->b_flags |= B_ASYNC;
322 	(void) bwrite(bp);
323 }
324 
325 /*
326  * Release a buffer.
327  * Even if the buffer is dirty, no I/O is started.
328  */
329 brelse(bp)
330 	register struct buf *bp;
331 {
332 	register struct queue_entry *flist;
333 	int s;
334 
335 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
336 	/*
337 	 * If a process is waiting for the buffer, or
338 	 * is waiting for a free buffer, awaken it.
339 	 */
340 	if (bp->b_flags & B_WANTED)
341 		wakeup((caddr_t)bp);
342 	if (needbuffer) {
343 		needbuffer = 0;
344 		wakeup((caddr_t)&needbuffer);
345 	}
346 	/*
347 	 * Retry I/O for locked buffers rather than invalidating them.
348 	 */
349 	s = splbio();
350 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
351 		bp->b_flags &= ~B_ERROR;
352 	/*
353 	 * Disassociate buffers that are no longer valid.
354 	 */
355 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
356 		bp->b_flags |= B_INVAL;
357 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
358 		if (bp->b_vp)
359 			brelvp(bp);
360 		bp->b_flags &= ~B_DELWRI;
361 	}
362 	/*
363 	 * Stick the buffer back on a free list.
364 	 */
365 	if (bp->b_bufsize <= 0) {
366 		/* block has no buffer ... put at front of unused buffer list */
367 		flist = &bufqueues[BQ_EMPTY];
368 		binsheadfree(bp, flist);
369 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
370 		/* block has no info ... put at front of most free list */
371 		flist = &bufqueues[BQ_AGE];
372 		binsheadfree(bp, flist);
373 	} else {
374 		if (bp->b_flags & B_LOCKED)
375 			flist = &bufqueues[BQ_LOCKED];
376 		else if (bp->b_flags & B_AGE)
377 			flist = &bufqueues[BQ_AGE];
378 		else
379 			flist = &bufqueues[BQ_LRU];
380 		binstailfree(bp, flist);
381 	}
382 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
383 	splx(s);
384 }
385 
386 /*
387  * Check to see if a block is currently memory resident.
388  */
389 incore(vp, blkno)
390 	struct vnode *vp;
391 	daddr_t blkno;
392 {
393 	register struct buf *bp;
394 
395 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
396 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
397 		    (bp->b_flags & B_INVAL) == 0)
398 			return (1);
399 	return (0);
400 }
401 
402 /*
403  * Check to see if a block is currently memory resident.
404  * If it is resident, return it. If it is not resident,
405  * allocate a new buffer and assign it to the block.
406  */
407 struct buf *
408 getblk(vp, blkno, size)
409 	register struct vnode *vp;
410 	daddr_t blkno;
411 	int size;
412 {
413 	register struct buf *bp;
414 	struct list_entry *dp;
415 	int s;
416 
417 	if (size > MAXBSIZE)
418 		panic("getblk: size too big");
419 	/*
420 	 * Search the cache for the block. If the buffer is found,
421 	 * but it is currently locked, the we must wait for it to
422 	 * become available.
423 	 */
424 	dp = BUFHASH(vp, blkno);
425 loop:
426 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
427 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
428 		    (bp->b_flags & B_INVAL))
429 			continue;
430 		s = splbio();
431 		if (bp->b_flags & B_BUSY) {
432 			bp->b_flags |= B_WANTED;
433 			sleep((caddr_t)bp, PRIBIO + 1);
434 			splx(s);
435 			goto loop;
436 		}
437 		bremfree(bp);
438 		bp->b_flags |= B_BUSY;
439 		splx(s);
440 		if (bp->b_bcount != size) {
441 			printf("getblk: stray size");
442 			bp->b_flags |= B_INVAL;
443 			bwrite(bp);
444 			goto loop;
445 		}
446 		bp->b_flags |= B_CACHE;
447 		return (bp);
448 	}
449 	bp = getnewbuf();
450 	bremhash(bp);
451 	bgetvp(vp, bp);
452 	bp->b_bcount = 0;
453 	bp->b_lblkno = blkno;
454 	bp->b_blkno = blkno;
455 	bp->b_error = 0;
456 	bp->b_resid = 0;
457 	binshash(bp, dp);
458 	allocbuf(bp, size);
459 	return (bp);
460 }
461 
462 /*
463  * Allocate a buffer.
464  * The caller will assign it to a block.
465  */
466 struct buf *
467 geteblk(size)
468 	int size;
469 {
470 	register struct buf *bp;
471 
472 	if (size > MAXBSIZE)
473 		panic("geteblk: size too big");
474 	bp = getnewbuf();
475 	bp->b_flags |= B_INVAL;
476 	bremhash(bp);
477 	binshash(bp, &invalhash);
478 	bp->b_bcount = 0;
479 	bp->b_error = 0;
480 	bp->b_resid = 0;
481 	allocbuf(bp, size);
482 	return (bp);
483 }
484 
485 /*
486  * Expand or contract the actual memory allocated to a buffer.
487  * If no memory is available, release buffer and take error exit.
488  */
489 allocbuf(tp, size)
490 	register struct buf *tp;
491 	int size;
492 {
493 	register struct buf *bp, *ep;
494 	int sizealloc, take, s;
495 
496 	sizealloc = roundup(size, CLBYTES);
497 	/*
498 	 * Buffer size does not change
499 	 */
500 	if (sizealloc == tp->b_bufsize)
501 		goto out;
502 	/*
503 	 * Buffer size is shrinking.
504 	 * Place excess space in a buffer header taken from the
505 	 * BQ_EMPTY buffer list and placed on the "most free" list.
506 	 * If no extra buffer headers are available, leave the
507 	 * extra space in the present buffer.
508 	 */
509 	if (sizealloc < tp->b_bufsize) {
510 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
511 			goto out;
512 		s = splbio();
513 		bremfree(ep);
514 		ep->b_flags |= B_BUSY;
515 		splx(s);
516 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
517 		    (int)tp->b_bufsize - sizealloc);
518 		ep->b_bufsize = tp->b_bufsize - sizealloc;
519 		tp->b_bufsize = sizealloc;
520 		ep->b_flags |= B_INVAL;
521 		ep->b_bcount = 0;
522 		brelse(ep);
523 		goto out;
524 	}
525 	/*
526 	 * More buffer space is needed. Get it out of buffers on
527 	 * the "most free" list, placing the empty headers on the
528 	 * BQ_EMPTY buffer header list.
529 	 */
530 	while (tp->b_bufsize < sizealloc) {
531 		take = sizealloc - tp->b_bufsize;
532 		bp = getnewbuf();
533 		if (take >= bp->b_bufsize)
534 			take = bp->b_bufsize;
535 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
536 		    &tp->b_un.b_addr[tp->b_bufsize], take);
537 		tp->b_bufsize += take;
538 		bp->b_bufsize = bp->b_bufsize - take;
539 		if (bp->b_bcount > bp->b_bufsize)
540 			bp->b_bcount = bp->b_bufsize;
541 		if (bp->b_bufsize <= 0) {
542 			bremhash(bp);
543 			binshash(bp, &invalhash);
544 			bp->b_dev = NODEV;
545 			bp->b_error = 0;
546 			bp->b_flags |= B_INVAL;
547 		}
548 		brelse(bp);
549 	}
550 out:
551 	tp->b_bcount = size;
552 	return (1);
553 }
554 
555 /*
556  * Find a buffer which is available for use.
557  * Select something from a free list.
558  * Preference is to AGE list, then LRU list.
559  */
560 struct buf *
561 getnewbuf()
562 {
563 	register struct buf *bp;
564 	register struct queue_entry *dp;
565 	register struct ucred *cred;
566 	int s;
567 
568 loop:
569 	s = splbio();
570 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--)
571 		if (dp->qe_next)
572 			break;
573 	if (dp == bufqueues) {		/* no free blocks */
574 		needbuffer = 1;
575 		sleep((caddr_t)&needbuffer, PRIBIO + 1);
576 		splx(s);
577 		goto loop;
578 	}
579 	bp = dp->qe_next;
580 	bremfree(bp);
581 	bp->b_flags |= B_BUSY;
582 	splx(s);
583 	if (bp->b_flags & B_DELWRI) {
584 		(void) bawrite(bp);
585 		goto loop;
586 	}
587 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
588 	if (bp->b_vp)
589 		brelvp(bp);
590 	if (bp->b_rcred != NOCRED) {
591 		cred = bp->b_rcred;
592 		bp->b_rcred = NOCRED;
593 		crfree(cred);
594 	}
595 	if (bp->b_wcred != NOCRED) {
596 		cred = bp->b_wcred;
597 		bp->b_wcred = NOCRED;
598 		crfree(cred);
599 	}
600 	bp->b_flags = B_BUSY;
601 	bp->b_dirtyoff = bp->b_dirtyend = 0;
602 	bp->b_validoff = bp->b_validend = 0;
603 	return (bp);
604 }
605 
606 /*
607  * Wait for I/O to complete.
608  *
609  * Extract and return any errors associated with the I/O.
610  * If the error flag is set, but no specific error is
611  * given, return EIO.
612  */
613 biowait(bp)
614 	register struct buf *bp;
615 {
616 	int s;
617 
618 	s = splbio();
619 	while ((bp->b_flags & B_DONE) == 0)
620 		sleep((caddr_t)bp, PRIBIO);
621 	splx(s);
622 	if ((bp->b_flags & B_ERROR) == 0)
623 		return (0);
624 	if (bp->b_error)
625 		return (bp->b_error);
626 	return (EIO);
627 }
628 
629 /*
630  * Mark I/O complete on a buffer.
631  *
632  * If a callback has been requested, e.g. the pageout
633  * daemon, do so. Otherwise, awaken waiting processes.
634  */
635 void
636 biodone(bp)
637 	register struct buf *bp;
638 {
639 
640 	if (bp->b_flags & B_DONE)
641 		panic("dup biodone");
642 	bp->b_flags |= B_DONE;
643 	if ((bp->b_flags & B_READ) == 0)
644 		vwakeup(bp);
645 	if (bp->b_flags & B_CALL) {
646 		bp->b_flags &= ~B_CALL;
647 		(*bp->b_iodone)(bp);
648 		return;
649 	}
650 	if (bp->b_flags & B_ASYNC)
651 		brelse(bp);
652 	else {
653 		bp->b_flags &= ~B_WANTED;
654 		wakeup((caddr_t)bp);
655 	}
656 }
657 
658 #ifdef DIAGNOSTIC
659 /*
660  * Print out statistics on the current allocation of the buffer pool.
661  * Can be enabled to print out on every ``sync'' by setting "syncprt"
662  * above.
663  */
664 void
665 vfs_bufstats()
666 {
667 	int s, i, j, count;
668 	register struct buf *bp;
669 	register struct queue_entry *dp;
670 	int counts[MAXBSIZE/CLBYTES+1];
671 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
672 
673 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
674 		count = 0;
675 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
676 			counts[j] = 0;
677 		s = splbio();
678 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
679 			counts[bp->b_bufsize/CLBYTES]++;
680 			count++;
681 		}
682 		splx(s);
683 		printf("%s: total-%d", bname[i], count);
684 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
685 			if (counts[j] != 0)
686 				printf(", %d-%d", j * CLBYTES, counts[j]);
687 		printf("\n");
688 	}
689 }
690 #endif /* DIAGNOSTIC */
691