xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 54636)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_cluster.c	7.50 (Berkeley) 07/03/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/specdev.h>
17 #include <sys/mount.h>
18 #include <sys/trace.h>
19 #include <sys/resourcevar.h>
20 
21 /*
22  * Initialize buffers and hash links for buffers.
23  */
24 void
25 bufinit()
26 {
27 	register int i;
28 	register struct buf *bp, *dp;
29 	register struct bufhd *hp;
30 	int base, residual;
31 
32 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
33 		hp->b_forw = hp->b_back = (struct buf *)hp;
34 
35 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
36 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
37 		dp->b_flags = B_HEAD;
38 	}
39 	base = bufpages / nbuf;
40 	residual = bufpages % nbuf;
41 	for (i = 0; i < nbuf; i++) {
42 		bp = &buf[i];
43 		bp->b_dev = NODEV;
44 		bp->b_bcount = 0;
45 		bp->b_rcred = NOCRED;
46 		bp->b_wcred = NOCRED;
47 		bp->b_dirtyoff = 0;
48 		bp->b_dirtyend = 0;
49 		bp->b_validoff = 0;
50 		bp->b_validend = 0;
51 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
52 		if (i < residual)
53 			bp->b_bufsize = (base + 1) * CLBYTES;
54 		else
55 			bp->b_bufsize = base * CLBYTES;
56 		binshash(bp, &bfreelist[BQ_AGE]);
57 		bp->b_flags = B_INVAL;
58 		dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY];
59 		binsheadfree(bp, dp);
60 	}
61 }
62 
63 /*
64  * Find the block in the buffer pool.
65  * If the buffer is not present, allocate a new buffer and load
66  * its contents according to the filesystem fill routine.
67  */
68 bread(vp, blkno, size, cred, bpp)
69 	struct vnode *vp;
70 	daddr_t blkno;
71 	int size;
72 	struct ucred *cred;
73 	struct buf **bpp;
74 {
75 	struct proc *p = curproc;		/* XXX */
76 	register struct buf *bp;
77 
78 	if (size == 0)
79 		panic("bread: size 0");
80 	*bpp = bp = getblk(vp, blkno, size);
81 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
82 		trace(TR_BREADHIT, pack(vp, size), blkno);
83 		return (0);
84 	}
85 	bp->b_flags |= B_READ;
86 	if (bp->b_bcount > bp->b_bufsize)
87 		panic("bread");
88 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
89 		crhold(cred);
90 		bp->b_rcred = cred;
91 	}
92 	VOP_STRATEGY(bp);
93 	trace(TR_BREADMISS, pack(vp, size), blkno);
94 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
95 	return (biowait(bp));
96 }
97 
98 /*
99  * Operates like bread, but also starts I/O on the N specified
100  * read-ahead blocks.
101  */
102 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
103 	struct vnode *vp;
104 	daddr_t blkno; int size;
105 	daddr_t rablkno[]; int rabsize[];
106 	int num;
107 	struct ucred *cred;
108 	struct buf **bpp;
109 {
110 	struct proc *p = curproc;		/* XXX */
111 	register struct buf *bp, *rabp;
112 	register int i;
113 
114 	bp = NULL;
115 	/*
116 	 * If the block is not memory resident,
117 	 * allocate a buffer and start I/O.
118 	 */
119 	if (!incore(vp, blkno)) {
120 		*bpp = bp = getblk(vp, blkno, size);
121 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
122 			bp->b_flags |= B_READ;
123 			if (bp->b_bcount > bp->b_bufsize)
124 				panic("breadn");
125 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
126 				crhold(cred);
127 				bp->b_rcred = cred;
128 			}
129 			VOP_STRATEGY(bp);
130 			trace(TR_BREADMISS, pack(vp, size), blkno);
131 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
132 		} else {
133 			trace(TR_BREADHIT, pack(vp, size), blkno);
134 		}
135 	}
136 
137 	/*
138 	 * If there's read-ahead block(s), start I/O
139 	 * on them also (as above).
140 	 */
141 	for (i = 0; i < num; i++) {
142 		if (incore(vp, rablkno[i]))
143 			continue;
144 		rabp = getblk(vp, rablkno[i], rabsize[i]);
145 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
146 			brelse(rabp);
147 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
148 		} else {
149 			rabp->b_flags |= B_ASYNC | B_READ;
150 			if (rabp->b_bcount > rabp->b_bufsize)
151 				panic("breadrabp");
152 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
153 				crhold(cred);
154 				rabp->b_rcred = cred;
155 			}
156 			VOP_STRATEGY(rabp);
157 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
158 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
159 		}
160 	}
161 
162 	/*
163 	 * If block was memory resident, let bread get it.
164 	 * If block was not memory resident, the read was
165 	 * started above, so just wait for the read to complete.
166 	 */
167 	if (bp == NULL)
168 		return (bread(vp, blkno, size, cred, bpp));
169 	return (biowait(bp));
170 }
171 
172 /*
173  * Synchronous write.
174  * Release buffer on completion.
175  */
176 bwrite(bp)
177 	register struct buf *bp;
178 {
179 	struct proc *p = curproc;		/* XXX */
180 	register int flag;
181 	int s, error = 0;
182 
183 	flag = bp->b_flags;
184 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
185 	if (flag & B_ASYNC) {
186 		if ((flag & B_DELWRI) == 0)
187 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
188 		else
189 			reassignbuf(bp, bp->b_vp);
190 	}
191 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
192 	if (bp->b_bcount > bp->b_bufsize)
193 		panic("bwrite");
194 	s = splbio();
195 	bp->b_vp->v_numoutput++;
196 	splx(s);
197 	VOP_STRATEGY(bp);
198 
199 	/*
200 	 * If the write was synchronous, then await I/O completion.
201 	 * If the write was "delayed", then we put the buffer on
202 	 * the queue of blocks awaiting I/O completion status.
203 	 */
204 	if ((flag & B_ASYNC) == 0) {
205 		error = biowait(bp);
206 		if ((flag&B_DELWRI) == 0)
207 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
208 		else
209 			reassignbuf(bp, bp->b_vp);
210 		brelse(bp);
211 	} else if (flag & B_DELWRI) {
212 		s = splbio();
213 		bp->b_flags |= B_AGE;
214 		splx(s);
215 	}
216 	return (error);
217 }
218 
219 int
220 vn_bwrite(ap)
221 	struct vop_bwrite_args *ap;
222 {
223 	return bwrite (ap->a_bp);
224 }
225 
226 
227 /*
228  * Delayed write.
229  *
230  * The buffer is marked dirty, but is not queued for I/O.
231  * This routine should be used when the buffer is expected
232  * to be modified again soon, typically a small write that
233  * partially fills a buffer.
234  *
235  * NB: magnetic tapes cannot be delayed; they must be
236  * written in the order that the writes are requested.
237  */
238 bdwrite(bp)
239 	register struct buf *bp;
240 {
241 	struct proc *p = curproc;		/* XXX */
242 
243 	if ((bp->b_flags & B_DELWRI) == 0) {
244 		bp->b_flags |= B_DELWRI;
245 		reassignbuf(bp, bp->b_vp);
246 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
247 	}
248 	/*
249 	 * If this is a tape drive, the write must be initiated.
250 	 */
251 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
252 		bawrite(bp);
253 	} else {
254 		bp->b_flags |= (B_DONE | B_DELWRI);
255 		brelse(bp);
256 	}
257 }
258 
259 /*
260  * Asynchronous write.
261  * Start I/O on a buffer, but do not wait for it to complete.
262  * The buffer is released when the I/O completes.
263  */
264 bawrite(bp)
265 	register struct buf *bp;
266 {
267 
268 	/*
269 	 * Setting the ASYNC flag causes bwrite to return
270 	 * after starting the I/O.
271 	 */
272 	bp->b_flags |= B_ASYNC;
273 	(void) bwrite(bp);
274 }
275 
276 /*
277  * Release a buffer.
278  * Even if the buffer is dirty, no I/O is started.
279  */
280 brelse(bp)
281 	register struct buf *bp;
282 {
283 	register struct buf *flist;
284 	int s;
285 
286 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
287 	/*
288 	 * If a process is waiting for the buffer, or
289 	 * is waiting for a free buffer, awaken it.
290 	 */
291 	if (bp->b_flags & B_WANTED)
292 		wakeup((caddr_t)bp);
293 	if (bfreelist[0].b_flags & B_WANTED) {
294 		bfreelist[0].b_flags &= ~B_WANTED;
295 		wakeup((caddr_t)bfreelist);
296 	}
297 	/*
298 	 * Retry I/O for locked buffers rather than invalidating them.
299 	 */
300 	s = splbio();
301 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
302 		bp->b_flags &= ~B_ERROR;
303 	/*
304 	 * Disassociate buffers that are no longer valid.
305 	 */
306 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
307 		bp->b_flags |= B_INVAL;
308 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
309 		if (bp->b_vp)
310 			brelvp(bp);
311 		bp->b_flags &= ~B_DELWRI;
312 	}
313 	/*
314 	 * Stick the buffer back on a free list.
315 	 */
316 	if (bp->b_bufsize <= 0) {
317 		/* block has no buffer ... put at front of unused buffer list */
318 		flist = &bfreelist[BQ_EMPTY];
319 		binsheadfree(bp, flist);
320 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
321 		/* block has no info ... put at front of most free list */
322 		flist = &bfreelist[BQ_AGE];
323 		binsheadfree(bp, flist);
324 	} else {
325 		if (bp->b_flags & B_LOCKED)
326 			flist = &bfreelist[BQ_LOCKED];
327 		else if (bp->b_flags & B_AGE)
328 			flist = &bfreelist[BQ_AGE];
329 		else
330 			flist = &bfreelist[BQ_LRU];
331 		binstailfree(bp, flist);
332 	}
333 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
334 	splx(s);
335 }
336 
337 /*
338  * Check to see if a block is currently memory resident.
339  */
340 incore(vp, blkno)
341 	struct vnode *vp;
342 	daddr_t blkno;
343 {
344 	register struct buf *bp;
345 	register struct buf *dp;
346 
347 	dp = BUFHASH(vp, blkno);
348 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
349 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
350 		    (bp->b_flags & B_INVAL) == 0)
351 			return (1);
352 	return (0);
353 }
354 
355 /*
356  * Check to see if a block is currently memory resident.
357  * If it is resident, return it. If it is not resident,
358  * allocate a new buffer and assign it to the block.
359  */
360 struct buf *
361 getblk(vp, blkno, size)
362 	register struct vnode *vp;
363 	daddr_t blkno;
364 	int size;
365 {
366 	register struct buf *bp, *dp;
367 	int s;
368 
369 	if (size > MAXBSIZE)
370 		panic("getblk: size too big");
371 	/*
372 	 * Search the cache for the block. If the buffer is found,
373 	 * but it is currently locked, the we must wait for it to
374 	 * become available.
375 	 */
376 	dp = BUFHASH(vp, blkno);
377 loop:
378 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
379 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
380 		    (bp->b_flags & B_INVAL))
381 			continue;
382 		s = splbio();
383 		if (bp->b_flags & B_BUSY) {
384 			bp->b_flags |= B_WANTED;
385 			sleep((caddr_t)bp, PRIBIO + 1);
386 			splx(s);
387 			goto loop;
388 		}
389 		bremfree(bp);
390 		bp->b_flags |= B_BUSY;
391 		splx(s);
392 		if (bp->b_bcount != size) {
393 			printf("getblk: stray size");
394 			bp->b_flags |= B_INVAL;
395 			bwrite(bp);
396 			goto loop;
397 		}
398 		bp->b_flags |= B_CACHE;
399 		return (bp);
400 	}
401 	bp = getnewbuf();
402 	bremhash(bp);
403 	bgetvp(vp, bp);
404 	bp->b_bcount = 0;
405 	bp->b_lblkno = blkno;
406 	bp->b_blkno = blkno;
407 	bp->b_error = 0;
408 	bp->b_resid = 0;
409 	binshash(bp, dp);
410 	allocbuf(bp, size);
411 	return (bp);
412 }
413 
414 /*
415  * Allocate a buffer.
416  * The caller will assign it to a block.
417  */
418 struct buf *
419 geteblk(size)
420 	int size;
421 {
422 	register struct buf *bp, *flist;
423 
424 	if (size > MAXBSIZE)
425 		panic("geteblk: size too big");
426 	bp = getnewbuf();
427 	bp->b_flags |= B_INVAL;
428 	bremhash(bp);
429 	flist = &bfreelist[BQ_AGE];
430 	bp->b_bcount = 0;
431 	bp->b_error = 0;
432 	bp->b_resid = 0;
433 	binshash(bp, flist);
434 	allocbuf(bp, size);
435 	return (bp);
436 }
437 
438 /*
439  * Expand or contract the actual memory allocated to a buffer.
440  * If no memory is available, release buffer and take error exit.
441  */
442 allocbuf(tp, size)
443 	register struct buf *tp;
444 	int size;
445 {
446 	register struct buf *bp, *ep;
447 	int sizealloc, take, s;
448 
449 	sizealloc = roundup(size, CLBYTES);
450 	/*
451 	 * Buffer size does not change
452 	 */
453 	if (sizealloc == tp->b_bufsize)
454 		goto out;
455 	/*
456 	 * Buffer size is shrinking.
457 	 * Place excess space in a buffer header taken from the
458 	 * BQ_EMPTY buffer list and placed on the "most free" list.
459 	 * If no extra buffer headers are available, leave the
460 	 * extra space in the present buffer.
461 	 */
462 	if (sizealloc < tp->b_bufsize) {
463 		ep = bfreelist[BQ_EMPTY].av_forw;
464 		if (ep == &bfreelist[BQ_EMPTY])
465 			goto out;
466 		s = splbio();
467 		bremfree(ep);
468 		ep->b_flags |= B_BUSY;
469 		splx(s);
470 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
471 		    (int)tp->b_bufsize - sizealloc);
472 		ep->b_bufsize = tp->b_bufsize - sizealloc;
473 		tp->b_bufsize = sizealloc;
474 		ep->b_flags |= B_INVAL;
475 		ep->b_bcount = 0;
476 		brelse(ep);
477 		goto out;
478 	}
479 	/*
480 	 * More buffer space is needed. Get it out of buffers on
481 	 * the "most free" list, placing the empty headers on the
482 	 * BQ_EMPTY buffer header list.
483 	 */
484 	while (tp->b_bufsize < sizealloc) {
485 		take = sizealloc - tp->b_bufsize;
486 		bp = getnewbuf();
487 		if (take >= bp->b_bufsize)
488 			take = bp->b_bufsize;
489 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
490 		    &tp->b_un.b_addr[tp->b_bufsize], take);
491 		tp->b_bufsize += take;
492 		bp->b_bufsize = bp->b_bufsize - take;
493 		if (bp->b_bcount > bp->b_bufsize)
494 			bp->b_bcount = bp->b_bufsize;
495 		if (bp->b_bufsize <= 0) {
496 			bremhash(bp);
497 			binshash(bp, &bfreelist[BQ_EMPTY]);
498 			bp->b_dev = NODEV;
499 			bp->b_error = 0;
500 			bp->b_flags |= B_INVAL;
501 		}
502 		brelse(bp);
503 	}
504 out:
505 	tp->b_bcount = size;
506 	return (1);
507 }
508 
509 /*
510  * Find a buffer which is available for use.
511  * Select something from a free list.
512  * Preference is to AGE list, then LRU list.
513  */
514 struct buf *
515 getnewbuf()
516 {
517 	register struct buf *bp, *dp;
518 	register struct ucred *cred;
519 	int s;
520 
521 #ifdef LFS
522 	lfs_flush();
523 #endif
524 loop:
525 	s = splbio();
526 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
527 		if (dp->av_forw != dp)
528 			break;
529 	if (dp == bfreelist) {		/* no free blocks */
530 		dp->b_flags |= B_WANTED;
531 		sleep((caddr_t)dp, PRIBIO + 1);
532 		splx(s);
533 		goto loop;
534 	}
535 	bp = dp->av_forw;
536 	bremfree(bp);
537 	bp->b_flags |= B_BUSY;
538 	splx(s);
539 	if (bp->b_flags & B_DELWRI) {
540 		(void) bawrite(bp);
541 		goto loop;
542 	}
543 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
544 	if (bp->b_vp)
545 		brelvp(bp);
546 	if (bp->b_rcred != NOCRED) {
547 		cred = bp->b_rcred;
548 		bp->b_rcred = NOCRED;
549 		crfree(cred);
550 	}
551 	if (bp->b_wcred != NOCRED) {
552 		cred = bp->b_wcred;
553 		bp->b_wcred = NOCRED;
554 		crfree(cred);
555 	}
556 	bp->b_flags = B_BUSY;
557 	bp->b_dirtyoff = bp->b_dirtyend = 0;
558 	bp->b_validoff = bp->b_validend = 0;
559 	return (bp);
560 }
561 
562 /*
563  * Wait for I/O to complete.
564  *
565  * Extract and return any errors associated with the I/O.
566  * If the error flag is set, but no specific error is
567  * given, return EIO.
568  */
569 biowait(bp)
570 	register struct buf *bp;
571 {
572 	int s;
573 
574 	s = splbio();
575 	while ((bp->b_flags & B_DONE) == 0)
576 		sleep((caddr_t)bp, PRIBIO);
577 	splx(s);
578 	if ((bp->b_flags & B_ERROR) == 0)
579 		return (0);
580 	if (bp->b_error)
581 		return (bp->b_error);
582 	return (EIO);
583 }
584 
585 /*
586  * Mark I/O complete on a buffer.
587  *
588  * If a callback has been requested, e.g. the pageout
589  * daemon, do so. Otherwise, awaken waiting processes.
590  */
591 void
592 biodone(bp)
593 	register struct buf *bp;
594 {
595 
596 	if (bp->b_flags & B_DONE)
597 		panic("dup biodone");
598 	bp->b_flags |= B_DONE;
599 	if ((bp->b_flags & B_READ) == 0)
600 		vwakeup(bp);
601 	if (bp->b_flags & B_CALL) {
602 		bp->b_flags &= ~B_CALL;
603 		(*bp->b_iodone)(bp);
604 		return;
605 	}
606 	if (bp->b_flags & B_ASYNC)
607 		brelse(bp);
608 	else {
609 		bp->b_flags &= ~B_WANTED;
610 		wakeup((caddr_t)bp);
611 	}
612 }
613