xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 52189)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_cluster.c	7.45 (Berkeley) 01/14/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/specdev.h>
17 #include <sys/mount.h>
18 #include <sys/trace.h>
19 #include <sys/resourcevar.h>
20 
21 /*
22  * Initialize buffers and hash links for buffers.
23  */
24 void
25 bufinit()
26 {
27 	register int i;
28 	register struct buf *bp, *dp;
29 	register struct bufhd *hp;
30 	int base, residual;
31 
32 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
33 		hp->b_forw = hp->b_back = (struct buf *)hp;
34 
35 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
36 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
37 		dp->b_flags = B_HEAD;
38 	}
39 	base = bufpages / nbuf;
40 	residual = bufpages % nbuf;
41 	for (i = 0; i < nbuf; i++) {
42 		bp = &buf[i];
43 		bp->b_dev = NODEV;
44 		bp->b_bcount = 0;
45 		bp->b_rcred = NOCRED;
46 		bp->b_wcred = NOCRED;
47 		bp->b_dirtyoff = 0;
48 		bp->b_dirtyend = 0;
49 		bp->b_validoff = 0;
50 		bp->b_validend = 0;
51 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
52 		if (i < residual)
53 			bp->b_bufsize = (base + 1) * CLBYTES;
54 		else
55 			bp->b_bufsize = base * CLBYTES;
56 		binshash(bp, &bfreelist[BQ_AGE]);
57 		bp->b_flags = B_BUSY|B_INVAL;
58 		brelse(bp);
59 	}
60 }
61 
62 /*
63  * Find the block in the buffer pool.
64  * If the buffer is not present, allocate a new buffer and load
65  * its contents according to the filesystem fill routine.
66  */
67 bread(vp, blkno, size, cred, bpp)
68 	struct vnode *vp;
69 	daddr_t blkno;
70 	int size;
71 	struct ucred *cred;
72 	struct buf **bpp;
73 {
74 	struct proc *p = curproc;		/* XXX */
75 	register struct buf *bp;
76 
77 	if (size == 0)
78 		panic("bread: size 0");
79 	*bpp = bp = getblk(vp, blkno, size);
80 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
81 		trace(TR_BREADHIT, pack(vp, size), blkno);
82 		return (0);
83 	}
84 	bp->b_flags |= B_READ;
85 	if (bp->b_bcount > bp->b_bufsize)
86 		panic("bread");
87 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
88 		crhold(cred);
89 		bp->b_rcred = cred;
90 	}
91 	VOP_STRATEGY(bp);
92 	trace(TR_BREADMISS, pack(vp, size), blkno);
93 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
94 	return (biowait(bp));
95 }
96 
97 /*
98  * Operates like bread, but also starts I/O on the N specified
99  * read-ahead blocks.
100  */
101 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
102 	struct vnode *vp;
103 	daddr_t blkno; int size;
104 	daddr_t rablkno[]; int rabsize[];
105 	int num;
106 	struct ucred *cred;
107 	struct buf **bpp;
108 {
109 	struct proc *p = curproc;		/* XXX */
110 	register struct buf *bp, *rabp;
111 	register int i;
112 
113 	bp = NULL;
114 	/*
115 	 * If the block is not memory resident,
116 	 * allocate a buffer and start I/O.
117 	 */
118 	if (!incore(vp, blkno)) {
119 		*bpp = bp = getblk(vp, blkno, size);
120 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
121 			bp->b_flags |= B_READ;
122 			if (bp->b_bcount > bp->b_bufsize)
123 				panic("breadn");
124 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
125 				crhold(cred);
126 				bp->b_rcred = cred;
127 			}
128 			VOP_STRATEGY(bp);
129 			trace(TR_BREADMISS, pack(vp, size), blkno);
130 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
131 		} else
132 			trace(TR_BREADHIT, pack(vp, size), blkno);
133 	}
134 
135 	/*
136 	 * If there's read-ahead block(s), start I/O
137 	 * on them also (as above).
138 	 */
139 	for (i = 0; i < num; i++) {
140 		if (incore(vp, rablkno[i]))
141 			continue;
142 		rabp = getblk(vp, rablkno[i], rabsize[i]);
143 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
144 			brelse(rabp);
145 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
146 		} else {
147 			rabp->b_flags |= B_ASYNC | B_READ;
148 			if (rabp->b_bcount > rabp->b_bufsize)
149 				panic("breadrabp");
150 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
151 				crhold(cred);
152 				rabp->b_rcred = cred;
153 			}
154 			VOP_STRATEGY(rabp);
155 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
156 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
157 		}
158 	}
159 
160 	/*
161 	 * If block was memory resident, let bread get it.
162 	 * If block was not memory resident, the read was
163 	 * started above, so just wait for the read to complete.
164 	 */
165 	if (bp == NULL)
166 		return (bread(vp, blkno, size, cred, bpp));
167 	return (biowait(bp));
168 }
169 
170 /*
171  * Synchronous write.
172  * Release buffer on completion.
173  */
174 bwrite(bp)
175 	register struct buf *bp;
176 {
177 	struct proc *p = curproc;		/* XXX */
178 	register int flag;
179 	int s, error;
180 
181 	flag = bp->b_flags;
182 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
183 	if (flag & B_ASYNC) {
184 		if ((flag & B_DELWRI) == 0)
185 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
186 		else
187 			reassignbuf(bp, bp->b_vp);
188 	}
189 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
190 	if (bp->b_bcount > bp->b_bufsize)
191 		panic("bwrite");
192 	s = splbio();
193 	bp->b_vp->v_numoutput++;
194 	splx(s);
195 	VOP_STRATEGY(bp);
196 
197 	/*
198 	 * If the write was synchronous, then await I/O completion.
199 	 * If the write was "delayed", then we put the buffer on
200 	 * the queue of blocks awaiting I/O completion status.
201 	 */
202 	if ((flag & B_ASYNC) == 0) {
203 		error = biowait(bp);
204 		if ((flag&B_DELWRI) == 0)
205 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
206 		else
207 			reassignbuf(bp, bp->b_vp);
208 		brelse(bp);
209 	} else if (flag & B_DELWRI) {
210 		bp->b_flags |= B_AGE;
211 		error = 0;
212 	}
213 	return (error);
214 }
215 
216 /*
217  * Delayed write.
218  *
219  * The buffer is marked dirty, but is not queued for I/O.
220  * This routine should be used when the buffer is expected
221  * to be modified again soon, typically a small write that
222  * partially fills a buffer.
223  *
224  * NB: magnetic tapes cannot be delayed; they must be
225  * written in the order that the writes are requested.
226  */
227 bdwrite(bp)
228 	register struct buf *bp;
229 {
230 	struct proc *p = curproc;		/* XXX */
231 
232 	if ((bp->b_flags & B_DELWRI) == 0) {
233 		bp->b_flags |= B_DELWRI;
234 		reassignbuf(bp, bp->b_vp);
235 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
236 	}
237 	/*
238 	 * If this is a tape drive, the write must be initiated.
239 	 */
240 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
241 		bawrite(bp);
242 	} else {
243 		bp->b_flags |= (B_DONE | B_DELWRI);
244 		brelse(bp);
245 	}
246 }
247 
248 /*
249  * Asynchronous write.
250  * Start I/O on a buffer, but do not wait for it to complete.
251  * The buffer is released when the I/O completes.
252  */
253 bawrite(bp)
254 	register struct buf *bp;
255 {
256 
257 	/*
258 	 * Setting the ASYNC flag causes bwrite to return
259 	 * after starting the I/O.
260 	 */
261 	bp->b_flags |= B_ASYNC;
262 	(void) bwrite(bp);
263 }
264 
265 /*
266  * Release a buffer.
267  * Even if the buffer is dirty, no I/O is started.
268  */
269 brelse(bp)
270 	register struct buf *bp;
271 {
272 	register struct buf *flist;
273 	int s;
274 
275 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
276 	/*
277 	 * If a process is waiting for the buffer, or
278 	 * is waiting for a free buffer, awaken it.
279 	 */
280 	if (bp->b_flags & B_WANTED)
281 		wakeup((caddr_t)bp);
282 	if (bfreelist[0].b_flags & B_WANTED) {
283 		bfreelist[0].b_flags &= ~B_WANTED;
284 		wakeup((caddr_t)bfreelist);
285 	}
286 	/*
287 	 * Retry I/O for locked buffers rather than invalidating them.
288 	 */
289 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
290 		bp->b_flags &= ~B_ERROR;
291 	/*
292 	 * Disassociate buffers that are no longer valid.
293 	 */
294 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
295 		bp->b_flags |= B_INVAL;
296 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
297 		if (bp->b_vp)
298 			brelvp(bp);
299 		bp->b_flags &= ~B_DELWRI;
300 	}
301 	/*
302 	 * Stick the buffer back on a free list.
303 	 */
304 	s = splbio();
305 	if (bp->b_bufsize <= 0) {
306 		/* block has no buffer ... put at front of unused buffer list */
307 		flist = &bfreelist[BQ_EMPTY];
308 		binsheadfree(bp, flist);
309 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
310 		/* block has no info ... put at front of most free list */
311 		flist = &bfreelist[BQ_AGE];
312 		binsheadfree(bp, flist);
313 	} else {
314 		if (bp->b_flags & B_LOCKED)
315 			flist = &bfreelist[BQ_LOCKED];
316 		else if (bp->b_flags & B_AGE)
317 			flist = &bfreelist[BQ_AGE];
318 		else
319 			flist = &bfreelist[BQ_LRU];
320 		binstailfree(bp, flist);
321 	}
322 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
323 	splx(s);
324 }
325 
326 /*
327  * Check to see if a block is currently memory resident.
328  */
329 incore(vp, blkno)
330 	struct vnode *vp;
331 	daddr_t blkno;
332 {
333 	register struct buf *bp;
334 	register struct buf *dp;
335 
336 	dp = BUFHASH(vp, blkno);
337 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
338 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
339 		    (bp->b_flags & B_INVAL) == 0)
340 			return (1);
341 	return (0);
342 }
343 
344 /*
345  * Check to see if a block is currently memory resident.
346  * If it is resident, return it. If it is not resident,
347  * allocate a new buffer and assign it to the block.
348  */
349 struct buf *
350 getblk(vp, blkno, size)
351 	register struct vnode *vp;
352 	daddr_t blkno;
353 	int size;
354 {
355 	register struct buf *bp, *dp;
356 	int s;
357 
358 	if (size > MAXBSIZE)
359 		panic("getblk: size too big");
360 	/*
361 	 * Search the cache for the block. If the buffer is found,
362 	 * but it is currently locked, the we must wait for it to
363 	 * become available.
364 	 */
365 	dp = BUFHASH(vp, blkno);
366 loop:
367 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
368 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
369 		    (bp->b_flags & B_INVAL))
370 			continue;
371 		s = splbio();
372 		if (bp->b_flags & B_BUSY) {
373 			bp->b_flags |= B_WANTED;
374 			sleep((caddr_t)bp, PRIBIO + 1);
375 			splx(s);
376 			goto loop;
377 		}
378 		bremfree(bp);
379 		bp->b_flags |= B_BUSY;
380 		splx(s);
381 		if (bp->b_bcount != size) {
382 			printf("getblk: stray size");
383 			bp->b_flags |= B_INVAL;
384 			bwrite(bp);
385 			goto loop;
386 		}
387 		bp->b_flags |= B_CACHE;
388 		return (bp);
389 	}
390 	bp = getnewbuf();
391 	bremhash(bp);
392 	bgetvp(vp, bp);
393 	bp->b_bcount = 0;
394 	bp->b_lblkno = blkno;
395 	bp->b_blkno = blkno;
396 	bp->b_error = 0;
397 	bp->b_resid = 0;
398 	binshash(bp, dp);
399 	allocbuf(bp, size);
400 	return (bp);
401 }
402 
403 /*
404  * Allocate a buffer.
405  * The caller will assign it to a block.
406  */
407 struct buf *
408 geteblk(size)
409 	int size;
410 {
411 	register struct buf *bp, *flist;
412 
413 	if (size > MAXBSIZE)
414 		panic("geteblk: size too big");
415 	bp = getnewbuf();
416 	bp->b_flags |= B_INVAL;
417 	bremhash(bp);
418 	flist = &bfreelist[BQ_AGE];
419 	bp->b_bcount = 0;
420 	bp->b_error = 0;
421 	bp->b_resid = 0;
422 	binshash(bp, flist);
423 	allocbuf(bp, size);
424 	return (bp);
425 }
426 
427 /*
428  * Expand or contract the actual memory allocated to a buffer.
429  * If no memory is available, release buffer and take error exit.
430  */
431 allocbuf(tp, size)
432 	register struct buf *tp;
433 	int size;
434 {
435 	register struct buf *bp, *ep;
436 	int sizealloc, take, s;
437 
438 	sizealloc = roundup(size, CLBYTES);
439 	/*
440 	 * Buffer size does not change
441 	 */
442 	if (sizealloc == tp->b_bufsize)
443 		goto out;
444 	/*
445 	 * Buffer size is shrinking.
446 	 * Place excess space in a buffer header taken from the
447 	 * BQ_EMPTY buffer list and placed on the "most free" list.
448 	 * If no extra buffer headers are available, leave the
449 	 * extra space in the present buffer.
450 	 */
451 	if (sizealloc < tp->b_bufsize) {
452 		ep = bfreelist[BQ_EMPTY].av_forw;
453 		if (ep == &bfreelist[BQ_EMPTY])
454 			goto out;
455 		s = splbio();
456 		bremfree(ep);
457 		ep->b_flags |= B_BUSY;
458 		splx(s);
459 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
460 		    (int)tp->b_bufsize - sizealloc);
461 		ep->b_bufsize = tp->b_bufsize - sizealloc;
462 		tp->b_bufsize = sizealloc;
463 		ep->b_flags |= B_INVAL;
464 		ep->b_bcount = 0;
465 		brelse(ep);
466 		goto out;
467 	}
468 	/*
469 	 * More buffer space is needed. Get it out of buffers on
470 	 * the "most free" list, placing the empty headers on the
471 	 * BQ_EMPTY buffer header list.
472 	 */
473 	while (tp->b_bufsize < sizealloc) {
474 		take = sizealloc - tp->b_bufsize;
475 		bp = getnewbuf();
476 		if (take >= bp->b_bufsize)
477 			take = bp->b_bufsize;
478 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
479 		    &tp->b_un.b_addr[tp->b_bufsize], take);
480 		tp->b_bufsize += take;
481 		bp->b_bufsize = bp->b_bufsize - take;
482 		if (bp->b_bcount > bp->b_bufsize)
483 			bp->b_bcount = bp->b_bufsize;
484 		if (bp->b_bufsize <= 0) {
485 			bremhash(bp);
486 			binshash(bp, &bfreelist[BQ_EMPTY]);
487 			bp->b_dev = NODEV;
488 			bp->b_error = 0;
489 			bp->b_flags |= B_INVAL;
490 		}
491 		brelse(bp);
492 	}
493 out:
494 	tp->b_bcount = size;
495 	return (1);
496 }
497 
498 /*
499  * Find a buffer which is available for use.
500  * Select something from a free list.
501  * Preference is to AGE list, then LRU list.
502  */
503 struct buf *
504 getnewbuf()
505 {
506 	register struct buf *bp, *dp;
507 	register struct ucred *cred;
508 	int s;
509 
510 #ifdef LFS
511 	lfs_flush();
512 #endif
513 loop:
514 	s = splbio();
515 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
516 		if (dp->av_forw != dp)
517 			break;
518 	if (dp == bfreelist) {		/* no free blocks */
519 		dp->b_flags |= B_WANTED;
520 		sleep((caddr_t)dp, PRIBIO + 1);
521 		splx(s);
522 		goto loop;
523 	}
524 	bp = dp->av_forw;
525 	bremfree(bp);
526 	bp->b_flags |= B_BUSY;
527 	splx(s);
528 	if (bp->b_flags & B_DELWRI) {
529 		(void) bawrite(bp);
530 		goto loop;
531 	}
532 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
533 	if (bp->b_vp)
534 		brelvp(bp);
535 	if (bp->b_rcred != NOCRED) {
536 		cred = bp->b_rcred;
537 		bp->b_rcred = NOCRED;
538 		crfree(cred);
539 	}
540 	if (bp->b_wcred != NOCRED) {
541 		cred = bp->b_wcred;
542 		bp->b_wcred = NOCRED;
543 		crfree(cred);
544 	}
545 	bp->b_flags = B_BUSY;
546 	bp->b_dirtyoff = bp->b_dirtyend = 0;
547 	bp->b_validoff = bp->b_validend = 0;
548 	return (bp);
549 }
550 
551 /*
552  * Wait for I/O to complete.
553  *
554  * Extract and return any errors associated with the I/O.
555  * If the error flag is set, but no specific error is
556  * given, return EIO.
557  */
558 biowait(bp)
559 	register struct buf *bp;
560 {
561 	int s;
562 
563 	s = splbio();
564 	while ((bp->b_flags & B_DONE) == 0)
565 		sleep((caddr_t)bp, PRIBIO);
566 	splx(s);
567 	if ((bp->b_flags & B_ERROR) == 0)
568 		return (0);
569 	if (bp->b_error)
570 		return (bp->b_error);
571 	return (EIO);
572 }
573 
574 /*
575  * Mark I/O complete on a buffer.
576  *
577  * If a callback has been requested, e.g. the pageout
578  * daemon, do so. Otherwise, awaken waiting processes.
579  */
580 void
581 biodone(bp)
582 	register struct buf *bp;
583 {
584 
585 	if (bp->b_flags & B_DONE)
586 		panic("dup biodone");
587 	bp->b_flags |= B_DONE;
588 	if ((bp->b_flags & B_READ) == 0)
589 		vwakeup(bp);
590 	if (bp->b_flags & B_CALL) {
591 		bp->b_flags &= ~B_CALL;
592 		(*bp->b_iodone)(bp);
593 		return;
594 	}
595 	if (bp->b_flags & B_ASYNC)
596 		brelse(bp);
597 	else {
598 		bp->b_flags &= ~B_WANTED;
599 		wakeup((caddr_t)bp);
600 	}
601 }
602