xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 49459)
1 /*
2  * Copyright (c) 1982, 1986, 1989 Regents of the University of California.
3  * All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  *
7  *	@(#)vfs_cluster.c	7.40 (Berkeley) 05/08/91
8  */
9 
10 #include "param.h"
11 #include "proc.h"
12 #include "buf.h"
13 #include "vnode.h"
14 #include "specdev.h"
15 #include "mount.h"
16 #include "trace.h"
17 #include "resourcevar.h"
18 
19 /*
20  * Initialize buffers and hash links for buffers.
21  */
22 bufinit()
23 {
24 	register int i;
25 	register struct buf *bp, *dp;
26 	register struct bufhd *hp;
27 	int base, residual;
28 
29 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
30 		hp->b_forw = hp->b_back = (struct buf *)hp;
31 
32 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
33 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
34 		dp->b_flags = B_HEAD;
35 	}
36 	base = bufpages / nbuf;
37 	residual = bufpages % nbuf;
38 	for (i = 0; i < nbuf; i++) {
39 		bp = &buf[i];
40 		bp->b_dev = NODEV;
41 		bp->b_bcount = 0;
42 		bp->b_rcred = NOCRED;
43 		bp->b_wcred = NOCRED;
44 		bp->b_dirtyoff = 0;
45 		bp->b_dirtyend = 0;
46 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
47 		if (i < residual)
48 			bp->b_bufsize = (base + 1) * CLBYTES;
49 		else
50 			bp->b_bufsize = base * CLBYTES;
51 		binshash(bp, &bfreelist[BQ_AGE]);
52 		bp->b_flags = B_BUSY|B_INVAL;
53 		brelse(bp);
54 	}
55 }
56 
57 /*
58  * Find the block in the buffer pool.
59  * If the buffer is not present, allocate a new buffer and load
60  * its contents according to the filesystem fill routine.
61  */
62 bread(vp, blkno, size, cred, bpp)
63 	struct vnode *vp;
64 	daddr_t blkno;
65 	int size;
66 	struct ucred *cred;
67 	struct buf **bpp;
68 {
69 	struct proc *p = curproc;		/* XXX */
70 	register struct buf *bp;
71 
72 	if (size == 0)
73 		panic("bread: size 0");
74 	*bpp = bp = getblk(vp, blkno, size);
75 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
76 		trace(TR_BREADHIT, pack(vp, size), blkno);
77 		return (0);
78 	}
79 	bp->b_flags |= B_READ;
80 	if (bp->b_bcount > bp->b_bufsize)
81 		panic("bread");
82 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
83 		crhold(cred);
84 		bp->b_rcred = cred;
85 	}
86 	VOP_STRATEGY(bp);
87 	trace(TR_BREADMISS, pack(vp, size), blkno);
88 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
89 	return (biowait(bp));
90 }
91 
92 /*
93  * Operates like bread, but also starts I/O on the specified
94  * read-ahead block.
95  */
96 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
97 	struct vnode *vp;
98 	daddr_t blkno; int size;
99 	daddr_t rablkno; int rabsize;
100 	struct ucred *cred;
101 	struct buf **bpp;
102 {
103 	struct proc *p = curproc;		/* XXX */
104 	register struct buf *bp, *rabp;
105 
106 	bp = NULL;
107 	/*
108 	 * If the block is not memory resident,
109 	 * allocate a buffer and start I/O.
110 	 */
111 	if (!incore(vp, blkno)) {
112 		*bpp = bp = getblk(vp, blkno, size);
113 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
114 			bp->b_flags |= B_READ;
115 			if (bp->b_bcount > bp->b_bufsize)
116 				panic("breada");
117 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
118 				crhold(cred);
119 				bp->b_rcred = cred;
120 			}
121 			VOP_STRATEGY(bp);
122 			trace(TR_BREADMISS, pack(vp, size), blkno);
123 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
124 		} else
125 			trace(TR_BREADHIT, pack(vp, size), blkno);
126 	}
127 
128 	/*
129 	 * If there is a read-ahead block, start I/O on it too.
130 	 */
131 	if (!incore(vp, rablkno)) {
132 		rabp = getblk(vp, rablkno, rabsize);
133 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
134 			brelse(rabp);
135 			trace(TR_BREADHITRA, pack(vp, rabsize), rablkno);
136 		} else {
137 			rabp->b_flags |= B_ASYNC | B_READ;
138 			if (rabp->b_bcount > rabp->b_bufsize)
139 				panic("breadrabp");
140 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
141 				crhold(cred);
142 				rabp->b_rcred = cred;
143 			}
144 			VOP_STRATEGY(rabp);
145 			trace(TR_BREADMISSRA, pack(vp, rabsize), rablkno);
146 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
147 		}
148 	}
149 
150 	/*
151 	 * If block was memory resident, let bread get it.
152 	 * If block was not memory resident, the read was
153 	 * started above, so just wait for the read to complete.
154 	 */
155 	if (bp == NULL)
156 		return (bread(vp, blkno, size, cred, bpp));
157 	return (biowait(bp));
158 }
159 
160 /*
161  * Synchronous write.
162  * Release buffer on completion.
163  */
164 bwrite(bp)
165 	register struct buf *bp;
166 {
167 	struct proc *p = curproc;		/* XXX */
168 	register int flag;
169 	int s, error;
170 
171 	flag = bp->b_flags;
172 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
173 	if (flag & B_ASYNC) {
174 		if ((flag & B_DELWRI) == 0)
175 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
176 		else
177 			reassignbuf(bp, bp->b_vp);
178 	}
179 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
180 	if (bp->b_bcount > bp->b_bufsize)
181 		panic("bwrite");
182 	s = splbio();
183 	bp->b_vp->v_numoutput++;
184 	splx(s);
185 	VOP_STRATEGY(bp);
186 
187 	/*
188 	 * If the write was synchronous, then await I/O completion.
189 	 * If the write was "delayed", then we put the buffer on
190 	 * the queue of blocks awaiting I/O completion status.
191 	 */
192 	if ((flag & B_ASYNC) == 0) {
193 		error = biowait(bp);
194 		if ((flag&B_DELWRI) == 0)
195 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
196 		else
197 			reassignbuf(bp, bp->b_vp);
198 		brelse(bp);
199 	} else if (flag & B_DELWRI) {
200 		bp->b_flags |= B_AGE;
201 		error = 0;
202 	}
203 	return (error);
204 }
205 
206 /*
207  * Delayed write.
208  *
209  * The buffer is marked dirty, but is not queued for I/O.
210  * This routine should be used when the buffer is expected
211  * to be modified again soon, typically a small write that
212  * partially fills a buffer.
213  *
214  * NB: magnetic tapes cannot be delayed; they must be
215  * written in the order that the writes are requested.
216  */
217 bdwrite(bp)
218 	register struct buf *bp;
219 {
220 	struct proc *p = curproc;		/* XXX */
221 
222 	if ((bp->b_flags & B_DELWRI) == 0) {
223 		bp->b_flags |= B_DELWRI;
224 		reassignbuf(bp, bp->b_vp);
225 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
226 	}
227 	/*
228 	 * If this is a tape drive, the write must be initiated.
229 	 */
230 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
231 		bawrite(bp);
232 	} else {
233 		bp->b_flags |= (B_DONE | B_DELWRI);
234 		brelse(bp);
235 	}
236 }
237 
238 /*
239  * Asynchronous write.
240  * Start I/O on a buffer, but do not wait for it to complete.
241  * The buffer is released when the I/O completes.
242  */
243 bawrite(bp)
244 	register struct buf *bp;
245 {
246 
247 	/*
248 	 * Setting the ASYNC flag causes bwrite to return
249 	 * after starting the I/O.
250 	 */
251 	bp->b_flags |= B_ASYNC;
252 	(void) bwrite(bp);
253 }
254 
255 /*
256  * Release a buffer.
257  * Even if the buffer is dirty, no I/O is started.
258  */
259 brelse(bp)
260 	register struct buf *bp;
261 {
262 	register struct buf *flist;
263 	int s;
264 
265 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
266 	/*
267 	 * If a process is waiting for the buffer, or
268 	 * is waiting for a free buffer, awaken it.
269 	 */
270 	if (bp->b_flags & B_WANTED)
271 		wakeup((caddr_t)bp);
272 	if (bfreelist[0].b_flags & B_WANTED) {
273 		bfreelist[0].b_flags &= ~B_WANTED;
274 		wakeup((caddr_t)bfreelist);
275 	}
276 	/*
277 	 * Retry I/O for locked buffers rather than invalidating them.
278 	 */
279 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
280 		bp->b_flags &= ~B_ERROR;
281 	/*
282 	 * Disassociate buffers that are no longer valid.
283 	 */
284 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
285 		bp->b_flags |= B_INVAL;
286 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
287 		if (bp->b_vp)
288 			brelvp(bp);
289 		bp->b_flags &= ~B_DELWRI;
290 	}
291 	/*
292 	 * Stick the buffer back on a free list.
293 	 */
294 	s = splbio();
295 	if (bp->b_bufsize <= 0) {
296 		/* block has no buffer ... put at front of unused buffer list */
297 		flist = &bfreelist[BQ_EMPTY];
298 		binsheadfree(bp, flist);
299 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
300 		/* block has no info ... put at front of most free list */
301 		flist = &bfreelist[BQ_AGE];
302 		binsheadfree(bp, flist);
303 	} else {
304 		if (bp->b_flags & B_LOCKED)
305 			flist = &bfreelist[BQ_LOCKED];
306 		else if (bp->b_flags & B_AGE)
307 			flist = &bfreelist[BQ_AGE];
308 		else
309 			flist = &bfreelist[BQ_LRU];
310 		binstailfree(bp, flist);
311 	}
312 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
313 	splx(s);
314 }
315 
316 /*
317  * Check to see if a block is currently memory resident.
318  */
319 incore(vp, blkno)
320 	struct vnode *vp;
321 	daddr_t blkno;
322 {
323 	register struct buf *bp;
324 	register struct buf *dp;
325 
326 	dp = BUFHASH(vp, blkno);
327 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
328 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
329 		    (bp->b_flags & B_INVAL) == 0)
330 			return (1);
331 	return (0);
332 }
333 
334 /*
335  * Check to see if a block is currently memory resident.
336  * If it is resident, return it. If it is not resident,
337  * allocate a new buffer and assign it to the block.
338  */
339 struct buf *
340 getblk(vp, blkno, size)
341 	register struct vnode *vp;
342 	daddr_t blkno;
343 	int size;
344 {
345 	register struct buf *bp, *dp;
346 	int s;
347 
348 	if (size > MAXBSIZE)
349 		panic("getblk: size too big");
350 	/*
351 	 * Search the cache for the block. If the buffer is found,
352 	 * but it is currently locked, the we must wait for it to
353 	 * become available.
354 	 */
355 	dp = BUFHASH(vp, blkno);
356 loop:
357 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
358 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
359 		    (bp->b_flags & B_INVAL))
360 			continue;
361 		s = splbio();
362 		if (bp->b_flags & B_BUSY) {
363 			bp->b_flags |= B_WANTED;
364 			sleep((caddr_t)bp, PRIBIO + 1);
365 			splx(s);
366 			goto loop;
367 		}
368 		bremfree(bp);
369 		bp->b_flags |= B_BUSY;
370 		splx(s);
371 		if (bp->b_bcount != size) {
372 			printf("getblk: stray size");
373 			bp->b_flags |= B_INVAL;
374 			bwrite(bp);
375 			goto loop;
376 		}
377 		bp->b_flags |= B_CACHE;
378 		return (bp);
379 	}
380 	bp = getnewbuf();
381 	bremhash(bp);
382 	bgetvp(vp, bp);
383 	bp->b_bcount = 0;
384 	bp->b_lblkno = blkno;
385 	bp->b_blkno = blkno;
386 	bp->b_error = 0;
387 	bp->b_resid = 0;
388 	binshash(bp, dp);
389 	allocbuf(bp, size);
390 	return (bp);
391 }
392 
393 /*
394  * Allocate a buffer.
395  * The caller will assign it to a block.
396  */
397 struct buf *
398 geteblk(size)
399 	int size;
400 {
401 	register struct buf *bp, *flist;
402 
403 	if (size > MAXBSIZE)
404 		panic("geteblk: size too big");
405 	bp = getnewbuf();
406 	bp->b_flags |= B_INVAL;
407 	bremhash(bp);
408 	flist = &bfreelist[BQ_AGE];
409 	bp->b_bcount = 0;
410 	bp->b_error = 0;
411 	bp->b_resid = 0;
412 	binshash(bp, flist);
413 	allocbuf(bp, size);
414 	return (bp);
415 }
416 
417 /*
418  * Expand or contract the actual memory allocated to a buffer.
419  * If no memory is available, release buffer and take error exit.
420  */
421 allocbuf(tp, size)
422 	register struct buf *tp;
423 	int size;
424 {
425 	register struct buf *bp, *ep;
426 	int sizealloc, take, s;
427 
428 	sizealloc = roundup(size, CLBYTES);
429 	/*
430 	 * Buffer size does not change
431 	 */
432 	if (sizealloc == tp->b_bufsize)
433 		goto out;
434 	/*
435 	 * Buffer size is shrinking.
436 	 * Place excess space in a buffer header taken from the
437 	 * BQ_EMPTY buffer list and placed on the "most free" list.
438 	 * If no extra buffer headers are available, leave the
439 	 * extra space in the present buffer.
440 	 */
441 	if (sizealloc < tp->b_bufsize) {
442 		ep = bfreelist[BQ_EMPTY].av_forw;
443 		if (ep == &bfreelist[BQ_EMPTY])
444 			goto out;
445 		s = splbio();
446 		bremfree(ep);
447 		ep->b_flags |= B_BUSY;
448 		splx(s);
449 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
450 		    (int)tp->b_bufsize - sizealloc);
451 		ep->b_bufsize = tp->b_bufsize - sizealloc;
452 		tp->b_bufsize = sizealloc;
453 		ep->b_flags |= B_INVAL;
454 		ep->b_bcount = 0;
455 		brelse(ep);
456 		goto out;
457 	}
458 	/*
459 	 * More buffer space is needed. Get it out of buffers on
460 	 * the "most free" list, placing the empty headers on the
461 	 * BQ_EMPTY buffer header list.
462 	 */
463 	while (tp->b_bufsize < sizealloc) {
464 		take = sizealloc - tp->b_bufsize;
465 		bp = getnewbuf();
466 		if (take >= bp->b_bufsize)
467 			take = bp->b_bufsize;
468 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
469 		    &tp->b_un.b_addr[tp->b_bufsize], take);
470 		tp->b_bufsize += take;
471 		bp->b_bufsize = bp->b_bufsize - take;
472 		if (bp->b_bcount > bp->b_bufsize)
473 			bp->b_bcount = bp->b_bufsize;
474 		if (bp->b_bufsize <= 0) {
475 			bremhash(bp);
476 			binshash(bp, &bfreelist[BQ_EMPTY]);
477 			bp->b_dev = NODEV;
478 			bp->b_error = 0;
479 			bp->b_flags |= B_INVAL;
480 		}
481 		brelse(bp);
482 	}
483 out:
484 	tp->b_bcount = size;
485 	return (1);
486 }
487 
488 /*
489  * Find a buffer which is available for use.
490  * Select something from a free list.
491  * Preference is to AGE list, then LRU list.
492  */
493 struct buf *
494 getnewbuf()
495 {
496 	register struct buf *bp, *dp;
497 	register struct ucred *cred;
498 	int s;
499 
500 loop:
501 	s = splbio();
502 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
503 		if (dp->av_forw != dp)
504 			break;
505 	if (dp == bfreelist) {		/* no free blocks */
506 		dp->b_flags |= B_WANTED;
507 		sleep((caddr_t)dp, PRIBIO + 1);
508 		splx(s);
509 		goto loop;
510 	}
511 	bp = dp->av_forw;
512 	bremfree(bp);
513 	bp->b_flags |= B_BUSY;
514 	splx(s);
515 	if (bp->b_flags & B_DELWRI) {
516 		(void) bawrite(bp);
517 		goto loop;
518 	}
519 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
520 	if (bp->b_vp)
521 		brelvp(bp);
522 	if (bp->b_rcred != NOCRED) {
523 		cred = bp->b_rcred;
524 		bp->b_rcred = NOCRED;
525 		crfree(cred);
526 	}
527 	if (bp->b_wcred != NOCRED) {
528 		cred = bp->b_wcred;
529 		bp->b_wcred = NOCRED;
530 		crfree(cred);
531 	}
532 	bp->b_flags = B_BUSY;
533 	bp->b_dirtyoff = bp->b_dirtyend = 0;
534 	return (bp);
535 }
536 
537 /*
538  * Wait for I/O to complete.
539  *
540  * Extract and return any errors associated with the I/O.
541  * If the error flag is set, but no specific error is
542  * given, return EIO.
543  */
544 biowait(bp)
545 	register struct buf *bp;
546 {
547 	int s;
548 
549 	s = splbio();
550 	while ((bp->b_flags & B_DONE) == 0)
551 		sleep((caddr_t)bp, PRIBIO);
552 	splx(s);
553 	if ((bp->b_flags & B_ERROR) == 0)
554 		return (0);
555 	if (bp->b_error)
556 		return (bp->b_error);
557 	return (EIO);
558 }
559 
560 /*
561  * Mark I/O complete on a buffer.
562  *
563  * If a callback has been requested, e.g. the pageout
564  * daemon, do so. Otherwise, awaken waiting processes.
565  */
566 biodone(bp)
567 	register struct buf *bp;
568 {
569 
570 	if (bp->b_flags & B_DONE)
571 		panic("dup biodone");
572 	bp->b_flags |= B_DONE;
573 	if ((bp->b_flags & B_READ) == 0)
574 		vwakeup(bp);
575 	if (bp->b_flags & B_CALL) {
576 		bp->b_flags &= ~B_CALL;
577 		(*bp->b_iodone)(bp);
578 		return;
579 	}
580 	if (bp->b_flags & B_ASYNC)
581 		brelse(bp);
582 	else {
583 		bp->b_flags &= ~B_WANTED;
584 		wakeup((caddr_t)bp);
585 	}
586 }
587