xref: /csrg-svn/sys/kern/vfs_bio.c (revision 30749)
1 /*
2  * Copyright (c) 1982, 1986 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  *
6  *	@(#)vfs_bio.c	7.1.1.1 (Berkeley) 04/02/87
7  */
8 
9 #include "../machine/pte.h"
10 
11 #include "param.h"
12 #include "systm.h"
13 #include "dir.h"
14 #include "user.h"
15 #include "buf.h"
16 #include "conf.h"
17 #include "proc.h"
18 #include "seg.h"
19 #include "vm.h"
20 #include "trace.h"
21 
22 /*
23  * Read in (if necessary) the block and return a buffer pointer.
24  */
25 struct buf *
26 #ifdef SECSIZE
27 bread(dev, blkno, size, secsize)
28 #else SECSIZE
29 bread(dev, blkno, size)
30 #endif SECSIZE
31 	dev_t dev;
32 	daddr_t blkno;
33 	int size;
34 #ifdef SECSIZE
35 	long secsize;
36 #endif SECSIZE
37 {
38 	register struct buf *bp;
39 
40 	if (size == 0)
41 		panic("bread: size 0");
42 #ifdef SECSIZE
43 	bp = getblk(dev, blkno, size, secsize);
44 #else SECSIZE
45 	bp = getblk(dev, blkno, size);
46 #endif SECSIZE
47 	if (bp->b_flags&B_DONE) {
48 		trace(TR_BREADHIT, pack(dev, size), blkno);
49 		return (bp);
50 	}
51 	bp->b_flags |= B_READ;
52 	if (bp->b_bcount > bp->b_bufsize)
53 		panic("bread");
54 	(*bdevsw[major(dev)].d_strategy)(bp);
55 	trace(TR_BREADMISS, pack(dev, size), blkno);
56 	u.u_ru.ru_inblock++;		/* pay for read */
57 	biowait(bp);
58 	return (bp);
59 }
60 
61 /*
62  * Read in the block, like bread, but also start I/O on the
63  * read-ahead block (which is not allocated to the caller)
64  */
65 struct buf *
66 #ifdef SECSIZE
67 breada(dev, blkno, size, secsize, rablkno, rabsize)
68 #else SECSIZE
69 breada(dev, blkno, size, rablkno, rabsize)
70 #endif SECSIZE
71 	dev_t dev;
72 	daddr_t blkno; int size;
73 #ifdef SECSIZE
74 	long secsize;
75 #endif SECSIZE
76 	daddr_t rablkno; int rabsize;
77 {
78 	register struct buf *bp, *rabp;
79 
80 	bp = NULL;
81 	/*
82 	 * If the block isn't in core, then allocate
83 	 * a buffer and initiate i/o (getblk checks
84 	 * for a cache hit).
85 	 */
86 	if (!incore(dev, blkno)) {
87 #ifdef SECSIZE
88 		bp = getblk(dev, blkno, size, secsize);
89 #else SECSIZE
90 		bp = getblk(dev, blkno, size);
91 #endif SECSIZE
92 		if ((bp->b_flags&B_DONE) == 0) {
93 			bp->b_flags |= B_READ;
94 			if (bp->b_bcount > bp->b_bufsize)
95 				panic("breada");
96 			(*bdevsw[major(dev)].d_strategy)(bp);
97 			trace(TR_BREADMISS, pack(dev, size), blkno);
98 			u.u_ru.ru_inblock++;		/* pay for read */
99 		} else
100 			trace(TR_BREADHIT, pack(dev, size), blkno);
101 	}
102 
103 	/*
104 	 * If there's a read-ahead block, start i/o
105 	 * on it also (as above).
106 	 */
107 	if (rablkno && !incore(dev, rablkno)) {
108 #ifdef SECSIZE
109 		rabp = getblk(dev, rablkno, rabsize, secsize);
110 #else SECSIZE
111 		rabp = getblk(dev, rablkno, rabsize);
112 #endif SECSIZE
113 		if (rabp->b_flags & B_DONE) {
114 			brelse(rabp);
115 			trace(TR_BREADHITRA, pack(dev, rabsize), blkno);
116 		} else {
117 			rabp->b_flags |= B_READ|B_ASYNC;
118 			if (rabp->b_bcount > rabp->b_bufsize)
119 				panic("breadrabp");
120 			(*bdevsw[major(dev)].d_strategy)(rabp);
121 			trace(TR_BREADMISSRA, pack(dev, rabsize), rablock);
122 			u.u_ru.ru_inblock++;		/* pay in advance */
123 		}
124 	}
125 
126 	/*
127 	 * If block was in core, let bread get it.
128 	 * If block wasn't in core, then the read was started
129 	 * above, and just wait for it.
130 	 */
131 	if (bp == NULL)
132 #ifdef SECSIZE
133 		return (bread(dev, blkno, size, secsize));
134 #else SECSIZE
135 		return (bread(dev, blkno, size));
136 #endif SECSIZE
137 	biowait(bp);
138 	return (bp);
139 }
140 
141 /*
142  * Write the buffer, waiting for completion.
143  * Then release the buffer.
144  */
145 bwrite(bp)
146 	register struct buf *bp;
147 {
148 	register flag;
149 
150 	flag = bp->b_flags;
151 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
152 	if ((flag&B_DELWRI) == 0)
153 		u.u_ru.ru_oublock++;		/* noone paid yet */
154 	trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno);
155 	if (bp->b_bcount > bp->b_bufsize)
156 		panic("bwrite");
157 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
158 
159 	/*
160 	 * If the write was synchronous, then await i/o completion.
161 	 * If the write was "delayed", then we put the buffer on
162 	 * the q of blocks awaiting i/o completion status.
163 	 */
164 	if ((flag&B_ASYNC) == 0) {
165 		biowait(bp);
166 		brelse(bp);
167 	} else if (flag & B_DELWRI)
168 		bp->b_flags |= B_AGE;
169 }
170 
171 /*
172  * Release the buffer, marking it so that if it is grabbed
173  * for another purpose it will be written out before being
174  * given up (e.g. when writing a partial block where it is
175  * assumed that another write for the same block will soon follow).
176  * This can't be done for magtape, since writes must be done
177  * in the same order as requested.
178  */
179 bdwrite(bp)
180 	register struct buf *bp;
181 {
182 
183 	if ((bp->b_flags&B_DELWRI) == 0)
184 		u.u_ru.ru_oublock++;		/* noone paid yet */
185 	if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE)
186 		bawrite(bp);
187 	else {
188 		bp->b_flags |= B_DELWRI | B_DONE;
189 		brelse(bp);
190 	}
191 }
192 
193 /*
194  * Release the buffer, start I/O on it, but don't wait for completion.
195  */
196 bawrite(bp)
197 	register struct buf *bp;
198 {
199 
200 	bp->b_flags |= B_ASYNC;
201 	bwrite(bp);
202 }
203 
204 /*
205  * Release the buffer, with no I/O implied.
206  */
207 brelse(bp)
208 	register struct buf *bp;
209 {
210 	register struct buf *flist;
211 	register s;
212 
213 	trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
214 	/*
215 	 * If someone's waiting for the buffer, or
216 	 * is waiting for a buffer wake 'em up.
217 	 */
218 	if (bp->b_flags&B_WANTED)
219 		wakeup((caddr_t)bp);
220 	if (bfreelist[0].b_flags&B_WANTED) {
221 		bfreelist[0].b_flags &= ~B_WANTED;
222 		wakeup((caddr_t)bfreelist);
223 	}
224 	if (bp->b_flags&B_ERROR)
225 		if (bp->b_flags & B_LOCKED)
226 			bp->b_flags &= ~B_ERROR;	/* try again later */
227 		else
228 			bp->b_dev = NODEV;  		/* no assoc */
229 
230 	/*
231 	 * Stick the buffer back on a free list.
232 	 */
233 	s = splbio();
234 	if (bp->b_bufsize <= 0) {
235 		/* block has no buffer ... put at front of unused buffer list */
236 		flist = &bfreelist[BQ_EMPTY];
237 		binsheadfree(bp, flist);
238 	} else if (bp->b_flags & (B_ERROR|B_INVAL)) {
239 		/* block has no info ... put at front of most free list */
240 		flist = &bfreelist[BQ_AGE];
241 		binsheadfree(bp, flist);
242 	} else {
243 		if (bp->b_flags & B_LOCKED)
244 			flist = &bfreelist[BQ_LOCKED];
245 		else if (bp->b_flags & B_AGE)
246 			flist = &bfreelist[BQ_AGE];
247 		else
248 			flist = &bfreelist[BQ_LRU];
249 		binstailfree(bp, flist);
250 	}
251 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
252 	splx(s);
253 }
254 
255 /*
256  * See if the block is associated with some buffer
257  * (mainly to avoid getting hung up on a wait in breada)
258  */
259 incore(dev, blkno)
260 	dev_t dev;
261 	daddr_t blkno;
262 {
263 	register struct buf *bp;
264 	register struct buf *dp;
265 
266 	dp = BUFHASH(dev, blkno);
267 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
268 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
269 		    (bp->b_flags & B_INVAL) == 0)
270 			return (1);
271 	return (0);
272 }
273 
274 struct buf *
275 #ifdef SECSIZE
276 baddr(dev, blkno, size, secsize)
277 #else SECSIZE
278 baddr(dev, blkno, size)
279 #endif SECSIZE
280 	dev_t dev;
281 	daddr_t blkno;
282 	int size;
283 #ifdef SECSIZE
284 	long secsize;
285 #endif SECSIZE
286 {
287 
288 	if (incore(dev, blkno))
289 #ifdef SECSIZE
290 		return (bread(dev, blkno, size, secsize));
291 #else SECSIZE
292 		return (bread(dev, blkno, size));
293 #endif SECSIZE
294 	return (0);
295 }
296 
297 /*
298  * Assign a buffer for the given block.  If the appropriate
299  * block is already associated, return it; otherwise search
300  * for the oldest non-busy buffer and reassign it.
301  *
302  * We use splx here because this routine may be called
303  * on the interrupt stack during a dump, and we don't
304  * want to lower the ipl back to 0.
305  */
306 struct buf *
307 #ifdef SECSIZE
308 getblk(dev, blkno, size, secsize)
309 #else SECSIZE
310 getblk(dev, blkno, size)
311 #endif SECSIZE
312 	dev_t dev;
313 	daddr_t blkno;
314 	int size;
315 #ifdef SECSIZE
316 	long secsize;
317 #endif SECSIZE
318 {
319 	register struct buf *bp, *dp;
320 	int s;
321 
322 	if (size > MAXBSIZE)
323 		panic("getblk: size too big");
324 	/*
325 	 * To prevent overflow of 32-bit ints when converting block
326 	 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set
327 	 * to the maximum number that can be converted to a byte offset
328 	 * without overflow. This is historic code; what bug it fixed,
329 	 * or whether it is still a reasonable thing to do is open to
330 	 * dispute. mkm 9/85
331 	 */
332 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT))
333 		blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1);
334 	/*
335 	 * Search the cache for the block.  If we hit, but
336 	 * the buffer is in use for i/o, then we wait until
337 	 * the i/o has completed.
338 	 */
339 	dp = BUFHASH(dev, blkno);
340 loop:
341 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
342 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
343 		    bp->b_flags&B_INVAL)
344 			continue;
345 		s = splbio();
346 		if (bp->b_flags&B_BUSY) {
347 			bp->b_flags |= B_WANTED;
348 			sleep((caddr_t)bp, PRIBIO+1);
349 			splx(s);
350 			goto loop;
351 		}
352 		splx(s);
353 		notavail(bp);
354 		if (bp->b_bcount != size && brealloc(bp, size) == 0)
355 			goto loop;
356 		bp->b_flags |= B_CACHE;
357 		return (bp);
358 	}
359 	if (major(dev) >= nblkdev)
360 		panic("blkdev");
361 	bp = getnewbuf();
362 	bfree(bp);
363 	bremhash(bp);
364 	binshash(bp, dp);
365 	bp->b_dev = dev;
366 #ifdef SECSIZE
367 	bp->b_blksize = secsize;
368 #endif SECSIZE
369 	bp->b_blkno = blkno;
370 	bp->b_error = 0;
371 	if (brealloc(bp, size) == 0)
372 		goto loop;
373 	return (bp);
374 }
375 
376 /*
377  * get an empty block,
378  * not assigned to any particular device
379  */
380 struct buf *
381 geteblk(size)
382 	int size;
383 {
384 	register struct buf *bp, *flist;
385 
386 	if (size > MAXBSIZE)
387 		panic("geteblk: size too big");
388 loop:
389 	bp = getnewbuf();
390 	bp->b_flags |= B_INVAL;
391 	bfree(bp);
392 	bremhash(bp);
393 	flist = &bfreelist[BQ_AGE];
394 	binshash(bp, flist);
395 	bp->b_dev = (dev_t)NODEV;
396 #ifdef SECSIZE
397 	bp->b_blksize = DEV_BSIZE;
398 #endif SECSIZE
399 	bp->b_error = 0;
400 	if (brealloc(bp, size) == 0)
401 		goto loop;
402 	return (bp);
403 }
404 
405 /*
406  * Allocate space associated with a buffer.
407  * If can't get space, buffer is released
408  */
409 brealloc(bp, size)
410 	register struct buf *bp;
411 	int size;
412 {
413 	daddr_t start, last;
414 	register struct buf *ep;
415 	struct buf *dp;
416 	int s;
417 
418 	/*
419 	 * First need to make sure that all overlapping previous I/O
420 	 * is dispatched with.
421 	 */
422 	if (size == bp->b_bcount)
423 		return (1);
424 	if (size < bp->b_bcount) {
425 		if (bp->b_flags & B_DELWRI) {
426 			bwrite(bp);
427 			return (0);
428 		}
429 		if (bp->b_flags & B_LOCKED)
430 			panic("brealloc");
431 		return (allocbuf(bp, size));
432 	}
433 	bp->b_flags &= ~B_DONE;
434 	if (bp->b_dev == NODEV)
435 		return (allocbuf(bp, size));
436 
437 	trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno);
438 	/*
439 	 * Search cache for any buffers that overlap the one that we
440 	 * are trying to allocate. Overlapping buffers must be marked
441 	 * invalid, after being written out if they are dirty. (indicated
442 	 * by B_DELWRI) A disk block must be mapped by at most one buffer
443 	 * at any point in time. Care must be taken to avoid deadlocking
444 	 * when two buffer are trying to get the same set of disk blocks.
445 	 */
446 	start = bp->b_blkno;
447 #ifdef SECSIZE
448 	last = start + size/bp->b_blksize - 1;
449 #else SECSIZE
450 	last = start + btodb(size) - 1;
451 #endif SECSIZE
452 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
453 loop:
454 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
455 		if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
456 			continue;
457 		/* look for overlap */
458 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
459 #ifdef SECSIZE
460 		    ep->b_blkno + ep->b_bcount/ep->b_blksize <= start)
461 #else SECSIZE
462 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
463 #endif SECSIZE
464 			continue;
465 		s = splbio();
466 		if (ep->b_flags&B_BUSY) {
467 			ep->b_flags |= B_WANTED;
468 			sleep((caddr_t)ep, PRIBIO+1);
469 			splx(s);
470 			goto loop;
471 		}
472 		splx(s);
473 		notavail(ep);
474 		if (ep->b_flags & B_DELWRI) {
475 			bwrite(ep);
476 			goto loop;
477 		}
478 		ep->b_flags |= B_INVAL;
479 		brelse(ep);
480 	}
481 	return (allocbuf(bp, size));
482 }
483 
484 /*
485  * Find a buffer which is available for use.
486  * Select something from a free list.
487  * Preference is to AGE list, then LRU list.
488  */
489 struct buf *
490 getnewbuf()
491 {
492 	register struct buf *bp, *dp;
493 	int s;
494 
495 loop:
496 	s = splbio();
497 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
498 		if (dp->av_forw != dp)
499 			break;
500 	if (dp == bfreelist) {		/* no free blocks */
501 		dp->b_flags |= B_WANTED;
502 		sleep((caddr_t)dp, PRIBIO+1);
503 		splx(s);
504 		goto loop;
505 	}
506 	splx(s);
507 	bp = dp->av_forw;
508 	notavail(bp);
509 	if (bp->b_flags & B_DELWRI) {
510 		bp->b_flags |= B_ASYNC;
511 		bwrite(bp);
512 		goto loop;
513 	}
514 	trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
515 	bp->b_flags = B_BUSY;
516 	return (bp);
517 }
518 
519 /*
520  * Wait for I/O completion on the buffer; return errors
521  * to the user.
522  */
523 biowait(bp)
524 	register struct buf *bp;
525 {
526 	int s;
527 
528 	s = splbio();
529 	while ((bp->b_flags&B_DONE)==0)
530 		sleep((caddr_t)bp, PRIBIO);
531 	splx(s);
532 	if (u.u_error == 0)			/* XXX */
533 		u.u_error = geterror(bp);
534 }
535 
536 /*
537  * Mark I/O complete on a buffer.
538  * If someone should be called, e.g. the pageout
539  * daemon, do so.  Otherwise, wake up anyone
540  * waiting for it.
541  */
542 biodone(bp)
543 	register struct buf *bp;
544 {
545 
546 	if (bp->b_flags & B_DONE)
547 		panic("dup biodone");
548 	bp->b_flags |= B_DONE;
549 	if (bp->b_flags & B_CALL) {
550 		bp->b_flags &= ~B_CALL;
551 		(*bp->b_iodone)(bp);
552 		return;
553 	}
554 	if (bp->b_flags&B_ASYNC)
555 		brelse(bp);
556 	else {
557 		bp->b_flags &= ~B_WANTED;
558 		wakeup((caddr_t)bp);
559 	}
560 }
561 
562 /*
563  * Insure that no part of a specified block is in an incore buffer.
564 #ifdef SECSIZE
565  * "size" is given in device blocks (the units of b_blkno).
566 #endif SECSIZE
567  */
568 blkflush(dev, blkno, size)
569 	dev_t dev;
570 	daddr_t blkno;
571 #ifdef SECSIZE
572 	int size;
573 #else SECSIZE
574 	long size;
575 #endif SECSIZE
576 {
577 	register struct buf *ep;
578 	struct buf *dp;
579 	daddr_t start, last;
580 	int s;
581 
582 	start = blkno;
583 #ifdef SECSIZE
584 	last = start + size - 1;
585 #else SECSIZE
586 	last = start + btodb(size) - 1;
587 #endif SECSIZE
588 	dp = BUFHASH(dev, blkno);
589 loop:
590 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
591 		if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
592 			continue;
593 		/* look for overlap */
594 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
595 #ifdef SECSIZE
596 		    ep->b_blkno + ep->b_bcount / ep->b_blksize <= start)
597 #else SECSIZE
598 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
599 #endif SECSIZE
600 			continue;
601 		s = splbio();
602 		if (ep->b_flags&B_BUSY) {
603 			ep->b_flags |= B_WANTED;
604 			sleep((caddr_t)ep, PRIBIO+1);
605 			splx(s);
606 			goto loop;
607 		}
608 		if (ep->b_flags & B_DELWRI) {
609 			splx(s);
610 			notavail(ep);
611 			bwrite(ep);
612 			goto loop;
613 		}
614 		splx(s);
615 	}
616 }
617 
618 /*
619  * Make sure all write-behind blocks
620  * on dev (or NODEV for all)
621  * are flushed out.
622  * (from umount and update)
623  */
624 bflush(dev)
625 	dev_t dev;
626 {
627 	register struct buf *bp;
628 	register struct buf *flist;
629 	int s;
630 
631 loop:
632 	s = splbio();
633 	for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
634 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
635 		if ((bp->b_flags & B_DELWRI) == 0)
636 			continue;
637 		if (dev == NODEV || dev == bp->b_dev) {
638 			bp->b_flags |= B_ASYNC;
639 			notavail(bp);
640 			bwrite(bp);
641 			splx(s);
642 			goto loop;
643 		}
644 	}
645 	splx(s);
646 }
647 
648 /*
649  * Pick up the device's error number and pass it to the user;
650  * if there is an error but the number is 0 set a generalized code.
651  */
652 geterror(bp)
653 	register struct buf *bp;
654 {
655 	int error = 0;
656 
657 	if (bp->b_flags&B_ERROR)
658 		if ((error = bp->b_error)==0)
659 			return (EIO);
660 	return (error);
661 }
662 
663 /*
664  * Invalidate in core blocks belonging to closed or umounted filesystem
665  *
666  * This is not nicely done at all - the buffer ought to be removed from the
667  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
668  * can't do that here, as it is quite possible that the block is still
669  * being used for i/o. Eventually, all disc drivers should be forced to
670  * have a close routine, which ought ensure that the queue is empty, then
671  * properly flush the queues. Until that happy day, this suffices for
672  * correctness.						... kre
673  */
674 binval(dev)
675 	dev_t dev;
676 {
677 	register struct buf *bp;
678 	register struct bufhd *hp;
679 #define dp ((struct buf *)hp)
680 
681 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
682 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
683 			if (bp->b_dev == dev)
684 				bp->b_flags |= B_INVAL;
685 }
686