xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 30751)
1 /*
2  * Copyright (c) 1982, 1986 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  *
6  *	@(#)vfs_cluster.c	7.2 (Berkeley) 04/02/87
7  */
8 
9 #include "../machine/pte.h"
10 
11 #include "param.h"
12 #include "systm.h"
13 #include "dir.h"
14 #include "user.h"
15 #include "buf.h"
16 #include "conf.h"
17 #include "proc.h"
18 #include "seg.h"
19 #include "vm.h"
20 #include "trace.h"
21 
22 /*
23  * Read in (if necessary) the block and return a buffer pointer.
24  */
25 struct buf *
26 bread(dev, blkno, size)
27 	dev_t dev;
28 	daddr_t blkno;
29 	int size;
30 {
31 	register struct buf *bp;
32 
33 	if (size == 0)
34 		panic("bread: size 0");
35 	bp = getblk(dev, blkno, size);
36 	if (bp->b_flags&B_DONE) {
37 		trace(TR_BREADHIT, pack(dev, size), blkno);
38 		return (bp);
39 	}
40 	bp->b_flags |= B_READ;
41 	if (bp->b_bcount > bp->b_bufsize)
42 		panic("bread");
43 	(*bdevsw[major(dev)].d_strategy)(bp);
44 	trace(TR_BREADMISS, pack(dev, size), blkno);
45 	u.u_ru.ru_inblock++;		/* pay for read */
46 	biowait(bp);
47 	return (bp);
48 }
49 
50 /*
51  * Read in the block, like bread, but also start I/O on the
52  * read-ahead block (which is not allocated to the caller)
53  */
54 struct buf *
55 breada(dev, blkno, size, rablkno, rabsize)
56 	dev_t dev;
57 	daddr_t blkno; int size;
58 	daddr_t rablkno; int rabsize;
59 {
60 	register struct buf *bp, *rabp;
61 
62 	bp = NULL;
63 	/*
64 	 * If the block isn't in core, then allocate
65 	 * a buffer and initiate i/o (getblk checks
66 	 * for a cache hit).
67 	 */
68 	if (!incore(dev, blkno)) {
69 		bp = getblk(dev, blkno, size);
70 		if ((bp->b_flags&B_DONE) == 0) {
71 			bp->b_flags |= B_READ;
72 			if (bp->b_bcount > bp->b_bufsize)
73 				panic("breada");
74 			(*bdevsw[major(dev)].d_strategy)(bp);
75 			trace(TR_BREADMISS, pack(dev, size), blkno);
76 			u.u_ru.ru_inblock++;		/* pay for read */
77 		} else
78 			trace(TR_BREADHIT, pack(dev, size), blkno);
79 	}
80 
81 	/*
82 	 * If there's a read-ahead block, start i/o
83 	 * on it also (as above).
84 	 */
85 	if (rablkno && !incore(dev, rablkno)) {
86 		rabp = getblk(dev, rablkno, rabsize);
87 		if (rabp->b_flags & B_DONE) {
88 			brelse(rabp);
89 			trace(TR_BREADHITRA, pack(dev, rabsize), blkno);
90 		} else {
91 			rabp->b_flags |= B_READ|B_ASYNC;
92 			if (rabp->b_bcount > rabp->b_bufsize)
93 				panic("breadrabp");
94 			(*bdevsw[major(dev)].d_strategy)(rabp);
95 			trace(TR_BREADMISSRA, pack(dev, rabsize), rablock);
96 			u.u_ru.ru_inblock++;		/* pay in advance */
97 		}
98 	}
99 
100 	/*
101 	 * If block was in core, let bread get it.
102 	 * If block wasn't in core, then the read was started
103 	 * above, and just wait for it.
104 	 */
105 	if (bp == NULL)
106 		return (bread(dev, blkno, size));
107 	biowait(bp);
108 	return (bp);
109 }
110 
111 /*
112  * Write the buffer, waiting for completion.
113  * Then release the buffer.
114  */
115 bwrite(bp)
116 	register struct buf *bp;
117 {
118 	register flag;
119 
120 	flag = bp->b_flags;
121 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
122 	if ((flag&B_DELWRI) == 0)
123 		u.u_ru.ru_oublock++;		/* noone paid yet */
124 	trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno);
125 	if (bp->b_bcount > bp->b_bufsize)
126 		panic("bwrite");
127 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
128 
129 	/*
130 	 * If the write was synchronous, then await i/o completion.
131 	 * If the write was "delayed", then we put the buffer on
132 	 * the q of blocks awaiting i/o completion status.
133 	 */
134 	if ((flag&B_ASYNC) == 0) {
135 		biowait(bp);
136 		brelse(bp);
137 	} else if (flag & B_DELWRI)
138 		bp->b_flags |= B_AGE;
139 }
140 
141 /*
142  * Release the buffer, marking it so that if it is grabbed
143  * for another purpose it will be written out before being
144  * given up (e.g. when writing a partial block where it is
145  * assumed that another write for the same block will soon follow).
146  * This can't be done for magtape, since writes must be done
147  * in the same order as requested.
148  */
149 bdwrite(bp)
150 	register struct buf *bp;
151 {
152 
153 	if ((bp->b_flags&B_DELWRI) == 0)
154 		u.u_ru.ru_oublock++;		/* noone paid yet */
155 	if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE)
156 		bawrite(bp);
157 	else {
158 		bp->b_flags |= B_DELWRI | B_DONE;
159 		brelse(bp);
160 	}
161 }
162 
163 /*
164  * Release the buffer, start I/O on it, but don't wait for completion.
165  */
166 bawrite(bp)
167 	register struct buf *bp;
168 {
169 
170 	bp->b_flags |= B_ASYNC;
171 	bwrite(bp);
172 }
173 
174 /*
175  * Release the buffer, with no I/O implied.
176  */
177 brelse(bp)
178 	register struct buf *bp;
179 {
180 	register struct buf *flist;
181 	register s;
182 
183 	trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
184 	/*
185 	 * If someone's waiting for the buffer, or
186 	 * is waiting for a buffer wake 'em up.
187 	 */
188 	if (bp->b_flags&B_WANTED)
189 		wakeup((caddr_t)bp);
190 	if (bfreelist[0].b_flags&B_WANTED) {
191 		bfreelist[0].b_flags &= ~B_WANTED;
192 		wakeup((caddr_t)bfreelist);
193 	}
194 	if (bp->b_flags&B_ERROR)
195 		if (bp->b_flags & B_LOCKED)
196 			bp->b_flags &= ~B_ERROR;	/* try again later */
197 		else
198 			bp->b_dev = NODEV;  		/* no assoc */
199 
200 	/*
201 	 * Stick the buffer back on a free list.
202 	 */
203 	s = splbio();
204 	if (bp->b_bufsize <= 0) {
205 		/* block has no buffer ... put at front of unused buffer list */
206 		flist = &bfreelist[BQ_EMPTY];
207 		binsheadfree(bp, flist);
208 	} else if (bp->b_flags & (B_ERROR|B_INVAL)) {
209 		/* block has no info ... put at front of most free list */
210 		flist = &bfreelist[BQ_AGE];
211 		binsheadfree(bp, flist);
212 	} else {
213 		if (bp->b_flags & B_LOCKED)
214 			flist = &bfreelist[BQ_LOCKED];
215 		else if (bp->b_flags & B_AGE)
216 			flist = &bfreelist[BQ_AGE];
217 		else
218 			flist = &bfreelist[BQ_LRU];
219 		binstailfree(bp, flist);
220 	}
221 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
222 	splx(s);
223 }
224 
225 /*
226  * See if the block is associated with some buffer
227  * (mainly to avoid getting hung up on a wait in breada)
228  */
229 incore(dev, blkno)
230 	dev_t dev;
231 	daddr_t blkno;
232 {
233 	register struct buf *bp;
234 	register struct buf *dp;
235 
236 	dp = BUFHASH(dev, blkno);
237 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
238 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
239 		    (bp->b_flags & B_INVAL) == 0)
240 			return (1);
241 	return (0);
242 }
243 
244 struct buf *
245 baddr(dev, blkno, size)
246 	dev_t dev;
247 	daddr_t blkno;
248 	int size;
249 {
250 
251 	if (incore(dev, blkno))
252 		return (bread(dev, blkno, size));
253 	return (0);
254 }
255 
256 /*
257  * Assign a buffer for the given block.  If the appropriate
258  * block is already associated, return it; otherwise search
259  * for the oldest non-busy buffer and reassign it.
260  *
261  * We use splx here because this routine may be called
262  * on the interrupt stack during a dump, and we don't
263  * want to lower the ipl back to 0.
264  */
265 struct buf *
266 getblk(dev, blkno, size)
267 	dev_t dev;
268 	daddr_t blkno;
269 	int size;
270 {
271 	register struct buf *bp, *dp;
272 	int s;
273 
274 	if (size > MAXBSIZE)
275 		panic("getblk: size too big");
276 	/*
277 	 * To prevent overflow of 32-bit ints when converting block
278 	 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set
279 	 * to the maximum number that can be converted to a byte offset
280 	 * without overflow. This is historic code; what bug it fixed,
281 	 * or whether it is still a reasonable thing to do is open to
282 	 * dispute. mkm 9/85
283 	 */
284 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT))
285 		blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1);
286 	/*
287 	 * Search the cache for the block.  If we hit, but
288 	 * the buffer is in use for i/o, then we wait until
289 	 * the i/o has completed.
290 	 */
291 	dp = BUFHASH(dev, blkno);
292 loop:
293 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
294 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
295 		    bp->b_flags&B_INVAL)
296 			continue;
297 		s = splbio();
298 		if (bp->b_flags&B_BUSY) {
299 			bp->b_flags |= B_WANTED;
300 			sleep((caddr_t)bp, PRIBIO+1);
301 			splx(s);
302 			goto loop;
303 		}
304 		splx(s);
305 		notavail(bp);
306 		if (bp->b_bcount != size && brealloc(bp, size) == 0)
307 			goto loop;
308 		bp->b_flags |= B_CACHE;
309 		return (bp);
310 	}
311 	if (major(dev) >= nblkdev)
312 		panic("blkdev");
313 	bp = getnewbuf();
314 	bfree(bp);
315 	bremhash(bp);
316 	binshash(bp, dp);
317 	bp->b_dev = dev;
318 	bp->b_blkno = blkno;
319 	bp->b_error = 0;
320 	if (brealloc(bp, size) == 0)
321 		goto loop;
322 	return (bp);
323 }
324 
325 /*
326  * get an empty block,
327  * not assigned to any particular device
328  */
329 struct buf *
330 geteblk(size)
331 	int size;
332 {
333 	register struct buf *bp, *flist;
334 
335 	if (size > MAXBSIZE)
336 		panic("geteblk: size too big");
337 loop:
338 	bp = getnewbuf();
339 	bp->b_flags |= B_INVAL;
340 	bfree(bp);
341 	bremhash(bp);
342 	flist = &bfreelist[BQ_AGE];
343 	binshash(bp, flist);
344 	bp->b_dev = (dev_t)NODEV;
345 	bp->b_error = 0;
346 	if (brealloc(bp, size) == 0)
347 		goto loop;
348 	return (bp);
349 }
350 
351 /*
352  * Allocate space associated with a buffer.
353  * If can't get space, buffer is released
354  */
355 brealloc(bp, size)
356 	register struct buf *bp;
357 	int size;
358 {
359 	daddr_t start, last;
360 	register struct buf *ep;
361 	struct buf *dp;
362 	int s;
363 
364 	/*
365 	 * First need to make sure that all overlapping previous I/O
366 	 * is dispatched with.
367 	 */
368 	if (size == bp->b_bcount)
369 		return (1);
370 	if (size < bp->b_bcount) {
371 		if (bp->b_flags & B_DELWRI) {
372 			bwrite(bp);
373 			return (0);
374 		}
375 		if (bp->b_flags & B_LOCKED)
376 			panic("brealloc");
377 		return (allocbuf(bp, size));
378 	}
379 	bp->b_flags &= ~B_DONE;
380 	if (bp->b_dev == NODEV)
381 		return (allocbuf(bp, size));
382 
383 	trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno);
384 	/*
385 	 * Search cache for any buffers that overlap the one that we
386 	 * are trying to allocate. Overlapping buffers must be marked
387 	 * invalid, after being written out if they are dirty. (indicated
388 	 * by B_DELWRI) A disk block must be mapped by at most one buffer
389 	 * at any point in time. Care must be taken to avoid deadlocking
390 	 * when two buffer are trying to get the same set of disk blocks.
391 	 */
392 	start = bp->b_blkno;
393 	last = start + btodb(size) - 1;
394 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
395 loop:
396 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
397 		if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
398 			continue;
399 		/* look for overlap */
400 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
401 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
402 			continue;
403 		s = splbio();
404 		if (ep->b_flags&B_BUSY) {
405 			ep->b_flags |= B_WANTED;
406 			sleep((caddr_t)ep, PRIBIO+1);
407 			splx(s);
408 			goto loop;
409 		}
410 		splx(s);
411 		notavail(ep);
412 		if (ep->b_flags & B_DELWRI) {
413 			bwrite(ep);
414 			goto loop;
415 		}
416 		ep->b_flags |= B_INVAL;
417 		brelse(ep);
418 	}
419 	return (allocbuf(bp, size));
420 }
421 
422 /*
423  * Find a buffer which is available for use.
424  * Select something from a free list.
425  * Preference is to AGE list, then LRU list.
426  */
427 struct buf *
428 getnewbuf()
429 {
430 	register struct buf *bp, *dp;
431 	int s;
432 
433 loop:
434 	s = splbio();
435 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
436 		if (dp->av_forw != dp)
437 			break;
438 	if (dp == bfreelist) {		/* no free blocks */
439 		dp->b_flags |= B_WANTED;
440 		sleep((caddr_t)dp, PRIBIO+1);
441 		splx(s);
442 		goto loop;
443 	}
444 	splx(s);
445 	bp = dp->av_forw;
446 	notavail(bp);
447 	if (bp->b_flags & B_DELWRI) {
448 		bp->b_flags |= B_ASYNC;
449 		bwrite(bp);
450 		goto loop;
451 	}
452 	trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
453 	bp->b_flags = B_BUSY;
454 	return (bp);
455 }
456 
457 /*
458  * Wait for I/O completion on the buffer; return errors
459  * to the user.
460  */
461 biowait(bp)
462 	register struct buf *bp;
463 {
464 	int s;
465 
466 	s = splbio();
467 	while ((bp->b_flags&B_DONE)==0)
468 		sleep((caddr_t)bp, PRIBIO);
469 	splx(s);
470 	if (u.u_error == 0)			/* XXX */
471 		u.u_error = geterror(bp);
472 }
473 
474 /*
475  * Mark I/O complete on a buffer.
476  * If someone should be called, e.g. the pageout
477  * daemon, do so.  Otherwise, wake up anyone
478  * waiting for it.
479  */
480 biodone(bp)
481 	register struct buf *bp;
482 {
483 
484 	if (bp->b_flags & B_DONE)
485 		panic("dup biodone");
486 	bp->b_flags |= B_DONE;
487 	if (bp->b_flags & B_CALL) {
488 		bp->b_flags &= ~B_CALL;
489 		(*bp->b_iodone)(bp);
490 		return;
491 	}
492 	if (bp->b_flags&B_ASYNC)
493 		brelse(bp);
494 	else {
495 		bp->b_flags &= ~B_WANTED;
496 		wakeup((caddr_t)bp);
497 	}
498 }
499 
500 /*
501  * Insure that no part of a specified block is in an incore buffer.
502 #ifdef SECSIZE
503  * "size" is given in device blocks (the units of b_blkno).
504 #endif SECSIZE
505  */
506 blkflush(dev, blkno, size)
507 	dev_t dev;
508 	daddr_t blkno;
509 	long size;
510 {
511 	register struct buf *ep;
512 	struct buf *dp;
513 	daddr_t start, last;
514 	int s;
515 
516 	start = blkno;
517 	last = start + btodb(size) - 1;
518 	dp = BUFHASH(dev, blkno);
519 loop:
520 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
521 		if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
522 			continue;
523 		/* look for overlap */
524 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
525 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
526 			continue;
527 		s = splbio();
528 		if (ep->b_flags&B_BUSY) {
529 			ep->b_flags |= B_WANTED;
530 			sleep((caddr_t)ep, PRIBIO+1);
531 			splx(s);
532 			goto loop;
533 		}
534 		if (ep->b_flags & B_DELWRI) {
535 			splx(s);
536 			notavail(ep);
537 			bwrite(ep);
538 			goto loop;
539 		}
540 		splx(s);
541 	}
542 }
543 
544 /*
545  * Make sure all write-behind blocks
546  * on dev (or NODEV for all)
547  * are flushed out.
548  * (from umount and update)
549  */
550 bflush(dev)
551 	dev_t dev;
552 {
553 	register struct buf *bp;
554 	register struct buf *flist;
555 	int s;
556 
557 loop:
558 	s = splbio();
559 	for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
560 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
561 		if ((bp->b_flags & B_DELWRI) == 0)
562 			continue;
563 		if (dev == NODEV || dev == bp->b_dev) {
564 			bp->b_flags |= B_ASYNC;
565 			notavail(bp);
566 			bwrite(bp);
567 			splx(s);
568 			goto loop;
569 		}
570 	}
571 	splx(s);
572 }
573 
574 /*
575  * Pick up the device's error number and pass it to the user;
576  * if there is an error but the number is 0 set a generalized code.
577  */
578 geterror(bp)
579 	register struct buf *bp;
580 {
581 	int error = 0;
582 
583 	if (bp->b_flags&B_ERROR)
584 		if ((error = bp->b_error)==0)
585 			return (EIO);
586 	return (error);
587 }
588 
589 /*
590  * Invalidate in core blocks belonging to closed or umounted filesystem
591  *
592  * This is not nicely done at all - the buffer ought to be removed from the
593  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
594  * can't do that here, as it is quite possible that the block is still
595  * being used for i/o. Eventually, all disc drivers should be forced to
596  * have a close routine, which ought ensure that the queue is empty, then
597  * properly flush the queues. Until that happy day, this suffices for
598  * correctness.						... kre
599  */
600 binval(dev)
601 	dev_t dev;
602 {
603 	register struct buf *bp;
604 	register struct bufhd *hp;
605 #define dp ((struct buf *)hp)
606 
607 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
608 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
609 			if (bp->b_dev == dev)
610 				bp->b_flags |= B_INVAL;
611 }
612