xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 17098)
1 /*	vfs_cluster.c	6.4	84/08/29	*/
2 
3 #include "../machine/pte.h"
4 
5 #include "param.h"
6 #include "systm.h"
7 #include "dir.h"
8 #include "user.h"
9 #include "buf.h"
10 #include "conf.h"
11 #include "proc.h"
12 #include "seg.h"
13 #include "vm.h"
14 #include "trace.h"
15 
16 /*
17  * Read in (if necessary) the block and return a buffer pointer.
18  */
19 struct buf *
20 bread(dev, blkno, size)
21 	dev_t dev;
22 	daddr_t blkno;
23 	int size;
24 {
25 	register struct buf *bp;
26 
27 	if (size == 0)
28 		panic("bread: size 0");
29 	bp = getblk(dev, blkno, size);
30 	if (bp->b_flags&B_DONE) {
31 		trace(TR_BREADHIT, pack(dev, size), blkno);
32 		return(bp);
33 	}
34 	bp->b_flags |= B_READ;
35 	if (bp->b_bcount > bp->b_bufsize)
36 		panic("bread");
37 	(*bdevsw[major(dev)].d_strategy)(bp);
38 	trace(TR_BREADMISS, pack(dev, size), blkno);
39 	u.u_ru.ru_inblock++;		/* pay for read */
40 	biowait(bp);
41 	return(bp);
42 }
43 
44 /*
45  * Read in the block, like bread, but also start I/O on the
46  * read-ahead block (which is not allocated to the caller)
47  */
48 struct buf *
49 breada(dev, blkno, size, rablkno, rabsize)
50 	dev_t dev;
51 	daddr_t blkno; int size;
52 	daddr_t rablkno; int rabsize;
53 {
54 	register struct buf *bp, *rabp;
55 
56 	bp = NULL;
57 	/*
58 	 * If the block isn't in core, then allocate
59 	 * a buffer and initiate i/o (getblk checks
60 	 * for a cache hit).
61 	 */
62 	if (!incore(dev, blkno)) {
63 		bp = getblk(dev, blkno, size);
64 		if ((bp->b_flags&B_DONE) == 0) {
65 			bp->b_flags |= B_READ;
66 			if (bp->b_bcount > bp->b_bufsize)
67 				panic("breada");
68 			(*bdevsw[major(dev)].d_strategy)(bp);
69 			trace(TR_BREADMISS, pack(dev, size), blkno);
70 			u.u_ru.ru_inblock++;		/* pay for read */
71 		} else
72 			trace(TR_BREADHIT, pack(dev, size), blkno);
73 	}
74 
75 	/*
76 	 * If there's a read-ahead block, start i/o
77 	 * on it also (as above).
78 	 */
79 	if (rablkno && !incore(dev, rablkno)) {
80 		rabp = getblk(dev, rablkno, rabsize);
81 		if (rabp->b_flags & B_DONE) {
82 			brelse(rabp);
83 			trace(TR_BREADHITRA, pack(dev, rabsize), blkno);
84 		} else {
85 			rabp->b_flags |= B_READ|B_ASYNC;
86 			if (rabp->b_bcount > rabp->b_bufsize)
87 				panic("breadrabp");
88 			(*bdevsw[major(dev)].d_strategy)(rabp);
89 			trace(TR_BREADMISSRA, pack(dev, rabsize), rablock);
90 			u.u_ru.ru_inblock++;		/* pay in advance */
91 		}
92 	}
93 
94 	/*
95 	 * If block was in core, let bread get it.
96 	 * If block wasn't in core, then the read was started
97 	 * above, and just wait for it.
98 	 */
99 	if (bp == NULL)
100 		return (bread(dev, blkno, size));
101 	biowait(bp);
102 	return (bp);
103 }
104 
105 /*
106  * Write the buffer, waiting for completion.
107  * Then release the buffer.
108  */
109 bwrite(bp)
110 	register struct buf *bp;
111 {
112 	register flag;
113 
114 	flag = bp->b_flags;
115 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
116 	if ((flag&B_DELWRI) == 0)
117 		u.u_ru.ru_oublock++;		/* noone paid yet */
118 	trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno);
119 	if (bp->b_bcount > bp->b_bufsize)
120 		panic("bwrite");
121 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
122 
123 	/*
124 	 * If the write was synchronous, then await i/o completion.
125 	 * If the write was "delayed", then we put the buffer on
126 	 * the q of blocks awaiting i/o completion status.
127 	 */
128 	if ((flag&B_ASYNC) == 0) {
129 		biowait(bp);
130 		brelse(bp);
131 	} else if (flag & B_DELWRI)
132 		bp->b_flags |= B_AGE;
133 }
134 
135 /*
136  * Release the buffer, marking it so that if it is grabbed
137  * for another purpose it will be written out before being
138  * given up (e.g. when writing a partial block where it is
139  * assumed that another write for the same block will soon follow).
140  * This can't be done for magtape, since writes must be done
141  * in the same order as requested.
142  */
143 bdwrite(bp)
144 	register struct buf *bp;
145 {
146 	register int flags;
147 
148 	if ((bp->b_flags&B_DELWRI) == 0)
149 		u.u_ru.ru_oublock++;		/* noone paid yet */
150 	flags = bdevsw[major(bp->b_dev)].d_flags;
151 	if(flags & B_TAPE)
152 		bawrite(bp);
153 	else {
154 		bp->b_flags |= B_DELWRI | B_DONE;
155 		brelse(bp);
156 	}
157 }
158 
159 /*
160  * Release the buffer, start I/O on it, but don't wait for completion.
161  */
162 bawrite(bp)
163 	register struct buf *bp;
164 {
165 
166 	bp->b_flags |= B_ASYNC;
167 	bwrite(bp);
168 }
169 
170 /*
171  * Release the buffer, with no I/O implied.
172  */
173 brelse(bp)
174 	register struct buf *bp;
175 {
176 	register struct buf *flist;
177 	register s;
178 
179 	trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
180 	/*
181 	 * If someone's waiting for the buffer, or
182 	 * is waiting for a buffer wake 'em up.
183 	 */
184 	if (bp->b_flags&B_WANTED)
185 		wakeup((caddr_t)bp);
186 	if (bfreelist[0].b_flags&B_WANTED) {
187 		bfreelist[0].b_flags &= ~B_WANTED;
188 		wakeup((caddr_t)bfreelist);
189 	}
190 	if (bp->b_flags&B_ERROR)
191 		if (bp->b_flags & B_LOCKED)
192 			bp->b_flags &= ~B_ERROR;	/* try again later */
193 		else
194 			bp->b_dev = NODEV;  		/* no assoc */
195 
196 	/*
197 	 * Stick the buffer back on a free list.
198 	 */
199 	s = spl6();
200 	if (bp->b_bufsize <= 0) {
201 		/* block has no buffer ... put at front of unused buffer list */
202 		flist = &bfreelist[BQ_EMPTY];
203 		binsheadfree(bp, flist);
204 	} else if (bp->b_flags & (B_ERROR|B_INVAL)) {
205 		/* block has no info ... put at front of most free list */
206 		flist = &bfreelist[BQ_AGE];
207 		binsheadfree(bp, flist);
208 	} else {
209 		if (bp->b_flags & B_LOCKED)
210 			flist = &bfreelist[BQ_LOCKED];
211 		else if (bp->b_flags & B_AGE)
212 			flist = &bfreelist[BQ_AGE];
213 		else
214 			flist = &bfreelist[BQ_LRU];
215 		binstailfree(bp, flist);
216 	}
217 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
218 	splx(s);
219 }
220 
221 /*
222  * See if the block is associated with some buffer
223  * (mainly to avoid getting hung up on a wait in breada)
224  */
225 incore(dev, blkno)
226 	dev_t dev;
227 	daddr_t blkno;
228 {
229 	register struct buf *bp;
230 	register struct buf *dp;
231 
232 	dp = BUFHASH(dev, blkno);
233 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
234 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
235 		    (bp->b_flags & B_INVAL) == 0)
236 			return (1);
237 	return (0);
238 }
239 
240 struct buf *
241 baddr(dev, blkno, size)
242 	dev_t dev;
243 	daddr_t blkno;
244 	int size;
245 {
246 
247 	if (incore(dev, blkno))
248 		return (bread(dev, blkno, size));
249 	return (0);
250 }
251 
252 /*
253  * Assign a buffer for the given block.  If the appropriate
254  * block is already associated, return it; otherwise search
255  * for the oldest non-busy buffer and reassign it.
256  *
257  * We use splx here because this routine may be called
258  * on the interrupt stack during a dump, and we don't
259  * want to lower the ipl back to 0.
260  */
261 struct buf *
262 getblk(dev, blkno, size)
263 	dev_t dev;
264 	daddr_t blkno;
265 	int size;
266 {
267 	register struct buf *bp, *dp;
268 	int s;
269 
270 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))	/* XXX */
271 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
272 	/*
273 	 * Search the cache for the block.  If we hit, but
274 	 * the buffer is in use for i/o, then we wait until
275 	 * the i/o has completed.
276 	 */
277 	dp = BUFHASH(dev, blkno);
278 loop:
279 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
280 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
281 		    bp->b_flags&B_INVAL)
282 			continue;
283 		s = spl6();
284 		if (bp->b_flags&B_BUSY) {
285 			bp->b_flags |= B_WANTED;
286 			sleep((caddr_t)bp, PRIBIO+1);
287 			splx(s);
288 			goto loop;
289 		}
290 		splx(s);
291 		notavail(bp);
292 		if (bp->b_bcount != size && brealloc(bp, size) == 0)
293 			goto loop;
294 		bp->b_flags |= B_CACHE;
295 		return(bp);
296 	}
297 	if (major(dev) >= nblkdev)
298 		panic("blkdev");
299 	bp = getnewbuf();
300 	bfree(bp);
301 	bremhash(bp);
302 	binshash(bp, dp);
303 	bp->b_dev = dev;
304 	bp->b_blkno = blkno;
305 	bp->b_error = 0;
306 	if (brealloc(bp, size) == 0)
307 		goto loop;
308 	return(bp);
309 }
310 
311 /*
312  * get an empty block,
313  * not assigned to any particular device
314  */
315 struct buf *
316 geteblk(size)
317 	int size;
318 {
319 	register struct buf *bp, *flist;
320 
321 loop:
322 	bp = getnewbuf();
323 	bp->b_flags |= B_INVAL;
324 	bfree(bp);
325 	bremhash(bp);
326 	flist = &bfreelist[BQ_AGE];
327 	binshash(bp, flist);
328 	bp->b_dev = (dev_t)NODEV;
329 	bp->b_error = 0;
330 	if (brealloc(bp, size) == 0)
331 		goto loop;
332 	return(bp);
333 }
334 
335 /*
336  * Allocate space associated with a buffer.
337  * If can't get space, buffer is released
338  */
339 brealloc(bp, size)
340 	register struct buf *bp;
341 	int size;
342 {
343 	daddr_t start, last;
344 	register struct buf *ep;
345 	struct buf *dp;
346 	int s;
347 
348 	/*
349 	 * First need to make sure that all overlaping previous I/O
350 	 * is dispatched with.
351 	 */
352 	if (size == bp->b_bcount)
353 		return (1);
354 	if (size < bp->b_bcount) {
355 		if (bp->b_flags & B_DELWRI) {
356 			bwrite(bp);
357 			return (0);
358 		}
359 		if (bp->b_flags & B_LOCKED)
360 			panic("brealloc");
361 		return (allocbuf(bp, size));
362 	}
363 	bp->b_flags &= ~B_DONE;
364 	if (bp->b_dev == NODEV)
365 		return (allocbuf(bp, size));
366 
367 	trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno);
368 	/*
369 	 * Search cache for any buffers that overlap the one that we
370 	 * are trying to allocate. Overlapping buffers must be marked
371 	 * invalid, after being written out if they are dirty. (indicated
372 	 * by B_DELWRI) A disk block must be mapped by at most one buffer
373 	 * at any point in time. Care must be taken to avoid deadlocking
374 	 * when two buffer are trying to get the same set of disk blocks.
375 	 */
376 	start = bp->b_blkno;
377 	last = start + btodb(size) - 1;
378 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
379 loop:
380 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
381 		if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
382 			continue;
383 		/* look for overlap */
384 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
385 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
386 			continue;
387 		s = spl6();
388 		if (ep->b_flags&B_BUSY) {
389 			ep->b_flags |= B_WANTED;
390 			sleep((caddr_t)ep, PRIBIO+1);
391 			splx(s);
392 			goto loop;
393 		}
394 		splx(s);
395 		notavail(ep);
396 		if (ep->b_flags & B_DELWRI) {
397 			bwrite(ep);
398 			goto loop;
399 		}
400 		ep->b_flags |= B_INVAL;
401 		brelse(ep);
402 	}
403 	return (allocbuf(bp, size));
404 }
405 
406 /*
407  * Find a buffer which is available for use.
408  * Select something from a free list.
409  * Preference is to AGE list, then LRU list.
410  */
411 struct buf *
412 getnewbuf()
413 {
414 	register struct buf *bp, *dp;
415 	int s;
416 
417 loop:
418 	s = spl6();
419 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
420 		if (dp->av_forw != dp)
421 			break;
422 	if (dp == bfreelist) {		/* no free blocks */
423 		dp->b_flags |= B_WANTED;
424 		sleep((caddr_t)dp, PRIBIO+1);
425 		splx(s);
426 		goto loop;
427 	}
428 	splx(s);
429 	bp = dp->av_forw;
430 	notavail(bp);
431 	if (bp->b_flags & B_DELWRI) {
432 		bp->b_flags |= B_ASYNC;
433 		bwrite(bp);
434 		goto loop;
435 	}
436 	trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno);
437 	bp->b_flags = B_BUSY;
438 	return (bp);
439 }
440 
441 /*
442  * Wait for I/O completion on the buffer; return errors
443  * to the user.
444  */
445 biowait(bp)
446 	register struct buf *bp;
447 {
448 	int s;
449 
450 	s = spl6();
451 	while ((bp->b_flags&B_DONE)==0)
452 		sleep((caddr_t)bp, PRIBIO);
453 	splx(s);
454 	if (u.u_error == 0)			/* XXX */
455 		u.u_error = geterror(bp);
456 }
457 
458 /*
459  * Mark I/O complete on a buffer.
460  * If someone should be called, e.g. the pageout
461  * daemon, do so.  Otherwise, wake up anyone
462  * waiting for it.
463  */
464 biodone(bp)
465 	register struct buf *bp;
466 {
467 
468 	if (bp->b_flags & B_DONE)
469 		panic("dup biodone");
470 	bp->b_flags |= B_DONE;
471 	if (bp->b_flags & B_CALL) {
472 		bp->b_flags &= ~B_CALL;
473 		(*bp->b_iodone)(bp);
474 		return;
475 	}
476 	if (bp->b_flags&B_ASYNC)
477 		brelse(bp);
478 	else {
479 		bp->b_flags &= ~B_WANTED;
480 		wakeup((caddr_t)bp);
481 	}
482 }
483 
484 /*
485  * Insure that no part of a specified block is in an incore buffer.
486  */
487 blkflush(dev, blkno, size)
488 	dev_t dev;
489 	daddr_t blkno;
490 	long size;
491 {
492 	register struct buf *ep;
493 	struct buf *dp;
494 	daddr_t start, last;
495 	int s;
496 
497 	start = blkno;
498 	last = start + btodb(size) - 1;
499 	dp = BUFHASH(dev, blkno);
500 loop:
501 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
502 		if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
503 			continue;
504 		/* look for overlap */
505 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
506 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
507 			continue;
508 		s = spl6();
509 		if (ep->b_flags&B_BUSY) {
510 			ep->b_flags |= B_WANTED;
511 			sleep((caddr_t)ep, PRIBIO+1);
512 			splx(s);
513 			goto loop;
514 		}
515 		if (ep->b_flags & B_DELWRI) {
516 			splx(s);
517 			notavail(ep);
518 			bwrite(ep);
519 			goto loop;
520 		}
521 		splx(s);
522 	}
523 }
524 
525 /*
526  * Make sure all write-behind blocks
527  * on dev (or NODEV for all)
528  * are flushed out.
529  * (from umount and update)
530  */
531 bflush(dev)
532 	dev_t dev;
533 {
534 	register struct buf *bp;
535 	register struct buf *flist;
536 	int s;
537 
538 loop:
539 	s = spl6();
540 	for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
541 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
542 		if ((bp->b_flags & B_DELWRI) == 0)
543 			continue;
544 		if (dev == NODEV || dev == bp->b_dev) {
545 			bp->b_flags |= B_ASYNC;
546 			notavail(bp);
547 			bwrite(bp);
548 			splx(s);
549 			goto loop;
550 		}
551 	}
552 	splx(s);
553 }
554 
555 /*
556  * Pick up the device's error number and pass it to the user;
557  * if there is an error but the number is 0 set a generalized
558  * code.  Actually the latter is always true because devices
559  * don't yet return specific errors.
560  */
561 geterror(bp)
562 	register struct buf *bp;
563 {
564 	int error = 0;
565 
566 	if (bp->b_flags&B_ERROR)
567 		if ((error = bp->b_error)==0)
568 			return (EIO);
569 	return (error);
570 }
571 
572 /*
573  * Invalidate in core blocks belonging to closed or umounted filesystem
574  *
575  * This is not nicely done at all - the buffer ought to be removed from the
576  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
577  * can't do that here, as it is quite possible that the block is still
578  * being used for i/o. Eventually, all disc drivers should be forced to
579  * have a close routine, which ought ensure that the queue is empty, then
580  * properly flush the queues. Until that happy day, this suffices for
581  * correctness.						... kre
582  */
583 binval(dev)
584 	dev_t dev;
585 {
586 	register struct buf *bp;
587 	register struct bufhd *hp;
588 #define dp ((struct buf *)hp)
589 
590 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
591 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
592 			if (bp->b_dev == dev)
593 				bp->b_flags |= B_INVAL;
594 }
595