xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 14221)
1 /*	vfs_cluster.c	6.1	83/07/29	*/
2 
3 #include "../machine/pte.h"
4 
5 #include "../h/param.h"
6 #include "../h/systm.h"
7 #include "../h/dir.h"
8 #include "../h/user.h"
9 #include "../h/buf.h"
10 #include "../h/conf.h"
11 #include "../h/proc.h"
12 #include "../h/seg.h"
13 #include "../h/vm.h"
14 #include "../h/trace.h"
15 
16 /*
17  * Read in (if necessary) the block and return a buffer pointer.
18  */
19 struct buf *
20 bread(dev, blkno, size)
21 	dev_t dev;
22 	daddr_t blkno;
23 	int size;
24 {
25 	register struct buf *bp;
26 
27 	if (size == 0)
28 		panic("bread: size 0");
29 	bp = getblk(dev, blkno, size);
30 	if (bp->b_flags&B_DONE) {
31 		trace(TR_BREADHIT, dev, blkno);
32 		return(bp);
33 	}
34 	bp->b_flags |= B_READ;
35 	if (bp->b_bcount > bp->b_bufsize)
36 		panic("bread");
37 	(*bdevsw[major(dev)].d_strategy)(bp);
38 	trace(TR_BREADMISS, dev, blkno);
39 	u.u_ru.ru_inblock++;		/* pay for read */
40 	biowait(bp);
41 	return(bp);
42 }
43 
44 /*
45  * Read in the block, like bread, but also start I/O on the
46  * read-ahead block (which is not allocated to the caller)
47  */
48 struct buf *
49 breada(dev, blkno, size, rablkno, rabsize)
50 	dev_t dev;
51 	daddr_t blkno; int size;
52 	daddr_t rablkno; int rabsize;
53 {
54 	register struct buf *bp, *rabp;
55 
56 	bp = NULL;
57 	/*
58 	 * If the block isn't in core, then allocate
59 	 * a buffer and initiate i/o (getblk checks
60 	 * for a cache hit).
61 	 */
62 	if (!incore(dev, blkno)) {
63 		bp = getblk(dev, blkno, size);
64 		if ((bp->b_flags&B_DONE) == 0) {
65 			bp->b_flags |= B_READ;
66 			if (bp->b_bcount > bp->b_bufsize)
67 				panic("breada");
68 			(*bdevsw[major(dev)].d_strategy)(bp);
69 			trace(TR_BREADMISS, dev, blkno);
70 			u.u_ru.ru_inblock++;		/* pay for read */
71 		} else
72 			trace(TR_BREADHIT, dev, blkno);
73 	}
74 
75 	/*
76 	 * If there's a read-ahead block, start i/o
77 	 * on it also (as above).
78 	 */
79 	if (rablkno && !incore(dev, rablkno)) {
80 		rabp = getblk(dev, rablkno, rabsize);
81 		if (rabp->b_flags & B_DONE) {
82 			brelse(rabp);
83 			trace(TR_BREADHITRA, dev, blkno);
84 		} else {
85 			rabp->b_flags |= B_READ|B_ASYNC;
86 			if (rabp->b_bcount > rabp->b_bufsize)
87 				panic("breadrabp");
88 			(*bdevsw[major(dev)].d_strategy)(rabp);
89 			trace(TR_BREADMISSRA, dev, rablock);
90 			u.u_ru.ru_inblock++;		/* pay in advance */
91 		}
92 	}
93 
94 	/*
95 	 * If block was in core, let bread get it.
96 	 * If block wasn't in core, then the read was started
97 	 * above, and just wait for it.
98 	 */
99 	if (bp == NULL)
100 		return (bread(dev, blkno, size));
101 	biowait(bp);
102 	return (bp);
103 }
104 
105 /*
106  * Write the buffer, waiting for completion.
107  * Then release the buffer.
108  */
109 bwrite(bp)
110 	register struct buf *bp;
111 {
112 	register flag;
113 
114 	flag = bp->b_flags;
115 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
116 	if ((flag&B_DELWRI) == 0)
117 		u.u_ru.ru_oublock++;		/* noone paid yet */
118 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
119 	if (bp->b_bcount > bp->b_bufsize)
120 		panic("bwrite");
121 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
122 
123 	/*
124 	 * If the write was synchronous, then await i/o completion.
125 	 * If the write was "delayed", then we put the buffer on
126 	 * the q of blocks awaiting i/o completion status.
127 	 */
128 	if ((flag&B_ASYNC) == 0) {
129 		biowait(bp);
130 		brelse(bp);
131 	} else if (flag & B_DELWRI)
132 		bp->b_flags |= B_AGE;
133 }
134 
135 /*
136  * Release the buffer, marking it so that if it is grabbed
137  * for another purpose it will be written out before being
138  * given up (e.g. when writing a partial block where it is
139  * assumed that another write for the same block will soon follow).
140  * This can't be done for magtape, since writes must be done
141  * in the same order as requested.
142  */
143 bdwrite(bp)
144 	register struct buf *bp;
145 {
146 	register int flags;
147 
148 	if ((bp->b_flags&B_DELWRI) == 0)
149 		u.u_ru.ru_oublock++;		/* noone paid yet */
150 	flags = bdevsw[major(bp->b_dev)].d_flags;
151 	if(flags & B_TAPE)
152 		bawrite(bp);
153 	else {
154 		bp->b_flags |= B_DELWRI | B_DONE;
155 		brelse(bp);
156 	}
157 }
158 
159 /*
160  * Release the buffer, start I/O on it, but don't wait for completion.
161  */
162 bawrite(bp)
163 	register struct buf *bp;
164 {
165 
166 	bp->b_flags |= B_ASYNC;
167 	bwrite(bp);
168 }
169 
170 /*
171  * Release the buffer, with no I/O implied.
172  */
173 brelse(bp)
174 	register struct buf *bp;
175 {
176 	register struct buf *flist;
177 	register s;
178 
179 	/*
180 	 * If someone's waiting for the buffer, or
181 	 * is waiting for a buffer wake 'em up.
182 	 */
183 	if (bp->b_flags&B_WANTED)
184 		wakeup((caddr_t)bp);
185 	if (bfreelist[0].b_flags&B_WANTED) {
186 		bfreelist[0].b_flags &= ~B_WANTED;
187 		wakeup((caddr_t)bfreelist);
188 	}
189 	if (bp->b_flags&B_ERROR)
190 		if (bp->b_flags & B_LOCKED)
191 			bp->b_flags &= ~B_ERROR;	/* try again later */
192 		else
193 			bp->b_dev = NODEV;  		/* no assoc */
194 
195 	/*
196 	 * Stick the buffer back on a free list.
197 	 */
198 	s = spl6();
199 	if (bp->b_bufsize <= 0) {
200 		/* block has no buffer ... put at front of unused buffer list */
201 		flist = &bfreelist[BQ_EMPTY];
202 		binsheadfree(bp, flist);
203 	} else if (bp->b_flags & (B_ERROR|B_INVAL)) {
204 		/* block has no info ... put at front of most free list */
205 		flist = &bfreelist[BQ_AGE];
206 		binsheadfree(bp, flist);
207 	} else {
208 		if (bp->b_flags & B_LOCKED)
209 			flist = &bfreelist[BQ_LOCKED];
210 		else if (bp->b_flags & B_AGE)
211 			flist = &bfreelist[BQ_AGE];
212 		else
213 			flist = &bfreelist[BQ_LRU];
214 		binstailfree(bp, flist);
215 	}
216 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
217 	splx(s);
218 }
219 
220 /*
221  * See if the block is associated with some buffer
222  * (mainly to avoid getting hung up on a wait in breada)
223  */
224 incore(dev, blkno)
225 	dev_t dev;
226 	daddr_t blkno;
227 {
228 	register struct buf *bp;
229 	register struct buf *dp;
230 
231 	dp = BUFHASH(dev, blkno);
232 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
233 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
234 		    (bp->b_flags & B_INVAL) == 0)
235 			return (1);
236 	return (0);
237 }
238 
239 struct buf *
240 baddr(dev, blkno, size)
241 	dev_t dev;
242 	daddr_t blkno;
243 	int size;
244 {
245 
246 	if (incore(dev, blkno))
247 		return (bread(dev, blkno, size));
248 	return (0);
249 }
250 
251 /*
252  * Assign a buffer for the given block.  If the appropriate
253  * block is already associated, return it; otherwise search
254  * for the oldest non-busy buffer and reassign it.
255  *
256  * We use splx here because this routine may be called
257  * on the interrupt stack during a dump, and we don't
258  * want to lower the ipl back to 0.
259  */
260 struct buf *
261 getblk(dev, blkno, size)
262 	dev_t dev;
263 	daddr_t blkno;
264 	int size;
265 {
266 	register struct buf *bp, *dp;
267 	int s;
268 
269 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))	/* XXX */
270 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
271 	/*
272 	 * Search the cache for the block.  If we hit, but
273 	 * the buffer is in use for i/o, then we wait until
274 	 * the i/o has completed.
275 	 */
276 	dp = BUFHASH(dev, blkno);
277 loop:
278 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
279 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
280 		    bp->b_flags&B_INVAL)
281 			continue;
282 		s = spl6();
283 		if (bp->b_flags&B_BUSY) {
284 			bp->b_flags |= B_WANTED;
285 			sleep((caddr_t)bp, PRIBIO+1);
286 			splx(s);
287 			goto loop;
288 		}
289 		splx(s);
290 		notavail(bp);
291 		if (brealloc(bp, size) == 0)
292 			goto loop;
293 		bp->b_flags |= B_CACHE;
294 		return(bp);
295 	}
296 	if (major(dev) >= nblkdev)
297 		panic("blkdev");
298 	bp = getnewbuf();
299 	bfree(bp);
300 	bremhash(bp);
301 	binshash(bp, dp);
302 	bp->b_dev = dev;
303 	bp->b_blkno = blkno;
304 	bp->b_error = 0;
305 	if (brealloc(bp, size) == 0)
306 		goto loop;
307 	return(bp);
308 }
309 
310 /*
311  * get an empty block,
312  * not assigned to any particular device
313  */
314 struct buf *
315 geteblk(size)
316 	int size;
317 {
318 	register struct buf *bp, *flist;
319 
320 loop:
321 	bp = getnewbuf();
322 	bp->b_flags |= B_INVAL;
323 	bfree(bp);
324 	bremhash(bp);
325 	flist = &bfreelist[BQ_AGE];
326 	binshash(bp, flist);
327 	bp->b_dev = (dev_t)NODEV;
328 	bp->b_error = 0;
329 	if (brealloc(bp, size) == 0)
330 		goto loop;
331 	return(bp);
332 }
333 
334 /*
335  * Allocate space associated with a buffer.
336  * If can't get space, buffer is released
337  */
338 brealloc(bp, size)
339 	register struct buf *bp;
340 	int size;
341 {
342 	daddr_t start, last;
343 	register struct buf *ep;
344 	struct buf *dp;
345 	int s;
346 
347 	/*
348 	 * First need to make sure that all overlaping previous I/O
349 	 * is dispatched with.
350 	 */
351 	if (size == bp->b_bcount)
352 		return (1);
353 	if (size < bp->b_bcount) {
354 		if (bp->b_flags & B_DELWRI) {
355 			bwrite(bp);
356 			return (0);
357 		}
358 		if (bp->b_flags & B_LOCKED)
359 			panic("brealloc");
360 		return (allocbuf(bp, size));
361 	}
362 	bp->b_flags &= ~B_DONE;
363 	if (bp->b_dev == NODEV)
364 		return (allocbuf(bp, size));
365 
366 	/*
367 	 * Search cache for any buffers that overlap the one that we
368 	 * are trying to allocate. Overlapping buffers must be marked
369 	 * invalid, after being written out if they are dirty. (indicated
370 	 * by B_DELWRI) A disk block must be mapped by at most one buffer
371 	 * at any point in time. Care must be taken to avoid deadlocking
372 	 * when two buffer are trying to get the same set of disk blocks.
373 	 */
374 	start = bp->b_blkno;
375 	last = start + btodb(size) - 1;
376 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
377 loop:
378 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
379 		if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
380 			continue;
381 		/* look for overlap */
382 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
383 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
384 			continue;
385 		s = spl6();
386 		if (ep->b_flags&B_BUSY) {
387 			ep->b_flags |= B_WANTED;
388 			sleep((caddr_t)ep, PRIBIO+1);
389 			splx(s);
390 			goto loop;
391 		}
392 		splx(s);
393 		notavail(ep);
394 		if (ep->b_flags & B_DELWRI) {
395 			bwrite(ep);
396 			goto loop;
397 		}
398 		ep->b_flags |= B_INVAL;
399 		brelse(ep);
400 	}
401 	return (allocbuf(bp, size));
402 }
403 
404 /*
405  * Find a buffer which is available for use.
406  * Select something from a free list.
407  * Preference is to AGE list, then LRU list.
408  */
409 struct buf *
410 getnewbuf()
411 {
412 	register struct buf *bp, *dp;
413 	int s;
414 
415 loop:
416 	s = spl6();
417 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
418 		if (dp->av_forw != dp)
419 			break;
420 	if (dp == bfreelist) {		/* no free blocks */
421 		dp->b_flags |= B_WANTED;
422 		sleep((caddr_t)dp, PRIBIO+1);
423 		splx(s);
424 		goto loop;
425 	}
426 	splx(s);
427 	bp = dp->av_forw;
428 	notavail(bp);
429 	if (bp->b_flags & B_DELWRI) {
430 		bp->b_flags |= B_ASYNC;
431 		bwrite(bp);
432 		goto loop;
433 	}
434 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
435 	bp->b_flags = B_BUSY;
436 	return (bp);
437 }
438 
439 /*
440  * Wait for I/O completion on the buffer; return errors
441  * to the user.
442  */
443 biowait(bp)
444 	register struct buf *bp;
445 {
446 	int s;
447 
448 	s = spl6();
449 	while ((bp->b_flags&B_DONE)==0)
450 		sleep((caddr_t)bp, PRIBIO);
451 	splx(s);
452 	if (u.u_error == 0)			/* XXX */
453 		u.u_error = geterror(bp);
454 }
455 
456 /*
457  * Mark I/O complete on a buffer.
458  * If someone should be called, e.g. the pageout
459  * daemon, do so.  Otherwise, wake up anyone
460  * waiting for it.
461  */
462 biodone(bp)
463 	register struct buf *bp;
464 {
465 
466 	if (bp->b_flags & B_DONE)
467 		panic("dup biodone");
468 	bp->b_flags |= B_DONE;
469 	if (bp->b_flags & B_CALL) {
470 		bp->b_flags &= ~B_CALL;
471 		(*bp->b_iodone)(bp);
472 		return;
473 	}
474 	if (bp->b_flags&B_ASYNC)
475 		brelse(bp);
476 	else {
477 		bp->b_flags &= ~B_WANTED;
478 		wakeup((caddr_t)bp);
479 	}
480 }
481 
482 /*
483  * Insure that no part of a specified block is in an incore buffer.
484  */
485 blkflush(dev, blkno, size)
486 	dev_t dev;
487 	daddr_t blkno;
488 	long size;
489 {
490 	register struct buf *ep;
491 	struct buf *dp;
492 	daddr_t start, last;
493 	int s;
494 
495 	start = blkno;
496 	last = start + btodb(size) - 1;
497 	dp = BUFHASH(dev, blkno);
498 loop:
499 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
500 		if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
501 			continue;
502 		/* look for overlap */
503 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
504 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
505 			continue;
506 		s = spl6();
507 		if (ep->b_flags&B_BUSY) {
508 			ep->b_flags |= B_WANTED;
509 			sleep((caddr_t)ep, PRIBIO+1);
510 			splx(s);
511 			goto loop;
512 		}
513 		if (ep->b_flags & B_DELWRI) {
514 			splx(s);
515 			notavail(ep);
516 			bwrite(ep);
517 			goto loop;
518 		}
519 		splx(s);
520 	}
521 }
522 
523 /*
524  * Make sure all write-behind blocks
525  * on dev (or NODEV for all)
526  * are flushed out.
527  * (from umount and update)
528  */
529 bflush(dev)
530 	dev_t dev;
531 {
532 	register struct buf *bp;
533 	register struct buf *flist;
534 	int s;
535 
536 loop:
537 	s = spl6();
538 	for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
539 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
540 		if ((bp->b_flags & B_DELWRI) == 0)
541 			continue;
542 		if (dev == NODEV || dev == bp->b_dev) {
543 			bp->b_flags |= B_ASYNC;
544 			notavail(bp);
545 			bwrite(bp);
546 			splx(s);
547 			goto loop;
548 		}
549 	}
550 	splx(s);
551 }
552 
553 /*
554  * Pick up the device's error number and pass it to the user;
555  * if there is an error but the number is 0 set a generalized
556  * code.  Actually the latter is always true because devices
557  * don't yet return specific errors.
558  */
559 geterror(bp)
560 	register struct buf *bp;
561 {
562 	int error = 0;
563 
564 	if (bp->b_flags&B_ERROR)
565 		if ((error = bp->b_error)==0)
566 			return (EIO);
567 	return (error);
568 }
569 
570 /*
571  * Invalidate in core blocks belonging to closed or umounted filesystem
572  *
573  * This is not nicely done at all - the buffer ought to be removed from the
574  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
575  * can't do that here, as it is quite possible that the block is still
576  * being used for i/o. Eventually, all disc drivers should be forced to
577  * have a close routine, which ought ensure that the queue is empty, then
578  * properly flush the queues. Until that happy day, this suffices for
579  * correctness.						... kre
580  */
581 binval(dev)
582 	dev_t dev;
583 {
584 	register struct buf *bp;
585 	register struct bufhd *hp;
586 #define dp ((struct buf *)hp)
587 
588 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
589 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
590 			if (bp->b_dev == dev)
591 				bp->b_flags |= B_INVAL;
592 }
593