xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 6914)
1 /*	vfs_cluster.c	4.30	82/05/22	*/
2 
3 /* merged into kernel:	 @(#)bio.c 2.3 4/8/82 */
4 
5 #include "../h/param.h"
6 #include "../h/systm.h"
7 #include "../h/dir.h"
8 #include "../h/user.h"
9 #include "../h/buf.h"
10 #include "../h/conf.h"
11 #include "../h/proc.h"
12 #include "../h/seg.h"
13 #include "../h/pte.h"
14 #include "../h/vm.h"
15 #include "../h/trace.h"
16 
17 /*
18  * The following several routines allocate and free
19  * buffers with various side effects.  In general the
20  * arguments to an allocate routine are a device and
21  * a block number, and the value is a pointer to
22  * to the buffer header; the buffer is marked "busy"
23  * so that no one else can touch it.  If the block was
24  * already in core, no I/O need be done; if it is
25  * already busy, the process waits until it becomes free.
26  * The following routines allocate a buffer:
27  *	getblk
28  *	bread
29  *	breada
30  *	baddr	(if it is incore)
31  * Eventually the buffer must be released, possibly with the
32  * side effect of writing it out, by using one of
33  *	bwrite
34  *	bdwrite
35  *	bawrite
36  *	brelse
37  */
38 
39 struct	buf bfreelist[BQUEUES];
40 struct	buf bswlist, *bclnlist;
41 
42 #define	BUFHSZ	63
43 #define RND	(MAXBSIZE/DEV_BSIZE)
44 struct	bufhd bufhash[BUFHSZ];
45 #define	BUFHASH(dev, dblkno)	\
46 	((struct buf *)&bufhash[((int)(dev)+(((int)(dblkno))/RND)) % BUFHSZ])
47 
48 /*
49  * Initialize hash links for buffers.
50  */
51 bhinit()
52 {
53 	register int i;
54 	register struct bufhd *bp;
55 
56 	for (bp = bufhash, i = 0; i < BUFHSZ; i++, bp++)
57 		bp->b_forw = bp->b_back = (struct buf *)bp;
58 }
59 
60 /* #define	DISKMON	1 */
61 
62 #ifdef	DISKMON
63 struct {
64 	int	nbuf;
65 	long	nread;
66 	long	nreada;
67 	long	ncache;
68 	long	nwrite;
69 	long	bufcount[64];
70 } io_info;
71 #endif
72 
73 /*
74 
75 #ifndef	UNFAST
76 #define	notavail(bp) \
77 { \
78 	int x = spl6(); \
79 	(bp)->av_back->av_forw = (bp)->av_forw; \
80 	(bp)->av_forw->av_back = (bp)->av_back; \
81 	(bp)->b_flags |= B_BUSY; \
82 	splx(x); \
83 }
84 #endif
85 
86 /*
87  * Read in (if necessary) the block and return a buffer pointer.
88  */
89 struct buf *
90 bread(dev, blkno, size)
91 	dev_t dev;
92 	daddr_t blkno;
93 	int size;
94 {
95 	register struct buf *bp;
96 
97 	bp = getblk(dev, blkno, size);
98 	if (bp->b_flags&B_DONE) {
99 #ifdef	TRACE
100 		trace(TR_BREADHIT, dev, blkno);
101 #endif
102 #ifdef	DISKMON
103 		io_info.ncache++;
104 #endif
105 		return(bp);
106 	}
107 	bp->b_flags |= B_READ;
108 	(*bdevsw[major(dev)].d_strategy)(bp);
109 #ifdef	TRACE
110 	trace(TR_BREADMISS, dev, blkno);
111 #endif
112 #ifdef	DISKMON
113 	io_info.nread++;
114 #endif
115 	u.u_vm.vm_inblk++;		/* pay for read */
116 	iowait(bp);
117 	return(bp);
118 }
119 
120 /*
121  * Read in the block, like bread, but also start I/O on the
122  * read-ahead block (which is not allocated to the caller)
123  */
124 struct buf *
125 breada(dev, blkno, rablkno, size)
126 	dev_t dev;
127 	daddr_t blkno, rablkno;
128 	int size;
129 {
130 	register struct buf *bp, *rabp;
131 
132 	bp = NULL;
133 	if (!incore(dev, blkno)) {
134 		bp = getblk(dev, blkno, size);
135 		if ((bp->b_flags&B_DONE) == 0) {
136 			bp->b_flags |= B_READ;
137 			(*bdevsw[major(dev)].d_strategy)(bp);
138 #ifdef	TRACE
139 			trace(TR_BREADMISS, dev, blkno);
140 #endif
141 #ifdef	DISKMON
142 			io_info.nread++;
143 #endif
144 			u.u_vm.vm_inblk++;		/* pay for read */
145 		}
146 #ifdef	TRACE
147 		else
148 			trace(TR_BREADHIT, dev, blkno);
149 #endif
150 	}
151 	if (rablkno && !incore(dev, rablkno)) {
152 		rabp = getblk(dev, rablkno, size);
153 		if (rabp->b_flags & B_DONE) {
154 			brelse(rabp);
155 #ifdef	TRACE
156 			trace(TR_BREADHITRA, dev, blkno);
157 #endif
158 		} else {
159 			rabp->b_flags |= B_READ|B_ASYNC;
160 			(*bdevsw[major(dev)].d_strategy)(rabp);
161 #ifdef	TRACE
162 			trace(TR_BREADMISSRA, dev, rablock);
163 #endif
164 #ifdef	DISKMON
165 			io_info.nreada++;
166 #endif
167 			u.u_vm.vm_inblk++;		/* pay in advance */
168 		}
169 	}
170 	if(bp == NULL)
171 		return(bread(dev, blkno, size));
172 	iowait(bp);
173 	return(bp);
174 }
175 
176 /*
177  * Write the buffer, waiting for completion.
178  * Then release the buffer.
179  */
180 bwrite(bp)
181 register struct buf *bp;
182 {
183 	register flag;
184 
185 	flag = bp->b_flags;
186 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
187 #ifdef	DISKMON
188 	io_info.nwrite++;
189 #endif
190 	if ((flag&B_DELWRI) == 0)
191 		u.u_vm.vm_oublk++;		/* noone paid yet */
192 #ifdef	TRACE
193 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
194 #endif
195 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
196 	if ((flag&B_ASYNC) == 0) {
197 		iowait(bp);
198 		brelse(bp);
199 	} else if (flag & B_DELWRI)
200 		bp->b_flags |= B_AGE;
201 	else
202 		geterror(bp);
203 }
204 
205 /*
206  * Release the buffer, marking it so that if it is grabbed
207  * for another purpose it will be written out before being
208  * given up (e.g. when writing a partial block where it is
209  * assumed that another write for the same block will soon follow).
210  * This can't be done for magtape, since writes must be done
211  * in the same order as requested.
212  */
213 bdwrite(bp)
214 register struct buf *bp;
215 {
216 	register int flags;
217 
218 	if ((bp->b_flags&B_DELWRI) == 0)
219 		u.u_vm.vm_oublk++;		/* noone paid yet */
220 	flags = bdevsw[major(bp->b_dev)].d_flags;
221 	if(flags & B_TAPE)
222 		bawrite(bp);
223 	else {
224 		bp->b_flags |= B_DELWRI | B_DONE;
225 		brelse(bp);
226 	}
227 }
228 
229 /*
230  * Release the buffer, start I/O on it, but don't wait for completion.
231  */
232 bawrite(bp)
233 register struct buf *bp;
234 {
235 
236 	bp->b_flags |= B_ASYNC;
237 	bwrite(bp);
238 }
239 
240 /*
241  * release the buffer, with no I/O implied.
242  */
243 brelse(bp)
244 register struct buf *bp;
245 {
246 	register struct buf *flist;
247 	register s;
248 
249 	if (bp->b_flags&B_WANTED)
250 		wakeup((caddr_t)bp);
251 	if (bfreelist[0].b_flags&B_WANTED) {
252 		bfreelist[0].b_flags &= ~B_WANTED;
253 		wakeup((caddr_t)bfreelist);
254 	}
255 	if (bp->b_flags&B_ERROR)
256 		if (bp->b_flags & B_LOCKED)
257 			bp->b_flags &= ~B_ERROR;	/* try again later */
258 		else
259 			bp->b_dev = NODEV;  		/* no assoc */
260 	s = spl6();
261 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
262 		/* block has no info ... put at front of most free list */
263 		flist = &bfreelist[BQUEUES-1];
264 		flist->av_forw->av_back = bp;
265 		bp->av_forw = flist->av_forw;
266 		flist->av_forw = bp;
267 		bp->av_back = flist;
268 	} else {
269 		if (bp->b_flags & B_LOCKED)
270 			flist = &bfreelist[BQ_LOCKED];
271 		else if (bp->b_flags & B_AGE)
272 			flist = &bfreelist[BQ_AGE];
273 		else
274 			flist = &bfreelist[BQ_LRU];
275 		flist->av_back->av_forw = bp;
276 		bp->av_back = flist->av_back;
277 		flist->av_back = bp;
278 		bp->av_forw = flist;
279 	}
280 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
281 	splx(s);
282 }
283 
284 /*
285  * See if the block is associated with some buffer
286  * (mainly to avoid getting hung up on a wait in breada)
287  */
288 incore(dev, blkno)
289 dev_t dev;
290 daddr_t blkno;
291 {
292 	register struct buf *bp;
293 	register struct buf *dp;
294 
295 	dp = BUFHASH(dev, blkno);
296 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
297 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
298 		    !(bp->b_flags & B_INVAL))
299 			return (1);
300 	return (0);
301 }
302 
303 struct buf *
304 baddr(dev, blkno, size)
305 	dev_t dev;
306 	daddr_t blkno;
307 	int size;
308 {
309 
310 	if (incore(dev, blkno))
311 		return (bread(dev, blkno, size));
312 	return (0);
313 }
314 
315 /*
316  * Assign a buffer for the given block.  If the appropriate
317  * block is already associated, return it; otherwise search
318  * for the oldest non-busy buffer and reassign it.
319  *
320  * We use splx here because this routine may be called
321  * on the interrupt stack during a dump, and we don't
322  * want to lower the ipl back to 0.
323  */
324 struct buf *
325 getblk(dev, blkno, size)
326 	dev_t dev;
327 	daddr_t blkno;
328 	int size;
329 {
330 	register struct buf *bp, *dp, *ep;
331 #ifdef	DISKMON
332 	register int i;
333 #endif
334 	int s;
335 
336 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
337 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
338 	dp = BUFHASH(dev, blkno);
339     loop:
340 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
341 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
342 		    bp->b_flags&B_INVAL)
343 			continue;
344 		s = spl6();
345 		if (bp->b_flags&B_BUSY) {
346 			bp->b_flags |= B_WANTED;
347 			sleep((caddr_t)bp, PRIBIO+1);
348 			splx(s);
349 			goto loop;
350 		}
351 		splx(s);
352 #ifdef	DISKMON
353 		i = 0;
354 		dp = bp->av_forw;
355 		while ((dp->b_flags & B_HEAD) == 0) {
356 			i++;
357 			dp = dp->av_forw;
358 		}
359 		if (i<64)
360 			io_info.bufcount[i]++;
361 #endif
362 		notavail(bp);
363 		brealloc(bp, size);
364 		bp->b_flags |= B_CACHE;
365 		return(bp);
366 	}
367 	if (major(dev) >= nblkdev)
368 		panic("blkdev");
369 	s = spl6();
370 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
371 		if (ep->av_forw != ep)
372 			break;
373 	if (ep == bfreelist) {		/* no free blocks at all */
374 		ep->b_flags |= B_WANTED;
375 		sleep((caddr_t)ep, PRIBIO+1);
376 		splx(s);
377 		goto loop;
378 	}
379 	splx(s);
380 	bp = ep->av_forw;
381 	notavail(bp);
382 	if (bp->b_flags & B_DELWRI) {
383 		bp->b_flags |= B_ASYNC;
384 		bwrite(bp);
385 		goto loop;
386 	}
387 #ifdef TRACE
388 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
389 #endif
390 	bp->b_flags = B_BUSY;
391 	bfree(bp);
392 	bp->b_back->b_forw = bp->b_forw;
393 	bp->b_forw->b_back = bp->b_back;
394 	bp->b_forw = dp->b_forw;
395 	bp->b_back = dp;
396 	dp->b_forw->b_back = bp;
397 	dp->b_forw = bp;
398 	bp->b_dev = dev;
399 	bp->b_blkno = blkno;
400 	brealloc(bp, size);
401 	return(bp);
402 }
403 
404 /*
405  * get an empty block,
406  * not assigned to any particular device
407  */
408 struct buf *
409 geteblk(size)
410 	int size;
411 {
412 	register struct buf *bp, *dp;
413 	int s;
414 
415 loop:
416 	s = spl6();
417 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
418 		if (dp->av_forw != dp)
419 			break;
420 	if (dp == bfreelist) {		/* no free blocks */
421 		dp->b_flags |= B_WANTED;
422 		sleep((caddr_t)dp, PRIBIO+1);
423 		goto loop;
424 	}
425 	splx(s);
426 	bp = dp->av_forw;
427 	notavail(bp);
428 	if (bp->b_flags & B_DELWRI) {
429 		bp->b_flags |= B_ASYNC;
430 		bwrite(bp);
431 		goto loop;
432 	}
433 #ifdef TRACE
434 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
435 #endif
436 	bp->b_flags = B_BUSY|B_INVAL;
437 	bp->b_back->b_forw = bp->b_forw;
438 	bp->b_forw->b_back = bp->b_back;
439 	bp->b_forw = dp->b_forw;
440 	bp->b_back = dp;
441 	dp->b_forw->b_back = bp;
442 	dp->b_forw = bp;
443 	bp->b_dev = (dev_t)NODEV;
444 	bp->b_bcount = size;
445 	return(bp);
446 }
447 
448 /*
449  * Allocate space associated with a buffer.
450  */
451 brealloc(bp, size)
452 	register struct buf *bp;
453 	int size;
454 {
455 	daddr_t start, last;
456 	register struct buf *ep;
457 	struct buf *dp;
458 	int s;
459 
460 	/*
461 	 * First need to make sure that all overlaping previous I/O
462 	 * is dispatched with.
463 	 */
464 	if (size == bp->b_bcount)
465 		return;
466 	if (size < bp->b_bcount) {
467 		bp->b_bcount = size;
468 		return;
469 	}
470 	start = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);
471 	last = bp->b_blkno + (size / DEV_BSIZE) - 1;
472 	if (bp->b_bcount == 0) {
473 		start++;
474 		if (start == last)
475 			goto allocit;
476 	}
477 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
478 loop:
479 	(void) spl0();
480 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
481 		if (ep->b_blkno < start || ep->b_blkno > last ||
482 		    ep->b_dev != bp->b_dev || ep->b_flags&B_INVAL)
483 			continue;
484 		s = spl6();
485 		if (ep->b_flags&B_BUSY) {
486 			ep->b_flags |= B_WANTED;
487 			sleep((caddr_t)ep, PRIBIO+1);
488 			splx(s);
489 			goto loop;
490 		}
491 		(void) spl0();
492 		/*
493 		 * What we would really like to do is kill this
494 		 * I/O since it is now useless. We cannot do that
495 		 * so we force it to complete, so that it cannot
496 		 * over-write our useful data later.
497 		 */
498 		if (ep->b_flags & B_DELWRI) {
499 			notavail(ep);
500 			ep->b_flags |= B_ASYNC;
501 			bwrite(ep);
502 			goto loop;
503 		}
504 	}
505 allocit:
506 	/*
507 	 * Here the buffer is already available, so all we
508 	 * need to do is set the size. Someday a better memory
509 	 * management scheme will be implemented.
510 	 */
511 	bp->b_bcount = size;
512 }
513 
514 /*
515  * Release space associated with a buffer.
516  */
517 bfree(bp)
518 	struct buf *bp;
519 {
520 	/*
521 	 * Here the buffer does not change, so all we
522 	 * need to do is set the size. Someday a better memory
523 	 * management scheme will be implemented.
524 	 */
525 	bp->b_bcount = 0;
526 }
527 
528 /*
529  * Wait for I/O completion on the buffer; return errors
530  * to the user.
531  */
532 iowait(bp)
533 	register struct buf *bp;
534 {
535 	int s;
536 
537 	s = spl6();
538 	while ((bp->b_flags&B_DONE)==0)
539 		sleep((caddr_t)bp, PRIBIO);
540 	splx(s);
541 	geterror(bp);
542 }
543 
544 #ifdef UNFAST
545 /*
546  * Unlink a buffer from the available list and mark it busy.
547  * (internal interface)
548  */
549 notavail(bp)
550 register struct buf *bp;
551 {
552 	register s;
553 
554 	s = spl6();
555 	bp->av_back->av_forw = bp->av_forw;
556 	bp->av_forw->av_back = bp->av_back;
557 	bp->b_flags |= B_BUSY;
558 	splx(s);
559 }
560 #endif
561 
562 /*
563  * Mark I/O complete on a buffer. If the header
564  * indicates a dirty page push completion, the
565  * header is inserted into the ``cleaned'' list
566  * to be processed by the pageout daemon. Otherwise
567  * release it if I/O is asynchronous, and wake
568  * up anyone waiting for it.
569  */
570 iodone(bp)
571 register struct buf *bp;
572 {
573 	register int s;
574 
575 	if (bp->b_flags & B_DONE)
576 		panic("dup iodone");
577 	bp->b_flags |= B_DONE;
578 	if (bp->b_flags & B_DIRTY) {
579 		if (bp->b_flags & B_ERROR)
580 			panic("IO err in push");
581 		s = spl6();
582 		bp->av_forw = bclnlist;
583 		bp->b_bcount = swsize[bp - swbuf];
584 		bp->b_pfcent = swpf[bp - swbuf];
585 		cnt.v_pgout++;
586 		cnt.v_pgpgout += bp->b_bcount / NBPG;
587 		bclnlist = bp;
588 		if (bswlist.b_flags & B_WANTED)
589 			wakeup((caddr_t)&proc[2]);
590 		splx(s);
591 		return;
592 	}
593 	if (bp->b_flags&B_ASYNC)
594 		brelse(bp);
595 	else {
596 		bp->b_flags &= ~B_WANTED;
597 		wakeup((caddr_t)bp);
598 	}
599 }
600 
601 /*
602  * Zero the core associated with a buffer.
603  */
604 clrbuf(bp)
605 	struct buf *bp;
606 {
607 	register int *p;
608 	register int c;
609 
610 	p = bp->b_un.b_words;
611 	c = bp->b_bcount/sizeof(int);
612 	do
613 		*p++ = 0;
614 	while (--c);
615 	bp->b_resid = 0;
616 }
617 
618 /*
619  * make sure all write-behind blocks
620  * on dev (or NODEV for all)
621  * are flushed out.
622  * (from umount and update)
623  * (and temporarily pagein)
624  */
625 bflush(dev)
626 dev_t dev;
627 {
628 	register struct buf *bp;
629 	register struct buf *flist;
630 	int s;
631 
632 loop:
633 	s = spl6();
634 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
635 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
636 		if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
637 			bp->b_flags |= B_ASYNC;
638 			notavail(bp);
639 			bwrite(bp);
640 			goto loop;
641 		}
642 	}
643 	splx(s);
644 }
645 
646 /*
647  * Pick up the device's error number and pass it to the user;
648  * if there is an error but the number is 0 set a generalized
649  * code.  Actually the latter is always true because devices
650  * don't yet return specific errors.
651  */
652 geterror(bp)
653 register struct buf *bp;
654 {
655 
656 	if (bp->b_flags&B_ERROR)
657 		if ((u.u_error = bp->b_error)==0)
658 			u.u_error = EIO;
659 }
660 
661 /*
662  * Invalidate in core blocks belonging to closed or umounted filesystem
663  *
664  * This is not nicely done at all - the buffer ought to be removed from the
665  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
666  * can't do that here, as it is quite possible that the block is still
667  * being used for i/o. Eventually, all disc drivers should be forced to
668  * have a close routine, which ought ensure that the queue is empty, then
669  * properly flush the queues. Until that happy day, this suffices for
670  * correctness.						... kre
671  */
672 binval(dev)
673 dev_t dev;
674 {
675 	register struct buf *bp;
676 	register struct bufhd *hp;
677 #define dp ((struct buf *)hp)
678 
679 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
680 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
681 			if (bp->b_dev == dev)
682 				bp->b_flags |= B_INVAL;
683 }
684