xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 7723)
1 /*	vfs_cluster.c	4.35	82/08/13	*/
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/dir.h"
6 #include "../h/user.h"
7 #include "../h/buf.h"
8 #include "../h/conf.h"
9 #include "../h/proc.h"
10 #include "../h/seg.h"
11 #include "../h/pte.h"
12 #include "../h/vm.h"
13 #include "../h/trace.h"
14 
15 int bioprintfs = 0;
16 
17 /*
18  * Read in (if necessary) the block and return a buffer pointer.
19  */
20 struct buf *
21 bread(dev, blkno, size)
22 	dev_t dev;
23 	daddr_t blkno;
24 	int size;
25 {
26 	register struct buf *bp;
27 
28 	bp = getblk(dev, blkno, size);
29 	if (bp->b_flags&B_DONE) {
30 		trace(TR_BREADHIT, dev, blkno);
31 		return(bp);
32 	}
33 	bp->b_flags |= B_READ;
34 	(*bdevsw[major(dev)].d_strategy)(bp);
35 	trace(TR_BREADMISS, dev, blkno);
36 	u.u_vm.vm_inblk++;		/* pay for read */
37 	biowait(bp);
38 	return(bp);
39 }
40 
41 /*
42  * Read in the block, like bread, but also start I/O on the
43  * read-ahead block (which is not allocated to the caller)
44  */
45 struct buf *
46 breada(dev, blkno, size, rablkno, rasize)
47 	dev_t dev;
48 	daddr_t blkno; int size;
49 	daddr_t rablkno; int rasize;
50 {
51 	register struct buf *bp, *rabp;
52 
53 	bp = NULL;
54 	/*
55 	 * If the block isn't in core, then allocate
56 	 * a buffer and initiate i/o (getblk checks
57 	 * for a cache hit).
58 	 */
59 	if (!incore(dev, blkno)) {
60 		bp = getblk(dev, blkno, size);
61 		if ((bp->b_flags&B_DONE) == 0) {
62 			bp->b_flags |= B_READ;
63 			(*bdevsw[major(dev)].d_strategy)(bp);
64 			trace(TR_BREADMISS, dev, blkno);
65 			u.u_vm.vm_inblk++;		/* pay for read */
66 		} else
67 			trace(TR_BREADHIT, dev, blkno);
68 	}
69 
70 	/*
71 	 * If there's a read-ahead block, start i/o
72 	 * on it also (as above).
73 	 */
74 	if (rablkno && !incore(dev, rablkno)) {
75 		rabp = getblk(dev, rablkno, rasize);
76 		if (rabp->b_flags & B_DONE) {
77 			brelse(rabp);
78 			trace(TR_BREADHITRA, dev, blkno);
79 		} else {
80 			rabp->b_flags |= B_READ|B_ASYNC;
81 			(*bdevsw[major(dev)].d_strategy)(rabp);
82 			trace(TR_BREADMISSRA, dev, rablock);
83 			u.u_vm.vm_inblk++;		/* pay in advance */
84 		}
85 	}
86 
87 	/*
88 	 * If block was in core, let bread get it.
89 	 * If block wasn't in core, then the read was started
90 	 * above, and just wait for it.
91 	 */
92 	if (bp == NULL)
93 		return (bread(dev, blkno, size));
94 	biowait(bp);
95 	return (bp);
96 }
97 
98 /*
99  * Write the buffer, waiting for completion.
100  * Then release the buffer.
101  */
102 bwrite(bp)
103 	register struct buf *bp;
104 {
105 	register flag;
106 
107 	flag = bp->b_flags;
108 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
109 	if ((flag&B_DELWRI) == 0)
110 		u.u_vm.vm_oublk++;		/* noone paid yet */
111 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
112 if (bioprintfs)
113 printf("write %x blk %d count %d\n", bp->b_dev, bp->b_blkno, bp->b_bcount);
114 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
115 
116 	/*
117 	 * If the write was synchronous, then await i/o completion.
118 	 * If the write was "delayed", then we put the buffer on
119 	 * the q of blocks awaiting i/o completion status.
120 	 * Otherwise, the i/o must be finished and we check for
121 	 * an error.
122 	 */
123 	if ((flag&B_ASYNC) == 0) {
124 		biowait(bp);
125 		brelse(bp);
126 	} else if (flag & B_DELWRI)
127 		bp->b_flags |= B_AGE;
128 	else
129 		u.u_error = geterror(bp);
130 }
131 
132 /*
133  * Release the buffer, marking it so that if it is grabbed
134  * for another purpose it will be written out before being
135  * given up (e.g. when writing a partial block where it is
136  * assumed that another write for the same block will soon follow).
137  * This can't be done for magtape, since writes must be done
138  * in the same order as requested.
139  */
140 bdwrite(bp)
141 	register struct buf *bp;
142 {
143 	register int flags;
144 
145 	if ((bp->b_flags&B_DELWRI) == 0)
146 		u.u_vm.vm_oublk++;		/* noone paid yet */
147 	flags = bdevsw[major(bp->b_dev)].d_flags;
148 	if(flags & B_TAPE)
149 		bawrite(bp);
150 	else {
151 		bp->b_flags |= B_DELWRI | B_DONE;
152 		brelse(bp);
153 	}
154 }
155 
156 /*
157  * Release the buffer, start I/O on it, but don't wait for completion.
158  */
159 bawrite(bp)
160 	register struct buf *bp;
161 {
162 
163 	bp->b_flags |= B_ASYNC;
164 	bwrite(bp);
165 }
166 
167 /*
168  * Release the buffer, with no I/O implied.
169  */
170 brelse(bp)
171 	register struct buf *bp;
172 {
173 	register struct buf *flist;
174 	register s;
175 
176 	/*
177 	 * If someone's waiting for the buffer, or
178 	 * is waiting for a buffer wake 'em up.
179 	 */
180 	if (bp->b_flags&B_WANTED)
181 		wakeup((caddr_t)bp);
182 	if (bfreelist[0].b_flags&B_WANTED) {
183 		bfreelist[0].b_flags &= ~B_WANTED;
184 		wakeup((caddr_t)bfreelist);
185 	}
186 	if (bp->b_flags&B_ERROR)
187 		if (bp->b_flags & B_LOCKED)
188 			bp->b_flags &= ~B_ERROR;	/* try again later */
189 		else
190 			bp->b_dev = NODEV;  		/* no assoc */
191 
192 	/*
193 	 * Stick the buffer back on a free list.
194 	 */
195 	s = spl6();
196 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
197 		/* block has no info ... put at front of most free list */
198 		flist = &bfreelist[BQUEUES-1];
199 		binsheadfree(bp, flist);
200 	} else {
201 		if (bp->b_flags & B_LOCKED)
202 			flist = &bfreelist[BQ_LOCKED];
203 		else if (bp->b_flags & B_AGE)
204 			flist = &bfreelist[BQ_AGE];
205 		else
206 			flist = &bfreelist[BQ_LRU];
207 		binstailfree(bp, flist);
208 	}
209 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
210 	splx(s);
211 }
212 
213 /*
214  * See if the block is associated with some buffer
215  * (mainly to avoid getting hung up on a wait in breada)
216  */
217 incore(dev, blkno)
218 	dev_t dev;
219 	daddr_t blkno;
220 {
221 	register struct buf *bp;
222 	register struct buf *dp;
223 
224 	dp = BUFHASH(dev, blkno);
225 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
226 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
227 		    (bp->b_flags & B_INVAL) == 0)
228 			return (1);
229 	return (0);
230 }
231 
232 struct buf *
233 baddr(dev, blkno, size)
234 	dev_t dev;
235 	daddr_t blkno;
236 	int size;
237 {
238 
239 	if (incore(dev, blkno))
240 		return (bread(dev, blkno, size));
241 	return (0);
242 }
243 
244 /*
245  * Assign a buffer for the given block.  If the appropriate
246  * block is already associated, return it; otherwise search
247  * for the oldest non-busy buffer and reassign it.
248  *
249  * We use splx here because this routine may be called
250  * on the interrupt stack during a dump, and we don't
251  * want to lower the ipl back to 0.
252  */
253 struct buf *
254 getblk(dev, blkno, size)
255 	dev_t dev;
256 	daddr_t blkno;
257 	int size;
258 {
259 	register struct buf *bp, *dp, *ep;
260 	int s;
261 
262 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
263 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
264 	/*
265 	 * Search the cache for the block.  If we hit, but
266 	 * the buffer is in use for i/o, then we wait until
267 	 * the i/o has completed.
268 	 */
269 	dp = BUFHASH(dev, blkno);
270 loop:
271 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
272 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
273 		    bp->b_flags&B_INVAL)
274 			continue;
275 		s = spl6();
276 		if (bp->b_flags&B_BUSY) {
277 			bp->b_flags |= B_WANTED;
278 			sleep((caddr_t)bp, PRIBIO+1);
279 			splx(s);
280 			goto loop;
281 		}
282 		splx(s);
283 		notavail(bp);
284 		if (brealloc(bp, size) == 0)
285 			goto loop;
286 		bp->b_flags |= B_CACHE;
287 		return(bp);
288 	}
289 	if (major(dev) >= nblkdev)
290 		panic("blkdev");
291 	/*
292 	 * Not found in the cache, select something from
293 	 * a free list.  Preference is to LRU list, then AGE list.
294 	 */
295 	s = spl6();
296 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
297 		if (ep->av_forw != ep)
298 			break;
299 	if (ep == bfreelist) {		/* no free blocks at all */
300 		ep->b_flags |= B_WANTED;
301 		sleep((caddr_t)ep, PRIBIO+1);
302 		splx(s);
303 		goto loop;
304 	}
305 	splx(s);
306 	bp = ep->av_forw;
307 	notavail(bp);
308 	if (bp->b_flags & B_DELWRI) {
309 		bp->b_flags |= B_ASYNC;
310 		bwrite(bp);
311 		goto loop;
312 	}
313 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
314 	bp->b_flags = B_BUSY;
315 	bfree(bp);
316 	bremhash(bp);
317 	binshash(bp, dp);
318 	bp->b_dev = dev;
319 	bp->b_blkno = blkno;
320 	if (brealloc(bp, size) == 0)
321 		goto loop;
322 	return(bp);
323 }
324 
325 /*
326  * get an empty block,
327  * not assigned to any particular device
328  */
329 struct buf *
330 geteblk(size)
331 	int size;
332 {
333 	register struct buf *bp, *dp;
334 	int s;
335 
336 loop:
337 	s = spl6();
338 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
339 		if (dp->av_forw != dp)
340 			break;
341 	if (dp == bfreelist) {		/* no free blocks */
342 		dp->b_flags |= B_WANTED;
343 		sleep((caddr_t)dp, PRIBIO+1);
344 		goto loop;
345 	}
346 	splx(s);
347 	bp = dp->av_forw;
348 	notavail(bp);
349 	if (bp->b_flags & B_DELWRI) {
350 		bp->b_flags |= B_ASYNC;
351 		bwrite(bp);
352 		goto loop;
353 	}
354 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
355 	bp->b_flags = B_BUSY|B_INVAL;
356 	bfree(bp);
357 	bremhash(bp);
358 	binshash(bp, dp);
359 	bp->b_dev = (dev_t)NODEV;
360 	if (brealloc(bp, size) == 0)
361 		goto loop;
362 	return(bp);
363 }
364 
365 /*
366  * Allocate space associated with a buffer.
367  */
368 brealloc(bp, size)
369 	register struct buf *bp;
370 	int size;
371 {
372 	daddr_t start, last;
373 	register struct buf *ep;
374 	struct buf *dp;
375 	int s;
376 
377 	/*
378 	 * First need to make sure that all overlaping previous I/O
379 	 * is dispatched with.
380 	 */
381 	if (size == bp->b_bcount)
382 		return (1);
383 	if (size < bp->b_bcount) {
384 		if (bp->b_flags & B_DELWRI) {
385 			bwrite(bp);
386 			return (0);
387 		}
388 		if (bp->b_flags & B_LOCKED)
389 			panic("brealloc");
390 		goto allocit;
391 	}
392 	bp->b_flags &= ~B_DONE;
393 	if (bp->b_dev == NODEV)
394 		goto allocit;
395 
396 	/*
397 	 * Search cache for any buffers that overlap the one that we
398 	 * are trying to allocate. Overlapping buffers must be marked
399 	 * invalid, after being written out if they are dirty. (indicated
400 	 * by B_DELWRI) A disk block must be mapped by at most one buffer
401 	 * at any point in time. Care must be taken to avoid deadlocking
402 	 * when two buffer are trying to get the same set of disk blocks.
403 	 */
404 	start = bp->b_blkno;
405 	last = start + (size / DEV_BSIZE) - 1;
406 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
407 loop:
408 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
409 		if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
410 			continue;
411 		/* look for overlap */
412 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
413 		    ep->b_blkno + (ep->b_bcount / DEV_BSIZE) <= start)
414 			continue;
415 if (bioprintfs)
416 if (ep->b_flags&B_BUSY)
417 printf("sleeping on:dev 0x%x, blks %d-%d, flg 0%o allocing dev 0x%x, blks %d-%d, flg 0%o\n",
418 ep->b_dev, ep->b_blkno, ep->b_blkno + (ep->b_bcount / DEV_BSIZE) - 1,
419 ep->b_flags, bp->b_dev, start, last, bp->b_flags);
420 		s = spl6();
421 		if (ep->b_flags&B_BUSY) {
422 			ep->b_flags |= B_WANTED;
423 			sleep((caddr_t)ep, PRIBIO+1);
424 			(void) splx(s);
425 			goto loop;
426 		}
427 		(void) splx(s);
428 		notavail(ep);
429 		if (ep->b_flags & B_DELWRI) {
430 if (bioprintfs)
431 printf("DELWRI:dev 0x%x, blks %d-%d, flg 0%o allocing dev 0x%x, blks %d-%d, flg 0%o\n",
432 ep->b_dev, ep->b_blkno, ep->b_blkno + (ep->b_bcount / DEV_BSIZE) - 1,
433 ep->b_flags, bp->b_dev, start, last, bp->b_flags);
434 			bwrite(ep);
435 			goto loop;
436 		}
437 		ep->b_flags |= B_INVAL;
438 		brelse(ep);
439 	}
440 allocit:
441 	/*
442 	 * Here the buffer is already available, so all we
443 	 * need to do is set the size. Someday a better memory
444 	 * management scheme will be implemented.
445 	 */
446 	bp->b_bcount = size;
447 	return (1);
448 }
449 
450 /*
451  * Release space associated with a buffer.
452  */
453 bfree(bp)
454 	struct buf *bp;
455 {
456 	/*
457 	 * Here the buffer does not change, so all we
458 	 * need to do is set the size. Someday a better memory
459 	 * management scheme will be implemented.
460 	 */
461 	bp->b_bcount = 0;
462 }
463 
464 /*
465  * Wait for I/O completion on the buffer; return errors
466  * to the user.
467  */
468 biowait(bp)
469 	register struct buf *bp;
470 {
471 	int s;
472 
473 	s = spl6();
474 	while ((bp->b_flags&B_DONE)==0)
475 		sleep((caddr_t)bp, PRIBIO);
476 	splx(s);
477 	u.u_error = geterror(bp);
478 }
479 
480 /*
481  * Mark I/O complete on a buffer. If the header
482  * indicates a dirty page push completion, the
483  * header is inserted into the ``cleaned'' list
484  * to be processed by the pageout daemon. Otherwise
485  * release it if I/O is asynchronous, and wake
486  * up anyone waiting for it.
487  */
488 biodone(bp)
489 	register struct buf *bp;
490 {
491 	register int s;
492 
493 	if (bp->b_flags & B_DONE)
494 		panic("dup biodone");
495 	bp->b_flags |= B_DONE;
496 	if (bp->b_flags & B_DIRTY) {
497 		if (bp->b_flags & B_ERROR)
498 			panic("IO err in push");
499 		s = spl6();
500 		bp->av_forw = bclnlist;
501 		bp->b_bcount = swsize[bp - swbuf];
502 		bp->b_pfcent = swpf[bp - swbuf];
503 		cnt.v_pgout++;
504 		cnt.v_pgpgout += bp->b_bcount / NBPG;
505 		bclnlist = bp;
506 		if (bswlist.b_flags & B_WANTED)
507 			wakeup((caddr_t)&proc[2]);
508 		splx(s);
509 		return;
510 	}
511 	if (bp->b_flags&B_ASYNC)
512 		brelse(bp);
513 	else {
514 		bp->b_flags &= ~B_WANTED;
515 		wakeup((caddr_t)bp);
516 	}
517 }
518 
519 /*
520  * make sure all write-behind blocks
521  * on dev (or NODEV for all)
522  * are flushed out.
523  * (from umount and update)
524  * (and temporarily pagein)
525  */
526 bflush(dev)
527 	dev_t dev;
528 {
529 	register struct buf *bp;
530 	register struct buf *flist;
531 	int s;
532 
533 loop:
534 	s = spl6();
535 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
536 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
537 		if ((bp->b_flags & B_DELWRI) == 0)
538 			continue;
539 		if (dev == NODEV || dev == bp->b_dev) {
540 			bp->b_flags |= B_ASYNC;
541 			notavail(bp);
542 			bwrite(bp);
543 			goto loop;
544 		}
545 	}
546 	splx(s);
547 }
548 
549 /*
550  * Pick up the device's error number and pass it to the user;
551  * if there is an error but the number is 0 set a generalized
552  * code.  Actually the latter is always true because devices
553  * don't yet return specific errors.
554  */
555 geterror(bp)
556 	register struct buf *bp;
557 {
558 	int error = 0;
559 
560 	if (bp->b_flags&B_ERROR)
561 		if ((error = bp->b_error)==0)
562 			return (EIO);
563 	return (error);
564 }
565 
566 /*
567  * Invalidate in core blocks belonging to closed or umounted filesystem
568  *
569  * This is not nicely done at all - the buffer ought to be removed from the
570  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
571  * can't do that here, as it is quite possible that the block is still
572  * being used for i/o. Eventually, all disc drivers should be forced to
573  * have a close routine, which ought ensure that the queue is empty, then
574  * properly flush the queues. Until that happy day, this suffices for
575  * correctness.						... kre
576  */
577 binval(dev)
578 	dev_t dev;
579 {
580 	register struct buf *bp;
581 	register struct bufhd *hp;
582 #define dp ((struct buf *)hp)
583 
584 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
585 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
586 			if (bp->b_dev == dev)
587 				bp->b_flags |= B_INVAL;
588 }
589