xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 7016)
1 /*	vfs_cluster.c	4.32	82/06/01	*/
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/dir.h"
6 #include "../h/user.h"
7 #include "../h/buf.h"
8 #include "../h/conf.h"
9 #include "../h/proc.h"
10 #include "../h/seg.h"
11 #include "../h/pte.h"
12 #include "../h/vm.h"
13 #include "../h/trace.h"
14 
15 /*
16  * Read in (if necessary) the block and return a buffer pointer.
17  */
18 struct buf *
19 bread(dev, blkno, size)
20 	dev_t dev;
21 	daddr_t blkno;
22 	int size;
23 {
24 	register struct buf *bp;
25 
26 	bp = getblk(dev, blkno, size);
27 	if (bp->b_flags&B_DONE) {
28 		trace(TR_BREADHIT, dev, blkno);
29 		return(bp);
30 	}
31 	bp->b_flags |= B_READ;
32 	(*bdevsw[major(dev)].d_strategy)(bp);
33 	trace(TR_BREADMISS, dev, blkno);
34 	u.u_vm.vm_inblk++;		/* pay for read */
35 	biowait(bp);
36 	return(bp);
37 }
38 
39 /*
40  * Read in the block, like bread, but also start I/O on the
41  * read-ahead block (which is not allocated to the caller)
42  */
43 struct buf *
44 breada(dev, blkno, rablkno, size)
45 	dev_t dev;
46 	daddr_t blkno, rablkno;
47 	int size;
48 {
49 	register struct buf *bp, *rabp;
50 
51 	bp = NULL;
52 	/*
53 	 * If the block isn't in core, then allocate
54 	 * a buffer and initiate i/o (getblk checks
55 	 * for a cache hit).
56 	 */
57 	if (!incore(dev, blkno)) {
58 		bp = getblk(dev, blkno, size);
59 		if ((bp->b_flags&B_DONE) == 0) {
60 			bp->b_flags |= B_READ;
61 			(*bdevsw[major(dev)].d_strategy)(bp);
62 			trace(TR_BREADMISS, dev, blkno);
63 			u.u_vm.vm_inblk++;		/* pay for read */
64 		} else
65 			trace(TR_BREADHIT, dev, blkno);
66 	}
67 
68 	/*
69 	 * If there's a read-ahead block, start i/o
70 	 * on it also (as above).
71 	 */
72 	if (rablkno && !incore(dev, rablkno)) {
73 		rabp = getblk(dev, rablkno, size);
74 		if (rabp->b_flags & B_DONE) {
75 			brelse(rabp);
76 			trace(TR_BREADHITRA, dev, blkno);
77 		} else {
78 			rabp->b_flags |= B_READ|B_ASYNC;
79 			(*bdevsw[major(dev)].d_strategy)(rabp);
80 			trace(TR_BREADMISSRA, dev, rablock);
81 			u.u_vm.vm_inblk++;		/* pay in advance */
82 		}
83 	}
84 
85 	/*
86 	 * If we get here with bp NULL, then the block
87 	 * must've been in core and bread will find it for us.
88 	 */
89 	if(bp == NULL)
90 		return(bread(dev, blkno, size));
91 	biowait(bp);
92 	return(bp);
93 }
94 
95 /*
96  * Write the buffer, waiting for completion.
97  * Then release the buffer.
98  */
99 bwrite(bp)
100 	register struct buf *bp;
101 {
102 	register flag;
103 
104 	flag = bp->b_flags;
105 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
106 	if ((flag&B_DELWRI) == 0)
107 		u.u_vm.vm_oublk++;		/* noone paid yet */
108 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
109 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
110 
111 	/*
112 	 * If the write was synchronous, then await i/o completion.
113 	 * If the write was "delayed", then we put the buffer on
114 	 * the q of blocks awaiting i/o completion status.
115 	 * Otherwise, the i/o must be finished and we check for
116 	 * an error.
117 	 */
118 	if ((flag&B_ASYNC) == 0) {
119 		biowait(bp);
120 		brelse(bp);
121 	} else if (flag & B_DELWRI)
122 		bp->b_flags |= B_AGE;
123 	else
124 		geterror(bp);
125 }
126 
127 /*
128  * Release the buffer, marking it so that if it is grabbed
129  * for another purpose it will be written out before being
130  * given up (e.g. when writing a partial block where it is
131  * assumed that another write for the same block will soon follow).
132  * This can't be done for magtape, since writes must be done
133  * in the same order as requested.
134  */
135 bdwrite(bp)
136 	register struct buf *bp;
137 {
138 	register int flags;
139 
140 	if ((bp->b_flags&B_DELWRI) == 0)
141 		u.u_vm.vm_oublk++;		/* noone paid yet */
142 	flags = bdevsw[major(bp->b_dev)].d_flags;
143 	if(flags & B_TAPE)
144 		bawrite(bp);
145 	else {
146 		bp->b_flags |= B_DELWRI | B_DONE;
147 		brelse(bp);
148 	}
149 }
150 
151 /*
152  * Release the buffer, start I/O on it, but don't wait for completion.
153  */
154 bawrite(bp)
155 	register struct buf *bp;
156 {
157 
158 	bp->b_flags |= B_ASYNC;
159 	bwrite(bp);
160 }
161 
162 /*
163  * Release the buffer, with no I/O implied.
164  */
165 brelse(bp)
166 	register struct buf *bp;
167 {
168 	register struct buf *flist;
169 	register s;
170 
171 	/*
172 	 * If someone's waiting for the buffer, or
173 	 * is waiting for a buffer wake 'em up.
174 	 */
175 	if (bp->b_flags&B_WANTED)
176 		wakeup((caddr_t)bp);
177 	if (bfreelist[0].b_flags&B_WANTED) {
178 		bfreelist[0].b_flags &= ~B_WANTED;
179 		wakeup((caddr_t)bfreelist);
180 	}
181 	if (bp->b_flags&B_ERROR)
182 		if (bp->b_flags & B_LOCKED)
183 			bp->b_flags &= ~B_ERROR;	/* try again later */
184 		else
185 			bp->b_dev = NODEV;  		/* no assoc */
186 
187 	/*
188 	 * Stick the buffer back on a free list.
189 	 */
190 	s = spl6();
191 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
192 		/* block has no info ... put at front of most free list */
193 		flist = &bfreelist[BQUEUES-1];
194 		binsheadfree(bp, flist);
195 	} else {
196 		if (bp->b_flags & B_LOCKED)
197 			flist = &bfreelist[BQ_LOCKED];
198 		else if (bp->b_flags & B_AGE)
199 			flist = &bfreelist[BQ_AGE];
200 		else
201 			flist = &bfreelist[BQ_LRU];
202 		binstailfree(bp, flist);
203 	}
204 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
205 	splx(s);
206 }
207 
208 /*
209  * See if the block is associated with some buffer
210  * (mainly to avoid getting hung up on a wait in breada)
211  */
212 incore(dev, blkno)
213 	dev_t dev;
214 	daddr_t blkno;
215 {
216 	register struct buf *bp;
217 	register struct buf *dp;
218 
219 	dp = BUFHASH(dev, blkno);
220 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
221 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
222 		    (bp->b_flags & B_INVAL) == 0)
223 			return (1);
224 	return (0);
225 }
226 
227 struct buf *
228 baddr(dev, blkno, size)
229 	dev_t dev;
230 	daddr_t blkno;
231 	int size;
232 {
233 
234 	if (incore(dev, blkno))
235 		return (bread(dev, blkno, size));
236 	return (0);
237 }
238 
239 /*
240  * Assign a buffer for the given block.  If the appropriate
241  * block is already associated, return it; otherwise search
242  * for the oldest non-busy buffer and reassign it.
243  *
244  * We use splx here because this routine may be called
245  * on the interrupt stack during a dump, and we don't
246  * want to lower the ipl back to 0.
247  */
248 struct buf *
249 getblk(dev, blkno, size)
250 	dev_t dev;
251 	daddr_t blkno;
252 	int size;
253 {
254 	register struct buf *bp, *dp, *ep;
255 	int s;
256 
257 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
258 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
259 	/*
260 	 * Search the cache for the block.  If we hit, but
261 	 * the buffer is in use for i/o, then we wait until
262 	 * the i/o has completed.
263 	 */
264 	dp = BUFHASH(dev, blkno);
265 loop:
266 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
267 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
268 		    bp->b_flags&B_INVAL)
269 			continue;
270 		s = spl6();
271 		if (bp->b_flags&B_BUSY) {
272 			bp->b_flags |= B_WANTED;
273 			sleep((caddr_t)bp, PRIBIO+1);
274 			splx(s);
275 			goto loop;
276 		}
277 		splx(s);
278 		notavail(bp);
279 		brealloc(bp, size);
280 		bp->b_flags |= B_CACHE;
281 		return(bp);
282 	}
283 	if (major(dev) >= nblkdev)
284 		panic("blkdev");
285 	/*
286 	 * Not found in the cache, select something from
287 	 * a free list.  Preference is to LRU list, then AGE list.
288 	 */
289 	s = spl6();
290 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
291 		if (ep->av_forw != ep)
292 			break;
293 	if (ep == bfreelist) {		/* no free blocks at all */
294 		ep->b_flags |= B_WANTED;
295 		sleep((caddr_t)ep, PRIBIO+1);
296 		splx(s);
297 		goto loop;
298 	}
299 	splx(s);
300 	bp = ep->av_forw;
301 	notavail(bp);
302 	if (bp->b_flags & B_DELWRI) {
303 		bp->b_flags |= B_ASYNC;
304 		bwrite(bp);
305 		goto loop;
306 	}
307 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
308 	bp->b_flags = B_BUSY;
309 	bfree(bp);
310 	bremhash(bp);
311 	binshash(bp, dp);
312 	bp->b_dev = dev;
313 	bp->b_blkno = blkno;
314 	brealloc(bp, size);
315 	return(bp);
316 }
317 
318 /*
319  * get an empty block,
320  * not assigned to any particular device
321  */
322 struct buf *
323 geteblk(size)
324 	int size;
325 {
326 	register struct buf *bp, *dp;
327 	int s;
328 
329 loop:
330 	s = spl6();
331 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
332 		if (dp->av_forw != dp)
333 			break;
334 	if (dp == bfreelist) {		/* no free blocks */
335 		dp->b_flags |= B_WANTED;
336 		sleep((caddr_t)dp, PRIBIO+1);
337 		goto loop;
338 	}
339 	splx(s);
340 	bp = dp->av_forw;
341 	notavail(bp);
342 	if (bp->b_flags & B_DELWRI) {
343 		bp->b_flags |= B_ASYNC;
344 		bwrite(bp);
345 		goto loop;
346 	}
347 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
348 	bp->b_flags = B_BUSY|B_INVAL;
349 	bfree(bp);
350 	bremhash(bp);
351 	binshash(bp, dp);
352 	bp->b_dev = (dev_t)NODEV;
353 	brealloc(bp, size);
354 	return(bp);
355 }
356 
357 /*
358  * Allocate space associated with a buffer.
359  */
360 brealloc(bp, size)
361 	register struct buf *bp;
362 	int size;
363 {
364 	daddr_t start, last;
365 	register struct buf *ep;
366 	struct buf *dp;
367 	int s;
368 
369 	/*
370 	 * First need to make sure that all overlaping previous I/O
371 	 * is dispatched with.
372 	 */
373 	if (size == bp->b_bcount)
374 		return;
375 	if (size < bp->b_bcount || bp->b_dev == NODEV)
376 		goto allocit;
377 
378 	start = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);
379 	last = bp->b_blkno + (size / DEV_BSIZE) - 1;
380 	if (bp->b_bcount == 0) {
381 		start++;
382 		if (start == last)
383 			goto allocit;
384 	}
385 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
386 loop:
387 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
388 		if (ep->b_blkno < start || ep->b_blkno > last ||
389 		    ep->b_dev != bp->b_dev || ep->b_flags&B_INVAL)
390 			continue;
391 		s = spl6();
392 		if (ep->b_flags&B_BUSY) {
393 			ep->b_flags |= B_WANTED;
394 			sleep((caddr_t)ep, PRIBIO+1);
395 			(void) splx(s);
396 			goto loop;
397 		}
398 		(void) splx(s);
399 		/*
400 		 * What we would really like to do is kill this
401 		 * I/O since it is now useless. We cannot do that
402 		 * so we force it to complete, so that it cannot
403 		 * over-write our useful data later.
404 		 */
405 		if (ep->b_flags & B_DELWRI) {
406 			notavail(ep);
407 			ep->b_flags |= B_ASYNC;
408 			bwrite(ep);
409 			goto loop;
410 		}
411 	}
412 allocit:
413 	/*
414 	 * Here the buffer is already available, so all we
415 	 * need to do is set the size. Someday a better memory
416 	 * management scheme will be implemented.
417 	 */
418 	bp->b_bcount = size;
419 }
420 
421 /*
422  * Release space associated with a buffer.
423  */
424 bfree(bp)
425 	struct buf *bp;
426 {
427 	/*
428 	 * Here the buffer does not change, so all we
429 	 * need to do is set the size. Someday a better memory
430 	 * management scheme will be implemented.
431 	 */
432 	bp->b_bcount = 0;
433 }
434 
435 /*
436  * Wait for I/O completion on the buffer; return errors
437  * to the user.
438  */
439 biowait(bp)
440 	register struct buf *bp;
441 {
442 	int s;
443 
444 	s = spl6();
445 	while ((bp->b_flags&B_DONE)==0)
446 		sleep((caddr_t)bp, PRIBIO);
447 	splx(s);
448 	geterror(bp);
449 }
450 
451 /*
452  * Mark I/O complete on a buffer. If the header
453  * indicates a dirty page push completion, the
454  * header is inserted into the ``cleaned'' list
455  * to be processed by the pageout daemon. Otherwise
456  * release it if I/O is asynchronous, and wake
457  * up anyone waiting for it.
458  */
459 biodone(bp)
460 	register struct buf *bp;
461 {
462 	register int s;
463 
464 	if (bp->b_flags & B_DONE)
465 		panic("dup biodone");
466 	bp->b_flags |= B_DONE;
467 	if (bp->b_flags & B_DIRTY) {
468 		if (bp->b_flags & B_ERROR)
469 			panic("IO err in push");
470 		s = spl6();
471 		bp->av_forw = bclnlist;
472 		bp->b_bcount = swsize[bp - swbuf];
473 		bp->b_pfcent = swpf[bp - swbuf];
474 		cnt.v_pgout++;
475 		cnt.v_pgpgout += bp->b_bcount / NBPG;
476 		bclnlist = bp;
477 		if (bswlist.b_flags & B_WANTED)
478 			wakeup((caddr_t)&proc[2]);
479 		splx(s);
480 		return;
481 	}
482 	if (bp->b_flags&B_ASYNC)
483 		brelse(bp);
484 	else {
485 		bp->b_flags &= ~B_WANTED;
486 		wakeup((caddr_t)bp);
487 	}
488 }
489 
490 /*
491  * make sure all write-behind blocks
492  * on dev (or NODEV for all)
493  * are flushed out.
494  * (from umount and update)
495  * (and temporarily pagein)
496  */
497 bflush(dev)
498 	dev_t dev;
499 {
500 	register struct buf *bp;
501 	register struct buf *flist;
502 	int s;
503 
504 loop:
505 	s = spl6();
506 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
507 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
508 		if ((bp->b_flags & B_DELWRI) == 0)
509 			continue;
510 		if (dev == NODEV || dev == bp->b_dev) {
511 			bp->b_flags |= B_ASYNC;
512 			notavail(bp);
513 			bwrite(bp);
514 			goto loop;
515 		}
516 	}
517 	splx(s);
518 }
519 
520 /*
521  * Pick up the device's error number and pass it to the user;
522  * if there is an error but the number is 0 set a generalized
523  * code.  Actually the latter is always true because devices
524  * don't yet return specific errors.
525  */
526 geterror(bp)
527 	register struct buf *bp;
528 {
529 
530 	if (bp->b_flags&B_ERROR)
531 		if ((u.u_error = bp->b_error)==0)
532 			u.u_error = EIO;
533 }
534 
535 /*
536  * Invalidate in core blocks belonging to closed or umounted filesystem
537  *
538  * This is not nicely done at all - the buffer ought to be removed from the
539  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
540  * can't do that here, as it is quite possible that the block is still
541  * being used for i/o. Eventually, all disc drivers should be forced to
542  * have a close routine, which ought ensure that the queue is empty, then
543  * properly flush the queues. Until that happy day, this suffices for
544  * correctness.						... kre
545  */
546 binval(dev)
547 	dev_t dev;
548 {
549 	register struct buf *bp;
550 	register struct bufhd *hp;
551 #define dp ((struct buf *)hp)
552 
553 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
554 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
555 			if (bp->b_dev == dev)
556 				bp->b_flags |= B_INVAL;
557 }
558