xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 7015)
1 /*	vfs_cluster.c	4.31	82/05/31	*/
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/dir.h"
6 #include "../h/user.h"
7 #include "../h/buf.h"
8 #include "../h/conf.h"
9 #include "../h/proc.h"
10 #include "../h/seg.h"
11 #include "../h/pte.h"
12 #include "../h/vm.h"
13 #include "../h/trace.h"
14 
15 /*
16  * Read in (if necessary) the block and return a buffer pointer.
17  */
18 struct buf *
19 bread(dev, blkno, size)
20 	dev_t dev;
21 	daddr_t blkno;
22 	int size;
23 {
24 	register struct buf *bp;
25 
26 	bp = getblk(dev, blkno, size);
27 	if (bp->b_flags&B_DONE) {
28 		trace(TR_BREADHIT, dev, blkno);
29 		return(bp);
30 	}
31 	bp->b_flags |= B_READ;
32 	(*bdevsw[major(dev)].d_strategy)(bp);
33 	trace(TR_BREADMISS, dev, blkno);
34 	u.u_vm.vm_inblk++;		/* pay for read */
35 	biowait(bp);
36 	return(bp);
37 }
38 
39 /*
40  * Read in the block, like bread, but also start I/O on the
41  * read-ahead block (which is not allocated to the caller)
42  */
43 struct buf *
44 breada(dev, blkno, rablkno, size)
45 	dev_t dev;
46 	daddr_t blkno, rablkno;
47 	int size;
48 {
49 	register struct buf *bp, *rabp;
50 
51 	bp = NULL;
52 	/*
53 	 * If the block isn't in core, then allocate
54 	 * a buffer and initiate i/o (getblk checks
55 	 * for a cache hit).
56 	 */
57 	if (!incore(dev, blkno)) {
58 		bp = getblk(dev, blkno, size);
59 		if ((bp->b_flags&B_DONE) == 0) {
60 			bp->b_flags |= B_READ;
61 			(*bdevsw[major(dev)].d_strategy)(bp);
62 			trace(TR_BREADMISS, dev, blkno);
63 			u.u_vm.vm_inblk++;		/* pay for read */
64 		} else
65 			trace(TR_BREADHIT, dev, blkno);
66 	}
67 
68 	/*
69 	 * If there's a read-ahead block, start i/o
70 	 * on it also (as above).
71 	 */
72 	if (rablkno && !incore(dev, rablkno)) {
73 		rabp = getblk(dev, rablkno, size);
74 		if (rabp->b_flags & B_DONE) {
75 			brelse(rabp);
76 			trace(TR_BREADHITRA, dev, blkno);
77 		} else {
78 			rabp->b_flags |= B_READ|B_ASYNC;
79 			(*bdevsw[major(dev)].d_strategy)(rabp);
80 			trace(TR_BREADMISSRA, dev, rablock);
81 			u.u_vm.vm_inblk++;		/* pay in advance */
82 		}
83 	}
84 
85 	/*
86 	 * If we get here with bp NULL, then the block
87 	 * must've been in core and bread will find it for us.
88 	 */
89 	if(bp == NULL)
90 		return(bread(dev, blkno, size));
91 	biowait(bp);
92 	return(bp);
93 }
94 
95 /*
96  * Write the buffer, waiting for completion.
97  * Then release the buffer.
98  */
99 bwrite(bp)
100 	register struct buf *bp;
101 {
102 	register flag;
103 
104 	flag = bp->b_flags;
105 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
106 	if ((flag&B_DELWRI) == 0)
107 		u.u_vm.vm_oublk++;		/* noone paid yet */
108 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
109 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
110 
111 	/*
112 	 * If the write was synchronous, then await i/o completion.
113 	 * If the write was "delayed", then we put the buffer on
114 	 * the q of blocks awaiting i/o completion status.
115 	 * Otherwise, the i/o must be finished and we check for
116 	 * an error.
117 	 */
118 	if ((flag&B_ASYNC) == 0) {
119 		biowait(bp);
120 		brelse(bp);
121 	} else if (flag & B_DELWRI)
122 		bp->b_flags |= B_AGE;
123 	else
124 		geterror(bp);
125 }
126 
127 /*
128  * Release the buffer, marking it so that if it is grabbed
129  * for another purpose it will be written out before being
130  * given up (e.g. when writing a partial block where it is
131  * assumed that another write for the same block will soon follow).
132  * This can't be done for magtape, since writes must be done
133  * in the same order as requested.
134  */
135 bdwrite(bp)
136 	register struct buf *bp;
137 {
138 	register int flags;
139 
140 	if ((bp->b_flags&B_DELWRI) == 0)
141 		u.u_vm.vm_oublk++;		/* noone paid yet */
142 	flags = bdevsw[major(bp->b_dev)].d_flags;
143 	if(flags & B_TAPE)
144 		bawrite(bp);
145 	else {
146 		bp->b_flags |= B_DELWRI | B_DONE;
147 		brelse(bp);
148 	}
149 }
150 
151 /*
152  * Release the buffer, start I/O on it, but don't wait for completion.
153  */
154 bawrite(bp)
155 	register struct buf *bp;
156 {
157 
158 	bp->b_flags |= B_ASYNC;
159 	bwrite(bp);
160 }
161 
162 /*
163  * Release the buffer, with no I/O implied.
164  */
165 brelse(bp)
166 	register struct buf *bp;
167 {
168 	register struct buf *flist;
169 	register s;
170 
171 	/*
172 	 * If someone's waiting for the buffer, or
173 	 * is waiting for a buffer wake 'em up.
174 	 */
175 	if (bp->b_flags&B_WANTED)
176 		wakeup((caddr_t)bp);
177 	if (bfreelist[0].b_flags&B_WANTED) {
178 		bfreelist[0].b_flags &= ~B_WANTED;
179 		wakeup((caddr_t)bfreelist);
180 	}
181 	if (bp->b_flags&B_ERROR)
182 		if (bp->b_flags & B_LOCKED)
183 			bp->b_flags &= ~B_ERROR;	/* try again later */
184 		else
185 			bp->b_dev = NODEV;  		/* no assoc */
186 
187 	/*
188 	 * Stick the buffer back on a free list.
189 	 */
190 	s = spl6();
191 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
192 		/* block has no info ... put at front of most free list */
193 		flist = &bfreelist[BQUEUES-1];
194 		binsheadfree(bp, flist);
195 	} else {
196 		if (bp->b_flags & B_LOCKED)
197 			flist = &bfreelist[BQ_LOCKED];
198 		else if (bp->b_flags & B_AGE)
199 			flist = &bfreelist[BQ_AGE];
200 		else
201 			flist = &bfreelist[BQ_LRU];
202 		binstailfree(bp, flist);
203 	}
204 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
205 	splx(s);
206 }
207 
208 /*
209  * See if the block is associated with some buffer
210  * (mainly to avoid getting hung up on a wait in breada)
211  */
212 incore(dev, blkno)
213 	dev_t dev;
214 	daddr_t blkno;
215 {
216 	register struct buf *bp;
217 	register struct buf *dp;
218 
219 	dp = BUFHASH(dev, blkno);
220 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
221 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
222 		    (bp->b_flags & B_INVAL) == 0)
223 			return (1);
224 	return (0);
225 }
226 
227 struct buf *
228 baddr(dev, blkno, size)
229 	dev_t dev;
230 	daddr_t blkno;
231 	int size;
232 {
233 
234 	if (incore(dev, blkno))
235 		return (bread(dev, blkno, size));
236 	return (0);
237 }
238 
239 /*
240  * Assign a buffer for the given block.  If the appropriate
241  * block is already associated, return it; otherwise search
242  * for the oldest non-busy buffer and reassign it.
243  *
244  * We use splx here because this routine may be called
245  * on the interrupt stack during a dump, and we don't
246  * want to lower the ipl back to 0.
247  */
248 struct buf *
249 getblk(dev, blkno, size)
250 	dev_t dev;
251 	daddr_t blkno;
252 	int size;
253 {
254 	register struct buf *bp, *dp, *ep;
255 	int s;
256 
257 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
258 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
259 	/*
260 	 * Search the cache for the block.  If we hit, but
261 	 * the buffer is in use for i/o, then we wait until
262 	 * the i/o has completed.
263 	 */
264 	dp = BUFHASH(dev, blkno);
265 loop:
266 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
267 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
268 		    bp->b_flags&B_INVAL)
269 			continue;
270 		s = spl6();
271 		if (bp->b_flags&B_BUSY) {
272 			bp->b_flags |= B_WANTED;
273 			sleep((caddr_t)bp, PRIBIO+1);
274 			splx(s);
275 			goto loop;
276 		}
277 		splx(s);
278 		notavail(bp);
279 		brealloc(bp, size);
280 		bp->b_flags |= B_CACHE;
281 		return(bp);
282 	}
283 	if (major(dev) >= nblkdev)
284 		panic("blkdev");
285 	/*
286 	 * Not found in the cache, select something from
287 	 * a free list.  Preference is to LRU list, then AGE list.
288 	 */
289 	s = spl6();
290 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
291 		if (ep->av_forw != ep)
292 			break;
293 	if (ep == bfreelist) {		/* no free blocks at all */
294 		ep->b_flags |= B_WANTED;
295 		sleep((caddr_t)ep, PRIBIO+1);
296 		splx(s);
297 		goto loop;
298 	}
299 	splx(s);
300 	bp = ep->av_forw;
301 	notavail(bp);
302 	if (bp->b_flags & B_DELWRI) {
303 		bp->b_flags |= B_ASYNC;
304 		bwrite(bp);
305 		goto loop;
306 	}
307 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
308 	bp->b_flags = B_BUSY;
309 	bfree(bp);
310 	bremhash(bp);
311 	binshash(bp, dp);
312 	bp->b_dev = dev;
313 	bp->b_blkno = blkno;
314 	brealloc(bp, size);
315 	return(bp);
316 }
317 
318 /*
319  * get an empty block,
320  * not assigned to any particular device
321  */
322 struct buf *
323 geteblk(size)
324 	int size;
325 {
326 	register struct buf *bp, *dp;
327 	int s;
328 
329 loop:
330 	s = spl6();
331 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
332 		if (dp->av_forw != dp)
333 			break;
334 	if (dp == bfreelist) {		/* no free blocks */
335 		dp->b_flags |= B_WANTED;
336 		sleep((caddr_t)dp, PRIBIO+1);
337 		goto loop;
338 	}
339 	splx(s);
340 	bp = dp->av_forw;
341 	notavail(bp);
342 	if (bp->b_flags & B_DELWRI) {
343 		bp->b_flags |= B_ASYNC;
344 		bwrite(bp);
345 		goto loop;
346 	}
347 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
348 	bp->b_flags = B_BUSY|B_INVAL;
349 	bfree(bp);
350 	bremhash(bp);
351 	binshash(bp, dp);
352 	bp->b_dev = (dev_t)NODEV;
353 	bp->b_bcount = size;
354 	return(bp);
355 }
356 
357 /*
358  * Allocate space associated with a buffer.
359  */
360 brealloc(bp, size)
361 	register struct buf *bp;
362 	int size;
363 {
364 	daddr_t start, last;
365 	register struct buf *ep;
366 	struct buf *dp;
367 	int s;
368 
369 	/*
370 	 * First need to make sure that all overlaping previous I/O
371 	 * is dispatched with.
372 	 */
373 	if (size == bp->b_bcount)
374 		return;
375 	if (size < bp->b_bcount) {
376 		bp->b_bcount = size;
377 		return;
378 	}
379 	start = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);
380 	last = bp->b_blkno + (size / DEV_BSIZE) - 1;
381 	if (bp->b_bcount == 0) {
382 		start++;
383 		if (start == last)
384 			goto allocit;
385 	}
386 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
387 loop:
388 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
389 		if (ep->b_blkno < start || ep->b_blkno > last ||
390 		    ep->b_dev != bp->b_dev || ep->b_flags&B_INVAL)
391 			continue;
392 		s = spl6();
393 		if (ep->b_flags&B_BUSY) {
394 			ep->b_flags |= B_WANTED;
395 			sleep((caddr_t)ep, PRIBIO+1);
396 			(void) splx(s);
397 			goto loop;
398 		}
399 		(void) splx(s);
400 		/*
401 		 * What we would really like to do is kill this
402 		 * I/O since it is now useless. We cannot do that
403 		 * so we force it to complete, so that it cannot
404 		 * over-write our useful data later.
405 		 */
406 		if (ep->b_flags & B_DELWRI) {
407 			notavail(ep);
408 			ep->b_flags |= B_ASYNC;
409 			bwrite(ep);
410 			goto loop;
411 		}
412 	}
413 allocit:
414 	/*
415 	 * Here the buffer is already available, so all we
416 	 * need to do is set the size. Someday a better memory
417 	 * management scheme will be implemented.
418 	 */
419 	bp->b_bcount = size;
420 }
421 
422 /*
423  * Release space associated with a buffer.
424  */
425 bfree(bp)
426 	struct buf *bp;
427 {
428 	/*
429 	 * Here the buffer does not change, so all we
430 	 * need to do is set the size. Someday a better memory
431 	 * management scheme will be implemented.
432 	 */
433 	bp->b_bcount = 0;
434 }
435 
436 /*
437  * Wait for I/O completion on the buffer; return errors
438  * to the user.
439  */
440 biowait(bp)
441 	register struct buf *bp;
442 {
443 	int s;
444 
445 	s = spl6();
446 	while ((bp->b_flags&B_DONE)==0)
447 		sleep((caddr_t)bp, PRIBIO);
448 	splx(s);
449 	geterror(bp);
450 }
451 
452 /*
453  * Mark I/O complete on a buffer. If the header
454  * indicates a dirty page push completion, the
455  * header is inserted into the ``cleaned'' list
456  * to be processed by the pageout daemon. Otherwise
457  * release it if I/O is asynchronous, and wake
458  * up anyone waiting for it.
459  */
460 biodone(bp)
461 	register struct buf *bp;
462 {
463 	register int s;
464 
465 	if (bp->b_flags & B_DONE)
466 		panic("dup biodone");
467 	bp->b_flags |= B_DONE;
468 	if (bp->b_flags & B_DIRTY) {
469 		if (bp->b_flags & B_ERROR)
470 			panic("IO err in push");
471 		s = spl6();
472 		bp->av_forw = bclnlist;
473 		bp->b_bcount = swsize[bp - swbuf];
474 		bp->b_pfcent = swpf[bp - swbuf];
475 		cnt.v_pgout++;
476 		cnt.v_pgpgout += bp->b_bcount / NBPG;
477 		bclnlist = bp;
478 		if (bswlist.b_flags & B_WANTED)
479 			wakeup((caddr_t)&proc[2]);
480 		splx(s);
481 		return;
482 	}
483 	if (bp->b_flags&B_ASYNC)
484 		brelse(bp);
485 	else {
486 		bp->b_flags &= ~B_WANTED;
487 		wakeup((caddr_t)bp);
488 	}
489 }
490 
491 /*
492  * make sure all write-behind blocks
493  * on dev (or NODEV for all)
494  * are flushed out.
495  * (from umount and update)
496  * (and temporarily pagein)
497  */
498 bflush(dev)
499 	dev_t dev;
500 {
501 	register struct buf *bp;
502 	register struct buf *flist;
503 	int s;
504 
505 loop:
506 	s = spl6();
507 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
508 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
509 		if ((bp->b_flags & B_DELWRI) == 0)
510 			continue;
511 		if (dev == NODEV || dev == bp->b_dev) {
512 			bp->b_flags |= B_ASYNC;
513 			notavail(bp);
514 			bwrite(bp);
515 			goto loop;
516 		}
517 	}
518 	splx(s);
519 }
520 
521 /*
522  * Pick up the device's error number and pass it to the user;
523  * if there is an error but the number is 0 set a generalized
524  * code.  Actually the latter is always true because devices
525  * don't yet return specific errors.
526  */
527 geterror(bp)
528 	register struct buf *bp;
529 {
530 
531 	if (bp->b_flags&B_ERROR)
532 		if ((u.u_error = bp->b_error)==0)
533 			u.u_error = EIO;
534 }
535 
536 /*
537  * Invalidate in core blocks belonging to closed or umounted filesystem
538  *
539  * This is not nicely done at all - the buffer ought to be removed from the
540  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
541  * can't do that here, as it is quite possible that the block is still
542  * being used for i/o. Eventually, all disc drivers should be forced to
543  * have a close routine, which ought ensure that the queue is empty, then
544  * properly flush the queues. Until that happy day, this suffices for
545  * correctness.						... kre
546  */
547 binval(dev)
548 	dev_t dev;
549 {
550 	register struct buf *bp;
551 	register struct bufhd *hp;
552 #define dp ((struct buf *)hp)
553 
554 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
555 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
556 			if (bp->b_dev == dev)
557 				bp->b_flags |= B_INVAL;
558 }
559