xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 7114)
1 /*	vfs_cluster.c	4.33	82/06/07	*/
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/dir.h"
6 #include "../h/user.h"
7 #include "../h/buf.h"
8 #include "../h/conf.h"
9 #include "../h/proc.h"
10 #include "../h/seg.h"
11 #include "../h/pte.h"
12 #include "../h/vm.h"
13 #include "../h/trace.h"
14 
15 /*
16  * Read in (if necessary) the block and return a buffer pointer.
17  */
18 struct buf *
19 bread(dev, blkno, size)
20 	dev_t dev;
21 	daddr_t blkno;
22 	int size;
23 {
24 	register struct buf *bp;
25 
26 	bp = getblk(dev, blkno, size);
27 	if (bp->b_flags&B_DONE) {
28 		trace(TR_BREADHIT, dev, blkno);
29 		return(bp);
30 	}
31 	bp->b_flags |= B_READ;
32 	(*bdevsw[major(dev)].d_strategy)(bp);
33 	trace(TR_BREADMISS, dev, blkno);
34 	u.u_vm.vm_inblk++;		/* pay for read */
35 	biowait(bp);
36 	return(bp);
37 }
38 
39 /*
40  * Read in the block, like bread, but also start I/O on the
41  * read-ahead block (which is not allocated to the caller)
42  */
43 struct buf *
44 breada(dev, blkno, size, rablkno, rasize)
45 	dev_t dev;
46 	daddr_t blkno; int size;
47 	daddr_t rablkno; int rasize;
48 {
49 	register struct buf *bp, *rabp;
50 
51 	bp = NULL;
52 	/*
53 	 * If the block isn't in core, then allocate
54 	 * a buffer and initiate i/o (getblk checks
55 	 * for a cache hit).
56 	 */
57 	if (!incore(dev, blkno)) {
58 		bp = getblk(dev, blkno, size);
59 		if ((bp->b_flags&B_DONE) == 0) {
60 			bp->b_flags |= B_READ;
61 			(*bdevsw[major(dev)].d_strategy)(bp);
62 			trace(TR_BREADMISS, dev, blkno);
63 			u.u_vm.vm_inblk++;		/* pay for read */
64 		} else
65 			trace(TR_BREADHIT, dev, blkno);
66 	}
67 
68 	/*
69 	 * If there's a read-ahead block, start i/o
70 	 * on it also (as above).
71 	 */
72 	if (rablkno && !incore(dev, rablkno)) {
73 		rabp = getblk(dev, rablkno, rasize);
74 		if (rabp->b_flags & B_DONE) {
75 			brelse(rabp);
76 			trace(TR_BREADHITRA, dev, blkno);
77 		} else {
78 			rabp->b_flags |= B_READ|B_ASYNC;
79 			(*bdevsw[major(dev)].d_strategy)(rabp);
80 			trace(TR_BREADMISSRA, dev, rablock);
81 			u.u_vm.vm_inblk++;		/* pay in advance */
82 		}
83 	}
84 
85 	/*
86 	 * If block was in core, let bread get it.
87 	 * If block wasn't in core, then the read was started
88 	 * above, and just wait for it.
89 	 */
90 	if (bp == NULL)
91 		return (bread(dev, blkno, size));
92 	biowait(bp);
93 	return (bp);
94 }
95 
96 /*
97  * Write the buffer, waiting for completion.
98  * Then release the buffer.
99  */
100 bwrite(bp)
101 	register struct buf *bp;
102 {
103 	register flag;
104 
105 	flag = bp->b_flags;
106 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
107 	if ((flag&B_DELWRI) == 0)
108 		u.u_vm.vm_oublk++;		/* noone paid yet */
109 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
110 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
111 
112 	/*
113 	 * If the write was synchronous, then await i/o completion.
114 	 * If the write was "delayed", then we put the buffer on
115 	 * the q of blocks awaiting i/o completion status.
116 	 * Otherwise, the i/o must be finished and we check for
117 	 * an error.
118 	 */
119 	if ((flag&B_ASYNC) == 0) {
120 		biowait(bp);
121 		brelse(bp);
122 	} else if (flag & B_DELWRI)
123 		bp->b_flags |= B_AGE;
124 	else
125 		geterror(bp);
126 }
127 
128 /*
129  * Release the buffer, marking it so that if it is grabbed
130  * for another purpose it will be written out before being
131  * given up (e.g. when writing a partial block where it is
132  * assumed that another write for the same block will soon follow).
133  * This can't be done for magtape, since writes must be done
134  * in the same order as requested.
135  */
136 bdwrite(bp)
137 	register struct buf *bp;
138 {
139 	register int flags;
140 
141 	if ((bp->b_flags&B_DELWRI) == 0)
142 		u.u_vm.vm_oublk++;		/* noone paid yet */
143 	flags = bdevsw[major(bp->b_dev)].d_flags;
144 	if(flags & B_TAPE)
145 		bawrite(bp);
146 	else {
147 		bp->b_flags |= B_DELWRI | B_DONE;
148 		brelse(bp);
149 	}
150 }
151 
152 /*
153  * Release the buffer, start I/O on it, but don't wait for completion.
154  */
155 bawrite(bp)
156 	register struct buf *bp;
157 {
158 
159 	bp->b_flags |= B_ASYNC;
160 	bwrite(bp);
161 }
162 
163 /*
164  * Release the buffer, with no I/O implied.
165  */
166 brelse(bp)
167 	register struct buf *bp;
168 {
169 	register struct buf *flist;
170 	register s;
171 
172 	/*
173 	 * If someone's waiting for the buffer, or
174 	 * is waiting for a buffer wake 'em up.
175 	 */
176 	if (bp->b_flags&B_WANTED)
177 		wakeup((caddr_t)bp);
178 	if (bfreelist[0].b_flags&B_WANTED) {
179 		bfreelist[0].b_flags &= ~B_WANTED;
180 		wakeup((caddr_t)bfreelist);
181 	}
182 	if (bp->b_flags&B_ERROR)
183 		if (bp->b_flags & B_LOCKED)
184 			bp->b_flags &= ~B_ERROR;	/* try again later */
185 		else
186 			bp->b_dev = NODEV;  		/* no assoc */
187 
188 	/*
189 	 * Stick the buffer back on a free list.
190 	 */
191 	s = spl6();
192 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
193 		/* block has no info ... put at front of most free list */
194 		flist = &bfreelist[BQUEUES-1];
195 		binsheadfree(bp, flist);
196 	} else {
197 		if (bp->b_flags & B_LOCKED)
198 			flist = &bfreelist[BQ_LOCKED];
199 		else if (bp->b_flags & B_AGE)
200 			flist = &bfreelist[BQ_AGE];
201 		else
202 			flist = &bfreelist[BQ_LRU];
203 		binstailfree(bp, flist);
204 	}
205 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
206 	splx(s);
207 }
208 
209 /*
210  * See if the block is associated with some buffer
211  * (mainly to avoid getting hung up on a wait in breada)
212  */
213 incore(dev, blkno)
214 	dev_t dev;
215 	daddr_t blkno;
216 {
217 	register struct buf *bp;
218 	register struct buf *dp;
219 
220 	dp = BUFHASH(dev, blkno);
221 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
222 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
223 		    (bp->b_flags & B_INVAL) == 0)
224 			return (1);
225 	return (0);
226 }
227 
228 struct buf *
229 baddr(dev, blkno, size)
230 	dev_t dev;
231 	daddr_t blkno;
232 	int size;
233 {
234 
235 	if (incore(dev, blkno))
236 		return (bread(dev, blkno, size));
237 	return (0);
238 }
239 
240 /*
241  * Assign a buffer for the given block.  If the appropriate
242  * block is already associated, return it; otherwise search
243  * for the oldest non-busy buffer and reassign it.
244  *
245  * We use splx here because this routine may be called
246  * on the interrupt stack during a dump, and we don't
247  * want to lower the ipl back to 0.
248  */
249 struct buf *
250 getblk(dev, blkno, size)
251 	dev_t dev;
252 	daddr_t blkno;
253 	int size;
254 {
255 	register struct buf *bp, *dp, *ep;
256 	int s;
257 
258 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
259 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
260 	/*
261 	 * Search the cache for the block.  If we hit, but
262 	 * the buffer is in use for i/o, then we wait until
263 	 * the i/o has completed.
264 	 */
265 	dp = BUFHASH(dev, blkno);
266 loop:
267 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
268 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
269 		    bp->b_flags&B_INVAL)
270 			continue;
271 		s = spl6();
272 		if (bp->b_flags&B_BUSY) {
273 			bp->b_flags |= B_WANTED;
274 			sleep((caddr_t)bp, PRIBIO+1);
275 			splx(s);
276 			goto loop;
277 		}
278 		splx(s);
279 		notavail(bp);
280 		brealloc(bp, size);
281 		bp->b_flags |= B_CACHE;
282 		return(bp);
283 	}
284 	if (major(dev) >= nblkdev)
285 		panic("blkdev");
286 	/*
287 	 * Not found in the cache, select something from
288 	 * a free list.  Preference is to LRU list, then AGE list.
289 	 */
290 	s = spl6();
291 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
292 		if (ep->av_forw != ep)
293 			break;
294 	if (ep == bfreelist) {		/* no free blocks at all */
295 		ep->b_flags |= B_WANTED;
296 		sleep((caddr_t)ep, PRIBIO+1);
297 		splx(s);
298 		goto loop;
299 	}
300 	splx(s);
301 	bp = ep->av_forw;
302 	notavail(bp);
303 	if (bp->b_flags & B_DELWRI) {
304 		bp->b_flags |= B_ASYNC;
305 		bwrite(bp);
306 		goto loop;
307 	}
308 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
309 	bp->b_flags = B_BUSY;
310 	bfree(bp);
311 	bremhash(bp);
312 	binshash(bp, dp);
313 	bp->b_dev = dev;
314 	bp->b_blkno = blkno;
315 	brealloc(bp, size);
316 	return(bp);
317 }
318 
319 /*
320  * get an empty block,
321  * not assigned to any particular device
322  */
323 struct buf *
324 geteblk(size)
325 	int size;
326 {
327 	register struct buf *bp, *dp;
328 	int s;
329 
330 loop:
331 	s = spl6();
332 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
333 		if (dp->av_forw != dp)
334 			break;
335 	if (dp == bfreelist) {		/* no free blocks */
336 		dp->b_flags |= B_WANTED;
337 		sleep((caddr_t)dp, PRIBIO+1);
338 		goto loop;
339 	}
340 	splx(s);
341 	bp = dp->av_forw;
342 	notavail(bp);
343 	if (bp->b_flags & B_DELWRI) {
344 		bp->b_flags |= B_ASYNC;
345 		bwrite(bp);
346 		goto loop;
347 	}
348 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
349 	bp->b_flags = B_BUSY|B_INVAL;
350 	bfree(bp);
351 	bremhash(bp);
352 	binshash(bp, dp);
353 	bp->b_dev = (dev_t)NODEV;
354 	brealloc(bp, size);
355 	return(bp);
356 }
357 
358 /*
359  * Allocate space associated with a buffer.
360  */
361 brealloc(bp, size)
362 	register struct buf *bp;
363 	int size;
364 {
365 	daddr_t start, last;
366 	register struct buf *ep;
367 	struct buf *dp;
368 	int s;
369 
370 	/*
371 	 * First need to make sure that all overlaping previous I/O
372 	 * is dispatched with.
373 	 */
374 	if (size == bp->b_bcount)
375 		return;
376 	if (size < bp->b_bcount || bp->b_dev == NODEV)
377 		goto allocit;
378 
379 	start = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);
380 	last = bp->b_blkno + (size / DEV_BSIZE) - 1;
381 	if (bp->b_bcount == 0) {
382 		start++;
383 		if (start == last)
384 			goto allocit;
385 	}
386 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
387 loop:
388 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
389 		if (ep->b_blkno < start || ep->b_blkno > last ||
390 		    ep->b_dev != bp->b_dev || ep->b_flags&B_INVAL)
391 			continue;
392 		s = spl6();
393 		if (ep->b_flags&B_BUSY) {
394 			ep->b_flags |= B_WANTED;
395 			sleep((caddr_t)ep, PRIBIO+1);
396 			(void) splx(s);
397 			goto loop;
398 		}
399 		(void) splx(s);
400 		/*
401 		 * What we would really like to do is kill this
402 		 * I/O since it is now useless. We cannot do that
403 		 * so we force it to complete, so that it cannot
404 		 * over-write our useful data later.
405 		 */
406 		if (ep->b_flags & B_DELWRI) {
407 			notavail(ep);
408 			ep->b_flags |= B_ASYNC;
409 			bwrite(ep);
410 			goto loop;
411 		}
412 	}
413 allocit:
414 	/*
415 	 * Here the buffer is already available, so all we
416 	 * need to do is set the size. Someday a better memory
417 	 * management scheme will be implemented.
418 	 */
419 	bp->b_bcount = size;
420 }
421 
422 /*
423  * Release space associated with a buffer.
424  */
425 bfree(bp)
426 	struct buf *bp;
427 {
428 	/*
429 	 * Here the buffer does not change, so all we
430 	 * need to do is set the size. Someday a better memory
431 	 * management scheme will be implemented.
432 	 */
433 	bp->b_bcount = 0;
434 }
435 
436 /*
437  * Wait for I/O completion on the buffer; return errors
438  * to the user.
439  */
440 biowait(bp)
441 	register struct buf *bp;
442 {
443 	int s;
444 
445 	s = spl6();
446 	while ((bp->b_flags&B_DONE)==0)
447 		sleep((caddr_t)bp, PRIBIO);
448 	splx(s);
449 	geterror(bp);
450 }
451 
452 /*
453  * Mark I/O complete on a buffer. If the header
454  * indicates a dirty page push completion, the
455  * header is inserted into the ``cleaned'' list
456  * to be processed by the pageout daemon. Otherwise
457  * release it if I/O is asynchronous, and wake
458  * up anyone waiting for it.
459  */
460 biodone(bp)
461 	register struct buf *bp;
462 {
463 	register int s;
464 
465 	if (bp->b_flags & B_DONE)
466 		panic("dup biodone");
467 	bp->b_flags |= B_DONE;
468 	if (bp->b_flags & B_DIRTY) {
469 		if (bp->b_flags & B_ERROR)
470 			panic("IO err in push");
471 		s = spl6();
472 		bp->av_forw = bclnlist;
473 		bp->b_bcount = swsize[bp - swbuf];
474 		bp->b_pfcent = swpf[bp - swbuf];
475 		cnt.v_pgout++;
476 		cnt.v_pgpgout += bp->b_bcount / NBPG;
477 		bclnlist = bp;
478 		if (bswlist.b_flags & B_WANTED)
479 			wakeup((caddr_t)&proc[2]);
480 		splx(s);
481 		return;
482 	}
483 	if (bp->b_flags&B_ASYNC)
484 		brelse(bp);
485 	else {
486 		bp->b_flags &= ~B_WANTED;
487 		wakeup((caddr_t)bp);
488 	}
489 }
490 
491 /*
492  * make sure all write-behind blocks
493  * on dev (or NODEV for all)
494  * are flushed out.
495  * (from umount and update)
496  * (and temporarily pagein)
497  */
498 bflush(dev)
499 	dev_t dev;
500 {
501 	register struct buf *bp;
502 	register struct buf *flist;
503 	int s;
504 
505 loop:
506 	s = spl6();
507 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
508 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
509 		if ((bp->b_flags & B_DELWRI) == 0)
510 			continue;
511 		if (dev == NODEV || dev == bp->b_dev) {
512 			bp->b_flags |= B_ASYNC;
513 			notavail(bp);
514 			bwrite(bp);
515 			goto loop;
516 		}
517 	}
518 	splx(s);
519 }
520 
521 /*
522  * Pick up the device's error number and pass it to the user;
523  * if there is an error but the number is 0 set a generalized
524  * code.  Actually the latter is always true because devices
525  * don't yet return specific errors.
526  */
527 geterror(bp)
528 	register struct buf *bp;
529 {
530 
531 	if (bp->b_flags&B_ERROR)
532 		if ((u.u_error = bp->b_error)==0)
533 			u.u_error = EIO;
534 }
535 
536 /*
537  * Invalidate in core blocks belonging to closed or umounted filesystem
538  *
539  * This is not nicely done at all - the buffer ought to be removed from the
540  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
541  * can't do that here, as it is quite possible that the block is still
542  * being used for i/o. Eventually, all disc drivers should be forced to
543  * have a close routine, which ought ensure that the queue is empty, then
544  * properly flush the queues. Until that happy day, this suffices for
545  * correctness.						... kre
546  */
547 binval(dev)
548 	dev_t dev;
549 {
550 	register struct buf *bp;
551 	register struct bufhd *hp;
552 #define dp ((struct buf *)hp)
553 
554 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
555 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
556 			if (bp->b_dev == dev)
557 				bp->b_flags |= B_INVAL;
558 }
559