xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 6563)
1*6563Smckusic /*	vfs_cluster.c	4.29	82/04/19	*/
28Sbill 
3*6563Smckusic /* merged into kernel:	 @(#)bio.c 2.3 4/8/82 */
4*6563Smckusic 
58Sbill #include "../h/param.h"
68Sbill #include "../h/systm.h"
78Sbill #include "../h/dir.h"
88Sbill #include "../h/user.h"
98Sbill #include "../h/buf.h"
108Sbill #include "../h/conf.h"
118Sbill #include "../h/proc.h"
128Sbill #include "../h/seg.h"
138Sbill #include "../h/pte.h"
148Sbill #include "../h/vm.h"
152045Swnj #include "../h/trace.h"
168Sbill 
1791Sbill /*
1891Sbill  * The following several routines allocate and free
1991Sbill  * buffers with various side effects.  In general the
2091Sbill  * arguments to an allocate routine are a device and
2191Sbill  * a block number, and the value is a pointer to
2291Sbill  * to the buffer header; the buffer is marked "busy"
2391Sbill  * so that no one else can touch it.  If the block was
2491Sbill  * already in core, no I/O need be done; if it is
2591Sbill  * already busy, the process waits until it becomes free.
2691Sbill  * The following routines allocate a buffer:
2791Sbill  *	getblk
2891Sbill  *	bread
2991Sbill  *	breada
3091Sbill  *	baddr	(if it is incore)
3191Sbill  * Eventually the buffer must be released, possibly with the
3291Sbill  * side effect of writing it out, by using one of
3391Sbill  *	bwrite
3491Sbill  *	bdwrite
3591Sbill  *	bawrite
3691Sbill  *	brelse
3791Sbill  */
3891Sbill 
393099Swnj struct	buf bfreelist[BQUEUES];
403099Swnj struct	buf bswlist, *bclnlist;
413099Swnj 
4291Sbill #define	BUFHSZ	63
43*6563Smckusic #define RND	(MAXBSIZE/DEV_BSIZE)
442325Swnj struct	bufhd bufhash[BUFHSZ];
452328Swnj #define	BUFHASH(dev, dblkno)	\
46*6563Smckusic 	((struct buf *)&bufhash[((int)(dev)+(((int)(dblkno))/RND)) % BUFHSZ])
4791Sbill 
4891Sbill /*
4991Sbill  * Initialize hash links for buffers.
5091Sbill  */
5191Sbill bhinit()
5291Sbill {
5391Sbill 	register int i;
542325Swnj 	register struct bufhd *bp;
5591Sbill 
562325Swnj 	for (bp = bufhash, i = 0; i < BUFHSZ; i++, bp++)
572325Swnj 		bp->b_forw = bp->b_back = (struct buf *)bp;
5891Sbill }
5991Sbill 
608Sbill /* #define	DISKMON	1 */
618Sbill 
628Sbill #ifdef	DISKMON
638Sbill struct {
648Sbill 	int	nbuf;
658Sbill 	long	nread;
668Sbill 	long	nreada;
678Sbill 	long	ncache;
688Sbill 	long	nwrite;
692771Swnj 	long	bufcount[64];
708Sbill } io_info;
718Sbill #endif
728Sbill 
738Sbill /*
748Sbill  * Swap IO headers -
758Sbill  * They contain the necessary information for the swap I/O.
768Sbill  * At any given time, a swap header can be in three
778Sbill  * different lists. When free it is in the free list,
788Sbill  * when allocated and the I/O queued, it is on the swap
798Sbill  * device list, and finally, if the operation was a dirty
808Sbill  * page push, when the I/O completes, it is inserted
818Sbill  * in a list of cleaned pages to be processed by the pageout daemon.
828Sbill  */
832771Swnj struct	buf *swbuf;
842771Swnj short	*swsize;		/* CAN WE JUST USE B_BCOUNT? */
852771Swnj int	*swpf;
868Sbill 
878Sbill 
882706Swnj #ifndef	UNFAST
898Sbill #define	notavail(bp) \
908Sbill { \
916156Ssam 	int x = spl6(); \
928Sbill 	(bp)->av_back->av_forw = (bp)->av_forw; \
938Sbill 	(bp)->av_forw->av_back = (bp)->av_back; \
948Sbill 	(bp)->b_flags |= B_BUSY; \
956156Ssam 	splx(x); \
968Sbill }
978Sbill #endif
988Sbill 
998Sbill /*
1008Sbill  * Read in (if necessary) the block and return a buffer pointer.
1018Sbill  */
1028Sbill struct buf *
103*6563Smckusic bread(dev, blkno, size)
104*6563Smckusic 	dev_t dev;
105*6563Smckusic 	daddr_t blkno;
106*6563Smckusic 	int size;
1078Sbill {
1088Sbill 	register struct buf *bp;
1098Sbill 
110*6563Smckusic 	bp = getblk(dev, blkno, size);
1118Sbill 	if (bp->b_flags&B_DONE) {
1123199Swnj #ifdef	TRACE
1133199Swnj 		trace(TR_BREADHIT, dev, blkno);
1142045Swnj #endif
1158Sbill #ifdef	DISKMON
1168Sbill 		io_info.ncache++;
1178Sbill #endif
1188Sbill 		return(bp);
1198Sbill 	}
1208Sbill 	bp->b_flags |= B_READ;
1218Sbill 	(*bdevsw[major(dev)].d_strategy)(bp);
1223199Swnj #ifdef	TRACE
1233199Swnj 	trace(TR_BREADMISS, dev, blkno);
1242045Swnj #endif
1258Sbill #ifdef	DISKMON
1268Sbill 	io_info.nread++;
1278Sbill #endif
1288Sbill 	u.u_vm.vm_inblk++;		/* pay for read */
1298Sbill 	iowait(bp);
1308Sbill 	return(bp);
1318Sbill }
1328Sbill 
1338Sbill /*
1348Sbill  * Read in the block, like bread, but also start I/O on the
1358Sbill  * read-ahead block (which is not allocated to the caller)
1368Sbill  */
1378Sbill struct buf *
138*6563Smckusic breada(dev, blkno, rablkno, size)
139*6563Smckusic 	dev_t dev;
140*6563Smckusic 	daddr_t blkno, rablkno;
141*6563Smckusic 	int size;
1428Sbill {
1438Sbill 	register struct buf *bp, *rabp;
1448Sbill 
1458Sbill 	bp = NULL;
1468Sbill 	if (!incore(dev, blkno)) {
147*6563Smckusic 		bp = getblk(dev, blkno, size);
1488Sbill 		if ((bp->b_flags&B_DONE) == 0) {
1498Sbill 			bp->b_flags |= B_READ;
1508Sbill 			(*bdevsw[major(dev)].d_strategy)(bp);
1513199Swnj #ifdef	TRACE
1523199Swnj 			trace(TR_BREADMISS, dev, blkno);
1532045Swnj #endif
1548Sbill #ifdef	DISKMON
1558Sbill 			io_info.nread++;
1568Sbill #endif
1578Sbill 			u.u_vm.vm_inblk++;		/* pay for read */
1588Sbill 		}
1593199Swnj #ifdef	TRACE
1602045Swnj 		else
1613199Swnj 			trace(TR_BREADHIT, dev, blkno);
1622045Swnj #endif
1638Sbill 	}
1648Sbill 	if (rablkno && !incore(dev, rablkno)) {
165*6563Smckusic 		rabp = getblk(dev, rablkno, size);
1662045Swnj 		if (rabp->b_flags & B_DONE) {
1678Sbill 			brelse(rabp);
1683199Swnj #ifdef	TRACE
1693199Swnj 			trace(TR_BREADHITRA, dev, blkno);
1702045Swnj #endif
1712045Swnj 		} else {
1728Sbill 			rabp->b_flags |= B_READ|B_ASYNC;
1738Sbill 			(*bdevsw[major(dev)].d_strategy)(rabp);
1743199Swnj #ifdef	TRACE
1753199Swnj 			trace(TR_BREADMISSRA, dev, rablock);
1762045Swnj #endif
1778Sbill #ifdef	DISKMON
1788Sbill 			io_info.nreada++;
1798Sbill #endif
1808Sbill 			u.u_vm.vm_inblk++;		/* pay in advance */
1818Sbill 		}
1828Sbill 	}
1838Sbill 	if(bp == NULL)
184*6563Smckusic 		return(bread(dev, blkno, size));
1858Sbill 	iowait(bp);
1868Sbill 	return(bp);
1878Sbill }
1888Sbill 
1898Sbill /*
1908Sbill  * Write the buffer, waiting for completion.
1918Sbill  * Then release the buffer.
1928Sbill  */
1938Sbill bwrite(bp)
1948Sbill register struct buf *bp;
1958Sbill {
1968Sbill 	register flag;
1978Sbill 
1988Sbill 	flag = bp->b_flags;
1998Sbill 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
2008Sbill #ifdef	DISKMON
2018Sbill 	io_info.nwrite++;
2028Sbill #endif
2038Sbill 	if ((flag&B_DELWRI) == 0)
2048Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
2053199Swnj #ifdef	TRACE
2064033Swnj 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
2072045Swnj #endif
2088Sbill 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2098Sbill 	if ((flag&B_ASYNC) == 0) {
2108Sbill 		iowait(bp);
2118Sbill 		brelse(bp);
2128Sbill 	} else if (flag & B_DELWRI)
2138Sbill 		bp->b_flags |= B_AGE;
2148Sbill 	else
2158Sbill 		geterror(bp);
2168Sbill }
2178Sbill 
2188Sbill /*
2198Sbill  * Release the buffer, marking it so that if it is grabbed
2208Sbill  * for another purpose it will be written out before being
2218Sbill  * given up (e.g. when writing a partial block where it is
2228Sbill  * assumed that another write for the same block will soon follow).
2238Sbill  * This can't be done for magtape, since writes must be done
2248Sbill  * in the same order as requested.
2258Sbill  */
2268Sbill bdwrite(bp)
2278Sbill register struct buf *bp;
2288Sbill {
2292403Skre 	register int flags;
2308Sbill 
2318Sbill 	if ((bp->b_flags&B_DELWRI) == 0)
2328Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
2332403Skre 	flags = bdevsw[major(bp->b_dev)].d_flags;
2342403Skre 	if(flags & B_TAPE)
2358Sbill 		bawrite(bp);
2368Sbill 	else {
2378Sbill 		bp->b_flags |= B_DELWRI | B_DONE;
2388Sbill 		brelse(bp);
2398Sbill 	}
2408Sbill }
2418Sbill 
2428Sbill /*
2438Sbill  * Release the buffer, start I/O on it, but don't wait for completion.
2448Sbill  */
2458Sbill bawrite(bp)
2468Sbill register struct buf *bp;
2478Sbill {
2488Sbill 
2498Sbill 	bp->b_flags |= B_ASYNC;
2508Sbill 	bwrite(bp);
2518Sbill }
2528Sbill 
2538Sbill /*
2548Sbill  * release the buffer, with no I/O implied.
2558Sbill  */
2568Sbill brelse(bp)
2578Sbill register struct buf *bp;
2588Sbill {
2592325Swnj 	register struct buf *flist;
2608Sbill 	register s;
2618Sbill 
2628Sbill 	if (bp->b_flags&B_WANTED)
2638Sbill 		wakeup((caddr_t)bp);
2642325Swnj 	if (bfreelist[0].b_flags&B_WANTED) {
2652325Swnj 		bfreelist[0].b_flags &= ~B_WANTED;
2662325Swnj 		wakeup((caddr_t)bfreelist);
2678Sbill 	}
2682683Swnj 	if (bp->b_flags&B_ERROR)
2692683Swnj 		if (bp->b_flags & B_LOCKED)
2702683Swnj 			bp->b_flags &= ~B_ERROR;	/* try again later */
2712683Swnj 		else
2722683Swnj 			bp->b_dev = NODEV;  		/* no assoc */
2738Sbill 	s = spl6();
2742325Swnj 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
2752325Swnj 		/* block has no info ... put at front of most free list */
2762325Swnj 		flist = &bfreelist[BQUEUES-1];
2772325Swnj 		flist->av_forw->av_back = bp;
2782325Swnj 		bp->av_forw = flist->av_forw;
2792325Swnj 		flist->av_forw = bp;
2802325Swnj 		bp->av_back = flist;
2818Sbill 	} else {
2822325Swnj 		if (bp->b_flags & B_LOCKED)
2832325Swnj 			flist = &bfreelist[BQ_LOCKED];
2842325Swnj 		else if (bp->b_flags & B_AGE)
2852325Swnj 			flist = &bfreelist[BQ_AGE];
2862325Swnj 		else
2872325Swnj 			flist = &bfreelist[BQ_LRU];
2882325Swnj 		flist->av_back->av_forw = bp;
2892325Swnj 		bp->av_back = flist->av_back;
2902325Swnj 		flist->av_back = bp;
2912325Swnj 		bp->av_forw = flist;
2928Sbill 	}
2938Sbill 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
2948Sbill 	splx(s);
2958Sbill }
2968Sbill 
2978Sbill /*
2988Sbill  * See if the block is associated with some buffer
2998Sbill  * (mainly to avoid getting hung up on a wait in breada)
3008Sbill  */
3018Sbill incore(dev, blkno)
3028Sbill dev_t dev;
3038Sbill daddr_t blkno;
3048Sbill {
3058Sbill 	register struct buf *bp;
3062325Swnj 	register struct buf *dp;
3078Sbill 
308*6563Smckusic 	dp = BUFHASH(dev, blkno);
3092325Swnj 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
310*6563Smckusic 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
3112325Swnj 		    !(bp->b_flags & B_INVAL))
31291Sbill 			return (1);
31391Sbill 	return (0);
3148Sbill }
3158Sbill 
3168Sbill struct buf *
317*6563Smckusic baddr(dev, blkno, size)
318*6563Smckusic 	dev_t dev;
319*6563Smckusic 	daddr_t blkno;
320*6563Smckusic 	int size;
3218Sbill {
3228Sbill 
3238Sbill 	if (incore(dev, blkno))
324*6563Smckusic 		return (bread(dev, blkno, size));
3258Sbill 	return (0);
3268Sbill }
3278Sbill 
3288Sbill /*
3298Sbill  * Assign a buffer for the given block.  If the appropriate
3308Sbill  * block is already associated, return it; otherwise search
3318Sbill  * for the oldest non-busy buffer and reassign it.
3325424Swnj  *
3335424Swnj  * We use splx here because this routine may be called
3345424Swnj  * on the interrupt stack during a dump, and we don't
3355424Swnj  * want to lower the ipl back to 0.
3368Sbill  */
3378Sbill struct buf *
338*6563Smckusic getblk(dev, blkno, size)
339*6563Smckusic 	dev_t dev;
340*6563Smckusic 	daddr_t blkno;
341*6563Smckusic 	int size;
3428Sbill {
34391Sbill 	register struct buf *bp, *dp, *ep;
3442423Skre #ifdef	DISKMON
3452423Skre 	register int i;
3462423Skre #endif
3475424Swnj 	int s;
3488Sbill 
3491831Sbill 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
3501831Sbill 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
351*6563Smckusic 	dp = BUFHASH(dev, blkno);
3528Sbill     loop:
3532325Swnj 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
354*6563Smckusic 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
3552325Swnj 		    bp->b_flags&B_INVAL)
3568Sbill 			continue;
3575424Swnj 		s = spl6();
3588Sbill 		if (bp->b_flags&B_BUSY) {
3598Sbill 			bp->b_flags |= B_WANTED;
3608Sbill 			sleep((caddr_t)bp, PRIBIO+1);
3615424Swnj 			splx(s);
3628Sbill 			goto loop;
3638Sbill 		}
3645424Swnj 		splx(s);
3658Sbill #ifdef	DISKMON
3668Sbill 		i = 0;
3678Sbill 		dp = bp->av_forw;
3682325Swnj 		while ((dp->b_flags & B_HEAD) == 0) {
3698Sbill 			i++;
3708Sbill 			dp = dp->av_forw;
3718Sbill 		}
3722771Swnj 		if (i<64)
3738Sbill 			io_info.bufcount[i]++;
3748Sbill #endif
3758Sbill 		notavail(bp);
376*6563Smckusic 		brealloc(bp, size);
3778Sbill 		bp->b_flags |= B_CACHE;
3788Sbill 		return(bp);
3798Sbill 	}
38091Sbill 	if (major(dev) >= nblkdev)
38191Sbill 		panic("blkdev");
3825424Swnj 	s = spl6();
3832325Swnj 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
3842325Swnj 		if (ep->av_forw != ep)
3852325Swnj 			break;
3862325Swnj 	if (ep == bfreelist) {		/* no free blocks at all */
3872325Swnj 		ep->b_flags |= B_WANTED;
3882325Swnj 		sleep((caddr_t)ep, PRIBIO+1);
3895424Swnj 		splx(s);
3908Sbill 		goto loop;
3918Sbill 	}
3925424Swnj 	splx(s);
3932325Swnj 	bp = ep->av_forw;
3948Sbill 	notavail(bp);
3958Sbill 	if (bp->b_flags & B_DELWRI) {
3968Sbill 		bp->b_flags |= B_ASYNC;
3978Sbill 		bwrite(bp);
3988Sbill 		goto loop;
3998Sbill 	}
4003199Swnj #ifdef TRACE
4014033Swnj 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
4022045Swnj #endif
4038Sbill 	bp->b_flags = B_BUSY;
404*6563Smckusic 	bfree(bp);
4058Sbill 	bp->b_back->b_forw = bp->b_forw;
4068Sbill 	bp->b_forw->b_back = bp->b_back;
4078Sbill 	bp->b_forw = dp->b_forw;
4088Sbill 	bp->b_back = dp;
4098Sbill 	dp->b_forw->b_back = bp;
4108Sbill 	dp->b_forw = bp;
4118Sbill 	bp->b_dev = dev;
412*6563Smckusic 	bp->b_blkno = blkno;
413*6563Smckusic 	brealloc(bp, size);
4148Sbill 	return(bp);
4158Sbill }
4168Sbill 
4178Sbill /*
4188Sbill  * get an empty block,
4198Sbill  * not assigned to any particular device
4208Sbill  */
4218Sbill struct buf *
422*6563Smckusic geteblk(size)
423*6563Smckusic 	int size;
4248Sbill {
425182Sbill 	register struct buf *bp, *dp;
4265431Sroot 	int s;
4278Sbill 
4288Sbill loop:
4295431Sroot 	s = spl6();
4302325Swnj 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
4312325Swnj 		if (dp->av_forw != dp)
4322325Swnj 			break;
4332325Swnj 	if (dp == bfreelist) {		/* no free blocks */
4342325Swnj 		dp->b_flags |= B_WANTED;
4352325Swnj 		sleep((caddr_t)dp, PRIBIO+1);
4362325Swnj 		goto loop;
4378Sbill 	}
4385431Sroot 	splx(s);
4392325Swnj 	bp = dp->av_forw;
4408Sbill 	notavail(bp);
4418Sbill 	if (bp->b_flags & B_DELWRI) {
4428Sbill 		bp->b_flags |= B_ASYNC;
4438Sbill 		bwrite(bp);
4448Sbill 		goto loop;
4458Sbill 	}
4463199Swnj #ifdef TRACE
4474033Swnj 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
4482045Swnj #endif
4492325Swnj 	bp->b_flags = B_BUSY|B_INVAL;
4508Sbill 	bp->b_back->b_forw = bp->b_forw;
4518Sbill 	bp->b_forw->b_back = bp->b_back;
4528Sbill 	bp->b_forw = dp->b_forw;
4538Sbill 	bp->b_back = dp;
4548Sbill 	dp->b_forw->b_back = bp;
4558Sbill 	dp->b_forw = bp;
4568Sbill 	bp->b_dev = (dev_t)NODEV;
457*6563Smckusic 	bp->b_bcount = size;
4588Sbill 	return(bp);
4598Sbill }
4608Sbill 
4618Sbill /*
462*6563Smckusic  * Allocate space associated with a buffer.
463*6563Smckusic  */
464*6563Smckusic brealloc(bp, size)
465*6563Smckusic 	register struct buf *bp;
466*6563Smckusic 	int size;
467*6563Smckusic {
468*6563Smckusic 	daddr_t start, last;
469*6563Smckusic 	register struct buf *ep;
470*6563Smckusic 	struct buf *dp;
471*6563Smckusic 	int s;
472*6563Smckusic 
473*6563Smckusic 	/*
474*6563Smckusic 	 * First need to make sure that all overlaping previous I/O
475*6563Smckusic 	 * is dispatched with.
476*6563Smckusic 	 */
477*6563Smckusic 	if (size == bp->b_bcount)
478*6563Smckusic 		return;
479*6563Smckusic 	if (size < bp->b_bcount) {
480*6563Smckusic 		bp->b_bcount = size;
481*6563Smckusic 		return;
482*6563Smckusic 	}
483*6563Smckusic 	start = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);
484*6563Smckusic 	last = bp->b_blkno + (size / DEV_BSIZE) - 1;
485*6563Smckusic 	if (bp->b_bcount == 0) {
486*6563Smckusic 		start++;
487*6563Smckusic 		if (start == last)
488*6563Smckusic 			goto allocit;
489*6563Smckusic 	}
490*6563Smckusic 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
491*6563Smckusic loop:
492*6563Smckusic 	(void) spl0();
493*6563Smckusic 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
494*6563Smckusic 		if (ep->b_blkno < start || ep->b_blkno > last ||
495*6563Smckusic 		    ep->b_dev != bp->b_dev || ep->b_flags&B_INVAL)
496*6563Smckusic 			continue;
497*6563Smckusic 		s = spl6();
498*6563Smckusic 		if (ep->b_flags&B_BUSY) {
499*6563Smckusic 			ep->b_flags |= B_WANTED;
500*6563Smckusic 			sleep((caddr_t)ep, PRIBIO+1);
501*6563Smckusic 			splx(s);
502*6563Smckusic 			goto loop;
503*6563Smckusic 		}
504*6563Smckusic 		(void) spl0();
505*6563Smckusic 		/*
506*6563Smckusic 		 * What we would really like to do is kill this
507*6563Smckusic 		 * I/O since it is now useless. We cannot do that
508*6563Smckusic 		 * so we force it to complete, so that it cannot
509*6563Smckusic 		 * over-write our useful data later.
510*6563Smckusic 		 */
511*6563Smckusic 		if (ep->b_flags & B_DELWRI) {
512*6563Smckusic 			notavail(ep);
513*6563Smckusic 			ep->b_flags |= B_ASYNC;
514*6563Smckusic 			bwrite(ep);
515*6563Smckusic 			goto loop;
516*6563Smckusic 		}
517*6563Smckusic 	}
518*6563Smckusic allocit:
519*6563Smckusic 	/*
520*6563Smckusic 	 * Here the buffer is already available, so all we
521*6563Smckusic 	 * need to do is set the size. Someday a better memory
522*6563Smckusic 	 * management scheme will be implemented.
523*6563Smckusic 	 */
524*6563Smckusic 	bp->b_bcount = size;
525*6563Smckusic }
526*6563Smckusic 
527*6563Smckusic /*
528*6563Smckusic  * Release space associated with a buffer.
529*6563Smckusic  */
530*6563Smckusic bfree(bp)
531*6563Smckusic 	struct buf *bp;
532*6563Smckusic {
533*6563Smckusic 	/*
534*6563Smckusic 	 * Here the buffer does not change, so all we
535*6563Smckusic 	 * need to do is set the size. Someday a better memory
536*6563Smckusic 	 * management scheme will be implemented.
537*6563Smckusic 	 */
538*6563Smckusic 	bp->b_bcount = 0;
539*6563Smckusic }
540*6563Smckusic 
541*6563Smckusic /*
5428Sbill  * Wait for I/O completion on the buffer; return errors
5438Sbill  * to the user.
5448Sbill  */
5458Sbill iowait(bp)
546*6563Smckusic 	register struct buf *bp;
5478Sbill {
5485431Sroot 	int s;
5498Sbill 
5505431Sroot 	s = spl6();
5518Sbill 	while ((bp->b_flags&B_DONE)==0)
5528Sbill 		sleep((caddr_t)bp, PRIBIO);
5535431Sroot 	splx(s);
5548Sbill 	geterror(bp);
5558Sbill }
5568Sbill 
5572706Swnj #ifdef UNFAST
5588Sbill /*
5598Sbill  * Unlink a buffer from the available list and mark it busy.
5608Sbill  * (internal interface)
5618Sbill  */
5628Sbill notavail(bp)
5638Sbill register struct buf *bp;
5648Sbill {
5658Sbill 	register s;
5668Sbill 
5678Sbill 	s = spl6();
5688Sbill 	bp->av_back->av_forw = bp->av_forw;
5698Sbill 	bp->av_forw->av_back = bp->av_back;
5708Sbill 	bp->b_flags |= B_BUSY;
5718Sbill 	splx(s);
5728Sbill }
5738Sbill #endif
5748Sbill 
5758Sbill /*
5768Sbill  * Mark I/O complete on a buffer. If the header
5778Sbill  * indicates a dirty page push completion, the
5788Sbill  * header is inserted into the ``cleaned'' list
5798Sbill  * to be processed by the pageout daemon. Otherwise
5808Sbill  * release it if I/O is asynchronous, and wake
5818Sbill  * up anyone waiting for it.
5828Sbill  */
5838Sbill iodone(bp)
5848Sbill register struct buf *bp;
5858Sbill {
5868Sbill 	register int s;
5878Sbill 
588420Sbill 	if (bp->b_flags & B_DONE)
589420Sbill 		panic("dup iodone");
5908Sbill 	bp->b_flags |= B_DONE;
5918Sbill 	if (bp->b_flags & B_DIRTY) {
5928Sbill 		if (bp->b_flags & B_ERROR)
5938Sbill 			panic("IO err in push");
5948Sbill 		s = spl6();
5958Sbill 		bp->av_forw = bclnlist;
5968Sbill 		bp->b_bcount = swsize[bp - swbuf];
5978Sbill 		bp->b_pfcent = swpf[bp - swbuf];
5983601Swnj 		cnt.v_pgout++;
5993601Swnj 		cnt.v_pgpgout += bp->b_bcount / NBPG;
6008Sbill 		bclnlist = bp;
6018Sbill 		if (bswlist.b_flags & B_WANTED)
6028Sbill 			wakeup((caddr_t)&proc[2]);
6038Sbill 		splx(s);
604383Sbill 		return;
6058Sbill 	}
6068Sbill 	if (bp->b_flags&B_ASYNC)
6078Sbill 		brelse(bp);
6088Sbill 	else {
6098Sbill 		bp->b_flags &= ~B_WANTED;
6108Sbill 		wakeup((caddr_t)bp);
6118Sbill 	}
6128Sbill }
6138Sbill 
6148Sbill /*
6158Sbill  * Zero the core associated with a buffer.
6168Sbill  */
6178Sbill clrbuf(bp)
618*6563Smckusic 	struct buf *bp;
6198Sbill {
620*6563Smckusic 	register int *p;
621*6563Smckusic 	register int c;
6228Sbill 
6238Sbill 	p = bp->b_un.b_words;
624*6563Smckusic 	c = bp->b_bcount/sizeof(int);
6258Sbill 	do
6268Sbill 		*p++ = 0;
6278Sbill 	while (--c);
6288Sbill 	bp->b_resid = 0;
6298Sbill }
6308Sbill 
6318Sbill /*
6328Sbill  * swap I/O -
6338Sbill  *
6348Sbill  * If the flag indicates a dirty page push initiated
6358Sbill  * by the pageout daemon, we map the page into the i th
6368Sbill  * virtual page of process 2 (the daemon itself) where i is
6378Sbill  * the index of the swap header that has been allocated.
6388Sbill  * We simply initialize the header and queue the I/O but
6398Sbill  * do not wait for completion. When the I/O completes,
6408Sbill  * iodone() will link the header to a list of cleaned
6418Sbill  * pages to be processed by the pageout daemon.
6428Sbill  */
6438Sbill swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent)
6448Sbill 	struct proc *p;
6458Sbill 	swblk_t dblkno;
6468Sbill 	caddr_t addr;
6478Sbill 	int flag, nbytes;
6488Sbill 	dev_t dev;
6498Sbill 	unsigned pfcent;
6508Sbill {
6518Sbill 	register struct buf *bp;
6528Sbill 	register int c;
6538Sbill 	int p2dp;
6548Sbill 	register struct pte *dpte, *vpte;
6555431Sroot 	int s;
6568Sbill 
6575431Sroot 	s = spl6();
6588Sbill 	while (bswlist.av_forw == NULL) {
6598Sbill 		bswlist.b_flags |= B_WANTED;
6608Sbill 		sleep((caddr_t)&bswlist, PSWP+1);
6618Sbill 	}
6628Sbill 	bp = bswlist.av_forw;
6638Sbill 	bswlist.av_forw = bp->av_forw;
6645431Sroot 	splx(s);
6658Sbill 
6668Sbill 	bp->b_flags = B_BUSY | B_PHYS | rdflg | flag;
6678Sbill 	if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0)
6688Sbill 		if (rdflg == B_READ)
6698Sbill 			sum.v_pswpin += btoc(nbytes);
6708Sbill 		else
6718Sbill 			sum.v_pswpout += btoc(nbytes);
6728Sbill 	bp->b_proc = p;
6738Sbill 	if (flag & B_DIRTY) {
6748Sbill 		p2dp = ((bp - swbuf) * CLSIZE) * KLMAX;
6758Sbill 		dpte = dptopte(&proc[2], p2dp);
6768Sbill 		vpte = vtopte(p, btop(addr));
6778Sbill 		for (c = 0; c < nbytes; c += NBPG) {
6788Sbill 			if (vpte->pg_pfnum == 0 || vpte->pg_fod)
6798Sbill 				panic("swap bad pte");
6808Sbill 			*dpte++ = *vpte++;
6818Sbill 		}
6828Sbill 		bp->b_un.b_addr = (caddr_t)ctob(p2dp);
6838Sbill 	} else
6848Sbill 		bp->b_un.b_addr = addr;
6858Sbill 	while (nbytes > 0) {
6868Sbill 		c = imin(ctob(120), nbytes);
6878Sbill 		bp->b_bcount = c;
6888Sbill 		bp->b_blkno = dblkno;
6898Sbill 		bp->b_dev = dev;
690718Sbill 		if (flag & B_DIRTY) {
691718Sbill 			swpf[bp - swbuf] = pfcent;
692718Sbill 			swsize[bp - swbuf] = nbytes;
693718Sbill 		}
6944033Swnj #ifdef TRACE
6954033Swnj 		trace(TR_SWAPIO, dev, bp->b_blkno);
6964033Swnj #endif
6978Sbill 		(*bdevsw[major(dev)].d_strategy)(bp);
6988Sbill 		if (flag & B_DIRTY) {
6998Sbill 			if (c < nbytes)
7008Sbill 				panic("big push");
7018Sbill 			return;
7028Sbill 		}
7035431Sroot 		s = spl6();
7048Sbill 		while((bp->b_flags&B_DONE)==0)
7058Sbill 			sleep((caddr_t)bp, PSWP);
7065431Sroot 		splx(s);
7078Sbill 		bp->b_un.b_addr += c;
7088Sbill 		bp->b_flags &= ~B_DONE;
7098Sbill 		if (bp->b_flags & B_ERROR) {
7108Sbill 			if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE)
7118Sbill 				panic("hard IO err in swap");
7128Sbill 			swkill(p, (char *)0);
7138Sbill 		}
7148Sbill 		nbytes -= c;
7158Sbill 		dblkno += btoc(c);
7168Sbill 	}
7175431Sroot 	s = spl6();
7188Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
7198Sbill 	bp->av_forw = bswlist.av_forw;
7208Sbill 	bswlist.av_forw = bp;
7218Sbill 	if (bswlist.b_flags & B_WANTED) {
7228Sbill 		bswlist.b_flags &= ~B_WANTED;
7238Sbill 		wakeup((caddr_t)&bswlist);
7248Sbill 		wakeup((caddr_t)&proc[2]);
7258Sbill 	}
7265431Sroot 	splx(s);
7278Sbill }
7288Sbill 
7298Sbill /*
7308Sbill  * If rout == 0 then killed on swap error, else
7318Sbill  * rout is the name of the routine where we ran out of
7328Sbill  * swap space.
7338Sbill  */
7348Sbill swkill(p, rout)
7358Sbill 	struct proc *p;
7368Sbill 	char *rout;
7378Sbill {
7382922Swnj 	char *mesg;
7398Sbill 
7402922Swnj 	printf("pid %d: ", p->p_pid);
7418Sbill 	if (rout)
7422922Swnj 		printf(mesg = "killed due to no swap space\n");
7438Sbill 	else
7442922Swnj 		printf(mesg = "killed on swap error\n");
7452922Swnj 	uprintf("sorry, pid %d was %s", p->p_pid, mesg);
7468Sbill 	/*
7478Sbill 	 * To be sure no looping (e.g. in vmsched trying to
7488Sbill 	 * swap out) mark process locked in core (as though
7498Sbill 	 * done by user) after killing it so noone will try
7508Sbill 	 * to swap it out.
7518Sbill 	 */
752165Sbill 	psignal(p, SIGKILL);
7538Sbill 	p->p_flag |= SULOCK;
7548Sbill }
7558Sbill 
7568Sbill /*
7578Sbill  * make sure all write-behind blocks
7588Sbill  * on dev (or NODEV for all)
7598Sbill  * are flushed out.
7608Sbill  * (from umount and update)
761*6563Smckusic  * (and temporarily pagein)
7628Sbill  */
7638Sbill bflush(dev)
7648Sbill dev_t dev;
7658Sbill {
7668Sbill 	register struct buf *bp;
7672325Swnj 	register struct buf *flist;
7685431Sroot 	int s;
7698Sbill 
7708Sbill loop:
7715431Sroot 	s = spl6();
7722325Swnj 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
7732325Swnj 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
7748Sbill 		if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
7758Sbill 			bp->b_flags |= B_ASYNC;
7768Sbill 			notavail(bp);
7778Sbill 			bwrite(bp);
7788Sbill 			goto loop;
7798Sbill 		}
7808Sbill 	}
7815431Sroot 	splx(s);
7828Sbill }
7838Sbill 
7848Sbill /*
7858Sbill  * Raw I/O. The arguments are
7868Sbill  *	The strategy routine for the device
7878Sbill  *	A buffer, which will always be a special buffer
7888Sbill  *	  header owned exclusively by the device for this purpose
7898Sbill  *	The device number
7908Sbill  *	Read/write flag
7918Sbill  * Essentially all the work is computing physical addresses and
7928Sbill  * validating them.
7938Sbill  * If the user has the proper access privilidges, the process is
7948Sbill  * marked 'delayed unlock' and the pages involved in the I/O are
7958Sbill  * faulted and locked. After the completion of the I/O, the above pages
7968Sbill  * are unlocked.
7978Sbill  */
7988Sbill physio(strat, bp, dev, rw, mincnt)
7998Sbill int (*strat)();
8008Sbill register struct buf *bp;
8018Sbill unsigned (*mincnt)();
8028Sbill {
8038Sbill 	register int c;
8048Sbill 	char *a;
8055431Sroot 	int s;
8068Sbill 
8078Sbill 	if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) {
8088Sbill 		u.u_error = EFAULT;
8098Sbill 		return;
8108Sbill 	}
8115431Sroot 	s = spl6();
8128Sbill 	while (bp->b_flags&B_BUSY) {
8138Sbill 		bp->b_flags |= B_WANTED;
8148Sbill 		sleep((caddr_t)bp, PRIBIO+1);
8158Sbill 	}
8166319Swnj 	splx(s);
8178Sbill 	bp->b_error = 0;
8188Sbill 	bp->b_proc = u.u_procp;
8198Sbill 	bp->b_un.b_addr = u.u_base;
8203667Swnj 	while (u.u_count != 0) {
8218Sbill 		bp->b_flags = B_BUSY | B_PHYS | rw;
8228Sbill 		bp->b_dev = dev;
8238Sbill 		bp->b_blkno = u.u_offset >> PGSHIFT;
8248Sbill 		bp->b_bcount = u.u_count;
8258Sbill 		(*mincnt)(bp);
8268Sbill 		c = bp->b_bcount;
8278Sbill 		u.u_procp->p_flag |= SPHYSIO;
8288Sbill 		vslock(a = bp->b_un.b_addr, c);
8298Sbill 		(*strat)(bp);
830124Sbill 		(void) spl6();
8318Sbill 		while ((bp->b_flags&B_DONE) == 0)
8328Sbill 			sleep((caddr_t)bp, PRIBIO);
8338Sbill 		vsunlock(a, c, rw);
8348Sbill 		u.u_procp->p_flag &= ~SPHYSIO;
8358Sbill 		if (bp->b_flags&B_WANTED)
8368Sbill 			wakeup((caddr_t)bp);
8375431Sroot 		splx(s);
8388Sbill 		bp->b_un.b_addr += c;
8398Sbill 		u.u_count -= c;
8408Sbill 		u.u_offset += c;
8413667Swnj 		if (bp->b_flags&B_ERROR)
8423667Swnj 			break;
8438Sbill 	}
8448Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
8458Sbill 	u.u_count = bp->b_resid;
8468Sbill 	geterror(bp);
8478Sbill }
8488Sbill 
8498Sbill /*ARGSUSED*/
8508Sbill unsigned
8518Sbill minphys(bp)
8528Sbill struct buf *bp;
8538Sbill {
8548Sbill 
8556379Swnj 	if (bp->b_bcount > 63 * 1024)
8566379Swnj 		bp->b_bcount = 63 * 1024;
8578Sbill }
8588Sbill 
859*6563Smckusic 
8608Sbill /*
8618Sbill  * Pick up the device's error number and pass it to the user;
8628Sbill  * if there is an error but the number is 0 set a generalized
8638Sbill  * code.  Actually the latter is always true because devices
8648Sbill  * don't yet return specific errors.
8658Sbill  */
8668Sbill geterror(bp)
8678Sbill register struct buf *bp;
8688Sbill {
8698Sbill 
8708Sbill 	if (bp->b_flags&B_ERROR)
8718Sbill 		if ((u.u_error = bp->b_error)==0)
8728Sbill 			u.u_error = EIO;
8738Sbill }
8742299Skre 
8752299Skre /*
8762299Skre  * Invalidate in core blocks belonging to closed or umounted filesystem
8772299Skre  *
8782299Skre  * This is not nicely done at all - the buffer ought to be removed from the
8792299Skre  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
8802299Skre  * can't do that here, as it is quite possible that the block is still
8812299Skre  * being used for i/o. Eventually, all disc drivers should be forced to
8822299Skre  * have a close routine, which ought ensure that the queue is empty, then
8832299Skre  * properly flush the queues. Until that happy day, this suffices for
8842299Skre  * correctness.						... kre
8852299Skre  */
8862299Skre binval(dev)
8872299Skre dev_t dev;
8882299Skre {
8892361Skre 	register struct buf *bp;
8902361Skre 	register struct bufhd *hp;
8912361Skre #define dp ((struct buf *)hp)
8922299Skre 
8932361Skre 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
8942361Skre 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
8952361Skre 			if (bp->b_dev == dev)
8962361Skre 				bp->b_flags |= B_INVAL;
8972299Skre }
898