xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 2683)
1*2683Swnj /*	vfs_cluster.c	4.12	02/25/81	*/
28Sbill 
38Sbill #include "../h/param.h"
48Sbill #include "../h/systm.h"
58Sbill #include "../h/dir.h"
68Sbill #include "../h/user.h"
78Sbill #include "../h/buf.h"
88Sbill #include "../h/conf.h"
98Sbill #include "../h/proc.h"
108Sbill #include "../h/seg.h"
118Sbill #include "../h/pte.h"
128Sbill #include "../h/vm.h"
132045Swnj #include "../h/trace.h"
148Sbill 
1591Sbill /*
1691Sbill  * The following several routines allocate and free
1791Sbill  * buffers with various side effects.  In general the
1891Sbill  * arguments to an allocate routine are a device and
1991Sbill  * a block number, and the value is a pointer to
2091Sbill  * to the buffer header; the buffer is marked "busy"
2191Sbill  * so that no one else can touch it.  If the block was
2291Sbill  * already in core, no I/O need be done; if it is
2391Sbill  * already busy, the process waits until it becomes free.
2491Sbill  * The following routines allocate a buffer:
2591Sbill  *	getblk
2691Sbill  *	bread
2791Sbill  *	breada
2891Sbill  *	baddr	(if it is incore)
2991Sbill  * Eventually the buffer must be released, possibly with the
3091Sbill  * side effect of writing it out, by using one of
3191Sbill  *	bwrite
3291Sbill  *	bdwrite
3391Sbill  *	bawrite
3491Sbill  *	brelse
3591Sbill  */
3691Sbill 
3791Sbill #define	BUFHSZ	63
382325Swnj struct	bufhd bufhash[BUFHSZ];
392328Swnj #define	BUFHASH(dev, dblkno)	\
402328Swnj 		((struct buf *)&bufhash[((int)(dev)+(int)(dblkno)) % BUFHSZ])
4191Sbill 
4291Sbill /*
4391Sbill  * Initialize hash links for buffers.
4491Sbill  */
4591Sbill bhinit()
4691Sbill {
4791Sbill 	register int i;
482325Swnj 	register struct bufhd *bp;
4991Sbill 
502325Swnj 	for (bp = bufhash, i = 0; i < BUFHSZ; i++, bp++)
512325Swnj 		bp->b_forw = bp->b_back = (struct buf *)bp;
5291Sbill }
5391Sbill 
548Sbill /* #define	DISKMON	1 */
558Sbill 
568Sbill #ifdef	DISKMON
578Sbill struct {
588Sbill 	int	nbuf;
598Sbill 	long	nread;
608Sbill 	long	nreada;
618Sbill 	long	ncache;
628Sbill 	long	nwrite;
638Sbill 	long	bufcount[NBUF];
648Sbill } io_info;
658Sbill #endif
668Sbill 
678Sbill /*
688Sbill  * Swap IO headers -
698Sbill  * They contain the necessary information for the swap I/O.
708Sbill  * At any given time, a swap header can be in three
718Sbill  * different lists. When free it is in the free list,
728Sbill  * when allocated and the I/O queued, it is on the swap
738Sbill  * device list, and finally, if the operation was a dirty
748Sbill  * page push, when the I/O completes, it is inserted
758Sbill  * in a list of cleaned pages to be processed by the pageout daemon.
768Sbill  */
778Sbill struct	buf swbuf[NSWBUF];
788Sbill short	swsize[NSWBUF];		/* CAN WE JUST USE B_BCOUNT? */
798Sbill int	swpf[NSWBUF];
808Sbill 
818Sbill 
828Sbill #ifdef	FASTVAX
838Sbill #define	notavail(bp) \
848Sbill { \
858Sbill 	int s = spl6(); \
868Sbill 	(bp)->av_back->av_forw = (bp)->av_forw; \
878Sbill 	(bp)->av_forw->av_back = (bp)->av_back; \
888Sbill 	(bp)->b_flags |= B_BUSY; \
898Sbill 	splx(s); \
908Sbill }
918Sbill #endif
928Sbill 
938Sbill /*
948Sbill  * Read in (if necessary) the block and return a buffer pointer.
958Sbill  */
968Sbill struct buf *
978Sbill bread(dev, blkno)
988Sbill dev_t dev;
998Sbill daddr_t blkno;
1008Sbill {
1018Sbill 	register struct buf *bp;
1028Sbill 
1038Sbill 	bp = getblk(dev, blkno);
1048Sbill 	if (bp->b_flags&B_DONE) {
1052045Swnj #ifdef	EPAWNJ
1062045Swnj 		trace(TR_BREAD|TR_HIT, dev, blkno);
1072045Swnj #endif
1088Sbill #ifdef	DISKMON
1098Sbill 		io_info.ncache++;
1108Sbill #endif
1118Sbill 		return(bp);
1128Sbill 	}
1138Sbill 	bp->b_flags |= B_READ;
1148Sbill 	bp->b_bcount = BSIZE;
1158Sbill 	(*bdevsw[major(dev)].d_strategy)(bp);
1162045Swnj #ifdef	EPAWNJ
1172045Swnj 	trace(TR_BREAD|TR_MISS, dev, blkno);
1182045Swnj #endif
1198Sbill #ifdef	DISKMON
1208Sbill 	io_info.nread++;
1218Sbill #endif
1228Sbill 	u.u_vm.vm_inblk++;		/* pay for read */
1238Sbill 	iowait(bp);
1248Sbill 	return(bp);
1258Sbill }
1268Sbill 
1278Sbill /*
1288Sbill  * Read in the block, like bread, but also start I/O on the
1298Sbill  * read-ahead block (which is not allocated to the caller)
1308Sbill  */
1318Sbill struct buf *
1328Sbill breada(dev, blkno, rablkno)
1338Sbill dev_t dev;
1348Sbill daddr_t blkno, rablkno;
1358Sbill {
1368Sbill 	register struct buf *bp, *rabp;
1378Sbill 
1388Sbill 	bp = NULL;
1398Sbill 	if (!incore(dev, blkno)) {
1408Sbill 		bp = getblk(dev, blkno);
1418Sbill 		if ((bp->b_flags&B_DONE) == 0) {
1428Sbill 			bp->b_flags |= B_READ;
1438Sbill 			bp->b_bcount = BSIZE;
1448Sbill 			(*bdevsw[major(dev)].d_strategy)(bp);
1452045Swnj #ifdef	EPAWNJ
1462045Swnj 			trace(TR_BREAD|TR_MISS, dev, blkno);
1472045Swnj #endif
1488Sbill #ifdef	DISKMON
1498Sbill 			io_info.nread++;
1508Sbill #endif
1518Sbill 			u.u_vm.vm_inblk++;		/* pay for read */
1528Sbill 		}
1532045Swnj #ifdef	EPAWNJ
1542045Swnj 		else
1552045Swnj 			trace(TR_BREAD|TR_HIT, dev, blkno);
1562045Swnj #endif
1578Sbill 	}
1588Sbill 	if (rablkno && !incore(dev, rablkno)) {
1598Sbill 		rabp = getblk(dev, rablkno);
1602045Swnj 		if (rabp->b_flags & B_DONE) {
1618Sbill 			brelse(rabp);
1622045Swnj #ifdef	EPAWNJ
1632045Swnj 			trace(TR_BREAD|TR_HIT|TR_RA, dev, blkno);
1642045Swnj #endif
1652045Swnj 		} else {
1668Sbill 			rabp->b_flags |= B_READ|B_ASYNC;
1678Sbill 			rabp->b_bcount = BSIZE;
1688Sbill 			(*bdevsw[major(dev)].d_strategy)(rabp);
1692045Swnj #ifdef	EPAWNJ
1702045Swnj 			trace(TR_BREAD|TR_MISS|TR_RA, dev, rablock);
1712045Swnj #endif
1728Sbill #ifdef	DISKMON
1738Sbill 			io_info.nreada++;
1748Sbill #endif
1758Sbill 			u.u_vm.vm_inblk++;		/* pay in advance */
1768Sbill 		}
1778Sbill 	}
1788Sbill 	if(bp == NULL)
1798Sbill 		return(bread(dev, blkno));
1808Sbill 	iowait(bp);
1818Sbill 	return(bp);
1828Sbill }
1838Sbill 
1848Sbill /*
1858Sbill  * Write the buffer, waiting for completion.
1868Sbill  * Then release the buffer.
1878Sbill  */
1888Sbill bwrite(bp)
1898Sbill register struct buf *bp;
1908Sbill {
1918Sbill 	register flag;
1928Sbill 
1938Sbill 	flag = bp->b_flags;
1948Sbill 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
1958Sbill 	bp->b_bcount = BSIZE;
1968Sbill #ifdef	DISKMON
1978Sbill 	io_info.nwrite++;
1988Sbill #endif
1998Sbill 	if ((flag&B_DELWRI) == 0)
2008Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
2012045Swnj #ifdef	EPAWNJ
2022045Swnj 	trace(TR_BWRITE, bp->b_dev, dbtofsb(bp->b_blkno));
2032045Swnj #endif
2048Sbill 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2058Sbill 	if ((flag&B_ASYNC) == 0) {
2068Sbill 		iowait(bp);
2078Sbill 		brelse(bp);
2088Sbill 	} else if (flag & B_DELWRI)
2098Sbill 		bp->b_flags |= B_AGE;
2108Sbill 	else
2118Sbill 		geterror(bp);
2128Sbill }
2138Sbill 
2148Sbill /*
2158Sbill  * Release the buffer, marking it so that if it is grabbed
2168Sbill  * for another purpose it will be written out before being
2178Sbill  * given up (e.g. when writing a partial block where it is
2188Sbill  * assumed that another write for the same block will soon follow).
2198Sbill  * This can't be done for magtape, since writes must be done
2208Sbill  * in the same order as requested.
2218Sbill  */
2228Sbill bdwrite(bp)
2238Sbill register struct buf *bp;
2248Sbill {
2252403Skre 	register int flags;
2268Sbill 
2278Sbill 	if ((bp->b_flags&B_DELWRI) == 0)
2288Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
2292403Skre 	flags = bdevsw[major(bp->b_dev)].d_flags;
2302403Skre 	if(flags & B_TAPE)
2318Sbill 		bawrite(bp);
2328Sbill 	else {
2338Sbill 		bp->b_flags |= B_DELWRI | B_DONE;
2348Sbill 		brelse(bp);
2358Sbill 	}
2368Sbill }
2378Sbill 
2388Sbill /*
2398Sbill  * Release the buffer, start I/O on it, but don't wait for completion.
2408Sbill  */
2418Sbill bawrite(bp)
2428Sbill register struct buf *bp;
2438Sbill {
2448Sbill 
2458Sbill 	bp->b_flags |= B_ASYNC;
2468Sbill 	bwrite(bp);
2478Sbill }
2488Sbill 
2498Sbill /*
2508Sbill  * release the buffer, with no I/O implied.
2518Sbill  */
2528Sbill brelse(bp)
2538Sbill register struct buf *bp;
2548Sbill {
2552325Swnj 	register struct buf *flist;
2568Sbill 	register s;
2578Sbill 
2588Sbill 	if (bp->b_flags&B_WANTED)
2598Sbill 		wakeup((caddr_t)bp);
2602325Swnj 	if (bfreelist[0].b_flags&B_WANTED) {
2612325Swnj 		bfreelist[0].b_flags &= ~B_WANTED;
2622325Swnj 		wakeup((caddr_t)bfreelist);
2638Sbill 	}
264*2683Swnj 	if (bp->b_flags&B_ERROR)
265*2683Swnj 		if (bp->b_flags & B_LOCKED)
266*2683Swnj 			bp->b_flags &= ~B_ERROR;	/* try again later */
267*2683Swnj 		else
268*2683Swnj 			bp->b_dev = NODEV;  		/* no assoc */
2698Sbill 	s = spl6();
2702325Swnj 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
2712325Swnj 		/* block has no info ... put at front of most free list */
2722325Swnj 		flist = &bfreelist[BQUEUES-1];
2732325Swnj 		flist->av_forw->av_back = bp;
2742325Swnj 		bp->av_forw = flist->av_forw;
2752325Swnj 		flist->av_forw = bp;
2762325Swnj 		bp->av_back = flist;
2778Sbill 	} else {
2782325Swnj 		if (bp->b_flags & B_LOCKED)
2792325Swnj 			flist = &bfreelist[BQ_LOCKED];
2802325Swnj 		else if (bp->b_flags & B_AGE)
2812325Swnj 			flist = &bfreelist[BQ_AGE];
2822325Swnj 		else
2832325Swnj 			flist = &bfreelist[BQ_LRU];
2842325Swnj 		flist->av_back->av_forw = bp;
2852325Swnj 		bp->av_back = flist->av_back;
2862325Swnj 		flist->av_back = bp;
2872325Swnj 		bp->av_forw = flist;
2888Sbill 	}
2898Sbill 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
2908Sbill 	splx(s);
2918Sbill }
2928Sbill 
2938Sbill /*
2948Sbill  * See if the block is associated with some buffer
2958Sbill  * (mainly to avoid getting hung up on a wait in breada)
2968Sbill  */
2978Sbill incore(dev, blkno)
2988Sbill dev_t dev;
2998Sbill daddr_t blkno;
3008Sbill {
3018Sbill 	register struct buf *bp;
3022325Swnj 	register struct buf *dp;
3038Sbill 	register int dblkno = fsbtodb(blkno);
3048Sbill 
3052328Swnj 	dp = BUFHASH(dev, dblkno);
3062325Swnj 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
3072325Swnj 		if (bp->b_blkno == dblkno && bp->b_dev == dev &&
3082325Swnj 		    !(bp->b_flags & B_INVAL))
30991Sbill 			return (1);
31091Sbill 	return (0);
3118Sbill }
3128Sbill 
3138Sbill struct buf *
3148Sbill baddr(dev, blkno)
3158Sbill dev_t dev;
3168Sbill daddr_t blkno;
3178Sbill {
3188Sbill 
3198Sbill 	if (incore(dev, blkno))
3208Sbill 		return (bread(dev, blkno));
3218Sbill 	return (0);
3228Sbill }
3238Sbill 
3248Sbill /*
3258Sbill  * Assign a buffer for the given block.  If the appropriate
3268Sbill  * block is already associated, return it; otherwise search
3278Sbill  * for the oldest non-busy buffer and reassign it.
3288Sbill  */
3298Sbill struct buf *
3308Sbill getblk(dev, blkno)
3318Sbill dev_t dev;
3328Sbill daddr_t blkno;
3338Sbill {
33491Sbill 	register struct buf *bp, *dp, *ep;
3352325Swnj 	register int dblkno = fsbtodb(blkno);
3362423Skre #ifdef	DISKMON
3372423Skre 	register int i;
3382423Skre #endif
3398Sbill 
3401831Sbill 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
3411831Sbill 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
3421831Sbill 	dblkno = fsbtodb(blkno);
3432325Swnj 	dp = BUFHASH(dev, dblkno);
3448Sbill     loop:
345124Sbill 	(void) spl0();
3462325Swnj 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
3472325Swnj 		if (bp->b_blkno != dblkno || bp->b_dev != dev ||
3482325Swnj 		    bp->b_flags&B_INVAL)
3498Sbill 			continue;
350124Sbill 		(void) spl6();
3518Sbill 		if (bp->b_flags&B_BUSY) {
3528Sbill 			bp->b_flags |= B_WANTED;
3538Sbill 			sleep((caddr_t)bp, PRIBIO+1);
3548Sbill 			goto loop;
3558Sbill 		}
356124Sbill 		(void) spl0();
3578Sbill #ifdef	DISKMON
3588Sbill 		i = 0;
3598Sbill 		dp = bp->av_forw;
3602325Swnj 		while ((dp->b_flags & B_HEAD) == 0) {
3618Sbill 			i++;
3628Sbill 			dp = dp->av_forw;
3638Sbill 		}
3648Sbill 		if (i<NBUF)
3658Sbill 			io_info.bufcount[i]++;
3668Sbill #endif
3678Sbill 		notavail(bp);
3688Sbill 		bp->b_flags |= B_CACHE;
3698Sbill 		return(bp);
3708Sbill 	}
37191Sbill 	if (major(dev) >= nblkdev)
37291Sbill 		panic("blkdev");
373124Sbill 	(void) spl6();
3742325Swnj 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
3752325Swnj 		if (ep->av_forw != ep)
3762325Swnj 			break;
3772325Swnj 	if (ep == bfreelist) {		/* no free blocks at all */
3782325Swnj 		ep->b_flags |= B_WANTED;
3792325Swnj 		sleep((caddr_t)ep, PRIBIO+1);
3808Sbill 		goto loop;
3818Sbill 	}
3821792Sbill 	(void) spl0();
3832325Swnj 	bp = ep->av_forw;
3848Sbill 	notavail(bp);
3858Sbill 	if (bp->b_flags & B_DELWRI) {
3868Sbill 		bp->b_flags |= B_ASYNC;
3878Sbill 		bwrite(bp);
3888Sbill 		goto loop;
3898Sbill 	}
3902045Swnj #ifdef EPAWNJ
3912045Swnj 	trace(TR_BRELSE, bp->b_dev, dbtofsb(bp->b_blkno));
3922045Swnj #endif
3938Sbill 	bp->b_flags = B_BUSY;
3948Sbill 	bp->b_back->b_forw = bp->b_forw;
3958Sbill 	bp->b_forw->b_back = bp->b_back;
3968Sbill 	bp->b_forw = dp->b_forw;
3978Sbill 	bp->b_back = dp;
3988Sbill 	dp->b_forw->b_back = bp;
3998Sbill 	dp->b_forw = bp;
4008Sbill 	bp->b_dev = dev;
4018Sbill 	bp->b_blkno = dblkno;
4028Sbill 	return(bp);
4038Sbill }
4048Sbill 
4058Sbill /*
4068Sbill  * get an empty block,
4078Sbill  * not assigned to any particular device
4088Sbill  */
4098Sbill struct buf *
4108Sbill geteblk()
4118Sbill {
412182Sbill 	register struct buf *bp, *dp;
4138Sbill 
4148Sbill loop:
415124Sbill 	(void) spl6();
4162325Swnj 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
4172325Swnj 		if (dp->av_forw != dp)
4182325Swnj 			break;
4192325Swnj 	if (dp == bfreelist) {		/* no free blocks */
4202325Swnj 		dp->b_flags |= B_WANTED;
4212325Swnj 		sleep((caddr_t)dp, PRIBIO+1);
4222325Swnj 		goto loop;
4238Sbill 	}
424124Sbill 	(void) spl0();
4252325Swnj 	bp = dp->av_forw;
4268Sbill 	notavail(bp);
4278Sbill 	if (bp->b_flags & B_DELWRI) {
4288Sbill 		bp->b_flags |= B_ASYNC;
4298Sbill 		bwrite(bp);
4308Sbill 		goto loop;
4318Sbill 	}
4322045Swnj #ifdef EPAWNJ
4332325Swnj 	trace(TR_BRELSE, bp->b_dev, dbtofsb(bp->b_blkno));
4342045Swnj #endif
4352325Swnj 	bp->b_flags = B_BUSY|B_INVAL;
4368Sbill 	bp->b_back->b_forw = bp->b_forw;
4378Sbill 	bp->b_forw->b_back = bp->b_back;
4388Sbill 	bp->b_forw = dp->b_forw;
4398Sbill 	bp->b_back = dp;
4408Sbill 	dp->b_forw->b_back = bp;
4418Sbill 	dp->b_forw = bp;
4428Sbill 	bp->b_dev = (dev_t)NODEV;
4438Sbill 	return(bp);
4448Sbill }
4458Sbill 
4468Sbill /*
4478Sbill  * Wait for I/O completion on the buffer; return errors
4488Sbill  * to the user.
4498Sbill  */
4508Sbill iowait(bp)
4518Sbill register struct buf *bp;
4528Sbill {
4538Sbill 
454124Sbill 	(void) spl6();
4558Sbill 	while ((bp->b_flags&B_DONE)==0)
4568Sbill 		sleep((caddr_t)bp, PRIBIO);
457124Sbill 	(void) spl0();
4588Sbill 	geterror(bp);
4598Sbill }
4608Sbill 
4618Sbill #ifndef FASTVAX
4628Sbill /*
4638Sbill  * Unlink a buffer from the available list and mark it busy.
4648Sbill  * (internal interface)
4658Sbill  */
4668Sbill notavail(bp)
4678Sbill register struct buf *bp;
4688Sbill {
4698Sbill 	register s;
4708Sbill 
4718Sbill 	s = spl6();
4728Sbill 	bp->av_back->av_forw = bp->av_forw;
4738Sbill 	bp->av_forw->av_back = bp->av_back;
4748Sbill 	bp->b_flags |= B_BUSY;
4758Sbill 	splx(s);
4768Sbill }
4778Sbill #endif
4788Sbill 
4798Sbill /*
4808Sbill  * Mark I/O complete on a buffer. If the header
4818Sbill  * indicates a dirty page push completion, the
4828Sbill  * header is inserted into the ``cleaned'' list
4838Sbill  * to be processed by the pageout daemon. Otherwise
4848Sbill  * release it if I/O is asynchronous, and wake
4858Sbill  * up anyone waiting for it.
4868Sbill  */
4878Sbill iodone(bp)
4888Sbill register struct buf *bp;
4898Sbill {
4908Sbill 	register int s;
4918Sbill 
492420Sbill 	if (bp->b_flags & B_DONE)
493420Sbill 		panic("dup iodone");
4948Sbill 	bp->b_flags |= B_DONE;
4958Sbill 	if (bp->b_flags & B_DIRTY) {
4968Sbill 		if (bp->b_flags & B_ERROR)
4978Sbill 			panic("IO err in push");
4988Sbill 		s = spl6();
4998Sbill 		cnt.v_pgout++;
5008Sbill 		bp->av_forw = bclnlist;
5018Sbill 		bp->b_bcount = swsize[bp - swbuf];
5028Sbill 		bp->b_pfcent = swpf[bp - swbuf];
5038Sbill 		bclnlist = bp;
5048Sbill 		if (bswlist.b_flags & B_WANTED)
5058Sbill 			wakeup((caddr_t)&proc[2]);
5068Sbill 		splx(s);
507383Sbill 		return;
5088Sbill 	}
5098Sbill 	if (bp->b_flags&B_ASYNC)
5108Sbill 		brelse(bp);
5118Sbill 	else {
5128Sbill 		bp->b_flags &= ~B_WANTED;
5138Sbill 		wakeup((caddr_t)bp);
5148Sbill 	}
5158Sbill }
5168Sbill 
5178Sbill /*
5188Sbill  * Zero the core associated with a buffer.
5198Sbill  */
5208Sbill clrbuf(bp)
5218Sbill struct buf *bp;
5228Sbill {
5238Sbill 	register *p;
5248Sbill 	register c;
5258Sbill 
5268Sbill 	p = bp->b_un.b_words;
5278Sbill 	c = BSIZE/sizeof(int);
5288Sbill 	do
5298Sbill 		*p++ = 0;
5308Sbill 	while (--c);
5318Sbill 	bp->b_resid = 0;
5328Sbill }
5338Sbill 
5348Sbill /*
5358Sbill  * swap I/O -
5368Sbill  *
5378Sbill  * If the flag indicates a dirty page push initiated
5388Sbill  * by the pageout daemon, we map the page into the i th
5398Sbill  * virtual page of process 2 (the daemon itself) where i is
5408Sbill  * the index of the swap header that has been allocated.
5418Sbill  * We simply initialize the header and queue the I/O but
5428Sbill  * do not wait for completion. When the I/O completes,
5438Sbill  * iodone() will link the header to a list of cleaned
5448Sbill  * pages to be processed by the pageout daemon.
5458Sbill  */
5468Sbill swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent)
5478Sbill 	struct proc *p;
5488Sbill 	swblk_t dblkno;
5498Sbill 	caddr_t addr;
5508Sbill 	int flag, nbytes;
5518Sbill 	dev_t dev;
5528Sbill 	unsigned pfcent;
5538Sbill {
5548Sbill 	register struct buf *bp;
5558Sbill 	register int c;
5568Sbill 	int p2dp;
5578Sbill 	register struct pte *dpte, *vpte;
5588Sbill 
559124Sbill 	(void) spl6();
5608Sbill 	while (bswlist.av_forw == NULL) {
5618Sbill 		bswlist.b_flags |= B_WANTED;
5628Sbill 		sleep((caddr_t)&bswlist, PSWP+1);
5638Sbill 	}
5648Sbill 	bp = bswlist.av_forw;
5658Sbill 	bswlist.av_forw = bp->av_forw;
566124Sbill 	(void) spl0();
5678Sbill 
5688Sbill 	bp->b_flags = B_BUSY | B_PHYS | rdflg | flag;
5698Sbill 	if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0)
5708Sbill 		if (rdflg == B_READ)
5718Sbill 			sum.v_pswpin += btoc(nbytes);
5728Sbill 		else
5738Sbill 			sum.v_pswpout += btoc(nbytes);
5748Sbill 	bp->b_proc = p;
5758Sbill 	if (flag & B_DIRTY) {
5768Sbill 		p2dp = ((bp - swbuf) * CLSIZE) * KLMAX;
5778Sbill 		dpte = dptopte(&proc[2], p2dp);
5788Sbill 		vpte = vtopte(p, btop(addr));
5798Sbill 		for (c = 0; c < nbytes; c += NBPG) {
5808Sbill 			if (vpte->pg_pfnum == 0 || vpte->pg_fod)
5818Sbill 				panic("swap bad pte");
5828Sbill 			*dpte++ = *vpte++;
5838Sbill 		}
5848Sbill 		bp->b_un.b_addr = (caddr_t)ctob(p2dp);
5858Sbill 	} else
5868Sbill 		bp->b_un.b_addr = addr;
5878Sbill 	while (nbytes > 0) {
5888Sbill 		c = imin(ctob(120), nbytes);
5898Sbill 		bp->b_bcount = c;
5908Sbill 		bp->b_blkno = dblkno;
5918Sbill 		bp->b_dev = dev;
592718Sbill 		if (flag & B_DIRTY) {
593718Sbill 			swpf[bp - swbuf] = pfcent;
594718Sbill 			swsize[bp - swbuf] = nbytes;
595718Sbill 		}
5968Sbill 		(*bdevsw[major(dev)].d_strategy)(bp);
5978Sbill 		if (flag & B_DIRTY) {
5988Sbill 			if (c < nbytes)
5998Sbill 				panic("big push");
6008Sbill 			return;
6018Sbill 		}
602124Sbill 		(void) spl6();
6038Sbill 		while((bp->b_flags&B_DONE)==0)
6048Sbill 			sleep((caddr_t)bp, PSWP);
605124Sbill 		(void) spl0();
6068Sbill 		bp->b_un.b_addr += c;
6078Sbill 		bp->b_flags &= ~B_DONE;
6088Sbill 		if (bp->b_flags & B_ERROR) {
6098Sbill 			if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE)
6108Sbill 				panic("hard IO err in swap");
6118Sbill 			swkill(p, (char *)0);
6128Sbill 		}
6138Sbill 		nbytes -= c;
6148Sbill 		dblkno += btoc(c);
6158Sbill 	}
616124Sbill 	(void) spl6();
6178Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
6188Sbill 	bp->av_forw = bswlist.av_forw;
6198Sbill 	bswlist.av_forw = bp;
6208Sbill 	if (bswlist.b_flags & B_WANTED) {
6218Sbill 		bswlist.b_flags &= ~B_WANTED;
6228Sbill 		wakeup((caddr_t)&bswlist);
6238Sbill 		wakeup((caddr_t)&proc[2]);
6248Sbill 	}
625124Sbill 	(void) spl0();
6268Sbill }
6278Sbill 
6288Sbill /*
6298Sbill  * If rout == 0 then killed on swap error, else
6308Sbill  * rout is the name of the routine where we ran out of
6318Sbill  * swap space.
6328Sbill  */
6338Sbill swkill(p, rout)
6348Sbill 	struct proc *p;
6358Sbill 	char *rout;
6368Sbill {
6378Sbill 
6388Sbill 	printf("%d: ", p->p_pid);
6398Sbill 	if (rout)
6408Sbill 		printf("out of swap space in %s\n", rout);
6418Sbill 	else
6428Sbill 		printf("killed on swap error\n");
6438Sbill 	/*
6448Sbill 	 * To be sure no looping (e.g. in vmsched trying to
6458Sbill 	 * swap out) mark process locked in core (as though
6468Sbill 	 * done by user) after killing it so noone will try
6478Sbill 	 * to swap it out.
6488Sbill 	 */
649165Sbill 	psignal(p, SIGKILL);
6508Sbill 	p->p_flag |= SULOCK;
6518Sbill }
6528Sbill 
6538Sbill /*
6548Sbill  * make sure all write-behind blocks
6558Sbill  * on dev (or NODEV for all)
6568Sbill  * are flushed out.
6578Sbill  * (from umount and update)
6588Sbill  */
6598Sbill bflush(dev)
6608Sbill dev_t dev;
6618Sbill {
6628Sbill 	register struct buf *bp;
6632325Swnj 	register struct buf *flist;
6648Sbill 
6658Sbill loop:
666124Sbill 	(void) spl6();
6672325Swnj 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
6682325Swnj 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
6698Sbill 		if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
6708Sbill 			bp->b_flags |= B_ASYNC;
6718Sbill 			notavail(bp);
6728Sbill 			bwrite(bp);
6738Sbill 			goto loop;
6748Sbill 		}
6758Sbill 	}
676124Sbill 	(void) spl0();
6778Sbill }
6788Sbill 
6798Sbill /*
6808Sbill  * Raw I/O. The arguments are
6818Sbill  *	The strategy routine for the device
6828Sbill  *	A buffer, which will always be a special buffer
6838Sbill  *	  header owned exclusively by the device for this purpose
6848Sbill  *	The device number
6858Sbill  *	Read/write flag
6868Sbill  * Essentially all the work is computing physical addresses and
6878Sbill  * validating them.
6888Sbill  * If the user has the proper access privilidges, the process is
6898Sbill  * marked 'delayed unlock' and the pages involved in the I/O are
6908Sbill  * faulted and locked. After the completion of the I/O, the above pages
6918Sbill  * are unlocked.
6928Sbill  */
6938Sbill physio(strat, bp, dev, rw, mincnt)
6948Sbill int (*strat)();
6958Sbill register struct buf *bp;
6968Sbill unsigned (*mincnt)();
6978Sbill {
6988Sbill 	register int c;
6998Sbill 	char *a;
7008Sbill 
7018Sbill 	if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) {
7028Sbill 		u.u_error = EFAULT;
7038Sbill 		return;
7048Sbill 	}
705124Sbill 	(void) spl6();
7068Sbill 	while (bp->b_flags&B_BUSY) {
7078Sbill 		bp->b_flags |= B_WANTED;
7088Sbill 		sleep((caddr_t)bp, PRIBIO+1);
7098Sbill 	}
7108Sbill 	bp->b_error = 0;
7118Sbill 	bp->b_proc = u.u_procp;
7128Sbill 	bp->b_un.b_addr = u.u_base;
7138Sbill 	while (u.u_count != 0 && bp->b_error==0) {
7148Sbill 		bp->b_flags = B_BUSY | B_PHYS | rw;
7158Sbill 		bp->b_dev = dev;
7168Sbill 		bp->b_blkno = u.u_offset >> PGSHIFT;
7178Sbill 		bp->b_bcount = u.u_count;
7188Sbill 		(*mincnt)(bp);
7198Sbill 		c = bp->b_bcount;
7208Sbill 		u.u_procp->p_flag |= SPHYSIO;
7218Sbill 		vslock(a = bp->b_un.b_addr, c);
7228Sbill 		(*strat)(bp);
723124Sbill 		(void) spl6();
7248Sbill 		while ((bp->b_flags&B_DONE) == 0)
7258Sbill 			sleep((caddr_t)bp, PRIBIO);
7268Sbill 		vsunlock(a, c, rw);
7278Sbill 		u.u_procp->p_flag &= ~SPHYSIO;
7288Sbill 		if (bp->b_flags&B_WANTED)
7298Sbill 			wakeup((caddr_t)bp);
730124Sbill 		(void) spl0();
7318Sbill 		bp->b_un.b_addr += c;
7328Sbill 		u.u_count -= c;
7338Sbill 		u.u_offset += c;
7348Sbill 	}
7358Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
7368Sbill 	u.u_count = bp->b_resid;
7378Sbill 	geterror(bp);
7388Sbill }
7398Sbill 
7408Sbill /*ARGSUSED*/
7418Sbill unsigned
7428Sbill minphys(bp)
7438Sbill struct buf *bp;
7448Sbill {
7458Sbill 
7468Sbill 	if (bp->b_bcount > 60 * 1024)
7478Sbill 		bp->b_bcount = 60 * 1024;
7488Sbill }
7498Sbill 
7508Sbill /*
7518Sbill  * Pick up the device's error number and pass it to the user;
7528Sbill  * if there is an error but the number is 0 set a generalized
7538Sbill  * code.  Actually the latter is always true because devices
7548Sbill  * don't yet return specific errors.
7558Sbill  */
7568Sbill geterror(bp)
7578Sbill register struct buf *bp;
7588Sbill {
7598Sbill 
7608Sbill 	if (bp->b_flags&B_ERROR)
7618Sbill 		if ((u.u_error = bp->b_error)==0)
7628Sbill 			u.u_error = EIO;
7638Sbill }
7642299Skre 
7652299Skre /*
7662299Skre  * Invalidate in core blocks belonging to closed or umounted filesystem
7672299Skre  *
7682299Skre  * This is not nicely done at all - the buffer ought to be removed from the
7692299Skre  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
7702299Skre  * can't do that here, as it is quite possible that the block is still
7712299Skre  * being used for i/o. Eventually, all disc drivers should be forced to
7722299Skre  * have a close routine, which ought ensure that the queue is empty, then
7732299Skre  * properly flush the queues. Until that happy day, this suffices for
7742299Skre  * correctness.						... kre
7752299Skre  */
7762299Skre binval(dev)
7772299Skre dev_t dev;
7782299Skre {
7792361Skre 	register struct buf *bp;
7802361Skre 	register struct bufhd *hp;
7812361Skre #define dp ((struct buf *)hp)
7822299Skre 
7832361Skre 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
7842361Skre 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
7852361Skre 			if (bp->b_dev == dev)
7862361Skre 				bp->b_flags |= B_INVAL;
7872299Skre }
788