xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 91)
1*91Sbill /*	vfs_cluster.c	3.2	10/14/12	*/
28Sbill 
3*91Sbill int	distrust = 1;		/* TEST */
48Sbill #include "../h/param.h"
58Sbill #include "../h/systm.h"
68Sbill #include "../h/dir.h"
78Sbill #include "../h/user.h"
88Sbill #include "../h/buf.h"
98Sbill #include "../h/conf.h"
108Sbill #include "../h/proc.h"
118Sbill #include "../h/seg.h"
128Sbill #include "../h/pte.h"
138Sbill #include "../h/vm.h"
148Sbill 
15*91Sbill /*
16*91Sbill  * The following several routines allocate and free
17*91Sbill  * buffers with various side effects.  In general the
18*91Sbill  * arguments to an allocate routine are a device and
19*91Sbill  * a block number, and the value is a pointer to
20*91Sbill  * to the buffer header; the buffer is marked "busy"
21*91Sbill  * so that no one else can touch it.  If the block was
22*91Sbill  * already in core, no I/O need be done; if it is
23*91Sbill  * already busy, the process waits until it becomes free.
24*91Sbill  * The following routines allocate a buffer:
25*91Sbill  *	getblk
26*91Sbill  *	bread
27*91Sbill  *	breada
28*91Sbill  *	baddr	(if it is incore)
29*91Sbill  * Eventually the buffer must be released, possibly with the
30*91Sbill  * side effect of writing it out, by using one of
31*91Sbill  *	bwrite
32*91Sbill  *	bdwrite
33*91Sbill  *	bawrite
34*91Sbill  *	brelse
35*91Sbill  */
36*91Sbill 
37*91Sbill #define	BUFHSZ	63
38*91Sbill #define	BUFHASH(blkno)	(blkno % BUFHSZ)
39*91Sbill short	bufhash[BUFHSZ];
40*91Sbill 
41*91Sbill /*
42*91Sbill  * Initialize hash links for buffers.
43*91Sbill  */
44*91Sbill bhinit()
45*91Sbill {
46*91Sbill 	register int i;
47*91Sbill 
48*91Sbill 	for (i = 0; i < BUFHSZ; i++)
49*91Sbill 		bufhash[i] = -1;
50*91Sbill }
51*91Sbill 
528Sbill /* #define	DISKMON	1 */
538Sbill 
548Sbill #ifdef	DISKMON
558Sbill struct {
568Sbill 	int	nbuf;
578Sbill 	long	nread;
588Sbill 	long	nreada;
598Sbill 	long	ncache;
608Sbill 	long	nwrite;
618Sbill 	long	bufcount[NBUF];
628Sbill } io_info;
638Sbill #endif
648Sbill 
658Sbill /*
668Sbill  * Swap IO headers -
678Sbill  * They contain the necessary information for the swap I/O.
688Sbill  * At any given time, a swap header can be in three
698Sbill  * different lists. When free it is in the free list,
708Sbill  * when allocated and the I/O queued, it is on the swap
718Sbill  * device list, and finally, if the operation was a dirty
728Sbill  * page push, when the I/O completes, it is inserted
738Sbill  * in a list of cleaned pages to be processed by the pageout daemon.
748Sbill  */
758Sbill struct	buf swbuf[NSWBUF];
768Sbill short	swsize[NSWBUF];		/* CAN WE JUST USE B_BCOUNT? */
778Sbill int	swpf[NSWBUF];
788Sbill 
798Sbill 
808Sbill #ifdef	FASTVAX
818Sbill #define	notavail(bp) \
828Sbill { \
838Sbill 	int s = spl6(); \
848Sbill 	(bp)->av_back->av_forw = (bp)->av_forw; \
858Sbill 	(bp)->av_forw->av_back = (bp)->av_back; \
868Sbill 	(bp)->b_flags |= B_BUSY; \
878Sbill 	splx(s); \
888Sbill }
898Sbill #endif
908Sbill 
918Sbill /*
928Sbill  * Read in (if necessary) the block and return a buffer pointer.
938Sbill  */
948Sbill struct buf *
958Sbill bread(dev, blkno)
968Sbill dev_t dev;
978Sbill daddr_t blkno;
988Sbill {
998Sbill 	register struct buf *bp;
1008Sbill 
1018Sbill 	bp = getblk(dev, blkno);
1028Sbill 	if (bp->b_flags&B_DONE) {
1038Sbill #ifdef	DISKMON
1048Sbill 		io_info.ncache++;
1058Sbill #endif
1068Sbill 		return(bp);
1078Sbill 	}
1088Sbill 	bp->b_flags |= B_READ;
1098Sbill 	bp->b_bcount = BSIZE;
1108Sbill 	(*bdevsw[major(dev)].d_strategy)(bp);
1118Sbill #ifdef	DISKMON
1128Sbill 	io_info.nread++;
1138Sbill #endif
1148Sbill 	u.u_vm.vm_inblk++;		/* pay for read */
1158Sbill 	iowait(bp);
1168Sbill 	return(bp);
1178Sbill }
1188Sbill 
1198Sbill /*
1208Sbill  * Read in the block, like bread, but also start I/O on the
1218Sbill  * read-ahead block (which is not allocated to the caller)
1228Sbill  */
1238Sbill struct buf *
1248Sbill breada(dev, blkno, rablkno)
1258Sbill dev_t dev;
1268Sbill daddr_t blkno, rablkno;
1278Sbill {
1288Sbill 	register struct buf *bp, *rabp;
1298Sbill 
1308Sbill 	bp = NULL;
1318Sbill 	if (!incore(dev, blkno)) {
1328Sbill 		bp = getblk(dev, blkno);
1338Sbill 		if ((bp->b_flags&B_DONE) == 0) {
1348Sbill 			bp->b_flags |= B_READ;
1358Sbill 			bp->b_bcount = BSIZE;
1368Sbill 			(*bdevsw[major(dev)].d_strategy)(bp);
1378Sbill #ifdef	DISKMON
1388Sbill 			io_info.nread++;
1398Sbill #endif
1408Sbill 			u.u_vm.vm_inblk++;		/* pay for read */
1418Sbill 		}
1428Sbill 	}
1438Sbill 	if (rablkno && !incore(dev, rablkno)) {
1448Sbill 		rabp = getblk(dev, rablkno);
1458Sbill 		if (rabp->b_flags & B_DONE)
1468Sbill 			brelse(rabp);
1478Sbill 		else {
1488Sbill 			rabp->b_flags |= B_READ|B_ASYNC;
1498Sbill 			rabp->b_bcount = BSIZE;
1508Sbill 			(*bdevsw[major(dev)].d_strategy)(rabp);
1518Sbill #ifdef	DISKMON
1528Sbill 			io_info.nreada++;
1538Sbill #endif
1548Sbill 			u.u_vm.vm_inblk++;		/* pay in advance */
1558Sbill 		}
1568Sbill 	}
1578Sbill 	if(bp == NULL)
1588Sbill 		return(bread(dev, blkno));
1598Sbill 	iowait(bp);
1608Sbill 	return(bp);
1618Sbill }
1628Sbill 
1638Sbill /*
1648Sbill  * Write the buffer, waiting for completion.
1658Sbill  * Then release the buffer.
1668Sbill  */
1678Sbill bwrite(bp)
1688Sbill register struct buf *bp;
1698Sbill {
1708Sbill 	register flag;
1718Sbill 
1728Sbill 	flag = bp->b_flags;
1738Sbill 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
1748Sbill 	bp->b_bcount = BSIZE;
1758Sbill #ifdef	DISKMON
1768Sbill 	io_info.nwrite++;
1778Sbill #endif
1788Sbill 	if ((flag&B_DELWRI) == 0)
1798Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
1808Sbill 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
1818Sbill 	if ((flag&B_ASYNC) == 0) {
1828Sbill 		iowait(bp);
1838Sbill 		brelse(bp);
1848Sbill 	} else if (flag & B_DELWRI)
1858Sbill 		bp->b_flags |= B_AGE;
1868Sbill 	else
1878Sbill 		geterror(bp);
1888Sbill }
1898Sbill 
1908Sbill /*
1918Sbill  * Release the buffer, marking it so that if it is grabbed
1928Sbill  * for another purpose it will be written out before being
1938Sbill  * given up (e.g. when writing a partial block where it is
1948Sbill  * assumed that another write for the same block will soon follow).
1958Sbill  * This can't be done for magtape, since writes must be done
1968Sbill  * in the same order as requested.
1978Sbill  */
1988Sbill bdwrite(bp)
1998Sbill register struct buf *bp;
2008Sbill {
2018Sbill 	register struct buf *dp;
2028Sbill 
2038Sbill 	if ((bp->b_flags&B_DELWRI) == 0)
2048Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
2058Sbill 	dp = bdevsw[major(bp->b_dev)].d_tab;
2068Sbill 	if(dp->b_flags & B_TAPE)
2078Sbill 		bawrite(bp);
2088Sbill 	else {
2098Sbill 		bp->b_flags |= B_DELWRI | B_DONE;
2108Sbill 		brelse(bp);
2118Sbill 	}
2128Sbill }
2138Sbill 
2148Sbill /*
2158Sbill  * Release the buffer, start I/O on it, but don't wait for completion.
2168Sbill  */
2178Sbill bawrite(bp)
2188Sbill register struct buf *bp;
2198Sbill {
2208Sbill 
2218Sbill 	bp->b_flags |= B_ASYNC;
2228Sbill 	bwrite(bp);
2238Sbill }
2248Sbill 
2258Sbill /*
2268Sbill  * release the buffer, with no I/O implied.
2278Sbill  */
2288Sbill brelse(bp)
2298Sbill register struct buf *bp;
2308Sbill {
2318Sbill 	register struct buf **backp;
2328Sbill 	register s;
2338Sbill 
2348Sbill 	if (bp->b_flags&B_WANTED)
2358Sbill 		wakeup((caddr_t)bp);
2368Sbill 	if (bfreelist.b_flags&B_WANTED) {
2378Sbill 		bfreelist.b_flags &= ~B_WANTED;
2388Sbill 		wakeup((caddr_t)&bfreelist);
2398Sbill 	}
240*91Sbill 	if ((bp->b_flags&B_ERROR) && bp->b_dev != NODEV) {
241*91Sbill 		bunhash(bp);
2428Sbill 		bp->b_dev = NODEV;  /* no assoc. on error */
243*91Sbill 	}
2448Sbill 	s = spl6();
2458Sbill 	if(bp->b_flags & (B_AGE|B_ERROR)) {
2468Sbill 		backp = &bfreelist.av_forw;
2478Sbill 		(*backp)->av_back = bp;
2488Sbill 		bp->av_forw = *backp;
2498Sbill 		*backp = bp;
2508Sbill 		bp->av_back = &bfreelist;
2518Sbill 	} else {
2528Sbill 		backp = &bfreelist.av_back;
2538Sbill 		(*backp)->av_forw = bp;
2548Sbill 		bp->av_back = *backp;
2558Sbill 		*backp = bp;
2568Sbill 		bp->av_forw = &bfreelist;
2578Sbill 	}
2588Sbill 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
2598Sbill 	splx(s);
2608Sbill }
2618Sbill 
262*91Sbill /* HASHING IS A GUN LIKE CHANGE, THIS IS THE SAFETY */
263*91Sbill struct buf *
264*91Sbill oincore(dev, blkno)
265*91Sbill 	dev_t dev;
266*91Sbill 	daddr_t blkno;
267*91Sbill {
268*91Sbill 	register struct buf *bp;
269*91Sbill 	register struct buf *dp;
270*91Sbill 	register int dblkno = fsbtodb(blkno);
271*91Sbill 
272*91Sbill 	dp = bdevsw[major(dev)].d_tab;
273*91Sbill 	for (bp=dp->b_forw; bp != dp; bp = bp->b_forw)
274*91Sbill 		if (bp->b_blkno==dblkno && bp->b_dev==dev &&
275*91Sbill 		    bp >= buf && bp < &buf[NBUF])
276*91Sbill 			return (bp);
277*91Sbill 	return ((struct buf *)0);
278*91Sbill }
279*91Sbill 
2808Sbill /*
2818Sbill  * See if the block is associated with some buffer
2828Sbill  * (mainly to avoid getting hung up on a wait in breada)
2838Sbill  */
2848Sbill incore(dev, blkno)
2858Sbill dev_t dev;
2868Sbill daddr_t blkno;
2878Sbill {
2888Sbill 	register struct buf *bp;
2898Sbill 	register int dblkno = fsbtodb(blkno);
2908Sbill 
291*91Sbill 	for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1];
292*91Sbill 	    bp = &buf[bp->b_hlink])
293*91Sbill 		if (bp->b_blkno == dblkno && bp->b_dev == dev) {
294*91Sbill 			if (distrust)
295*91Sbill 			if (oincore(dev, blkno) != bp)		/* TEST */
296*91Sbill 				panic("incore 1");		/* TEST */
297*91Sbill 			return (1);
298*91Sbill 		}
299*91Sbill 	if (distrust)
300*91Sbill 	if (oincore(dev, blkno))				/* TEST */
301*91Sbill 		panic("incore 2");				/* TEST */
302*91Sbill 	return (0);
3038Sbill }
3048Sbill 
3058Sbill struct buf *
3068Sbill baddr(dev, blkno)
3078Sbill dev_t dev;
3088Sbill daddr_t blkno;
3098Sbill {
3108Sbill 
3118Sbill 	if (incore(dev, blkno))
3128Sbill 		return (bread(dev, blkno));
3138Sbill 	return (0);
3148Sbill }
3158Sbill 
3168Sbill /*
3178Sbill  * Assign a buffer for the given block.  If the appropriate
3188Sbill  * block is already associated, return it; otherwise search
3198Sbill  * for the oldest non-busy buffer and reassign it.
3208Sbill  */
3218Sbill struct buf *
3228Sbill getblk(dev, blkno)
3238Sbill dev_t dev;
3248Sbill daddr_t blkno;
3258Sbill {
326*91Sbill 	register struct buf *bp, *dp, *ep;
327*91Sbill 	register int i, x;
3288Sbill 	register int dblkno = fsbtodb(blkno);
3298Sbill 
3308Sbill     loop:
3318Sbill 	VOID spl0();
332*91Sbill 	for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1];
333*91Sbill 	    bp = &buf[bp->b_hlink]) {
334*91Sbill 		if (bp->b_blkno != dblkno || bp->b_dev != dev)
3358Sbill 			continue;
336*91Sbill 		if (distrust)
337*91Sbill 		if (bp != oincore(dev, blkno))		/* TEST */
338*91Sbill 			panic("getblk 1");		/* TEST */
3398Sbill 		VOID spl6();
3408Sbill 		if (bp->b_flags&B_BUSY) {
3418Sbill 			bp->b_flags |= B_WANTED;
3428Sbill 			sleep((caddr_t)bp, PRIBIO+1);
3438Sbill 			goto loop;
3448Sbill 		}
3458Sbill 		VOID spl0();
3468Sbill #ifdef	DISKMON
3478Sbill 		i = 0;
3488Sbill 		dp = bp->av_forw;
3498Sbill 		while (dp != &bfreelist) {
3508Sbill 			i++;
3518Sbill 			dp = dp->av_forw;
3528Sbill 		}
3538Sbill 		if (i<NBUF)
3548Sbill 			io_info.bufcount[i]++;
3558Sbill #endif
3568Sbill 		notavail(bp);
3578Sbill 		bp->b_flags |= B_CACHE;
3588Sbill 		return(bp);
3598Sbill 	}
360*91Sbill 	if (distrust)
361*91Sbill 	if (oincore(dev, blkno))		/* TEST */
362*91Sbill 		panic("getblk 2");		/* TEST */
363*91Sbill 	if (major(dev) >= nblkdev)
364*91Sbill 		panic("blkdev");
365*91Sbill 	dp = bdevsw[major(dev)].d_tab;
366*91Sbill 	if (dp == NULL)
367*91Sbill 		panic("devtab");
3688Sbill 	VOID spl6();
3698Sbill 	if (bfreelist.av_forw == &bfreelist) {
3708Sbill 		bfreelist.b_flags |= B_WANTED;
3718Sbill 		sleep((caddr_t)&bfreelist, PRIBIO+1);
3728Sbill 		goto loop;
3738Sbill 	}
3748Sbill 	spl0();
3758Sbill 	bp = bfreelist.av_forw;
3768Sbill 	notavail(bp);
3778Sbill 	if (bp->b_flags & B_DELWRI) {
3788Sbill 		bp->b_flags |= B_ASYNC;
3798Sbill 		bwrite(bp);
3808Sbill 		goto loop;
3818Sbill 	}
382*91Sbill 	if (bp->b_dev == NODEV)
383*91Sbill 		goto done;
384*91Sbill 	/* INLINE EXPANSION OF bunhash(bp) */
385*91Sbill 	i = BUFHASH(dbtofsb(bp->b_blkno));
386*91Sbill 	x = bp - buf;
387*91Sbill 	if (bufhash[i] == x) {
388*91Sbill 		bufhash[i] = bp->b_hlink;
389*91Sbill 	} else {
390*91Sbill 		for (ep = &buf[bufhash[i]]; ep != &buf[-1];
391*91Sbill 		    ep = &buf[ep->b_hlink])
392*91Sbill 			if (ep->b_hlink == x) {
393*91Sbill 				ep->b_hlink = bp->b_hlink;
394*91Sbill 				goto done;
395*91Sbill 			}
396*91Sbill 		panic("getblk");
397*91Sbill 	}
398*91Sbill done:
399*91Sbill 	/* END INLINE EXPANSION */
4008Sbill 	bp->b_flags = B_BUSY;
4018Sbill 	bp->b_back->b_forw = bp->b_forw;
4028Sbill 	bp->b_forw->b_back = bp->b_back;
4038Sbill 	bp->b_forw = dp->b_forw;
4048Sbill 	bp->b_back = dp;
4058Sbill 	dp->b_forw->b_back = bp;
4068Sbill 	dp->b_forw = bp;
4078Sbill 	bp->b_dev = dev;
4088Sbill 	bp->b_blkno = dblkno;
409*91Sbill 	i = BUFHASH(blkno);
410*91Sbill 	bp->b_hlink = bufhash[i];
411*91Sbill 	bufhash[i] = bp - buf;
4128Sbill 	return(bp);
4138Sbill }
4148Sbill 
4158Sbill /*
4168Sbill  * get an empty block,
4178Sbill  * not assigned to any particular device
4188Sbill  */
4198Sbill struct buf *
4208Sbill geteblk()
4218Sbill {
422*91Sbill 	register struct buf *bp, *dp, *ep;
423*91Sbill 	register int i, x;
4248Sbill 
4258Sbill loop:
4268Sbill 	VOID spl6();
4278Sbill 	while (bfreelist.av_forw == &bfreelist) {
4288Sbill 		bfreelist.b_flags |= B_WANTED;
4298Sbill 		sleep((caddr_t)&bfreelist, PRIBIO+1);
4308Sbill 	}
4318Sbill 	VOID spl0();
4328Sbill 	dp = &bfreelist;
4338Sbill 	bp = bfreelist.av_forw;
4348Sbill 	notavail(bp);
4358Sbill 	if (bp->b_flags & B_DELWRI) {
4368Sbill 		bp->b_flags |= B_ASYNC;
4378Sbill 		bwrite(bp);
4388Sbill 		goto loop;
4398Sbill 	}
440*91Sbill 	if (bp->b_dev != NODEV)
441*91Sbill 		bunhash(bp);
4428Sbill 	bp->b_flags = B_BUSY;
4438Sbill 	bp->b_back->b_forw = bp->b_forw;
4448Sbill 	bp->b_forw->b_back = bp->b_back;
4458Sbill 	bp->b_forw = dp->b_forw;
4468Sbill 	bp->b_back = dp;
4478Sbill 	dp->b_forw->b_back = bp;
4488Sbill 	dp->b_forw = bp;
4498Sbill 	bp->b_dev = (dev_t)NODEV;
450*91Sbill 	bp->b_hlink = -1;
4518Sbill 	return(bp);
4528Sbill }
4538Sbill 
454*91Sbill bunhash(bp)
455*91Sbill 	register struct buf *bp;
456*91Sbill {
457*91Sbill 	register struct buf *ep;
458*91Sbill 	register int i, x;
459*91Sbill 
460*91Sbill 	if (bp->b_dev == NODEV)
461*91Sbill 		return;
462*91Sbill 	i = BUFHASH(dbtofsb(bp->b_blkno));
463*91Sbill 	x = bp - buf;
464*91Sbill 	if (bufhash[i] == x) {
465*91Sbill 		bufhash[i] = bp->b_hlink;
466*91Sbill 		return;
467*91Sbill 	}
468*91Sbill 	for (ep = &buf[bufhash[i]]; ep != &buf[-1];
469*91Sbill 	    ep = &buf[ep->b_hlink])
470*91Sbill 		if (ep->b_hlink == x) {
471*91Sbill 			ep->b_hlink = bp->b_hlink;
472*91Sbill 			return;
473*91Sbill 		}
474*91Sbill 	panic("bunhash");
475*91Sbill }
476*91Sbill 
4778Sbill /*
4788Sbill  * Wait for I/O completion on the buffer; return errors
4798Sbill  * to the user.
4808Sbill  */
4818Sbill iowait(bp)
4828Sbill register struct buf *bp;
4838Sbill {
4848Sbill 
4858Sbill 	VOID spl6();
4868Sbill 	while ((bp->b_flags&B_DONE)==0)
4878Sbill 		sleep((caddr_t)bp, PRIBIO);
4888Sbill 	VOID spl0();
4898Sbill 	geterror(bp);
4908Sbill }
4918Sbill 
4928Sbill #ifndef FASTVAX
4938Sbill /*
4948Sbill  * Unlink a buffer from the available list and mark it busy.
4958Sbill  * (internal interface)
4968Sbill  */
4978Sbill notavail(bp)
4988Sbill register struct buf *bp;
4998Sbill {
5008Sbill 	register s;
5018Sbill 
5028Sbill 	s = spl6();
5038Sbill 	bp->av_back->av_forw = bp->av_forw;
5048Sbill 	bp->av_forw->av_back = bp->av_back;
5058Sbill 	bp->b_flags |= B_BUSY;
5068Sbill 	splx(s);
5078Sbill }
5088Sbill #endif
5098Sbill 
5108Sbill /*
5118Sbill  * Mark I/O complete on a buffer. If the header
5128Sbill  * indicates a dirty page push completion, the
5138Sbill  * header is inserted into the ``cleaned'' list
5148Sbill  * to be processed by the pageout daemon. Otherwise
5158Sbill  * release it if I/O is asynchronous, and wake
5168Sbill  * up anyone waiting for it.
5178Sbill  */
5188Sbill iodone(bp)
5198Sbill register struct buf *bp;
5208Sbill {
5218Sbill 	register int s;
5228Sbill 
5238Sbill 	bp->b_flags |= B_DONE;
5248Sbill 	if (bp->b_flags & B_DIRTY) {
5258Sbill 		if (bp->b_flags & B_ERROR)
5268Sbill 			panic("IO err in push");
5278Sbill 		s = spl6();
5288Sbill 		cnt.v_pgout++;
5298Sbill 		bp->av_forw = bclnlist;
5308Sbill 		bp->b_bcount = swsize[bp - swbuf];
5318Sbill 		bp->b_pfcent = swpf[bp - swbuf];
5328Sbill 		bclnlist = bp;
5338Sbill 		if (bswlist.b_flags & B_WANTED)
5348Sbill 			wakeup((caddr_t)&proc[2]);
5358Sbill 		splx(s);
5368Sbill 	}
5378Sbill 	if (bp->b_flags&B_ASYNC)
5388Sbill 		brelse(bp);
5398Sbill 	else {
5408Sbill 		bp->b_flags &= ~B_WANTED;
5418Sbill 		wakeup((caddr_t)bp);
5428Sbill 	}
5438Sbill }
5448Sbill 
5458Sbill /*
5468Sbill  * Zero the core associated with a buffer.
5478Sbill  */
5488Sbill clrbuf(bp)
5498Sbill struct buf *bp;
5508Sbill {
5518Sbill 	register *p;
5528Sbill 	register c;
5538Sbill 
5548Sbill 	p = bp->b_un.b_words;
5558Sbill 	c = BSIZE/sizeof(int);
5568Sbill 	do
5578Sbill 		*p++ = 0;
5588Sbill 	while (--c);
5598Sbill 	bp->b_resid = 0;
5608Sbill }
5618Sbill 
5628Sbill /*
5638Sbill  * swap I/O -
5648Sbill  *
5658Sbill  * If the flag indicates a dirty page push initiated
5668Sbill  * by the pageout daemon, we map the page into the i th
5678Sbill  * virtual page of process 2 (the daemon itself) where i is
5688Sbill  * the index of the swap header that has been allocated.
5698Sbill  * We simply initialize the header and queue the I/O but
5708Sbill  * do not wait for completion. When the I/O completes,
5718Sbill  * iodone() will link the header to a list of cleaned
5728Sbill  * pages to be processed by the pageout daemon.
5738Sbill  */
5748Sbill swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent)
5758Sbill 	struct proc *p;
5768Sbill 	swblk_t dblkno;
5778Sbill 	caddr_t addr;
5788Sbill 	int flag, nbytes;
5798Sbill 	dev_t dev;
5808Sbill 	unsigned pfcent;
5818Sbill {
5828Sbill 	register struct buf *bp;
5838Sbill 	register int c;
5848Sbill 	int p2dp;
5858Sbill 	register struct pte *dpte, *vpte;
5868Sbill 
5878Sbill 	VOID spl6();
5888Sbill 	while (bswlist.av_forw == NULL) {
5898Sbill 		bswlist.b_flags |= B_WANTED;
5908Sbill 		sleep((caddr_t)&bswlist, PSWP+1);
5918Sbill 	}
5928Sbill 	bp = bswlist.av_forw;
5938Sbill 	bswlist.av_forw = bp->av_forw;
5948Sbill 	VOID spl0();
5958Sbill 
5968Sbill 	bp->b_flags = B_BUSY | B_PHYS | rdflg | flag;
5978Sbill 	if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0)
5988Sbill 		if (rdflg == B_READ)
5998Sbill 			sum.v_pswpin += btoc(nbytes);
6008Sbill 		else
6018Sbill 			sum.v_pswpout += btoc(nbytes);
6028Sbill 	bp->b_proc = p;
6038Sbill 	if (flag & B_DIRTY) {
6048Sbill 		p2dp = ((bp - swbuf) * CLSIZE) * KLMAX;
6058Sbill 		dpte = dptopte(&proc[2], p2dp);
6068Sbill 		vpte = vtopte(p, btop(addr));
6078Sbill 		for (c = 0; c < nbytes; c += NBPG) {
6088Sbill 			if (vpte->pg_pfnum == 0 || vpte->pg_fod)
6098Sbill 				panic("swap bad pte");
6108Sbill 			*dpte++ = *vpte++;
6118Sbill 		}
6128Sbill 		bp->b_un.b_addr = (caddr_t)ctob(p2dp);
6138Sbill 	} else
6148Sbill 		bp->b_un.b_addr = addr;
6158Sbill 	while (nbytes > 0) {
6168Sbill 		c = imin(ctob(120), nbytes);
6178Sbill 		bp->b_bcount = c;
6188Sbill 		bp->b_blkno = dblkno;
6198Sbill 		bp->b_dev = dev;
6208Sbill 		if (dev == swapdev)
6218Sbill 			bp->b_blkno += swplo;
6228Sbill 		(*bdevsw[major(dev)].d_strategy)(bp);
6238Sbill 		if (flag & B_DIRTY) {
6248Sbill 			if (c < nbytes)
6258Sbill 				panic("big push");
6268Sbill 			swsize[bp - swbuf] = nbytes;
6278Sbill 			swpf[bp - swbuf] = pfcent;
6288Sbill 			return;
6298Sbill 		}
6308Sbill 		VOID spl6();
6318Sbill 		while((bp->b_flags&B_DONE)==0)
6328Sbill 			sleep((caddr_t)bp, PSWP);
6338Sbill 		VOID spl0();
6348Sbill 		bp->b_un.b_addr += c;
6358Sbill 		bp->b_flags &= ~B_DONE;
6368Sbill 		if (bp->b_flags & B_ERROR) {
6378Sbill 			if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE)
6388Sbill 				panic("hard IO err in swap");
6398Sbill 			swkill(p, (char *)0);
6408Sbill 		}
6418Sbill 		nbytes -= c;
6428Sbill 		dblkno += btoc(c);
6438Sbill 	}
6448Sbill 	VOID spl6();
6458Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
6468Sbill 	bp->av_forw = bswlist.av_forw;
6478Sbill 	bswlist.av_forw = bp;
6488Sbill 	if (bswlist.b_flags & B_WANTED) {
6498Sbill 		bswlist.b_flags &= ~B_WANTED;
6508Sbill 		wakeup((caddr_t)&bswlist);
6518Sbill 		wakeup((caddr_t)&proc[2]);
6528Sbill 	}
6538Sbill 	VOID spl0();
6548Sbill }
6558Sbill 
6568Sbill /*
6578Sbill  * If rout == 0 then killed on swap error, else
6588Sbill  * rout is the name of the routine where we ran out of
6598Sbill  * swap space.
6608Sbill  */
6618Sbill swkill(p, rout)
6628Sbill 	struct proc *p;
6638Sbill 	char *rout;
6648Sbill {
6658Sbill 
6668Sbill 	printf("%d: ", p->p_pid);
6678Sbill 	if (rout)
6688Sbill 		printf("out of swap space in %s\n", rout);
6698Sbill 	else
6708Sbill 		printf("killed on swap error\n");
6718Sbill 	/*
6728Sbill 	 * To be sure no looping (e.g. in vmsched trying to
6738Sbill 	 * swap out) mark process locked in core (as though
6748Sbill 	 * done by user) after killing it so noone will try
6758Sbill 	 * to swap it out.
6768Sbill 	 */
6778Sbill 	psignal(p, SIGKIL);
6788Sbill 	p->p_flag |= SULOCK;
6798Sbill }
6808Sbill 
6818Sbill /*
6828Sbill  * make sure all write-behind blocks
6838Sbill  * on dev (or NODEV for all)
6848Sbill  * are flushed out.
6858Sbill  * (from umount and update)
6868Sbill  */
6878Sbill bflush(dev)
6888Sbill dev_t dev;
6898Sbill {
6908Sbill 	register struct buf *bp;
6918Sbill 
6928Sbill loop:
6938Sbill 	VOID spl6();
6948Sbill 	for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) {
6958Sbill 		if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
6968Sbill 			bp->b_flags |= B_ASYNC;
6978Sbill 			notavail(bp);
6988Sbill 			bwrite(bp);
6998Sbill 			goto loop;
7008Sbill 		}
7018Sbill 	}
7028Sbill 	VOID spl0();
7038Sbill }
7048Sbill 
7058Sbill /*
7068Sbill  * Raw I/O. The arguments are
7078Sbill  *	The strategy routine for the device
7088Sbill  *	A buffer, which will always be a special buffer
7098Sbill  *	  header owned exclusively by the device for this purpose
7108Sbill  *	The device number
7118Sbill  *	Read/write flag
7128Sbill  * Essentially all the work is computing physical addresses and
7138Sbill  * validating them.
7148Sbill  * If the user has the proper access privilidges, the process is
7158Sbill  * marked 'delayed unlock' and the pages involved in the I/O are
7168Sbill  * faulted and locked. After the completion of the I/O, the above pages
7178Sbill  * are unlocked.
7188Sbill  */
7198Sbill physio(strat, bp, dev, rw, mincnt)
7208Sbill int (*strat)();
7218Sbill register struct buf *bp;
7228Sbill unsigned (*mincnt)();
7238Sbill {
7248Sbill 	register int c;
7258Sbill 	char *a;
7268Sbill 
7278Sbill 	if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) {
7288Sbill 		u.u_error = EFAULT;
7298Sbill 		return;
7308Sbill 	}
7318Sbill 	VOID spl6();
7328Sbill 	while (bp->b_flags&B_BUSY) {
7338Sbill 		bp->b_flags |= B_WANTED;
7348Sbill 		sleep((caddr_t)bp, PRIBIO+1);
7358Sbill 	}
7368Sbill 	bp->b_error = 0;
7378Sbill 	bp->b_proc = u.u_procp;
7388Sbill 	bp->b_un.b_addr = u.u_base;
7398Sbill 	while (u.u_count != 0 && bp->b_error==0) {
7408Sbill 		bp->b_flags = B_BUSY | B_PHYS | rw;
7418Sbill 		bp->b_dev = dev;
7428Sbill 		bp->b_blkno = u.u_offset >> PGSHIFT;
7438Sbill 		bp->b_bcount = u.u_count;
7448Sbill 		(*mincnt)(bp);
7458Sbill 		c = bp->b_bcount;
7468Sbill 		u.u_procp->p_flag |= SPHYSIO;
7478Sbill 		vslock(a = bp->b_un.b_addr, c);
7488Sbill 		(*strat)(bp);
7498Sbill 		VOID spl6();
7508Sbill 		while ((bp->b_flags&B_DONE) == 0)
7518Sbill 			sleep((caddr_t)bp, PRIBIO);
7528Sbill 		vsunlock(a, c, rw);
7538Sbill 		u.u_procp->p_flag &= ~SPHYSIO;
7548Sbill 		if (bp->b_flags&B_WANTED)
7558Sbill 			wakeup((caddr_t)bp);
7568Sbill 		VOID spl0();
7578Sbill 		bp->b_un.b_addr += c;
7588Sbill 		u.u_count -= c;
7598Sbill 		u.u_offset += c;
7608Sbill 	}
7618Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
7628Sbill 	u.u_count = bp->b_resid;
7638Sbill 	geterror(bp);
7648Sbill }
7658Sbill 
7668Sbill /*ARGSUSED*/
7678Sbill unsigned
7688Sbill minphys(bp)
7698Sbill struct buf *bp;
7708Sbill {
7718Sbill 
7728Sbill 	if (bp->b_bcount > 60 * 1024)
7738Sbill 		bp->b_bcount = 60 * 1024;
7748Sbill }
7758Sbill 
7768Sbill /*
7778Sbill  * Pick up the device's error number and pass it to the user;
7788Sbill  * if there is an error but the number is 0 set a generalized
7798Sbill  * code.  Actually the latter is always true because devices
7808Sbill  * don't yet return specific errors.
7818Sbill  */
7828Sbill geterror(bp)
7838Sbill register struct buf *bp;
7848Sbill {
7858Sbill 
7868Sbill 	if (bp->b_flags&B_ERROR)
7878Sbill 		if ((u.u_error = bp->b_error)==0)
7888Sbill 			u.u_error = EIO;
7898Sbill }
790