xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 6156)
1*6156Ssam /*	vfs_cluster.c	4.26	82/03/13	*/
28Sbill 
38Sbill #include "../h/param.h"
48Sbill #include "../h/systm.h"
58Sbill #include "../h/dir.h"
68Sbill #include "../h/user.h"
78Sbill #include "../h/buf.h"
88Sbill #include "../h/conf.h"
98Sbill #include "../h/proc.h"
108Sbill #include "../h/seg.h"
118Sbill #include "../h/pte.h"
128Sbill #include "../h/vm.h"
132045Swnj #include "../h/trace.h"
148Sbill 
1591Sbill /*
1691Sbill  * The following several routines allocate and free
1791Sbill  * buffers with various side effects.  In general the
1891Sbill  * arguments to an allocate routine are a device and
1991Sbill  * a block number, and the value is a pointer to
2091Sbill  * to the buffer header; the buffer is marked "busy"
2191Sbill  * so that no one else can touch it.  If the block was
2291Sbill  * already in core, no I/O need be done; if it is
2391Sbill  * already busy, the process waits until it becomes free.
2491Sbill  * The following routines allocate a buffer:
2591Sbill  *	getblk
2691Sbill  *	bread
2791Sbill  *	breada
2891Sbill  *	baddr	(if it is incore)
2991Sbill  * Eventually the buffer must be released, possibly with the
3091Sbill  * side effect of writing it out, by using one of
3191Sbill  *	bwrite
3291Sbill  *	bdwrite
3391Sbill  *	bawrite
3491Sbill  *	brelse
3591Sbill  */
3691Sbill 
373099Swnj struct	buf bfreelist[BQUEUES];
383099Swnj struct	buf bswlist, *bclnlist;
393099Swnj 
4091Sbill #define	BUFHSZ	63
412325Swnj struct	bufhd bufhash[BUFHSZ];
422328Swnj #define	BUFHASH(dev, dblkno)	\
432328Swnj 		((struct buf *)&bufhash[((int)(dev)+(int)(dblkno)) % BUFHSZ])
4491Sbill 
4591Sbill /*
4691Sbill  * Initialize hash links for buffers.
4791Sbill  */
4891Sbill bhinit()
4991Sbill {
5091Sbill 	register int i;
512325Swnj 	register struct bufhd *bp;
5291Sbill 
532325Swnj 	for (bp = bufhash, i = 0; i < BUFHSZ; i++, bp++)
542325Swnj 		bp->b_forw = bp->b_back = (struct buf *)bp;
5591Sbill }
5691Sbill 
578Sbill /* #define	DISKMON	1 */
588Sbill 
598Sbill #ifdef	DISKMON
608Sbill struct {
618Sbill 	int	nbuf;
628Sbill 	long	nread;
638Sbill 	long	nreada;
648Sbill 	long	ncache;
658Sbill 	long	nwrite;
662771Swnj 	long	bufcount[64];
678Sbill } io_info;
688Sbill #endif
698Sbill 
708Sbill /*
718Sbill  * Swap IO headers -
728Sbill  * They contain the necessary information for the swap I/O.
738Sbill  * At any given time, a swap header can be in three
748Sbill  * different lists. When free it is in the free list,
758Sbill  * when allocated and the I/O queued, it is on the swap
768Sbill  * device list, and finally, if the operation was a dirty
778Sbill  * page push, when the I/O completes, it is inserted
788Sbill  * in a list of cleaned pages to be processed by the pageout daemon.
798Sbill  */
802771Swnj struct	buf *swbuf;
812771Swnj short	*swsize;		/* CAN WE JUST USE B_BCOUNT? */
822771Swnj int	*swpf;
838Sbill 
848Sbill 
852706Swnj #ifndef	UNFAST
868Sbill #define	notavail(bp) \
878Sbill { \
88*6156Ssam 	int x = spl6(); \
898Sbill 	(bp)->av_back->av_forw = (bp)->av_forw; \
908Sbill 	(bp)->av_forw->av_back = (bp)->av_back; \
918Sbill 	(bp)->b_flags |= B_BUSY; \
92*6156Ssam 	splx(x); \
938Sbill }
948Sbill #endif
958Sbill 
968Sbill /*
978Sbill  * Read in (if necessary) the block and return a buffer pointer.
988Sbill  */
998Sbill struct buf *
1008Sbill bread(dev, blkno)
1018Sbill dev_t dev;
1028Sbill daddr_t blkno;
1038Sbill {
1048Sbill 	register struct buf *bp;
1058Sbill 
1068Sbill 	bp = getblk(dev, blkno);
1078Sbill 	if (bp->b_flags&B_DONE) {
1083199Swnj #ifdef	TRACE
1093199Swnj 		trace(TR_BREADHIT, dev, blkno);
1102045Swnj #endif
1118Sbill #ifdef	DISKMON
1128Sbill 		io_info.ncache++;
1138Sbill #endif
1148Sbill 		return(bp);
1158Sbill 	}
1168Sbill 	bp->b_flags |= B_READ;
1178Sbill 	bp->b_bcount = BSIZE;
1188Sbill 	(*bdevsw[major(dev)].d_strategy)(bp);
1193199Swnj #ifdef	TRACE
1203199Swnj 	trace(TR_BREADMISS, dev, blkno);
1212045Swnj #endif
1228Sbill #ifdef	DISKMON
1238Sbill 	io_info.nread++;
1248Sbill #endif
1258Sbill 	u.u_vm.vm_inblk++;		/* pay for read */
1268Sbill 	iowait(bp);
1278Sbill 	return(bp);
1288Sbill }
1298Sbill 
1308Sbill /*
1318Sbill  * Read in the block, like bread, but also start I/O on the
1328Sbill  * read-ahead block (which is not allocated to the caller)
1338Sbill  */
1348Sbill struct buf *
1358Sbill breada(dev, blkno, rablkno)
1368Sbill dev_t dev;
1378Sbill daddr_t blkno, rablkno;
1388Sbill {
1398Sbill 	register struct buf *bp, *rabp;
1408Sbill 
1418Sbill 	bp = NULL;
1428Sbill 	if (!incore(dev, blkno)) {
1438Sbill 		bp = getblk(dev, blkno);
1448Sbill 		if ((bp->b_flags&B_DONE) == 0) {
1458Sbill 			bp->b_flags |= B_READ;
1468Sbill 			bp->b_bcount = BSIZE;
1478Sbill 			(*bdevsw[major(dev)].d_strategy)(bp);
1483199Swnj #ifdef	TRACE
1493199Swnj 			trace(TR_BREADMISS, dev, blkno);
1502045Swnj #endif
1518Sbill #ifdef	DISKMON
1528Sbill 			io_info.nread++;
1538Sbill #endif
1548Sbill 			u.u_vm.vm_inblk++;		/* pay for read */
1558Sbill 		}
1563199Swnj #ifdef	TRACE
1572045Swnj 		else
1583199Swnj 			trace(TR_BREADHIT, dev, blkno);
1592045Swnj #endif
1608Sbill 	}
1618Sbill 	if (rablkno && !incore(dev, rablkno)) {
1628Sbill 		rabp = getblk(dev, rablkno);
1632045Swnj 		if (rabp->b_flags & B_DONE) {
1648Sbill 			brelse(rabp);
1653199Swnj #ifdef	TRACE
1663199Swnj 			trace(TR_BREADHITRA, dev, blkno);
1672045Swnj #endif
1682045Swnj 		} else {
1698Sbill 			rabp->b_flags |= B_READ|B_ASYNC;
1708Sbill 			rabp->b_bcount = BSIZE;
1718Sbill 			(*bdevsw[major(dev)].d_strategy)(rabp);
1723199Swnj #ifdef	TRACE
1733199Swnj 			trace(TR_BREADMISSRA, dev, rablock);
1742045Swnj #endif
1758Sbill #ifdef	DISKMON
1768Sbill 			io_info.nreada++;
1778Sbill #endif
1788Sbill 			u.u_vm.vm_inblk++;		/* pay in advance */
1798Sbill 		}
1808Sbill 	}
1818Sbill 	if(bp == NULL)
1828Sbill 		return(bread(dev, blkno));
1838Sbill 	iowait(bp);
1848Sbill 	return(bp);
1858Sbill }
1868Sbill 
1878Sbill /*
1888Sbill  * Write the buffer, waiting for completion.
1898Sbill  * Then release the buffer.
1908Sbill  */
1918Sbill bwrite(bp)
1928Sbill register struct buf *bp;
1938Sbill {
1948Sbill 	register flag;
1958Sbill 
1968Sbill 	flag = bp->b_flags;
1978Sbill 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
1988Sbill 	bp->b_bcount = BSIZE;
1998Sbill #ifdef	DISKMON
2008Sbill 	io_info.nwrite++;
2018Sbill #endif
2028Sbill 	if ((flag&B_DELWRI) == 0)
2038Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
2043199Swnj #ifdef	TRACE
2054033Swnj 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
2062045Swnj #endif
2078Sbill 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
2088Sbill 	if ((flag&B_ASYNC) == 0) {
2098Sbill 		iowait(bp);
2108Sbill 		brelse(bp);
2118Sbill 	} else if (flag & B_DELWRI)
2128Sbill 		bp->b_flags |= B_AGE;
2138Sbill 	else
2148Sbill 		geterror(bp);
2158Sbill }
2168Sbill 
2178Sbill /*
2188Sbill  * Release the buffer, marking it so that if it is grabbed
2198Sbill  * for another purpose it will be written out before being
2208Sbill  * given up (e.g. when writing a partial block where it is
2218Sbill  * assumed that another write for the same block will soon follow).
2228Sbill  * This can't be done for magtape, since writes must be done
2238Sbill  * in the same order as requested.
2248Sbill  */
2258Sbill bdwrite(bp)
2268Sbill register struct buf *bp;
2278Sbill {
2282403Skre 	register int flags;
2298Sbill 
2308Sbill 	if ((bp->b_flags&B_DELWRI) == 0)
2318Sbill 		u.u_vm.vm_oublk++;		/* noone paid yet */
2322403Skre 	flags = bdevsw[major(bp->b_dev)].d_flags;
2332403Skre 	if(flags & B_TAPE)
2348Sbill 		bawrite(bp);
2358Sbill 	else {
2368Sbill 		bp->b_flags |= B_DELWRI | B_DONE;
2378Sbill 		brelse(bp);
2388Sbill 	}
2398Sbill }
2408Sbill 
2418Sbill /*
2428Sbill  * Release the buffer, start I/O on it, but don't wait for completion.
2438Sbill  */
2448Sbill bawrite(bp)
2458Sbill register struct buf *bp;
2468Sbill {
2478Sbill 
2488Sbill 	bp->b_flags |= B_ASYNC;
2498Sbill 	bwrite(bp);
2508Sbill }
2518Sbill 
2528Sbill /*
2538Sbill  * release the buffer, with no I/O implied.
2548Sbill  */
2558Sbill brelse(bp)
2568Sbill register struct buf *bp;
2578Sbill {
2582325Swnj 	register struct buf *flist;
2598Sbill 	register s;
2608Sbill 
2618Sbill 	if (bp->b_flags&B_WANTED)
2628Sbill 		wakeup((caddr_t)bp);
2632325Swnj 	if (bfreelist[0].b_flags&B_WANTED) {
2642325Swnj 		bfreelist[0].b_flags &= ~B_WANTED;
2652325Swnj 		wakeup((caddr_t)bfreelist);
2668Sbill 	}
2672683Swnj 	if (bp->b_flags&B_ERROR)
2682683Swnj 		if (bp->b_flags & B_LOCKED)
2692683Swnj 			bp->b_flags &= ~B_ERROR;	/* try again later */
2702683Swnj 		else
2712683Swnj 			bp->b_dev = NODEV;  		/* no assoc */
2728Sbill 	s = spl6();
2732325Swnj 	if (bp->b_flags & (B_ERROR|B_INVAL)) {
2742325Swnj 		/* block has no info ... put at front of most free list */
2752325Swnj 		flist = &bfreelist[BQUEUES-1];
2762325Swnj 		flist->av_forw->av_back = bp;
2772325Swnj 		bp->av_forw = flist->av_forw;
2782325Swnj 		flist->av_forw = bp;
2792325Swnj 		bp->av_back = flist;
2808Sbill 	} else {
2812325Swnj 		if (bp->b_flags & B_LOCKED)
2822325Swnj 			flist = &bfreelist[BQ_LOCKED];
2832325Swnj 		else if (bp->b_flags & B_AGE)
2842325Swnj 			flist = &bfreelist[BQ_AGE];
2852325Swnj 		else
2862325Swnj 			flist = &bfreelist[BQ_LRU];
2872325Swnj 		flist->av_back->av_forw = bp;
2882325Swnj 		bp->av_back = flist->av_back;
2892325Swnj 		flist->av_back = bp;
2902325Swnj 		bp->av_forw = flist;
2918Sbill 	}
2928Sbill 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
2938Sbill 	splx(s);
2948Sbill }
2958Sbill 
2968Sbill /*
2978Sbill  * See if the block is associated with some buffer
2988Sbill  * (mainly to avoid getting hung up on a wait in breada)
2998Sbill  */
3008Sbill incore(dev, blkno)
3018Sbill dev_t dev;
3028Sbill daddr_t blkno;
3038Sbill {
3048Sbill 	register struct buf *bp;
3052325Swnj 	register struct buf *dp;
3068Sbill 	register int dblkno = fsbtodb(blkno);
3078Sbill 
3082328Swnj 	dp = BUFHASH(dev, dblkno);
3092325Swnj 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
3102325Swnj 		if (bp->b_blkno == dblkno && bp->b_dev == dev &&
3112325Swnj 		    !(bp->b_flags & B_INVAL))
31291Sbill 			return (1);
31391Sbill 	return (0);
3148Sbill }
3158Sbill 
3168Sbill struct buf *
3178Sbill baddr(dev, blkno)
3188Sbill dev_t dev;
3198Sbill daddr_t blkno;
3208Sbill {
3218Sbill 
3228Sbill 	if (incore(dev, blkno))
3238Sbill 		return (bread(dev, blkno));
3248Sbill 	return (0);
3258Sbill }
3268Sbill 
3278Sbill /*
3288Sbill  * Assign a buffer for the given block.  If the appropriate
3298Sbill  * block is already associated, return it; otherwise search
3308Sbill  * for the oldest non-busy buffer and reassign it.
3315424Swnj  *
3325424Swnj  * We use splx here because this routine may be called
3335424Swnj  * on the interrupt stack during a dump, and we don't
3345424Swnj  * want to lower the ipl back to 0.
3358Sbill  */
3368Sbill struct buf *
3378Sbill getblk(dev, blkno)
3388Sbill dev_t dev;
3398Sbill daddr_t blkno;
3408Sbill {
34191Sbill 	register struct buf *bp, *dp, *ep;
3422325Swnj 	register int dblkno = fsbtodb(blkno);
3432423Skre #ifdef	DISKMON
3442423Skre 	register int i;
3452423Skre #endif
3465424Swnj 	int s;
3478Sbill 
3481831Sbill 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
3491831Sbill 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
3501831Sbill 	dblkno = fsbtodb(blkno);
3512325Swnj 	dp = BUFHASH(dev, dblkno);
3528Sbill     loop:
3532325Swnj 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
3542325Swnj 		if (bp->b_blkno != dblkno || bp->b_dev != dev ||
3552325Swnj 		    bp->b_flags&B_INVAL)
3568Sbill 			continue;
3575424Swnj 		s = spl6();
3588Sbill 		if (bp->b_flags&B_BUSY) {
3598Sbill 			bp->b_flags |= B_WANTED;
3608Sbill 			sleep((caddr_t)bp, PRIBIO+1);
3615424Swnj 			splx(s);
3628Sbill 			goto loop;
3638Sbill 		}
3645424Swnj 		splx(s);
3658Sbill #ifdef	DISKMON
3668Sbill 		i = 0;
3678Sbill 		dp = bp->av_forw;
3682325Swnj 		while ((dp->b_flags & B_HEAD) == 0) {
3698Sbill 			i++;
3708Sbill 			dp = dp->av_forw;
3718Sbill 		}
3722771Swnj 		if (i<64)
3738Sbill 			io_info.bufcount[i]++;
3748Sbill #endif
3758Sbill 		notavail(bp);
3768Sbill 		bp->b_flags |= B_CACHE;
3778Sbill 		return(bp);
3788Sbill 	}
37991Sbill 	if (major(dev) >= nblkdev)
38091Sbill 		panic("blkdev");
3815424Swnj 	s = spl6();
3822325Swnj 	for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--)
3832325Swnj 		if (ep->av_forw != ep)
3842325Swnj 			break;
3852325Swnj 	if (ep == bfreelist) {		/* no free blocks at all */
3862325Swnj 		ep->b_flags |= B_WANTED;
3872325Swnj 		sleep((caddr_t)ep, PRIBIO+1);
3885424Swnj 		splx(s);
3898Sbill 		goto loop;
3908Sbill 	}
3915424Swnj 	splx(s);
3922325Swnj 	bp = ep->av_forw;
3938Sbill 	notavail(bp);
3948Sbill 	if (bp->b_flags & B_DELWRI) {
3958Sbill 		bp->b_flags |= B_ASYNC;
3968Sbill 		bwrite(bp);
3978Sbill 		goto loop;
3988Sbill 	}
3993199Swnj #ifdef TRACE
4004033Swnj 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
4012045Swnj #endif
4028Sbill 	bp->b_flags = B_BUSY;
4038Sbill 	bp->b_back->b_forw = bp->b_forw;
4048Sbill 	bp->b_forw->b_back = bp->b_back;
4058Sbill 	bp->b_forw = dp->b_forw;
4068Sbill 	bp->b_back = dp;
4078Sbill 	dp->b_forw->b_back = bp;
4088Sbill 	dp->b_forw = bp;
4098Sbill 	bp->b_dev = dev;
4108Sbill 	bp->b_blkno = dblkno;
4118Sbill 	return(bp);
4128Sbill }
4138Sbill 
4148Sbill /*
4158Sbill  * get an empty block,
4168Sbill  * not assigned to any particular device
4178Sbill  */
4188Sbill struct buf *
4198Sbill geteblk()
4208Sbill {
421182Sbill 	register struct buf *bp, *dp;
4225431Sroot 	int s;
4238Sbill 
4248Sbill loop:
4255431Sroot 	s = spl6();
4262325Swnj 	for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--)
4272325Swnj 		if (dp->av_forw != dp)
4282325Swnj 			break;
4292325Swnj 	if (dp == bfreelist) {		/* no free blocks */
4302325Swnj 		dp->b_flags |= B_WANTED;
4312325Swnj 		sleep((caddr_t)dp, PRIBIO+1);
4322325Swnj 		goto loop;
4338Sbill 	}
4345431Sroot 	splx(s);
4352325Swnj 	bp = dp->av_forw;
4368Sbill 	notavail(bp);
4378Sbill 	if (bp->b_flags & B_DELWRI) {
4388Sbill 		bp->b_flags |= B_ASYNC;
4398Sbill 		bwrite(bp);
4408Sbill 		goto loop;
4418Sbill 	}
4423199Swnj #ifdef TRACE
4434033Swnj 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
4442045Swnj #endif
4452325Swnj 	bp->b_flags = B_BUSY|B_INVAL;
4468Sbill 	bp->b_back->b_forw = bp->b_forw;
4478Sbill 	bp->b_forw->b_back = bp->b_back;
4488Sbill 	bp->b_forw = dp->b_forw;
4498Sbill 	bp->b_back = dp;
4508Sbill 	dp->b_forw->b_back = bp;
4518Sbill 	dp->b_forw = bp;
4528Sbill 	bp->b_dev = (dev_t)NODEV;
4538Sbill 	return(bp);
4548Sbill }
4558Sbill 
4568Sbill /*
4578Sbill  * Wait for I/O completion on the buffer; return errors
4588Sbill  * to the user.
4598Sbill  */
4608Sbill iowait(bp)
4618Sbill register struct buf *bp;
4628Sbill {
4635431Sroot 	int s;
4648Sbill 
4655431Sroot 	s = spl6();
4668Sbill 	while ((bp->b_flags&B_DONE)==0)
4678Sbill 		sleep((caddr_t)bp, PRIBIO);
4685431Sroot 	splx(s);
4698Sbill 	geterror(bp);
4708Sbill }
4718Sbill 
4722706Swnj #ifdef UNFAST
4738Sbill /*
4748Sbill  * Unlink a buffer from the available list and mark it busy.
4758Sbill  * (internal interface)
4768Sbill  */
4778Sbill notavail(bp)
4788Sbill register struct buf *bp;
4798Sbill {
4808Sbill 	register s;
4818Sbill 
4828Sbill 	s = spl6();
4838Sbill 	bp->av_back->av_forw = bp->av_forw;
4848Sbill 	bp->av_forw->av_back = bp->av_back;
4858Sbill 	bp->b_flags |= B_BUSY;
4868Sbill 	splx(s);
4878Sbill }
4888Sbill #endif
4898Sbill 
4908Sbill /*
4918Sbill  * Mark I/O complete on a buffer. If the header
4928Sbill  * indicates a dirty page push completion, the
4938Sbill  * header is inserted into the ``cleaned'' list
4948Sbill  * to be processed by the pageout daemon. Otherwise
4958Sbill  * release it if I/O is asynchronous, and wake
4968Sbill  * up anyone waiting for it.
4978Sbill  */
4988Sbill iodone(bp)
4998Sbill register struct buf *bp;
5008Sbill {
5018Sbill 	register int s;
5028Sbill 
503420Sbill 	if (bp->b_flags & B_DONE)
504420Sbill 		panic("dup iodone");
5058Sbill 	bp->b_flags |= B_DONE;
5068Sbill 	if (bp->b_flags & B_DIRTY) {
5078Sbill 		if (bp->b_flags & B_ERROR)
5088Sbill 			panic("IO err in push");
5098Sbill 		s = spl6();
5108Sbill 		bp->av_forw = bclnlist;
5118Sbill 		bp->b_bcount = swsize[bp - swbuf];
5128Sbill 		bp->b_pfcent = swpf[bp - swbuf];
5133601Swnj 		cnt.v_pgout++;
5143601Swnj 		cnt.v_pgpgout += bp->b_bcount / NBPG;
5158Sbill 		bclnlist = bp;
5168Sbill 		if (bswlist.b_flags & B_WANTED)
5178Sbill 			wakeup((caddr_t)&proc[2]);
5188Sbill 		splx(s);
519383Sbill 		return;
5208Sbill 	}
5218Sbill 	if (bp->b_flags&B_ASYNC)
5228Sbill 		brelse(bp);
5238Sbill 	else {
5248Sbill 		bp->b_flags &= ~B_WANTED;
5258Sbill 		wakeup((caddr_t)bp);
5268Sbill 	}
5278Sbill }
5288Sbill 
5298Sbill /*
5308Sbill  * Zero the core associated with a buffer.
5318Sbill  */
5328Sbill clrbuf(bp)
5338Sbill struct buf *bp;
5348Sbill {
5358Sbill 	register *p;
5368Sbill 	register c;
5378Sbill 
5388Sbill 	p = bp->b_un.b_words;
5398Sbill 	c = BSIZE/sizeof(int);
5408Sbill 	do
5418Sbill 		*p++ = 0;
5428Sbill 	while (--c);
5438Sbill 	bp->b_resid = 0;
5448Sbill }
5458Sbill 
5468Sbill /*
5478Sbill  * swap I/O -
5488Sbill  *
5498Sbill  * If the flag indicates a dirty page push initiated
5508Sbill  * by the pageout daemon, we map the page into the i th
5518Sbill  * virtual page of process 2 (the daemon itself) where i is
5528Sbill  * the index of the swap header that has been allocated.
5538Sbill  * We simply initialize the header and queue the I/O but
5548Sbill  * do not wait for completion. When the I/O completes,
5558Sbill  * iodone() will link the header to a list of cleaned
5568Sbill  * pages to be processed by the pageout daemon.
5578Sbill  */
5588Sbill swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent)
5598Sbill 	struct proc *p;
5608Sbill 	swblk_t dblkno;
5618Sbill 	caddr_t addr;
5628Sbill 	int flag, nbytes;
5638Sbill 	dev_t dev;
5648Sbill 	unsigned pfcent;
5658Sbill {
5668Sbill 	register struct buf *bp;
5678Sbill 	register int c;
5688Sbill 	int p2dp;
5698Sbill 	register struct pte *dpte, *vpte;
5705431Sroot 	int s;
5718Sbill 
5725431Sroot 	s = spl6();
5738Sbill 	while (bswlist.av_forw == NULL) {
5748Sbill 		bswlist.b_flags |= B_WANTED;
5758Sbill 		sleep((caddr_t)&bswlist, PSWP+1);
5768Sbill 	}
5778Sbill 	bp = bswlist.av_forw;
5788Sbill 	bswlist.av_forw = bp->av_forw;
5795431Sroot 	splx(s);
5808Sbill 
5818Sbill 	bp->b_flags = B_BUSY | B_PHYS | rdflg | flag;
5828Sbill 	if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0)
5838Sbill 		if (rdflg == B_READ)
5848Sbill 			sum.v_pswpin += btoc(nbytes);
5858Sbill 		else
5868Sbill 			sum.v_pswpout += btoc(nbytes);
5878Sbill 	bp->b_proc = p;
5888Sbill 	if (flag & B_DIRTY) {
5898Sbill 		p2dp = ((bp - swbuf) * CLSIZE) * KLMAX;
5908Sbill 		dpte = dptopte(&proc[2], p2dp);
5918Sbill 		vpte = vtopte(p, btop(addr));
5928Sbill 		for (c = 0; c < nbytes; c += NBPG) {
5938Sbill 			if (vpte->pg_pfnum == 0 || vpte->pg_fod)
5948Sbill 				panic("swap bad pte");
5958Sbill 			*dpte++ = *vpte++;
5968Sbill 		}
5978Sbill 		bp->b_un.b_addr = (caddr_t)ctob(p2dp);
5988Sbill 	} else
5998Sbill 		bp->b_un.b_addr = addr;
6008Sbill 	while (nbytes > 0) {
6018Sbill 		c = imin(ctob(120), nbytes);
6028Sbill 		bp->b_bcount = c;
6038Sbill 		bp->b_blkno = dblkno;
6048Sbill 		bp->b_dev = dev;
605718Sbill 		if (flag & B_DIRTY) {
606718Sbill 			swpf[bp - swbuf] = pfcent;
607718Sbill 			swsize[bp - swbuf] = nbytes;
608718Sbill 		}
6094033Swnj #ifdef TRACE
6104033Swnj 		trace(TR_SWAPIO, dev, bp->b_blkno);
6114033Swnj #endif
6128Sbill 		(*bdevsw[major(dev)].d_strategy)(bp);
6138Sbill 		if (flag & B_DIRTY) {
6148Sbill 			if (c < nbytes)
6158Sbill 				panic("big push");
6168Sbill 			return;
6178Sbill 		}
6185431Sroot 		s = spl6();
6198Sbill 		while((bp->b_flags&B_DONE)==0)
6208Sbill 			sleep((caddr_t)bp, PSWP);
6215431Sroot 		splx(s);
6228Sbill 		bp->b_un.b_addr += c;
6238Sbill 		bp->b_flags &= ~B_DONE;
6248Sbill 		if (bp->b_flags & B_ERROR) {
6258Sbill 			if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE)
6268Sbill 				panic("hard IO err in swap");
6278Sbill 			swkill(p, (char *)0);
6288Sbill 		}
6298Sbill 		nbytes -= c;
6308Sbill 		dblkno += btoc(c);
6318Sbill 	}
6325431Sroot 	s = spl6();
6338Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
6348Sbill 	bp->av_forw = bswlist.av_forw;
6358Sbill 	bswlist.av_forw = bp;
6368Sbill 	if (bswlist.b_flags & B_WANTED) {
6378Sbill 		bswlist.b_flags &= ~B_WANTED;
6388Sbill 		wakeup((caddr_t)&bswlist);
6398Sbill 		wakeup((caddr_t)&proc[2]);
6408Sbill 	}
6415431Sroot 	splx(s);
6428Sbill }
6438Sbill 
6448Sbill /*
6458Sbill  * If rout == 0 then killed on swap error, else
6468Sbill  * rout is the name of the routine where we ran out of
6478Sbill  * swap space.
6488Sbill  */
6498Sbill swkill(p, rout)
6508Sbill 	struct proc *p;
6518Sbill 	char *rout;
6528Sbill {
6532922Swnj 	char *mesg;
6548Sbill 
6552922Swnj 	printf("pid %d: ", p->p_pid);
6568Sbill 	if (rout)
6572922Swnj 		printf(mesg = "killed due to no swap space\n");
6588Sbill 	else
6592922Swnj 		printf(mesg = "killed on swap error\n");
6602922Swnj 	uprintf("sorry, pid %d was %s", p->p_pid, mesg);
6618Sbill 	/*
6628Sbill 	 * To be sure no looping (e.g. in vmsched trying to
6638Sbill 	 * swap out) mark process locked in core (as though
6648Sbill 	 * done by user) after killing it so noone will try
6658Sbill 	 * to swap it out.
6668Sbill 	 */
667165Sbill 	psignal(p, SIGKILL);
6688Sbill 	p->p_flag |= SULOCK;
6698Sbill }
6708Sbill 
6718Sbill /*
6728Sbill  * make sure all write-behind blocks
6738Sbill  * on dev (or NODEV for all)
6748Sbill  * are flushed out.
6758Sbill  * (from umount and update)
6768Sbill  */
6778Sbill bflush(dev)
6788Sbill dev_t dev;
6798Sbill {
6808Sbill 	register struct buf *bp;
6812325Swnj 	register struct buf *flist;
6825431Sroot 	int s;
6838Sbill 
6848Sbill loop:
6855431Sroot 	s = spl6();
6862325Swnj 	for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++)
6872325Swnj 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
6888Sbill 		if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
6898Sbill 			bp->b_flags |= B_ASYNC;
6908Sbill 			notavail(bp);
6918Sbill 			bwrite(bp);
6928Sbill 			goto loop;
6938Sbill 		}
6948Sbill 	}
6955431Sroot 	splx(s);
6968Sbill }
6978Sbill 
6988Sbill /*
6998Sbill  * Raw I/O. The arguments are
7008Sbill  *	The strategy routine for the device
7018Sbill  *	A buffer, which will always be a special buffer
7028Sbill  *	  header owned exclusively by the device for this purpose
7038Sbill  *	The device number
7048Sbill  *	Read/write flag
7058Sbill  * Essentially all the work is computing physical addresses and
7068Sbill  * validating them.
7078Sbill  * If the user has the proper access privilidges, the process is
7088Sbill  * marked 'delayed unlock' and the pages involved in the I/O are
7098Sbill  * faulted and locked. After the completion of the I/O, the above pages
7108Sbill  * are unlocked.
7118Sbill  */
7128Sbill physio(strat, bp, dev, rw, mincnt)
7138Sbill int (*strat)();
7148Sbill register struct buf *bp;
7158Sbill unsigned (*mincnt)();
7168Sbill {
7178Sbill 	register int c;
7188Sbill 	char *a;
7195431Sroot 	int s;
7208Sbill 
7218Sbill 	if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) {
7228Sbill 		u.u_error = EFAULT;
7238Sbill 		return;
7248Sbill 	}
7255431Sroot 	s = spl6();
7268Sbill 	while (bp->b_flags&B_BUSY) {
7278Sbill 		bp->b_flags |= B_WANTED;
7288Sbill 		sleep((caddr_t)bp, PRIBIO+1);
7298Sbill 	}
7308Sbill 	bp->b_error = 0;
7318Sbill 	bp->b_proc = u.u_procp;
7328Sbill 	bp->b_un.b_addr = u.u_base;
7333667Swnj 	while (u.u_count != 0) {
7348Sbill 		bp->b_flags = B_BUSY | B_PHYS | rw;
7358Sbill 		bp->b_dev = dev;
7368Sbill 		bp->b_blkno = u.u_offset >> PGSHIFT;
7378Sbill 		bp->b_bcount = u.u_count;
7388Sbill 		(*mincnt)(bp);
7398Sbill 		c = bp->b_bcount;
7408Sbill 		u.u_procp->p_flag |= SPHYSIO;
7418Sbill 		vslock(a = bp->b_un.b_addr, c);
7428Sbill 		(*strat)(bp);
743124Sbill 		(void) spl6();
7448Sbill 		while ((bp->b_flags&B_DONE) == 0)
7458Sbill 			sleep((caddr_t)bp, PRIBIO);
7468Sbill 		vsunlock(a, c, rw);
7478Sbill 		u.u_procp->p_flag &= ~SPHYSIO;
7488Sbill 		if (bp->b_flags&B_WANTED)
7498Sbill 			wakeup((caddr_t)bp);
7505431Sroot 		splx(s);
7518Sbill 		bp->b_un.b_addr += c;
7528Sbill 		u.u_count -= c;
7538Sbill 		u.u_offset += c;
7543667Swnj 		if (bp->b_flags&B_ERROR)
7553667Swnj 			break;
7568Sbill 	}
7578Sbill 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
7588Sbill 	u.u_count = bp->b_resid;
7598Sbill 	geterror(bp);
7608Sbill }
7618Sbill 
7628Sbill /*ARGSUSED*/
7638Sbill unsigned
7648Sbill minphys(bp)
7658Sbill struct buf *bp;
7668Sbill {
7678Sbill 
7688Sbill 	if (bp->b_bcount > 60 * 1024)
7698Sbill 		bp->b_bcount = 60 * 1024;
7708Sbill }
7718Sbill 
7728Sbill /*
7738Sbill  * Pick up the device's error number and pass it to the user;
7748Sbill  * if there is an error but the number is 0 set a generalized
7758Sbill  * code.  Actually the latter is always true because devices
7768Sbill  * don't yet return specific errors.
7778Sbill  */
7788Sbill geterror(bp)
7798Sbill register struct buf *bp;
7808Sbill {
7818Sbill 
7828Sbill 	if (bp->b_flags&B_ERROR)
7838Sbill 		if ((u.u_error = bp->b_error)==0)
7848Sbill 			u.u_error = EIO;
7858Sbill }
7862299Skre 
7872299Skre /*
7882299Skre  * Invalidate in core blocks belonging to closed or umounted filesystem
7892299Skre  *
7902299Skre  * This is not nicely done at all - the buffer ought to be removed from the
7912299Skre  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
7922299Skre  * can't do that here, as it is quite possible that the block is still
7932299Skre  * being used for i/o. Eventually, all disc drivers should be forced to
7942299Skre  * have a close routine, which ought ensure that the queue is empty, then
7952299Skre  * properly flush the queues. Until that happy day, this suffices for
7962299Skre  * correctness.						... kre
7972299Skre  */
7982299Skre binval(dev)
7992299Skre dev_t dev;
8002299Skre {
8012361Skre 	register struct buf *bp;
8022361Skre 	register struct bufhd *hp;
8032361Skre #define dp ((struct buf *)hp)
8042299Skre 
8052361Skre 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
8062361Skre 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
8072361Skre 			if (bp->b_dev == dev)
8082361Skre 				bp->b_flags |= B_INVAL;
8092299Skre }
810