1 /*	vfs_cluster.c	3.1	10/14/12	*/
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/dir.h"
6 #include "../h/user.h"
7 #include "../h/buf.h"
8 #include "../h/conf.h"
9 #include "../h/proc.h"
10 #include "../h/seg.h"
11 #include "../h/pte.h"
12 #include "../h/vm.h"
13 
14 /* #define	DISKMON	1 */
15 
16 #ifdef	DISKMON
17 struct {
18 	int	nbuf;
19 	long	nread;
20 	long	nreada;
21 	long	ncache;
22 	long	nwrite;
23 	long	bufcount[NBUF];
24 } io_info;
25 #endif
26 
27 /*
28  * Swap IO headers -
29  * They contain the necessary information for the swap I/O.
30  * At any given time, a swap header can be in three
31  * different lists. When free it is in the free list,
32  * when allocated and the I/O queued, it is on the swap
33  * device list, and finally, if the operation was a dirty
34  * page push, when the I/O completes, it is inserted
35  * in a list of cleaned pages to be processed by the pageout daemon.
36  */
37 struct	buf swbuf[NSWBUF];
38 short	swsize[NSWBUF];		/* CAN WE JUST USE B_BCOUNT? */
39 int	swpf[NSWBUF];
40 
41 /*
42  * The following several routines allocate and free
43  * buffers with various side effects.  In general the
44  * arguments to an allocate routine are a device and
45  * a block number, and the value is a pointer to
46  * to the buffer header; the buffer is marked "busy"
47  * so that no one else can touch it.  If the block was
48  * already in core, no I/O need be done; if it is
49  * already busy, the process waits until it becomes free.
50  * The following routines allocate a buffer:
51  *	getblk
52  *	bread
53  *	breada
54  *	baddr	(if it is incore)
55  * Eventually the buffer must be released, possibly with the
56  * side effect of writing it out, by using one of
57  *	bwrite
58  *	bdwrite
59  *	bawrite
60  *	brelse
61  */
62 
63 #ifdef	FASTVAX
64 #define	notavail(bp) \
65 { \
66 	int s = spl6(); \
67 	(bp)->av_back->av_forw = (bp)->av_forw; \
68 	(bp)->av_forw->av_back = (bp)->av_back; \
69 	(bp)->b_flags |= B_BUSY; \
70 	splx(s); \
71 }
72 #endif
73 
74 /*
75  * Read in (if necessary) the block and return a buffer pointer.
76  */
77 struct buf *
78 bread(dev, blkno)
79 dev_t dev;
80 daddr_t blkno;
81 {
82 	register struct buf *bp;
83 
84 	bp = getblk(dev, blkno);
85 	if (bp->b_flags&B_DONE) {
86 #ifdef	DISKMON
87 		io_info.ncache++;
88 #endif
89 		return(bp);
90 	}
91 	bp->b_flags |= B_READ;
92 	bp->b_bcount = BSIZE;
93 	(*bdevsw[major(dev)].d_strategy)(bp);
94 #ifdef	DISKMON
95 	io_info.nread++;
96 #endif
97 	u.u_vm.vm_inblk++;		/* pay for read */
98 	iowait(bp);
99 	return(bp);
100 }
101 
102 /*
103  * Read in the block, like bread, but also start I/O on the
104  * read-ahead block (which is not allocated to the caller)
105  */
106 struct buf *
107 breada(dev, blkno, rablkno)
108 dev_t dev;
109 daddr_t blkno, rablkno;
110 {
111 	register struct buf *bp, *rabp;
112 
113 	bp = NULL;
114 	if (!incore(dev, blkno)) {
115 		bp = getblk(dev, blkno);
116 		if ((bp->b_flags&B_DONE) == 0) {
117 			bp->b_flags |= B_READ;
118 			bp->b_bcount = BSIZE;
119 			(*bdevsw[major(dev)].d_strategy)(bp);
120 #ifdef	DISKMON
121 			io_info.nread++;
122 #endif
123 			u.u_vm.vm_inblk++;		/* pay for read */
124 		}
125 	}
126 	if (rablkno && !incore(dev, rablkno)) {
127 		rabp = getblk(dev, rablkno);
128 		if (rabp->b_flags & B_DONE)
129 			brelse(rabp);
130 		else {
131 			rabp->b_flags |= B_READ|B_ASYNC;
132 			rabp->b_bcount = BSIZE;
133 			(*bdevsw[major(dev)].d_strategy)(rabp);
134 #ifdef	DISKMON
135 			io_info.nreada++;
136 #endif
137 			u.u_vm.vm_inblk++;		/* pay in advance */
138 		}
139 	}
140 	if(bp == NULL)
141 		return(bread(dev, blkno));
142 	iowait(bp);
143 	return(bp);
144 }
145 
146 /*
147  * Write the buffer, waiting for completion.
148  * Then release the buffer.
149  */
150 bwrite(bp)
151 register struct buf *bp;
152 {
153 	register flag;
154 
155 	flag = bp->b_flags;
156 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
157 	bp->b_bcount = BSIZE;
158 #ifdef	DISKMON
159 	io_info.nwrite++;
160 #endif
161 	if ((flag&B_DELWRI) == 0)
162 		u.u_vm.vm_oublk++;		/* noone paid yet */
163 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
164 	if ((flag&B_ASYNC) == 0) {
165 		iowait(bp);
166 		brelse(bp);
167 	} else if (flag & B_DELWRI)
168 		bp->b_flags |= B_AGE;
169 	else
170 		geterror(bp);
171 }
172 
173 /*
174  * Release the buffer, marking it so that if it is grabbed
175  * for another purpose it will be written out before being
176  * given up (e.g. when writing a partial block where it is
177  * assumed that another write for the same block will soon follow).
178  * This can't be done for magtape, since writes must be done
179  * in the same order as requested.
180  */
181 bdwrite(bp)
182 register struct buf *bp;
183 {
184 	register struct buf *dp;
185 
186 	if ((bp->b_flags&B_DELWRI) == 0)
187 		u.u_vm.vm_oublk++;		/* noone paid yet */
188 	dp = bdevsw[major(bp->b_dev)].d_tab;
189 	if(dp->b_flags & B_TAPE)
190 		bawrite(bp);
191 	else {
192 		bp->b_flags |= B_DELWRI | B_DONE;
193 		brelse(bp);
194 	}
195 }
196 
197 /*
198  * Release the buffer, start I/O on it, but don't wait for completion.
199  */
200 bawrite(bp)
201 register struct buf *bp;
202 {
203 
204 	bp->b_flags |= B_ASYNC;
205 	bwrite(bp);
206 }
207 
208 /*
209  * release the buffer, with no I/O implied.
210  */
211 brelse(bp)
212 register struct buf *bp;
213 {
214 	register struct buf **backp;
215 	register s;
216 
217 	if (bp->b_flags&B_WANTED)
218 		wakeup((caddr_t)bp);
219 	if (bfreelist.b_flags&B_WANTED) {
220 		bfreelist.b_flags &= ~B_WANTED;
221 		wakeup((caddr_t)&bfreelist);
222 	}
223 	if (bp->b_flags&B_ERROR)
224 		bp->b_dev = NODEV;  /* no assoc. on error */
225 	s = spl6();
226 	if(bp->b_flags & (B_AGE|B_ERROR)) {
227 		backp = &bfreelist.av_forw;
228 		(*backp)->av_back = bp;
229 		bp->av_forw = *backp;
230 		*backp = bp;
231 		bp->av_back = &bfreelist;
232 	} else {
233 		backp = &bfreelist.av_back;
234 		(*backp)->av_forw = bp;
235 		bp->av_back = *backp;
236 		*backp = bp;
237 		bp->av_forw = &bfreelist;
238 	}
239 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
240 	splx(s);
241 }
242 
243 /*
244  * See if the block is associated with some buffer
245  * (mainly to avoid getting hung up on a wait in breada)
246  */
247 incore(dev, blkno)
248 dev_t dev;
249 daddr_t blkno;
250 {
251 	register struct buf *bp;
252 	register struct buf *dp;
253 	register int dblkno = fsbtodb(blkno);
254 
255 	dp = bdevsw[major(dev)].d_tab;
256 	for (bp=dp->b_forw; bp != dp; bp = bp->b_forw)
257 		if (bp->b_blkno==dblkno && bp->b_dev==dev)
258 			return(1);
259 	return(0);
260 }
261 
262 struct buf *
263 baddr(dev, blkno)
264 dev_t dev;
265 daddr_t blkno;
266 {
267 
268 	if (incore(dev, blkno))
269 		return (bread(dev, blkno));
270 	return (0);
271 }
272 
273 /*
274  * Assign a buffer for the given block.  If the appropriate
275  * block is already associated, return it; otherwise search
276  * for the oldest non-busy buffer and reassign it.
277  */
278 struct buf *
279 getblk(dev, blkno)
280 dev_t dev;
281 daddr_t blkno;
282 {
283 	register struct buf *bp;
284 	register struct buf *dp;
285 #ifdef	DISKMON
286 	register i;
287 #endif
288 	register int dblkno = fsbtodb(blkno);
289 
290 	if(major(dev) >= nblkdev)
291 		panic("blkdev");
292 
293     loop:
294 	VOID spl0();
295 	dp = bdevsw[major(dev)].d_tab;
296 	if(dp == NULL)
297 		panic("devtab");
298 	for (bp=dp->b_forw; bp != dp; bp = bp->b_forw) {
299 		if (bp->b_blkno!=dblkno || bp->b_dev!=dev)
300 			continue;
301 		VOID spl6();
302 		if (bp->b_flags&B_BUSY) {
303 			bp->b_flags |= B_WANTED;
304 			sleep((caddr_t)bp, PRIBIO+1);
305 			goto loop;
306 		}
307 		VOID spl0();
308 #ifdef	DISKMON
309 		i = 0;
310 		dp = bp->av_forw;
311 		while (dp != &bfreelist) {
312 			i++;
313 			dp = dp->av_forw;
314 		}
315 		if (i<NBUF)
316 			io_info.bufcount[i]++;
317 #endif
318 		notavail(bp);
319 		bp->b_flags |= B_CACHE;
320 		return(bp);
321 	}
322 	VOID spl6();
323 	if (bfreelist.av_forw == &bfreelist) {
324 		bfreelist.b_flags |= B_WANTED;
325 		sleep((caddr_t)&bfreelist, PRIBIO+1);
326 		goto loop;
327 	}
328 	spl0();
329 	bp = bfreelist.av_forw;
330 	notavail(bp);
331 	if (bp->b_flags & B_DELWRI) {
332 		bp->b_flags |= B_ASYNC;
333 		bwrite(bp);
334 		goto loop;
335 	}
336 	bp->b_flags = B_BUSY;
337 	bp->b_back->b_forw = bp->b_forw;
338 	bp->b_forw->b_back = bp->b_back;
339 	bp->b_forw = dp->b_forw;
340 	bp->b_back = dp;
341 	dp->b_forw->b_back = bp;
342 	dp->b_forw = bp;
343 	bp->b_dev = dev;
344 	bp->b_blkno = dblkno;
345 	return(bp);
346 }
347 
348 /*
349  * get an empty block,
350  * not assigned to any particular device
351  */
352 struct buf *
353 geteblk()
354 {
355 	register struct buf *bp;
356 	register struct buf *dp;
357 
358 loop:
359 	VOID spl6();
360 	while (bfreelist.av_forw == &bfreelist) {
361 		bfreelist.b_flags |= B_WANTED;
362 		sleep((caddr_t)&bfreelist, PRIBIO+1);
363 	}
364 	VOID spl0();
365 	dp = &bfreelist;
366 	bp = bfreelist.av_forw;
367 	notavail(bp);
368 	if (bp->b_flags & B_DELWRI) {
369 		bp->b_flags |= B_ASYNC;
370 		bwrite(bp);
371 		goto loop;
372 	}
373 	bp->b_flags = B_BUSY;
374 	bp->b_back->b_forw = bp->b_forw;
375 	bp->b_forw->b_back = bp->b_back;
376 	bp->b_forw = dp->b_forw;
377 	bp->b_back = dp;
378 	dp->b_forw->b_back = bp;
379 	dp->b_forw = bp;
380 	bp->b_dev = (dev_t)NODEV;
381 	return(bp);
382 }
383 
384 /*
385  * Wait for I/O completion on the buffer; return errors
386  * to the user.
387  */
388 iowait(bp)
389 register struct buf *bp;
390 {
391 
392 	VOID spl6();
393 	while ((bp->b_flags&B_DONE)==0)
394 		sleep((caddr_t)bp, PRIBIO);
395 	VOID spl0();
396 	geterror(bp);
397 }
398 
399 #ifndef FASTVAX
400 /*
401  * Unlink a buffer from the available list and mark it busy.
402  * (internal interface)
403  */
404 notavail(bp)
405 register struct buf *bp;
406 {
407 	register s;
408 
409 	s = spl6();
410 	bp->av_back->av_forw = bp->av_forw;
411 	bp->av_forw->av_back = bp->av_back;
412 	bp->b_flags |= B_BUSY;
413 	splx(s);
414 }
415 #endif
416 
417 /*
418  * Mark I/O complete on a buffer. If the header
419  * indicates a dirty page push completion, the
420  * header is inserted into the ``cleaned'' list
421  * to be processed by the pageout daemon. Otherwise
422  * release it if I/O is asynchronous, and wake
423  * up anyone waiting for it.
424  */
425 iodone(bp)
426 register struct buf *bp;
427 {
428 	register int s;
429 
430 	bp->b_flags |= B_DONE;
431 	if (bp->b_flags & B_DIRTY) {
432 		if (bp->b_flags & B_ERROR)
433 			panic("IO err in push");
434 		s = spl6();
435 		cnt.v_pgout++;
436 		bp->av_forw = bclnlist;
437 		bp->b_bcount = swsize[bp - swbuf];
438 		bp->b_pfcent = swpf[bp - swbuf];
439 		bclnlist = bp;
440 		if (bswlist.b_flags & B_WANTED)
441 			wakeup((caddr_t)&proc[2]);
442 		splx(s);
443 	}
444 	if (bp->b_flags&B_ASYNC)
445 		brelse(bp);
446 	else {
447 		bp->b_flags &= ~B_WANTED;
448 		wakeup((caddr_t)bp);
449 	}
450 }
451 
452 /*
453  * Zero the core associated with a buffer.
454  */
455 clrbuf(bp)
456 struct buf *bp;
457 {
458 	register *p;
459 	register c;
460 
461 	p = bp->b_un.b_words;
462 	c = BSIZE/sizeof(int);
463 	do
464 		*p++ = 0;
465 	while (--c);
466 	bp->b_resid = 0;
467 }
468 
469 /*
470  * swap I/O -
471  *
472  * If the flag indicates a dirty page push initiated
473  * by the pageout daemon, we map the page into the i th
474  * virtual page of process 2 (the daemon itself) where i is
475  * the index of the swap header that has been allocated.
476  * We simply initialize the header and queue the I/O but
477  * do not wait for completion. When the I/O completes,
478  * iodone() will link the header to a list of cleaned
479  * pages to be processed by the pageout daemon.
480  */
481 swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent)
482 	struct proc *p;
483 	swblk_t dblkno;
484 	caddr_t addr;
485 	int flag, nbytes;
486 	dev_t dev;
487 	unsigned pfcent;
488 {
489 	register struct buf *bp;
490 	register int c;
491 	int p2dp;
492 	register struct pte *dpte, *vpte;
493 
494 	VOID spl6();
495 	while (bswlist.av_forw == NULL) {
496 		bswlist.b_flags |= B_WANTED;
497 		sleep((caddr_t)&bswlist, PSWP+1);
498 	}
499 	bp = bswlist.av_forw;
500 	bswlist.av_forw = bp->av_forw;
501 	VOID spl0();
502 
503 	bp->b_flags = B_BUSY | B_PHYS | rdflg | flag;
504 	if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0)
505 		if (rdflg == B_READ)
506 			sum.v_pswpin += btoc(nbytes);
507 		else
508 			sum.v_pswpout += btoc(nbytes);
509 	bp->b_proc = p;
510 	if (flag & B_DIRTY) {
511 		p2dp = ((bp - swbuf) * CLSIZE) * KLMAX;
512 		dpte = dptopte(&proc[2], p2dp);
513 		vpte = vtopte(p, btop(addr));
514 		for (c = 0; c < nbytes; c += NBPG) {
515 			if (vpte->pg_pfnum == 0 || vpte->pg_fod)
516 				panic("swap bad pte");
517 			*dpte++ = *vpte++;
518 		}
519 		bp->b_un.b_addr = (caddr_t)ctob(p2dp);
520 	} else
521 		bp->b_un.b_addr = addr;
522 	while (nbytes > 0) {
523 		c = imin(ctob(120), nbytes);
524 		bp->b_bcount = c;
525 		bp->b_blkno = dblkno;
526 		bp->b_dev = dev;
527 		if (dev == swapdev)
528 			bp->b_blkno += swplo;
529 		(*bdevsw[major(dev)].d_strategy)(bp);
530 		if (flag & B_DIRTY) {
531 			if (c < nbytes)
532 				panic("big push");
533 			swsize[bp - swbuf] = nbytes;
534 			swpf[bp - swbuf] = pfcent;
535 			return;
536 		}
537 		VOID spl6();
538 		while((bp->b_flags&B_DONE)==0)
539 			sleep((caddr_t)bp, PSWP);
540 		VOID spl0();
541 		bp->b_un.b_addr += c;
542 		bp->b_flags &= ~B_DONE;
543 		if (bp->b_flags & B_ERROR) {
544 			if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE)
545 				panic("hard IO err in swap");
546 			swkill(p, (char *)0);
547 		}
548 		nbytes -= c;
549 		dblkno += btoc(c);
550 	}
551 	VOID spl6();
552 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
553 	bp->av_forw = bswlist.av_forw;
554 	bswlist.av_forw = bp;
555 	if (bswlist.b_flags & B_WANTED) {
556 		bswlist.b_flags &= ~B_WANTED;
557 		wakeup((caddr_t)&bswlist);
558 		wakeup((caddr_t)&proc[2]);
559 	}
560 	VOID spl0();
561 }
562 
563 /*
564  * If rout == 0 then killed on swap error, else
565  * rout is the name of the routine where we ran out of
566  * swap space.
567  */
568 swkill(p, rout)
569 	struct proc *p;
570 	char *rout;
571 {
572 
573 	printf("%d: ", p->p_pid);
574 	if (rout)
575 		printf("out of swap space in %s\n", rout);
576 	else
577 		printf("killed on swap error\n");
578 	/*
579 	 * To be sure no looping (e.g. in vmsched trying to
580 	 * swap out) mark process locked in core (as though
581 	 * done by user) after killing it so noone will try
582 	 * to swap it out.
583 	 */
584 	psignal(p, SIGKIL);
585 	p->p_flag |= SULOCK;
586 }
587 
588 /*
589  * make sure all write-behind blocks
590  * on dev (or NODEV for all)
591  * are flushed out.
592  * (from umount and update)
593  */
594 bflush(dev)
595 dev_t dev;
596 {
597 	register struct buf *bp;
598 
599 loop:
600 	VOID spl6();
601 	for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) {
602 		if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) {
603 			bp->b_flags |= B_ASYNC;
604 			notavail(bp);
605 			bwrite(bp);
606 			goto loop;
607 		}
608 	}
609 	VOID spl0();
610 }
611 
612 /*
613  * Raw I/O. The arguments are
614  *	The strategy routine for the device
615  *	A buffer, which will always be a special buffer
616  *	  header owned exclusively by the device for this purpose
617  *	The device number
618  *	Read/write flag
619  * Essentially all the work is computing physical addresses and
620  * validating them.
621  * If the user has the proper access privilidges, the process is
622  * marked 'delayed unlock' and the pages involved in the I/O are
623  * faulted and locked. After the completion of the I/O, the above pages
624  * are unlocked.
625  */
626 physio(strat, bp, dev, rw, mincnt)
627 int (*strat)();
628 register struct buf *bp;
629 unsigned (*mincnt)();
630 {
631 	register int c;
632 	char *a;
633 
634 	if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) {
635 		u.u_error = EFAULT;
636 		return;
637 	}
638 	VOID spl6();
639 	while (bp->b_flags&B_BUSY) {
640 		bp->b_flags |= B_WANTED;
641 		sleep((caddr_t)bp, PRIBIO+1);
642 	}
643 	bp->b_error = 0;
644 	bp->b_proc = u.u_procp;
645 	bp->b_un.b_addr = u.u_base;
646 	while (u.u_count != 0 && bp->b_error==0) {
647 		bp->b_flags = B_BUSY | B_PHYS | rw;
648 		bp->b_dev = dev;
649 		bp->b_blkno = u.u_offset >> PGSHIFT;
650 		bp->b_bcount = u.u_count;
651 		(*mincnt)(bp);
652 		c = bp->b_bcount;
653 		u.u_procp->p_flag |= SPHYSIO;
654 		vslock(a = bp->b_un.b_addr, c);
655 		(*strat)(bp);
656 		VOID spl6();
657 		while ((bp->b_flags&B_DONE) == 0)
658 			sleep((caddr_t)bp, PRIBIO);
659 		vsunlock(a, c, rw);
660 		u.u_procp->p_flag &= ~SPHYSIO;
661 		if (bp->b_flags&B_WANTED)
662 			wakeup((caddr_t)bp);
663 		VOID spl0();
664 		bp->b_un.b_addr += c;
665 		u.u_count -= c;
666 		u.u_offset += c;
667 	}
668 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
669 	u.u_count = bp->b_resid;
670 	geterror(bp);
671 }
672 
673 /*ARGSUSED*/
674 unsigned
675 minphys(bp)
676 struct buf *bp;
677 {
678 
679 	if (bp->b_bcount > 60 * 1024)
680 		bp->b_bcount = 60 * 1024;
681 }
682 
683 /*
684  * Pick up the device's error number and pass it to the user;
685  * if there is an error but the number is 0 set a generalized
686  * code.  Actually the latter is always true because devices
687  * don't yet return specific errors.
688  */
689 geterror(bp)
690 register struct buf *bp;
691 {
692 
693 	if (bp->b_flags&B_ERROR)
694 		if ((u.u_error = bp->b_error)==0)
695 			u.u_error = EIO;
696 }
697