1 /* vfs_cluster.c 4.3 11/24/80 */ 2 3 #include "../h/param.h" 4 #include "../h/systm.h" 5 #include "../h/dir.h" 6 #include "../h/user.h" 7 #include "../h/buf.h" 8 #include "../h/conf.h" 9 #include "../h/proc.h" 10 #include "../h/seg.h" 11 #include "../h/pte.h" 12 #include "../h/vm.h" 13 14 /* 15 * The following several routines allocate and free 16 * buffers with various side effects. In general the 17 * arguments to an allocate routine are a device and 18 * a block number, and the value is a pointer to 19 * to the buffer header; the buffer is marked "busy" 20 * so that no one else can touch it. If the block was 21 * already in core, no I/O need be done; if it is 22 * already busy, the process waits until it becomes free. 23 * The following routines allocate a buffer: 24 * getblk 25 * bread 26 * breada 27 * baddr (if it is incore) 28 * Eventually the buffer must be released, possibly with the 29 * side effect of writing it out, by using one of 30 * bwrite 31 * bdwrite 32 * bawrite 33 * brelse 34 */ 35 36 #define BUFHSZ 63 37 #define BUFHASH(blkno) (blkno % BUFHSZ) 38 short bufhash[BUFHSZ]; 39 40 /* 41 * Initialize hash links for buffers. 42 */ 43 bhinit() 44 { 45 register int i; 46 47 for (i = 0; i < BUFHSZ; i++) 48 bufhash[i] = -1; 49 } 50 51 /* #define DISKMON 1 */ 52 53 #ifdef DISKMON 54 struct { 55 int nbuf; 56 long nread; 57 long nreada; 58 long ncache; 59 long nwrite; 60 long bufcount[NBUF]; 61 } io_info; 62 #endif 63 64 /* 65 * Swap IO headers - 66 * They contain the necessary information for the swap I/O. 67 * At any given time, a swap header can be in three 68 * different lists. When free it is in the free list, 69 * when allocated and the I/O queued, it is on the swap 70 * device list, and finally, if the operation was a dirty 71 * page push, when the I/O completes, it is inserted 72 * in a list of cleaned pages to be processed by the pageout daemon. 73 */ 74 struct buf swbuf[NSWBUF]; 75 short swsize[NSWBUF]; /* CAN WE JUST USE B_BCOUNT? */ 76 int swpf[NSWBUF]; 77 78 79 #ifdef FASTVAX 80 #define notavail(bp) \ 81 { \ 82 int s = spl6(); \ 83 (bp)->av_back->av_forw = (bp)->av_forw; \ 84 (bp)->av_forw->av_back = (bp)->av_back; \ 85 (bp)->b_flags |= B_BUSY; \ 86 splx(s); \ 87 } 88 #endif 89 90 /* 91 * Read in (if necessary) the block and return a buffer pointer. 92 */ 93 struct buf * 94 bread(dev, blkno) 95 dev_t dev; 96 daddr_t blkno; 97 { 98 register struct buf *bp; 99 100 bp = getblk(dev, blkno); 101 if (bp->b_flags&B_DONE) { 102 #ifdef DISKMON 103 io_info.ncache++; 104 #endif 105 return(bp); 106 } 107 bp->b_flags |= B_READ; 108 bp->b_bcount = BSIZE; 109 (*bdevsw[major(dev)].d_strategy)(bp); 110 #ifdef DISKMON 111 io_info.nread++; 112 #endif 113 u.u_vm.vm_inblk++; /* pay for read */ 114 iowait(bp); 115 return(bp); 116 } 117 118 /* 119 * Read in the block, like bread, but also start I/O on the 120 * read-ahead block (which is not allocated to the caller) 121 */ 122 struct buf * 123 breada(dev, blkno, rablkno) 124 dev_t dev; 125 daddr_t blkno, rablkno; 126 { 127 register struct buf *bp, *rabp; 128 129 bp = NULL; 130 if (!incore(dev, blkno)) { 131 bp = getblk(dev, blkno); 132 if ((bp->b_flags&B_DONE) == 0) { 133 bp->b_flags |= B_READ; 134 bp->b_bcount = BSIZE; 135 (*bdevsw[major(dev)].d_strategy)(bp); 136 #ifdef DISKMON 137 io_info.nread++; 138 #endif 139 u.u_vm.vm_inblk++; /* pay for read */ 140 } 141 } 142 if (rablkno && !incore(dev, rablkno)) { 143 rabp = getblk(dev, rablkno); 144 if (rabp->b_flags & B_DONE) 145 brelse(rabp); 146 else { 147 rabp->b_flags |= B_READ|B_ASYNC; 148 rabp->b_bcount = BSIZE; 149 (*bdevsw[major(dev)].d_strategy)(rabp); 150 #ifdef DISKMON 151 io_info.nreada++; 152 #endif 153 u.u_vm.vm_inblk++; /* pay in advance */ 154 } 155 } 156 if(bp == NULL) 157 return(bread(dev, blkno)); 158 iowait(bp); 159 return(bp); 160 } 161 162 /* 163 * Write the buffer, waiting for completion. 164 * Then release the buffer. 165 */ 166 bwrite(bp) 167 register struct buf *bp; 168 { 169 register flag; 170 171 flag = bp->b_flags; 172 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 173 bp->b_bcount = BSIZE; 174 #ifdef DISKMON 175 io_info.nwrite++; 176 #endif 177 if ((flag&B_DELWRI) == 0) 178 u.u_vm.vm_oublk++; /* noone paid yet */ 179 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 180 if ((flag&B_ASYNC) == 0) { 181 iowait(bp); 182 brelse(bp); 183 } else if (flag & B_DELWRI) 184 bp->b_flags |= B_AGE; 185 else 186 geterror(bp); 187 } 188 189 /* 190 * Release the buffer, marking it so that if it is grabbed 191 * for another purpose it will be written out before being 192 * given up (e.g. when writing a partial block where it is 193 * assumed that another write for the same block will soon follow). 194 * This can't be done for magtape, since writes must be done 195 * in the same order as requested. 196 */ 197 bdwrite(bp) 198 register struct buf *bp; 199 { 200 register struct buf *dp; 201 202 if ((bp->b_flags&B_DELWRI) == 0) 203 u.u_vm.vm_oublk++; /* noone paid yet */ 204 dp = bdevsw[major(bp->b_dev)].d_tab; 205 if(dp->b_flags & B_TAPE) 206 bawrite(bp); 207 else { 208 bp->b_flags |= B_DELWRI | B_DONE; 209 brelse(bp); 210 } 211 } 212 213 /* 214 * Release the buffer, start I/O on it, but don't wait for completion. 215 */ 216 bawrite(bp) 217 register struct buf *bp; 218 { 219 220 bp->b_flags |= B_ASYNC; 221 bwrite(bp); 222 } 223 224 /* 225 * release the buffer, with no I/O implied. 226 */ 227 brelse(bp) 228 register struct buf *bp; 229 { 230 register struct buf **backp; 231 register s; 232 233 if (bp->b_flags&B_WANTED) 234 wakeup((caddr_t)bp); 235 if (bfreelist.b_flags&B_WANTED) { 236 bfreelist.b_flags &= ~B_WANTED; 237 wakeup((caddr_t)&bfreelist); 238 } 239 if ((bp->b_flags&B_ERROR) && bp->b_dev != NODEV) { 240 bunhash(bp); 241 bp->b_dev = NODEV; /* no assoc. on error */ 242 } 243 s = spl6(); 244 if(bp->b_flags & (B_AGE|B_ERROR)) { 245 backp = &bfreelist.av_forw; 246 (*backp)->av_back = bp; 247 bp->av_forw = *backp; 248 *backp = bp; 249 bp->av_back = &bfreelist; 250 } else { 251 backp = &bfreelist.av_back; 252 (*backp)->av_forw = bp; 253 bp->av_back = *backp; 254 *backp = bp; 255 bp->av_forw = &bfreelist; 256 } 257 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 258 splx(s); 259 } 260 261 /* 262 * See if the block is associated with some buffer 263 * (mainly to avoid getting hung up on a wait in breada) 264 */ 265 incore(dev, blkno) 266 dev_t dev; 267 daddr_t blkno; 268 { 269 register struct buf *bp; 270 register int dblkno = fsbtodb(blkno); 271 272 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 273 bp = &buf[bp->b_hlink]) 274 if (bp->b_blkno == dblkno && bp->b_dev == dev) 275 return (1); 276 return (0); 277 } 278 279 struct buf * 280 baddr(dev, blkno) 281 dev_t dev; 282 daddr_t blkno; 283 { 284 285 if (incore(dev, blkno)) 286 return (bread(dev, blkno)); 287 return (0); 288 } 289 290 /* 291 * Assign a buffer for the given block. If the appropriate 292 * block is already associated, return it; otherwise search 293 * for the oldest non-busy buffer and reassign it. 294 */ 295 struct buf * 296 getblk(dev, blkno) 297 dev_t dev; 298 daddr_t blkno; 299 { 300 register struct buf *bp, *dp, *ep; 301 register int i, x, dblkno; 302 303 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT)) 304 blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1); 305 dblkno = fsbtodb(blkno); 306 loop: 307 (void) spl0(); 308 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 309 bp = &buf[bp->b_hlink]) { 310 if (bp->b_blkno != dblkno || bp->b_dev != dev) 311 continue; 312 (void) spl6(); 313 if (bp->b_flags&B_BUSY) { 314 bp->b_flags |= B_WANTED; 315 sleep((caddr_t)bp, PRIBIO+1); 316 goto loop; 317 } 318 (void) spl0(); 319 #ifdef DISKMON 320 i = 0; 321 dp = bp->av_forw; 322 while (dp != &bfreelist) { 323 i++; 324 dp = dp->av_forw; 325 } 326 if (i<NBUF) 327 io_info.bufcount[i]++; 328 #endif 329 notavail(bp); 330 bp->b_flags |= B_CACHE; 331 return(bp); 332 } 333 if (major(dev) >= nblkdev) 334 panic("blkdev"); 335 dp = bdevsw[major(dev)].d_tab; 336 if (dp == NULL) 337 panic("devtab"); 338 (void) spl6(); 339 if (bfreelist.av_forw == &bfreelist) { 340 bfreelist.b_flags |= B_WANTED; 341 sleep((caddr_t)&bfreelist, PRIBIO+1); 342 goto loop; 343 } 344 (void) spl0(); 345 bp = bfreelist.av_forw; 346 notavail(bp); 347 if (bp->b_flags & B_DELWRI) { 348 bp->b_flags |= B_ASYNC; 349 bwrite(bp); 350 goto loop; 351 } 352 if (bp->b_dev == NODEV) 353 goto done; 354 /* INLINE EXPANSION OF bunhash(bp) */ 355 (void) spl6(); 356 i = BUFHASH(dbtofsb(bp->b_blkno)); 357 x = bp - buf; 358 if (bufhash[i] == x) { 359 bufhash[i] = bp->b_hlink; 360 } else { 361 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 362 ep = &buf[ep->b_hlink]) 363 if (ep->b_hlink == x) { 364 ep->b_hlink = bp->b_hlink; 365 goto done; 366 } 367 panic("getblk"); 368 } 369 done: 370 (void) spl0(); 371 /* END INLINE EXPANSION */ 372 bp->b_flags = B_BUSY; 373 bp->b_back->b_forw = bp->b_forw; 374 bp->b_forw->b_back = bp->b_back; 375 bp->b_forw = dp->b_forw; 376 bp->b_back = dp; 377 dp->b_forw->b_back = bp; 378 dp->b_forw = bp; 379 bp->b_dev = dev; 380 bp->b_blkno = dblkno; 381 i = BUFHASH(blkno); 382 bp->b_hlink = bufhash[i]; 383 bufhash[i] = bp - buf; 384 return(bp); 385 } 386 387 /* 388 * get an empty block, 389 * not assigned to any particular device 390 */ 391 struct buf * 392 geteblk() 393 { 394 register struct buf *bp, *dp; 395 396 loop: 397 (void) spl6(); 398 while (bfreelist.av_forw == &bfreelist) { 399 bfreelist.b_flags |= B_WANTED; 400 sleep((caddr_t)&bfreelist, PRIBIO+1); 401 } 402 (void) spl0(); 403 dp = &bfreelist; 404 bp = bfreelist.av_forw; 405 notavail(bp); 406 if (bp->b_flags & B_DELWRI) { 407 bp->b_flags |= B_ASYNC; 408 bwrite(bp); 409 goto loop; 410 } 411 if (bp->b_dev != NODEV) 412 bunhash(bp); 413 bp->b_flags = B_BUSY; 414 bp->b_back->b_forw = bp->b_forw; 415 bp->b_forw->b_back = bp->b_back; 416 bp->b_forw = dp->b_forw; 417 bp->b_back = dp; 418 dp->b_forw->b_back = bp; 419 dp->b_forw = bp; 420 bp->b_dev = (dev_t)NODEV; 421 bp->b_hlink = -1; 422 return(bp); 423 } 424 425 bunhash(bp) 426 register struct buf *bp; 427 { 428 register struct buf *ep; 429 register int i, x, s; 430 431 if (bp->b_dev == NODEV) 432 return; 433 s = spl6(); 434 i = BUFHASH(dbtofsb(bp->b_blkno)); 435 x = bp - buf; 436 if (bufhash[i] == x) { 437 bufhash[i] = bp->b_hlink; 438 goto ret; 439 } 440 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 441 ep = &buf[ep->b_hlink]) 442 if (ep->b_hlink == x) { 443 ep->b_hlink = bp->b_hlink; 444 goto ret; 445 } 446 panic("bunhash"); 447 ret: 448 splx(s); 449 } 450 451 /* 452 * Wait for I/O completion on the buffer; return errors 453 * to the user. 454 */ 455 iowait(bp) 456 register struct buf *bp; 457 { 458 459 (void) spl6(); 460 while ((bp->b_flags&B_DONE)==0) 461 sleep((caddr_t)bp, PRIBIO); 462 (void) spl0(); 463 geterror(bp); 464 } 465 466 #ifndef FASTVAX 467 /* 468 * Unlink a buffer from the available list and mark it busy. 469 * (internal interface) 470 */ 471 notavail(bp) 472 register struct buf *bp; 473 { 474 register s; 475 476 s = spl6(); 477 bp->av_back->av_forw = bp->av_forw; 478 bp->av_forw->av_back = bp->av_back; 479 bp->b_flags |= B_BUSY; 480 splx(s); 481 } 482 #endif 483 484 /* 485 * Mark I/O complete on a buffer. If the header 486 * indicates a dirty page push completion, the 487 * header is inserted into the ``cleaned'' list 488 * to be processed by the pageout daemon. Otherwise 489 * release it if I/O is asynchronous, and wake 490 * up anyone waiting for it. 491 */ 492 iodone(bp) 493 register struct buf *bp; 494 { 495 register int s; 496 497 if (bp->b_flags & B_DONE) 498 panic("dup iodone"); 499 bp->b_flags |= B_DONE; 500 if (bp->b_flags & B_DIRTY) { 501 if (bp->b_flags & B_ERROR) 502 panic("IO err in push"); 503 s = spl6(); 504 cnt.v_pgout++; 505 bp->av_forw = bclnlist; 506 bp->b_bcount = swsize[bp - swbuf]; 507 bp->b_pfcent = swpf[bp - swbuf]; 508 bclnlist = bp; 509 if (bswlist.b_flags & B_WANTED) 510 wakeup((caddr_t)&proc[2]); 511 splx(s); 512 return; 513 } 514 if (bp->b_flags&B_ASYNC) 515 brelse(bp); 516 else { 517 bp->b_flags &= ~B_WANTED; 518 wakeup((caddr_t)bp); 519 } 520 } 521 522 /* 523 * Zero the core associated with a buffer. 524 */ 525 clrbuf(bp) 526 struct buf *bp; 527 { 528 register *p; 529 register c; 530 531 p = bp->b_un.b_words; 532 c = BSIZE/sizeof(int); 533 do 534 *p++ = 0; 535 while (--c); 536 bp->b_resid = 0; 537 } 538 539 /* 540 * swap I/O - 541 * 542 * If the flag indicates a dirty page push initiated 543 * by the pageout daemon, we map the page into the i th 544 * virtual page of process 2 (the daemon itself) where i is 545 * the index of the swap header that has been allocated. 546 * We simply initialize the header and queue the I/O but 547 * do not wait for completion. When the I/O completes, 548 * iodone() will link the header to a list of cleaned 549 * pages to be processed by the pageout daemon. 550 */ 551 swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent) 552 struct proc *p; 553 swblk_t dblkno; 554 caddr_t addr; 555 int flag, nbytes; 556 dev_t dev; 557 unsigned pfcent; 558 { 559 register struct buf *bp; 560 register int c; 561 int p2dp; 562 register struct pte *dpte, *vpte; 563 564 (void) spl6(); 565 while (bswlist.av_forw == NULL) { 566 bswlist.b_flags |= B_WANTED; 567 sleep((caddr_t)&bswlist, PSWP+1); 568 } 569 bp = bswlist.av_forw; 570 bswlist.av_forw = bp->av_forw; 571 (void) spl0(); 572 573 bp->b_flags = B_BUSY | B_PHYS | rdflg | flag; 574 if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0) 575 if (rdflg == B_READ) 576 sum.v_pswpin += btoc(nbytes); 577 else 578 sum.v_pswpout += btoc(nbytes); 579 bp->b_proc = p; 580 if (flag & B_DIRTY) { 581 p2dp = ((bp - swbuf) * CLSIZE) * KLMAX; 582 dpte = dptopte(&proc[2], p2dp); 583 vpte = vtopte(p, btop(addr)); 584 for (c = 0; c < nbytes; c += NBPG) { 585 if (vpte->pg_pfnum == 0 || vpte->pg_fod) 586 panic("swap bad pte"); 587 *dpte++ = *vpte++; 588 } 589 bp->b_un.b_addr = (caddr_t)ctob(p2dp); 590 } else 591 bp->b_un.b_addr = addr; 592 while (nbytes > 0) { 593 c = imin(ctob(120), nbytes); 594 bp->b_bcount = c; 595 bp->b_blkno = dblkno; 596 bp->b_dev = dev; 597 if (flag & B_DIRTY) { 598 swpf[bp - swbuf] = pfcent; 599 swsize[bp - swbuf] = nbytes; 600 } 601 (*bdevsw[major(dev)].d_strategy)(bp); 602 if (flag & B_DIRTY) { 603 if (c < nbytes) 604 panic("big push"); 605 return; 606 } 607 (void) spl6(); 608 while((bp->b_flags&B_DONE)==0) 609 sleep((caddr_t)bp, PSWP); 610 (void) spl0(); 611 bp->b_un.b_addr += c; 612 bp->b_flags &= ~B_DONE; 613 if (bp->b_flags & B_ERROR) { 614 if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE) 615 panic("hard IO err in swap"); 616 swkill(p, (char *)0); 617 } 618 nbytes -= c; 619 dblkno += btoc(c); 620 } 621 (void) spl6(); 622 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); 623 bp->av_forw = bswlist.av_forw; 624 bswlist.av_forw = bp; 625 if (bswlist.b_flags & B_WANTED) { 626 bswlist.b_flags &= ~B_WANTED; 627 wakeup((caddr_t)&bswlist); 628 wakeup((caddr_t)&proc[2]); 629 } 630 (void) spl0(); 631 } 632 633 /* 634 * If rout == 0 then killed on swap error, else 635 * rout is the name of the routine where we ran out of 636 * swap space. 637 */ 638 swkill(p, rout) 639 struct proc *p; 640 char *rout; 641 { 642 643 printf("%d: ", p->p_pid); 644 if (rout) 645 printf("out of swap space in %s\n", rout); 646 else 647 printf("killed on swap error\n"); 648 /* 649 * To be sure no looping (e.g. in vmsched trying to 650 * swap out) mark process locked in core (as though 651 * done by user) after killing it so noone will try 652 * to swap it out. 653 */ 654 psignal(p, SIGKILL); 655 p->p_flag |= SULOCK; 656 } 657 658 /* 659 * make sure all write-behind blocks 660 * on dev (or NODEV for all) 661 * are flushed out. 662 * (from umount and update) 663 */ 664 bflush(dev) 665 dev_t dev; 666 { 667 register struct buf *bp; 668 669 loop: 670 (void) spl6(); 671 for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) { 672 if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) { 673 bp->b_flags |= B_ASYNC; 674 notavail(bp); 675 bwrite(bp); 676 goto loop; 677 } 678 } 679 (void) spl0(); 680 } 681 682 /* 683 * Raw I/O. The arguments are 684 * The strategy routine for the device 685 * A buffer, which will always be a special buffer 686 * header owned exclusively by the device for this purpose 687 * The device number 688 * Read/write flag 689 * Essentially all the work is computing physical addresses and 690 * validating them. 691 * If the user has the proper access privilidges, the process is 692 * marked 'delayed unlock' and the pages involved in the I/O are 693 * faulted and locked. After the completion of the I/O, the above pages 694 * are unlocked. 695 */ 696 physio(strat, bp, dev, rw, mincnt) 697 int (*strat)(); 698 register struct buf *bp; 699 unsigned (*mincnt)(); 700 { 701 register int c; 702 char *a; 703 704 if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) { 705 u.u_error = EFAULT; 706 return; 707 } 708 (void) spl6(); 709 while (bp->b_flags&B_BUSY) { 710 bp->b_flags |= B_WANTED; 711 sleep((caddr_t)bp, PRIBIO+1); 712 } 713 bp->b_error = 0; 714 bp->b_proc = u.u_procp; 715 bp->b_un.b_addr = u.u_base; 716 while (u.u_count != 0 && bp->b_error==0) { 717 bp->b_flags = B_BUSY | B_PHYS | rw; 718 bp->b_dev = dev; 719 bp->b_blkno = u.u_offset >> PGSHIFT; 720 bp->b_bcount = u.u_count; 721 (*mincnt)(bp); 722 c = bp->b_bcount; 723 u.u_procp->p_flag |= SPHYSIO; 724 vslock(a = bp->b_un.b_addr, c); 725 (*strat)(bp); 726 (void) spl6(); 727 while ((bp->b_flags&B_DONE) == 0) 728 sleep((caddr_t)bp, PRIBIO); 729 vsunlock(a, c, rw); 730 u.u_procp->p_flag &= ~SPHYSIO; 731 if (bp->b_flags&B_WANTED) 732 wakeup((caddr_t)bp); 733 (void) spl0(); 734 bp->b_un.b_addr += c; 735 u.u_count -= c; 736 u.u_offset += c; 737 } 738 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 739 u.u_count = bp->b_resid; 740 geterror(bp); 741 } 742 743 /*ARGSUSED*/ 744 unsigned 745 minphys(bp) 746 struct buf *bp; 747 { 748 749 if (bp->b_bcount > 60 * 1024) 750 bp->b_bcount = 60 * 1024; 751 } 752 753 /* 754 * Pick up the device's error number and pass it to the user; 755 * if there is an error but the number is 0 set a generalized 756 * code. Actually the latter is always true because devices 757 * don't yet return specific errors. 758 */ 759 geterror(bp) 760 register struct buf *bp; 761 { 762 763 if (bp->b_flags&B_ERROR) 764 if ((u.u_error = bp->b_error)==0) 765 u.u_error = EIO; 766 } 767