1 /* vfs_cluster.c 3.7 06/07/80 */ 2 3 #include "../h/param.h" 4 #include "../h/systm.h" 5 #include "../h/dir.h" 6 #include "../h/user.h" 7 #include "../h/buf.h" 8 #include "../h/conf.h" 9 #include "../h/proc.h" 10 #include "../h/seg.h" 11 #include "../h/pte.h" 12 #include "../h/vm.h" 13 14 /* 15 * The following several routines allocate and free 16 * buffers with various side effects. In general the 17 * arguments to an allocate routine are a device and 18 * a block number, and the value is a pointer to 19 * to the buffer header; the buffer is marked "busy" 20 * so that no one else can touch it. If the block was 21 * already in core, no I/O need be done; if it is 22 * already busy, the process waits until it becomes free. 23 * The following routines allocate a buffer: 24 * getblk 25 * bread 26 * breada 27 * baddr (if it is incore) 28 * Eventually the buffer must be released, possibly with the 29 * side effect of writing it out, by using one of 30 * bwrite 31 * bdwrite 32 * bawrite 33 * brelse 34 */ 35 36 #define BUFHSZ 63 37 #define BUFHASH(blkno) (blkno % BUFHSZ) 38 short bufhash[BUFHSZ]; 39 40 /* 41 * Initialize hash links for buffers. 42 */ 43 bhinit() 44 { 45 register int i; 46 47 for (i = 0; i < BUFHSZ; i++) 48 bufhash[i] = -1; 49 } 50 51 /* #define DISKMON 1 */ 52 53 #ifdef DISKMON 54 struct { 55 int nbuf; 56 long nread; 57 long nreada; 58 long ncache; 59 long nwrite; 60 long bufcount[NBUF]; 61 } io_info; 62 #endif 63 64 /* 65 * Swap IO headers - 66 * They contain the necessary information for the swap I/O. 67 * At any given time, a swap header can be in three 68 * different lists. When free it is in the free list, 69 * when allocated and the I/O queued, it is on the swap 70 * device list, and finally, if the operation was a dirty 71 * page push, when the I/O completes, it is inserted 72 * in a list of cleaned pages to be processed by the pageout daemon. 73 */ 74 struct buf swbuf[NSWBUF]; 75 short swsize[NSWBUF]; /* CAN WE JUST USE B_BCOUNT? */ 76 int swpf[NSWBUF]; 77 78 79 #ifdef FASTVAX 80 #define notavail(bp) \ 81 { \ 82 int s = spl6(); \ 83 (bp)->av_back->av_forw = (bp)->av_forw; \ 84 (bp)->av_forw->av_back = (bp)->av_back; \ 85 (bp)->b_flags |= B_BUSY; \ 86 splx(s); \ 87 } 88 #endif 89 90 /* 91 * Read in (if necessary) the block and return a buffer pointer. 92 */ 93 struct buf * 94 bread(dev, blkno) 95 dev_t dev; 96 daddr_t blkno; 97 { 98 register struct buf *bp; 99 100 bp = getblk(dev, blkno); 101 if (bp->b_flags&B_DONE) { 102 #ifdef DISKMON 103 io_info.ncache++; 104 #endif 105 return(bp); 106 } 107 bp->b_flags |= B_READ; 108 bp->b_bcount = BSIZE; 109 (*bdevsw[major(dev)].d_strategy)(bp); 110 #ifdef DISKMON 111 io_info.nread++; 112 #endif 113 u.u_vm.vm_inblk++; /* pay for read */ 114 iowait(bp); 115 return(bp); 116 } 117 118 /* 119 * Read in the block, like bread, but also start I/O on the 120 * read-ahead block (which is not allocated to the caller) 121 */ 122 struct buf * 123 breada(dev, blkno, rablkno) 124 dev_t dev; 125 daddr_t blkno, rablkno; 126 { 127 register struct buf *bp, *rabp; 128 129 bp = NULL; 130 if (!incore(dev, blkno)) { 131 bp = getblk(dev, blkno); 132 if ((bp->b_flags&B_DONE) == 0) { 133 bp->b_flags |= B_READ; 134 bp->b_bcount = BSIZE; 135 (*bdevsw[major(dev)].d_strategy)(bp); 136 #ifdef DISKMON 137 io_info.nread++; 138 #endif 139 u.u_vm.vm_inblk++; /* pay for read */ 140 } 141 } 142 if (rablkno && !incore(dev, rablkno)) { 143 rabp = getblk(dev, rablkno); 144 if (rabp->b_flags & B_DONE) 145 brelse(rabp); 146 else { 147 rabp->b_flags |= B_READ|B_ASYNC; 148 rabp->b_bcount = BSIZE; 149 (*bdevsw[major(dev)].d_strategy)(rabp); 150 #ifdef DISKMON 151 io_info.nreada++; 152 #endif 153 u.u_vm.vm_inblk++; /* pay in advance */ 154 } 155 } 156 if(bp == NULL) 157 return(bread(dev, blkno)); 158 iowait(bp); 159 return(bp); 160 } 161 162 /* 163 * Write the buffer, waiting for completion. 164 * Then release the buffer. 165 */ 166 bwrite(bp) 167 register struct buf *bp; 168 { 169 register flag; 170 171 flag = bp->b_flags; 172 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 173 bp->b_bcount = BSIZE; 174 #ifdef DISKMON 175 io_info.nwrite++; 176 #endif 177 if ((flag&B_DELWRI) == 0) 178 u.u_vm.vm_oublk++; /* noone paid yet */ 179 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 180 if ((flag&B_ASYNC) == 0) { 181 iowait(bp); 182 brelse(bp); 183 } else if (flag & B_DELWRI) 184 bp->b_flags |= B_AGE; 185 else 186 geterror(bp); 187 } 188 189 /* 190 * Release the buffer, marking it so that if it is grabbed 191 * for another purpose it will be written out before being 192 * given up (e.g. when writing a partial block where it is 193 * assumed that another write for the same block will soon follow). 194 * This can't be done for magtape, since writes must be done 195 * in the same order as requested. 196 */ 197 bdwrite(bp) 198 register struct buf *bp; 199 { 200 register struct buf *dp; 201 202 if ((bp->b_flags&B_DELWRI) == 0) 203 u.u_vm.vm_oublk++; /* noone paid yet */ 204 dp = bdevsw[major(bp->b_dev)].d_tab; 205 if(dp->b_flags & B_TAPE) 206 bawrite(bp); 207 else { 208 bp->b_flags |= B_DELWRI | B_DONE; 209 brelse(bp); 210 } 211 } 212 213 /* 214 * Release the buffer, start I/O on it, but don't wait for completion. 215 */ 216 bawrite(bp) 217 register struct buf *bp; 218 { 219 220 bp->b_flags |= B_ASYNC; 221 bwrite(bp); 222 } 223 224 /* 225 * release the buffer, with no I/O implied. 226 */ 227 brelse(bp) 228 register struct buf *bp; 229 { 230 register struct buf **backp; 231 register s; 232 233 if (bp->b_flags&B_WANTED) 234 wakeup((caddr_t)bp); 235 if (bfreelist.b_flags&B_WANTED) { 236 bfreelist.b_flags &= ~B_WANTED; 237 wakeup((caddr_t)&bfreelist); 238 } 239 if ((bp->b_flags&B_ERROR) && bp->b_dev != NODEV) { 240 bunhash(bp); 241 bp->b_dev = NODEV; /* no assoc. on error */ 242 } 243 s = spl6(); 244 if(bp->b_flags & (B_AGE|B_ERROR)) { 245 backp = &bfreelist.av_forw; 246 (*backp)->av_back = bp; 247 bp->av_forw = *backp; 248 *backp = bp; 249 bp->av_back = &bfreelist; 250 } else { 251 backp = &bfreelist.av_back; 252 (*backp)->av_forw = bp; 253 bp->av_back = *backp; 254 *backp = bp; 255 bp->av_forw = &bfreelist; 256 } 257 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 258 splx(s); 259 } 260 261 /* 262 * See if the block is associated with some buffer 263 * (mainly to avoid getting hung up on a wait in breada) 264 */ 265 incore(dev, blkno) 266 dev_t dev; 267 daddr_t blkno; 268 { 269 register struct buf *bp; 270 register int dblkno = fsbtodb(blkno); 271 272 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 273 bp = &buf[bp->b_hlink]) 274 if (bp->b_blkno == dblkno && bp->b_dev == dev) 275 return (1); 276 return (0); 277 } 278 279 struct buf * 280 baddr(dev, blkno) 281 dev_t dev; 282 daddr_t blkno; 283 { 284 285 if (incore(dev, blkno)) 286 return (bread(dev, blkno)); 287 return (0); 288 } 289 290 /* 291 * Assign a buffer for the given block. If the appropriate 292 * block is already associated, return it; otherwise search 293 * for the oldest non-busy buffer and reassign it. 294 */ 295 struct buf * 296 getblk(dev, blkno) 297 dev_t dev; 298 daddr_t blkno; 299 { 300 register struct buf *bp, *dp, *ep; 301 register int i, x; 302 register int dblkno = fsbtodb(blkno); 303 304 loop: 305 (void) spl0(); 306 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 307 bp = &buf[bp->b_hlink]) { 308 if (bp->b_blkno != dblkno || bp->b_dev != dev) 309 continue; 310 (void) spl6(); 311 if (bp->b_flags&B_BUSY) { 312 bp->b_flags |= B_WANTED; 313 sleep((caddr_t)bp, PRIBIO+1); 314 goto loop; 315 } 316 (void) spl0(); 317 #ifdef DISKMON 318 i = 0; 319 dp = bp->av_forw; 320 while (dp != &bfreelist) { 321 i++; 322 dp = dp->av_forw; 323 } 324 if (i<NBUF) 325 io_info.bufcount[i]++; 326 #endif 327 notavail(bp); 328 bp->b_flags |= B_CACHE; 329 return(bp); 330 } 331 if (major(dev) >= nblkdev) 332 panic("blkdev"); 333 dp = bdevsw[major(dev)].d_tab; 334 if (dp == NULL) 335 panic("devtab"); 336 (void) spl6(); 337 if (bfreelist.av_forw == &bfreelist) { 338 bfreelist.b_flags |= B_WANTED; 339 sleep((caddr_t)&bfreelist, PRIBIO+1); 340 goto loop; 341 } 342 spl0(); 343 bp = bfreelist.av_forw; 344 notavail(bp); 345 if (bp->b_flags & B_DELWRI) { 346 bp->b_flags |= B_ASYNC; 347 bwrite(bp); 348 goto loop; 349 } 350 if (bp->b_dev == NODEV) 351 goto done; 352 /* INLINE EXPANSION OF bunhash(bp) */ 353 i = BUFHASH(dbtofsb(bp->b_blkno)); 354 x = bp - buf; 355 if (bufhash[i] == x) { 356 bufhash[i] = bp->b_hlink; 357 } else { 358 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 359 ep = &buf[ep->b_hlink]) 360 if (ep->b_hlink == x) { 361 ep->b_hlink = bp->b_hlink; 362 goto done; 363 } 364 panic("getblk"); 365 } 366 done: 367 /* END INLINE EXPANSION */ 368 bp->b_flags = B_BUSY; 369 bp->b_back->b_forw = bp->b_forw; 370 bp->b_forw->b_back = bp->b_back; 371 bp->b_forw = dp->b_forw; 372 bp->b_back = dp; 373 dp->b_forw->b_back = bp; 374 dp->b_forw = bp; 375 bp->b_dev = dev; 376 bp->b_blkno = dblkno; 377 i = BUFHASH(blkno); 378 bp->b_hlink = bufhash[i]; 379 bufhash[i] = bp - buf; 380 return(bp); 381 } 382 383 /* 384 * get an empty block, 385 * not assigned to any particular device 386 */ 387 struct buf * 388 geteblk() 389 { 390 register struct buf *bp, *dp; 391 392 loop: 393 (void) spl6(); 394 while (bfreelist.av_forw == &bfreelist) { 395 bfreelist.b_flags |= B_WANTED; 396 sleep((caddr_t)&bfreelist, PRIBIO+1); 397 } 398 (void) spl0(); 399 dp = &bfreelist; 400 bp = bfreelist.av_forw; 401 notavail(bp); 402 if (bp->b_flags & B_DELWRI) { 403 bp->b_flags |= B_ASYNC; 404 bwrite(bp); 405 goto loop; 406 } 407 if (bp->b_dev != NODEV) 408 bunhash(bp); 409 bp->b_flags = B_BUSY; 410 bp->b_back->b_forw = bp->b_forw; 411 bp->b_forw->b_back = bp->b_back; 412 bp->b_forw = dp->b_forw; 413 bp->b_back = dp; 414 dp->b_forw->b_back = bp; 415 dp->b_forw = bp; 416 bp->b_dev = (dev_t)NODEV; 417 bp->b_hlink = -1; 418 return(bp); 419 } 420 421 bunhash(bp) 422 register struct buf *bp; 423 { 424 register struct buf *ep; 425 register int i, x; 426 427 if (bp->b_dev == NODEV) 428 return; 429 i = BUFHASH(dbtofsb(bp->b_blkno)); 430 x = bp - buf; 431 if (bufhash[i] == x) { 432 bufhash[i] = bp->b_hlink; 433 return; 434 } 435 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 436 ep = &buf[ep->b_hlink]) 437 if (ep->b_hlink == x) { 438 ep->b_hlink = bp->b_hlink; 439 return; 440 } 441 panic("bunhash"); 442 } 443 444 /* 445 * Wait for I/O completion on the buffer; return errors 446 * to the user. 447 */ 448 iowait(bp) 449 register struct buf *bp; 450 { 451 452 (void) spl6(); 453 while ((bp->b_flags&B_DONE)==0) 454 sleep((caddr_t)bp, PRIBIO); 455 (void) spl0(); 456 geterror(bp); 457 } 458 459 #ifndef FASTVAX 460 /* 461 * Unlink a buffer from the available list and mark it busy. 462 * (internal interface) 463 */ 464 notavail(bp) 465 register struct buf *bp; 466 { 467 register s; 468 469 s = spl6(); 470 bp->av_back->av_forw = bp->av_forw; 471 bp->av_forw->av_back = bp->av_back; 472 bp->b_flags |= B_BUSY; 473 splx(s); 474 } 475 #endif 476 477 /* 478 * Mark I/O complete on a buffer. If the header 479 * indicates a dirty page push completion, the 480 * header is inserted into the ``cleaned'' list 481 * to be processed by the pageout daemon. Otherwise 482 * release it if I/O is asynchronous, and wake 483 * up anyone waiting for it. 484 */ 485 iodone(bp) 486 register struct buf *bp; 487 { 488 register int s; 489 490 bp->b_flags |= B_DONE; 491 if (bp->b_flags & B_DIRTY) { 492 if (bp->b_flags & B_ERROR) 493 panic("IO err in push"); 494 s = spl6(); 495 cnt.v_pgout++; 496 bp->av_forw = bclnlist; 497 bp->b_bcount = swsize[bp - swbuf]; 498 bp->b_pfcent = swpf[bp - swbuf]; 499 bclnlist = bp; 500 if (bswlist.b_flags & B_WANTED) 501 wakeup((caddr_t)&proc[2]); 502 splx(s); 503 } 504 if (bp->b_flags&B_ASYNC) 505 brelse(bp); 506 else { 507 bp->b_flags &= ~B_WANTED; 508 wakeup((caddr_t)bp); 509 } 510 } 511 512 /* 513 * Zero the core associated with a buffer. 514 */ 515 clrbuf(bp) 516 struct buf *bp; 517 { 518 register *p; 519 register c; 520 521 p = bp->b_un.b_words; 522 c = BSIZE/sizeof(int); 523 do 524 *p++ = 0; 525 while (--c); 526 bp->b_resid = 0; 527 } 528 529 /* 530 * swap I/O - 531 * 532 * If the flag indicates a dirty page push initiated 533 * by the pageout daemon, we map the page into the i th 534 * virtual page of process 2 (the daemon itself) where i is 535 * the index of the swap header that has been allocated. 536 * We simply initialize the header and queue the I/O but 537 * do not wait for completion. When the I/O completes, 538 * iodone() will link the header to a list of cleaned 539 * pages to be processed by the pageout daemon. 540 */ 541 swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent) 542 struct proc *p; 543 swblk_t dblkno; 544 caddr_t addr; 545 int flag, nbytes; 546 dev_t dev; 547 unsigned pfcent; 548 { 549 register struct buf *bp; 550 register int c; 551 int p2dp; 552 register struct pte *dpte, *vpte; 553 554 (void) spl6(); 555 while (bswlist.av_forw == NULL) { 556 bswlist.b_flags |= B_WANTED; 557 sleep((caddr_t)&bswlist, PSWP+1); 558 } 559 bp = bswlist.av_forw; 560 bswlist.av_forw = bp->av_forw; 561 (void) spl0(); 562 563 bp->b_flags = B_BUSY | B_PHYS | rdflg | flag; 564 if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0) 565 if (rdflg == B_READ) 566 sum.v_pswpin += btoc(nbytes); 567 else 568 sum.v_pswpout += btoc(nbytes); 569 bp->b_proc = p; 570 if (flag & B_DIRTY) { 571 p2dp = ((bp - swbuf) * CLSIZE) * KLMAX; 572 dpte = dptopte(&proc[2], p2dp); 573 vpte = vtopte(p, btop(addr)); 574 for (c = 0; c < nbytes; c += NBPG) { 575 if (vpte->pg_pfnum == 0 || vpte->pg_fod) 576 panic("swap bad pte"); 577 *dpte++ = *vpte++; 578 } 579 bp->b_un.b_addr = (caddr_t)ctob(p2dp); 580 } else 581 bp->b_un.b_addr = addr; 582 while (nbytes > 0) { 583 c = imin(ctob(120), nbytes); 584 bp->b_bcount = c; 585 bp->b_blkno = dblkno; 586 bp->b_dev = dev; 587 if (dev == swapdev) 588 bp->b_blkno += swplo; 589 (*bdevsw[major(dev)].d_strategy)(bp); 590 if (flag & B_DIRTY) { 591 if (c < nbytes) 592 panic("big push"); 593 swsize[bp - swbuf] = nbytes; 594 swpf[bp - swbuf] = pfcent; 595 return; 596 } 597 (void) spl6(); 598 while((bp->b_flags&B_DONE)==0) 599 sleep((caddr_t)bp, PSWP); 600 (void) spl0(); 601 bp->b_un.b_addr += c; 602 bp->b_flags &= ~B_DONE; 603 if (bp->b_flags & B_ERROR) { 604 if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE) 605 panic("hard IO err in swap"); 606 swkill(p, (char *)0); 607 } 608 nbytes -= c; 609 dblkno += btoc(c); 610 } 611 (void) spl6(); 612 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); 613 bp->av_forw = bswlist.av_forw; 614 bswlist.av_forw = bp; 615 if (bswlist.b_flags & B_WANTED) { 616 bswlist.b_flags &= ~B_WANTED; 617 wakeup((caddr_t)&bswlist); 618 wakeup((caddr_t)&proc[2]); 619 } 620 (void) spl0(); 621 } 622 623 /* 624 * If rout == 0 then killed on swap error, else 625 * rout is the name of the routine where we ran out of 626 * swap space. 627 */ 628 swkill(p, rout) 629 struct proc *p; 630 char *rout; 631 { 632 633 printf("%d: ", p->p_pid); 634 if (rout) 635 printf("out of swap space in %s\n", rout); 636 else 637 printf("killed on swap error\n"); 638 /* 639 * To be sure no looping (e.g. in vmsched trying to 640 * swap out) mark process locked in core (as though 641 * done by user) after killing it so noone will try 642 * to swap it out. 643 */ 644 psignal(p, SIGKILL); 645 p->p_flag |= SULOCK; 646 } 647 648 /* 649 * make sure all write-behind blocks 650 * on dev (or NODEV for all) 651 * are flushed out. 652 * (from umount and update) 653 */ 654 bflush(dev) 655 dev_t dev; 656 { 657 register struct buf *bp; 658 659 loop: 660 (void) spl6(); 661 for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) { 662 if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) { 663 bp->b_flags |= B_ASYNC; 664 notavail(bp); 665 bwrite(bp); 666 goto loop; 667 } 668 } 669 (void) spl0(); 670 } 671 672 /* 673 * Raw I/O. The arguments are 674 * The strategy routine for the device 675 * A buffer, which will always be a special buffer 676 * header owned exclusively by the device for this purpose 677 * The device number 678 * Read/write flag 679 * Essentially all the work is computing physical addresses and 680 * validating them. 681 * If the user has the proper access privilidges, the process is 682 * marked 'delayed unlock' and the pages involved in the I/O are 683 * faulted and locked. After the completion of the I/O, the above pages 684 * are unlocked. 685 */ 686 physio(strat, bp, dev, rw, mincnt) 687 int (*strat)(); 688 register struct buf *bp; 689 unsigned (*mincnt)(); 690 { 691 register int c; 692 char *a; 693 694 if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) { 695 u.u_error = EFAULT; 696 return; 697 } 698 (void) spl6(); 699 while (bp->b_flags&B_BUSY) { 700 bp->b_flags |= B_WANTED; 701 sleep((caddr_t)bp, PRIBIO+1); 702 } 703 bp->b_error = 0; 704 bp->b_proc = u.u_procp; 705 bp->b_un.b_addr = u.u_base; 706 while (u.u_count != 0 && bp->b_error==0) { 707 bp->b_flags = B_BUSY | B_PHYS | rw; 708 bp->b_dev = dev; 709 bp->b_blkno = u.u_offset >> PGSHIFT; 710 bp->b_bcount = u.u_count; 711 (*mincnt)(bp); 712 c = bp->b_bcount; 713 u.u_procp->p_flag |= SPHYSIO; 714 vslock(a = bp->b_un.b_addr, c); 715 (*strat)(bp); 716 (void) spl6(); 717 while ((bp->b_flags&B_DONE) == 0) 718 sleep((caddr_t)bp, PRIBIO); 719 vsunlock(a, c, rw); 720 u.u_procp->p_flag &= ~SPHYSIO; 721 if (bp->b_flags&B_WANTED) 722 wakeup((caddr_t)bp); 723 (void) spl0(); 724 bp->b_un.b_addr += c; 725 u.u_count -= c; 726 u.u_offset += c; 727 } 728 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 729 u.u_count = bp->b_resid; 730 geterror(bp); 731 } 732 733 /*ARGSUSED*/ 734 unsigned 735 minphys(bp) 736 struct buf *bp; 737 { 738 739 if (bp->b_bcount > 60 * 1024) 740 bp->b_bcount = 60 * 1024; 741 } 742 743 /* 744 * Pick up the device's error number and pass it to the user; 745 * if there is an error but the number is 0 set a generalized 746 * code. Actually the latter is always true because devices 747 * don't yet return specific errors. 748 */ 749 geterror(bp) 750 register struct buf *bp; 751 { 752 753 if (bp->b_flags&B_ERROR) 754 if ((u.u_error = bp->b_error)==0) 755 u.u_error = EIO; 756 } 757