1 /* vfs_cluster.c 3.10 07/29/80 */ 2 3 #include "../h/param.h" 4 #include "../h/systm.h" 5 #include "../h/dir.h" 6 #include "../h/user.h" 7 #include "../h/buf.h" 8 #include "../h/conf.h" 9 #include "../h/proc.h" 10 #include "../h/seg.h" 11 #include "../h/pte.h" 12 #include "../h/vm.h" 13 14 /* 15 * The following several routines allocate and free 16 * buffers with various side effects. In general the 17 * arguments to an allocate routine are a device and 18 * a block number, and the value is a pointer to 19 * to the buffer header; the buffer is marked "busy" 20 * so that no one else can touch it. If the block was 21 * already in core, no I/O need be done; if it is 22 * already busy, the process waits until it becomes free. 23 * The following routines allocate a buffer: 24 * getblk 25 * bread 26 * breada 27 * baddr (if it is incore) 28 * Eventually the buffer must be released, possibly with the 29 * side effect of writing it out, by using one of 30 * bwrite 31 * bdwrite 32 * bawrite 33 * brelse 34 */ 35 36 #define BUFHSZ 63 37 #define BUFHASH(blkno) (blkno % BUFHSZ) 38 short bufhash[BUFHSZ]; 39 40 /* 41 * Initialize hash links for buffers. 42 */ 43 bhinit() 44 { 45 register int i; 46 47 for (i = 0; i < BUFHSZ; i++) 48 bufhash[i] = -1; 49 } 50 51 /* #define DISKMON 1 */ 52 53 #ifdef DISKMON 54 struct { 55 int nbuf; 56 long nread; 57 long nreada; 58 long ncache; 59 long nwrite; 60 long bufcount[NBUF]; 61 } io_info; 62 #endif 63 64 /* 65 * Swap IO headers - 66 * They contain the necessary information for the swap I/O. 67 * At any given time, a swap header can be in three 68 * different lists. When free it is in the free list, 69 * when allocated and the I/O queued, it is on the swap 70 * device list, and finally, if the operation was a dirty 71 * page push, when the I/O completes, it is inserted 72 * in a list of cleaned pages to be processed by the pageout daemon. 73 */ 74 struct buf swbuf[NSWBUF]; 75 short swsize[NSWBUF]; /* CAN WE JUST USE B_BCOUNT? */ 76 int swpf[NSWBUF]; 77 78 79 #ifdef FASTVAX 80 #define notavail(bp) \ 81 { \ 82 int s = spl6(); \ 83 (bp)->av_back->av_forw = (bp)->av_forw; \ 84 (bp)->av_forw->av_back = (bp)->av_back; \ 85 (bp)->b_flags |= B_BUSY; \ 86 splx(s); \ 87 } 88 #endif 89 90 /* 91 * Read in (if necessary) the block and return a buffer pointer. 92 */ 93 struct buf * 94 bread(dev, blkno) 95 dev_t dev; 96 daddr_t blkno; 97 { 98 register struct buf *bp; 99 100 bp = getblk(dev, blkno); 101 if (bp->b_flags&B_DONE) { 102 #ifdef DISKMON 103 io_info.ncache++; 104 #endif 105 return(bp); 106 } 107 bp->b_flags |= B_READ; 108 bp->b_bcount = BSIZE; 109 (*bdevsw[major(dev)].d_strategy)(bp); 110 #ifdef DISKMON 111 io_info.nread++; 112 #endif 113 u.u_vm.vm_inblk++; /* pay for read */ 114 iowait(bp); 115 return(bp); 116 } 117 118 /* 119 * Read in the block, like bread, but also start I/O on the 120 * read-ahead block (which is not allocated to the caller) 121 */ 122 struct buf * 123 breada(dev, blkno, rablkno) 124 dev_t dev; 125 daddr_t blkno, rablkno; 126 { 127 register struct buf *bp, *rabp; 128 129 bp = NULL; 130 if (!incore(dev, blkno)) { 131 bp = getblk(dev, blkno); 132 if ((bp->b_flags&B_DONE) == 0) { 133 bp->b_flags |= B_READ; 134 bp->b_bcount = BSIZE; 135 (*bdevsw[major(dev)].d_strategy)(bp); 136 #ifdef DISKMON 137 io_info.nread++; 138 #endif 139 u.u_vm.vm_inblk++; /* pay for read */ 140 } 141 } 142 if (rablkno && !incore(dev, rablkno)) { 143 rabp = getblk(dev, rablkno); 144 if (rabp->b_flags & B_DONE) 145 brelse(rabp); 146 else { 147 rabp->b_flags |= B_READ|B_ASYNC; 148 rabp->b_bcount = BSIZE; 149 (*bdevsw[major(dev)].d_strategy)(rabp); 150 #ifdef DISKMON 151 io_info.nreada++; 152 #endif 153 u.u_vm.vm_inblk++; /* pay in advance */ 154 } 155 } 156 if(bp == NULL) 157 return(bread(dev, blkno)); 158 iowait(bp); 159 return(bp); 160 } 161 162 /* 163 * Write the buffer, waiting for completion. 164 * Then release the buffer. 165 */ 166 bwrite(bp) 167 register struct buf *bp; 168 { 169 register flag; 170 171 flag = bp->b_flags; 172 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 173 bp->b_bcount = BSIZE; 174 #ifdef DISKMON 175 io_info.nwrite++; 176 #endif 177 if ((flag&B_DELWRI) == 0) 178 u.u_vm.vm_oublk++; /* noone paid yet */ 179 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 180 if ((flag&B_ASYNC) == 0) { 181 iowait(bp); 182 brelse(bp); 183 } else if (flag & B_DELWRI) 184 bp->b_flags |= B_AGE; 185 else 186 geterror(bp); 187 } 188 189 /* 190 * Release the buffer, marking it so that if it is grabbed 191 * for another purpose it will be written out before being 192 * given up (e.g. when writing a partial block where it is 193 * assumed that another write for the same block will soon follow). 194 * This can't be done for magtape, since writes must be done 195 * in the same order as requested. 196 */ 197 bdwrite(bp) 198 register struct buf *bp; 199 { 200 register struct buf *dp; 201 202 if ((bp->b_flags&B_DELWRI) == 0) 203 u.u_vm.vm_oublk++; /* noone paid yet */ 204 dp = bdevsw[major(bp->b_dev)].d_tab; 205 if(dp->b_flags & B_TAPE) 206 bawrite(bp); 207 else { 208 bp->b_flags |= B_DELWRI | B_DONE; 209 brelse(bp); 210 } 211 } 212 213 /* 214 * Release the buffer, start I/O on it, but don't wait for completion. 215 */ 216 bawrite(bp) 217 register struct buf *bp; 218 { 219 220 bp->b_flags |= B_ASYNC; 221 bwrite(bp); 222 } 223 224 /* 225 * release the buffer, with no I/O implied. 226 */ 227 brelse(bp) 228 register struct buf *bp; 229 { 230 register struct buf **backp; 231 register s; 232 233 if (bp->b_flags&B_WANTED) 234 wakeup((caddr_t)bp); 235 if (bfreelist.b_flags&B_WANTED) { 236 bfreelist.b_flags &= ~B_WANTED; 237 wakeup((caddr_t)&bfreelist); 238 } 239 if ((bp->b_flags&B_ERROR) && bp->b_dev != NODEV) { 240 bunhash(bp); 241 bp->b_dev = NODEV; /* no assoc. on error */ 242 } 243 s = spl6(); 244 if(bp->b_flags & (B_AGE|B_ERROR)) { 245 backp = &bfreelist.av_forw; 246 (*backp)->av_back = bp; 247 bp->av_forw = *backp; 248 *backp = bp; 249 bp->av_back = &bfreelist; 250 } else { 251 backp = &bfreelist.av_back; 252 (*backp)->av_forw = bp; 253 bp->av_back = *backp; 254 *backp = bp; 255 bp->av_forw = &bfreelist; 256 } 257 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 258 splx(s); 259 } 260 261 /* 262 * See if the block is associated with some buffer 263 * (mainly to avoid getting hung up on a wait in breada) 264 */ 265 incore(dev, blkno) 266 dev_t dev; 267 daddr_t blkno; 268 { 269 register struct buf *bp; 270 register int dblkno = fsbtodb(blkno); 271 272 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 273 bp = &buf[bp->b_hlink]) 274 if (bp->b_blkno == dblkno && bp->b_dev == dev) 275 return (1); 276 return (0); 277 } 278 279 struct buf * 280 baddr(dev, blkno) 281 dev_t dev; 282 daddr_t blkno; 283 { 284 285 if (incore(dev, blkno)) 286 return (bread(dev, blkno)); 287 return (0); 288 } 289 290 /* 291 * Assign a buffer for the given block. If the appropriate 292 * block is already associated, return it; otherwise search 293 * for the oldest non-busy buffer and reassign it. 294 */ 295 struct buf * 296 getblk(dev, blkno) 297 dev_t dev; 298 daddr_t blkno; 299 { 300 register struct buf *bp, *dp, *ep; 301 register int i, x; 302 register int dblkno = fsbtodb(blkno); 303 304 loop: 305 (void) spl0(); 306 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 307 bp = &buf[bp->b_hlink]) { 308 if (bp->b_blkno != dblkno || bp->b_dev != dev) 309 continue; 310 (void) spl6(); 311 if (bp->b_flags&B_BUSY) { 312 bp->b_flags |= B_WANTED; 313 sleep((caddr_t)bp, PRIBIO+1); 314 goto loop; 315 } 316 (void) spl0(); 317 #ifdef DISKMON 318 i = 0; 319 dp = bp->av_forw; 320 while (dp != &bfreelist) { 321 i++; 322 dp = dp->av_forw; 323 } 324 if (i<NBUF) 325 io_info.bufcount[i]++; 326 #endif 327 notavail(bp); 328 bp->b_flags |= B_CACHE; 329 return(bp); 330 } 331 if (major(dev) >= nblkdev) 332 panic("blkdev"); 333 dp = bdevsw[major(dev)].d_tab; 334 if (dp == NULL) 335 panic("devtab"); 336 (void) spl6(); 337 if (bfreelist.av_forw == &bfreelist) { 338 bfreelist.b_flags |= B_WANTED; 339 sleep((caddr_t)&bfreelist, PRIBIO+1); 340 goto loop; 341 } 342 spl0(); 343 bp = bfreelist.av_forw; 344 notavail(bp); 345 if (bp->b_flags & B_DELWRI) { 346 bp->b_flags |= B_ASYNC; 347 bwrite(bp); 348 goto loop; 349 } 350 if (bp->b_dev == NODEV) 351 goto done; 352 /* INLINE EXPANSION OF bunhash(bp) */ 353 i = BUFHASH(dbtofsb(bp->b_blkno)); 354 x = bp - buf; 355 if (bufhash[i] == x) { 356 bufhash[i] = bp->b_hlink; 357 } else { 358 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 359 ep = &buf[ep->b_hlink]) 360 if (ep->b_hlink == x) { 361 ep->b_hlink = bp->b_hlink; 362 goto done; 363 } 364 panic("getblk"); 365 } 366 done: 367 /* END INLINE EXPANSION */ 368 bp->b_flags = B_BUSY; 369 bp->b_back->b_forw = bp->b_forw; 370 bp->b_forw->b_back = bp->b_back; 371 bp->b_forw = dp->b_forw; 372 bp->b_back = dp; 373 dp->b_forw->b_back = bp; 374 dp->b_forw = bp; 375 bp->b_dev = dev; 376 bp->b_blkno = dblkno; 377 i = BUFHASH(blkno); 378 bp->b_hlink = bufhash[i]; 379 bufhash[i] = bp - buf; 380 return(bp); 381 } 382 383 /* 384 * get an empty block, 385 * not assigned to any particular device 386 */ 387 struct buf * 388 geteblk() 389 { 390 register struct buf *bp, *dp; 391 392 loop: 393 (void) spl6(); 394 while (bfreelist.av_forw == &bfreelist) { 395 bfreelist.b_flags |= B_WANTED; 396 sleep((caddr_t)&bfreelist, PRIBIO+1); 397 } 398 (void) spl0(); 399 dp = &bfreelist; 400 bp = bfreelist.av_forw; 401 notavail(bp); 402 if (bp->b_flags & B_DELWRI) { 403 bp->b_flags |= B_ASYNC; 404 bwrite(bp); 405 goto loop; 406 } 407 if (bp->b_dev != NODEV) 408 bunhash(bp); 409 bp->b_flags = B_BUSY; 410 bp->b_back->b_forw = bp->b_forw; 411 bp->b_forw->b_back = bp->b_back; 412 bp->b_forw = dp->b_forw; 413 bp->b_back = dp; 414 dp->b_forw->b_back = bp; 415 dp->b_forw = bp; 416 bp->b_dev = (dev_t)NODEV; 417 bp->b_hlink = -1; 418 return(bp); 419 } 420 421 bunhash(bp) 422 register struct buf *bp; 423 { 424 register struct buf *ep; 425 register int i, x; 426 427 if (bp->b_dev == NODEV) 428 return; 429 i = BUFHASH(dbtofsb(bp->b_blkno)); 430 x = bp - buf; 431 if (bufhash[i] == x) { 432 bufhash[i] = bp->b_hlink; 433 return; 434 } 435 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 436 ep = &buf[ep->b_hlink]) 437 if (ep->b_hlink == x) { 438 ep->b_hlink = bp->b_hlink; 439 return; 440 } 441 panic("bunhash"); 442 } 443 444 /* 445 * Wait for I/O completion on the buffer; return errors 446 * to the user. 447 */ 448 iowait(bp) 449 register struct buf *bp; 450 { 451 452 (void) spl6(); 453 while ((bp->b_flags&B_DONE)==0) 454 sleep((caddr_t)bp, PRIBIO); 455 (void) spl0(); 456 geterror(bp); 457 } 458 459 #ifndef FASTVAX 460 /* 461 * Unlink a buffer from the available list and mark it busy. 462 * (internal interface) 463 */ 464 notavail(bp) 465 register struct buf *bp; 466 { 467 register s; 468 469 s = spl6(); 470 bp->av_back->av_forw = bp->av_forw; 471 bp->av_forw->av_back = bp->av_back; 472 bp->b_flags |= B_BUSY; 473 splx(s); 474 } 475 #endif 476 477 /* 478 * Mark I/O complete on a buffer. If the header 479 * indicates a dirty page push completion, the 480 * header is inserted into the ``cleaned'' list 481 * to be processed by the pageout daemon. Otherwise 482 * release it if I/O is asynchronous, and wake 483 * up anyone waiting for it. 484 */ 485 iodone(bp) 486 register struct buf *bp; 487 { 488 register int s; 489 490 if (bp->b_flags & B_DONE) 491 panic("dup iodone"); 492 bp->b_flags |= B_DONE; 493 if (bp->b_flags & B_DIRTY) { 494 if (bp->b_flags & B_ERROR) 495 panic("IO err in push"); 496 s = spl6(); 497 cnt.v_pgout++; 498 bp->av_forw = bclnlist; 499 bp->b_bcount = swsize[bp - swbuf]; 500 bp->b_pfcent = swpf[bp - swbuf]; 501 bclnlist = bp; 502 if (bswlist.b_flags & B_WANTED) 503 wakeup((caddr_t)&proc[2]); 504 splx(s); 505 return; 506 } 507 if (bp->b_flags&B_ASYNC) 508 brelse(bp); 509 else { 510 bp->b_flags &= ~B_WANTED; 511 wakeup((caddr_t)bp); 512 } 513 } 514 515 /* 516 * Zero the core associated with a buffer. 517 */ 518 clrbuf(bp) 519 struct buf *bp; 520 { 521 register *p; 522 register c; 523 524 p = bp->b_un.b_words; 525 c = BSIZE/sizeof(int); 526 do 527 *p++ = 0; 528 while (--c); 529 bp->b_resid = 0; 530 } 531 532 /* 533 * swap I/O - 534 * 535 * If the flag indicates a dirty page push initiated 536 * by the pageout daemon, we map the page into the i th 537 * virtual page of process 2 (the daemon itself) where i is 538 * the index of the swap header that has been allocated. 539 * We simply initialize the header and queue the I/O but 540 * do not wait for completion. When the I/O completes, 541 * iodone() will link the header to a list of cleaned 542 * pages to be processed by the pageout daemon. 543 */ 544 swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent) 545 struct proc *p; 546 swblk_t dblkno; 547 caddr_t addr; 548 int flag, nbytes; 549 dev_t dev; 550 unsigned pfcent; 551 { 552 register struct buf *bp; 553 register int c; 554 int p2dp; 555 register struct pte *dpte, *vpte; 556 557 (void) spl6(); 558 while (bswlist.av_forw == NULL) { 559 bswlist.b_flags |= B_WANTED; 560 sleep((caddr_t)&bswlist, PSWP+1); 561 } 562 bp = bswlist.av_forw; 563 bswlist.av_forw = bp->av_forw; 564 (void) spl0(); 565 566 bp->b_flags = B_BUSY | B_PHYS | rdflg | flag; 567 if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0) 568 if (rdflg == B_READ) 569 sum.v_pswpin += btoc(nbytes); 570 else 571 sum.v_pswpout += btoc(nbytes); 572 bp->b_proc = p; 573 if (flag & B_DIRTY) { 574 p2dp = ((bp - swbuf) * CLSIZE) * KLMAX; 575 dpte = dptopte(&proc[2], p2dp); 576 vpte = vtopte(p, btop(addr)); 577 for (c = 0; c < nbytes; c += NBPG) { 578 if (vpte->pg_pfnum == 0 || vpte->pg_fod) 579 panic("swap bad pte"); 580 *dpte++ = *vpte++; 581 } 582 bp->b_un.b_addr = (caddr_t)ctob(p2dp); 583 } else 584 bp->b_un.b_addr = addr; 585 while (nbytes > 0) { 586 c = imin(ctob(120), nbytes); 587 bp->b_bcount = c; 588 bp->b_blkno = dblkno; 589 bp->b_dev = dev; 590 (*bdevsw[major(dev)].d_strategy)(bp); 591 if (flag & B_DIRTY) { 592 if (c < nbytes) 593 panic("big push"); 594 swsize[bp - swbuf] = nbytes; 595 swpf[bp - swbuf] = pfcent; 596 return; 597 } 598 (void) spl6(); 599 while((bp->b_flags&B_DONE)==0) 600 sleep((caddr_t)bp, PSWP); 601 (void) spl0(); 602 bp->b_un.b_addr += c; 603 bp->b_flags &= ~B_DONE; 604 if (bp->b_flags & B_ERROR) { 605 if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE) 606 panic("hard IO err in swap"); 607 swkill(p, (char *)0); 608 } 609 nbytes -= c; 610 dblkno += btoc(c); 611 } 612 (void) spl6(); 613 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); 614 bp->av_forw = bswlist.av_forw; 615 bswlist.av_forw = bp; 616 if (bswlist.b_flags & B_WANTED) { 617 bswlist.b_flags &= ~B_WANTED; 618 wakeup((caddr_t)&bswlist); 619 wakeup((caddr_t)&proc[2]); 620 } 621 (void) spl0(); 622 } 623 624 /* 625 * If rout == 0 then killed on swap error, else 626 * rout is the name of the routine where we ran out of 627 * swap space. 628 */ 629 swkill(p, rout) 630 struct proc *p; 631 char *rout; 632 { 633 634 printf("%d: ", p->p_pid); 635 if (rout) 636 printf("out of swap space in %s\n", rout); 637 else 638 printf("killed on swap error\n"); 639 /* 640 * To be sure no looping (e.g. in vmsched trying to 641 * swap out) mark process locked in core (as though 642 * done by user) after killing it so noone will try 643 * to swap it out. 644 */ 645 psignal(p, SIGKILL); 646 p->p_flag |= SULOCK; 647 } 648 649 /* 650 * make sure all write-behind blocks 651 * on dev (or NODEV for all) 652 * are flushed out. 653 * (from umount and update) 654 */ 655 bflush(dev) 656 dev_t dev; 657 { 658 register struct buf *bp; 659 660 loop: 661 (void) spl6(); 662 for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) { 663 if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) { 664 bp->b_flags |= B_ASYNC; 665 notavail(bp); 666 bwrite(bp); 667 goto loop; 668 } 669 } 670 (void) spl0(); 671 } 672 673 /* 674 * Raw I/O. The arguments are 675 * The strategy routine for the device 676 * A buffer, which will always be a special buffer 677 * header owned exclusively by the device for this purpose 678 * The device number 679 * Read/write flag 680 * Essentially all the work is computing physical addresses and 681 * validating them. 682 * If the user has the proper access privilidges, the process is 683 * marked 'delayed unlock' and the pages involved in the I/O are 684 * faulted and locked. After the completion of the I/O, the above pages 685 * are unlocked. 686 */ 687 physio(strat, bp, dev, rw, mincnt) 688 int (*strat)(); 689 register struct buf *bp; 690 unsigned (*mincnt)(); 691 { 692 register int c; 693 char *a; 694 695 if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) { 696 u.u_error = EFAULT; 697 return; 698 } 699 (void) spl6(); 700 while (bp->b_flags&B_BUSY) { 701 bp->b_flags |= B_WANTED; 702 sleep((caddr_t)bp, PRIBIO+1); 703 } 704 bp->b_error = 0; 705 bp->b_proc = u.u_procp; 706 bp->b_un.b_addr = u.u_base; 707 while (u.u_count != 0 && bp->b_error==0) { 708 bp->b_flags = B_BUSY | B_PHYS | rw; 709 bp->b_dev = dev; 710 bp->b_blkno = u.u_offset >> PGSHIFT; 711 bp->b_bcount = u.u_count; 712 (*mincnt)(bp); 713 c = bp->b_bcount; 714 u.u_procp->p_flag |= SPHYSIO; 715 vslock(a = bp->b_un.b_addr, c); 716 (*strat)(bp); 717 (void) spl6(); 718 while ((bp->b_flags&B_DONE) == 0) 719 sleep((caddr_t)bp, PRIBIO); 720 vsunlock(a, c, rw); 721 u.u_procp->p_flag &= ~SPHYSIO; 722 if (bp->b_flags&B_WANTED) 723 wakeup((caddr_t)bp); 724 (void) spl0(); 725 bp->b_un.b_addr += c; 726 u.u_count -= c; 727 u.u_offset += c; 728 } 729 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 730 u.u_count = bp->b_resid; 731 geterror(bp); 732 } 733 734 /*ARGSUSED*/ 735 unsigned 736 minphys(bp) 737 struct buf *bp; 738 { 739 740 if (bp->b_bcount > 60 * 1024) 741 bp->b_bcount = 60 * 1024; 742 } 743 744 /* 745 * Pick up the device's error number and pass it to the user; 746 * if there is an error but the number is 0 set a generalized 747 * code. Actually the latter is always true because devices 748 * don't yet return specific errors. 749 */ 750 geterror(bp) 751 register struct buf *bp; 752 { 753 754 if (bp->b_flags&B_ERROR) 755 if ((u.u_error = bp->b_error)==0) 756 u.u_error = EIO; 757 } 758