1 /* vfs_cluster.c 3.1 10/14/12 */ 2 3 #include "../h/param.h" 4 #include "../h/systm.h" 5 #include "../h/dir.h" 6 #include "../h/user.h" 7 #include "../h/buf.h" 8 #include "../h/conf.h" 9 #include "../h/proc.h" 10 #include "../h/seg.h" 11 #include "../h/pte.h" 12 #include "../h/vm.h" 13 14 /* #define DISKMON 1 */ 15 16 #ifdef DISKMON 17 struct { 18 int nbuf; 19 long nread; 20 long nreada; 21 long ncache; 22 long nwrite; 23 long bufcount[NBUF]; 24 } io_info; 25 #endif 26 27 /* 28 * Swap IO headers - 29 * They contain the necessary information for the swap I/O. 30 * At any given time, a swap header can be in three 31 * different lists. When free it is in the free list, 32 * when allocated and the I/O queued, it is on the swap 33 * device list, and finally, if the operation was a dirty 34 * page push, when the I/O completes, it is inserted 35 * in a list of cleaned pages to be processed by the pageout daemon. 36 */ 37 struct buf swbuf[NSWBUF]; 38 short swsize[NSWBUF]; /* CAN WE JUST USE B_BCOUNT? */ 39 int swpf[NSWBUF]; 40 41 /* 42 * The following several routines allocate and free 43 * buffers with various side effects. In general the 44 * arguments to an allocate routine are a device and 45 * a block number, and the value is a pointer to 46 * to the buffer header; the buffer is marked "busy" 47 * so that no one else can touch it. If the block was 48 * already in core, no I/O need be done; if it is 49 * already busy, the process waits until it becomes free. 50 * The following routines allocate a buffer: 51 * getblk 52 * bread 53 * breada 54 * baddr (if it is incore) 55 * Eventually the buffer must be released, possibly with the 56 * side effect of writing it out, by using one of 57 * bwrite 58 * bdwrite 59 * bawrite 60 * brelse 61 */ 62 63 #ifdef FASTVAX 64 #define notavail(bp) \ 65 { \ 66 int s = spl6(); \ 67 (bp)->av_back->av_forw = (bp)->av_forw; \ 68 (bp)->av_forw->av_back = (bp)->av_back; \ 69 (bp)->b_flags |= B_BUSY; \ 70 splx(s); \ 71 } 72 #endif 73 74 /* 75 * Read in (if necessary) the block and return a buffer pointer. 76 */ 77 struct buf * 78 bread(dev, blkno) 79 dev_t dev; 80 daddr_t blkno; 81 { 82 register struct buf *bp; 83 84 bp = getblk(dev, blkno); 85 if (bp->b_flags&B_DONE) { 86 #ifdef DISKMON 87 io_info.ncache++; 88 #endif 89 return(bp); 90 } 91 bp->b_flags |= B_READ; 92 bp->b_bcount = BSIZE; 93 (*bdevsw[major(dev)].d_strategy)(bp); 94 #ifdef DISKMON 95 io_info.nread++; 96 #endif 97 u.u_vm.vm_inblk++; /* pay for read */ 98 iowait(bp); 99 return(bp); 100 } 101 102 /* 103 * Read in the block, like bread, but also start I/O on the 104 * read-ahead block (which is not allocated to the caller) 105 */ 106 struct buf * 107 breada(dev, blkno, rablkno) 108 dev_t dev; 109 daddr_t blkno, rablkno; 110 { 111 register struct buf *bp, *rabp; 112 113 bp = NULL; 114 if (!incore(dev, blkno)) { 115 bp = getblk(dev, blkno); 116 if ((bp->b_flags&B_DONE) == 0) { 117 bp->b_flags |= B_READ; 118 bp->b_bcount = BSIZE; 119 (*bdevsw[major(dev)].d_strategy)(bp); 120 #ifdef DISKMON 121 io_info.nread++; 122 #endif 123 u.u_vm.vm_inblk++; /* pay for read */ 124 } 125 } 126 if (rablkno && !incore(dev, rablkno)) { 127 rabp = getblk(dev, rablkno); 128 if (rabp->b_flags & B_DONE) 129 brelse(rabp); 130 else { 131 rabp->b_flags |= B_READ|B_ASYNC; 132 rabp->b_bcount = BSIZE; 133 (*bdevsw[major(dev)].d_strategy)(rabp); 134 #ifdef DISKMON 135 io_info.nreada++; 136 #endif 137 u.u_vm.vm_inblk++; /* pay in advance */ 138 } 139 } 140 if(bp == NULL) 141 return(bread(dev, blkno)); 142 iowait(bp); 143 return(bp); 144 } 145 146 /* 147 * Write the buffer, waiting for completion. 148 * Then release the buffer. 149 */ 150 bwrite(bp) 151 register struct buf *bp; 152 { 153 register flag; 154 155 flag = bp->b_flags; 156 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 157 bp->b_bcount = BSIZE; 158 #ifdef DISKMON 159 io_info.nwrite++; 160 #endif 161 if ((flag&B_DELWRI) == 0) 162 u.u_vm.vm_oublk++; /* noone paid yet */ 163 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 164 if ((flag&B_ASYNC) == 0) { 165 iowait(bp); 166 brelse(bp); 167 } else if (flag & B_DELWRI) 168 bp->b_flags |= B_AGE; 169 else 170 geterror(bp); 171 } 172 173 /* 174 * Release the buffer, marking it so that if it is grabbed 175 * for another purpose it will be written out before being 176 * given up (e.g. when writing a partial block where it is 177 * assumed that another write for the same block will soon follow). 178 * This can't be done for magtape, since writes must be done 179 * in the same order as requested. 180 */ 181 bdwrite(bp) 182 register struct buf *bp; 183 { 184 register struct buf *dp; 185 186 if ((bp->b_flags&B_DELWRI) == 0) 187 u.u_vm.vm_oublk++; /* noone paid yet */ 188 dp = bdevsw[major(bp->b_dev)].d_tab; 189 if(dp->b_flags & B_TAPE) 190 bawrite(bp); 191 else { 192 bp->b_flags |= B_DELWRI | B_DONE; 193 brelse(bp); 194 } 195 } 196 197 /* 198 * Release the buffer, start I/O on it, but don't wait for completion. 199 */ 200 bawrite(bp) 201 register struct buf *bp; 202 { 203 204 bp->b_flags |= B_ASYNC; 205 bwrite(bp); 206 } 207 208 /* 209 * release the buffer, with no I/O implied. 210 */ 211 brelse(bp) 212 register struct buf *bp; 213 { 214 register struct buf **backp; 215 register s; 216 217 if (bp->b_flags&B_WANTED) 218 wakeup((caddr_t)bp); 219 if (bfreelist.b_flags&B_WANTED) { 220 bfreelist.b_flags &= ~B_WANTED; 221 wakeup((caddr_t)&bfreelist); 222 } 223 if (bp->b_flags&B_ERROR) 224 bp->b_dev = NODEV; /* no assoc. on error */ 225 s = spl6(); 226 if(bp->b_flags & (B_AGE|B_ERROR)) { 227 backp = &bfreelist.av_forw; 228 (*backp)->av_back = bp; 229 bp->av_forw = *backp; 230 *backp = bp; 231 bp->av_back = &bfreelist; 232 } else { 233 backp = &bfreelist.av_back; 234 (*backp)->av_forw = bp; 235 bp->av_back = *backp; 236 *backp = bp; 237 bp->av_forw = &bfreelist; 238 } 239 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 240 splx(s); 241 } 242 243 /* 244 * See if the block is associated with some buffer 245 * (mainly to avoid getting hung up on a wait in breada) 246 */ 247 incore(dev, blkno) 248 dev_t dev; 249 daddr_t blkno; 250 { 251 register struct buf *bp; 252 register struct buf *dp; 253 register int dblkno = fsbtodb(blkno); 254 255 dp = bdevsw[major(dev)].d_tab; 256 for (bp=dp->b_forw; bp != dp; bp = bp->b_forw) 257 if (bp->b_blkno==dblkno && bp->b_dev==dev) 258 return(1); 259 return(0); 260 } 261 262 struct buf * 263 baddr(dev, blkno) 264 dev_t dev; 265 daddr_t blkno; 266 { 267 268 if (incore(dev, blkno)) 269 return (bread(dev, blkno)); 270 return (0); 271 } 272 273 /* 274 * Assign a buffer for the given block. If the appropriate 275 * block is already associated, return it; otherwise search 276 * for the oldest non-busy buffer and reassign it. 277 */ 278 struct buf * 279 getblk(dev, blkno) 280 dev_t dev; 281 daddr_t blkno; 282 { 283 register struct buf *bp; 284 register struct buf *dp; 285 #ifdef DISKMON 286 register i; 287 #endif 288 register int dblkno = fsbtodb(blkno); 289 290 if(major(dev) >= nblkdev) 291 panic("blkdev"); 292 293 loop: 294 VOID spl0(); 295 dp = bdevsw[major(dev)].d_tab; 296 if(dp == NULL) 297 panic("devtab"); 298 for (bp=dp->b_forw; bp != dp; bp = bp->b_forw) { 299 if (bp->b_blkno!=dblkno || bp->b_dev!=dev) 300 continue; 301 VOID spl6(); 302 if (bp->b_flags&B_BUSY) { 303 bp->b_flags |= B_WANTED; 304 sleep((caddr_t)bp, PRIBIO+1); 305 goto loop; 306 } 307 VOID spl0(); 308 #ifdef DISKMON 309 i = 0; 310 dp = bp->av_forw; 311 while (dp != &bfreelist) { 312 i++; 313 dp = dp->av_forw; 314 } 315 if (i<NBUF) 316 io_info.bufcount[i]++; 317 #endif 318 notavail(bp); 319 bp->b_flags |= B_CACHE; 320 return(bp); 321 } 322 VOID spl6(); 323 if (bfreelist.av_forw == &bfreelist) { 324 bfreelist.b_flags |= B_WANTED; 325 sleep((caddr_t)&bfreelist, PRIBIO+1); 326 goto loop; 327 } 328 spl0(); 329 bp = bfreelist.av_forw; 330 notavail(bp); 331 if (bp->b_flags & B_DELWRI) { 332 bp->b_flags |= B_ASYNC; 333 bwrite(bp); 334 goto loop; 335 } 336 bp->b_flags = B_BUSY; 337 bp->b_back->b_forw = bp->b_forw; 338 bp->b_forw->b_back = bp->b_back; 339 bp->b_forw = dp->b_forw; 340 bp->b_back = dp; 341 dp->b_forw->b_back = bp; 342 dp->b_forw = bp; 343 bp->b_dev = dev; 344 bp->b_blkno = dblkno; 345 return(bp); 346 } 347 348 /* 349 * get an empty block, 350 * not assigned to any particular device 351 */ 352 struct buf * 353 geteblk() 354 { 355 register struct buf *bp; 356 register struct buf *dp; 357 358 loop: 359 VOID spl6(); 360 while (bfreelist.av_forw == &bfreelist) { 361 bfreelist.b_flags |= B_WANTED; 362 sleep((caddr_t)&bfreelist, PRIBIO+1); 363 } 364 VOID spl0(); 365 dp = &bfreelist; 366 bp = bfreelist.av_forw; 367 notavail(bp); 368 if (bp->b_flags & B_DELWRI) { 369 bp->b_flags |= B_ASYNC; 370 bwrite(bp); 371 goto loop; 372 } 373 bp->b_flags = B_BUSY; 374 bp->b_back->b_forw = bp->b_forw; 375 bp->b_forw->b_back = bp->b_back; 376 bp->b_forw = dp->b_forw; 377 bp->b_back = dp; 378 dp->b_forw->b_back = bp; 379 dp->b_forw = bp; 380 bp->b_dev = (dev_t)NODEV; 381 return(bp); 382 } 383 384 /* 385 * Wait for I/O completion on the buffer; return errors 386 * to the user. 387 */ 388 iowait(bp) 389 register struct buf *bp; 390 { 391 392 VOID spl6(); 393 while ((bp->b_flags&B_DONE)==0) 394 sleep((caddr_t)bp, PRIBIO); 395 VOID spl0(); 396 geterror(bp); 397 } 398 399 #ifndef FASTVAX 400 /* 401 * Unlink a buffer from the available list and mark it busy. 402 * (internal interface) 403 */ 404 notavail(bp) 405 register struct buf *bp; 406 { 407 register s; 408 409 s = spl6(); 410 bp->av_back->av_forw = bp->av_forw; 411 bp->av_forw->av_back = bp->av_back; 412 bp->b_flags |= B_BUSY; 413 splx(s); 414 } 415 #endif 416 417 /* 418 * Mark I/O complete on a buffer. If the header 419 * indicates a dirty page push completion, the 420 * header is inserted into the ``cleaned'' list 421 * to be processed by the pageout daemon. Otherwise 422 * release it if I/O is asynchronous, and wake 423 * up anyone waiting for it. 424 */ 425 iodone(bp) 426 register struct buf *bp; 427 { 428 register int s; 429 430 bp->b_flags |= B_DONE; 431 if (bp->b_flags & B_DIRTY) { 432 if (bp->b_flags & B_ERROR) 433 panic("IO err in push"); 434 s = spl6(); 435 cnt.v_pgout++; 436 bp->av_forw = bclnlist; 437 bp->b_bcount = swsize[bp - swbuf]; 438 bp->b_pfcent = swpf[bp - swbuf]; 439 bclnlist = bp; 440 if (bswlist.b_flags & B_WANTED) 441 wakeup((caddr_t)&proc[2]); 442 splx(s); 443 } 444 if (bp->b_flags&B_ASYNC) 445 brelse(bp); 446 else { 447 bp->b_flags &= ~B_WANTED; 448 wakeup((caddr_t)bp); 449 } 450 } 451 452 /* 453 * Zero the core associated with a buffer. 454 */ 455 clrbuf(bp) 456 struct buf *bp; 457 { 458 register *p; 459 register c; 460 461 p = bp->b_un.b_words; 462 c = BSIZE/sizeof(int); 463 do 464 *p++ = 0; 465 while (--c); 466 bp->b_resid = 0; 467 } 468 469 /* 470 * swap I/O - 471 * 472 * If the flag indicates a dirty page push initiated 473 * by the pageout daemon, we map the page into the i th 474 * virtual page of process 2 (the daemon itself) where i is 475 * the index of the swap header that has been allocated. 476 * We simply initialize the header and queue the I/O but 477 * do not wait for completion. When the I/O completes, 478 * iodone() will link the header to a list of cleaned 479 * pages to be processed by the pageout daemon. 480 */ 481 swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent) 482 struct proc *p; 483 swblk_t dblkno; 484 caddr_t addr; 485 int flag, nbytes; 486 dev_t dev; 487 unsigned pfcent; 488 { 489 register struct buf *bp; 490 register int c; 491 int p2dp; 492 register struct pte *dpte, *vpte; 493 494 VOID spl6(); 495 while (bswlist.av_forw == NULL) { 496 bswlist.b_flags |= B_WANTED; 497 sleep((caddr_t)&bswlist, PSWP+1); 498 } 499 bp = bswlist.av_forw; 500 bswlist.av_forw = bp->av_forw; 501 VOID spl0(); 502 503 bp->b_flags = B_BUSY | B_PHYS | rdflg | flag; 504 if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0) 505 if (rdflg == B_READ) 506 sum.v_pswpin += btoc(nbytes); 507 else 508 sum.v_pswpout += btoc(nbytes); 509 bp->b_proc = p; 510 if (flag & B_DIRTY) { 511 p2dp = ((bp - swbuf) * CLSIZE) * KLMAX; 512 dpte = dptopte(&proc[2], p2dp); 513 vpte = vtopte(p, btop(addr)); 514 for (c = 0; c < nbytes; c += NBPG) { 515 if (vpte->pg_pfnum == 0 || vpte->pg_fod) 516 panic("swap bad pte"); 517 *dpte++ = *vpte++; 518 } 519 bp->b_un.b_addr = (caddr_t)ctob(p2dp); 520 } else 521 bp->b_un.b_addr = addr; 522 while (nbytes > 0) { 523 c = imin(ctob(120), nbytes); 524 bp->b_bcount = c; 525 bp->b_blkno = dblkno; 526 bp->b_dev = dev; 527 if (dev == swapdev) 528 bp->b_blkno += swplo; 529 (*bdevsw[major(dev)].d_strategy)(bp); 530 if (flag & B_DIRTY) { 531 if (c < nbytes) 532 panic("big push"); 533 swsize[bp - swbuf] = nbytes; 534 swpf[bp - swbuf] = pfcent; 535 return; 536 } 537 VOID spl6(); 538 while((bp->b_flags&B_DONE)==0) 539 sleep((caddr_t)bp, PSWP); 540 VOID spl0(); 541 bp->b_un.b_addr += c; 542 bp->b_flags &= ~B_DONE; 543 if (bp->b_flags & B_ERROR) { 544 if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE) 545 panic("hard IO err in swap"); 546 swkill(p, (char *)0); 547 } 548 nbytes -= c; 549 dblkno += btoc(c); 550 } 551 VOID spl6(); 552 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); 553 bp->av_forw = bswlist.av_forw; 554 bswlist.av_forw = bp; 555 if (bswlist.b_flags & B_WANTED) { 556 bswlist.b_flags &= ~B_WANTED; 557 wakeup((caddr_t)&bswlist); 558 wakeup((caddr_t)&proc[2]); 559 } 560 VOID spl0(); 561 } 562 563 /* 564 * If rout == 0 then killed on swap error, else 565 * rout is the name of the routine where we ran out of 566 * swap space. 567 */ 568 swkill(p, rout) 569 struct proc *p; 570 char *rout; 571 { 572 573 printf("%d: ", p->p_pid); 574 if (rout) 575 printf("out of swap space in %s\n", rout); 576 else 577 printf("killed on swap error\n"); 578 /* 579 * To be sure no looping (e.g. in vmsched trying to 580 * swap out) mark process locked in core (as though 581 * done by user) after killing it so noone will try 582 * to swap it out. 583 */ 584 psignal(p, SIGKIL); 585 p->p_flag |= SULOCK; 586 } 587 588 /* 589 * make sure all write-behind blocks 590 * on dev (or NODEV for all) 591 * are flushed out. 592 * (from umount and update) 593 */ 594 bflush(dev) 595 dev_t dev; 596 { 597 register struct buf *bp; 598 599 loop: 600 VOID spl6(); 601 for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) { 602 if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) { 603 bp->b_flags |= B_ASYNC; 604 notavail(bp); 605 bwrite(bp); 606 goto loop; 607 } 608 } 609 VOID spl0(); 610 } 611 612 /* 613 * Raw I/O. The arguments are 614 * The strategy routine for the device 615 * A buffer, which will always be a special buffer 616 * header owned exclusively by the device for this purpose 617 * The device number 618 * Read/write flag 619 * Essentially all the work is computing physical addresses and 620 * validating them. 621 * If the user has the proper access privilidges, the process is 622 * marked 'delayed unlock' and the pages involved in the I/O are 623 * faulted and locked. After the completion of the I/O, the above pages 624 * are unlocked. 625 */ 626 physio(strat, bp, dev, rw, mincnt) 627 int (*strat)(); 628 register struct buf *bp; 629 unsigned (*mincnt)(); 630 { 631 register int c; 632 char *a; 633 634 if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) { 635 u.u_error = EFAULT; 636 return; 637 } 638 VOID spl6(); 639 while (bp->b_flags&B_BUSY) { 640 bp->b_flags |= B_WANTED; 641 sleep((caddr_t)bp, PRIBIO+1); 642 } 643 bp->b_error = 0; 644 bp->b_proc = u.u_procp; 645 bp->b_un.b_addr = u.u_base; 646 while (u.u_count != 0 && bp->b_error==0) { 647 bp->b_flags = B_BUSY | B_PHYS | rw; 648 bp->b_dev = dev; 649 bp->b_blkno = u.u_offset >> PGSHIFT; 650 bp->b_bcount = u.u_count; 651 (*mincnt)(bp); 652 c = bp->b_bcount; 653 u.u_procp->p_flag |= SPHYSIO; 654 vslock(a = bp->b_un.b_addr, c); 655 (*strat)(bp); 656 VOID spl6(); 657 while ((bp->b_flags&B_DONE) == 0) 658 sleep((caddr_t)bp, PRIBIO); 659 vsunlock(a, c, rw); 660 u.u_procp->p_flag &= ~SPHYSIO; 661 if (bp->b_flags&B_WANTED) 662 wakeup((caddr_t)bp); 663 VOID spl0(); 664 bp->b_un.b_addr += c; 665 u.u_count -= c; 666 u.u_offset += c; 667 } 668 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 669 u.u_count = bp->b_resid; 670 geterror(bp); 671 } 672 673 /*ARGSUSED*/ 674 unsigned 675 minphys(bp) 676 struct buf *bp; 677 { 678 679 if (bp->b_bcount > 60 * 1024) 680 bp->b_bcount = 60 * 1024; 681 } 682 683 /* 684 * Pick up the device's error number and pass it to the user; 685 * if there is an error but the number is 0 set a generalized 686 * code. Actually the latter is always true because devices 687 * don't yet return specific errors. 688 */ 689 geterror(bp) 690 register struct buf *bp; 691 { 692 693 if (bp->b_flags&B_ERROR) 694 if ((u.u_error = bp->b_error)==0) 695 u.u_error = EIO; 696 } 697