1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_cluster.c 7.55 (Berkeley) 10/22/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/mount.h> 17 #include <sys/trace.h> 18 #include <sys/resourcevar.h> 19 #include <sys/malloc.h> 20 #include <libkern/libkern.h> 21 22 /* 23 * Definitions for the buffer hash lists. 24 */ 25 #define BUFHASH(dvp, lbn) \ 26 (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 27 struct list_entry *bufhashtbl, invalhash; 28 u_long bufhash; 29 30 /* 31 * Insq/Remq for the buffer hash lists. 32 */ 33 #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) 34 #define bremhash(bp) list_remove(bp, struct buf *, b_hash) 35 36 /* 37 * Definitions for the buffer free lists. 38 */ 39 #define BQUEUES 4 /* number of free buffer queues */ 40 41 #define BQ_LOCKED 0 /* super-blocks &c */ 42 #define BQ_LRU 1 /* lru, useful buffers */ 43 #define BQ_AGE 2 /* rubbish */ 44 #define BQ_EMPTY 3 /* buffer headers with no memory */ 45 46 struct queue_entry bufqueues[BQUEUES]; 47 int needbuffer; 48 49 /* 50 * Insq/Remq for the buffer free lists. 51 */ 52 #define binsheadfree(bp, dp) \ 53 queue_enter_head(dp, bp, struct buf *, b_freelist) 54 #define binstailfree(bp, dp) \ 55 queue_enter_tail(dp, bp, struct buf *, b_freelist) 56 57 void 58 bremfree(bp) 59 struct buf *bp; 60 { 61 struct queue_entry *dp; 62 63 /* 64 * We only calculate the head of the freelist when removing 65 * the last element of the list as that is the only time that 66 * it is needed (e.g. to reset the tail pointer). 67 */ 68 if (bp->b_freelist.qe_next == NULL) { 69 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 70 if (dp->qe_prev == &bp->b_freelist.qe_next) 71 break; 72 if (dp == &bufqueues[BQUEUES]) 73 panic("bremfree: lost tail"); 74 } 75 queue_remove(dp, bp, struct buf *, b_freelist); 76 } 77 78 /* 79 * Initialize buffers and hash links for buffers. 80 */ 81 void 82 bufinit() 83 { 84 register struct buf *bp; 85 struct queue_entry *dp; 86 register int i; 87 int base, residual; 88 89 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 90 queue_init(dp); 91 bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); 92 base = bufpages / nbuf; 93 residual = bufpages % nbuf; 94 for (i = 0; i < nbuf; i++) { 95 bp = &buf[i]; 96 bzero((char *)bp, sizeof *bp); 97 bp->b_dev = NODEV; 98 bp->b_rcred = NOCRED; 99 bp->b_wcred = NOCRED; 100 bp->b_un.b_addr = buffers + i * MAXBSIZE; 101 if (i < residual) 102 bp->b_bufsize = (base + 1) * CLBYTES; 103 else 104 bp->b_bufsize = base * CLBYTES; 105 bp->b_flags = B_INVAL; 106 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 107 binsheadfree(bp, dp); 108 binshash(bp, &invalhash); 109 } 110 } 111 112 /* 113 * Find the block in the buffer pool. 114 * If the buffer is not present, allocate a new buffer and load 115 * its contents according to the filesystem fill routine. 116 */ 117 bread(vp, blkno, size, cred, bpp) 118 struct vnode *vp; 119 daddr_t blkno; 120 int size; 121 struct ucred *cred; 122 struct buf **bpp; 123 { 124 struct proc *p = curproc; /* XXX */ 125 register struct buf *bp; 126 127 if (size == 0) 128 panic("bread: size 0"); 129 *bpp = bp = getblk(vp, blkno, size); 130 if (bp->b_flags & (B_DONE | B_DELWRI)) { 131 trace(TR_BREADHIT, pack(vp, size), blkno); 132 return (0); 133 } 134 bp->b_flags |= B_READ; 135 if (bp->b_bcount > bp->b_bufsize) 136 panic("bread"); 137 if (bp->b_rcred == NOCRED && cred != NOCRED) { 138 crhold(cred); 139 bp->b_rcred = cred; 140 } 141 VOP_STRATEGY(bp); 142 trace(TR_BREADMISS, pack(vp, size), blkno); 143 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 144 return (biowait(bp)); 145 } 146 147 /* 148 * Operates like bread, but also starts I/O on the N specified 149 * read-ahead blocks. 150 */ 151 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 152 struct vnode *vp; 153 daddr_t blkno; int size; 154 daddr_t rablkno[]; int rabsize[]; 155 int num; 156 struct ucred *cred; 157 struct buf **bpp; 158 { 159 struct proc *p = curproc; /* XXX */ 160 register struct buf *bp, *rabp; 161 register int i; 162 163 bp = NULL; 164 /* 165 * If the block is not memory resident, 166 * allocate a buffer and start I/O. 167 */ 168 if (!incore(vp, blkno)) { 169 *bpp = bp = getblk(vp, blkno, size); 170 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 171 bp->b_flags |= B_READ; 172 if (bp->b_bcount > bp->b_bufsize) 173 panic("breadn"); 174 if (bp->b_rcred == NOCRED && cred != NOCRED) { 175 crhold(cred); 176 bp->b_rcred = cred; 177 } 178 VOP_STRATEGY(bp); 179 trace(TR_BREADMISS, pack(vp, size), blkno); 180 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 181 } else { 182 trace(TR_BREADHIT, pack(vp, size), blkno); 183 } 184 } 185 186 /* 187 * If there's read-ahead block(s), start I/O 188 * on them also (as above). 189 */ 190 for (i = 0; i < num; i++) { 191 if (incore(vp, rablkno[i])) 192 continue; 193 rabp = getblk(vp, rablkno[i], rabsize[i]); 194 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 195 brelse(rabp); 196 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 197 } else { 198 rabp->b_flags |= B_ASYNC | B_READ; 199 if (rabp->b_bcount > rabp->b_bufsize) 200 panic("breadrabp"); 201 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 202 crhold(cred); 203 rabp->b_rcred = cred; 204 } 205 VOP_STRATEGY(rabp); 206 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 207 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 208 } 209 } 210 211 /* 212 * If block was memory resident, let bread get it. 213 * If block was not memory resident, the read was 214 * started above, so just wait for the read to complete. 215 */ 216 if (bp == NULL) 217 return (bread(vp, blkno, size, cred, bpp)); 218 return (biowait(bp)); 219 } 220 221 /* 222 * Synchronous write. 223 * Release buffer on completion. 224 */ 225 bwrite(bp) 226 register struct buf *bp; 227 { 228 struct proc *p = curproc; /* XXX */ 229 register int flag; 230 int s, error = 0; 231 232 flag = bp->b_flags; 233 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 234 if (flag & B_ASYNC) { 235 if ((flag & B_DELWRI) == 0) 236 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 237 else 238 reassignbuf(bp, bp->b_vp); 239 } 240 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 241 if (bp->b_bcount > bp->b_bufsize) 242 panic("bwrite"); 243 s = splbio(); 244 bp->b_vp->v_numoutput++; 245 splx(s); 246 VOP_STRATEGY(bp); 247 248 /* 249 * If the write was synchronous, then await I/O completion. 250 * If the write was "delayed", then we put the buffer on 251 * the queue of blocks awaiting I/O completion status. 252 */ 253 if ((flag & B_ASYNC) == 0) { 254 error = biowait(bp); 255 if ((flag&B_DELWRI) == 0) 256 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 257 else 258 reassignbuf(bp, bp->b_vp); 259 brelse(bp); 260 } else if (flag & B_DELWRI) { 261 s = splbio(); 262 bp->b_flags |= B_AGE; 263 splx(s); 264 } 265 return (error); 266 } 267 268 int 269 vn_bwrite(ap) 270 struct vop_bwrite_args *ap; 271 { 272 return (bwrite(ap->a_bp)); 273 } 274 275 276 /* 277 * Delayed write. 278 * 279 * The buffer is marked dirty, but is not queued for I/O. 280 * This routine should be used when the buffer is expected 281 * to be modified again soon, typically a small write that 282 * partially fills a buffer. 283 * 284 * NB: magnetic tapes cannot be delayed; they must be 285 * written in the order that the writes are requested. 286 */ 287 bdwrite(bp) 288 register struct buf *bp; 289 { 290 struct proc *p = curproc; /* XXX */ 291 292 if ((bp->b_flags & B_DELWRI) == 0) { 293 bp->b_flags |= B_DELWRI; 294 reassignbuf(bp, bp->b_vp); 295 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 296 } 297 /* 298 * If this is a tape drive, the write must be initiated. 299 */ 300 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 301 bawrite(bp); 302 } else { 303 bp->b_flags |= (B_DONE | B_DELWRI); 304 brelse(bp); 305 } 306 } 307 308 /* 309 * Asynchronous write. 310 * Start I/O on a buffer, but do not wait for it to complete. 311 * The buffer is released when the I/O completes. 312 */ 313 bawrite(bp) 314 register struct buf *bp; 315 { 316 317 /* 318 * Setting the ASYNC flag causes bwrite to return 319 * after starting the I/O. 320 */ 321 bp->b_flags |= B_ASYNC; 322 (void) bwrite(bp); 323 } 324 325 /* 326 * Release a buffer. 327 * Even if the buffer is dirty, no I/O is started. 328 */ 329 brelse(bp) 330 register struct buf *bp; 331 { 332 register struct queue_entry *flist; 333 int s; 334 335 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 336 /* 337 * If a process is waiting for the buffer, or 338 * is waiting for a free buffer, awaken it. 339 */ 340 if (bp->b_flags & B_WANTED) 341 wakeup((caddr_t)bp); 342 if (needbuffer) { 343 needbuffer = 0; 344 wakeup((caddr_t)&needbuffer); 345 } 346 /* 347 * Retry I/O for locked buffers rather than invalidating them. 348 */ 349 s = splbio(); 350 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 351 bp->b_flags &= ~B_ERROR; 352 /* 353 * Disassociate buffers that are no longer valid. 354 */ 355 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 356 bp->b_flags |= B_INVAL; 357 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 358 if (bp->b_vp) 359 brelvp(bp); 360 bp->b_flags &= ~B_DELWRI; 361 } 362 /* 363 * Stick the buffer back on a free list. 364 */ 365 if (bp->b_bufsize <= 0) { 366 /* block has no buffer ... put at front of unused buffer list */ 367 flist = &bufqueues[BQ_EMPTY]; 368 binsheadfree(bp, flist); 369 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 370 /* block has no info ... put at front of most free list */ 371 flist = &bufqueues[BQ_AGE]; 372 binsheadfree(bp, flist); 373 } else { 374 if (bp->b_flags & B_LOCKED) 375 flist = &bufqueues[BQ_LOCKED]; 376 else if (bp->b_flags & B_AGE) 377 flist = &bufqueues[BQ_AGE]; 378 else 379 flist = &bufqueues[BQ_LRU]; 380 binstailfree(bp, flist); 381 } 382 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 383 splx(s); 384 } 385 386 /* 387 * Check to see if a block is currently memory resident. 388 */ 389 incore(vp, blkno) 390 struct vnode *vp; 391 daddr_t blkno; 392 { 393 register struct buf *bp; 394 395 for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) 396 if (bp->b_lblkno == blkno && bp->b_vp == vp && 397 (bp->b_flags & B_INVAL) == 0) 398 return (1); 399 return (0); 400 } 401 402 /* 403 * Check to see if a block is currently memory resident. 404 * If it is resident, return it. If it is not resident, 405 * allocate a new buffer and assign it to the block. 406 */ 407 struct buf * 408 getblk(vp, blkno, size) 409 register struct vnode *vp; 410 daddr_t blkno; 411 int size; 412 { 413 register struct buf *bp; 414 struct list_entry *dp; 415 int s; 416 417 if (size > MAXBSIZE) 418 panic("getblk: size too big"); 419 /* 420 * Search the cache for the block. If the buffer is found, 421 * but it is currently locked, the we must wait for it to 422 * become available. 423 */ 424 dp = BUFHASH(vp, blkno); 425 loop: 426 for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { 427 if (bp->b_lblkno != blkno || bp->b_vp != vp || 428 (bp->b_flags & B_INVAL)) 429 continue; 430 s = splbio(); 431 if (bp->b_flags & B_BUSY) { 432 bp->b_flags |= B_WANTED; 433 sleep((caddr_t)bp, PRIBIO + 1); 434 splx(s); 435 goto loop; 436 } 437 bremfree(bp); 438 bp->b_flags |= B_BUSY; 439 splx(s); 440 if (bp->b_bcount != size) { 441 printf("getblk: stray size"); 442 bp->b_flags |= B_INVAL; 443 bwrite(bp); 444 goto loop; 445 } 446 bp->b_flags |= B_CACHE; 447 return (bp); 448 } 449 bp = getnewbuf(); 450 bremhash(bp); 451 bgetvp(vp, bp); 452 bp->b_bcount = 0; 453 bp->b_lblkno = blkno; 454 bp->b_blkno = blkno; 455 bp->b_error = 0; 456 bp->b_resid = 0; 457 binshash(bp, dp); 458 allocbuf(bp, size); 459 return (bp); 460 } 461 462 /* 463 * Allocate a buffer. 464 * The caller will assign it to a block. 465 */ 466 struct buf * 467 geteblk(size) 468 int size; 469 { 470 register struct buf *bp; 471 472 if (size > MAXBSIZE) 473 panic("geteblk: size too big"); 474 bp = getnewbuf(); 475 bp->b_flags |= B_INVAL; 476 bremhash(bp); 477 binshash(bp, &invalhash); 478 bp->b_bcount = 0; 479 bp->b_error = 0; 480 bp->b_resid = 0; 481 allocbuf(bp, size); 482 return (bp); 483 } 484 485 /* 486 * Expand or contract the actual memory allocated to a buffer. 487 * If no memory is available, release buffer and take error exit. 488 */ 489 allocbuf(tp, size) 490 register struct buf *tp; 491 int size; 492 { 493 register struct buf *bp, *ep; 494 int sizealloc, take, s; 495 496 sizealloc = roundup(size, CLBYTES); 497 /* 498 * Buffer size does not change 499 */ 500 if (sizealloc == tp->b_bufsize) 501 goto out; 502 /* 503 * Buffer size is shrinking. 504 * Place excess space in a buffer header taken from the 505 * BQ_EMPTY buffer list and placed on the "most free" list. 506 * If no extra buffer headers are available, leave the 507 * extra space in the present buffer. 508 */ 509 if (sizealloc < tp->b_bufsize) { 510 if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) 511 goto out; 512 s = splbio(); 513 bremfree(ep); 514 ep->b_flags |= B_BUSY; 515 splx(s); 516 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 517 (int)tp->b_bufsize - sizealloc); 518 ep->b_bufsize = tp->b_bufsize - sizealloc; 519 tp->b_bufsize = sizealloc; 520 ep->b_flags |= B_INVAL; 521 ep->b_bcount = 0; 522 brelse(ep); 523 goto out; 524 } 525 /* 526 * More buffer space is needed. Get it out of buffers on 527 * the "most free" list, placing the empty headers on the 528 * BQ_EMPTY buffer header list. 529 */ 530 while (tp->b_bufsize < sizealloc) { 531 take = sizealloc - tp->b_bufsize; 532 bp = getnewbuf(); 533 if (take >= bp->b_bufsize) 534 take = bp->b_bufsize; 535 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 536 &tp->b_un.b_addr[tp->b_bufsize], take); 537 tp->b_bufsize += take; 538 bp->b_bufsize = bp->b_bufsize - take; 539 if (bp->b_bcount > bp->b_bufsize) 540 bp->b_bcount = bp->b_bufsize; 541 if (bp->b_bufsize <= 0) { 542 bremhash(bp); 543 binshash(bp, &invalhash); 544 bp->b_dev = NODEV; 545 bp->b_error = 0; 546 bp->b_flags |= B_INVAL; 547 } 548 brelse(bp); 549 } 550 out: 551 tp->b_bcount = size; 552 return (1); 553 } 554 555 /* 556 * Find a buffer which is available for use. 557 * Select something from a free list. 558 * Preference is to AGE list, then LRU list. 559 */ 560 struct buf * 561 getnewbuf() 562 { 563 register struct buf *bp; 564 register struct queue_entry *dp; 565 register struct ucred *cred; 566 int s; 567 568 loop: 569 s = splbio(); 570 for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) 571 if (dp->qe_next) 572 break; 573 if (dp == bufqueues) { /* no free blocks */ 574 needbuffer = 1; 575 sleep((caddr_t)&needbuffer, PRIBIO + 1); 576 splx(s); 577 goto loop; 578 } 579 bp = dp->qe_next; 580 bremfree(bp); 581 bp->b_flags |= B_BUSY; 582 splx(s); 583 if (bp->b_flags & B_DELWRI) { 584 (void) bawrite(bp); 585 goto loop; 586 } 587 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 588 if (bp->b_vp) 589 brelvp(bp); 590 if (bp->b_rcred != NOCRED) { 591 cred = bp->b_rcred; 592 bp->b_rcred = NOCRED; 593 crfree(cred); 594 } 595 if (bp->b_wcred != NOCRED) { 596 cred = bp->b_wcred; 597 bp->b_wcred = NOCRED; 598 crfree(cred); 599 } 600 bp->b_flags = B_BUSY; 601 bp->b_dirtyoff = bp->b_dirtyend = 0; 602 bp->b_validoff = bp->b_validend = 0; 603 return (bp); 604 } 605 606 /* 607 * Wait for I/O to complete. 608 * 609 * Extract and return any errors associated with the I/O. 610 * If the error flag is set, but no specific error is 611 * given, return EIO. 612 */ 613 biowait(bp) 614 register struct buf *bp; 615 { 616 int s; 617 618 s = splbio(); 619 while ((bp->b_flags & B_DONE) == 0) 620 sleep((caddr_t)bp, PRIBIO); 621 splx(s); 622 if ((bp->b_flags & B_ERROR) == 0) 623 return (0); 624 if (bp->b_error) 625 return (bp->b_error); 626 return (EIO); 627 } 628 629 /* 630 * Mark I/O complete on a buffer. 631 * 632 * If a callback has been requested, e.g. the pageout 633 * daemon, do so. Otherwise, awaken waiting processes. 634 */ 635 void 636 biodone(bp) 637 register struct buf *bp; 638 { 639 640 if (bp->b_flags & B_DONE) 641 panic("dup biodone"); 642 bp->b_flags |= B_DONE; 643 if ((bp->b_flags & B_READ) == 0) 644 vwakeup(bp); 645 if (bp->b_flags & B_CALL) { 646 bp->b_flags &= ~B_CALL; 647 (*bp->b_iodone)(bp); 648 return; 649 } 650 if (bp->b_flags & B_ASYNC) 651 brelse(bp); 652 else { 653 bp->b_flags &= ~B_WANTED; 654 wakeup((caddr_t)bp); 655 } 656 } 657 658 #ifdef DIAGNOSTIC 659 /* 660 * Print out statistics on the current allocation of the buffer pool. 661 * Can be enabled to print out on every ``sync'' by setting "syncprt" 662 * above. 663 */ 664 void 665 vfs_bufstats() 666 { 667 int s, i, j, count; 668 register struct buf *bp; 669 register struct queue_entry *dp; 670 int counts[MAXBSIZE/CLBYTES+1]; 671 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 672 673 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 674 count = 0; 675 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 676 counts[j] = 0; 677 s = splbio(); 678 for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { 679 counts[bp->b_bufsize/CLBYTES]++; 680 count++; 681 } 682 splx(s); 683 printf("%s: total-%d", bname[i], count); 684 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 685 if (counts[j] != 0) 686 printf(", %d-%d", j * CLBYTES, counts[j]); 687 printf("\n"); 688 } 689 } 690 #endif /* DIAGNOSTIC */ 691