1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_cluster.c 7.50 (Berkeley) 07/03/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/specdev.h> 17 #include <sys/mount.h> 18 #include <sys/trace.h> 19 #include <sys/resourcevar.h> 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 void 25 bufinit() 26 { 27 register int i; 28 register struct buf *bp, *dp; 29 register struct bufhd *hp; 30 int base, residual; 31 32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 33 hp->b_forw = hp->b_back = (struct buf *)hp; 34 35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 37 dp->b_flags = B_HEAD; 38 } 39 base = bufpages / nbuf; 40 residual = bufpages % nbuf; 41 for (i = 0; i < nbuf; i++) { 42 bp = &buf[i]; 43 bp->b_dev = NODEV; 44 bp->b_bcount = 0; 45 bp->b_rcred = NOCRED; 46 bp->b_wcred = NOCRED; 47 bp->b_dirtyoff = 0; 48 bp->b_dirtyend = 0; 49 bp->b_validoff = 0; 50 bp->b_validend = 0; 51 bp->b_un.b_addr = buffers + i * MAXBSIZE; 52 if (i < residual) 53 bp->b_bufsize = (base + 1) * CLBYTES; 54 else 55 bp->b_bufsize = base * CLBYTES; 56 binshash(bp, &bfreelist[BQ_AGE]); 57 bp->b_flags = B_INVAL; 58 dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY]; 59 binsheadfree(bp, dp); 60 } 61 } 62 63 /* 64 * Find the block in the buffer pool. 65 * If the buffer is not present, allocate a new buffer and load 66 * its contents according to the filesystem fill routine. 67 */ 68 bread(vp, blkno, size, cred, bpp) 69 struct vnode *vp; 70 daddr_t blkno; 71 int size; 72 struct ucred *cred; 73 struct buf **bpp; 74 { 75 struct proc *p = curproc; /* XXX */ 76 register struct buf *bp; 77 78 if (size == 0) 79 panic("bread: size 0"); 80 *bpp = bp = getblk(vp, blkno, size); 81 if (bp->b_flags & (B_DONE | B_DELWRI)) { 82 trace(TR_BREADHIT, pack(vp, size), blkno); 83 return (0); 84 } 85 bp->b_flags |= B_READ; 86 if (bp->b_bcount > bp->b_bufsize) 87 panic("bread"); 88 if (bp->b_rcred == NOCRED && cred != NOCRED) { 89 crhold(cred); 90 bp->b_rcred = cred; 91 } 92 VOP_STRATEGY(bp); 93 trace(TR_BREADMISS, pack(vp, size), blkno); 94 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 95 return (biowait(bp)); 96 } 97 98 /* 99 * Operates like bread, but also starts I/O on the N specified 100 * read-ahead blocks. 101 */ 102 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 103 struct vnode *vp; 104 daddr_t blkno; int size; 105 daddr_t rablkno[]; int rabsize[]; 106 int num; 107 struct ucred *cred; 108 struct buf **bpp; 109 { 110 struct proc *p = curproc; /* XXX */ 111 register struct buf *bp, *rabp; 112 register int i; 113 114 bp = NULL; 115 /* 116 * If the block is not memory resident, 117 * allocate a buffer and start I/O. 118 */ 119 if (!incore(vp, blkno)) { 120 *bpp = bp = getblk(vp, blkno, size); 121 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 122 bp->b_flags |= B_READ; 123 if (bp->b_bcount > bp->b_bufsize) 124 panic("breadn"); 125 if (bp->b_rcred == NOCRED && cred != NOCRED) { 126 crhold(cred); 127 bp->b_rcred = cred; 128 } 129 VOP_STRATEGY(bp); 130 trace(TR_BREADMISS, pack(vp, size), blkno); 131 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 132 } else { 133 trace(TR_BREADHIT, pack(vp, size), blkno); 134 } 135 } 136 137 /* 138 * If there's read-ahead block(s), start I/O 139 * on them also (as above). 140 */ 141 for (i = 0; i < num; i++) { 142 if (incore(vp, rablkno[i])) 143 continue; 144 rabp = getblk(vp, rablkno[i], rabsize[i]); 145 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 146 brelse(rabp); 147 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 148 } else { 149 rabp->b_flags |= B_ASYNC | B_READ; 150 if (rabp->b_bcount > rabp->b_bufsize) 151 panic("breadrabp"); 152 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 153 crhold(cred); 154 rabp->b_rcred = cred; 155 } 156 VOP_STRATEGY(rabp); 157 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 158 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 159 } 160 } 161 162 /* 163 * If block was memory resident, let bread get it. 164 * If block was not memory resident, the read was 165 * started above, so just wait for the read to complete. 166 */ 167 if (bp == NULL) 168 return (bread(vp, blkno, size, cred, bpp)); 169 return (biowait(bp)); 170 } 171 172 /* 173 * Synchronous write. 174 * Release buffer on completion. 175 */ 176 bwrite(bp) 177 register struct buf *bp; 178 { 179 struct proc *p = curproc; /* XXX */ 180 register int flag; 181 int s, error = 0; 182 183 flag = bp->b_flags; 184 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 185 if (flag & B_ASYNC) { 186 if ((flag & B_DELWRI) == 0) 187 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 188 else 189 reassignbuf(bp, bp->b_vp); 190 } 191 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 192 if (bp->b_bcount > bp->b_bufsize) 193 panic("bwrite"); 194 s = splbio(); 195 bp->b_vp->v_numoutput++; 196 splx(s); 197 VOP_STRATEGY(bp); 198 199 /* 200 * If the write was synchronous, then await I/O completion. 201 * If the write was "delayed", then we put the buffer on 202 * the queue of blocks awaiting I/O completion status. 203 */ 204 if ((flag & B_ASYNC) == 0) { 205 error = biowait(bp); 206 if ((flag&B_DELWRI) == 0) 207 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 208 else 209 reassignbuf(bp, bp->b_vp); 210 brelse(bp); 211 } else if (flag & B_DELWRI) { 212 s = splbio(); 213 bp->b_flags |= B_AGE; 214 splx(s); 215 } 216 return (error); 217 } 218 219 int 220 vn_bwrite(ap) 221 struct vop_bwrite_args *ap; 222 { 223 return bwrite (ap->a_bp); 224 } 225 226 227 /* 228 * Delayed write. 229 * 230 * The buffer is marked dirty, but is not queued for I/O. 231 * This routine should be used when the buffer is expected 232 * to be modified again soon, typically a small write that 233 * partially fills a buffer. 234 * 235 * NB: magnetic tapes cannot be delayed; they must be 236 * written in the order that the writes are requested. 237 */ 238 bdwrite(bp) 239 register struct buf *bp; 240 { 241 struct proc *p = curproc; /* XXX */ 242 243 if ((bp->b_flags & B_DELWRI) == 0) { 244 bp->b_flags |= B_DELWRI; 245 reassignbuf(bp, bp->b_vp); 246 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 247 } 248 /* 249 * If this is a tape drive, the write must be initiated. 250 */ 251 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 252 bawrite(bp); 253 } else { 254 bp->b_flags |= (B_DONE | B_DELWRI); 255 brelse(bp); 256 } 257 } 258 259 /* 260 * Asynchronous write. 261 * Start I/O on a buffer, but do not wait for it to complete. 262 * The buffer is released when the I/O completes. 263 */ 264 bawrite(bp) 265 register struct buf *bp; 266 { 267 268 /* 269 * Setting the ASYNC flag causes bwrite to return 270 * after starting the I/O. 271 */ 272 bp->b_flags |= B_ASYNC; 273 (void) bwrite(bp); 274 } 275 276 /* 277 * Release a buffer. 278 * Even if the buffer is dirty, no I/O is started. 279 */ 280 brelse(bp) 281 register struct buf *bp; 282 { 283 register struct buf *flist; 284 int s; 285 286 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 287 /* 288 * If a process is waiting for the buffer, or 289 * is waiting for a free buffer, awaken it. 290 */ 291 if (bp->b_flags & B_WANTED) 292 wakeup((caddr_t)bp); 293 if (bfreelist[0].b_flags & B_WANTED) { 294 bfreelist[0].b_flags &= ~B_WANTED; 295 wakeup((caddr_t)bfreelist); 296 } 297 /* 298 * Retry I/O for locked buffers rather than invalidating them. 299 */ 300 s = splbio(); 301 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 302 bp->b_flags &= ~B_ERROR; 303 /* 304 * Disassociate buffers that are no longer valid. 305 */ 306 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 307 bp->b_flags |= B_INVAL; 308 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 309 if (bp->b_vp) 310 brelvp(bp); 311 bp->b_flags &= ~B_DELWRI; 312 } 313 /* 314 * Stick the buffer back on a free list. 315 */ 316 if (bp->b_bufsize <= 0) { 317 /* block has no buffer ... put at front of unused buffer list */ 318 flist = &bfreelist[BQ_EMPTY]; 319 binsheadfree(bp, flist); 320 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 321 /* block has no info ... put at front of most free list */ 322 flist = &bfreelist[BQ_AGE]; 323 binsheadfree(bp, flist); 324 } else { 325 if (bp->b_flags & B_LOCKED) 326 flist = &bfreelist[BQ_LOCKED]; 327 else if (bp->b_flags & B_AGE) 328 flist = &bfreelist[BQ_AGE]; 329 else 330 flist = &bfreelist[BQ_LRU]; 331 binstailfree(bp, flist); 332 } 333 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 334 splx(s); 335 } 336 337 /* 338 * Check to see if a block is currently memory resident. 339 */ 340 incore(vp, blkno) 341 struct vnode *vp; 342 daddr_t blkno; 343 { 344 register struct buf *bp; 345 register struct buf *dp; 346 347 dp = BUFHASH(vp, blkno); 348 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 349 if (bp->b_lblkno == blkno && bp->b_vp == vp && 350 (bp->b_flags & B_INVAL) == 0) 351 return (1); 352 return (0); 353 } 354 355 /* 356 * Check to see if a block is currently memory resident. 357 * If it is resident, return it. If it is not resident, 358 * allocate a new buffer and assign it to the block. 359 */ 360 struct buf * 361 getblk(vp, blkno, size) 362 register struct vnode *vp; 363 daddr_t blkno; 364 int size; 365 { 366 register struct buf *bp, *dp; 367 int s; 368 369 if (size > MAXBSIZE) 370 panic("getblk: size too big"); 371 /* 372 * Search the cache for the block. If the buffer is found, 373 * but it is currently locked, the we must wait for it to 374 * become available. 375 */ 376 dp = BUFHASH(vp, blkno); 377 loop: 378 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 379 if (bp->b_lblkno != blkno || bp->b_vp != vp || 380 (bp->b_flags & B_INVAL)) 381 continue; 382 s = splbio(); 383 if (bp->b_flags & B_BUSY) { 384 bp->b_flags |= B_WANTED; 385 sleep((caddr_t)bp, PRIBIO + 1); 386 splx(s); 387 goto loop; 388 } 389 bremfree(bp); 390 bp->b_flags |= B_BUSY; 391 splx(s); 392 if (bp->b_bcount != size) { 393 printf("getblk: stray size"); 394 bp->b_flags |= B_INVAL; 395 bwrite(bp); 396 goto loop; 397 } 398 bp->b_flags |= B_CACHE; 399 return (bp); 400 } 401 bp = getnewbuf(); 402 bremhash(bp); 403 bgetvp(vp, bp); 404 bp->b_bcount = 0; 405 bp->b_lblkno = blkno; 406 bp->b_blkno = blkno; 407 bp->b_error = 0; 408 bp->b_resid = 0; 409 binshash(bp, dp); 410 allocbuf(bp, size); 411 return (bp); 412 } 413 414 /* 415 * Allocate a buffer. 416 * The caller will assign it to a block. 417 */ 418 struct buf * 419 geteblk(size) 420 int size; 421 { 422 register struct buf *bp, *flist; 423 424 if (size > MAXBSIZE) 425 panic("geteblk: size too big"); 426 bp = getnewbuf(); 427 bp->b_flags |= B_INVAL; 428 bremhash(bp); 429 flist = &bfreelist[BQ_AGE]; 430 bp->b_bcount = 0; 431 bp->b_error = 0; 432 bp->b_resid = 0; 433 binshash(bp, flist); 434 allocbuf(bp, size); 435 return (bp); 436 } 437 438 /* 439 * Expand or contract the actual memory allocated to a buffer. 440 * If no memory is available, release buffer and take error exit. 441 */ 442 allocbuf(tp, size) 443 register struct buf *tp; 444 int size; 445 { 446 register struct buf *bp, *ep; 447 int sizealloc, take, s; 448 449 sizealloc = roundup(size, CLBYTES); 450 /* 451 * Buffer size does not change 452 */ 453 if (sizealloc == tp->b_bufsize) 454 goto out; 455 /* 456 * Buffer size is shrinking. 457 * Place excess space in a buffer header taken from the 458 * BQ_EMPTY buffer list and placed on the "most free" list. 459 * If no extra buffer headers are available, leave the 460 * extra space in the present buffer. 461 */ 462 if (sizealloc < tp->b_bufsize) { 463 ep = bfreelist[BQ_EMPTY].av_forw; 464 if (ep == &bfreelist[BQ_EMPTY]) 465 goto out; 466 s = splbio(); 467 bremfree(ep); 468 ep->b_flags |= B_BUSY; 469 splx(s); 470 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 471 (int)tp->b_bufsize - sizealloc); 472 ep->b_bufsize = tp->b_bufsize - sizealloc; 473 tp->b_bufsize = sizealloc; 474 ep->b_flags |= B_INVAL; 475 ep->b_bcount = 0; 476 brelse(ep); 477 goto out; 478 } 479 /* 480 * More buffer space is needed. Get it out of buffers on 481 * the "most free" list, placing the empty headers on the 482 * BQ_EMPTY buffer header list. 483 */ 484 while (tp->b_bufsize < sizealloc) { 485 take = sizealloc - tp->b_bufsize; 486 bp = getnewbuf(); 487 if (take >= bp->b_bufsize) 488 take = bp->b_bufsize; 489 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 490 &tp->b_un.b_addr[tp->b_bufsize], take); 491 tp->b_bufsize += take; 492 bp->b_bufsize = bp->b_bufsize - take; 493 if (bp->b_bcount > bp->b_bufsize) 494 bp->b_bcount = bp->b_bufsize; 495 if (bp->b_bufsize <= 0) { 496 bremhash(bp); 497 binshash(bp, &bfreelist[BQ_EMPTY]); 498 bp->b_dev = NODEV; 499 bp->b_error = 0; 500 bp->b_flags |= B_INVAL; 501 } 502 brelse(bp); 503 } 504 out: 505 tp->b_bcount = size; 506 return (1); 507 } 508 509 /* 510 * Find a buffer which is available for use. 511 * Select something from a free list. 512 * Preference is to AGE list, then LRU list. 513 */ 514 struct buf * 515 getnewbuf() 516 { 517 register struct buf *bp, *dp; 518 register struct ucred *cred; 519 int s; 520 521 #ifdef LFS 522 lfs_flush(); 523 #endif 524 loop: 525 s = splbio(); 526 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 527 if (dp->av_forw != dp) 528 break; 529 if (dp == bfreelist) { /* no free blocks */ 530 dp->b_flags |= B_WANTED; 531 sleep((caddr_t)dp, PRIBIO + 1); 532 splx(s); 533 goto loop; 534 } 535 bp = dp->av_forw; 536 bremfree(bp); 537 bp->b_flags |= B_BUSY; 538 splx(s); 539 if (bp->b_flags & B_DELWRI) { 540 (void) bawrite(bp); 541 goto loop; 542 } 543 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 544 if (bp->b_vp) 545 brelvp(bp); 546 if (bp->b_rcred != NOCRED) { 547 cred = bp->b_rcred; 548 bp->b_rcred = NOCRED; 549 crfree(cred); 550 } 551 if (bp->b_wcred != NOCRED) { 552 cred = bp->b_wcred; 553 bp->b_wcred = NOCRED; 554 crfree(cred); 555 } 556 bp->b_flags = B_BUSY; 557 bp->b_dirtyoff = bp->b_dirtyend = 0; 558 bp->b_validoff = bp->b_validend = 0; 559 return (bp); 560 } 561 562 /* 563 * Wait for I/O to complete. 564 * 565 * Extract and return any errors associated with the I/O. 566 * If the error flag is set, but no specific error is 567 * given, return EIO. 568 */ 569 biowait(bp) 570 register struct buf *bp; 571 { 572 int s; 573 574 s = splbio(); 575 while ((bp->b_flags & B_DONE) == 0) 576 sleep((caddr_t)bp, PRIBIO); 577 splx(s); 578 if ((bp->b_flags & B_ERROR) == 0) 579 return (0); 580 if (bp->b_error) 581 return (bp->b_error); 582 return (EIO); 583 } 584 585 /* 586 * Mark I/O complete on a buffer. 587 * 588 * If a callback has been requested, e.g. the pageout 589 * daemon, do so. Otherwise, awaken waiting processes. 590 */ 591 void 592 biodone(bp) 593 register struct buf *bp; 594 { 595 596 if (bp->b_flags & B_DONE) 597 panic("dup biodone"); 598 bp->b_flags |= B_DONE; 599 if ((bp->b_flags & B_READ) == 0) 600 vwakeup(bp); 601 if (bp->b_flags & B_CALL) { 602 bp->b_flags &= ~B_CALL; 603 (*bp->b_iodone)(bp); 604 return; 605 } 606 if (bp->b_flags & B_ASYNC) 607 brelse(bp); 608 else { 609 bp->b_flags &= ~B_WANTED; 610 wakeup((caddr_t)bp); 611 } 612 } 613