1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_bio.c 7.47 (Berkeley) 05/14/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/specdev.h> 17 #include <sys/mount.h> 18 #include <sys/trace.h> 19 #include <sys/resourcevar.h> 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 void 25 bufinit() 26 { 27 register int i; 28 register struct buf *bp, *dp; 29 register struct bufhd *hp; 30 int base, residual; 31 32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 33 hp->b_forw = hp->b_back = (struct buf *)hp; 34 35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 37 dp->b_flags = B_HEAD; 38 } 39 base = bufpages / nbuf; 40 residual = bufpages % nbuf; 41 for (i = 0; i < nbuf; i++) { 42 bp = &buf[i]; 43 bp->b_dev = NODEV; 44 bp->b_bcount = 0; 45 bp->b_rcred = NOCRED; 46 bp->b_wcred = NOCRED; 47 bp->b_dirtyoff = 0; 48 bp->b_dirtyend = 0; 49 bp->b_validoff = 0; 50 bp->b_validend = 0; 51 bp->b_un.b_addr = buffers + i * MAXBSIZE; 52 if (i < residual) 53 bp->b_bufsize = (base + 1) * CLBYTES; 54 else 55 bp->b_bufsize = base * CLBYTES; 56 binshash(bp, &bfreelist[BQ_AGE]); 57 bp->b_flags = B_INVAL; 58 dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY]; 59 binsheadfree(bp, dp); 60 } 61 } 62 63 /* 64 * Find the block in the buffer pool. 65 * If the buffer is not present, allocate a new buffer and load 66 * its contents according to the filesystem fill routine. 67 */ 68 bread(vp, blkno, size, cred, bpp) 69 struct vnode *vp; 70 daddr_t blkno; 71 int size; 72 struct ucred *cred; 73 struct buf **bpp; 74 { 75 USES_VOP_STRATEGY; 76 struct proc *p = curproc; /* XXX */ 77 register struct buf *bp; 78 79 if (size == 0) 80 panic("bread: size 0"); 81 *bpp = bp = getblk(vp, blkno, size); 82 if (bp->b_flags & (B_DONE | B_DELWRI)) { 83 trace(TR_BREADHIT, pack(vp, size), blkno); 84 return (0); 85 } 86 bp->b_flags |= B_READ; 87 if (bp->b_bcount > bp->b_bufsize) 88 panic("bread"); 89 if (bp->b_rcred == NOCRED && cred != NOCRED) { 90 crhold(cred); 91 bp->b_rcred = cred; 92 } 93 VOP_STRATEGY(bp); 94 trace(TR_BREADMISS, pack(vp, size), blkno); 95 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 96 return (biowait(bp)); 97 } 98 99 /* 100 * Operates like bread, but also starts I/O on the N specified 101 * read-ahead blocks. 102 */ 103 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 104 struct vnode *vp; 105 daddr_t blkno; int size; 106 daddr_t rablkno[]; int rabsize[]; 107 int num; 108 struct ucred *cred; 109 struct buf **bpp; 110 { 111 USES_VOP_STRATEGY; 112 struct proc *p = curproc; /* XXX */ 113 register struct buf *bp, *rabp; 114 register int i; 115 116 bp = NULL; 117 /* 118 * If the block is not memory resident, 119 * allocate a buffer and start I/O. 120 */ 121 if (!incore(vp, blkno)) { 122 *bpp = bp = getblk(vp, blkno, size); 123 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 124 bp->b_flags |= B_READ; 125 if (bp->b_bcount > bp->b_bufsize) 126 panic("breadn"); 127 if (bp->b_rcred == NOCRED && cred != NOCRED) { 128 crhold(cred); 129 bp->b_rcred = cred; 130 } 131 VOP_STRATEGY(bp); 132 trace(TR_BREADMISS, pack(vp, size), blkno); 133 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 134 } else 135 trace(TR_BREADHIT, pack(vp, size), blkno); 136 } 137 138 /* 139 * If there's read-ahead block(s), start I/O 140 * on them also (as above). 141 */ 142 for (i = 0; i < num; i++) { 143 if (incore(vp, rablkno[i])) 144 continue; 145 rabp = getblk(vp, rablkno[i], rabsize[i]); 146 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 147 brelse(rabp); 148 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 149 } else { 150 rabp->b_flags |= B_ASYNC | B_READ; 151 if (rabp->b_bcount > rabp->b_bufsize) 152 panic("breadrabp"); 153 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 154 crhold(cred); 155 rabp->b_rcred = cred; 156 } 157 VOP_STRATEGY(rabp); 158 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 159 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 160 } 161 } 162 163 /* 164 * If block was memory resident, let bread get it. 165 * If block was not memory resident, the read was 166 * started above, so just wait for the read to complete. 167 */ 168 if (bp == NULL) 169 return (bread(vp, blkno, size, cred, bpp)); 170 return (biowait(bp)); 171 } 172 173 /* 174 * Synchronous write. 175 * Release buffer on completion. 176 */ 177 bwrite(bp) 178 register struct buf *bp; 179 { 180 USES_VOP_STRATEGY; 181 struct proc *p = curproc; /* XXX */ 182 register int flag; 183 int s, error = 0; 184 185 flag = bp->b_flags; 186 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 187 if (flag & B_ASYNC) { 188 if ((flag & B_DELWRI) == 0) 189 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 190 else 191 reassignbuf(bp, bp->b_vp); 192 } 193 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 194 if (bp->b_bcount > bp->b_bufsize) 195 panic("bwrite"); 196 s = splbio(); 197 bp->b_vp->v_numoutput++; 198 splx(s); 199 VOP_STRATEGY(bp); 200 201 /* 202 * If the write was synchronous, then await I/O completion. 203 * If the write was "delayed", then we put the buffer on 204 * the queue of blocks awaiting I/O completion status. 205 */ 206 if ((flag & B_ASYNC) == 0) { 207 error = biowait(bp); 208 if ((flag&B_DELWRI) == 0) 209 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 210 else 211 reassignbuf(bp, bp->b_vp); 212 brelse(bp); 213 } else if (flag & B_DELWRI) { 214 s = splbio(); 215 bp->b_flags |= B_AGE; 216 splx(s); 217 } 218 return (error); 219 } 220 221 /* 222 * Delayed write. 223 * 224 * The buffer is marked dirty, but is not queued for I/O. 225 * This routine should be used when the buffer is expected 226 * to be modified again soon, typically a small write that 227 * partially fills a buffer. 228 * 229 * NB: magnetic tapes cannot be delayed; they must be 230 * written in the order that the writes are requested. 231 */ 232 bdwrite(bp) 233 register struct buf *bp; 234 { 235 USES_VOP_IOCTL; 236 struct proc *p = curproc; /* XXX */ 237 238 if ((bp->b_flags & B_DELWRI) == 0) { 239 bp->b_flags |= B_DELWRI; 240 reassignbuf(bp, bp->b_vp); 241 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 242 } 243 /* 244 * If this is a tape drive, the write must be initiated. 245 */ 246 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 247 bawrite(bp); 248 } else { 249 bp->b_flags |= (B_DONE | B_DELWRI); 250 brelse(bp); 251 } 252 } 253 254 /* 255 * Asynchronous write. 256 * Start I/O on a buffer, but do not wait for it to complete. 257 * The buffer is released when the I/O completes. 258 */ 259 bawrite(bp) 260 register struct buf *bp; 261 { 262 263 /* 264 * Setting the ASYNC flag causes bwrite to return 265 * after starting the I/O. 266 */ 267 bp->b_flags |= B_ASYNC; 268 (void) bwrite(bp); 269 } 270 271 /* 272 * Release a buffer. 273 * Even if the buffer is dirty, no I/O is started. 274 */ 275 brelse(bp) 276 register struct buf *bp; 277 { 278 register struct buf *flist; 279 int s; 280 281 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 282 /* 283 * If a process is waiting for the buffer, or 284 * is waiting for a free buffer, awaken it. 285 */ 286 if (bp->b_flags & B_WANTED) 287 wakeup((caddr_t)bp); 288 if (bfreelist[0].b_flags & B_WANTED) { 289 bfreelist[0].b_flags &= ~B_WANTED; 290 wakeup((caddr_t)bfreelist); 291 } 292 /* 293 * Retry I/O for locked buffers rather than invalidating them. 294 */ 295 s = splbio(); 296 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 297 bp->b_flags &= ~B_ERROR; 298 /* 299 * Disassociate buffers that are no longer valid. 300 */ 301 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 302 bp->b_flags |= B_INVAL; 303 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 304 if (bp->b_vp) 305 brelvp(bp); 306 bp->b_flags &= ~B_DELWRI; 307 } 308 /* 309 * Stick the buffer back on a free list. 310 */ 311 if (bp->b_bufsize <= 0) { 312 /* block has no buffer ... put at front of unused buffer list */ 313 flist = &bfreelist[BQ_EMPTY]; 314 binsheadfree(bp, flist); 315 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 316 /* block has no info ... put at front of most free list */ 317 flist = &bfreelist[BQ_AGE]; 318 binsheadfree(bp, flist); 319 } else { 320 if (bp->b_flags & B_LOCKED) 321 flist = &bfreelist[BQ_LOCKED]; 322 else if (bp->b_flags & B_AGE) 323 flist = &bfreelist[BQ_AGE]; 324 else 325 flist = &bfreelist[BQ_LRU]; 326 binstailfree(bp, flist); 327 } 328 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 329 splx(s); 330 } 331 332 /* 333 * Check to see if a block is currently memory resident. 334 */ 335 incore(vp, blkno) 336 struct vnode *vp; 337 daddr_t blkno; 338 { 339 register struct buf *bp; 340 register struct buf *dp; 341 342 dp = BUFHASH(vp, blkno); 343 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 344 if (bp->b_lblkno == blkno && bp->b_vp == vp && 345 (bp->b_flags & B_INVAL) == 0) 346 return (1); 347 return (0); 348 } 349 350 /* 351 * Check to see if a block is currently memory resident. 352 * If it is resident, return it. If it is not resident, 353 * allocate a new buffer and assign it to the block. 354 */ 355 struct buf * 356 getblk(vp, blkno, size) 357 register struct vnode *vp; 358 daddr_t blkno; 359 int size; 360 { 361 register struct buf *bp, *dp; 362 int s; 363 364 if (size > MAXBSIZE) 365 panic("getblk: size too big"); 366 /* 367 * Search the cache for the block. If the buffer is found, 368 * but it is currently locked, the we must wait for it to 369 * become available. 370 */ 371 dp = BUFHASH(vp, blkno); 372 loop: 373 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 374 if (bp->b_lblkno != blkno || bp->b_vp != vp || 375 (bp->b_flags & B_INVAL)) 376 continue; 377 s = splbio(); 378 if (bp->b_flags & B_BUSY) { 379 bp->b_flags |= B_WANTED; 380 sleep((caddr_t)bp, PRIBIO + 1); 381 splx(s); 382 goto loop; 383 } 384 bremfree(bp); 385 bp->b_flags |= B_BUSY; 386 splx(s); 387 if (bp->b_bcount != size) { 388 printf("getblk: stray size"); 389 bp->b_flags |= B_INVAL; 390 bwrite(bp); 391 goto loop; 392 } 393 bp->b_flags |= B_CACHE; 394 return (bp); 395 } 396 bp = getnewbuf(); 397 bremhash(bp); 398 bgetvp(vp, bp); 399 bp->b_bcount = 0; 400 bp->b_lblkno = blkno; 401 bp->b_blkno = blkno; 402 bp->b_error = 0; 403 bp->b_resid = 0; 404 binshash(bp, dp); 405 allocbuf(bp, size); 406 return (bp); 407 } 408 409 /* 410 * Allocate a buffer. 411 * The caller will assign it to a block. 412 */ 413 struct buf * 414 geteblk(size) 415 int size; 416 { 417 register struct buf *bp, *flist; 418 419 if (size > MAXBSIZE) 420 panic("geteblk: size too big"); 421 bp = getnewbuf(); 422 bp->b_flags |= B_INVAL; 423 bremhash(bp); 424 flist = &bfreelist[BQ_AGE]; 425 bp->b_bcount = 0; 426 bp->b_error = 0; 427 bp->b_resid = 0; 428 binshash(bp, flist); 429 allocbuf(bp, size); 430 return (bp); 431 } 432 433 /* 434 * Expand or contract the actual memory allocated to a buffer. 435 * If no memory is available, release buffer and take error exit. 436 */ 437 allocbuf(tp, size) 438 register struct buf *tp; 439 int size; 440 { 441 register struct buf *bp, *ep; 442 int sizealloc, take, s; 443 444 sizealloc = roundup(size, CLBYTES); 445 /* 446 * Buffer size does not change 447 */ 448 if (sizealloc == tp->b_bufsize) 449 goto out; 450 /* 451 * Buffer size is shrinking. 452 * Place excess space in a buffer header taken from the 453 * BQ_EMPTY buffer list and placed on the "most free" list. 454 * If no extra buffer headers are available, leave the 455 * extra space in the present buffer. 456 */ 457 if (sizealloc < tp->b_bufsize) { 458 ep = bfreelist[BQ_EMPTY].av_forw; 459 if (ep == &bfreelist[BQ_EMPTY]) 460 goto out; 461 s = splbio(); 462 bremfree(ep); 463 ep->b_flags |= B_BUSY; 464 splx(s); 465 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 466 (int)tp->b_bufsize - sizealloc); 467 ep->b_bufsize = tp->b_bufsize - sizealloc; 468 tp->b_bufsize = sizealloc; 469 ep->b_flags |= B_INVAL; 470 ep->b_bcount = 0; 471 brelse(ep); 472 goto out; 473 } 474 /* 475 * More buffer space is needed. Get it out of buffers on 476 * the "most free" list, placing the empty headers on the 477 * BQ_EMPTY buffer header list. 478 */ 479 while (tp->b_bufsize < sizealloc) { 480 take = sizealloc - tp->b_bufsize; 481 bp = getnewbuf(); 482 if (take >= bp->b_bufsize) 483 take = bp->b_bufsize; 484 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 485 &tp->b_un.b_addr[tp->b_bufsize], take); 486 tp->b_bufsize += take; 487 bp->b_bufsize = bp->b_bufsize - take; 488 if (bp->b_bcount > bp->b_bufsize) 489 bp->b_bcount = bp->b_bufsize; 490 if (bp->b_bufsize <= 0) { 491 bremhash(bp); 492 binshash(bp, &bfreelist[BQ_EMPTY]); 493 bp->b_dev = NODEV; 494 bp->b_error = 0; 495 bp->b_flags |= B_INVAL; 496 } 497 brelse(bp); 498 } 499 out: 500 tp->b_bcount = size; 501 return (1); 502 } 503 504 /* 505 * Find a buffer which is available for use. 506 * Select something from a free list. 507 * Preference is to AGE list, then LRU list. 508 */ 509 struct buf * 510 getnewbuf() 511 { 512 register struct buf *bp, *dp; 513 register struct ucred *cred; 514 int s; 515 516 #ifdef LFS 517 lfs_flush(); 518 #endif 519 loop: 520 s = splbio(); 521 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 522 if (dp->av_forw != dp) 523 break; 524 if (dp == bfreelist) { /* no free blocks */ 525 dp->b_flags |= B_WANTED; 526 sleep((caddr_t)dp, PRIBIO + 1); 527 splx(s); 528 goto loop; 529 } 530 bp = dp->av_forw; 531 bremfree(bp); 532 bp->b_flags |= B_BUSY; 533 splx(s); 534 if (bp->b_flags & B_DELWRI) { 535 (void) bawrite(bp); 536 goto loop; 537 } 538 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 539 if (bp->b_vp) 540 brelvp(bp); 541 if (bp->b_rcred != NOCRED) { 542 cred = bp->b_rcred; 543 bp->b_rcred = NOCRED; 544 crfree(cred); 545 } 546 if (bp->b_wcred != NOCRED) { 547 cred = bp->b_wcred; 548 bp->b_wcred = NOCRED; 549 crfree(cred); 550 } 551 bp->b_flags = B_BUSY; 552 bp->b_dirtyoff = bp->b_dirtyend = 0; 553 bp->b_validoff = bp->b_validend = 0; 554 return (bp); 555 } 556 557 /* 558 * Wait for I/O to complete. 559 * 560 * Extract and return any errors associated with the I/O. 561 * If the error flag is set, but no specific error is 562 * given, return EIO. 563 */ 564 biowait(bp) 565 register struct buf *bp; 566 { 567 int s; 568 569 s = splbio(); 570 while ((bp->b_flags & B_DONE) == 0) 571 sleep((caddr_t)bp, PRIBIO); 572 splx(s); 573 if ((bp->b_flags & B_ERROR) == 0) 574 return (0); 575 if (bp->b_error) 576 return (bp->b_error); 577 return (EIO); 578 } 579 580 /* 581 * Mark I/O complete on a buffer. 582 * 583 * If a callback has been requested, e.g. the pageout 584 * daemon, do so. Otherwise, awaken waiting processes. 585 */ 586 void 587 biodone(bp) 588 register struct buf *bp; 589 { 590 591 if (bp->b_flags & B_DONE) 592 panic("dup biodone"); 593 bp->b_flags |= B_DONE; 594 if ((bp->b_flags & B_READ) == 0) 595 vwakeup(bp); 596 if (bp->b_flags & B_CALL) { 597 bp->b_flags &= ~B_CALL; 598 (*bp->b_iodone)(bp); 599 return; 600 } 601 if (bp->b_flags & B_ASYNC) 602 brelse(bp); 603 else { 604 bp->b_flags &= ~B_WANTED; 605 wakeup((caddr_t)bp); 606 } 607 } 608