1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_cluster.c 7.45 (Berkeley) 01/14/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/specdev.h> 17 #include <sys/mount.h> 18 #include <sys/trace.h> 19 #include <sys/resourcevar.h> 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 void 25 bufinit() 26 { 27 register int i; 28 register struct buf *bp, *dp; 29 register struct bufhd *hp; 30 int base, residual; 31 32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 33 hp->b_forw = hp->b_back = (struct buf *)hp; 34 35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 37 dp->b_flags = B_HEAD; 38 } 39 base = bufpages / nbuf; 40 residual = bufpages % nbuf; 41 for (i = 0; i < nbuf; i++) { 42 bp = &buf[i]; 43 bp->b_dev = NODEV; 44 bp->b_bcount = 0; 45 bp->b_rcred = NOCRED; 46 bp->b_wcred = NOCRED; 47 bp->b_dirtyoff = 0; 48 bp->b_dirtyend = 0; 49 bp->b_validoff = 0; 50 bp->b_validend = 0; 51 bp->b_un.b_addr = buffers + i * MAXBSIZE; 52 if (i < residual) 53 bp->b_bufsize = (base + 1) * CLBYTES; 54 else 55 bp->b_bufsize = base * CLBYTES; 56 binshash(bp, &bfreelist[BQ_AGE]); 57 bp->b_flags = B_BUSY|B_INVAL; 58 brelse(bp); 59 } 60 } 61 62 /* 63 * Find the block in the buffer pool. 64 * If the buffer is not present, allocate a new buffer and load 65 * its contents according to the filesystem fill routine. 66 */ 67 bread(vp, blkno, size, cred, bpp) 68 struct vnode *vp; 69 daddr_t blkno; 70 int size; 71 struct ucred *cred; 72 struct buf **bpp; 73 { 74 struct proc *p = curproc; /* XXX */ 75 register struct buf *bp; 76 77 if (size == 0) 78 panic("bread: size 0"); 79 *bpp = bp = getblk(vp, blkno, size); 80 if (bp->b_flags & (B_DONE | B_DELWRI)) { 81 trace(TR_BREADHIT, pack(vp, size), blkno); 82 return (0); 83 } 84 bp->b_flags |= B_READ; 85 if (bp->b_bcount > bp->b_bufsize) 86 panic("bread"); 87 if (bp->b_rcred == NOCRED && cred != NOCRED) { 88 crhold(cred); 89 bp->b_rcred = cred; 90 } 91 VOP_STRATEGY(bp); 92 trace(TR_BREADMISS, pack(vp, size), blkno); 93 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 94 return (biowait(bp)); 95 } 96 97 /* 98 * Operates like bread, but also starts I/O on the N specified 99 * read-ahead blocks. 100 */ 101 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 102 struct vnode *vp; 103 daddr_t blkno; int size; 104 daddr_t rablkno[]; int rabsize[]; 105 int num; 106 struct ucred *cred; 107 struct buf **bpp; 108 { 109 struct proc *p = curproc; /* XXX */ 110 register struct buf *bp, *rabp; 111 register int i; 112 113 bp = NULL; 114 /* 115 * If the block is not memory resident, 116 * allocate a buffer and start I/O. 117 */ 118 if (!incore(vp, blkno)) { 119 *bpp = bp = getblk(vp, blkno, size); 120 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 121 bp->b_flags |= B_READ; 122 if (bp->b_bcount > bp->b_bufsize) 123 panic("breadn"); 124 if (bp->b_rcred == NOCRED && cred != NOCRED) { 125 crhold(cred); 126 bp->b_rcred = cred; 127 } 128 VOP_STRATEGY(bp); 129 trace(TR_BREADMISS, pack(vp, size), blkno); 130 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 131 } else 132 trace(TR_BREADHIT, pack(vp, size), blkno); 133 } 134 135 /* 136 * If there's read-ahead block(s), start I/O 137 * on them also (as above). 138 */ 139 for (i = 0; i < num; i++) { 140 if (incore(vp, rablkno[i])) 141 continue; 142 rabp = getblk(vp, rablkno[i], rabsize[i]); 143 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 144 brelse(rabp); 145 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 146 } else { 147 rabp->b_flags |= B_ASYNC | B_READ; 148 if (rabp->b_bcount > rabp->b_bufsize) 149 panic("breadrabp"); 150 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 151 crhold(cred); 152 rabp->b_rcred = cred; 153 } 154 VOP_STRATEGY(rabp); 155 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 156 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 157 } 158 } 159 160 /* 161 * If block was memory resident, let bread get it. 162 * If block was not memory resident, the read was 163 * started above, so just wait for the read to complete. 164 */ 165 if (bp == NULL) 166 return (bread(vp, blkno, size, cred, bpp)); 167 return (biowait(bp)); 168 } 169 170 /* 171 * Synchronous write. 172 * Release buffer on completion. 173 */ 174 bwrite(bp) 175 register struct buf *bp; 176 { 177 struct proc *p = curproc; /* XXX */ 178 register int flag; 179 int s, error; 180 181 flag = bp->b_flags; 182 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 183 if (flag & B_ASYNC) { 184 if ((flag & B_DELWRI) == 0) 185 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 186 else 187 reassignbuf(bp, bp->b_vp); 188 } 189 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 190 if (bp->b_bcount > bp->b_bufsize) 191 panic("bwrite"); 192 s = splbio(); 193 bp->b_vp->v_numoutput++; 194 splx(s); 195 VOP_STRATEGY(bp); 196 197 /* 198 * If the write was synchronous, then await I/O completion. 199 * If the write was "delayed", then we put the buffer on 200 * the queue of blocks awaiting I/O completion status. 201 */ 202 if ((flag & B_ASYNC) == 0) { 203 error = biowait(bp); 204 if ((flag&B_DELWRI) == 0) 205 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 206 else 207 reassignbuf(bp, bp->b_vp); 208 brelse(bp); 209 } else if (flag & B_DELWRI) { 210 bp->b_flags |= B_AGE; 211 error = 0; 212 } 213 return (error); 214 } 215 216 /* 217 * Delayed write. 218 * 219 * The buffer is marked dirty, but is not queued for I/O. 220 * This routine should be used when the buffer is expected 221 * to be modified again soon, typically a small write that 222 * partially fills a buffer. 223 * 224 * NB: magnetic tapes cannot be delayed; they must be 225 * written in the order that the writes are requested. 226 */ 227 bdwrite(bp) 228 register struct buf *bp; 229 { 230 struct proc *p = curproc; /* XXX */ 231 232 if ((bp->b_flags & B_DELWRI) == 0) { 233 bp->b_flags |= B_DELWRI; 234 reassignbuf(bp, bp->b_vp); 235 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 236 } 237 /* 238 * If this is a tape drive, the write must be initiated. 239 */ 240 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 241 bawrite(bp); 242 } else { 243 bp->b_flags |= (B_DONE | B_DELWRI); 244 brelse(bp); 245 } 246 } 247 248 /* 249 * Asynchronous write. 250 * Start I/O on a buffer, but do not wait for it to complete. 251 * The buffer is released when the I/O completes. 252 */ 253 bawrite(bp) 254 register struct buf *bp; 255 { 256 257 /* 258 * Setting the ASYNC flag causes bwrite to return 259 * after starting the I/O. 260 */ 261 bp->b_flags |= B_ASYNC; 262 (void) bwrite(bp); 263 } 264 265 /* 266 * Release a buffer. 267 * Even if the buffer is dirty, no I/O is started. 268 */ 269 brelse(bp) 270 register struct buf *bp; 271 { 272 register struct buf *flist; 273 int s; 274 275 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 276 /* 277 * If a process is waiting for the buffer, or 278 * is waiting for a free buffer, awaken it. 279 */ 280 if (bp->b_flags & B_WANTED) 281 wakeup((caddr_t)bp); 282 if (bfreelist[0].b_flags & B_WANTED) { 283 bfreelist[0].b_flags &= ~B_WANTED; 284 wakeup((caddr_t)bfreelist); 285 } 286 /* 287 * Retry I/O for locked buffers rather than invalidating them. 288 */ 289 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 290 bp->b_flags &= ~B_ERROR; 291 /* 292 * Disassociate buffers that are no longer valid. 293 */ 294 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 295 bp->b_flags |= B_INVAL; 296 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 297 if (bp->b_vp) 298 brelvp(bp); 299 bp->b_flags &= ~B_DELWRI; 300 } 301 /* 302 * Stick the buffer back on a free list. 303 */ 304 s = splbio(); 305 if (bp->b_bufsize <= 0) { 306 /* block has no buffer ... put at front of unused buffer list */ 307 flist = &bfreelist[BQ_EMPTY]; 308 binsheadfree(bp, flist); 309 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 310 /* block has no info ... put at front of most free list */ 311 flist = &bfreelist[BQ_AGE]; 312 binsheadfree(bp, flist); 313 } else { 314 if (bp->b_flags & B_LOCKED) 315 flist = &bfreelist[BQ_LOCKED]; 316 else if (bp->b_flags & B_AGE) 317 flist = &bfreelist[BQ_AGE]; 318 else 319 flist = &bfreelist[BQ_LRU]; 320 binstailfree(bp, flist); 321 } 322 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 323 splx(s); 324 } 325 326 /* 327 * Check to see if a block is currently memory resident. 328 */ 329 incore(vp, blkno) 330 struct vnode *vp; 331 daddr_t blkno; 332 { 333 register struct buf *bp; 334 register struct buf *dp; 335 336 dp = BUFHASH(vp, blkno); 337 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 338 if (bp->b_lblkno == blkno && bp->b_vp == vp && 339 (bp->b_flags & B_INVAL) == 0) 340 return (1); 341 return (0); 342 } 343 344 /* 345 * Check to see if a block is currently memory resident. 346 * If it is resident, return it. If it is not resident, 347 * allocate a new buffer and assign it to the block. 348 */ 349 struct buf * 350 getblk(vp, blkno, size) 351 register struct vnode *vp; 352 daddr_t blkno; 353 int size; 354 { 355 register struct buf *bp, *dp; 356 int s; 357 358 if (size > MAXBSIZE) 359 panic("getblk: size too big"); 360 /* 361 * Search the cache for the block. If the buffer is found, 362 * but it is currently locked, the we must wait for it to 363 * become available. 364 */ 365 dp = BUFHASH(vp, blkno); 366 loop: 367 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 368 if (bp->b_lblkno != blkno || bp->b_vp != vp || 369 (bp->b_flags & B_INVAL)) 370 continue; 371 s = splbio(); 372 if (bp->b_flags & B_BUSY) { 373 bp->b_flags |= B_WANTED; 374 sleep((caddr_t)bp, PRIBIO + 1); 375 splx(s); 376 goto loop; 377 } 378 bremfree(bp); 379 bp->b_flags |= B_BUSY; 380 splx(s); 381 if (bp->b_bcount != size) { 382 printf("getblk: stray size"); 383 bp->b_flags |= B_INVAL; 384 bwrite(bp); 385 goto loop; 386 } 387 bp->b_flags |= B_CACHE; 388 return (bp); 389 } 390 bp = getnewbuf(); 391 bremhash(bp); 392 bgetvp(vp, bp); 393 bp->b_bcount = 0; 394 bp->b_lblkno = blkno; 395 bp->b_blkno = blkno; 396 bp->b_error = 0; 397 bp->b_resid = 0; 398 binshash(bp, dp); 399 allocbuf(bp, size); 400 return (bp); 401 } 402 403 /* 404 * Allocate a buffer. 405 * The caller will assign it to a block. 406 */ 407 struct buf * 408 geteblk(size) 409 int size; 410 { 411 register struct buf *bp, *flist; 412 413 if (size > MAXBSIZE) 414 panic("geteblk: size too big"); 415 bp = getnewbuf(); 416 bp->b_flags |= B_INVAL; 417 bremhash(bp); 418 flist = &bfreelist[BQ_AGE]; 419 bp->b_bcount = 0; 420 bp->b_error = 0; 421 bp->b_resid = 0; 422 binshash(bp, flist); 423 allocbuf(bp, size); 424 return (bp); 425 } 426 427 /* 428 * Expand or contract the actual memory allocated to a buffer. 429 * If no memory is available, release buffer and take error exit. 430 */ 431 allocbuf(tp, size) 432 register struct buf *tp; 433 int size; 434 { 435 register struct buf *bp, *ep; 436 int sizealloc, take, s; 437 438 sizealloc = roundup(size, CLBYTES); 439 /* 440 * Buffer size does not change 441 */ 442 if (sizealloc == tp->b_bufsize) 443 goto out; 444 /* 445 * Buffer size is shrinking. 446 * Place excess space in a buffer header taken from the 447 * BQ_EMPTY buffer list and placed on the "most free" list. 448 * If no extra buffer headers are available, leave the 449 * extra space in the present buffer. 450 */ 451 if (sizealloc < tp->b_bufsize) { 452 ep = bfreelist[BQ_EMPTY].av_forw; 453 if (ep == &bfreelist[BQ_EMPTY]) 454 goto out; 455 s = splbio(); 456 bremfree(ep); 457 ep->b_flags |= B_BUSY; 458 splx(s); 459 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 460 (int)tp->b_bufsize - sizealloc); 461 ep->b_bufsize = tp->b_bufsize - sizealloc; 462 tp->b_bufsize = sizealloc; 463 ep->b_flags |= B_INVAL; 464 ep->b_bcount = 0; 465 brelse(ep); 466 goto out; 467 } 468 /* 469 * More buffer space is needed. Get it out of buffers on 470 * the "most free" list, placing the empty headers on the 471 * BQ_EMPTY buffer header list. 472 */ 473 while (tp->b_bufsize < sizealloc) { 474 take = sizealloc - tp->b_bufsize; 475 bp = getnewbuf(); 476 if (take >= bp->b_bufsize) 477 take = bp->b_bufsize; 478 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 479 &tp->b_un.b_addr[tp->b_bufsize], take); 480 tp->b_bufsize += take; 481 bp->b_bufsize = bp->b_bufsize - take; 482 if (bp->b_bcount > bp->b_bufsize) 483 bp->b_bcount = bp->b_bufsize; 484 if (bp->b_bufsize <= 0) { 485 bremhash(bp); 486 binshash(bp, &bfreelist[BQ_EMPTY]); 487 bp->b_dev = NODEV; 488 bp->b_error = 0; 489 bp->b_flags |= B_INVAL; 490 } 491 brelse(bp); 492 } 493 out: 494 tp->b_bcount = size; 495 return (1); 496 } 497 498 /* 499 * Find a buffer which is available for use. 500 * Select something from a free list. 501 * Preference is to AGE list, then LRU list. 502 */ 503 struct buf * 504 getnewbuf() 505 { 506 register struct buf *bp, *dp; 507 register struct ucred *cred; 508 int s; 509 510 #ifdef LFS 511 lfs_flush(); 512 #endif 513 loop: 514 s = splbio(); 515 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 516 if (dp->av_forw != dp) 517 break; 518 if (dp == bfreelist) { /* no free blocks */ 519 dp->b_flags |= B_WANTED; 520 sleep((caddr_t)dp, PRIBIO + 1); 521 splx(s); 522 goto loop; 523 } 524 bp = dp->av_forw; 525 bremfree(bp); 526 bp->b_flags |= B_BUSY; 527 splx(s); 528 if (bp->b_flags & B_DELWRI) { 529 (void) bawrite(bp); 530 goto loop; 531 } 532 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 533 if (bp->b_vp) 534 brelvp(bp); 535 if (bp->b_rcred != NOCRED) { 536 cred = bp->b_rcred; 537 bp->b_rcred = NOCRED; 538 crfree(cred); 539 } 540 if (bp->b_wcred != NOCRED) { 541 cred = bp->b_wcred; 542 bp->b_wcred = NOCRED; 543 crfree(cred); 544 } 545 bp->b_flags = B_BUSY; 546 bp->b_dirtyoff = bp->b_dirtyend = 0; 547 bp->b_validoff = bp->b_validend = 0; 548 return (bp); 549 } 550 551 /* 552 * Wait for I/O to complete. 553 * 554 * Extract and return any errors associated with the I/O. 555 * If the error flag is set, but no specific error is 556 * given, return EIO. 557 */ 558 biowait(bp) 559 register struct buf *bp; 560 { 561 int s; 562 563 s = splbio(); 564 while ((bp->b_flags & B_DONE) == 0) 565 sleep((caddr_t)bp, PRIBIO); 566 splx(s); 567 if ((bp->b_flags & B_ERROR) == 0) 568 return (0); 569 if (bp->b_error) 570 return (bp->b_error); 571 return (EIO); 572 } 573 574 /* 575 * Mark I/O complete on a buffer. 576 * 577 * If a callback has been requested, e.g. the pageout 578 * daemon, do so. Otherwise, awaken waiting processes. 579 */ 580 void 581 biodone(bp) 582 register struct buf *bp; 583 { 584 585 if (bp->b_flags & B_DONE) 586 panic("dup biodone"); 587 bp->b_flags |= B_DONE; 588 if ((bp->b_flags & B_READ) == 0) 589 vwakeup(bp); 590 if (bp->b_flags & B_CALL) { 591 bp->b_flags &= ~B_CALL; 592 (*bp->b_iodone)(bp); 593 return; 594 } 595 if (bp->b_flags & B_ASYNC) 596 brelse(bp); 597 else { 598 bp->b_flags &= ~B_WANTED; 599 wakeup((caddr_t)bp); 600 } 601 } 602