1 /* 2 * Copyright (c) 1982 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 * 6 * @(#)vfs_cluster.c 6.6 (Berkeley) 09/13/85 7 */ 8 9 #include "../machine/pte.h" 10 11 #include "param.h" 12 #include "systm.h" 13 #include "dir.h" 14 #include "user.h" 15 #include "buf.h" 16 #include "conf.h" 17 #include "proc.h" 18 #include "seg.h" 19 #include "vm.h" 20 #include "trace.h" 21 22 /* 23 * Read in (if necessary) the block and return a buffer pointer. 24 */ 25 struct buf * 26 bread(dev, blkno, size) 27 dev_t dev; 28 daddr_t blkno; 29 int size; 30 { 31 register struct buf *bp; 32 33 if (size == 0) 34 panic("bread: size 0"); 35 bp = getblk(dev, blkno, size); 36 if (bp->b_flags&B_DONE) { 37 trace(TR_BREADHIT, pack(dev, size), blkno); 38 return(bp); 39 } 40 bp->b_flags |= B_READ; 41 if (bp->b_bcount > bp->b_bufsize) 42 panic("bread"); 43 (*bdevsw[major(dev)].d_strategy)(bp); 44 trace(TR_BREADMISS, pack(dev, size), blkno); 45 u.u_ru.ru_inblock++; /* pay for read */ 46 biowait(bp); 47 return(bp); 48 } 49 50 /* 51 * Read in the block, like bread, but also start I/O on the 52 * read-ahead block (which is not allocated to the caller) 53 */ 54 struct buf * 55 breada(dev, blkno, size, rablkno, rabsize) 56 dev_t dev; 57 daddr_t blkno; int size; 58 daddr_t rablkno; int rabsize; 59 { 60 register struct buf *bp, *rabp; 61 62 bp = NULL; 63 /* 64 * If the block isn't in core, then allocate 65 * a buffer and initiate i/o (getblk checks 66 * for a cache hit). 67 */ 68 if (!incore(dev, blkno)) { 69 bp = getblk(dev, blkno, size); 70 if ((bp->b_flags&B_DONE) == 0) { 71 bp->b_flags |= B_READ; 72 if (bp->b_bcount > bp->b_bufsize) 73 panic("breada"); 74 (*bdevsw[major(dev)].d_strategy)(bp); 75 trace(TR_BREADMISS, pack(dev, size), blkno); 76 u.u_ru.ru_inblock++; /* pay for read */ 77 } else 78 trace(TR_BREADHIT, pack(dev, size), blkno); 79 } 80 81 /* 82 * If there's a read-ahead block, start i/o 83 * on it also (as above). 84 */ 85 if (rablkno && !incore(dev, rablkno)) { 86 rabp = getblk(dev, rablkno, rabsize); 87 if (rabp->b_flags & B_DONE) { 88 brelse(rabp); 89 trace(TR_BREADHITRA, pack(dev, rabsize), blkno); 90 } else { 91 rabp->b_flags |= B_READ|B_ASYNC; 92 if (rabp->b_bcount > rabp->b_bufsize) 93 panic("breadrabp"); 94 (*bdevsw[major(dev)].d_strategy)(rabp); 95 trace(TR_BREADMISSRA, pack(dev, rabsize), rablock); 96 u.u_ru.ru_inblock++; /* pay in advance */ 97 } 98 } 99 100 /* 101 * If block was in core, let bread get it. 102 * If block wasn't in core, then the read was started 103 * above, and just wait for it. 104 */ 105 if (bp == NULL) 106 return (bread(dev, blkno, size)); 107 biowait(bp); 108 return (bp); 109 } 110 111 /* 112 * Write the buffer, waiting for completion. 113 * Then release the buffer. 114 */ 115 bwrite(bp) 116 register struct buf *bp; 117 { 118 register flag; 119 120 flag = bp->b_flags; 121 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 122 if ((flag&B_DELWRI) == 0) 123 u.u_ru.ru_oublock++; /* noone paid yet */ 124 trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno); 125 if (bp->b_bcount > bp->b_bufsize) 126 panic("bwrite"); 127 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 128 129 /* 130 * If the write was synchronous, then await i/o completion. 131 * If the write was "delayed", then we put the buffer on 132 * the q of blocks awaiting i/o completion status. 133 */ 134 if ((flag&B_ASYNC) == 0) { 135 biowait(bp); 136 brelse(bp); 137 } else if (flag & B_DELWRI) 138 bp->b_flags |= B_AGE; 139 } 140 141 /* 142 * Release the buffer, marking it so that if it is grabbed 143 * for another purpose it will be written out before being 144 * given up (e.g. when writing a partial block where it is 145 * assumed that another write for the same block will soon follow). 146 * This can't be done for magtape, since writes must be done 147 * in the same order as requested. 148 */ 149 bdwrite(bp) 150 register struct buf *bp; 151 { 152 register int flags; 153 154 if ((bp->b_flags&B_DELWRI) == 0) 155 u.u_ru.ru_oublock++; /* noone paid yet */ 156 flags = bdevsw[major(bp->b_dev)].d_flags; 157 if(flags & B_TAPE) 158 bawrite(bp); 159 else { 160 bp->b_flags |= B_DELWRI | B_DONE; 161 brelse(bp); 162 } 163 } 164 165 /* 166 * Release the buffer, start I/O on it, but don't wait for completion. 167 */ 168 bawrite(bp) 169 register struct buf *bp; 170 { 171 172 bp->b_flags |= B_ASYNC; 173 bwrite(bp); 174 } 175 176 /* 177 * Release the buffer, with no I/O implied. 178 */ 179 brelse(bp) 180 register struct buf *bp; 181 { 182 register struct buf *flist; 183 register s; 184 185 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 186 /* 187 * If someone's waiting for the buffer, or 188 * is waiting for a buffer wake 'em up. 189 */ 190 if (bp->b_flags&B_WANTED) 191 wakeup((caddr_t)bp); 192 if (bfreelist[0].b_flags&B_WANTED) { 193 bfreelist[0].b_flags &= ~B_WANTED; 194 wakeup((caddr_t)bfreelist); 195 } 196 if (bp->b_flags&B_ERROR) 197 if (bp->b_flags & B_LOCKED) 198 bp->b_flags &= ~B_ERROR; /* try again later */ 199 else 200 bp->b_dev = NODEV; /* no assoc */ 201 202 /* 203 * Stick the buffer back on a free list. 204 */ 205 s = spl6(); 206 if (bp->b_bufsize <= 0) { 207 /* block has no buffer ... put at front of unused buffer list */ 208 flist = &bfreelist[BQ_EMPTY]; 209 binsheadfree(bp, flist); 210 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 211 /* block has no info ... put at front of most free list */ 212 flist = &bfreelist[BQ_AGE]; 213 binsheadfree(bp, flist); 214 } else { 215 if (bp->b_flags & B_LOCKED) 216 flist = &bfreelist[BQ_LOCKED]; 217 else if (bp->b_flags & B_AGE) 218 flist = &bfreelist[BQ_AGE]; 219 else 220 flist = &bfreelist[BQ_LRU]; 221 binstailfree(bp, flist); 222 } 223 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 224 splx(s); 225 } 226 227 /* 228 * See if the block is associated with some buffer 229 * (mainly to avoid getting hung up on a wait in breada) 230 */ 231 incore(dev, blkno) 232 dev_t dev; 233 daddr_t blkno; 234 { 235 register struct buf *bp; 236 register struct buf *dp; 237 238 dp = BUFHASH(dev, blkno); 239 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 240 if (bp->b_blkno == blkno && bp->b_dev == dev && 241 (bp->b_flags & B_INVAL) == 0) 242 return (1); 243 return (0); 244 } 245 246 struct buf * 247 baddr(dev, blkno, size) 248 dev_t dev; 249 daddr_t blkno; 250 int size; 251 { 252 253 if (incore(dev, blkno)) 254 return (bread(dev, blkno, size)); 255 return (0); 256 } 257 258 /* 259 * Assign a buffer for the given block. If the appropriate 260 * block is already associated, return it; otherwise search 261 * for the oldest non-busy buffer and reassign it. 262 * 263 * We use splx here because this routine may be called 264 * on the interrupt stack during a dump, and we don't 265 * want to lower the ipl back to 0. 266 */ 267 struct buf * 268 getblk(dev, blkno, size) 269 dev_t dev; 270 daddr_t blkno; 271 int size; 272 { 273 register struct buf *bp, *dp; 274 int s; 275 276 /* 277 * To prevent overflow of 32-bit ints when converting block 278 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 279 * to the maximum number that can be converted to a byte offset 280 * without overflow. This is historic code; what bug it fixed, 281 * or whether it is still a reasonable thing to do is open to 282 * dispute. mkm 9/85 283 */ 284 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) 285 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 286 /* 287 * Search the cache for the block. If we hit, but 288 * the buffer is in use for i/o, then we wait until 289 * the i/o has completed. 290 */ 291 dp = BUFHASH(dev, blkno); 292 loop: 293 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 294 if (bp->b_blkno != blkno || bp->b_dev != dev || 295 bp->b_flags&B_INVAL) 296 continue; 297 s = spl6(); 298 if (bp->b_flags&B_BUSY) { 299 bp->b_flags |= B_WANTED; 300 sleep((caddr_t)bp, PRIBIO+1); 301 splx(s); 302 goto loop; 303 } 304 splx(s); 305 notavail(bp); 306 if (bp->b_bcount != size && brealloc(bp, size) == 0) 307 goto loop; 308 bp->b_flags |= B_CACHE; 309 return(bp); 310 } 311 if (major(dev) >= nblkdev) 312 panic("blkdev"); 313 bp = getnewbuf(); 314 bfree(bp); 315 bremhash(bp); 316 binshash(bp, dp); 317 bp->b_dev = dev; 318 bp->b_blkno = blkno; 319 bp->b_error = 0; 320 if (brealloc(bp, size) == 0) 321 goto loop; 322 return(bp); 323 } 324 325 /* 326 * get an empty block, 327 * not assigned to any particular device 328 */ 329 struct buf * 330 geteblk(size) 331 int size; 332 { 333 register struct buf *bp, *flist; 334 335 loop: 336 bp = getnewbuf(); 337 bp->b_flags |= B_INVAL; 338 bfree(bp); 339 bremhash(bp); 340 flist = &bfreelist[BQ_AGE]; 341 binshash(bp, flist); 342 bp->b_dev = (dev_t)NODEV; 343 bp->b_error = 0; 344 if (brealloc(bp, size) == 0) 345 goto loop; 346 return(bp); 347 } 348 349 /* 350 * Allocate space associated with a buffer. 351 * If can't get space, buffer is released 352 */ 353 brealloc(bp, size) 354 register struct buf *bp; 355 int size; 356 { 357 daddr_t start, last; 358 register struct buf *ep; 359 struct buf *dp; 360 int s; 361 362 /* 363 * First need to make sure that all overlaping previous I/O 364 * is dispatched with. 365 */ 366 if (size == bp->b_bcount) 367 return (1); 368 if (size < bp->b_bcount) { 369 if (bp->b_flags & B_DELWRI) { 370 bwrite(bp); 371 return (0); 372 } 373 if (bp->b_flags & B_LOCKED) 374 panic("brealloc"); 375 return (allocbuf(bp, size)); 376 } 377 bp->b_flags &= ~B_DONE; 378 if (bp->b_dev == NODEV) 379 return (allocbuf(bp, size)); 380 381 trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno); 382 /* 383 * Search cache for any buffers that overlap the one that we 384 * are trying to allocate. Overlapping buffers must be marked 385 * invalid, after being written out if they are dirty. (indicated 386 * by B_DELWRI) A disk block must be mapped by at most one buffer 387 * at any point in time. Care must be taken to avoid deadlocking 388 * when two buffer are trying to get the same set of disk blocks. 389 */ 390 start = bp->b_blkno; 391 last = start + btodb(size) - 1; 392 dp = BUFHASH(bp->b_dev, bp->b_blkno); 393 loop: 394 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 395 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL)) 396 continue; 397 /* look for overlap */ 398 if (ep->b_bcount == 0 || ep->b_blkno > last || 399 ep->b_blkno + btodb(ep->b_bcount) <= start) 400 continue; 401 s = spl6(); 402 if (ep->b_flags&B_BUSY) { 403 ep->b_flags |= B_WANTED; 404 sleep((caddr_t)ep, PRIBIO+1); 405 splx(s); 406 goto loop; 407 } 408 splx(s); 409 notavail(ep); 410 if (ep->b_flags & B_DELWRI) { 411 bwrite(ep); 412 goto loop; 413 } 414 ep->b_flags |= B_INVAL; 415 brelse(ep); 416 } 417 return (allocbuf(bp, size)); 418 } 419 420 /* 421 * Find a buffer which is available for use. 422 * Select something from a free list. 423 * Preference is to AGE list, then LRU list. 424 */ 425 struct buf * 426 getnewbuf() 427 { 428 register struct buf *bp, *dp; 429 int s; 430 431 loop: 432 s = spl6(); 433 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 434 if (dp->av_forw != dp) 435 break; 436 if (dp == bfreelist) { /* no free blocks */ 437 dp->b_flags |= B_WANTED; 438 sleep((caddr_t)dp, PRIBIO+1); 439 splx(s); 440 goto loop; 441 } 442 splx(s); 443 bp = dp->av_forw; 444 notavail(bp); 445 if (bp->b_flags & B_DELWRI) { 446 bp->b_flags |= B_ASYNC; 447 bwrite(bp); 448 goto loop; 449 } 450 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 451 bp->b_flags = B_BUSY; 452 return (bp); 453 } 454 455 /* 456 * Wait for I/O completion on the buffer; return errors 457 * to the user. 458 */ 459 biowait(bp) 460 register struct buf *bp; 461 { 462 int s; 463 464 s = spl6(); 465 while ((bp->b_flags&B_DONE)==0) 466 sleep((caddr_t)bp, PRIBIO); 467 splx(s); 468 if (u.u_error == 0) /* XXX */ 469 u.u_error = geterror(bp); 470 } 471 472 /* 473 * Mark I/O complete on a buffer. 474 * If someone should be called, e.g. the pageout 475 * daemon, do so. Otherwise, wake up anyone 476 * waiting for it. 477 */ 478 biodone(bp) 479 register struct buf *bp; 480 { 481 482 if (bp->b_flags & B_DONE) 483 panic("dup biodone"); 484 bp->b_flags |= B_DONE; 485 if (bp->b_flags & B_CALL) { 486 bp->b_flags &= ~B_CALL; 487 (*bp->b_iodone)(bp); 488 return; 489 } 490 if (bp->b_flags&B_ASYNC) 491 brelse(bp); 492 else { 493 bp->b_flags &= ~B_WANTED; 494 wakeup((caddr_t)bp); 495 } 496 } 497 498 /* 499 * Insure that no part of a specified block is in an incore buffer. 500 */ 501 blkflush(dev, blkno, size) 502 dev_t dev; 503 daddr_t blkno; 504 long size; 505 { 506 register struct buf *ep; 507 struct buf *dp; 508 daddr_t start, last; 509 int s; 510 511 start = blkno; 512 last = start + btodb(size) - 1; 513 dp = BUFHASH(dev, blkno); 514 loop: 515 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 516 if (ep->b_dev != dev || (ep->b_flags&B_INVAL)) 517 continue; 518 /* look for overlap */ 519 if (ep->b_bcount == 0 || ep->b_blkno > last || 520 ep->b_blkno + btodb(ep->b_bcount) <= start) 521 continue; 522 s = spl6(); 523 if (ep->b_flags&B_BUSY) { 524 ep->b_flags |= B_WANTED; 525 sleep((caddr_t)ep, PRIBIO+1); 526 splx(s); 527 goto loop; 528 } 529 if (ep->b_flags & B_DELWRI) { 530 splx(s); 531 notavail(ep); 532 bwrite(ep); 533 goto loop; 534 } 535 splx(s); 536 } 537 } 538 539 /* 540 * Make sure all write-behind blocks 541 * on dev (or NODEV for all) 542 * are flushed out. 543 * (from umount and update) 544 */ 545 bflush(dev) 546 dev_t dev; 547 { 548 register struct buf *bp; 549 register struct buf *flist; 550 int s; 551 552 loop: 553 s = spl6(); 554 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 555 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 556 if ((bp->b_flags & B_DELWRI) == 0) 557 continue; 558 if (dev == NODEV || dev == bp->b_dev) { 559 bp->b_flags |= B_ASYNC; 560 notavail(bp); 561 bwrite(bp); 562 splx(s); 563 goto loop; 564 } 565 } 566 splx(s); 567 } 568 569 /* 570 * Pick up the device's error number and pass it to the user; 571 * if there is an error but the number is 0 set a generalized 572 * code. Actually the latter is always true because devices 573 * don't yet return specific errors. 574 */ 575 geterror(bp) 576 register struct buf *bp; 577 { 578 int error = 0; 579 580 if (bp->b_flags&B_ERROR) 581 if ((error = bp->b_error)==0) 582 return (EIO); 583 return (error); 584 } 585 586 /* 587 * Invalidate in core blocks belonging to closed or umounted filesystem 588 * 589 * This is not nicely done at all - the buffer ought to be removed from the 590 * hash chains & have its dev/blkno fields clobbered, but unfortunately we 591 * can't do that here, as it is quite possible that the block is still 592 * being used for i/o. Eventually, all disc drivers should be forced to 593 * have a close routine, which ought ensure that the queue is empty, then 594 * properly flush the queues. Until that happy day, this suffices for 595 * correctness. ... kre 596 */ 597 binval(dev) 598 dev_t dev; 599 { 600 register struct buf *bp; 601 register struct bufhd *hp; 602 #define dp ((struct buf *)hp) 603 604 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 605 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 606 if (bp->b_dev == dev) 607 bp->b_flags |= B_INVAL; 608 } 609