1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_bio.c 7.59 (Berkeley) 05/10/93 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/mount.h> 17 #include <sys/trace.h> 18 #include <sys/resourcevar.h> 19 #include <sys/malloc.h> 20 #include <libkern/libkern.h> 21 22 /* 23 * Definitions for the buffer hash lists. 24 */ 25 #define BUFHASH(dvp, lbn) \ 26 (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 27 struct list_entry *bufhashtbl, invalhash; 28 u_long bufhash; 29 30 /* 31 * Insq/Remq for the buffer hash lists. 32 */ 33 #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) 34 #define bremhash(bp) list_remove(bp, struct buf *, b_hash) 35 36 /* 37 * Definitions for the buffer free lists. 38 */ 39 #define BQUEUES 4 /* number of free buffer queues */ 40 41 #define BQ_LOCKED 0 /* super-blocks &c */ 42 #define BQ_LRU 1 /* lru, useful buffers */ 43 #define BQ_AGE 2 /* rubbish */ 44 #define BQ_EMPTY 3 /* buffer headers with no memory */ 45 46 struct queue_entry bufqueues[BQUEUES]; 47 int needbuffer; 48 49 /* 50 * Insq/Remq for the buffer free lists. 51 */ 52 #define binsheadfree(bp, dp) \ 53 queue_enter_head(dp, bp, struct buf *, b_freelist) 54 #define binstailfree(bp, dp) \ 55 queue_enter_tail(dp, bp, struct buf *, b_freelist) 56 57 /* 58 * Local declarations 59 */ 60 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 61 daddr_t, long, int)); 62 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 63 daddr_t, daddr_t, long, int, long)); 64 void cluster_wbuild __P((struct vnode *, struct buf *, long size, 65 daddr_t start_lbn, int len, daddr_t lbn)); 66 67 void 68 bremfree(bp) 69 struct buf *bp; 70 { 71 struct queue_entry *dp; 72 73 /* 74 * We only calculate the head of the freelist when removing 75 * the last element of the list as that is the only time that 76 * it is needed (e.g. to reset the tail pointer). 77 */ 78 if (bp->b_freelist.qe_next == NULL) { 79 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 80 if (dp->qe_prev == &bp->b_freelist.qe_next) 81 break; 82 if (dp == &bufqueues[BQUEUES]) 83 panic("bremfree: lost tail"); 84 } 85 queue_remove(dp, bp, struct buf *, b_freelist); 86 } 87 88 /* 89 * Initialize buffers and hash links for buffers. 90 */ 91 void 92 bufinit() 93 { 94 register struct buf *bp; 95 struct queue_entry *dp; 96 register int i; 97 int base, residual; 98 99 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 100 queue_init(dp); 101 bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); 102 base = bufpages / nbuf; 103 residual = bufpages % nbuf; 104 for (i = 0; i < nbuf; i++) { 105 bp = &buf[i]; 106 bzero((char *)bp, sizeof *bp); 107 bp->b_dev = NODEV; 108 bp->b_rcred = NOCRED; 109 bp->b_wcred = NOCRED; 110 bp->b_un.b_addr = buffers + i * MAXBSIZE; 111 if (i < residual) 112 bp->b_bufsize = (base + 1) * CLBYTES; 113 else 114 bp->b_bufsize = base * CLBYTES; 115 bp->b_flags = B_INVAL; 116 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 117 binsheadfree(bp, dp); 118 binshash(bp, &invalhash); 119 } 120 } 121 122 /* 123 * Find the block in the buffer pool. 124 * If the buffer is not present, allocate a new buffer and load 125 * its contents according to the filesystem fill routine. 126 */ 127 bread(vp, blkno, size, cred, bpp) 128 struct vnode *vp; 129 daddr_t blkno; 130 int size; 131 struct ucred *cred; 132 struct buf **bpp; 133 { 134 struct proc *p = curproc; /* XXX */ 135 register struct buf *bp; 136 137 if (size == 0) 138 panic("bread: size 0"); 139 *bpp = bp = getblk(vp, blkno, size, 0, 0); 140 if (bp->b_flags & (B_DONE | B_DELWRI)) { 141 trace(TR_BREADHIT, pack(vp, size), blkno); 142 return (0); 143 } 144 bp->b_flags |= B_READ; 145 if (bp->b_bcount > bp->b_bufsize) 146 panic("bread"); 147 if (bp->b_rcred == NOCRED && cred != NOCRED) { 148 crhold(cred); 149 bp->b_rcred = cred; 150 } 151 VOP_STRATEGY(bp); 152 trace(TR_BREADMISS, pack(vp, size), blkno); 153 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 154 return (biowait(bp)); 155 } 156 157 /* 158 * Operates like bread, but also starts I/O on the N specified 159 * read-ahead blocks. 160 */ 161 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 162 struct vnode *vp; 163 daddr_t blkno; int size; 164 daddr_t rablkno[]; int rabsize[]; 165 int num; 166 struct ucred *cred; 167 struct buf **bpp; 168 { 169 struct proc *p = curproc; /* XXX */ 170 register struct buf *bp, *rabp; 171 register int i; 172 173 bp = NULL; 174 /* 175 * If the block is not memory resident, 176 * allocate a buffer and start I/O. 177 */ 178 if (!incore(vp, blkno)) { 179 *bpp = bp = getblk(vp, blkno, size, 0, 0); 180 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 181 bp->b_flags |= B_READ; 182 if (bp->b_bcount > bp->b_bufsize) 183 panic("breadn"); 184 if (bp->b_rcred == NOCRED && cred != NOCRED) { 185 crhold(cred); 186 bp->b_rcred = cred; 187 } 188 VOP_STRATEGY(bp); 189 trace(TR_BREADMISS, pack(vp, size), blkno); 190 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 191 } else { 192 trace(TR_BREADHIT, pack(vp, size), blkno); 193 } 194 } 195 196 /* 197 * If there's read-ahead block(s), start I/O 198 * on them also (as above). 199 */ 200 for (i = 0; i < num; i++) { 201 if (incore(vp, rablkno[i])) 202 continue; 203 rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0); 204 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 205 brelse(rabp); 206 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 207 } else { 208 rabp->b_flags |= B_ASYNC | B_READ; 209 if (rabp->b_bcount > rabp->b_bufsize) 210 panic("breadrabp"); 211 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 212 crhold(cred); 213 rabp->b_rcred = cred; 214 } 215 VOP_STRATEGY(rabp); 216 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 217 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 218 } 219 } 220 221 /* 222 * If block was memory resident, let bread get it. 223 * If block was not memory resident, the read was 224 * started above, so just wait for the read to complete. 225 */ 226 if (bp == NULL) 227 return (bread(vp, blkno, size, cred, bpp)); 228 return (biowait(bp)); 229 } 230 231 /* 232 * We could optimize this by keeping track of where the last read-ahead 233 * was, but it would involve adding fields to the vnode. For now, let's 234 * just get it working. 235 * 236 * This replaces bread. If this is a bread at the beginning of a file and 237 * lastr is 0, we assume this is the first read and we'll read up to two 238 * blocks if they are sequential. After that, we'll do regular read ahead 239 * in clustered chunks. 240 * 241 * There are 4 or 5 cases depending on how you count: 242 * Desired block is in the cache: 243 * 1 Not sequential access (0 I/Os). 244 * 2 Access is sequential, do read-ahead (1 ASYNC). 245 * Desired block is not in cache: 246 * 3 Not sequential access (1 SYNC). 247 * 4 Sequential access, next block is contiguous (1 SYNC). 248 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 249 * 250 * There are potentially two buffers that require I/O. 251 * bp is the block requested. 252 * rbp is the read-ahead block. 253 * If either is NULL, then you don't have to do the I/O. 254 */ 255 cluster_read(vp, filesize, lblkno, size, cred, bpp) 256 struct vnode *vp; 257 u_quad_t filesize; 258 daddr_t lblkno; 259 long size; 260 struct ucred *cred; 261 struct buf **bpp; 262 { 263 struct buf *bp, *rbp; 264 daddr_t blkno, ioblkno; 265 long flags; 266 int error, num_ra, alreadyincore; 267 268 #ifdef DIAGNOSTIC 269 if (size == 0) 270 panic("cluster_read: size = 0"); 271 #endif 272 273 error = 0; 274 flags = B_READ; 275 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 276 if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { 277 /* 278 * Desired block is in cache; do any readahead ASYNC. 279 * Case 1, 2. 280 */ 281 trace(TR_BREADHIT, pack(vp, size), lblkno); 282 flags |= B_ASYNC; 283 ioblkno = lblkno + 284 (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); 285 alreadyincore = (int)incore(vp, ioblkno); 286 bp = NULL; 287 } else { 288 /* Block wasn't in cache, case 3, 4, 5. */ 289 trace(TR_BREADMISS, pack(vp, size), lblkno); 290 ioblkno = lblkno; 291 bp->b_flags |= flags; 292 alreadyincore = 0; 293 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 294 } 295 /* 296 * XXX 297 * Replace 1 with a window size based on some permutation of 298 * maxcontig and rot_delay. This will let you figure out how 299 * many blocks you should read-ahead (case 2, 4, 5). 300 * 301 * If the access isn't sequential, cut the window size in half. 302 */ 303 rbp = NULL; 304 if (lblkno != vp->v_lastr + 1 && lblkno != 0) 305 vp->v_ralen = max(vp->v_ralen >> 1, 1); 306 else if ((ioblkno + 1) * size < filesize && !alreadyincore && 307 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { 308 /* 309 * Reading sequentially, and the next block is not in the 310 * cache. We are going to try reading ahead. If this is 311 * the first read of a file, then limit read-ahead to a 312 * single block, else read as much as we're allowed. 313 */ 314 if (num_ra > vp->v_ralen) { 315 num_ra = vp->v_ralen; 316 vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); 317 } else 318 vp->v_ralen = num_ra + 1; 319 320 321 if (num_ra) /* case 2, 4 */ 322 rbp = cluster_rbuild(vp, filesize, 323 bp, ioblkno, blkno, size, num_ra, flags); 324 else if (lblkno != 0 && ioblkno == lblkno) { 325 /* Case 5: check how many blocks to read ahead */ 326 ++ioblkno; 327 if ((ioblkno + 1) * size > filesize || 328 (error = VOP_BMAP(vp, 329 ioblkno, NULL, &blkno, &num_ra))) 330 goto skip_readahead; 331 flags |= B_ASYNC; 332 if (num_ra) 333 rbp = cluster_rbuild(vp, filesize, 334 NULL, ioblkno, blkno, size, num_ra, flags); 335 else { 336 rbp = getblk(vp, ioblkno, size, 0, 0); 337 rbp->b_flags |= flags; 338 rbp->b_blkno = blkno; 339 } 340 } else if (lblkno != 0) { 341 /* case 2; read ahead single block */ 342 rbp = getblk(vp, ioblkno, size, 0, 0); 343 rbp->b_flags |= flags; 344 rbp->b_blkno = blkno; 345 } else if (bp) /* case 1, 3, block 0 */ 346 bp->b_blkno = blkno; 347 /* Case 1 on block 0; not really doing sequential I/O */ 348 349 if (rbp == bp) /* case 4 */ 350 rbp = NULL; 351 else if (rbp) { /* case 2, 5 */ 352 trace(TR_BREADMISSRA, 353 pack(vp, (num_ra + 1) * size), ioblkno); 354 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 355 } 356 } 357 358 /* XXX Kirk, do we need to make sure the bp has creds? */ 359 skip_readahead: 360 if (bp) 361 if (bp->b_flags & (B_DONE | B_DELWRI)) 362 panic("cluster_read: DONE bp"); 363 else 364 error = VOP_STRATEGY(bp); 365 366 if (rbp) 367 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 368 rbp->b_flags &= ~(B_ASYNC | B_READ); 369 brelse(rbp); 370 } else 371 (void) VOP_STRATEGY(rbp); 372 373 if (bp) 374 return(biowait(bp)); 375 return(error); 376 } 377 378 /* 379 * If blocks are contiguous on disk, use this to provide clustered 380 * read ahead. We will read as many blocks as possible sequentially 381 * and then parcel them up into logical blocks in the buffer hash table. 382 */ 383 struct buf * 384 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 385 struct vnode *vp; 386 u_quad_t filesize; 387 struct buf *bp; 388 daddr_t lbn; 389 daddr_t blkno; 390 long size; 391 int run; 392 long flags; 393 { 394 struct cluster_save *b_save; 395 struct buf *tbp; 396 daddr_t bn; 397 int i, inc; 398 399 #ifdef DIAGNOSTIC 400 if (size != vp->v_mount->mnt_stat.f_iosize) 401 panic("cluster_rbuild: size %d != filesize %d\n", 402 size, vp->v_mount->mnt_stat.f_iosize); 403 #endif 404 if (size * (lbn + run + 1) > filesize) 405 --run; 406 if (run == 0) { 407 if (!bp) { 408 bp = getblk(vp, lbn, size, 0, 0); 409 bp->b_blkno = blkno; 410 bp->b_flags |= flags; 411 } 412 return(bp); 413 } 414 415 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 416 if (bp->b_flags & (B_DONE | B_DELWRI)) 417 return (bp); 418 419 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 420 M_SEGMENT, M_WAITOK); 421 b_save->bs_bufsize = b_save->bs_bcount = size; 422 b_save->bs_nchildren = 0; 423 b_save->bs_children = (struct buf **)(b_save + 1); 424 b_save->bs_saveaddr = bp->b_saveaddr; 425 bp->b_saveaddr = (caddr_t) b_save; 426 427 inc = size / DEV_BSIZE; 428 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 429 if (incore(vp, lbn + i)) { 430 if (i == 1) { 431 bp->b_saveaddr = b_save->bs_saveaddr; 432 bp->b_flags &= ~B_CALL; 433 bp->b_iodone = NULL; 434 allocbuf(bp, size); 435 free(b_save, M_SEGMENT); 436 } else 437 allocbuf(bp, size * i); 438 break; 439 } 440 tbp = getblk(vp, lbn + i, 0, 0, 0); 441 tbp->b_bcount = tbp->b_bufsize = size; 442 tbp->b_blkno = bn; 443 tbp->b_flags |= flags | B_READ | B_ASYNC; 444 ++b_save->bs_nchildren; 445 b_save->bs_children[i - 1] = tbp; 446 } 447 if (!(bp->b_flags & B_ASYNC)) 448 vp->v_ralen = max(vp->v_ralen - 1, 1); 449 return(bp); 450 } 451 452 /* 453 * Either get a new buffer or grow the existing one. 454 */ 455 struct buf * 456 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 457 struct vnode *vp; 458 struct buf *bp; 459 long flags; 460 daddr_t blkno; 461 daddr_t lblkno; 462 long size; 463 int run; 464 { 465 if (!bp) { 466 bp = getblk(vp, lblkno, size, 0, 0); 467 if (bp->b_flags & (B_DONE | B_DELWRI)) { 468 bp->b_blkno = blkno; 469 return(bp); 470 } 471 } 472 allocbuf(bp, run * size); 473 bp->b_blkno = blkno; 474 bp->b_iodone = cluster_callback; 475 bp->b_flags |= flags | B_CALL; 476 return(bp); 477 } 478 479 /* 480 * Cleanup after a clustered read or write. 481 */ 482 void 483 cluster_callback(bp) 484 struct buf *bp; 485 { 486 struct cluster_save *b_save; 487 struct buf **tbp; 488 long bsize; 489 caddr_t cp; 490 b_save = (struct cluster_save *)(bp->b_saveaddr); 491 bp->b_saveaddr = b_save->bs_saveaddr; 492 493 cp = bp->b_un.b_addr + b_save->bs_bufsize; 494 for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { 495 pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); 496 cp += (*tbp)->b_bufsize; 497 bp->b_bufsize -= (*tbp)->b_bufsize; 498 biodone(*tbp); 499 } 500 #ifdef DIAGNOSTIC 501 if (bp->b_bufsize != b_save->bs_bufsize) 502 panic ("cluster_callback: more space to reclaim"); 503 #endif 504 bp->b_bcount = bp->b_bufsize; 505 bp->b_iodone = NULL; 506 free(b_save, M_SEGMENT); 507 if (bp->b_flags & B_ASYNC) 508 brelse(bp); 509 else 510 wakeup((caddr_t)bp); 511 } 512 513 /* 514 * Synchronous write. 515 * Release buffer on completion. 516 */ 517 bwrite(bp) 518 register struct buf *bp; 519 { 520 struct proc *p = curproc; /* XXX */ 521 register int flag; 522 int s, error = 0; 523 524 flag = bp->b_flags; 525 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 526 if (flag & B_ASYNC) { 527 if ((flag & B_DELWRI) == 0) 528 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 529 else 530 reassignbuf(bp, bp->b_vp); 531 } 532 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 533 if (bp->b_bcount > bp->b_bufsize) 534 panic("bwrite"); 535 s = splbio(); 536 bp->b_vp->v_numoutput++; 537 bp->b_flags |= B_WRITEINPROG; 538 splx(s); 539 VOP_STRATEGY(bp); 540 541 /* 542 * If the write was synchronous, then await I/O completion. 543 * If the write was "delayed", then we put the buffer on 544 * the queue of blocks awaiting I/O completion status. 545 */ 546 if ((flag & B_ASYNC) == 0) { 547 error = biowait(bp); 548 if ((flag&B_DELWRI) == 0) 549 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 550 else 551 reassignbuf(bp, bp->b_vp); 552 if (bp->b_flags & B_EINTR) { 553 bp->b_flags &= ~B_EINTR; 554 error = EINTR; 555 } 556 brelse(bp); 557 } else if (flag & B_DELWRI) { 558 s = splbio(); 559 bp->b_flags |= B_AGE; 560 splx(s); 561 } 562 return (error); 563 } 564 565 int 566 vn_bwrite(ap) 567 struct vop_bwrite_args *ap; 568 { 569 return (bwrite(ap->a_bp)); 570 } 571 572 573 /* 574 * Delayed write. 575 * 576 * The buffer is marked dirty, but is not queued for I/O. 577 * This routine should be used when the buffer is expected 578 * to be modified again soon, typically a small write that 579 * partially fills a buffer. 580 * 581 * NB: magnetic tapes cannot be delayed; they must be 582 * written in the order that the writes are requested. 583 */ 584 bdwrite(bp) 585 register struct buf *bp; 586 { 587 struct proc *p = curproc; /* XXX */ 588 589 if ((bp->b_flags & B_DELWRI) == 0) { 590 bp->b_flags |= B_DELWRI; 591 reassignbuf(bp, bp->b_vp); 592 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 593 } 594 /* 595 * If this is a tape drive, the write must be initiated. 596 */ 597 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 598 bawrite(bp); 599 } else { 600 bp->b_flags |= (B_DONE | B_DELWRI); 601 brelse(bp); 602 } 603 } 604 605 /* 606 * Asynchronous write. 607 * Start I/O on a buffer, but do not wait for it to complete. 608 * The buffer is released when the I/O completes. 609 */ 610 bawrite(bp) 611 register struct buf *bp; 612 { 613 614 /* 615 * Setting the ASYNC flag causes bwrite to return 616 * after starting the I/O. 617 */ 618 bp->b_flags |= B_ASYNC; 619 (void) VOP_BWRITE(bp); 620 } 621 622 /* 623 * Do clustered write for FFS. 624 * 625 * Three cases: 626 * 1. Write is not sequential (write asynchronously) 627 * Write is sequential: 628 * 2. beginning of cluster - begin cluster 629 * 3. middle of a cluster - add to cluster 630 * 4. end of a cluster - asynchronously write cluster 631 */ 632 void 633 cluster_write(bp, filesize) 634 struct buf *bp; 635 u_quad_t filesize; 636 { 637 struct vnode *vp; 638 daddr_t lbn; 639 int clen; 640 641 vp = bp->b_vp; 642 lbn = bp->b_lblkno; 643 644 /* Initialize vnode to beginning of file. */ 645 if (lbn == 0) 646 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 647 648 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 649 (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) { 650 if (vp->v_clen != 0) 651 /* 652 * Write is not sequential. 653 */ 654 cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 655 vp->v_lastw - vp->v_cstart + 1, lbn); 656 /* 657 * Consider beginning a cluster. 658 */ 659 if ((lbn + 1) * bp->b_bcount == filesize) 660 /* End of file, make cluster as large as possible */ 661 clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 662 else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { 663 bawrite(bp); 664 vp->v_clen = 0; 665 vp->v_lasta = bp->b_blkno; 666 vp->v_cstart = lbn + 1; 667 vp->v_lastw = lbn; 668 return; 669 } else 670 clen = 0; 671 vp->v_clen = clen; 672 if (clen == 0) { /* I/O not contiguous */ 673 vp->v_cstart = lbn + 1; 674 bawrite(bp); 675 } else { /* Wait for rest of cluster */ 676 vp->v_cstart = lbn; 677 bdwrite(bp); 678 } 679 } else if (lbn == vp->v_cstart + vp->v_clen) { 680 /* 681 * At end of cluster, write it out. 682 */ 683 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 684 vp->v_clen + 1, lbn); 685 vp->v_clen = 0; 686 vp->v_cstart = lbn + 1; 687 } else 688 /* 689 * In the middle of a cluster, so just delay the 690 * I/O for now. 691 */ 692 bdwrite(bp); 693 vp->v_lastw = lbn; 694 vp->v_lasta = bp->b_blkno; 695 } 696 697 698 /* 699 * This is an awful lot like cluster_rbuild...wish they could be combined. 700 * The last lbn argument is the current block on which I/O is being 701 * performed. Check to see that it doesn't fall in the middle of 702 * the current block. 703 */ 704 void 705 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 706 struct vnode *vp; 707 struct buf *last_bp; 708 long size; 709 daddr_t start_lbn; 710 int len; 711 daddr_t lbn; 712 { 713 struct cluster_save *b_save; 714 struct buf *bp, *tbp; 715 caddr_t cp; 716 int i, s; 717 718 #ifdef DIAGNOSTIC 719 if (size != vp->v_mount->mnt_stat.f_iosize) 720 panic("cluster_wbuild: size %d != filesize %d\n", 721 size, vp->v_mount->mnt_stat.f_iosize); 722 #endif 723 redo: 724 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 725 ++start_lbn; 726 --len; 727 } 728 729 /* Get more memory for current buffer */ 730 if (len <= 1) { 731 if (last_bp) { 732 bawrite(last_bp); 733 } else if (len) { 734 bp = getblk(vp, start_lbn, size, 0, 0); 735 bawrite(bp); 736 } 737 return; 738 } 739 740 bp = getblk(vp, start_lbn, size, 0, 0); 741 if (!(bp->b_flags & B_DELWRI)) { 742 ++start_lbn; 743 --len; 744 brelse(bp); 745 goto redo; 746 } 747 748 --len; 749 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 750 M_SEGMENT, M_WAITOK); 751 b_save->bs_bcount = bp->b_bcount; 752 b_save->bs_bufsize = bp->b_bufsize; 753 b_save->bs_nchildren = 0; 754 b_save->bs_children = (struct buf **)(b_save + 1); 755 b_save->bs_saveaddr = bp->b_saveaddr; 756 bp->b_saveaddr = (caddr_t) b_save; 757 758 759 bp->b_flags |= B_CALL; 760 bp->b_iodone = cluster_callback; 761 cp = bp->b_un.b_addr + bp->b_bufsize; 762 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 763 if (!incore(vp, start_lbn) || start_lbn == lbn) 764 break; 765 766 if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { 767 tbp = getblk(vp, start_lbn, size, 0, 0); 768 #ifdef DIAGNOSTIC 769 if (tbp->b_bcount != tbp->b_bufsize) 770 panic("cluster_wbuild: Buffer too big"); 771 #endif 772 if (!(tbp->b_flags & B_DELWRI)) { 773 brelse(tbp); 774 break; 775 } 776 } else 777 tbp = last_bp; 778 779 ++b_save->bs_nchildren; 780 781 /* Move memory from children to parent */ 782 if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) { 783 printf("Clustered Block: %d addr %x bufsize: %d\n", 784 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 785 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 786 tbp->b_blkno); 787 panic("Clustered write to wrong blocks"); 788 } 789 790 pagemove(tbp->b_un.b_daddr, cp, size); 791 bp->b_bcount += size; 792 bp->b_bufsize += size; 793 794 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 795 tbp->b_flags |= B_ASYNC; 796 s = splbio(); 797 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 798 ++tbp->b_vp->v_numoutput; 799 splx(s); 800 b_save->bs_children[i] = tbp; 801 802 cp += tbp->b_bufsize; 803 } 804 805 if (i == 0) { 806 /* None to cluster */ 807 bp->b_saveaddr = b_save->bs_saveaddr; 808 bp->b_flags &= ~B_CALL; 809 bp->b_iodone = NULL; 810 free(b_save, M_SEGMENT); 811 } 812 bawrite(bp); 813 if (i < len) { 814 len -= i + 1; 815 start_lbn += 1; 816 goto redo; 817 } 818 } 819 820 /* 821 * Release a buffer. 822 * Even if the buffer is dirty, no I/O is started. 823 */ 824 brelse(bp) 825 register struct buf *bp; 826 { 827 register struct queue_entry *flist; 828 int s; 829 830 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 831 /* 832 * If a process is waiting for the buffer, or 833 * is waiting for a free buffer, awaken it. 834 */ 835 if (bp->b_flags & B_WANTED) 836 wakeup((caddr_t)bp); 837 if (needbuffer) { 838 needbuffer = 0; 839 wakeup((caddr_t)&needbuffer); 840 } 841 /* 842 * Retry I/O for locked buffers rather than invalidating them. 843 */ 844 s = splbio(); 845 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 846 bp->b_flags &= ~B_ERROR; 847 /* 848 * Disassociate buffers that are no longer valid. 849 */ 850 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 851 bp->b_flags |= B_INVAL; 852 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 853 if (bp->b_vp) 854 brelvp(bp); 855 bp->b_flags &= ~B_DELWRI; 856 } 857 /* 858 * Stick the buffer back on a free list. 859 */ 860 if (bp->b_bufsize <= 0) { 861 /* block has no buffer ... put at front of unused buffer list */ 862 flist = &bufqueues[BQ_EMPTY]; 863 binsheadfree(bp, flist); 864 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 865 /* block has no info ... put at front of most free list */ 866 flist = &bufqueues[BQ_AGE]; 867 binsheadfree(bp, flist); 868 } else { 869 if (bp->b_flags & B_LOCKED) 870 flist = &bufqueues[BQ_LOCKED]; 871 else if (bp->b_flags & B_AGE) 872 flist = &bufqueues[BQ_AGE]; 873 else 874 flist = &bufqueues[BQ_LRU]; 875 binstailfree(bp, flist); 876 } 877 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 878 splx(s); 879 } 880 881 /* 882 * Check to see if a block is currently memory resident. 883 */ 884 struct buf * 885 incore(vp, blkno) 886 struct vnode *vp; 887 daddr_t blkno; 888 { 889 register struct buf *bp; 890 891 for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) 892 if (bp->b_lblkno == blkno && bp->b_vp == vp && 893 (bp->b_flags & B_INVAL) == 0) 894 return (bp); 895 return (NULL); 896 } 897 898 /* 899 * Check to see if a block is currently memory resident. 900 * If it is resident, return it. If it is not resident, 901 * allocate a new buffer and assign it to the block. 902 */ 903 struct buf * 904 getblk(vp, blkno, size, slpflag, slptimeo) 905 register struct vnode *vp; 906 daddr_t blkno; 907 int size, slpflag, slptimeo; 908 { 909 register struct buf *bp; 910 struct list_entry *dp; 911 int s, error; 912 913 if (size > MAXBSIZE) 914 panic("getblk: size too big"); 915 /* 916 * Search the cache for the block. If the buffer is found, 917 * but it is currently locked, the we must wait for it to 918 * become available. 919 */ 920 dp = BUFHASH(vp, blkno); 921 loop: 922 for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { 923 if (bp->b_lblkno != blkno || bp->b_vp != vp) 924 continue; 925 s = splbio(); 926 if (bp->b_flags & B_BUSY) { 927 bp->b_flags |= B_WANTED; 928 error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), 929 "getblk", slptimeo); 930 splx(s); 931 if (error) 932 return (NULL); 933 goto loop; 934 } 935 /* 936 * The test for B_INVAL is moved down here, since there 937 * are cases where B_INVAL is set before VOP_BWRITE() is 938 * called and for NFS, the process cannot be allowed to 939 * allocate a new buffer for the same block until the write 940 * back to the server has been completed. (ie. B_BUSY clears) 941 */ 942 if (bp->b_flags & B_INVAL) { 943 splx(s); 944 continue; 945 } 946 bremfree(bp); 947 bp->b_flags |= B_BUSY; 948 splx(s); 949 if (bp->b_bcount != size) { 950 printf("getblk: stray size"); 951 bp->b_flags |= B_INVAL; 952 VOP_BWRITE(bp); 953 goto loop; 954 } 955 bp->b_flags |= B_CACHE; 956 return (bp); 957 } 958 /* 959 * The loop back to the top when getnewbuf() fails is because 960 * stateless filesystems like NFS have no node locks. Thus, 961 * there is a slight chance that more than one process will 962 * try and getnewbuf() for the same block concurrently when 963 * the first sleeps in getnewbuf(). So after a sleep, go back 964 * up to the top to check the hash lists again. 965 */ 966 if ((bp = getnewbuf(slpflag, slptimeo)) == 0) 967 goto loop; 968 bremhash(bp); 969 bgetvp(vp, bp); 970 bp->b_bcount = 0; 971 bp->b_lblkno = blkno; 972 bp->b_blkno = blkno; 973 bp->b_error = 0; 974 bp->b_resid = 0; 975 binshash(bp, dp); 976 allocbuf(bp, size); 977 return (bp); 978 } 979 980 /* 981 * Allocate a buffer. 982 * The caller will assign it to a block. 983 */ 984 struct buf * 985 geteblk(size) 986 int size; 987 { 988 register struct buf *bp; 989 990 if (size > MAXBSIZE) 991 panic("geteblk: size too big"); 992 while ((bp = getnewbuf(0, 0)) == NULL) 993 /* void */; 994 bp->b_flags |= B_INVAL; 995 bremhash(bp); 996 binshash(bp, &invalhash); 997 bp->b_bcount = 0; 998 bp->b_error = 0; 999 bp->b_resid = 0; 1000 allocbuf(bp, size); 1001 return (bp); 1002 } 1003 1004 /* 1005 * Expand or contract the actual memory allocated to a buffer. 1006 * If no memory is available, release buffer and take error exit. 1007 */ 1008 allocbuf(tp, size) 1009 register struct buf *tp; 1010 int size; 1011 { 1012 register struct buf *bp, *ep; 1013 int sizealloc, take, s; 1014 1015 sizealloc = roundup(size, CLBYTES); 1016 /* 1017 * Buffer size does not change 1018 */ 1019 if (sizealloc == tp->b_bufsize) 1020 goto out; 1021 /* 1022 * Buffer size is shrinking. 1023 * Place excess space in a buffer header taken from the 1024 * BQ_EMPTY buffer list and placed on the "most free" list. 1025 * If no extra buffer headers are available, leave the 1026 * extra space in the present buffer. 1027 */ 1028 if (sizealloc < tp->b_bufsize) { 1029 if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) 1030 goto out; 1031 s = splbio(); 1032 bremfree(ep); 1033 ep->b_flags |= B_BUSY; 1034 splx(s); 1035 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 1036 (int)tp->b_bufsize - sizealloc); 1037 ep->b_bufsize = tp->b_bufsize - sizealloc; 1038 tp->b_bufsize = sizealloc; 1039 ep->b_flags |= B_INVAL; 1040 ep->b_bcount = 0; 1041 brelse(ep); 1042 goto out; 1043 } 1044 /* 1045 * More buffer space is needed. Get it out of buffers on 1046 * the "most free" list, placing the empty headers on the 1047 * BQ_EMPTY buffer header list. 1048 */ 1049 while (tp->b_bufsize < sizealloc) { 1050 take = sizealloc - tp->b_bufsize; 1051 while ((bp = getnewbuf(0, 0)) == NULL) 1052 /* void */; 1053 if (take >= bp->b_bufsize) 1054 take = bp->b_bufsize; 1055 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 1056 &tp->b_un.b_addr[tp->b_bufsize], take); 1057 tp->b_bufsize += take; 1058 bp->b_bufsize = bp->b_bufsize - take; 1059 if (bp->b_bcount > bp->b_bufsize) 1060 bp->b_bcount = bp->b_bufsize; 1061 if (bp->b_bufsize <= 0) { 1062 bremhash(bp); 1063 binshash(bp, &invalhash); 1064 bp->b_dev = NODEV; 1065 bp->b_error = 0; 1066 bp->b_flags |= B_INVAL; 1067 } 1068 brelse(bp); 1069 } 1070 out: 1071 tp->b_bcount = size; 1072 return (1); 1073 } 1074 1075 /* 1076 * Find a buffer which is available for use. 1077 * Select something from a free list. 1078 * Preference is to AGE list, then LRU list. 1079 */ 1080 struct buf * 1081 getnewbuf(slpflag, slptimeo) 1082 int slpflag, slptimeo; 1083 { 1084 register struct buf *bp; 1085 register struct queue_entry *dp; 1086 register struct ucred *cred; 1087 int s; 1088 1089 loop: 1090 s = splbio(); 1091 for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) 1092 if (dp->qe_next) 1093 break; 1094 if (dp == bufqueues) { /* no free blocks */ 1095 needbuffer = 1; 1096 (void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1), 1097 "getnewbuf", slptimeo); 1098 splx(s); 1099 return (NULL); 1100 } 1101 bp = dp->qe_next; 1102 bremfree(bp); 1103 bp->b_flags |= B_BUSY; 1104 splx(s); 1105 if (bp->b_flags & B_DELWRI) { 1106 (void) bawrite(bp); 1107 goto loop; 1108 } 1109 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 1110 if (bp->b_vp) 1111 brelvp(bp); 1112 if (bp->b_rcred != NOCRED) { 1113 cred = bp->b_rcred; 1114 bp->b_rcred = NOCRED; 1115 crfree(cred); 1116 } 1117 if (bp->b_wcred != NOCRED) { 1118 cred = bp->b_wcred; 1119 bp->b_wcred = NOCRED; 1120 crfree(cred); 1121 } 1122 bp->b_flags = B_BUSY; 1123 bp->b_dirtyoff = bp->b_dirtyend = 0; 1124 bp->b_validoff = bp->b_validend = 0; 1125 return (bp); 1126 } 1127 1128 /* 1129 * Wait for I/O to complete. 1130 * 1131 * Extract and return any errors associated with the I/O. 1132 * If the error flag is set, but no specific error is 1133 * given, return EIO. 1134 */ 1135 biowait(bp) 1136 register struct buf *bp; 1137 { 1138 int s; 1139 1140 s = splbio(); 1141 while ((bp->b_flags & B_DONE) == 0) 1142 sleep((caddr_t)bp, PRIBIO); 1143 splx(s); 1144 if ((bp->b_flags & B_ERROR) == 0) 1145 return (0); 1146 if (bp->b_error) 1147 return (bp->b_error); 1148 return (EIO); 1149 } 1150 1151 /* 1152 * Mark I/O complete on a buffer. 1153 * 1154 * If a callback has been requested, e.g. the pageout 1155 * daemon, do so. Otherwise, awaken waiting processes. 1156 */ 1157 void 1158 biodone(bp) 1159 register struct buf *bp; 1160 { 1161 1162 if (bp->b_flags & B_DONE) 1163 panic("dup biodone"); 1164 bp->b_flags |= B_DONE; 1165 if ((bp->b_flags & B_READ) == 0) 1166 vwakeup(bp); 1167 if (bp->b_flags & B_CALL) { 1168 bp->b_flags &= ~B_CALL; 1169 (*bp->b_iodone)(bp); 1170 return; 1171 } 1172 if (bp->b_flags & B_ASYNC) 1173 brelse(bp); 1174 else { 1175 bp->b_flags &= ~B_WANTED; 1176 wakeup((caddr_t)bp); 1177 } 1178 } 1179 1180 int 1181 count_lock_queue() 1182 { 1183 register struct buf *bp; 1184 register int ret; 1185 1186 for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next; 1187 bp; bp = (struct buf *)bp->b_freelist.qe_next) 1188 ++ret; 1189 return(ret); 1190 } 1191 1192 #ifdef DIAGNOSTIC 1193 /* 1194 * Print out statistics on the current allocation of the buffer pool. 1195 * Can be enabled to print out on every ``sync'' by setting "syncprt" 1196 * above. 1197 */ 1198 void 1199 vfs_bufstats() 1200 { 1201 int s, i, j, count; 1202 register struct buf *bp; 1203 register struct queue_entry *dp; 1204 int counts[MAXBSIZE/CLBYTES+1]; 1205 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 1206 1207 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 1208 count = 0; 1209 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 1210 counts[j] = 0; 1211 s = splbio(); 1212 for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { 1213 counts[bp->b_bufsize/CLBYTES]++; 1214 count++; 1215 } 1216 splx(s); 1217 printf("%s: total-%d", bname[i], count); 1218 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 1219 if (counts[j] != 0) 1220 printf(", %d-%d", j * CLBYTES, counts[j]); 1221 printf("\n"); 1222 } 1223 } 1224 #endif /* DIAGNOSTIC */ 1225