1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * Modifications/enhancements: 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 32 * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $ 33 * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $ 34 */ 35 36 #include "opt_debug_cluster.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/proc.h> 42 #include <sys/buf.h> 43 #include <sys/vnode.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/resourcevar.h> 47 #include <sys/vmmeter.h> 48 #include <vm/vm.h> 49 #include <vm/vm_object.h> 50 #include <vm/vm_page.h> 51 #include <sys/sysctl.h> 52 53 #include <sys/buf2.h> 54 #include <vm/vm_page2.h> 55 56 #include <machine/limits.h> 57 58 #if defined(CLUSTERDEBUG) 59 #include <sys/sysctl.h> 60 static int rcluster= 0; 61 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 62 #endif 63 64 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer"); 65 66 static struct cluster_save * 67 cluster_collectbufs (struct vnode *vp, struct buf *last_bp, 68 int blksize); 69 static struct buf * 70 cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset, 71 off_t doffset, int blksize, int run, 72 struct buf *fbp); 73 static void cluster_callback (struct bio *); 74 static void cluster_setram (struct buf *); 75 static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize, 76 off_t start_loffset, int bytes); 77 78 static int write_behind = 1; 79 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 80 "Cluster write-behind setting"); 81 static quad_t write_behind_minfilesize = 10 * 1024 * 1024; 82 SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW, 83 &write_behind_minfilesize, 0, "Cluster write-behind setting"); 84 static int max_readahead = 2 * 1024 * 1024; 85 SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, 86 "Limit in bytes for desired cluster read-ahead"); 87 88 extern vm_page_t bogus_page; 89 90 extern int cluster_pbuf_freecnt; 91 92 /* 93 * This replaces bread(), providing a synchronous read of the requested 94 * buffer plus asynchronous read-ahead within the specified bounds. 95 * 96 * The caller may pre-populate *bpp if it already has the requested buffer 97 * in-hand, else must set *bpp to NULL. Note that the cluster_read() inline 98 * sets *bpp to NULL and then calls cluster_readx() for compatibility. 99 * 100 * filesize - read-ahead @ blksize will not cross this boundary 101 * loffset - loffset for returned *bpp 102 * blksize - blocksize for returned *bpp and read-ahead bps 103 * minreq - minimum (not a hard minimum) in bytes, typically reflects 104 * a higher level uio resid. 105 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) 106 * bpp - return buffer (*bpp) for (loffset,blksize) 107 */ 108 int 109 cluster_readx(struct vnode *vp, off_t filesize, off_t loffset, 110 int blksize, size_t minreq, size_t maxreq, struct buf **bpp) 111 { 112 struct buf *bp, *rbp, *reqbp; 113 off_t origoffset; 114 off_t doffset; 115 int error; 116 int i; 117 int maxra; 118 int maxrbuild; 119 120 error = 0; 121 122 /* 123 * Calculate the desired read-ahead in blksize'd blocks (maxra). 124 * To do this we calculate maxreq. 125 * 126 * maxreq typically starts out as a sequential heuristic. If the 127 * high level uio/resid is bigger (minreq), we pop maxreq up to 128 * minreq. This represents the case where random I/O is being 129 * performed by the userland is issuing big read()'s. 130 * 131 * Then we limit maxreq to max_readahead to ensure it is a reasonable 132 * value. 133 * 134 * Finally we must ensure that (loffset + maxreq) does not cross the 135 * boundary (filesize) for the current blocksize. If we allowed it 136 * to cross we could end up with buffers past the boundary with the 137 * wrong block size (HAMMER large-data areas use mixed block sizes). 138 * minreq is also absolutely limited to filesize. 139 */ 140 if (maxreq < minreq) 141 maxreq = minreq; 142 /* minreq not used beyond this point */ 143 144 if (maxreq > max_readahead) { 145 maxreq = max_readahead; 146 if (maxreq > 16 * 1024 * 1024) 147 maxreq = 16 * 1024 * 1024; 148 } 149 if (maxreq < blksize) 150 maxreq = blksize; 151 if (loffset + maxreq > filesize) { 152 if (loffset > filesize) 153 maxreq = 0; 154 else 155 maxreq = filesize - loffset; 156 } 157 158 maxra = (int)(maxreq / blksize); 159 160 /* 161 * Get the requested block. 162 */ 163 if (*bpp) 164 reqbp = bp = *bpp; 165 else 166 *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0); 167 origoffset = loffset; 168 169 /* 170 * Calculate the maximum cluster size for a single I/O, used 171 * by cluster_rbuild(). 172 */ 173 maxrbuild = vmaxiosize(vp) / blksize; 174 175 /* 176 * if it is in the cache, then check to see if the reads have been 177 * sequential. If they have, then try some read-ahead, otherwise 178 * back-off on prospective read-aheads. 179 */ 180 if (bp->b_flags & B_CACHE) { 181 /* 182 * Not sequential, do not do any read-ahead 183 */ 184 if (maxra <= 1) 185 return 0; 186 187 /* 188 * No read-ahead mark, do not do any read-ahead 189 * yet. 190 */ 191 if ((bp->b_flags & B_RAM) == 0) 192 return 0; 193 194 /* 195 * We hit a read-ahead-mark, figure out how much read-ahead 196 * to do (maxra) and where to start (loffset). 197 * 198 * Shortcut the scan. Typically the way this works is that 199 * we've built up all the blocks inbetween except for the 200 * last in previous iterations, so if the second-to-last 201 * block is present we just skip ahead to it. 202 * 203 * This algorithm has O(1) cpu in the steady state no 204 * matter how large maxra is. 205 */ 206 bp->b_flags &= ~B_RAM; 207 208 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST)) 209 i = maxra - 1; 210 else 211 i = 1; 212 while (i < maxra) { 213 if (findblk(vp, loffset + i * blksize, 214 FINDBLK_TEST) == NULL) { 215 break; 216 } 217 ++i; 218 } 219 220 /* 221 * We got everything or everything is in the cache, no 222 * point continuing. 223 */ 224 if (i >= maxra) 225 return 0; 226 227 /* 228 * Calculate where to start the read-ahead and how much 229 * to do. Generally speaking we want to read-ahead by 230 * (maxra) when we've found a read-ahead mark. We do 231 * not want to reduce maxra here as it will cause 232 * successive read-ahead I/O's to be smaller and smaller. 233 * 234 * However, we have to make sure we don't break the 235 * filesize limitation for the clustered operation. 236 */ 237 loffset += i * blksize; 238 reqbp = bp = NULL; 239 240 if (loffset >= filesize) 241 return 0; 242 if (loffset + maxra * blksize > filesize) { 243 maxreq = filesize - loffset; 244 maxra = (int)(maxreq / blksize); 245 } 246 } else { 247 __debugvar off_t firstread = bp->b_loffset; 248 int nblks; 249 250 /* 251 * Set-up synchronous read for bp. 252 */ 253 bp->b_cmd = BUF_CMD_READ; 254 bp->b_bio1.bio_done = biodone_sync; 255 bp->b_bio1.bio_flags |= BIO_SYNC; 256 257 KASSERT(firstread != NOOFFSET, 258 ("cluster_read: no buffer offset")); 259 260 /* 261 * nblks is our cluster_rbuild request size, limited 262 * primarily by the device. 263 */ 264 if ((nblks = maxra) > maxrbuild) 265 nblks = maxrbuild; 266 267 if (nblks > 1) { 268 int burstbytes; 269 270 error = VOP_BMAP(vp, loffset, &doffset, 271 &burstbytes, NULL, BUF_CMD_READ); 272 if (error) 273 goto single_block_read; 274 if (nblks > burstbytes / blksize) 275 nblks = burstbytes / blksize; 276 if (doffset == NOOFFSET) 277 goto single_block_read; 278 if (nblks <= 1) 279 goto single_block_read; 280 281 bp = cluster_rbuild(vp, filesize, loffset, 282 doffset, blksize, nblks, bp); 283 loffset += bp->b_bufsize; 284 maxra -= bp->b_bufsize / blksize; 285 } else { 286 single_block_read: 287 /* 288 * If it isn't in the cache, then get a chunk from 289 * disk if sequential, otherwise just get the block. 290 */ 291 cluster_setram(bp); 292 loffset += blksize; 293 --maxra; 294 } 295 } 296 297 /* 298 * If B_CACHE was not set issue bp. bp will either be an 299 * asynchronous cluster buf or a synchronous single-buf. 300 * If it is a single buf it will be the same as reqbp. 301 * 302 * NOTE: Once an async cluster buf is issued bp becomes invalid. 303 */ 304 if (bp) { 305 #if defined(CLUSTERDEBUG) 306 if (rcluster) 307 kprintf("S(%012jx,%d,%d)\n", 308 (intmax_t)bp->b_loffset, bp->b_bcount, maxra); 309 #endif 310 if ((bp->b_flags & B_CLUSTER) == 0) 311 vfs_busy_pages(vp, bp); 312 bp->b_flags &= ~(B_ERROR|B_INVAL); 313 vn_strategy(vp, &bp->b_bio1); 314 error = 0; 315 /* bp invalid now */ 316 bp = NULL; 317 } 318 319 /* 320 * If we have been doing sequential I/O, then do some read-ahead. 321 * The code above us should have positioned us at the next likely 322 * offset. 323 * 324 * Only mess with buffers which we can immediately lock. HAMMER 325 * will do device-readahead irrespective of what the blocks 326 * represent. 327 */ 328 while (error == 0 && maxra > 0) { 329 int burstbytes; 330 int tmp_error; 331 int nblks; 332 333 rbp = getblk(vp, loffset, blksize, 334 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 335 if (rbp == NULL) 336 goto no_read_ahead; 337 if ((rbp->b_flags & B_CACHE)) { 338 bqrelse(rbp); 339 goto no_read_ahead; 340 } 341 342 /* 343 * An error from the read-ahead bmap has nothing to do 344 * with the caller's original request. 345 */ 346 tmp_error = VOP_BMAP(vp, loffset, &doffset, 347 &burstbytes, NULL, BUF_CMD_READ); 348 if (tmp_error || doffset == NOOFFSET) { 349 rbp->b_flags |= B_INVAL; 350 brelse(rbp); 351 rbp = NULL; 352 goto no_read_ahead; 353 } 354 if ((nblks = maxra) > maxrbuild) 355 nblks = maxrbuild; 356 if (nblks > burstbytes / blksize) 357 nblks = burstbytes / blksize; 358 359 /* 360 * rbp: async read 361 */ 362 rbp->b_cmd = BUF_CMD_READ; 363 /*rbp->b_flags |= B_AGE*/; 364 cluster_setram(rbp); 365 366 if (nblks > 1) { 367 rbp = cluster_rbuild(vp, filesize, loffset, 368 doffset, blksize, 369 nblks, rbp); 370 } else { 371 rbp->b_bio2.bio_offset = doffset; 372 } 373 374 rbp->b_flags &= ~(B_ERROR|B_INVAL); 375 376 if ((rbp->b_flags & B_CLUSTER) == 0) 377 vfs_busy_pages(vp, rbp); 378 BUF_KERNPROC(rbp); 379 loffset += rbp->b_bufsize; 380 maxra -= rbp->b_bufsize / blksize; 381 vn_strategy(vp, &rbp->b_bio1); 382 /* rbp invalid now */ 383 } 384 385 /* 386 * Wait for our original buffer to complete its I/O. reqbp will 387 * be NULL if the original buffer was B_CACHE. We are returning 388 * (*bpp) which is the same as reqbp when reqbp != NULL. 389 */ 390 no_read_ahead: 391 if (reqbp) { 392 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC); 393 error = biowait(&reqbp->b_bio1, "clurd"); 394 } 395 return (error); 396 } 397 398 /* 399 * This replaces breadcb(), providing an asynchronous read of the requested 400 * buffer with a callback, plus an asynchronous read-ahead within the 401 * specified bounds. 402 * 403 * The callback must check whether BIO_DONE is set in the bio and issue 404 * the bpdone(bp, 0) if it isn't. The callback is responsible for clearing 405 * BIO_DONE and disposing of the I/O (bqrelse()ing it). 406 * 407 * filesize - read-ahead @ blksize will not cross this boundary 408 * loffset - loffset for returned *bpp 409 * blksize - blocksize for returned *bpp and read-ahead bps 410 * minreq - minimum (not a hard minimum) in bytes, typically reflects 411 * a higher level uio resid. 412 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) 413 * bpp - return buffer (*bpp) for (loffset,blksize) 414 */ 415 void 416 cluster_readcb(struct vnode *vp, off_t filesize, off_t loffset, 417 int blksize, size_t minreq, size_t maxreq, 418 void (*func)(struct bio *), void *arg) 419 { 420 struct buf *bp, *rbp, *reqbp; 421 off_t origoffset; 422 off_t doffset; 423 int i; 424 int maxra; 425 int maxrbuild; 426 427 /* 428 * Calculate the desired read-ahead in blksize'd blocks (maxra). 429 * To do this we calculate maxreq. 430 * 431 * maxreq typically starts out as a sequential heuristic. If the 432 * high level uio/resid is bigger (minreq), we pop maxreq up to 433 * minreq. This represents the case where random I/O is being 434 * performed by the userland is issuing big read()'s. 435 * 436 * Then we limit maxreq to max_readahead to ensure it is a reasonable 437 * value. 438 * 439 * Finally we must ensure that (loffset + maxreq) does not cross the 440 * boundary (filesize) for the current blocksize. If we allowed it 441 * to cross we could end up with buffers past the boundary with the 442 * wrong block size (HAMMER large-data areas use mixed block sizes). 443 * minreq is also absolutely limited to filesize. 444 */ 445 if (maxreq < minreq) 446 maxreq = minreq; 447 /* minreq not used beyond this point */ 448 449 if (maxreq > max_readahead) { 450 maxreq = max_readahead; 451 if (maxreq > 16 * 1024 * 1024) 452 maxreq = 16 * 1024 * 1024; 453 } 454 if (maxreq < blksize) 455 maxreq = blksize; 456 if (loffset + maxreq > filesize) { 457 if (loffset > filesize) 458 maxreq = 0; 459 else 460 maxreq = filesize - loffset; 461 } 462 463 maxra = (int)(maxreq / blksize); 464 465 /* 466 * Get the requested block. 467 */ 468 reqbp = bp = getblk(vp, loffset, blksize, 0, 0); 469 origoffset = loffset; 470 471 /* 472 * Calculate the maximum cluster size for a single I/O, used 473 * by cluster_rbuild(). 474 */ 475 maxrbuild = vmaxiosize(vp) / blksize; 476 477 /* 478 * if it is in the cache, then check to see if the reads have been 479 * sequential. If they have, then try some read-ahead, otherwise 480 * back-off on prospective read-aheads. 481 */ 482 if (bp->b_flags & B_CACHE) { 483 /* 484 * Setup for func() call whether we do read-ahead or not. 485 */ 486 bp->b_bio1.bio_caller_info1.ptr = arg; 487 bp->b_bio1.bio_flags |= BIO_DONE; 488 489 /* 490 * Not sequential, do not do any read-ahead 491 */ 492 if (maxra <= 1) 493 goto no_read_ahead; 494 495 /* 496 * No read-ahead mark, do not do any read-ahead 497 * yet. 498 */ 499 if ((bp->b_flags & B_RAM) == 0) 500 goto no_read_ahead; 501 bp->b_flags &= ~B_RAM; 502 503 /* 504 * We hit a read-ahead-mark, figure out how much read-ahead 505 * to do (maxra) and where to start (loffset). 506 * 507 * Shortcut the scan. Typically the way this works is that 508 * we've built up all the blocks inbetween except for the 509 * last in previous iterations, so if the second-to-last 510 * block is present we just skip ahead to it. 511 * 512 * This algorithm has O(1) cpu in the steady state no 513 * matter how large maxra is. 514 */ 515 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST)) 516 i = maxra - 1; 517 else 518 i = 1; 519 while (i < maxra) { 520 if (findblk(vp, loffset + i * blksize, 521 FINDBLK_TEST) == NULL) { 522 break; 523 } 524 ++i; 525 } 526 527 /* 528 * We got everything or everything is in the cache, no 529 * point continuing. 530 */ 531 if (i >= maxra) 532 goto no_read_ahead; 533 534 /* 535 * Calculate where to start the read-ahead and how much 536 * to do. Generally speaking we want to read-ahead by 537 * (maxra) when we've found a read-ahead mark. We do 538 * not want to reduce maxra here as it will cause 539 * successive read-ahead I/O's to be smaller and smaller. 540 * 541 * However, we have to make sure we don't break the 542 * filesize limitation for the clustered operation. 543 */ 544 loffset += i * blksize; 545 bp = NULL; 546 /* leave reqbp intact to force function callback */ 547 548 if (loffset >= filesize) 549 goto no_read_ahead; 550 if (loffset + maxra * blksize > filesize) { 551 maxreq = filesize - loffset; 552 maxra = (int)(maxreq / blksize); 553 } 554 } else { 555 __debugvar off_t firstread = bp->b_loffset; 556 int nblks; 557 int tmp_error; 558 559 /* 560 * Set-up synchronous read for bp. 561 */ 562 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 563 bp->b_cmd = BUF_CMD_READ; 564 bp->b_bio1.bio_done = func; 565 bp->b_bio1.bio_caller_info1.ptr = arg; 566 BUF_KERNPROC(bp); 567 reqbp = NULL; /* don't func() reqbp, it's running async */ 568 569 KASSERT(firstread != NOOFFSET, 570 ("cluster_read: no buffer offset")); 571 572 /* 573 * nblks is our cluster_rbuild request size, limited 574 * primarily by the device. 575 */ 576 if ((nblks = maxra) > maxrbuild) 577 nblks = maxrbuild; 578 579 if (nblks > 1) { 580 int burstbytes; 581 582 tmp_error = VOP_BMAP(vp, loffset, &doffset, 583 &burstbytes, NULL, BUF_CMD_READ); 584 if (tmp_error) 585 goto single_block_read; 586 if (nblks > burstbytes / blksize) 587 nblks = burstbytes / blksize; 588 if (doffset == NOOFFSET) 589 goto single_block_read; 590 if (nblks <= 1) 591 goto single_block_read; 592 593 bp = cluster_rbuild(vp, filesize, loffset, 594 doffset, blksize, nblks, bp); 595 loffset += bp->b_bufsize; 596 maxra -= bp->b_bufsize / blksize; 597 } else { 598 single_block_read: 599 /* 600 * If it isn't in the cache, then get a chunk from 601 * disk if sequential, otherwise just get the block. 602 */ 603 cluster_setram(bp); 604 loffset += blksize; 605 --maxra; 606 } 607 } 608 609 /* 610 * If bp != NULL then B_CACHE was *NOT* set and bp must be issued. 611 * bp will either be an asynchronous cluster buf or an asynchronous 612 * single-buf. 613 * 614 * NOTE: Once an async cluster buf is issued bp becomes invalid. 615 */ 616 if (bp) { 617 #if defined(CLUSTERDEBUG) 618 if (rcluster) 619 kprintf("S(%012jx,%d,%d)\n", 620 (intmax_t)bp->b_loffset, bp->b_bcount, maxra); 621 #endif 622 if ((bp->b_flags & B_CLUSTER) == 0) 623 vfs_busy_pages(vp, bp); 624 bp->b_flags &= ~(B_ERROR|B_INVAL); 625 vn_strategy(vp, &bp->b_bio1); 626 /* bp invalid now */ 627 bp = NULL; 628 } 629 630 /* 631 * If we have been doing sequential I/O, then do some read-ahead. 632 * The code above us should have positioned us at the next likely 633 * offset. 634 * 635 * Only mess with buffers which we can immediately lock. HAMMER 636 * will do device-readahead irrespective of what the blocks 637 * represent. 638 */ 639 while (maxra > 0) { 640 int burstbytes; 641 int tmp_error; 642 int nblks; 643 644 rbp = getblk(vp, loffset, blksize, 645 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 646 if (rbp == NULL) 647 goto no_read_ahead; 648 if ((rbp->b_flags & B_CACHE)) { 649 bqrelse(rbp); 650 goto no_read_ahead; 651 } 652 653 /* 654 * An error from the read-ahead bmap has nothing to do 655 * with the caller's original request. 656 */ 657 tmp_error = VOP_BMAP(vp, loffset, &doffset, 658 &burstbytes, NULL, BUF_CMD_READ); 659 if (tmp_error || doffset == NOOFFSET) { 660 rbp->b_flags |= B_INVAL; 661 brelse(rbp); 662 rbp = NULL; 663 goto no_read_ahead; 664 } 665 if ((nblks = maxra) > maxrbuild) 666 nblks = maxrbuild; 667 if (nblks > burstbytes / blksize) 668 nblks = burstbytes / blksize; 669 670 /* 671 * rbp: async read 672 */ 673 rbp->b_cmd = BUF_CMD_READ; 674 /*rbp->b_flags |= B_AGE*/; 675 cluster_setram(rbp); 676 677 if (nblks > 1) { 678 rbp = cluster_rbuild(vp, filesize, loffset, 679 doffset, blksize, 680 nblks, rbp); 681 } else { 682 rbp->b_bio2.bio_offset = doffset; 683 } 684 685 rbp->b_flags &= ~(B_ERROR|B_INVAL); 686 687 if ((rbp->b_flags & B_CLUSTER) == 0) 688 vfs_busy_pages(vp, rbp); 689 BUF_KERNPROC(rbp); 690 loffset += rbp->b_bufsize; 691 maxra -= rbp->b_bufsize / blksize; 692 vn_strategy(vp, &rbp->b_bio1); 693 /* rbp invalid now */ 694 } 695 696 /* 697 * If reqbp is non-NULL it had B_CACHE set and we issue the 698 * function callback synchronously. 699 * 700 * Note that we may start additional asynchronous I/O before doing 701 * the func() callback for the B_CACHE case 702 */ 703 no_read_ahead: 704 if (reqbp) 705 func(&reqbp->b_bio1); 706 } 707 708 /* 709 * If blocks are contiguous on disk, use this to provide clustered 710 * read ahead. We will read as many blocks as possible sequentially 711 * and then parcel them up into logical blocks in the buffer hash table. 712 * 713 * This function either returns a cluster buf or it returns fbp. fbp is 714 * already expected to be set up as a synchronous or asynchronous request. 715 * 716 * If a cluster buf is returned it will always be async. 717 */ 718 static struct buf * 719 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, 720 int blksize, int run, struct buf *fbp) 721 { 722 struct buf *bp, *tbp; 723 off_t boffset; 724 int i, j; 725 int maxiosize = vmaxiosize(vp); 726 727 /* 728 * avoid a division 729 */ 730 while (loffset + run * blksize > filesize) { 731 --run; 732 } 733 734 tbp = fbp; 735 tbp->b_bio2.bio_offset = doffset; 736 if((tbp->b_flags & B_MALLOC) || 737 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) { 738 return tbp; 739 } 740 741 bp = trypbuf_kva(&cluster_pbuf_freecnt); 742 if (bp == NULL) { 743 return tbp; 744 } 745 746 /* 747 * We are synthesizing a buffer out of vm_page_t's, but 748 * if the block size is not page aligned then the starting 749 * address may not be either. Inherit the b_data offset 750 * from the original buffer. 751 */ 752 bp->b_data = (char *)((vm_offset_t)bp->b_data | 753 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 754 bp->b_flags |= B_CLUSTER | B_VMIO; 755 bp->b_cmd = BUF_CMD_READ; 756 bp->b_bio1.bio_done = cluster_callback; /* default to async */ 757 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 758 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 759 bp->b_loffset = loffset; 760 bp->b_bio2.bio_offset = doffset; 761 KASSERT(bp->b_loffset != NOOFFSET, 762 ("cluster_rbuild: no buffer offset")); 763 764 bp->b_bcount = 0; 765 bp->b_bufsize = 0; 766 bp->b_xio.xio_npages = 0; 767 768 for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) { 769 if (i) { 770 if ((bp->b_xio.xio_npages * PAGE_SIZE) + 771 round_page(blksize) > maxiosize) { 772 break; 773 } 774 775 /* 776 * Shortcut some checks and try to avoid buffers that 777 * would block in the lock. The same checks have to 778 * be made again after we officially get the buffer. 779 */ 780 tbp = getblk(vp, loffset + i * blksize, blksize, 781 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 782 if (tbp == NULL) 783 break; 784 for (j = 0; j < tbp->b_xio.xio_npages; j++) { 785 if (tbp->b_xio.xio_pages[j]->valid) 786 break; 787 } 788 if (j != tbp->b_xio.xio_npages) { 789 bqrelse(tbp); 790 break; 791 } 792 793 /* 794 * Stop scanning if the buffer is fuly valid 795 * (marked B_CACHE), or locked (may be doing a 796 * background write), or if the buffer is not 797 * VMIO backed. The clustering code can only deal 798 * with VMIO-backed buffers. 799 */ 800 if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || 801 (tbp->b_flags & B_VMIO) == 0 || 802 (LIST_FIRST(&tbp->b_dep) != NULL && 803 buf_checkread(tbp)) 804 ) { 805 bqrelse(tbp); 806 break; 807 } 808 809 /* 810 * The buffer must be completely invalid in order to 811 * take part in the cluster. If it is partially valid 812 * then we stop. 813 */ 814 for (j = 0;j < tbp->b_xio.xio_npages; j++) { 815 if (tbp->b_xio.xio_pages[j]->valid) 816 break; 817 } 818 if (j != tbp->b_xio.xio_npages) { 819 bqrelse(tbp); 820 break; 821 } 822 823 /* 824 * Set a read-ahead mark as appropriate. Always 825 * set the read-ahead mark at (run - 1). It is 826 * unclear why we were also setting it at i == 1. 827 */ 828 if (/*i == 1 ||*/ i == (run - 1)) 829 cluster_setram(tbp); 830 831 /* 832 * Depress the priority of buffers not explicitly 833 * requested. 834 */ 835 /* tbp->b_flags |= B_AGE; */ 836 837 /* 838 * Set the block number if it isn't set, otherwise 839 * if it is make sure it matches the block number we 840 * expect. 841 */ 842 if (tbp->b_bio2.bio_offset == NOOFFSET) { 843 tbp->b_bio2.bio_offset = boffset; 844 } else if (tbp->b_bio2.bio_offset != boffset) { 845 brelse(tbp); 846 break; 847 } 848 } 849 850 /* 851 * The passed-in tbp (i == 0) will already be set up for 852 * async or sync operation. All other tbp's acquire in 853 * our loop are set up for async operation. 854 */ 855 tbp->b_cmd = BUF_CMD_READ; 856 BUF_KERNPROC(tbp); 857 cluster_append(&bp->b_bio1, tbp); 858 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 859 vm_page_t m; 860 861 m = tbp->b_xio.xio_pages[j]; 862 vm_page_busy_wait(m, FALSE, "clurpg"); 863 vm_page_io_start(m); 864 vm_page_wakeup(m); 865 vm_object_pip_add(m->object, 1); 866 if ((bp->b_xio.xio_npages == 0) || 867 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) { 868 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 869 bp->b_xio.xio_npages++; 870 } 871 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 872 tbp->b_xio.xio_pages[j] = bogus_page; 873 } 874 /* 875 * XXX shouldn't this be += size for both, like in 876 * cluster_wbuild()? 877 * 878 * Don't inherit tbp->b_bufsize as it may be larger due to 879 * a non-page-aligned size. Instead just aggregate using 880 * 'size'. 881 */ 882 if (tbp->b_bcount != blksize) 883 kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize); 884 if (tbp->b_bufsize != blksize) 885 kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize); 886 bp->b_bcount += blksize; 887 bp->b_bufsize += blksize; 888 } 889 890 /* 891 * Fully valid pages in the cluster are already good and do not need 892 * to be re-read from disk. Replace the page with bogus_page 893 */ 894 for (j = 0; j < bp->b_xio.xio_npages; j++) { 895 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) == 896 VM_PAGE_BITS_ALL) { 897 bp->b_xio.xio_pages[j] = bogus_page; 898 } 899 } 900 if (bp->b_bufsize > bp->b_kvasize) { 901 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)", 902 bp->b_bufsize, bp->b_kvasize); 903 } 904 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 905 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages); 906 BUF_KERNPROC(bp); 907 return (bp); 908 } 909 910 /* 911 * Cleanup after a clustered read or write. 912 * This is complicated by the fact that any of the buffers might have 913 * extra memory (if there were no empty buffer headers at allocbuf time) 914 * that we will need to shift around. 915 * 916 * The returned bio is &bp->b_bio1 917 */ 918 void 919 cluster_callback(struct bio *bio) 920 { 921 struct buf *bp = bio->bio_buf; 922 struct buf *tbp; 923 int error = 0; 924 925 /* 926 * Must propogate errors to all the components. A short read (EOF) 927 * is a critical error. 928 */ 929 if (bp->b_flags & B_ERROR) { 930 error = bp->b_error; 931 } else if (bp->b_bcount != bp->b_bufsize) { 932 panic("cluster_callback: unexpected EOF on cluster %p!", bio); 933 } 934 935 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages); 936 /* 937 * Move memory from the large cluster buffer into the component 938 * buffers and mark IO as done on these. Since the memory map 939 * is the same, no actual copying is required. 940 */ 941 while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) { 942 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next; 943 if (error) { 944 tbp->b_flags |= B_ERROR | B_IODEBUG; 945 tbp->b_error = error; 946 } else { 947 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 948 tbp->b_flags &= ~(B_ERROR|B_INVAL); 949 tbp->b_flags |= B_IODEBUG; 950 /* 951 * XXX the bdwrite()/bqrelse() issued during 952 * cluster building clears B_RELBUF (see bqrelse() 953 * comment). If direct I/O was specified, we have 954 * to restore it here to allow the buffer and VM 955 * to be freed. 956 */ 957 if (tbp->b_flags & B_DIRECT) 958 tbp->b_flags |= B_RELBUF; 959 } 960 biodone(&tbp->b_bio1); 961 } 962 relpbuf(bp, &cluster_pbuf_freecnt); 963 } 964 965 /* 966 * Implement modified write build for cluster. 967 * 968 * write_behind = 0 write behind disabled 969 * write_behind = 1 write behind normal (default) 970 * write_behind = 2 write behind backed-off 971 * 972 * In addition, write_behind is only activated for files that have 973 * grown past a certain size (default 10MB). Otherwise temporary files 974 * wind up generating a lot of unnecessary disk I/O. 975 */ 976 static __inline int 977 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len) 978 { 979 int r = 0; 980 981 switch(write_behind) { 982 case 2: 983 if (start_loffset < len) 984 break; 985 start_loffset -= len; 986 /* fall through */ 987 case 1: 988 if (vp->v_filesize >= write_behind_minfilesize) { 989 r = cluster_wbuild(vp, NULL, blksize, 990 start_loffset, len); 991 } 992 /* fall through */ 993 default: 994 /* fall through */ 995 break; 996 } 997 return(r); 998 } 999 1000 /* 1001 * Do clustered write for FFS. 1002 * 1003 * Three cases: 1004 * 1. Write is not sequential (write asynchronously) 1005 * Write is sequential: 1006 * 2. beginning of cluster - begin cluster 1007 * 3. middle of a cluster - add to cluster 1008 * 4. end of a cluster - asynchronously write cluster 1009 */ 1010 void 1011 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount) 1012 { 1013 struct vnode *vp; 1014 off_t loffset; 1015 int maxclen, cursize; 1016 int async; 1017 1018 vp = bp->b_vp; 1019 if (vp->v_type == VREG) 1020 async = vp->v_mount->mnt_flag & MNT_ASYNC; 1021 else 1022 async = 0; 1023 loffset = bp->b_loffset; 1024 KASSERT(bp->b_loffset != NOOFFSET, 1025 ("cluster_write: no buffer offset")); 1026 1027 /* Initialize vnode to beginning of file. */ 1028 if (loffset == 0) 1029 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 1030 1031 if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize || 1032 bp->b_bio2.bio_offset == NOOFFSET || 1033 (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) { 1034 maxclen = vmaxiosize(vp); 1035 if (vp->v_clen != 0) { 1036 /* 1037 * Next block is not sequential. 1038 * 1039 * If we are not writing at end of file, the process 1040 * seeked to another point in the file since its last 1041 * write, or we have reached our maximum cluster size, 1042 * then push the previous cluster. Otherwise try 1043 * reallocating to make it sequential. 1044 * 1045 * Change to algorithm: only push previous cluster if 1046 * it was sequential from the point of view of the 1047 * seqcount heuristic, otherwise leave the buffer 1048 * intact so we can potentially optimize the I/O 1049 * later on in the buf_daemon or update daemon 1050 * flush. 1051 */ 1052 cursize = vp->v_lastw - vp->v_cstart + blksize; 1053 if (bp->b_loffset + blksize < filesize || 1054 loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) { 1055 if (!async && seqcount > 0) { 1056 cluster_wbuild_wb(vp, blksize, 1057 vp->v_cstart, cursize); 1058 } 1059 } else { 1060 struct buf **bpp, **endbp; 1061 struct cluster_save *buflist; 1062 1063 buflist = cluster_collectbufs(vp, bp, blksize); 1064 endbp = &buflist->bs_children 1065 [buflist->bs_nchildren - 1]; 1066 if (VOP_REALLOCBLKS(vp, buflist)) { 1067 /* 1068 * Failed, push the previous cluster 1069 * if *really* writing sequentially 1070 * in the logical file (seqcount > 1), 1071 * otherwise delay it in the hopes that 1072 * the low level disk driver can 1073 * optimize the write ordering. 1074 */ 1075 for (bpp = buflist->bs_children; 1076 bpp < endbp; bpp++) 1077 brelse(*bpp); 1078 kfree(buflist, M_SEGMENT); 1079 if (seqcount > 1) { 1080 cluster_wbuild_wb(vp, 1081 blksize, vp->v_cstart, 1082 cursize); 1083 } 1084 } else { 1085 /* 1086 * Succeeded, keep building cluster. 1087 */ 1088 for (bpp = buflist->bs_children; 1089 bpp <= endbp; bpp++) 1090 bdwrite(*bpp); 1091 kfree(buflist, M_SEGMENT); 1092 vp->v_lastw = loffset; 1093 vp->v_lasta = bp->b_bio2.bio_offset; 1094 return; 1095 } 1096 } 1097 } 1098 /* 1099 * Consider beginning a cluster. If at end of file, make 1100 * cluster as large as possible, otherwise find size of 1101 * existing cluster. 1102 */ 1103 if ((vp->v_type == VREG) && 1104 bp->b_loffset + blksize < filesize && 1105 (bp->b_bio2.bio_offset == NOOFFSET) && 1106 (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) || 1107 bp->b_bio2.bio_offset == NOOFFSET)) { 1108 bdwrite(bp); 1109 vp->v_clen = 0; 1110 vp->v_lasta = bp->b_bio2.bio_offset; 1111 vp->v_cstart = loffset + blksize; 1112 vp->v_lastw = loffset; 1113 return; 1114 } 1115 if (maxclen > blksize) 1116 vp->v_clen = maxclen - blksize; 1117 else 1118 vp->v_clen = 0; 1119 if (!async && vp->v_clen == 0) { /* I/O not contiguous */ 1120 vp->v_cstart = loffset + blksize; 1121 bdwrite(bp); 1122 } else { /* Wait for rest of cluster */ 1123 vp->v_cstart = loffset; 1124 bdwrite(bp); 1125 } 1126 } else if (loffset == vp->v_cstart + vp->v_clen) { 1127 /* 1128 * At end of cluster, write it out if seqcount tells us we 1129 * are operating sequentially, otherwise let the buf or 1130 * update daemon handle it. 1131 */ 1132 bdwrite(bp); 1133 if (seqcount > 1) 1134 cluster_wbuild_wb(vp, blksize, vp->v_cstart, 1135 vp->v_clen + blksize); 1136 vp->v_clen = 0; 1137 vp->v_cstart = loffset + blksize; 1138 } else if (vm_page_count_severe() && 1139 bp->b_loffset + blksize < filesize) { 1140 /* 1141 * We are low on memory, get it going NOW. However, do not 1142 * try to push out a partial block at the end of the file 1143 * as this could lead to extremely non-optimal write activity. 1144 */ 1145 bawrite(bp); 1146 } else { 1147 /* 1148 * In the middle of a cluster, so just delay the I/O for now. 1149 */ 1150 bdwrite(bp); 1151 } 1152 vp->v_lastw = loffset; 1153 vp->v_lasta = bp->b_bio2.bio_offset; 1154 } 1155 1156 /* 1157 * This is the clustered version of bawrite(). It works similarly to 1158 * cluster_write() except I/O on the buffer is guaranteed to occur. 1159 */ 1160 int 1161 cluster_awrite(struct buf *bp) 1162 { 1163 int total; 1164 1165 /* 1166 * Don't bother if it isn't clusterable. 1167 */ 1168 if ((bp->b_flags & B_CLUSTEROK) == 0 || 1169 bp->b_vp == NULL || 1170 (bp->b_vp->v_flag & VOBJBUF) == 0) { 1171 total = bp->b_bufsize; 1172 bawrite(bp); 1173 return (total); 1174 } 1175 1176 total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize, 1177 bp->b_loffset, vmaxiosize(bp->b_vp)); 1178 if (bp) 1179 bawrite(bp); 1180 1181 return total; 1182 } 1183 1184 /* 1185 * This is an awful lot like cluster_rbuild...wish they could be combined. 1186 * The last lbn argument is the current block on which I/O is being 1187 * performed. Check to see that it doesn't fall in the middle of 1188 * the current block (if last_bp == NULL). 1189 * 1190 * cluster_wbuild() normally does not guarantee anything. If bpp is 1191 * non-NULL and cluster_wbuild() is able to incorporate it into the 1192 * I/O it will set *bpp to NULL, otherwise it will leave it alone and 1193 * the caller must dispose of *bpp. 1194 */ 1195 static int 1196 cluster_wbuild(struct vnode *vp, struct buf **bpp, 1197 int blksize, off_t start_loffset, int bytes) 1198 { 1199 struct buf *bp, *tbp; 1200 int i, j; 1201 int totalwritten = 0; 1202 int must_initiate; 1203 int maxiosize = vmaxiosize(vp); 1204 1205 while (bytes > 0) { 1206 /* 1207 * If the buffer matches the passed locked & removed buffer 1208 * we used the passed buffer (which might not be B_DELWRI). 1209 * 1210 * Otherwise locate the buffer and determine if it is 1211 * compatible. 1212 */ 1213 if (bpp && (*bpp)->b_loffset == start_loffset) { 1214 tbp = *bpp; 1215 *bpp = NULL; 1216 bpp = NULL; 1217 } else { 1218 tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK); 1219 if (tbp == NULL || 1220 (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != 1221 B_DELWRI || 1222 (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) { 1223 if (tbp) 1224 BUF_UNLOCK(tbp); 1225 start_loffset += blksize; 1226 bytes -= blksize; 1227 continue; 1228 } 1229 bremfree(tbp); 1230 } 1231 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 1232 1233 /* 1234 * Extra memory in the buffer, punt on this buffer. 1235 * XXX we could handle this in most cases, but we would 1236 * have to push the extra memory down to after our max 1237 * possible cluster size and then potentially pull it back 1238 * up if the cluster was terminated prematurely--too much 1239 * hassle. 1240 */ 1241 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 1242 (tbp->b_bcount != tbp->b_bufsize) || 1243 (tbp->b_bcount != blksize) || 1244 (bytes == blksize) || 1245 ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) { 1246 totalwritten += tbp->b_bufsize; 1247 bawrite(tbp); 1248 start_loffset += blksize; 1249 bytes -= blksize; 1250 continue; 1251 } 1252 1253 /* 1254 * Set up the pbuf. Track our append point with b_bcount 1255 * and b_bufsize. b_bufsize is not used by the device but 1256 * our caller uses it to loop clusters and we use it to 1257 * detect a premature EOF on the block device. 1258 */ 1259 bp->b_bcount = 0; 1260 bp->b_bufsize = 0; 1261 bp->b_xio.xio_npages = 0; 1262 bp->b_loffset = tbp->b_loffset; 1263 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset; 1264 1265 /* 1266 * We are synthesizing a buffer out of vm_page_t's, but 1267 * if the block size is not page aligned then the starting 1268 * address may not be either. Inherit the b_data offset 1269 * from the original buffer. 1270 */ 1271 bp->b_data = (char *)((vm_offset_t)bp->b_data | 1272 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 1273 bp->b_flags &= ~B_ERROR; 1274 bp->b_flags |= B_CLUSTER | B_BNOCLIP | 1275 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 1276 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 1277 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 1278 1279 /* 1280 * From this location in the file, scan forward to see 1281 * if there are buffers with adjacent data that need to 1282 * be written as well. 1283 * 1284 * IO *must* be initiated on index 0 at this point 1285 * (particularly when called from cluster_awrite()). 1286 */ 1287 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) { 1288 if (i == 0) { 1289 must_initiate = 1; 1290 } else { 1291 /* 1292 * Not first buffer. 1293 */ 1294 must_initiate = 0; 1295 tbp = findblk(vp, start_loffset, 1296 FINDBLK_NBLOCK); 1297 /* 1298 * Buffer not found or could not be locked 1299 * non-blocking. 1300 */ 1301 if (tbp == NULL) 1302 break; 1303 1304 /* 1305 * If it IS in core, but has different 1306 * characteristics, then don't cluster 1307 * with it. 1308 */ 1309 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 1310 B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 1311 != (B_DELWRI | B_CLUSTEROK | 1312 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 1313 (tbp->b_flags & B_LOCKED) 1314 ) { 1315 BUF_UNLOCK(tbp); 1316 break; 1317 } 1318 1319 /* 1320 * Check that the combined cluster 1321 * would make sense with regard to pages 1322 * and would not be too large 1323 * 1324 * WARNING! buf_checkwrite() must be the last 1325 * check made. If it returns 0 then 1326 * we must initiate the I/O. 1327 */ 1328 if ((tbp->b_bcount != blksize) || 1329 ((bp->b_bio2.bio_offset + i) != 1330 tbp->b_bio2.bio_offset) || 1331 ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > 1332 (maxiosize / PAGE_SIZE)) || 1333 (LIST_FIRST(&tbp->b_dep) && 1334 buf_checkwrite(tbp)) 1335 ) { 1336 BUF_UNLOCK(tbp); 1337 break; 1338 } 1339 if (LIST_FIRST(&tbp->b_dep)) 1340 must_initiate = 1; 1341 /* 1342 * Ok, it's passed all the tests, 1343 * so remove it from the free list 1344 * and mark it busy. We will use it. 1345 */ 1346 bremfree(tbp); 1347 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 1348 } 1349 1350 /* 1351 * If the IO is via the VM then we do some 1352 * special VM hackery (yuck). Since the buffer's 1353 * block size may not be page-aligned it is possible 1354 * for a page to be shared between two buffers. We 1355 * have to get rid of the duplication when building 1356 * the cluster. 1357 */ 1358 if (tbp->b_flags & B_VMIO) { 1359 vm_page_t m; 1360 1361 /* 1362 * Try to avoid deadlocks with the VM system. 1363 * However, we cannot abort the I/O if 1364 * must_initiate is non-zero. 1365 */ 1366 if (must_initiate == 0) { 1367 for (j = 0; 1368 j < tbp->b_xio.xio_npages; 1369 ++j) { 1370 m = tbp->b_xio.xio_pages[j]; 1371 if (m->flags & PG_BUSY) { 1372 bqrelse(tbp); 1373 goto finishcluster; 1374 } 1375 } 1376 } 1377 1378 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 1379 m = tbp->b_xio.xio_pages[j]; 1380 vm_page_busy_wait(m, FALSE, "clurpg"); 1381 vm_page_io_start(m); 1382 vm_page_wakeup(m); 1383 vm_object_pip_add(m->object, 1); 1384 if ((bp->b_xio.xio_npages == 0) || 1385 (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) { 1386 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 1387 bp->b_xio.xio_npages++; 1388 } 1389 } 1390 } 1391 bp->b_bcount += blksize; 1392 bp->b_bufsize += blksize; 1393 1394 bundirty(tbp); 1395 tbp->b_flags &= ~B_ERROR; 1396 tbp->b_cmd = BUF_CMD_WRITE; 1397 BUF_KERNPROC(tbp); 1398 cluster_append(&bp->b_bio1, tbp); 1399 1400 /* 1401 * check for latent dependencies to be handled 1402 */ 1403 if (LIST_FIRST(&tbp->b_dep) != NULL) 1404 buf_start(tbp); 1405 } 1406 finishcluster: 1407 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 1408 (vm_page_t *)bp->b_xio.xio_pages, 1409 bp->b_xio.xio_npages); 1410 if (bp->b_bufsize > bp->b_kvasize) { 1411 panic("cluster_wbuild: b_bufsize(%d) " 1412 "> b_kvasize(%d)\n", 1413 bp->b_bufsize, bp->b_kvasize); 1414 } 1415 totalwritten += bp->b_bufsize; 1416 bp->b_dirtyoff = 0; 1417 bp->b_dirtyend = bp->b_bufsize; 1418 bp->b_bio1.bio_done = cluster_callback; 1419 bp->b_cmd = BUF_CMD_WRITE; 1420 1421 vfs_busy_pages(vp, bp); 1422 bsetrunningbufspace(bp, bp->b_bufsize); 1423 BUF_KERNPROC(bp); 1424 vn_strategy(vp, &bp->b_bio1); 1425 1426 bytes -= i; 1427 } 1428 return totalwritten; 1429 } 1430 1431 /* 1432 * Collect together all the buffers in a cluster, plus add one 1433 * additional buffer passed-in. 1434 * 1435 * Only pre-existing buffers whos block size matches blksize are collected. 1436 * (this is primarily because HAMMER1 uses varying block sizes and we don't 1437 * want to override its choices). 1438 */ 1439 static struct cluster_save * 1440 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize) 1441 { 1442 struct cluster_save *buflist; 1443 struct buf *bp; 1444 off_t loffset; 1445 int i, len; 1446 int j; 1447 int k; 1448 1449 len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize; 1450 buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 1451 M_SEGMENT, M_WAITOK); 1452 buflist->bs_nchildren = 0; 1453 buflist->bs_children = (struct buf **) (buflist + 1); 1454 for (loffset = vp->v_cstart, i = 0, j = 0; 1455 i < len; 1456 (loffset += blksize), i++) { 1457 bp = getcacheblk(vp, loffset, 1458 last_bp->b_bcount, GETBLK_SZMATCH); 1459 buflist->bs_children[i] = bp; 1460 if (bp == NULL) { 1461 j = i + 1; 1462 } else if (bp->b_bio2.bio_offset == NOOFFSET) { 1463 VOP_BMAP(bp->b_vp, bp->b_loffset, 1464 &bp->b_bio2.bio_offset, 1465 NULL, NULL, BUF_CMD_WRITE); 1466 } 1467 } 1468 1469 /* 1470 * Get rid of gaps 1471 */ 1472 for (k = 0; k < j; ++k) { 1473 if (buflist->bs_children[k]) { 1474 bqrelse(buflist->bs_children[k]); 1475 buflist->bs_children[k] = NULL; 1476 } 1477 } 1478 if (j != 0) { 1479 if (j != i) { 1480 bcopy(buflist->bs_children + j, 1481 buflist->bs_children + 0, 1482 sizeof(buflist->bs_children[0]) * (i - j)); 1483 } 1484 i -= j; 1485 } 1486 buflist->bs_children[i] = bp = last_bp; 1487 if (bp->b_bio2.bio_offset == NOOFFSET) { 1488 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1489 NULL, NULL, BUF_CMD_WRITE); 1490 } 1491 buflist->bs_nchildren = i + 1; 1492 return (buflist); 1493 } 1494 1495 void 1496 cluster_append(struct bio *bio, struct buf *tbp) 1497 { 1498 tbp->b_cluster_next = NULL; 1499 if (bio->bio_caller_info1.cluster_head == NULL) { 1500 bio->bio_caller_info1.cluster_head = tbp; 1501 bio->bio_caller_info2.cluster_tail = tbp; 1502 } else { 1503 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp; 1504 bio->bio_caller_info2.cluster_tail = tbp; 1505 } 1506 } 1507 1508 static 1509 void 1510 cluster_setram (struct buf *bp) 1511 { 1512 bp->b_flags |= B_RAM; 1513 if (bp->b_xio.xio_npages) 1514 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM); 1515 } 1516