1 /* $NetBSD: rumpblk.c,v 1.37 2010/01/31 13:15:08 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by the 7 * Finnish Cultural Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 /* 32 * Block device emulation. Presents a block device interface and 33 * uses rumpuser system calls to satisfy I/O requests. 34 * 35 * We provide fault injection. The driver can be made to fail 36 * I/O occasionally. 37 * 38 * The driver also provides an optimization for regular files by 39 * using memory-mapped I/O. This avoids kernel access for every 40 * I/O operation. It also gives finer-grained control of how to 41 * flush data. Additionally, in case the rump kernel dumps core, 42 * we get way less carnage. 43 * 44 * However, it is quite costly in writing large amounts of 45 * file data, since old contents cannot merely be overwritten, but 46 * must be paged in first before replacing (i.e. r/m/w). Ideally, 47 * we should use directio. The problem is that directio can fail 48 * silently causing improper file system semantics (i.e. unflushed 49 * data). Therefore, default to mmap for now. Even so, directio 50 * _should_ be safe and can be enabled by compiling this module 51 * with -DHAS_DIRECTIO. 52 */ 53 54 #include <sys/cdefs.h> 55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.37 2010/01/31 13:15:08 pooka Exp $"); 56 57 #include <sys/param.h> 58 #include <sys/buf.h> 59 #include <sys/conf.h> 60 #include <sys/condvar.h> 61 #include <sys/disklabel.h> 62 #include <sys/evcnt.h> 63 #include <sys/fcntl.h> 64 #include <sys/kmem.h> 65 #include <sys/malloc.h> 66 #include <sys/queue.h> 67 #include <sys/stat.h> 68 69 #include <rump/rumpuser.h> 70 71 #include "rump_private.h" 72 #include "rump_vfs_private.h" 73 74 #if 0 75 #define DPRINTF(x) printf x 76 #else 77 #define DPRINTF(x) 78 #endif 79 80 /* Default: 16 x 1MB windows */ 81 unsigned memwinsize = (1<<20); 82 unsigned memwincnt = 16; 83 84 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1)) 85 #define INWIN(win,off) ((win)->win_off == STARTWIN(off)) 86 #define WINSIZE(rblk, win) (MIN((rblk->rblk_size-win->win_off),memwinsize)) 87 #define WINVALID(win) ((win)->win_off != (off_t)-1) 88 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1) 89 struct blkwin { 90 off_t win_off; 91 void *win_mem; 92 int win_refcnt; 93 94 TAILQ_ENTRY(blkwin) win_lru; 95 }; 96 97 #define RUMPBLK_SIZE 16 98 static struct rblkdev { 99 char *rblk_path; 100 int rblk_fd; 101 int rblk_opencnt; 102 #ifdef HAS_ODIRECT 103 int rblk_dfd; 104 #endif 105 uint64_t rblk_size; 106 uint64_t rblk_hostoffset; 107 int rblk_ftype; 108 109 /* for mmap */ 110 int rblk_mmflags; 111 kmutex_t rblk_memmtx; 112 kcondvar_t rblk_memcv; 113 TAILQ_HEAD(winlru, blkwin) rblk_lruq; 114 bool rblk_waiting; 115 116 struct disklabel rblk_label; 117 } minors[RUMPBLK_SIZE]; 118 119 static struct evcnt ev_io_total; 120 static struct evcnt ev_io_async; 121 122 static struct evcnt ev_memblk_hits; 123 static struct evcnt ev_memblk_busy; 124 125 static struct evcnt ev_bwrite_total; 126 static struct evcnt ev_bwrite_async; 127 static struct evcnt ev_bread_total; 128 129 dev_type_open(rumpblk_open); 130 dev_type_close(rumpblk_close); 131 dev_type_read(rumpblk_read); 132 dev_type_write(rumpblk_write); 133 dev_type_ioctl(rumpblk_ioctl); 134 dev_type_strategy(rumpblk_strategy); 135 dev_type_strategy(rumpblk_strategy_fail); 136 dev_type_dump(rumpblk_dump); 137 dev_type_size(rumpblk_size); 138 139 static const struct bdevsw rumpblk_bdevsw = { 140 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl, 141 nodump, nosize, D_DISK 142 }; 143 144 static const struct bdevsw rumpblk_bdevsw_fail = { 145 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl, 146 nodump, nosize, D_DISK 147 }; 148 149 static const struct cdevsw rumpblk_cdevsw = { 150 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write, 151 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK 152 }; 153 154 /* fail every n out of BLKFAIL_MAX */ 155 #define BLKFAIL_MAX 10000 156 static int blkfail; 157 static unsigned randstate; 158 static kmutex_t rumpblk_lock; 159 static int sectshift = DEV_BSHIFT; 160 161 static void 162 makedefaultlabel(struct disklabel *lp, off_t size, int part) 163 { 164 int i; 165 166 memset(lp, 0, sizeof(*lp)); 167 168 lp->d_secperunit = size; 169 lp->d_secsize = 1 << sectshift; 170 lp->d_nsectors = size >> sectshift; 171 lp->d_ntracks = 1; 172 lp->d_ncylinders = 1; 173 lp->d_secpercyl = lp->d_nsectors; 174 175 /* oh dear oh dear */ 176 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); 177 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 178 179 lp->d_type = DTYPE_RUMPD; 180 lp->d_rpm = 11; 181 lp->d_interleave = 1; 182 lp->d_flags = 0; 183 184 /* XXX: RAW_PART handling? */ 185 for (i = 0; i < part; i++) { 186 lp->d_partitions[i].p_fstype = FS_UNUSED; 187 } 188 lp->d_partitions[part].p_size = size >> sectshift; 189 lp->d_npartitions = part+1; 190 /* XXX: file system type? */ 191 192 lp->d_magic = DISKMAGIC; 193 lp->d_magic2 = DISKMAGIC; 194 lp->d_checksum = 0; /* XXX */ 195 } 196 197 static struct blkwin * 198 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error) 199 { 200 struct blkwin *win; 201 202 mutex_enter(&rblk->rblk_memmtx); 203 retry: 204 /* search for window */ 205 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) { 206 if (INWIN(win, off) && WINVALID(win)) 207 break; 208 } 209 210 /* found? return */ 211 if (win) { 212 ev_memblk_hits.ev_count++; 213 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 214 goto good; 215 } 216 217 /* 218 * Else, create new window. If the least recently used is not 219 * currently in use, reuse that. Otherwise we need to wait. 220 */ 221 win = TAILQ_LAST(&rblk->rblk_lruq, winlru); 222 if (win->win_refcnt == 0) { 223 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 224 mutex_exit(&rblk->rblk_memmtx); 225 226 if (WINVALID(win)) { 227 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n", 228 win, win->win_mem, win->win_off)); 229 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 230 WINVALIDATE(win); 231 } 232 233 win->win_off = STARTWIN(off); 234 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off, 235 WINSIZE(rblk, win), rblk->rblk_mmflags, error); 236 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n", 237 win, win->win_off, win->win_mem)); 238 239 mutex_enter(&rblk->rblk_memmtx); 240 if (win->win_mem == NULL) { 241 WINVALIDATE(win); 242 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 243 mutex_exit(&rblk->rblk_memmtx); 244 return NULL; 245 } 246 } else { 247 DPRINTF(("memwin wait\n")); 248 ev_memblk_busy.ev_count++; 249 250 rblk->rblk_waiting = true; 251 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx); 252 goto retry; 253 } 254 255 good: 256 KASSERT(win); 257 win->win_refcnt++; 258 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru); 259 mutex_exit(&rblk->rblk_memmtx); 260 *wsize = MIN(*wsize, memwinsize - (off-win->win_off)); 261 KASSERT(*wsize); 262 263 return win; 264 } 265 266 static void 267 putwindow(struct rblkdev *rblk, struct blkwin *win) 268 { 269 270 mutex_enter(&rblk->rblk_memmtx); 271 if (--win->win_refcnt == 0 && rblk->rblk_waiting) { 272 rblk->rblk_waiting = false; 273 cv_signal(&rblk->rblk_memcv); 274 } 275 KASSERT(win->win_refcnt >= 0); 276 mutex_exit(&rblk->rblk_memmtx); 277 } 278 279 static void 280 wincleanup(struct rblkdev *rblk) 281 { 282 struct blkwin *win; 283 284 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) { 285 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 286 if (WINVALID(win)) { 287 DPRINTF(("cleanup win %p addr %p\n", 288 win, win->win_mem)); 289 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 290 } 291 kmem_free(win, sizeof(*win)); 292 } 293 rblk->rblk_mmflags = 0; 294 } 295 296 int 297 rumpblk_init(void) 298 { 299 char buf[64]; 300 int rumpblk = RUMPBLK; 301 unsigned tmp; 302 int error, i; 303 304 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); 305 306 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) { 307 blkfail = strtoul(buf, NULL, 10); 308 /* fail everything */ 309 if (blkfail > BLKFAIL_MAX) 310 blkfail = BLKFAIL_MAX; 311 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf), 312 &error) == 0) { 313 randstate = strtoul(buf, NULL, 10); 314 } else { 315 randstate = arc4random(); 316 } 317 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " 318 "seed %u\n", blkfail, BLKFAIL_MAX, randstate); 319 } else { 320 blkfail = 0; 321 } 322 323 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) { 324 printf("rumpblk: "); 325 tmp = strtoul(buf, NULL, 10); 326 if (tmp && !(tmp & (tmp-1))) 327 memwinsize = tmp; 328 else 329 printf("invalid RUMP_BLKWINSIZE %d, ", tmp); 330 printf("using %d for memwinsize\n", memwinsize); 331 } 332 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){ 333 printf("rumpblk: "); 334 tmp = strtoul(buf, NULL, 10); 335 if (tmp) 336 memwincnt = tmp; 337 else 338 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp); 339 printf("using %d for memwincount\n", memwincnt); 340 } 341 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){ 342 printf("rumpblk: "); 343 tmp = strtoul(buf, NULL, 10); 344 if (tmp >= DEV_BSHIFT) 345 sectshift = tmp; 346 else 347 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ", 348 DEV_BSHIFT, tmp); 349 printf("using %d for sector shift (size %d)\n", 350 sectshift, 1<<sectshift); 351 } 352 353 memset(minors, 0, sizeof(minors)); 354 for (i = 0; i < RUMPBLK_SIZE; i++) { 355 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE); 356 cv_init(&minors[i].rblk_memcv, "rblkmcv"); 357 } 358 359 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL, 360 "rumpblk", "rumpblk I/O reqs"); 361 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL, 362 "rumpblk", "rumpblk async I/O"); 363 364 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL, 365 "rumpblk", "rumpblk bytes read"); 366 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL, 367 "rumpblk", "rumpblk bytes written"); 368 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL, 369 "rumpblk", "rumpblk bytes written async"); 370 371 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL, 372 "rumpblk", "memblk window hits"); 373 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL, 374 "rumpblk", "memblk all windows busy"); 375 376 if (blkfail) { 377 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk, 378 &rumpblk_cdevsw, &rumpblk); 379 } else { 380 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk, 381 &rumpblk_cdevsw, &rumpblk); 382 } 383 } 384 385 /* XXX: no deregister */ 386 int 387 rumpblk_register(const char *path, devminor_t *dmin, 388 uint64_t offset, uint64_t size) 389 { 390 struct rblkdev *rblk; 391 uint64_t flen; 392 size_t len; 393 int ftype, error, i; 394 395 /* devices might not report correct size unless they're open */ 396 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1) 397 return error; 398 399 /* verify host file is of supported type */ 400 if (!(ftype == RUMPUSER_FT_REG 401 || ftype == RUMPUSER_FT_BLK 402 || ftype == RUMPUSER_FT_CHR)) 403 return EINVAL; 404 405 mutex_enter(&rumpblk_lock); 406 for (i = 0; i < RUMPBLK_SIZE; i++) { 407 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 408 mutex_exit(&rumpblk_lock); 409 *dmin = i; 410 return 0; 411 } 412 } 413 414 for (i = 0; i < RUMPBLK_SIZE; i++) 415 if (minors[i].rblk_path == NULL) 416 break; 417 if (i == RUMPBLK_SIZE) { 418 mutex_exit(&rumpblk_lock); 419 return EBUSY; 420 } 421 422 rblk = &minors[i]; 423 len = strlen(path); 424 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); 425 strcpy(rblk->rblk_path, path); 426 rblk->rblk_fd = -1; 427 rblk->rblk_hostoffset = offset; 428 if (size != RUMPBLK_SIZENOTSET) { 429 KASSERT(size + offset <= flen); 430 rblk->rblk_size = size; 431 } else { 432 KASSERT(offset < flen); 433 rblk->rblk_size = flen - offset; 434 } 435 rblk->rblk_ftype = ftype; 436 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); 437 mutex_exit(&rumpblk_lock); 438 439 *dmin = i; 440 return 0; 441 } 442 443 int 444 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) 445 { 446 struct rblkdev *rblk = &minors[minor(dev)]; 447 int error, fd; 448 449 if (rblk->rblk_path == NULL) 450 return ENXIO; 451 452 if (rblk->rblk_fd != -1) 453 return 0; /* XXX: refcount, open mode */ 454 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error); 455 if (error) 456 return error; 457 458 #ifdef HAS_ODIRECT 459 rblk->rblk_dfd = rumpuser_open(rblk->rblk_path, 460 OFLAGS(flag) | O_DIRECT, &error); 461 if (error) 462 return error; 463 #endif 464 465 if (rblk->rblk_ftype == RUMPUSER_FT_REG) { 466 uint64_t fsize = rblk->rblk_size, off = rblk->rblk_hostoffset; 467 struct blkwin *win; 468 int i, winsize; 469 470 /* 471 * Use mmap to access a regular file. Allocate and 472 * cache initial windows here. Failure to allocate one 473 * means fallback to read/write i/o. 474 */ 475 476 rblk->rblk_mmflags = 0; 477 if (flag & FREAD) 478 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ; 479 if (flag & FWRITE) { 480 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE; 481 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED; 482 } 483 484 TAILQ_INIT(&rblk->rblk_lruq); 485 rblk->rblk_fd = fd; 486 487 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) { 488 win = kmem_zalloc(sizeof(*win), KM_SLEEP); 489 WINVALIDATE(win); 490 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 491 492 /* 493 * Allocate first windows. Here we just generally 494 * make sure a) we can mmap at all b) we have the 495 * necessary VA available 496 */ 497 winsize = memwinsize; 498 win = getwindow(rblk, off + i*memwinsize, &winsize, 499 &error); 500 if (win) { 501 putwindow(rblk, win); 502 } else { 503 wincleanup(rblk); 504 break; 505 } 506 } 507 } else { 508 rblk->rblk_fd = fd; 509 } 510 511 KASSERT(rblk->rblk_fd != -1); 512 return 0; 513 } 514 515 int 516 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) 517 { 518 struct rblkdev *rblk = &minors[minor(dev)]; 519 int dummy; 520 521 if (rblk->rblk_mmflags) 522 wincleanup(rblk); 523 rumpuser_fsync(rblk->rblk_fd, &dummy); 524 rumpuser_close(rblk->rblk_fd, &dummy); 525 rblk->rblk_fd = -1; 526 527 return 0; 528 } 529 530 int 531 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) 532 { 533 devminor_t dmin = minor(dev); 534 struct rblkdev *rblk = &minors[dmin]; 535 struct partinfo *pi; 536 int error = 0; 537 538 /* well, me should support a few more, but we don't for now */ 539 switch (xfer) { 540 case DIOCGDINFO: 541 *(struct disklabel *)addr = rblk->rblk_label; 542 break; 543 544 case DIOCGPART: 545 pi = addr; 546 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; 547 pi->disklab = &rblk->rblk_label; 548 break; 549 550 /* it's synced enough along the write path */ 551 case DIOCCACHESYNC: 552 break; 553 554 default: 555 error = ENOTTY; 556 break; 557 } 558 559 return error; 560 } 561 562 static int 563 do_physio(dev_t dev, struct uio *uio, int which) 564 { 565 void (*strat)(struct buf *); 566 567 if (blkfail) 568 strat = rumpblk_strategy_fail; 569 else 570 strat = rumpblk_strategy; 571 572 return physio(strat, NULL, dev, which, minphys, uio); 573 } 574 575 int 576 rumpblk_read(dev_t dev, struct uio *uio, int flags) 577 { 578 579 return do_physio(dev, uio, B_READ); 580 } 581 582 int 583 rumpblk_write(dev_t dev, struct uio *uio, int flags) 584 { 585 586 return do_physio(dev, uio, B_WRITE); 587 } 588 589 static void 590 dostrategy(struct buf *bp) 591 { 592 struct rblkdev *rblk = &minors[minor(bp->b_dev)]; 593 off_t off; 594 int async = bp->b_flags & B_ASYNC; 595 int error; 596 597 /* collect statistics */ 598 ev_io_total.ev_count++; 599 if (async) 600 ev_io_async.ev_count++; 601 if (BUF_ISWRITE(bp)) { 602 ev_bwrite_total.ev_count += bp->b_bcount; 603 if (async) 604 ev_bwrite_async.ev_count += bp->b_bcount; 605 } else { 606 ev_bread_total.ev_count++; 607 } 608 609 off = bp->b_blkno << sectshift; 610 /* 611 * Do bounds checking if we're working on a file. Otherwise 612 * invalid file systems might attempt to read beyond EOF. This 613 * is bad(tm) especially on mmapped images. This is essentially 614 * the kernel bounds_check() routines. 615 */ 616 if (off + bp->b_bcount > rblk->rblk_size) { 617 int64_t sz = rblk->rblk_size - off; 618 619 /* EOF */ 620 if (sz == 0) { 621 rump_biodone(bp, 0, 0); 622 return; 623 } 624 /* beyond EOF ==> error */ 625 if (sz < 0) { 626 rump_biodone(bp, 0, EINVAL); 627 return; 628 } 629 630 /* truncate to device size */ 631 bp->b_bcount = sz; 632 } 633 634 off += rblk->rblk_hostoffset; 635 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 636 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", 637 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", 638 off, off, (off + bp->b_bcount), async ? "a" : "")); 639 640 /* mmap? handle here and return */ 641 if (rblk->rblk_mmflags) { 642 struct blkwin *win; 643 int winsize, iodone; 644 uint8_t *ioaddr, *bufaddr; 645 646 for (iodone = 0; iodone < bp->b_bcount; 647 iodone += winsize, off += winsize) { 648 winsize = bp->b_bcount - iodone; 649 win = getwindow(rblk, off, &winsize, &error); 650 if (win == NULL) { 651 rump_biodone(bp, iodone, error); 652 return; 653 } 654 655 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off)); 656 bufaddr = (uint8_t *)bp->b_data + iodone; 657 658 DPRINTF(("strat: %p off 0x%" PRIx64 659 ", ioaddr %p (%p)/buf %p\n", win, 660 win->win_off, ioaddr, win->win_mem, bufaddr)); 661 if (BUF_ISREAD(bp)) { 662 memcpy(bufaddr, ioaddr, winsize); 663 } else { 664 memcpy(ioaddr, bufaddr, winsize); 665 } 666 667 /* synchronous write, sync bits back to disk */ 668 if (BUF_ISWRITE(bp) && !async) { 669 rumpuser_memsync(ioaddr, winsize, &error); 670 } 671 putwindow(rblk, win); 672 } 673 674 rump_biodone(bp, bp->b_bcount, 0); 675 return; 676 } 677 678 /* 679 * Do I/O. We have different paths for async and sync I/O. 680 * Async I/O is done by passing a request to rumpuser where 681 * it is executed. The rumpuser routine then calls 682 * biodone() to signal any waiters in the kernel. I/O's are 683 * executed in series. Technically executing them in parallel 684 * would produce better results, but then we'd need either 685 * more threads or posix aio. Maybe worth investigating 686 * this later. 687 * 688 * Using bufq here might be a good idea. 689 */ 690 691 if (rump_threads) { 692 struct rumpuser_aio *rua; 693 int op, fd; 694 695 fd = rblk->rblk_fd; 696 if (BUF_ISREAD(bp)) { 697 op = RUA_OP_READ; 698 } else { 699 op = RUA_OP_WRITE; 700 if (!async) { 701 /* O_DIRECT not fully automatic yet */ 702 #ifdef HAS_ODIRECT 703 if ((off & ((1<<sectshift)-1)) == 0 704 && ((intptr_t)bp->b_data 705 & ((1<<sectshift)-1)) == 0 706 && (bp->b_bcount & ((1<<sectshift)-1)) == 0) 707 fd = rblk->rblk_dfd; 708 else 709 #endif 710 op |= RUA_OP_SYNC; 711 } 712 } 713 714 rumpuser_mutex_enter(&rumpuser_aio_mtx); 715 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) { 716 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx); 717 } 718 719 rua = &rumpuser_aios[rumpuser_aio_head]; 720 KASSERT(rua->rua_bp == NULL); 721 rua->rua_fd = fd; 722 rua->rua_data = bp->b_data; 723 rua->rua_dlen = bp->b_bcount; 724 rua->rua_off = off; 725 rua->rua_bp = bp; 726 rua->rua_op = op; 727 728 /* insert into queue & signal */ 729 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS; 730 rumpuser_cv_signal(&rumpuser_aio_cv); 731 rumpuser_mutex_exit(&rumpuser_aio_mtx); 732 } else { 733 if (BUF_ISREAD(bp)) { 734 rumpuser_read_bio(rblk->rblk_fd, bp->b_data, 735 bp->b_bcount, off, rump_biodone, bp); 736 } else { 737 rumpuser_write_bio(rblk->rblk_fd, bp->b_data, 738 bp->b_bcount, off, rump_biodone, bp); 739 } 740 if (BUF_ISWRITE(bp) && !async) 741 rumpuser_fsync(rblk->rblk_fd, &error); 742 } 743 } 744 745 void 746 rumpblk_strategy(struct buf *bp) 747 { 748 749 dostrategy(bp); 750 } 751 752 /* 753 * Simple random number generator. This is private so that we can 754 * very repeatedly control which blocks will fail. 755 * 756 * <mlelstv> pooka, rand() 757 * <mlelstv> [paste] 758 */ 759 static unsigned 760 gimmerand(void) 761 { 762 763 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); 764 } 765 766 /* 767 * Block device with very simple fault injection. Fails every 768 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env 769 * variable RUMP_BLKFAIL. 770 */ 771 void 772 rumpblk_strategy_fail(struct buf *bp) 773 { 774 775 if (gimmerand() % BLKFAIL_MAX >= blkfail) { 776 dostrategy(bp); 777 } else { 778 printf("block fault injection: failing I/O on block %lld\n", 779 (long long)bp->b_blkno); 780 bp->b_error = EIO; 781 biodone(bp); 782 } 783 } 784