1 /* $NetBSD: rumpblk.c,v 1.46 2011/02/03 22:16:11 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by the 7 * Finnish Cultural Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 /* 32 * Block device emulation. Presents a block device interface and 33 * uses rumpuser system calls to satisfy I/O requests. 34 * 35 * We provide fault injection. The driver can be made to fail 36 * I/O occasionally. 37 * 38 * The driver also provides an optimization for regular files by 39 * using memory-mapped I/O. This avoids kernel access for every 40 * I/O operation. It also gives finer-grained control of how to 41 * flush data. Additionally, in case the rump kernel dumps core, 42 * we get way less carnage. 43 * 44 * However, it is quite costly in writing large amounts of 45 * file data, since old contents cannot merely be overwritten, but 46 * must be paged in first before replacing (i.e. r/m/w). Ideally, 47 * we should use directio. The problem is that directio can fail 48 * silently causing improper file system semantics (i.e. unflushed 49 * data). Therefore, default to mmap for now. Even so, directio 50 * _should_ be safe and can be enabled by compiling this module 51 * with -DHAS_DIRECTIO. 52 */ 53 54 #include <sys/cdefs.h> 55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.46 2011/02/03 22:16:11 pooka Exp $"); 56 57 #include <sys/param.h> 58 #include <sys/buf.h> 59 #include <sys/conf.h> 60 #include <sys/condvar.h> 61 #include <sys/disklabel.h> 62 #include <sys/evcnt.h> 63 #include <sys/fcntl.h> 64 #include <sys/kmem.h> 65 #include <sys/malloc.h> 66 #include <sys/queue.h> 67 #include <sys/stat.h> 68 69 #include <rump/rumpuser.h> 70 71 #include "rump_private.h" 72 #include "rump_vfs_private.h" 73 74 /* 75 * O_DIRECT is the fastest alternative, but since it falls back to 76 * non-direct writes silently, I am not sure it will always be 100% safe. 77 * Use it and play with it, but do that with caution. 78 */ 79 #if 0 80 #define HAS_ODIRECT 81 #endif 82 83 #if 0 84 #define DPRINTF(x) printf x 85 #else 86 #define DPRINTF(x) 87 #endif 88 89 /* Default: 16 x 1MB windows */ 90 unsigned memwinsize = (1<<20); 91 unsigned memwincnt = 16; 92 93 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1)) 94 #define INWIN(win,off) ((win)->win_off == STARTWIN(off)) 95 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \ 96 memwinsize)) 97 #define WINVALID(win) ((win)->win_off != (off_t)-1) 98 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1) 99 struct blkwin { 100 off_t win_off; 101 void *win_mem; 102 int win_refcnt; 103 104 TAILQ_ENTRY(blkwin) win_lru; 105 }; 106 107 #define RUMPBLK_SIZE 16 108 static struct rblkdev { 109 char *rblk_path; 110 int rblk_fd; 111 int rblk_mode; 112 #ifdef HAS_ODIRECT 113 int rblk_dfd; 114 #endif 115 uint64_t rblk_size; 116 uint64_t rblk_hostoffset; 117 uint64_t rblk_hostsize; 118 int rblk_ftype; 119 120 /* for mmap */ 121 int rblk_mmflags; 122 kmutex_t rblk_memmtx; 123 kcondvar_t rblk_memcv; 124 TAILQ_HEAD(winlru, blkwin) rblk_lruq; 125 bool rblk_waiting; 126 127 struct disklabel rblk_label; 128 } minors[RUMPBLK_SIZE]; 129 130 static struct evcnt ev_io_total; 131 static struct evcnt ev_io_async; 132 133 static struct evcnt ev_memblk_hits; 134 static struct evcnt ev_memblk_busy; 135 136 static struct evcnt ev_bwrite_total; 137 static struct evcnt ev_bwrite_async; 138 static struct evcnt ev_bread_total; 139 140 dev_type_open(rumpblk_open); 141 dev_type_close(rumpblk_close); 142 dev_type_read(rumpblk_read); 143 dev_type_write(rumpblk_write); 144 dev_type_ioctl(rumpblk_ioctl); 145 dev_type_strategy(rumpblk_strategy); 146 dev_type_strategy(rumpblk_strategy_fail); 147 dev_type_dump(rumpblk_dump); 148 dev_type_size(rumpblk_size); 149 150 static const struct bdevsw rumpblk_bdevsw = { 151 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl, 152 nodump, nosize, D_DISK 153 }; 154 155 static const struct bdevsw rumpblk_bdevsw_fail = { 156 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl, 157 nodump, nosize, D_DISK 158 }; 159 160 static const struct cdevsw rumpblk_cdevsw = { 161 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write, 162 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK 163 }; 164 165 static int backend_open(struct rblkdev *, const char *); 166 static int backend_close(struct rblkdev *); 167 168 /* fail every n out of BLKFAIL_MAX */ 169 #define BLKFAIL_MAX 10000 170 static int blkfail; 171 static unsigned randstate; 172 static kmutex_t rumpblk_lock; 173 static int sectshift = DEV_BSHIFT; 174 175 static void 176 makedefaultlabel(struct disklabel *lp, off_t size, int part) 177 { 178 int i; 179 180 memset(lp, 0, sizeof(*lp)); 181 182 lp->d_secperunit = size; 183 lp->d_secsize = 1 << sectshift; 184 lp->d_nsectors = size >> sectshift; 185 lp->d_ntracks = 1; 186 lp->d_ncylinders = 1; 187 lp->d_secpercyl = lp->d_nsectors; 188 189 /* oh dear oh dear */ 190 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); 191 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 192 193 lp->d_type = DTYPE_RUMPD; 194 lp->d_rpm = 11; 195 lp->d_interleave = 1; 196 lp->d_flags = 0; 197 198 /* XXX: RAW_PART handling? */ 199 for (i = 0; i < part; i++) { 200 lp->d_partitions[i].p_fstype = FS_UNUSED; 201 } 202 lp->d_partitions[part].p_size = size >> sectshift; 203 lp->d_npartitions = part+1; 204 /* XXX: file system type? */ 205 206 lp->d_magic = DISKMAGIC; 207 lp->d_magic2 = DISKMAGIC; 208 lp->d_checksum = 0; /* XXX */ 209 } 210 211 static struct blkwin * 212 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error) 213 { 214 struct blkwin *win; 215 216 mutex_enter(&rblk->rblk_memmtx); 217 retry: 218 /* search for window */ 219 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) { 220 if (INWIN(win, off) && WINVALID(win)) 221 break; 222 } 223 224 /* found? return */ 225 if (win) { 226 ev_memblk_hits.ev_count++; 227 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 228 goto good; 229 } 230 231 /* 232 * Else, create new window. If the least recently used is not 233 * currently in use, reuse that. Otherwise we need to wait. 234 */ 235 win = TAILQ_LAST(&rblk->rblk_lruq, winlru); 236 if (win->win_refcnt == 0) { 237 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 238 mutex_exit(&rblk->rblk_memmtx); 239 240 if (WINVALID(win)) { 241 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n", 242 win, win->win_mem, win->win_off)); 243 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 244 WINVALIDATE(win); 245 } 246 247 win->win_off = STARTWIN(off); 248 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off, 249 WINSIZE(rblk, win), rblk->rblk_mmflags, error); 250 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n", 251 win, win->win_off, win->win_mem)); 252 253 mutex_enter(&rblk->rblk_memmtx); 254 if (win->win_mem == NULL) { 255 WINVALIDATE(win); 256 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 257 mutex_exit(&rblk->rblk_memmtx); 258 return NULL; 259 } 260 } else { 261 DPRINTF(("memwin wait\n")); 262 ev_memblk_busy.ev_count++; 263 264 rblk->rblk_waiting = true; 265 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx); 266 goto retry; 267 } 268 269 good: 270 KASSERT(win); 271 win->win_refcnt++; 272 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru); 273 mutex_exit(&rblk->rblk_memmtx); 274 *wsize = MIN(*wsize, memwinsize - (off-win->win_off)); 275 KASSERT(*wsize); 276 277 return win; 278 } 279 280 static void 281 putwindow(struct rblkdev *rblk, struct blkwin *win) 282 { 283 284 mutex_enter(&rblk->rblk_memmtx); 285 if (--win->win_refcnt == 0 && rblk->rblk_waiting) { 286 rblk->rblk_waiting = false; 287 cv_broadcast(&rblk->rblk_memcv); 288 } 289 KASSERT(win->win_refcnt >= 0); 290 mutex_exit(&rblk->rblk_memmtx); 291 } 292 293 static void 294 wincleanup(struct rblkdev *rblk) 295 { 296 struct blkwin *win; 297 298 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) { 299 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 300 if (WINVALID(win)) { 301 DPRINTF(("cleanup win %p addr %p\n", 302 win, win->win_mem)); 303 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 304 } 305 kmem_free(win, sizeof(*win)); 306 } 307 rblk->rblk_mmflags = 0; 308 } 309 310 int 311 rumpblk_init(void) 312 { 313 char buf[64]; 314 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR; 315 unsigned tmp; 316 int error, i; 317 318 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); 319 320 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) { 321 blkfail = strtoul(buf, NULL, 10); 322 /* fail everything */ 323 if (blkfail > BLKFAIL_MAX) 324 blkfail = BLKFAIL_MAX; 325 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf), 326 &error) == 0) { 327 randstate = strtoul(buf, NULL, 10); 328 } else { 329 randstate = arc4random(); 330 } 331 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " 332 "seed %u\n", blkfail, BLKFAIL_MAX, randstate); 333 } else { 334 blkfail = 0; 335 } 336 337 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) { 338 printf("rumpblk: "); 339 tmp = strtoul(buf, NULL, 10); 340 if (tmp && !(tmp & (tmp-1))) 341 memwinsize = tmp; 342 else 343 printf("invalid RUMP_BLKWINSIZE %d, ", tmp); 344 printf("using %d for memwinsize\n", memwinsize); 345 } 346 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){ 347 printf("rumpblk: "); 348 tmp = strtoul(buf, NULL, 10); 349 if (tmp) 350 memwincnt = tmp; 351 else 352 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp); 353 printf("using %d for memwincount\n", memwincnt); 354 } 355 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){ 356 printf("rumpblk: "); 357 tmp = strtoul(buf, NULL, 10); 358 if (tmp >= DEV_BSHIFT) 359 sectshift = tmp; 360 else 361 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ", 362 DEV_BSHIFT, tmp); 363 printf("using %d for sector shift (size %d)\n", 364 sectshift, 1<<sectshift); 365 } 366 367 memset(minors, 0, sizeof(minors)); 368 for (i = 0; i < RUMPBLK_SIZE; i++) { 369 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE); 370 cv_init(&minors[i].rblk_memcv, "rblkmcv"); 371 minors[i].rblk_fd = -1; 372 } 373 374 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL, 375 "rumpblk", "I/O reqs"); 376 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL, 377 "rumpblk", "async I/O"); 378 379 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL, 380 "rumpblk", "bytes read"); 381 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL, 382 "rumpblk", "bytes written"); 383 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL, 384 "rumpblk", "bytes written async"); 385 386 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL, 387 "rumpblk", "window hits"); 388 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL, 389 "rumpblk", "all windows busy"); 390 391 if (blkfail) { 392 return devsw_attach("rumpblk", 393 &rumpblk_bdevsw_fail, &rumpblkmaj, 394 &rumpblk_cdevsw, &rumpblkmaj); 395 } else { 396 return devsw_attach("rumpblk", 397 &rumpblk_bdevsw, &rumpblkmaj, 398 &rumpblk_cdevsw, &rumpblkmaj); 399 } 400 } 401 402 int 403 rumpblk_register(const char *path, devminor_t *dmin, 404 uint64_t offset, uint64_t size) 405 { 406 struct rblkdev *rblk; 407 uint64_t flen; 408 size_t len; 409 int ftype, error, i; 410 411 /* devices might not report correct size unless they're open */ 412 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1) 413 return error; 414 415 /* verify host file is of supported type */ 416 if (!(ftype == RUMPUSER_FT_REG 417 || ftype == RUMPUSER_FT_BLK 418 || ftype == RUMPUSER_FT_CHR)) 419 return EINVAL; 420 421 mutex_enter(&rumpblk_lock); 422 for (i = 0; i < RUMPBLK_SIZE; i++) { 423 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 424 mutex_exit(&rumpblk_lock); 425 *dmin = i; 426 return 0; 427 } 428 } 429 430 for (i = 0; i < RUMPBLK_SIZE; i++) 431 if (minors[i].rblk_path == NULL) 432 break; 433 if (i == RUMPBLK_SIZE) { 434 mutex_exit(&rumpblk_lock); 435 return EBUSY; 436 } 437 438 rblk = &minors[i]; 439 rblk->rblk_path = __UNCONST("taken"); 440 mutex_exit(&rumpblk_lock); 441 442 len = strlen(path); 443 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); 444 strcpy(rblk->rblk_path, path); 445 rblk->rblk_hostoffset = offset; 446 if (size != RUMPBLK_SIZENOTSET) { 447 KASSERT(size + offset <= flen); 448 rblk->rblk_size = size; 449 } else { 450 KASSERT(offset < flen); 451 rblk->rblk_size = flen - offset; 452 } 453 rblk->rblk_hostsize = flen; 454 rblk->rblk_ftype = ftype; 455 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); 456 457 if ((error = backend_open(rblk, path)) != 0) { 458 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); 459 free(rblk->rblk_path, M_TEMP); 460 rblk->rblk_path = NULL; 461 return error; 462 } 463 464 *dmin = i; 465 return 0; 466 } 467 468 /* 469 * Unregister rumpblk. It's the callers responsibility to make 470 * sure it's no longer in use. 471 */ 472 int 473 rumpblk_deregister(const char *path) 474 { 475 struct rblkdev *rblk; 476 int i; 477 478 mutex_enter(&rumpblk_lock); 479 for (i = 0; i < RUMPBLK_SIZE; i++) { 480 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 481 break; 482 } 483 } 484 mutex_exit(&rumpblk_lock); 485 486 if (i == RUMPBLK_SIZE) 487 return ENOENT; 488 489 rblk = &minors[i]; 490 backend_close(rblk); 491 492 wincleanup(rblk); 493 free(rblk->rblk_path, M_TEMP); 494 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); 495 rblk->rblk_path = NULL; 496 497 return 0; 498 } 499 500 static int 501 backend_open(struct rblkdev *rblk, const char *path) 502 { 503 int error, fd; 504 505 KASSERT(rblk->rblk_fd == -1); 506 fd = rumpuser_open(path, O_RDWR, &error); 507 if (error) { 508 fd = rumpuser_open(path, O_RDONLY, &error); 509 if (error) 510 return error; 511 rblk->rblk_mode = FREAD; 512 513 #ifdef HAS_ODIRECT 514 rblk->rblk_dfd = rumpuser_open(path, 515 O_RDONLY | O_DIRECT, &error); 516 if (error) { 517 close(fd); 518 return error; 519 } 520 #endif 521 } else { 522 rblk->rblk_mode = FREAD|FWRITE; 523 524 #ifdef HAS_ODIRECT 525 rblk->rblk_dfd = rumpuser_open(path, 526 O_RDWR | O_DIRECT, &error); 527 if (error) { 528 close(fd); 529 return error; 530 } 531 #endif 532 } 533 534 if (rblk->rblk_ftype == RUMPUSER_FT_REG) { 535 uint64_t fsize= rblk->rblk_hostsize, off= rblk->rblk_hostoffset; 536 struct blkwin *win; 537 int i, winsize; 538 539 /* 540 * Use mmap to access a regular file. Allocate and 541 * cache initial windows here. Failure to allocate one 542 * means fallback to read/write i/o. 543 */ 544 545 rblk->rblk_mmflags = 0; 546 if (rblk->rblk_mode & FREAD) 547 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ; 548 if (rblk->rblk_mode & FWRITE) { 549 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE; 550 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED; 551 } 552 553 TAILQ_INIT(&rblk->rblk_lruq); 554 rblk->rblk_fd = fd; 555 556 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) { 557 win = kmem_zalloc(sizeof(*win), KM_SLEEP); 558 WINVALIDATE(win); 559 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 560 561 /* 562 * Allocate first windows. Here we just generally 563 * make sure a) we can mmap at all b) we have the 564 * necessary VA available 565 */ 566 winsize = memwinsize; 567 win = getwindow(rblk, off + i*memwinsize, &winsize, 568 &error); 569 if (win) { 570 putwindow(rblk, win); 571 } else { 572 wincleanup(rblk); 573 break; 574 } 575 } 576 } else { 577 rblk->rblk_fd = fd; 578 } 579 580 KASSERT(rblk->rblk_fd != -1); 581 return 0; 582 } 583 584 static int 585 backend_close(struct rblkdev *rblk) 586 { 587 int dummy; 588 589 if (rblk->rblk_mmflags) 590 wincleanup(rblk); 591 rumpuser_fsync(rblk->rblk_fd, &dummy); 592 rumpuser_close(rblk->rblk_fd, &dummy); 593 rblk->rblk_fd = -1; 594 #ifdef HAS_ODIRECT 595 if (rblk->rblk_dfd != -1) { 596 rumpuser_close(rblk->rblk_dfd, &dummy); 597 rblk->rblk_dfd = -1; 598 } 599 #endif 600 601 return 0; 602 } 603 604 int 605 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) 606 { 607 struct rblkdev *rblk = &minors[minor(dev)]; 608 609 if (rblk->rblk_fd == -1) 610 return ENXIO; 611 612 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) { 613 return EACCES; 614 } 615 616 return 0; 617 } 618 619 int 620 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) 621 { 622 623 return 0; 624 } 625 626 int 627 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) 628 { 629 devminor_t dmin = minor(dev); 630 struct rblkdev *rblk = &minors[dmin]; 631 struct partinfo *pi; 632 int error = 0; 633 634 /* well, me should support a few more, but we don't for now */ 635 switch (xfer) { 636 case DIOCGDINFO: 637 *(struct disklabel *)addr = rblk->rblk_label; 638 break; 639 640 case DIOCGPART: 641 pi = addr; 642 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; 643 pi->disklab = &rblk->rblk_label; 644 break; 645 646 /* it's synced enough along the write path */ 647 case DIOCCACHESYNC: 648 break; 649 650 default: 651 error = ENOTTY; 652 break; 653 } 654 655 return error; 656 } 657 658 static int 659 do_physio(dev_t dev, struct uio *uio, int which) 660 { 661 void (*strat)(struct buf *); 662 663 if (blkfail) 664 strat = rumpblk_strategy_fail; 665 else 666 strat = rumpblk_strategy; 667 668 return physio(strat, NULL, dev, which, minphys, uio); 669 } 670 671 int 672 rumpblk_read(dev_t dev, struct uio *uio, int flags) 673 { 674 675 return do_physio(dev, uio, B_READ); 676 } 677 678 int 679 rumpblk_write(dev_t dev, struct uio *uio, int flags) 680 { 681 682 return do_physio(dev, uio, B_WRITE); 683 } 684 685 static void 686 dostrategy(struct buf *bp) 687 { 688 struct rblkdev *rblk = &minors[minor(bp->b_dev)]; 689 off_t off; 690 int async = bp->b_flags & B_ASYNC; 691 int error; 692 693 if (bp->b_bcount % (1<<sectshift) != 0) { 694 rump_biodone(bp, 0, EINVAL); 695 return; 696 } 697 698 /* collect statistics */ 699 ev_io_total.ev_count++; 700 if (async) 701 ev_io_async.ev_count++; 702 if (BUF_ISWRITE(bp)) { 703 ev_bwrite_total.ev_count += bp->b_bcount; 704 if (async) 705 ev_bwrite_async.ev_count += bp->b_bcount; 706 } else { 707 ev_bread_total.ev_count++; 708 } 709 710 /* 711 * b_blkno is always in terms of DEV_BSIZE, and since we need 712 * to translate to a byte offset for the host read, this 713 * calculation does not need sectshift. 714 */ 715 off = bp->b_blkno << DEV_BSHIFT; 716 717 /* 718 * Do bounds checking if we're working on a file. Otherwise 719 * invalid file systems might attempt to read beyond EOF. This 720 * is bad(tm) especially on mmapped images. This is essentially 721 * the kernel bounds_check() routines. 722 */ 723 if (off + bp->b_bcount > rblk->rblk_size) { 724 int64_t sz = rblk->rblk_size - off; 725 726 /* EOF */ 727 if (sz == 0) { 728 rump_biodone(bp, 0, 0); 729 return; 730 } 731 /* beyond EOF ==> error */ 732 if (sz < 0) { 733 rump_biodone(bp, 0, EINVAL); 734 return; 735 } 736 737 /* truncate to device size */ 738 bp->b_bcount = sz; 739 } 740 741 off += rblk->rblk_hostoffset; 742 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 743 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", 744 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", 745 off, off, (off + bp->b_bcount), async ? "a" : "")); 746 747 /* mmap? handle here and return */ 748 if (rblk->rblk_mmflags) { 749 struct blkwin *win; 750 int winsize, iodone; 751 uint8_t *ioaddr, *bufaddr; 752 753 for (iodone = 0; iodone < bp->b_bcount; 754 iodone += winsize, off += winsize) { 755 winsize = bp->b_bcount - iodone; 756 win = getwindow(rblk, off, &winsize, &error); 757 if (win == NULL) { 758 rump_biodone(bp, iodone, error); 759 return; 760 } 761 762 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off)); 763 bufaddr = (uint8_t *)bp->b_data + iodone; 764 765 DPRINTF(("strat: %p off 0x%" PRIx64 766 ", ioaddr %p (%p)/buf %p\n", win, 767 win->win_off, ioaddr, win->win_mem, bufaddr)); 768 if (BUF_ISREAD(bp)) { 769 memcpy(bufaddr, ioaddr, winsize); 770 } else { 771 memcpy(ioaddr, bufaddr, winsize); 772 } 773 774 /* synchronous write, sync bits back to disk */ 775 if (BUF_ISWRITE(bp) && !async) { 776 rumpuser_memsync(ioaddr, winsize, &error); 777 } 778 putwindow(rblk, win); 779 } 780 781 rump_biodone(bp, bp->b_bcount, 0); 782 return; 783 } 784 785 /* 786 * Do I/O. We have different paths for async and sync I/O. 787 * Async I/O is done by passing a request to rumpuser where 788 * it is executed. The rumpuser routine then calls 789 * biodone() to signal any waiters in the kernel. I/O's are 790 * executed in series. Technically executing them in parallel 791 * would produce better results, but then we'd need either 792 * more threads or posix aio. Maybe worth investigating 793 * this later. 794 * 795 * Using bufq here might be a good idea. 796 */ 797 798 if (rump_threads) { 799 struct rumpuser_aio *rua; 800 int op, fd; 801 802 fd = rblk->rblk_fd; 803 if (BUF_ISREAD(bp)) { 804 op = RUA_OP_READ; 805 } else { 806 op = RUA_OP_WRITE; 807 if (!async) { 808 /* O_DIRECT not fully automatic yet */ 809 #ifdef HAS_ODIRECT 810 if ((off & ((1<<sectshift)-1)) == 0 811 && ((intptr_t)bp->b_data 812 & ((1<<sectshift)-1)) == 0 813 && (bp->b_bcount & ((1<<sectshift)-1)) == 0) 814 fd = rblk->rblk_dfd; 815 else 816 #endif 817 op |= RUA_OP_SYNC; 818 } 819 } 820 821 rumpuser_mutex_enter(&rumpuser_aio_mtx); 822 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) { 823 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx); 824 } 825 826 rua = &rumpuser_aios[rumpuser_aio_head]; 827 KASSERT(rua->rua_bp == NULL); 828 rua->rua_fd = fd; 829 rua->rua_data = bp->b_data; 830 rua->rua_dlen = bp->b_bcount; 831 rua->rua_off = off; 832 rua->rua_bp = bp; 833 rua->rua_op = op; 834 835 /* insert into queue & signal */ 836 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS; 837 rumpuser_cv_signal(&rumpuser_aio_cv); 838 rumpuser_mutex_exit(&rumpuser_aio_mtx); 839 } else { 840 if (BUF_ISREAD(bp)) { 841 rumpuser_read_bio(rblk->rblk_fd, bp->b_data, 842 bp->b_bcount, off, rump_biodone, bp); 843 } else { 844 rumpuser_write_bio(rblk->rblk_fd, bp->b_data, 845 bp->b_bcount, off, rump_biodone, bp); 846 } 847 if (BUF_ISWRITE(bp) && !async) 848 rumpuser_fsync(rblk->rblk_fd, &error); 849 } 850 } 851 852 void 853 rumpblk_strategy(struct buf *bp) 854 { 855 856 dostrategy(bp); 857 } 858 859 /* 860 * Simple random number generator. This is private so that we can 861 * very repeatedly control which blocks will fail. 862 * 863 * <mlelstv> pooka, rand() 864 * <mlelstv> [paste] 865 */ 866 static unsigned 867 gimmerand(void) 868 { 869 870 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); 871 } 872 873 /* 874 * Block device with very simple fault injection. Fails every 875 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env 876 * variable RUMP_BLKFAIL. 877 */ 878 void 879 rumpblk_strategy_fail(struct buf *bp) 880 { 881 882 if (gimmerand() % BLKFAIL_MAX >= blkfail) { 883 dostrategy(bp); 884 } else { 885 printf("block fault injection: failing I/O on block %lld\n", 886 (long long)bp->b_blkno); 887 bp->b_error = EIO; 888 biodone(bp); 889 } 890 } 891