1 /* $NetBSD: rumpblk.c,v 1.48 2012/09/14 16:29:21 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by the 7 * Finnish Cultural Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 /* 32 * Block device emulation. Presents a block device interface and 33 * uses rumpuser system calls to satisfy I/O requests. 34 * 35 * We provide fault injection. The driver can be made to fail 36 * I/O occasionally. 37 * 38 * The driver also provides an optimization for regular files by 39 * using memory-mapped I/O. This avoids kernel access for every 40 * I/O operation. It also gives finer-grained control of how to 41 * flush data. Additionally, in case the rump kernel dumps core, 42 * we get way less carnage. 43 * 44 * However, it is quite costly in writing large amounts of 45 * file data, since old contents cannot merely be overwritten, but 46 * must be paged in first before replacing (i.e. r/m/w). Ideally, 47 * we should use directio. The problem is that directio can fail 48 * silently causing improper file system semantics (i.e. unflushed 49 * data). Therefore, default to mmap for now. Even so, directio 50 * _should_ be safe and can be enabled by compiling this module 51 * with -DHAS_DIRECTIO. 52 */ 53 54 #include <sys/cdefs.h> 55 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.48 2012/09/14 16:29:21 pooka Exp $"); 56 57 #include <sys/param.h> 58 #include <sys/buf.h> 59 #include <sys/conf.h> 60 #include <sys/condvar.h> 61 #include <sys/disklabel.h> 62 #include <sys/evcnt.h> 63 #include <sys/fcntl.h> 64 #include <sys/kmem.h> 65 #include <sys/malloc.h> 66 #include <sys/queue.h> 67 #include <sys/stat.h> 68 #include <sys/cprng.h> 69 70 #include <rump/rumpuser.h> 71 72 #include "rump_private.h" 73 #include "rump_vfs_private.h" 74 75 /* 76 * O_DIRECT is the fastest alternative, but since it falls back to 77 * non-direct writes silently, I am not sure it will always be 100% safe. 78 * Use it and play with it, but do that with caution. 79 */ 80 #if 0 81 #define HAS_ODIRECT 82 #endif 83 84 #if 0 85 #define DPRINTF(x) printf x 86 #else 87 #define DPRINTF(x) 88 #endif 89 90 /* Default: 16 x 1MB windows */ 91 unsigned memwinsize = (1<<20); 92 unsigned memwincnt = 16; 93 94 #define STARTWIN(off) ((off) & ~((off_t)memwinsize-1)) 95 #define INWIN(win,off) ((win)->win_off == STARTWIN(off)) 96 #define WINSIZE(rblk, win) (MIN((rblk->rblk_hostsize-win->win_off), \ 97 memwinsize)) 98 #define WINVALID(win) ((win)->win_off != (off_t)-1) 99 #define WINVALIDATE(win) ((win)->win_off = (off_t)-1) 100 struct blkwin { 101 off_t win_off; 102 void *win_mem; 103 int win_refcnt; 104 105 TAILQ_ENTRY(blkwin) win_lru; 106 }; 107 108 #define RUMPBLK_SIZE 16 109 static struct rblkdev { 110 char *rblk_path; 111 int rblk_fd; 112 int rblk_mode; 113 #ifdef HAS_ODIRECT 114 int rblk_dfd; 115 #endif 116 uint64_t rblk_size; 117 uint64_t rblk_hostoffset; 118 uint64_t rblk_hostsize; 119 int rblk_ftype; 120 121 /* for mmap */ 122 int rblk_mmflags; 123 kmutex_t rblk_memmtx; 124 kcondvar_t rblk_memcv; 125 TAILQ_HEAD(winlru, blkwin) rblk_lruq; 126 bool rblk_waiting; 127 128 struct disklabel rblk_label; 129 } minors[RUMPBLK_SIZE]; 130 131 static struct evcnt ev_io_total; 132 static struct evcnt ev_io_async; 133 134 static struct evcnt ev_memblk_hits; 135 static struct evcnt ev_memblk_busy; 136 137 static struct evcnt ev_bwrite_total; 138 static struct evcnt ev_bwrite_async; 139 static struct evcnt ev_bread_total; 140 141 dev_type_open(rumpblk_open); 142 dev_type_close(rumpblk_close); 143 dev_type_read(rumpblk_read); 144 dev_type_write(rumpblk_write); 145 dev_type_ioctl(rumpblk_ioctl); 146 dev_type_strategy(rumpblk_strategy); 147 dev_type_strategy(rumpblk_strategy_fail); 148 dev_type_dump(rumpblk_dump); 149 dev_type_size(rumpblk_size); 150 151 static const struct bdevsw rumpblk_bdevsw = { 152 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl, 153 nodump, nosize, D_DISK 154 }; 155 156 static const struct bdevsw rumpblk_bdevsw_fail = { 157 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl, 158 nodump, nosize, D_DISK 159 }; 160 161 static const struct cdevsw rumpblk_cdevsw = { 162 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write, 163 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK 164 }; 165 166 static int backend_open(struct rblkdev *, const char *); 167 static int backend_close(struct rblkdev *); 168 169 /* fail every n out of BLKFAIL_MAX */ 170 #define BLKFAIL_MAX 10000 171 static int blkfail; 172 static unsigned randstate; 173 static kmutex_t rumpblk_lock; 174 static int sectshift = DEV_BSHIFT; 175 176 static void 177 makedefaultlabel(struct disklabel *lp, off_t size, int part) 178 { 179 int i; 180 181 memset(lp, 0, sizeof(*lp)); 182 183 lp->d_secperunit = size; 184 lp->d_secsize = 1 << sectshift; 185 lp->d_nsectors = size >> sectshift; 186 lp->d_ntracks = 1; 187 lp->d_ncylinders = 1; 188 lp->d_secpercyl = lp->d_nsectors; 189 190 /* oh dear oh dear */ 191 strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); 192 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 193 194 lp->d_type = DTYPE_RUMPD; 195 lp->d_rpm = 11; 196 lp->d_interleave = 1; 197 lp->d_flags = 0; 198 199 /* XXX: RAW_PART handling? */ 200 for (i = 0; i < part; i++) { 201 lp->d_partitions[i].p_fstype = FS_UNUSED; 202 } 203 lp->d_partitions[part].p_size = size >> sectshift; 204 lp->d_npartitions = part+1; 205 /* XXX: file system type? */ 206 207 lp->d_magic = DISKMAGIC; 208 lp->d_magic2 = DISKMAGIC; 209 lp->d_checksum = 0; /* XXX */ 210 } 211 212 static struct blkwin * 213 getwindow(struct rblkdev *rblk, off_t off, int *wsize, int *error) 214 { 215 struct blkwin *win; 216 217 mutex_enter(&rblk->rblk_memmtx); 218 retry: 219 /* search for window */ 220 TAILQ_FOREACH(win, &rblk->rblk_lruq, win_lru) { 221 if (INWIN(win, off) && WINVALID(win)) 222 break; 223 } 224 225 /* found? return */ 226 if (win) { 227 ev_memblk_hits.ev_count++; 228 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 229 goto good; 230 } 231 232 /* 233 * Else, create new window. If the least recently used is not 234 * currently in use, reuse that. Otherwise we need to wait. 235 */ 236 win = TAILQ_LAST(&rblk->rblk_lruq, winlru); 237 if (win->win_refcnt == 0) { 238 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 239 mutex_exit(&rblk->rblk_memmtx); 240 241 if (WINVALID(win)) { 242 DPRINTF(("win %p, unmap mem %p, off 0x%" PRIx64 "\n", 243 win, win->win_mem, win->win_off)); 244 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 245 WINVALIDATE(win); 246 } 247 248 win->win_off = STARTWIN(off); 249 win->win_mem = rumpuser_filemmap(rblk->rblk_fd, win->win_off, 250 WINSIZE(rblk, win), rblk->rblk_mmflags, error); 251 DPRINTF(("win %p, off 0x%" PRIx64 ", mem %p\n", 252 win, win->win_off, win->win_mem)); 253 254 mutex_enter(&rblk->rblk_memmtx); 255 if (win->win_mem == NULL) { 256 WINVALIDATE(win); 257 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 258 mutex_exit(&rblk->rblk_memmtx); 259 return NULL; 260 } 261 } else { 262 DPRINTF(("memwin wait\n")); 263 ev_memblk_busy.ev_count++; 264 265 rblk->rblk_waiting = true; 266 cv_wait(&rblk->rblk_memcv, &rblk->rblk_memmtx); 267 goto retry; 268 } 269 270 good: 271 KASSERT(win); 272 win->win_refcnt++; 273 TAILQ_INSERT_HEAD(&rblk->rblk_lruq, win, win_lru); 274 mutex_exit(&rblk->rblk_memmtx); 275 *wsize = MIN(*wsize, memwinsize - (off-win->win_off)); 276 KASSERT(*wsize); 277 278 return win; 279 } 280 281 static void 282 putwindow(struct rblkdev *rblk, struct blkwin *win) 283 { 284 285 mutex_enter(&rblk->rblk_memmtx); 286 if (--win->win_refcnt == 0 && rblk->rblk_waiting) { 287 rblk->rblk_waiting = false; 288 cv_broadcast(&rblk->rblk_memcv); 289 } 290 KASSERT(win->win_refcnt >= 0); 291 mutex_exit(&rblk->rblk_memmtx); 292 } 293 294 static void 295 wincleanup(struct rblkdev *rblk) 296 { 297 struct blkwin *win; 298 299 while ((win = TAILQ_FIRST(&rblk->rblk_lruq)) != NULL) { 300 TAILQ_REMOVE(&rblk->rblk_lruq, win, win_lru); 301 if (WINVALID(win)) { 302 DPRINTF(("cleanup win %p addr %p\n", 303 win, win->win_mem)); 304 rumpuser_unmap(win->win_mem, WINSIZE(rblk, win)); 305 } 306 kmem_free(win, sizeof(*win)); 307 } 308 rblk->rblk_mmflags = 0; 309 } 310 311 int 312 rumpblk_init(void) 313 { 314 char buf[64]; 315 devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR; 316 unsigned tmp; 317 int error, i; 318 319 mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); 320 321 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) { 322 blkfail = strtoul(buf, NULL, 10); 323 /* fail everything */ 324 if (blkfail > BLKFAIL_MAX) 325 blkfail = BLKFAIL_MAX; 326 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf), 327 &error) == 0) { 328 randstate = strtoul(buf, NULL, 10); 329 } else { 330 randstate = cprng_fast32(); 331 } 332 printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " 333 "seed %u\n", blkfail, BLKFAIL_MAX, randstate); 334 } else { 335 blkfail = 0; 336 } 337 338 if (rumpuser_getenv("RUMP_BLKWINSIZE", buf, sizeof(buf), &error) == 0) { 339 printf("rumpblk: "); 340 tmp = strtoul(buf, NULL, 10); 341 if (tmp && !(tmp & (tmp-1))) 342 memwinsize = tmp; 343 else 344 printf("invalid RUMP_BLKWINSIZE %d, ", tmp); 345 printf("using %d for memwinsize\n", memwinsize); 346 } 347 if (rumpuser_getenv("RUMP_BLKWINCOUNT", buf, sizeof(buf), &error) == 0){ 348 printf("rumpblk: "); 349 tmp = strtoul(buf, NULL, 10); 350 if (tmp) 351 memwincnt = tmp; 352 else 353 printf("invalid RUMP_BLKWINCOUNT %d, ", tmp); 354 printf("using %d for memwincount\n", memwincnt); 355 } 356 if (rumpuser_getenv("RUMP_BLKSECTSHIFT", buf, sizeof(buf), &error)==0){ 357 printf("rumpblk: "); 358 tmp = strtoul(buf, NULL, 10); 359 if (tmp >= DEV_BSHIFT) 360 sectshift = tmp; 361 else 362 printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ", 363 DEV_BSHIFT, tmp); 364 printf("using %d for sector shift (size %d)\n", 365 sectshift, 1<<sectshift); 366 } 367 368 memset(minors, 0, sizeof(minors)); 369 for (i = 0; i < RUMPBLK_SIZE; i++) { 370 mutex_init(&minors[i].rblk_memmtx, MUTEX_DEFAULT, IPL_NONE); 371 cv_init(&minors[i].rblk_memcv, "rblkmcv"); 372 minors[i].rblk_fd = -1; 373 } 374 375 evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL, 376 "rumpblk", "I/O reqs"); 377 evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL, 378 "rumpblk", "async I/O"); 379 380 evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL, 381 "rumpblk", "bytes read"); 382 evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL, 383 "rumpblk", "bytes written"); 384 evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL, 385 "rumpblk", "bytes written async"); 386 387 evcnt_attach_dynamic(&ev_memblk_hits, EVCNT_TYPE_MISC, NULL, 388 "rumpblk", "window hits"); 389 evcnt_attach_dynamic(&ev_memblk_busy, EVCNT_TYPE_MISC, NULL, 390 "rumpblk", "all windows busy"); 391 392 if (blkfail) { 393 return devsw_attach("rumpblk", 394 &rumpblk_bdevsw_fail, &rumpblkmaj, 395 &rumpblk_cdevsw, &rumpblkmaj); 396 } else { 397 return devsw_attach("rumpblk", 398 &rumpblk_bdevsw, &rumpblkmaj, 399 &rumpblk_cdevsw, &rumpblkmaj); 400 } 401 } 402 403 int 404 rumpblk_register(const char *path, devminor_t *dmin, 405 uint64_t offset, uint64_t size) 406 { 407 struct rblkdev *rblk; 408 uint64_t flen; 409 size_t len; 410 int ftype, error, i; 411 412 /* devices might not report correct size unless they're open */ 413 if (rumpuser_getfileinfo(path, &flen, &ftype, &error) == -1) 414 return error; 415 416 /* verify host file is of supported type */ 417 if (!(ftype == RUMPUSER_FT_REG 418 || ftype == RUMPUSER_FT_BLK 419 || ftype == RUMPUSER_FT_CHR)) 420 return EINVAL; 421 422 mutex_enter(&rumpblk_lock); 423 for (i = 0; i < RUMPBLK_SIZE; i++) { 424 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 425 mutex_exit(&rumpblk_lock); 426 *dmin = i; 427 return 0; 428 } 429 } 430 431 for (i = 0; i < RUMPBLK_SIZE; i++) 432 if (minors[i].rblk_path == NULL) 433 break; 434 if (i == RUMPBLK_SIZE) { 435 mutex_exit(&rumpblk_lock); 436 return EBUSY; 437 } 438 439 rblk = &minors[i]; 440 rblk->rblk_path = __UNCONST("taken"); 441 mutex_exit(&rumpblk_lock); 442 443 len = strlen(path); 444 rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); 445 strcpy(rblk->rblk_path, path); 446 rblk->rblk_hostoffset = offset; 447 if (size != RUMPBLK_SIZENOTSET) { 448 KASSERT(size + offset <= flen); 449 rblk->rblk_size = size; 450 } else { 451 KASSERT(offset < flen); 452 rblk->rblk_size = flen - offset; 453 } 454 rblk->rblk_hostsize = flen; 455 rblk->rblk_ftype = ftype; 456 makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); 457 458 if ((error = backend_open(rblk, path)) != 0) { 459 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); 460 free(rblk->rblk_path, M_TEMP); 461 rblk->rblk_path = NULL; 462 return error; 463 } 464 465 *dmin = i; 466 return 0; 467 } 468 469 /* 470 * Unregister rumpblk. It's the callers responsibility to make 471 * sure it's no longer in use. 472 */ 473 int 474 rumpblk_deregister(const char *path) 475 { 476 struct rblkdev *rblk; 477 int i; 478 479 mutex_enter(&rumpblk_lock); 480 for (i = 0; i < RUMPBLK_SIZE; i++) { 481 if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { 482 break; 483 } 484 } 485 mutex_exit(&rumpblk_lock); 486 487 if (i == RUMPBLK_SIZE) 488 return ENOENT; 489 490 rblk = &minors[i]; 491 backend_close(rblk); 492 493 wincleanup(rblk); 494 free(rblk->rblk_path, M_TEMP); 495 memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); 496 rblk->rblk_path = NULL; 497 498 return 0; 499 } 500 501 static int 502 backend_open(struct rblkdev *rblk, const char *path) 503 { 504 int error, fd; 505 506 KASSERT(rblk->rblk_fd == -1); 507 fd = rumpuser_open(path, RUMPUSER_OPEN_RDWR, &error); 508 if (error) { 509 fd = rumpuser_open(path, RUMPUSER_OPEN_RDONLY, &error); 510 if (error) 511 return error; 512 rblk->rblk_mode = FREAD; 513 514 #ifdef HAS_ODIRECT 515 rblk->rblk_dfd = rumpuser_open(path, 516 RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_DIRECT, &error); 517 if (error) { 518 close(fd); 519 return error; 520 } 521 #endif 522 } else { 523 rblk->rblk_mode = FREAD|FWRITE; 524 525 #ifdef HAS_ODIRECT 526 rblk->rblk_dfd = rumpuser_open(path, 527 RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_DIRECT, &error); 528 if (error) { 529 close(fd); 530 return error; 531 } 532 #endif 533 } 534 535 if (rblk->rblk_ftype == RUMPUSER_FT_REG) { 536 uint64_t fsize= rblk->rblk_hostsize, off= rblk->rblk_hostoffset; 537 struct blkwin *win; 538 int i, winsize; 539 540 /* 541 * Use mmap to access a regular file. Allocate and 542 * cache initial windows here. Failure to allocate one 543 * means fallback to read/write i/o. 544 */ 545 546 rblk->rblk_mmflags = 0; 547 if (rblk->rblk_mode & FREAD) 548 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_READ; 549 if (rblk->rblk_mode & FWRITE) { 550 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_WRITE; 551 rblk->rblk_mmflags |= RUMPUSER_FILEMMAP_SHARED; 552 } 553 554 TAILQ_INIT(&rblk->rblk_lruq); 555 rblk->rblk_fd = fd; 556 557 for (i = 0; i < memwincnt && off + i*memwinsize < fsize; i++) { 558 win = kmem_zalloc(sizeof(*win), KM_SLEEP); 559 WINVALIDATE(win); 560 TAILQ_INSERT_TAIL(&rblk->rblk_lruq, win, win_lru); 561 562 /* 563 * Allocate first windows. Here we just generally 564 * make sure a) we can mmap at all b) we have the 565 * necessary VA available 566 */ 567 winsize = memwinsize; 568 win = getwindow(rblk, off + i*memwinsize, &winsize, 569 &error); 570 if (win) { 571 putwindow(rblk, win); 572 } else { 573 wincleanup(rblk); 574 break; 575 } 576 } 577 } else { 578 rblk->rblk_fd = fd; 579 } 580 581 KASSERT(rblk->rblk_fd != -1); 582 return 0; 583 } 584 585 static int 586 backend_close(struct rblkdev *rblk) 587 { 588 int dummy; 589 590 if (rblk->rblk_mmflags) 591 wincleanup(rblk); 592 rumpuser_fsync(rblk->rblk_fd, &dummy); 593 rumpuser_close(rblk->rblk_fd, &dummy); 594 rblk->rblk_fd = -1; 595 #ifdef HAS_ODIRECT 596 if (rblk->rblk_dfd != -1) { 597 rumpuser_close(rblk->rblk_dfd, &dummy); 598 rblk->rblk_dfd = -1; 599 } 600 #endif 601 602 return 0; 603 } 604 605 int 606 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) 607 { 608 struct rblkdev *rblk = &minors[minor(dev)]; 609 610 if (rblk->rblk_fd == -1) 611 return ENXIO; 612 613 if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) { 614 return EACCES; 615 } 616 617 return 0; 618 } 619 620 int 621 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) 622 { 623 624 return 0; 625 } 626 627 int 628 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) 629 { 630 devminor_t dmin = minor(dev); 631 struct rblkdev *rblk = &minors[dmin]; 632 struct partinfo *pi; 633 int error = 0; 634 635 /* well, me should support a few more, but we don't for now */ 636 switch (xfer) { 637 case DIOCGDINFO: 638 *(struct disklabel *)addr = rblk->rblk_label; 639 break; 640 641 case DIOCGPART: 642 pi = addr; 643 pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; 644 pi->disklab = &rblk->rblk_label; 645 break; 646 647 /* it's synced enough along the write path */ 648 case DIOCCACHESYNC: 649 break; 650 651 default: 652 error = ENOTTY; 653 break; 654 } 655 656 return error; 657 } 658 659 static int 660 do_physio(dev_t dev, struct uio *uio, int which) 661 { 662 void (*strat)(struct buf *); 663 664 if (blkfail) 665 strat = rumpblk_strategy_fail; 666 else 667 strat = rumpblk_strategy; 668 669 return physio(strat, NULL, dev, which, minphys, uio); 670 } 671 672 int 673 rumpblk_read(dev_t dev, struct uio *uio, int flags) 674 { 675 676 return do_physio(dev, uio, B_READ); 677 } 678 679 int 680 rumpblk_write(dev_t dev, struct uio *uio, int flags) 681 { 682 683 return do_physio(dev, uio, B_WRITE); 684 } 685 686 static void 687 dostrategy(struct buf *bp) 688 { 689 struct rblkdev *rblk = &minors[minor(bp->b_dev)]; 690 off_t off; 691 int async = bp->b_flags & B_ASYNC; 692 int error; 693 694 if (bp->b_bcount % (1<<sectshift) != 0) { 695 rump_biodone(bp, 0, EINVAL); 696 return; 697 } 698 699 /* collect statistics */ 700 ev_io_total.ev_count++; 701 if (async) 702 ev_io_async.ev_count++; 703 if (BUF_ISWRITE(bp)) { 704 ev_bwrite_total.ev_count += bp->b_bcount; 705 if (async) 706 ev_bwrite_async.ev_count += bp->b_bcount; 707 } else { 708 ev_bread_total.ev_count++; 709 } 710 711 /* 712 * b_blkno is always in terms of DEV_BSIZE, and since we need 713 * to translate to a byte offset for the host read, this 714 * calculation does not need sectshift. 715 */ 716 off = bp->b_blkno << DEV_BSHIFT; 717 718 /* 719 * Do bounds checking if we're working on a file. Otherwise 720 * invalid file systems might attempt to read beyond EOF. This 721 * is bad(tm) especially on mmapped images. This is essentially 722 * the kernel bounds_check() routines. 723 */ 724 if (off + bp->b_bcount > rblk->rblk_size) { 725 int64_t sz = rblk->rblk_size - off; 726 727 /* EOF */ 728 if (sz == 0) { 729 rump_biodone(bp, 0, 0); 730 return; 731 } 732 /* beyond EOF ==> error */ 733 if (sz < 0) { 734 rump_biodone(bp, 0, EINVAL); 735 return; 736 } 737 738 /* truncate to device size */ 739 bp->b_bcount = sz; 740 } 741 742 off += rblk->rblk_hostoffset; 743 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 744 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", 745 bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", 746 off, off, (off + bp->b_bcount), async ? "a" : "")); 747 748 /* mmap? handle here and return */ 749 if (rblk->rblk_mmflags) { 750 struct blkwin *win; 751 int winsize, iodone; 752 uint8_t *ioaddr, *bufaddr; 753 754 for (iodone = 0; iodone < bp->b_bcount; 755 iodone += winsize, off += winsize) { 756 winsize = bp->b_bcount - iodone; 757 win = getwindow(rblk, off, &winsize, &error); 758 if (win == NULL) { 759 rump_biodone(bp, iodone, error); 760 return; 761 } 762 763 ioaddr = (uint8_t *)win->win_mem + (off-STARTWIN(off)); 764 bufaddr = (uint8_t *)bp->b_data + iodone; 765 766 DPRINTF(("strat: %p off 0x%" PRIx64 767 ", ioaddr %p (%p)/buf %p\n", win, 768 win->win_off, ioaddr, win->win_mem, bufaddr)); 769 if (BUF_ISREAD(bp)) { 770 memcpy(bufaddr, ioaddr, winsize); 771 } else { 772 memcpy(ioaddr, bufaddr, winsize); 773 } 774 775 /* synchronous write, sync bits back to disk */ 776 if (BUF_ISWRITE(bp) && !async) { 777 rumpuser_memsync(ioaddr, winsize, &error); 778 } 779 putwindow(rblk, win); 780 } 781 782 rump_biodone(bp, bp->b_bcount, 0); 783 return; 784 } 785 786 /* 787 * Do I/O. We have different paths for async and sync I/O. 788 * Async I/O is done by passing a request to rumpuser where 789 * it is executed. The rumpuser routine then calls 790 * biodone() to signal any waiters in the kernel. I/O's are 791 * executed in series. Technically executing them in parallel 792 * would produce better results, but then we'd need either 793 * more threads or posix aio. Maybe worth investigating 794 * this later. 795 * 796 * Using bufq here might be a good idea. 797 */ 798 799 if (rump_threads) { 800 struct rumpuser_aio *rua; 801 int op, fd; 802 803 fd = rblk->rblk_fd; 804 if (BUF_ISREAD(bp)) { 805 op = RUA_OP_READ; 806 } else { 807 op = RUA_OP_WRITE; 808 if (!async) { 809 /* O_DIRECT not fully automatic yet */ 810 #ifdef HAS_ODIRECT 811 if ((off & ((1<<sectshift)-1)) == 0 812 && ((intptr_t)bp->b_data 813 & ((1<<sectshift)-1)) == 0 814 && (bp->b_bcount & ((1<<sectshift)-1)) == 0) 815 fd = rblk->rblk_dfd; 816 else 817 #endif 818 op |= RUA_OP_SYNC; 819 } 820 } 821 822 rumpuser_mutex_enter(&rumpuser_aio_mtx); 823 while ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) { 824 rumpuser_cv_wait(&rumpuser_aio_cv, &rumpuser_aio_mtx); 825 } 826 827 rua = &rumpuser_aios[rumpuser_aio_head]; 828 KASSERT(rua->rua_bp == NULL); 829 rua->rua_fd = fd; 830 rua->rua_data = bp->b_data; 831 rua->rua_dlen = bp->b_bcount; 832 rua->rua_off = off; 833 rua->rua_bp = bp; 834 rua->rua_op = op; 835 836 /* insert into queue & signal */ 837 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS; 838 rumpuser_cv_signal(&rumpuser_aio_cv); 839 rumpuser_mutex_exit(&rumpuser_aio_mtx); 840 } else { 841 if (BUF_ISREAD(bp)) { 842 rumpuser_read_bio(rblk->rblk_fd, bp->b_data, 843 bp->b_bcount, off, rump_biodone, bp); 844 } else { 845 rumpuser_write_bio(rblk->rblk_fd, bp->b_data, 846 bp->b_bcount, off, rump_biodone, bp); 847 } 848 if (BUF_ISWRITE(bp) && !async) 849 rumpuser_fsync(rblk->rblk_fd, &error); 850 } 851 } 852 853 void 854 rumpblk_strategy(struct buf *bp) 855 { 856 857 dostrategy(bp); 858 } 859 860 /* 861 * Simple random number generator. This is private so that we can 862 * very repeatedly control which blocks will fail. 863 * 864 * <mlelstv> pooka, rand() 865 * <mlelstv> [paste] 866 */ 867 static unsigned 868 gimmerand(void) 869 { 870 871 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); 872 } 873 874 /* 875 * Block device with very simple fault injection. Fails every 876 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env 877 * variable RUMP_BLKFAIL. 878 */ 879 void 880 rumpblk_strategy_fail(struct buf *bp) 881 { 882 883 if (gimmerand() % BLKFAIL_MAX >= blkfail) { 884 dostrategy(bp); 885 } else { 886 printf("block fault injection: failing I/O on block %lld\n", 887 (long long)bp->b_blkno); 888 bp->b_error = EIO; 889 biodone(bp); 890 } 891 } 892