1 /* $NetBSD: rumpuser.c,v 1.29 2013/03/18 21:00:52 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 2007-2010 Antti Kantee. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include "rumpuser_port.h" 29 30 #if !defined(lint) 31 __RCSID("$NetBSD: rumpuser.c,v 1.29 2013/03/18 21:00:52 pooka Exp $"); 32 #endif /* !lint */ 33 34 #include <sys/ioctl.h> 35 #include <sys/mman.h> 36 #include <sys/uio.h> 37 #include <sys/stat.h> 38 #include <sys/time.h> 39 40 #ifdef __NetBSD__ 41 #include <sys/disk.h> 42 #include <sys/disklabel.h> 43 #include <sys/dkio.h> 44 #include <sys/event.h> 45 #endif 46 47 #if defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__) 48 #include <sys/sysctl.h> 49 #endif 50 51 #include <assert.h> 52 #include <errno.h> 53 #include <fcntl.h> 54 #include <netdb.h> 55 #include <poll.h> 56 #include <signal.h> 57 #include <stdarg.h> 58 #include <stdint.h> 59 #include <stdio.h> 60 #include <stdlib.h> 61 #include <string.h> 62 #include <time.h> 63 #include <unistd.h> 64 65 #include <rump/rumpuser.h> 66 67 #include "rumpuser_int.h" 68 69 int 70 rumpuser_getversion(void) 71 { 72 73 return RUMPUSER_VERSION; 74 } 75 76 int 77 rumpuser_getfileinfo(const char *path, uint64_t *sizep, int *ftp, int *error) 78 { 79 struct stat sb; 80 uint64_t size; 81 int needsdev = 0, rv = 0, ft; 82 int fd = -1; 83 84 if (stat(path, &sb) == -1) { 85 seterror(errno); 86 return -1; 87 } 88 89 switch (sb.st_mode & S_IFMT) { 90 case S_IFDIR: 91 ft = RUMPUSER_FT_DIR; 92 break; 93 case S_IFREG: 94 ft = RUMPUSER_FT_REG; 95 break; 96 case S_IFBLK: 97 ft = RUMPUSER_FT_BLK; 98 needsdev = 1; 99 break; 100 case S_IFCHR: 101 ft = RUMPUSER_FT_CHR; 102 needsdev = 1; 103 break; 104 default: 105 ft = RUMPUSER_FT_OTHER; 106 break; 107 } 108 109 if (!needsdev) { 110 size = sb.st_size; 111 } else if (sizep) { 112 /* 113 * Welcome to the jungle. Of course querying the kernel 114 * for a device partition size is supposed to be far from 115 * trivial. On NetBSD we use ioctl. On $other platform 116 * we have a problem. We try "the lseek trick" and just 117 * fail if that fails. Platform specific code can later 118 * be written here if appropriate. 119 * 120 * On NetBSD we hope and pray that for block devices nobody 121 * else is holding them open, because otherwise the kernel 122 * will not permit us to open it. Thankfully, this is 123 * usually called only in bootstrap and then we can 124 * forget about it. 125 */ 126 #ifndef __NetBSD__ 127 off_t off; 128 129 fd = open(path, O_RDONLY); 130 if (fd == -1) { 131 seterror(errno); 132 rv = -1; 133 goto out; 134 } 135 136 off = lseek(fd, 0, SEEK_END); 137 if (off != 0) { 138 size = off; 139 goto out; 140 } 141 fprintf(stderr, "error: device size query not implemented on " 142 "this platform\n"); 143 seterror(EOPNOTSUPP); 144 rv = -1; 145 goto out; 146 #else 147 struct disklabel lab; 148 struct partition *parta; 149 struct dkwedge_info dkw; 150 151 fd = open(path, O_RDONLY); 152 if (fd == -1) { 153 seterror(errno); 154 rv = -1; 155 goto out; 156 } 157 158 if (ioctl(fd, DIOCGDINFO, &lab) == 0) { 159 parta = &lab.d_partitions[DISKPART(sb.st_rdev)]; 160 size = (uint64_t)lab.d_secsize * parta->p_size; 161 goto out; 162 } 163 164 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) == 0) { 165 /* 166 * XXX: should use DIOCGDISKINFO to query 167 * sector size, but that requires proplib, 168 * so just don't bother for now. it's nice 169 * that something as difficult as figuring out 170 * a partition's size has been made so easy. 171 */ 172 size = dkw.dkw_size << DEV_BSHIFT; 173 goto out; 174 } 175 176 seterror(errno); 177 rv = -1; 178 #endif /* __NetBSD__ */ 179 } 180 181 out: 182 if (rv == 0 && sizep) 183 *sizep = size; 184 if (rv == 0 && ftp) 185 *ftp = ft; 186 if (fd != -1) 187 close(fd); 188 189 return rv; 190 } 191 192 int 193 rumpuser_nanosleep(uint64_t *sec, uint64_t *nsec, int *error) 194 { 195 struct timespec rqt, rmt; 196 int rv; 197 198 /*LINTED*/ 199 rqt.tv_sec = *sec; 200 /*LINTED*/ 201 rqt.tv_nsec = *nsec; 202 203 KLOCK_WRAP(rv = nanosleep(&rqt, &rmt)); 204 if (rv == -1) 205 seterror(errno); 206 207 *sec = rmt.tv_sec; 208 *nsec = rmt.tv_nsec; 209 210 return rv; 211 } 212 213 void * 214 rumpuser_malloc(size_t howmuch, int alignment) 215 { 216 void *mem; 217 int rv; 218 219 if (alignment == 0) 220 alignment = sizeof(void *); 221 222 rv = posix_memalign(&mem, (size_t)alignment, howmuch); 223 if (__predict_false(rv != 0)) { 224 if (rv == EINVAL) { 225 printf("rumpuser_malloc: invalid alignment %d\n", 226 alignment); 227 abort(); 228 } 229 mem = NULL; 230 } 231 232 return mem; 233 } 234 235 void * 236 rumpuser_realloc(void *ptr, size_t howmuch) 237 { 238 239 return realloc(ptr, howmuch); 240 } 241 242 void 243 rumpuser_free(void *ptr) 244 { 245 246 free(ptr); 247 } 248 249 void * 250 rumpuser_anonmmap(void *prefaddr, size_t size, int alignbit, 251 int exec, int *error) 252 { 253 void *rv; 254 int prot; 255 256 #ifndef MAP_ALIGNED 257 #define MAP_ALIGNED(a) 0 258 if (alignbit) 259 fprintf(stderr, "rumpuser_anonmmap: warning, requested " 260 "alignment not supported by hypervisor\n"); 261 #endif 262 263 prot = PROT_READ|PROT_WRITE; 264 if (exec) 265 prot |= PROT_EXEC; 266 rv = mmap(prefaddr, size, prot, 267 MAP_PRIVATE | MAP_ANON | MAP_ALIGNED(alignbit), -1, 0); 268 if (rv == MAP_FAILED) { 269 seterror(errno); 270 return NULL; 271 } 272 return rv; 273 } 274 275 void 276 rumpuser_unmap(void *addr, size_t len) 277 { 278 int rv; 279 280 rv = munmap(addr, len); 281 assert(rv == 0); 282 } 283 284 void * 285 rumpuser_filemmap(int fd, off_t offset, size_t len, int flags, int *error) 286 { 287 void *rv; 288 int mmflags, prot; 289 290 if (flags & RUMPUSER_FILEMMAP_TRUNCATE) { 291 if (ftruncate(fd, offset + len) == -1) { 292 seterror(errno); 293 return NULL; 294 } 295 } 296 297 /* it's implicit */ 298 #if defined(__sun__) && !defined(MAP_FILE) 299 #define MAP_FILE 0 300 #endif 301 302 mmflags = MAP_FILE; 303 if (flags & RUMPUSER_FILEMMAP_SHARED) 304 mmflags |= MAP_SHARED; 305 else 306 mmflags |= MAP_PRIVATE; 307 308 prot = 0; 309 if (flags & RUMPUSER_FILEMMAP_READ) 310 prot |= PROT_READ; 311 if (flags & RUMPUSER_FILEMMAP_WRITE) 312 prot |= PROT_WRITE; 313 314 rv = mmap(NULL, len, PROT_READ|PROT_WRITE, mmflags, fd, offset); 315 if (rv == MAP_FAILED) { 316 seterror(errno); 317 return NULL; 318 } 319 320 seterror(0); 321 return rv; 322 } 323 324 int 325 rumpuser_memsync(void *addr, size_t len, int *error) 326 { 327 328 DOCALL_KLOCK(int, (msync(addr, len, MS_SYNC))); 329 } 330 331 int 332 rumpuser_open(const char *path, int ruflags, int *error) 333 { 334 int flags; 335 336 switch (ruflags & RUMPUSER_OPEN_ACCMODE) { 337 case RUMPUSER_OPEN_RDONLY: 338 flags = O_RDONLY; 339 break; 340 case RUMPUSER_OPEN_WRONLY: 341 flags = O_WRONLY; 342 break; 343 case RUMPUSER_OPEN_RDWR: 344 flags = O_RDWR; 345 break; 346 default: 347 *error = EINVAL; 348 return -1; 349 } 350 351 #define TESTSET(_ru_, _h_) if (ruflags & _ru_) flags |= _h_; 352 TESTSET(RUMPUSER_OPEN_CREATE, O_CREAT); 353 TESTSET(RUMPUSER_OPEN_EXCL, O_EXCL); 354 #ifdef O_DIRECT 355 TESTSET(RUMPUSER_OPEN_DIRECT, O_DIRECT); 356 #else 357 if (ruflags & RUMPUSER_OPEN_DIRECT) { 358 *error = EOPNOTSUPP; 359 return -1; 360 } 361 #endif 362 #undef TESTSET 363 364 DOCALL_KLOCK(int, (open(path, flags, 0644))); 365 } 366 367 int 368 rumpuser_ioctl(int fd, u_long cmd, void *data, int *error) 369 { 370 371 DOCALL_KLOCK(int, (ioctl(fd, cmd, data))); 372 } 373 374 int 375 rumpuser_close(int fd, int *error) 376 { 377 378 DOCALL(int, close(fd)); 379 } 380 381 int 382 rumpuser_fsync(int fd, int *error) 383 { 384 385 DOCALL_KLOCK(int, fsync(fd)); 386 } 387 388 ssize_t 389 rumpuser_read(int fd, void *data, size_t size, int *error) 390 { 391 ssize_t rv; 392 393 KLOCK_WRAP(rv = read(fd, data, size)); 394 if (rv == -1) 395 seterror(errno); 396 397 return rv; 398 } 399 400 ssize_t 401 rumpuser_pread(int fd, void *data, size_t size, off_t offset, int *error) 402 { 403 ssize_t rv; 404 405 KLOCK_WRAP(rv = pread(fd, data, size, offset)); 406 if (rv == -1) 407 seterror(errno); 408 409 return rv; 410 } 411 412 void 413 rumpuser_read_bio(int fd, void *data, size_t size, off_t offset, 414 rump_biodone_fn biodone, void *biodonecookie) 415 { 416 ssize_t rv; 417 int error = 0; 418 419 rv = rumpuser_pread(fd, data, size, offset, &error); 420 /* check against <0 instead of ==-1 to get typing below right */ 421 if (rv < 0) 422 rv = 0; 423 424 /* LINTED: see above */ 425 biodone(biodonecookie, rv, error); 426 } 427 428 ssize_t 429 rumpuser_write(int fd, const void *data, size_t size, int *error) 430 { 431 ssize_t rv; 432 433 KLOCK_WRAP(rv = write(fd, data, size)); 434 if (rv == -1) 435 seterror(errno); 436 437 return rv; 438 } 439 440 ssize_t 441 rumpuser_pwrite(int fd, const void *data, size_t size, off_t offset, int *error) 442 { 443 ssize_t rv; 444 445 KLOCK_WRAP(rv = pwrite(fd, data, size, offset)); 446 if (rv == -1) 447 seterror(errno); 448 449 return rv; 450 } 451 452 void 453 rumpuser_write_bio(int fd, const void *data, size_t size, off_t offset, 454 rump_biodone_fn biodone, void *biodonecookie) 455 { 456 ssize_t rv; 457 int error = 0; 458 459 rv = rumpuser_pwrite(fd, data, size, offset, &error); 460 /* check against <0 instead of ==-1 to get typing below right */ 461 if (rv < 0) 462 rv = 0; 463 464 /* LINTED: see above */ 465 biodone(biodonecookie, rv, error); 466 } 467 468 ssize_t 469 rumpuser_readv(int fd, const struct rumpuser_iovec *riov, int iovcnt, 470 int *error) 471 { 472 struct iovec *iovp; 473 ssize_t rv; 474 int i; 475 476 iovp = malloc(iovcnt * sizeof(struct iovec)); 477 if (iovp == NULL) { 478 seterror(ENOMEM); 479 return -1; 480 } 481 for (i = 0; i < iovcnt; i++) { 482 iovp[i].iov_base = riov[i].iov_base; 483 /*LINTED*/ 484 iovp[i].iov_len = riov[i].iov_len; 485 } 486 487 KLOCK_WRAP(rv = readv(fd, iovp, iovcnt)); 488 if (rv == -1) 489 seterror(errno); 490 free(iovp); 491 492 return rv; 493 } 494 495 ssize_t 496 rumpuser_writev(int fd, const struct rumpuser_iovec *riov, int iovcnt, 497 int *error) 498 { 499 struct iovec *iovp; 500 ssize_t rv; 501 int i; 502 503 iovp = malloc(iovcnt * sizeof(struct iovec)); 504 if (iovp == NULL) { 505 seterror(ENOMEM); 506 return -1; 507 } 508 for (i = 0; i < iovcnt; i++) { 509 iovp[i].iov_base = riov[i].iov_base; 510 /*LINTED*/ 511 iovp[i].iov_len = riov[i].iov_len; 512 } 513 514 KLOCK_WRAP(rv = writev(fd, iovp, iovcnt)); 515 if (rv == -1) 516 seterror(errno); 517 free(iovp); 518 519 return rv; 520 } 521 522 int 523 rumpuser_gettime(uint64_t *sec, uint64_t *nsec, int *error) 524 { 525 struct timeval tv; 526 int rv; 527 528 rv = gettimeofday(&tv, NULL); 529 if (rv == -1) { 530 seterror(errno); 531 return rv; 532 } 533 534 *sec = tv.tv_sec; 535 *nsec = tv.tv_usec * 1000; 536 537 return 0; 538 } 539 540 int 541 rumpuser_getenv(const char *name, char *buf, size_t blen, int *error) 542 { 543 544 DOCALL(int, getenv_r(name, buf, blen)); 545 } 546 547 int 548 rumpuser_gethostname(char *name, size_t namelen, int *error) 549 { 550 char tmp[MAXHOSTNAMELEN]; 551 552 if (gethostname(tmp, sizeof(tmp)) == -1) { 553 snprintf(name, namelen, "rump-%05d.rumpdomain", (int)getpid()); 554 } else { 555 snprintf(name, namelen, "rump-%05d.%s.rumpdomain", 556 (int)getpid(), tmp); 557 } 558 559 *error = 0; 560 return 0; 561 } 562 563 int 564 rumpuser_poll(struct pollfd *fds, int nfds, int timeout, int *error) 565 { 566 567 DOCALL_KLOCK(int, (poll(fds, (nfds_t)nfds, timeout))); 568 } 569 570 int 571 rumpuser_putchar(int c, int *error) 572 { 573 574 DOCALL(int, (putchar(c))); 575 } 576 577 void 578 rumpuser_exit(int rv) 579 { 580 581 if (rv == RUMPUSER_PANIC) 582 abort(); 583 else 584 exit(rv); 585 } 586 587 void 588 rumpuser_seterrno(int error) 589 { 590 591 errno = error; 592 } 593 594 /* 595 * On NetBSD we use kqueue, on Linux we use inotify. The underlying 596 * interface requirements aren't quite the same, but we have a very 597 * good chance of doing the fd->path mapping on Linux thanks to dcache, 598 * so just keep the existing interfaces for now. 599 */ 600 #if defined(__NetBSD__) 601 int 602 rumpuser_writewatchfile_setup(int kq, int fd, intptr_t opaque, int *error) 603 { 604 struct kevent kev; 605 606 if (kq == -1) { 607 kq = kqueue(); 608 if (kq == -1) { 609 seterror(errno); 610 return -1; 611 } 612 } 613 614 EV_SET(&kev, fd, EVFILT_VNODE, EV_ADD|EV_ENABLE|EV_CLEAR, 615 NOTE_WRITE, 0, opaque); 616 if (kevent(kq, &kev, 1, NULL, 0, NULL) == -1) { 617 seterror(errno); 618 return -1; 619 } 620 621 return kq; 622 } 623 624 int 625 rumpuser_writewatchfile_wait(int kq, intptr_t *opaque, int *error) 626 { 627 struct kevent kev; 628 int rv; 629 630 again: 631 KLOCK_WRAP(rv = kevent(kq, NULL, 0, &kev, 1, NULL)); 632 if (rv == -1) { 633 if (errno == EINTR) 634 goto again; 635 seterror(errno); 636 return -1; 637 } 638 639 if (opaque) 640 *opaque = kev.udata; 641 return rv; 642 } 643 644 #elif defined(__linux__) 645 #include <sys/inotify.h> 646 647 int 648 rumpuser_writewatchfile_setup(int inotify, int fd, intptr_t notused, int *error) 649 { 650 char procbuf[PATH_MAX], linkbuf[PATH_MAX]; 651 ssize_t nn; 652 653 if (inotify == -1) { 654 inotify = inotify_init(); 655 if (inotify == -1) { 656 seterror(errno); 657 return -1; 658 } 659 } 660 661 /* ok, need to map fd into path for inotify */ 662 snprintf(procbuf, sizeof(procbuf), "/proc/self/fd/%d", fd); 663 nn = readlink(procbuf, linkbuf, sizeof(linkbuf)-1); 664 if (nn >= (ssize_t)sizeof(linkbuf)-1) { 665 nn = -1; 666 errno = E2BIG; /* pick something */ 667 } 668 if (nn == -1) { 669 seterror(errno); 670 close(inotify); 671 return -1; 672 } 673 674 linkbuf[nn] = '\0'; 675 if (inotify_add_watch(inotify, linkbuf, IN_MODIFY) == -1) { 676 seterror(errno); 677 close(inotify); 678 return -1; 679 } 680 681 return inotify; 682 } 683 684 int 685 rumpuser_writewatchfile_wait(int kq, intptr_t *opaque, int *error) 686 { 687 struct inotify_event iev; 688 ssize_t nn; 689 690 do { 691 KLOCK_WRAP(nn = read(kq, &iev, sizeof(iev))); 692 } while (errno == EINTR); 693 694 if (nn == -1) { 695 seterror(errno); 696 return -1; 697 } 698 return (nn/sizeof(iev)); 699 } 700 701 #else 702 703 /* a polling default implementation */ 704 int 705 rumpuser_writewatchfile_setup(int inotify, int fd, intptr_t notused, int *error) 706 { 707 static int warned = 0; 708 709 if (!warned) { 710 fprintf(stderr, "WARNING: rumpuser writewatchfile routines are " 711 "polling-only on this platform\n"); 712 warned = 1; 713 } 714 715 return 0; 716 } 717 718 int 719 rumpuser_writewatchfile_wait(int kq, intptr_t *opaque, int *error) 720 { 721 722 KLOCK_WRAP(usleep(10000)); 723 return 0; 724 } 725 #endif 726 727 /* 728 * This is meant for safe debugging prints from the kernel. 729 */ 730 int 731 rumpuser_dprintf(const char *format, ...) 732 { 733 va_list ap; 734 int rv; 735 736 va_start(ap, format); 737 rv = vfprintf(stderr, format, ap); 738 va_end(ap); 739 740 return rv; 741 } 742 743 int 744 rumpuser_kill(int64_t pid, int sig, int *error) 745 { 746 747 #ifdef __NetBSD__ 748 if (pid == RUMPUSER_PID_SELF) { 749 DOCALL(int, raise(sig)); 750 } else { 751 DOCALL(int, kill((pid_t)pid, sig)); 752 } 753 #else 754 /* XXXfixme: signal numbers may not match on non-NetBSD */ 755 seterror(EOPNOTSUPP); 756 return -1; 757 #endif 758 } 759 760 int 761 rumpuser_getnhostcpu(void) 762 { 763 int ncpu = 1; 764 765 #if defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__) 766 size_t sz = sizeof(ncpu); 767 768 sysctlbyname("hw.ncpu", &ncpu, &sz, NULL, 0); 769 #elif defined(__linux__) || defined(__CYGWIN__) 770 FILE *fp; 771 char *line = NULL; 772 size_t n = 0; 773 774 /* If anyone knows a better way, I'm all ears */ 775 if ((fp = fopen("/proc/cpuinfo", "r")) != NULL) { 776 ncpu = 0; 777 while (getline(&line, &n, fp) != -1) { 778 if (strncmp(line, 779 "processor", sizeof("processor")-1) == 0) 780 ncpu++; 781 } 782 if (ncpu == 0) 783 ncpu = 1; 784 free(line); 785 fclose(fp); 786 } 787 #elif __sun__ 788 /* XXX: this is just a rough estimate ... */ 789 ncpu = sysconf(_SC_NPROCESSORS_ONLN); 790 #endif 791 792 return ncpu; 793 } 794 795 /* XXX: this hypercall needs a better name */ 796 uint32_t 797 rumpuser_arc4random(void) 798 { 799 800 return arc4random(); 801 } 802