1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2016 Intel Corporation 3 */ 4 5 #include <sys/types.h> 6 #include <sys/stat.h> 7 #include <fcntl.h> 8 #include <unistd.h> 9 #include <errno.h> 10 #include <stdlib.h> 11 12 #include <rte_memory.h> 13 14 #include "vhost.h" 15 #include "virtio_user_dev.h" 16 #include "vhost_kernel_tap.h" 17 18 struct vhost_kernel_data { 19 int *vhostfds; 20 int *tapfds; 21 }; 22 23 struct vhost_memory_kernel { 24 uint32_t nregions; 25 uint32_t padding; 26 struct vhost_memory_region regions[]; 27 }; 28 29 /* vhost kernel ioctls */ 30 #define VHOST_VIRTIO 0xAF 31 #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) 32 #define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) 33 #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) 34 #define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) 35 #define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) 36 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) 37 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) 38 #define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) 39 #define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) 40 #define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 41 #define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 42 #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) 43 #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) 44 #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) 45 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) 46 47 /* with below features, vhost kernel does not need to do the checksum and TSO, 48 * these info will be passed to virtio_user through virtio net header. 49 */ 50 #define VHOST_KERNEL_GUEST_OFFLOADS_MASK \ 51 ((1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ 52 (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ 53 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ 54 (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ 55 (1ULL << VIRTIO_NET_F_GUEST_UFO)) 56 57 /* with below features, when flows from virtio_user to vhost kernel 58 * (1) if flows goes up through the kernel networking stack, it does not need 59 * to verify checksum, which can save CPU cycles; 60 * (2) if flows goes through a Linux bridge and outside from an interface 61 * (kernel driver), checksum and TSO will be done by GSO in kernel or even 62 * offloaded into real physical device. 63 */ 64 #define VHOST_KERNEL_HOST_OFFLOADS_MASK \ 65 ((1ULL << VIRTIO_NET_F_HOST_TSO4) | \ 66 (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ 67 (1ULL << VIRTIO_NET_F_CSUM)) 68 69 static uint64_t max_regions = 64; 70 71 static void 72 get_vhost_kernel_max_regions(void) 73 { 74 int fd; 75 char buf[20] = {'\0'}; 76 77 fd = open("/sys/module/vhost/parameters/max_mem_regions", O_RDONLY); 78 if (fd < 0) 79 return; 80 81 if (read(fd, buf, sizeof(buf) - 1) > 0) 82 max_regions = strtoull(buf, NULL, 10); 83 84 close(fd); 85 } 86 87 static int 88 vhost_kernel_ioctl(int fd, uint64_t request, void *arg) 89 { 90 int ret; 91 92 ret = ioctl(fd, request, arg); 93 if (ret) { 94 PMD_DRV_LOG(ERR, "Vhost-kernel ioctl %"PRIu64" failed (%s)", 95 request, strerror(errno)); 96 return -1; 97 } 98 99 return 0; 100 } 101 102 static int 103 vhost_kernel_set_owner(struct virtio_user_dev *dev) 104 { 105 int ret; 106 uint32_t i; 107 struct vhost_kernel_data *data = dev->backend_data; 108 109 for (i = 0; i < dev->max_queue_pairs; ++i) { 110 if (data->vhostfds[i] < 0) 111 continue; 112 113 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_OWNER, NULL); 114 if (ret < 0) 115 return ret; 116 } 117 118 return 0; 119 } 120 121 static int 122 vhost_kernel_get_features(struct virtio_user_dev *dev, uint64_t *features) 123 { 124 struct vhost_kernel_data *data = dev->backend_data; 125 unsigned int tap_flags; 126 int ret; 127 128 ret = vhost_kernel_ioctl(data->vhostfds[0], VHOST_GET_FEATURES, features); 129 if (ret < 0) { 130 PMD_DRV_LOG(ERR, "Failed to get features"); 131 return -1; 132 } 133 134 ret = tap_get_flags(data->tapfds[0], &tap_flags); 135 if (ret < 0) { 136 PMD_DRV_LOG(ERR, "Failed to get TAP features"); 137 return -1; 138 } 139 140 /* with tap as the backend, all these features are supported 141 * but not claimed by vhost-net, so we add them back when 142 * reporting to upper layer. 143 */ 144 if (tap_flags & IFF_VNET_HDR) { 145 *features |= VHOST_KERNEL_GUEST_OFFLOADS_MASK; 146 *features |= VHOST_KERNEL_HOST_OFFLOADS_MASK; 147 } 148 149 /* vhost_kernel will not declare this feature, but it does 150 * support multi-queue. 151 */ 152 if (tap_flags & IFF_MULTI_QUEUE) 153 *features |= (1ull << VIRTIO_NET_F_MQ); 154 155 return 0; 156 } 157 158 static int 159 vhost_kernel_set_features(struct virtio_user_dev *dev, uint64_t features) 160 { 161 struct vhost_kernel_data *data = dev->backend_data; 162 uint32_t i; 163 int ret; 164 165 /* We don't need memory protection here */ 166 features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); 167 /* VHOST kernel does not know about below flags */ 168 features &= ~VHOST_KERNEL_GUEST_OFFLOADS_MASK; 169 features &= ~VHOST_KERNEL_HOST_OFFLOADS_MASK; 170 features &= ~(1ULL << VIRTIO_NET_F_MQ); 171 172 for (i = 0; i < dev->max_queue_pairs; ++i) { 173 if (data->vhostfds[i] < 0) 174 continue; 175 176 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_FEATURES, &features); 177 if (ret < 0) 178 return ret; 179 } 180 181 return 0; 182 } 183 184 static int 185 add_memseg_list(const struct rte_memseg_list *msl, void *arg) 186 { 187 struct vhost_memory_kernel *vm = arg; 188 struct vhost_memory_region *mr; 189 void *start_addr; 190 uint64_t len; 191 192 if (msl->external) 193 return 0; 194 195 if (vm->nregions >= max_regions) 196 return -1; 197 198 start_addr = msl->base_va; 199 len = msl->page_sz * msl->memseg_arr.len; 200 201 mr = &vm->regions[vm->nregions++]; 202 203 mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr; 204 mr->userspace_addr = (uint64_t)(uintptr_t)start_addr; 205 mr->memory_size = len; 206 mr->mmap_offset = 0; /* flags_padding */ 207 208 PMD_DRV_LOG(DEBUG, "index=%u addr=%p len=%" PRIu64, 209 vm->nregions - 1, start_addr, len); 210 211 return 0; 212 } 213 214 /* By default, vhost kernel module allows 64 regions, but DPDK may 215 * have much more memory regions. Below function will treat each 216 * contiguous memory space reserved by DPDK as one region. 217 */ 218 static int 219 vhost_kernel_set_memory_table(struct virtio_user_dev *dev) 220 { 221 uint32_t i; 222 struct vhost_kernel_data *data = dev->backend_data; 223 struct vhost_memory_kernel *vm; 224 int ret; 225 226 vm = malloc(sizeof(struct vhost_memory_kernel) + 227 max_regions * 228 sizeof(struct vhost_memory_region)); 229 if (!vm) 230 goto err; 231 232 vm->nregions = 0; 233 vm->padding = 0; 234 235 /* 236 * The memory lock has already been taken by memory subsystem 237 * or virtio_user_start_device(). 238 */ 239 ret = rte_memseg_list_walk_thread_unsafe(add_memseg_list, vm); 240 if (ret < 0) 241 goto err_free; 242 243 for (i = 0; i < dev->max_queue_pairs; ++i) { 244 if (data->vhostfds[i] < 0) 245 continue; 246 247 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_MEM_TABLE, vm); 248 if (ret < 0) 249 goto err_free; 250 } 251 252 free(vm); 253 254 return 0; 255 err_free: 256 free(vm); 257 err: 258 PMD_DRV_LOG(ERR, "Failed to set memory table"); 259 return -1; 260 } 261 262 static int 263 vhost_kernel_set_vring(struct virtio_user_dev *dev, uint64_t req, struct vhost_vring_state *state) 264 { 265 int ret, fd; 266 unsigned int index = state->index; 267 struct vhost_kernel_data *data = dev->backend_data; 268 269 /* Convert from queue index to queue-pair & offset */ 270 fd = data->vhostfds[state->index / 2]; 271 state->index %= 2; 272 273 ret = vhost_kernel_ioctl(fd, req, state); 274 if (ret < 0) { 275 PMD_DRV_LOG(ERR, "Failed to set vring (request %" PRIu64 ")", req); 276 return -1; 277 } 278 279 /* restore index back to queue index */ 280 state->index = index; 281 282 return 0; 283 } 284 285 static int 286 vhost_kernel_set_vring_num(struct virtio_user_dev *dev, struct vhost_vring_state *state) 287 { 288 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_NUM, state); 289 } 290 291 static int 292 vhost_kernel_set_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 293 { 294 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_BASE, state); 295 } 296 297 static int 298 vhost_kernel_get_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 299 { 300 return vhost_kernel_set_vring(dev, VHOST_GET_VRING_BASE, state); 301 } 302 303 static int 304 vhost_kernel_set_vring_file(struct virtio_user_dev *dev, uint64_t req, 305 struct vhost_vring_file *file) 306 { 307 int ret, fd; 308 unsigned int index = file->index; 309 struct vhost_kernel_data *data = dev->backend_data; 310 311 /* Convert from queue index to queue-pair & offset */ 312 fd = data->vhostfds[file->index / 2]; 313 file->index %= 2; 314 315 ret = vhost_kernel_ioctl(fd, req, file); 316 if (ret < 0) { 317 PMD_DRV_LOG(ERR, "Failed to set vring file (request %" PRIu64 ")", req); 318 return -1; 319 } 320 321 /* restore index back to queue index */ 322 file->index = index; 323 324 return 0; 325 } 326 327 static int 328 vhost_kernel_set_vring_kick(struct virtio_user_dev *dev, struct vhost_vring_file *file) 329 { 330 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_KICK, file); 331 } 332 333 static int 334 vhost_kernel_set_vring_call(struct virtio_user_dev *dev, struct vhost_vring_file *file) 335 { 336 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_CALL, file); 337 } 338 339 static int 340 vhost_kernel_set_vring_addr(struct virtio_user_dev *dev, struct vhost_vring_addr *addr) 341 { 342 int ret, fd; 343 unsigned int index = addr->index; 344 struct vhost_kernel_data *data = dev->backend_data; 345 346 /* Convert from queue index to queue-pair & offset */ 347 fd = data->vhostfds[addr->index / 2]; 348 addr->index %= 2; 349 350 ret = vhost_kernel_ioctl(fd, VHOST_SET_VRING_ADDR, addr); 351 if (ret < 0) { 352 PMD_DRV_LOG(ERR, "Failed to set vring address"); 353 return -1; 354 } 355 356 /* restore index back to queue index */ 357 addr->index = index; 358 359 return 0; 360 } 361 362 static int 363 vhost_kernel_get_status(struct virtio_user_dev *dev __rte_unused, uint8_t *status __rte_unused) 364 { 365 return -ENOTSUP; 366 } 367 368 static int 369 vhost_kernel_set_status(struct virtio_user_dev *dev __rte_unused, uint8_t status __rte_unused) 370 { 371 return -ENOTSUP; 372 } 373 374 /** 375 * Set up environment to talk with a vhost kernel backend. 376 * 377 * @return 378 * - (-1) if fail to set up; 379 * - (>=0) if successful. 380 */ 381 static int 382 vhost_kernel_setup(struct virtio_user_dev *dev) 383 { 384 struct vhost_kernel_data *data; 385 unsigned int tap_features; 386 unsigned int tap_flags; 387 unsigned int r_flags; 388 const char *ifname; 389 uint32_t q, i; 390 int vhostfd; 391 392 if (tap_support_features(&tap_features) < 0) 393 return -1; 394 395 if ((tap_features & IFF_VNET_HDR) == 0) { 396 PMD_INIT_LOG(ERR, "TAP does not support IFF_VNET_HDR"); 397 return -1; 398 } 399 r_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 400 401 if (tap_features & IFF_NAPI) 402 r_flags |= IFF_NAPI; 403 404 data = malloc(sizeof(*data)); 405 if (!data) { 406 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost-kernel data", dev->path); 407 return -1; 408 } 409 410 data->vhostfds = malloc(dev->max_queue_pairs * sizeof(int)); 411 if (!data->vhostfds) { 412 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost FDs", dev->path); 413 goto err_data; 414 } 415 data->tapfds = malloc(dev->max_queue_pairs * sizeof(int)); 416 if (!data->tapfds) { 417 PMD_INIT_LOG(ERR, "(%s) Failed to allocate TAP FDs", dev->path); 418 goto err_vhostfds; 419 } 420 421 for (q = 0; q < dev->max_queue_pairs; ++q) { 422 data->vhostfds[q] = -1; 423 data->tapfds[q] = -1; 424 } 425 426 get_vhost_kernel_max_regions(); 427 428 for (i = 0; i < dev->max_queue_pairs; ++i) { 429 vhostfd = open(dev->path, O_RDWR); 430 if (vhostfd < 0) { 431 PMD_DRV_LOG(ERR, "fail to open %s, %s", dev->path, strerror(errno)); 432 goto err_tapfds; 433 } 434 data->vhostfds[i] = vhostfd; 435 } 436 437 ifname = dev->ifname != NULL ? dev->ifname : "tap%d"; 438 data->tapfds[0] = tap_open(ifname, r_flags, (tap_features & IFF_MULTI_QUEUE) != 0); 439 if (data->tapfds[0] < 0) 440 goto err_tapfds; 441 if (dev->ifname == NULL && tap_get_name(data->tapfds[0], &dev->ifname) < 0) { 442 PMD_DRV_LOG(ERR, "fail to get tap name (%d)", data->tapfds[0]); 443 goto err_tapfds; 444 } 445 if (tap_get_flags(data->tapfds[0], &tap_flags) < 0) { 446 PMD_DRV_LOG(ERR, "fail to get tap flags for tap %s", dev->ifname); 447 goto err_tapfds; 448 } 449 if ((tap_flags & IFF_MULTI_QUEUE) == 0 && dev->max_queue_pairs > 1) { 450 PMD_DRV_LOG(ERR, "tap %s does not support multi queue", dev->ifname); 451 goto err_tapfds; 452 } 453 454 for (i = 1; i < dev->max_queue_pairs; i++) { 455 data->tapfds[i] = tap_open(dev->ifname, r_flags, true); 456 if (data->tapfds[i] < 0) 457 goto err_tapfds; 458 } 459 460 dev->backend_data = data; 461 462 return 0; 463 464 err_tapfds: 465 for (i = 0; i < dev->max_queue_pairs; i++) { 466 if (data->vhostfds[i] >= 0) 467 close(data->vhostfds[i]); 468 if (data->tapfds[i] >= 0) 469 close(data->tapfds[i]); 470 } 471 472 free(data->tapfds); 473 err_vhostfds: 474 free(data->vhostfds); 475 err_data: 476 free(data); 477 478 return -1; 479 } 480 481 static int 482 vhost_kernel_destroy(struct virtio_user_dev *dev) 483 { 484 struct vhost_kernel_data *data = dev->backend_data; 485 uint32_t i; 486 487 if (!data) 488 return 0; 489 490 for (i = 0; i < dev->max_queue_pairs; ++i) { 491 if (data->vhostfds[i] >= 0) 492 close(data->vhostfds[i]); 493 if (data->tapfds[i] >= 0) 494 close(data->tapfds[i]); 495 } 496 497 free(data->vhostfds); 498 free(data->tapfds); 499 free(data); 500 dev->backend_data = NULL; 501 502 return 0; 503 } 504 505 static int 506 vhost_kernel_set_backend(int vhostfd, int tapfd) 507 { 508 struct vhost_vring_file f; 509 510 f.fd = tapfd; 511 f.index = 0; 512 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 513 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 514 strerror(errno)); 515 return -1; 516 } 517 518 f.index = 1; 519 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 520 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 521 strerror(errno)); 522 return -1; 523 } 524 525 return 0; 526 } 527 528 static int 529 vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, 530 uint16_t pair_idx, 531 int enable) 532 { 533 struct vhost_kernel_data *data = dev->backend_data; 534 int hdr_size; 535 int vhostfd; 536 int tapfd; 537 538 if (dev->qp_enabled[pair_idx] == enable) 539 return 0; 540 541 vhostfd = data->vhostfds[pair_idx]; 542 tapfd = data->tapfds[pair_idx]; 543 544 if (!enable) { 545 if (vhost_kernel_set_backend(vhostfd, -1) < 0) { 546 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 547 return -1; 548 } 549 dev->qp_enabled[pair_idx] = false; 550 return 0; 551 } 552 553 if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || 554 (dev->features & (1ULL << VIRTIO_F_VERSION_1))) 555 hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); 556 else 557 hdr_size = sizeof(struct virtio_net_hdr); 558 559 /* Set mac on tap only once when starting */ 560 if (!dev->started && pair_idx == 0 && 561 tap_set_mac(data->tapfds[pair_idx], dev->mac_addr) < 0) 562 return -1; 563 564 if (vhost_kernel_tap_setup(tapfd, hdr_size, dev->features) < 0) { 565 PMD_DRV_LOG(ERR, "fail to setup tap for vhost kernel"); 566 return -1; 567 } 568 569 if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) { 570 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 571 return -1; 572 } 573 574 dev->qp_enabled[pair_idx] = true; 575 return 0; 576 } 577 578 static int 579 vhost_kernel_get_backend_features(uint64_t *features) 580 { 581 *features = 0; 582 583 return 0; 584 } 585 586 static int 587 vhost_kernel_update_link_state(struct virtio_user_dev *dev __rte_unused) 588 { 589 /* Nothing to update (Maybe get TAP interface link state?) */ 590 return 0; 591 } 592 593 static int 594 vhost_kernel_get_intr_fd(struct virtio_user_dev *dev __rte_unused) 595 { 596 /* No link state interrupt with Vhost-kernel */ 597 return -1; 598 } 599 600 struct virtio_user_backend_ops virtio_ops_kernel = { 601 .setup = vhost_kernel_setup, 602 .destroy = vhost_kernel_destroy, 603 .get_backend_features = vhost_kernel_get_backend_features, 604 .set_owner = vhost_kernel_set_owner, 605 .get_features = vhost_kernel_get_features, 606 .set_features = vhost_kernel_set_features, 607 .set_memory_table = vhost_kernel_set_memory_table, 608 .set_vring_num = vhost_kernel_set_vring_num, 609 .set_vring_base = vhost_kernel_set_vring_base, 610 .get_vring_base = vhost_kernel_get_vring_base, 611 .set_vring_call = vhost_kernel_set_vring_call, 612 .set_vring_kick = vhost_kernel_set_vring_kick, 613 .set_vring_addr = vhost_kernel_set_vring_addr, 614 .get_status = vhost_kernel_get_status, 615 .set_status = vhost_kernel_set_status, 616 .enable_qp = vhost_kernel_enable_queue_pair, 617 .update_link_state = vhost_kernel_update_link_state, 618 .get_intr_fd = vhost_kernel_get_intr_fd, 619 }; 620