1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2016 Intel Corporation 3 */ 4 5 #include <sys/types.h> 6 #include <sys/stat.h> 7 #include <fcntl.h> 8 #include <unistd.h> 9 #include <errno.h> 10 11 #include <rte_memory.h> 12 13 #include "vhost.h" 14 #include "virtio_user_dev.h" 15 #include "vhost_kernel_tap.h" 16 17 struct vhost_kernel_data { 18 int *vhostfds; 19 int *tapfds; 20 }; 21 22 struct vhost_memory_kernel { 23 uint32_t nregions; 24 uint32_t padding; 25 struct vhost_memory_region regions[0]; 26 }; 27 28 /* vhost kernel ioctls */ 29 #define VHOST_VIRTIO 0xAF 30 #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) 31 #define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) 32 #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) 33 #define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) 34 #define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) 35 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) 36 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) 37 #define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) 38 #define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) 39 #define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 40 #define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 41 #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) 42 #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) 43 #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) 44 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) 45 46 /* with below features, vhost kernel does not need to do the checksum and TSO, 47 * these info will be passed to virtio_user through virtio net header. 48 */ 49 #define VHOST_KERNEL_GUEST_OFFLOADS_MASK \ 50 ((1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ 51 (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ 52 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ 53 (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ 54 (1ULL << VIRTIO_NET_F_GUEST_UFO)) 55 56 /* with below features, when flows from virtio_user to vhost kernel 57 * (1) if flows goes up through the kernel networking stack, it does not need 58 * to verify checksum, which can save CPU cycles; 59 * (2) if flows goes through a Linux bridge and outside from an interface 60 * (kernel driver), checksum and TSO will be done by GSO in kernel or even 61 * offloaded into real physical device. 62 */ 63 #define VHOST_KERNEL_HOST_OFFLOADS_MASK \ 64 ((1ULL << VIRTIO_NET_F_HOST_TSO4) | \ 65 (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ 66 (1ULL << VIRTIO_NET_F_CSUM)) 67 68 static uint64_t max_regions = 64; 69 70 static void 71 get_vhost_kernel_max_regions(void) 72 { 73 int fd; 74 char buf[20] = {'\0'}; 75 76 fd = open("/sys/module/vhost/parameters/max_mem_regions", O_RDONLY); 77 if (fd < 0) 78 return; 79 80 if (read(fd, buf, sizeof(buf) - 1) > 0) 81 max_regions = strtoull(buf, NULL, 10); 82 83 close(fd); 84 } 85 86 static int 87 vhost_kernel_ioctl(int fd, uint64_t request, void *arg) 88 { 89 int ret; 90 91 ret = ioctl(fd, request, arg); 92 if (ret) { 93 PMD_DRV_LOG(ERR, "Vhost-kernel ioctl %"PRIu64" failed (%s)", 94 request, strerror(errno)); 95 return -1; 96 } 97 98 return 0; 99 } 100 101 static int 102 vhost_kernel_set_owner(struct virtio_user_dev *dev) 103 { 104 int ret; 105 uint32_t i; 106 struct vhost_kernel_data *data = dev->backend_data; 107 108 for (i = 0; i < dev->max_queue_pairs; ++i) { 109 if (data->vhostfds[i] < 0) 110 continue; 111 112 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_OWNER, NULL); 113 if (ret < 0) 114 return ret; 115 } 116 117 return 0; 118 } 119 120 static int 121 vhost_kernel_get_features(struct virtio_user_dev *dev, uint64_t *features) 122 { 123 struct vhost_kernel_data *data = dev->backend_data; 124 unsigned int tap_flags; 125 int ret; 126 127 ret = vhost_kernel_ioctl(data->vhostfds[0], VHOST_GET_FEATURES, features); 128 if (ret < 0) { 129 PMD_DRV_LOG(ERR, "Failed to get features"); 130 return -1; 131 } 132 133 ret = tap_get_flags(data->tapfds[0], &tap_flags); 134 if (ret < 0) { 135 PMD_DRV_LOG(ERR, "Failed to get TAP features"); 136 return -1; 137 } 138 139 /* with tap as the backend, all these features are supported 140 * but not claimed by vhost-net, so we add them back when 141 * reporting to upper layer. 142 */ 143 if (tap_flags & IFF_VNET_HDR) { 144 *features |= VHOST_KERNEL_GUEST_OFFLOADS_MASK; 145 *features |= VHOST_KERNEL_HOST_OFFLOADS_MASK; 146 } 147 148 /* vhost_kernel will not declare this feature, but it does 149 * support multi-queue. 150 */ 151 if (tap_flags & IFF_MULTI_QUEUE) 152 *features |= (1ull << VIRTIO_NET_F_MQ); 153 154 return 0; 155 } 156 157 static int 158 vhost_kernel_set_features(struct virtio_user_dev *dev, uint64_t features) 159 { 160 struct vhost_kernel_data *data = dev->backend_data; 161 uint32_t i; 162 int ret; 163 164 /* We don't need memory protection here */ 165 features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); 166 /* VHOST kernel does not know about below flags */ 167 features &= ~VHOST_KERNEL_GUEST_OFFLOADS_MASK; 168 features &= ~VHOST_KERNEL_HOST_OFFLOADS_MASK; 169 features &= ~(1ULL << VIRTIO_NET_F_MQ); 170 171 for (i = 0; i < dev->max_queue_pairs; ++i) { 172 if (data->vhostfds[i] < 0) 173 continue; 174 175 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_FEATURES, &features); 176 if (ret < 0) 177 return ret; 178 } 179 180 return 0; 181 } 182 183 static int 184 add_memseg_list(const struct rte_memseg_list *msl, void *arg) 185 { 186 struct vhost_memory_kernel *vm = arg; 187 struct vhost_memory_region *mr; 188 void *start_addr; 189 uint64_t len; 190 191 if (msl->external) 192 return 0; 193 194 if (vm->nregions >= max_regions) 195 return -1; 196 197 start_addr = msl->base_va; 198 len = msl->page_sz * msl->memseg_arr.len; 199 200 mr = &vm->regions[vm->nregions++]; 201 202 mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr; 203 mr->userspace_addr = (uint64_t)(uintptr_t)start_addr; 204 mr->memory_size = len; 205 mr->mmap_offset = 0; /* flags_padding */ 206 207 PMD_DRV_LOG(DEBUG, "index=%u addr=%p len=%" PRIu64, 208 vm->nregions - 1, start_addr, len); 209 210 return 0; 211 } 212 213 /* By default, vhost kernel module allows 64 regions, but DPDK may 214 * have much more memory regions. Below function will treat each 215 * contiguous memory space reserved by DPDK as one region. 216 */ 217 static int 218 vhost_kernel_set_memory_table(struct virtio_user_dev *dev) 219 { 220 uint32_t i; 221 struct vhost_kernel_data *data = dev->backend_data; 222 struct vhost_memory_kernel *vm; 223 int ret; 224 225 vm = malloc(sizeof(struct vhost_memory_kernel) + 226 max_regions * 227 sizeof(struct vhost_memory_region)); 228 if (!vm) 229 goto err; 230 231 vm->nregions = 0; 232 vm->padding = 0; 233 234 /* 235 * The memory lock has already been taken by memory subsystem 236 * or virtio_user_start_device(). 237 */ 238 ret = rte_memseg_list_walk_thread_unsafe(add_memseg_list, vm); 239 if (ret < 0) 240 goto err_free; 241 242 for (i = 0; i < dev->max_queue_pairs; ++i) { 243 if (data->vhostfds[i] < 0) 244 continue; 245 246 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_MEM_TABLE, vm); 247 if (ret < 0) 248 goto err_free; 249 } 250 251 free(vm); 252 253 return 0; 254 err_free: 255 free(vm); 256 err: 257 PMD_DRV_LOG(ERR, "Failed to set memory table"); 258 return -1; 259 } 260 261 static int 262 vhost_kernel_set_vring(struct virtio_user_dev *dev, uint64_t req, struct vhost_vring_state *state) 263 { 264 int ret, fd; 265 unsigned int index = state->index; 266 struct vhost_kernel_data *data = dev->backend_data; 267 268 /* Convert from queue index to queue-pair & offset */ 269 fd = data->vhostfds[state->index / 2]; 270 state->index %= 2; 271 272 ret = vhost_kernel_ioctl(fd, req, state); 273 if (ret < 0) { 274 PMD_DRV_LOG(ERR, "Failed to set vring (request %" PRIu64 ")", req); 275 return -1; 276 } 277 278 /* restore index back to queue index */ 279 state->index = index; 280 281 return 0; 282 } 283 284 static int 285 vhost_kernel_set_vring_num(struct virtio_user_dev *dev, struct vhost_vring_state *state) 286 { 287 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_NUM, state); 288 } 289 290 static int 291 vhost_kernel_set_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 292 { 293 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_BASE, state); 294 } 295 296 static int 297 vhost_kernel_get_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 298 { 299 return vhost_kernel_set_vring(dev, VHOST_GET_VRING_BASE, state); 300 } 301 302 static int 303 vhost_kernel_set_vring_file(struct virtio_user_dev *dev, uint64_t req, 304 struct vhost_vring_file *file) 305 { 306 int ret, fd; 307 unsigned int index = file->index; 308 struct vhost_kernel_data *data = dev->backend_data; 309 310 /* Convert from queue index to queue-pair & offset */ 311 fd = data->vhostfds[file->index / 2]; 312 file->index %= 2; 313 314 ret = vhost_kernel_ioctl(fd, req, file); 315 if (ret < 0) { 316 PMD_DRV_LOG(ERR, "Failed to set vring file (request %" PRIu64 ")", req); 317 return -1; 318 } 319 320 /* restore index back to queue index */ 321 file->index = index; 322 323 return 0; 324 } 325 326 static int 327 vhost_kernel_set_vring_kick(struct virtio_user_dev *dev, struct vhost_vring_file *file) 328 { 329 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_KICK, file); 330 } 331 332 static int 333 vhost_kernel_set_vring_call(struct virtio_user_dev *dev, struct vhost_vring_file *file) 334 { 335 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_CALL, file); 336 } 337 338 static int 339 vhost_kernel_set_vring_addr(struct virtio_user_dev *dev, struct vhost_vring_addr *addr) 340 { 341 int ret, fd; 342 unsigned int index = addr->index; 343 struct vhost_kernel_data *data = dev->backend_data; 344 345 /* Convert from queue index to queue-pair & offset */ 346 fd = data->vhostfds[addr->index / 2]; 347 addr->index %= 2; 348 349 ret = vhost_kernel_ioctl(fd, VHOST_SET_VRING_ADDR, addr); 350 if (ret < 0) { 351 PMD_DRV_LOG(ERR, "Failed to set vring address"); 352 return -1; 353 } 354 355 /* restore index back to queue index */ 356 addr->index = index; 357 358 return 0; 359 } 360 361 static int 362 vhost_kernel_get_status(struct virtio_user_dev *dev __rte_unused, uint8_t *status __rte_unused) 363 { 364 return -ENOTSUP; 365 } 366 367 static int 368 vhost_kernel_set_status(struct virtio_user_dev *dev __rte_unused, uint8_t status __rte_unused) 369 { 370 return -ENOTSUP; 371 } 372 373 /** 374 * Set up environment to talk with a vhost kernel backend. 375 * 376 * @return 377 * - (-1) if fail to set up; 378 * - (>=0) if successful. 379 */ 380 static int 381 vhost_kernel_setup(struct virtio_user_dev *dev) 382 { 383 struct vhost_kernel_data *data; 384 unsigned int tap_features; 385 unsigned int tap_flags; 386 const char *ifname; 387 uint32_t q, i; 388 int vhostfd; 389 390 if (tap_support_features(&tap_features) < 0) 391 return -1; 392 393 if ((tap_features & IFF_VNET_HDR) == 0) { 394 PMD_INIT_LOG(ERR, "TAP does not support IFF_VNET_HDR"); 395 return -1; 396 } 397 398 data = malloc(sizeof(*data)); 399 if (!data) { 400 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost-kernel data", dev->path); 401 return -1; 402 } 403 404 data->vhostfds = malloc(dev->max_queue_pairs * sizeof(int)); 405 if (!data->vhostfds) { 406 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost FDs", dev->path); 407 goto err_data; 408 } 409 data->tapfds = malloc(dev->max_queue_pairs * sizeof(int)); 410 if (!data->tapfds) { 411 PMD_INIT_LOG(ERR, "(%s) Failed to allocate TAP FDs", dev->path); 412 goto err_vhostfds; 413 } 414 415 for (q = 0; q < dev->max_queue_pairs; ++q) { 416 data->vhostfds[q] = -1; 417 data->tapfds[q] = -1; 418 } 419 420 get_vhost_kernel_max_regions(); 421 422 for (i = 0; i < dev->max_queue_pairs; ++i) { 423 vhostfd = open(dev->path, O_RDWR); 424 if (vhostfd < 0) { 425 PMD_DRV_LOG(ERR, "fail to open %s, %s", dev->path, strerror(errno)); 426 goto err_tapfds; 427 } 428 data->vhostfds[i] = vhostfd; 429 } 430 431 ifname = dev->ifname != NULL ? dev->ifname : "tap%d"; 432 data->tapfds[0] = tap_open(ifname, (tap_features & IFF_MULTI_QUEUE) != 0); 433 if (data->tapfds[0] < 0) 434 goto err_tapfds; 435 if (dev->ifname == NULL && tap_get_name(data->tapfds[0], &dev->ifname) < 0) { 436 PMD_DRV_LOG(ERR, "fail to get tap name (%d)", data->tapfds[0]); 437 goto err_tapfds; 438 } 439 if (tap_get_flags(data->tapfds[0], &tap_flags) < 0) { 440 PMD_DRV_LOG(ERR, "fail to get tap flags for tap %s", dev->ifname); 441 goto err_tapfds; 442 } 443 if ((tap_flags & IFF_MULTI_QUEUE) == 0 && dev->max_queue_pairs > 1) { 444 PMD_DRV_LOG(ERR, "tap %s does not support multi queue", dev->ifname); 445 goto err_tapfds; 446 } 447 448 for (i = 1; i < dev->max_queue_pairs; i++) { 449 data->tapfds[i] = tap_open(dev->ifname, true); 450 if (data->tapfds[i] < 0) 451 goto err_tapfds; 452 } 453 454 dev->backend_data = data; 455 456 return 0; 457 458 err_tapfds: 459 for (i = 0; i < dev->max_queue_pairs; i++) { 460 if (data->vhostfds[i] >= 0) 461 close(data->vhostfds[i]); 462 if (data->tapfds[i] >= 0) 463 close(data->tapfds[i]); 464 } 465 466 free(data->tapfds); 467 err_vhostfds: 468 free(data->vhostfds); 469 err_data: 470 free(data); 471 472 return -1; 473 } 474 475 static int 476 vhost_kernel_destroy(struct virtio_user_dev *dev) 477 { 478 struct vhost_kernel_data *data = dev->backend_data; 479 uint32_t i; 480 481 if (!data) 482 return 0; 483 484 for (i = 0; i < dev->max_queue_pairs; ++i) { 485 if (data->vhostfds[i] >= 0) 486 close(data->vhostfds[i]); 487 if (data->tapfds[i] >= 0) 488 close(data->tapfds[i]); 489 } 490 491 free(data->vhostfds); 492 free(data->tapfds); 493 free(data); 494 dev->backend_data = NULL; 495 496 return 0; 497 } 498 499 static int 500 vhost_kernel_set_backend(int vhostfd, int tapfd) 501 { 502 struct vhost_vring_file f; 503 504 f.fd = tapfd; 505 f.index = 0; 506 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 507 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 508 strerror(errno)); 509 return -1; 510 } 511 512 f.index = 1; 513 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 514 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 515 strerror(errno)); 516 return -1; 517 } 518 519 return 0; 520 } 521 522 static int 523 vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, 524 uint16_t pair_idx, 525 int enable) 526 { 527 struct vhost_kernel_data *data = dev->backend_data; 528 int hdr_size; 529 int vhostfd; 530 int tapfd; 531 532 if (dev->qp_enabled[pair_idx] == enable) 533 return 0; 534 535 vhostfd = data->vhostfds[pair_idx]; 536 tapfd = data->tapfds[pair_idx]; 537 538 if (!enable) { 539 if (vhost_kernel_set_backend(vhostfd, -1) < 0) { 540 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 541 return -1; 542 } 543 dev->qp_enabled[pair_idx] = false; 544 return 0; 545 } 546 547 if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || 548 (dev->features & (1ULL << VIRTIO_F_VERSION_1))) 549 hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); 550 else 551 hdr_size = sizeof(struct virtio_net_hdr); 552 553 /* Set mac on tap only once when starting */ 554 if (!dev->started && pair_idx == 0 && 555 tap_set_mac(data->tapfds[pair_idx], dev->mac_addr) < 0) 556 return -1; 557 558 if (vhost_kernel_tap_setup(tapfd, hdr_size, dev->features) < 0) { 559 PMD_DRV_LOG(ERR, "fail to setup tap for vhost kernel"); 560 return -1; 561 } 562 563 if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) { 564 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 565 return -1; 566 } 567 568 dev->qp_enabled[pair_idx] = true; 569 return 0; 570 } 571 572 static int 573 vhost_kernel_get_backend_features(uint64_t *features) 574 { 575 *features = 0; 576 577 return 0; 578 } 579 580 static int 581 vhost_kernel_update_link_state(struct virtio_user_dev *dev __rte_unused) 582 { 583 /* Nothing to update (Maybe get TAP interface link state?) */ 584 return 0; 585 } 586 587 static int 588 vhost_kernel_get_intr_fd(struct virtio_user_dev *dev __rte_unused) 589 { 590 /* No link state interrupt with Vhost-kernel */ 591 return -1; 592 } 593 594 struct virtio_user_backend_ops virtio_ops_kernel = { 595 .setup = vhost_kernel_setup, 596 .destroy = vhost_kernel_destroy, 597 .get_backend_features = vhost_kernel_get_backend_features, 598 .set_owner = vhost_kernel_set_owner, 599 .get_features = vhost_kernel_get_features, 600 .set_features = vhost_kernel_set_features, 601 .set_memory_table = vhost_kernel_set_memory_table, 602 .set_vring_num = vhost_kernel_set_vring_num, 603 .set_vring_base = vhost_kernel_set_vring_base, 604 .get_vring_base = vhost_kernel_get_vring_base, 605 .set_vring_call = vhost_kernel_set_vring_call, 606 .set_vring_kick = vhost_kernel_set_vring_kick, 607 .set_vring_addr = vhost_kernel_set_vring_addr, 608 .get_status = vhost_kernel_get_status, 609 .set_status = vhost_kernel_set_status, 610 .enable_qp = vhost_kernel_enable_queue_pair, 611 .update_link_state = vhost_kernel_update_link_state, 612 .get_intr_fd = vhost_kernel_get_intr_fd, 613 }; 614