1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2016 Intel Corporation 3 */ 4 5 #include <sys/types.h> 6 #include <sys/stat.h> 7 #include <fcntl.h> 8 #include <unistd.h> 9 #include <errno.h> 10 11 #include <rte_memory.h> 12 13 #include "vhost.h" 14 #include "virtio_user_dev.h" 15 #include "vhost_kernel_tap.h" 16 17 struct vhost_kernel_data { 18 int *vhostfds; 19 int *tapfds; 20 }; 21 22 struct vhost_memory_kernel { 23 uint32_t nregions; 24 uint32_t padding; 25 struct vhost_memory_region regions[0]; 26 }; 27 28 /* vhost kernel ioctls */ 29 #define VHOST_VIRTIO 0xAF 30 #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) 31 #define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) 32 #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) 33 #define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) 34 #define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) 35 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) 36 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) 37 #define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) 38 #define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) 39 #define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 40 #define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 41 #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) 42 #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) 43 #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) 44 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) 45 46 /* with below features, vhost kernel does not need to do the checksum and TSO, 47 * these info will be passed to virtio_user through virtio net header. 48 */ 49 #define VHOST_KERNEL_GUEST_OFFLOADS_MASK \ 50 ((1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ 51 (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ 52 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ 53 (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ 54 (1ULL << VIRTIO_NET_F_GUEST_UFO)) 55 56 /* with below features, when flows from virtio_user to vhost kernel 57 * (1) if flows goes up through the kernel networking stack, it does not need 58 * to verify checksum, which can save CPU cycles; 59 * (2) if flows goes through a Linux bridge and outside from an interface 60 * (kernel driver), checksum and TSO will be done by GSO in kernel or even 61 * offloaded into real physical device. 62 */ 63 #define VHOST_KERNEL_HOST_OFFLOADS_MASK \ 64 ((1ULL << VIRTIO_NET_F_HOST_TSO4) | \ 65 (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ 66 (1ULL << VIRTIO_NET_F_CSUM)) 67 68 static uint64_t max_regions = 64; 69 70 static void 71 get_vhost_kernel_max_regions(void) 72 { 73 int fd; 74 char buf[20] = {'\0'}; 75 76 fd = open("/sys/module/vhost/parameters/max_mem_regions", O_RDONLY); 77 if (fd < 0) 78 return; 79 80 if (read(fd, buf, sizeof(buf) - 1) > 0) 81 max_regions = strtoull(buf, NULL, 10); 82 83 close(fd); 84 } 85 86 static int 87 vhost_kernel_ioctl(int fd, uint64_t request, void *arg) 88 { 89 int ret; 90 91 ret = ioctl(fd, request, arg); 92 if (ret) { 93 PMD_DRV_LOG(ERR, "Vhost-kernel ioctl %"PRIu64" failed (%s)", 94 request, strerror(errno)); 95 return -1; 96 } 97 98 return 0; 99 } 100 101 static int 102 vhost_kernel_set_owner(struct virtio_user_dev *dev) 103 { 104 int ret; 105 uint32_t i; 106 struct vhost_kernel_data *data = dev->backend_data; 107 108 for (i = 0; i < dev->max_queue_pairs; ++i) { 109 if (data->vhostfds[i] < 0) 110 continue; 111 112 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_OWNER, NULL); 113 if (ret < 0) 114 return ret; 115 } 116 117 return 0; 118 } 119 120 static int 121 vhost_kernel_get_features(struct virtio_user_dev *dev, uint64_t *features) 122 { 123 int ret; 124 unsigned int tap_features; 125 struct vhost_kernel_data *data = dev->backend_data; 126 127 ret = vhost_kernel_ioctl(data->vhostfds[0], VHOST_GET_FEATURES, features); 128 if (ret < 0) { 129 PMD_DRV_LOG(ERR, "Failed to get features"); 130 return -1; 131 } 132 133 ret = tap_support_features(&tap_features); 134 if (ret < 0) { 135 PMD_DRV_LOG(ERR, "Failed to get TAP features"); 136 return -1; 137 } 138 139 /* with tap as the backend, all these features are supported 140 * but not claimed by vhost-net, so we add them back when 141 * reporting to upper layer. 142 */ 143 if (tap_features & IFF_VNET_HDR) { 144 *features |= VHOST_KERNEL_GUEST_OFFLOADS_MASK; 145 *features |= VHOST_KERNEL_HOST_OFFLOADS_MASK; 146 } 147 148 /* vhost_kernel will not declare this feature, but it does 149 * support multi-queue. 150 */ 151 if (tap_features & IFF_MULTI_QUEUE) 152 *features |= (1ull << VIRTIO_NET_F_MQ); 153 154 return 0; 155 } 156 157 static int 158 vhost_kernel_set_features(struct virtio_user_dev *dev, uint64_t features) 159 { 160 struct vhost_kernel_data *data = dev->backend_data; 161 uint32_t i; 162 int ret; 163 164 /* We don't need memory protection here */ 165 features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); 166 /* VHOST kernel does not know about below flags */ 167 features &= ~VHOST_KERNEL_GUEST_OFFLOADS_MASK; 168 features &= ~VHOST_KERNEL_HOST_OFFLOADS_MASK; 169 features &= ~(1ULL << VIRTIO_NET_F_MQ); 170 171 for (i = 0; i < dev->max_queue_pairs; ++i) { 172 if (data->vhostfds[i] < 0) 173 continue; 174 175 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_FEATURES, &features); 176 if (ret < 0) 177 return ret; 178 } 179 180 return 0; 181 } 182 183 static int 184 add_memseg_list(const struct rte_memseg_list *msl, void *arg) 185 { 186 struct vhost_memory_kernel *vm = arg; 187 struct vhost_memory_region *mr; 188 void *start_addr; 189 uint64_t len; 190 191 if (msl->external) 192 return 0; 193 194 if (vm->nregions >= max_regions) 195 return -1; 196 197 start_addr = msl->base_va; 198 len = msl->page_sz * msl->memseg_arr.len; 199 200 mr = &vm->regions[vm->nregions++]; 201 202 mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr; 203 mr->userspace_addr = (uint64_t)(uintptr_t)start_addr; 204 mr->memory_size = len; 205 mr->mmap_offset = 0; /* flags_padding */ 206 207 PMD_DRV_LOG(DEBUG, "index=%u addr=%p len=%" PRIu64, 208 vm->nregions - 1, start_addr, len); 209 210 return 0; 211 } 212 213 /* By default, vhost kernel module allows 64 regions, but DPDK may 214 * have much more memory regions. Below function will treat each 215 * contiguous memory space reserved by DPDK as one region. 216 */ 217 static int 218 vhost_kernel_set_memory_table(struct virtio_user_dev *dev) 219 { 220 uint32_t i; 221 struct vhost_kernel_data *data = dev->backend_data; 222 struct vhost_memory_kernel *vm; 223 int ret; 224 225 vm = malloc(sizeof(struct vhost_memory_kernel) + 226 max_regions * 227 sizeof(struct vhost_memory_region)); 228 if (!vm) 229 goto err; 230 231 vm->nregions = 0; 232 vm->padding = 0; 233 234 /* 235 * The memory lock has already been taken by memory subsystem 236 * or virtio_user_start_device(). 237 */ 238 ret = rte_memseg_list_walk_thread_unsafe(add_memseg_list, vm); 239 if (ret < 0) 240 goto err_free; 241 242 for (i = 0; i < dev->max_queue_pairs; ++i) { 243 if (data->vhostfds[i] < 0) 244 continue; 245 246 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_MEM_TABLE, vm); 247 if (ret < 0) 248 goto err_free; 249 } 250 251 free(vm); 252 253 return 0; 254 err_free: 255 free(vm); 256 err: 257 PMD_DRV_LOG(ERR, "Failed to set memory table"); 258 return -1; 259 } 260 261 static int 262 vhost_kernel_set_vring(struct virtio_user_dev *dev, uint64_t req, struct vhost_vring_state *state) 263 { 264 int ret, fd; 265 unsigned int index = state->index; 266 struct vhost_kernel_data *data = dev->backend_data; 267 268 /* Convert from queue index to queue-pair & offset */ 269 fd = data->vhostfds[state->index / 2]; 270 state->index %= 2; 271 272 ret = vhost_kernel_ioctl(fd, req, state); 273 if (ret < 0) { 274 PMD_DRV_LOG(ERR, "Failed to set vring (request %" PRIu64 ")", req); 275 return -1; 276 } 277 278 /* restore index back to queue index */ 279 state->index = index; 280 281 return 0; 282 } 283 284 static int 285 vhost_kernel_set_vring_num(struct virtio_user_dev *dev, struct vhost_vring_state *state) 286 { 287 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_NUM, state); 288 } 289 290 static int 291 vhost_kernel_set_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 292 { 293 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_BASE, state); 294 } 295 296 static int 297 vhost_kernel_get_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 298 { 299 return vhost_kernel_set_vring(dev, VHOST_GET_VRING_BASE, state); 300 } 301 302 static int 303 vhost_kernel_set_vring_file(struct virtio_user_dev *dev, uint64_t req, 304 struct vhost_vring_file *file) 305 { 306 int ret, fd; 307 unsigned int index = file->index; 308 struct vhost_kernel_data *data = dev->backend_data; 309 310 /* Convert from queue index to queue-pair & offset */ 311 fd = data->vhostfds[file->index / 2]; 312 file->index %= 2; 313 314 ret = vhost_kernel_ioctl(fd, req, file); 315 if (ret < 0) { 316 PMD_DRV_LOG(ERR, "Failed to set vring file (request %" PRIu64 ")", req); 317 return -1; 318 } 319 320 /* restore index back to queue index */ 321 file->index = index; 322 323 return 0; 324 } 325 326 static int 327 vhost_kernel_set_vring_kick(struct virtio_user_dev *dev, struct vhost_vring_file *file) 328 { 329 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_KICK, file); 330 } 331 332 static int 333 vhost_kernel_set_vring_call(struct virtio_user_dev *dev, struct vhost_vring_file *file) 334 { 335 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_CALL, file); 336 } 337 338 static int 339 vhost_kernel_set_vring_addr(struct virtio_user_dev *dev, struct vhost_vring_addr *addr) 340 { 341 int ret, fd; 342 unsigned int index = addr->index; 343 struct vhost_kernel_data *data = dev->backend_data; 344 345 /* Convert from queue index to queue-pair & offset */ 346 fd = data->vhostfds[addr->index / 2]; 347 addr->index %= 2; 348 349 ret = vhost_kernel_ioctl(fd, VHOST_SET_VRING_ADDR, addr); 350 if (ret < 0) { 351 PMD_DRV_LOG(ERR, "Failed to set vring address"); 352 return -1; 353 } 354 355 /* restore index back to queue index */ 356 addr->index = index; 357 358 return 0; 359 } 360 361 static int 362 vhost_kernel_get_status(struct virtio_user_dev *dev __rte_unused, uint8_t *status __rte_unused) 363 { 364 return -ENOTSUP; 365 } 366 367 static int 368 vhost_kernel_set_status(struct virtio_user_dev *dev __rte_unused, uint8_t status __rte_unused) 369 { 370 return -ENOTSUP; 371 } 372 373 /** 374 * Set up environment to talk with a vhost kernel backend. 375 * 376 * @return 377 * - (-1) if fail to set up; 378 * - (>=0) if successful. 379 */ 380 static int 381 vhost_kernel_setup(struct virtio_user_dev *dev) 382 { 383 int vhostfd; 384 uint32_t q, i; 385 struct vhost_kernel_data *data; 386 387 data = malloc(sizeof(*data)); 388 if (!data) { 389 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost-kernel data", dev->path); 390 return -1; 391 } 392 393 data->vhostfds = malloc(dev->max_queue_pairs * sizeof(int)); 394 if (!data->vhostfds) { 395 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost FDs", dev->path); 396 goto err_data; 397 } 398 data->tapfds = malloc(dev->max_queue_pairs * sizeof(int)); 399 if (!data->tapfds) { 400 PMD_INIT_LOG(ERR, "(%s) Failed to allocate TAP FDs", dev->path); 401 goto err_vhostfds; 402 } 403 404 for (q = 0; q < dev->max_queue_pairs; ++q) { 405 data->vhostfds[q] = -1; 406 data->tapfds[q] = -1; 407 } 408 409 get_vhost_kernel_max_regions(); 410 411 for (i = 0; i < dev->max_queue_pairs; ++i) { 412 vhostfd = open(dev->path, O_RDWR); 413 if (vhostfd < 0) { 414 PMD_DRV_LOG(ERR, "fail to open %s, %s", dev->path, strerror(errno)); 415 goto err_tapfds; 416 } 417 418 data->vhostfds[i] = vhostfd; 419 } 420 421 dev->backend_data = data; 422 423 return 0; 424 425 err_tapfds: 426 for (i = 0; i < dev->max_queue_pairs; i++) 427 if (data->vhostfds[i] >= 0) 428 close(data->vhostfds[i]); 429 430 free(data->tapfds); 431 err_vhostfds: 432 free(data->vhostfds); 433 err_data: 434 free(data); 435 436 return -1; 437 } 438 439 static int 440 vhost_kernel_destroy(struct virtio_user_dev *dev) 441 { 442 struct vhost_kernel_data *data = dev->backend_data; 443 uint32_t i; 444 445 if (!data) 446 return 0; 447 448 for (i = 0; i < dev->max_queue_pairs; ++i) { 449 if (data->vhostfds[i] >= 0) 450 close(data->vhostfds[i]); 451 if (data->tapfds[i] >= 0) 452 close(data->tapfds[i]); 453 } 454 455 free(data->vhostfds); 456 free(data->tapfds); 457 free(data); 458 dev->backend_data = NULL; 459 460 return 0; 461 } 462 463 static int 464 vhost_kernel_set_backend(int vhostfd, int tapfd) 465 { 466 struct vhost_vring_file f; 467 468 f.fd = tapfd; 469 f.index = 0; 470 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 471 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 472 strerror(errno)); 473 return -1; 474 } 475 476 f.index = 1; 477 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 478 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 479 strerror(errno)); 480 return -1; 481 } 482 483 return 0; 484 } 485 486 static int 487 vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, 488 uint16_t pair_idx, 489 int enable) 490 { 491 int hdr_size; 492 int vhostfd; 493 int tapfd; 494 int req_mq = (dev->max_queue_pairs > 1); 495 struct vhost_kernel_data *data = dev->backend_data; 496 497 vhostfd = data->vhostfds[pair_idx]; 498 499 if (dev->qp_enabled[pair_idx] == enable) 500 return 0; 501 502 if (!enable) { 503 tapfd = data->tapfds[pair_idx]; 504 if (vhost_kernel_set_backend(vhostfd, -1) < 0) { 505 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 506 return -1; 507 } 508 if (req_mq && vhost_kernel_tap_set_queue(tapfd, false) < 0) { 509 PMD_DRV_LOG(ERR, "fail to disable tap for vhost kernel"); 510 return -1; 511 } 512 dev->qp_enabled[pair_idx] = false; 513 return 0; 514 } 515 516 if (data->tapfds[pair_idx] >= 0) { 517 tapfd = data->tapfds[pair_idx]; 518 if (vhost_kernel_tap_set_offload(tapfd, dev->features) == -1) 519 return -1; 520 if (req_mq && vhost_kernel_tap_set_queue(tapfd, true) < 0) { 521 PMD_DRV_LOG(ERR, "fail to enable tap for vhost kernel"); 522 return -1; 523 } 524 goto set_backend; 525 } 526 527 if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || 528 (dev->features & (1ULL << VIRTIO_F_VERSION_1))) 529 hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); 530 else 531 hdr_size = sizeof(struct virtio_net_hdr); 532 533 tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq, 534 (char *)dev->mac_addr, dev->features); 535 if (tapfd < 0) { 536 PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel"); 537 return -1; 538 } 539 540 data->tapfds[pair_idx] = tapfd; 541 542 set_backend: 543 if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) { 544 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 545 return -1; 546 } 547 548 dev->qp_enabled[pair_idx] = true; 549 return 0; 550 } 551 552 static int 553 vhost_kernel_get_backend_features(uint64_t *features) 554 { 555 *features = 0; 556 557 return 0; 558 } 559 560 static int 561 vhost_kernel_update_link_state(struct virtio_user_dev *dev __rte_unused) 562 { 563 /* Nothing to update (Maybe get TAP interface link state?) */ 564 return 0; 565 } 566 567 static int 568 vhost_kernel_get_intr_fd(struct virtio_user_dev *dev __rte_unused) 569 { 570 /* No link state interrupt with Vhost-kernel */ 571 return -1; 572 } 573 574 struct virtio_user_backend_ops virtio_ops_kernel = { 575 .setup = vhost_kernel_setup, 576 .destroy = vhost_kernel_destroy, 577 .get_backend_features = vhost_kernel_get_backend_features, 578 .set_owner = vhost_kernel_set_owner, 579 .get_features = vhost_kernel_get_features, 580 .set_features = vhost_kernel_set_features, 581 .set_memory_table = vhost_kernel_set_memory_table, 582 .set_vring_num = vhost_kernel_set_vring_num, 583 .set_vring_base = vhost_kernel_set_vring_base, 584 .get_vring_base = vhost_kernel_get_vring_base, 585 .set_vring_call = vhost_kernel_set_vring_call, 586 .set_vring_kick = vhost_kernel_set_vring_kick, 587 .set_vring_addr = vhost_kernel_set_vring_addr, 588 .get_status = vhost_kernel_get_status, 589 .set_status = vhost_kernel_set_status, 590 .enable_qp = vhost_kernel_enable_queue_pair, 591 .update_link_state = vhost_kernel_update_link_state, 592 .get_intr_fd = vhost_kernel_get_intr_fd, 593 }; 594