1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2016 Intel Corporation 3 */ 4 5 #include <sys/types.h> 6 #include <sys/stat.h> 7 #include <fcntl.h> 8 #include <unistd.h> 9 #include <errno.h> 10 11 #include <rte_memory.h> 12 13 #include "vhost.h" 14 #include "virtio_user_dev.h" 15 #include "vhost_kernel_tap.h" 16 17 struct vhost_kernel_data { 18 int *vhostfds; 19 int *tapfds; 20 }; 21 22 struct vhost_memory_kernel { 23 uint32_t nregions; 24 uint32_t padding; 25 struct vhost_memory_region regions[0]; 26 }; 27 28 /* vhost kernel ioctls */ 29 #define VHOST_VIRTIO 0xAF 30 #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) 31 #define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) 32 #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) 33 #define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) 34 #define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) 35 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) 36 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) 37 #define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) 38 #define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) 39 #define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 40 #define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) 41 #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) 42 #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) 43 #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) 44 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) 45 46 /* with below features, vhost kernel does not need to do the checksum and TSO, 47 * these info will be passed to virtio_user through virtio net header. 48 */ 49 #define VHOST_KERNEL_GUEST_OFFLOADS_MASK \ 50 ((1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ 51 (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ 52 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ 53 (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ 54 (1ULL << VIRTIO_NET_F_GUEST_UFO)) 55 56 /* with below features, when flows from virtio_user to vhost kernel 57 * (1) if flows goes up through the kernel networking stack, it does not need 58 * to verify checksum, which can save CPU cycles; 59 * (2) if flows goes through a Linux bridge and outside from an interface 60 * (kernel driver), checksum and TSO will be done by GSO in kernel or even 61 * offloaded into real physical device. 62 */ 63 #define VHOST_KERNEL_HOST_OFFLOADS_MASK \ 64 ((1ULL << VIRTIO_NET_F_HOST_TSO4) | \ 65 (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ 66 (1ULL << VIRTIO_NET_F_CSUM)) 67 68 static uint64_t max_regions = 64; 69 70 static void 71 get_vhost_kernel_max_regions(void) 72 { 73 int fd; 74 char buf[20] = {'\0'}; 75 76 fd = open("/sys/module/vhost/parameters/max_mem_regions", O_RDONLY); 77 if (fd < 0) 78 return; 79 80 if (read(fd, buf, sizeof(buf) - 1) > 0) 81 max_regions = strtoull(buf, NULL, 10); 82 83 close(fd); 84 } 85 86 static int 87 vhost_kernel_ioctl(int fd, uint64_t request, void *arg) 88 { 89 int ret; 90 91 ret = ioctl(fd, request, arg); 92 if (ret) { 93 PMD_DRV_LOG(ERR, "Vhost-kernel ioctl %"PRIu64" failed (%s)", 94 request, strerror(errno)); 95 return -1; 96 } 97 98 return 0; 99 } 100 101 static int 102 vhost_kernel_set_owner(struct virtio_user_dev *dev) 103 { 104 int ret; 105 uint32_t i; 106 struct vhost_kernel_data *data = dev->backend_data; 107 108 for (i = 0; i < dev->max_queue_pairs; ++i) { 109 if (data->vhostfds[i] < 0) 110 continue; 111 112 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_OWNER, NULL); 113 if (ret < 0) 114 return ret; 115 } 116 117 return 0; 118 } 119 120 static int 121 vhost_kernel_get_features(struct virtio_user_dev *dev, uint64_t *features) 122 { 123 int ret; 124 unsigned int tap_features; 125 struct vhost_kernel_data *data = dev->backend_data; 126 127 ret = vhost_kernel_ioctl(data->vhostfds[0], VHOST_GET_FEATURES, features); 128 if (ret < 0) { 129 PMD_DRV_LOG(ERR, "Failed to get features"); 130 return -1; 131 } 132 133 ret = tap_support_features(&tap_features); 134 if (ret < 0) { 135 PMD_DRV_LOG(ERR, "Failed to get TAP features"); 136 return -1; 137 } 138 139 /* with tap as the backend, all these features are supported 140 * but not claimed by vhost-net, so we add them back when 141 * reporting to upper layer. 142 */ 143 if (tap_features & IFF_VNET_HDR) { 144 *features |= VHOST_KERNEL_GUEST_OFFLOADS_MASK; 145 *features |= VHOST_KERNEL_HOST_OFFLOADS_MASK; 146 } 147 148 /* vhost_kernel will not declare this feature, but it does 149 * support multi-queue. 150 */ 151 if (tap_features & IFF_MULTI_QUEUE) 152 *features |= (1ull << VIRTIO_NET_F_MQ); 153 154 return 0; 155 } 156 157 static int 158 vhost_kernel_set_features(struct virtio_user_dev *dev, uint64_t features) 159 { 160 struct vhost_kernel_data *data = dev->backend_data; 161 162 /* We don't need memory protection here */ 163 features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); 164 /* VHOST kernel does not know about below flags */ 165 features &= ~VHOST_KERNEL_GUEST_OFFLOADS_MASK; 166 features &= ~VHOST_KERNEL_HOST_OFFLOADS_MASK; 167 features &= ~(1ULL << VIRTIO_NET_F_MQ); 168 169 return vhost_kernel_ioctl(data->vhostfds[0], VHOST_SET_FEATURES, &features); 170 } 171 172 static int 173 add_memseg_list(const struct rte_memseg_list *msl, void *arg) 174 { 175 struct vhost_memory_kernel *vm = arg; 176 struct vhost_memory_region *mr; 177 void *start_addr; 178 uint64_t len; 179 180 if (msl->external) 181 return 0; 182 183 if (vm->nregions >= max_regions) 184 return -1; 185 186 start_addr = msl->base_va; 187 len = msl->page_sz * msl->memseg_arr.len; 188 189 mr = &vm->regions[vm->nregions++]; 190 191 mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr; 192 mr->userspace_addr = (uint64_t)(uintptr_t)start_addr; 193 mr->memory_size = len; 194 mr->mmap_offset = 0; /* flags_padding */ 195 196 PMD_DRV_LOG(DEBUG, "index=%u addr=%p len=%" PRIu64, 197 vm->nregions - 1, start_addr, len); 198 199 return 0; 200 } 201 202 /* By default, vhost kernel module allows 64 regions, but DPDK may 203 * have much more memory regions. Below function will treat each 204 * contiguous memory space reserved by DPDK as one region. 205 */ 206 static int 207 vhost_kernel_set_memory_table(struct virtio_user_dev *dev) 208 { 209 uint32_t i; 210 struct vhost_kernel_data *data = dev->backend_data; 211 struct vhost_memory_kernel *vm; 212 int ret; 213 214 vm = malloc(sizeof(struct vhost_memory_kernel) + 215 max_regions * 216 sizeof(struct vhost_memory_region)); 217 if (!vm) 218 goto err; 219 220 vm->nregions = 0; 221 vm->padding = 0; 222 223 /* 224 * The memory lock has already been taken by memory subsystem 225 * or virtio_user_start_device(). 226 */ 227 ret = rte_memseg_list_walk_thread_unsafe(add_memseg_list, vm); 228 if (ret < 0) 229 goto err_free; 230 231 for (i = 0; i < dev->max_queue_pairs; ++i) { 232 if (data->vhostfds[i] < 0) 233 continue; 234 235 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_MEM_TABLE, vm); 236 if (ret < 0) 237 goto err_free; 238 } 239 240 free(vm); 241 242 return 0; 243 err_free: 244 free(vm); 245 err: 246 PMD_DRV_LOG(ERR, "Failed to set memory table"); 247 return -1; 248 } 249 250 static int 251 vhost_kernel_set_vring(struct virtio_user_dev *dev, uint64_t req, struct vhost_vring_state *state) 252 { 253 int ret, fd; 254 unsigned int index = state->index; 255 struct vhost_kernel_data *data = dev->backend_data; 256 257 /* Convert from queue index to queue-pair & offset */ 258 fd = data->vhostfds[state->index / 2]; 259 state->index %= 2; 260 261 ret = vhost_kernel_ioctl(fd, req, state); 262 if (ret < 0) { 263 PMD_DRV_LOG(ERR, "Failed to set vring (request %" PRIu64 ")", req); 264 return -1; 265 } 266 267 /* restore index back to queue index */ 268 state->index = index; 269 270 return 0; 271 } 272 273 static int 274 vhost_kernel_set_vring_num(struct virtio_user_dev *dev, struct vhost_vring_state *state) 275 { 276 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_NUM, state); 277 } 278 279 static int 280 vhost_kernel_set_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 281 { 282 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_BASE, state); 283 } 284 285 static int 286 vhost_kernel_get_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state) 287 { 288 return vhost_kernel_set_vring(dev, VHOST_GET_VRING_BASE, state); 289 } 290 291 static int 292 vhost_kernel_set_vring_file(struct virtio_user_dev *dev, uint64_t req, 293 struct vhost_vring_file *file) 294 { 295 int ret, fd; 296 unsigned int index = file->index; 297 struct vhost_kernel_data *data = dev->backend_data; 298 299 /* Convert from queue index to queue-pair & offset */ 300 fd = data->vhostfds[file->index / 2]; 301 file->index %= 2; 302 303 ret = vhost_kernel_ioctl(fd, req, file); 304 if (ret < 0) { 305 PMD_DRV_LOG(ERR, "Failed to set vring file (request %" PRIu64 ")", req); 306 return -1; 307 } 308 309 /* restore index back to queue index */ 310 file->index = index; 311 312 return 0; 313 } 314 315 static int 316 vhost_kernel_set_vring_kick(struct virtio_user_dev *dev, struct vhost_vring_file *file) 317 { 318 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_KICK, file); 319 } 320 321 static int 322 vhost_kernel_set_vring_call(struct virtio_user_dev *dev, struct vhost_vring_file *file) 323 { 324 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_CALL, file); 325 } 326 327 static int 328 vhost_kernel_set_vring_addr(struct virtio_user_dev *dev, struct vhost_vring_addr *addr) 329 { 330 int ret, fd; 331 unsigned int index = addr->index; 332 struct vhost_kernel_data *data = dev->backend_data; 333 334 /* Convert from queue index to queue-pair & offset */ 335 fd = data->vhostfds[addr->index / 2]; 336 addr->index %= 2; 337 338 ret = vhost_kernel_ioctl(fd, VHOST_SET_VRING_ADDR, addr); 339 if (ret < 0) { 340 PMD_DRV_LOG(ERR, "Failed to set vring address"); 341 return -1; 342 } 343 344 /* restore index back to queue index */ 345 addr->index = index; 346 347 return 0; 348 } 349 350 static int 351 vhost_kernel_get_status(struct virtio_user_dev *dev __rte_unused, uint8_t *status __rte_unused) 352 { 353 return -ENOTSUP; 354 } 355 356 static int 357 vhost_kernel_set_status(struct virtio_user_dev *dev __rte_unused, uint8_t status __rte_unused) 358 { 359 return -ENOTSUP; 360 } 361 362 /** 363 * Set up environment to talk with a vhost kernel backend. 364 * 365 * @return 366 * - (-1) if fail to set up; 367 * - (>=0) if successful. 368 */ 369 static int 370 vhost_kernel_setup(struct virtio_user_dev *dev) 371 { 372 int vhostfd; 373 uint32_t q, i; 374 struct vhost_kernel_data *data; 375 376 data = malloc(sizeof(*data)); 377 if (!data) { 378 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost-kernel data", dev->path); 379 return -1; 380 } 381 382 data->vhostfds = malloc(dev->max_queue_pairs * sizeof(int)); 383 if (!data->vhostfds) { 384 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost FDs", dev->path); 385 goto err_data; 386 } 387 data->tapfds = malloc(dev->max_queue_pairs * sizeof(int)); 388 if (!data->tapfds) { 389 PMD_INIT_LOG(ERR, "(%s) Failed to allocate TAP FDs", dev->path); 390 goto err_vhostfds; 391 } 392 393 for (q = 0; q < dev->max_queue_pairs; ++q) { 394 data->vhostfds[q] = -1; 395 data->tapfds[q] = -1; 396 } 397 398 get_vhost_kernel_max_regions(); 399 400 for (i = 0; i < dev->max_queue_pairs; ++i) { 401 vhostfd = open(dev->path, O_RDWR); 402 if (vhostfd < 0) { 403 PMD_DRV_LOG(ERR, "fail to open %s, %s", dev->path, strerror(errno)); 404 goto err_tapfds; 405 } 406 407 data->vhostfds[i] = vhostfd; 408 } 409 410 dev->backend_data = data; 411 412 return 0; 413 414 err_tapfds: 415 for (i = 0; i < dev->max_queue_pairs; i++) 416 if (data->vhostfds[i] >= 0) 417 close(data->vhostfds[i]); 418 419 free(data->tapfds); 420 err_vhostfds: 421 free(data->vhostfds); 422 err_data: 423 free(data); 424 425 return -1; 426 } 427 428 static int 429 vhost_kernel_destroy(struct virtio_user_dev *dev) 430 { 431 struct vhost_kernel_data *data = dev->backend_data; 432 uint32_t i; 433 434 if (!data) 435 return 0; 436 437 for (i = 0; i < dev->max_queue_pairs; ++i) { 438 if (data->vhostfds[i] >= 0) 439 close(data->vhostfds[i]); 440 if (data->tapfds[i] >= 0) 441 close(data->tapfds[i]); 442 } 443 444 free(data->vhostfds); 445 free(data->tapfds); 446 free(data); 447 dev->backend_data = NULL; 448 449 return 0; 450 } 451 452 static int 453 vhost_kernel_set_backend(int vhostfd, int tapfd) 454 { 455 struct vhost_vring_file f; 456 457 f.fd = tapfd; 458 f.index = 0; 459 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 460 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 461 strerror(errno)); 462 return -1; 463 } 464 465 f.index = 1; 466 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { 467 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", 468 strerror(errno)); 469 return -1; 470 } 471 472 return 0; 473 } 474 475 static int 476 vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, 477 uint16_t pair_idx, 478 int enable) 479 { 480 int hdr_size; 481 int vhostfd; 482 int tapfd; 483 int req_mq = (dev->max_queue_pairs > 1); 484 struct vhost_kernel_data *data = dev->backend_data; 485 486 vhostfd = data->vhostfds[pair_idx]; 487 488 if (dev->qp_enabled[pair_idx] == enable) 489 return 0; 490 491 if (!enable) { 492 tapfd = data->tapfds[pair_idx]; 493 if (vhost_kernel_set_backend(vhostfd, -1) < 0) { 494 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 495 return -1; 496 } 497 if (req_mq && vhost_kernel_tap_set_queue(tapfd, false) < 0) { 498 PMD_DRV_LOG(ERR, "fail to disable tap for vhost kernel"); 499 return -1; 500 } 501 dev->qp_enabled[pair_idx] = false; 502 return 0; 503 } 504 505 if (data->tapfds[pair_idx] >= 0) { 506 tapfd = data->tapfds[pair_idx]; 507 if (vhost_kernel_tap_set_offload(tapfd, dev->features) == -1) 508 return -1; 509 if (req_mq && vhost_kernel_tap_set_queue(tapfd, true) < 0) { 510 PMD_DRV_LOG(ERR, "fail to enable tap for vhost kernel"); 511 return -1; 512 } 513 goto set_backend; 514 } 515 516 if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || 517 (dev->features & (1ULL << VIRTIO_F_VERSION_1))) 518 hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); 519 else 520 hdr_size = sizeof(struct virtio_net_hdr); 521 522 tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq, 523 (char *)dev->mac_addr, dev->features); 524 if (tapfd < 0) { 525 PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel"); 526 return -1; 527 } 528 529 data->tapfds[pair_idx] = tapfd; 530 531 set_backend: 532 if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) { 533 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); 534 return -1; 535 } 536 537 dev->qp_enabled[pair_idx] = true; 538 return 0; 539 } 540 541 static int 542 vhost_kernel_get_backend_features(uint64_t *features) 543 { 544 *features = 0; 545 546 return 0; 547 } 548 549 static int 550 vhost_kernel_update_link_state(struct virtio_user_dev *dev __rte_unused) 551 { 552 /* Nothing to update (Maybe get TAP interface link state?) */ 553 return 0; 554 } 555 556 static int 557 vhost_kernel_get_intr_fd(struct virtio_user_dev *dev __rte_unused) 558 { 559 /* No link state interrupt with Vhost-kernel */ 560 return -1; 561 } 562 563 struct virtio_user_backend_ops virtio_ops_kernel = { 564 .setup = vhost_kernel_setup, 565 .destroy = vhost_kernel_destroy, 566 .get_backend_features = vhost_kernel_get_backend_features, 567 .set_owner = vhost_kernel_set_owner, 568 .get_features = vhost_kernel_get_features, 569 .set_features = vhost_kernel_set_features, 570 .set_memory_table = vhost_kernel_set_memory_table, 571 .set_vring_num = vhost_kernel_set_vring_num, 572 .set_vring_base = vhost_kernel_set_vring_base, 573 .get_vring_base = vhost_kernel_get_vring_base, 574 .set_vring_call = vhost_kernel_set_vring_call, 575 .set_vring_kick = vhost_kernel_set_vring_kick, 576 .set_vring_addr = vhost_kernel_set_vring_addr, 577 .get_status = vhost_kernel_get_status, 578 .set_status = vhost_kernel_set_status, 579 .enable_qp = vhost_kernel_enable_queue_pair, 580 .update_link_state = vhost_kernel_update_link_state, 581 .get_intr_fd = vhost_kernel_get_intr_fd, 582 }; 583