1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2016 Intel Corporation
3 */
4
5 #include <sys/types.h>
6 #include <sys/stat.h>
7 #include <fcntl.h>
8 #include <unistd.h>
9 #include <errno.h>
10 #include <stdlib.h>
11
12 #include <rte_memory.h>
13
14 #include "vhost.h"
15 #include "virtio_user_dev.h"
16 #include "vhost_kernel_tap.h"
17
18 struct vhost_kernel_data {
19 int *vhostfds;
20 int *tapfds;
21 };
22
23 struct vhost_memory_kernel {
24 uint32_t nregions;
25 uint32_t padding;
26 struct vhost_memory_region regions[];
27 };
28
29 /* vhost kernel ioctls */
30 #define VHOST_VIRTIO 0xAF
31 #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
32 #define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
33 #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
34 #define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
35 #define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel)
36 #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
37 #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
38 #define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
39 #define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
40 #define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
41 #define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
42 #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
43 #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
44 #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
45 #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
46
47 /* with below features, vhost kernel does not need to do the checksum and TSO,
48 * these info will be passed to virtio_user through virtio net header.
49 */
50 #define VHOST_KERNEL_GUEST_OFFLOADS_MASK \
51 ((1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
52 (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
53 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
54 (1ULL << VIRTIO_NET_F_GUEST_ECN) | \
55 (1ULL << VIRTIO_NET_F_GUEST_UFO))
56
57 /* with below features, when flows from virtio_user to vhost kernel
58 * (1) if flows goes up through the kernel networking stack, it does not need
59 * to verify checksum, which can save CPU cycles;
60 * (2) if flows goes through a Linux bridge and outside from an interface
61 * (kernel driver), checksum and TSO will be done by GSO in kernel or even
62 * offloaded into real physical device.
63 */
64 #define VHOST_KERNEL_HOST_OFFLOADS_MASK \
65 ((1ULL << VIRTIO_NET_F_HOST_TSO4) | \
66 (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
67 (1ULL << VIRTIO_NET_F_CSUM))
68
69 static uint64_t max_regions = 64;
70
71 static void
get_vhost_kernel_max_regions(void)72 get_vhost_kernel_max_regions(void)
73 {
74 int fd;
75 char buf[20] = {'\0'};
76
77 fd = open("/sys/module/vhost/parameters/max_mem_regions", O_RDONLY);
78 if (fd < 0)
79 return;
80
81 if (read(fd, buf, sizeof(buf) - 1) > 0)
82 max_regions = strtoull(buf, NULL, 10);
83
84 close(fd);
85 }
86
87 static int
vhost_kernel_ioctl(int fd,uint64_t request,void * arg)88 vhost_kernel_ioctl(int fd, uint64_t request, void *arg)
89 {
90 int ret;
91
92 ret = ioctl(fd, request, arg);
93 if (ret) {
94 PMD_DRV_LOG(ERR, "Vhost-kernel ioctl %"PRIu64" failed (%s)",
95 request, strerror(errno));
96 return -1;
97 }
98
99 return 0;
100 }
101
102 static int
vhost_kernel_set_owner(struct virtio_user_dev * dev)103 vhost_kernel_set_owner(struct virtio_user_dev *dev)
104 {
105 int ret;
106 uint32_t i;
107 struct vhost_kernel_data *data = dev->backend_data;
108
109 for (i = 0; i < dev->max_queue_pairs; ++i) {
110 if (data->vhostfds[i] < 0)
111 continue;
112
113 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_OWNER, NULL);
114 if (ret < 0)
115 return ret;
116 }
117
118 return 0;
119 }
120
121 static int
vhost_kernel_get_features(struct virtio_user_dev * dev,uint64_t * features)122 vhost_kernel_get_features(struct virtio_user_dev *dev, uint64_t *features)
123 {
124 struct vhost_kernel_data *data = dev->backend_data;
125 unsigned int tap_flags;
126 int ret;
127
128 ret = vhost_kernel_ioctl(data->vhostfds[0], VHOST_GET_FEATURES, features);
129 if (ret < 0) {
130 PMD_DRV_LOG(ERR, "Failed to get features");
131 return -1;
132 }
133
134 ret = tap_get_flags(data->tapfds[0], &tap_flags);
135 if (ret < 0) {
136 PMD_DRV_LOG(ERR, "Failed to get TAP features");
137 return -1;
138 }
139
140 /* with tap as the backend, all these features are supported
141 * but not claimed by vhost-net, so we add them back when
142 * reporting to upper layer.
143 */
144 if (tap_flags & IFF_VNET_HDR) {
145 *features |= VHOST_KERNEL_GUEST_OFFLOADS_MASK;
146 *features |= VHOST_KERNEL_HOST_OFFLOADS_MASK;
147 }
148
149 /* vhost_kernel will not declare this feature, but it does
150 * support multi-queue.
151 */
152 if (tap_flags & IFF_MULTI_QUEUE)
153 *features |= (1ull << VIRTIO_NET_F_MQ);
154
155 return 0;
156 }
157
158 static int
vhost_kernel_set_features(struct virtio_user_dev * dev,uint64_t features)159 vhost_kernel_set_features(struct virtio_user_dev *dev, uint64_t features)
160 {
161 struct vhost_kernel_data *data = dev->backend_data;
162 uint32_t i;
163 int ret;
164
165 /* We don't need memory protection here */
166 features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
167 /* VHOST kernel does not know about below flags */
168 features &= ~VHOST_KERNEL_GUEST_OFFLOADS_MASK;
169 features &= ~VHOST_KERNEL_HOST_OFFLOADS_MASK;
170 features &= ~(1ULL << VIRTIO_NET_F_MQ);
171
172 for (i = 0; i < dev->max_queue_pairs; ++i) {
173 if (data->vhostfds[i] < 0)
174 continue;
175
176 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_FEATURES, &features);
177 if (ret < 0)
178 return ret;
179 }
180
181 return 0;
182 }
183
184 static int
add_memseg_list(const struct rte_memseg_list * msl,void * arg)185 add_memseg_list(const struct rte_memseg_list *msl, void *arg)
186 {
187 struct vhost_memory_kernel *vm = arg;
188 struct vhost_memory_region *mr;
189 void *start_addr;
190 uint64_t len;
191
192 if (msl->external)
193 return 0;
194
195 if (vm->nregions >= max_regions)
196 return -1;
197
198 start_addr = msl->base_va;
199 len = msl->page_sz * msl->memseg_arr.len;
200
201 mr = &vm->regions[vm->nregions++];
202
203 mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr;
204 mr->userspace_addr = (uint64_t)(uintptr_t)start_addr;
205 mr->memory_size = len;
206 mr->mmap_offset = 0; /* flags_padding */
207
208 PMD_DRV_LOG(DEBUG, "index=%u addr=%p len=%" PRIu64,
209 vm->nregions - 1, start_addr, len);
210
211 return 0;
212 }
213
214 /* By default, vhost kernel module allows 64 regions, but DPDK may
215 * have much more memory regions. Below function will treat each
216 * contiguous memory space reserved by DPDK as one region.
217 */
218 static int
vhost_kernel_set_memory_table(struct virtio_user_dev * dev)219 vhost_kernel_set_memory_table(struct virtio_user_dev *dev)
220 {
221 uint32_t i;
222 struct vhost_kernel_data *data = dev->backend_data;
223 struct vhost_memory_kernel *vm;
224 int ret;
225
226 vm = malloc(sizeof(struct vhost_memory_kernel) +
227 max_regions *
228 sizeof(struct vhost_memory_region));
229 if (!vm)
230 goto err;
231
232 vm->nregions = 0;
233 vm->padding = 0;
234
235 /*
236 * The memory lock has already been taken by memory subsystem
237 * or virtio_user_start_device().
238 */
239 ret = rte_memseg_list_walk_thread_unsafe(add_memseg_list, vm);
240 if (ret < 0)
241 goto err_free;
242
243 for (i = 0; i < dev->max_queue_pairs; ++i) {
244 if (data->vhostfds[i] < 0)
245 continue;
246
247 ret = vhost_kernel_ioctl(data->vhostfds[i], VHOST_SET_MEM_TABLE, vm);
248 if (ret < 0)
249 goto err_free;
250 }
251
252 free(vm);
253
254 return 0;
255 err_free:
256 free(vm);
257 err:
258 PMD_DRV_LOG(ERR, "Failed to set memory table");
259 return -1;
260 }
261
262 static int
vhost_kernel_set_vring(struct virtio_user_dev * dev,uint64_t req,struct vhost_vring_state * state)263 vhost_kernel_set_vring(struct virtio_user_dev *dev, uint64_t req, struct vhost_vring_state *state)
264 {
265 int ret, fd;
266 unsigned int index = state->index;
267 struct vhost_kernel_data *data = dev->backend_data;
268
269 /* Convert from queue index to queue-pair & offset */
270 fd = data->vhostfds[state->index / 2];
271 state->index %= 2;
272
273 ret = vhost_kernel_ioctl(fd, req, state);
274 if (ret < 0) {
275 PMD_DRV_LOG(ERR, "Failed to set vring (request %" PRIu64 ")", req);
276 return -1;
277 }
278
279 /* restore index back to queue index */
280 state->index = index;
281
282 return 0;
283 }
284
285 static int
vhost_kernel_set_vring_num(struct virtio_user_dev * dev,struct vhost_vring_state * state)286 vhost_kernel_set_vring_num(struct virtio_user_dev *dev, struct vhost_vring_state *state)
287 {
288 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_NUM, state);
289 }
290
291 static int
vhost_kernel_set_vring_base(struct virtio_user_dev * dev,struct vhost_vring_state * state)292 vhost_kernel_set_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state)
293 {
294 return vhost_kernel_set_vring(dev, VHOST_SET_VRING_BASE, state);
295 }
296
297 static int
vhost_kernel_get_vring_base(struct virtio_user_dev * dev,struct vhost_vring_state * state)298 vhost_kernel_get_vring_base(struct virtio_user_dev *dev, struct vhost_vring_state *state)
299 {
300 return vhost_kernel_set_vring(dev, VHOST_GET_VRING_BASE, state);
301 }
302
303 static int
vhost_kernel_set_vring_file(struct virtio_user_dev * dev,uint64_t req,struct vhost_vring_file * file)304 vhost_kernel_set_vring_file(struct virtio_user_dev *dev, uint64_t req,
305 struct vhost_vring_file *file)
306 {
307 int ret, fd;
308 unsigned int index = file->index;
309 struct vhost_kernel_data *data = dev->backend_data;
310
311 /* Convert from queue index to queue-pair & offset */
312 fd = data->vhostfds[file->index / 2];
313 file->index %= 2;
314
315 ret = vhost_kernel_ioctl(fd, req, file);
316 if (ret < 0) {
317 PMD_DRV_LOG(ERR, "Failed to set vring file (request %" PRIu64 ")", req);
318 return -1;
319 }
320
321 /* restore index back to queue index */
322 file->index = index;
323
324 return 0;
325 }
326
327 static int
vhost_kernel_set_vring_kick(struct virtio_user_dev * dev,struct vhost_vring_file * file)328 vhost_kernel_set_vring_kick(struct virtio_user_dev *dev, struct vhost_vring_file *file)
329 {
330 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_KICK, file);
331 }
332
333 static int
vhost_kernel_set_vring_call(struct virtio_user_dev * dev,struct vhost_vring_file * file)334 vhost_kernel_set_vring_call(struct virtio_user_dev *dev, struct vhost_vring_file *file)
335 {
336 return vhost_kernel_set_vring_file(dev, VHOST_SET_VRING_CALL, file);
337 }
338
339 static int
vhost_kernel_set_vring_addr(struct virtio_user_dev * dev,struct vhost_vring_addr * addr)340 vhost_kernel_set_vring_addr(struct virtio_user_dev *dev, struct vhost_vring_addr *addr)
341 {
342 int ret, fd;
343 unsigned int index = addr->index;
344 struct vhost_kernel_data *data = dev->backend_data;
345
346 /* Convert from queue index to queue-pair & offset */
347 fd = data->vhostfds[addr->index / 2];
348 addr->index %= 2;
349
350 ret = vhost_kernel_ioctl(fd, VHOST_SET_VRING_ADDR, addr);
351 if (ret < 0) {
352 PMD_DRV_LOG(ERR, "Failed to set vring address");
353 return -1;
354 }
355
356 /* restore index back to queue index */
357 addr->index = index;
358
359 return 0;
360 }
361
362 static int
vhost_kernel_get_status(struct virtio_user_dev * dev __rte_unused,uint8_t * status __rte_unused)363 vhost_kernel_get_status(struct virtio_user_dev *dev __rte_unused, uint8_t *status __rte_unused)
364 {
365 return -ENOTSUP;
366 }
367
368 static int
vhost_kernel_set_status(struct virtio_user_dev * dev __rte_unused,uint8_t status __rte_unused)369 vhost_kernel_set_status(struct virtio_user_dev *dev __rte_unused, uint8_t status __rte_unused)
370 {
371 return -ENOTSUP;
372 }
373
374 /**
375 * Set up environment to talk with a vhost kernel backend.
376 *
377 * @return
378 * - (-1) if fail to set up;
379 * - (>=0) if successful.
380 */
381 static int
vhost_kernel_setup(struct virtio_user_dev * dev)382 vhost_kernel_setup(struct virtio_user_dev *dev)
383 {
384 struct vhost_kernel_data *data;
385 unsigned int tap_features;
386 unsigned int tap_flags;
387 unsigned int r_flags;
388 const char *ifname;
389 uint32_t q, i;
390 int vhostfd;
391
392 if (tap_support_features(&tap_features) < 0)
393 return -1;
394
395 if ((tap_features & IFF_VNET_HDR) == 0) {
396 PMD_INIT_LOG(ERR, "TAP does not support IFF_VNET_HDR");
397 return -1;
398 }
399 r_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
400
401 if (tap_features & IFF_NAPI)
402 r_flags |= IFF_NAPI;
403
404 data = malloc(sizeof(*data));
405 if (!data) {
406 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost-kernel data", dev->path);
407 return -1;
408 }
409
410 data->vhostfds = malloc(dev->max_queue_pairs * sizeof(int));
411 if (!data->vhostfds) {
412 PMD_INIT_LOG(ERR, "(%s) Failed to allocate Vhost FDs", dev->path);
413 goto err_data;
414 }
415 data->tapfds = malloc(dev->max_queue_pairs * sizeof(int));
416 if (!data->tapfds) {
417 PMD_INIT_LOG(ERR, "(%s) Failed to allocate TAP FDs", dev->path);
418 goto err_vhostfds;
419 }
420
421 for (q = 0; q < dev->max_queue_pairs; ++q) {
422 data->vhostfds[q] = -1;
423 data->tapfds[q] = -1;
424 }
425
426 get_vhost_kernel_max_regions();
427
428 for (i = 0; i < dev->max_queue_pairs; ++i) {
429 vhostfd = open(dev->path, O_RDWR);
430 if (vhostfd < 0) {
431 PMD_DRV_LOG(ERR, "fail to open %s, %s", dev->path, strerror(errno));
432 goto err_tapfds;
433 }
434 data->vhostfds[i] = vhostfd;
435 }
436
437 ifname = dev->ifname != NULL ? dev->ifname : "tap%d";
438 data->tapfds[0] = tap_open(ifname, r_flags, (tap_features & IFF_MULTI_QUEUE) != 0);
439 if (data->tapfds[0] < 0)
440 goto err_tapfds;
441 if (dev->ifname == NULL && tap_get_name(data->tapfds[0], &dev->ifname) < 0) {
442 PMD_DRV_LOG(ERR, "fail to get tap name (%d)", data->tapfds[0]);
443 goto err_tapfds;
444 }
445 if (tap_get_flags(data->tapfds[0], &tap_flags) < 0) {
446 PMD_DRV_LOG(ERR, "fail to get tap flags for tap %s", dev->ifname);
447 goto err_tapfds;
448 }
449 if ((tap_flags & IFF_MULTI_QUEUE) == 0 && dev->max_queue_pairs > 1) {
450 PMD_DRV_LOG(ERR, "tap %s does not support multi queue", dev->ifname);
451 goto err_tapfds;
452 }
453
454 for (i = 1; i < dev->max_queue_pairs; i++) {
455 data->tapfds[i] = tap_open(dev->ifname, r_flags, true);
456 if (data->tapfds[i] < 0)
457 goto err_tapfds;
458 }
459
460 dev->backend_data = data;
461
462 return 0;
463
464 err_tapfds:
465 for (i = 0; i < dev->max_queue_pairs; i++) {
466 if (data->vhostfds[i] >= 0)
467 close(data->vhostfds[i]);
468 if (data->tapfds[i] >= 0)
469 close(data->tapfds[i]);
470 }
471
472 free(data->tapfds);
473 err_vhostfds:
474 free(data->vhostfds);
475 err_data:
476 free(data);
477
478 return -1;
479 }
480
481 static int
vhost_kernel_destroy(struct virtio_user_dev * dev)482 vhost_kernel_destroy(struct virtio_user_dev *dev)
483 {
484 struct vhost_kernel_data *data = dev->backend_data;
485 uint32_t i;
486
487 if (!data)
488 return 0;
489
490 for (i = 0; i < dev->max_queue_pairs; ++i) {
491 if (data->vhostfds[i] >= 0)
492 close(data->vhostfds[i]);
493 if (data->tapfds[i] >= 0)
494 close(data->tapfds[i]);
495 }
496
497 free(data->vhostfds);
498 free(data->tapfds);
499 free(data);
500 dev->backend_data = NULL;
501
502 return 0;
503 }
504
505 static int
vhost_kernel_set_backend(int vhostfd,int tapfd)506 vhost_kernel_set_backend(int vhostfd, int tapfd)
507 {
508 struct vhost_vring_file f;
509
510 f.fd = tapfd;
511 f.index = 0;
512 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) {
513 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s",
514 strerror(errno));
515 return -1;
516 }
517
518 f.index = 1;
519 if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) {
520 PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s",
521 strerror(errno));
522 return -1;
523 }
524
525 return 0;
526 }
527
528 static int
vhost_kernel_enable_queue_pair(struct virtio_user_dev * dev,uint16_t pair_idx,int enable)529 vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev,
530 uint16_t pair_idx,
531 int enable)
532 {
533 struct vhost_kernel_data *data = dev->backend_data;
534 int hdr_size;
535 int vhostfd;
536 int tapfd;
537
538 if (dev->qp_enabled[pair_idx] == enable)
539 return 0;
540
541 vhostfd = data->vhostfds[pair_idx];
542 tapfd = data->tapfds[pair_idx];
543
544 if (!enable) {
545 if (vhost_kernel_set_backend(vhostfd, -1) < 0) {
546 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel");
547 return -1;
548 }
549 dev->qp_enabled[pair_idx] = false;
550 return 0;
551 }
552
553 if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) ||
554 (dev->features & (1ULL << VIRTIO_F_VERSION_1)))
555 hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
556 else
557 hdr_size = sizeof(struct virtio_net_hdr);
558
559 /* Set mac on tap only once when starting */
560 if (!dev->started && pair_idx == 0 &&
561 tap_set_mac(data->tapfds[pair_idx], dev->mac_addr) < 0)
562 return -1;
563
564 if (vhost_kernel_tap_setup(tapfd, hdr_size, dev->features) < 0) {
565 PMD_DRV_LOG(ERR, "fail to setup tap for vhost kernel");
566 return -1;
567 }
568
569 if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) {
570 PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel");
571 return -1;
572 }
573
574 dev->qp_enabled[pair_idx] = true;
575 return 0;
576 }
577
578 static int
vhost_kernel_get_backend_features(uint64_t * features)579 vhost_kernel_get_backend_features(uint64_t *features)
580 {
581 *features = 0;
582
583 return 0;
584 }
585
586 static int
vhost_kernel_update_link_state(struct virtio_user_dev * dev __rte_unused)587 vhost_kernel_update_link_state(struct virtio_user_dev *dev __rte_unused)
588 {
589 /* Nothing to update (Maybe get TAP interface link state?) */
590 return 0;
591 }
592
593 static int
vhost_kernel_get_intr_fd(struct virtio_user_dev * dev __rte_unused)594 vhost_kernel_get_intr_fd(struct virtio_user_dev *dev __rte_unused)
595 {
596 /* No link state interrupt with Vhost-kernel */
597 return -1;
598 }
599
600 struct virtio_user_backend_ops virtio_ops_kernel = {
601 .setup = vhost_kernel_setup,
602 .destroy = vhost_kernel_destroy,
603 .get_backend_features = vhost_kernel_get_backend_features,
604 .set_owner = vhost_kernel_set_owner,
605 .get_features = vhost_kernel_get_features,
606 .set_features = vhost_kernel_set_features,
607 .set_memory_table = vhost_kernel_set_memory_table,
608 .set_vring_num = vhost_kernel_set_vring_num,
609 .set_vring_base = vhost_kernel_set_vring_base,
610 .get_vring_base = vhost_kernel_get_vring_base,
611 .set_vring_call = vhost_kernel_set_vring_call,
612 .set_vring_kick = vhost_kernel_set_vring_kick,
613 .set_vring_addr = vhost_kernel_set_vring_addr,
614 .get_status = vhost_kernel_get_status,
615 .set_status = vhost_kernel_set_status,
616 .enable_qp = vhost_kernel_enable_queue_pair,
617 .update_link_state = vhost_kernel_update_link_state,
618 .get_intr_fd = vhost_kernel_get_intr_fd,
619 };
620