1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
3 */
4
5 #include <ctype.h>
6 #include <arpa/inet.h>
7 #include <getopt.h>
8 #include <linux/if_ether.h>
9 #include <linux/if_vlan.h>
10 #include <linux/virtio_net.h>
11 #include <linux/virtio_ring.h>
12 #include <signal.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <sys/eventfd.h>
16 #include <sys/param.h>
17 #include <unistd.h>
18
19 #include <rte_cycles.h>
20 #include <rte_ethdev.h>
21 #include <rte_log.h>
22 #include <rte_string_fns.h>
23 #include <rte_malloc.h>
24 #include <rte_net.h>
25 #include <rte_vhost.h>
26 #include <rte_ip.h>
27 #include <rte_tcp.h>
28 #include <rte_pause.h>
29 #include <rte_dmadev.h>
30 #include <rte_vhost_async.h>
31 #include <rte_thread.h>
32
33 #include "main.h"
34
35 #ifndef MAX_QUEUES
36 #define MAX_QUEUES 128
37 #endif
38
39 #define NUM_MBUFS_DEFAULT 0x24000
40
41 /* the maximum number of external ports supported */
42 #define MAX_SUP_PORTS 1
43
44 #define MBUF_CACHE_SIZE 128
45 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
46
47 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
48
49 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
50 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
51
52 #define JUMBO_FRAME_MAX_SIZE 0x2600
53 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
54
55 /* State of virtio device. */
56 #define DEVICE_MAC_LEARNING 0
57 #define DEVICE_RX 1
58 #define DEVICE_SAFE_REMOVE 2
59
60 /* Configurable number of RX/TX ring descriptors */
61 #define RX_DESC_DEFAULT 1024
62 #define TX_DESC_DEFAULT 512
63
64 #define INVALID_PORT_ID 0xFF
65 #define INVALID_DMA_ID -1
66
67 #define DMA_RING_SIZE 4096
68
69 #define ASYNC_ENQUEUE_VHOST 1
70 #define ASYNC_DEQUEUE_VHOST 2
71
72 /* number of mbufs in all pools - if specified on command-line. */
73 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
74
75 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
76 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
77 static int dma_count;
78
79 /* mask of enabled ports */
80 static uint32_t enabled_port_mask = 0;
81
82 /* Promiscuous mode */
83 static uint32_t promiscuous;
84
85 /* number of devices/queues to support*/
86 static uint32_t num_queues = 0;
87 static uint32_t num_devices;
88
89 static struct rte_mempool *mbuf_pool;
90 static int mergeable;
91
92 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
93 typedef enum {
94 VM2VM_DISABLED = 0,
95 VM2VM_SOFTWARE = 1,
96 VM2VM_HARDWARE = 2,
97 VM2VM_LAST
98 } vm2vm_type;
99 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
100
101 /* Enable stats. */
102 static uint32_t enable_stats = 0;
103 /* Enable retries on RX. */
104 static uint32_t enable_retry = 1;
105
106 /* Disable TX checksum offload */
107 static uint32_t enable_tx_csum;
108
109 /* Disable TSO offload */
110 static uint32_t enable_tso;
111
112 static int client_mode;
113
114 static int builtin_net_driver;
115
116 /* Specify timeout (in useconds) between retries on RX. */
117 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
118 /* Specify the number of retries on RX. */
119 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
120
121 /* Socket file paths. Can be set by user */
122 static char *socket_files;
123 static int nb_sockets;
124
125 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
126
127 /* empty VMDq configuration structure. Filled in programmatically */
128 static struct rte_eth_conf vmdq_conf_default = {
129 .rxmode = {
130 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY,
131 /*
132 * VLAN strip is necessary for 1G NIC such as I350,
133 * this fixes bug of ipv4 forwarding in guest can't
134 * forward packets from one virtio dev to another virtio dev.
135 */
136 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
137 },
138
139 .txmode = {
140 .mq_mode = RTE_ETH_MQ_TX_NONE,
141 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
142 RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
143 RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
144 RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
145 RTE_ETH_TX_OFFLOAD_TCP_TSO),
146 },
147 .rx_adv_conf = {
148 /*
149 * should be overridden separately in code with
150 * appropriate values
151 */
152 .vmdq_rx_conf = {
153 .nb_queue_pools = RTE_ETH_8_POOLS,
154 .enable_default_pool = 0,
155 .default_pool = 0,
156 .nb_pool_maps = 0,
157 .pool_map = {{0, 0},},
158 },
159 },
160 };
161
162
163 static unsigned lcore_ids[RTE_MAX_LCORE];
164 static uint16_t ports[RTE_MAX_ETHPORTS];
165 static unsigned num_ports = 0; /**< The number of ports specified in command line */
166 static uint16_t num_pf_queues, num_vmdq_queues;
167 static uint16_t vmdq_pool_base, vmdq_queue_base;
168 static uint16_t queues_per_pool;
169
170 const uint16_t vlan_tags[] = {
171 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
172 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
173 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
174 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
175 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
176 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
177 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
178 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
179 };
180
181 /* ethernet addresses of ports */
182 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
183
184 static struct vhost_dev_tailq_list vhost_dev_list =
185 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
186
187 static struct lcore_info lcore_info[RTE_MAX_LCORE];
188
189 /* Used for queueing bursts of TX packets. */
190 struct mbuf_table {
191 unsigned len;
192 unsigned txq_id;
193 struct rte_mbuf *m_table[MAX_PKT_BURST];
194 };
195
196 struct vhost_bufftable {
197 uint32_t len;
198 uint64_t pre_tsc;
199 struct rte_mbuf *m_table[MAX_PKT_BURST];
200 };
201
202 /* TX queue for each data core. */
203 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
204
205 /*
206 * Vhost TX buffer for each data core.
207 * Every data core maintains a TX buffer for every vhost device,
208 * which is used for batch pkts enqueue for higher performance.
209 */
210 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
211
212 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
213 / US_PER_S * BURST_TX_DRAIN_US)
214
215 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
216
217 static inline uint32_t
get_async_flag_by_socketid(int socketid)218 get_async_flag_by_socketid(int socketid)
219 {
220 return dma_bind[socketid].async_flag;
221 }
222
223 static inline void
init_vid2socketid_array(int vid,int socketid)224 init_vid2socketid_array(int vid, int socketid)
225 {
226 vid2socketid[vid] = socketid;
227 }
228
229 static inline bool
is_dma_configured(int16_t dev_id)230 is_dma_configured(int16_t dev_id)
231 {
232 int i;
233
234 for (i = 0; i < dma_count; i++)
235 if (dmas_id[i] == dev_id)
236 return true;
237 return false;
238 }
239
240 static inline int
open_dma(const char * value)241 open_dma(const char *value)
242 {
243 struct dma_for_vhost *dma_info = dma_bind;
244 char *input = strndup(value, strlen(value) + 1);
245 char *addrs = input;
246 char *ptrs[2];
247 char *start, *end, *substr;
248 int64_t socketid, vring_id;
249
250 struct rte_dma_info info;
251 struct rte_dma_conf dev_config = { .nb_vchans = 1 };
252 struct rte_dma_vchan_conf qconf = {
253 .direction = RTE_DMA_DIR_MEM_TO_MEM,
254 .nb_desc = DMA_RING_SIZE
255 };
256
257 int dev_id;
258 int ret = 0;
259 uint16_t i = 0;
260 char *dma_arg[RTE_MAX_VHOST_DEVICE];
261 int args_nr;
262
263 if (input == NULL)
264 return -1;
265
266 while (isblank(*addrs))
267 addrs++;
268 if (*addrs == '\0') {
269 ret = -1;
270 goto out;
271 }
272
273 /* process DMA devices within bracket. */
274 addrs++;
275 substr = strtok(addrs, ";]");
276 if (!substr) {
277 ret = -1;
278 goto out;
279 }
280
281 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
282 if (args_nr <= 0) {
283 ret = -1;
284 goto out;
285 }
286
287 while (i < args_nr) {
288 char *arg_temp = dma_arg[i];
289 char *txd, *rxd;
290 uint8_t sub_nr;
291 int async_flag;
292
293 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
294 if (sub_nr != 2) {
295 ret = -1;
296 goto out;
297 }
298
299 txd = strstr(ptrs[0], "txd");
300 rxd = strstr(ptrs[0], "rxd");
301 if (txd) {
302 start = txd;
303 vring_id = VIRTIO_RXQ;
304 async_flag = ASYNC_ENQUEUE_VHOST;
305 } else if (rxd) {
306 start = rxd;
307 vring_id = VIRTIO_TXQ;
308 async_flag = ASYNC_DEQUEUE_VHOST;
309 } else {
310 ret = -1;
311 goto out;
312 }
313
314 start += 3;
315 socketid = strtol(start, &end, 0);
316 if (end == start) {
317 ret = -1;
318 goto out;
319 }
320
321 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
322 if (dev_id < 0) {
323 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
324 ret = -1;
325 goto out;
326 }
327
328 /* DMA device is already configured, so skip */
329 if (is_dma_configured(dev_id))
330 goto done;
331
332 if (rte_dma_info_get(dev_id, &info) != 0) {
333 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
334 ret = -1;
335 goto out;
336 }
337
338 if (info.max_vchans < 1) {
339 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
340 ret = -1;
341 goto out;
342 }
343
344 if (rte_dma_configure(dev_id, &dev_config) != 0) {
345 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
346 ret = -1;
347 goto out;
348 }
349
350 /* Check the max desc supported by DMA device */
351 rte_dma_info_get(dev_id, &info);
352 if (info.nb_vchans != 1) {
353 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
354 dev_id);
355 ret = -1;
356 goto out;
357 }
358
359 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
360
361 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
362 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
363 ret = -1;
364 goto out;
365 }
366
367 if (rte_dma_start(dev_id) != 0) {
368 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
369 ret = -1;
370 goto out;
371 }
372
373 dmas_id[dma_count++] = dev_id;
374
375 done:
376 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
377 (dma_info + socketid)->async_flag |= async_flag;
378 i++;
379 }
380 out:
381 free(input);
382 return ret;
383 }
384
385 /*
386 * Builds up the correct configuration for VMDQ VLAN pool map
387 * according to the pool & queue limits.
388 */
389 static inline int
get_eth_conf(struct rte_eth_conf * eth_conf,uint32_t num_devices)390 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
391 {
392 struct rte_eth_vmdq_rx_conf conf;
393 struct rte_eth_vmdq_rx_conf *def_conf =
394 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
395 unsigned i;
396
397 memset(&conf, 0, sizeof(conf));
398 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
399 conf.nb_pool_maps = num_devices;
400 conf.enable_loop_back = def_conf->enable_loop_back;
401 conf.rx_mode = def_conf->rx_mode;
402
403 for (i = 0; i < conf.nb_pool_maps; i++) {
404 conf.pool_map[i].vlan_id = vlan_tags[ i ];
405 conf.pool_map[i].pools = (1UL << i);
406 }
407
408 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
409 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
410 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
411 return 0;
412 }
413
414 /*
415 * Initialises a given port using global settings and with the rx buffers
416 * coming from the mbuf_pool passed as parameter
417 */
418 static inline int
port_init(uint16_t port)419 port_init(uint16_t port)
420 {
421 struct rte_eth_dev_info dev_info;
422 struct rte_eth_conf port_conf;
423 struct rte_eth_rxconf *rxconf;
424 struct rte_eth_txconf *txconf;
425 int16_t rx_rings, tx_rings;
426 uint16_t rx_ring_size, tx_ring_size;
427 int retval;
428 uint16_t q;
429
430 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
431 retval = rte_eth_dev_info_get(port, &dev_info);
432 if (retval != 0) {
433 RTE_LOG(ERR, VHOST_PORT,
434 "Error during getting device (port %u) info: %s\n",
435 port, strerror(-retval));
436
437 return retval;
438 }
439 if (dev_info.max_vmdq_pools == 0) {
440 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
441 return -1;
442 }
443
444 rxconf = &dev_info.default_rxconf;
445 txconf = &dev_info.default_txconf;
446 rxconf->rx_drop_en = 1;
447
448 /*configure the number of supported virtio devices based on VMDQ limits */
449 num_devices = dev_info.max_vmdq_pools;
450
451 rx_ring_size = RX_DESC_DEFAULT;
452 tx_ring_size = TX_DESC_DEFAULT;
453
454 tx_rings = (uint16_t)rte_lcore_count();
455
456 if (mergeable) {
457 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
458 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
459 else
460 vmdq_conf_default.rxmode.mtu = MAX_MTU;
461 }
462
463 /* Get port configuration. */
464 retval = get_eth_conf(&port_conf, num_devices);
465 if (retval < 0)
466 return retval;
467 /* NIC queues are divided into pf queues and vmdq queues. */
468 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
469 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
470 num_vmdq_queues = num_devices * queues_per_pool;
471 num_queues = num_pf_queues + num_vmdq_queues;
472 vmdq_queue_base = dev_info.vmdq_queue_base;
473 vmdq_pool_base = dev_info.vmdq_pool_base;
474 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
475 num_pf_queues, num_devices, queues_per_pool);
476
477 if (!rte_eth_dev_is_valid_port(port))
478 return -1;
479
480 rx_rings = (uint16_t)dev_info.max_rx_queues;
481 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
482 port_conf.txmode.offloads |=
483 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
484 /* Configure ethernet device. */
485 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
486 if (retval != 0) {
487 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
488 port, strerror(-retval));
489 return retval;
490 }
491
492 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
493 &tx_ring_size);
494 if (retval != 0) {
495 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
496 "for port %u: %s.\n", port, strerror(-retval));
497 return retval;
498 }
499 if (rx_ring_size > RX_DESC_DEFAULT) {
500 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
501 "for Rx queues on port %u.\n", port);
502 return -1;
503 }
504
505 /* Setup the queues. */
506 rxconf->offloads = port_conf.rxmode.offloads;
507 for (q = 0; q < rx_rings; q ++) {
508 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
509 rte_eth_dev_socket_id(port),
510 rxconf,
511 mbuf_pool);
512 if (retval < 0) {
513 RTE_LOG(ERR, VHOST_PORT,
514 "Failed to setup rx queue %u of port %u: %s.\n",
515 q, port, strerror(-retval));
516 return retval;
517 }
518 }
519 txconf->offloads = port_conf.txmode.offloads;
520 for (q = 0; q < tx_rings; q ++) {
521 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
522 rte_eth_dev_socket_id(port),
523 txconf);
524 if (retval < 0) {
525 RTE_LOG(ERR, VHOST_PORT,
526 "Failed to setup tx queue %u of port %u: %s.\n",
527 q, port, strerror(-retval));
528 return retval;
529 }
530 }
531
532 /* Start the device. */
533 retval = rte_eth_dev_start(port);
534 if (retval < 0) {
535 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
536 port, strerror(-retval));
537 return retval;
538 }
539
540 if (promiscuous) {
541 retval = rte_eth_promiscuous_enable(port);
542 if (retval != 0) {
543 RTE_LOG(ERR, VHOST_PORT,
544 "Failed to enable promiscuous mode on port %u: %s\n",
545 port, rte_strerror(-retval));
546 return retval;
547 }
548 }
549
550 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
551 if (retval < 0) {
552 RTE_LOG(ERR, VHOST_PORT,
553 "Failed to get MAC address on port %u: %s\n",
554 port, rte_strerror(-retval));
555 return retval;
556 }
557
558 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
559 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
560 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
561 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
562
563 return 0;
564 }
565
566 /*
567 * Set socket file path.
568 */
569 static int
us_vhost_parse_socket_path(const char * q_arg)570 us_vhost_parse_socket_path(const char *q_arg)
571 {
572 char *old;
573
574 /* parse number string */
575 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
576 return -1;
577
578 old = socket_files;
579 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
580 if (socket_files == NULL) {
581 free(old);
582 return -1;
583 }
584
585 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
586 nb_sockets++;
587
588 return 0;
589 }
590
591 /*
592 * Parse the portmask provided at run time.
593 */
594 static int
parse_portmask(const char * portmask)595 parse_portmask(const char *portmask)
596 {
597 char *end = NULL;
598 unsigned long pm;
599
600 errno = 0;
601
602 /* parse hexadecimal string */
603 pm = strtoul(portmask, &end, 16);
604 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
605 return 0;
606
607 return pm;
608
609 }
610
611 /*
612 * Parse num options at run time.
613 */
614 static int
parse_num_opt(const char * q_arg,uint32_t max_valid_value)615 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
616 {
617 char *end = NULL;
618 unsigned long num;
619
620 errno = 0;
621
622 /* parse unsigned int string */
623 num = strtoul(q_arg, &end, 10);
624 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
625 return -1;
626
627 if (num > max_valid_value)
628 return -1;
629
630 return num;
631
632 }
633
634 /*
635 * Display usage
636 */
637 static void
us_vhost_usage(const char * prgname)638 us_vhost_usage(const char *prgname)
639 {
640 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
641 " --vm2vm [0|1|2]\n"
642 " --rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n"
643 " --socket-file <path>\n"
644 " -p PORTMASK: Set mask for ports to be used by application\n"
645 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
646 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
647 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
648 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
649 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
650 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
651 " --socket-file: The path of the socket file.\n"
652 " --tx-csum [0|1]: disable/enable TX checksum offload.\n"
653 " --tso [0|1]: disable/enable TCP segment offload.\n"
654 " --client: register a vhost-user socket as client mode.\n"
655 " --dmas: register dma channel for specific vhost device.\n"
656 " --total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n"
657 " --builtin-net-driver: enable simple vhost-user net driver\n",
658 prgname);
659 }
660
661 enum {
662 #define OPT_VM2VM "vm2vm"
663 OPT_VM2VM_NUM = 256,
664 #define OPT_RX_RETRY "rx-retry"
665 OPT_RX_RETRY_NUM,
666 #define OPT_RX_RETRY_DELAY "rx-retry-delay"
667 OPT_RX_RETRY_DELAY_NUM,
668 #define OPT_RX_RETRY_NUMB "rx-retry-num"
669 OPT_RX_RETRY_NUMB_NUM,
670 #define OPT_MERGEABLE "mergeable"
671 OPT_MERGEABLE_NUM,
672 #define OPT_STATS "stats"
673 OPT_STATS_NUM,
674 #define OPT_SOCKET_FILE "socket-file"
675 OPT_SOCKET_FILE_NUM,
676 #define OPT_TX_CSUM "tx-csum"
677 OPT_TX_CSUM_NUM,
678 #define OPT_TSO "tso"
679 OPT_TSO_NUM,
680 #define OPT_CLIENT "client"
681 OPT_CLIENT_NUM,
682 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver"
683 OPT_BUILTIN_NET_DRIVER_NUM,
684 #define OPT_DMAS "dmas"
685 OPT_DMAS_NUM,
686 #define OPT_NUM_MBUFS "total-num-mbufs"
687 OPT_NUM_MBUFS_NUM,
688 };
689
690 /*
691 * Parse the arguments given in the command line of the application.
692 */
693 static int
us_vhost_parse_args(int argc,char ** argv)694 us_vhost_parse_args(int argc, char **argv)
695 {
696 int opt, ret;
697 int option_index;
698 unsigned i;
699 const char *prgname = argv[0];
700 static struct option long_option[] = {
701 {OPT_VM2VM, required_argument,
702 NULL, OPT_VM2VM_NUM},
703 {OPT_RX_RETRY, required_argument,
704 NULL, OPT_RX_RETRY_NUM},
705 {OPT_RX_RETRY_DELAY, required_argument,
706 NULL, OPT_RX_RETRY_DELAY_NUM},
707 {OPT_RX_RETRY_NUMB, required_argument,
708 NULL, OPT_RX_RETRY_NUMB_NUM},
709 {OPT_MERGEABLE, required_argument,
710 NULL, OPT_MERGEABLE_NUM},
711 {OPT_STATS, required_argument,
712 NULL, OPT_STATS_NUM},
713 {OPT_SOCKET_FILE, required_argument,
714 NULL, OPT_SOCKET_FILE_NUM},
715 {OPT_TX_CSUM, required_argument,
716 NULL, OPT_TX_CSUM_NUM},
717 {OPT_TSO, required_argument,
718 NULL, OPT_TSO_NUM},
719 {OPT_CLIENT, no_argument,
720 NULL, OPT_CLIENT_NUM},
721 {OPT_BUILTIN_NET_DRIVER, no_argument,
722 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
723 {OPT_DMAS, required_argument,
724 NULL, OPT_DMAS_NUM},
725 {OPT_NUM_MBUFS, required_argument,
726 NULL, OPT_NUM_MBUFS_NUM},
727 {NULL, 0, 0, 0},
728 };
729
730 /* Parse command line */
731 while ((opt = getopt_long(argc, argv, "p:P",
732 long_option, &option_index)) != EOF) {
733 switch (opt) {
734 /* Portmask */
735 case 'p':
736 enabled_port_mask = parse_portmask(optarg);
737 if (enabled_port_mask == 0) {
738 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
739 us_vhost_usage(prgname);
740 return -1;
741 }
742 break;
743
744 case 'P':
745 promiscuous = 1;
746 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
747 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
748 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
749 break;
750
751 case OPT_VM2VM_NUM:
752 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
753 if (ret == -1) {
754 RTE_LOG(INFO, VHOST_CONFIG,
755 "Invalid argument for "
756 "vm2vm [0|1|2]\n");
757 us_vhost_usage(prgname);
758 return -1;
759 }
760 vm2vm_mode = (vm2vm_type)ret;
761 break;
762
763 case OPT_RX_RETRY_NUM:
764 ret = parse_num_opt(optarg, 1);
765 if (ret == -1) {
766 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
767 us_vhost_usage(prgname);
768 return -1;
769 }
770 enable_retry = ret;
771 break;
772
773 case OPT_TX_CSUM_NUM:
774 ret = parse_num_opt(optarg, 1);
775 if (ret == -1) {
776 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
777 us_vhost_usage(prgname);
778 return -1;
779 }
780 enable_tx_csum = ret;
781 break;
782
783 case OPT_TSO_NUM:
784 ret = parse_num_opt(optarg, 1);
785 if (ret == -1) {
786 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
787 us_vhost_usage(prgname);
788 return -1;
789 }
790 enable_tso = ret;
791 break;
792
793 case OPT_RX_RETRY_DELAY_NUM:
794 ret = parse_num_opt(optarg, INT32_MAX);
795 if (ret == -1) {
796 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
797 us_vhost_usage(prgname);
798 return -1;
799 }
800 burst_rx_delay_time = ret;
801 break;
802
803 case OPT_RX_RETRY_NUMB_NUM:
804 ret = parse_num_opt(optarg, INT32_MAX);
805 if (ret == -1) {
806 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
807 us_vhost_usage(prgname);
808 return -1;
809 }
810 burst_rx_retry_num = ret;
811 break;
812
813 case OPT_MERGEABLE_NUM:
814 ret = parse_num_opt(optarg, 1);
815 if (ret == -1) {
816 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
817 us_vhost_usage(prgname);
818 return -1;
819 }
820 mergeable = !!ret;
821 break;
822
823 case OPT_STATS_NUM:
824 ret = parse_num_opt(optarg, INT32_MAX);
825 if (ret == -1) {
826 RTE_LOG(INFO, VHOST_CONFIG,
827 "Invalid argument for stats [0..N]\n");
828 us_vhost_usage(prgname);
829 return -1;
830 }
831 enable_stats = ret;
832 break;
833
834 /* Set socket file path. */
835 case OPT_SOCKET_FILE_NUM:
836 if (us_vhost_parse_socket_path(optarg) == -1) {
837 RTE_LOG(INFO, VHOST_CONFIG,
838 "Invalid argument for socket name (Max %d characters)\n",
839 PATH_MAX);
840 us_vhost_usage(prgname);
841 return -1;
842 }
843 break;
844
845 case OPT_DMAS_NUM:
846 if (open_dma(optarg) == -1) {
847 RTE_LOG(INFO, VHOST_CONFIG,
848 "Wrong DMA args\n");
849 us_vhost_usage(prgname);
850 return -1;
851 }
852 break;
853
854 case OPT_NUM_MBUFS_NUM:
855 ret = parse_num_opt(optarg, INT32_MAX);
856 if (ret == -1) {
857 RTE_LOG(INFO, VHOST_CONFIG,
858 "Invalid argument for total-num-mbufs [0..N]\n");
859 us_vhost_usage(prgname);
860 return -1;
861 }
862
863 if (total_num_mbufs < ret)
864 total_num_mbufs = ret;
865 break;
866
867 case OPT_CLIENT_NUM:
868 client_mode = 1;
869 break;
870
871 case OPT_BUILTIN_NET_DRIVER_NUM:
872 builtin_net_driver = 1;
873 break;
874
875 /* Invalid option - print options. */
876 default:
877 us_vhost_usage(prgname);
878 return -1;
879 }
880 }
881
882 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
883 if (enabled_port_mask & (1 << i))
884 ports[num_ports++] = i;
885 }
886
887 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
888 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
889 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
890 return -1;
891 }
892
893 return 0;
894 }
895
896 /*
897 * Update the global var NUM_PORTS and array PORTS according to system ports number
898 * and return valid ports number
899 */
check_ports_num(unsigned nb_ports)900 static unsigned check_ports_num(unsigned nb_ports)
901 {
902 unsigned valid_num_ports = num_ports;
903 unsigned portid;
904
905 if (num_ports > nb_ports) {
906 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
907 num_ports, nb_ports);
908 num_ports = nb_ports;
909 }
910
911 for (portid = 0; portid < num_ports; portid ++) {
912 if (!rte_eth_dev_is_valid_port(ports[portid])) {
913 RTE_LOG(INFO, VHOST_PORT,
914 "\nSpecified port ID(%u) is not valid\n",
915 ports[portid]);
916 ports[portid] = INVALID_PORT_ID;
917 valid_num_ports--;
918 }
919 }
920 return valid_num_ports;
921 }
922
923 static __rte_always_inline struct vhost_dev *
find_vhost_dev(struct rte_ether_addr * mac)924 find_vhost_dev(struct rte_ether_addr *mac)
925 {
926 struct vhost_dev *vdev;
927
928 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
929 if (vdev->ready == DEVICE_RX &&
930 rte_is_same_ether_addr(mac, &vdev->mac_address))
931 return vdev;
932 }
933
934 return NULL;
935 }
936
937 /*
938 * This function learns the MAC address of the device and registers this along with a
939 * vlan tag to a VMDQ.
940 */
941 static int
link_vmdq(struct vhost_dev * vdev,struct rte_mbuf * m)942 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
943 {
944 struct rte_ether_hdr *pkt_hdr;
945 int i, ret;
946
947 /* Learn MAC address of guest device from packet */
948 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
949
950 if (find_vhost_dev(&pkt_hdr->src_addr)) {
951 RTE_LOG(ERR, VHOST_DATA,
952 "(%d) device is using a registered MAC!\n",
953 vdev->vid);
954 return -1;
955 }
956
957 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
958 vdev->mac_address.addr_bytes[i] =
959 pkt_hdr->src_addr.addr_bytes[i];
960
961 /* vlan_tag currently uses the device_id. */
962 vdev->vlan_tag = vlan_tags[vdev->vid];
963
964 /* Print out VMDQ registration info. */
965 RTE_LOG(INFO, VHOST_DATA,
966 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
967 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
968 vdev->vlan_tag);
969
970 /* Register the MAC address. */
971 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
972 (uint32_t)vdev->vid + vmdq_pool_base);
973 if (ret)
974 RTE_LOG(ERR, VHOST_DATA,
975 "(%d) failed to add device MAC address to VMDQ\n",
976 vdev->vid);
977
978 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
979
980 /* Set device as ready for RX. */
981 vdev->ready = DEVICE_RX;
982
983 return 0;
984 }
985
986 /*
987 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
988 * queue before disabling RX on the device.
989 */
990 static inline void
unlink_vmdq(struct vhost_dev * vdev)991 unlink_vmdq(struct vhost_dev *vdev)
992 {
993 unsigned i = 0;
994 unsigned rx_count;
995 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
996
997 if (vdev->ready == DEVICE_RX) {
998 /*clear MAC and VLAN settings*/
999 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1000 for (i = 0; i < 6; i++)
1001 vdev->mac_address.addr_bytes[i] = 0;
1002
1003 vdev->vlan_tag = 0;
1004
1005 /*Clear out the receive buffers*/
1006 rx_count = rte_eth_rx_burst(ports[0],
1007 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1008
1009 while (rx_count) {
1010 for (i = 0; i < rx_count; i++)
1011 rte_pktmbuf_free(pkts_burst[i]);
1012
1013 rx_count = rte_eth_rx_burst(ports[0],
1014 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1015 }
1016
1017 vdev->ready = DEVICE_MAC_LEARNING;
1018 }
1019 }
1020
1021 static inline void
free_pkts(struct rte_mbuf ** pkts,uint16_t n)1022 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1023 {
1024 while (n--)
1025 rte_pktmbuf_free(pkts[n]);
1026 }
1027
1028 static __rte_always_inline void
complete_async_pkts(struct vhost_dev * vdev)1029 complete_async_pkts(struct vhost_dev *vdev)
1030 {
1031 struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1032 uint16_t complete_count;
1033 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1034
1035 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1036 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1037 if (complete_count)
1038 free_pkts(p_cpl, complete_count);
1039
1040 }
1041
1042 static __rte_always_inline void
sync_virtio_xmit(struct vhost_dev * dst_vdev,struct vhost_dev * src_vdev,struct rte_mbuf * m)1043 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1044 struct rte_mbuf *m)
1045 {
1046 uint16_t ret;
1047
1048 if (builtin_net_driver) {
1049 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1050 } else {
1051 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1052 }
1053
1054 if (enable_stats) {
1055 rte_atomic_fetch_add_explicit(&dst_vdev->stats.rx_total_atomic, 1,
1056 rte_memory_order_seq_cst);
1057 rte_atomic_fetch_add_explicit(&dst_vdev->stats.rx_atomic, ret,
1058 rte_memory_order_seq_cst);
1059 src_vdev->stats.tx_total++;
1060 src_vdev->stats.tx += ret;
1061 }
1062 }
1063
1064 static __rte_always_inline void
drain_vhost(struct vhost_dev * vdev)1065 drain_vhost(struct vhost_dev *vdev)
1066 {
1067 uint16_t ret;
1068 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1069 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1070 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1071
1072 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1073
1074 if (enable_stats) {
1075 rte_atomic_fetch_add_explicit(&vdev->stats.rx_total_atomic, nr_xmit,
1076 rte_memory_order_seq_cst);
1077 rte_atomic_fetch_add_explicit(&vdev->stats.rx_atomic, ret,
1078 rte_memory_order_seq_cst);
1079 }
1080
1081 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1082 free_pkts(m, nr_xmit);
1083 } else {
1084 uint16_t enqueue_fail = nr_xmit - ret;
1085 if (enqueue_fail > 0)
1086 free_pkts(&m[ret], enqueue_fail);
1087 }
1088 }
1089
1090 static __rte_always_inline void
drain_vhost_table(void)1091 drain_vhost_table(void)
1092 {
1093 uint16_t lcore_id = rte_lcore_id();
1094 struct vhost_bufftable *vhost_txq;
1095 struct vhost_dev *vdev;
1096 uint64_t cur_tsc;
1097
1098 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1099 if (unlikely(vdev->remove == 1))
1100 continue;
1101
1102 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1103
1104 cur_tsc = rte_rdtsc();
1105 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1106 > MBUF_TABLE_DRAIN_TSC)) {
1107 RTE_LOG_DP(DEBUG, VHOST_DATA,
1108 "Vhost TX queue drained after timeout with burst size %u\n",
1109 vhost_txq->len);
1110 drain_vhost(vdev);
1111 vhost_txq->len = 0;
1112 vhost_txq->pre_tsc = cur_tsc;
1113 }
1114 }
1115 }
1116
1117 /*
1118 * Check if the packet destination MAC address is for a local device. If so then put
1119 * the packet on that devices RX queue. If not then return.
1120 */
1121 static __rte_always_inline int
virtio_tx_local(struct vhost_dev * vdev,struct rte_mbuf * m)1122 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1123 {
1124 struct rte_ether_hdr *pkt_hdr;
1125 struct vhost_dev *dst_vdev;
1126 struct vhost_bufftable *vhost_txq;
1127 uint16_t lcore_id = rte_lcore_id();
1128 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1129
1130 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1131 if (!dst_vdev)
1132 return -1;
1133
1134 if (vdev->vid == dst_vdev->vid) {
1135 RTE_LOG_DP(DEBUG, VHOST_DATA,
1136 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1137 vdev->vid);
1138 return 0;
1139 }
1140
1141 RTE_LOG_DP(DEBUG, VHOST_DATA,
1142 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1143
1144 if (unlikely(dst_vdev->remove)) {
1145 RTE_LOG_DP(DEBUG, VHOST_DATA,
1146 "(%d) device is marked for removal\n", dst_vdev->vid);
1147 return 0;
1148 }
1149
1150 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1151 vhost_txq->m_table[vhost_txq->len++] = m;
1152
1153 if (enable_stats) {
1154 vdev->stats.tx_total++;
1155 vdev->stats.tx++;
1156 }
1157
1158 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1159 drain_vhost(dst_vdev);
1160 vhost_txq->len = 0;
1161 vhost_txq->pre_tsc = rte_rdtsc();
1162 }
1163 return 0;
1164 }
1165
1166 /*
1167 * Check if the destination MAC of a packet is one local VM,
1168 * and get its vlan tag, and offset if it is.
1169 */
1170 static __rte_always_inline int
find_local_dest(struct vhost_dev * vdev,struct rte_mbuf * m,uint32_t * offset,uint16_t * vlan_tag)1171 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1172 uint32_t *offset, uint16_t *vlan_tag)
1173 {
1174 struct vhost_dev *dst_vdev;
1175 struct rte_ether_hdr *pkt_hdr =
1176 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1177
1178 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1179 if (!dst_vdev)
1180 return 0;
1181
1182 if (vdev->vid == dst_vdev->vid) {
1183 RTE_LOG_DP(DEBUG, VHOST_DATA,
1184 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1185 vdev->vid);
1186 return -1;
1187 }
1188
1189 /*
1190 * HW vlan strip will reduce the packet length
1191 * by minus length of vlan tag, so need restore
1192 * the packet length by plus it.
1193 */
1194 *offset = RTE_VLAN_HLEN;
1195 *vlan_tag = vlan_tags[vdev->vid];
1196
1197 RTE_LOG_DP(DEBUG, VHOST_DATA,
1198 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1199 vdev->vid, dst_vdev->vid, *vlan_tag);
1200
1201 return 0;
1202 }
1203
virtio_tx_offload(struct rte_mbuf * m)1204 static void virtio_tx_offload(struct rte_mbuf *m)
1205 {
1206 struct rte_net_hdr_lens hdr_lens;
1207 struct rte_ipv4_hdr *ipv4_hdr;
1208 struct rte_tcp_hdr *tcp_hdr;
1209 uint32_t ptype;
1210 void *l3_hdr;
1211
1212 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1213 m->l2_len = hdr_lens.l2_len;
1214 m->l3_len = hdr_lens.l3_len;
1215 m->l4_len = hdr_lens.l4_len;
1216
1217 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1218 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1219 m->l2_len + m->l3_len);
1220
1221 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1222 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1223 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1224 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1225 ipv4_hdr = l3_hdr;
1226 ipv4_hdr->hdr_checksum = 0;
1227 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1228 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1229 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1230 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1231 }
1232 }
1233
1234 static __rte_always_inline void
do_drain_mbuf_table(struct mbuf_table * tx_q)1235 do_drain_mbuf_table(struct mbuf_table *tx_q)
1236 {
1237 uint16_t count;
1238
1239 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1240 tx_q->m_table, tx_q->len);
1241 if (unlikely(count < tx_q->len))
1242 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1243
1244 tx_q->len = 0;
1245 }
1246
1247 /*
1248 * This function routes the TX packet to the correct interface. This
1249 * may be a local device or the physical port.
1250 */
1251 static __rte_always_inline void
virtio_tx_route(struct vhost_dev * vdev,struct rte_mbuf * m,uint16_t vlan_tag)1252 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1253 {
1254 struct mbuf_table *tx_q;
1255 unsigned offset = 0;
1256 const uint16_t lcore_id = rte_lcore_id();
1257 struct rte_ether_hdr *nh;
1258
1259
1260 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1261 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1262 struct vhost_dev *vdev2;
1263
1264 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1265 if (vdev2 != vdev)
1266 sync_virtio_xmit(vdev2, vdev, m);
1267 }
1268 goto queue2nic;
1269 }
1270
1271 /*check if destination is local VM*/
1272 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1273 return;
1274
1275 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1276 if (unlikely(find_local_dest(vdev, m, &offset,
1277 &vlan_tag) != 0)) {
1278 rte_pktmbuf_free(m);
1279 return;
1280 }
1281 }
1282
1283 RTE_LOG_DP(DEBUG, VHOST_DATA,
1284 "(%d) TX: MAC address is external\n", vdev->vid);
1285
1286 queue2nic:
1287
1288 /*Add packet to the port tx queue*/
1289 tx_q = &lcore_tx_queue[lcore_id];
1290
1291 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1292 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1293 /* Guest has inserted the vlan tag. */
1294 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1295 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1296 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1297 (vh->vlan_tci != vlan_tag_be))
1298 vh->vlan_tci = vlan_tag_be;
1299 } else {
1300 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1301
1302 /*
1303 * Find the right seg to adjust the data len when offset is
1304 * bigger than tail room size.
1305 */
1306 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1307 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1308 m->data_len += offset;
1309 else {
1310 struct rte_mbuf *seg = m;
1311
1312 while ((seg->next != NULL) &&
1313 (offset > rte_pktmbuf_tailroom(seg)))
1314 seg = seg->next;
1315
1316 seg->data_len += offset;
1317 }
1318 m->pkt_len += offset;
1319 }
1320
1321 m->vlan_tci = vlan_tag;
1322 }
1323
1324 if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1325 virtio_tx_offload(m);
1326
1327 tx_q->m_table[tx_q->len++] = m;
1328 if (enable_stats) {
1329 vdev->stats.tx_total++;
1330 vdev->stats.tx++;
1331 }
1332
1333 if (unlikely(tx_q->len == MAX_PKT_BURST))
1334 do_drain_mbuf_table(tx_q);
1335 }
1336
1337
1338 static __rte_always_inline void
drain_mbuf_table(struct mbuf_table * tx_q)1339 drain_mbuf_table(struct mbuf_table *tx_q)
1340 {
1341 static uint64_t prev_tsc;
1342 uint64_t cur_tsc;
1343
1344 if (tx_q->len == 0)
1345 return;
1346
1347 cur_tsc = rte_rdtsc();
1348 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1349 prev_tsc = cur_tsc;
1350
1351 RTE_LOG_DP(DEBUG, VHOST_DATA,
1352 "TX queue drained after timeout with burst size %u\n",
1353 tx_q->len);
1354 do_drain_mbuf_table(tx_q);
1355 }
1356 }
1357
1358 uint16_t
async_enqueue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t rx_count)1359 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1360 struct rte_mbuf **pkts, uint32_t rx_count)
1361 {
1362 uint16_t enqueue_count;
1363 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1364
1365 complete_async_pkts(dev);
1366 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1367 pkts, rx_count, dma_id, 0);
1368
1369 return enqueue_count;
1370 }
1371
1372 uint16_t
sync_enqueue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t rx_count)1373 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1374 struct rte_mbuf **pkts, uint32_t rx_count)
1375 {
1376 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1377 }
1378
1379 static __rte_always_inline void
drain_eth_rx(struct vhost_dev * vdev)1380 drain_eth_rx(struct vhost_dev *vdev)
1381 {
1382 uint16_t rx_count, enqueue_count;
1383 struct rte_mbuf *pkts[MAX_PKT_BURST];
1384
1385 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1386 pkts, MAX_PKT_BURST);
1387
1388 if (!rx_count)
1389 return;
1390
1391 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1392 VIRTIO_RXQ, pkts, rx_count);
1393
1394 /* Retry if necessary */
1395 if (enable_retry && unlikely(enqueue_count < rx_count)) {
1396 uint32_t retry = 0;
1397
1398 while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) {
1399 rte_delay_us(burst_rx_delay_time);
1400 enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1401 VIRTIO_RXQ, &pkts[enqueue_count],
1402 rx_count - enqueue_count);
1403 }
1404 }
1405
1406 if (enable_stats) {
1407 rte_atomic_fetch_add_explicit(&vdev->stats.rx_total_atomic, rx_count,
1408 rte_memory_order_seq_cst);
1409 rte_atomic_fetch_add_explicit(&vdev->stats.rx_atomic, enqueue_count,
1410 rte_memory_order_seq_cst);
1411 }
1412
1413 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1414 free_pkts(pkts, rx_count);
1415 } else {
1416 uint16_t enqueue_fail = rx_count - enqueue_count;
1417 if (enqueue_fail > 0)
1418 free_pkts(&pkts[enqueue_count], enqueue_fail);
1419 }
1420 }
1421
async_dequeue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)1422 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1423 struct rte_mempool *mbuf_pool,
1424 struct rte_mbuf **pkts, uint16_t count)
1425 {
1426 int nr_inflight;
1427 uint16_t dequeue_count;
1428 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1429
1430 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1431 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1432
1433 return dequeue_count;
1434 }
1435
sync_dequeue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)1436 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1437 struct rte_mempool *mbuf_pool,
1438 struct rte_mbuf **pkts, uint16_t count)
1439 {
1440 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1441 }
1442
1443 static __rte_always_inline void
drain_virtio_tx(struct vhost_dev * vdev)1444 drain_virtio_tx(struct vhost_dev *vdev)
1445 {
1446 struct rte_mbuf *pkts[MAX_PKT_BURST];
1447 uint16_t count;
1448 uint16_t i;
1449
1450 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1451 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1452
1453 /* setup VMDq for the first packet */
1454 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1455 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1456 free_pkts(pkts, count);
1457 }
1458
1459 for (i = 0; i < count; ++i)
1460 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1461 }
1462
1463 /*
1464 * Main function of vhost-switch. It basically does:
1465 *
1466 * for each vhost device {
1467 * - drain_eth_rx()
1468 *
1469 * Which drains the host eth Rx queue linked to the vhost device,
1470 * and deliver all of them to guest virito Rx ring associated with
1471 * this vhost device.
1472 *
1473 * - drain_virtio_tx()
1474 *
1475 * Which drains the guest virtio Tx queue and deliver all of them
1476 * to the target, which could be another vhost device, or the
1477 * physical eth dev. The route is done in function "virtio_tx_route".
1478 * }
1479 */
1480 static int
switch_worker(void * arg __rte_unused)1481 switch_worker(void *arg __rte_unused)
1482 {
1483 unsigned i;
1484 unsigned lcore_id = rte_lcore_id();
1485 struct vhost_dev *vdev;
1486 struct mbuf_table *tx_q;
1487
1488 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1489
1490 tx_q = &lcore_tx_queue[lcore_id];
1491 for (i = 0; i < rte_lcore_count(); i++) {
1492 if (lcore_ids[i] == lcore_id) {
1493 tx_q->txq_id = i;
1494 break;
1495 }
1496 }
1497
1498 while(1) {
1499 drain_mbuf_table(tx_q);
1500 drain_vhost_table();
1501 /*
1502 * Inform the configuration core that we have exited the
1503 * linked list and that no devices are in use if requested.
1504 */
1505 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1506 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1507
1508 /*
1509 * Process vhost devices
1510 */
1511 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1512 lcore_vdev_entry) {
1513 if (unlikely(vdev->remove)) {
1514 unlink_vmdq(vdev);
1515 vdev->ready = DEVICE_SAFE_REMOVE;
1516 continue;
1517 }
1518
1519 if (likely(vdev->ready == DEVICE_RX))
1520 drain_eth_rx(vdev);
1521
1522 if (likely(!vdev->remove))
1523 drain_virtio_tx(vdev);
1524 }
1525 }
1526
1527 return 0;
1528 }
1529
1530 static void
vhost_clear_queue_thread_unsafe(struct vhost_dev * vdev,uint16_t queue_id)1531 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1532 {
1533 uint16_t n_pkt = 0;
1534 int pkts_inflight;
1535
1536 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1537 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1538
1539 struct rte_mbuf *m_cpl[pkts_inflight];
1540
1541 while (pkts_inflight) {
1542 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1543 pkts_inflight, dma_id, 0);
1544 free_pkts(m_cpl, n_pkt);
1545 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1546 queue_id);
1547 }
1548 }
1549
1550 static void
vhost_clear_queue(struct vhost_dev * vdev,uint16_t queue_id)1551 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id)
1552 {
1553 uint16_t n_pkt = 0;
1554 int pkts_inflight;
1555
1556 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1557 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1558
1559 struct rte_mbuf *m_cpl[pkts_inflight];
1560
1561 while (pkts_inflight) {
1562 n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl,
1563 pkts_inflight, dma_id, 0);
1564 free_pkts(m_cpl, n_pkt);
1565 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1566 }
1567 }
1568
1569 /*
1570 * Remove a device from the specific data core linked list and from the
1571 * main linked list. Synchronization occurs through the use of the
1572 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1573 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1574 */
1575 static void
destroy_device(int vid)1576 destroy_device(int vid)
1577 {
1578 struct vhost_dev *vdev = NULL;
1579 int lcore;
1580 uint16_t i;
1581
1582 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1583 if (vdev->vid == vid)
1584 break;
1585 }
1586 if (!vdev)
1587 return;
1588 /*set the remove flag. */
1589 vdev->remove = 1;
1590 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1591 rte_pause();
1592 }
1593
1594 for (i = 0; i < RTE_MAX_LCORE; i++)
1595 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1596
1597 if (builtin_net_driver)
1598 vs_vhost_net_remove(vdev);
1599
1600 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1601 lcore_vdev_entry);
1602 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1603
1604
1605 /* Set the dev_removal_flag on each lcore. */
1606 RTE_LCORE_FOREACH_WORKER(lcore)
1607 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1608
1609 /*
1610 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1611 * we can be sure that they can no longer access the device removed
1612 * from the linked lists and that the devices are no longer in use.
1613 */
1614 RTE_LCORE_FOREACH_WORKER(lcore) {
1615 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1616 rte_pause();
1617 }
1618
1619 lcore_info[vdev->coreid].device_num--;
1620
1621 RTE_LOG(INFO, VHOST_DATA,
1622 "(%d) device has been removed from data core\n",
1623 vdev->vid);
1624
1625 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1626 vhost_clear_queue(vdev, VIRTIO_RXQ);
1627 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1628 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1629 }
1630
1631 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1632 vhost_clear_queue(vdev, VIRTIO_TXQ);
1633 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1634 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1635 }
1636
1637 rte_free(vdev);
1638 }
1639
1640 static inline int
get_socketid_by_vid(int vid)1641 get_socketid_by_vid(int vid)
1642 {
1643 int i;
1644 char ifname[PATH_MAX];
1645 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1646
1647 for (i = 0; i < nb_sockets; i++) {
1648 char *file = socket_files + i * PATH_MAX;
1649 if (strcmp(file, ifname) == 0)
1650 return i;
1651 }
1652
1653 return -1;
1654 }
1655
1656 static int
init_vhost_queue_ops(int vid)1657 init_vhost_queue_ops(int vid)
1658 {
1659 if (builtin_net_driver) {
1660 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1661 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1662 } else {
1663 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1664 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1665 else
1666 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1667
1668 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1669 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1670 else
1671 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1672 }
1673
1674 return 0;
1675 }
1676
1677 static inline int
vhost_async_channel_register(int vid)1678 vhost_async_channel_register(int vid)
1679 {
1680 int rx_ret = 0, tx_ret = 0;
1681
1682 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1683 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1684 if (rx_ret == 0)
1685 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1686 }
1687
1688 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1689 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1690 if (tx_ret == 0)
1691 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1692 }
1693
1694 return rx_ret | tx_ret;
1695 }
1696
1697
1698
1699 /*
1700 * A new device is added to a data core. First the device is added to the main linked list
1701 * and then allocated to a specific data core.
1702 */
1703 static int
new_device(int vid)1704 new_device(int vid)
1705 {
1706 int lcore, core_add = 0;
1707 uint16_t i;
1708 uint32_t device_num_min = num_devices;
1709 struct vhost_dev *vdev;
1710 int ret;
1711
1712 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1713 if (vdev == NULL) {
1714 RTE_LOG(INFO, VHOST_DATA,
1715 "(%d) couldn't allocate memory for vhost dev\n",
1716 vid);
1717 return -1;
1718 }
1719 vdev->vid = vid;
1720
1721 for (i = 0; i < RTE_MAX_LCORE; i++) {
1722 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1723 = rte_zmalloc("vhost bufftable",
1724 sizeof(struct vhost_bufftable),
1725 RTE_CACHE_LINE_SIZE);
1726
1727 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1728 RTE_LOG(INFO, VHOST_DATA,
1729 "(%d) couldn't allocate memory for vhost TX\n", vid);
1730 return -1;
1731 }
1732 }
1733
1734 int socketid = get_socketid_by_vid(vid);
1735 if (socketid == -1)
1736 return -1;
1737
1738 init_vid2socketid_array(vid, socketid);
1739
1740 ret = vhost_async_channel_register(vid);
1741
1742 if (init_vhost_queue_ops(vid) != 0)
1743 return -1;
1744
1745 if (builtin_net_driver)
1746 vs_vhost_net_setup(vdev);
1747
1748 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1749 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1750
1751 /*reset ready flag*/
1752 vdev->ready = DEVICE_MAC_LEARNING;
1753 vdev->remove = 0;
1754
1755 /* Find a suitable lcore to add the device. */
1756 RTE_LCORE_FOREACH_WORKER(lcore) {
1757 if (lcore_info[lcore].device_num < device_num_min) {
1758 device_num_min = lcore_info[lcore].device_num;
1759 core_add = lcore;
1760 }
1761 }
1762 vdev->coreid = core_add;
1763
1764 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1765 lcore_vdev_entry);
1766 lcore_info[vdev->coreid].device_num++;
1767
1768 /* Disable notifications. */
1769 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1770 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1771
1772 RTE_LOG(INFO, VHOST_DATA,
1773 "(%d) device has been added to data core %d\n",
1774 vid, vdev->coreid);
1775
1776 return ret;
1777 }
1778
1779 static int
vring_state_changed(int vid,uint16_t queue_id,int enable)1780 vring_state_changed(int vid, uint16_t queue_id, int enable)
1781 {
1782 struct vhost_dev *vdev = NULL;
1783
1784 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1785 if (vdev->vid == vid)
1786 break;
1787 }
1788 if (!vdev)
1789 return -1;
1790
1791 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1792 if (!enable)
1793 vhost_clear_queue_thread_unsafe(vdev, queue_id);
1794 }
1795
1796 return 0;
1797 }
1798
1799 /*
1800 * These callback allow devices to be added to the data core when configuration
1801 * has been fully complete.
1802 */
1803 static const struct rte_vhost_device_ops virtio_net_device_ops =
1804 {
1805 .new_device = new_device,
1806 .destroy_device = destroy_device,
1807 .vring_state_changed = vring_state_changed,
1808 };
1809
1810 /*
1811 * This is a thread will wake up after a period to print stats if the user has
1812 * enabled them.
1813 */
1814 static uint32_t
print_stats(__rte_unused void * arg)1815 print_stats(__rte_unused void *arg)
1816 {
1817 struct vhost_dev *vdev;
1818 uint64_t tx_dropped, rx_dropped;
1819 uint64_t tx, tx_total, rx, rx_total;
1820 const char clr[] = { 27, '[', '2', 'J', '\0' };
1821 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1822
1823 while(1) {
1824 sleep(enable_stats);
1825
1826 /* Clear screen and move to top left */
1827 printf("%s%s\n", clr, top_left);
1828 printf("Device statistics =================================\n");
1829
1830 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1831 tx_total = vdev->stats.tx_total;
1832 tx = vdev->stats.tx;
1833 tx_dropped = tx_total - tx;
1834
1835 rx_total = rte_atomic_load_explicit(&vdev->stats.rx_total_atomic,
1836 rte_memory_order_seq_cst);
1837 rx = rte_atomic_load_explicit(&vdev->stats.rx_atomic,
1838 rte_memory_order_seq_cst);
1839 rx_dropped = rx_total - rx;
1840
1841 printf("Statistics for device %d\n"
1842 "-----------------------\n"
1843 "TX total: %" PRIu64 "\n"
1844 "TX dropped: %" PRIu64 "\n"
1845 "TX successful: %" PRIu64 "\n"
1846 "RX total: %" PRIu64 "\n"
1847 "RX dropped: %" PRIu64 "\n"
1848 "RX successful: %" PRIu64 "\n",
1849 vdev->vid,
1850 tx_total, tx_dropped, tx,
1851 rx_total, rx_dropped, rx);
1852 }
1853
1854 printf("===================================================\n");
1855
1856 fflush(stdout);
1857 }
1858
1859 return 0;
1860 }
1861
1862 static void
unregister_drivers(int socket_num)1863 unregister_drivers(int socket_num)
1864 {
1865 int i, ret;
1866
1867 for (i = 0; i < socket_num; i++) {
1868 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1869 if (ret != 0)
1870 RTE_LOG(ERR, VHOST_CONFIG,
1871 "Fail to unregister vhost driver for %s.\n",
1872 socket_files + i * PATH_MAX);
1873 }
1874 }
1875
1876 /* When we receive a INT signal, unregister vhost driver */
1877 static void
sigint_handler(__rte_unused int signum)1878 sigint_handler(__rte_unused int signum)
1879 {
1880 /* Unregister vhost driver. */
1881 unregister_drivers(nb_sockets);
1882
1883 exit(0);
1884 }
1885
1886 static void
reset_dma(void)1887 reset_dma(void)
1888 {
1889 int i;
1890
1891 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1892 int j;
1893
1894 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1895 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1896 dma_bind[i].dmas[j].async_enabled = false;
1897 }
1898 }
1899
1900 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1901 dmas_id[i] = INVALID_DMA_ID;
1902 }
1903
1904 /*
1905 * Main function, does initialisation and calls the per-lcore functions.
1906 */
1907 int
main(int argc,char * argv[])1908 main(int argc, char *argv[])
1909 {
1910 unsigned lcore_id, core_id = 0;
1911 unsigned nb_ports, valid_num_ports;
1912 int ret, i;
1913 uint16_t portid;
1914 rte_thread_t tid;
1915 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1916
1917 signal(SIGINT, sigint_handler);
1918
1919 /* init EAL */
1920 ret = rte_eal_init(argc, argv);
1921 if (ret < 0)
1922 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1923 argc -= ret;
1924 argv += ret;
1925
1926 /* initialize dma structures */
1927 reset_dma();
1928
1929 /* parse app arguments */
1930 ret = us_vhost_parse_args(argc, argv);
1931 if (ret < 0)
1932 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1933
1934 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1935 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1936
1937 if (rte_lcore_is_enabled(lcore_id))
1938 lcore_ids[core_id++] = lcore_id;
1939 }
1940
1941 if (rte_lcore_count() > RTE_MAX_LCORE)
1942 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1943
1944 /* Get the number of physical ports. */
1945 nb_ports = rte_eth_dev_count_avail();
1946
1947 /*
1948 * Update the global var NUM_PORTS and global array PORTS
1949 * and get value of var VALID_NUM_PORTS according to system ports number
1950 */
1951 valid_num_ports = check_ports_num(nb_ports);
1952
1953 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1954 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1955 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1956 return -1;
1957 }
1958
1959 /*
1960 * FIXME: here we are trying to allocate mbufs big enough for
1961 * @MAX_QUEUES, but the truth is we're never going to use that
1962 * many queues here. We probably should only do allocation for
1963 * those queues we are going to use.
1964 */
1965 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1966 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1967 rte_socket_id());
1968 if (mbuf_pool == NULL)
1969 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1970
1971 if (vm2vm_mode == VM2VM_HARDWARE) {
1972 /* Enable VT loop back to let L2 switch to do it. */
1973 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1974 RTE_LOG(DEBUG, VHOST_CONFIG,
1975 "Enable loop back for L2 switch in vmdq.\n");
1976 }
1977
1978 /* initialize all ports */
1979 RTE_ETH_FOREACH_DEV(portid) {
1980 /* skip ports that are not enabled */
1981 if ((enabled_port_mask & (1 << portid)) == 0) {
1982 RTE_LOG(INFO, VHOST_PORT,
1983 "Skipping disabled port %d\n", portid);
1984 continue;
1985 }
1986 if (port_init(portid) != 0)
1987 rte_exit(EXIT_FAILURE,
1988 "Cannot initialize network ports\n");
1989 }
1990
1991 /* Enable stats if the user option is set. */
1992 if (enable_stats) {
1993 ret = rte_thread_create_control(&tid, "dpdk-vhost-stat",
1994 print_stats, NULL);
1995 if (ret < 0)
1996 rte_exit(EXIT_FAILURE,
1997 "Cannot create dpdk-vhost-stat thread\n");
1998 }
1999
2000 /* Launch all data cores. */
2001 RTE_LCORE_FOREACH_WORKER(lcore_id)
2002 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
2003
2004 if (client_mode)
2005 flags |= RTE_VHOST_USER_CLIENT;
2006
2007 for (i = 0; i < dma_count; i++) {
2008 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
2009 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
2010 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2011 }
2012 }
2013
2014 /* Register vhost user driver to handle vhost messages. */
2015 for (i = 0; i < nb_sockets; i++) {
2016 char *file = socket_files + i * PATH_MAX;
2017
2018 if (dma_count && get_async_flag_by_socketid(i) != 0)
2019 flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2020
2021 ret = rte_vhost_driver_register(file, flags);
2022 if (ret != 0) {
2023 unregister_drivers(i);
2024 rte_exit(EXIT_FAILURE,
2025 "vhost driver register failure.\n");
2026 }
2027
2028 if (builtin_net_driver)
2029 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2030
2031 if (mergeable == 0) {
2032 rte_vhost_driver_disable_features(file,
2033 1ULL << VIRTIO_NET_F_MRG_RXBUF);
2034 }
2035
2036 if (enable_tx_csum == 0) {
2037 rte_vhost_driver_disable_features(file,
2038 1ULL << VIRTIO_NET_F_CSUM);
2039 }
2040
2041 if (enable_tso == 0) {
2042 rte_vhost_driver_disable_features(file,
2043 1ULL << VIRTIO_NET_F_HOST_TSO4);
2044 rte_vhost_driver_disable_features(file,
2045 1ULL << VIRTIO_NET_F_HOST_TSO6);
2046 rte_vhost_driver_disable_features(file,
2047 1ULL << VIRTIO_NET_F_GUEST_TSO4);
2048 rte_vhost_driver_disable_features(file,
2049 1ULL << VIRTIO_NET_F_GUEST_TSO6);
2050 }
2051
2052 if (promiscuous) {
2053 rte_vhost_driver_enable_features(file,
2054 1ULL << VIRTIO_NET_F_CTRL_RX);
2055 }
2056
2057 ret = rte_vhost_driver_callback_register(file,
2058 &virtio_net_device_ops);
2059 if (ret != 0) {
2060 rte_exit(EXIT_FAILURE,
2061 "failed to register vhost driver callbacks.\n");
2062 }
2063
2064 if (rte_vhost_driver_start(file) < 0) {
2065 rte_exit(EXIT_FAILURE,
2066 "failed to start vhost driver.\n");
2067 }
2068 }
2069
2070 RTE_LCORE_FOREACH_WORKER(lcore_id)
2071 rte_eal_wait_lcore(lcore_id);
2072
2073 for (i = 0; i < dma_count; i++) {
2074 if (rte_vhost_async_dma_unconfigure(dmas_id[i], 0) < 0) {
2075 RTE_LOG(ERR, VHOST_PORT,
2076 "Failed to unconfigure DMA %d in vhost.\n", dmas_id[i]);
2077 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2078 }
2079 }
2080
2081 /* clean up the EAL */
2082 rte_eal_cleanup();
2083
2084 return 0;
2085 }
2086