xref: /dpdk/examples/vhost/main.c (revision a543dcb70c5ca87bb95dcf419743dc36f4412b63)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29 
30 #include "main.h"
31 
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35 
36 #define NUM_MBUFS_DEFAULT 0x24000
37 
38 /* the maximum number of external ports supported */
39 #define MAX_SUP_PORTS 1
40 
41 #define MBUF_CACHE_SIZE	128
42 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
43 
44 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
45 
46 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
47 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
48 
49 #define JUMBO_FRAME_MAX_SIZE    0x2600
50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
51 
52 /* State of virtio device. */
53 #define DEVICE_MAC_LEARNING 0
54 #define DEVICE_RX			1
55 #define DEVICE_SAFE_REMOVE	2
56 
57 /* Configurable number of RX/TX ring descriptors */
58 #define RTE_TEST_RX_DESC_DEFAULT 1024
59 #define RTE_TEST_TX_DESC_DEFAULT 512
60 
61 #define INVALID_PORT_ID 0xFF
62 #define INVALID_DMA_ID -1
63 
64 #define DMA_RING_SIZE 4096
65 
66 #define ASYNC_ENQUEUE_VHOST 1
67 #define ASYNC_DEQUEUE_VHOST 2
68 
69 /* number of mbufs in all pools - if specified on command-line. */
70 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
71 
72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
74 static int dma_count;
75 
76 /* mask of enabled ports */
77 static uint32_t enabled_port_mask = 0;
78 
79 /* Promiscuous mode */
80 static uint32_t promiscuous;
81 
82 /* number of devices/queues to support*/
83 static uint32_t num_queues = 0;
84 static uint32_t num_devices;
85 
86 static struct rte_mempool *mbuf_pool;
87 static int mergeable;
88 
89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
90 typedef enum {
91 	VM2VM_DISABLED = 0,
92 	VM2VM_SOFTWARE = 1,
93 	VM2VM_HARDWARE = 2,
94 	VM2VM_LAST
95 } vm2vm_type;
96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
97 
98 /* Enable stats. */
99 static uint32_t enable_stats = 0;
100 /* Enable retries on RX. */
101 static uint32_t enable_retry = 1;
102 
103 /* Disable TX checksum offload */
104 static uint32_t enable_tx_csum;
105 
106 /* Disable TSO offload */
107 static uint32_t enable_tso;
108 
109 static int client_mode;
110 
111 static int builtin_net_driver;
112 
113 /* Specify timeout (in useconds) between retries on RX. */
114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
115 /* Specify the number of retries on RX. */
116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
117 
118 /* Socket file paths. Can be set by user */
119 static char *socket_files;
120 static int nb_sockets;
121 
122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
123 
124 /* empty VMDq configuration structure. Filled in programmatically */
125 static struct rte_eth_conf vmdq_conf_default = {
126 	.rxmode = {
127 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
128 		.split_hdr_size = 0,
129 		/*
130 		 * VLAN strip is necessary for 1G NIC such as I350,
131 		 * this fixes bug of ipv4 forwarding in guest can't
132 		 * forward packets from one virtio dev to another virtio dev.
133 		 */
134 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
135 	},
136 
137 	.txmode = {
138 		.mq_mode = RTE_ETH_MQ_TX_NONE,
139 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
140 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
141 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
142 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
143 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
144 	},
145 	.rx_adv_conf = {
146 		/*
147 		 * should be overridden separately in code with
148 		 * appropriate values
149 		 */
150 		.vmdq_rx_conf = {
151 			.nb_queue_pools = RTE_ETH_8_POOLS,
152 			.enable_default_pool = 0,
153 			.default_pool = 0,
154 			.nb_pool_maps = 0,
155 			.pool_map = {{0, 0},},
156 		},
157 	},
158 };
159 
160 
161 static unsigned lcore_ids[RTE_MAX_LCORE];
162 static uint16_t ports[RTE_MAX_ETHPORTS];
163 static unsigned num_ports = 0; /**< The number of ports specified in command line */
164 static uint16_t num_pf_queues, num_vmdq_queues;
165 static uint16_t vmdq_pool_base, vmdq_queue_base;
166 static uint16_t queues_per_pool;
167 
168 const uint16_t vlan_tags[] = {
169 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
170 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
171 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
172 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
173 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
174 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
175 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
176 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
177 };
178 
179 /* ethernet addresses of ports */
180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
181 
182 static struct vhost_dev_tailq_list vhost_dev_list =
183 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
184 
185 static struct lcore_info lcore_info[RTE_MAX_LCORE];
186 
187 /* Used for queueing bursts of TX packets. */
188 struct mbuf_table {
189 	unsigned len;
190 	unsigned txq_id;
191 	struct rte_mbuf *m_table[MAX_PKT_BURST];
192 };
193 
194 struct vhost_bufftable {
195 	uint32_t len;
196 	uint64_t pre_tsc;
197 	struct rte_mbuf *m_table[MAX_PKT_BURST];
198 };
199 
200 /* TX queue for each data core. */
201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
202 
203 /*
204  * Vhost TX buffer for each data core.
205  * Every data core maintains a TX buffer for every vhost device,
206  * which is used for batch pkts enqueue for higher performance.
207  */
208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
209 
210 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
211 				 / US_PER_S * BURST_TX_DRAIN_US)
212 
213 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
214 
215 static inline uint32_t
216 get_async_flag_by_socketid(int socketid)
217 {
218 	return dma_bind[socketid].async_flag;
219 }
220 
221 static inline void
222 init_vid2socketid_array(int vid, int socketid)
223 {
224 	vid2socketid[vid] = socketid;
225 }
226 
227 static inline bool
228 is_dma_configured(int16_t dev_id)
229 {
230 	int i;
231 
232 	for (i = 0; i < dma_count; i++)
233 		if (dmas_id[i] == dev_id)
234 			return true;
235 	return false;
236 }
237 
238 static inline int
239 open_dma(const char *value)
240 {
241 	struct dma_for_vhost *dma_info = dma_bind;
242 	char *input = strndup(value, strlen(value) + 1);
243 	char *addrs = input;
244 	char *ptrs[2];
245 	char *start, *end, *substr;
246 	int64_t socketid, vring_id;
247 
248 	struct rte_dma_info info;
249 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
250 	struct rte_dma_vchan_conf qconf = {
251 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
252 		.nb_desc = DMA_RING_SIZE
253 	};
254 
255 	int dev_id;
256 	int ret = 0;
257 	uint16_t i = 0;
258 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
259 	int args_nr;
260 
261 	while (isblank(*addrs))
262 		addrs++;
263 	if (*addrs == '\0') {
264 		ret = -1;
265 		goto out;
266 	}
267 
268 	/* process DMA devices within bracket. */
269 	addrs++;
270 	substr = strtok(addrs, ";]");
271 	if (!substr) {
272 		ret = -1;
273 		goto out;
274 	}
275 
276 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
277 	if (args_nr <= 0) {
278 		ret = -1;
279 		goto out;
280 	}
281 
282 	while (i < args_nr) {
283 		char *arg_temp = dma_arg[i];
284 		char *txd, *rxd;
285 		uint8_t sub_nr;
286 		int async_flag;
287 
288 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
289 		if (sub_nr != 2) {
290 			ret = -1;
291 			goto out;
292 		}
293 
294 		txd = strstr(ptrs[0], "txd");
295 		rxd = strstr(ptrs[0], "rxd");
296 		if (txd) {
297 			start = txd;
298 			vring_id = VIRTIO_RXQ;
299 			async_flag = ASYNC_ENQUEUE_VHOST;
300 		} else if (rxd) {
301 			start = rxd;
302 			vring_id = VIRTIO_TXQ;
303 			async_flag = ASYNC_DEQUEUE_VHOST;
304 		} else {
305 			ret = -1;
306 			goto out;
307 		}
308 
309 		start += 3;
310 		socketid = strtol(start, &end, 0);
311 		if (end == start) {
312 			ret = -1;
313 			goto out;
314 		}
315 
316 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
317 		if (dev_id < 0) {
318 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
319 			ret = -1;
320 			goto out;
321 		}
322 
323 		/* DMA device is already configured, so skip */
324 		if (is_dma_configured(dev_id))
325 			goto done;
326 
327 		if (rte_dma_info_get(dev_id, &info) != 0) {
328 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
329 			ret = -1;
330 			goto out;
331 		}
332 
333 		if (info.max_vchans < 1) {
334 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
335 			ret = -1;
336 			goto out;
337 		}
338 
339 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
340 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
341 			ret = -1;
342 			goto out;
343 		}
344 
345 		/* Check the max desc supported by DMA device */
346 		rte_dma_info_get(dev_id, &info);
347 		if (info.nb_vchans != 1) {
348 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
349 					dev_id);
350 			ret = -1;
351 			goto out;
352 		}
353 
354 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
355 
356 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
357 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
358 			ret = -1;
359 			goto out;
360 		}
361 
362 		if (rte_dma_start(dev_id) != 0) {
363 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
364 			ret = -1;
365 			goto out;
366 		}
367 
368 		dmas_id[dma_count++] = dev_id;
369 
370 done:
371 		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
372 		(dma_info + socketid)->async_flag |= async_flag;
373 		i++;
374 	}
375 out:
376 	free(input);
377 	return ret;
378 }
379 
380 /*
381  * Builds up the correct configuration for VMDQ VLAN pool map
382  * according to the pool & queue limits.
383  */
384 static inline int
385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
386 {
387 	struct rte_eth_vmdq_rx_conf conf;
388 	struct rte_eth_vmdq_rx_conf *def_conf =
389 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
390 	unsigned i;
391 
392 	memset(&conf, 0, sizeof(conf));
393 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
394 	conf.nb_pool_maps = num_devices;
395 	conf.enable_loop_back = def_conf->enable_loop_back;
396 	conf.rx_mode = def_conf->rx_mode;
397 
398 	for (i = 0; i < conf.nb_pool_maps; i++) {
399 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
400 		conf.pool_map[i].pools = (1UL << i);
401 	}
402 
403 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
404 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
405 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
406 	return 0;
407 }
408 
409 /*
410  * Initialises a given port using global settings and with the rx buffers
411  * coming from the mbuf_pool passed as parameter
412  */
413 static inline int
414 port_init(uint16_t port)
415 {
416 	struct rte_eth_dev_info dev_info;
417 	struct rte_eth_conf port_conf;
418 	struct rte_eth_rxconf *rxconf;
419 	struct rte_eth_txconf *txconf;
420 	int16_t rx_rings, tx_rings;
421 	uint16_t rx_ring_size, tx_ring_size;
422 	int retval;
423 	uint16_t q;
424 
425 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
426 	retval = rte_eth_dev_info_get(port, &dev_info);
427 	if (retval != 0) {
428 		RTE_LOG(ERR, VHOST_PORT,
429 			"Error during getting device (port %u) info: %s\n",
430 			port, strerror(-retval));
431 
432 		return retval;
433 	}
434 
435 	rxconf = &dev_info.default_rxconf;
436 	txconf = &dev_info.default_txconf;
437 	rxconf->rx_drop_en = 1;
438 
439 	/*configure the number of supported virtio devices based on VMDQ limits */
440 	num_devices = dev_info.max_vmdq_pools;
441 
442 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
443 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
444 
445 	tx_rings = (uint16_t)rte_lcore_count();
446 
447 	if (mergeable) {
448 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
449 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
450 		else
451 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
452 	}
453 
454 	/* Get port configuration. */
455 	retval = get_eth_conf(&port_conf, num_devices);
456 	if (retval < 0)
457 		return retval;
458 	/* NIC queues are divided into pf queues and vmdq queues.  */
459 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
460 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
461 	num_vmdq_queues = num_devices * queues_per_pool;
462 	num_queues = num_pf_queues + num_vmdq_queues;
463 	vmdq_queue_base = dev_info.vmdq_queue_base;
464 	vmdq_pool_base  = dev_info.vmdq_pool_base;
465 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
466 		num_pf_queues, num_devices, queues_per_pool);
467 
468 	if (!rte_eth_dev_is_valid_port(port))
469 		return -1;
470 
471 	rx_rings = (uint16_t)dev_info.max_rx_queues;
472 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
473 		port_conf.txmode.offloads |=
474 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
475 	/* Configure ethernet device. */
476 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
477 	if (retval != 0) {
478 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
479 			port, strerror(-retval));
480 		return retval;
481 	}
482 
483 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
484 		&tx_ring_size);
485 	if (retval != 0) {
486 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
487 			"for port %u: %s.\n", port, strerror(-retval));
488 		return retval;
489 	}
490 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
491 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
492 			"for Rx queues on port %u.\n", port);
493 		return -1;
494 	}
495 
496 	/* Setup the queues. */
497 	rxconf->offloads = port_conf.rxmode.offloads;
498 	for (q = 0; q < rx_rings; q ++) {
499 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
500 						rte_eth_dev_socket_id(port),
501 						rxconf,
502 						mbuf_pool);
503 		if (retval < 0) {
504 			RTE_LOG(ERR, VHOST_PORT,
505 				"Failed to setup rx queue %u of port %u: %s.\n",
506 				q, port, strerror(-retval));
507 			return retval;
508 		}
509 	}
510 	txconf->offloads = port_conf.txmode.offloads;
511 	for (q = 0; q < tx_rings; q ++) {
512 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
513 						rte_eth_dev_socket_id(port),
514 						txconf);
515 		if (retval < 0) {
516 			RTE_LOG(ERR, VHOST_PORT,
517 				"Failed to setup tx queue %u of port %u: %s.\n",
518 				q, port, strerror(-retval));
519 			return retval;
520 		}
521 	}
522 
523 	/* Start the device. */
524 	retval  = rte_eth_dev_start(port);
525 	if (retval < 0) {
526 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
527 			port, strerror(-retval));
528 		return retval;
529 	}
530 
531 	if (promiscuous) {
532 		retval = rte_eth_promiscuous_enable(port);
533 		if (retval != 0) {
534 			RTE_LOG(ERR, VHOST_PORT,
535 				"Failed to enable promiscuous mode on port %u: %s\n",
536 				port, rte_strerror(-retval));
537 			return retval;
538 		}
539 	}
540 
541 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
542 	if (retval < 0) {
543 		RTE_LOG(ERR, VHOST_PORT,
544 			"Failed to get MAC address on port %u: %s\n",
545 			port, rte_strerror(-retval));
546 		return retval;
547 	}
548 
549 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
550 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
551 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
552 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
553 
554 	return 0;
555 }
556 
557 /*
558  * Set socket file path.
559  */
560 static int
561 us_vhost_parse_socket_path(const char *q_arg)
562 {
563 	char *old;
564 
565 	/* parse number string */
566 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
567 		return -1;
568 
569 	old = socket_files;
570 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
571 	if (socket_files == NULL) {
572 		free(old);
573 		return -1;
574 	}
575 
576 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
577 	nb_sockets++;
578 
579 	return 0;
580 }
581 
582 /*
583  * Parse the portmask provided at run time.
584  */
585 static int
586 parse_portmask(const char *portmask)
587 {
588 	char *end = NULL;
589 	unsigned long pm;
590 
591 	errno = 0;
592 
593 	/* parse hexadecimal string */
594 	pm = strtoul(portmask, &end, 16);
595 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
596 		return 0;
597 
598 	return pm;
599 
600 }
601 
602 /*
603  * Parse num options at run time.
604  */
605 static int
606 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
607 {
608 	char *end = NULL;
609 	unsigned long num;
610 
611 	errno = 0;
612 
613 	/* parse unsigned int string */
614 	num = strtoul(q_arg, &end, 10);
615 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
616 		return -1;
617 
618 	if (num > max_valid_value)
619 		return -1;
620 
621 	return num;
622 
623 }
624 
625 /*
626  * Display usage
627  */
628 static void
629 us_vhost_usage(const char *prgname)
630 {
631 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
632 	"		--vm2vm [0|1|2]\n"
633 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
634 	"		--socket-file <path>\n"
635 	"		--nb-devices ND\n"
636 	"		-p PORTMASK: Set mask for ports to be used by application\n"
637 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
638 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
639 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
640 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
641 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
642 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
643 	"		--socket-file: The path of the socket file.\n"
644 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
645 	"		--tso [0|1] disable/enable TCP segment offload.\n"
646 	"		--client register a vhost-user socket as client mode.\n"
647 	"		--dmas register dma channel for specific vhost device.\n"
648 	"		--total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n",
649 	       prgname);
650 }
651 
652 enum {
653 #define OPT_VM2VM               "vm2vm"
654 	OPT_VM2VM_NUM = 256,
655 #define OPT_RX_RETRY            "rx-retry"
656 	OPT_RX_RETRY_NUM,
657 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
658 	OPT_RX_RETRY_DELAY_NUM,
659 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
660 	OPT_RX_RETRY_NUMB_NUM,
661 #define OPT_MERGEABLE           "mergeable"
662 	OPT_MERGEABLE_NUM,
663 #define OPT_STATS               "stats"
664 	OPT_STATS_NUM,
665 #define OPT_SOCKET_FILE         "socket-file"
666 	OPT_SOCKET_FILE_NUM,
667 #define OPT_TX_CSUM             "tx-csum"
668 	OPT_TX_CSUM_NUM,
669 #define OPT_TSO                 "tso"
670 	OPT_TSO_NUM,
671 #define OPT_CLIENT              "client"
672 	OPT_CLIENT_NUM,
673 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
674 	OPT_BUILTIN_NET_DRIVER_NUM,
675 #define OPT_DMAS                "dmas"
676 	OPT_DMAS_NUM,
677 #define OPT_NUM_MBUFS           "total-num-mbufs"
678 	OPT_NUM_MBUFS_NUM,
679 };
680 
681 /*
682  * Parse the arguments given in the command line of the application.
683  */
684 static int
685 us_vhost_parse_args(int argc, char **argv)
686 {
687 	int opt, ret;
688 	int option_index;
689 	unsigned i;
690 	const char *prgname = argv[0];
691 	static struct option long_option[] = {
692 		{OPT_VM2VM, required_argument,
693 				NULL, OPT_VM2VM_NUM},
694 		{OPT_RX_RETRY, required_argument,
695 				NULL, OPT_RX_RETRY_NUM},
696 		{OPT_RX_RETRY_DELAY, required_argument,
697 				NULL, OPT_RX_RETRY_DELAY_NUM},
698 		{OPT_RX_RETRY_NUMB, required_argument,
699 				NULL, OPT_RX_RETRY_NUMB_NUM},
700 		{OPT_MERGEABLE, required_argument,
701 				NULL, OPT_MERGEABLE_NUM},
702 		{OPT_STATS, required_argument,
703 				NULL, OPT_STATS_NUM},
704 		{OPT_SOCKET_FILE, required_argument,
705 				NULL, OPT_SOCKET_FILE_NUM},
706 		{OPT_TX_CSUM, required_argument,
707 				NULL, OPT_TX_CSUM_NUM},
708 		{OPT_TSO, required_argument,
709 				NULL, OPT_TSO_NUM},
710 		{OPT_CLIENT, no_argument,
711 				NULL, OPT_CLIENT_NUM},
712 		{OPT_BUILTIN_NET_DRIVER, no_argument,
713 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
714 		{OPT_DMAS, required_argument,
715 				NULL, OPT_DMAS_NUM},
716 		{OPT_NUM_MBUFS, required_argument,
717 				NULL, OPT_NUM_MBUFS_NUM},
718 		{NULL, 0, 0, 0},
719 	};
720 
721 	/* Parse command line */
722 	while ((opt = getopt_long(argc, argv, "p:P",
723 			long_option, &option_index)) != EOF) {
724 		switch (opt) {
725 		/* Portmask */
726 		case 'p':
727 			enabled_port_mask = parse_portmask(optarg);
728 			if (enabled_port_mask == 0) {
729 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
730 				us_vhost_usage(prgname);
731 				return -1;
732 			}
733 			break;
734 
735 		case 'P':
736 			promiscuous = 1;
737 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
738 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
739 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
740 			break;
741 
742 		case OPT_VM2VM_NUM:
743 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
744 			if (ret == -1) {
745 				RTE_LOG(INFO, VHOST_CONFIG,
746 					"Invalid argument for "
747 					"vm2vm [0|1|2]\n");
748 				us_vhost_usage(prgname);
749 				return -1;
750 			}
751 			vm2vm_mode = (vm2vm_type)ret;
752 			break;
753 
754 		case OPT_RX_RETRY_NUM:
755 			ret = parse_num_opt(optarg, 1);
756 			if (ret == -1) {
757 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
758 				us_vhost_usage(prgname);
759 				return -1;
760 			}
761 			enable_retry = ret;
762 			break;
763 
764 		case OPT_TX_CSUM_NUM:
765 			ret = parse_num_opt(optarg, 1);
766 			if (ret == -1) {
767 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
768 				us_vhost_usage(prgname);
769 				return -1;
770 			}
771 			enable_tx_csum = ret;
772 			break;
773 
774 		case OPT_TSO_NUM:
775 			ret = parse_num_opt(optarg, 1);
776 			if (ret == -1) {
777 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
778 				us_vhost_usage(prgname);
779 				return -1;
780 			}
781 			enable_tso = ret;
782 			break;
783 
784 		case OPT_RX_RETRY_DELAY_NUM:
785 			ret = parse_num_opt(optarg, INT32_MAX);
786 			if (ret == -1) {
787 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
788 				us_vhost_usage(prgname);
789 				return -1;
790 			}
791 			burst_rx_delay_time = ret;
792 			break;
793 
794 		case OPT_RX_RETRY_NUMB_NUM:
795 			ret = parse_num_opt(optarg, INT32_MAX);
796 			if (ret == -1) {
797 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
798 				us_vhost_usage(prgname);
799 				return -1;
800 			}
801 			burst_rx_retry_num = ret;
802 			break;
803 
804 		case OPT_MERGEABLE_NUM:
805 			ret = parse_num_opt(optarg, 1);
806 			if (ret == -1) {
807 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
808 				us_vhost_usage(prgname);
809 				return -1;
810 			}
811 			mergeable = !!ret;
812 			break;
813 
814 		case OPT_STATS_NUM:
815 			ret = parse_num_opt(optarg, INT32_MAX);
816 			if (ret == -1) {
817 				RTE_LOG(INFO, VHOST_CONFIG,
818 					"Invalid argument for stats [0..N]\n");
819 				us_vhost_usage(prgname);
820 				return -1;
821 			}
822 			enable_stats = ret;
823 			break;
824 
825 		/* Set socket file path. */
826 		case OPT_SOCKET_FILE_NUM:
827 			if (us_vhost_parse_socket_path(optarg) == -1) {
828 				RTE_LOG(INFO, VHOST_CONFIG,
829 				"Invalid argument for socket name (Max %d characters)\n",
830 				PATH_MAX);
831 				us_vhost_usage(prgname);
832 				return -1;
833 			}
834 			break;
835 
836 		case OPT_DMAS_NUM:
837 			if (open_dma(optarg) == -1) {
838 				RTE_LOG(INFO, VHOST_CONFIG,
839 					"Wrong DMA args\n");
840 				us_vhost_usage(prgname);
841 				return -1;
842 			}
843 			break;
844 
845 		case OPT_NUM_MBUFS_NUM:
846 			ret = parse_num_opt(optarg, INT32_MAX);
847 			if (ret == -1) {
848 				RTE_LOG(INFO, VHOST_CONFIG,
849 					"Invalid argument for total-num-mbufs [0..N]\n");
850 				us_vhost_usage(prgname);
851 				return -1;
852 			}
853 
854 			if (total_num_mbufs < ret)
855 				total_num_mbufs = ret;
856 			break;
857 
858 		case OPT_CLIENT_NUM:
859 			client_mode = 1;
860 			break;
861 
862 		case OPT_BUILTIN_NET_DRIVER_NUM:
863 			builtin_net_driver = 1;
864 			break;
865 
866 		/* Invalid option - print options. */
867 		default:
868 			us_vhost_usage(prgname);
869 			return -1;
870 		}
871 	}
872 
873 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
874 		if (enabled_port_mask & (1 << i))
875 			ports[num_ports++] = i;
876 	}
877 
878 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
879 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
880 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
881 		return -1;
882 	}
883 
884 	return 0;
885 }
886 
887 /*
888  * Update the global var NUM_PORTS and array PORTS according to system ports number
889  * and return valid ports number
890  */
891 static unsigned check_ports_num(unsigned nb_ports)
892 {
893 	unsigned valid_num_ports = num_ports;
894 	unsigned portid;
895 
896 	if (num_ports > nb_ports) {
897 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
898 			num_ports, nb_ports);
899 		num_ports = nb_ports;
900 	}
901 
902 	for (portid = 0; portid < num_ports; portid ++) {
903 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
904 			RTE_LOG(INFO, VHOST_PORT,
905 				"\nSpecified port ID(%u) is not valid\n",
906 				ports[portid]);
907 			ports[portid] = INVALID_PORT_ID;
908 			valid_num_ports--;
909 		}
910 	}
911 	return valid_num_ports;
912 }
913 
914 static __rte_always_inline struct vhost_dev *
915 find_vhost_dev(struct rte_ether_addr *mac)
916 {
917 	struct vhost_dev *vdev;
918 
919 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
920 		if (vdev->ready == DEVICE_RX &&
921 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
922 			return vdev;
923 	}
924 
925 	return NULL;
926 }
927 
928 /*
929  * This function learns the MAC address of the device and registers this along with a
930  * vlan tag to a VMDQ.
931  */
932 static int
933 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
934 {
935 	struct rte_ether_hdr *pkt_hdr;
936 	int i, ret;
937 
938 	/* Learn MAC address of guest device from packet */
939 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
940 
941 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
942 		RTE_LOG(ERR, VHOST_DATA,
943 			"(%d) device is using a registered MAC!\n",
944 			vdev->vid);
945 		return -1;
946 	}
947 
948 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
949 		vdev->mac_address.addr_bytes[i] =
950 			pkt_hdr->src_addr.addr_bytes[i];
951 
952 	/* vlan_tag currently uses the device_id. */
953 	vdev->vlan_tag = vlan_tags[vdev->vid];
954 
955 	/* Print out VMDQ registration info. */
956 	RTE_LOG(INFO, VHOST_DATA,
957 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
958 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
959 		vdev->vlan_tag);
960 
961 	/* Register the MAC address. */
962 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
963 				(uint32_t)vdev->vid + vmdq_pool_base);
964 	if (ret)
965 		RTE_LOG(ERR, VHOST_DATA,
966 			"(%d) failed to add device MAC address to VMDQ\n",
967 			vdev->vid);
968 
969 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
970 
971 	/* Set device as ready for RX. */
972 	vdev->ready = DEVICE_RX;
973 
974 	return 0;
975 }
976 
977 /*
978  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
979  * queue before disabling RX on the device.
980  */
981 static inline void
982 unlink_vmdq(struct vhost_dev *vdev)
983 {
984 	unsigned i = 0;
985 	unsigned rx_count;
986 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
987 
988 	if (vdev->ready == DEVICE_RX) {
989 		/*clear MAC and VLAN settings*/
990 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
991 		for (i = 0; i < 6; i++)
992 			vdev->mac_address.addr_bytes[i] = 0;
993 
994 		vdev->vlan_tag = 0;
995 
996 		/*Clear out the receive buffers*/
997 		rx_count = rte_eth_rx_burst(ports[0],
998 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
999 
1000 		while (rx_count) {
1001 			for (i = 0; i < rx_count; i++)
1002 				rte_pktmbuf_free(pkts_burst[i]);
1003 
1004 			rx_count = rte_eth_rx_burst(ports[0],
1005 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1006 		}
1007 
1008 		vdev->ready = DEVICE_MAC_LEARNING;
1009 	}
1010 }
1011 
1012 static inline void
1013 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1014 {
1015 	while (n--)
1016 		rte_pktmbuf_free(pkts[n]);
1017 }
1018 
1019 static __rte_always_inline void
1020 complete_async_pkts(struct vhost_dev *vdev)
1021 {
1022 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1023 	uint16_t complete_count;
1024 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1025 
1026 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1027 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1028 	if (complete_count)
1029 		free_pkts(p_cpl, complete_count);
1030 
1031 }
1032 
1033 static __rte_always_inline void
1034 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1035 	    struct rte_mbuf *m)
1036 {
1037 	uint16_t ret;
1038 
1039 	if (builtin_net_driver) {
1040 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1041 	} else {
1042 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1043 	}
1044 
1045 	if (enable_stats) {
1046 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1047 				__ATOMIC_SEQ_CST);
1048 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1049 				__ATOMIC_SEQ_CST);
1050 		src_vdev->stats.tx_total++;
1051 		src_vdev->stats.tx += ret;
1052 	}
1053 }
1054 
1055 static __rte_always_inline void
1056 drain_vhost(struct vhost_dev *vdev)
1057 {
1058 	uint16_t ret;
1059 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1060 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1061 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1062 
1063 	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1064 
1065 	if (enable_stats) {
1066 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1067 				__ATOMIC_SEQ_CST);
1068 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1069 				__ATOMIC_SEQ_CST);
1070 	}
1071 
1072 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1073 		free_pkts(m, nr_xmit);
1074 }
1075 
1076 static __rte_always_inline void
1077 drain_vhost_table(void)
1078 {
1079 	uint16_t lcore_id = rte_lcore_id();
1080 	struct vhost_bufftable *vhost_txq;
1081 	struct vhost_dev *vdev;
1082 	uint64_t cur_tsc;
1083 
1084 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1085 		if (unlikely(vdev->remove == 1))
1086 			continue;
1087 
1088 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1089 
1090 		cur_tsc = rte_rdtsc();
1091 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1092 				> MBUF_TABLE_DRAIN_TSC)) {
1093 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1094 				"Vhost TX queue drained after timeout with burst size %u\n",
1095 				vhost_txq->len);
1096 			drain_vhost(vdev);
1097 			vhost_txq->len = 0;
1098 			vhost_txq->pre_tsc = cur_tsc;
1099 		}
1100 	}
1101 }
1102 
1103 /*
1104  * Check if the packet destination MAC address is for a local device. If so then put
1105  * the packet on that devices RX queue. If not then return.
1106  */
1107 static __rte_always_inline int
1108 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1109 {
1110 	struct rte_ether_hdr *pkt_hdr;
1111 	struct vhost_dev *dst_vdev;
1112 	struct vhost_bufftable *vhost_txq;
1113 	uint16_t lcore_id = rte_lcore_id();
1114 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1115 
1116 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1117 	if (!dst_vdev)
1118 		return -1;
1119 
1120 	if (vdev->vid == dst_vdev->vid) {
1121 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1122 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1123 			vdev->vid);
1124 		return 0;
1125 	}
1126 
1127 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1128 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1129 
1130 	if (unlikely(dst_vdev->remove)) {
1131 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1132 			"(%d) device is marked for removal\n", dst_vdev->vid);
1133 		return 0;
1134 	}
1135 
1136 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1137 	vhost_txq->m_table[vhost_txq->len++] = m;
1138 
1139 	if (enable_stats) {
1140 		vdev->stats.tx_total++;
1141 		vdev->stats.tx++;
1142 	}
1143 
1144 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1145 		drain_vhost(dst_vdev);
1146 		vhost_txq->len = 0;
1147 		vhost_txq->pre_tsc = rte_rdtsc();
1148 	}
1149 	return 0;
1150 }
1151 
1152 /*
1153  * Check if the destination MAC of a packet is one local VM,
1154  * and get its vlan tag, and offset if it is.
1155  */
1156 static __rte_always_inline int
1157 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1158 	uint32_t *offset, uint16_t *vlan_tag)
1159 {
1160 	struct vhost_dev *dst_vdev;
1161 	struct rte_ether_hdr *pkt_hdr =
1162 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1163 
1164 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1165 	if (!dst_vdev)
1166 		return 0;
1167 
1168 	if (vdev->vid == dst_vdev->vid) {
1169 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1170 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1171 			vdev->vid);
1172 		return -1;
1173 	}
1174 
1175 	/*
1176 	 * HW vlan strip will reduce the packet length
1177 	 * by minus length of vlan tag, so need restore
1178 	 * the packet length by plus it.
1179 	 */
1180 	*offset  = RTE_VLAN_HLEN;
1181 	*vlan_tag = vlan_tags[vdev->vid];
1182 
1183 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1184 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1185 		vdev->vid, dst_vdev->vid, *vlan_tag);
1186 
1187 	return 0;
1188 }
1189 
1190 static void virtio_tx_offload(struct rte_mbuf *m)
1191 {
1192 	struct rte_net_hdr_lens hdr_lens;
1193 	struct rte_ipv4_hdr *ipv4_hdr;
1194 	struct rte_tcp_hdr *tcp_hdr;
1195 	uint32_t ptype;
1196 	void *l3_hdr;
1197 
1198 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1199 	m->l2_len = hdr_lens.l2_len;
1200 	m->l3_len = hdr_lens.l3_len;
1201 	m->l4_len = hdr_lens.l4_len;
1202 
1203 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1204 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1205 		m->l2_len + m->l3_len);
1206 
1207 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1208 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1209 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1210 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1211 		ipv4_hdr = l3_hdr;
1212 		ipv4_hdr->hdr_checksum = 0;
1213 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1214 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1215 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1216 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1217 	}
1218 }
1219 
1220 static __rte_always_inline void
1221 do_drain_mbuf_table(struct mbuf_table *tx_q)
1222 {
1223 	uint16_t count;
1224 
1225 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1226 				 tx_q->m_table, tx_q->len);
1227 	if (unlikely(count < tx_q->len))
1228 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1229 
1230 	tx_q->len = 0;
1231 }
1232 
1233 /*
1234  * This function routes the TX packet to the correct interface. This
1235  * may be a local device or the physical port.
1236  */
1237 static __rte_always_inline void
1238 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1239 {
1240 	struct mbuf_table *tx_q;
1241 	unsigned offset = 0;
1242 	const uint16_t lcore_id = rte_lcore_id();
1243 	struct rte_ether_hdr *nh;
1244 
1245 
1246 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1247 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1248 		struct vhost_dev *vdev2;
1249 
1250 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1251 			if (vdev2 != vdev)
1252 				sync_virtio_xmit(vdev2, vdev, m);
1253 		}
1254 		goto queue2nic;
1255 	}
1256 
1257 	/*check if destination is local VM*/
1258 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1259 		return;
1260 
1261 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1262 		if (unlikely(find_local_dest(vdev, m, &offset,
1263 					     &vlan_tag) != 0)) {
1264 			rte_pktmbuf_free(m);
1265 			return;
1266 		}
1267 	}
1268 
1269 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1270 		"(%d) TX: MAC address is external\n", vdev->vid);
1271 
1272 queue2nic:
1273 
1274 	/*Add packet to the port tx queue*/
1275 	tx_q = &lcore_tx_queue[lcore_id];
1276 
1277 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1278 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1279 		/* Guest has inserted the vlan tag. */
1280 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1281 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1282 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1283 			(vh->vlan_tci != vlan_tag_be))
1284 			vh->vlan_tci = vlan_tag_be;
1285 	} else {
1286 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1287 
1288 		/*
1289 		 * Find the right seg to adjust the data len when offset is
1290 		 * bigger than tail room size.
1291 		 */
1292 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1293 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1294 				m->data_len += offset;
1295 			else {
1296 				struct rte_mbuf *seg = m;
1297 
1298 				while ((seg->next != NULL) &&
1299 					(offset > rte_pktmbuf_tailroom(seg)))
1300 					seg = seg->next;
1301 
1302 				seg->data_len += offset;
1303 			}
1304 			m->pkt_len += offset;
1305 		}
1306 
1307 		m->vlan_tci = vlan_tag;
1308 	}
1309 
1310 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1311 		virtio_tx_offload(m);
1312 
1313 	tx_q->m_table[tx_q->len++] = m;
1314 	if (enable_stats) {
1315 		vdev->stats.tx_total++;
1316 		vdev->stats.tx++;
1317 	}
1318 
1319 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1320 		do_drain_mbuf_table(tx_q);
1321 }
1322 
1323 
1324 static __rte_always_inline void
1325 drain_mbuf_table(struct mbuf_table *tx_q)
1326 {
1327 	static uint64_t prev_tsc;
1328 	uint64_t cur_tsc;
1329 
1330 	if (tx_q->len == 0)
1331 		return;
1332 
1333 	cur_tsc = rte_rdtsc();
1334 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1335 		prev_tsc = cur_tsc;
1336 
1337 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1338 			"TX queue drained after timeout with burst size %u\n",
1339 			tx_q->len);
1340 		do_drain_mbuf_table(tx_q);
1341 	}
1342 }
1343 
1344 uint16_t
1345 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1346 		struct rte_mbuf **pkts, uint32_t rx_count)
1347 {
1348 	uint16_t enqueue_count;
1349 	uint16_t enqueue_fail = 0;
1350 	uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1351 
1352 	complete_async_pkts(dev);
1353 	enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1354 					pkts, rx_count, dma_id, 0);
1355 
1356 	enqueue_fail = rx_count - enqueue_count;
1357 	if (enqueue_fail)
1358 		free_pkts(&pkts[enqueue_count], enqueue_fail);
1359 
1360 	return enqueue_count;
1361 }
1362 
1363 uint16_t
1364 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1365 		struct rte_mbuf **pkts, uint32_t rx_count)
1366 {
1367 	return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1368 }
1369 
1370 static __rte_always_inline void
1371 drain_eth_rx(struct vhost_dev *vdev)
1372 {
1373 	uint16_t rx_count, enqueue_count;
1374 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1375 
1376 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1377 				    pkts, MAX_PKT_BURST);
1378 
1379 	if (!rx_count)
1380 		return;
1381 
1382 	/*
1383 	 * When "enable_retry" is set, here we wait and retry when there
1384 	 * is no enough free slots in the queue to hold @rx_count packets,
1385 	 * to diminish packet loss.
1386 	 */
1387 	if (enable_retry &&
1388 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1389 			VIRTIO_RXQ))) {
1390 		uint32_t retry;
1391 
1392 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1393 			rte_delay_us(burst_rx_delay_time);
1394 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1395 					VIRTIO_RXQ))
1396 				break;
1397 		}
1398 	}
1399 
1400 	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1401 					VIRTIO_RXQ, pkts, rx_count);
1402 
1403 	if (enable_stats) {
1404 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1405 				__ATOMIC_SEQ_CST);
1406 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1407 				__ATOMIC_SEQ_CST);
1408 	}
1409 
1410 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1411 		free_pkts(pkts, rx_count);
1412 }
1413 
1414 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1415 			    struct rte_mempool *mbuf_pool,
1416 			    struct rte_mbuf **pkts, uint16_t count)
1417 {
1418 	int nr_inflight;
1419 	uint16_t dequeue_count;
1420 	int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1421 
1422 	dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1423 			mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1424 
1425 	return dequeue_count;
1426 }
1427 
1428 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1429 			   struct rte_mempool *mbuf_pool,
1430 			   struct rte_mbuf **pkts, uint16_t count)
1431 {
1432 	return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1433 }
1434 
1435 static __rte_always_inline void
1436 drain_virtio_tx(struct vhost_dev *vdev)
1437 {
1438 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1439 	uint16_t count;
1440 	uint16_t i;
1441 
1442 	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1443 				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1444 
1445 	/* setup VMDq for the first packet */
1446 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1447 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1448 			free_pkts(pkts, count);
1449 	}
1450 
1451 	for (i = 0; i < count; ++i)
1452 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1453 }
1454 
1455 /*
1456  * Main function of vhost-switch. It basically does:
1457  *
1458  * for each vhost device {
1459  *    - drain_eth_rx()
1460  *
1461  *      Which drains the host eth Rx queue linked to the vhost device,
1462  *      and deliver all of them to guest virito Rx ring associated with
1463  *      this vhost device.
1464  *
1465  *    - drain_virtio_tx()
1466  *
1467  *      Which drains the guest virtio Tx queue and deliver all of them
1468  *      to the target, which could be another vhost device, or the
1469  *      physical eth dev. The route is done in function "virtio_tx_route".
1470  * }
1471  */
1472 static int
1473 switch_worker(void *arg __rte_unused)
1474 {
1475 	unsigned i;
1476 	unsigned lcore_id = rte_lcore_id();
1477 	struct vhost_dev *vdev;
1478 	struct mbuf_table *tx_q;
1479 
1480 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1481 
1482 	tx_q = &lcore_tx_queue[lcore_id];
1483 	for (i = 0; i < rte_lcore_count(); i++) {
1484 		if (lcore_ids[i] == lcore_id) {
1485 			tx_q->txq_id = i;
1486 			break;
1487 		}
1488 	}
1489 
1490 	while(1) {
1491 		drain_mbuf_table(tx_q);
1492 		drain_vhost_table();
1493 		/*
1494 		 * Inform the configuration core that we have exited the
1495 		 * linked list and that no devices are in use if requested.
1496 		 */
1497 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1498 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1499 
1500 		/*
1501 		 * Process vhost devices
1502 		 */
1503 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1504 			      lcore_vdev_entry) {
1505 			if (unlikely(vdev->remove)) {
1506 				unlink_vmdq(vdev);
1507 				vdev->ready = DEVICE_SAFE_REMOVE;
1508 				continue;
1509 			}
1510 
1511 			if (likely(vdev->ready == DEVICE_RX))
1512 				drain_eth_rx(vdev);
1513 
1514 			if (likely(!vdev->remove))
1515 				drain_virtio_tx(vdev);
1516 		}
1517 	}
1518 
1519 	return 0;
1520 }
1521 
1522 static void
1523 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1524 {
1525 	uint16_t n_pkt = 0;
1526 	int pkts_inflight;
1527 
1528 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1529 	pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1530 
1531 	struct rte_mbuf *m_cpl[pkts_inflight];
1532 
1533 	while (pkts_inflight) {
1534 		n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1535 							pkts_inflight, dma_id, 0);
1536 		free_pkts(m_cpl, n_pkt);
1537 		pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1538 									queue_id);
1539 	}
1540 }
1541 
1542 /*
1543  * Remove a device from the specific data core linked list and from the
1544  * main linked list. Synchronization  occurs through the use of the
1545  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1546  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1547  */
1548 static void
1549 destroy_device(int vid)
1550 {
1551 	struct vhost_dev *vdev = NULL;
1552 	int lcore;
1553 	uint16_t i;
1554 
1555 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1556 		if (vdev->vid == vid)
1557 			break;
1558 	}
1559 	if (!vdev)
1560 		return;
1561 	/*set the remove flag. */
1562 	vdev->remove = 1;
1563 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1564 		rte_pause();
1565 	}
1566 
1567 	for (i = 0; i < RTE_MAX_LCORE; i++)
1568 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1569 
1570 	if (builtin_net_driver)
1571 		vs_vhost_net_remove(vdev);
1572 
1573 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1574 		     lcore_vdev_entry);
1575 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1576 
1577 
1578 	/* Set the dev_removal_flag on each lcore. */
1579 	RTE_LCORE_FOREACH_WORKER(lcore)
1580 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1581 
1582 	/*
1583 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1584 	 * we can be sure that they can no longer access the device removed
1585 	 * from the linked lists and that the devices are no longer in use.
1586 	 */
1587 	RTE_LCORE_FOREACH_WORKER(lcore) {
1588 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1589 			rte_pause();
1590 	}
1591 
1592 	lcore_info[vdev->coreid].device_num--;
1593 
1594 	RTE_LOG(INFO, VHOST_DATA,
1595 		"(%d) device has been removed from data core\n",
1596 		vdev->vid);
1597 
1598 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1599 		vhost_clear_queue_thread_unsafe(vdev, VIRTIO_RXQ);
1600 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1601 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1602 	}
1603 
1604 	if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1605 		vhost_clear_queue_thread_unsafe(vdev, VIRTIO_TXQ);
1606 		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1607 		dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1608 	}
1609 
1610 	rte_free(vdev);
1611 }
1612 
1613 static inline int
1614 get_socketid_by_vid(int vid)
1615 {
1616 	int i;
1617 	char ifname[PATH_MAX];
1618 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1619 
1620 	for (i = 0; i < nb_sockets; i++) {
1621 		char *file = socket_files + i * PATH_MAX;
1622 		if (strcmp(file, ifname) == 0)
1623 			return i;
1624 	}
1625 
1626 	return -1;
1627 }
1628 
1629 static int
1630 init_vhost_queue_ops(int vid)
1631 {
1632 	if (builtin_net_driver) {
1633 		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1634 		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1635 	} else {
1636 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1637 			vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1638 		else
1639 			vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1640 
1641 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1642 			vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1643 		else
1644 			vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1645 	}
1646 
1647 	return 0;
1648 }
1649 
1650 static inline int
1651 vhost_async_channel_register(int vid)
1652 {
1653 	int rx_ret = 0, tx_ret = 0;
1654 
1655 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1656 		rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1657 		if (rx_ret == 0)
1658 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1659 	}
1660 
1661 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1662 		tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1663 		if (tx_ret == 0)
1664 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1665 	}
1666 
1667 	return rx_ret | tx_ret;
1668 }
1669 
1670 
1671 
1672 /*
1673  * A new device is added to a data core. First the device is added to the main linked list
1674  * and then allocated to a specific data core.
1675  */
1676 static int
1677 new_device(int vid)
1678 {
1679 	int lcore, core_add = 0;
1680 	uint16_t i;
1681 	uint32_t device_num_min = num_devices;
1682 	struct vhost_dev *vdev;
1683 	int ret;
1684 
1685 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1686 	if (vdev == NULL) {
1687 		RTE_LOG(INFO, VHOST_DATA,
1688 			"(%d) couldn't allocate memory for vhost dev\n",
1689 			vid);
1690 		return -1;
1691 	}
1692 	vdev->vid = vid;
1693 
1694 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1695 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1696 			= rte_zmalloc("vhost bufftable",
1697 				sizeof(struct vhost_bufftable),
1698 				RTE_CACHE_LINE_SIZE);
1699 
1700 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1701 			RTE_LOG(INFO, VHOST_DATA,
1702 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1703 			return -1;
1704 		}
1705 	}
1706 
1707 	int socketid = get_socketid_by_vid(vid);
1708 	if (socketid == -1)
1709 		return -1;
1710 
1711 	init_vid2socketid_array(vid, socketid);
1712 
1713 	ret =  vhost_async_channel_register(vid);
1714 
1715 	if (init_vhost_queue_ops(vid) != 0)
1716 		return -1;
1717 
1718 	if (builtin_net_driver)
1719 		vs_vhost_net_setup(vdev);
1720 
1721 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1722 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1723 
1724 	/*reset ready flag*/
1725 	vdev->ready = DEVICE_MAC_LEARNING;
1726 	vdev->remove = 0;
1727 
1728 	/* Find a suitable lcore to add the device. */
1729 	RTE_LCORE_FOREACH_WORKER(lcore) {
1730 		if (lcore_info[lcore].device_num < device_num_min) {
1731 			device_num_min = lcore_info[lcore].device_num;
1732 			core_add = lcore;
1733 		}
1734 	}
1735 	vdev->coreid = core_add;
1736 
1737 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1738 			  lcore_vdev_entry);
1739 	lcore_info[vdev->coreid].device_num++;
1740 
1741 	/* Disable notifications. */
1742 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1743 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1744 
1745 	RTE_LOG(INFO, VHOST_DATA,
1746 		"(%d) device has been added to data core %d\n",
1747 		vid, vdev->coreid);
1748 
1749 	return ret;
1750 }
1751 
1752 static int
1753 vring_state_changed(int vid, uint16_t queue_id, int enable)
1754 {
1755 	struct vhost_dev *vdev = NULL;
1756 
1757 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1758 		if (vdev->vid == vid)
1759 			break;
1760 	}
1761 	if (!vdev)
1762 		return -1;
1763 
1764 	if (queue_id != VIRTIO_RXQ)
1765 		return 0;
1766 
1767 	if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1768 		if (!enable)
1769 			vhost_clear_queue_thread_unsafe(vdev, queue_id);
1770 	}
1771 
1772 	return 0;
1773 }
1774 
1775 /*
1776  * These callback allow devices to be added to the data core when configuration
1777  * has been fully complete.
1778  */
1779 static const struct rte_vhost_device_ops virtio_net_device_ops =
1780 {
1781 	.new_device =  new_device,
1782 	.destroy_device = destroy_device,
1783 	.vring_state_changed = vring_state_changed,
1784 };
1785 
1786 /*
1787  * This is a thread will wake up after a period to print stats if the user has
1788  * enabled them.
1789  */
1790 static void *
1791 print_stats(__rte_unused void *arg)
1792 {
1793 	struct vhost_dev *vdev;
1794 	uint64_t tx_dropped, rx_dropped;
1795 	uint64_t tx, tx_total, rx, rx_total;
1796 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1797 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1798 
1799 	while(1) {
1800 		sleep(enable_stats);
1801 
1802 		/* Clear screen and move to top left */
1803 		printf("%s%s\n", clr, top_left);
1804 		printf("Device statistics =================================\n");
1805 
1806 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1807 			tx_total   = vdev->stats.tx_total;
1808 			tx         = vdev->stats.tx;
1809 			tx_dropped = tx_total - tx;
1810 
1811 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1812 				__ATOMIC_SEQ_CST);
1813 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1814 				__ATOMIC_SEQ_CST);
1815 			rx_dropped = rx_total - rx;
1816 
1817 			printf("Statistics for device %d\n"
1818 				"-----------------------\n"
1819 				"TX total:              %" PRIu64 "\n"
1820 				"TX dropped:            %" PRIu64 "\n"
1821 				"TX successful:         %" PRIu64 "\n"
1822 				"RX total:              %" PRIu64 "\n"
1823 				"RX dropped:            %" PRIu64 "\n"
1824 				"RX successful:         %" PRIu64 "\n",
1825 				vdev->vid,
1826 				tx_total, tx_dropped, tx,
1827 				rx_total, rx_dropped, rx);
1828 		}
1829 
1830 		printf("===================================================\n");
1831 
1832 		fflush(stdout);
1833 	}
1834 
1835 	return NULL;
1836 }
1837 
1838 static void
1839 unregister_drivers(int socket_num)
1840 {
1841 	int i, ret;
1842 
1843 	for (i = 0; i < socket_num; i++) {
1844 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1845 		if (ret != 0)
1846 			RTE_LOG(ERR, VHOST_CONFIG,
1847 				"Fail to unregister vhost driver for %s.\n",
1848 				socket_files + i * PATH_MAX);
1849 	}
1850 }
1851 
1852 /* When we receive a INT signal, unregister vhost driver */
1853 static void
1854 sigint_handler(__rte_unused int signum)
1855 {
1856 	/* Unregister vhost driver. */
1857 	unregister_drivers(nb_sockets);
1858 
1859 	exit(0);
1860 }
1861 
1862 static void
1863 reset_dma(void)
1864 {
1865 	int i;
1866 
1867 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1868 		int j;
1869 
1870 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1871 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1872 			dma_bind[i].dmas[j].async_enabled = false;
1873 		}
1874 	}
1875 
1876 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1877 		dmas_id[i] = INVALID_DMA_ID;
1878 }
1879 
1880 /*
1881  * Main function, does initialisation and calls the per-lcore functions.
1882  */
1883 int
1884 main(int argc, char *argv[])
1885 {
1886 	unsigned lcore_id, core_id = 0;
1887 	unsigned nb_ports, valid_num_ports;
1888 	int ret, i;
1889 	uint16_t portid;
1890 	static pthread_t tid;
1891 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1892 
1893 	signal(SIGINT, sigint_handler);
1894 
1895 	/* init EAL */
1896 	ret = rte_eal_init(argc, argv);
1897 	if (ret < 0)
1898 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1899 	argc -= ret;
1900 	argv += ret;
1901 
1902 	/* initialize dma structures */
1903 	reset_dma();
1904 
1905 	/* parse app arguments */
1906 	ret = us_vhost_parse_args(argc, argv);
1907 	if (ret < 0)
1908 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1909 
1910 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1911 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1912 
1913 		if (rte_lcore_is_enabled(lcore_id))
1914 			lcore_ids[core_id++] = lcore_id;
1915 	}
1916 
1917 	if (rte_lcore_count() > RTE_MAX_LCORE)
1918 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1919 
1920 	/* Get the number of physical ports. */
1921 	nb_ports = rte_eth_dev_count_avail();
1922 
1923 	/*
1924 	 * Update the global var NUM_PORTS and global array PORTS
1925 	 * and get value of var VALID_NUM_PORTS according to system ports number
1926 	 */
1927 	valid_num_ports = check_ports_num(nb_ports);
1928 
1929 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1930 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1931 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1932 		return -1;
1933 	}
1934 
1935 	/*
1936 	 * FIXME: here we are trying to allocate mbufs big enough for
1937 	 * @MAX_QUEUES, but the truth is we're never going to use that
1938 	 * many queues here. We probably should only do allocation for
1939 	 * those queues we are going to use.
1940 	 */
1941 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1942 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1943 					    rte_socket_id());
1944 	if (mbuf_pool == NULL)
1945 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1946 
1947 	if (vm2vm_mode == VM2VM_HARDWARE) {
1948 		/* Enable VT loop back to let L2 switch to do it. */
1949 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1950 		RTE_LOG(DEBUG, VHOST_CONFIG,
1951 			"Enable loop back for L2 switch in vmdq.\n");
1952 	}
1953 
1954 	/* initialize all ports */
1955 	RTE_ETH_FOREACH_DEV(portid) {
1956 		/* skip ports that are not enabled */
1957 		if ((enabled_port_mask & (1 << portid)) == 0) {
1958 			RTE_LOG(INFO, VHOST_PORT,
1959 				"Skipping disabled port %d\n", portid);
1960 			continue;
1961 		}
1962 		if (port_init(portid) != 0)
1963 			rte_exit(EXIT_FAILURE,
1964 				"Cannot initialize network ports\n");
1965 	}
1966 
1967 	/* Enable stats if the user option is set. */
1968 	if (enable_stats) {
1969 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1970 					print_stats, NULL);
1971 		if (ret < 0)
1972 			rte_exit(EXIT_FAILURE,
1973 				"Cannot create print-stats thread\n");
1974 	}
1975 
1976 	/* Launch all data cores. */
1977 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1978 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1979 
1980 	if (client_mode)
1981 		flags |= RTE_VHOST_USER_CLIENT;
1982 
1983 	for (i = 0; i < dma_count; i++) {
1984 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1985 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1986 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1987 		}
1988 	}
1989 
1990 	/* Register vhost user driver to handle vhost messages. */
1991 	for (i = 0; i < nb_sockets; i++) {
1992 		char *file = socket_files + i * PATH_MAX;
1993 
1994 		if (dma_count && get_async_flag_by_socketid(i) != 0)
1995 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1996 
1997 		ret = rte_vhost_driver_register(file, flags);
1998 		if (ret != 0) {
1999 			unregister_drivers(i);
2000 			rte_exit(EXIT_FAILURE,
2001 				"vhost driver register failure.\n");
2002 		}
2003 
2004 		if (builtin_net_driver)
2005 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2006 
2007 		if (mergeable == 0) {
2008 			rte_vhost_driver_disable_features(file,
2009 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
2010 		}
2011 
2012 		if (enable_tx_csum == 0) {
2013 			rte_vhost_driver_disable_features(file,
2014 				1ULL << VIRTIO_NET_F_CSUM);
2015 		}
2016 
2017 		if (enable_tso == 0) {
2018 			rte_vhost_driver_disable_features(file,
2019 				1ULL << VIRTIO_NET_F_HOST_TSO4);
2020 			rte_vhost_driver_disable_features(file,
2021 				1ULL << VIRTIO_NET_F_HOST_TSO6);
2022 			rte_vhost_driver_disable_features(file,
2023 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
2024 			rte_vhost_driver_disable_features(file,
2025 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
2026 		}
2027 
2028 		if (promiscuous) {
2029 			rte_vhost_driver_enable_features(file,
2030 				1ULL << VIRTIO_NET_F_CTRL_RX);
2031 		}
2032 
2033 		ret = rte_vhost_driver_callback_register(file,
2034 			&virtio_net_device_ops);
2035 		if (ret != 0) {
2036 			rte_exit(EXIT_FAILURE,
2037 				"failed to register vhost driver callbacks.\n");
2038 		}
2039 
2040 		if (rte_vhost_driver_start(file) < 0) {
2041 			rte_exit(EXIT_FAILURE,
2042 				"failed to start vhost driver.\n");
2043 		}
2044 	}
2045 
2046 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2047 		rte_eal_wait_lcore(lcore_id);
2048 
2049 	/* clean up the EAL */
2050 	rte_eal_cleanup();
2051 
2052 	return 0;
2053 }
2054