xref: /dpdk/examples/vhost/main.c (revision bc70e55948380ce57cbc079930f217c73ea59b39)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29 
30 #include "main.h"
31 
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35 
36 #define NUM_MBUFS_DEFAULT 0x24000
37 
38 /* the maximum number of external ports supported */
39 #define MAX_SUP_PORTS 1
40 
41 #define MBUF_CACHE_SIZE	128
42 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
43 
44 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
45 
46 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
47 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
48 
49 #define JUMBO_FRAME_MAX_SIZE    0x2600
50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
51 
52 /* State of virtio device. */
53 #define DEVICE_MAC_LEARNING 0
54 #define DEVICE_RX			1
55 #define DEVICE_SAFE_REMOVE	2
56 
57 /* Configurable number of RX/TX ring descriptors */
58 #define RTE_TEST_RX_DESC_DEFAULT 1024
59 #define RTE_TEST_TX_DESC_DEFAULT 512
60 
61 #define INVALID_PORT_ID 0xFF
62 #define INVALID_DMA_ID -1
63 
64 #define DMA_RING_SIZE 4096
65 
66 #define ASYNC_ENQUEUE_VHOST 1
67 #define ASYNC_DEQUEUE_VHOST 2
68 
69 /* number of mbufs in all pools - if specified on command-line. */
70 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
71 
72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
74 static int dma_count;
75 
76 /* mask of enabled ports */
77 static uint32_t enabled_port_mask = 0;
78 
79 /* Promiscuous mode */
80 static uint32_t promiscuous;
81 
82 /* number of devices/queues to support*/
83 static uint32_t num_queues = 0;
84 static uint32_t num_devices;
85 
86 static struct rte_mempool *mbuf_pool;
87 static int mergeable;
88 
89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
90 typedef enum {
91 	VM2VM_DISABLED = 0,
92 	VM2VM_SOFTWARE = 1,
93 	VM2VM_HARDWARE = 2,
94 	VM2VM_LAST
95 } vm2vm_type;
96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
97 
98 /* Enable stats. */
99 static uint32_t enable_stats = 0;
100 /* Enable retries on RX. */
101 static uint32_t enable_retry = 1;
102 
103 /* Disable TX checksum offload */
104 static uint32_t enable_tx_csum;
105 
106 /* Disable TSO offload */
107 static uint32_t enable_tso;
108 
109 static int client_mode;
110 
111 static int builtin_net_driver;
112 
113 /* Specify timeout (in useconds) between retries on RX. */
114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
115 /* Specify the number of retries on RX. */
116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
117 
118 /* Socket file paths. Can be set by user */
119 static char *socket_files;
120 static int nb_sockets;
121 
122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
123 
124 /* empty VMDq configuration structure. Filled in programmatically */
125 static struct rte_eth_conf vmdq_conf_default = {
126 	.rxmode = {
127 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
128 		.split_hdr_size = 0,
129 		/*
130 		 * VLAN strip is necessary for 1G NIC such as I350,
131 		 * this fixes bug of ipv4 forwarding in guest can't
132 		 * forward packets from one virtio dev to another virtio dev.
133 		 */
134 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
135 	},
136 
137 	.txmode = {
138 		.mq_mode = RTE_ETH_MQ_TX_NONE,
139 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
140 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
141 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
142 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
143 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
144 	},
145 	.rx_adv_conf = {
146 		/*
147 		 * should be overridden separately in code with
148 		 * appropriate values
149 		 */
150 		.vmdq_rx_conf = {
151 			.nb_queue_pools = RTE_ETH_8_POOLS,
152 			.enable_default_pool = 0,
153 			.default_pool = 0,
154 			.nb_pool_maps = 0,
155 			.pool_map = {{0, 0},},
156 		},
157 	},
158 };
159 
160 
161 static unsigned lcore_ids[RTE_MAX_LCORE];
162 static uint16_t ports[RTE_MAX_ETHPORTS];
163 static unsigned num_ports = 0; /**< The number of ports specified in command line */
164 static uint16_t num_pf_queues, num_vmdq_queues;
165 static uint16_t vmdq_pool_base, vmdq_queue_base;
166 static uint16_t queues_per_pool;
167 
168 const uint16_t vlan_tags[] = {
169 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
170 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
171 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
172 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
173 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
174 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
175 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
176 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
177 };
178 
179 /* ethernet addresses of ports */
180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
181 
182 static struct vhost_dev_tailq_list vhost_dev_list =
183 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
184 
185 static struct lcore_info lcore_info[RTE_MAX_LCORE];
186 
187 /* Used for queueing bursts of TX packets. */
188 struct mbuf_table {
189 	unsigned len;
190 	unsigned txq_id;
191 	struct rte_mbuf *m_table[MAX_PKT_BURST];
192 };
193 
194 struct vhost_bufftable {
195 	uint32_t len;
196 	uint64_t pre_tsc;
197 	struct rte_mbuf *m_table[MAX_PKT_BURST];
198 };
199 
200 /* TX queue for each data core. */
201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
202 
203 /*
204  * Vhost TX buffer for each data core.
205  * Every data core maintains a TX buffer for every vhost device,
206  * which is used for batch pkts enqueue for higher performance.
207  */
208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
209 
210 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
211 				 / US_PER_S * BURST_TX_DRAIN_US)
212 
213 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
214 
215 static inline uint32_t
216 get_async_flag_by_socketid(int socketid)
217 {
218 	return dma_bind[socketid].async_flag;
219 }
220 
221 static inline void
222 init_vid2socketid_array(int vid, int socketid)
223 {
224 	vid2socketid[vid] = socketid;
225 }
226 
227 static inline bool
228 is_dma_configured(int16_t dev_id)
229 {
230 	int i;
231 
232 	for (i = 0; i < dma_count; i++)
233 		if (dmas_id[i] == dev_id)
234 			return true;
235 	return false;
236 }
237 
238 static inline int
239 open_dma(const char *value)
240 {
241 	struct dma_for_vhost *dma_info = dma_bind;
242 	char *input = strndup(value, strlen(value) + 1);
243 	char *addrs = input;
244 	char *ptrs[2];
245 	char *start, *end, *substr;
246 	int64_t socketid, vring_id;
247 
248 	struct rte_dma_info info;
249 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
250 	struct rte_dma_vchan_conf qconf = {
251 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
252 		.nb_desc = DMA_RING_SIZE
253 	};
254 
255 	int dev_id;
256 	int ret = 0;
257 	uint16_t i = 0;
258 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
259 	int args_nr;
260 
261 	while (isblank(*addrs))
262 		addrs++;
263 	if (*addrs == '\0') {
264 		ret = -1;
265 		goto out;
266 	}
267 
268 	/* process DMA devices within bracket. */
269 	addrs++;
270 	substr = strtok(addrs, ";]");
271 	if (!substr) {
272 		ret = -1;
273 		goto out;
274 	}
275 
276 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
277 	if (args_nr <= 0) {
278 		ret = -1;
279 		goto out;
280 	}
281 
282 	while (i < args_nr) {
283 		char *arg_temp = dma_arg[i];
284 		char *txd, *rxd;
285 		uint8_t sub_nr;
286 		int async_flag;
287 
288 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
289 		if (sub_nr != 2) {
290 			ret = -1;
291 			goto out;
292 		}
293 
294 		txd = strstr(ptrs[0], "txd");
295 		rxd = strstr(ptrs[0], "rxd");
296 		if (txd) {
297 			start = txd;
298 			vring_id = VIRTIO_RXQ;
299 			async_flag = ASYNC_ENQUEUE_VHOST;
300 		} else if (rxd) {
301 			start = rxd;
302 			vring_id = VIRTIO_TXQ;
303 			async_flag = ASYNC_DEQUEUE_VHOST;
304 		} else {
305 			ret = -1;
306 			goto out;
307 		}
308 
309 		start += 3;
310 		socketid = strtol(start, &end, 0);
311 		if (end == start) {
312 			ret = -1;
313 			goto out;
314 		}
315 
316 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
317 		if (dev_id < 0) {
318 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
319 			ret = -1;
320 			goto out;
321 		}
322 
323 		/* DMA device is already configured, so skip */
324 		if (is_dma_configured(dev_id))
325 			goto done;
326 
327 		if (rte_dma_info_get(dev_id, &info) != 0) {
328 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
329 			ret = -1;
330 			goto out;
331 		}
332 
333 		if (info.max_vchans < 1) {
334 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
335 			ret = -1;
336 			goto out;
337 		}
338 
339 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
340 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
341 			ret = -1;
342 			goto out;
343 		}
344 
345 		/* Check the max desc supported by DMA device */
346 		rte_dma_info_get(dev_id, &info);
347 		if (info.nb_vchans != 1) {
348 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
349 					dev_id);
350 			ret = -1;
351 			goto out;
352 		}
353 
354 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
355 
356 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
357 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
358 			ret = -1;
359 			goto out;
360 		}
361 
362 		if (rte_dma_start(dev_id) != 0) {
363 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
364 			ret = -1;
365 			goto out;
366 		}
367 
368 		dmas_id[dma_count++] = dev_id;
369 
370 done:
371 		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
372 		(dma_info + socketid)->async_flag |= async_flag;
373 		i++;
374 	}
375 out:
376 	free(input);
377 	return ret;
378 }
379 
380 /*
381  * Builds up the correct configuration for VMDQ VLAN pool map
382  * according to the pool & queue limits.
383  */
384 static inline int
385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
386 {
387 	struct rte_eth_vmdq_rx_conf conf;
388 	struct rte_eth_vmdq_rx_conf *def_conf =
389 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
390 	unsigned i;
391 
392 	memset(&conf, 0, sizeof(conf));
393 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
394 	conf.nb_pool_maps = num_devices;
395 	conf.enable_loop_back = def_conf->enable_loop_back;
396 	conf.rx_mode = def_conf->rx_mode;
397 
398 	for (i = 0; i < conf.nb_pool_maps; i++) {
399 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
400 		conf.pool_map[i].pools = (1UL << i);
401 	}
402 
403 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
404 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
405 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
406 	return 0;
407 }
408 
409 /*
410  * Initialises a given port using global settings and with the rx buffers
411  * coming from the mbuf_pool passed as parameter
412  */
413 static inline int
414 port_init(uint16_t port)
415 {
416 	struct rte_eth_dev_info dev_info;
417 	struct rte_eth_conf port_conf;
418 	struct rte_eth_rxconf *rxconf;
419 	struct rte_eth_txconf *txconf;
420 	int16_t rx_rings, tx_rings;
421 	uint16_t rx_ring_size, tx_ring_size;
422 	int retval;
423 	uint16_t q;
424 
425 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
426 	retval = rte_eth_dev_info_get(port, &dev_info);
427 	if (retval != 0) {
428 		RTE_LOG(ERR, VHOST_PORT,
429 			"Error during getting device (port %u) info: %s\n",
430 			port, strerror(-retval));
431 
432 		return retval;
433 	}
434 	if (dev_info.max_vmdq_pools == 0) {
435 		RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
436 		return -1;
437 	}
438 
439 	rxconf = &dev_info.default_rxconf;
440 	txconf = &dev_info.default_txconf;
441 	rxconf->rx_drop_en = 1;
442 
443 	/*configure the number of supported virtio devices based on VMDQ limits */
444 	num_devices = dev_info.max_vmdq_pools;
445 
446 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
447 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
448 
449 	tx_rings = (uint16_t)rte_lcore_count();
450 
451 	if (mergeable) {
452 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
453 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
454 		else
455 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
456 	}
457 
458 	/* Get port configuration. */
459 	retval = get_eth_conf(&port_conf, num_devices);
460 	if (retval < 0)
461 		return retval;
462 	/* NIC queues are divided into pf queues and vmdq queues.  */
463 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
464 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
465 	num_vmdq_queues = num_devices * queues_per_pool;
466 	num_queues = num_pf_queues + num_vmdq_queues;
467 	vmdq_queue_base = dev_info.vmdq_queue_base;
468 	vmdq_pool_base  = dev_info.vmdq_pool_base;
469 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
470 		num_pf_queues, num_devices, queues_per_pool);
471 
472 	if (!rte_eth_dev_is_valid_port(port))
473 		return -1;
474 
475 	rx_rings = (uint16_t)dev_info.max_rx_queues;
476 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
477 		port_conf.txmode.offloads |=
478 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
479 	/* Configure ethernet device. */
480 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
481 	if (retval != 0) {
482 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
483 			port, strerror(-retval));
484 		return retval;
485 	}
486 
487 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
488 		&tx_ring_size);
489 	if (retval != 0) {
490 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
491 			"for port %u: %s.\n", port, strerror(-retval));
492 		return retval;
493 	}
494 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
495 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
496 			"for Rx queues on port %u.\n", port);
497 		return -1;
498 	}
499 
500 	/* Setup the queues. */
501 	rxconf->offloads = port_conf.rxmode.offloads;
502 	for (q = 0; q < rx_rings; q ++) {
503 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
504 						rte_eth_dev_socket_id(port),
505 						rxconf,
506 						mbuf_pool);
507 		if (retval < 0) {
508 			RTE_LOG(ERR, VHOST_PORT,
509 				"Failed to setup rx queue %u of port %u: %s.\n",
510 				q, port, strerror(-retval));
511 			return retval;
512 		}
513 	}
514 	txconf->offloads = port_conf.txmode.offloads;
515 	for (q = 0; q < tx_rings; q ++) {
516 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
517 						rte_eth_dev_socket_id(port),
518 						txconf);
519 		if (retval < 0) {
520 			RTE_LOG(ERR, VHOST_PORT,
521 				"Failed to setup tx queue %u of port %u: %s.\n",
522 				q, port, strerror(-retval));
523 			return retval;
524 		}
525 	}
526 
527 	/* Start the device. */
528 	retval  = rte_eth_dev_start(port);
529 	if (retval < 0) {
530 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
531 			port, strerror(-retval));
532 		return retval;
533 	}
534 
535 	if (promiscuous) {
536 		retval = rte_eth_promiscuous_enable(port);
537 		if (retval != 0) {
538 			RTE_LOG(ERR, VHOST_PORT,
539 				"Failed to enable promiscuous mode on port %u: %s\n",
540 				port, rte_strerror(-retval));
541 			return retval;
542 		}
543 	}
544 
545 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
546 	if (retval < 0) {
547 		RTE_LOG(ERR, VHOST_PORT,
548 			"Failed to get MAC address on port %u: %s\n",
549 			port, rte_strerror(-retval));
550 		return retval;
551 	}
552 
553 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
554 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
555 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
556 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
557 
558 	return 0;
559 }
560 
561 /*
562  * Set socket file path.
563  */
564 static int
565 us_vhost_parse_socket_path(const char *q_arg)
566 {
567 	char *old;
568 
569 	/* parse number string */
570 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
571 		return -1;
572 
573 	old = socket_files;
574 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
575 	if (socket_files == NULL) {
576 		free(old);
577 		return -1;
578 	}
579 
580 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
581 	nb_sockets++;
582 
583 	return 0;
584 }
585 
586 /*
587  * Parse the portmask provided at run time.
588  */
589 static int
590 parse_portmask(const char *portmask)
591 {
592 	char *end = NULL;
593 	unsigned long pm;
594 
595 	errno = 0;
596 
597 	/* parse hexadecimal string */
598 	pm = strtoul(portmask, &end, 16);
599 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
600 		return 0;
601 
602 	return pm;
603 
604 }
605 
606 /*
607  * Parse num options at run time.
608  */
609 static int
610 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
611 {
612 	char *end = NULL;
613 	unsigned long num;
614 
615 	errno = 0;
616 
617 	/* parse unsigned int string */
618 	num = strtoul(q_arg, &end, 10);
619 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
620 		return -1;
621 
622 	if (num > max_valid_value)
623 		return -1;
624 
625 	return num;
626 
627 }
628 
629 /*
630  * Display usage
631  */
632 static void
633 us_vhost_usage(const char *prgname)
634 {
635 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
636 	"		--vm2vm [0|1|2]\n"
637 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
638 	"		--socket-file <path>\n"
639 	"		--nb-devices ND\n"
640 	"		-p PORTMASK: Set mask for ports to be used by application\n"
641 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
642 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
643 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
644 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
645 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
646 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
647 	"		--socket-file: The path of the socket file.\n"
648 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
649 	"		--tso [0|1] disable/enable TCP segment offload.\n"
650 	"		--client register a vhost-user socket as client mode.\n"
651 	"		--dmas register dma channel for specific vhost device.\n"
652 	"		--total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n",
653 	       prgname);
654 }
655 
656 enum {
657 #define OPT_VM2VM               "vm2vm"
658 	OPT_VM2VM_NUM = 256,
659 #define OPT_RX_RETRY            "rx-retry"
660 	OPT_RX_RETRY_NUM,
661 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
662 	OPT_RX_RETRY_DELAY_NUM,
663 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
664 	OPT_RX_RETRY_NUMB_NUM,
665 #define OPT_MERGEABLE           "mergeable"
666 	OPT_MERGEABLE_NUM,
667 #define OPT_STATS               "stats"
668 	OPT_STATS_NUM,
669 #define OPT_SOCKET_FILE         "socket-file"
670 	OPT_SOCKET_FILE_NUM,
671 #define OPT_TX_CSUM             "tx-csum"
672 	OPT_TX_CSUM_NUM,
673 #define OPT_TSO                 "tso"
674 	OPT_TSO_NUM,
675 #define OPT_CLIENT              "client"
676 	OPT_CLIENT_NUM,
677 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
678 	OPT_BUILTIN_NET_DRIVER_NUM,
679 #define OPT_DMAS                "dmas"
680 	OPT_DMAS_NUM,
681 #define OPT_NUM_MBUFS           "total-num-mbufs"
682 	OPT_NUM_MBUFS_NUM,
683 };
684 
685 /*
686  * Parse the arguments given in the command line of the application.
687  */
688 static int
689 us_vhost_parse_args(int argc, char **argv)
690 {
691 	int opt, ret;
692 	int option_index;
693 	unsigned i;
694 	const char *prgname = argv[0];
695 	static struct option long_option[] = {
696 		{OPT_VM2VM, required_argument,
697 				NULL, OPT_VM2VM_NUM},
698 		{OPT_RX_RETRY, required_argument,
699 				NULL, OPT_RX_RETRY_NUM},
700 		{OPT_RX_RETRY_DELAY, required_argument,
701 				NULL, OPT_RX_RETRY_DELAY_NUM},
702 		{OPT_RX_RETRY_NUMB, required_argument,
703 				NULL, OPT_RX_RETRY_NUMB_NUM},
704 		{OPT_MERGEABLE, required_argument,
705 				NULL, OPT_MERGEABLE_NUM},
706 		{OPT_STATS, required_argument,
707 				NULL, OPT_STATS_NUM},
708 		{OPT_SOCKET_FILE, required_argument,
709 				NULL, OPT_SOCKET_FILE_NUM},
710 		{OPT_TX_CSUM, required_argument,
711 				NULL, OPT_TX_CSUM_NUM},
712 		{OPT_TSO, required_argument,
713 				NULL, OPT_TSO_NUM},
714 		{OPT_CLIENT, no_argument,
715 				NULL, OPT_CLIENT_NUM},
716 		{OPT_BUILTIN_NET_DRIVER, no_argument,
717 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
718 		{OPT_DMAS, required_argument,
719 				NULL, OPT_DMAS_NUM},
720 		{OPT_NUM_MBUFS, required_argument,
721 				NULL, OPT_NUM_MBUFS_NUM},
722 		{NULL, 0, 0, 0},
723 	};
724 
725 	/* Parse command line */
726 	while ((opt = getopt_long(argc, argv, "p:P",
727 			long_option, &option_index)) != EOF) {
728 		switch (opt) {
729 		/* Portmask */
730 		case 'p':
731 			enabled_port_mask = parse_portmask(optarg);
732 			if (enabled_port_mask == 0) {
733 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
734 				us_vhost_usage(prgname);
735 				return -1;
736 			}
737 			break;
738 
739 		case 'P':
740 			promiscuous = 1;
741 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
742 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
743 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
744 			break;
745 
746 		case OPT_VM2VM_NUM:
747 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
748 			if (ret == -1) {
749 				RTE_LOG(INFO, VHOST_CONFIG,
750 					"Invalid argument for "
751 					"vm2vm [0|1|2]\n");
752 				us_vhost_usage(prgname);
753 				return -1;
754 			}
755 			vm2vm_mode = (vm2vm_type)ret;
756 			break;
757 
758 		case OPT_RX_RETRY_NUM:
759 			ret = parse_num_opt(optarg, 1);
760 			if (ret == -1) {
761 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
762 				us_vhost_usage(prgname);
763 				return -1;
764 			}
765 			enable_retry = ret;
766 			break;
767 
768 		case OPT_TX_CSUM_NUM:
769 			ret = parse_num_opt(optarg, 1);
770 			if (ret == -1) {
771 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
772 				us_vhost_usage(prgname);
773 				return -1;
774 			}
775 			enable_tx_csum = ret;
776 			break;
777 
778 		case OPT_TSO_NUM:
779 			ret = parse_num_opt(optarg, 1);
780 			if (ret == -1) {
781 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
782 				us_vhost_usage(prgname);
783 				return -1;
784 			}
785 			enable_tso = ret;
786 			break;
787 
788 		case OPT_RX_RETRY_DELAY_NUM:
789 			ret = parse_num_opt(optarg, INT32_MAX);
790 			if (ret == -1) {
791 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
792 				us_vhost_usage(prgname);
793 				return -1;
794 			}
795 			burst_rx_delay_time = ret;
796 			break;
797 
798 		case OPT_RX_RETRY_NUMB_NUM:
799 			ret = parse_num_opt(optarg, INT32_MAX);
800 			if (ret == -1) {
801 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
802 				us_vhost_usage(prgname);
803 				return -1;
804 			}
805 			burst_rx_retry_num = ret;
806 			break;
807 
808 		case OPT_MERGEABLE_NUM:
809 			ret = parse_num_opt(optarg, 1);
810 			if (ret == -1) {
811 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
812 				us_vhost_usage(prgname);
813 				return -1;
814 			}
815 			mergeable = !!ret;
816 			break;
817 
818 		case OPT_STATS_NUM:
819 			ret = parse_num_opt(optarg, INT32_MAX);
820 			if (ret == -1) {
821 				RTE_LOG(INFO, VHOST_CONFIG,
822 					"Invalid argument for stats [0..N]\n");
823 				us_vhost_usage(prgname);
824 				return -1;
825 			}
826 			enable_stats = ret;
827 			break;
828 
829 		/* Set socket file path. */
830 		case OPT_SOCKET_FILE_NUM:
831 			if (us_vhost_parse_socket_path(optarg) == -1) {
832 				RTE_LOG(INFO, VHOST_CONFIG,
833 				"Invalid argument for socket name (Max %d characters)\n",
834 				PATH_MAX);
835 				us_vhost_usage(prgname);
836 				return -1;
837 			}
838 			break;
839 
840 		case OPT_DMAS_NUM:
841 			if (open_dma(optarg) == -1) {
842 				RTE_LOG(INFO, VHOST_CONFIG,
843 					"Wrong DMA args\n");
844 				us_vhost_usage(prgname);
845 				return -1;
846 			}
847 			break;
848 
849 		case OPT_NUM_MBUFS_NUM:
850 			ret = parse_num_opt(optarg, INT32_MAX);
851 			if (ret == -1) {
852 				RTE_LOG(INFO, VHOST_CONFIG,
853 					"Invalid argument for total-num-mbufs [0..N]\n");
854 				us_vhost_usage(prgname);
855 				return -1;
856 			}
857 
858 			if (total_num_mbufs < ret)
859 				total_num_mbufs = ret;
860 			break;
861 
862 		case OPT_CLIENT_NUM:
863 			client_mode = 1;
864 			break;
865 
866 		case OPT_BUILTIN_NET_DRIVER_NUM:
867 			builtin_net_driver = 1;
868 			break;
869 
870 		/* Invalid option - print options. */
871 		default:
872 			us_vhost_usage(prgname);
873 			return -1;
874 		}
875 	}
876 
877 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
878 		if (enabled_port_mask & (1 << i))
879 			ports[num_ports++] = i;
880 	}
881 
882 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
883 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
884 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
885 		return -1;
886 	}
887 
888 	return 0;
889 }
890 
891 /*
892  * Update the global var NUM_PORTS and array PORTS according to system ports number
893  * and return valid ports number
894  */
895 static unsigned check_ports_num(unsigned nb_ports)
896 {
897 	unsigned valid_num_ports = num_ports;
898 	unsigned portid;
899 
900 	if (num_ports > nb_ports) {
901 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
902 			num_ports, nb_ports);
903 		num_ports = nb_ports;
904 	}
905 
906 	for (portid = 0; portid < num_ports; portid ++) {
907 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
908 			RTE_LOG(INFO, VHOST_PORT,
909 				"\nSpecified port ID(%u) is not valid\n",
910 				ports[portid]);
911 			ports[portid] = INVALID_PORT_ID;
912 			valid_num_ports--;
913 		}
914 	}
915 	return valid_num_ports;
916 }
917 
918 static __rte_always_inline struct vhost_dev *
919 find_vhost_dev(struct rte_ether_addr *mac)
920 {
921 	struct vhost_dev *vdev;
922 
923 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
924 		if (vdev->ready == DEVICE_RX &&
925 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
926 			return vdev;
927 	}
928 
929 	return NULL;
930 }
931 
932 /*
933  * This function learns the MAC address of the device and registers this along with a
934  * vlan tag to a VMDQ.
935  */
936 static int
937 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
938 {
939 	struct rte_ether_hdr *pkt_hdr;
940 	int i, ret;
941 
942 	/* Learn MAC address of guest device from packet */
943 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
944 
945 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
946 		RTE_LOG(ERR, VHOST_DATA,
947 			"(%d) device is using a registered MAC!\n",
948 			vdev->vid);
949 		return -1;
950 	}
951 
952 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
953 		vdev->mac_address.addr_bytes[i] =
954 			pkt_hdr->src_addr.addr_bytes[i];
955 
956 	/* vlan_tag currently uses the device_id. */
957 	vdev->vlan_tag = vlan_tags[vdev->vid];
958 
959 	/* Print out VMDQ registration info. */
960 	RTE_LOG(INFO, VHOST_DATA,
961 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
962 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
963 		vdev->vlan_tag);
964 
965 	/* Register the MAC address. */
966 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
967 				(uint32_t)vdev->vid + vmdq_pool_base);
968 	if (ret)
969 		RTE_LOG(ERR, VHOST_DATA,
970 			"(%d) failed to add device MAC address to VMDQ\n",
971 			vdev->vid);
972 
973 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
974 
975 	/* Set device as ready for RX. */
976 	vdev->ready = DEVICE_RX;
977 
978 	return 0;
979 }
980 
981 /*
982  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
983  * queue before disabling RX on the device.
984  */
985 static inline void
986 unlink_vmdq(struct vhost_dev *vdev)
987 {
988 	unsigned i = 0;
989 	unsigned rx_count;
990 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
991 
992 	if (vdev->ready == DEVICE_RX) {
993 		/*clear MAC and VLAN settings*/
994 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
995 		for (i = 0; i < 6; i++)
996 			vdev->mac_address.addr_bytes[i] = 0;
997 
998 		vdev->vlan_tag = 0;
999 
1000 		/*Clear out the receive buffers*/
1001 		rx_count = rte_eth_rx_burst(ports[0],
1002 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1003 
1004 		while (rx_count) {
1005 			for (i = 0; i < rx_count; i++)
1006 				rte_pktmbuf_free(pkts_burst[i]);
1007 
1008 			rx_count = rte_eth_rx_burst(ports[0],
1009 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1010 		}
1011 
1012 		vdev->ready = DEVICE_MAC_LEARNING;
1013 	}
1014 }
1015 
1016 static inline void
1017 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1018 {
1019 	while (n--)
1020 		rte_pktmbuf_free(pkts[n]);
1021 }
1022 
1023 static __rte_always_inline void
1024 complete_async_pkts(struct vhost_dev *vdev)
1025 {
1026 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1027 	uint16_t complete_count;
1028 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1029 
1030 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1031 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1032 	if (complete_count)
1033 		free_pkts(p_cpl, complete_count);
1034 
1035 }
1036 
1037 static __rte_always_inline void
1038 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1039 	    struct rte_mbuf *m)
1040 {
1041 	uint16_t ret;
1042 
1043 	if (builtin_net_driver) {
1044 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1045 	} else {
1046 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1047 	}
1048 
1049 	if (enable_stats) {
1050 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1051 				__ATOMIC_SEQ_CST);
1052 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1053 				__ATOMIC_SEQ_CST);
1054 		src_vdev->stats.tx_total++;
1055 		src_vdev->stats.tx += ret;
1056 	}
1057 }
1058 
1059 static __rte_always_inline void
1060 drain_vhost(struct vhost_dev *vdev)
1061 {
1062 	uint16_t ret;
1063 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1064 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1065 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1066 
1067 	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1068 
1069 	if (enable_stats) {
1070 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1071 				__ATOMIC_SEQ_CST);
1072 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1073 				__ATOMIC_SEQ_CST);
1074 	}
1075 
1076 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1077 		free_pkts(m, nr_xmit);
1078 }
1079 
1080 static __rte_always_inline void
1081 drain_vhost_table(void)
1082 {
1083 	uint16_t lcore_id = rte_lcore_id();
1084 	struct vhost_bufftable *vhost_txq;
1085 	struct vhost_dev *vdev;
1086 	uint64_t cur_tsc;
1087 
1088 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1089 		if (unlikely(vdev->remove == 1))
1090 			continue;
1091 
1092 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1093 
1094 		cur_tsc = rte_rdtsc();
1095 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1096 				> MBUF_TABLE_DRAIN_TSC)) {
1097 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1098 				"Vhost TX queue drained after timeout with burst size %u\n",
1099 				vhost_txq->len);
1100 			drain_vhost(vdev);
1101 			vhost_txq->len = 0;
1102 			vhost_txq->pre_tsc = cur_tsc;
1103 		}
1104 	}
1105 }
1106 
1107 /*
1108  * Check if the packet destination MAC address is for a local device. If so then put
1109  * the packet on that devices RX queue. If not then return.
1110  */
1111 static __rte_always_inline int
1112 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1113 {
1114 	struct rte_ether_hdr *pkt_hdr;
1115 	struct vhost_dev *dst_vdev;
1116 	struct vhost_bufftable *vhost_txq;
1117 	uint16_t lcore_id = rte_lcore_id();
1118 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1119 
1120 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1121 	if (!dst_vdev)
1122 		return -1;
1123 
1124 	if (vdev->vid == dst_vdev->vid) {
1125 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1126 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1127 			vdev->vid);
1128 		return 0;
1129 	}
1130 
1131 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1132 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1133 
1134 	if (unlikely(dst_vdev->remove)) {
1135 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1136 			"(%d) device is marked for removal\n", dst_vdev->vid);
1137 		return 0;
1138 	}
1139 
1140 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1141 	vhost_txq->m_table[vhost_txq->len++] = m;
1142 
1143 	if (enable_stats) {
1144 		vdev->stats.tx_total++;
1145 		vdev->stats.tx++;
1146 	}
1147 
1148 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1149 		drain_vhost(dst_vdev);
1150 		vhost_txq->len = 0;
1151 		vhost_txq->pre_tsc = rte_rdtsc();
1152 	}
1153 	return 0;
1154 }
1155 
1156 /*
1157  * Check if the destination MAC of a packet is one local VM,
1158  * and get its vlan tag, and offset if it is.
1159  */
1160 static __rte_always_inline int
1161 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1162 	uint32_t *offset, uint16_t *vlan_tag)
1163 {
1164 	struct vhost_dev *dst_vdev;
1165 	struct rte_ether_hdr *pkt_hdr =
1166 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1167 
1168 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1169 	if (!dst_vdev)
1170 		return 0;
1171 
1172 	if (vdev->vid == dst_vdev->vid) {
1173 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1174 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1175 			vdev->vid);
1176 		return -1;
1177 	}
1178 
1179 	/*
1180 	 * HW vlan strip will reduce the packet length
1181 	 * by minus length of vlan tag, so need restore
1182 	 * the packet length by plus it.
1183 	 */
1184 	*offset  = RTE_VLAN_HLEN;
1185 	*vlan_tag = vlan_tags[vdev->vid];
1186 
1187 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1188 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1189 		vdev->vid, dst_vdev->vid, *vlan_tag);
1190 
1191 	return 0;
1192 }
1193 
1194 static void virtio_tx_offload(struct rte_mbuf *m)
1195 {
1196 	struct rte_net_hdr_lens hdr_lens;
1197 	struct rte_ipv4_hdr *ipv4_hdr;
1198 	struct rte_tcp_hdr *tcp_hdr;
1199 	uint32_t ptype;
1200 	void *l3_hdr;
1201 
1202 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1203 	m->l2_len = hdr_lens.l2_len;
1204 	m->l3_len = hdr_lens.l3_len;
1205 	m->l4_len = hdr_lens.l4_len;
1206 
1207 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1208 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1209 		m->l2_len + m->l3_len);
1210 
1211 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1212 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1213 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1214 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1215 		ipv4_hdr = l3_hdr;
1216 		ipv4_hdr->hdr_checksum = 0;
1217 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1218 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1219 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1220 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1221 	}
1222 }
1223 
1224 static __rte_always_inline void
1225 do_drain_mbuf_table(struct mbuf_table *tx_q)
1226 {
1227 	uint16_t count;
1228 
1229 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1230 				 tx_q->m_table, tx_q->len);
1231 	if (unlikely(count < tx_q->len))
1232 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1233 
1234 	tx_q->len = 0;
1235 }
1236 
1237 /*
1238  * This function routes the TX packet to the correct interface. This
1239  * may be a local device or the physical port.
1240  */
1241 static __rte_always_inline void
1242 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1243 {
1244 	struct mbuf_table *tx_q;
1245 	unsigned offset = 0;
1246 	const uint16_t lcore_id = rte_lcore_id();
1247 	struct rte_ether_hdr *nh;
1248 
1249 
1250 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1251 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1252 		struct vhost_dev *vdev2;
1253 
1254 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1255 			if (vdev2 != vdev)
1256 				sync_virtio_xmit(vdev2, vdev, m);
1257 		}
1258 		goto queue2nic;
1259 	}
1260 
1261 	/*check if destination is local VM*/
1262 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1263 		return;
1264 
1265 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1266 		if (unlikely(find_local_dest(vdev, m, &offset,
1267 					     &vlan_tag) != 0)) {
1268 			rte_pktmbuf_free(m);
1269 			return;
1270 		}
1271 	}
1272 
1273 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1274 		"(%d) TX: MAC address is external\n", vdev->vid);
1275 
1276 queue2nic:
1277 
1278 	/*Add packet to the port tx queue*/
1279 	tx_q = &lcore_tx_queue[lcore_id];
1280 
1281 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1282 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1283 		/* Guest has inserted the vlan tag. */
1284 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1285 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1286 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1287 			(vh->vlan_tci != vlan_tag_be))
1288 			vh->vlan_tci = vlan_tag_be;
1289 	} else {
1290 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1291 
1292 		/*
1293 		 * Find the right seg to adjust the data len when offset is
1294 		 * bigger than tail room size.
1295 		 */
1296 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1297 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1298 				m->data_len += offset;
1299 			else {
1300 				struct rte_mbuf *seg = m;
1301 
1302 				while ((seg->next != NULL) &&
1303 					(offset > rte_pktmbuf_tailroom(seg)))
1304 					seg = seg->next;
1305 
1306 				seg->data_len += offset;
1307 			}
1308 			m->pkt_len += offset;
1309 		}
1310 
1311 		m->vlan_tci = vlan_tag;
1312 	}
1313 
1314 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1315 		virtio_tx_offload(m);
1316 
1317 	tx_q->m_table[tx_q->len++] = m;
1318 	if (enable_stats) {
1319 		vdev->stats.tx_total++;
1320 		vdev->stats.tx++;
1321 	}
1322 
1323 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1324 		do_drain_mbuf_table(tx_q);
1325 }
1326 
1327 
1328 static __rte_always_inline void
1329 drain_mbuf_table(struct mbuf_table *tx_q)
1330 {
1331 	static uint64_t prev_tsc;
1332 	uint64_t cur_tsc;
1333 
1334 	if (tx_q->len == 0)
1335 		return;
1336 
1337 	cur_tsc = rte_rdtsc();
1338 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1339 		prev_tsc = cur_tsc;
1340 
1341 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1342 			"TX queue drained after timeout with burst size %u\n",
1343 			tx_q->len);
1344 		do_drain_mbuf_table(tx_q);
1345 	}
1346 }
1347 
1348 uint16_t
1349 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1350 		struct rte_mbuf **pkts, uint32_t rx_count)
1351 {
1352 	uint16_t enqueue_count;
1353 	uint16_t enqueue_fail = 0;
1354 	uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1355 
1356 	complete_async_pkts(dev);
1357 	enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1358 					pkts, rx_count, dma_id, 0);
1359 
1360 	enqueue_fail = rx_count - enqueue_count;
1361 	if (enqueue_fail)
1362 		free_pkts(&pkts[enqueue_count], enqueue_fail);
1363 
1364 	return enqueue_count;
1365 }
1366 
1367 uint16_t
1368 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1369 		struct rte_mbuf **pkts, uint32_t rx_count)
1370 {
1371 	return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1372 }
1373 
1374 static __rte_always_inline void
1375 drain_eth_rx(struct vhost_dev *vdev)
1376 {
1377 	uint16_t rx_count, enqueue_count;
1378 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1379 
1380 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1381 				    pkts, MAX_PKT_BURST);
1382 
1383 	if (!rx_count)
1384 		return;
1385 
1386 	/*
1387 	 * When "enable_retry" is set, here we wait and retry when there
1388 	 * is no enough free slots in the queue to hold @rx_count packets,
1389 	 * to diminish packet loss.
1390 	 */
1391 	if (enable_retry &&
1392 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1393 			VIRTIO_RXQ))) {
1394 		uint32_t retry;
1395 
1396 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1397 			rte_delay_us(burst_rx_delay_time);
1398 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1399 					VIRTIO_RXQ))
1400 				break;
1401 		}
1402 	}
1403 
1404 	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1405 					VIRTIO_RXQ, pkts, rx_count);
1406 
1407 	if (enable_stats) {
1408 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1409 				__ATOMIC_SEQ_CST);
1410 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1411 				__ATOMIC_SEQ_CST);
1412 	}
1413 
1414 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1415 		free_pkts(pkts, rx_count);
1416 }
1417 
1418 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1419 			    struct rte_mempool *mbuf_pool,
1420 			    struct rte_mbuf **pkts, uint16_t count)
1421 {
1422 	int nr_inflight;
1423 	uint16_t dequeue_count;
1424 	int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1425 
1426 	dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1427 			mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1428 
1429 	return dequeue_count;
1430 }
1431 
1432 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1433 			   struct rte_mempool *mbuf_pool,
1434 			   struct rte_mbuf **pkts, uint16_t count)
1435 {
1436 	return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1437 }
1438 
1439 static __rte_always_inline void
1440 drain_virtio_tx(struct vhost_dev *vdev)
1441 {
1442 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1443 	uint16_t count;
1444 	uint16_t i;
1445 
1446 	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1447 				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1448 
1449 	/* setup VMDq for the first packet */
1450 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1451 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1452 			free_pkts(pkts, count);
1453 	}
1454 
1455 	for (i = 0; i < count; ++i)
1456 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1457 }
1458 
1459 /*
1460  * Main function of vhost-switch. It basically does:
1461  *
1462  * for each vhost device {
1463  *    - drain_eth_rx()
1464  *
1465  *      Which drains the host eth Rx queue linked to the vhost device,
1466  *      and deliver all of them to guest virito Rx ring associated with
1467  *      this vhost device.
1468  *
1469  *    - drain_virtio_tx()
1470  *
1471  *      Which drains the guest virtio Tx queue and deliver all of them
1472  *      to the target, which could be another vhost device, or the
1473  *      physical eth dev. The route is done in function "virtio_tx_route".
1474  * }
1475  */
1476 static int
1477 switch_worker(void *arg __rte_unused)
1478 {
1479 	unsigned i;
1480 	unsigned lcore_id = rte_lcore_id();
1481 	struct vhost_dev *vdev;
1482 	struct mbuf_table *tx_q;
1483 
1484 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1485 
1486 	tx_q = &lcore_tx_queue[lcore_id];
1487 	for (i = 0; i < rte_lcore_count(); i++) {
1488 		if (lcore_ids[i] == lcore_id) {
1489 			tx_q->txq_id = i;
1490 			break;
1491 		}
1492 	}
1493 
1494 	while(1) {
1495 		drain_mbuf_table(tx_q);
1496 		drain_vhost_table();
1497 		/*
1498 		 * Inform the configuration core that we have exited the
1499 		 * linked list and that no devices are in use if requested.
1500 		 */
1501 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1502 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1503 
1504 		/*
1505 		 * Process vhost devices
1506 		 */
1507 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1508 			      lcore_vdev_entry) {
1509 			if (unlikely(vdev->remove)) {
1510 				unlink_vmdq(vdev);
1511 				vdev->ready = DEVICE_SAFE_REMOVE;
1512 				continue;
1513 			}
1514 
1515 			if (likely(vdev->ready == DEVICE_RX))
1516 				drain_eth_rx(vdev);
1517 
1518 			if (likely(!vdev->remove))
1519 				drain_virtio_tx(vdev);
1520 		}
1521 	}
1522 
1523 	return 0;
1524 }
1525 
1526 static void
1527 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1528 {
1529 	uint16_t n_pkt = 0;
1530 	int pkts_inflight;
1531 
1532 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1533 	pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1534 
1535 	struct rte_mbuf *m_cpl[pkts_inflight];
1536 
1537 	while (pkts_inflight) {
1538 		n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1539 							pkts_inflight, dma_id, 0);
1540 		free_pkts(m_cpl, n_pkt);
1541 		pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1542 									queue_id);
1543 	}
1544 }
1545 
1546 /*
1547  * Remove a device from the specific data core linked list and from the
1548  * main linked list. Synchronization  occurs through the use of the
1549  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1550  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1551  */
1552 static void
1553 destroy_device(int vid)
1554 {
1555 	struct vhost_dev *vdev = NULL;
1556 	int lcore;
1557 	uint16_t i;
1558 
1559 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1560 		if (vdev->vid == vid)
1561 			break;
1562 	}
1563 	if (!vdev)
1564 		return;
1565 	/*set the remove flag. */
1566 	vdev->remove = 1;
1567 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1568 		rte_pause();
1569 	}
1570 
1571 	for (i = 0; i < RTE_MAX_LCORE; i++)
1572 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1573 
1574 	if (builtin_net_driver)
1575 		vs_vhost_net_remove(vdev);
1576 
1577 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1578 		     lcore_vdev_entry);
1579 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1580 
1581 
1582 	/* Set the dev_removal_flag on each lcore. */
1583 	RTE_LCORE_FOREACH_WORKER(lcore)
1584 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1585 
1586 	/*
1587 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1588 	 * we can be sure that they can no longer access the device removed
1589 	 * from the linked lists and that the devices are no longer in use.
1590 	 */
1591 	RTE_LCORE_FOREACH_WORKER(lcore) {
1592 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1593 			rte_pause();
1594 	}
1595 
1596 	lcore_info[vdev->coreid].device_num--;
1597 
1598 	RTE_LOG(INFO, VHOST_DATA,
1599 		"(%d) device has been removed from data core\n",
1600 		vdev->vid);
1601 
1602 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1603 		vhost_clear_queue_thread_unsafe(vdev, VIRTIO_RXQ);
1604 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1605 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1606 	}
1607 
1608 	if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1609 		vhost_clear_queue_thread_unsafe(vdev, VIRTIO_TXQ);
1610 		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1611 		dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1612 	}
1613 
1614 	rte_free(vdev);
1615 }
1616 
1617 static inline int
1618 get_socketid_by_vid(int vid)
1619 {
1620 	int i;
1621 	char ifname[PATH_MAX];
1622 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1623 
1624 	for (i = 0; i < nb_sockets; i++) {
1625 		char *file = socket_files + i * PATH_MAX;
1626 		if (strcmp(file, ifname) == 0)
1627 			return i;
1628 	}
1629 
1630 	return -1;
1631 }
1632 
1633 static int
1634 init_vhost_queue_ops(int vid)
1635 {
1636 	if (builtin_net_driver) {
1637 		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1638 		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1639 	} else {
1640 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1641 			vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1642 		else
1643 			vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1644 
1645 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1646 			vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1647 		else
1648 			vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1649 	}
1650 
1651 	return 0;
1652 }
1653 
1654 static inline int
1655 vhost_async_channel_register(int vid)
1656 {
1657 	int rx_ret = 0, tx_ret = 0;
1658 
1659 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1660 		rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1661 		if (rx_ret == 0)
1662 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1663 	}
1664 
1665 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1666 		tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1667 		if (tx_ret == 0)
1668 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1669 	}
1670 
1671 	return rx_ret | tx_ret;
1672 }
1673 
1674 
1675 
1676 /*
1677  * A new device is added to a data core. First the device is added to the main linked list
1678  * and then allocated to a specific data core.
1679  */
1680 static int
1681 new_device(int vid)
1682 {
1683 	int lcore, core_add = 0;
1684 	uint16_t i;
1685 	uint32_t device_num_min = num_devices;
1686 	struct vhost_dev *vdev;
1687 	int ret;
1688 
1689 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1690 	if (vdev == NULL) {
1691 		RTE_LOG(INFO, VHOST_DATA,
1692 			"(%d) couldn't allocate memory for vhost dev\n",
1693 			vid);
1694 		return -1;
1695 	}
1696 	vdev->vid = vid;
1697 
1698 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1699 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1700 			= rte_zmalloc("vhost bufftable",
1701 				sizeof(struct vhost_bufftable),
1702 				RTE_CACHE_LINE_SIZE);
1703 
1704 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1705 			RTE_LOG(INFO, VHOST_DATA,
1706 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1707 			return -1;
1708 		}
1709 	}
1710 
1711 	int socketid = get_socketid_by_vid(vid);
1712 	if (socketid == -1)
1713 		return -1;
1714 
1715 	init_vid2socketid_array(vid, socketid);
1716 
1717 	ret =  vhost_async_channel_register(vid);
1718 
1719 	if (init_vhost_queue_ops(vid) != 0)
1720 		return -1;
1721 
1722 	if (builtin_net_driver)
1723 		vs_vhost_net_setup(vdev);
1724 
1725 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1726 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1727 
1728 	/*reset ready flag*/
1729 	vdev->ready = DEVICE_MAC_LEARNING;
1730 	vdev->remove = 0;
1731 
1732 	/* Find a suitable lcore to add the device. */
1733 	RTE_LCORE_FOREACH_WORKER(lcore) {
1734 		if (lcore_info[lcore].device_num < device_num_min) {
1735 			device_num_min = lcore_info[lcore].device_num;
1736 			core_add = lcore;
1737 		}
1738 	}
1739 	vdev->coreid = core_add;
1740 
1741 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1742 			  lcore_vdev_entry);
1743 	lcore_info[vdev->coreid].device_num++;
1744 
1745 	/* Disable notifications. */
1746 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1747 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1748 
1749 	RTE_LOG(INFO, VHOST_DATA,
1750 		"(%d) device has been added to data core %d\n",
1751 		vid, vdev->coreid);
1752 
1753 	return ret;
1754 }
1755 
1756 static int
1757 vring_state_changed(int vid, uint16_t queue_id, int enable)
1758 {
1759 	struct vhost_dev *vdev = NULL;
1760 
1761 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1762 		if (vdev->vid == vid)
1763 			break;
1764 	}
1765 	if (!vdev)
1766 		return -1;
1767 
1768 	if (queue_id != VIRTIO_RXQ)
1769 		return 0;
1770 
1771 	if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1772 		if (!enable)
1773 			vhost_clear_queue_thread_unsafe(vdev, queue_id);
1774 	}
1775 
1776 	return 0;
1777 }
1778 
1779 /*
1780  * These callback allow devices to be added to the data core when configuration
1781  * has been fully complete.
1782  */
1783 static const struct rte_vhost_device_ops virtio_net_device_ops =
1784 {
1785 	.new_device =  new_device,
1786 	.destroy_device = destroy_device,
1787 	.vring_state_changed = vring_state_changed,
1788 };
1789 
1790 /*
1791  * This is a thread will wake up after a period to print stats if the user has
1792  * enabled them.
1793  */
1794 static void *
1795 print_stats(__rte_unused void *arg)
1796 {
1797 	struct vhost_dev *vdev;
1798 	uint64_t tx_dropped, rx_dropped;
1799 	uint64_t tx, tx_total, rx, rx_total;
1800 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1801 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1802 
1803 	while(1) {
1804 		sleep(enable_stats);
1805 
1806 		/* Clear screen and move to top left */
1807 		printf("%s%s\n", clr, top_left);
1808 		printf("Device statistics =================================\n");
1809 
1810 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1811 			tx_total   = vdev->stats.tx_total;
1812 			tx         = vdev->stats.tx;
1813 			tx_dropped = tx_total - tx;
1814 
1815 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1816 				__ATOMIC_SEQ_CST);
1817 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1818 				__ATOMIC_SEQ_CST);
1819 			rx_dropped = rx_total - rx;
1820 
1821 			printf("Statistics for device %d\n"
1822 				"-----------------------\n"
1823 				"TX total:              %" PRIu64 "\n"
1824 				"TX dropped:            %" PRIu64 "\n"
1825 				"TX successful:         %" PRIu64 "\n"
1826 				"RX total:              %" PRIu64 "\n"
1827 				"RX dropped:            %" PRIu64 "\n"
1828 				"RX successful:         %" PRIu64 "\n",
1829 				vdev->vid,
1830 				tx_total, tx_dropped, tx,
1831 				rx_total, rx_dropped, rx);
1832 		}
1833 
1834 		printf("===================================================\n");
1835 
1836 		fflush(stdout);
1837 	}
1838 
1839 	return NULL;
1840 }
1841 
1842 static void
1843 unregister_drivers(int socket_num)
1844 {
1845 	int i, ret;
1846 
1847 	for (i = 0; i < socket_num; i++) {
1848 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1849 		if (ret != 0)
1850 			RTE_LOG(ERR, VHOST_CONFIG,
1851 				"Fail to unregister vhost driver for %s.\n",
1852 				socket_files + i * PATH_MAX);
1853 	}
1854 }
1855 
1856 /* When we receive a INT signal, unregister vhost driver */
1857 static void
1858 sigint_handler(__rte_unused int signum)
1859 {
1860 	/* Unregister vhost driver. */
1861 	unregister_drivers(nb_sockets);
1862 
1863 	exit(0);
1864 }
1865 
1866 static void
1867 reset_dma(void)
1868 {
1869 	int i;
1870 
1871 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1872 		int j;
1873 
1874 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1875 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1876 			dma_bind[i].dmas[j].async_enabled = false;
1877 		}
1878 	}
1879 
1880 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1881 		dmas_id[i] = INVALID_DMA_ID;
1882 }
1883 
1884 /*
1885  * Main function, does initialisation and calls the per-lcore functions.
1886  */
1887 int
1888 main(int argc, char *argv[])
1889 {
1890 	unsigned lcore_id, core_id = 0;
1891 	unsigned nb_ports, valid_num_ports;
1892 	int ret, i;
1893 	uint16_t portid;
1894 	static pthread_t tid;
1895 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1896 
1897 	signal(SIGINT, sigint_handler);
1898 
1899 	/* init EAL */
1900 	ret = rte_eal_init(argc, argv);
1901 	if (ret < 0)
1902 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1903 	argc -= ret;
1904 	argv += ret;
1905 
1906 	/* initialize dma structures */
1907 	reset_dma();
1908 
1909 	/* parse app arguments */
1910 	ret = us_vhost_parse_args(argc, argv);
1911 	if (ret < 0)
1912 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1913 
1914 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1915 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1916 
1917 		if (rte_lcore_is_enabled(lcore_id))
1918 			lcore_ids[core_id++] = lcore_id;
1919 	}
1920 
1921 	if (rte_lcore_count() > RTE_MAX_LCORE)
1922 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1923 
1924 	/* Get the number of physical ports. */
1925 	nb_ports = rte_eth_dev_count_avail();
1926 
1927 	/*
1928 	 * Update the global var NUM_PORTS and global array PORTS
1929 	 * and get value of var VALID_NUM_PORTS according to system ports number
1930 	 */
1931 	valid_num_ports = check_ports_num(nb_ports);
1932 
1933 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1934 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1935 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1936 		return -1;
1937 	}
1938 
1939 	/*
1940 	 * FIXME: here we are trying to allocate mbufs big enough for
1941 	 * @MAX_QUEUES, but the truth is we're never going to use that
1942 	 * many queues here. We probably should only do allocation for
1943 	 * those queues we are going to use.
1944 	 */
1945 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1946 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1947 					    rte_socket_id());
1948 	if (mbuf_pool == NULL)
1949 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1950 
1951 	if (vm2vm_mode == VM2VM_HARDWARE) {
1952 		/* Enable VT loop back to let L2 switch to do it. */
1953 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1954 		RTE_LOG(DEBUG, VHOST_CONFIG,
1955 			"Enable loop back for L2 switch in vmdq.\n");
1956 	}
1957 
1958 	/* initialize all ports */
1959 	RTE_ETH_FOREACH_DEV(portid) {
1960 		/* skip ports that are not enabled */
1961 		if ((enabled_port_mask & (1 << portid)) == 0) {
1962 			RTE_LOG(INFO, VHOST_PORT,
1963 				"Skipping disabled port %d\n", portid);
1964 			continue;
1965 		}
1966 		if (port_init(portid) != 0)
1967 			rte_exit(EXIT_FAILURE,
1968 				"Cannot initialize network ports\n");
1969 	}
1970 
1971 	/* Enable stats if the user option is set. */
1972 	if (enable_stats) {
1973 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1974 					print_stats, NULL);
1975 		if (ret < 0)
1976 			rte_exit(EXIT_FAILURE,
1977 				"Cannot create print-stats thread\n");
1978 	}
1979 
1980 	/* Launch all data cores. */
1981 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1982 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1983 
1984 	if (client_mode)
1985 		flags |= RTE_VHOST_USER_CLIENT;
1986 
1987 	for (i = 0; i < dma_count; i++) {
1988 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1989 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1990 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1991 		}
1992 	}
1993 
1994 	/* Register vhost user driver to handle vhost messages. */
1995 	for (i = 0; i < nb_sockets; i++) {
1996 		char *file = socket_files + i * PATH_MAX;
1997 
1998 		if (dma_count && get_async_flag_by_socketid(i) != 0)
1999 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2000 
2001 		ret = rte_vhost_driver_register(file, flags);
2002 		if (ret != 0) {
2003 			unregister_drivers(i);
2004 			rte_exit(EXIT_FAILURE,
2005 				"vhost driver register failure.\n");
2006 		}
2007 
2008 		if (builtin_net_driver)
2009 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2010 
2011 		if (mergeable == 0) {
2012 			rte_vhost_driver_disable_features(file,
2013 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
2014 		}
2015 
2016 		if (enable_tx_csum == 0) {
2017 			rte_vhost_driver_disable_features(file,
2018 				1ULL << VIRTIO_NET_F_CSUM);
2019 		}
2020 
2021 		if (enable_tso == 0) {
2022 			rte_vhost_driver_disable_features(file,
2023 				1ULL << VIRTIO_NET_F_HOST_TSO4);
2024 			rte_vhost_driver_disable_features(file,
2025 				1ULL << VIRTIO_NET_F_HOST_TSO6);
2026 			rte_vhost_driver_disable_features(file,
2027 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
2028 			rte_vhost_driver_disable_features(file,
2029 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
2030 		}
2031 
2032 		if (promiscuous) {
2033 			rte_vhost_driver_enable_features(file,
2034 				1ULL << VIRTIO_NET_F_CTRL_RX);
2035 		}
2036 
2037 		ret = rte_vhost_driver_callback_register(file,
2038 			&virtio_net_device_ops);
2039 		if (ret != 0) {
2040 			rte_exit(EXIT_FAILURE,
2041 				"failed to register vhost driver callbacks.\n");
2042 		}
2043 
2044 		if (rte_vhost_driver_start(file) < 0) {
2045 			rte_exit(EXIT_FAILURE,
2046 				"failed to start vhost driver.\n");
2047 		}
2048 	}
2049 
2050 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2051 		rte_eal_wait_lcore(lcore_id);
2052 
2053 	/* clean up the EAL */
2054 	rte_eal_cleanup();
2055 
2056 	return 0;
2057 }
2058