xref: /dpdk/examples/vhost/main.c (revision 30a1de105a5f40d77b344a891c4a68f79e815c43)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29 
30 #include "main.h"
31 
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35 
36 /* the maximum number of external ports supported */
37 #define MAX_SUP_PORTS 1
38 
39 #define MBUF_CACHE_SIZE	128
40 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
41 
42 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
43 
44 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
45 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
46 
47 #define JUMBO_FRAME_MAX_SIZE    0x2600
48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
49 
50 /* State of virtio device. */
51 #define DEVICE_MAC_LEARNING 0
52 #define DEVICE_RX			1
53 #define DEVICE_SAFE_REMOVE	2
54 
55 /* Configurable number of RX/TX ring descriptors */
56 #define RTE_TEST_RX_DESC_DEFAULT 1024
57 #define RTE_TEST_TX_DESC_DEFAULT 512
58 
59 #define INVALID_PORT_ID 0xFF
60 #define INVALID_DMA_ID -1
61 
62 #define DMA_RING_SIZE 4096
63 
64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
66 static int dma_count;
67 
68 /* mask of enabled ports */
69 static uint32_t enabled_port_mask = 0;
70 
71 /* Promiscuous mode */
72 static uint32_t promiscuous;
73 
74 /* number of devices/queues to support*/
75 static uint32_t num_queues = 0;
76 static uint32_t num_devices;
77 
78 static struct rte_mempool *mbuf_pool;
79 static int mergeable;
80 
81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
82 typedef enum {
83 	VM2VM_DISABLED = 0,
84 	VM2VM_SOFTWARE = 1,
85 	VM2VM_HARDWARE = 2,
86 	VM2VM_LAST
87 } vm2vm_type;
88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
89 
90 /* Enable stats. */
91 static uint32_t enable_stats = 0;
92 /* Enable retries on RX. */
93 static uint32_t enable_retry = 1;
94 
95 /* Disable TX checksum offload */
96 static uint32_t enable_tx_csum;
97 
98 /* Disable TSO offload */
99 static uint32_t enable_tso;
100 
101 static int client_mode;
102 
103 static int builtin_net_driver;
104 
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109 
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113 
114 /* empty VMDq configuration structure. Filled in programmatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116 	.rxmode = {
117 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
118 		.split_hdr_size = 0,
119 		/*
120 		 * VLAN strip is necessary for 1G NIC such as I350,
121 		 * this fixes bug of ipv4 forwarding in guest can't
122 		 * forward packets from one virtio dev to another virtio dev.
123 		 */
124 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
125 	},
126 
127 	.txmode = {
128 		.mq_mode = RTE_ETH_MQ_TX_NONE,
129 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
130 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
131 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
132 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
133 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
134 	},
135 	.rx_adv_conf = {
136 		/*
137 		 * should be overridden separately in code with
138 		 * appropriate values
139 		 */
140 		.vmdq_rx_conf = {
141 			.nb_queue_pools = RTE_ETH_8_POOLS,
142 			.enable_default_pool = 0,
143 			.default_pool = 0,
144 			.nb_pool_maps = 0,
145 			.pool_map = {{0, 0},},
146 		},
147 	},
148 };
149 
150 
151 static unsigned lcore_ids[RTE_MAX_LCORE];
152 static uint16_t ports[RTE_MAX_ETHPORTS];
153 static unsigned num_ports = 0; /**< The number of ports specified in command line */
154 static uint16_t num_pf_queues, num_vmdq_queues;
155 static uint16_t vmdq_pool_base, vmdq_queue_base;
156 static uint16_t queues_per_pool;
157 
158 const uint16_t vlan_tags[] = {
159 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
160 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
161 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
162 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
163 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
164 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
165 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
166 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
167 };
168 
169 /* ethernet addresses of ports */
170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
171 
172 static struct vhost_dev_tailq_list vhost_dev_list =
173 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
174 
175 static struct lcore_info lcore_info[RTE_MAX_LCORE];
176 
177 /* Used for queueing bursts of TX packets. */
178 struct mbuf_table {
179 	unsigned len;
180 	unsigned txq_id;
181 	struct rte_mbuf *m_table[MAX_PKT_BURST];
182 };
183 
184 struct vhost_bufftable {
185 	uint32_t len;
186 	uint64_t pre_tsc;
187 	struct rte_mbuf *m_table[MAX_PKT_BURST];
188 };
189 
190 /* TX queue for each data core. */
191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
192 
193 /*
194  * Vhost TX buffer for each data core.
195  * Every data core maintains a TX buffer for every vhost device,
196  * which is used for batch pkts enqueue for higher performance.
197  */
198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
199 
200 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
201 				 / US_PER_S * BURST_TX_DRAIN_US)
202 
203 static inline bool
204 is_dma_configured(int16_t dev_id)
205 {
206 	int i;
207 
208 	for (i = 0; i < dma_count; i++)
209 		if (dmas_id[i] == dev_id)
210 			return true;
211 	return false;
212 }
213 
214 static inline int
215 open_dma(const char *value)
216 {
217 	struct dma_for_vhost *dma_info = dma_bind;
218 	char *input = strndup(value, strlen(value) + 1);
219 	char *addrs = input;
220 	char *ptrs[2];
221 	char *start, *end, *substr;
222 	int64_t vid;
223 
224 	struct rte_dma_info info;
225 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
226 	struct rte_dma_vchan_conf qconf = {
227 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
228 		.nb_desc = DMA_RING_SIZE
229 	};
230 
231 	int dev_id;
232 	int ret = 0;
233 	uint16_t i = 0;
234 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
235 	int args_nr;
236 
237 	while (isblank(*addrs))
238 		addrs++;
239 	if (*addrs == '\0') {
240 		ret = -1;
241 		goto out;
242 	}
243 
244 	/* process DMA devices within bracket. */
245 	addrs++;
246 	substr = strtok(addrs, ";]");
247 	if (!substr) {
248 		ret = -1;
249 		goto out;
250 	}
251 
252 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
253 	if (args_nr <= 0) {
254 		ret = -1;
255 		goto out;
256 	}
257 
258 	while (i < args_nr) {
259 		char *arg_temp = dma_arg[i];
260 		uint8_t sub_nr;
261 
262 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
263 		if (sub_nr != 2) {
264 			ret = -1;
265 			goto out;
266 		}
267 
268 		start = strstr(ptrs[0], "txd");
269 		if (start == NULL) {
270 			ret = -1;
271 			goto out;
272 		}
273 
274 		start += 3;
275 		vid = strtol(start, &end, 0);
276 		if (end == start) {
277 			ret = -1;
278 			goto out;
279 		}
280 
281 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
282 		if (dev_id < 0) {
283 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
284 			ret = -1;
285 			goto out;
286 		}
287 
288 		/* DMA device is already configured, so skip */
289 		if (is_dma_configured(dev_id))
290 			goto done;
291 
292 		if (rte_dma_info_get(dev_id, &info) != 0) {
293 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
294 			ret = -1;
295 			goto out;
296 		}
297 
298 		if (info.max_vchans < 1) {
299 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
300 			ret = -1;
301 			goto out;
302 		}
303 
304 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
305 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
306 			ret = -1;
307 			goto out;
308 		}
309 
310 		/* Check the max desc supported by DMA device */
311 		rte_dma_info_get(dev_id, &info);
312 		if (info.nb_vchans != 1) {
313 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
314 					dev_id);
315 			ret = -1;
316 			goto out;
317 		}
318 
319 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
320 
321 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
322 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
323 			ret = -1;
324 			goto out;
325 		}
326 
327 		if (rte_dma_start(dev_id) != 0) {
328 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
329 			ret = -1;
330 			goto out;
331 		}
332 
333 		dmas_id[dma_count++] = dev_id;
334 
335 done:
336 		(dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
337 		i++;
338 	}
339 out:
340 	free(input);
341 	return ret;
342 }
343 
344 /*
345  * Builds up the correct configuration for VMDQ VLAN pool map
346  * according to the pool & queue limits.
347  */
348 static inline int
349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
350 {
351 	struct rte_eth_vmdq_rx_conf conf;
352 	struct rte_eth_vmdq_rx_conf *def_conf =
353 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
354 	unsigned i;
355 
356 	memset(&conf, 0, sizeof(conf));
357 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
358 	conf.nb_pool_maps = num_devices;
359 	conf.enable_loop_back = def_conf->enable_loop_back;
360 	conf.rx_mode = def_conf->rx_mode;
361 
362 	for (i = 0; i < conf.nb_pool_maps; i++) {
363 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
364 		conf.pool_map[i].pools = (1UL << i);
365 	}
366 
367 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
368 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
369 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
370 	return 0;
371 }
372 
373 /*
374  * Initialises a given port using global settings and with the rx buffers
375  * coming from the mbuf_pool passed as parameter
376  */
377 static inline int
378 port_init(uint16_t port)
379 {
380 	struct rte_eth_dev_info dev_info;
381 	struct rte_eth_conf port_conf;
382 	struct rte_eth_rxconf *rxconf;
383 	struct rte_eth_txconf *txconf;
384 	int16_t rx_rings, tx_rings;
385 	uint16_t rx_ring_size, tx_ring_size;
386 	int retval;
387 	uint16_t q;
388 
389 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
390 	retval = rte_eth_dev_info_get(port, &dev_info);
391 	if (retval != 0) {
392 		RTE_LOG(ERR, VHOST_PORT,
393 			"Error during getting device (port %u) info: %s\n",
394 			port, strerror(-retval));
395 
396 		return retval;
397 	}
398 
399 	rxconf = &dev_info.default_rxconf;
400 	txconf = &dev_info.default_txconf;
401 	rxconf->rx_drop_en = 1;
402 
403 	/*configure the number of supported virtio devices based on VMDQ limits */
404 	num_devices = dev_info.max_vmdq_pools;
405 
406 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
408 
409 	tx_rings = (uint16_t)rte_lcore_count();
410 
411 	if (mergeable) {
412 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
413 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
414 		else
415 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
416 	}
417 
418 	/* Get port configuration. */
419 	retval = get_eth_conf(&port_conf, num_devices);
420 	if (retval < 0)
421 		return retval;
422 	/* NIC queues are divided into pf queues and vmdq queues.  */
423 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
424 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
425 	num_vmdq_queues = num_devices * queues_per_pool;
426 	num_queues = num_pf_queues + num_vmdq_queues;
427 	vmdq_queue_base = dev_info.vmdq_queue_base;
428 	vmdq_pool_base  = dev_info.vmdq_pool_base;
429 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
430 		num_pf_queues, num_devices, queues_per_pool);
431 
432 	if (!rte_eth_dev_is_valid_port(port))
433 		return -1;
434 
435 	rx_rings = (uint16_t)dev_info.max_rx_queues;
436 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
437 		port_conf.txmode.offloads |=
438 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
439 	/* Configure ethernet device. */
440 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
441 	if (retval != 0) {
442 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
443 			port, strerror(-retval));
444 		return retval;
445 	}
446 
447 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
448 		&tx_ring_size);
449 	if (retval != 0) {
450 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
451 			"for port %u: %s.\n", port, strerror(-retval));
452 		return retval;
453 	}
454 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
455 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
456 			"for Rx queues on port %u.\n", port);
457 		return -1;
458 	}
459 
460 	/* Setup the queues. */
461 	rxconf->offloads = port_conf.rxmode.offloads;
462 	for (q = 0; q < rx_rings; q ++) {
463 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
464 						rte_eth_dev_socket_id(port),
465 						rxconf,
466 						mbuf_pool);
467 		if (retval < 0) {
468 			RTE_LOG(ERR, VHOST_PORT,
469 				"Failed to setup rx queue %u of port %u: %s.\n",
470 				q, port, strerror(-retval));
471 			return retval;
472 		}
473 	}
474 	txconf->offloads = port_conf.txmode.offloads;
475 	for (q = 0; q < tx_rings; q ++) {
476 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
477 						rte_eth_dev_socket_id(port),
478 						txconf);
479 		if (retval < 0) {
480 			RTE_LOG(ERR, VHOST_PORT,
481 				"Failed to setup tx queue %u of port %u: %s.\n",
482 				q, port, strerror(-retval));
483 			return retval;
484 		}
485 	}
486 
487 	/* Start the device. */
488 	retval  = rte_eth_dev_start(port);
489 	if (retval < 0) {
490 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
491 			port, strerror(-retval));
492 		return retval;
493 	}
494 
495 	if (promiscuous) {
496 		retval = rte_eth_promiscuous_enable(port);
497 		if (retval != 0) {
498 			RTE_LOG(ERR, VHOST_PORT,
499 				"Failed to enable promiscuous mode on port %u: %s\n",
500 				port, rte_strerror(-retval));
501 			return retval;
502 		}
503 	}
504 
505 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
506 	if (retval < 0) {
507 		RTE_LOG(ERR, VHOST_PORT,
508 			"Failed to get MAC address on port %u: %s\n",
509 			port, rte_strerror(-retval));
510 		return retval;
511 	}
512 
513 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
514 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
515 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
516 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
517 
518 	return 0;
519 }
520 
521 /*
522  * Set socket file path.
523  */
524 static int
525 us_vhost_parse_socket_path(const char *q_arg)
526 {
527 	char *old;
528 
529 	/* parse number string */
530 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
531 		return -1;
532 
533 	old = socket_files;
534 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
535 	if (socket_files == NULL) {
536 		free(old);
537 		return -1;
538 	}
539 
540 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
541 	nb_sockets++;
542 
543 	return 0;
544 }
545 
546 /*
547  * Parse the portmask provided at run time.
548  */
549 static int
550 parse_portmask(const char *portmask)
551 {
552 	char *end = NULL;
553 	unsigned long pm;
554 
555 	errno = 0;
556 
557 	/* parse hexadecimal string */
558 	pm = strtoul(portmask, &end, 16);
559 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
560 		return 0;
561 
562 	return pm;
563 
564 }
565 
566 /*
567  * Parse num options at run time.
568  */
569 static int
570 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
571 {
572 	char *end = NULL;
573 	unsigned long num;
574 
575 	errno = 0;
576 
577 	/* parse unsigned int string */
578 	num = strtoul(q_arg, &end, 10);
579 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
580 		return -1;
581 
582 	if (num > max_valid_value)
583 		return -1;
584 
585 	return num;
586 
587 }
588 
589 /*
590  * Display usage
591  */
592 static void
593 us_vhost_usage(const char *prgname)
594 {
595 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
596 	"		--vm2vm [0|1|2]\n"
597 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
598 	"		--socket-file <path>\n"
599 	"		--nb-devices ND\n"
600 	"		-p PORTMASK: Set mask for ports to be used by application\n"
601 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
602 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
603 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
604 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
605 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
606 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
607 	"		--socket-file: The path of the socket file.\n"
608 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
609 	"		--tso [0|1] disable/enable TCP segment offload.\n"
610 	"		--client register a vhost-user socket as client mode.\n"
611 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
612 	"		--dmas register dma channel for specific vhost device.\n",
613 	       prgname);
614 }
615 
616 enum {
617 #define OPT_VM2VM               "vm2vm"
618 	OPT_VM2VM_NUM = 256,
619 #define OPT_RX_RETRY            "rx-retry"
620 	OPT_RX_RETRY_NUM,
621 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
622 	OPT_RX_RETRY_DELAY_NUM,
623 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
624 	OPT_RX_RETRY_NUMB_NUM,
625 #define OPT_MERGEABLE           "mergeable"
626 	OPT_MERGEABLE_NUM,
627 #define OPT_STATS               "stats"
628 	OPT_STATS_NUM,
629 #define OPT_SOCKET_FILE         "socket-file"
630 	OPT_SOCKET_FILE_NUM,
631 #define OPT_TX_CSUM             "tx-csum"
632 	OPT_TX_CSUM_NUM,
633 #define OPT_TSO                 "tso"
634 	OPT_TSO_NUM,
635 #define OPT_CLIENT              "client"
636 	OPT_CLIENT_NUM,
637 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
638 	OPT_BUILTIN_NET_DRIVER_NUM,
639 #define OPT_DMAS                "dmas"
640 	OPT_DMAS_NUM,
641 };
642 
643 /*
644  * Parse the arguments given in the command line of the application.
645  */
646 static int
647 us_vhost_parse_args(int argc, char **argv)
648 {
649 	int opt, ret;
650 	int option_index;
651 	unsigned i;
652 	const char *prgname = argv[0];
653 	static struct option long_option[] = {
654 		{OPT_VM2VM, required_argument,
655 				NULL, OPT_VM2VM_NUM},
656 		{OPT_RX_RETRY, required_argument,
657 				NULL, OPT_RX_RETRY_NUM},
658 		{OPT_RX_RETRY_DELAY, required_argument,
659 				NULL, OPT_RX_RETRY_DELAY_NUM},
660 		{OPT_RX_RETRY_NUMB, required_argument,
661 				NULL, OPT_RX_RETRY_NUMB_NUM},
662 		{OPT_MERGEABLE, required_argument,
663 				NULL, OPT_MERGEABLE_NUM},
664 		{OPT_STATS, required_argument,
665 				NULL, OPT_STATS_NUM},
666 		{OPT_SOCKET_FILE, required_argument,
667 				NULL, OPT_SOCKET_FILE_NUM},
668 		{OPT_TX_CSUM, required_argument,
669 				NULL, OPT_TX_CSUM_NUM},
670 		{OPT_TSO, required_argument,
671 				NULL, OPT_TSO_NUM},
672 		{OPT_CLIENT, no_argument,
673 				NULL, OPT_CLIENT_NUM},
674 		{OPT_BUILTIN_NET_DRIVER, no_argument,
675 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
676 		{OPT_DMAS, required_argument,
677 				NULL, OPT_DMAS_NUM},
678 		{NULL, 0, 0, 0},
679 	};
680 
681 	/* Parse command line */
682 	while ((opt = getopt_long(argc, argv, "p:P",
683 			long_option, &option_index)) != EOF) {
684 		switch (opt) {
685 		/* Portmask */
686 		case 'p':
687 			enabled_port_mask = parse_portmask(optarg);
688 			if (enabled_port_mask == 0) {
689 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
690 				us_vhost_usage(prgname);
691 				return -1;
692 			}
693 			break;
694 
695 		case 'P':
696 			promiscuous = 1;
697 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
698 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
699 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
700 			break;
701 
702 		case OPT_VM2VM_NUM:
703 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
704 			if (ret == -1) {
705 				RTE_LOG(INFO, VHOST_CONFIG,
706 					"Invalid argument for "
707 					"vm2vm [0|1|2]\n");
708 				us_vhost_usage(prgname);
709 				return -1;
710 			}
711 			vm2vm_mode = (vm2vm_type)ret;
712 			break;
713 
714 		case OPT_RX_RETRY_NUM:
715 			ret = parse_num_opt(optarg, 1);
716 			if (ret == -1) {
717 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
718 				us_vhost_usage(prgname);
719 				return -1;
720 			}
721 			enable_retry = ret;
722 			break;
723 
724 		case OPT_TX_CSUM_NUM:
725 			ret = parse_num_opt(optarg, 1);
726 			if (ret == -1) {
727 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
728 				us_vhost_usage(prgname);
729 				return -1;
730 			}
731 			enable_tx_csum = ret;
732 			break;
733 
734 		case OPT_TSO_NUM:
735 			ret = parse_num_opt(optarg, 1);
736 			if (ret == -1) {
737 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
738 				us_vhost_usage(prgname);
739 				return -1;
740 			}
741 			enable_tso = ret;
742 			break;
743 
744 		case OPT_RX_RETRY_DELAY_NUM:
745 			ret = parse_num_opt(optarg, INT32_MAX);
746 			if (ret == -1) {
747 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
748 				us_vhost_usage(prgname);
749 				return -1;
750 			}
751 			burst_rx_delay_time = ret;
752 			break;
753 
754 		case OPT_RX_RETRY_NUMB_NUM:
755 			ret = parse_num_opt(optarg, INT32_MAX);
756 			if (ret == -1) {
757 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
758 				us_vhost_usage(prgname);
759 				return -1;
760 			}
761 			burst_rx_retry_num = ret;
762 			break;
763 
764 		case OPT_MERGEABLE_NUM:
765 			ret = parse_num_opt(optarg, 1);
766 			if (ret == -1) {
767 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
768 				us_vhost_usage(prgname);
769 				return -1;
770 			}
771 			mergeable = !!ret;
772 			break;
773 
774 		case OPT_STATS_NUM:
775 			ret = parse_num_opt(optarg, INT32_MAX);
776 			if (ret == -1) {
777 				RTE_LOG(INFO, VHOST_CONFIG,
778 					"Invalid argument for stats [0..N]\n");
779 				us_vhost_usage(prgname);
780 				return -1;
781 			}
782 			enable_stats = ret;
783 			break;
784 
785 		/* Set socket file path. */
786 		case OPT_SOCKET_FILE_NUM:
787 			if (us_vhost_parse_socket_path(optarg) == -1) {
788 				RTE_LOG(INFO, VHOST_CONFIG,
789 				"Invalid argument for socket name (Max %d characters)\n",
790 				PATH_MAX);
791 				us_vhost_usage(prgname);
792 				return -1;
793 			}
794 			break;
795 
796 		case OPT_DMAS_NUM:
797 			if (open_dma(optarg) == -1) {
798 				RTE_LOG(INFO, VHOST_CONFIG,
799 					"Wrong DMA args\n");
800 				us_vhost_usage(prgname);
801 				return -1;
802 			}
803 			break;
804 
805 		case OPT_CLIENT_NUM:
806 			client_mode = 1;
807 			break;
808 
809 		case OPT_BUILTIN_NET_DRIVER_NUM:
810 			builtin_net_driver = 1;
811 			break;
812 
813 		/* Invalid option - print options. */
814 		default:
815 			us_vhost_usage(prgname);
816 			return -1;
817 		}
818 	}
819 
820 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
821 		if (enabled_port_mask & (1 << i))
822 			ports[num_ports++] = i;
823 	}
824 
825 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
826 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
827 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
828 		return -1;
829 	}
830 
831 	return 0;
832 }
833 
834 /*
835  * Update the global var NUM_PORTS and array PORTS according to system ports number
836  * and return valid ports number
837  */
838 static unsigned check_ports_num(unsigned nb_ports)
839 {
840 	unsigned valid_num_ports = num_ports;
841 	unsigned portid;
842 
843 	if (num_ports > nb_ports) {
844 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
845 			num_ports, nb_ports);
846 		num_ports = nb_ports;
847 	}
848 
849 	for (portid = 0; portid < num_ports; portid ++) {
850 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
851 			RTE_LOG(INFO, VHOST_PORT,
852 				"\nSpecified port ID(%u) is not valid\n",
853 				ports[portid]);
854 			ports[portid] = INVALID_PORT_ID;
855 			valid_num_ports--;
856 		}
857 	}
858 	return valid_num_ports;
859 }
860 
861 static __rte_always_inline struct vhost_dev *
862 find_vhost_dev(struct rte_ether_addr *mac)
863 {
864 	struct vhost_dev *vdev;
865 
866 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
867 		if (vdev->ready == DEVICE_RX &&
868 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
869 			return vdev;
870 	}
871 
872 	return NULL;
873 }
874 
875 /*
876  * This function learns the MAC address of the device and registers this along with a
877  * vlan tag to a VMDQ.
878  */
879 static int
880 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
881 {
882 	struct rte_ether_hdr *pkt_hdr;
883 	int i, ret;
884 
885 	/* Learn MAC address of guest device from packet */
886 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
887 
888 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
889 		RTE_LOG(ERR, VHOST_DATA,
890 			"(%d) device is using a registered MAC!\n",
891 			vdev->vid);
892 		return -1;
893 	}
894 
895 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
896 		vdev->mac_address.addr_bytes[i] =
897 			pkt_hdr->src_addr.addr_bytes[i];
898 
899 	/* vlan_tag currently uses the device_id. */
900 	vdev->vlan_tag = vlan_tags[vdev->vid];
901 
902 	/* Print out VMDQ registration info. */
903 	RTE_LOG(INFO, VHOST_DATA,
904 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
905 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
906 		vdev->vlan_tag);
907 
908 	/* Register the MAC address. */
909 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
910 				(uint32_t)vdev->vid + vmdq_pool_base);
911 	if (ret)
912 		RTE_LOG(ERR, VHOST_DATA,
913 			"(%d) failed to add device MAC address to VMDQ\n",
914 			vdev->vid);
915 
916 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
917 
918 	/* Set device as ready for RX. */
919 	vdev->ready = DEVICE_RX;
920 
921 	return 0;
922 }
923 
924 /*
925  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
926  * queue before disabling RX on the device.
927  */
928 static inline void
929 unlink_vmdq(struct vhost_dev *vdev)
930 {
931 	unsigned i = 0;
932 	unsigned rx_count;
933 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
934 
935 	if (vdev->ready == DEVICE_RX) {
936 		/*clear MAC and VLAN settings*/
937 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
938 		for (i = 0; i < 6; i++)
939 			vdev->mac_address.addr_bytes[i] = 0;
940 
941 		vdev->vlan_tag = 0;
942 
943 		/*Clear out the receive buffers*/
944 		rx_count = rte_eth_rx_burst(ports[0],
945 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
946 
947 		while (rx_count) {
948 			for (i = 0; i < rx_count; i++)
949 				rte_pktmbuf_free(pkts_burst[i]);
950 
951 			rx_count = rte_eth_rx_burst(ports[0],
952 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
953 		}
954 
955 		vdev->ready = DEVICE_MAC_LEARNING;
956 	}
957 }
958 
959 static inline void
960 free_pkts(struct rte_mbuf **pkts, uint16_t n)
961 {
962 	while (n--)
963 		rte_pktmbuf_free(pkts[n]);
964 }
965 
966 static __rte_always_inline void
967 complete_async_pkts(struct vhost_dev *vdev)
968 {
969 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
970 	uint16_t complete_count;
971 	int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
972 
973 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
974 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
975 	if (complete_count) {
976 		free_pkts(p_cpl, complete_count);
977 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
978 	}
979 
980 }
981 
982 static __rte_always_inline void
983 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
984 	    struct rte_mbuf *m)
985 {
986 	uint16_t ret;
987 
988 	if (builtin_net_driver) {
989 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
990 	} else {
991 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
992 	}
993 
994 	if (enable_stats) {
995 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
996 				__ATOMIC_SEQ_CST);
997 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
998 				__ATOMIC_SEQ_CST);
999 		src_vdev->stats.tx_total++;
1000 		src_vdev->stats.tx += ret;
1001 	}
1002 }
1003 
1004 static __rte_always_inline void
1005 drain_vhost(struct vhost_dev *vdev)
1006 {
1007 	uint16_t ret;
1008 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1009 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1010 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1011 
1012 	if (builtin_net_driver) {
1013 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
1014 	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1015 		uint16_t enqueue_fail = 0;
1016 		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1017 
1018 		complete_async_pkts(vdev);
1019 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
1020 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
1021 
1022 		enqueue_fail = nr_xmit - ret;
1023 		if (enqueue_fail)
1024 			free_pkts(&m[ret], nr_xmit - ret);
1025 	} else {
1026 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1027 						m, nr_xmit);
1028 	}
1029 
1030 	if (enable_stats) {
1031 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1032 				__ATOMIC_SEQ_CST);
1033 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1034 				__ATOMIC_SEQ_CST);
1035 	}
1036 
1037 	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1038 		free_pkts(m, nr_xmit);
1039 }
1040 
1041 static __rte_always_inline void
1042 drain_vhost_table(void)
1043 {
1044 	uint16_t lcore_id = rte_lcore_id();
1045 	struct vhost_bufftable *vhost_txq;
1046 	struct vhost_dev *vdev;
1047 	uint64_t cur_tsc;
1048 
1049 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1050 		if (unlikely(vdev->remove == 1))
1051 			continue;
1052 
1053 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1054 
1055 		cur_tsc = rte_rdtsc();
1056 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1057 				> MBUF_TABLE_DRAIN_TSC)) {
1058 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1059 				"Vhost TX queue drained after timeout with burst size %u\n",
1060 				vhost_txq->len);
1061 			drain_vhost(vdev);
1062 			vhost_txq->len = 0;
1063 			vhost_txq->pre_tsc = cur_tsc;
1064 		}
1065 	}
1066 }
1067 
1068 /*
1069  * Check if the packet destination MAC address is for a local device. If so then put
1070  * the packet on that devices RX queue. If not then return.
1071  */
1072 static __rte_always_inline int
1073 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1074 {
1075 	struct rte_ether_hdr *pkt_hdr;
1076 	struct vhost_dev *dst_vdev;
1077 	struct vhost_bufftable *vhost_txq;
1078 	uint16_t lcore_id = rte_lcore_id();
1079 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1080 
1081 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1082 	if (!dst_vdev)
1083 		return -1;
1084 
1085 	if (vdev->vid == dst_vdev->vid) {
1086 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1087 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1088 			vdev->vid);
1089 		return 0;
1090 	}
1091 
1092 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1093 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1094 
1095 	if (unlikely(dst_vdev->remove)) {
1096 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1097 			"(%d) device is marked for removal\n", dst_vdev->vid);
1098 		return 0;
1099 	}
1100 
1101 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1102 	vhost_txq->m_table[vhost_txq->len++] = m;
1103 
1104 	if (enable_stats) {
1105 		vdev->stats.tx_total++;
1106 		vdev->stats.tx++;
1107 	}
1108 
1109 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1110 		drain_vhost(dst_vdev);
1111 		vhost_txq->len = 0;
1112 		vhost_txq->pre_tsc = rte_rdtsc();
1113 	}
1114 	return 0;
1115 }
1116 
1117 /*
1118  * Check if the destination MAC of a packet is one local VM,
1119  * and get its vlan tag, and offset if it is.
1120  */
1121 static __rte_always_inline int
1122 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1123 	uint32_t *offset, uint16_t *vlan_tag)
1124 {
1125 	struct vhost_dev *dst_vdev;
1126 	struct rte_ether_hdr *pkt_hdr =
1127 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1128 
1129 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1130 	if (!dst_vdev)
1131 		return 0;
1132 
1133 	if (vdev->vid == dst_vdev->vid) {
1134 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1135 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1136 			vdev->vid);
1137 		return -1;
1138 	}
1139 
1140 	/*
1141 	 * HW vlan strip will reduce the packet length
1142 	 * by minus length of vlan tag, so need restore
1143 	 * the packet length by plus it.
1144 	 */
1145 	*offset  = RTE_VLAN_HLEN;
1146 	*vlan_tag = vlan_tags[vdev->vid];
1147 
1148 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1149 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1150 		vdev->vid, dst_vdev->vid, *vlan_tag);
1151 
1152 	return 0;
1153 }
1154 
1155 static void virtio_tx_offload(struct rte_mbuf *m)
1156 {
1157 	struct rte_net_hdr_lens hdr_lens;
1158 	struct rte_ipv4_hdr *ipv4_hdr;
1159 	struct rte_tcp_hdr *tcp_hdr;
1160 	uint32_t ptype;
1161 	void *l3_hdr;
1162 
1163 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1164 	m->l2_len = hdr_lens.l2_len;
1165 	m->l3_len = hdr_lens.l3_len;
1166 	m->l4_len = hdr_lens.l4_len;
1167 
1168 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1169 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1170 		m->l2_len + m->l3_len);
1171 
1172 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1173 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1174 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1175 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1176 		ipv4_hdr = l3_hdr;
1177 		ipv4_hdr->hdr_checksum = 0;
1178 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1179 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1180 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1181 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1182 	}
1183 }
1184 
1185 static __rte_always_inline void
1186 do_drain_mbuf_table(struct mbuf_table *tx_q)
1187 {
1188 	uint16_t count;
1189 
1190 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1191 				 tx_q->m_table, tx_q->len);
1192 	if (unlikely(count < tx_q->len))
1193 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1194 
1195 	tx_q->len = 0;
1196 }
1197 
1198 /*
1199  * This function routes the TX packet to the correct interface. This
1200  * may be a local device or the physical port.
1201  */
1202 static __rte_always_inline void
1203 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1204 {
1205 	struct mbuf_table *tx_q;
1206 	unsigned offset = 0;
1207 	const uint16_t lcore_id = rte_lcore_id();
1208 	struct rte_ether_hdr *nh;
1209 
1210 
1211 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1212 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1213 		struct vhost_dev *vdev2;
1214 
1215 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1216 			if (vdev2 != vdev)
1217 				sync_virtio_xmit(vdev2, vdev, m);
1218 		}
1219 		goto queue2nic;
1220 	}
1221 
1222 	/*check if destination is local VM*/
1223 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1224 		return;
1225 
1226 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1227 		if (unlikely(find_local_dest(vdev, m, &offset,
1228 					     &vlan_tag) != 0)) {
1229 			rte_pktmbuf_free(m);
1230 			return;
1231 		}
1232 	}
1233 
1234 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1235 		"(%d) TX: MAC address is external\n", vdev->vid);
1236 
1237 queue2nic:
1238 
1239 	/*Add packet to the port tx queue*/
1240 	tx_q = &lcore_tx_queue[lcore_id];
1241 
1242 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1243 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1244 		/* Guest has inserted the vlan tag. */
1245 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1246 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1247 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1248 			(vh->vlan_tci != vlan_tag_be))
1249 			vh->vlan_tci = vlan_tag_be;
1250 	} else {
1251 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1252 
1253 		/*
1254 		 * Find the right seg to adjust the data len when offset is
1255 		 * bigger than tail room size.
1256 		 */
1257 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1258 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1259 				m->data_len += offset;
1260 			else {
1261 				struct rte_mbuf *seg = m;
1262 
1263 				while ((seg->next != NULL) &&
1264 					(offset > rte_pktmbuf_tailroom(seg)))
1265 					seg = seg->next;
1266 
1267 				seg->data_len += offset;
1268 			}
1269 			m->pkt_len += offset;
1270 		}
1271 
1272 		m->vlan_tci = vlan_tag;
1273 	}
1274 
1275 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1276 		virtio_tx_offload(m);
1277 
1278 	tx_q->m_table[tx_q->len++] = m;
1279 	if (enable_stats) {
1280 		vdev->stats.tx_total++;
1281 		vdev->stats.tx++;
1282 	}
1283 
1284 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1285 		do_drain_mbuf_table(tx_q);
1286 }
1287 
1288 
1289 static __rte_always_inline void
1290 drain_mbuf_table(struct mbuf_table *tx_q)
1291 {
1292 	static uint64_t prev_tsc;
1293 	uint64_t cur_tsc;
1294 
1295 	if (tx_q->len == 0)
1296 		return;
1297 
1298 	cur_tsc = rte_rdtsc();
1299 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1300 		prev_tsc = cur_tsc;
1301 
1302 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1303 			"TX queue drained after timeout with burst size %u\n",
1304 			tx_q->len);
1305 		do_drain_mbuf_table(tx_q);
1306 	}
1307 }
1308 
1309 static __rte_always_inline void
1310 drain_eth_rx(struct vhost_dev *vdev)
1311 {
1312 	uint16_t rx_count, enqueue_count;
1313 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1314 
1315 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1316 				    pkts, MAX_PKT_BURST);
1317 
1318 	if (!rx_count)
1319 		return;
1320 
1321 	/*
1322 	 * When "enable_retry" is set, here we wait and retry when there
1323 	 * is no enough free slots in the queue to hold @rx_count packets,
1324 	 * to diminish packet loss.
1325 	 */
1326 	if (enable_retry &&
1327 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1328 			VIRTIO_RXQ))) {
1329 		uint32_t retry;
1330 
1331 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1332 			rte_delay_us(burst_rx_delay_time);
1333 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1334 					VIRTIO_RXQ))
1335 				break;
1336 		}
1337 	}
1338 
1339 	if (builtin_net_driver) {
1340 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1341 						pkts, rx_count);
1342 	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1343 		uint16_t enqueue_fail = 0;
1344 		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1345 
1346 		complete_async_pkts(vdev);
1347 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1348 					VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
1349 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1350 
1351 		enqueue_fail = rx_count - enqueue_count;
1352 		if (enqueue_fail)
1353 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1354 
1355 	} else {
1356 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1357 						pkts, rx_count);
1358 	}
1359 
1360 	if (enable_stats) {
1361 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1362 				__ATOMIC_SEQ_CST);
1363 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1364 				__ATOMIC_SEQ_CST);
1365 	}
1366 
1367 	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1368 		free_pkts(pkts, rx_count);
1369 }
1370 
1371 static __rte_always_inline void
1372 drain_virtio_tx(struct vhost_dev *vdev)
1373 {
1374 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1375 	uint16_t count;
1376 	uint16_t i;
1377 
1378 	if (builtin_net_driver) {
1379 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1380 					pkts, MAX_PKT_BURST);
1381 	} else {
1382 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1383 					mbuf_pool, pkts, MAX_PKT_BURST);
1384 	}
1385 
1386 	/* setup VMDq for the first packet */
1387 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1388 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1389 			free_pkts(pkts, count);
1390 	}
1391 
1392 	for (i = 0; i < count; ++i)
1393 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1394 }
1395 
1396 /*
1397  * Main function of vhost-switch. It basically does:
1398  *
1399  * for each vhost device {
1400  *    - drain_eth_rx()
1401  *
1402  *      Which drains the host eth Rx queue linked to the vhost device,
1403  *      and deliver all of them to guest virito Rx ring associated with
1404  *      this vhost device.
1405  *
1406  *    - drain_virtio_tx()
1407  *
1408  *      Which drains the guest virtio Tx queue and deliver all of them
1409  *      to the target, which could be another vhost device, or the
1410  *      physical eth dev. The route is done in function "virtio_tx_route".
1411  * }
1412  */
1413 static int
1414 switch_worker(void *arg __rte_unused)
1415 {
1416 	unsigned i;
1417 	unsigned lcore_id = rte_lcore_id();
1418 	struct vhost_dev *vdev;
1419 	struct mbuf_table *tx_q;
1420 
1421 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1422 
1423 	tx_q = &lcore_tx_queue[lcore_id];
1424 	for (i = 0; i < rte_lcore_count(); i++) {
1425 		if (lcore_ids[i] == lcore_id) {
1426 			tx_q->txq_id = i;
1427 			break;
1428 		}
1429 	}
1430 
1431 	while(1) {
1432 		drain_mbuf_table(tx_q);
1433 		drain_vhost_table();
1434 		/*
1435 		 * Inform the configuration core that we have exited the
1436 		 * linked list and that no devices are in use if requested.
1437 		 */
1438 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1439 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1440 
1441 		/*
1442 		 * Process vhost devices
1443 		 */
1444 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1445 			      lcore_vdev_entry) {
1446 			if (unlikely(vdev->remove)) {
1447 				unlink_vmdq(vdev);
1448 				vdev->ready = DEVICE_SAFE_REMOVE;
1449 				continue;
1450 			}
1451 
1452 			if (likely(vdev->ready == DEVICE_RX))
1453 				drain_eth_rx(vdev);
1454 
1455 			if (likely(!vdev->remove))
1456 				drain_virtio_tx(vdev);
1457 		}
1458 	}
1459 
1460 	return 0;
1461 }
1462 
1463 /*
1464  * Remove a device from the specific data core linked list and from the
1465  * main linked list. Synchronization  occurs through the use of the
1466  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1467  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1468  */
1469 static void
1470 destroy_device(int vid)
1471 {
1472 	struct vhost_dev *vdev = NULL;
1473 	int lcore;
1474 	uint16_t i;
1475 
1476 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1477 		if (vdev->vid == vid)
1478 			break;
1479 	}
1480 	if (!vdev)
1481 		return;
1482 	/*set the remove flag. */
1483 	vdev->remove = 1;
1484 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1485 		rte_pause();
1486 	}
1487 
1488 	for (i = 0; i < RTE_MAX_LCORE; i++)
1489 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1490 
1491 	if (builtin_net_driver)
1492 		vs_vhost_net_remove(vdev);
1493 
1494 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1495 		     lcore_vdev_entry);
1496 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1497 
1498 
1499 	/* Set the dev_removal_flag on each lcore. */
1500 	RTE_LCORE_FOREACH_WORKER(lcore)
1501 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1502 
1503 	/*
1504 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1505 	 * we can be sure that they can no longer access the device removed
1506 	 * from the linked lists and that the devices are no longer in use.
1507 	 */
1508 	RTE_LCORE_FOREACH_WORKER(lcore) {
1509 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1510 			rte_pause();
1511 	}
1512 
1513 	lcore_info[vdev->coreid].device_num--;
1514 
1515 	RTE_LOG(INFO, VHOST_DATA,
1516 		"(%d) device has been removed from data core\n",
1517 		vdev->vid);
1518 
1519 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1520 		uint16_t n_pkt = 0;
1521 		int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1522 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1523 
1524 		while (vdev->pkts_inflight) {
1525 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1526 						m_cpl, vdev->pkts_inflight, dma_id, 0);
1527 			free_pkts(m_cpl, n_pkt);
1528 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1529 		}
1530 
1531 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1532 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1533 	}
1534 
1535 	rte_free(vdev);
1536 }
1537 
1538 /*
1539  * A new device is added to a data core. First the device is added to the main linked list
1540  * and then allocated to a specific data core.
1541  */
1542 static int
1543 new_device(int vid)
1544 {
1545 	int lcore, core_add = 0;
1546 	uint16_t i;
1547 	uint32_t device_num_min = num_devices;
1548 	struct vhost_dev *vdev;
1549 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1550 	if (vdev == NULL) {
1551 		RTE_LOG(INFO, VHOST_DATA,
1552 			"(%d) couldn't allocate memory for vhost dev\n",
1553 			vid);
1554 		return -1;
1555 	}
1556 	vdev->vid = vid;
1557 
1558 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1559 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1560 			= rte_zmalloc("vhost bufftable",
1561 				sizeof(struct vhost_bufftable),
1562 				RTE_CACHE_LINE_SIZE);
1563 
1564 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1565 			RTE_LOG(INFO, VHOST_DATA,
1566 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1567 			return -1;
1568 		}
1569 	}
1570 
1571 	if (builtin_net_driver)
1572 		vs_vhost_net_setup(vdev);
1573 
1574 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1575 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1576 
1577 	/*reset ready flag*/
1578 	vdev->ready = DEVICE_MAC_LEARNING;
1579 	vdev->remove = 0;
1580 
1581 	/* Find a suitable lcore to add the device. */
1582 	RTE_LCORE_FOREACH_WORKER(lcore) {
1583 		if (lcore_info[lcore].device_num < device_num_min) {
1584 			device_num_min = lcore_info[lcore].device_num;
1585 			core_add = lcore;
1586 		}
1587 	}
1588 	vdev->coreid = core_add;
1589 
1590 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1591 			  lcore_vdev_entry);
1592 	lcore_info[vdev->coreid].device_num++;
1593 
1594 	/* Disable notifications. */
1595 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1596 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1597 
1598 	RTE_LOG(INFO, VHOST_DATA,
1599 		"(%d) device has been added to data core %d\n",
1600 		vid, vdev->coreid);
1601 
1602 	if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1603 		int ret;
1604 
1605 		ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1606 		if (ret == 0)
1607 			dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
1608 		return ret;
1609 	}
1610 
1611 	return 0;
1612 }
1613 
1614 static int
1615 vring_state_changed(int vid, uint16_t queue_id, int enable)
1616 {
1617 	struct vhost_dev *vdev = NULL;
1618 
1619 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1620 		if (vdev->vid == vid)
1621 			break;
1622 	}
1623 	if (!vdev)
1624 		return -1;
1625 
1626 	if (queue_id != VIRTIO_RXQ)
1627 		return 0;
1628 
1629 	if (dma_bind[vid].dmas[queue_id].async_enabled) {
1630 		if (!enable) {
1631 			uint16_t n_pkt = 0;
1632 			int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1633 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1634 
1635 			while (vdev->pkts_inflight) {
1636 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1637 							m_cpl, vdev->pkts_inflight, dma_id, 0);
1638 				free_pkts(m_cpl, n_pkt);
1639 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1640 			}
1641 		}
1642 	}
1643 
1644 	return 0;
1645 }
1646 
1647 /*
1648  * These callback allow devices to be added to the data core when configuration
1649  * has been fully complete.
1650  */
1651 static const struct rte_vhost_device_ops virtio_net_device_ops =
1652 {
1653 	.new_device =  new_device,
1654 	.destroy_device = destroy_device,
1655 	.vring_state_changed = vring_state_changed,
1656 };
1657 
1658 /*
1659  * This is a thread will wake up after a period to print stats if the user has
1660  * enabled them.
1661  */
1662 static void *
1663 print_stats(__rte_unused void *arg)
1664 {
1665 	struct vhost_dev *vdev;
1666 	uint64_t tx_dropped, rx_dropped;
1667 	uint64_t tx, tx_total, rx, rx_total;
1668 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1669 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1670 
1671 	while(1) {
1672 		sleep(enable_stats);
1673 
1674 		/* Clear screen and move to top left */
1675 		printf("%s%s\n", clr, top_left);
1676 		printf("Device statistics =================================\n");
1677 
1678 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1679 			tx_total   = vdev->stats.tx_total;
1680 			tx         = vdev->stats.tx;
1681 			tx_dropped = tx_total - tx;
1682 
1683 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1684 				__ATOMIC_SEQ_CST);
1685 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1686 				__ATOMIC_SEQ_CST);
1687 			rx_dropped = rx_total - rx;
1688 
1689 			printf("Statistics for device %d\n"
1690 				"-----------------------\n"
1691 				"TX total:              %" PRIu64 "\n"
1692 				"TX dropped:            %" PRIu64 "\n"
1693 				"TX successful:         %" PRIu64 "\n"
1694 				"RX total:              %" PRIu64 "\n"
1695 				"RX dropped:            %" PRIu64 "\n"
1696 				"RX successful:         %" PRIu64 "\n",
1697 				vdev->vid,
1698 				tx_total, tx_dropped, tx,
1699 				rx_total, rx_dropped, rx);
1700 		}
1701 
1702 		printf("===================================================\n");
1703 
1704 		fflush(stdout);
1705 	}
1706 
1707 	return NULL;
1708 }
1709 
1710 static void
1711 unregister_drivers(int socket_num)
1712 {
1713 	int i, ret;
1714 
1715 	for (i = 0; i < socket_num; i++) {
1716 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1717 		if (ret != 0)
1718 			RTE_LOG(ERR, VHOST_CONFIG,
1719 				"Fail to unregister vhost driver for %s.\n",
1720 				socket_files + i * PATH_MAX);
1721 	}
1722 }
1723 
1724 /* When we receive a INT signal, unregister vhost driver */
1725 static void
1726 sigint_handler(__rte_unused int signum)
1727 {
1728 	/* Unregister vhost driver. */
1729 	unregister_drivers(nb_sockets);
1730 
1731 	exit(0);
1732 }
1733 
1734 /*
1735  * While creating an mbuf pool, one key thing is to figure out how
1736  * many mbuf entries is enough for our use. FYI, here are some
1737  * guidelines:
1738  *
1739  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1740  *
1741  * - For each switch core (A CPU core does the packet switch), we need
1742  *   also make some reservation for receiving the packets from virtio
1743  *   Tx queue. How many is enough depends on the usage. It's normally
1744  *   a simple calculation like following:
1745  *
1746  *       MAX_PKT_BURST * max packet size / mbuf size
1747  *
1748  *   So, we definitely need allocate more mbufs when TSO is enabled.
1749  *
1750  * - Similarly, for each switching core, we should serve @nr_rx_desc
1751  *   mbufs for receiving the packets from physical NIC device.
1752  *
1753  * - We also need make sure, for each switch core, we have allocated
1754  *   enough mbufs to fill up the mbuf cache.
1755  */
1756 static void
1757 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1758 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1759 {
1760 	uint32_t nr_mbufs;
1761 	uint32_t nr_mbufs_per_core;
1762 	uint32_t mtu = 1500;
1763 
1764 	if (mergeable)
1765 		mtu = 9000;
1766 	if (enable_tso)
1767 		mtu = 64 * 1024;
1768 
1769 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1770 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1771 	nr_mbufs_per_core += nr_rx_desc;
1772 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1773 
1774 	nr_mbufs  = nr_queues * nr_rx_desc;
1775 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1776 	nr_mbufs *= nr_port;
1777 
1778 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1779 					    nr_mbuf_cache, 0, mbuf_size,
1780 					    rte_socket_id());
1781 	if (mbuf_pool == NULL)
1782 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1783 }
1784 
1785 static void
1786 reset_dma(void)
1787 {
1788 	int i;
1789 
1790 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1791 		int j;
1792 
1793 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1794 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1795 			dma_bind[i].dmas[j].async_enabled = false;
1796 		}
1797 	}
1798 
1799 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1800 		dmas_id[i] = INVALID_DMA_ID;
1801 }
1802 
1803 /*
1804  * Main function, does initialisation and calls the per-lcore functions.
1805  */
1806 int
1807 main(int argc, char *argv[])
1808 {
1809 	unsigned lcore_id, core_id = 0;
1810 	unsigned nb_ports, valid_num_ports;
1811 	int ret, i;
1812 	uint16_t portid;
1813 	static pthread_t tid;
1814 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1815 
1816 	signal(SIGINT, sigint_handler);
1817 
1818 	/* init EAL */
1819 	ret = rte_eal_init(argc, argv);
1820 	if (ret < 0)
1821 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1822 	argc -= ret;
1823 	argv += ret;
1824 
1825 	/* initialize dma structures */
1826 	reset_dma();
1827 
1828 	/* parse app arguments */
1829 	ret = us_vhost_parse_args(argc, argv);
1830 	if (ret < 0)
1831 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1832 
1833 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1834 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1835 
1836 		if (rte_lcore_is_enabled(lcore_id))
1837 			lcore_ids[core_id++] = lcore_id;
1838 	}
1839 
1840 	if (rte_lcore_count() > RTE_MAX_LCORE)
1841 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1842 
1843 	/* Get the number of physical ports. */
1844 	nb_ports = rte_eth_dev_count_avail();
1845 
1846 	/*
1847 	 * Update the global var NUM_PORTS and global array PORTS
1848 	 * and get value of var VALID_NUM_PORTS according to system ports number
1849 	 */
1850 	valid_num_ports = check_ports_num(nb_ports);
1851 
1852 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1853 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1854 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1855 		return -1;
1856 	}
1857 
1858 	/*
1859 	 * FIXME: here we are trying to allocate mbufs big enough for
1860 	 * @MAX_QUEUES, but the truth is we're never going to use that
1861 	 * many queues here. We probably should only do allocation for
1862 	 * those queues we are going to use.
1863 	 */
1864 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1865 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1866 
1867 	if (vm2vm_mode == VM2VM_HARDWARE) {
1868 		/* Enable VT loop back to let L2 switch to do it. */
1869 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1870 		RTE_LOG(DEBUG, VHOST_CONFIG,
1871 			"Enable loop back for L2 switch in vmdq.\n");
1872 	}
1873 
1874 	/* initialize all ports */
1875 	RTE_ETH_FOREACH_DEV(portid) {
1876 		/* skip ports that are not enabled */
1877 		if ((enabled_port_mask & (1 << portid)) == 0) {
1878 			RTE_LOG(INFO, VHOST_PORT,
1879 				"Skipping disabled port %d\n", portid);
1880 			continue;
1881 		}
1882 		if (port_init(portid) != 0)
1883 			rte_exit(EXIT_FAILURE,
1884 				"Cannot initialize network ports\n");
1885 	}
1886 
1887 	/* Enable stats if the user option is set. */
1888 	if (enable_stats) {
1889 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1890 					print_stats, NULL);
1891 		if (ret < 0)
1892 			rte_exit(EXIT_FAILURE,
1893 				"Cannot create print-stats thread\n");
1894 	}
1895 
1896 	/* Launch all data cores. */
1897 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1898 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1899 
1900 	if (client_mode)
1901 		flags |= RTE_VHOST_USER_CLIENT;
1902 
1903 	for (i = 0; i < dma_count; i++) {
1904 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1905 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1906 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1907 		}
1908 	}
1909 
1910 	/* Register vhost user driver to handle vhost messages. */
1911 	for (i = 0; i < nb_sockets; i++) {
1912 		char *file = socket_files + i * PATH_MAX;
1913 
1914 		if (dma_count)
1915 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1916 
1917 		ret = rte_vhost_driver_register(file, flags);
1918 		if (ret != 0) {
1919 			unregister_drivers(i);
1920 			rte_exit(EXIT_FAILURE,
1921 				"vhost driver register failure.\n");
1922 		}
1923 
1924 		if (builtin_net_driver)
1925 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1926 
1927 		if (mergeable == 0) {
1928 			rte_vhost_driver_disable_features(file,
1929 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1930 		}
1931 
1932 		if (enable_tx_csum == 0) {
1933 			rte_vhost_driver_disable_features(file,
1934 				1ULL << VIRTIO_NET_F_CSUM);
1935 		}
1936 
1937 		if (enable_tso == 0) {
1938 			rte_vhost_driver_disable_features(file,
1939 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1940 			rte_vhost_driver_disable_features(file,
1941 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1942 			rte_vhost_driver_disable_features(file,
1943 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1944 			rte_vhost_driver_disable_features(file,
1945 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1946 		}
1947 
1948 		if (promiscuous) {
1949 			rte_vhost_driver_enable_features(file,
1950 				1ULL << VIRTIO_NET_F_CTRL_RX);
1951 		}
1952 
1953 		ret = rte_vhost_driver_callback_register(file,
1954 			&virtio_net_device_ops);
1955 		if (ret != 0) {
1956 			rte_exit(EXIT_FAILURE,
1957 				"failed to register vhost driver callbacks.\n");
1958 		}
1959 
1960 		if (rte_vhost_driver_start(file) < 0) {
1961 			rte_exit(EXIT_FAILURE,
1962 				"failed to start vhost driver.\n");
1963 		}
1964 	}
1965 
1966 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1967 		rte_eal_wait_lcore(lcore_id);
1968 
1969 	/* clean up the EAL */
1970 	rte_eal_cleanup();
1971 
1972 	return 0;
1973 }
1974