xref: /dpdk/examples/vhost/main.c (revision 8b6502f28685eb6395292470ff28e1ff3271aa32)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29 
30 #include "main.h"
31 
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35 
36 /* the maximum number of external ports supported */
37 #define MAX_SUP_PORTS 1
38 
39 #define MBUF_CACHE_SIZE	128
40 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
41 
42 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
43 
44 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
45 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
46 
47 #define JUMBO_FRAME_MAX_SIZE    0x2600
48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
49 
50 /* State of virtio device. */
51 #define DEVICE_MAC_LEARNING 0
52 #define DEVICE_RX			1
53 #define DEVICE_SAFE_REMOVE	2
54 
55 /* Configurable number of RX/TX ring descriptors */
56 #define RTE_TEST_RX_DESC_DEFAULT 1024
57 #define RTE_TEST_TX_DESC_DEFAULT 512
58 
59 #define INVALID_PORT_ID 0xFF
60 #define INVALID_DMA_ID -1
61 
62 #define DMA_RING_SIZE 4096
63 
64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
66 static int dma_count;
67 
68 /* mask of enabled ports */
69 static uint32_t enabled_port_mask = 0;
70 
71 /* Promiscuous mode */
72 static uint32_t promiscuous;
73 
74 /* number of devices/queues to support*/
75 static uint32_t num_queues = 0;
76 static uint32_t num_devices;
77 
78 static struct rte_mempool *mbuf_pool;
79 static int mergeable;
80 
81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
82 typedef enum {
83 	VM2VM_DISABLED = 0,
84 	VM2VM_SOFTWARE = 1,
85 	VM2VM_HARDWARE = 2,
86 	VM2VM_LAST
87 } vm2vm_type;
88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
89 
90 /* Enable stats. */
91 static uint32_t enable_stats = 0;
92 /* Enable retries on RX. */
93 static uint32_t enable_retry = 1;
94 
95 /* Disable TX checksum offload */
96 static uint32_t enable_tx_csum;
97 
98 /* Disable TSO offload */
99 static uint32_t enable_tso;
100 
101 static int client_mode;
102 
103 static int builtin_net_driver;
104 
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109 
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113 
114 /* empty VMDq configuration structure. Filled in programmatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116 	.rxmode = {
117 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
118 		.split_hdr_size = 0,
119 		/*
120 		 * VLAN strip is necessary for 1G NIC such as I350,
121 		 * this fixes bug of ipv4 forwarding in guest can't
122 		 * forward packets from one virtio dev to another virtio dev.
123 		 */
124 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
125 	},
126 
127 	.txmode = {
128 		.mq_mode = RTE_ETH_MQ_TX_NONE,
129 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
130 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
131 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
132 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
133 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
134 	},
135 	.rx_adv_conf = {
136 		/*
137 		 * should be overridden separately in code with
138 		 * appropriate values
139 		 */
140 		.vmdq_rx_conf = {
141 			.nb_queue_pools = RTE_ETH_8_POOLS,
142 			.enable_default_pool = 0,
143 			.default_pool = 0,
144 			.nb_pool_maps = 0,
145 			.pool_map = {{0, 0},},
146 		},
147 	},
148 };
149 
150 
151 static unsigned lcore_ids[RTE_MAX_LCORE];
152 static uint16_t ports[RTE_MAX_ETHPORTS];
153 static unsigned num_ports = 0; /**< The number of ports specified in command line */
154 static uint16_t num_pf_queues, num_vmdq_queues;
155 static uint16_t vmdq_pool_base, vmdq_queue_base;
156 static uint16_t queues_per_pool;
157 
158 const uint16_t vlan_tags[] = {
159 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
160 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
161 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
162 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
163 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
164 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
165 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
166 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
167 };
168 
169 /* ethernet addresses of ports */
170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
171 
172 static struct vhost_dev_tailq_list vhost_dev_list =
173 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
174 
175 static struct lcore_info lcore_info[RTE_MAX_LCORE];
176 
177 /* Used for queueing bursts of TX packets. */
178 struct mbuf_table {
179 	unsigned len;
180 	unsigned txq_id;
181 	struct rte_mbuf *m_table[MAX_PKT_BURST];
182 };
183 
184 struct vhost_bufftable {
185 	uint32_t len;
186 	uint64_t pre_tsc;
187 	struct rte_mbuf *m_table[MAX_PKT_BURST];
188 };
189 
190 /* TX queue for each data core. */
191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
192 
193 /*
194  * Vhost TX buffer for each data core.
195  * Every data core maintains a TX buffer for every vhost device,
196  * which is used for batch pkts enqueue for higher performance.
197  */
198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
199 
200 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
201 				 / US_PER_S * BURST_TX_DRAIN_US)
202 
203 static inline bool
204 is_dma_configured(int16_t dev_id)
205 {
206 	int i;
207 
208 	for (i = 0; i < dma_count; i++)
209 		if (dmas_id[i] == dev_id)
210 			return true;
211 	return false;
212 }
213 
214 static inline int
215 open_dma(const char *value)
216 {
217 	struct dma_for_vhost *dma_info = dma_bind;
218 	char *input = strndup(value, strlen(value) + 1);
219 	char *addrs = input;
220 	char *ptrs[2];
221 	char *start, *end, *substr;
222 	int64_t vid;
223 
224 	struct rte_dma_info info;
225 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
226 	struct rte_dma_vchan_conf qconf = {
227 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
228 		.nb_desc = DMA_RING_SIZE
229 	};
230 
231 	int dev_id;
232 	int ret = 0;
233 	uint16_t i = 0;
234 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
235 	int args_nr;
236 
237 	while (isblank(*addrs))
238 		addrs++;
239 	if (*addrs == '\0') {
240 		ret = -1;
241 		goto out;
242 	}
243 
244 	/* process DMA devices within bracket. */
245 	addrs++;
246 	substr = strtok(addrs, ";]");
247 	if (!substr) {
248 		ret = -1;
249 		goto out;
250 	}
251 
252 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
253 	if (args_nr <= 0) {
254 		ret = -1;
255 		goto out;
256 	}
257 
258 	while (i < args_nr) {
259 		char *arg_temp = dma_arg[i];
260 		uint8_t sub_nr;
261 
262 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
263 		if (sub_nr != 2) {
264 			ret = -1;
265 			goto out;
266 		}
267 
268 		start = strstr(ptrs[0], "txd");
269 		if (start == NULL) {
270 			ret = -1;
271 			goto out;
272 		}
273 
274 		start += 3;
275 		vid = strtol(start, &end, 0);
276 		if (end == start) {
277 			ret = -1;
278 			goto out;
279 		}
280 
281 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
282 		if (dev_id < 0) {
283 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
284 			ret = -1;
285 			goto out;
286 		}
287 
288 		/* DMA device is already configured, so skip */
289 		if (is_dma_configured(dev_id))
290 			goto done;
291 
292 		if (rte_dma_info_get(dev_id, &info) != 0) {
293 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
294 			ret = -1;
295 			goto out;
296 		}
297 
298 		if (info.max_vchans < 1) {
299 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
300 			ret = -1;
301 			goto out;
302 		}
303 
304 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
305 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
306 			ret = -1;
307 			goto out;
308 		}
309 
310 		/* Check the max desc supported by DMA device */
311 		rte_dma_info_get(dev_id, &info);
312 		if (info.nb_vchans != 1) {
313 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
314 					dev_id);
315 			ret = -1;
316 			goto out;
317 		}
318 
319 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
320 
321 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
322 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
323 			ret = -1;
324 			goto out;
325 		}
326 
327 		if (rte_dma_start(dev_id) != 0) {
328 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
329 			ret = -1;
330 			goto out;
331 		}
332 
333 		dmas_id[dma_count++] = dev_id;
334 
335 done:
336 		(dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
337 		i++;
338 	}
339 out:
340 	free(input);
341 	return ret;
342 }
343 
344 /*
345  * Builds up the correct configuration for VMDQ VLAN pool map
346  * according to the pool & queue limits.
347  */
348 static inline int
349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
350 {
351 	struct rte_eth_vmdq_rx_conf conf;
352 	struct rte_eth_vmdq_rx_conf *def_conf =
353 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
354 	unsigned i;
355 
356 	memset(&conf, 0, sizeof(conf));
357 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
358 	conf.nb_pool_maps = num_devices;
359 	conf.enable_loop_back = def_conf->enable_loop_back;
360 	conf.rx_mode = def_conf->rx_mode;
361 
362 	for (i = 0; i < conf.nb_pool_maps; i++) {
363 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
364 		conf.pool_map[i].pools = (1UL << i);
365 	}
366 
367 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
368 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
369 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
370 	return 0;
371 }
372 
373 /*
374  * Initialises a given port using global settings and with the rx buffers
375  * coming from the mbuf_pool passed as parameter
376  */
377 static inline int
378 port_init(uint16_t port)
379 {
380 	struct rte_eth_dev_info dev_info;
381 	struct rte_eth_conf port_conf;
382 	struct rte_eth_rxconf *rxconf;
383 	struct rte_eth_txconf *txconf;
384 	int16_t rx_rings, tx_rings;
385 	uint16_t rx_ring_size, tx_ring_size;
386 	int retval;
387 	uint16_t q;
388 
389 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
390 	retval = rte_eth_dev_info_get(port, &dev_info);
391 	if (retval != 0) {
392 		RTE_LOG(ERR, VHOST_PORT,
393 			"Error during getting device (port %u) info: %s\n",
394 			port, strerror(-retval));
395 
396 		return retval;
397 	}
398 
399 	rxconf = &dev_info.default_rxconf;
400 	txconf = &dev_info.default_txconf;
401 	rxconf->rx_drop_en = 1;
402 
403 	/*configure the number of supported virtio devices based on VMDQ limits */
404 	num_devices = dev_info.max_vmdq_pools;
405 
406 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
408 
409 	tx_rings = (uint16_t)rte_lcore_count();
410 
411 	if (mergeable) {
412 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
413 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
414 		else
415 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
416 	}
417 
418 	/* Get port configuration. */
419 	retval = get_eth_conf(&port_conf, num_devices);
420 	if (retval < 0)
421 		return retval;
422 	/* NIC queues are divided into pf queues and vmdq queues.  */
423 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
424 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
425 	num_vmdq_queues = num_devices * queues_per_pool;
426 	num_queues = num_pf_queues + num_vmdq_queues;
427 	vmdq_queue_base = dev_info.vmdq_queue_base;
428 	vmdq_pool_base  = dev_info.vmdq_pool_base;
429 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
430 		num_pf_queues, num_devices, queues_per_pool);
431 
432 	if (!rte_eth_dev_is_valid_port(port))
433 		return -1;
434 
435 	rx_rings = (uint16_t)dev_info.max_rx_queues;
436 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
437 		port_conf.txmode.offloads |=
438 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
439 	/* Configure ethernet device. */
440 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
441 	if (retval != 0) {
442 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
443 			port, strerror(-retval));
444 		return retval;
445 	}
446 
447 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
448 		&tx_ring_size);
449 	if (retval != 0) {
450 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
451 			"for port %u: %s.\n", port, strerror(-retval));
452 		return retval;
453 	}
454 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
455 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
456 			"for Rx queues on port %u.\n", port);
457 		return -1;
458 	}
459 
460 	/* Setup the queues. */
461 	rxconf->offloads = port_conf.rxmode.offloads;
462 	for (q = 0; q < rx_rings; q ++) {
463 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
464 						rte_eth_dev_socket_id(port),
465 						rxconf,
466 						mbuf_pool);
467 		if (retval < 0) {
468 			RTE_LOG(ERR, VHOST_PORT,
469 				"Failed to setup rx queue %u of port %u: %s.\n",
470 				q, port, strerror(-retval));
471 			return retval;
472 		}
473 	}
474 	txconf->offloads = port_conf.txmode.offloads;
475 	for (q = 0; q < tx_rings; q ++) {
476 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
477 						rte_eth_dev_socket_id(port),
478 						txconf);
479 		if (retval < 0) {
480 			RTE_LOG(ERR, VHOST_PORT,
481 				"Failed to setup tx queue %u of port %u: %s.\n",
482 				q, port, strerror(-retval));
483 			return retval;
484 		}
485 	}
486 
487 	/* Start the device. */
488 	retval  = rte_eth_dev_start(port);
489 	if (retval < 0) {
490 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
491 			port, strerror(-retval));
492 		return retval;
493 	}
494 
495 	if (promiscuous) {
496 		retval = rte_eth_promiscuous_enable(port);
497 		if (retval != 0) {
498 			RTE_LOG(ERR, VHOST_PORT,
499 				"Failed to enable promiscuous mode on port %u: %s\n",
500 				port, rte_strerror(-retval));
501 			return retval;
502 		}
503 	}
504 
505 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
506 	if (retval < 0) {
507 		RTE_LOG(ERR, VHOST_PORT,
508 			"Failed to get MAC address on port %u: %s\n",
509 			port, rte_strerror(-retval));
510 		return retval;
511 	}
512 
513 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
514 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
515 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
516 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
517 
518 	return 0;
519 }
520 
521 /*
522  * Set socket file path.
523  */
524 static int
525 us_vhost_parse_socket_path(const char *q_arg)
526 {
527 	char *old;
528 
529 	/* parse number string */
530 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
531 		return -1;
532 
533 	old = socket_files;
534 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
535 	if (socket_files == NULL) {
536 		free(old);
537 		return -1;
538 	}
539 
540 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
541 	nb_sockets++;
542 
543 	return 0;
544 }
545 
546 /*
547  * Parse the portmask provided at run time.
548  */
549 static int
550 parse_portmask(const char *portmask)
551 {
552 	char *end = NULL;
553 	unsigned long pm;
554 
555 	errno = 0;
556 
557 	/* parse hexadecimal string */
558 	pm = strtoul(portmask, &end, 16);
559 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
560 		return 0;
561 
562 	return pm;
563 
564 }
565 
566 /*
567  * Parse num options at run time.
568  */
569 static int
570 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
571 {
572 	char *end = NULL;
573 	unsigned long num;
574 
575 	errno = 0;
576 
577 	/* parse unsigned int string */
578 	num = strtoul(q_arg, &end, 10);
579 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
580 		return -1;
581 
582 	if (num > max_valid_value)
583 		return -1;
584 
585 	return num;
586 
587 }
588 
589 /*
590  * Display usage
591  */
592 static void
593 us_vhost_usage(const char *prgname)
594 {
595 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
596 	"		--vm2vm [0|1|2]\n"
597 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
598 	"		--socket-file <path>\n"
599 	"		--nb-devices ND\n"
600 	"		-p PORTMASK: Set mask for ports to be used by application\n"
601 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
602 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
603 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
604 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
605 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
606 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
607 	"		--socket-file: The path of the socket file.\n"
608 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
609 	"		--tso [0|1] disable/enable TCP segment offload.\n"
610 	"		--client register a vhost-user socket as client mode.\n"
611 	"		--dmas register dma channel for specific vhost device.\n",
612 	       prgname);
613 }
614 
615 enum {
616 #define OPT_VM2VM               "vm2vm"
617 	OPT_VM2VM_NUM = 256,
618 #define OPT_RX_RETRY            "rx-retry"
619 	OPT_RX_RETRY_NUM,
620 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
621 	OPT_RX_RETRY_DELAY_NUM,
622 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
623 	OPT_RX_RETRY_NUMB_NUM,
624 #define OPT_MERGEABLE           "mergeable"
625 	OPT_MERGEABLE_NUM,
626 #define OPT_STATS               "stats"
627 	OPT_STATS_NUM,
628 #define OPT_SOCKET_FILE         "socket-file"
629 	OPT_SOCKET_FILE_NUM,
630 #define OPT_TX_CSUM             "tx-csum"
631 	OPT_TX_CSUM_NUM,
632 #define OPT_TSO                 "tso"
633 	OPT_TSO_NUM,
634 #define OPT_CLIENT              "client"
635 	OPT_CLIENT_NUM,
636 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
637 	OPT_BUILTIN_NET_DRIVER_NUM,
638 #define OPT_DMAS                "dmas"
639 	OPT_DMAS_NUM,
640 };
641 
642 /*
643  * Parse the arguments given in the command line of the application.
644  */
645 static int
646 us_vhost_parse_args(int argc, char **argv)
647 {
648 	int opt, ret;
649 	int option_index;
650 	unsigned i;
651 	const char *prgname = argv[0];
652 	static struct option long_option[] = {
653 		{OPT_VM2VM, required_argument,
654 				NULL, OPT_VM2VM_NUM},
655 		{OPT_RX_RETRY, required_argument,
656 				NULL, OPT_RX_RETRY_NUM},
657 		{OPT_RX_RETRY_DELAY, required_argument,
658 				NULL, OPT_RX_RETRY_DELAY_NUM},
659 		{OPT_RX_RETRY_NUMB, required_argument,
660 				NULL, OPT_RX_RETRY_NUMB_NUM},
661 		{OPT_MERGEABLE, required_argument,
662 				NULL, OPT_MERGEABLE_NUM},
663 		{OPT_STATS, required_argument,
664 				NULL, OPT_STATS_NUM},
665 		{OPT_SOCKET_FILE, required_argument,
666 				NULL, OPT_SOCKET_FILE_NUM},
667 		{OPT_TX_CSUM, required_argument,
668 				NULL, OPT_TX_CSUM_NUM},
669 		{OPT_TSO, required_argument,
670 				NULL, OPT_TSO_NUM},
671 		{OPT_CLIENT, no_argument,
672 				NULL, OPT_CLIENT_NUM},
673 		{OPT_BUILTIN_NET_DRIVER, no_argument,
674 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
675 		{OPT_DMAS, required_argument,
676 				NULL, OPT_DMAS_NUM},
677 		{NULL, 0, 0, 0},
678 	};
679 
680 	/* Parse command line */
681 	while ((opt = getopt_long(argc, argv, "p:P",
682 			long_option, &option_index)) != EOF) {
683 		switch (opt) {
684 		/* Portmask */
685 		case 'p':
686 			enabled_port_mask = parse_portmask(optarg);
687 			if (enabled_port_mask == 0) {
688 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
689 				us_vhost_usage(prgname);
690 				return -1;
691 			}
692 			break;
693 
694 		case 'P':
695 			promiscuous = 1;
696 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
697 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
698 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
699 			break;
700 
701 		case OPT_VM2VM_NUM:
702 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
703 			if (ret == -1) {
704 				RTE_LOG(INFO, VHOST_CONFIG,
705 					"Invalid argument for "
706 					"vm2vm [0|1|2]\n");
707 				us_vhost_usage(prgname);
708 				return -1;
709 			}
710 			vm2vm_mode = (vm2vm_type)ret;
711 			break;
712 
713 		case OPT_RX_RETRY_NUM:
714 			ret = parse_num_opt(optarg, 1);
715 			if (ret == -1) {
716 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
717 				us_vhost_usage(prgname);
718 				return -1;
719 			}
720 			enable_retry = ret;
721 			break;
722 
723 		case OPT_TX_CSUM_NUM:
724 			ret = parse_num_opt(optarg, 1);
725 			if (ret == -1) {
726 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
727 				us_vhost_usage(prgname);
728 				return -1;
729 			}
730 			enable_tx_csum = ret;
731 			break;
732 
733 		case OPT_TSO_NUM:
734 			ret = parse_num_opt(optarg, 1);
735 			if (ret == -1) {
736 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
737 				us_vhost_usage(prgname);
738 				return -1;
739 			}
740 			enable_tso = ret;
741 			break;
742 
743 		case OPT_RX_RETRY_DELAY_NUM:
744 			ret = parse_num_opt(optarg, INT32_MAX);
745 			if (ret == -1) {
746 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
747 				us_vhost_usage(prgname);
748 				return -1;
749 			}
750 			burst_rx_delay_time = ret;
751 			break;
752 
753 		case OPT_RX_RETRY_NUMB_NUM:
754 			ret = parse_num_opt(optarg, INT32_MAX);
755 			if (ret == -1) {
756 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
757 				us_vhost_usage(prgname);
758 				return -1;
759 			}
760 			burst_rx_retry_num = ret;
761 			break;
762 
763 		case OPT_MERGEABLE_NUM:
764 			ret = parse_num_opt(optarg, 1);
765 			if (ret == -1) {
766 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
767 				us_vhost_usage(prgname);
768 				return -1;
769 			}
770 			mergeable = !!ret;
771 			break;
772 
773 		case OPT_STATS_NUM:
774 			ret = parse_num_opt(optarg, INT32_MAX);
775 			if (ret == -1) {
776 				RTE_LOG(INFO, VHOST_CONFIG,
777 					"Invalid argument for stats [0..N]\n");
778 				us_vhost_usage(prgname);
779 				return -1;
780 			}
781 			enable_stats = ret;
782 			break;
783 
784 		/* Set socket file path. */
785 		case OPT_SOCKET_FILE_NUM:
786 			if (us_vhost_parse_socket_path(optarg) == -1) {
787 				RTE_LOG(INFO, VHOST_CONFIG,
788 				"Invalid argument for socket name (Max %d characters)\n",
789 				PATH_MAX);
790 				us_vhost_usage(prgname);
791 				return -1;
792 			}
793 			break;
794 
795 		case OPT_DMAS_NUM:
796 			if (open_dma(optarg) == -1) {
797 				RTE_LOG(INFO, VHOST_CONFIG,
798 					"Wrong DMA args\n");
799 				us_vhost_usage(prgname);
800 				return -1;
801 			}
802 			break;
803 
804 		case OPT_CLIENT_NUM:
805 			client_mode = 1;
806 			break;
807 
808 		case OPT_BUILTIN_NET_DRIVER_NUM:
809 			builtin_net_driver = 1;
810 			break;
811 
812 		/* Invalid option - print options. */
813 		default:
814 			us_vhost_usage(prgname);
815 			return -1;
816 		}
817 	}
818 
819 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
820 		if (enabled_port_mask & (1 << i))
821 			ports[num_ports++] = i;
822 	}
823 
824 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
825 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
826 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
827 		return -1;
828 	}
829 
830 	return 0;
831 }
832 
833 /*
834  * Update the global var NUM_PORTS and array PORTS according to system ports number
835  * and return valid ports number
836  */
837 static unsigned check_ports_num(unsigned nb_ports)
838 {
839 	unsigned valid_num_ports = num_ports;
840 	unsigned portid;
841 
842 	if (num_ports > nb_ports) {
843 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
844 			num_ports, nb_ports);
845 		num_ports = nb_ports;
846 	}
847 
848 	for (portid = 0; portid < num_ports; portid ++) {
849 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
850 			RTE_LOG(INFO, VHOST_PORT,
851 				"\nSpecified port ID(%u) is not valid\n",
852 				ports[portid]);
853 			ports[portid] = INVALID_PORT_ID;
854 			valid_num_ports--;
855 		}
856 	}
857 	return valid_num_ports;
858 }
859 
860 static __rte_always_inline struct vhost_dev *
861 find_vhost_dev(struct rte_ether_addr *mac)
862 {
863 	struct vhost_dev *vdev;
864 
865 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
866 		if (vdev->ready == DEVICE_RX &&
867 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
868 			return vdev;
869 	}
870 
871 	return NULL;
872 }
873 
874 /*
875  * This function learns the MAC address of the device and registers this along with a
876  * vlan tag to a VMDQ.
877  */
878 static int
879 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
880 {
881 	struct rte_ether_hdr *pkt_hdr;
882 	int i, ret;
883 
884 	/* Learn MAC address of guest device from packet */
885 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
886 
887 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
888 		RTE_LOG(ERR, VHOST_DATA,
889 			"(%d) device is using a registered MAC!\n",
890 			vdev->vid);
891 		return -1;
892 	}
893 
894 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
895 		vdev->mac_address.addr_bytes[i] =
896 			pkt_hdr->src_addr.addr_bytes[i];
897 
898 	/* vlan_tag currently uses the device_id. */
899 	vdev->vlan_tag = vlan_tags[vdev->vid];
900 
901 	/* Print out VMDQ registration info. */
902 	RTE_LOG(INFO, VHOST_DATA,
903 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
904 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
905 		vdev->vlan_tag);
906 
907 	/* Register the MAC address. */
908 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
909 				(uint32_t)vdev->vid + vmdq_pool_base);
910 	if (ret)
911 		RTE_LOG(ERR, VHOST_DATA,
912 			"(%d) failed to add device MAC address to VMDQ\n",
913 			vdev->vid);
914 
915 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
916 
917 	/* Set device as ready for RX. */
918 	vdev->ready = DEVICE_RX;
919 
920 	return 0;
921 }
922 
923 /*
924  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
925  * queue before disabling RX on the device.
926  */
927 static inline void
928 unlink_vmdq(struct vhost_dev *vdev)
929 {
930 	unsigned i = 0;
931 	unsigned rx_count;
932 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
933 
934 	if (vdev->ready == DEVICE_RX) {
935 		/*clear MAC and VLAN settings*/
936 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
937 		for (i = 0; i < 6; i++)
938 			vdev->mac_address.addr_bytes[i] = 0;
939 
940 		vdev->vlan_tag = 0;
941 
942 		/*Clear out the receive buffers*/
943 		rx_count = rte_eth_rx_burst(ports[0],
944 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
945 
946 		while (rx_count) {
947 			for (i = 0; i < rx_count; i++)
948 				rte_pktmbuf_free(pkts_burst[i]);
949 
950 			rx_count = rte_eth_rx_burst(ports[0],
951 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
952 		}
953 
954 		vdev->ready = DEVICE_MAC_LEARNING;
955 	}
956 }
957 
958 static inline void
959 free_pkts(struct rte_mbuf **pkts, uint16_t n)
960 {
961 	while (n--)
962 		rte_pktmbuf_free(pkts[n]);
963 }
964 
965 static __rte_always_inline void
966 complete_async_pkts(struct vhost_dev *vdev)
967 {
968 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
969 	uint16_t complete_count;
970 	int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
971 
972 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
973 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
974 	if (complete_count) {
975 		free_pkts(p_cpl, complete_count);
976 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
977 	}
978 
979 }
980 
981 static __rte_always_inline void
982 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
983 	    struct rte_mbuf *m)
984 {
985 	uint16_t ret;
986 
987 	if (builtin_net_driver) {
988 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
989 	} else {
990 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
991 	}
992 
993 	if (enable_stats) {
994 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
995 				__ATOMIC_SEQ_CST);
996 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
997 				__ATOMIC_SEQ_CST);
998 		src_vdev->stats.tx_total++;
999 		src_vdev->stats.tx += ret;
1000 	}
1001 }
1002 
1003 static __rte_always_inline void
1004 drain_vhost(struct vhost_dev *vdev)
1005 {
1006 	uint16_t ret;
1007 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1008 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1009 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1010 
1011 	if (builtin_net_driver) {
1012 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
1013 	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1014 		uint16_t enqueue_fail = 0;
1015 		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1016 
1017 		complete_async_pkts(vdev);
1018 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
1019 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
1020 
1021 		enqueue_fail = nr_xmit - ret;
1022 		if (enqueue_fail)
1023 			free_pkts(&m[ret], nr_xmit - ret);
1024 	} else {
1025 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1026 						m, nr_xmit);
1027 	}
1028 
1029 	if (enable_stats) {
1030 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1031 				__ATOMIC_SEQ_CST);
1032 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1033 				__ATOMIC_SEQ_CST);
1034 	}
1035 
1036 	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1037 		free_pkts(m, nr_xmit);
1038 }
1039 
1040 static __rte_always_inline void
1041 drain_vhost_table(void)
1042 {
1043 	uint16_t lcore_id = rte_lcore_id();
1044 	struct vhost_bufftable *vhost_txq;
1045 	struct vhost_dev *vdev;
1046 	uint64_t cur_tsc;
1047 
1048 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1049 		if (unlikely(vdev->remove == 1))
1050 			continue;
1051 
1052 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1053 
1054 		cur_tsc = rte_rdtsc();
1055 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1056 				> MBUF_TABLE_DRAIN_TSC)) {
1057 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1058 				"Vhost TX queue drained after timeout with burst size %u\n",
1059 				vhost_txq->len);
1060 			drain_vhost(vdev);
1061 			vhost_txq->len = 0;
1062 			vhost_txq->pre_tsc = cur_tsc;
1063 		}
1064 	}
1065 }
1066 
1067 /*
1068  * Check if the packet destination MAC address is for a local device. If so then put
1069  * the packet on that devices RX queue. If not then return.
1070  */
1071 static __rte_always_inline int
1072 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1073 {
1074 	struct rte_ether_hdr *pkt_hdr;
1075 	struct vhost_dev *dst_vdev;
1076 	struct vhost_bufftable *vhost_txq;
1077 	uint16_t lcore_id = rte_lcore_id();
1078 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1079 
1080 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1081 	if (!dst_vdev)
1082 		return -1;
1083 
1084 	if (vdev->vid == dst_vdev->vid) {
1085 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1086 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1087 			vdev->vid);
1088 		return 0;
1089 	}
1090 
1091 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1092 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1093 
1094 	if (unlikely(dst_vdev->remove)) {
1095 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1096 			"(%d) device is marked for removal\n", dst_vdev->vid);
1097 		return 0;
1098 	}
1099 
1100 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1101 	vhost_txq->m_table[vhost_txq->len++] = m;
1102 
1103 	if (enable_stats) {
1104 		vdev->stats.tx_total++;
1105 		vdev->stats.tx++;
1106 	}
1107 
1108 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1109 		drain_vhost(dst_vdev);
1110 		vhost_txq->len = 0;
1111 		vhost_txq->pre_tsc = rte_rdtsc();
1112 	}
1113 	return 0;
1114 }
1115 
1116 /*
1117  * Check if the destination MAC of a packet is one local VM,
1118  * and get its vlan tag, and offset if it is.
1119  */
1120 static __rte_always_inline int
1121 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1122 	uint32_t *offset, uint16_t *vlan_tag)
1123 {
1124 	struct vhost_dev *dst_vdev;
1125 	struct rte_ether_hdr *pkt_hdr =
1126 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1127 
1128 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1129 	if (!dst_vdev)
1130 		return 0;
1131 
1132 	if (vdev->vid == dst_vdev->vid) {
1133 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1134 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1135 			vdev->vid);
1136 		return -1;
1137 	}
1138 
1139 	/*
1140 	 * HW vlan strip will reduce the packet length
1141 	 * by minus length of vlan tag, so need restore
1142 	 * the packet length by plus it.
1143 	 */
1144 	*offset  = RTE_VLAN_HLEN;
1145 	*vlan_tag = vlan_tags[vdev->vid];
1146 
1147 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1148 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1149 		vdev->vid, dst_vdev->vid, *vlan_tag);
1150 
1151 	return 0;
1152 }
1153 
1154 static void virtio_tx_offload(struct rte_mbuf *m)
1155 {
1156 	struct rte_net_hdr_lens hdr_lens;
1157 	struct rte_ipv4_hdr *ipv4_hdr;
1158 	struct rte_tcp_hdr *tcp_hdr;
1159 	uint32_t ptype;
1160 	void *l3_hdr;
1161 
1162 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1163 	m->l2_len = hdr_lens.l2_len;
1164 	m->l3_len = hdr_lens.l3_len;
1165 	m->l4_len = hdr_lens.l4_len;
1166 
1167 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1168 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1169 		m->l2_len + m->l3_len);
1170 
1171 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1172 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1173 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1174 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1175 		ipv4_hdr = l3_hdr;
1176 		ipv4_hdr->hdr_checksum = 0;
1177 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1178 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1179 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1180 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1181 	}
1182 }
1183 
1184 static __rte_always_inline void
1185 do_drain_mbuf_table(struct mbuf_table *tx_q)
1186 {
1187 	uint16_t count;
1188 
1189 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1190 				 tx_q->m_table, tx_q->len);
1191 	if (unlikely(count < tx_q->len))
1192 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1193 
1194 	tx_q->len = 0;
1195 }
1196 
1197 /*
1198  * This function routes the TX packet to the correct interface. This
1199  * may be a local device or the physical port.
1200  */
1201 static __rte_always_inline void
1202 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1203 {
1204 	struct mbuf_table *tx_q;
1205 	unsigned offset = 0;
1206 	const uint16_t lcore_id = rte_lcore_id();
1207 	struct rte_ether_hdr *nh;
1208 
1209 
1210 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1211 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1212 		struct vhost_dev *vdev2;
1213 
1214 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1215 			if (vdev2 != vdev)
1216 				sync_virtio_xmit(vdev2, vdev, m);
1217 		}
1218 		goto queue2nic;
1219 	}
1220 
1221 	/*check if destination is local VM*/
1222 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1223 		return;
1224 
1225 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1226 		if (unlikely(find_local_dest(vdev, m, &offset,
1227 					     &vlan_tag) != 0)) {
1228 			rte_pktmbuf_free(m);
1229 			return;
1230 		}
1231 	}
1232 
1233 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1234 		"(%d) TX: MAC address is external\n", vdev->vid);
1235 
1236 queue2nic:
1237 
1238 	/*Add packet to the port tx queue*/
1239 	tx_q = &lcore_tx_queue[lcore_id];
1240 
1241 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1242 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1243 		/* Guest has inserted the vlan tag. */
1244 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1245 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1246 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1247 			(vh->vlan_tci != vlan_tag_be))
1248 			vh->vlan_tci = vlan_tag_be;
1249 	} else {
1250 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1251 
1252 		/*
1253 		 * Find the right seg to adjust the data len when offset is
1254 		 * bigger than tail room size.
1255 		 */
1256 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1257 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1258 				m->data_len += offset;
1259 			else {
1260 				struct rte_mbuf *seg = m;
1261 
1262 				while ((seg->next != NULL) &&
1263 					(offset > rte_pktmbuf_tailroom(seg)))
1264 					seg = seg->next;
1265 
1266 				seg->data_len += offset;
1267 			}
1268 			m->pkt_len += offset;
1269 		}
1270 
1271 		m->vlan_tci = vlan_tag;
1272 	}
1273 
1274 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1275 		virtio_tx_offload(m);
1276 
1277 	tx_q->m_table[tx_q->len++] = m;
1278 	if (enable_stats) {
1279 		vdev->stats.tx_total++;
1280 		vdev->stats.tx++;
1281 	}
1282 
1283 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1284 		do_drain_mbuf_table(tx_q);
1285 }
1286 
1287 
1288 static __rte_always_inline void
1289 drain_mbuf_table(struct mbuf_table *tx_q)
1290 {
1291 	static uint64_t prev_tsc;
1292 	uint64_t cur_tsc;
1293 
1294 	if (tx_q->len == 0)
1295 		return;
1296 
1297 	cur_tsc = rte_rdtsc();
1298 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1299 		prev_tsc = cur_tsc;
1300 
1301 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1302 			"TX queue drained after timeout with burst size %u\n",
1303 			tx_q->len);
1304 		do_drain_mbuf_table(tx_q);
1305 	}
1306 }
1307 
1308 static __rte_always_inline void
1309 drain_eth_rx(struct vhost_dev *vdev)
1310 {
1311 	uint16_t rx_count, enqueue_count;
1312 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1313 
1314 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1315 				    pkts, MAX_PKT_BURST);
1316 
1317 	if (!rx_count)
1318 		return;
1319 
1320 	/*
1321 	 * When "enable_retry" is set, here we wait and retry when there
1322 	 * is no enough free slots in the queue to hold @rx_count packets,
1323 	 * to diminish packet loss.
1324 	 */
1325 	if (enable_retry &&
1326 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1327 			VIRTIO_RXQ))) {
1328 		uint32_t retry;
1329 
1330 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1331 			rte_delay_us(burst_rx_delay_time);
1332 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1333 					VIRTIO_RXQ))
1334 				break;
1335 		}
1336 	}
1337 
1338 	if (builtin_net_driver) {
1339 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1340 						pkts, rx_count);
1341 	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1342 		uint16_t enqueue_fail = 0;
1343 		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1344 
1345 		complete_async_pkts(vdev);
1346 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1347 					VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
1348 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1349 
1350 		enqueue_fail = rx_count - enqueue_count;
1351 		if (enqueue_fail)
1352 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1353 
1354 	} else {
1355 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1356 						pkts, rx_count);
1357 	}
1358 
1359 	if (enable_stats) {
1360 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1361 				__ATOMIC_SEQ_CST);
1362 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1363 				__ATOMIC_SEQ_CST);
1364 	}
1365 
1366 	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1367 		free_pkts(pkts, rx_count);
1368 }
1369 
1370 static __rte_always_inline void
1371 drain_virtio_tx(struct vhost_dev *vdev)
1372 {
1373 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1374 	uint16_t count;
1375 	uint16_t i;
1376 
1377 	if (builtin_net_driver) {
1378 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1379 					pkts, MAX_PKT_BURST);
1380 	} else {
1381 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1382 					mbuf_pool, pkts, MAX_PKT_BURST);
1383 	}
1384 
1385 	/* setup VMDq for the first packet */
1386 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1387 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1388 			free_pkts(pkts, count);
1389 	}
1390 
1391 	for (i = 0; i < count; ++i)
1392 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1393 }
1394 
1395 /*
1396  * Main function of vhost-switch. It basically does:
1397  *
1398  * for each vhost device {
1399  *    - drain_eth_rx()
1400  *
1401  *      Which drains the host eth Rx queue linked to the vhost device,
1402  *      and deliver all of them to guest virito Rx ring associated with
1403  *      this vhost device.
1404  *
1405  *    - drain_virtio_tx()
1406  *
1407  *      Which drains the guest virtio Tx queue and deliver all of them
1408  *      to the target, which could be another vhost device, or the
1409  *      physical eth dev. The route is done in function "virtio_tx_route".
1410  * }
1411  */
1412 static int
1413 switch_worker(void *arg __rte_unused)
1414 {
1415 	unsigned i;
1416 	unsigned lcore_id = rte_lcore_id();
1417 	struct vhost_dev *vdev;
1418 	struct mbuf_table *tx_q;
1419 
1420 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1421 
1422 	tx_q = &lcore_tx_queue[lcore_id];
1423 	for (i = 0; i < rte_lcore_count(); i++) {
1424 		if (lcore_ids[i] == lcore_id) {
1425 			tx_q->txq_id = i;
1426 			break;
1427 		}
1428 	}
1429 
1430 	while(1) {
1431 		drain_mbuf_table(tx_q);
1432 		drain_vhost_table();
1433 		/*
1434 		 * Inform the configuration core that we have exited the
1435 		 * linked list and that no devices are in use if requested.
1436 		 */
1437 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1438 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1439 
1440 		/*
1441 		 * Process vhost devices
1442 		 */
1443 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1444 			      lcore_vdev_entry) {
1445 			if (unlikely(vdev->remove)) {
1446 				unlink_vmdq(vdev);
1447 				vdev->ready = DEVICE_SAFE_REMOVE;
1448 				continue;
1449 			}
1450 
1451 			if (likely(vdev->ready == DEVICE_RX))
1452 				drain_eth_rx(vdev);
1453 
1454 			if (likely(!vdev->remove))
1455 				drain_virtio_tx(vdev);
1456 		}
1457 	}
1458 
1459 	return 0;
1460 }
1461 
1462 /*
1463  * Remove a device from the specific data core linked list and from the
1464  * main linked list. Synchronization  occurs through the use of the
1465  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1466  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1467  */
1468 static void
1469 destroy_device(int vid)
1470 {
1471 	struct vhost_dev *vdev = NULL;
1472 	int lcore;
1473 	uint16_t i;
1474 
1475 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1476 		if (vdev->vid == vid)
1477 			break;
1478 	}
1479 	if (!vdev)
1480 		return;
1481 	/*set the remove flag. */
1482 	vdev->remove = 1;
1483 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1484 		rte_pause();
1485 	}
1486 
1487 	for (i = 0; i < RTE_MAX_LCORE; i++)
1488 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1489 
1490 	if (builtin_net_driver)
1491 		vs_vhost_net_remove(vdev);
1492 
1493 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1494 		     lcore_vdev_entry);
1495 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1496 
1497 
1498 	/* Set the dev_removal_flag on each lcore. */
1499 	RTE_LCORE_FOREACH_WORKER(lcore)
1500 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1501 
1502 	/*
1503 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1504 	 * we can be sure that they can no longer access the device removed
1505 	 * from the linked lists and that the devices are no longer in use.
1506 	 */
1507 	RTE_LCORE_FOREACH_WORKER(lcore) {
1508 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1509 			rte_pause();
1510 	}
1511 
1512 	lcore_info[vdev->coreid].device_num--;
1513 
1514 	RTE_LOG(INFO, VHOST_DATA,
1515 		"(%d) device has been removed from data core\n",
1516 		vdev->vid);
1517 
1518 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1519 		uint16_t n_pkt = 0;
1520 		int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1521 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1522 
1523 		while (vdev->pkts_inflight) {
1524 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1525 						m_cpl, vdev->pkts_inflight, dma_id, 0);
1526 			free_pkts(m_cpl, n_pkt);
1527 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1528 		}
1529 
1530 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1531 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1532 	}
1533 
1534 	rte_free(vdev);
1535 }
1536 
1537 /*
1538  * A new device is added to a data core. First the device is added to the main linked list
1539  * and then allocated to a specific data core.
1540  */
1541 static int
1542 new_device(int vid)
1543 {
1544 	int lcore, core_add = 0;
1545 	uint16_t i;
1546 	uint32_t device_num_min = num_devices;
1547 	struct vhost_dev *vdev;
1548 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1549 	if (vdev == NULL) {
1550 		RTE_LOG(INFO, VHOST_DATA,
1551 			"(%d) couldn't allocate memory for vhost dev\n",
1552 			vid);
1553 		return -1;
1554 	}
1555 	vdev->vid = vid;
1556 
1557 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1558 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1559 			= rte_zmalloc("vhost bufftable",
1560 				sizeof(struct vhost_bufftable),
1561 				RTE_CACHE_LINE_SIZE);
1562 
1563 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1564 			RTE_LOG(INFO, VHOST_DATA,
1565 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1566 			return -1;
1567 		}
1568 	}
1569 
1570 	if (builtin_net_driver)
1571 		vs_vhost_net_setup(vdev);
1572 
1573 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1574 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1575 
1576 	/*reset ready flag*/
1577 	vdev->ready = DEVICE_MAC_LEARNING;
1578 	vdev->remove = 0;
1579 
1580 	/* Find a suitable lcore to add the device. */
1581 	RTE_LCORE_FOREACH_WORKER(lcore) {
1582 		if (lcore_info[lcore].device_num < device_num_min) {
1583 			device_num_min = lcore_info[lcore].device_num;
1584 			core_add = lcore;
1585 		}
1586 	}
1587 	vdev->coreid = core_add;
1588 
1589 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1590 			  lcore_vdev_entry);
1591 	lcore_info[vdev->coreid].device_num++;
1592 
1593 	/* Disable notifications. */
1594 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1595 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1596 
1597 	RTE_LOG(INFO, VHOST_DATA,
1598 		"(%d) device has been added to data core %d\n",
1599 		vid, vdev->coreid);
1600 
1601 	if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1602 		int ret;
1603 
1604 		ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1605 		if (ret == 0)
1606 			dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
1607 		return ret;
1608 	}
1609 
1610 	return 0;
1611 }
1612 
1613 static int
1614 vring_state_changed(int vid, uint16_t queue_id, int enable)
1615 {
1616 	struct vhost_dev *vdev = NULL;
1617 
1618 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1619 		if (vdev->vid == vid)
1620 			break;
1621 	}
1622 	if (!vdev)
1623 		return -1;
1624 
1625 	if (queue_id != VIRTIO_RXQ)
1626 		return 0;
1627 
1628 	if (dma_bind[vid].dmas[queue_id].async_enabled) {
1629 		if (!enable) {
1630 			uint16_t n_pkt = 0;
1631 			int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1632 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1633 
1634 			while (vdev->pkts_inflight) {
1635 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1636 							m_cpl, vdev->pkts_inflight, dma_id, 0);
1637 				free_pkts(m_cpl, n_pkt);
1638 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1639 			}
1640 		}
1641 	}
1642 
1643 	return 0;
1644 }
1645 
1646 /*
1647  * These callback allow devices to be added to the data core when configuration
1648  * has been fully complete.
1649  */
1650 static const struct rte_vhost_device_ops virtio_net_device_ops =
1651 {
1652 	.new_device =  new_device,
1653 	.destroy_device = destroy_device,
1654 	.vring_state_changed = vring_state_changed,
1655 };
1656 
1657 /*
1658  * This is a thread will wake up after a period to print stats if the user has
1659  * enabled them.
1660  */
1661 static void *
1662 print_stats(__rte_unused void *arg)
1663 {
1664 	struct vhost_dev *vdev;
1665 	uint64_t tx_dropped, rx_dropped;
1666 	uint64_t tx, tx_total, rx, rx_total;
1667 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1668 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1669 
1670 	while(1) {
1671 		sleep(enable_stats);
1672 
1673 		/* Clear screen and move to top left */
1674 		printf("%s%s\n", clr, top_left);
1675 		printf("Device statistics =================================\n");
1676 
1677 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1678 			tx_total   = vdev->stats.tx_total;
1679 			tx         = vdev->stats.tx;
1680 			tx_dropped = tx_total - tx;
1681 
1682 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1683 				__ATOMIC_SEQ_CST);
1684 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1685 				__ATOMIC_SEQ_CST);
1686 			rx_dropped = rx_total - rx;
1687 
1688 			printf("Statistics for device %d\n"
1689 				"-----------------------\n"
1690 				"TX total:              %" PRIu64 "\n"
1691 				"TX dropped:            %" PRIu64 "\n"
1692 				"TX successful:         %" PRIu64 "\n"
1693 				"RX total:              %" PRIu64 "\n"
1694 				"RX dropped:            %" PRIu64 "\n"
1695 				"RX successful:         %" PRIu64 "\n",
1696 				vdev->vid,
1697 				tx_total, tx_dropped, tx,
1698 				rx_total, rx_dropped, rx);
1699 		}
1700 
1701 		printf("===================================================\n");
1702 
1703 		fflush(stdout);
1704 	}
1705 
1706 	return NULL;
1707 }
1708 
1709 static void
1710 unregister_drivers(int socket_num)
1711 {
1712 	int i, ret;
1713 
1714 	for (i = 0; i < socket_num; i++) {
1715 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1716 		if (ret != 0)
1717 			RTE_LOG(ERR, VHOST_CONFIG,
1718 				"Fail to unregister vhost driver for %s.\n",
1719 				socket_files + i * PATH_MAX);
1720 	}
1721 }
1722 
1723 /* When we receive a INT signal, unregister vhost driver */
1724 static void
1725 sigint_handler(__rte_unused int signum)
1726 {
1727 	/* Unregister vhost driver. */
1728 	unregister_drivers(nb_sockets);
1729 
1730 	exit(0);
1731 }
1732 
1733 /*
1734  * While creating an mbuf pool, one key thing is to figure out how
1735  * many mbuf entries is enough for our use. FYI, here are some
1736  * guidelines:
1737  *
1738  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1739  *
1740  * - For each switch core (A CPU core does the packet switch), we need
1741  *   also make some reservation for receiving the packets from virtio
1742  *   Tx queue. How many is enough depends on the usage. It's normally
1743  *   a simple calculation like following:
1744  *
1745  *       MAX_PKT_BURST * max packet size / mbuf size
1746  *
1747  *   So, we definitely need allocate more mbufs when TSO is enabled.
1748  *
1749  * - Similarly, for each switching core, we should serve @nr_rx_desc
1750  *   mbufs for receiving the packets from physical NIC device.
1751  *
1752  * - We also need make sure, for each switch core, we have allocated
1753  *   enough mbufs to fill up the mbuf cache.
1754  */
1755 static void
1756 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1757 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1758 {
1759 	uint32_t nr_mbufs;
1760 	uint32_t nr_mbufs_per_core;
1761 	uint32_t mtu = 1500;
1762 
1763 	if (mergeable)
1764 		mtu = 9000;
1765 	if (enable_tso)
1766 		mtu = 64 * 1024;
1767 
1768 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1769 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1770 	nr_mbufs_per_core += nr_rx_desc;
1771 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1772 
1773 	nr_mbufs  = nr_queues * nr_rx_desc;
1774 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1775 	nr_mbufs *= nr_port;
1776 
1777 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1778 					    nr_mbuf_cache, 0, mbuf_size,
1779 					    rte_socket_id());
1780 	if (mbuf_pool == NULL)
1781 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1782 }
1783 
1784 static void
1785 reset_dma(void)
1786 {
1787 	int i;
1788 
1789 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1790 		int j;
1791 
1792 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1793 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1794 			dma_bind[i].dmas[j].async_enabled = false;
1795 		}
1796 	}
1797 
1798 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1799 		dmas_id[i] = INVALID_DMA_ID;
1800 }
1801 
1802 /*
1803  * Main function, does initialisation and calls the per-lcore functions.
1804  */
1805 int
1806 main(int argc, char *argv[])
1807 {
1808 	unsigned lcore_id, core_id = 0;
1809 	unsigned nb_ports, valid_num_ports;
1810 	int ret, i;
1811 	uint16_t portid;
1812 	static pthread_t tid;
1813 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1814 
1815 	signal(SIGINT, sigint_handler);
1816 
1817 	/* init EAL */
1818 	ret = rte_eal_init(argc, argv);
1819 	if (ret < 0)
1820 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1821 	argc -= ret;
1822 	argv += ret;
1823 
1824 	/* initialize dma structures */
1825 	reset_dma();
1826 
1827 	/* parse app arguments */
1828 	ret = us_vhost_parse_args(argc, argv);
1829 	if (ret < 0)
1830 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1831 
1832 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1833 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1834 
1835 		if (rte_lcore_is_enabled(lcore_id))
1836 			lcore_ids[core_id++] = lcore_id;
1837 	}
1838 
1839 	if (rte_lcore_count() > RTE_MAX_LCORE)
1840 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1841 
1842 	/* Get the number of physical ports. */
1843 	nb_ports = rte_eth_dev_count_avail();
1844 
1845 	/*
1846 	 * Update the global var NUM_PORTS and global array PORTS
1847 	 * and get value of var VALID_NUM_PORTS according to system ports number
1848 	 */
1849 	valid_num_ports = check_ports_num(nb_ports);
1850 
1851 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1852 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1853 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1854 		return -1;
1855 	}
1856 
1857 	/*
1858 	 * FIXME: here we are trying to allocate mbufs big enough for
1859 	 * @MAX_QUEUES, but the truth is we're never going to use that
1860 	 * many queues here. We probably should only do allocation for
1861 	 * those queues we are going to use.
1862 	 */
1863 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1864 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1865 
1866 	if (vm2vm_mode == VM2VM_HARDWARE) {
1867 		/* Enable VT loop back to let L2 switch to do it. */
1868 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1869 		RTE_LOG(DEBUG, VHOST_CONFIG,
1870 			"Enable loop back for L2 switch in vmdq.\n");
1871 	}
1872 
1873 	/* initialize all ports */
1874 	RTE_ETH_FOREACH_DEV(portid) {
1875 		/* skip ports that are not enabled */
1876 		if ((enabled_port_mask & (1 << portid)) == 0) {
1877 			RTE_LOG(INFO, VHOST_PORT,
1878 				"Skipping disabled port %d\n", portid);
1879 			continue;
1880 		}
1881 		if (port_init(portid) != 0)
1882 			rte_exit(EXIT_FAILURE,
1883 				"Cannot initialize network ports\n");
1884 	}
1885 
1886 	/* Enable stats if the user option is set. */
1887 	if (enable_stats) {
1888 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1889 					print_stats, NULL);
1890 		if (ret < 0)
1891 			rte_exit(EXIT_FAILURE,
1892 				"Cannot create print-stats thread\n");
1893 	}
1894 
1895 	/* Launch all data cores. */
1896 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1897 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1898 
1899 	if (client_mode)
1900 		flags |= RTE_VHOST_USER_CLIENT;
1901 
1902 	for (i = 0; i < dma_count; i++) {
1903 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1904 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1905 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1906 		}
1907 	}
1908 
1909 	/* Register vhost user driver to handle vhost messages. */
1910 	for (i = 0; i < nb_sockets; i++) {
1911 		char *file = socket_files + i * PATH_MAX;
1912 
1913 		if (dma_count)
1914 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1915 
1916 		ret = rte_vhost_driver_register(file, flags);
1917 		if (ret != 0) {
1918 			unregister_drivers(i);
1919 			rte_exit(EXIT_FAILURE,
1920 				"vhost driver register failure.\n");
1921 		}
1922 
1923 		if (builtin_net_driver)
1924 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1925 
1926 		if (mergeable == 0) {
1927 			rte_vhost_driver_disable_features(file,
1928 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1929 		}
1930 
1931 		if (enable_tx_csum == 0) {
1932 			rte_vhost_driver_disable_features(file,
1933 				1ULL << VIRTIO_NET_F_CSUM);
1934 		}
1935 
1936 		if (enable_tso == 0) {
1937 			rte_vhost_driver_disable_features(file,
1938 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1939 			rte_vhost_driver_disable_features(file,
1940 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1941 			rte_vhost_driver_disable_features(file,
1942 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1943 			rte_vhost_driver_disable_features(file,
1944 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1945 		}
1946 
1947 		if (promiscuous) {
1948 			rte_vhost_driver_enable_features(file,
1949 				1ULL << VIRTIO_NET_F_CTRL_RX);
1950 		}
1951 
1952 		ret = rte_vhost_driver_callback_register(file,
1953 			&virtio_net_device_ops);
1954 		if (ret != 0) {
1955 			rte_exit(EXIT_FAILURE,
1956 				"failed to register vhost driver callbacks.\n");
1957 		}
1958 
1959 		if (rte_vhost_driver_start(file) < 0) {
1960 			rte_exit(EXIT_FAILURE,
1961 				"failed to start vhost driver.\n");
1962 		}
1963 	}
1964 
1965 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1966 		rte_eal_wait_lcore(lcore_id);
1967 
1968 	/* clean up the EAL */
1969 	rte_eal_cleanup();
1970 
1971 	return 0;
1972 }
1973