xref: /dpdk/examples/vhost/main.c (revision c56185fc183fc0532d2f03aaf04bbf0989ea91a5)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <ctype.h>
6 #include <arpa/inet.h>
7 #include <getopt.h>
8 #include <linux/if_ether.h>
9 #include <linux/if_vlan.h>
10 #include <linux/virtio_net.h>
11 #include <linux/virtio_ring.h>
12 #include <signal.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <sys/eventfd.h>
16 #include <sys/param.h>
17 #include <unistd.h>
18 
19 #include <rte_cycles.h>
20 #include <rte_ethdev.h>
21 #include <rte_log.h>
22 #include <rte_string_fns.h>
23 #include <rte_malloc.h>
24 #include <rte_net.h>
25 #include <rte_vhost.h>
26 #include <rte_ip.h>
27 #include <rte_tcp.h>
28 #include <rte_pause.h>
29 #include <rte_dmadev.h>
30 #include <rte_vhost_async.h>
31 #include <rte_thread.h>
32 
33 #include "main.h"
34 
35 #ifndef MAX_QUEUES
36 #define MAX_QUEUES 128
37 #endif
38 
39 #define NUM_MBUFS_DEFAULT 0x24000
40 
41 /* the maximum number of external ports supported */
42 #define MAX_SUP_PORTS 1
43 
44 #define MBUF_CACHE_SIZE	128
45 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
46 
47 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
48 
49 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
50 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
51 
52 #define JUMBO_FRAME_MAX_SIZE    0x2600
53 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
54 
55 /* State of virtio device. */
56 #define DEVICE_MAC_LEARNING 0
57 #define DEVICE_RX			1
58 #define DEVICE_SAFE_REMOVE	2
59 
60 /* Configurable number of RX/TX ring descriptors */
61 #define RX_DESC_DEFAULT 1024
62 #define TX_DESC_DEFAULT 512
63 
64 #define INVALID_PORT_ID 0xFF
65 #define INVALID_DMA_ID -1
66 
67 #define DMA_RING_SIZE 4096
68 
69 #define ASYNC_ENQUEUE_VHOST 1
70 #define ASYNC_DEQUEUE_VHOST 2
71 
72 /* number of mbufs in all pools - if specified on command-line. */
73 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
74 
75 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
76 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
77 static int dma_count;
78 
79 /* mask of enabled ports */
80 static uint32_t enabled_port_mask = 0;
81 
82 /* Promiscuous mode */
83 static uint32_t promiscuous;
84 
85 /* number of devices/queues to support*/
86 static uint32_t num_queues = 0;
87 static uint32_t num_devices;
88 
89 static struct rte_mempool *mbuf_pool;
90 static int mergeable;
91 
92 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
93 typedef enum {
94 	VM2VM_DISABLED = 0,
95 	VM2VM_SOFTWARE = 1,
96 	VM2VM_HARDWARE = 2,
97 	VM2VM_LAST
98 } vm2vm_type;
99 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
100 
101 /* Enable stats. */
102 static uint32_t enable_stats = 0;
103 /* Enable retries on RX. */
104 static uint32_t enable_retry = 1;
105 
106 /* Disable TX checksum offload */
107 static uint32_t enable_tx_csum;
108 
109 /* Disable TSO offload */
110 static uint32_t enable_tso;
111 
112 static int client_mode;
113 
114 static int builtin_net_driver;
115 
116 /* Specify timeout (in useconds) between retries on RX. */
117 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
118 /* Specify the number of retries on RX. */
119 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
120 
121 /* Socket file paths. Can be set by user */
122 static char *socket_files;
123 static int nb_sockets;
124 
125 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
126 
127 /* empty VMDq configuration structure. Filled in programmatically */
128 static struct rte_eth_conf vmdq_conf_default = {
129 	.rxmode = {
130 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
131 		/*
132 		 * VLAN strip is necessary for 1G NIC such as I350,
133 		 * this fixes bug of ipv4 forwarding in guest can't
134 		 * forward packets from one virtio dev to another virtio dev.
135 		 */
136 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
137 	},
138 
139 	.txmode = {
140 		.mq_mode = RTE_ETH_MQ_TX_NONE,
141 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
142 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
143 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
144 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
145 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
146 	},
147 	.rx_adv_conf = {
148 		/*
149 		 * should be overridden separately in code with
150 		 * appropriate values
151 		 */
152 		.vmdq_rx_conf = {
153 			.nb_queue_pools = RTE_ETH_8_POOLS,
154 			.enable_default_pool = 0,
155 			.default_pool = 0,
156 			.nb_pool_maps = 0,
157 			.pool_map = {{0, 0},},
158 		},
159 	},
160 };
161 
162 
163 static unsigned lcore_ids[RTE_MAX_LCORE];
164 static uint16_t ports[RTE_MAX_ETHPORTS];
165 static unsigned num_ports = 0; /**< The number of ports specified in command line */
166 static uint16_t num_pf_queues, num_vmdq_queues;
167 static uint16_t vmdq_pool_base, vmdq_queue_base;
168 static uint16_t queues_per_pool;
169 
170 const uint16_t vlan_tags[] = {
171 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
172 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
173 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
174 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
175 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
176 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
177 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
178 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
179 };
180 
181 /* ethernet addresses of ports */
182 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
183 
184 static struct vhost_dev_tailq_list vhost_dev_list =
185 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
186 
187 static struct lcore_info lcore_info[RTE_MAX_LCORE];
188 
189 /* Used for queueing bursts of TX packets. */
190 struct mbuf_table {
191 	unsigned len;
192 	unsigned txq_id;
193 	struct rte_mbuf *m_table[MAX_PKT_BURST];
194 };
195 
196 struct vhost_bufftable {
197 	uint32_t len;
198 	uint64_t pre_tsc;
199 	struct rte_mbuf *m_table[MAX_PKT_BURST];
200 };
201 
202 /* TX queue for each data core. */
203 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
204 
205 /*
206  * Vhost TX buffer for each data core.
207  * Every data core maintains a TX buffer for every vhost device,
208  * which is used for batch pkts enqueue for higher performance.
209  */
210 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
211 
212 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
213 				 / US_PER_S * BURST_TX_DRAIN_US)
214 
215 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
216 
217 static inline uint32_t
218 get_async_flag_by_socketid(int socketid)
219 {
220 	return dma_bind[socketid].async_flag;
221 }
222 
223 static inline void
224 init_vid2socketid_array(int vid, int socketid)
225 {
226 	vid2socketid[vid] = socketid;
227 }
228 
229 static inline bool
230 is_dma_configured(int16_t dev_id)
231 {
232 	int i;
233 
234 	for (i = 0; i < dma_count; i++)
235 		if (dmas_id[i] == dev_id)
236 			return true;
237 	return false;
238 }
239 
240 static inline int
241 open_dma(const char *value)
242 {
243 	struct dma_for_vhost *dma_info = dma_bind;
244 	char *input = strndup(value, strlen(value) + 1);
245 	char *addrs = input;
246 	char *ptrs[2];
247 	char *start, *end, *substr;
248 	int64_t socketid, vring_id;
249 
250 	struct rte_dma_info info;
251 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
252 	struct rte_dma_vchan_conf qconf = {
253 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
254 		.nb_desc = DMA_RING_SIZE
255 	};
256 
257 	int dev_id;
258 	int ret = 0;
259 	uint16_t i = 0;
260 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
261 	int args_nr;
262 
263 	while (isblank(*addrs))
264 		addrs++;
265 	if (*addrs == '\0') {
266 		ret = -1;
267 		goto out;
268 	}
269 
270 	/* process DMA devices within bracket. */
271 	addrs++;
272 	substr = strtok(addrs, ";]");
273 	if (!substr) {
274 		ret = -1;
275 		goto out;
276 	}
277 
278 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
279 	if (args_nr <= 0) {
280 		ret = -1;
281 		goto out;
282 	}
283 
284 	while (i < args_nr) {
285 		char *arg_temp = dma_arg[i];
286 		char *txd, *rxd;
287 		uint8_t sub_nr;
288 		int async_flag;
289 
290 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
291 		if (sub_nr != 2) {
292 			ret = -1;
293 			goto out;
294 		}
295 
296 		txd = strstr(ptrs[0], "txd");
297 		rxd = strstr(ptrs[0], "rxd");
298 		if (txd) {
299 			start = txd;
300 			vring_id = VIRTIO_RXQ;
301 			async_flag = ASYNC_ENQUEUE_VHOST;
302 		} else if (rxd) {
303 			start = rxd;
304 			vring_id = VIRTIO_TXQ;
305 			async_flag = ASYNC_DEQUEUE_VHOST;
306 		} else {
307 			ret = -1;
308 			goto out;
309 		}
310 
311 		start += 3;
312 		socketid = strtol(start, &end, 0);
313 		if (end == start) {
314 			ret = -1;
315 			goto out;
316 		}
317 
318 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
319 		if (dev_id < 0) {
320 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
321 			ret = -1;
322 			goto out;
323 		}
324 
325 		/* DMA device is already configured, so skip */
326 		if (is_dma_configured(dev_id))
327 			goto done;
328 
329 		if (rte_dma_info_get(dev_id, &info) != 0) {
330 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
331 			ret = -1;
332 			goto out;
333 		}
334 
335 		if (info.max_vchans < 1) {
336 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
337 			ret = -1;
338 			goto out;
339 		}
340 
341 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
342 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
343 			ret = -1;
344 			goto out;
345 		}
346 
347 		/* Check the max desc supported by DMA device */
348 		rte_dma_info_get(dev_id, &info);
349 		if (info.nb_vchans != 1) {
350 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
351 					dev_id);
352 			ret = -1;
353 			goto out;
354 		}
355 
356 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
357 
358 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
359 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
360 			ret = -1;
361 			goto out;
362 		}
363 
364 		if (rte_dma_start(dev_id) != 0) {
365 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
366 			ret = -1;
367 			goto out;
368 		}
369 
370 		dmas_id[dma_count++] = dev_id;
371 
372 done:
373 		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
374 		(dma_info + socketid)->async_flag |= async_flag;
375 		i++;
376 	}
377 out:
378 	free(input);
379 	return ret;
380 }
381 
382 /*
383  * Builds up the correct configuration for VMDQ VLAN pool map
384  * according to the pool & queue limits.
385  */
386 static inline int
387 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
388 {
389 	struct rte_eth_vmdq_rx_conf conf;
390 	struct rte_eth_vmdq_rx_conf *def_conf =
391 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
392 	unsigned i;
393 
394 	memset(&conf, 0, sizeof(conf));
395 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
396 	conf.nb_pool_maps = num_devices;
397 	conf.enable_loop_back = def_conf->enable_loop_back;
398 	conf.rx_mode = def_conf->rx_mode;
399 
400 	for (i = 0; i < conf.nb_pool_maps; i++) {
401 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
402 		conf.pool_map[i].pools = (1UL << i);
403 	}
404 
405 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
406 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
407 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
408 	return 0;
409 }
410 
411 /*
412  * Initialises a given port using global settings and with the rx buffers
413  * coming from the mbuf_pool passed as parameter
414  */
415 static inline int
416 port_init(uint16_t port)
417 {
418 	struct rte_eth_dev_info dev_info;
419 	struct rte_eth_conf port_conf;
420 	struct rte_eth_rxconf *rxconf;
421 	struct rte_eth_txconf *txconf;
422 	int16_t rx_rings, tx_rings;
423 	uint16_t rx_ring_size, tx_ring_size;
424 	int retval;
425 	uint16_t q;
426 
427 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
428 	retval = rte_eth_dev_info_get(port, &dev_info);
429 	if (retval != 0) {
430 		RTE_LOG(ERR, VHOST_PORT,
431 			"Error during getting device (port %u) info: %s\n",
432 			port, strerror(-retval));
433 
434 		return retval;
435 	}
436 	if (dev_info.max_vmdq_pools == 0) {
437 		RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
438 		return -1;
439 	}
440 
441 	rxconf = &dev_info.default_rxconf;
442 	txconf = &dev_info.default_txconf;
443 	rxconf->rx_drop_en = 1;
444 
445 	/*configure the number of supported virtio devices based on VMDQ limits */
446 	num_devices = dev_info.max_vmdq_pools;
447 
448 	rx_ring_size = RX_DESC_DEFAULT;
449 	tx_ring_size = TX_DESC_DEFAULT;
450 
451 	tx_rings = (uint16_t)rte_lcore_count();
452 
453 	if (mergeable) {
454 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
455 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
456 		else
457 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
458 	}
459 
460 	/* Get port configuration. */
461 	retval = get_eth_conf(&port_conf, num_devices);
462 	if (retval < 0)
463 		return retval;
464 	/* NIC queues are divided into pf queues and vmdq queues.  */
465 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
466 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
467 	num_vmdq_queues = num_devices * queues_per_pool;
468 	num_queues = num_pf_queues + num_vmdq_queues;
469 	vmdq_queue_base = dev_info.vmdq_queue_base;
470 	vmdq_pool_base  = dev_info.vmdq_pool_base;
471 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
472 		num_pf_queues, num_devices, queues_per_pool);
473 
474 	if (!rte_eth_dev_is_valid_port(port))
475 		return -1;
476 
477 	rx_rings = (uint16_t)dev_info.max_rx_queues;
478 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
479 		port_conf.txmode.offloads |=
480 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
481 	/* Configure ethernet device. */
482 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
483 	if (retval != 0) {
484 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
485 			port, strerror(-retval));
486 		return retval;
487 	}
488 
489 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
490 		&tx_ring_size);
491 	if (retval != 0) {
492 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
493 			"for port %u: %s.\n", port, strerror(-retval));
494 		return retval;
495 	}
496 	if (rx_ring_size > RX_DESC_DEFAULT) {
497 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
498 			"for Rx queues on port %u.\n", port);
499 		return -1;
500 	}
501 
502 	/* Setup the queues. */
503 	rxconf->offloads = port_conf.rxmode.offloads;
504 	for (q = 0; q < rx_rings; q ++) {
505 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
506 						rte_eth_dev_socket_id(port),
507 						rxconf,
508 						mbuf_pool);
509 		if (retval < 0) {
510 			RTE_LOG(ERR, VHOST_PORT,
511 				"Failed to setup rx queue %u of port %u: %s.\n",
512 				q, port, strerror(-retval));
513 			return retval;
514 		}
515 	}
516 	txconf->offloads = port_conf.txmode.offloads;
517 	for (q = 0; q < tx_rings; q ++) {
518 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
519 						rte_eth_dev_socket_id(port),
520 						txconf);
521 		if (retval < 0) {
522 			RTE_LOG(ERR, VHOST_PORT,
523 				"Failed to setup tx queue %u of port %u: %s.\n",
524 				q, port, strerror(-retval));
525 			return retval;
526 		}
527 	}
528 
529 	/* Start the device. */
530 	retval  = rte_eth_dev_start(port);
531 	if (retval < 0) {
532 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
533 			port, strerror(-retval));
534 		return retval;
535 	}
536 
537 	if (promiscuous) {
538 		retval = rte_eth_promiscuous_enable(port);
539 		if (retval != 0) {
540 			RTE_LOG(ERR, VHOST_PORT,
541 				"Failed to enable promiscuous mode on port %u: %s\n",
542 				port, rte_strerror(-retval));
543 			return retval;
544 		}
545 	}
546 
547 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
548 	if (retval < 0) {
549 		RTE_LOG(ERR, VHOST_PORT,
550 			"Failed to get MAC address on port %u: %s\n",
551 			port, rte_strerror(-retval));
552 		return retval;
553 	}
554 
555 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
556 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
557 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
558 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
559 
560 	return 0;
561 }
562 
563 /*
564  * Set socket file path.
565  */
566 static int
567 us_vhost_parse_socket_path(const char *q_arg)
568 {
569 	char *old;
570 
571 	/* parse number string */
572 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
573 		return -1;
574 
575 	old = socket_files;
576 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
577 	if (socket_files == NULL) {
578 		free(old);
579 		return -1;
580 	}
581 
582 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
583 	nb_sockets++;
584 
585 	return 0;
586 }
587 
588 /*
589  * Parse the portmask provided at run time.
590  */
591 static int
592 parse_portmask(const char *portmask)
593 {
594 	char *end = NULL;
595 	unsigned long pm;
596 
597 	errno = 0;
598 
599 	/* parse hexadecimal string */
600 	pm = strtoul(portmask, &end, 16);
601 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
602 		return 0;
603 
604 	return pm;
605 
606 }
607 
608 /*
609  * Parse num options at run time.
610  */
611 static int
612 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
613 {
614 	char *end = NULL;
615 	unsigned long num;
616 
617 	errno = 0;
618 
619 	/* parse unsigned int string */
620 	num = strtoul(q_arg, &end, 10);
621 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
622 		return -1;
623 
624 	if (num > max_valid_value)
625 		return -1;
626 
627 	return num;
628 
629 }
630 
631 /*
632  * Display usage
633  */
634 static void
635 us_vhost_usage(const char *prgname)
636 {
637 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
638 	"		--vm2vm [0|1|2]\n"
639 	"		--rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n"
640 	"		--socket-file <path>\n"
641 	"		-p PORTMASK: Set mask for ports to be used by application\n"
642 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
643 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
644 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
645 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
646 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
647 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
648 	"		--socket-file: The path of the socket file.\n"
649 	"		--tx-csum [0|1]: disable/enable TX checksum offload.\n"
650 	"		--tso [0|1]: disable/enable TCP segment offload.\n"
651 	"		--client: register a vhost-user socket as client mode.\n"
652 	"		--dmas: register dma channel for specific vhost device.\n"
653 	"		--total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n"
654 	"		--builtin-net-driver: enable simple vhost-user net driver\n",
655 	       prgname);
656 }
657 
658 enum {
659 #define OPT_VM2VM               "vm2vm"
660 	OPT_VM2VM_NUM = 256,
661 #define OPT_RX_RETRY            "rx-retry"
662 	OPT_RX_RETRY_NUM,
663 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
664 	OPT_RX_RETRY_DELAY_NUM,
665 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
666 	OPT_RX_RETRY_NUMB_NUM,
667 #define OPT_MERGEABLE           "mergeable"
668 	OPT_MERGEABLE_NUM,
669 #define OPT_STATS               "stats"
670 	OPT_STATS_NUM,
671 #define OPT_SOCKET_FILE         "socket-file"
672 	OPT_SOCKET_FILE_NUM,
673 #define OPT_TX_CSUM             "tx-csum"
674 	OPT_TX_CSUM_NUM,
675 #define OPT_TSO                 "tso"
676 	OPT_TSO_NUM,
677 #define OPT_CLIENT              "client"
678 	OPT_CLIENT_NUM,
679 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
680 	OPT_BUILTIN_NET_DRIVER_NUM,
681 #define OPT_DMAS                "dmas"
682 	OPT_DMAS_NUM,
683 #define OPT_NUM_MBUFS           "total-num-mbufs"
684 	OPT_NUM_MBUFS_NUM,
685 };
686 
687 /*
688  * Parse the arguments given in the command line of the application.
689  */
690 static int
691 us_vhost_parse_args(int argc, char **argv)
692 {
693 	int opt, ret;
694 	int option_index;
695 	unsigned i;
696 	const char *prgname = argv[0];
697 	static struct option long_option[] = {
698 		{OPT_VM2VM, required_argument,
699 				NULL, OPT_VM2VM_NUM},
700 		{OPT_RX_RETRY, required_argument,
701 				NULL, OPT_RX_RETRY_NUM},
702 		{OPT_RX_RETRY_DELAY, required_argument,
703 				NULL, OPT_RX_RETRY_DELAY_NUM},
704 		{OPT_RX_RETRY_NUMB, required_argument,
705 				NULL, OPT_RX_RETRY_NUMB_NUM},
706 		{OPT_MERGEABLE, required_argument,
707 				NULL, OPT_MERGEABLE_NUM},
708 		{OPT_STATS, required_argument,
709 				NULL, OPT_STATS_NUM},
710 		{OPT_SOCKET_FILE, required_argument,
711 				NULL, OPT_SOCKET_FILE_NUM},
712 		{OPT_TX_CSUM, required_argument,
713 				NULL, OPT_TX_CSUM_NUM},
714 		{OPT_TSO, required_argument,
715 				NULL, OPT_TSO_NUM},
716 		{OPT_CLIENT, no_argument,
717 				NULL, OPT_CLIENT_NUM},
718 		{OPT_BUILTIN_NET_DRIVER, no_argument,
719 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
720 		{OPT_DMAS, required_argument,
721 				NULL, OPT_DMAS_NUM},
722 		{OPT_NUM_MBUFS, required_argument,
723 				NULL, OPT_NUM_MBUFS_NUM},
724 		{NULL, 0, 0, 0},
725 	};
726 
727 	/* Parse command line */
728 	while ((opt = getopt_long(argc, argv, "p:P",
729 			long_option, &option_index)) != EOF) {
730 		switch (opt) {
731 		/* Portmask */
732 		case 'p':
733 			enabled_port_mask = parse_portmask(optarg);
734 			if (enabled_port_mask == 0) {
735 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
736 				us_vhost_usage(prgname);
737 				return -1;
738 			}
739 			break;
740 
741 		case 'P':
742 			promiscuous = 1;
743 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
744 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
745 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
746 			break;
747 
748 		case OPT_VM2VM_NUM:
749 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
750 			if (ret == -1) {
751 				RTE_LOG(INFO, VHOST_CONFIG,
752 					"Invalid argument for "
753 					"vm2vm [0|1|2]\n");
754 				us_vhost_usage(prgname);
755 				return -1;
756 			}
757 			vm2vm_mode = (vm2vm_type)ret;
758 			break;
759 
760 		case OPT_RX_RETRY_NUM:
761 			ret = parse_num_opt(optarg, 1);
762 			if (ret == -1) {
763 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
764 				us_vhost_usage(prgname);
765 				return -1;
766 			}
767 			enable_retry = ret;
768 			break;
769 
770 		case OPT_TX_CSUM_NUM:
771 			ret = parse_num_opt(optarg, 1);
772 			if (ret == -1) {
773 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
774 				us_vhost_usage(prgname);
775 				return -1;
776 			}
777 			enable_tx_csum = ret;
778 			break;
779 
780 		case OPT_TSO_NUM:
781 			ret = parse_num_opt(optarg, 1);
782 			if (ret == -1) {
783 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
784 				us_vhost_usage(prgname);
785 				return -1;
786 			}
787 			enable_tso = ret;
788 			break;
789 
790 		case OPT_RX_RETRY_DELAY_NUM:
791 			ret = parse_num_opt(optarg, INT32_MAX);
792 			if (ret == -1) {
793 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
794 				us_vhost_usage(prgname);
795 				return -1;
796 			}
797 			burst_rx_delay_time = ret;
798 			break;
799 
800 		case OPT_RX_RETRY_NUMB_NUM:
801 			ret = parse_num_opt(optarg, INT32_MAX);
802 			if (ret == -1) {
803 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
804 				us_vhost_usage(prgname);
805 				return -1;
806 			}
807 			burst_rx_retry_num = ret;
808 			break;
809 
810 		case OPT_MERGEABLE_NUM:
811 			ret = parse_num_opt(optarg, 1);
812 			if (ret == -1) {
813 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
814 				us_vhost_usage(prgname);
815 				return -1;
816 			}
817 			mergeable = !!ret;
818 			break;
819 
820 		case OPT_STATS_NUM:
821 			ret = parse_num_opt(optarg, INT32_MAX);
822 			if (ret == -1) {
823 				RTE_LOG(INFO, VHOST_CONFIG,
824 					"Invalid argument for stats [0..N]\n");
825 				us_vhost_usage(prgname);
826 				return -1;
827 			}
828 			enable_stats = ret;
829 			break;
830 
831 		/* Set socket file path. */
832 		case OPT_SOCKET_FILE_NUM:
833 			if (us_vhost_parse_socket_path(optarg) == -1) {
834 				RTE_LOG(INFO, VHOST_CONFIG,
835 				"Invalid argument for socket name (Max %d characters)\n",
836 				PATH_MAX);
837 				us_vhost_usage(prgname);
838 				return -1;
839 			}
840 			break;
841 
842 		case OPT_DMAS_NUM:
843 			if (open_dma(optarg) == -1) {
844 				RTE_LOG(INFO, VHOST_CONFIG,
845 					"Wrong DMA args\n");
846 				us_vhost_usage(prgname);
847 				return -1;
848 			}
849 			break;
850 
851 		case OPT_NUM_MBUFS_NUM:
852 			ret = parse_num_opt(optarg, INT32_MAX);
853 			if (ret == -1) {
854 				RTE_LOG(INFO, VHOST_CONFIG,
855 					"Invalid argument for total-num-mbufs [0..N]\n");
856 				us_vhost_usage(prgname);
857 				return -1;
858 			}
859 
860 			if (total_num_mbufs < ret)
861 				total_num_mbufs = ret;
862 			break;
863 
864 		case OPT_CLIENT_NUM:
865 			client_mode = 1;
866 			break;
867 
868 		case OPT_BUILTIN_NET_DRIVER_NUM:
869 			builtin_net_driver = 1;
870 			break;
871 
872 		/* Invalid option - print options. */
873 		default:
874 			us_vhost_usage(prgname);
875 			return -1;
876 		}
877 	}
878 
879 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
880 		if (enabled_port_mask & (1 << i))
881 			ports[num_ports++] = i;
882 	}
883 
884 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
885 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
886 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
887 		return -1;
888 	}
889 
890 	return 0;
891 }
892 
893 /*
894  * Update the global var NUM_PORTS and array PORTS according to system ports number
895  * and return valid ports number
896  */
897 static unsigned check_ports_num(unsigned nb_ports)
898 {
899 	unsigned valid_num_ports = num_ports;
900 	unsigned portid;
901 
902 	if (num_ports > nb_ports) {
903 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
904 			num_ports, nb_ports);
905 		num_ports = nb_ports;
906 	}
907 
908 	for (portid = 0; portid < num_ports; portid ++) {
909 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
910 			RTE_LOG(INFO, VHOST_PORT,
911 				"\nSpecified port ID(%u) is not valid\n",
912 				ports[portid]);
913 			ports[portid] = INVALID_PORT_ID;
914 			valid_num_ports--;
915 		}
916 	}
917 	return valid_num_ports;
918 }
919 
920 static __rte_always_inline struct vhost_dev *
921 find_vhost_dev(struct rte_ether_addr *mac)
922 {
923 	struct vhost_dev *vdev;
924 
925 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
926 		if (vdev->ready == DEVICE_RX &&
927 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
928 			return vdev;
929 	}
930 
931 	return NULL;
932 }
933 
934 /*
935  * This function learns the MAC address of the device and registers this along with a
936  * vlan tag to a VMDQ.
937  */
938 static int
939 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
940 {
941 	struct rte_ether_hdr *pkt_hdr;
942 	int i, ret;
943 
944 	/* Learn MAC address of guest device from packet */
945 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
946 
947 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
948 		RTE_LOG(ERR, VHOST_DATA,
949 			"(%d) device is using a registered MAC!\n",
950 			vdev->vid);
951 		return -1;
952 	}
953 
954 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
955 		vdev->mac_address.addr_bytes[i] =
956 			pkt_hdr->src_addr.addr_bytes[i];
957 
958 	/* vlan_tag currently uses the device_id. */
959 	vdev->vlan_tag = vlan_tags[vdev->vid];
960 
961 	/* Print out VMDQ registration info. */
962 	RTE_LOG(INFO, VHOST_DATA,
963 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
964 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
965 		vdev->vlan_tag);
966 
967 	/* Register the MAC address. */
968 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
969 				(uint32_t)vdev->vid + vmdq_pool_base);
970 	if (ret)
971 		RTE_LOG(ERR, VHOST_DATA,
972 			"(%d) failed to add device MAC address to VMDQ\n",
973 			vdev->vid);
974 
975 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
976 
977 	/* Set device as ready for RX. */
978 	vdev->ready = DEVICE_RX;
979 
980 	return 0;
981 }
982 
983 /*
984  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
985  * queue before disabling RX on the device.
986  */
987 static inline void
988 unlink_vmdq(struct vhost_dev *vdev)
989 {
990 	unsigned i = 0;
991 	unsigned rx_count;
992 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
993 
994 	if (vdev->ready == DEVICE_RX) {
995 		/*clear MAC and VLAN settings*/
996 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
997 		for (i = 0; i < 6; i++)
998 			vdev->mac_address.addr_bytes[i] = 0;
999 
1000 		vdev->vlan_tag = 0;
1001 
1002 		/*Clear out the receive buffers*/
1003 		rx_count = rte_eth_rx_burst(ports[0],
1004 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1005 
1006 		while (rx_count) {
1007 			for (i = 0; i < rx_count; i++)
1008 				rte_pktmbuf_free(pkts_burst[i]);
1009 
1010 			rx_count = rte_eth_rx_burst(ports[0],
1011 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1012 		}
1013 
1014 		vdev->ready = DEVICE_MAC_LEARNING;
1015 	}
1016 }
1017 
1018 static inline void
1019 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1020 {
1021 	while (n--)
1022 		rte_pktmbuf_free(pkts[n]);
1023 }
1024 
1025 static __rte_always_inline void
1026 complete_async_pkts(struct vhost_dev *vdev)
1027 {
1028 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1029 	uint16_t complete_count;
1030 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1031 
1032 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1033 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1034 	if (complete_count)
1035 		free_pkts(p_cpl, complete_count);
1036 
1037 }
1038 
1039 static __rte_always_inline void
1040 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1041 	    struct rte_mbuf *m)
1042 {
1043 	uint16_t ret;
1044 
1045 	if (builtin_net_driver) {
1046 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1047 	} else {
1048 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1049 	}
1050 
1051 	if (enable_stats) {
1052 		__atomic_fetch_add(&dst_vdev->stats.rx_total_atomic, 1,
1053 				__ATOMIC_SEQ_CST);
1054 		__atomic_fetch_add(&dst_vdev->stats.rx_atomic, ret,
1055 				__ATOMIC_SEQ_CST);
1056 		src_vdev->stats.tx_total++;
1057 		src_vdev->stats.tx += ret;
1058 	}
1059 }
1060 
1061 static __rte_always_inline void
1062 drain_vhost(struct vhost_dev *vdev)
1063 {
1064 	uint16_t ret;
1065 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1066 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1067 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1068 
1069 	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1070 
1071 	if (enable_stats) {
1072 		__atomic_fetch_add(&vdev->stats.rx_total_atomic, nr_xmit,
1073 				__ATOMIC_SEQ_CST);
1074 		__atomic_fetch_add(&vdev->stats.rx_atomic, ret,
1075 				__ATOMIC_SEQ_CST);
1076 	}
1077 
1078 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1079 		free_pkts(m, nr_xmit);
1080 	} else {
1081 		uint16_t enqueue_fail = nr_xmit - ret;
1082 		if (enqueue_fail > 0)
1083 			free_pkts(&m[ret], enqueue_fail);
1084 	}
1085 }
1086 
1087 static __rte_always_inline void
1088 drain_vhost_table(void)
1089 {
1090 	uint16_t lcore_id = rte_lcore_id();
1091 	struct vhost_bufftable *vhost_txq;
1092 	struct vhost_dev *vdev;
1093 	uint64_t cur_tsc;
1094 
1095 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1096 		if (unlikely(vdev->remove == 1))
1097 			continue;
1098 
1099 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1100 
1101 		cur_tsc = rte_rdtsc();
1102 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1103 				> MBUF_TABLE_DRAIN_TSC)) {
1104 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1105 				"Vhost TX queue drained after timeout with burst size %u\n",
1106 				vhost_txq->len);
1107 			drain_vhost(vdev);
1108 			vhost_txq->len = 0;
1109 			vhost_txq->pre_tsc = cur_tsc;
1110 		}
1111 	}
1112 }
1113 
1114 /*
1115  * Check if the packet destination MAC address is for a local device. If so then put
1116  * the packet on that devices RX queue. If not then return.
1117  */
1118 static __rte_always_inline int
1119 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1120 {
1121 	struct rte_ether_hdr *pkt_hdr;
1122 	struct vhost_dev *dst_vdev;
1123 	struct vhost_bufftable *vhost_txq;
1124 	uint16_t lcore_id = rte_lcore_id();
1125 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1126 
1127 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1128 	if (!dst_vdev)
1129 		return -1;
1130 
1131 	if (vdev->vid == dst_vdev->vid) {
1132 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1133 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1134 			vdev->vid);
1135 		return 0;
1136 	}
1137 
1138 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1139 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1140 
1141 	if (unlikely(dst_vdev->remove)) {
1142 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1143 			"(%d) device is marked for removal\n", dst_vdev->vid);
1144 		return 0;
1145 	}
1146 
1147 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1148 	vhost_txq->m_table[vhost_txq->len++] = m;
1149 
1150 	if (enable_stats) {
1151 		vdev->stats.tx_total++;
1152 		vdev->stats.tx++;
1153 	}
1154 
1155 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1156 		drain_vhost(dst_vdev);
1157 		vhost_txq->len = 0;
1158 		vhost_txq->pre_tsc = rte_rdtsc();
1159 	}
1160 	return 0;
1161 }
1162 
1163 /*
1164  * Check if the destination MAC of a packet is one local VM,
1165  * and get its vlan tag, and offset if it is.
1166  */
1167 static __rte_always_inline int
1168 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1169 	uint32_t *offset, uint16_t *vlan_tag)
1170 {
1171 	struct vhost_dev *dst_vdev;
1172 	struct rte_ether_hdr *pkt_hdr =
1173 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1174 
1175 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1176 	if (!dst_vdev)
1177 		return 0;
1178 
1179 	if (vdev->vid == dst_vdev->vid) {
1180 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1181 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1182 			vdev->vid);
1183 		return -1;
1184 	}
1185 
1186 	/*
1187 	 * HW vlan strip will reduce the packet length
1188 	 * by minus length of vlan tag, so need restore
1189 	 * the packet length by plus it.
1190 	 */
1191 	*offset  = RTE_VLAN_HLEN;
1192 	*vlan_tag = vlan_tags[vdev->vid];
1193 
1194 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1195 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1196 		vdev->vid, dst_vdev->vid, *vlan_tag);
1197 
1198 	return 0;
1199 }
1200 
1201 static void virtio_tx_offload(struct rte_mbuf *m)
1202 {
1203 	struct rte_net_hdr_lens hdr_lens;
1204 	struct rte_ipv4_hdr *ipv4_hdr;
1205 	struct rte_tcp_hdr *tcp_hdr;
1206 	uint32_t ptype;
1207 	void *l3_hdr;
1208 
1209 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1210 	m->l2_len = hdr_lens.l2_len;
1211 	m->l3_len = hdr_lens.l3_len;
1212 	m->l4_len = hdr_lens.l4_len;
1213 
1214 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1215 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1216 		m->l2_len + m->l3_len);
1217 
1218 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1219 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1220 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1221 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1222 		ipv4_hdr = l3_hdr;
1223 		ipv4_hdr->hdr_checksum = 0;
1224 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1225 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1226 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1227 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1228 	}
1229 }
1230 
1231 static __rte_always_inline void
1232 do_drain_mbuf_table(struct mbuf_table *tx_q)
1233 {
1234 	uint16_t count;
1235 
1236 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1237 				 tx_q->m_table, tx_q->len);
1238 	if (unlikely(count < tx_q->len))
1239 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1240 
1241 	tx_q->len = 0;
1242 }
1243 
1244 /*
1245  * This function routes the TX packet to the correct interface. This
1246  * may be a local device or the physical port.
1247  */
1248 static __rte_always_inline void
1249 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1250 {
1251 	struct mbuf_table *tx_q;
1252 	unsigned offset = 0;
1253 	const uint16_t lcore_id = rte_lcore_id();
1254 	struct rte_ether_hdr *nh;
1255 
1256 
1257 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1258 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1259 		struct vhost_dev *vdev2;
1260 
1261 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1262 			if (vdev2 != vdev)
1263 				sync_virtio_xmit(vdev2, vdev, m);
1264 		}
1265 		goto queue2nic;
1266 	}
1267 
1268 	/*check if destination is local VM*/
1269 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1270 		return;
1271 
1272 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1273 		if (unlikely(find_local_dest(vdev, m, &offset,
1274 					     &vlan_tag) != 0)) {
1275 			rte_pktmbuf_free(m);
1276 			return;
1277 		}
1278 	}
1279 
1280 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1281 		"(%d) TX: MAC address is external\n", vdev->vid);
1282 
1283 queue2nic:
1284 
1285 	/*Add packet to the port tx queue*/
1286 	tx_q = &lcore_tx_queue[lcore_id];
1287 
1288 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1289 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1290 		/* Guest has inserted the vlan tag. */
1291 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1292 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1293 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1294 			(vh->vlan_tci != vlan_tag_be))
1295 			vh->vlan_tci = vlan_tag_be;
1296 	} else {
1297 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1298 
1299 		/*
1300 		 * Find the right seg to adjust the data len when offset is
1301 		 * bigger than tail room size.
1302 		 */
1303 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1304 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1305 				m->data_len += offset;
1306 			else {
1307 				struct rte_mbuf *seg = m;
1308 
1309 				while ((seg->next != NULL) &&
1310 					(offset > rte_pktmbuf_tailroom(seg)))
1311 					seg = seg->next;
1312 
1313 				seg->data_len += offset;
1314 			}
1315 			m->pkt_len += offset;
1316 		}
1317 
1318 		m->vlan_tci = vlan_tag;
1319 	}
1320 
1321 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1322 		virtio_tx_offload(m);
1323 
1324 	tx_q->m_table[tx_q->len++] = m;
1325 	if (enable_stats) {
1326 		vdev->stats.tx_total++;
1327 		vdev->stats.tx++;
1328 	}
1329 
1330 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1331 		do_drain_mbuf_table(tx_q);
1332 }
1333 
1334 
1335 static __rte_always_inline void
1336 drain_mbuf_table(struct mbuf_table *tx_q)
1337 {
1338 	static uint64_t prev_tsc;
1339 	uint64_t cur_tsc;
1340 
1341 	if (tx_q->len == 0)
1342 		return;
1343 
1344 	cur_tsc = rte_rdtsc();
1345 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1346 		prev_tsc = cur_tsc;
1347 
1348 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1349 			"TX queue drained after timeout with burst size %u\n",
1350 			tx_q->len);
1351 		do_drain_mbuf_table(tx_q);
1352 	}
1353 }
1354 
1355 uint16_t
1356 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1357 		struct rte_mbuf **pkts, uint32_t rx_count)
1358 {
1359 	uint16_t enqueue_count;
1360 	uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1361 
1362 	complete_async_pkts(dev);
1363 	enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1364 					pkts, rx_count, dma_id, 0);
1365 
1366 	return enqueue_count;
1367 }
1368 
1369 uint16_t
1370 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1371 		struct rte_mbuf **pkts, uint32_t rx_count)
1372 {
1373 	return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1374 }
1375 
1376 static __rte_always_inline void
1377 drain_eth_rx(struct vhost_dev *vdev)
1378 {
1379 	uint16_t rx_count, enqueue_count;
1380 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1381 
1382 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1383 				    pkts, MAX_PKT_BURST);
1384 
1385 	if (!rx_count)
1386 		return;
1387 
1388 	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1389 						VIRTIO_RXQ, pkts, rx_count);
1390 
1391 	/* Retry if necessary */
1392 	if (enable_retry && unlikely(enqueue_count < rx_count)) {
1393 		uint32_t retry = 0;
1394 
1395 		while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) {
1396 			rte_delay_us(burst_rx_delay_time);
1397 			enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1398 							VIRTIO_RXQ, &pkts[enqueue_count],
1399 							rx_count - enqueue_count);
1400 		}
1401 	}
1402 
1403 	if (enable_stats) {
1404 		__atomic_fetch_add(&vdev->stats.rx_total_atomic, rx_count,
1405 				__ATOMIC_SEQ_CST);
1406 		__atomic_fetch_add(&vdev->stats.rx_atomic, enqueue_count,
1407 				__ATOMIC_SEQ_CST);
1408 	}
1409 
1410 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1411 		free_pkts(pkts, rx_count);
1412 	} else {
1413 		uint16_t enqueue_fail = rx_count - enqueue_count;
1414 		if (enqueue_fail > 0)
1415 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1416 	}
1417 }
1418 
1419 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1420 			    struct rte_mempool *mbuf_pool,
1421 			    struct rte_mbuf **pkts, uint16_t count)
1422 {
1423 	int nr_inflight;
1424 	uint16_t dequeue_count;
1425 	int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1426 
1427 	dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1428 			mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1429 
1430 	return dequeue_count;
1431 }
1432 
1433 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1434 			   struct rte_mempool *mbuf_pool,
1435 			   struct rte_mbuf **pkts, uint16_t count)
1436 {
1437 	return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1438 }
1439 
1440 static __rte_always_inline void
1441 drain_virtio_tx(struct vhost_dev *vdev)
1442 {
1443 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1444 	uint16_t count;
1445 	uint16_t i;
1446 
1447 	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1448 				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1449 
1450 	/* setup VMDq for the first packet */
1451 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1452 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1453 			free_pkts(pkts, count);
1454 	}
1455 
1456 	for (i = 0; i < count; ++i)
1457 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1458 }
1459 
1460 /*
1461  * Main function of vhost-switch. It basically does:
1462  *
1463  * for each vhost device {
1464  *    - drain_eth_rx()
1465  *
1466  *      Which drains the host eth Rx queue linked to the vhost device,
1467  *      and deliver all of them to guest virito Rx ring associated with
1468  *      this vhost device.
1469  *
1470  *    - drain_virtio_tx()
1471  *
1472  *      Which drains the guest virtio Tx queue and deliver all of them
1473  *      to the target, which could be another vhost device, or the
1474  *      physical eth dev. The route is done in function "virtio_tx_route".
1475  * }
1476  */
1477 static int
1478 switch_worker(void *arg __rte_unused)
1479 {
1480 	unsigned i;
1481 	unsigned lcore_id = rte_lcore_id();
1482 	struct vhost_dev *vdev;
1483 	struct mbuf_table *tx_q;
1484 
1485 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1486 
1487 	tx_q = &lcore_tx_queue[lcore_id];
1488 	for (i = 0; i < rte_lcore_count(); i++) {
1489 		if (lcore_ids[i] == lcore_id) {
1490 			tx_q->txq_id = i;
1491 			break;
1492 		}
1493 	}
1494 
1495 	while(1) {
1496 		drain_mbuf_table(tx_q);
1497 		drain_vhost_table();
1498 		/*
1499 		 * Inform the configuration core that we have exited the
1500 		 * linked list and that no devices are in use if requested.
1501 		 */
1502 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1503 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1504 
1505 		/*
1506 		 * Process vhost devices
1507 		 */
1508 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1509 			      lcore_vdev_entry) {
1510 			if (unlikely(vdev->remove)) {
1511 				unlink_vmdq(vdev);
1512 				vdev->ready = DEVICE_SAFE_REMOVE;
1513 				continue;
1514 			}
1515 
1516 			if (likely(vdev->ready == DEVICE_RX))
1517 				drain_eth_rx(vdev);
1518 
1519 			if (likely(!vdev->remove))
1520 				drain_virtio_tx(vdev);
1521 		}
1522 	}
1523 
1524 	return 0;
1525 }
1526 
1527 static void
1528 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1529 {
1530 	uint16_t n_pkt = 0;
1531 	int pkts_inflight;
1532 
1533 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1534 	pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1535 
1536 	struct rte_mbuf *m_cpl[pkts_inflight];
1537 
1538 	while (pkts_inflight) {
1539 		n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1540 							pkts_inflight, dma_id, 0);
1541 		free_pkts(m_cpl, n_pkt);
1542 		pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1543 									queue_id);
1544 	}
1545 }
1546 
1547 static void
1548 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id)
1549 {
1550 	uint16_t n_pkt = 0;
1551 	int pkts_inflight;
1552 
1553 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1554 	pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1555 
1556 	struct rte_mbuf *m_cpl[pkts_inflight];
1557 
1558 	while (pkts_inflight) {
1559 		n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl,
1560 						pkts_inflight, dma_id, 0);
1561 		free_pkts(m_cpl, n_pkt);
1562 		pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1563 	}
1564 }
1565 
1566 /*
1567  * Remove a device from the specific data core linked list and from the
1568  * main linked list. Synchronization  occurs through the use of the
1569  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1570  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1571  */
1572 static void
1573 destroy_device(int vid)
1574 {
1575 	struct vhost_dev *vdev = NULL;
1576 	int lcore;
1577 	uint16_t i;
1578 
1579 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1580 		if (vdev->vid == vid)
1581 			break;
1582 	}
1583 	if (!vdev)
1584 		return;
1585 	/*set the remove flag. */
1586 	vdev->remove = 1;
1587 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1588 		rte_pause();
1589 	}
1590 
1591 	for (i = 0; i < RTE_MAX_LCORE; i++)
1592 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1593 
1594 	if (builtin_net_driver)
1595 		vs_vhost_net_remove(vdev);
1596 
1597 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1598 		     lcore_vdev_entry);
1599 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1600 
1601 
1602 	/* Set the dev_removal_flag on each lcore. */
1603 	RTE_LCORE_FOREACH_WORKER(lcore)
1604 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1605 
1606 	/*
1607 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1608 	 * we can be sure that they can no longer access the device removed
1609 	 * from the linked lists and that the devices are no longer in use.
1610 	 */
1611 	RTE_LCORE_FOREACH_WORKER(lcore) {
1612 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1613 			rte_pause();
1614 	}
1615 
1616 	lcore_info[vdev->coreid].device_num--;
1617 
1618 	RTE_LOG(INFO, VHOST_DATA,
1619 		"(%d) device has been removed from data core\n",
1620 		vdev->vid);
1621 
1622 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1623 		vhost_clear_queue(vdev, VIRTIO_RXQ);
1624 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1625 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1626 	}
1627 
1628 	if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1629 		vhost_clear_queue(vdev, VIRTIO_TXQ);
1630 		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1631 		dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1632 	}
1633 
1634 	rte_free(vdev);
1635 }
1636 
1637 static inline int
1638 get_socketid_by_vid(int vid)
1639 {
1640 	int i;
1641 	char ifname[PATH_MAX];
1642 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1643 
1644 	for (i = 0; i < nb_sockets; i++) {
1645 		char *file = socket_files + i * PATH_MAX;
1646 		if (strcmp(file, ifname) == 0)
1647 			return i;
1648 	}
1649 
1650 	return -1;
1651 }
1652 
1653 static int
1654 init_vhost_queue_ops(int vid)
1655 {
1656 	if (builtin_net_driver) {
1657 		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1658 		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1659 	} else {
1660 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1661 			vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1662 		else
1663 			vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1664 
1665 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1666 			vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1667 		else
1668 			vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1669 	}
1670 
1671 	return 0;
1672 }
1673 
1674 static inline int
1675 vhost_async_channel_register(int vid)
1676 {
1677 	int rx_ret = 0, tx_ret = 0;
1678 
1679 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1680 		rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1681 		if (rx_ret == 0)
1682 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1683 	}
1684 
1685 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1686 		tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1687 		if (tx_ret == 0)
1688 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1689 	}
1690 
1691 	return rx_ret | tx_ret;
1692 }
1693 
1694 
1695 
1696 /*
1697  * A new device is added to a data core. First the device is added to the main linked list
1698  * and then allocated to a specific data core.
1699  */
1700 static int
1701 new_device(int vid)
1702 {
1703 	int lcore, core_add = 0;
1704 	uint16_t i;
1705 	uint32_t device_num_min = num_devices;
1706 	struct vhost_dev *vdev;
1707 	int ret;
1708 
1709 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1710 	if (vdev == NULL) {
1711 		RTE_LOG(INFO, VHOST_DATA,
1712 			"(%d) couldn't allocate memory for vhost dev\n",
1713 			vid);
1714 		return -1;
1715 	}
1716 	vdev->vid = vid;
1717 
1718 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1719 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1720 			= rte_zmalloc("vhost bufftable",
1721 				sizeof(struct vhost_bufftable),
1722 				RTE_CACHE_LINE_SIZE);
1723 
1724 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1725 			RTE_LOG(INFO, VHOST_DATA,
1726 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1727 			return -1;
1728 		}
1729 	}
1730 
1731 	int socketid = get_socketid_by_vid(vid);
1732 	if (socketid == -1)
1733 		return -1;
1734 
1735 	init_vid2socketid_array(vid, socketid);
1736 
1737 	ret =  vhost_async_channel_register(vid);
1738 
1739 	if (init_vhost_queue_ops(vid) != 0)
1740 		return -1;
1741 
1742 	if (builtin_net_driver)
1743 		vs_vhost_net_setup(vdev);
1744 
1745 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1746 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1747 
1748 	/*reset ready flag*/
1749 	vdev->ready = DEVICE_MAC_LEARNING;
1750 	vdev->remove = 0;
1751 
1752 	/* Find a suitable lcore to add the device. */
1753 	RTE_LCORE_FOREACH_WORKER(lcore) {
1754 		if (lcore_info[lcore].device_num < device_num_min) {
1755 			device_num_min = lcore_info[lcore].device_num;
1756 			core_add = lcore;
1757 		}
1758 	}
1759 	vdev->coreid = core_add;
1760 
1761 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1762 			  lcore_vdev_entry);
1763 	lcore_info[vdev->coreid].device_num++;
1764 
1765 	/* Disable notifications. */
1766 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1767 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1768 
1769 	RTE_LOG(INFO, VHOST_DATA,
1770 		"(%d) device has been added to data core %d\n",
1771 		vid, vdev->coreid);
1772 
1773 	return ret;
1774 }
1775 
1776 static int
1777 vring_state_changed(int vid, uint16_t queue_id, int enable)
1778 {
1779 	struct vhost_dev *vdev = NULL;
1780 
1781 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1782 		if (vdev->vid == vid)
1783 			break;
1784 	}
1785 	if (!vdev)
1786 		return -1;
1787 
1788 	if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1789 		if (!enable)
1790 			vhost_clear_queue_thread_unsafe(vdev, queue_id);
1791 	}
1792 
1793 	return 0;
1794 }
1795 
1796 /*
1797  * These callback allow devices to be added to the data core when configuration
1798  * has been fully complete.
1799  */
1800 static const struct rte_vhost_device_ops virtio_net_device_ops =
1801 {
1802 	.new_device =  new_device,
1803 	.destroy_device = destroy_device,
1804 	.vring_state_changed = vring_state_changed,
1805 };
1806 
1807 /*
1808  * This is a thread will wake up after a period to print stats if the user has
1809  * enabled them.
1810  */
1811 static uint32_t
1812 print_stats(__rte_unused void *arg)
1813 {
1814 	struct vhost_dev *vdev;
1815 	uint64_t tx_dropped, rx_dropped;
1816 	uint64_t tx, tx_total, rx, rx_total;
1817 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1818 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1819 
1820 	while(1) {
1821 		sleep(enable_stats);
1822 
1823 		/* Clear screen and move to top left */
1824 		printf("%s%s\n", clr, top_left);
1825 		printf("Device statistics =================================\n");
1826 
1827 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1828 			tx_total   = vdev->stats.tx_total;
1829 			tx         = vdev->stats.tx;
1830 			tx_dropped = tx_total - tx;
1831 
1832 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1833 				__ATOMIC_SEQ_CST);
1834 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1835 				__ATOMIC_SEQ_CST);
1836 			rx_dropped = rx_total - rx;
1837 
1838 			printf("Statistics for device %d\n"
1839 				"-----------------------\n"
1840 				"TX total:              %" PRIu64 "\n"
1841 				"TX dropped:            %" PRIu64 "\n"
1842 				"TX successful:         %" PRIu64 "\n"
1843 				"RX total:              %" PRIu64 "\n"
1844 				"RX dropped:            %" PRIu64 "\n"
1845 				"RX successful:         %" PRIu64 "\n",
1846 				vdev->vid,
1847 				tx_total, tx_dropped, tx,
1848 				rx_total, rx_dropped, rx);
1849 		}
1850 
1851 		printf("===================================================\n");
1852 
1853 		fflush(stdout);
1854 	}
1855 
1856 	return 0;
1857 }
1858 
1859 static void
1860 unregister_drivers(int socket_num)
1861 {
1862 	int i, ret;
1863 
1864 	for (i = 0; i < socket_num; i++) {
1865 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1866 		if (ret != 0)
1867 			RTE_LOG(ERR, VHOST_CONFIG,
1868 				"Fail to unregister vhost driver for %s.\n",
1869 				socket_files + i * PATH_MAX);
1870 	}
1871 }
1872 
1873 /* When we receive a INT signal, unregister vhost driver */
1874 static void
1875 sigint_handler(__rte_unused int signum)
1876 {
1877 	/* Unregister vhost driver. */
1878 	unregister_drivers(nb_sockets);
1879 
1880 	exit(0);
1881 }
1882 
1883 static void
1884 reset_dma(void)
1885 {
1886 	int i;
1887 
1888 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1889 		int j;
1890 
1891 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1892 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1893 			dma_bind[i].dmas[j].async_enabled = false;
1894 		}
1895 	}
1896 
1897 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1898 		dmas_id[i] = INVALID_DMA_ID;
1899 }
1900 
1901 /*
1902  * Main function, does initialisation and calls the per-lcore functions.
1903  */
1904 int
1905 main(int argc, char *argv[])
1906 {
1907 	unsigned lcore_id, core_id = 0;
1908 	unsigned nb_ports, valid_num_ports;
1909 	int ret, i;
1910 	uint16_t portid;
1911 	rte_thread_t tid;
1912 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1913 
1914 	signal(SIGINT, sigint_handler);
1915 
1916 	/* init EAL */
1917 	ret = rte_eal_init(argc, argv);
1918 	if (ret < 0)
1919 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1920 	argc -= ret;
1921 	argv += ret;
1922 
1923 	/* initialize dma structures */
1924 	reset_dma();
1925 
1926 	/* parse app arguments */
1927 	ret = us_vhost_parse_args(argc, argv);
1928 	if (ret < 0)
1929 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1930 
1931 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1932 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1933 
1934 		if (rte_lcore_is_enabled(lcore_id))
1935 			lcore_ids[core_id++] = lcore_id;
1936 	}
1937 
1938 	if (rte_lcore_count() > RTE_MAX_LCORE)
1939 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1940 
1941 	/* Get the number of physical ports. */
1942 	nb_ports = rte_eth_dev_count_avail();
1943 
1944 	/*
1945 	 * Update the global var NUM_PORTS and global array PORTS
1946 	 * and get value of var VALID_NUM_PORTS according to system ports number
1947 	 */
1948 	valid_num_ports = check_ports_num(nb_ports);
1949 
1950 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1951 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1952 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1953 		return -1;
1954 	}
1955 
1956 	/*
1957 	 * FIXME: here we are trying to allocate mbufs big enough for
1958 	 * @MAX_QUEUES, but the truth is we're never going to use that
1959 	 * many queues here. We probably should only do allocation for
1960 	 * those queues we are going to use.
1961 	 */
1962 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1963 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1964 					    rte_socket_id());
1965 	if (mbuf_pool == NULL)
1966 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1967 
1968 	if (vm2vm_mode == VM2VM_HARDWARE) {
1969 		/* Enable VT loop back to let L2 switch to do it. */
1970 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1971 		RTE_LOG(DEBUG, VHOST_CONFIG,
1972 			"Enable loop back for L2 switch in vmdq.\n");
1973 	}
1974 
1975 	/* initialize all ports */
1976 	RTE_ETH_FOREACH_DEV(portid) {
1977 		/* skip ports that are not enabled */
1978 		if ((enabled_port_mask & (1 << portid)) == 0) {
1979 			RTE_LOG(INFO, VHOST_PORT,
1980 				"Skipping disabled port %d\n", portid);
1981 			continue;
1982 		}
1983 		if (port_init(portid) != 0)
1984 			rte_exit(EXIT_FAILURE,
1985 				"Cannot initialize network ports\n");
1986 	}
1987 
1988 	/* Enable stats if the user option is set. */
1989 	if (enable_stats) {
1990 		ret = rte_thread_create_control(&tid, "dpdk-vhost-stat",
1991 					print_stats, NULL);
1992 		if (ret < 0)
1993 			rte_exit(EXIT_FAILURE,
1994 				"Cannot create dpdk-vhost-stat thread\n");
1995 	}
1996 
1997 	/* Launch all data cores. */
1998 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1999 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
2000 
2001 	if (client_mode)
2002 		flags |= RTE_VHOST_USER_CLIENT;
2003 
2004 	for (i = 0; i < dma_count; i++) {
2005 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
2006 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
2007 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2008 		}
2009 	}
2010 
2011 	/* Register vhost user driver to handle vhost messages. */
2012 	for (i = 0; i < nb_sockets; i++) {
2013 		char *file = socket_files + i * PATH_MAX;
2014 
2015 		if (dma_count && get_async_flag_by_socketid(i) != 0)
2016 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2017 
2018 		ret = rte_vhost_driver_register(file, flags);
2019 		if (ret != 0) {
2020 			unregister_drivers(i);
2021 			rte_exit(EXIT_FAILURE,
2022 				"vhost driver register failure.\n");
2023 		}
2024 
2025 		if (builtin_net_driver)
2026 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2027 
2028 		if (mergeable == 0) {
2029 			rte_vhost_driver_disable_features(file,
2030 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
2031 		}
2032 
2033 		if (enable_tx_csum == 0) {
2034 			rte_vhost_driver_disable_features(file,
2035 				1ULL << VIRTIO_NET_F_CSUM);
2036 		}
2037 
2038 		if (enable_tso == 0) {
2039 			rte_vhost_driver_disable_features(file,
2040 				1ULL << VIRTIO_NET_F_HOST_TSO4);
2041 			rte_vhost_driver_disable_features(file,
2042 				1ULL << VIRTIO_NET_F_HOST_TSO6);
2043 			rte_vhost_driver_disable_features(file,
2044 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
2045 			rte_vhost_driver_disable_features(file,
2046 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
2047 		}
2048 
2049 		if (promiscuous) {
2050 			rte_vhost_driver_enable_features(file,
2051 				1ULL << VIRTIO_NET_F_CTRL_RX);
2052 		}
2053 
2054 		ret = rte_vhost_driver_callback_register(file,
2055 			&virtio_net_device_ops);
2056 		if (ret != 0) {
2057 			rte_exit(EXIT_FAILURE,
2058 				"failed to register vhost driver callbacks.\n");
2059 		}
2060 
2061 		if (rte_vhost_driver_start(file) < 0) {
2062 			rte_exit(EXIT_FAILURE,
2063 				"failed to start vhost driver.\n");
2064 		}
2065 	}
2066 
2067 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2068 		rte_eal_wait_lcore(lcore_id);
2069 
2070 	for (i = 0; i < dma_count; i++) {
2071 		if (rte_vhost_async_dma_unconfigure(dmas_id[i], 0) < 0) {
2072 			RTE_LOG(ERR, VHOST_PORT,
2073 				"Failed to unconfigure DMA %d in vhost.\n", dmas_id[i]);
2074 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2075 		}
2076 	}
2077 
2078 	/* clean up the EAL */
2079 	rte_eal_cleanup();
2080 
2081 	return 0;
2082 }
2083