xref: /dpdk/examples/vhost/main.c (revision 8f1d23ece06adff5eae9f1b4365bdbbd3abee2b2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <ctype.h>
6 #include <arpa/inet.h>
7 #include <getopt.h>
8 #include <linux/if_ether.h>
9 #include <linux/if_vlan.h>
10 #include <linux/virtio_net.h>
11 #include <linux/virtio_ring.h>
12 #include <signal.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <sys/eventfd.h>
16 #include <sys/param.h>
17 #include <unistd.h>
18 
19 #include <rte_cycles.h>
20 #include <rte_ethdev.h>
21 #include <rte_log.h>
22 #include <rte_string_fns.h>
23 #include <rte_malloc.h>
24 #include <rte_net.h>
25 #include <rte_vhost.h>
26 #include <rte_ip.h>
27 #include <rte_tcp.h>
28 #include <rte_pause.h>
29 #include <rte_dmadev.h>
30 #include <rte_vhost_async.h>
31 
32 #include "main.h"
33 
34 #ifndef MAX_QUEUES
35 #define MAX_QUEUES 128
36 #endif
37 
38 #define NUM_MBUFS_DEFAULT 0x24000
39 
40 /* the maximum number of external ports supported */
41 #define MAX_SUP_PORTS 1
42 
43 #define MBUF_CACHE_SIZE	128
44 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
45 
46 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
47 
48 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
49 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
50 
51 #define JUMBO_FRAME_MAX_SIZE    0x2600
52 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
53 
54 /* State of virtio device. */
55 #define DEVICE_MAC_LEARNING 0
56 #define DEVICE_RX			1
57 #define DEVICE_SAFE_REMOVE	2
58 
59 /* Configurable number of RX/TX ring descriptors */
60 #define RTE_TEST_RX_DESC_DEFAULT 1024
61 #define RTE_TEST_TX_DESC_DEFAULT 512
62 
63 #define INVALID_PORT_ID 0xFF
64 #define INVALID_DMA_ID -1
65 
66 #define DMA_RING_SIZE 4096
67 
68 #define ASYNC_ENQUEUE_VHOST 1
69 #define ASYNC_DEQUEUE_VHOST 2
70 
71 /* number of mbufs in all pools - if specified on command-line. */
72 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
73 
74 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
75 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
76 static int dma_count;
77 
78 /* mask of enabled ports */
79 static uint32_t enabled_port_mask = 0;
80 
81 /* Promiscuous mode */
82 static uint32_t promiscuous;
83 
84 /* number of devices/queues to support*/
85 static uint32_t num_queues = 0;
86 static uint32_t num_devices;
87 
88 static struct rte_mempool *mbuf_pool;
89 static int mergeable;
90 
91 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
92 typedef enum {
93 	VM2VM_DISABLED = 0,
94 	VM2VM_SOFTWARE = 1,
95 	VM2VM_HARDWARE = 2,
96 	VM2VM_LAST
97 } vm2vm_type;
98 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
99 
100 /* Enable stats. */
101 static uint32_t enable_stats = 0;
102 /* Enable retries on RX. */
103 static uint32_t enable_retry = 1;
104 
105 /* Disable TX checksum offload */
106 static uint32_t enable_tx_csum;
107 
108 /* Disable TSO offload */
109 static uint32_t enable_tso;
110 
111 static int client_mode;
112 
113 static int builtin_net_driver;
114 
115 /* Specify timeout (in useconds) between retries on RX. */
116 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
117 /* Specify the number of retries on RX. */
118 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
119 
120 /* Socket file paths. Can be set by user */
121 static char *socket_files;
122 static int nb_sockets;
123 
124 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
125 
126 /* empty VMDq configuration structure. Filled in programmatically */
127 static struct rte_eth_conf vmdq_conf_default = {
128 	.rxmode = {
129 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
130 		.split_hdr_size = 0,
131 		/*
132 		 * VLAN strip is necessary for 1G NIC such as I350,
133 		 * this fixes bug of ipv4 forwarding in guest can't
134 		 * forward packets from one virtio dev to another virtio dev.
135 		 */
136 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
137 	},
138 
139 	.txmode = {
140 		.mq_mode = RTE_ETH_MQ_TX_NONE,
141 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
142 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
143 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
144 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
145 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
146 	},
147 	.rx_adv_conf = {
148 		/*
149 		 * should be overridden separately in code with
150 		 * appropriate values
151 		 */
152 		.vmdq_rx_conf = {
153 			.nb_queue_pools = RTE_ETH_8_POOLS,
154 			.enable_default_pool = 0,
155 			.default_pool = 0,
156 			.nb_pool_maps = 0,
157 			.pool_map = {{0, 0},},
158 		},
159 	},
160 };
161 
162 
163 static unsigned lcore_ids[RTE_MAX_LCORE];
164 static uint16_t ports[RTE_MAX_ETHPORTS];
165 static unsigned num_ports = 0; /**< The number of ports specified in command line */
166 static uint16_t num_pf_queues, num_vmdq_queues;
167 static uint16_t vmdq_pool_base, vmdq_queue_base;
168 static uint16_t queues_per_pool;
169 
170 const uint16_t vlan_tags[] = {
171 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
172 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
173 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
174 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
175 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
176 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
177 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
178 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
179 };
180 
181 /* ethernet addresses of ports */
182 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
183 
184 static struct vhost_dev_tailq_list vhost_dev_list =
185 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
186 
187 static struct lcore_info lcore_info[RTE_MAX_LCORE];
188 
189 /* Used for queueing bursts of TX packets. */
190 struct mbuf_table {
191 	unsigned len;
192 	unsigned txq_id;
193 	struct rte_mbuf *m_table[MAX_PKT_BURST];
194 };
195 
196 struct vhost_bufftable {
197 	uint32_t len;
198 	uint64_t pre_tsc;
199 	struct rte_mbuf *m_table[MAX_PKT_BURST];
200 };
201 
202 /* TX queue for each data core. */
203 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
204 
205 /*
206  * Vhost TX buffer for each data core.
207  * Every data core maintains a TX buffer for every vhost device,
208  * which is used for batch pkts enqueue for higher performance.
209  */
210 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
211 
212 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
213 				 / US_PER_S * BURST_TX_DRAIN_US)
214 
215 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
216 
217 static inline uint32_t
218 get_async_flag_by_socketid(int socketid)
219 {
220 	return dma_bind[socketid].async_flag;
221 }
222 
223 static inline void
224 init_vid2socketid_array(int vid, int socketid)
225 {
226 	vid2socketid[vid] = socketid;
227 }
228 
229 static inline bool
230 is_dma_configured(int16_t dev_id)
231 {
232 	int i;
233 
234 	for (i = 0; i < dma_count; i++)
235 		if (dmas_id[i] == dev_id)
236 			return true;
237 	return false;
238 }
239 
240 static inline int
241 open_dma(const char *value)
242 {
243 	struct dma_for_vhost *dma_info = dma_bind;
244 	char *input = strndup(value, strlen(value) + 1);
245 	char *addrs = input;
246 	char *ptrs[2];
247 	char *start, *end, *substr;
248 	int64_t socketid, vring_id;
249 
250 	struct rte_dma_info info;
251 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
252 	struct rte_dma_vchan_conf qconf = {
253 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
254 		.nb_desc = DMA_RING_SIZE
255 	};
256 
257 	int dev_id;
258 	int ret = 0;
259 	uint16_t i = 0;
260 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
261 	int args_nr;
262 
263 	while (isblank(*addrs))
264 		addrs++;
265 	if (*addrs == '\0') {
266 		ret = -1;
267 		goto out;
268 	}
269 
270 	/* process DMA devices within bracket. */
271 	addrs++;
272 	substr = strtok(addrs, ";]");
273 	if (!substr) {
274 		ret = -1;
275 		goto out;
276 	}
277 
278 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
279 	if (args_nr <= 0) {
280 		ret = -1;
281 		goto out;
282 	}
283 
284 	while (i < args_nr) {
285 		char *arg_temp = dma_arg[i];
286 		char *txd, *rxd;
287 		uint8_t sub_nr;
288 		int async_flag;
289 
290 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
291 		if (sub_nr != 2) {
292 			ret = -1;
293 			goto out;
294 		}
295 
296 		txd = strstr(ptrs[0], "txd");
297 		rxd = strstr(ptrs[0], "rxd");
298 		if (txd) {
299 			start = txd;
300 			vring_id = VIRTIO_RXQ;
301 			async_flag = ASYNC_ENQUEUE_VHOST;
302 		} else if (rxd) {
303 			start = rxd;
304 			vring_id = VIRTIO_TXQ;
305 			async_flag = ASYNC_DEQUEUE_VHOST;
306 		} else {
307 			ret = -1;
308 			goto out;
309 		}
310 
311 		start += 3;
312 		socketid = strtol(start, &end, 0);
313 		if (end == start) {
314 			ret = -1;
315 			goto out;
316 		}
317 
318 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
319 		if (dev_id < 0) {
320 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
321 			ret = -1;
322 			goto out;
323 		}
324 
325 		/* DMA device is already configured, so skip */
326 		if (is_dma_configured(dev_id))
327 			goto done;
328 
329 		if (rte_dma_info_get(dev_id, &info) != 0) {
330 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
331 			ret = -1;
332 			goto out;
333 		}
334 
335 		if (info.max_vchans < 1) {
336 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
337 			ret = -1;
338 			goto out;
339 		}
340 
341 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
342 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
343 			ret = -1;
344 			goto out;
345 		}
346 
347 		/* Check the max desc supported by DMA device */
348 		rte_dma_info_get(dev_id, &info);
349 		if (info.nb_vchans != 1) {
350 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
351 					dev_id);
352 			ret = -1;
353 			goto out;
354 		}
355 
356 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
357 
358 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
359 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
360 			ret = -1;
361 			goto out;
362 		}
363 
364 		if (rte_dma_start(dev_id) != 0) {
365 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
366 			ret = -1;
367 			goto out;
368 		}
369 
370 		dmas_id[dma_count++] = dev_id;
371 
372 done:
373 		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
374 		(dma_info + socketid)->async_flag |= async_flag;
375 		i++;
376 	}
377 out:
378 	free(input);
379 	return ret;
380 }
381 
382 /*
383  * Builds up the correct configuration for VMDQ VLAN pool map
384  * according to the pool & queue limits.
385  */
386 static inline int
387 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
388 {
389 	struct rte_eth_vmdq_rx_conf conf;
390 	struct rte_eth_vmdq_rx_conf *def_conf =
391 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
392 	unsigned i;
393 
394 	memset(&conf, 0, sizeof(conf));
395 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
396 	conf.nb_pool_maps = num_devices;
397 	conf.enable_loop_back = def_conf->enable_loop_back;
398 	conf.rx_mode = def_conf->rx_mode;
399 
400 	for (i = 0; i < conf.nb_pool_maps; i++) {
401 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
402 		conf.pool_map[i].pools = (1UL << i);
403 	}
404 
405 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
406 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
407 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
408 	return 0;
409 }
410 
411 /*
412  * Initialises a given port using global settings and with the rx buffers
413  * coming from the mbuf_pool passed as parameter
414  */
415 static inline int
416 port_init(uint16_t port)
417 {
418 	struct rte_eth_dev_info dev_info;
419 	struct rte_eth_conf port_conf;
420 	struct rte_eth_rxconf *rxconf;
421 	struct rte_eth_txconf *txconf;
422 	int16_t rx_rings, tx_rings;
423 	uint16_t rx_ring_size, tx_ring_size;
424 	int retval;
425 	uint16_t q;
426 
427 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
428 	retval = rte_eth_dev_info_get(port, &dev_info);
429 	if (retval != 0) {
430 		RTE_LOG(ERR, VHOST_PORT,
431 			"Error during getting device (port %u) info: %s\n",
432 			port, strerror(-retval));
433 
434 		return retval;
435 	}
436 	if (dev_info.max_vmdq_pools == 0) {
437 		RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
438 		return -1;
439 	}
440 
441 	rxconf = &dev_info.default_rxconf;
442 	txconf = &dev_info.default_txconf;
443 	rxconf->rx_drop_en = 1;
444 
445 	/*configure the number of supported virtio devices based on VMDQ limits */
446 	num_devices = dev_info.max_vmdq_pools;
447 
448 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
449 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
450 
451 	tx_rings = (uint16_t)rte_lcore_count();
452 
453 	if (mergeable) {
454 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
455 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
456 		else
457 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
458 	}
459 
460 	/* Get port configuration. */
461 	retval = get_eth_conf(&port_conf, num_devices);
462 	if (retval < 0)
463 		return retval;
464 	/* NIC queues are divided into pf queues and vmdq queues.  */
465 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
466 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
467 	num_vmdq_queues = num_devices * queues_per_pool;
468 	num_queues = num_pf_queues + num_vmdq_queues;
469 	vmdq_queue_base = dev_info.vmdq_queue_base;
470 	vmdq_pool_base  = dev_info.vmdq_pool_base;
471 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
472 		num_pf_queues, num_devices, queues_per_pool);
473 
474 	if (!rte_eth_dev_is_valid_port(port))
475 		return -1;
476 
477 	rx_rings = (uint16_t)dev_info.max_rx_queues;
478 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
479 		port_conf.txmode.offloads |=
480 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
481 	/* Configure ethernet device. */
482 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
483 	if (retval != 0) {
484 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
485 			port, strerror(-retval));
486 		return retval;
487 	}
488 
489 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
490 		&tx_ring_size);
491 	if (retval != 0) {
492 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
493 			"for port %u: %s.\n", port, strerror(-retval));
494 		return retval;
495 	}
496 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
497 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
498 			"for Rx queues on port %u.\n", port);
499 		return -1;
500 	}
501 
502 	/* Setup the queues. */
503 	rxconf->offloads = port_conf.rxmode.offloads;
504 	for (q = 0; q < rx_rings; q ++) {
505 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
506 						rte_eth_dev_socket_id(port),
507 						rxconf,
508 						mbuf_pool);
509 		if (retval < 0) {
510 			RTE_LOG(ERR, VHOST_PORT,
511 				"Failed to setup rx queue %u of port %u: %s.\n",
512 				q, port, strerror(-retval));
513 			return retval;
514 		}
515 	}
516 	txconf->offloads = port_conf.txmode.offloads;
517 	for (q = 0; q < tx_rings; q ++) {
518 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
519 						rte_eth_dev_socket_id(port),
520 						txconf);
521 		if (retval < 0) {
522 			RTE_LOG(ERR, VHOST_PORT,
523 				"Failed to setup tx queue %u of port %u: %s.\n",
524 				q, port, strerror(-retval));
525 			return retval;
526 		}
527 	}
528 
529 	/* Start the device. */
530 	retval  = rte_eth_dev_start(port);
531 	if (retval < 0) {
532 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
533 			port, strerror(-retval));
534 		return retval;
535 	}
536 
537 	if (promiscuous) {
538 		retval = rte_eth_promiscuous_enable(port);
539 		if (retval != 0) {
540 			RTE_LOG(ERR, VHOST_PORT,
541 				"Failed to enable promiscuous mode on port %u: %s\n",
542 				port, rte_strerror(-retval));
543 			return retval;
544 		}
545 	}
546 
547 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
548 	if (retval < 0) {
549 		RTE_LOG(ERR, VHOST_PORT,
550 			"Failed to get MAC address on port %u: %s\n",
551 			port, rte_strerror(-retval));
552 		return retval;
553 	}
554 
555 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
556 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
557 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
558 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
559 
560 	return 0;
561 }
562 
563 /*
564  * Set socket file path.
565  */
566 static int
567 us_vhost_parse_socket_path(const char *q_arg)
568 {
569 	char *old;
570 
571 	/* parse number string */
572 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
573 		return -1;
574 
575 	old = socket_files;
576 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
577 	if (socket_files == NULL) {
578 		free(old);
579 		return -1;
580 	}
581 
582 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
583 	nb_sockets++;
584 
585 	return 0;
586 }
587 
588 /*
589  * Parse the portmask provided at run time.
590  */
591 static int
592 parse_portmask(const char *portmask)
593 {
594 	char *end = NULL;
595 	unsigned long pm;
596 
597 	errno = 0;
598 
599 	/* parse hexadecimal string */
600 	pm = strtoul(portmask, &end, 16);
601 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
602 		return 0;
603 
604 	return pm;
605 
606 }
607 
608 /*
609  * Parse num options at run time.
610  */
611 static int
612 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
613 {
614 	char *end = NULL;
615 	unsigned long num;
616 
617 	errno = 0;
618 
619 	/* parse unsigned int string */
620 	num = strtoul(q_arg, &end, 10);
621 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
622 		return -1;
623 
624 	if (num > max_valid_value)
625 		return -1;
626 
627 	return num;
628 
629 }
630 
631 /*
632  * Display usage
633  */
634 static void
635 us_vhost_usage(const char *prgname)
636 {
637 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
638 	"		--vm2vm [0|1|2]\n"
639 	"		--rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n"
640 	"		--socket-file <path>\n"
641 	"		-p PORTMASK: Set mask for ports to be used by application\n"
642 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
643 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
644 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
645 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
646 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
647 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
648 	"		--socket-file: The path of the socket file.\n"
649 	"		--tx-csum [0|1]: disable/enable TX checksum offload.\n"
650 	"		--tso [0|1]: disable/enable TCP segment offload.\n"
651 	"		--client: register a vhost-user socket as client mode.\n"
652 	"		--dmas: register dma channel for specific vhost device.\n"
653 	"		--total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n"
654 	"		--builtin-net-driver: enable simple vhost-user net driver\n",
655 	       prgname);
656 }
657 
658 enum {
659 #define OPT_VM2VM               "vm2vm"
660 	OPT_VM2VM_NUM = 256,
661 #define OPT_RX_RETRY            "rx-retry"
662 	OPT_RX_RETRY_NUM,
663 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
664 	OPT_RX_RETRY_DELAY_NUM,
665 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
666 	OPT_RX_RETRY_NUMB_NUM,
667 #define OPT_MERGEABLE           "mergeable"
668 	OPT_MERGEABLE_NUM,
669 #define OPT_STATS               "stats"
670 	OPT_STATS_NUM,
671 #define OPT_SOCKET_FILE         "socket-file"
672 	OPT_SOCKET_FILE_NUM,
673 #define OPT_TX_CSUM             "tx-csum"
674 	OPT_TX_CSUM_NUM,
675 #define OPT_TSO                 "tso"
676 	OPT_TSO_NUM,
677 #define OPT_CLIENT              "client"
678 	OPT_CLIENT_NUM,
679 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
680 	OPT_BUILTIN_NET_DRIVER_NUM,
681 #define OPT_DMAS                "dmas"
682 	OPT_DMAS_NUM,
683 #define OPT_NUM_MBUFS           "total-num-mbufs"
684 	OPT_NUM_MBUFS_NUM,
685 };
686 
687 /*
688  * Parse the arguments given in the command line of the application.
689  */
690 static int
691 us_vhost_parse_args(int argc, char **argv)
692 {
693 	int opt, ret;
694 	int option_index;
695 	unsigned i;
696 	const char *prgname = argv[0];
697 	static struct option long_option[] = {
698 		{OPT_VM2VM, required_argument,
699 				NULL, OPT_VM2VM_NUM},
700 		{OPT_RX_RETRY, required_argument,
701 				NULL, OPT_RX_RETRY_NUM},
702 		{OPT_RX_RETRY_DELAY, required_argument,
703 				NULL, OPT_RX_RETRY_DELAY_NUM},
704 		{OPT_RX_RETRY_NUMB, required_argument,
705 				NULL, OPT_RX_RETRY_NUMB_NUM},
706 		{OPT_MERGEABLE, required_argument,
707 				NULL, OPT_MERGEABLE_NUM},
708 		{OPT_STATS, required_argument,
709 				NULL, OPT_STATS_NUM},
710 		{OPT_SOCKET_FILE, required_argument,
711 				NULL, OPT_SOCKET_FILE_NUM},
712 		{OPT_TX_CSUM, required_argument,
713 				NULL, OPT_TX_CSUM_NUM},
714 		{OPT_TSO, required_argument,
715 				NULL, OPT_TSO_NUM},
716 		{OPT_CLIENT, no_argument,
717 				NULL, OPT_CLIENT_NUM},
718 		{OPT_BUILTIN_NET_DRIVER, no_argument,
719 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
720 		{OPT_DMAS, required_argument,
721 				NULL, OPT_DMAS_NUM},
722 		{OPT_NUM_MBUFS, required_argument,
723 				NULL, OPT_NUM_MBUFS_NUM},
724 		{NULL, 0, 0, 0},
725 	};
726 
727 	/* Parse command line */
728 	while ((opt = getopt_long(argc, argv, "p:P",
729 			long_option, &option_index)) != EOF) {
730 		switch (opt) {
731 		/* Portmask */
732 		case 'p':
733 			enabled_port_mask = parse_portmask(optarg);
734 			if (enabled_port_mask == 0) {
735 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
736 				us_vhost_usage(prgname);
737 				return -1;
738 			}
739 			break;
740 
741 		case 'P':
742 			promiscuous = 1;
743 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
744 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
745 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
746 			break;
747 
748 		case OPT_VM2VM_NUM:
749 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
750 			if (ret == -1) {
751 				RTE_LOG(INFO, VHOST_CONFIG,
752 					"Invalid argument for "
753 					"vm2vm [0|1|2]\n");
754 				us_vhost_usage(prgname);
755 				return -1;
756 			}
757 			vm2vm_mode = (vm2vm_type)ret;
758 			break;
759 
760 		case OPT_RX_RETRY_NUM:
761 			ret = parse_num_opt(optarg, 1);
762 			if (ret == -1) {
763 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
764 				us_vhost_usage(prgname);
765 				return -1;
766 			}
767 			enable_retry = ret;
768 			break;
769 
770 		case OPT_TX_CSUM_NUM:
771 			ret = parse_num_opt(optarg, 1);
772 			if (ret == -1) {
773 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
774 				us_vhost_usage(prgname);
775 				return -1;
776 			}
777 			enable_tx_csum = ret;
778 			break;
779 
780 		case OPT_TSO_NUM:
781 			ret = parse_num_opt(optarg, 1);
782 			if (ret == -1) {
783 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
784 				us_vhost_usage(prgname);
785 				return -1;
786 			}
787 			enable_tso = ret;
788 			break;
789 
790 		case OPT_RX_RETRY_DELAY_NUM:
791 			ret = parse_num_opt(optarg, INT32_MAX);
792 			if (ret == -1) {
793 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
794 				us_vhost_usage(prgname);
795 				return -1;
796 			}
797 			burst_rx_delay_time = ret;
798 			break;
799 
800 		case OPT_RX_RETRY_NUMB_NUM:
801 			ret = parse_num_opt(optarg, INT32_MAX);
802 			if (ret == -1) {
803 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
804 				us_vhost_usage(prgname);
805 				return -1;
806 			}
807 			burst_rx_retry_num = ret;
808 			break;
809 
810 		case OPT_MERGEABLE_NUM:
811 			ret = parse_num_opt(optarg, 1);
812 			if (ret == -1) {
813 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
814 				us_vhost_usage(prgname);
815 				return -1;
816 			}
817 			mergeable = !!ret;
818 			break;
819 
820 		case OPT_STATS_NUM:
821 			ret = parse_num_opt(optarg, INT32_MAX);
822 			if (ret == -1) {
823 				RTE_LOG(INFO, VHOST_CONFIG,
824 					"Invalid argument for stats [0..N]\n");
825 				us_vhost_usage(prgname);
826 				return -1;
827 			}
828 			enable_stats = ret;
829 			break;
830 
831 		/* Set socket file path. */
832 		case OPT_SOCKET_FILE_NUM:
833 			if (us_vhost_parse_socket_path(optarg) == -1) {
834 				RTE_LOG(INFO, VHOST_CONFIG,
835 				"Invalid argument for socket name (Max %d characters)\n",
836 				PATH_MAX);
837 				us_vhost_usage(prgname);
838 				return -1;
839 			}
840 			break;
841 
842 		case OPT_DMAS_NUM:
843 			if (open_dma(optarg) == -1) {
844 				RTE_LOG(INFO, VHOST_CONFIG,
845 					"Wrong DMA args\n");
846 				us_vhost_usage(prgname);
847 				return -1;
848 			}
849 			break;
850 
851 		case OPT_NUM_MBUFS_NUM:
852 			ret = parse_num_opt(optarg, INT32_MAX);
853 			if (ret == -1) {
854 				RTE_LOG(INFO, VHOST_CONFIG,
855 					"Invalid argument for total-num-mbufs [0..N]\n");
856 				us_vhost_usage(prgname);
857 				return -1;
858 			}
859 
860 			if (total_num_mbufs < ret)
861 				total_num_mbufs = ret;
862 			break;
863 
864 		case OPT_CLIENT_NUM:
865 			client_mode = 1;
866 			break;
867 
868 		case OPT_BUILTIN_NET_DRIVER_NUM:
869 			builtin_net_driver = 1;
870 			break;
871 
872 		/* Invalid option - print options. */
873 		default:
874 			us_vhost_usage(prgname);
875 			return -1;
876 		}
877 	}
878 
879 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
880 		if (enabled_port_mask & (1 << i))
881 			ports[num_ports++] = i;
882 	}
883 
884 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
885 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
886 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
887 		return -1;
888 	}
889 
890 	return 0;
891 }
892 
893 /*
894  * Update the global var NUM_PORTS and array PORTS according to system ports number
895  * and return valid ports number
896  */
897 static unsigned check_ports_num(unsigned nb_ports)
898 {
899 	unsigned valid_num_ports = num_ports;
900 	unsigned portid;
901 
902 	if (num_ports > nb_ports) {
903 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
904 			num_ports, nb_ports);
905 		num_ports = nb_ports;
906 	}
907 
908 	for (portid = 0; portid < num_ports; portid ++) {
909 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
910 			RTE_LOG(INFO, VHOST_PORT,
911 				"\nSpecified port ID(%u) is not valid\n",
912 				ports[portid]);
913 			ports[portid] = INVALID_PORT_ID;
914 			valid_num_ports--;
915 		}
916 	}
917 	return valid_num_ports;
918 }
919 
920 static __rte_always_inline struct vhost_dev *
921 find_vhost_dev(struct rte_ether_addr *mac)
922 {
923 	struct vhost_dev *vdev;
924 
925 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
926 		if (vdev->ready == DEVICE_RX &&
927 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
928 			return vdev;
929 	}
930 
931 	return NULL;
932 }
933 
934 /*
935  * This function learns the MAC address of the device and registers this along with a
936  * vlan tag to a VMDQ.
937  */
938 static int
939 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
940 {
941 	struct rte_ether_hdr *pkt_hdr;
942 	int i, ret;
943 
944 	/* Learn MAC address of guest device from packet */
945 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
946 
947 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
948 		RTE_LOG(ERR, VHOST_DATA,
949 			"(%d) device is using a registered MAC!\n",
950 			vdev->vid);
951 		return -1;
952 	}
953 
954 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
955 		vdev->mac_address.addr_bytes[i] =
956 			pkt_hdr->src_addr.addr_bytes[i];
957 
958 	/* vlan_tag currently uses the device_id. */
959 	vdev->vlan_tag = vlan_tags[vdev->vid];
960 
961 	/* Print out VMDQ registration info. */
962 	RTE_LOG(INFO, VHOST_DATA,
963 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
964 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
965 		vdev->vlan_tag);
966 
967 	/* Register the MAC address. */
968 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
969 				(uint32_t)vdev->vid + vmdq_pool_base);
970 	if (ret)
971 		RTE_LOG(ERR, VHOST_DATA,
972 			"(%d) failed to add device MAC address to VMDQ\n",
973 			vdev->vid);
974 
975 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
976 
977 	/* Set device as ready for RX. */
978 	vdev->ready = DEVICE_RX;
979 
980 	return 0;
981 }
982 
983 /*
984  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
985  * queue before disabling RX on the device.
986  */
987 static inline void
988 unlink_vmdq(struct vhost_dev *vdev)
989 {
990 	unsigned i = 0;
991 	unsigned rx_count;
992 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
993 
994 	if (vdev->ready == DEVICE_RX) {
995 		/*clear MAC and VLAN settings*/
996 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
997 		for (i = 0; i < 6; i++)
998 			vdev->mac_address.addr_bytes[i] = 0;
999 
1000 		vdev->vlan_tag = 0;
1001 
1002 		/*Clear out the receive buffers*/
1003 		rx_count = rte_eth_rx_burst(ports[0],
1004 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1005 
1006 		while (rx_count) {
1007 			for (i = 0; i < rx_count; i++)
1008 				rte_pktmbuf_free(pkts_burst[i]);
1009 
1010 			rx_count = rte_eth_rx_burst(ports[0],
1011 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1012 		}
1013 
1014 		vdev->ready = DEVICE_MAC_LEARNING;
1015 	}
1016 }
1017 
1018 static inline void
1019 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1020 {
1021 	while (n--)
1022 		rte_pktmbuf_free(pkts[n]);
1023 }
1024 
1025 static __rte_always_inline void
1026 complete_async_pkts(struct vhost_dev *vdev)
1027 {
1028 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1029 	uint16_t complete_count;
1030 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1031 
1032 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1033 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1034 	if (complete_count)
1035 		free_pkts(p_cpl, complete_count);
1036 
1037 }
1038 
1039 static __rte_always_inline void
1040 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1041 	    struct rte_mbuf *m)
1042 {
1043 	uint16_t ret;
1044 
1045 	if (builtin_net_driver) {
1046 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1047 	} else {
1048 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1049 	}
1050 
1051 	if (enable_stats) {
1052 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1053 				__ATOMIC_SEQ_CST);
1054 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1055 				__ATOMIC_SEQ_CST);
1056 		src_vdev->stats.tx_total++;
1057 		src_vdev->stats.tx += ret;
1058 	}
1059 }
1060 
1061 static __rte_always_inline void
1062 drain_vhost(struct vhost_dev *vdev)
1063 {
1064 	uint16_t ret;
1065 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1066 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1067 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1068 
1069 	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1070 
1071 	if (enable_stats) {
1072 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1073 				__ATOMIC_SEQ_CST);
1074 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1075 				__ATOMIC_SEQ_CST);
1076 	}
1077 
1078 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1079 		free_pkts(m, nr_xmit);
1080 }
1081 
1082 static __rte_always_inline void
1083 drain_vhost_table(void)
1084 {
1085 	uint16_t lcore_id = rte_lcore_id();
1086 	struct vhost_bufftable *vhost_txq;
1087 	struct vhost_dev *vdev;
1088 	uint64_t cur_tsc;
1089 
1090 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1091 		if (unlikely(vdev->remove == 1))
1092 			continue;
1093 
1094 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1095 
1096 		cur_tsc = rte_rdtsc();
1097 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1098 				> MBUF_TABLE_DRAIN_TSC)) {
1099 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1100 				"Vhost TX queue drained after timeout with burst size %u\n",
1101 				vhost_txq->len);
1102 			drain_vhost(vdev);
1103 			vhost_txq->len = 0;
1104 			vhost_txq->pre_tsc = cur_tsc;
1105 		}
1106 	}
1107 }
1108 
1109 /*
1110  * Check if the packet destination MAC address is for a local device. If so then put
1111  * the packet on that devices RX queue. If not then return.
1112  */
1113 static __rte_always_inline int
1114 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1115 {
1116 	struct rte_ether_hdr *pkt_hdr;
1117 	struct vhost_dev *dst_vdev;
1118 	struct vhost_bufftable *vhost_txq;
1119 	uint16_t lcore_id = rte_lcore_id();
1120 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1121 
1122 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1123 	if (!dst_vdev)
1124 		return -1;
1125 
1126 	if (vdev->vid == dst_vdev->vid) {
1127 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1128 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1129 			vdev->vid);
1130 		return 0;
1131 	}
1132 
1133 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1134 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1135 
1136 	if (unlikely(dst_vdev->remove)) {
1137 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1138 			"(%d) device is marked for removal\n", dst_vdev->vid);
1139 		return 0;
1140 	}
1141 
1142 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1143 	vhost_txq->m_table[vhost_txq->len++] = m;
1144 
1145 	if (enable_stats) {
1146 		vdev->stats.tx_total++;
1147 		vdev->stats.tx++;
1148 	}
1149 
1150 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1151 		drain_vhost(dst_vdev);
1152 		vhost_txq->len = 0;
1153 		vhost_txq->pre_tsc = rte_rdtsc();
1154 	}
1155 	return 0;
1156 }
1157 
1158 /*
1159  * Check if the destination MAC of a packet is one local VM,
1160  * and get its vlan tag, and offset if it is.
1161  */
1162 static __rte_always_inline int
1163 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1164 	uint32_t *offset, uint16_t *vlan_tag)
1165 {
1166 	struct vhost_dev *dst_vdev;
1167 	struct rte_ether_hdr *pkt_hdr =
1168 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1169 
1170 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1171 	if (!dst_vdev)
1172 		return 0;
1173 
1174 	if (vdev->vid == dst_vdev->vid) {
1175 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1176 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1177 			vdev->vid);
1178 		return -1;
1179 	}
1180 
1181 	/*
1182 	 * HW vlan strip will reduce the packet length
1183 	 * by minus length of vlan tag, so need restore
1184 	 * the packet length by plus it.
1185 	 */
1186 	*offset  = RTE_VLAN_HLEN;
1187 	*vlan_tag = vlan_tags[vdev->vid];
1188 
1189 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1190 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1191 		vdev->vid, dst_vdev->vid, *vlan_tag);
1192 
1193 	return 0;
1194 }
1195 
1196 static void virtio_tx_offload(struct rte_mbuf *m)
1197 {
1198 	struct rte_net_hdr_lens hdr_lens;
1199 	struct rte_ipv4_hdr *ipv4_hdr;
1200 	struct rte_tcp_hdr *tcp_hdr;
1201 	uint32_t ptype;
1202 	void *l3_hdr;
1203 
1204 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1205 	m->l2_len = hdr_lens.l2_len;
1206 	m->l3_len = hdr_lens.l3_len;
1207 	m->l4_len = hdr_lens.l4_len;
1208 
1209 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1210 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1211 		m->l2_len + m->l3_len);
1212 
1213 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1214 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1215 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1216 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1217 		ipv4_hdr = l3_hdr;
1218 		ipv4_hdr->hdr_checksum = 0;
1219 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1220 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1221 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1222 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1223 	}
1224 }
1225 
1226 static __rte_always_inline void
1227 do_drain_mbuf_table(struct mbuf_table *tx_q)
1228 {
1229 	uint16_t count;
1230 
1231 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1232 				 tx_q->m_table, tx_q->len);
1233 	if (unlikely(count < tx_q->len))
1234 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1235 
1236 	tx_q->len = 0;
1237 }
1238 
1239 /*
1240  * This function routes the TX packet to the correct interface. This
1241  * may be a local device or the physical port.
1242  */
1243 static __rte_always_inline void
1244 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1245 {
1246 	struct mbuf_table *tx_q;
1247 	unsigned offset = 0;
1248 	const uint16_t lcore_id = rte_lcore_id();
1249 	struct rte_ether_hdr *nh;
1250 
1251 
1252 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1253 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1254 		struct vhost_dev *vdev2;
1255 
1256 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1257 			if (vdev2 != vdev)
1258 				sync_virtio_xmit(vdev2, vdev, m);
1259 		}
1260 		goto queue2nic;
1261 	}
1262 
1263 	/*check if destination is local VM*/
1264 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1265 		return;
1266 
1267 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1268 		if (unlikely(find_local_dest(vdev, m, &offset,
1269 					     &vlan_tag) != 0)) {
1270 			rte_pktmbuf_free(m);
1271 			return;
1272 		}
1273 	}
1274 
1275 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1276 		"(%d) TX: MAC address is external\n", vdev->vid);
1277 
1278 queue2nic:
1279 
1280 	/*Add packet to the port tx queue*/
1281 	tx_q = &lcore_tx_queue[lcore_id];
1282 
1283 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1284 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1285 		/* Guest has inserted the vlan tag. */
1286 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1287 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1288 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1289 			(vh->vlan_tci != vlan_tag_be))
1290 			vh->vlan_tci = vlan_tag_be;
1291 	} else {
1292 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1293 
1294 		/*
1295 		 * Find the right seg to adjust the data len when offset is
1296 		 * bigger than tail room size.
1297 		 */
1298 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1299 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1300 				m->data_len += offset;
1301 			else {
1302 				struct rte_mbuf *seg = m;
1303 
1304 				while ((seg->next != NULL) &&
1305 					(offset > rte_pktmbuf_tailroom(seg)))
1306 					seg = seg->next;
1307 
1308 				seg->data_len += offset;
1309 			}
1310 			m->pkt_len += offset;
1311 		}
1312 
1313 		m->vlan_tci = vlan_tag;
1314 	}
1315 
1316 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1317 		virtio_tx_offload(m);
1318 
1319 	tx_q->m_table[tx_q->len++] = m;
1320 	if (enable_stats) {
1321 		vdev->stats.tx_total++;
1322 		vdev->stats.tx++;
1323 	}
1324 
1325 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1326 		do_drain_mbuf_table(tx_q);
1327 }
1328 
1329 
1330 static __rte_always_inline void
1331 drain_mbuf_table(struct mbuf_table *tx_q)
1332 {
1333 	static uint64_t prev_tsc;
1334 	uint64_t cur_tsc;
1335 
1336 	if (tx_q->len == 0)
1337 		return;
1338 
1339 	cur_tsc = rte_rdtsc();
1340 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1341 		prev_tsc = cur_tsc;
1342 
1343 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1344 			"TX queue drained after timeout with burst size %u\n",
1345 			tx_q->len);
1346 		do_drain_mbuf_table(tx_q);
1347 	}
1348 }
1349 
1350 uint16_t
1351 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1352 		struct rte_mbuf **pkts, uint32_t rx_count)
1353 {
1354 	uint16_t enqueue_count;
1355 	uint16_t enqueue_fail = 0;
1356 	uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1357 
1358 	complete_async_pkts(dev);
1359 	enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1360 					pkts, rx_count, dma_id, 0);
1361 
1362 	enqueue_fail = rx_count - enqueue_count;
1363 	if (enqueue_fail)
1364 		free_pkts(&pkts[enqueue_count], enqueue_fail);
1365 
1366 	return enqueue_count;
1367 }
1368 
1369 uint16_t
1370 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1371 		struct rte_mbuf **pkts, uint32_t rx_count)
1372 {
1373 	return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1374 }
1375 
1376 static __rte_always_inline void
1377 drain_eth_rx(struct vhost_dev *vdev)
1378 {
1379 	uint16_t rx_count, enqueue_count;
1380 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1381 
1382 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1383 				    pkts, MAX_PKT_BURST);
1384 
1385 	if (!rx_count)
1386 		return;
1387 
1388 	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1389 						VIRTIO_RXQ, pkts, rx_count);
1390 
1391 	/* Retry if necessary */
1392 	if (enable_retry && unlikely(enqueue_count < rx_count)) {
1393 		uint32_t retry = 0;
1394 
1395 		while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) {
1396 			rte_delay_us(burst_rx_delay_time);
1397 			enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1398 							VIRTIO_RXQ, &pkts[enqueue_count],
1399 							rx_count - enqueue_count);
1400 		}
1401 	}
1402 
1403 	if (enable_stats) {
1404 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1405 				__ATOMIC_SEQ_CST);
1406 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1407 				__ATOMIC_SEQ_CST);
1408 	}
1409 
1410 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1411 		free_pkts(pkts, rx_count);
1412 }
1413 
1414 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1415 			    struct rte_mempool *mbuf_pool,
1416 			    struct rte_mbuf **pkts, uint16_t count)
1417 {
1418 	int nr_inflight;
1419 	uint16_t dequeue_count;
1420 	int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1421 
1422 	dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1423 			mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1424 
1425 	return dequeue_count;
1426 }
1427 
1428 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1429 			   struct rte_mempool *mbuf_pool,
1430 			   struct rte_mbuf **pkts, uint16_t count)
1431 {
1432 	return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1433 }
1434 
1435 static __rte_always_inline void
1436 drain_virtio_tx(struct vhost_dev *vdev)
1437 {
1438 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1439 	uint16_t count;
1440 	uint16_t i;
1441 
1442 	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1443 				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1444 
1445 	/* setup VMDq for the first packet */
1446 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1447 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1448 			free_pkts(pkts, count);
1449 	}
1450 
1451 	for (i = 0; i < count; ++i)
1452 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1453 }
1454 
1455 /*
1456  * Main function of vhost-switch. It basically does:
1457  *
1458  * for each vhost device {
1459  *    - drain_eth_rx()
1460  *
1461  *      Which drains the host eth Rx queue linked to the vhost device,
1462  *      and deliver all of them to guest virito Rx ring associated with
1463  *      this vhost device.
1464  *
1465  *    - drain_virtio_tx()
1466  *
1467  *      Which drains the guest virtio Tx queue and deliver all of them
1468  *      to the target, which could be another vhost device, or the
1469  *      physical eth dev. The route is done in function "virtio_tx_route".
1470  * }
1471  */
1472 static int
1473 switch_worker(void *arg __rte_unused)
1474 {
1475 	unsigned i;
1476 	unsigned lcore_id = rte_lcore_id();
1477 	struct vhost_dev *vdev;
1478 	struct mbuf_table *tx_q;
1479 
1480 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1481 
1482 	tx_q = &lcore_tx_queue[lcore_id];
1483 	for (i = 0; i < rte_lcore_count(); i++) {
1484 		if (lcore_ids[i] == lcore_id) {
1485 			tx_q->txq_id = i;
1486 			break;
1487 		}
1488 	}
1489 
1490 	while(1) {
1491 		drain_mbuf_table(tx_q);
1492 		drain_vhost_table();
1493 		/*
1494 		 * Inform the configuration core that we have exited the
1495 		 * linked list and that no devices are in use if requested.
1496 		 */
1497 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1498 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1499 
1500 		/*
1501 		 * Process vhost devices
1502 		 */
1503 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1504 			      lcore_vdev_entry) {
1505 			if (unlikely(vdev->remove)) {
1506 				unlink_vmdq(vdev);
1507 				vdev->ready = DEVICE_SAFE_REMOVE;
1508 				continue;
1509 			}
1510 
1511 			if (likely(vdev->ready == DEVICE_RX))
1512 				drain_eth_rx(vdev);
1513 
1514 			if (likely(!vdev->remove))
1515 				drain_virtio_tx(vdev);
1516 		}
1517 	}
1518 
1519 	return 0;
1520 }
1521 
1522 static void
1523 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1524 {
1525 	uint16_t n_pkt = 0;
1526 	int pkts_inflight;
1527 
1528 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1529 	pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1530 
1531 	struct rte_mbuf *m_cpl[pkts_inflight];
1532 
1533 	while (pkts_inflight) {
1534 		n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1535 							pkts_inflight, dma_id, 0);
1536 		free_pkts(m_cpl, n_pkt);
1537 		pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1538 									queue_id);
1539 	}
1540 }
1541 
1542 static void
1543 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id)
1544 {
1545 	uint16_t n_pkt = 0;
1546 	int pkts_inflight;
1547 
1548 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1549 	pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1550 
1551 	struct rte_mbuf *m_cpl[pkts_inflight];
1552 
1553 	while (pkts_inflight) {
1554 		n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl,
1555 						pkts_inflight, dma_id, 0);
1556 		free_pkts(m_cpl, n_pkt);
1557 		pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1558 	}
1559 }
1560 
1561 /*
1562  * Remove a device from the specific data core linked list and from the
1563  * main linked list. Synchronization  occurs through the use of the
1564  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1565  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1566  */
1567 static void
1568 destroy_device(int vid)
1569 {
1570 	struct vhost_dev *vdev = NULL;
1571 	int lcore;
1572 	uint16_t i;
1573 
1574 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1575 		if (vdev->vid == vid)
1576 			break;
1577 	}
1578 	if (!vdev)
1579 		return;
1580 	/*set the remove flag. */
1581 	vdev->remove = 1;
1582 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1583 		rte_pause();
1584 	}
1585 
1586 	for (i = 0; i < RTE_MAX_LCORE; i++)
1587 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1588 
1589 	if (builtin_net_driver)
1590 		vs_vhost_net_remove(vdev);
1591 
1592 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1593 		     lcore_vdev_entry);
1594 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1595 
1596 
1597 	/* Set the dev_removal_flag on each lcore. */
1598 	RTE_LCORE_FOREACH_WORKER(lcore)
1599 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1600 
1601 	/*
1602 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1603 	 * we can be sure that they can no longer access the device removed
1604 	 * from the linked lists and that the devices are no longer in use.
1605 	 */
1606 	RTE_LCORE_FOREACH_WORKER(lcore) {
1607 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1608 			rte_pause();
1609 	}
1610 
1611 	lcore_info[vdev->coreid].device_num--;
1612 
1613 	RTE_LOG(INFO, VHOST_DATA,
1614 		"(%d) device has been removed from data core\n",
1615 		vdev->vid);
1616 
1617 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1618 		vhost_clear_queue(vdev, VIRTIO_RXQ);
1619 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1620 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1621 	}
1622 
1623 	if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1624 		vhost_clear_queue(vdev, VIRTIO_TXQ);
1625 		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1626 		dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1627 	}
1628 
1629 	rte_free(vdev);
1630 }
1631 
1632 static inline int
1633 get_socketid_by_vid(int vid)
1634 {
1635 	int i;
1636 	char ifname[PATH_MAX];
1637 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1638 
1639 	for (i = 0; i < nb_sockets; i++) {
1640 		char *file = socket_files + i * PATH_MAX;
1641 		if (strcmp(file, ifname) == 0)
1642 			return i;
1643 	}
1644 
1645 	return -1;
1646 }
1647 
1648 static int
1649 init_vhost_queue_ops(int vid)
1650 {
1651 	if (builtin_net_driver) {
1652 		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1653 		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1654 	} else {
1655 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1656 			vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1657 		else
1658 			vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1659 
1660 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1661 			vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1662 		else
1663 			vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1664 	}
1665 
1666 	return 0;
1667 }
1668 
1669 static inline int
1670 vhost_async_channel_register(int vid)
1671 {
1672 	int rx_ret = 0, tx_ret = 0;
1673 
1674 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1675 		rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1676 		if (rx_ret == 0)
1677 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1678 	}
1679 
1680 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1681 		tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1682 		if (tx_ret == 0)
1683 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1684 	}
1685 
1686 	return rx_ret | tx_ret;
1687 }
1688 
1689 
1690 
1691 /*
1692  * A new device is added to a data core. First the device is added to the main linked list
1693  * and then allocated to a specific data core.
1694  */
1695 static int
1696 new_device(int vid)
1697 {
1698 	int lcore, core_add = 0;
1699 	uint16_t i;
1700 	uint32_t device_num_min = num_devices;
1701 	struct vhost_dev *vdev;
1702 	int ret;
1703 
1704 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1705 	if (vdev == NULL) {
1706 		RTE_LOG(INFO, VHOST_DATA,
1707 			"(%d) couldn't allocate memory for vhost dev\n",
1708 			vid);
1709 		return -1;
1710 	}
1711 	vdev->vid = vid;
1712 
1713 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1714 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1715 			= rte_zmalloc("vhost bufftable",
1716 				sizeof(struct vhost_bufftable),
1717 				RTE_CACHE_LINE_SIZE);
1718 
1719 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1720 			RTE_LOG(INFO, VHOST_DATA,
1721 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1722 			return -1;
1723 		}
1724 	}
1725 
1726 	int socketid = get_socketid_by_vid(vid);
1727 	if (socketid == -1)
1728 		return -1;
1729 
1730 	init_vid2socketid_array(vid, socketid);
1731 
1732 	ret =  vhost_async_channel_register(vid);
1733 
1734 	if (init_vhost_queue_ops(vid) != 0)
1735 		return -1;
1736 
1737 	if (builtin_net_driver)
1738 		vs_vhost_net_setup(vdev);
1739 
1740 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1741 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1742 
1743 	/*reset ready flag*/
1744 	vdev->ready = DEVICE_MAC_LEARNING;
1745 	vdev->remove = 0;
1746 
1747 	/* Find a suitable lcore to add the device. */
1748 	RTE_LCORE_FOREACH_WORKER(lcore) {
1749 		if (lcore_info[lcore].device_num < device_num_min) {
1750 			device_num_min = lcore_info[lcore].device_num;
1751 			core_add = lcore;
1752 		}
1753 	}
1754 	vdev->coreid = core_add;
1755 
1756 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1757 			  lcore_vdev_entry);
1758 	lcore_info[vdev->coreid].device_num++;
1759 
1760 	/* Disable notifications. */
1761 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1762 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1763 
1764 	RTE_LOG(INFO, VHOST_DATA,
1765 		"(%d) device has been added to data core %d\n",
1766 		vid, vdev->coreid);
1767 
1768 	return ret;
1769 }
1770 
1771 static int
1772 vring_state_changed(int vid, uint16_t queue_id, int enable)
1773 {
1774 	struct vhost_dev *vdev = NULL;
1775 
1776 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1777 		if (vdev->vid == vid)
1778 			break;
1779 	}
1780 	if (!vdev)
1781 		return -1;
1782 
1783 	if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1784 		if (!enable)
1785 			vhost_clear_queue_thread_unsafe(vdev, queue_id);
1786 	}
1787 
1788 	return 0;
1789 }
1790 
1791 /*
1792  * These callback allow devices to be added to the data core when configuration
1793  * has been fully complete.
1794  */
1795 static const struct rte_vhost_device_ops virtio_net_device_ops =
1796 {
1797 	.new_device =  new_device,
1798 	.destroy_device = destroy_device,
1799 	.vring_state_changed = vring_state_changed,
1800 };
1801 
1802 /*
1803  * This is a thread will wake up after a period to print stats if the user has
1804  * enabled them.
1805  */
1806 static void *
1807 print_stats(__rte_unused void *arg)
1808 {
1809 	struct vhost_dev *vdev;
1810 	uint64_t tx_dropped, rx_dropped;
1811 	uint64_t tx, tx_total, rx, rx_total;
1812 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1813 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1814 
1815 	while(1) {
1816 		sleep(enable_stats);
1817 
1818 		/* Clear screen and move to top left */
1819 		printf("%s%s\n", clr, top_left);
1820 		printf("Device statistics =================================\n");
1821 
1822 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1823 			tx_total   = vdev->stats.tx_total;
1824 			tx         = vdev->stats.tx;
1825 			tx_dropped = tx_total - tx;
1826 
1827 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1828 				__ATOMIC_SEQ_CST);
1829 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1830 				__ATOMIC_SEQ_CST);
1831 			rx_dropped = rx_total - rx;
1832 
1833 			printf("Statistics for device %d\n"
1834 				"-----------------------\n"
1835 				"TX total:              %" PRIu64 "\n"
1836 				"TX dropped:            %" PRIu64 "\n"
1837 				"TX successful:         %" PRIu64 "\n"
1838 				"RX total:              %" PRIu64 "\n"
1839 				"RX dropped:            %" PRIu64 "\n"
1840 				"RX successful:         %" PRIu64 "\n",
1841 				vdev->vid,
1842 				tx_total, tx_dropped, tx,
1843 				rx_total, rx_dropped, rx);
1844 		}
1845 
1846 		printf("===================================================\n");
1847 
1848 		fflush(stdout);
1849 	}
1850 
1851 	return NULL;
1852 }
1853 
1854 static void
1855 unregister_drivers(int socket_num)
1856 {
1857 	int i, ret;
1858 
1859 	for (i = 0; i < socket_num; i++) {
1860 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1861 		if (ret != 0)
1862 			RTE_LOG(ERR, VHOST_CONFIG,
1863 				"Fail to unregister vhost driver for %s.\n",
1864 				socket_files + i * PATH_MAX);
1865 	}
1866 }
1867 
1868 /* When we receive a INT signal, unregister vhost driver */
1869 static void
1870 sigint_handler(__rte_unused int signum)
1871 {
1872 	/* Unregister vhost driver. */
1873 	unregister_drivers(nb_sockets);
1874 
1875 	exit(0);
1876 }
1877 
1878 static void
1879 reset_dma(void)
1880 {
1881 	int i;
1882 
1883 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1884 		int j;
1885 
1886 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1887 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1888 			dma_bind[i].dmas[j].async_enabled = false;
1889 		}
1890 	}
1891 
1892 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1893 		dmas_id[i] = INVALID_DMA_ID;
1894 }
1895 
1896 /*
1897  * Main function, does initialisation and calls the per-lcore functions.
1898  */
1899 int
1900 main(int argc, char *argv[])
1901 {
1902 	unsigned lcore_id, core_id = 0;
1903 	unsigned nb_ports, valid_num_ports;
1904 	int ret, i;
1905 	uint16_t portid;
1906 	static pthread_t tid;
1907 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1908 
1909 	signal(SIGINT, sigint_handler);
1910 
1911 	/* init EAL */
1912 	ret = rte_eal_init(argc, argv);
1913 	if (ret < 0)
1914 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1915 	argc -= ret;
1916 	argv += ret;
1917 
1918 	/* initialize dma structures */
1919 	reset_dma();
1920 
1921 	/* parse app arguments */
1922 	ret = us_vhost_parse_args(argc, argv);
1923 	if (ret < 0)
1924 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1925 
1926 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1927 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1928 
1929 		if (rte_lcore_is_enabled(lcore_id))
1930 			lcore_ids[core_id++] = lcore_id;
1931 	}
1932 
1933 	if (rte_lcore_count() > RTE_MAX_LCORE)
1934 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1935 
1936 	/* Get the number of physical ports. */
1937 	nb_ports = rte_eth_dev_count_avail();
1938 
1939 	/*
1940 	 * Update the global var NUM_PORTS and global array PORTS
1941 	 * and get value of var VALID_NUM_PORTS according to system ports number
1942 	 */
1943 	valid_num_ports = check_ports_num(nb_ports);
1944 
1945 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1946 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1947 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1948 		return -1;
1949 	}
1950 
1951 	/*
1952 	 * FIXME: here we are trying to allocate mbufs big enough for
1953 	 * @MAX_QUEUES, but the truth is we're never going to use that
1954 	 * many queues here. We probably should only do allocation for
1955 	 * those queues we are going to use.
1956 	 */
1957 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1958 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1959 					    rte_socket_id());
1960 	if (mbuf_pool == NULL)
1961 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1962 
1963 	if (vm2vm_mode == VM2VM_HARDWARE) {
1964 		/* Enable VT loop back to let L2 switch to do it. */
1965 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1966 		RTE_LOG(DEBUG, VHOST_CONFIG,
1967 			"Enable loop back for L2 switch in vmdq.\n");
1968 	}
1969 
1970 	/* initialize all ports */
1971 	RTE_ETH_FOREACH_DEV(portid) {
1972 		/* skip ports that are not enabled */
1973 		if ((enabled_port_mask & (1 << portid)) == 0) {
1974 			RTE_LOG(INFO, VHOST_PORT,
1975 				"Skipping disabled port %d\n", portid);
1976 			continue;
1977 		}
1978 		if (port_init(portid) != 0)
1979 			rte_exit(EXIT_FAILURE,
1980 				"Cannot initialize network ports\n");
1981 	}
1982 
1983 	/* Enable stats if the user option is set. */
1984 	if (enable_stats) {
1985 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1986 					print_stats, NULL);
1987 		if (ret < 0)
1988 			rte_exit(EXIT_FAILURE,
1989 				"Cannot create print-stats thread\n");
1990 	}
1991 
1992 	/* Launch all data cores. */
1993 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1994 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1995 
1996 	if (client_mode)
1997 		flags |= RTE_VHOST_USER_CLIENT;
1998 
1999 	for (i = 0; i < dma_count; i++) {
2000 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
2001 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
2002 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2003 		}
2004 	}
2005 
2006 	/* Register vhost user driver to handle vhost messages. */
2007 	for (i = 0; i < nb_sockets; i++) {
2008 		char *file = socket_files + i * PATH_MAX;
2009 
2010 		if (dma_count && get_async_flag_by_socketid(i) != 0)
2011 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2012 
2013 		ret = rte_vhost_driver_register(file, flags);
2014 		if (ret != 0) {
2015 			unregister_drivers(i);
2016 			rte_exit(EXIT_FAILURE,
2017 				"vhost driver register failure.\n");
2018 		}
2019 
2020 		if (builtin_net_driver)
2021 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2022 
2023 		if (mergeable == 0) {
2024 			rte_vhost_driver_disable_features(file,
2025 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
2026 		}
2027 
2028 		if (enable_tx_csum == 0) {
2029 			rte_vhost_driver_disable_features(file,
2030 				1ULL << VIRTIO_NET_F_CSUM);
2031 		}
2032 
2033 		if (enable_tso == 0) {
2034 			rte_vhost_driver_disable_features(file,
2035 				1ULL << VIRTIO_NET_F_HOST_TSO4);
2036 			rte_vhost_driver_disable_features(file,
2037 				1ULL << VIRTIO_NET_F_HOST_TSO6);
2038 			rte_vhost_driver_disable_features(file,
2039 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
2040 			rte_vhost_driver_disable_features(file,
2041 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
2042 		}
2043 
2044 		if (promiscuous) {
2045 			rte_vhost_driver_enable_features(file,
2046 				1ULL << VIRTIO_NET_F_CTRL_RX);
2047 		}
2048 
2049 		ret = rte_vhost_driver_callback_register(file,
2050 			&virtio_net_device_ops);
2051 		if (ret != 0) {
2052 			rte_exit(EXIT_FAILURE,
2053 				"failed to register vhost driver callbacks.\n");
2054 		}
2055 
2056 		if (rte_vhost_driver_start(file) < 0) {
2057 			rte_exit(EXIT_FAILURE,
2058 				"failed to start vhost driver.\n");
2059 		}
2060 	}
2061 
2062 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2063 		rte_eal_wait_lcore(lcore_id);
2064 
2065 	/* clean up the EAL */
2066 	rte_eal_cleanup();
2067 
2068 	return 0;
2069 }
2070