xref: /dpdk/examples/vhost/main.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
13998e2a0SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
23998e2a0SBruce Richardson  * Copyright(c) 2010-2017 Intel Corporation
3d19533e8SHuawei Xie  */
4d19533e8SHuawei Xie 
5d19533e8SHuawei Xie #include <arpa/inet.h>
6d19533e8SHuawei Xie #include <getopt.h>
7d19533e8SHuawei Xie #include <linux/if_ether.h>
8d19533e8SHuawei Xie #include <linux/if_vlan.h>
9d19533e8SHuawei Xie #include <linux/virtio_net.h>
10d19533e8SHuawei Xie #include <linux/virtio_ring.h>
11d19533e8SHuawei Xie #include <signal.h>
12d19533e8SHuawei Xie #include <stdint.h>
13d19533e8SHuawei Xie #include <sys/eventfd.h>
14d19533e8SHuawei Xie #include <sys/param.h>
15d19533e8SHuawei Xie #include <unistd.h>
16d19533e8SHuawei Xie 
17d19533e8SHuawei Xie #include <rte_cycles.h>
18d19533e8SHuawei Xie #include <rte_ethdev.h>
19d19533e8SHuawei Xie #include <rte_log.h>
20d19533e8SHuawei Xie #include <rte_string_fns.h>
21d19533e8SHuawei Xie #include <rte_malloc.h>
22ca7036b4SDavid Marchand #include <rte_net.h>
23a798beb4SYuanhan Liu #include <rte_vhost.h>
24691693c6SJijiang Liu #include <rte_ip.h>
259fd72e3cSJijiang Liu #include <rte_tcp.h>
26577329e6SJerin Jacob #include <rte_pause.h>
27d19533e8SHuawei Xie 
283a04ecb2SCheng Jiang #include "ioat.h"
29d19533e8SHuawei Xie #include "main.h"
30d19533e8SHuawei Xie 
31f17eb179SBernard Iremonger #ifndef MAX_QUEUES
32f17eb179SBernard Iremonger #define MAX_QUEUES 128
33f17eb179SBernard Iremonger #endif
34d19533e8SHuawei Xie 
35d19533e8SHuawei Xie /* the maximum number of external ports supported */
36d19533e8SHuawei Xie #define MAX_SUP_PORTS 1
37d19533e8SHuawei Xie 
38d19533e8SHuawei Xie #define MBUF_CACHE_SIZE	128
39824cb29cSKonstantin Ananyev #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40d19533e8SHuawei Xie 
41d19533e8SHuawei Xie #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42d19533e8SHuawei Xie 
43d19533e8SHuawei Xie #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44d19533e8SHuawei Xie #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45d19533e8SHuawei Xie 
46d19533e8SHuawei Xie #define JUMBO_FRAME_MAX_SIZE    0x2600
471bb4a528SFerruh Yigit #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48d19533e8SHuawei Xie 
49d19533e8SHuawei Xie /* State of virtio device. */
50d19533e8SHuawei Xie #define DEVICE_MAC_LEARNING 0
51d19533e8SHuawei Xie #define DEVICE_RX			1
52d19533e8SHuawei Xie #define DEVICE_SAFE_REMOVE	2
53d19533e8SHuawei Xie 
54d19533e8SHuawei Xie /* Configurable number of RX/TX ring descriptors */
55d19533e8SHuawei Xie #define RTE_TEST_RX_DESC_DEFAULT 1024
56d19533e8SHuawei Xie #define RTE_TEST_TX_DESC_DEFAULT 512
57d19533e8SHuawei Xie 
58d19533e8SHuawei Xie #define INVALID_PORT_ID 0xFF
59d19533e8SHuawei Xie 
60d19533e8SHuawei Xie /* mask of enabled ports */
61d19533e8SHuawei Xie static uint32_t enabled_port_mask = 0;
62d19533e8SHuawei Xie 
6390924cafSOuyang Changchun /* Promiscuous mode */
6490924cafSOuyang Changchun static uint32_t promiscuous;
6590924cafSOuyang Changchun 
66d19533e8SHuawei Xie /* number of devices/queues to support*/
67d19533e8SHuawei Xie static uint32_t num_queues = 0;
68a981294bSHuawei Xie static uint32_t num_devices;
69d19533e8SHuawei Xie 
7068363d85SYuanhan Liu static struct rte_mempool *mbuf_pool;
7128deb020SHuawei Xie static int mergeable;
72d19533e8SHuawei Xie 
73d19533e8SHuawei Xie /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74d19533e8SHuawei Xie typedef enum {
75d19533e8SHuawei Xie 	VM2VM_DISABLED = 0,
76d19533e8SHuawei Xie 	VM2VM_SOFTWARE = 1,
77d19533e8SHuawei Xie 	VM2VM_HARDWARE = 2,
78d19533e8SHuawei Xie 	VM2VM_LAST
79d19533e8SHuawei Xie } vm2vm_type;
80d19533e8SHuawei Xie static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81d19533e8SHuawei Xie 
82d19533e8SHuawei Xie /* Enable stats. */
83d19533e8SHuawei Xie static uint32_t enable_stats = 0;
84d19533e8SHuawei Xie /* Enable retries on RX. */
85d19533e8SHuawei Xie static uint32_t enable_retry = 1;
869fd72e3cSJijiang Liu 
879fd72e3cSJijiang Liu /* Disable TX checksum offload */
889fd72e3cSJijiang Liu static uint32_t enable_tx_csum;
899fd72e3cSJijiang Liu 
909fd72e3cSJijiang Liu /* Disable TSO offload */
919fd72e3cSJijiang Liu static uint32_t enable_tso;
929fd72e3cSJijiang Liu 
932345e3beSYuanhan Liu static int client_mode;
942345e3beSYuanhan Liu 
95ca059fa5SYuanhan Liu static int builtin_net_driver;
96ca059fa5SYuanhan Liu 
973a04ecb2SCheng Jiang static int async_vhost_driver;
983a04ecb2SCheng Jiang 
9947afdbbeSCheng Jiang static char *dma_type;
1003a04ecb2SCheng Jiang 
101d19533e8SHuawei Xie /* Specify timeout (in useconds) between retries on RX. */
102d19533e8SHuawei Xie static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103d19533e8SHuawei Xie /* Specify the number of retries on RX. */
104d19533e8SHuawei Xie static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105d19533e8SHuawei Xie 
106ad0eef4dSJiayu Hu /* Socket file paths. Can be set by user */
107ad0eef4dSJiayu Hu static char *socket_files;
108ad0eef4dSJiayu Hu static int nb_sockets;
109d19533e8SHuawei Xie 
110d19533e8SHuawei Xie /* empty vmdq configuration structure. Filled in programatically */
111d19533e8SHuawei Xie static struct rte_eth_conf vmdq_conf_default = {
112d19533e8SHuawei Xie 	.rxmode = {
113295968d1SFerruh Yigit 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
114d19533e8SHuawei Xie 		.split_hdr_size = 0,
115d19533e8SHuawei Xie 		/*
116cc22d8caSShahaf Shuler 		 * VLAN strip is necessary for 1G NIC such as I350,
117d19533e8SHuawei Xie 		 * this fixes bug of ipv4 forwarding in guest can't
118d19533e8SHuawei Xie 		 * forward pakets from one virtio dev to another virtio dev.
119d19533e8SHuawei Xie 		 */
120295968d1SFerruh Yigit 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
121d19533e8SHuawei Xie 	},
122d19533e8SHuawei Xie 
123d19533e8SHuawei Xie 	.txmode = {
124295968d1SFerruh Yigit 		.mq_mode = RTE_ETH_MQ_TX_NONE,
125295968d1SFerruh Yigit 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
126295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
127295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
128295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
129295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
130d19533e8SHuawei Xie 	},
131d19533e8SHuawei Xie 	.rx_adv_conf = {
132d19533e8SHuawei Xie 		/*
133d19533e8SHuawei Xie 		 * should be overridden separately in code with
134d19533e8SHuawei Xie 		 * appropriate values
135d19533e8SHuawei Xie 		 */
136d19533e8SHuawei Xie 		.vmdq_rx_conf = {
137295968d1SFerruh Yigit 			.nb_queue_pools = RTE_ETH_8_POOLS,
138d19533e8SHuawei Xie 			.enable_default_pool = 0,
139d19533e8SHuawei Xie 			.default_pool = 0,
140d19533e8SHuawei Xie 			.nb_pool_maps = 0,
141d19533e8SHuawei Xie 			.pool_map = {{0, 0},},
142d19533e8SHuawei Xie 		},
143d19533e8SHuawei Xie 	},
144d19533e8SHuawei Xie };
145d19533e8SHuawei Xie 
146cc22d8caSShahaf Shuler 
147d19533e8SHuawei Xie static unsigned lcore_ids[RTE_MAX_LCORE];
148f8244c63SZhiyong Yang static uint16_t ports[RTE_MAX_ETHPORTS];
149d19533e8SHuawei Xie static unsigned num_ports = 0; /**< The number of ports specified in command line */
15084b02d16SHuawei Xie static uint16_t num_pf_queues, num_vmdq_queues;
15184b02d16SHuawei Xie static uint16_t vmdq_pool_base, vmdq_queue_base;
15284b02d16SHuawei Xie static uint16_t queues_per_pool;
153d19533e8SHuawei Xie 
154d19533e8SHuawei Xie const uint16_t vlan_tags[] = {
155d19533e8SHuawei Xie 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156d19533e8SHuawei Xie 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
157d19533e8SHuawei Xie 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158d19533e8SHuawei Xie 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159d19533e8SHuawei Xie 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160d19533e8SHuawei Xie 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161d19533e8SHuawei Xie 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162d19533e8SHuawei Xie 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163d19533e8SHuawei Xie };
164d19533e8SHuawei Xie 
165d19533e8SHuawei Xie /* ethernet addresses of ports */
1666d13ea8eSOlivier Matz static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167d19533e8SHuawei Xie 
16845657a5cSYuanhan Liu static struct vhost_dev_tailq_list vhost_dev_list =
16945657a5cSYuanhan Liu 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170d19533e8SHuawei Xie 
171d19533e8SHuawei Xie static struct lcore_info lcore_info[RTE_MAX_LCORE];
172d19533e8SHuawei Xie 
173d19533e8SHuawei Xie /* Used for queueing bursts of TX packets. */
174d19533e8SHuawei Xie struct mbuf_table {
175d19533e8SHuawei Xie 	unsigned len;
176d19533e8SHuawei Xie 	unsigned txq_id;
177d19533e8SHuawei Xie 	struct rte_mbuf *m_table[MAX_PKT_BURST];
178d19533e8SHuawei Xie };
179d19533e8SHuawei Xie 
180a68ba8e0SCheng Jiang struct vhost_bufftable {
181a68ba8e0SCheng Jiang 	uint32_t len;
182a68ba8e0SCheng Jiang 	uint64_t pre_tsc;
183a68ba8e0SCheng Jiang 	struct rte_mbuf *m_table[MAX_PKT_BURST];
184a68ba8e0SCheng Jiang };
185a68ba8e0SCheng Jiang 
186d19533e8SHuawei Xie /* TX queue for each data core. */
187d19533e8SHuawei Xie struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188d19533e8SHuawei Xie 
189a68ba8e0SCheng Jiang /*
190a68ba8e0SCheng Jiang  * Vhost TX buffer for each data core.
191a68ba8e0SCheng Jiang  * Every data core maintains a TX buffer for every vhost device,
192a68ba8e0SCheng Jiang  * which is used for batch pkts enqueue for higher performance.
193a68ba8e0SCheng Jiang  */
194a68ba8e0SCheng Jiang struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195a68ba8e0SCheng Jiang 
196273ecdbcSYuanhan Liu #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
197273ecdbcSYuanhan Liu 				 / US_PER_S * BURST_TX_DRAIN_US)
198d19533e8SHuawei Xie #define VLAN_HLEN       4
199d19533e8SHuawei Xie 
2003a04ecb2SCheng Jiang static inline int
2013a04ecb2SCheng Jiang open_dma(const char *value)
2023a04ecb2SCheng Jiang {
20347afdbbeSCheng Jiang 	if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
2043a04ecb2SCheng Jiang 		return open_ioat(value);
2053a04ecb2SCheng Jiang 
2063a04ecb2SCheng Jiang 	return -1;
2073a04ecb2SCheng Jiang }
2083a04ecb2SCheng Jiang 
209d19533e8SHuawei Xie /*
210d19533e8SHuawei Xie  * Builds up the correct configuration for VMDQ VLAN pool map
211d19533e8SHuawei Xie  * according to the pool & queue limits.
212d19533e8SHuawei Xie  */
213d19533e8SHuawei Xie static inline int
214d19533e8SHuawei Xie get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
215d19533e8SHuawei Xie {
216d19533e8SHuawei Xie 	struct rte_eth_vmdq_rx_conf conf;
21790924cafSOuyang Changchun 	struct rte_eth_vmdq_rx_conf *def_conf =
21890924cafSOuyang Changchun 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
219d19533e8SHuawei Xie 	unsigned i;
220d19533e8SHuawei Xie 
221d19533e8SHuawei Xie 	memset(&conf, 0, sizeof(conf));
222d19533e8SHuawei Xie 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
223d19533e8SHuawei Xie 	conf.nb_pool_maps = num_devices;
22490924cafSOuyang Changchun 	conf.enable_loop_back = def_conf->enable_loop_back;
22590924cafSOuyang Changchun 	conf.rx_mode = def_conf->rx_mode;
226d19533e8SHuawei Xie 
227d19533e8SHuawei Xie 	for (i = 0; i < conf.nb_pool_maps; i++) {
228d19533e8SHuawei Xie 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
229d19533e8SHuawei Xie 		conf.pool_map[i].pools = (1UL << i);
230d19533e8SHuawei Xie 	}
231d19533e8SHuawei Xie 
232d19533e8SHuawei Xie 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
233d19533e8SHuawei Xie 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
234d19533e8SHuawei Xie 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
235d19533e8SHuawei Xie 	return 0;
236d19533e8SHuawei Xie }
237d19533e8SHuawei Xie 
238d19533e8SHuawei Xie /*
239d19533e8SHuawei Xie  * Initialises a given port using global settings and with the rx buffers
240d19533e8SHuawei Xie  * coming from the mbuf_pool passed as parameter
241d19533e8SHuawei Xie  */
242d19533e8SHuawei Xie static inline int
243f8244c63SZhiyong Yang port_init(uint16_t port)
244d19533e8SHuawei Xie {
245d19533e8SHuawei Xie 	struct rte_eth_dev_info dev_info;
246d19533e8SHuawei Xie 	struct rte_eth_conf port_conf;
247db4014f2SHuawei Xie 	struct rte_eth_rxconf *rxconf;
248db4014f2SHuawei Xie 	struct rte_eth_txconf *txconf;
249db4014f2SHuawei Xie 	int16_t rx_rings, tx_rings;
250d19533e8SHuawei Xie 	uint16_t rx_ring_size, tx_ring_size;
251d19533e8SHuawei Xie 	int retval;
252d19533e8SHuawei Xie 	uint16_t q;
253d19533e8SHuawei Xie 
254d19533e8SHuawei Xie 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
25537fb306cSIvan Ilchenko 	retval = rte_eth_dev_info_get(port, &dev_info);
25637fb306cSIvan Ilchenko 	if (retval != 0) {
25737fb306cSIvan Ilchenko 		RTE_LOG(ERR, VHOST_PORT,
25837fb306cSIvan Ilchenko 			"Error during getting device (port %u) info: %s\n",
25937fb306cSIvan Ilchenko 			port, strerror(-retval));
26037fb306cSIvan Ilchenko 
26137fb306cSIvan Ilchenko 		return retval;
26237fb306cSIvan Ilchenko 	}
263d19533e8SHuawei Xie 
264db4014f2SHuawei Xie 	rxconf = &dev_info.default_rxconf;
265db4014f2SHuawei Xie 	txconf = &dev_info.default_txconf;
266db4014f2SHuawei Xie 	rxconf->rx_drop_en = 1;
267f0adccd4SOuyang Changchun 
268d19533e8SHuawei Xie 	/*configure the number of supported virtio devices based on VMDQ limits */
269d19533e8SHuawei Xie 	num_devices = dev_info.max_vmdq_pools;
270d19533e8SHuawei Xie 
271d19533e8SHuawei Xie 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
272d19533e8SHuawei Xie 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
27300b8b706SYuanhan Liu 
274d19533e8SHuawei Xie 	tx_rings = (uint16_t)rte_lcore_count();
275d19533e8SHuawei Xie 
276d19533e8SHuawei Xie 	/* Get port configuration. */
277d19533e8SHuawei Xie 	retval = get_eth_conf(&port_conf, num_devices);
278d19533e8SHuawei Xie 	if (retval < 0)
279d19533e8SHuawei Xie 		return retval;
28084b02d16SHuawei Xie 	/* NIC queues are divided into pf queues and vmdq queues.  */
28184b02d16SHuawei Xie 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
28284b02d16SHuawei Xie 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
28384b02d16SHuawei Xie 	num_vmdq_queues = num_devices * queues_per_pool;
28484b02d16SHuawei Xie 	num_queues = num_pf_queues + num_vmdq_queues;
28584b02d16SHuawei Xie 	vmdq_queue_base = dev_info.vmdq_queue_base;
28684b02d16SHuawei Xie 	vmdq_pool_base  = dev_info.vmdq_pool_base;
28784b02d16SHuawei Xie 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
28884b02d16SHuawei Xie 		num_pf_queues, num_devices, queues_per_pool);
289d19533e8SHuawei Xie 
290a9dbe180SThomas Monjalon 	if (!rte_eth_dev_is_valid_port(port))
291a9dbe180SThomas Monjalon 		return -1;
292d19533e8SHuawei Xie 
29384b02d16SHuawei Xie 	rx_rings = (uint16_t)dev_info.max_rx_queues;
294295968d1SFerruh Yigit 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
295cc22d8caSShahaf Shuler 		port_conf.txmode.offloads |=
296295968d1SFerruh Yigit 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
297d19533e8SHuawei Xie 	/* Configure ethernet device. */
298d19533e8SHuawei Xie 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
299bb7085b4SJianfeng Tan 	if (retval != 0) {
300bb7085b4SJianfeng Tan 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
301bb7085b4SJianfeng Tan 			port, strerror(-retval));
302d19533e8SHuawei Xie 		return retval;
303bb7085b4SJianfeng Tan 	}
304d19533e8SHuawei Xie 
30560efb44fSRoman Zhukov 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
30660efb44fSRoman Zhukov 		&tx_ring_size);
30760efb44fSRoman Zhukov 	if (retval != 0) {
30860efb44fSRoman Zhukov 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
30960efb44fSRoman Zhukov 			"for port %u: %s.\n", port, strerror(-retval));
31060efb44fSRoman Zhukov 		return retval;
31160efb44fSRoman Zhukov 	}
31260efb44fSRoman Zhukov 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
31360efb44fSRoman Zhukov 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
31460efb44fSRoman Zhukov 			"for Rx queues on port %u.\n", port);
31560efb44fSRoman Zhukov 		return -1;
31660efb44fSRoman Zhukov 	}
31760efb44fSRoman Zhukov 
318d19533e8SHuawei Xie 	/* Setup the queues. */
319cc22d8caSShahaf Shuler 	rxconf->offloads = port_conf.rxmode.offloads;
320d19533e8SHuawei Xie 	for (q = 0; q < rx_rings; q ++) {
321d19533e8SHuawei Xie 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
322db4014f2SHuawei Xie 						rte_eth_dev_socket_id(port),
323db4014f2SHuawei Xie 						rxconf,
32468363d85SYuanhan Liu 						mbuf_pool);
325bb7085b4SJianfeng Tan 		if (retval < 0) {
326bb7085b4SJianfeng Tan 			RTE_LOG(ERR, VHOST_PORT,
327bb7085b4SJianfeng Tan 				"Failed to setup rx queue %u of port %u: %s.\n",
328bb7085b4SJianfeng Tan 				q, port, strerror(-retval));
329d19533e8SHuawei Xie 			return retval;
330d19533e8SHuawei Xie 		}
331bb7085b4SJianfeng Tan 	}
332cc22d8caSShahaf Shuler 	txconf->offloads = port_conf.txmode.offloads;
333d19533e8SHuawei Xie 	for (q = 0; q < tx_rings; q ++) {
334d19533e8SHuawei Xie 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
335db4014f2SHuawei Xie 						rte_eth_dev_socket_id(port),
336db4014f2SHuawei Xie 						txconf);
337bb7085b4SJianfeng Tan 		if (retval < 0) {
338bb7085b4SJianfeng Tan 			RTE_LOG(ERR, VHOST_PORT,
339bb7085b4SJianfeng Tan 				"Failed to setup tx queue %u of port %u: %s.\n",
340bb7085b4SJianfeng Tan 				q, port, strerror(-retval));
341d19533e8SHuawei Xie 			return retval;
342d19533e8SHuawei Xie 		}
343bb7085b4SJianfeng Tan 	}
344d19533e8SHuawei Xie 
345d19533e8SHuawei Xie 	/* Start the device. */
346d19533e8SHuawei Xie 	retval  = rte_eth_dev_start(port);
347d19533e8SHuawei Xie 	if (retval < 0) {
348bb7085b4SJianfeng Tan 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
349bb7085b4SJianfeng Tan 			port, strerror(-retval));
350d19533e8SHuawei Xie 		return retval;
351d19533e8SHuawei Xie 	}
352d19533e8SHuawei Xie 
353f430bbceSIvan Ilchenko 	if (promiscuous) {
354f430bbceSIvan Ilchenko 		retval = rte_eth_promiscuous_enable(port);
355f430bbceSIvan Ilchenko 		if (retval != 0) {
356f430bbceSIvan Ilchenko 			RTE_LOG(ERR, VHOST_PORT,
357f430bbceSIvan Ilchenko 				"Failed to enable promiscuous mode on port %u: %s\n",
358f430bbceSIvan Ilchenko 				port, rte_strerror(-retval));
359f430bbceSIvan Ilchenko 			return retval;
360f430bbceSIvan Ilchenko 		}
361f430bbceSIvan Ilchenko 	}
36290924cafSOuyang Changchun 
36370febdcfSIgor Romanov 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
36470febdcfSIgor Romanov 	if (retval < 0) {
36570febdcfSIgor Romanov 		RTE_LOG(ERR, VHOST_PORT,
36670febdcfSIgor Romanov 			"Failed to get MAC address on port %u: %s\n",
36770febdcfSIgor Romanov 			port, rte_strerror(-retval));
36870febdcfSIgor Romanov 		return retval;
36970febdcfSIgor Romanov 	}
37070febdcfSIgor Romanov 
371d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
372d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
373d19533e8SHuawei Xie 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
374a7db3afcSAman Deep Singh 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
375d19533e8SHuawei Xie 
376d19533e8SHuawei Xie 	return 0;
377d19533e8SHuawei Xie }
378d19533e8SHuawei Xie 
379d19533e8SHuawei Xie /*
380bde19a4dSJiayu Hu  * Set socket file path.
381d19533e8SHuawei Xie  */
382d19533e8SHuawei Xie static int
383bde19a4dSJiayu Hu us_vhost_parse_socket_path(const char *q_arg)
384d19533e8SHuawei Xie {
385d79035b7STiwei Bie 	char *old;
386d79035b7STiwei Bie 
387d19533e8SHuawei Xie 	/* parse number string */
388fa81d3b9SGang Jiang 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
389d19533e8SHuawei Xie 		return -1;
390ad0eef4dSJiayu Hu 
391d79035b7STiwei Bie 	old = socket_files;
392ad0eef4dSJiayu Hu 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
393d79035b7STiwei Bie 	if (socket_files == NULL) {
394d79035b7STiwei Bie 		free(old);
395d79035b7STiwei Bie 		return -1;
396d79035b7STiwei Bie 	}
397d79035b7STiwei Bie 
398f9acaf84SBruce Richardson 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
399ad0eef4dSJiayu Hu 	nb_sockets++;
400d19533e8SHuawei Xie 
401d19533e8SHuawei Xie 	return 0;
402d19533e8SHuawei Xie }
403d19533e8SHuawei Xie 
404d19533e8SHuawei Xie /*
405d19533e8SHuawei Xie  * Parse the portmask provided at run time.
406d19533e8SHuawei Xie  */
407d19533e8SHuawei Xie static int
408d19533e8SHuawei Xie parse_portmask(const char *portmask)
409d19533e8SHuawei Xie {
410d19533e8SHuawei Xie 	char *end = NULL;
411d19533e8SHuawei Xie 	unsigned long pm;
412d19533e8SHuawei Xie 
413d19533e8SHuawei Xie 	errno = 0;
414d19533e8SHuawei Xie 
415d19533e8SHuawei Xie 	/* parse hexadecimal string */
416d19533e8SHuawei Xie 	pm = strtoul(portmask, &end, 16);
417d19533e8SHuawei Xie 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
418ce6b8c31SSarosh Arif 		return 0;
419d19533e8SHuawei Xie 
420d19533e8SHuawei Xie 	return pm;
421d19533e8SHuawei Xie 
422d19533e8SHuawei Xie }
423d19533e8SHuawei Xie 
424d19533e8SHuawei Xie /*
425d19533e8SHuawei Xie  * Parse num options at run time.
426d19533e8SHuawei Xie  */
427d19533e8SHuawei Xie static int
428d19533e8SHuawei Xie parse_num_opt(const char *q_arg, uint32_t max_valid_value)
429d19533e8SHuawei Xie {
430d19533e8SHuawei Xie 	char *end = NULL;
431d19533e8SHuawei Xie 	unsigned long num;
432d19533e8SHuawei Xie 
433d19533e8SHuawei Xie 	errno = 0;
434d19533e8SHuawei Xie 
435d19533e8SHuawei Xie 	/* parse unsigned int string */
436d19533e8SHuawei Xie 	num = strtoul(q_arg, &end, 10);
437d19533e8SHuawei Xie 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
438d19533e8SHuawei Xie 		return -1;
439d19533e8SHuawei Xie 
440d19533e8SHuawei Xie 	if (num > max_valid_value)
441d19533e8SHuawei Xie 		return -1;
442d19533e8SHuawei Xie 
443d19533e8SHuawei Xie 	return num;
444d19533e8SHuawei Xie 
445d19533e8SHuawei Xie }
446d19533e8SHuawei Xie 
447d19533e8SHuawei Xie /*
448d19533e8SHuawei Xie  * Display usage
449d19533e8SHuawei Xie  */
450d19533e8SHuawei Xie static void
451d19533e8SHuawei Xie us_vhost_usage(const char *prgname)
452d19533e8SHuawei Xie {
453d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
454d19533e8SHuawei Xie 	"		--vm2vm [0|1|2]\n"
455d19533e8SHuawei Xie 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
456bde19a4dSJiayu Hu 	"		--socket-file <path>\n"
457d19533e8SHuawei Xie 	"		--nb-devices ND\n"
458d19533e8SHuawei Xie 	"		-p PORTMASK: Set mask for ports to be used by application\n"
459d19533e8SHuawei Xie 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
460d19533e8SHuawei Xie 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
461d19533e8SHuawei Xie 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
462d19533e8SHuawei Xie 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
463d19533e8SHuawei Xie 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
464d19533e8SHuawei Xie 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
465bde19a4dSJiayu Hu 	"		--socket-file: The path of the socket file.\n"
4669fd72e3cSJijiang Liu 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
4672345e3beSYuanhan Liu 	"		--tso [0|1] disable/enable TCP segment offload.\n"
4683a04ecb2SCheng Jiang 	"		--client register a vhost-user socket as client mode.\n"
4693a04ecb2SCheng Jiang 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
4703a04ecb2SCheng Jiang 	"		--dmas register dma channel for specific vhost device.\n",
471d19533e8SHuawei Xie 	       prgname);
472d19533e8SHuawei Xie }
473d19533e8SHuawei Xie 
474965b06f0SIbtisam Tariq enum {
475965b06f0SIbtisam Tariq #define OPT_VM2VM               "vm2vm"
476965b06f0SIbtisam Tariq 	OPT_VM2VM_NUM = 256,
477965b06f0SIbtisam Tariq #define OPT_RX_RETRY            "rx-retry"
478965b06f0SIbtisam Tariq 	OPT_RX_RETRY_NUM,
479965b06f0SIbtisam Tariq #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
480965b06f0SIbtisam Tariq 	OPT_RX_RETRY_DELAY_NUM,
481965b06f0SIbtisam Tariq #define OPT_RX_RETRY_NUMB       "rx-retry-num"
482965b06f0SIbtisam Tariq 	OPT_RX_RETRY_NUMB_NUM,
483965b06f0SIbtisam Tariq #define OPT_MERGEABLE           "mergeable"
484965b06f0SIbtisam Tariq 	OPT_MERGEABLE_NUM,
485965b06f0SIbtisam Tariq #define OPT_STATS               "stats"
486965b06f0SIbtisam Tariq 	OPT_STATS_NUM,
487965b06f0SIbtisam Tariq #define OPT_SOCKET_FILE         "socket-file"
488965b06f0SIbtisam Tariq 	OPT_SOCKET_FILE_NUM,
489965b06f0SIbtisam Tariq #define OPT_TX_CSUM             "tx-csum"
490965b06f0SIbtisam Tariq 	OPT_TX_CSUM_NUM,
491965b06f0SIbtisam Tariq #define OPT_TSO                 "tso"
492965b06f0SIbtisam Tariq 	OPT_TSO_NUM,
493965b06f0SIbtisam Tariq #define OPT_CLIENT              "client"
494965b06f0SIbtisam Tariq 	OPT_CLIENT_NUM,
495965b06f0SIbtisam Tariq #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
496965b06f0SIbtisam Tariq 	OPT_BUILTIN_NET_DRIVER_NUM,
497965b06f0SIbtisam Tariq #define OPT_DMA_TYPE            "dma-type"
498965b06f0SIbtisam Tariq 	OPT_DMA_TYPE_NUM,
499965b06f0SIbtisam Tariq #define OPT_DMAS                "dmas"
500965b06f0SIbtisam Tariq 	OPT_DMAS_NUM,
501965b06f0SIbtisam Tariq };
502965b06f0SIbtisam Tariq 
503d19533e8SHuawei Xie /*
504d19533e8SHuawei Xie  * Parse the arguments given in the command line of the application.
505d19533e8SHuawei Xie  */
506d19533e8SHuawei Xie static int
507d19533e8SHuawei Xie us_vhost_parse_args(int argc, char **argv)
508d19533e8SHuawei Xie {
509d19533e8SHuawei Xie 	int opt, ret;
510d19533e8SHuawei Xie 	int option_index;
511d19533e8SHuawei Xie 	unsigned i;
512d19533e8SHuawei Xie 	const char *prgname = argv[0];
513d19533e8SHuawei Xie 	static struct option long_option[] = {
514965b06f0SIbtisam Tariq 		{OPT_VM2VM, required_argument,
515965b06f0SIbtisam Tariq 				NULL, OPT_VM2VM_NUM},
516965b06f0SIbtisam Tariq 		{OPT_RX_RETRY, required_argument,
517965b06f0SIbtisam Tariq 				NULL, OPT_RX_RETRY_NUM},
518965b06f0SIbtisam Tariq 		{OPT_RX_RETRY_DELAY, required_argument,
519965b06f0SIbtisam Tariq 				NULL, OPT_RX_RETRY_DELAY_NUM},
520965b06f0SIbtisam Tariq 		{OPT_RX_RETRY_NUMB, required_argument,
521965b06f0SIbtisam Tariq 				NULL, OPT_RX_RETRY_NUMB_NUM},
522965b06f0SIbtisam Tariq 		{OPT_MERGEABLE, required_argument,
523965b06f0SIbtisam Tariq 				NULL, OPT_MERGEABLE_NUM},
524965b06f0SIbtisam Tariq 		{OPT_STATS, required_argument,
525965b06f0SIbtisam Tariq 				NULL, OPT_STATS_NUM},
526965b06f0SIbtisam Tariq 		{OPT_SOCKET_FILE, required_argument,
527965b06f0SIbtisam Tariq 				NULL, OPT_SOCKET_FILE_NUM},
528965b06f0SIbtisam Tariq 		{OPT_TX_CSUM, required_argument,
529965b06f0SIbtisam Tariq 				NULL, OPT_TX_CSUM_NUM},
530965b06f0SIbtisam Tariq 		{OPT_TSO, required_argument,
531965b06f0SIbtisam Tariq 				NULL, OPT_TSO_NUM},
532965b06f0SIbtisam Tariq 		{OPT_CLIENT, no_argument,
533965b06f0SIbtisam Tariq 				NULL, OPT_CLIENT_NUM},
534965b06f0SIbtisam Tariq 		{OPT_BUILTIN_NET_DRIVER, no_argument,
535965b06f0SIbtisam Tariq 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
536965b06f0SIbtisam Tariq 		{OPT_DMA_TYPE, required_argument,
537965b06f0SIbtisam Tariq 				NULL, OPT_DMA_TYPE_NUM},
538965b06f0SIbtisam Tariq 		{OPT_DMAS, required_argument,
539965b06f0SIbtisam Tariq 				NULL, OPT_DMAS_NUM},
540d19533e8SHuawei Xie 		{NULL, 0, 0, 0},
541d19533e8SHuawei Xie 	};
542d19533e8SHuawei Xie 
543d19533e8SHuawei Xie 	/* Parse command line */
54490924cafSOuyang Changchun 	while ((opt = getopt_long(argc, argv, "p:P",
54590924cafSOuyang Changchun 			long_option, &option_index)) != EOF) {
546d19533e8SHuawei Xie 		switch (opt) {
547d19533e8SHuawei Xie 		/* Portmask */
548d19533e8SHuawei Xie 		case 'p':
549d19533e8SHuawei Xie 			enabled_port_mask = parse_portmask(optarg);
550d19533e8SHuawei Xie 			if (enabled_port_mask == 0) {
551d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
552d19533e8SHuawei Xie 				us_vhost_usage(prgname);
553d19533e8SHuawei Xie 				return -1;
554d19533e8SHuawei Xie 			}
555d19533e8SHuawei Xie 			break;
556d19533e8SHuawei Xie 
55790924cafSOuyang Changchun 		case 'P':
55890924cafSOuyang Changchun 			promiscuous = 1;
55990924cafSOuyang Changchun 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
560295968d1SFerruh Yigit 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
561295968d1SFerruh Yigit 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
56290924cafSOuyang Changchun 			break;
56390924cafSOuyang Changchun 
564965b06f0SIbtisam Tariq 		case OPT_VM2VM_NUM:
565d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
566d19533e8SHuawei Xie 			if (ret == -1) {
567d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG,
568d19533e8SHuawei Xie 					"Invalid argument for "
569d19533e8SHuawei Xie 					"vm2vm [0|1|2]\n");
570d19533e8SHuawei Xie 				us_vhost_usage(prgname);
571d19533e8SHuawei Xie 				return -1;
572965b06f0SIbtisam Tariq 			}
573d19533e8SHuawei Xie 			vm2vm_mode = (vm2vm_type)ret;
574965b06f0SIbtisam Tariq 			break;
575d19533e8SHuawei Xie 
576965b06f0SIbtisam Tariq 		case OPT_RX_RETRY_NUM:
577d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, 1);
578d19533e8SHuawei Xie 			if (ret == -1) {
579d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
580d19533e8SHuawei Xie 				us_vhost_usage(prgname);
581d19533e8SHuawei Xie 				return -1;
582965b06f0SIbtisam Tariq 			}
583d19533e8SHuawei Xie 			enable_retry = ret;
584965b06f0SIbtisam Tariq 			break;
585d19533e8SHuawei Xie 
586965b06f0SIbtisam Tariq 		case OPT_TX_CSUM_NUM:
5879fd72e3cSJijiang Liu 			ret = parse_num_opt(optarg, 1);
5889fd72e3cSJijiang Liu 			if (ret == -1) {
5899fd72e3cSJijiang Liu 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
5909fd72e3cSJijiang Liu 				us_vhost_usage(prgname);
5919fd72e3cSJijiang Liu 				return -1;
5929fd72e3cSJijiang Liu 			}
593965b06f0SIbtisam Tariq 			enable_tx_csum = ret;
594965b06f0SIbtisam Tariq 			break;
5959fd72e3cSJijiang Liu 
596965b06f0SIbtisam Tariq 		case OPT_TSO_NUM:
5979fd72e3cSJijiang Liu 			ret = parse_num_opt(optarg, 1);
5989fd72e3cSJijiang Liu 			if (ret == -1) {
5999fd72e3cSJijiang Liu 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
6009fd72e3cSJijiang Liu 				us_vhost_usage(prgname);
6019fd72e3cSJijiang Liu 				return -1;
6029fd72e3cSJijiang Liu 			}
603965b06f0SIbtisam Tariq 			enable_tso = ret;
604965b06f0SIbtisam Tariq 			break;
6059fd72e3cSJijiang Liu 
606965b06f0SIbtisam Tariq 		case OPT_RX_RETRY_DELAY_NUM:
607d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, INT32_MAX);
608d19533e8SHuawei Xie 			if (ret == -1) {
609d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
610d19533e8SHuawei Xie 				us_vhost_usage(prgname);
611d19533e8SHuawei Xie 				return -1;
612965b06f0SIbtisam Tariq 			}
613d19533e8SHuawei Xie 			burst_rx_delay_time = ret;
614965b06f0SIbtisam Tariq 			break;
615d19533e8SHuawei Xie 
616965b06f0SIbtisam Tariq 		case OPT_RX_RETRY_NUMB_NUM:
617d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, INT32_MAX);
618d19533e8SHuawei Xie 			if (ret == -1) {
619d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
620d19533e8SHuawei Xie 				us_vhost_usage(prgname);
621d19533e8SHuawei Xie 				return -1;
622965b06f0SIbtisam Tariq 			}
623d19533e8SHuawei Xie 			burst_rx_retry_num = ret;
624965b06f0SIbtisam Tariq 			break;
625d19533e8SHuawei Xie 
626965b06f0SIbtisam Tariq 		case OPT_MERGEABLE_NUM:
627d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, 1);
628d19533e8SHuawei Xie 			if (ret == -1) {
629d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
630d19533e8SHuawei Xie 				us_vhost_usage(prgname);
631d19533e8SHuawei Xie 				return -1;
632965b06f0SIbtisam Tariq 			}
63328deb020SHuawei Xie 			mergeable = !!ret;
634b563c142SFerruh Yigit 			if (ret)
6351bb4a528SFerruh Yigit 				vmdq_conf_default.rxmode.mtu = MAX_MTU;
636965b06f0SIbtisam Tariq 			break;
637d19533e8SHuawei Xie 
638965b06f0SIbtisam Tariq 		case OPT_STATS_NUM:
639d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, INT32_MAX);
640d19533e8SHuawei Xie 			if (ret == -1) {
641bde19a4dSJiayu Hu 				RTE_LOG(INFO, VHOST_CONFIG,
642bde19a4dSJiayu Hu 					"Invalid argument for stats [0..N]\n");
643d19533e8SHuawei Xie 				us_vhost_usage(prgname);
644d19533e8SHuawei Xie 				return -1;
645965b06f0SIbtisam Tariq 			}
646d19533e8SHuawei Xie 			enable_stats = ret;
647965b06f0SIbtisam Tariq 			break;
648d19533e8SHuawei Xie 
649bde19a4dSJiayu Hu 		/* Set socket file path. */
650965b06f0SIbtisam Tariq 		case OPT_SOCKET_FILE_NUM:
651bde19a4dSJiayu Hu 			if (us_vhost_parse_socket_path(optarg) == -1) {
652bde19a4dSJiayu Hu 				RTE_LOG(INFO, VHOST_CONFIG,
653bde19a4dSJiayu Hu 				"Invalid argument for socket name (Max %d characters)\n",
654bde19a4dSJiayu Hu 				PATH_MAX);
655d19533e8SHuawei Xie 				us_vhost_usage(prgname);
656d19533e8SHuawei Xie 				return -1;
657d19533e8SHuawei Xie 			}
658965b06f0SIbtisam Tariq 			break;
659d19533e8SHuawei Xie 
660965b06f0SIbtisam Tariq 		case OPT_DMA_TYPE_NUM:
66147afdbbeSCheng Jiang 			dma_type = optarg;
662965b06f0SIbtisam Tariq 			break;
6633a04ecb2SCheng Jiang 
664965b06f0SIbtisam Tariq 		case OPT_DMAS_NUM:
6653a04ecb2SCheng Jiang 			if (open_dma(optarg) == -1) {
6663a04ecb2SCheng Jiang 				RTE_LOG(INFO, VHOST_CONFIG,
6673a04ecb2SCheng Jiang 					"Wrong DMA args\n");
6683a04ecb2SCheng Jiang 				us_vhost_usage(prgname);
6693a04ecb2SCheng Jiang 				return -1;
6703a04ecb2SCheng Jiang 			}
6713a04ecb2SCheng Jiang 			async_vhost_driver = 1;
672965b06f0SIbtisam Tariq 			break;
6733a04ecb2SCheng Jiang 
674965b06f0SIbtisam Tariq 		case OPT_CLIENT_NUM:
675965b06f0SIbtisam Tariq 			client_mode = 1;
676965b06f0SIbtisam Tariq 			break;
677965b06f0SIbtisam Tariq 
678965b06f0SIbtisam Tariq 		case OPT_BUILTIN_NET_DRIVER_NUM:
679965b06f0SIbtisam Tariq 			builtin_net_driver = 1;
680d19533e8SHuawei Xie 			break;
681d19533e8SHuawei Xie 
682d19533e8SHuawei Xie 		/* Invalid option - print options. */
683d19533e8SHuawei Xie 		default:
684d19533e8SHuawei Xie 			us_vhost_usage(prgname);
685d19533e8SHuawei Xie 			return -1;
686d19533e8SHuawei Xie 		}
687d19533e8SHuawei Xie 	}
688d19533e8SHuawei Xie 
689d19533e8SHuawei Xie 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
690d19533e8SHuawei Xie 		if (enabled_port_mask & (1 << i))
691f8244c63SZhiyong Yang 			ports[num_ports++] = i;
692d19533e8SHuawei Xie 	}
693d19533e8SHuawei Xie 
694d19533e8SHuawei Xie 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
695d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
696d19533e8SHuawei Xie 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
697d19533e8SHuawei Xie 		return -1;
698d19533e8SHuawei Xie 	}
699d19533e8SHuawei Xie 
700d19533e8SHuawei Xie 	return 0;
701d19533e8SHuawei Xie }
702d19533e8SHuawei Xie 
703d19533e8SHuawei Xie /*
704d19533e8SHuawei Xie  * Update the global var NUM_PORTS and array PORTS according to system ports number
705d19533e8SHuawei Xie  * and return valid ports number
706d19533e8SHuawei Xie  */
707d19533e8SHuawei Xie static unsigned check_ports_num(unsigned nb_ports)
708d19533e8SHuawei Xie {
709d19533e8SHuawei Xie 	unsigned valid_num_ports = num_ports;
710d19533e8SHuawei Xie 	unsigned portid;
711d19533e8SHuawei Xie 
712d19533e8SHuawei Xie 	if (num_ports > nb_ports) {
713d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
714d19533e8SHuawei Xie 			num_ports, nb_ports);
715d19533e8SHuawei Xie 		num_ports = nb_ports;
716d19533e8SHuawei Xie 	}
717d19533e8SHuawei Xie 
718d19533e8SHuawei Xie 	for (portid = 0; portid < num_ports; portid ++) {
719a9dbe180SThomas Monjalon 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
720a9dbe180SThomas Monjalon 			RTE_LOG(INFO, VHOST_PORT,
721a9dbe180SThomas Monjalon 				"\nSpecified port ID(%u) is not valid\n",
722a9dbe180SThomas Monjalon 				ports[portid]);
723d19533e8SHuawei Xie 			ports[portid] = INVALID_PORT_ID;
724d19533e8SHuawei Xie 			valid_num_ports--;
725d19533e8SHuawei Xie 		}
726d19533e8SHuawei Xie 	}
727d19533e8SHuawei Xie 	return valid_num_ports;
728d19533e8SHuawei Xie }
729d19533e8SHuawei Xie 
730c0583d98SJerin Jacob static __rte_always_inline struct vhost_dev *
7316d13ea8eSOlivier Matz find_vhost_dev(struct rte_ether_addr *mac)
73245657a5cSYuanhan Liu {
73345657a5cSYuanhan Liu 	struct vhost_dev *vdev;
73445657a5cSYuanhan Liu 
73597daf19eSYuanhan Liu 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
73645657a5cSYuanhan Liu 		if (vdev->ready == DEVICE_RX &&
737538da7a1SOlivier Matz 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
73845657a5cSYuanhan Liu 			return vdev;
73945657a5cSYuanhan Liu 	}
74045657a5cSYuanhan Liu 
74145657a5cSYuanhan Liu 	return NULL;
74245657a5cSYuanhan Liu }
74345657a5cSYuanhan Liu 
744d19533e8SHuawei Xie /*
745d19533e8SHuawei Xie  * This function learns the MAC address of the device and registers this along with a
746d19533e8SHuawei Xie  * vlan tag to a VMDQ.
747d19533e8SHuawei Xie  */
748d19533e8SHuawei Xie static int
749e571e6b4SHuawei Xie link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
750d19533e8SHuawei Xie {
7516d13ea8eSOlivier Matz 	struct rte_ether_hdr *pkt_hdr;
752d19533e8SHuawei Xie 	int i, ret;
753d19533e8SHuawei Xie 
754d19533e8SHuawei Xie 	/* Learn MAC address of guest device from packet */
7556d13ea8eSOlivier Matz 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
756d19533e8SHuawei Xie 
75704d43857SDmitry Kozlyuk 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
75845657a5cSYuanhan Liu 		RTE_LOG(ERR, VHOST_DATA,
759c08a3490SYuanhan Liu 			"(%d) device is using a registered MAC!\n",
760e2a1dd12SYuanhan Liu 			vdev->vid);
761d19533e8SHuawei Xie 		return -1;
762d19533e8SHuawei Xie 	}
763d19533e8SHuawei Xie 
76435b2d13fSOlivier Matz 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
76504d43857SDmitry Kozlyuk 		vdev->mac_address.addr_bytes[i] =
76604d43857SDmitry Kozlyuk 			pkt_hdr->src_addr.addr_bytes[i];
767d19533e8SHuawei Xie 
768d19533e8SHuawei Xie 	/* vlan_tag currently uses the device_id. */
769e2a1dd12SYuanhan Liu 	vdev->vlan_tag = vlan_tags[vdev->vid];
770d19533e8SHuawei Xie 
771d19533e8SHuawei Xie 	/* Print out VMDQ registration info. */
772c08a3490SYuanhan Liu 	RTE_LOG(INFO, VHOST_DATA,
773c2c4f87bSAman Deep Singh 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
774a7db3afcSAman Deep Singh 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
775e571e6b4SHuawei Xie 		vdev->vlan_tag);
776d19533e8SHuawei Xie 
777d19533e8SHuawei Xie 	/* Register the MAC address. */
77884b02d16SHuawei Xie 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
779e2a1dd12SYuanhan Liu 				(uint32_t)vdev->vid + vmdq_pool_base);
780d19533e8SHuawei Xie 	if (ret)
781c08a3490SYuanhan Liu 		RTE_LOG(ERR, VHOST_DATA,
782c08a3490SYuanhan Liu 			"(%d) failed to add device MAC address to VMDQ\n",
783e2a1dd12SYuanhan Liu 			vdev->vid);
784d19533e8SHuawei Xie 
78565453928SJianfeng Tan 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
786d19533e8SHuawei Xie 
787d19533e8SHuawei Xie 	/* Set device as ready for RX. */
788e571e6b4SHuawei Xie 	vdev->ready = DEVICE_RX;
789d19533e8SHuawei Xie 
790d19533e8SHuawei Xie 	return 0;
791d19533e8SHuawei Xie }
792d19533e8SHuawei Xie 
793d19533e8SHuawei Xie /*
794d19533e8SHuawei Xie  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
795d19533e8SHuawei Xie  * queue before disabling RX on the device.
796d19533e8SHuawei Xie  */
797d19533e8SHuawei Xie static inline void
798e571e6b4SHuawei Xie unlink_vmdq(struct vhost_dev *vdev)
799d19533e8SHuawei Xie {
800d19533e8SHuawei Xie 	unsigned i = 0;
801d19533e8SHuawei Xie 	unsigned rx_count;
802d19533e8SHuawei Xie 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
803d19533e8SHuawei Xie 
804e571e6b4SHuawei Xie 	if (vdev->ready == DEVICE_RX) {
805d19533e8SHuawei Xie 		/*clear MAC and VLAN settings*/
806e571e6b4SHuawei Xie 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
807d19533e8SHuawei Xie 		for (i = 0; i < 6; i++)
808e571e6b4SHuawei Xie 			vdev->mac_address.addr_bytes[i] = 0;
809d19533e8SHuawei Xie 
810e571e6b4SHuawei Xie 		vdev->vlan_tag = 0;
811d19533e8SHuawei Xie 
812d19533e8SHuawei Xie 		/*Clear out the receive buffers*/
813d19533e8SHuawei Xie 		rx_count = rte_eth_rx_burst(ports[0],
814e571e6b4SHuawei Xie 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
815d19533e8SHuawei Xie 
816d19533e8SHuawei Xie 		while (rx_count) {
817d19533e8SHuawei Xie 			for (i = 0; i < rx_count; i++)
818d19533e8SHuawei Xie 				rte_pktmbuf_free(pkts_burst[i]);
819d19533e8SHuawei Xie 
820d19533e8SHuawei Xie 			rx_count = rte_eth_rx_burst(ports[0],
821e571e6b4SHuawei Xie 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
822d19533e8SHuawei Xie 		}
823d19533e8SHuawei Xie 
824e571e6b4SHuawei Xie 		vdev->ready = DEVICE_MAC_LEARNING;
825d19533e8SHuawei Xie 	}
826d19533e8SHuawei Xie }
827d19533e8SHuawei Xie 
828a68ba8e0SCheng Jiang static inline void
829a68ba8e0SCheng Jiang free_pkts(struct rte_mbuf **pkts, uint16_t n)
830a68ba8e0SCheng Jiang {
831a68ba8e0SCheng Jiang 	while (n--)
832a68ba8e0SCheng Jiang 		rte_pktmbuf_free(pkts[n]);
833a68ba8e0SCheng Jiang }
834a68ba8e0SCheng Jiang 
835c0583d98SJerin Jacob static __rte_always_inline void
836a68ba8e0SCheng Jiang complete_async_pkts(struct vhost_dev *vdev)
837a68ba8e0SCheng Jiang {
838a68ba8e0SCheng Jiang 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
839a68ba8e0SCheng Jiang 	uint16_t complete_count;
840a68ba8e0SCheng Jiang 
841a68ba8e0SCheng Jiang 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
842a68ba8e0SCheng Jiang 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
843b9f23beeSCheng Jiang 	if (complete_count) {
844a68ba8e0SCheng Jiang 		free_pkts(p_cpl, complete_count);
845b9f23beeSCheng Jiang 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
846b9f23beeSCheng Jiang 	}
847b9f23beeSCheng Jiang 
848a68ba8e0SCheng Jiang }
849a68ba8e0SCheng Jiang 
850a68ba8e0SCheng Jiang static __rte_always_inline void
851a68ba8e0SCheng Jiang sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
8529c5ef512SYuanhan Liu 	    struct rte_mbuf *m)
8539c5ef512SYuanhan Liu {
8549c5ef512SYuanhan Liu 	uint16_t ret;
8559c5ef512SYuanhan Liu 
856ca059fa5SYuanhan Liu 	if (builtin_net_driver) {
857ca059fa5SYuanhan Liu 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
858ca059fa5SYuanhan Liu 	} else {
8594ecf22e3SYuanhan Liu 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
860ca059fa5SYuanhan Liu 	}
861ca059fa5SYuanhan Liu 
8629c5ef512SYuanhan Liu 	if (enable_stats) {
863a68ba8e0SCheng Jiang 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
864a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
865a68ba8e0SCheng Jiang 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
866a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
86756fe86f8SYuanhan Liu 		src_vdev->stats.tx_total++;
86856fe86f8SYuanhan Liu 		src_vdev->stats.tx += ret;
8699c5ef512SYuanhan Liu 	}
8709c5ef512SYuanhan Liu }
8719c5ef512SYuanhan Liu 
872a68ba8e0SCheng Jiang static __rte_always_inline void
873a68ba8e0SCheng Jiang drain_vhost(struct vhost_dev *vdev)
874a68ba8e0SCheng Jiang {
875a68ba8e0SCheng Jiang 	uint16_t ret;
876ee6e451fSCheng Jiang 	uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
877a68ba8e0SCheng Jiang 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
878a68ba8e0SCheng Jiang 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
879a68ba8e0SCheng Jiang 
880a68ba8e0SCheng Jiang 	if (builtin_net_driver) {
881a68ba8e0SCheng Jiang 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
882a68ba8e0SCheng Jiang 	} else if (async_vhost_driver) {
883a68ba8e0SCheng Jiang 		uint16_t enqueue_fail = 0;
884a68ba8e0SCheng Jiang 
885a68ba8e0SCheng Jiang 		complete_async_pkts(vdev);
886abeb8652SJiayu Hu 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
887abeb8652SJiayu Hu 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
888a68ba8e0SCheng Jiang 
889a68ba8e0SCheng Jiang 		enqueue_fail = nr_xmit - ret;
890a68ba8e0SCheng Jiang 		if (enqueue_fail)
891a68ba8e0SCheng Jiang 			free_pkts(&m[ret], nr_xmit - ret);
892a68ba8e0SCheng Jiang 	} else {
893a68ba8e0SCheng Jiang 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
894a68ba8e0SCheng Jiang 						m, nr_xmit);
895a68ba8e0SCheng Jiang 	}
896a68ba8e0SCheng Jiang 
897a68ba8e0SCheng Jiang 	if (enable_stats) {
898a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
899a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
900a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
901a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
902a68ba8e0SCheng Jiang 	}
903a68ba8e0SCheng Jiang 
904a68ba8e0SCheng Jiang 	if (!async_vhost_driver)
905a68ba8e0SCheng Jiang 		free_pkts(m, nr_xmit);
906a68ba8e0SCheng Jiang }
907a68ba8e0SCheng Jiang 
908a68ba8e0SCheng Jiang static __rte_always_inline void
909a68ba8e0SCheng Jiang drain_vhost_table(void)
910a68ba8e0SCheng Jiang {
911a68ba8e0SCheng Jiang 	uint16_t lcore_id = rte_lcore_id();
912a68ba8e0SCheng Jiang 	struct vhost_bufftable *vhost_txq;
913a68ba8e0SCheng Jiang 	struct vhost_dev *vdev;
914a68ba8e0SCheng Jiang 	uint64_t cur_tsc;
915a68ba8e0SCheng Jiang 
916a68ba8e0SCheng Jiang 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
917ad5050e4SWenwu Ma 		if (unlikely(vdev->remove == 1))
918ad5050e4SWenwu Ma 			continue;
919ad5050e4SWenwu Ma 
920a68ba8e0SCheng Jiang 		vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
921a68ba8e0SCheng Jiang 						+ vdev->vid];
922a68ba8e0SCheng Jiang 
923a68ba8e0SCheng Jiang 		cur_tsc = rte_rdtsc();
924a68ba8e0SCheng Jiang 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
925a68ba8e0SCheng Jiang 				> MBUF_TABLE_DRAIN_TSC)) {
926a68ba8e0SCheng Jiang 			RTE_LOG_DP(DEBUG, VHOST_DATA,
927a68ba8e0SCheng Jiang 				"Vhost TX queue drained after timeout with burst size %u\n",
928a68ba8e0SCheng Jiang 				vhost_txq->len);
929a68ba8e0SCheng Jiang 			drain_vhost(vdev);
930a68ba8e0SCheng Jiang 			vhost_txq->len = 0;
931a68ba8e0SCheng Jiang 			vhost_txq->pre_tsc = cur_tsc;
932a68ba8e0SCheng Jiang 		}
933a68ba8e0SCheng Jiang 	}
934a68ba8e0SCheng Jiang }
935a68ba8e0SCheng Jiang 
936d19533e8SHuawei Xie /*
937d19533e8SHuawei Xie  * Check if the packet destination MAC address is for a local device. If so then put
938d19533e8SHuawei Xie  * the packet on that devices RX queue. If not then return.
939d19533e8SHuawei Xie  */
940c0583d98SJerin Jacob static __rte_always_inline int
941e571e6b4SHuawei Xie virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
942d19533e8SHuawei Xie {
9436d13ea8eSOlivier Matz 	struct rte_ether_hdr *pkt_hdr;
94445657a5cSYuanhan Liu 	struct vhost_dev *dst_vdev;
945a68ba8e0SCheng Jiang 	struct vhost_bufftable *vhost_txq;
946a68ba8e0SCheng Jiang 	uint16_t lcore_id = rte_lcore_id();
9476d13ea8eSOlivier Matz 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
948d19533e8SHuawei Xie 
94904d43857SDmitry Kozlyuk 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
95045657a5cSYuanhan Liu 	if (!dst_vdev)
951d19533e8SHuawei Xie 		return -1;
95245657a5cSYuanhan Liu 
953e2a1dd12SYuanhan Liu 	if (vdev->vid == dst_vdev->vid) {
9545d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
955c08a3490SYuanhan Liu 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
956e2a1dd12SYuanhan Liu 			vdev->vid);
95745657a5cSYuanhan Liu 		return 0;
95845657a5cSYuanhan Liu 	}
95945657a5cSYuanhan Liu 
9605d8f0bafSOlivier Matz 	RTE_LOG_DP(DEBUG, VHOST_DATA,
961e2a1dd12SYuanhan Liu 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
96245657a5cSYuanhan Liu 
96345657a5cSYuanhan Liu 	if (unlikely(dst_vdev->remove)) {
9645d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
965e2a1dd12SYuanhan Liu 			"(%d) device is marked for removal\n", dst_vdev->vid);
96645657a5cSYuanhan Liu 		return 0;
96745657a5cSYuanhan Liu 	}
96845657a5cSYuanhan Liu 
969a68ba8e0SCheng Jiang 	vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
970a68ba8e0SCheng Jiang 	vhost_txq->m_table[vhost_txq->len++] = m;
971a68ba8e0SCheng Jiang 
972a68ba8e0SCheng Jiang 	if (enable_stats) {
973a68ba8e0SCheng Jiang 		vdev->stats.tx_total++;
974a68ba8e0SCheng Jiang 		vdev->stats.tx++;
975a68ba8e0SCheng Jiang 	}
976a68ba8e0SCheng Jiang 
977a68ba8e0SCheng Jiang 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
978a68ba8e0SCheng Jiang 		drain_vhost(dst_vdev);
979a68ba8e0SCheng Jiang 		vhost_txq->len = 0;
980a68ba8e0SCheng Jiang 		vhost_txq->pre_tsc = rte_rdtsc();
981a68ba8e0SCheng Jiang 	}
98245657a5cSYuanhan Liu 	return 0;
983d19533e8SHuawei Xie }
984d19533e8SHuawei Xie 
985d19533e8SHuawei Xie /*
98672ec8d77SOuyang Changchun  * Check if the destination MAC of a packet is one local VM,
98772ec8d77SOuyang Changchun  * and get its vlan tag, and offset if it is.
988d19533e8SHuawei Xie  */
989c0583d98SJerin Jacob static __rte_always_inline int
9907f262239SYuanhan Liu find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
99172ec8d77SOuyang Changchun 	uint32_t *offset, uint16_t *vlan_tag)
992d19533e8SHuawei Xie {
99345657a5cSYuanhan Liu 	struct vhost_dev *dst_vdev;
9946d13ea8eSOlivier Matz 	struct rte_ether_hdr *pkt_hdr =
9956d13ea8eSOlivier Matz 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
996d19533e8SHuawei Xie 
99704d43857SDmitry Kozlyuk 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
99845657a5cSYuanhan Liu 	if (!dst_vdev)
99945657a5cSYuanhan Liu 		return 0;
100045657a5cSYuanhan Liu 
1001e2a1dd12SYuanhan Liu 	if (vdev->vid == dst_vdev->vid) {
10025d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1003c08a3490SYuanhan Liu 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1004e2a1dd12SYuanhan Liu 			vdev->vid);
100572ec8d77SOuyang Changchun 		return -1;
1006d19533e8SHuawei Xie 	}
1007e44fb8a4SOuyang Changchun 
1008e44fb8a4SOuyang Changchun 	/*
1009e44fb8a4SOuyang Changchun 	 * HW vlan strip will reduce the packet length
1010e44fb8a4SOuyang Changchun 	 * by minus length of vlan tag, so need restore
1011e44fb8a4SOuyang Changchun 	 * the packet length by plus it.
1012e44fb8a4SOuyang Changchun 	 */
101372ec8d77SOuyang Changchun 	*offset  = VLAN_HLEN;
1014e2a1dd12SYuanhan Liu 	*vlan_tag = vlan_tags[vdev->vid];
1015d19533e8SHuawei Xie 
10165d8f0bafSOlivier Matz 	RTE_LOG_DP(DEBUG, VHOST_DATA,
10177f262239SYuanhan Liu 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1018e2a1dd12SYuanhan Liu 		vdev->vid, dst_vdev->vid, *vlan_tag);
1019d19533e8SHuawei Xie 
102072ec8d77SOuyang Changchun 	return 0;
102172ec8d77SOuyang Changchun }
102272ec8d77SOuyang Changchun 
10239fd72e3cSJijiang Liu static void virtio_tx_offload(struct rte_mbuf *m)
10249fd72e3cSJijiang Liu {
1025ca7036b4SDavid Marchand 	struct rte_net_hdr_lens hdr_lens;
1026ca7036b4SDavid Marchand 	struct rte_ipv4_hdr *ipv4_hdr;
1027ca7036b4SDavid Marchand 	struct rte_tcp_hdr *tcp_hdr;
1028ca7036b4SDavid Marchand 	uint32_t ptype;
10299fd72e3cSJijiang Liu 	void *l3_hdr;
10309fd72e3cSJijiang Liu 
1031ca7036b4SDavid Marchand 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1032ca7036b4SDavid Marchand 	m->l2_len = hdr_lens.l2_len;
1033ca7036b4SDavid Marchand 	m->l3_len = hdr_lens.l3_len;
1034ca7036b4SDavid Marchand 	m->l4_len = hdr_lens.l4_len;
10359fd72e3cSJijiang Liu 
1036ca7036b4SDavid Marchand 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1037ca7036b4SDavid Marchand 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1038ca7036b4SDavid Marchand 		m->l2_len + m->l3_len);
1039ca7036b4SDavid Marchand 
1040*daa02b5cSOlivier Matz 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1041ca7036b4SDavid Marchand 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1042*daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1043*daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1044df40169aSYuanhan Liu 		ipv4_hdr = l3_hdr;
10459fd72e3cSJijiang Liu 		ipv4_hdr->hdr_checksum = 0;
1046ca7036b4SDavid Marchand 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1047ca7036b4SDavid Marchand 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1048*daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1049ca7036b4SDavid Marchand 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1050df40169aSYuanhan Liu 	}
10519fd72e3cSJijiang Liu }
10529fd72e3cSJijiang Liu 
1053c0583d98SJerin Jacob static __rte_always_inline void
1054273ecdbcSYuanhan Liu do_drain_mbuf_table(struct mbuf_table *tx_q)
1055273ecdbcSYuanhan Liu {
1056273ecdbcSYuanhan Liu 	uint16_t count;
1057273ecdbcSYuanhan Liu 
1058273ecdbcSYuanhan Liu 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1059273ecdbcSYuanhan Liu 				 tx_q->m_table, tx_q->len);
1060273ecdbcSYuanhan Liu 	if (unlikely(count < tx_q->len))
1061273ecdbcSYuanhan Liu 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1062273ecdbcSYuanhan Liu 
1063273ecdbcSYuanhan Liu 	tx_q->len = 0;
1064273ecdbcSYuanhan Liu }
1065273ecdbcSYuanhan Liu 
106672ec8d77SOuyang Changchun /*
1067273ecdbcSYuanhan Liu  * This function routes the TX packet to the correct interface. This
1068273ecdbcSYuanhan Liu  * may be a local device or the physical port.
106972ec8d77SOuyang Changchun  */
1070c0583d98SJerin Jacob static __rte_always_inline void
107172ec8d77SOuyang Changchun virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
107272ec8d77SOuyang Changchun {
107372ec8d77SOuyang Changchun 	struct mbuf_table *tx_q;
1074273ecdbcSYuanhan Liu 	unsigned offset = 0;
107572ec8d77SOuyang Changchun 	const uint16_t lcore_id = rte_lcore_id();
10766d13ea8eSOlivier Matz 	struct rte_ether_hdr *nh;
107772ec8d77SOuyang Changchun 
10789c5ef512SYuanhan Liu 
10796d13ea8eSOlivier Matz 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
108004d43857SDmitry Kozlyuk 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
10819c5ef512SYuanhan Liu 		struct vhost_dev *vdev2;
10829c5ef512SYuanhan Liu 
108397daf19eSYuanhan Liu 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1084a3fdb532SJunjie Chen 			if (vdev2 != vdev)
1085a68ba8e0SCheng Jiang 				sync_virtio_xmit(vdev2, vdev, m);
10869c5ef512SYuanhan Liu 		}
10879c5ef512SYuanhan Liu 		goto queue2nic;
10889c5ef512SYuanhan Liu 	}
10899c5ef512SYuanhan Liu 
109072ec8d77SOuyang Changchun 	/*check if destination is local VM*/
1091a68ba8e0SCheng Jiang 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
109272ec8d77SOuyang Changchun 		return;
109372ec8d77SOuyang Changchun 
1094c2ab5162SOuyang Changchun 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
10957f262239SYuanhan Liu 		if (unlikely(find_local_dest(vdev, m, &offset,
10967f262239SYuanhan Liu 					     &vlan_tag) != 0)) {
109772ec8d77SOuyang Changchun 			rte_pktmbuf_free(m);
109872ec8d77SOuyang Changchun 			return;
109972ec8d77SOuyang Changchun 		}
1100d19533e8SHuawei Xie 	}
1101d19533e8SHuawei Xie 
11025d8f0bafSOlivier Matz 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1103e2a1dd12SYuanhan Liu 		"(%d) TX: MAC address is external\n", vdev->vid);
1104d19533e8SHuawei Xie 
11059c5ef512SYuanhan Liu queue2nic:
11069c5ef512SYuanhan Liu 
1107d19533e8SHuawei Xie 	/*Add packet to the port tx queue*/
1108d19533e8SHuawei Xie 	tx_q = &lcore_tx_queue[lcore_id];
1109d19533e8SHuawei Xie 
11106d13ea8eSOlivier Matz 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
111135b2d13fSOlivier Matz 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
11128b9bb988SOuyang Changchun 		/* Guest has inserted the vlan tag. */
11136d13ea8eSOlivier Matz 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
11148b9bb988SOuyang Changchun 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
11158b9bb988SOuyang Changchun 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
11168b9bb988SOuyang Changchun 			(vh->vlan_tci != vlan_tag_be))
11178b9bb988SOuyang Changchun 			vh->vlan_tci = vlan_tag_be;
11188b9bb988SOuyang Changchun 	} else {
1119*daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1120e44fb8a4SOuyang Changchun 
1121c2ab5162SOuyang Changchun 		/*
1122c2ab5162SOuyang Changchun 		 * Find the right seg to adjust the data len when offset is
1123c2ab5162SOuyang Changchun 		 * bigger than tail room size.
1124c2ab5162SOuyang Changchun 		 */
1125c2ab5162SOuyang Changchun 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1126c2ab5162SOuyang Changchun 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
11274d50b6acSHuawei Xie 				m->data_len += offset;
1128c2ab5162SOuyang Changchun 			else {
1129c2ab5162SOuyang Changchun 				struct rte_mbuf *seg = m;
1130c2ab5162SOuyang Changchun 
1131c2ab5162SOuyang Changchun 				while ((seg->next != NULL) &&
1132c2ab5162SOuyang Changchun 					(offset > rte_pktmbuf_tailroom(seg)))
1133c2ab5162SOuyang Changchun 					seg = seg->next;
1134c2ab5162SOuyang Changchun 
1135c2ab5162SOuyang Changchun 				seg->data_len += offset;
1136c2ab5162SOuyang Changchun 			}
1137e44fb8a4SOuyang Changchun 			m->pkt_len += offset;
1138c2ab5162SOuyang Changchun 		}
1139e44fb8a4SOuyang Changchun 
11404d50b6acSHuawei Xie 		m->vlan_tci = vlan_tag;
11418b9bb988SOuyang Changchun 	}
1142d19533e8SHuawei Xie 
1143*daa02b5cSOlivier Matz 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
11449fd72e3cSJijiang Liu 		virtio_tx_offload(m);
11459fd72e3cSJijiang Liu 
1146273ecdbcSYuanhan Liu 	tx_q->m_table[tx_q->len++] = m;
1147d19533e8SHuawei Xie 	if (enable_stats) {
114856fe86f8SYuanhan Liu 		vdev->stats.tx_total++;
114956fe86f8SYuanhan Liu 		vdev->stats.tx++;
1150d19533e8SHuawei Xie 	}
1151d19533e8SHuawei Xie 
1152273ecdbcSYuanhan Liu 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1153273ecdbcSYuanhan Liu 		do_drain_mbuf_table(tx_q);
1154d19533e8SHuawei Xie }
1155d19533e8SHuawei Xie 
1156d19533e8SHuawei Xie 
1157c0583d98SJerin Jacob static __rte_always_inline void
1158273ecdbcSYuanhan Liu drain_mbuf_table(struct mbuf_table *tx_q)
1159273ecdbcSYuanhan Liu {
1160273ecdbcSYuanhan Liu 	static uint64_t prev_tsc;
1161273ecdbcSYuanhan Liu 	uint64_t cur_tsc;
1162273ecdbcSYuanhan Liu 
1163273ecdbcSYuanhan Liu 	if (tx_q->len == 0)
1164d19533e8SHuawei Xie 		return;
1165273ecdbcSYuanhan Liu 
1166273ecdbcSYuanhan Liu 	cur_tsc = rte_rdtsc();
1167273ecdbcSYuanhan Liu 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1168273ecdbcSYuanhan Liu 		prev_tsc = cur_tsc;
1169273ecdbcSYuanhan Liu 
11705d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1171273ecdbcSYuanhan Liu 			"TX queue drained after timeout with burst size %u\n",
1172273ecdbcSYuanhan Liu 			tx_q->len);
1173273ecdbcSYuanhan Liu 		do_drain_mbuf_table(tx_q);
1174d19533e8SHuawei Xie 	}
1175273ecdbcSYuanhan Liu }
1176273ecdbcSYuanhan Liu 
1177c0583d98SJerin Jacob static __rte_always_inline void
1178273ecdbcSYuanhan Liu drain_eth_rx(struct vhost_dev *vdev)
1179273ecdbcSYuanhan Liu {
1180273ecdbcSYuanhan Liu 	uint16_t rx_count, enqueue_count;
1181a68ba8e0SCheng Jiang 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1182273ecdbcSYuanhan Liu 
1183273ecdbcSYuanhan Liu 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1184273ecdbcSYuanhan Liu 				    pkts, MAX_PKT_BURST);
1185abec60e7SCheng Jiang 
1186273ecdbcSYuanhan Liu 	if (!rx_count)
1187273ecdbcSYuanhan Liu 		return;
1188273ecdbcSYuanhan Liu 
1189d19533e8SHuawei Xie 	/*
1190273ecdbcSYuanhan Liu 	 * When "enable_retry" is set, here we wait and retry when there
1191273ecdbcSYuanhan Liu 	 * is no enough free slots in the queue to hold @rx_count packets,
1192273ecdbcSYuanhan Liu 	 * to diminish packet loss.
1193273ecdbcSYuanhan Liu 	 */
1194273ecdbcSYuanhan Liu 	if (enable_retry &&
11954ecf22e3SYuanhan Liu 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1196273ecdbcSYuanhan Liu 			VIRTIO_RXQ))) {
1197273ecdbcSYuanhan Liu 		uint32_t retry;
1198273ecdbcSYuanhan Liu 
1199273ecdbcSYuanhan Liu 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1200273ecdbcSYuanhan Liu 			rte_delay_us(burst_rx_delay_time);
12014ecf22e3SYuanhan Liu 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1202273ecdbcSYuanhan Liu 					VIRTIO_RXQ))
1203273ecdbcSYuanhan Liu 				break;
1204273ecdbcSYuanhan Liu 		}
1205273ecdbcSYuanhan Liu 	}
1206273ecdbcSYuanhan Liu 
1207ca059fa5SYuanhan Liu 	if (builtin_net_driver) {
1208ca059fa5SYuanhan Liu 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1209ca059fa5SYuanhan Liu 						pkts, rx_count);
1210abec60e7SCheng Jiang 	} else if (async_vhost_driver) {
1211a68ba8e0SCheng Jiang 		uint16_t enqueue_fail = 0;
1212a68ba8e0SCheng Jiang 
1213a68ba8e0SCheng Jiang 		complete_async_pkts(vdev);
1214abec60e7SCheng Jiang 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1215abeb8652SJiayu Hu 					VIRTIO_RXQ, pkts, rx_count);
1216abeb8652SJiayu Hu 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1217a68ba8e0SCheng Jiang 
1218a68ba8e0SCheng Jiang 		enqueue_fail = rx_count - enqueue_count;
1219a68ba8e0SCheng Jiang 		if (enqueue_fail)
1220a68ba8e0SCheng Jiang 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1221a68ba8e0SCheng Jiang 
1222ca059fa5SYuanhan Liu 	} else {
12234ecf22e3SYuanhan Liu 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1224273ecdbcSYuanhan Liu 						pkts, rx_count);
1225ca059fa5SYuanhan Liu 	}
1226abec60e7SCheng Jiang 
1227273ecdbcSYuanhan Liu 	if (enable_stats) {
1228a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1229a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1230a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1231a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1232273ecdbcSYuanhan Liu 	}
1233273ecdbcSYuanhan Liu 
1234abec60e7SCheng Jiang 	if (!async_vhost_driver)
1235273ecdbcSYuanhan Liu 		free_pkts(pkts, rx_count);
1236273ecdbcSYuanhan Liu }
1237273ecdbcSYuanhan Liu 
1238c0583d98SJerin Jacob static __rte_always_inline void
1239273ecdbcSYuanhan Liu drain_virtio_tx(struct vhost_dev *vdev)
1240273ecdbcSYuanhan Liu {
1241273ecdbcSYuanhan Liu 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1242273ecdbcSYuanhan Liu 	uint16_t count;
1243273ecdbcSYuanhan Liu 	uint16_t i;
1244273ecdbcSYuanhan Liu 
1245ca059fa5SYuanhan Liu 	if (builtin_net_driver) {
1246ca059fa5SYuanhan Liu 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1247273ecdbcSYuanhan Liu 					pkts, MAX_PKT_BURST);
1248ca059fa5SYuanhan Liu 	} else {
1249ca059fa5SYuanhan Liu 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1250ca059fa5SYuanhan Liu 					mbuf_pool, pkts, MAX_PKT_BURST);
1251ca059fa5SYuanhan Liu 	}
1252273ecdbcSYuanhan Liu 
1253273ecdbcSYuanhan Liu 	/* setup VMDq for the first packet */
1254273ecdbcSYuanhan Liu 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1255273ecdbcSYuanhan Liu 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1256273ecdbcSYuanhan Liu 			free_pkts(pkts, count);
1257273ecdbcSYuanhan Liu 	}
1258273ecdbcSYuanhan Liu 
12597f262239SYuanhan Liu 	for (i = 0; i < count; ++i)
1260e2a1dd12SYuanhan Liu 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1261273ecdbcSYuanhan Liu }
1262273ecdbcSYuanhan Liu 
1263273ecdbcSYuanhan Liu /*
1264273ecdbcSYuanhan Liu  * Main function of vhost-switch. It basically does:
1265273ecdbcSYuanhan Liu  *
1266273ecdbcSYuanhan Liu  * for each vhost device {
1267273ecdbcSYuanhan Liu  *    - drain_eth_rx()
1268273ecdbcSYuanhan Liu  *
1269273ecdbcSYuanhan Liu  *      Which drains the host eth Rx queue linked to the vhost device,
1270273ecdbcSYuanhan Liu  *      and deliver all of them to guest virito Rx ring associated with
1271273ecdbcSYuanhan Liu  *      this vhost device.
1272273ecdbcSYuanhan Liu  *
1273273ecdbcSYuanhan Liu  *    - drain_virtio_tx()
1274273ecdbcSYuanhan Liu  *
1275273ecdbcSYuanhan Liu  *      Which drains the guest virtio Tx queue and deliver all of them
1276273ecdbcSYuanhan Liu  *      to the target, which could be another vhost device, or the
1277273ecdbcSYuanhan Liu  *      physical eth dev. The route is done in function "virtio_tx_route".
1278273ecdbcSYuanhan Liu  * }
1279d19533e8SHuawei Xie  */
1280d19533e8SHuawei Xie static int
1281273ecdbcSYuanhan Liu switch_worker(void *arg __rte_unused)
1282d19533e8SHuawei Xie {
1283273ecdbcSYuanhan Liu 	unsigned i;
1284273ecdbcSYuanhan Liu 	unsigned lcore_id = rte_lcore_id();
1285273ecdbcSYuanhan Liu 	struct vhost_dev *vdev;
1286d19533e8SHuawei Xie 	struct mbuf_table *tx_q;
1287d19533e8SHuawei Xie 
1288d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1289d19533e8SHuawei Xie 
1290d19533e8SHuawei Xie 	tx_q = &lcore_tx_queue[lcore_id];
1291273ecdbcSYuanhan Liu 	for (i = 0; i < rte_lcore_count(); i++) {
1292d19533e8SHuawei Xie 		if (lcore_ids[i] == lcore_id) {
1293d19533e8SHuawei Xie 			tx_q->txq_id = i;
1294d19533e8SHuawei Xie 			break;
1295d19533e8SHuawei Xie 		}
1296d19533e8SHuawei Xie 	}
1297d19533e8SHuawei Xie 
1298d19533e8SHuawei Xie 	while(1) {
1299273ecdbcSYuanhan Liu 		drain_mbuf_table(tx_q);
1300a68ba8e0SCheng Jiang 		drain_vhost_table();
1301d19533e8SHuawei Xie 		/*
130245657a5cSYuanhan Liu 		 * Inform the configuration core that we have exited the
130345657a5cSYuanhan Liu 		 * linked list and that no devices are in use if requested.
1304d19533e8SHuawei Xie 		 */
130545657a5cSYuanhan Liu 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
130645657a5cSYuanhan Liu 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1307d19533e8SHuawei Xie 
1308d19533e8SHuawei Xie 		/*
1309273ecdbcSYuanhan Liu 		 * Process vhost devices
1310d19533e8SHuawei Xie 		 */
131197daf19eSYuanhan Liu 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
131297daf19eSYuanhan Liu 			      lcore_vdev_entry) {
1313364dddcdSHuawei Xie 			if (unlikely(vdev->remove)) {
1314e571e6b4SHuawei Xie 				unlink_vmdq(vdev);
1315e571e6b4SHuawei Xie 				vdev->ready = DEVICE_SAFE_REMOVE;
1316d19533e8SHuawei Xie 				continue;
1317d19533e8SHuawei Xie 			}
131845657a5cSYuanhan Liu 
1319273ecdbcSYuanhan Liu 			if (likely(vdev->ready == DEVICE_RX))
1320273ecdbcSYuanhan Liu 				drain_eth_rx(vdev);
1321d19533e8SHuawei Xie 
1322273ecdbcSYuanhan Liu 			if (likely(!vdev->remove))
1323273ecdbcSYuanhan Liu 				drain_virtio_tx(vdev);
1324d19533e8SHuawei Xie 		}
1325d19533e8SHuawei Xie 	}
1326d19533e8SHuawei Xie 
1327d19533e8SHuawei Xie 	return 0;
1328d19533e8SHuawei Xie }
1329d19533e8SHuawei Xie 
1330d19533e8SHuawei Xie /*
133145657a5cSYuanhan Liu  * Remove a device from the specific data core linked list and from the
133245657a5cSYuanhan Liu  * main linked list. Synchonization  occurs through the use of the
133345657a5cSYuanhan Liu  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1334d19533e8SHuawei Xie  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1335d19533e8SHuawei Xie  */
1336d19533e8SHuawei Xie static void
13374ecf22e3SYuanhan Liu destroy_device(int vid)
1338d19533e8SHuawei Xie {
133916ae8abeSYuanhan Liu 	struct vhost_dev *vdev = NULL;
1340d19533e8SHuawei Xie 	int lcore;
1341a68ba8e0SCheng Jiang 	uint16_t i;
1342d19533e8SHuawei Xie 
134316ae8abeSYuanhan Liu 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
13444ecf22e3SYuanhan Liu 		if (vdev->vid == vid)
134516ae8abeSYuanhan Liu 			break;
134616ae8abeSYuanhan Liu 	}
134716ae8abeSYuanhan Liu 	if (!vdev)
134816ae8abeSYuanhan Liu 		return;
1349d19533e8SHuawei Xie 	/*set the remove flag. */
1350e571e6b4SHuawei Xie 	vdev->remove = 1;
1351e571e6b4SHuawei Xie 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1352d19533e8SHuawei Xie 		rte_pause();
1353d19533e8SHuawei Xie 	}
1354d19533e8SHuawei Xie 
1355a68ba8e0SCheng Jiang 	for (i = 0; i < RTE_MAX_LCORE; i++)
1356a68ba8e0SCheng Jiang 		rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1357a68ba8e0SCheng Jiang 
1358ca059fa5SYuanhan Liu 	if (builtin_net_driver)
1359ca059fa5SYuanhan Liu 		vs_vhost_net_remove(vdev);
1360ca059fa5SYuanhan Liu 
136197daf19eSYuanhan Liu 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
136297daf19eSYuanhan Liu 		     lcore_vdev_entry);
136397daf19eSYuanhan Liu 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
136497daf19eSYuanhan Liu 
1365d19533e8SHuawei Xie 
1366d19533e8SHuawei Xie 	/* Set the dev_removal_flag on each lcore. */
1367cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore)
136845657a5cSYuanhan Liu 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1369d19533e8SHuawei Xie 
1370d19533e8SHuawei Xie 	/*
137145657a5cSYuanhan Liu 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
137245657a5cSYuanhan Liu 	 * we can be sure that they can no longer access the device removed
137345657a5cSYuanhan Liu 	 * from the linked lists and that the devices are no longer in use.
1374d19533e8SHuawei Xie 	 */
1375cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore) {
137645657a5cSYuanhan Liu 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1377d19533e8SHuawei Xie 			rte_pause();
1378d19533e8SHuawei Xie 	}
1379d19533e8SHuawei Xie 
138045657a5cSYuanhan Liu 	lcore_info[vdev->coreid].device_num--;
1381d19533e8SHuawei Xie 
138245657a5cSYuanhan Liu 	RTE_LOG(INFO, VHOST_DATA,
1383c08a3490SYuanhan Liu 		"(%d) device has been removed from data core\n",
1384e2a1dd12SYuanhan Liu 		vdev->vid);
1385d19533e8SHuawei Xie 
1386b9f23beeSCheng Jiang 	if (async_vhost_driver) {
1387b9f23beeSCheng Jiang 		uint16_t n_pkt = 0;
1388b9f23beeSCheng Jiang 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1389b9f23beeSCheng Jiang 
1390b9f23beeSCheng Jiang 		while (vdev->pkts_inflight) {
1391b9f23beeSCheng Jiang 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1392b9f23beeSCheng Jiang 						m_cpl, vdev->pkts_inflight);
1393b9f23beeSCheng Jiang 			free_pkts(m_cpl, n_pkt);
1394b9f23beeSCheng Jiang 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1395b9f23beeSCheng Jiang 		}
1396b9f23beeSCheng Jiang 
1397abec60e7SCheng Jiang 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1398b9f23beeSCheng Jiang 	}
1399abec60e7SCheng Jiang 
1400e571e6b4SHuawei Xie 	rte_free(vdev);
1401d19533e8SHuawei Xie }
1402d19533e8SHuawei Xie 
1403d19533e8SHuawei Xie /*
1404d19533e8SHuawei Xie  * A new device is added to a data core. First the device is added to the main linked list
140510b4270fSRami Rosen  * and then allocated to a specific data core.
1406d19533e8SHuawei Xie  */
1407d19533e8SHuawei Xie static int
14084ecf22e3SYuanhan Liu new_device(int vid)
1409d19533e8SHuawei Xie {
1410d19533e8SHuawei Xie 	int lcore, core_add = 0;
1411a68ba8e0SCheng Jiang 	uint16_t i;
1412d19533e8SHuawei Xie 	uint32_t device_num_min = num_devices;
1413e571e6b4SHuawei Xie 	struct vhost_dev *vdev;
1414fdf20fa7SSergio Gonzalez Monroy 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1415e571e6b4SHuawei Xie 	if (vdev == NULL) {
1416c08a3490SYuanhan Liu 		RTE_LOG(INFO, VHOST_DATA,
14177f262239SYuanhan Liu 			"(%d) couldn't allocate memory for vhost dev\n",
1418e2a1dd12SYuanhan Liu 			vid);
1419e571e6b4SHuawei Xie 		return -1;
1420e571e6b4SHuawei Xie 	}
1421e2a1dd12SYuanhan Liu 	vdev->vid = vid;
1422d19533e8SHuawei Xie 
1423a68ba8e0SCheng Jiang 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1424a68ba8e0SCheng Jiang 		vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1425a68ba8e0SCheng Jiang 			= rte_zmalloc("vhost bufftable",
1426a68ba8e0SCheng Jiang 				sizeof(struct vhost_bufftable),
1427a68ba8e0SCheng Jiang 				RTE_CACHE_LINE_SIZE);
1428a68ba8e0SCheng Jiang 
1429a68ba8e0SCheng Jiang 		if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1430a68ba8e0SCheng Jiang 			RTE_LOG(INFO, VHOST_DATA,
1431a68ba8e0SCheng Jiang 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1432a68ba8e0SCheng Jiang 			return -1;
1433a68ba8e0SCheng Jiang 		}
1434a68ba8e0SCheng Jiang 	}
1435a68ba8e0SCheng Jiang 
1436ca059fa5SYuanhan Liu 	if (builtin_net_driver)
1437ca059fa5SYuanhan Liu 		vs_vhost_net_setup(vdev);
1438ca059fa5SYuanhan Liu 
143997daf19eSYuanhan Liu 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1440e2a1dd12SYuanhan Liu 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1441d19533e8SHuawei Xie 
1442d19533e8SHuawei Xie 	/*reset ready flag*/
1443e571e6b4SHuawei Xie 	vdev->ready = DEVICE_MAC_LEARNING;
1444e571e6b4SHuawei Xie 	vdev->remove = 0;
1445d19533e8SHuawei Xie 
1446d19533e8SHuawei Xie 	/* Find a suitable lcore to add the device. */
1447cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore) {
144845657a5cSYuanhan Liu 		if (lcore_info[lcore].device_num < device_num_min) {
144945657a5cSYuanhan Liu 			device_num_min = lcore_info[lcore].device_num;
1450d19533e8SHuawei Xie 			core_add = lcore;
1451d19533e8SHuawei Xie 		}
1452d19533e8SHuawei Xie 	}
1453e571e6b4SHuawei Xie 	vdev->coreid = core_add;
1454e571e6b4SHuawei Xie 
145597daf19eSYuanhan Liu 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
145697daf19eSYuanhan Liu 			  lcore_vdev_entry);
145745657a5cSYuanhan Liu 	lcore_info[vdev->coreid].device_num++;
1458d19533e8SHuawei Xie 
1459d19533e8SHuawei Xie 	/* Disable notifications. */
14604ecf22e3SYuanhan Liu 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
14614ecf22e3SYuanhan Liu 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1462d19533e8SHuawei Xie 
1463c08a3490SYuanhan Liu 	RTE_LOG(INFO, VHOST_DATA,
1464c08a3490SYuanhan Liu 		"(%d) device has been added to data core %d\n",
1465e2a1dd12SYuanhan Liu 		vid, vdev->coreid);
1466d19533e8SHuawei Xie 
1467abec60e7SCheng Jiang 	if (async_vhost_driver) {
1468acbc3888SJiayu Hu 		struct rte_vhost_async_config config = {0};
14696e9a9d2aSCheng Jiang 		struct rte_vhost_async_channel_ops channel_ops;
1470a68ba8e0SCheng Jiang 
147147afdbbeSCheng Jiang 		if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
14726e9a9d2aSCheng Jiang 			channel_ops.transfer_data = ioat_transfer_data_cb;
14736e9a9d2aSCheng Jiang 			channel_ops.check_completed_copies =
14746e9a9d2aSCheng Jiang 				ioat_check_completed_copies_cb;
1475a68ba8e0SCheng Jiang 
1476acbc3888SJiayu Hu 			config.features = RTE_VHOST_ASYNC_INORDER;
1477a68ba8e0SCheng Jiang 
1478abec60e7SCheng Jiang 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1479acbc3888SJiayu Hu 				config, &channel_ops);
1480abec60e7SCheng Jiang 		}
14816e9a9d2aSCheng Jiang 	}
1482abec60e7SCheng Jiang 
1483d19533e8SHuawei Xie 	return 0;
1484d19533e8SHuawei Xie }
1485d19533e8SHuawei Xie 
1486b9f23beeSCheng Jiang static int
1487b9f23beeSCheng Jiang vring_state_changed(int vid, uint16_t queue_id, int enable)
1488b9f23beeSCheng Jiang {
1489b9f23beeSCheng Jiang 	struct vhost_dev *vdev = NULL;
1490b9f23beeSCheng Jiang 
1491b9f23beeSCheng Jiang 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1492b9f23beeSCheng Jiang 		if (vdev->vid == vid)
1493b9f23beeSCheng Jiang 			break;
1494b9f23beeSCheng Jiang 	}
1495b9f23beeSCheng Jiang 	if (!vdev)
1496b9f23beeSCheng Jiang 		return -1;
1497b9f23beeSCheng Jiang 
1498b9f23beeSCheng Jiang 	if (queue_id != VIRTIO_RXQ)
1499b9f23beeSCheng Jiang 		return 0;
1500b9f23beeSCheng Jiang 
1501b9f23beeSCheng Jiang 	if (async_vhost_driver) {
1502b9f23beeSCheng Jiang 		if (!enable) {
1503b9f23beeSCheng Jiang 			uint16_t n_pkt = 0;
1504b9f23beeSCheng Jiang 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1505b9f23beeSCheng Jiang 
1506b9f23beeSCheng Jiang 			while (vdev->pkts_inflight) {
1507b9f23beeSCheng Jiang 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1508b9f23beeSCheng Jiang 							m_cpl, vdev->pkts_inflight);
1509b9f23beeSCheng Jiang 				free_pkts(m_cpl, n_pkt);
1510b9f23beeSCheng Jiang 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1511b9f23beeSCheng Jiang 			}
1512b9f23beeSCheng Jiang 		}
1513b9f23beeSCheng Jiang 	}
1514b9f23beeSCheng Jiang 
1515b9f23beeSCheng Jiang 	return 0;
1516b9f23beeSCheng Jiang }
1517b9f23beeSCheng Jiang 
1518d19533e8SHuawei Xie /*
1519d19533e8SHuawei Xie  * These callback allow devices to be added to the data core when configuration
1520d19533e8SHuawei Xie  * has been fully complete.
1521d19533e8SHuawei Xie  */
15227c129037SYuanhan Liu static const struct vhost_device_ops virtio_net_device_ops =
1523d19533e8SHuawei Xie {
1524d19533e8SHuawei Xie 	.new_device =  new_device,
1525d19533e8SHuawei Xie 	.destroy_device = destroy_device,
1526b9f23beeSCheng Jiang 	.vring_state_changed = vring_state_changed,
1527d19533e8SHuawei Xie };
1528d19533e8SHuawei Xie 
1529d19533e8SHuawei Xie /*
1530d19533e8SHuawei Xie  * This is a thread will wake up after a period to print stats if the user has
1531d19533e8SHuawei Xie  * enabled them.
1532d19533e8SHuawei Xie  */
1533fa204854SOlivier Matz static void *
1534fa204854SOlivier Matz print_stats(__rte_unused void *arg)
1535d19533e8SHuawei Xie {
153645657a5cSYuanhan Liu 	struct vhost_dev *vdev;
1537d19533e8SHuawei Xie 	uint64_t tx_dropped, rx_dropped;
1538d19533e8SHuawei Xie 	uint64_t tx, tx_total, rx, rx_total;
1539d19533e8SHuawei Xie 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1540d19533e8SHuawei Xie 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1541d19533e8SHuawei Xie 
1542d19533e8SHuawei Xie 	while(1) {
1543d19533e8SHuawei Xie 		sleep(enable_stats);
1544d19533e8SHuawei Xie 
1545d19533e8SHuawei Xie 		/* Clear screen and move to top left */
154656fe86f8SYuanhan Liu 		printf("%s%s\n", clr, top_left);
154756fe86f8SYuanhan Liu 		printf("Device statistics =================================\n");
1548d19533e8SHuawei Xie 
154997daf19eSYuanhan Liu 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
155056fe86f8SYuanhan Liu 			tx_total   = vdev->stats.tx_total;
155156fe86f8SYuanhan Liu 			tx         = vdev->stats.tx;
1552d19533e8SHuawei Xie 			tx_dropped = tx_total - tx;
155356fe86f8SYuanhan Liu 
1554a68ba8e0SCheng Jiang 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1555a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1556a68ba8e0SCheng Jiang 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1557a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1558d19533e8SHuawei Xie 			rx_dropped = rx_total - rx;
1559d19533e8SHuawei Xie 
1560c08a3490SYuanhan Liu 			printf("Statistics for device %d\n"
156156fe86f8SYuanhan Liu 				"-----------------------\n"
156256fe86f8SYuanhan Liu 				"TX total:              %" PRIu64 "\n"
156356fe86f8SYuanhan Liu 				"TX dropped:            %" PRIu64 "\n"
156456fe86f8SYuanhan Liu 				"TX successful:         %" PRIu64 "\n"
156556fe86f8SYuanhan Liu 				"RX total:              %" PRIu64 "\n"
156656fe86f8SYuanhan Liu 				"RX dropped:            %" PRIu64 "\n"
156756fe86f8SYuanhan Liu 				"RX successful:         %" PRIu64 "\n",
15684ecf22e3SYuanhan Liu 				vdev->vid,
156956fe86f8SYuanhan Liu 				tx_total, tx_dropped, tx,
157056fe86f8SYuanhan Liu 				rx_total, rx_dropped, rx);
1571d19533e8SHuawei Xie 		}
157256fe86f8SYuanhan Liu 
157356fe86f8SYuanhan Liu 		printf("===================================================\n");
15743ee6f706SGeorgiy Levashov 
15753ee6f706SGeorgiy Levashov 		fflush(stdout);
1576d19533e8SHuawei Xie 	}
1577fa204854SOlivier Matz 
1578fa204854SOlivier Matz 	return NULL;
1579d19533e8SHuawei Xie }
1580d19533e8SHuawei Xie 
1581ad0eef4dSJiayu Hu static void
1582ad0eef4dSJiayu Hu unregister_drivers(int socket_num)
1583ad0eef4dSJiayu Hu {
1584ad0eef4dSJiayu Hu 	int i, ret;
1585ad0eef4dSJiayu Hu 
1586ad0eef4dSJiayu Hu 	for (i = 0; i < socket_num; i++) {
1587ad0eef4dSJiayu Hu 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1588ad0eef4dSJiayu Hu 		if (ret != 0)
1589ad0eef4dSJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG,
1590ad0eef4dSJiayu Hu 				"Fail to unregister vhost driver for %s.\n",
1591ad0eef4dSJiayu Hu 				socket_files + i * PATH_MAX);
1592ad0eef4dSJiayu Hu 	}
1593ad0eef4dSJiayu Hu }
1594ad0eef4dSJiayu Hu 
1595c83d2d00SOuyang Changchun /* When we receive a INT signal, unregister vhost driver */
1596c83d2d00SOuyang Changchun static void
1597c83d2d00SOuyang Changchun sigint_handler(__rte_unused int signum)
1598c83d2d00SOuyang Changchun {
1599c83d2d00SOuyang Changchun 	/* Unregister vhost driver. */
1600ad0eef4dSJiayu Hu 	unregister_drivers(nb_sockets);
1601ad0eef4dSJiayu Hu 
1602c83d2d00SOuyang Changchun 	exit(0);
1603c83d2d00SOuyang Changchun }
1604d19533e8SHuawei Xie 
1605d19533e8SHuawei Xie /*
1606bdb19b77SYuanhan Liu  * While creating an mbuf pool, one key thing is to figure out how
1607bdb19b77SYuanhan Liu  * many mbuf entries is enough for our use. FYI, here are some
1608bdb19b77SYuanhan Liu  * guidelines:
1609bdb19b77SYuanhan Liu  *
1610bdb19b77SYuanhan Liu  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1611bdb19b77SYuanhan Liu  *
1612bdb19b77SYuanhan Liu  * - For each switch core (A CPU core does the packet switch), we need
1613bdb19b77SYuanhan Liu  *   also make some reservation for receiving the packets from virtio
1614bdb19b77SYuanhan Liu  *   Tx queue. How many is enough depends on the usage. It's normally
1615bdb19b77SYuanhan Liu  *   a simple calculation like following:
1616bdb19b77SYuanhan Liu  *
1617bdb19b77SYuanhan Liu  *       MAX_PKT_BURST * max packet size / mbuf size
1618bdb19b77SYuanhan Liu  *
1619bdb19b77SYuanhan Liu  *   So, we definitely need allocate more mbufs when TSO is enabled.
1620bdb19b77SYuanhan Liu  *
1621bdb19b77SYuanhan Liu  * - Similarly, for each switching core, we should serve @nr_rx_desc
1622bdb19b77SYuanhan Liu  *   mbufs for receiving the packets from physical NIC device.
1623bdb19b77SYuanhan Liu  *
1624bdb19b77SYuanhan Liu  * - We also need make sure, for each switch core, we have allocated
1625bdb19b77SYuanhan Liu  *   enough mbufs to fill up the mbuf cache.
1626bdb19b77SYuanhan Liu  */
1627bdb19b77SYuanhan Liu static void
1628bdb19b77SYuanhan Liu create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1629bdb19b77SYuanhan Liu 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1630bdb19b77SYuanhan Liu {
1631bdb19b77SYuanhan Liu 	uint32_t nr_mbufs;
1632bdb19b77SYuanhan Liu 	uint32_t nr_mbufs_per_core;
1633bdb19b77SYuanhan Liu 	uint32_t mtu = 1500;
1634bdb19b77SYuanhan Liu 
1635bdb19b77SYuanhan Liu 	if (mergeable)
1636bdb19b77SYuanhan Liu 		mtu = 9000;
1637bdb19b77SYuanhan Liu 	if (enable_tso)
1638bdb19b77SYuanhan Liu 		mtu = 64 * 1024;
1639bdb19b77SYuanhan Liu 
1640bdb19b77SYuanhan Liu 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
164112ee45a3SYong Wang 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1642bdb19b77SYuanhan Liu 	nr_mbufs_per_core += nr_rx_desc;
1643bdb19b77SYuanhan Liu 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1644bdb19b77SYuanhan Liu 
1645bdb19b77SYuanhan Liu 	nr_mbufs  = nr_queues * nr_rx_desc;
1646bdb19b77SYuanhan Liu 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1647bdb19b77SYuanhan Liu 	nr_mbufs *= nr_port;
1648bdb19b77SYuanhan Liu 
1649bdb19b77SYuanhan Liu 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1650bdb19b77SYuanhan Liu 					    nr_mbuf_cache, 0, mbuf_size,
1651bdb19b77SYuanhan Liu 					    rte_socket_id());
1652bdb19b77SYuanhan Liu 	if (mbuf_pool == NULL)
1653bdb19b77SYuanhan Liu 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1654bdb19b77SYuanhan Liu }
1655bdb19b77SYuanhan Liu 
1656bdb19b77SYuanhan Liu /*
1657164a601bSYuanhan Liu  * Main function, does initialisation and calls the per-lcore functions.
1658d19533e8SHuawei Xie  */
1659d19533e8SHuawei Xie int
166098a16481SDavid Marchand main(int argc, char *argv[])
1661d19533e8SHuawei Xie {
1662d19533e8SHuawei Xie 	unsigned lcore_id, core_id = 0;
1663d19533e8SHuawei Xie 	unsigned nb_ports, valid_num_ports;
1664ad0eef4dSJiayu Hu 	int ret, i;
1665f8244c63SZhiyong Yang 	uint16_t portid;
1666d19533e8SHuawei Xie 	static pthread_t tid;
1667ca7036b4SDavid Marchand 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1668d19533e8SHuawei Xie 
1669c83d2d00SOuyang Changchun 	signal(SIGINT, sigint_handler);
1670c83d2d00SOuyang Changchun 
1671d19533e8SHuawei Xie 	/* init EAL */
1672d19533e8SHuawei Xie 	ret = rte_eal_init(argc, argv);
1673d19533e8SHuawei Xie 	if (ret < 0)
1674d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1675d19533e8SHuawei Xie 	argc -= ret;
1676d19533e8SHuawei Xie 	argv += ret;
1677d19533e8SHuawei Xie 
1678d19533e8SHuawei Xie 	/* parse app arguments */
1679d19533e8SHuawei Xie 	ret = us_vhost_parse_args(argc, argv);
1680d19533e8SHuawei Xie 	if (ret < 0)
1681d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1682d19533e8SHuawei Xie 
1683b3bee7d8SYong Wang 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
168445657a5cSYuanhan Liu 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
168545657a5cSYuanhan Liu 
1686d19533e8SHuawei Xie 		if (rte_lcore_is_enabled(lcore_id))
1687d19533e8SHuawei Xie 			lcore_ids[core_id++] = lcore_id;
1688b3bee7d8SYong Wang 	}
1689d19533e8SHuawei Xie 
1690d19533e8SHuawei Xie 	if (rte_lcore_count() > RTE_MAX_LCORE)
1691d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1692d19533e8SHuawei Xie 
1693d19533e8SHuawei Xie 	/* Get the number of physical ports. */
1694d9a42a69SThomas Monjalon 	nb_ports = rte_eth_dev_count_avail();
1695d19533e8SHuawei Xie 
1696d19533e8SHuawei Xie 	/*
1697d19533e8SHuawei Xie 	 * Update the global var NUM_PORTS and global array PORTS
1698d19533e8SHuawei Xie 	 * and get value of var VALID_NUM_PORTS according to system ports number
1699d19533e8SHuawei Xie 	 */
1700d19533e8SHuawei Xie 	valid_num_ports = check_ports_num(nb_ports);
1701d19533e8SHuawei Xie 
1702d19533e8SHuawei Xie 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1703d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1704d19533e8SHuawei Xie 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1705d19533e8SHuawei Xie 		return -1;
1706d19533e8SHuawei Xie 	}
1707d19533e8SHuawei Xie 
1708bdb19b77SYuanhan Liu 	/*
1709bdb19b77SYuanhan Liu 	 * FIXME: here we are trying to allocate mbufs big enough for
1710bdb19b77SYuanhan Liu 	 * @MAX_QUEUES, but the truth is we're never going to use that
1711bdb19b77SYuanhan Liu 	 * many queues here. We probably should only do allocation for
1712bdb19b77SYuanhan Liu 	 * those queues we are going to use.
1713bdb19b77SYuanhan Liu 	 */
1714bdb19b77SYuanhan Liu 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1715bdb19b77SYuanhan Liu 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1716d19533e8SHuawei Xie 
1717d19533e8SHuawei Xie 	if (vm2vm_mode == VM2VM_HARDWARE) {
1718d19533e8SHuawei Xie 		/* Enable VT loop back to let L2 switch to do it. */
1719d19533e8SHuawei Xie 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
17201f49ec15SThomas Monjalon 		RTE_LOG(DEBUG, VHOST_CONFIG,
1721d19533e8SHuawei Xie 			"Enable loop back for L2 switch in vmdq.\n");
1722d19533e8SHuawei Xie 	}
1723d19533e8SHuawei Xie 
1724d19533e8SHuawei Xie 	/* initialize all ports */
17258728ccf3SThomas Monjalon 	RTE_ETH_FOREACH_DEV(portid) {
1726d19533e8SHuawei Xie 		/* skip ports that are not enabled */
1727d19533e8SHuawei Xie 		if ((enabled_port_mask & (1 << portid)) == 0) {
1728d19533e8SHuawei Xie 			RTE_LOG(INFO, VHOST_PORT,
1729d19533e8SHuawei Xie 				"Skipping disabled port %d\n", portid);
1730d19533e8SHuawei Xie 			continue;
1731d19533e8SHuawei Xie 		}
1732d19533e8SHuawei Xie 		if (port_init(portid) != 0)
1733d19533e8SHuawei Xie 			rte_exit(EXIT_FAILURE,
1734d19533e8SHuawei Xie 				"Cannot initialize network ports\n");
1735d19533e8SHuawei Xie 	}
1736d19533e8SHuawei Xie 
1737d19533e8SHuawei Xie 	/* Enable stats if the user option is set. */
173867b6d303SRavi Kerur 	if (enable_stats) {
1739fa204854SOlivier Matz 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1740fa204854SOlivier Matz 					print_stats, NULL);
1741fa204854SOlivier Matz 		if (ret < 0)
174267b6d303SRavi Kerur 			rte_exit(EXIT_FAILURE,
174367b6d303SRavi Kerur 				"Cannot create print-stats thread\n");
174467b6d303SRavi Kerur 	}
1745d19533e8SHuawei Xie 
1746d19533e8SHuawei Xie 	/* Launch all data cores. */
1747cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore_id)
174868363d85SYuanhan Liu 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1749d19533e8SHuawei Xie 
17502345e3beSYuanhan Liu 	if (client_mode)
17512345e3beSYuanhan Liu 		flags |= RTE_VHOST_USER_CLIENT;
17522345e3beSYuanhan Liu 
1753bde19a4dSJiayu Hu 	/* Register vhost user driver to handle vhost messages. */
1754ad0eef4dSJiayu Hu 	for (i = 0; i < nb_sockets; i++) {
17550917f9d1SYuanhan Liu 		char *file = socket_files + i * PATH_MAX;
1756a68ba8e0SCheng Jiang 
1757abec60e7SCheng Jiang 		if (async_vhost_driver)
1758abec60e7SCheng Jiang 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1759abec60e7SCheng Jiang 
17600917f9d1SYuanhan Liu 		ret = rte_vhost_driver_register(file, flags);
1761ad0eef4dSJiayu Hu 		if (ret != 0) {
1762ad0eef4dSJiayu Hu 			unregister_drivers(i);
1763ad0eef4dSJiayu Hu 			rte_exit(EXIT_FAILURE,
1764ad0eef4dSJiayu Hu 				"vhost driver register failure.\n");
1765ad0eef4dSJiayu Hu 		}
1766ca059fa5SYuanhan Liu 
1767ca059fa5SYuanhan Liu 		if (builtin_net_driver)
1768ca059fa5SYuanhan Liu 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1769ca059fa5SYuanhan Liu 
17700917f9d1SYuanhan Liu 		if (mergeable == 0) {
17710917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
17720917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
17730917f9d1SYuanhan Liu 		}
17740917f9d1SYuanhan Liu 
17750917f9d1SYuanhan Liu 		if (enable_tx_csum == 0) {
17760917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
17770917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_CSUM);
17780917f9d1SYuanhan Liu 		}
17790917f9d1SYuanhan Liu 
17800917f9d1SYuanhan Liu 		if (enable_tso == 0) {
17810917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
17820917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_HOST_TSO4);
17830917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
17840917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_HOST_TSO6);
17850917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
17860917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
17870917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
17880917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
17890917f9d1SYuanhan Liu 		}
17900917f9d1SYuanhan Liu 
17910917f9d1SYuanhan Liu 		if (promiscuous) {
17920917f9d1SYuanhan Liu 			rte_vhost_driver_enable_features(file,
17930917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_CTRL_RX);
17940917f9d1SYuanhan Liu 		}
1795d19533e8SHuawei Xie 
179693433b63SYuanhan Liu 		ret = rte_vhost_driver_callback_register(file,
179793433b63SYuanhan Liu 			&virtio_net_device_ops);
179893433b63SYuanhan Liu 		if (ret != 0) {
179993433b63SYuanhan Liu 			rte_exit(EXIT_FAILURE,
180093433b63SYuanhan Liu 				"failed to register vhost driver callbacks.\n");
180193433b63SYuanhan Liu 		}
1802af147591SYuanhan Liu 
1803af147591SYuanhan Liu 		if (rte_vhost_driver_start(file) < 0) {
1804af147591SYuanhan Liu 			rte_exit(EXIT_FAILURE,
1805af147591SYuanhan Liu 				"failed to start vhost driver.\n");
1806af147591SYuanhan Liu 		}
180793433b63SYuanhan Liu 	}
1808d19533e8SHuawei Xie 
1809cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1810af147591SYuanhan Liu 		rte_eal_wait_lcore(lcore_id);
1811af147591SYuanhan Liu 
181210aa3757SChengchang Tang 	/* clean up the EAL */
181310aa3757SChengchang Tang 	rte_eal_cleanup();
1814d19533e8SHuawei Xie 
181510aa3757SChengchang Tang 	return 0;
1816d19533e8SHuawei Xie }
1817