xref: /dpdk/examples/vhost/main.c (revision 917229c24e871bbc3225a0227eb3f0faaa7aaa69)
13998e2a0SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause
23998e2a0SBruce Richardson  * Copyright(c) 2010-2017 Intel Corporation
3d19533e8SHuawei Xie  */
4d19533e8SHuawei Xie 
5d19533e8SHuawei Xie #include <arpa/inet.h>
6d19533e8SHuawei Xie #include <getopt.h>
7d19533e8SHuawei Xie #include <linux/if_ether.h>
8d19533e8SHuawei Xie #include <linux/if_vlan.h>
9d19533e8SHuawei Xie #include <linux/virtio_net.h>
10d19533e8SHuawei Xie #include <linux/virtio_ring.h>
11d19533e8SHuawei Xie #include <signal.h>
12d19533e8SHuawei Xie #include <stdint.h>
13d19533e8SHuawei Xie #include <sys/eventfd.h>
14d19533e8SHuawei Xie #include <sys/param.h>
15d19533e8SHuawei Xie #include <unistd.h>
16d19533e8SHuawei Xie 
17d19533e8SHuawei Xie #include <rte_cycles.h>
18d19533e8SHuawei Xie #include <rte_ethdev.h>
19d19533e8SHuawei Xie #include <rte_log.h>
20d19533e8SHuawei Xie #include <rte_string_fns.h>
21d19533e8SHuawei Xie #include <rte_malloc.h>
22ca7036b4SDavid Marchand #include <rte_net.h>
23a798beb4SYuanhan Liu #include <rte_vhost.h>
24691693c6SJijiang Liu #include <rte_ip.h>
259fd72e3cSJijiang Liu #include <rte_tcp.h>
26577329e6SJerin Jacob #include <rte_pause.h>
2753d3f477SJiayu Hu #include <rte_dmadev.h>
2853d3f477SJiayu Hu #include <rte_vhost_async.h>
29d19533e8SHuawei Xie 
30d19533e8SHuawei Xie #include "main.h"
31d19533e8SHuawei Xie 
32f17eb179SBernard Iremonger #ifndef MAX_QUEUES
33f17eb179SBernard Iremonger #define MAX_QUEUES 128
34f17eb179SBernard Iremonger #endif
35d19533e8SHuawei Xie 
36*917229c2SWenwu Ma #define NUM_MBUFS_DEFAULT 0x24000
37*917229c2SWenwu Ma 
38d19533e8SHuawei Xie /* the maximum number of external ports supported */
39d19533e8SHuawei Xie #define MAX_SUP_PORTS 1
40d19533e8SHuawei Xie 
41d19533e8SHuawei Xie #define MBUF_CACHE_SIZE	128
42824cb29cSKonstantin Ananyev #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
43d19533e8SHuawei Xie 
44d19533e8SHuawei Xie #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
45d19533e8SHuawei Xie 
46d19533e8SHuawei Xie #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
47d19533e8SHuawei Xie #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
48d19533e8SHuawei Xie 
49d19533e8SHuawei Xie #define JUMBO_FRAME_MAX_SIZE    0x2600
501bb4a528SFerruh Yigit #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
51d19533e8SHuawei Xie 
52d19533e8SHuawei Xie /* State of virtio device. */
53d19533e8SHuawei Xie #define DEVICE_MAC_LEARNING 0
54d19533e8SHuawei Xie #define DEVICE_RX			1
55d19533e8SHuawei Xie #define DEVICE_SAFE_REMOVE	2
56d19533e8SHuawei Xie 
57d19533e8SHuawei Xie /* Configurable number of RX/TX ring descriptors */
58d19533e8SHuawei Xie #define RTE_TEST_RX_DESC_DEFAULT 1024
59d19533e8SHuawei Xie #define RTE_TEST_TX_DESC_DEFAULT 512
60d19533e8SHuawei Xie 
61d19533e8SHuawei Xie #define INVALID_PORT_ID 0xFF
6253d3f477SJiayu Hu #define INVALID_DMA_ID -1
6353d3f477SJiayu Hu 
6453d3f477SJiayu Hu #define DMA_RING_SIZE 4096
6553d3f477SJiayu Hu 
66*917229c2SWenwu Ma /* number of mbufs in all pools - if specified on command-line. */
67*917229c2SWenwu Ma static int total_num_mbufs = NUM_MBUFS_DEFAULT;
68*917229c2SWenwu Ma 
6953d3f477SJiayu Hu struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
7053d3f477SJiayu Hu int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
7153d3f477SJiayu Hu static int dma_count;
72d19533e8SHuawei Xie 
73d19533e8SHuawei Xie /* mask of enabled ports */
74d19533e8SHuawei Xie static uint32_t enabled_port_mask = 0;
75d19533e8SHuawei Xie 
7690924cafSOuyang Changchun /* Promiscuous mode */
7790924cafSOuyang Changchun static uint32_t promiscuous;
7890924cafSOuyang Changchun 
79d19533e8SHuawei Xie /* number of devices/queues to support*/
80d19533e8SHuawei Xie static uint32_t num_queues = 0;
81a981294bSHuawei Xie static uint32_t num_devices;
82d19533e8SHuawei Xie 
8368363d85SYuanhan Liu static struct rte_mempool *mbuf_pool;
8428deb020SHuawei Xie static int mergeable;
85d19533e8SHuawei Xie 
86d19533e8SHuawei Xie /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
87d19533e8SHuawei Xie typedef enum {
88d19533e8SHuawei Xie 	VM2VM_DISABLED = 0,
89d19533e8SHuawei Xie 	VM2VM_SOFTWARE = 1,
90d19533e8SHuawei Xie 	VM2VM_HARDWARE = 2,
91d19533e8SHuawei Xie 	VM2VM_LAST
92d19533e8SHuawei Xie } vm2vm_type;
93d19533e8SHuawei Xie static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
94d19533e8SHuawei Xie 
95d19533e8SHuawei Xie /* Enable stats. */
96d19533e8SHuawei Xie static uint32_t enable_stats = 0;
97d19533e8SHuawei Xie /* Enable retries on RX. */
98d19533e8SHuawei Xie static uint32_t enable_retry = 1;
999fd72e3cSJijiang Liu 
1009fd72e3cSJijiang Liu /* Disable TX checksum offload */
1019fd72e3cSJijiang Liu static uint32_t enable_tx_csum;
1029fd72e3cSJijiang Liu 
1039fd72e3cSJijiang Liu /* Disable TSO offload */
1049fd72e3cSJijiang Liu static uint32_t enable_tso;
1059fd72e3cSJijiang Liu 
1062345e3beSYuanhan Liu static int client_mode;
1072345e3beSYuanhan Liu 
108ca059fa5SYuanhan Liu static int builtin_net_driver;
109ca059fa5SYuanhan Liu 
110d19533e8SHuawei Xie /* Specify timeout (in useconds) between retries on RX. */
111d19533e8SHuawei Xie static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
112d19533e8SHuawei Xie /* Specify the number of retries on RX. */
113d19533e8SHuawei Xie static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
114d19533e8SHuawei Xie 
115ad0eef4dSJiayu Hu /* Socket file paths. Can be set by user */
116ad0eef4dSJiayu Hu static char *socket_files;
117ad0eef4dSJiayu Hu static int nb_sockets;
118d19533e8SHuawei Xie 
1197be78d02SJosh Soref /* empty VMDq configuration structure. Filled in programmatically */
120d19533e8SHuawei Xie static struct rte_eth_conf vmdq_conf_default = {
121d19533e8SHuawei Xie 	.rxmode = {
122295968d1SFerruh Yigit 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
123d19533e8SHuawei Xie 		.split_hdr_size = 0,
124d19533e8SHuawei Xie 		/*
125cc22d8caSShahaf Shuler 		 * VLAN strip is necessary for 1G NIC such as I350,
126d19533e8SHuawei Xie 		 * this fixes bug of ipv4 forwarding in guest can't
1277be78d02SJosh Soref 		 * forward packets from one virtio dev to another virtio dev.
128d19533e8SHuawei Xie 		 */
129295968d1SFerruh Yigit 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
130d19533e8SHuawei Xie 	},
131d19533e8SHuawei Xie 
132d19533e8SHuawei Xie 	.txmode = {
133295968d1SFerruh Yigit 		.mq_mode = RTE_ETH_MQ_TX_NONE,
134295968d1SFerruh Yigit 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
135295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
136295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
137295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
138295968d1SFerruh Yigit 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
139d19533e8SHuawei Xie 	},
140d19533e8SHuawei Xie 	.rx_adv_conf = {
141d19533e8SHuawei Xie 		/*
142d19533e8SHuawei Xie 		 * should be overridden separately in code with
143d19533e8SHuawei Xie 		 * appropriate values
144d19533e8SHuawei Xie 		 */
145d19533e8SHuawei Xie 		.vmdq_rx_conf = {
146295968d1SFerruh Yigit 			.nb_queue_pools = RTE_ETH_8_POOLS,
147d19533e8SHuawei Xie 			.enable_default_pool = 0,
148d19533e8SHuawei Xie 			.default_pool = 0,
149d19533e8SHuawei Xie 			.nb_pool_maps = 0,
150d19533e8SHuawei Xie 			.pool_map = {{0, 0},},
151d19533e8SHuawei Xie 		},
152d19533e8SHuawei Xie 	},
153d19533e8SHuawei Xie };
154d19533e8SHuawei Xie 
155cc22d8caSShahaf Shuler 
156d19533e8SHuawei Xie static unsigned lcore_ids[RTE_MAX_LCORE];
157f8244c63SZhiyong Yang static uint16_t ports[RTE_MAX_ETHPORTS];
158d19533e8SHuawei Xie static unsigned num_ports = 0; /**< The number of ports specified in command line */
15984b02d16SHuawei Xie static uint16_t num_pf_queues, num_vmdq_queues;
16084b02d16SHuawei Xie static uint16_t vmdq_pool_base, vmdq_queue_base;
16184b02d16SHuawei Xie static uint16_t queues_per_pool;
162d19533e8SHuawei Xie 
163d19533e8SHuawei Xie const uint16_t vlan_tags[] = {
164d19533e8SHuawei Xie 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
165d19533e8SHuawei Xie 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
166d19533e8SHuawei Xie 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
167d19533e8SHuawei Xie 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
168d19533e8SHuawei Xie 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
169d19533e8SHuawei Xie 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
170d19533e8SHuawei Xie 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
171d19533e8SHuawei Xie 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
172d19533e8SHuawei Xie };
173d19533e8SHuawei Xie 
174d19533e8SHuawei Xie /* ethernet addresses of ports */
1756d13ea8eSOlivier Matz static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
176d19533e8SHuawei Xie 
17745657a5cSYuanhan Liu static struct vhost_dev_tailq_list vhost_dev_list =
17845657a5cSYuanhan Liu 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
179d19533e8SHuawei Xie 
180d19533e8SHuawei Xie static struct lcore_info lcore_info[RTE_MAX_LCORE];
181d19533e8SHuawei Xie 
182d19533e8SHuawei Xie /* Used for queueing bursts of TX packets. */
183d19533e8SHuawei Xie struct mbuf_table {
184d19533e8SHuawei Xie 	unsigned len;
185d19533e8SHuawei Xie 	unsigned txq_id;
186d19533e8SHuawei Xie 	struct rte_mbuf *m_table[MAX_PKT_BURST];
187d19533e8SHuawei Xie };
188d19533e8SHuawei Xie 
189a68ba8e0SCheng Jiang struct vhost_bufftable {
190a68ba8e0SCheng Jiang 	uint32_t len;
191a68ba8e0SCheng Jiang 	uint64_t pre_tsc;
192a68ba8e0SCheng Jiang 	struct rte_mbuf *m_table[MAX_PKT_BURST];
193a68ba8e0SCheng Jiang };
194a68ba8e0SCheng Jiang 
195d19533e8SHuawei Xie /* TX queue for each data core. */
196d19533e8SHuawei Xie struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
197d19533e8SHuawei Xie 
198a68ba8e0SCheng Jiang /*
199a68ba8e0SCheng Jiang  * Vhost TX buffer for each data core.
200a68ba8e0SCheng Jiang  * Every data core maintains a TX buffer for every vhost device,
201a68ba8e0SCheng Jiang  * which is used for batch pkts enqueue for higher performance.
202a68ba8e0SCheng Jiang  */
20353d3f477SJiayu Hu struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
204a68ba8e0SCheng Jiang 
205273ecdbcSYuanhan Liu #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
206273ecdbcSYuanhan Liu 				 / US_PER_S * BURST_TX_DRAIN_US)
207d19533e8SHuawei Xie 
20853d3f477SJiayu Hu static inline bool
20953d3f477SJiayu Hu is_dma_configured(int16_t dev_id)
21053d3f477SJiayu Hu {
21153d3f477SJiayu Hu 	int i;
21253d3f477SJiayu Hu 
21353d3f477SJiayu Hu 	for (i = 0; i < dma_count; i++)
21453d3f477SJiayu Hu 		if (dmas_id[i] == dev_id)
21553d3f477SJiayu Hu 			return true;
21653d3f477SJiayu Hu 	return false;
21753d3f477SJiayu Hu }
21853d3f477SJiayu Hu 
2193a04ecb2SCheng Jiang static inline int
2203a04ecb2SCheng Jiang open_dma(const char *value)
2213a04ecb2SCheng Jiang {
22253d3f477SJiayu Hu 	struct dma_for_vhost *dma_info = dma_bind;
22353d3f477SJiayu Hu 	char *input = strndup(value, strlen(value) + 1);
22453d3f477SJiayu Hu 	char *addrs = input;
22553d3f477SJiayu Hu 	char *ptrs[2];
22653d3f477SJiayu Hu 	char *start, *end, *substr;
22753d3f477SJiayu Hu 	int64_t vid;
2283a04ecb2SCheng Jiang 
22953d3f477SJiayu Hu 	struct rte_dma_info info;
23053d3f477SJiayu Hu 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
23153d3f477SJiayu Hu 	struct rte_dma_vchan_conf qconf = {
23253d3f477SJiayu Hu 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
23353d3f477SJiayu Hu 		.nb_desc = DMA_RING_SIZE
23453d3f477SJiayu Hu 	};
23553d3f477SJiayu Hu 
23653d3f477SJiayu Hu 	int dev_id;
23753d3f477SJiayu Hu 	int ret = 0;
23853d3f477SJiayu Hu 	uint16_t i = 0;
23953d3f477SJiayu Hu 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
24053d3f477SJiayu Hu 	int args_nr;
24153d3f477SJiayu Hu 
24253d3f477SJiayu Hu 	while (isblank(*addrs))
24353d3f477SJiayu Hu 		addrs++;
24453d3f477SJiayu Hu 	if (*addrs == '\0') {
24553d3f477SJiayu Hu 		ret = -1;
24653d3f477SJiayu Hu 		goto out;
24753d3f477SJiayu Hu 	}
24853d3f477SJiayu Hu 
24953d3f477SJiayu Hu 	/* process DMA devices within bracket. */
25053d3f477SJiayu Hu 	addrs++;
25153d3f477SJiayu Hu 	substr = strtok(addrs, ";]");
25253d3f477SJiayu Hu 	if (!substr) {
25353d3f477SJiayu Hu 		ret = -1;
25453d3f477SJiayu Hu 		goto out;
25553d3f477SJiayu Hu 	}
25653d3f477SJiayu Hu 
25753d3f477SJiayu Hu 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
25853d3f477SJiayu Hu 	if (args_nr <= 0) {
25953d3f477SJiayu Hu 		ret = -1;
26053d3f477SJiayu Hu 		goto out;
26153d3f477SJiayu Hu 	}
26253d3f477SJiayu Hu 
26353d3f477SJiayu Hu 	while (i < args_nr) {
26453d3f477SJiayu Hu 		char *arg_temp = dma_arg[i];
26553d3f477SJiayu Hu 		uint8_t sub_nr;
26653d3f477SJiayu Hu 
26753d3f477SJiayu Hu 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
26853d3f477SJiayu Hu 		if (sub_nr != 2) {
26953d3f477SJiayu Hu 			ret = -1;
27053d3f477SJiayu Hu 			goto out;
27153d3f477SJiayu Hu 		}
27253d3f477SJiayu Hu 
27353d3f477SJiayu Hu 		start = strstr(ptrs[0], "txd");
27453d3f477SJiayu Hu 		if (start == NULL) {
27553d3f477SJiayu Hu 			ret = -1;
27653d3f477SJiayu Hu 			goto out;
27753d3f477SJiayu Hu 		}
27853d3f477SJiayu Hu 
27953d3f477SJiayu Hu 		start += 3;
28053d3f477SJiayu Hu 		vid = strtol(start, &end, 0);
28153d3f477SJiayu Hu 		if (end == start) {
28253d3f477SJiayu Hu 			ret = -1;
28353d3f477SJiayu Hu 			goto out;
28453d3f477SJiayu Hu 		}
28553d3f477SJiayu Hu 
28653d3f477SJiayu Hu 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
28753d3f477SJiayu Hu 		if (dev_id < 0) {
28853d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
28953d3f477SJiayu Hu 			ret = -1;
29053d3f477SJiayu Hu 			goto out;
29153d3f477SJiayu Hu 		}
29253d3f477SJiayu Hu 
29353d3f477SJiayu Hu 		/* DMA device is already configured, so skip */
29453d3f477SJiayu Hu 		if (is_dma_configured(dev_id))
29553d3f477SJiayu Hu 			goto done;
29653d3f477SJiayu Hu 
29753d3f477SJiayu Hu 		if (rte_dma_info_get(dev_id, &info) != 0) {
29853d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
29953d3f477SJiayu Hu 			ret = -1;
30053d3f477SJiayu Hu 			goto out;
30153d3f477SJiayu Hu 		}
30253d3f477SJiayu Hu 
30353d3f477SJiayu Hu 		if (info.max_vchans < 1) {
30453d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
30553d3f477SJiayu Hu 			ret = -1;
30653d3f477SJiayu Hu 			goto out;
30753d3f477SJiayu Hu 		}
30853d3f477SJiayu Hu 
30953d3f477SJiayu Hu 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
31053d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
31153d3f477SJiayu Hu 			ret = -1;
31253d3f477SJiayu Hu 			goto out;
31353d3f477SJiayu Hu 		}
31453d3f477SJiayu Hu 
31553d3f477SJiayu Hu 		/* Check the max desc supported by DMA device */
31653d3f477SJiayu Hu 		rte_dma_info_get(dev_id, &info);
31753d3f477SJiayu Hu 		if (info.nb_vchans != 1) {
31853d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
31953d3f477SJiayu Hu 					dev_id);
32053d3f477SJiayu Hu 			ret = -1;
32153d3f477SJiayu Hu 			goto out;
32253d3f477SJiayu Hu 		}
32353d3f477SJiayu Hu 
32453d3f477SJiayu Hu 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
32553d3f477SJiayu Hu 
32653d3f477SJiayu Hu 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
32753d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
32853d3f477SJiayu Hu 			ret = -1;
32953d3f477SJiayu Hu 			goto out;
33053d3f477SJiayu Hu 		}
33153d3f477SJiayu Hu 
33253d3f477SJiayu Hu 		if (rte_dma_start(dev_id) != 0) {
33353d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
33453d3f477SJiayu Hu 			ret = -1;
33553d3f477SJiayu Hu 			goto out;
33653d3f477SJiayu Hu 		}
33753d3f477SJiayu Hu 
33853d3f477SJiayu Hu 		dmas_id[dma_count++] = dev_id;
33953d3f477SJiayu Hu 
34053d3f477SJiayu Hu done:
34153d3f477SJiayu Hu 		(dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
34253d3f477SJiayu Hu 		i++;
34353d3f477SJiayu Hu 	}
34453d3f477SJiayu Hu out:
34553d3f477SJiayu Hu 	free(input);
34653d3f477SJiayu Hu 	return ret;
3473a04ecb2SCheng Jiang }
3483a04ecb2SCheng Jiang 
349d19533e8SHuawei Xie /*
350d19533e8SHuawei Xie  * Builds up the correct configuration for VMDQ VLAN pool map
351d19533e8SHuawei Xie  * according to the pool & queue limits.
352d19533e8SHuawei Xie  */
353d19533e8SHuawei Xie static inline int
354d19533e8SHuawei Xie get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
355d19533e8SHuawei Xie {
356d19533e8SHuawei Xie 	struct rte_eth_vmdq_rx_conf conf;
35790924cafSOuyang Changchun 	struct rte_eth_vmdq_rx_conf *def_conf =
35890924cafSOuyang Changchun 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
359d19533e8SHuawei Xie 	unsigned i;
360d19533e8SHuawei Xie 
361d19533e8SHuawei Xie 	memset(&conf, 0, sizeof(conf));
362d19533e8SHuawei Xie 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
363d19533e8SHuawei Xie 	conf.nb_pool_maps = num_devices;
36490924cafSOuyang Changchun 	conf.enable_loop_back = def_conf->enable_loop_back;
36590924cafSOuyang Changchun 	conf.rx_mode = def_conf->rx_mode;
366d19533e8SHuawei Xie 
367d19533e8SHuawei Xie 	for (i = 0; i < conf.nb_pool_maps; i++) {
368d19533e8SHuawei Xie 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
369d19533e8SHuawei Xie 		conf.pool_map[i].pools = (1UL << i);
370d19533e8SHuawei Xie 	}
371d19533e8SHuawei Xie 
372d19533e8SHuawei Xie 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
373d19533e8SHuawei Xie 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
374d19533e8SHuawei Xie 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
375d19533e8SHuawei Xie 	return 0;
376d19533e8SHuawei Xie }
377d19533e8SHuawei Xie 
378d19533e8SHuawei Xie /*
379d19533e8SHuawei Xie  * Initialises a given port using global settings and with the rx buffers
380d19533e8SHuawei Xie  * coming from the mbuf_pool passed as parameter
381d19533e8SHuawei Xie  */
382d19533e8SHuawei Xie static inline int
383f8244c63SZhiyong Yang port_init(uint16_t port)
384d19533e8SHuawei Xie {
385d19533e8SHuawei Xie 	struct rte_eth_dev_info dev_info;
386d19533e8SHuawei Xie 	struct rte_eth_conf port_conf;
387db4014f2SHuawei Xie 	struct rte_eth_rxconf *rxconf;
388db4014f2SHuawei Xie 	struct rte_eth_txconf *txconf;
389db4014f2SHuawei Xie 	int16_t rx_rings, tx_rings;
390d19533e8SHuawei Xie 	uint16_t rx_ring_size, tx_ring_size;
391d19533e8SHuawei Xie 	int retval;
392d19533e8SHuawei Xie 	uint16_t q;
393d19533e8SHuawei Xie 
394d19533e8SHuawei Xie 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
39537fb306cSIvan Ilchenko 	retval = rte_eth_dev_info_get(port, &dev_info);
39637fb306cSIvan Ilchenko 	if (retval != 0) {
39737fb306cSIvan Ilchenko 		RTE_LOG(ERR, VHOST_PORT,
39837fb306cSIvan Ilchenko 			"Error during getting device (port %u) info: %s\n",
39937fb306cSIvan Ilchenko 			port, strerror(-retval));
40037fb306cSIvan Ilchenko 
40137fb306cSIvan Ilchenko 		return retval;
40237fb306cSIvan Ilchenko 	}
403d19533e8SHuawei Xie 
404db4014f2SHuawei Xie 	rxconf = &dev_info.default_rxconf;
405db4014f2SHuawei Xie 	txconf = &dev_info.default_txconf;
406db4014f2SHuawei Xie 	rxconf->rx_drop_en = 1;
407f0adccd4SOuyang Changchun 
408d19533e8SHuawei Xie 	/*configure the number of supported virtio devices based on VMDQ limits */
409d19533e8SHuawei Xie 	num_devices = dev_info.max_vmdq_pools;
410d19533e8SHuawei Xie 
411d19533e8SHuawei Xie 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
412d19533e8SHuawei Xie 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
41300b8b706SYuanhan Liu 
414d19533e8SHuawei Xie 	tx_rings = (uint16_t)rte_lcore_count();
415d19533e8SHuawei Xie 
4165932109aSChenbo Xia 	if (mergeable) {
4175932109aSChenbo Xia 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
4185932109aSChenbo Xia 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
4195932109aSChenbo Xia 		else
4205932109aSChenbo Xia 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
4215932109aSChenbo Xia 	}
4225932109aSChenbo Xia 
423d19533e8SHuawei Xie 	/* Get port configuration. */
424d19533e8SHuawei Xie 	retval = get_eth_conf(&port_conf, num_devices);
425d19533e8SHuawei Xie 	if (retval < 0)
426d19533e8SHuawei Xie 		return retval;
42784b02d16SHuawei Xie 	/* NIC queues are divided into pf queues and vmdq queues.  */
42884b02d16SHuawei Xie 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
42984b02d16SHuawei Xie 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
43084b02d16SHuawei Xie 	num_vmdq_queues = num_devices * queues_per_pool;
43184b02d16SHuawei Xie 	num_queues = num_pf_queues + num_vmdq_queues;
43284b02d16SHuawei Xie 	vmdq_queue_base = dev_info.vmdq_queue_base;
43384b02d16SHuawei Xie 	vmdq_pool_base  = dev_info.vmdq_pool_base;
43484b02d16SHuawei Xie 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
43584b02d16SHuawei Xie 		num_pf_queues, num_devices, queues_per_pool);
436d19533e8SHuawei Xie 
437a9dbe180SThomas Monjalon 	if (!rte_eth_dev_is_valid_port(port))
438a9dbe180SThomas Monjalon 		return -1;
439d19533e8SHuawei Xie 
44084b02d16SHuawei Xie 	rx_rings = (uint16_t)dev_info.max_rx_queues;
441295968d1SFerruh Yigit 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
442cc22d8caSShahaf Shuler 		port_conf.txmode.offloads |=
443295968d1SFerruh Yigit 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
444d19533e8SHuawei Xie 	/* Configure ethernet device. */
445d19533e8SHuawei Xie 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446bb7085b4SJianfeng Tan 	if (retval != 0) {
447bb7085b4SJianfeng Tan 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
448bb7085b4SJianfeng Tan 			port, strerror(-retval));
449d19533e8SHuawei Xie 		return retval;
450bb7085b4SJianfeng Tan 	}
451d19533e8SHuawei Xie 
45260efb44fSRoman Zhukov 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
45360efb44fSRoman Zhukov 		&tx_ring_size);
45460efb44fSRoman Zhukov 	if (retval != 0) {
45560efb44fSRoman Zhukov 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
45660efb44fSRoman Zhukov 			"for port %u: %s.\n", port, strerror(-retval));
45760efb44fSRoman Zhukov 		return retval;
45860efb44fSRoman Zhukov 	}
45960efb44fSRoman Zhukov 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
46060efb44fSRoman Zhukov 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
46160efb44fSRoman Zhukov 			"for Rx queues on port %u.\n", port);
46260efb44fSRoman Zhukov 		return -1;
46360efb44fSRoman Zhukov 	}
46460efb44fSRoman Zhukov 
465d19533e8SHuawei Xie 	/* Setup the queues. */
466cc22d8caSShahaf Shuler 	rxconf->offloads = port_conf.rxmode.offloads;
467d19533e8SHuawei Xie 	for (q = 0; q < rx_rings; q ++) {
468d19533e8SHuawei Xie 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
469db4014f2SHuawei Xie 						rte_eth_dev_socket_id(port),
470db4014f2SHuawei Xie 						rxconf,
47168363d85SYuanhan Liu 						mbuf_pool);
472bb7085b4SJianfeng Tan 		if (retval < 0) {
473bb7085b4SJianfeng Tan 			RTE_LOG(ERR, VHOST_PORT,
474bb7085b4SJianfeng Tan 				"Failed to setup rx queue %u of port %u: %s.\n",
475bb7085b4SJianfeng Tan 				q, port, strerror(-retval));
476d19533e8SHuawei Xie 			return retval;
477d19533e8SHuawei Xie 		}
478bb7085b4SJianfeng Tan 	}
479cc22d8caSShahaf Shuler 	txconf->offloads = port_conf.txmode.offloads;
480d19533e8SHuawei Xie 	for (q = 0; q < tx_rings; q ++) {
481d19533e8SHuawei Xie 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
482db4014f2SHuawei Xie 						rte_eth_dev_socket_id(port),
483db4014f2SHuawei Xie 						txconf);
484bb7085b4SJianfeng Tan 		if (retval < 0) {
485bb7085b4SJianfeng Tan 			RTE_LOG(ERR, VHOST_PORT,
486bb7085b4SJianfeng Tan 				"Failed to setup tx queue %u of port %u: %s.\n",
487bb7085b4SJianfeng Tan 				q, port, strerror(-retval));
488d19533e8SHuawei Xie 			return retval;
489d19533e8SHuawei Xie 		}
490bb7085b4SJianfeng Tan 	}
491d19533e8SHuawei Xie 
492d19533e8SHuawei Xie 	/* Start the device. */
493d19533e8SHuawei Xie 	retval  = rte_eth_dev_start(port);
494d19533e8SHuawei Xie 	if (retval < 0) {
495bb7085b4SJianfeng Tan 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
496bb7085b4SJianfeng Tan 			port, strerror(-retval));
497d19533e8SHuawei Xie 		return retval;
498d19533e8SHuawei Xie 	}
499d19533e8SHuawei Xie 
500f430bbceSIvan Ilchenko 	if (promiscuous) {
501f430bbceSIvan Ilchenko 		retval = rte_eth_promiscuous_enable(port);
502f430bbceSIvan Ilchenko 		if (retval != 0) {
503f430bbceSIvan Ilchenko 			RTE_LOG(ERR, VHOST_PORT,
504f430bbceSIvan Ilchenko 				"Failed to enable promiscuous mode on port %u: %s\n",
505f430bbceSIvan Ilchenko 				port, rte_strerror(-retval));
506f430bbceSIvan Ilchenko 			return retval;
507f430bbceSIvan Ilchenko 		}
508f430bbceSIvan Ilchenko 	}
50990924cafSOuyang Changchun 
51070febdcfSIgor Romanov 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
51170febdcfSIgor Romanov 	if (retval < 0) {
51270febdcfSIgor Romanov 		RTE_LOG(ERR, VHOST_PORT,
51370febdcfSIgor Romanov 			"Failed to get MAC address on port %u: %s\n",
51470febdcfSIgor Romanov 			port, rte_strerror(-retval));
51570febdcfSIgor Romanov 		return retval;
51670febdcfSIgor Romanov 	}
51770febdcfSIgor Romanov 
518d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
519d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
520d19533e8SHuawei Xie 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
521a7db3afcSAman Deep Singh 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
522d19533e8SHuawei Xie 
523d19533e8SHuawei Xie 	return 0;
524d19533e8SHuawei Xie }
525d19533e8SHuawei Xie 
526d19533e8SHuawei Xie /*
527bde19a4dSJiayu Hu  * Set socket file path.
528d19533e8SHuawei Xie  */
529d19533e8SHuawei Xie static int
530bde19a4dSJiayu Hu us_vhost_parse_socket_path(const char *q_arg)
531d19533e8SHuawei Xie {
532d79035b7STiwei Bie 	char *old;
533d79035b7STiwei Bie 
534d19533e8SHuawei Xie 	/* parse number string */
535fa81d3b9SGang Jiang 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
536d19533e8SHuawei Xie 		return -1;
537ad0eef4dSJiayu Hu 
538d79035b7STiwei Bie 	old = socket_files;
539ad0eef4dSJiayu Hu 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
540d79035b7STiwei Bie 	if (socket_files == NULL) {
541d79035b7STiwei Bie 		free(old);
542d79035b7STiwei Bie 		return -1;
543d79035b7STiwei Bie 	}
544d79035b7STiwei Bie 
545f9acaf84SBruce Richardson 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
546ad0eef4dSJiayu Hu 	nb_sockets++;
547d19533e8SHuawei Xie 
548d19533e8SHuawei Xie 	return 0;
549d19533e8SHuawei Xie }
550d19533e8SHuawei Xie 
551d19533e8SHuawei Xie /*
552d19533e8SHuawei Xie  * Parse the portmask provided at run time.
553d19533e8SHuawei Xie  */
554d19533e8SHuawei Xie static int
555d19533e8SHuawei Xie parse_portmask(const char *portmask)
556d19533e8SHuawei Xie {
557d19533e8SHuawei Xie 	char *end = NULL;
558d19533e8SHuawei Xie 	unsigned long pm;
559d19533e8SHuawei Xie 
560d19533e8SHuawei Xie 	errno = 0;
561d19533e8SHuawei Xie 
562d19533e8SHuawei Xie 	/* parse hexadecimal string */
563d19533e8SHuawei Xie 	pm = strtoul(portmask, &end, 16);
564d19533e8SHuawei Xie 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
565ce6b8c31SSarosh Arif 		return 0;
566d19533e8SHuawei Xie 
567d19533e8SHuawei Xie 	return pm;
568d19533e8SHuawei Xie 
569d19533e8SHuawei Xie }
570d19533e8SHuawei Xie 
571d19533e8SHuawei Xie /*
572d19533e8SHuawei Xie  * Parse num options at run time.
573d19533e8SHuawei Xie  */
574d19533e8SHuawei Xie static int
575d19533e8SHuawei Xie parse_num_opt(const char *q_arg, uint32_t max_valid_value)
576d19533e8SHuawei Xie {
577d19533e8SHuawei Xie 	char *end = NULL;
578d19533e8SHuawei Xie 	unsigned long num;
579d19533e8SHuawei Xie 
580d19533e8SHuawei Xie 	errno = 0;
581d19533e8SHuawei Xie 
582d19533e8SHuawei Xie 	/* parse unsigned int string */
583d19533e8SHuawei Xie 	num = strtoul(q_arg, &end, 10);
584d19533e8SHuawei Xie 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
585d19533e8SHuawei Xie 		return -1;
586d19533e8SHuawei Xie 
587d19533e8SHuawei Xie 	if (num > max_valid_value)
588d19533e8SHuawei Xie 		return -1;
589d19533e8SHuawei Xie 
590d19533e8SHuawei Xie 	return num;
591d19533e8SHuawei Xie 
592d19533e8SHuawei Xie }
593d19533e8SHuawei Xie 
594d19533e8SHuawei Xie /*
595d19533e8SHuawei Xie  * Display usage
596d19533e8SHuawei Xie  */
597d19533e8SHuawei Xie static void
598d19533e8SHuawei Xie us_vhost_usage(const char *prgname)
599d19533e8SHuawei Xie {
600d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
601d19533e8SHuawei Xie 	"		--vm2vm [0|1|2]\n"
602d19533e8SHuawei Xie 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
603bde19a4dSJiayu Hu 	"		--socket-file <path>\n"
604d19533e8SHuawei Xie 	"		--nb-devices ND\n"
605d19533e8SHuawei Xie 	"		-p PORTMASK: Set mask for ports to be used by application\n"
606d19533e8SHuawei Xie 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
6077be78d02SJosh Soref 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
608d19533e8SHuawei Xie 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
609d19533e8SHuawei Xie 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
610d19533e8SHuawei Xie 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
611d19533e8SHuawei Xie 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
612bde19a4dSJiayu Hu 	"		--socket-file: The path of the socket file.\n"
6139fd72e3cSJijiang Liu 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
6142345e3beSYuanhan Liu 	"		--tso [0|1] disable/enable TCP segment offload.\n"
6153a04ecb2SCheng Jiang 	"		--client register a vhost-user socket as client mode.\n"
616*917229c2SWenwu Ma 	"		--dmas register dma channel for specific vhost device.\n"
617*917229c2SWenwu Ma 	"		--total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n",
618d19533e8SHuawei Xie 	       prgname);
619d19533e8SHuawei Xie }
620d19533e8SHuawei Xie 
621965b06f0SIbtisam Tariq enum {
622965b06f0SIbtisam Tariq #define OPT_VM2VM               "vm2vm"
623965b06f0SIbtisam Tariq 	OPT_VM2VM_NUM = 256,
624965b06f0SIbtisam Tariq #define OPT_RX_RETRY            "rx-retry"
625965b06f0SIbtisam Tariq 	OPT_RX_RETRY_NUM,
626965b06f0SIbtisam Tariq #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
627965b06f0SIbtisam Tariq 	OPT_RX_RETRY_DELAY_NUM,
628965b06f0SIbtisam Tariq #define OPT_RX_RETRY_NUMB       "rx-retry-num"
629965b06f0SIbtisam Tariq 	OPT_RX_RETRY_NUMB_NUM,
630965b06f0SIbtisam Tariq #define OPT_MERGEABLE           "mergeable"
631965b06f0SIbtisam Tariq 	OPT_MERGEABLE_NUM,
632965b06f0SIbtisam Tariq #define OPT_STATS               "stats"
633965b06f0SIbtisam Tariq 	OPT_STATS_NUM,
634965b06f0SIbtisam Tariq #define OPT_SOCKET_FILE         "socket-file"
635965b06f0SIbtisam Tariq 	OPT_SOCKET_FILE_NUM,
636965b06f0SIbtisam Tariq #define OPT_TX_CSUM             "tx-csum"
637965b06f0SIbtisam Tariq 	OPT_TX_CSUM_NUM,
638965b06f0SIbtisam Tariq #define OPT_TSO                 "tso"
639965b06f0SIbtisam Tariq 	OPT_TSO_NUM,
640965b06f0SIbtisam Tariq #define OPT_CLIENT              "client"
641965b06f0SIbtisam Tariq 	OPT_CLIENT_NUM,
642965b06f0SIbtisam Tariq #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
643965b06f0SIbtisam Tariq 	OPT_BUILTIN_NET_DRIVER_NUM,
644965b06f0SIbtisam Tariq #define OPT_DMAS                "dmas"
645965b06f0SIbtisam Tariq 	OPT_DMAS_NUM,
646*917229c2SWenwu Ma #define OPT_NUM_MBUFS           "total-num-mbufs"
647*917229c2SWenwu Ma 	OPT_NUM_MBUFS_NUM,
648965b06f0SIbtisam Tariq };
649965b06f0SIbtisam Tariq 
650d19533e8SHuawei Xie /*
651d19533e8SHuawei Xie  * Parse the arguments given in the command line of the application.
652d19533e8SHuawei Xie  */
653d19533e8SHuawei Xie static int
654d19533e8SHuawei Xie us_vhost_parse_args(int argc, char **argv)
655d19533e8SHuawei Xie {
656d19533e8SHuawei Xie 	int opt, ret;
657d19533e8SHuawei Xie 	int option_index;
658d19533e8SHuawei Xie 	unsigned i;
659d19533e8SHuawei Xie 	const char *prgname = argv[0];
660d19533e8SHuawei Xie 	static struct option long_option[] = {
661965b06f0SIbtisam Tariq 		{OPT_VM2VM, required_argument,
662965b06f0SIbtisam Tariq 				NULL, OPT_VM2VM_NUM},
663965b06f0SIbtisam Tariq 		{OPT_RX_RETRY, required_argument,
664965b06f0SIbtisam Tariq 				NULL, OPT_RX_RETRY_NUM},
665965b06f0SIbtisam Tariq 		{OPT_RX_RETRY_DELAY, required_argument,
666965b06f0SIbtisam Tariq 				NULL, OPT_RX_RETRY_DELAY_NUM},
667965b06f0SIbtisam Tariq 		{OPT_RX_RETRY_NUMB, required_argument,
668965b06f0SIbtisam Tariq 				NULL, OPT_RX_RETRY_NUMB_NUM},
669965b06f0SIbtisam Tariq 		{OPT_MERGEABLE, required_argument,
670965b06f0SIbtisam Tariq 				NULL, OPT_MERGEABLE_NUM},
671965b06f0SIbtisam Tariq 		{OPT_STATS, required_argument,
672965b06f0SIbtisam Tariq 				NULL, OPT_STATS_NUM},
673965b06f0SIbtisam Tariq 		{OPT_SOCKET_FILE, required_argument,
674965b06f0SIbtisam Tariq 				NULL, OPT_SOCKET_FILE_NUM},
675965b06f0SIbtisam Tariq 		{OPT_TX_CSUM, required_argument,
676965b06f0SIbtisam Tariq 				NULL, OPT_TX_CSUM_NUM},
677965b06f0SIbtisam Tariq 		{OPT_TSO, required_argument,
678965b06f0SIbtisam Tariq 				NULL, OPT_TSO_NUM},
679965b06f0SIbtisam Tariq 		{OPT_CLIENT, no_argument,
680965b06f0SIbtisam Tariq 				NULL, OPT_CLIENT_NUM},
681965b06f0SIbtisam Tariq 		{OPT_BUILTIN_NET_DRIVER, no_argument,
682965b06f0SIbtisam Tariq 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
683965b06f0SIbtisam Tariq 		{OPT_DMAS, required_argument,
684965b06f0SIbtisam Tariq 				NULL, OPT_DMAS_NUM},
685*917229c2SWenwu Ma 		{OPT_NUM_MBUFS, required_argument,
686*917229c2SWenwu Ma 				NULL, OPT_NUM_MBUFS_NUM},
687d19533e8SHuawei Xie 		{NULL, 0, 0, 0},
688d19533e8SHuawei Xie 	};
689d19533e8SHuawei Xie 
690d19533e8SHuawei Xie 	/* Parse command line */
69190924cafSOuyang Changchun 	while ((opt = getopt_long(argc, argv, "p:P",
69290924cafSOuyang Changchun 			long_option, &option_index)) != EOF) {
693d19533e8SHuawei Xie 		switch (opt) {
694d19533e8SHuawei Xie 		/* Portmask */
695d19533e8SHuawei Xie 		case 'p':
696d19533e8SHuawei Xie 			enabled_port_mask = parse_portmask(optarg);
697d19533e8SHuawei Xie 			if (enabled_port_mask == 0) {
698d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
699d19533e8SHuawei Xie 				us_vhost_usage(prgname);
700d19533e8SHuawei Xie 				return -1;
701d19533e8SHuawei Xie 			}
702d19533e8SHuawei Xie 			break;
703d19533e8SHuawei Xie 
70490924cafSOuyang Changchun 		case 'P':
70590924cafSOuyang Changchun 			promiscuous = 1;
70690924cafSOuyang Changchun 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
707295968d1SFerruh Yigit 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
708295968d1SFerruh Yigit 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
70990924cafSOuyang Changchun 			break;
71090924cafSOuyang Changchun 
711965b06f0SIbtisam Tariq 		case OPT_VM2VM_NUM:
712d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
713d19533e8SHuawei Xie 			if (ret == -1) {
714d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG,
715d19533e8SHuawei Xie 					"Invalid argument for "
716d19533e8SHuawei Xie 					"vm2vm [0|1|2]\n");
717d19533e8SHuawei Xie 				us_vhost_usage(prgname);
718d19533e8SHuawei Xie 				return -1;
719965b06f0SIbtisam Tariq 			}
720d19533e8SHuawei Xie 			vm2vm_mode = (vm2vm_type)ret;
721965b06f0SIbtisam Tariq 			break;
722d19533e8SHuawei Xie 
723965b06f0SIbtisam Tariq 		case OPT_RX_RETRY_NUM:
724d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, 1);
725d19533e8SHuawei Xie 			if (ret == -1) {
726d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
727d19533e8SHuawei Xie 				us_vhost_usage(prgname);
728d19533e8SHuawei Xie 				return -1;
729965b06f0SIbtisam Tariq 			}
730d19533e8SHuawei Xie 			enable_retry = ret;
731965b06f0SIbtisam Tariq 			break;
732d19533e8SHuawei Xie 
733965b06f0SIbtisam Tariq 		case OPT_TX_CSUM_NUM:
7349fd72e3cSJijiang Liu 			ret = parse_num_opt(optarg, 1);
7359fd72e3cSJijiang Liu 			if (ret == -1) {
7369fd72e3cSJijiang Liu 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
7379fd72e3cSJijiang Liu 				us_vhost_usage(prgname);
7389fd72e3cSJijiang Liu 				return -1;
7399fd72e3cSJijiang Liu 			}
740965b06f0SIbtisam Tariq 			enable_tx_csum = ret;
741965b06f0SIbtisam Tariq 			break;
7429fd72e3cSJijiang Liu 
743965b06f0SIbtisam Tariq 		case OPT_TSO_NUM:
7449fd72e3cSJijiang Liu 			ret = parse_num_opt(optarg, 1);
7459fd72e3cSJijiang Liu 			if (ret == -1) {
7469fd72e3cSJijiang Liu 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
7479fd72e3cSJijiang Liu 				us_vhost_usage(prgname);
7489fd72e3cSJijiang Liu 				return -1;
7499fd72e3cSJijiang Liu 			}
750965b06f0SIbtisam Tariq 			enable_tso = ret;
751965b06f0SIbtisam Tariq 			break;
7529fd72e3cSJijiang Liu 
753965b06f0SIbtisam Tariq 		case OPT_RX_RETRY_DELAY_NUM:
754d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, INT32_MAX);
755d19533e8SHuawei Xie 			if (ret == -1) {
756d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
757d19533e8SHuawei Xie 				us_vhost_usage(prgname);
758d19533e8SHuawei Xie 				return -1;
759965b06f0SIbtisam Tariq 			}
760d19533e8SHuawei Xie 			burst_rx_delay_time = ret;
761965b06f0SIbtisam Tariq 			break;
762d19533e8SHuawei Xie 
763965b06f0SIbtisam Tariq 		case OPT_RX_RETRY_NUMB_NUM:
764d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, INT32_MAX);
765d19533e8SHuawei Xie 			if (ret == -1) {
766d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
767d19533e8SHuawei Xie 				us_vhost_usage(prgname);
768d19533e8SHuawei Xie 				return -1;
769965b06f0SIbtisam Tariq 			}
770d19533e8SHuawei Xie 			burst_rx_retry_num = ret;
771965b06f0SIbtisam Tariq 			break;
772d19533e8SHuawei Xie 
773965b06f0SIbtisam Tariq 		case OPT_MERGEABLE_NUM:
774d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, 1);
775d19533e8SHuawei Xie 			if (ret == -1) {
776d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
777d19533e8SHuawei Xie 				us_vhost_usage(prgname);
778d19533e8SHuawei Xie 				return -1;
779965b06f0SIbtisam Tariq 			}
78028deb020SHuawei Xie 			mergeable = !!ret;
781965b06f0SIbtisam Tariq 			break;
782d19533e8SHuawei Xie 
783965b06f0SIbtisam Tariq 		case OPT_STATS_NUM:
784d19533e8SHuawei Xie 			ret = parse_num_opt(optarg, INT32_MAX);
785d19533e8SHuawei Xie 			if (ret == -1) {
786bde19a4dSJiayu Hu 				RTE_LOG(INFO, VHOST_CONFIG,
787bde19a4dSJiayu Hu 					"Invalid argument for stats [0..N]\n");
788d19533e8SHuawei Xie 				us_vhost_usage(prgname);
789d19533e8SHuawei Xie 				return -1;
790965b06f0SIbtisam Tariq 			}
791d19533e8SHuawei Xie 			enable_stats = ret;
792965b06f0SIbtisam Tariq 			break;
793d19533e8SHuawei Xie 
794bde19a4dSJiayu Hu 		/* Set socket file path. */
795965b06f0SIbtisam Tariq 		case OPT_SOCKET_FILE_NUM:
796bde19a4dSJiayu Hu 			if (us_vhost_parse_socket_path(optarg) == -1) {
797bde19a4dSJiayu Hu 				RTE_LOG(INFO, VHOST_CONFIG,
798bde19a4dSJiayu Hu 				"Invalid argument for socket name (Max %d characters)\n",
799bde19a4dSJiayu Hu 				PATH_MAX);
800d19533e8SHuawei Xie 				us_vhost_usage(prgname);
801d19533e8SHuawei Xie 				return -1;
802d19533e8SHuawei Xie 			}
803965b06f0SIbtisam Tariq 			break;
804d19533e8SHuawei Xie 
805965b06f0SIbtisam Tariq 		case OPT_DMAS_NUM:
8063a04ecb2SCheng Jiang 			if (open_dma(optarg) == -1) {
8073a04ecb2SCheng Jiang 				RTE_LOG(INFO, VHOST_CONFIG,
8083a04ecb2SCheng Jiang 					"Wrong DMA args\n");
8093a04ecb2SCheng Jiang 				us_vhost_usage(prgname);
8103a04ecb2SCheng Jiang 				return -1;
8113a04ecb2SCheng Jiang 			}
812965b06f0SIbtisam Tariq 			break;
8133a04ecb2SCheng Jiang 
814*917229c2SWenwu Ma 		case OPT_NUM_MBUFS_NUM:
815*917229c2SWenwu Ma 			ret = parse_num_opt(optarg, INT32_MAX);
816*917229c2SWenwu Ma 			if (ret == -1) {
817*917229c2SWenwu Ma 				RTE_LOG(INFO, VHOST_CONFIG,
818*917229c2SWenwu Ma 					"Invalid argument for total-num-mbufs [0..N]\n");
819*917229c2SWenwu Ma 				us_vhost_usage(prgname);
820*917229c2SWenwu Ma 				return -1;
821*917229c2SWenwu Ma 			}
822*917229c2SWenwu Ma 
823*917229c2SWenwu Ma 			if (total_num_mbufs < ret)
824*917229c2SWenwu Ma 				total_num_mbufs = ret;
825*917229c2SWenwu Ma 			break;
826*917229c2SWenwu Ma 
827965b06f0SIbtisam Tariq 		case OPT_CLIENT_NUM:
828965b06f0SIbtisam Tariq 			client_mode = 1;
829965b06f0SIbtisam Tariq 			break;
830965b06f0SIbtisam Tariq 
831965b06f0SIbtisam Tariq 		case OPT_BUILTIN_NET_DRIVER_NUM:
832965b06f0SIbtisam Tariq 			builtin_net_driver = 1;
833d19533e8SHuawei Xie 			break;
834d19533e8SHuawei Xie 
835d19533e8SHuawei Xie 		/* Invalid option - print options. */
836d19533e8SHuawei Xie 		default:
837d19533e8SHuawei Xie 			us_vhost_usage(prgname);
838d19533e8SHuawei Xie 			return -1;
839d19533e8SHuawei Xie 		}
840d19533e8SHuawei Xie 	}
841d19533e8SHuawei Xie 
842d19533e8SHuawei Xie 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
843d19533e8SHuawei Xie 		if (enabled_port_mask & (1 << i))
844f8244c63SZhiyong Yang 			ports[num_ports++] = i;
845d19533e8SHuawei Xie 	}
846d19533e8SHuawei Xie 
847d19533e8SHuawei Xie 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
848d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
849d19533e8SHuawei Xie 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
850d19533e8SHuawei Xie 		return -1;
851d19533e8SHuawei Xie 	}
852d19533e8SHuawei Xie 
853d19533e8SHuawei Xie 	return 0;
854d19533e8SHuawei Xie }
855d19533e8SHuawei Xie 
856d19533e8SHuawei Xie /*
857d19533e8SHuawei Xie  * Update the global var NUM_PORTS and array PORTS according to system ports number
858d19533e8SHuawei Xie  * and return valid ports number
859d19533e8SHuawei Xie  */
860d19533e8SHuawei Xie static unsigned check_ports_num(unsigned nb_ports)
861d19533e8SHuawei Xie {
862d19533e8SHuawei Xie 	unsigned valid_num_ports = num_ports;
863d19533e8SHuawei Xie 	unsigned portid;
864d19533e8SHuawei Xie 
865d19533e8SHuawei Xie 	if (num_ports > nb_ports) {
866d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
867d19533e8SHuawei Xie 			num_ports, nb_ports);
868d19533e8SHuawei Xie 		num_ports = nb_ports;
869d19533e8SHuawei Xie 	}
870d19533e8SHuawei Xie 
871d19533e8SHuawei Xie 	for (portid = 0; portid < num_ports; portid ++) {
872a9dbe180SThomas Monjalon 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
873a9dbe180SThomas Monjalon 			RTE_LOG(INFO, VHOST_PORT,
874a9dbe180SThomas Monjalon 				"\nSpecified port ID(%u) is not valid\n",
875a9dbe180SThomas Monjalon 				ports[portid]);
876d19533e8SHuawei Xie 			ports[portid] = INVALID_PORT_ID;
877d19533e8SHuawei Xie 			valid_num_ports--;
878d19533e8SHuawei Xie 		}
879d19533e8SHuawei Xie 	}
880d19533e8SHuawei Xie 	return valid_num_ports;
881d19533e8SHuawei Xie }
882d19533e8SHuawei Xie 
883c0583d98SJerin Jacob static __rte_always_inline struct vhost_dev *
8846d13ea8eSOlivier Matz find_vhost_dev(struct rte_ether_addr *mac)
88545657a5cSYuanhan Liu {
88645657a5cSYuanhan Liu 	struct vhost_dev *vdev;
88745657a5cSYuanhan Liu 
88897daf19eSYuanhan Liu 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
88945657a5cSYuanhan Liu 		if (vdev->ready == DEVICE_RX &&
890538da7a1SOlivier Matz 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
89145657a5cSYuanhan Liu 			return vdev;
89245657a5cSYuanhan Liu 	}
89345657a5cSYuanhan Liu 
89445657a5cSYuanhan Liu 	return NULL;
89545657a5cSYuanhan Liu }
89645657a5cSYuanhan Liu 
897d19533e8SHuawei Xie /*
898d19533e8SHuawei Xie  * This function learns the MAC address of the device and registers this along with a
899d19533e8SHuawei Xie  * vlan tag to a VMDQ.
900d19533e8SHuawei Xie  */
901d19533e8SHuawei Xie static int
902e571e6b4SHuawei Xie link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903d19533e8SHuawei Xie {
9046d13ea8eSOlivier Matz 	struct rte_ether_hdr *pkt_hdr;
905d19533e8SHuawei Xie 	int i, ret;
906d19533e8SHuawei Xie 
907d19533e8SHuawei Xie 	/* Learn MAC address of guest device from packet */
9086d13ea8eSOlivier Matz 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
909d19533e8SHuawei Xie 
91004d43857SDmitry Kozlyuk 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
91145657a5cSYuanhan Liu 		RTE_LOG(ERR, VHOST_DATA,
912c08a3490SYuanhan Liu 			"(%d) device is using a registered MAC!\n",
913e2a1dd12SYuanhan Liu 			vdev->vid);
914d19533e8SHuawei Xie 		return -1;
915d19533e8SHuawei Xie 	}
916d19533e8SHuawei Xie 
91735b2d13fSOlivier Matz 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
91804d43857SDmitry Kozlyuk 		vdev->mac_address.addr_bytes[i] =
91904d43857SDmitry Kozlyuk 			pkt_hdr->src_addr.addr_bytes[i];
920d19533e8SHuawei Xie 
921d19533e8SHuawei Xie 	/* vlan_tag currently uses the device_id. */
922e2a1dd12SYuanhan Liu 	vdev->vlan_tag = vlan_tags[vdev->vid];
923d19533e8SHuawei Xie 
924d19533e8SHuawei Xie 	/* Print out VMDQ registration info. */
925c08a3490SYuanhan Liu 	RTE_LOG(INFO, VHOST_DATA,
926c2c4f87bSAman Deep Singh 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
927a7db3afcSAman Deep Singh 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
928e571e6b4SHuawei Xie 		vdev->vlan_tag);
929d19533e8SHuawei Xie 
930d19533e8SHuawei Xie 	/* Register the MAC address. */
93184b02d16SHuawei Xie 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
932e2a1dd12SYuanhan Liu 				(uint32_t)vdev->vid + vmdq_pool_base);
933d19533e8SHuawei Xie 	if (ret)
934c08a3490SYuanhan Liu 		RTE_LOG(ERR, VHOST_DATA,
935c08a3490SYuanhan Liu 			"(%d) failed to add device MAC address to VMDQ\n",
936e2a1dd12SYuanhan Liu 			vdev->vid);
937d19533e8SHuawei Xie 
93865453928SJianfeng Tan 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
939d19533e8SHuawei Xie 
940d19533e8SHuawei Xie 	/* Set device as ready for RX. */
941e571e6b4SHuawei Xie 	vdev->ready = DEVICE_RX;
942d19533e8SHuawei Xie 
943d19533e8SHuawei Xie 	return 0;
944d19533e8SHuawei Xie }
945d19533e8SHuawei Xie 
946d19533e8SHuawei Xie /*
947d19533e8SHuawei Xie  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
948d19533e8SHuawei Xie  * queue before disabling RX on the device.
949d19533e8SHuawei Xie  */
950d19533e8SHuawei Xie static inline void
951e571e6b4SHuawei Xie unlink_vmdq(struct vhost_dev *vdev)
952d19533e8SHuawei Xie {
953d19533e8SHuawei Xie 	unsigned i = 0;
954d19533e8SHuawei Xie 	unsigned rx_count;
955d19533e8SHuawei Xie 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
956d19533e8SHuawei Xie 
957e571e6b4SHuawei Xie 	if (vdev->ready == DEVICE_RX) {
958d19533e8SHuawei Xie 		/*clear MAC and VLAN settings*/
959e571e6b4SHuawei Xie 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
960d19533e8SHuawei Xie 		for (i = 0; i < 6; i++)
961e571e6b4SHuawei Xie 			vdev->mac_address.addr_bytes[i] = 0;
962d19533e8SHuawei Xie 
963e571e6b4SHuawei Xie 		vdev->vlan_tag = 0;
964d19533e8SHuawei Xie 
965d19533e8SHuawei Xie 		/*Clear out the receive buffers*/
966d19533e8SHuawei Xie 		rx_count = rte_eth_rx_burst(ports[0],
967e571e6b4SHuawei Xie 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
968d19533e8SHuawei Xie 
969d19533e8SHuawei Xie 		while (rx_count) {
970d19533e8SHuawei Xie 			for (i = 0; i < rx_count; i++)
971d19533e8SHuawei Xie 				rte_pktmbuf_free(pkts_burst[i]);
972d19533e8SHuawei Xie 
973d19533e8SHuawei Xie 			rx_count = rte_eth_rx_burst(ports[0],
974e571e6b4SHuawei Xie 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
975d19533e8SHuawei Xie 		}
976d19533e8SHuawei Xie 
977e571e6b4SHuawei Xie 		vdev->ready = DEVICE_MAC_LEARNING;
978d19533e8SHuawei Xie 	}
979d19533e8SHuawei Xie }
980d19533e8SHuawei Xie 
981a68ba8e0SCheng Jiang static inline void
982a68ba8e0SCheng Jiang free_pkts(struct rte_mbuf **pkts, uint16_t n)
983a68ba8e0SCheng Jiang {
984a68ba8e0SCheng Jiang 	while (n--)
985a68ba8e0SCheng Jiang 		rte_pktmbuf_free(pkts[n]);
986a68ba8e0SCheng Jiang }
987a68ba8e0SCheng Jiang 
988c0583d98SJerin Jacob static __rte_always_inline void
989a68ba8e0SCheng Jiang complete_async_pkts(struct vhost_dev *vdev)
990a68ba8e0SCheng Jiang {
991a68ba8e0SCheng Jiang 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
992a68ba8e0SCheng Jiang 	uint16_t complete_count;
99353d3f477SJiayu Hu 	int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
994a68ba8e0SCheng Jiang 
995a68ba8e0SCheng Jiang 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
99653d3f477SJiayu Hu 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
997b9f23beeSCheng Jiang 	if (complete_count) {
998a68ba8e0SCheng Jiang 		free_pkts(p_cpl, complete_count);
999b9f23beeSCheng Jiang 		__atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
1000b9f23beeSCheng Jiang 	}
1001b9f23beeSCheng Jiang 
1002a68ba8e0SCheng Jiang }
1003a68ba8e0SCheng Jiang 
1004a68ba8e0SCheng Jiang static __rte_always_inline void
1005a68ba8e0SCheng Jiang sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
10069c5ef512SYuanhan Liu 	    struct rte_mbuf *m)
10079c5ef512SYuanhan Liu {
10089c5ef512SYuanhan Liu 	uint16_t ret;
10099c5ef512SYuanhan Liu 
1010ca059fa5SYuanhan Liu 	if (builtin_net_driver) {
1011ca059fa5SYuanhan Liu 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1012ca059fa5SYuanhan Liu 	} else {
10134ecf22e3SYuanhan Liu 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1014ca059fa5SYuanhan Liu 	}
1015ca059fa5SYuanhan Liu 
10169c5ef512SYuanhan Liu 	if (enable_stats) {
1017a68ba8e0SCheng Jiang 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1018a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1019a68ba8e0SCheng Jiang 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1020a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
102156fe86f8SYuanhan Liu 		src_vdev->stats.tx_total++;
102256fe86f8SYuanhan Liu 		src_vdev->stats.tx += ret;
10239c5ef512SYuanhan Liu 	}
10249c5ef512SYuanhan Liu }
10259c5ef512SYuanhan Liu 
1026a68ba8e0SCheng Jiang static __rte_always_inline void
1027a68ba8e0SCheng Jiang drain_vhost(struct vhost_dev *vdev)
1028a68ba8e0SCheng Jiang {
1029a68ba8e0SCheng Jiang 	uint16_t ret;
103053d3f477SJiayu Hu 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1031a68ba8e0SCheng Jiang 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1032a68ba8e0SCheng Jiang 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1033a68ba8e0SCheng Jiang 
1034a68ba8e0SCheng Jiang 	if (builtin_net_driver) {
1035a68ba8e0SCheng Jiang 		ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
103653d3f477SJiayu Hu 	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1037a68ba8e0SCheng Jiang 		uint16_t enqueue_fail = 0;
103853d3f477SJiayu Hu 		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1039a68ba8e0SCheng Jiang 
1040a68ba8e0SCheng Jiang 		complete_async_pkts(vdev);
104153d3f477SJiayu Hu 		ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
1042abeb8652SJiayu Hu 		__atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
1043a68ba8e0SCheng Jiang 
1044a68ba8e0SCheng Jiang 		enqueue_fail = nr_xmit - ret;
1045a68ba8e0SCheng Jiang 		if (enqueue_fail)
1046a68ba8e0SCheng Jiang 			free_pkts(&m[ret], nr_xmit - ret);
1047a68ba8e0SCheng Jiang 	} else {
1048a68ba8e0SCheng Jiang 		ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1049a68ba8e0SCheng Jiang 						m, nr_xmit);
1050a68ba8e0SCheng Jiang 	}
1051a68ba8e0SCheng Jiang 
1052a68ba8e0SCheng Jiang 	if (enable_stats) {
1053a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1054a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1055a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1056a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1057a68ba8e0SCheng Jiang 	}
1058a68ba8e0SCheng Jiang 
105953d3f477SJiayu Hu 	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1060a68ba8e0SCheng Jiang 		free_pkts(m, nr_xmit);
1061a68ba8e0SCheng Jiang }
1062a68ba8e0SCheng Jiang 
1063a68ba8e0SCheng Jiang static __rte_always_inline void
1064a68ba8e0SCheng Jiang drain_vhost_table(void)
1065a68ba8e0SCheng Jiang {
1066a68ba8e0SCheng Jiang 	uint16_t lcore_id = rte_lcore_id();
1067a68ba8e0SCheng Jiang 	struct vhost_bufftable *vhost_txq;
1068a68ba8e0SCheng Jiang 	struct vhost_dev *vdev;
1069a68ba8e0SCheng Jiang 	uint64_t cur_tsc;
1070a68ba8e0SCheng Jiang 
1071a68ba8e0SCheng Jiang 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1072ad5050e4SWenwu Ma 		if (unlikely(vdev->remove == 1))
1073ad5050e4SWenwu Ma 			continue;
1074ad5050e4SWenwu Ma 
107553d3f477SJiayu Hu 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1076a68ba8e0SCheng Jiang 
1077a68ba8e0SCheng Jiang 		cur_tsc = rte_rdtsc();
1078a68ba8e0SCheng Jiang 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1079a68ba8e0SCheng Jiang 				> MBUF_TABLE_DRAIN_TSC)) {
1080a68ba8e0SCheng Jiang 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1081a68ba8e0SCheng Jiang 				"Vhost TX queue drained after timeout with burst size %u\n",
1082a68ba8e0SCheng Jiang 				vhost_txq->len);
1083a68ba8e0SCheng Jiang 			drain_vhost(vdev);
1084a68ba8e0SCheng Jiang 			vhost_txq->len = 0;
1085a68ba8e0SCheng Jiang 			vhost_txq->pre_tsc = cur_tsc;
1086a68ba8e0SCheng Jiang 		}
1087a68ba8e0SCheng Jiang 	}
1088a68ba8e0SCheng Jiang }
1089a68ba8e0SCheng Jiang 
1090d19533e8SHuawei Xie /*
1091d19533e8SHuawei Xie  * Check if the packet destination MAC address is for a local device. If so then put
1092d19533e8SHuawei Xie  * the packet on that devices RX queue. If not then return.
1093d19533e8SHuawei Xie  */
1094c0583d98SJerin Jacob static __rte_always_inline int
1095e571e6b4SHuawei Xie virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1096d19533e8SHuawei Xie {
10976d13ea8eSOlivier Matz 	struct rte_ether_hdr *pkt_hdr;
109845657a5cSYuanhan Liu 	struct vhost_dev *dst_vdev;
1099a68ba8e0SCheng Jiang 	struct vhost_bufftable *vhost_txq;
1100a68ba8e0SCheng Jiang 	uint16_t lcore_id = rte_lcore_id();
11016d13ea8eSOlivier Matz 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1102d19533e8SHuawei Xie 
110304d43857SDmitry Kozlyuk 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
110445657a5cSYuanhan Liu 	if (!dst_vdev)
1105d19533e8SHuawei Xie 		return -1;
110645657a5cSYuanhan Liu 
1107e2a1dd12SYuanhan Liu 	if (vdev->vid == dst_vdev->vid) {
11085d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1109c08a3490SYuanhan Liu 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1110e2a1dd12SYuanhan Liu 			vdev->vid);
111145657a5cSYuanhan Liu 		return 0;
111245657a5cSYuanhan Liu 	}
111345657a5cSYuanhan Liu 
11145d8f0bafSOlivier Matz 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1115e2a1dd12SYuanhan Liu 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
111645657a5cSYuanhan Liu 
111745657a5cSYuanhan Liu 	if (unlikely(dst_vdev->remove)) {
11185d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1119e2a1dd12SYuanhan Liu 			"(%d) device is marked for removal\n", dst_vdev->vid);
112045657a5cSYuanhan Liu 		return 0;
112145657a5cSYuanhan Liu 	}
112245657a5cSYuanhan Liu 
112353d3f477SJiayu Hu 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1124a68ba8e0SCheng Jiang 	vhost_txq->m_table[vhost_txq->len++] = m;
1125a68ba8e0SCheng Jiang 
1126a68ba8e0SCheng Jiang 	if (enable_stats) {
1127a68ba8e0SCheng Jiang 		vdev->stats.tx_total++;
1128a68ba8e0SCheng Jiang 		vdev->stats.tx++;
1129a68ba8e0SCheng Jiang 	}
1130a68ba8e0SCheng Jiang 
1131a68ba8e0SCheng Jiang 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1132a68ba8e0SCheng Jiang 		drain_vhost(dst_vdev);
1133a68ba8e0SCheng Jiang 		vhost_txq->len = 0;
1134a68ba8e0SCheng Jiang 		vhost_txq->pre_tsc = rte_rdtsc();
1135a68ba8e0SCheng Jiang 	}
113645657a5cSYuanhan Liu 	return 0;
1137d19533e8SHuawei Xie }
1138d19533e8SHuawei Xie 
1139d19533e8SHuawei Xie /*
114072ec8d77SOuyang Changchun  * Check if the destination MAC of a packet is one local VM,
114172ec8d77SOuyang Changchun  * and get its vlan tag, and offset if it is.
1142d19533e8SHuawei Xie  */
1143c0583d98SJerin Jacob static __rte_always_inline int
11447f262239SYuanhan Liu find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
114572ec8d77SOuyang Changchun 	uint32_t *offset, uint16_t *vlan_tag)
1146d19533e8SHuawei Xie {
114745657a5cSYuanhan Liu 	struct vhost_dev *dst_vdev;
11486d13ea8eSOlivier Matz 	struct rte_ether_hdr *pkt_hdr =
11496d13ea8eSOlivier Matz 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1150d19533e8SHuawei Xie 
115104d43857SDmitry Kozlyuk 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
115245657a5cSYuanhan Liu 	if (!dst_vdev)
115345657a5cSYuanhan Liu 		return 0;
115445657a5cSYuanhan Liu 
1155e2a1dd12SYuanhan Liu 	if (vdev->vid == dst_vdev->vid) {
11565d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1157c08a3490SYuanhan Liu 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1158e2a1dd12SYuanhan Liu 			vdev->vid);
115972ec8d77SOuyang Changchun 		return -1;
1160d19533e8SHuawei Xie 	}
1161e44fb8a4SOuyang Changchun 
1162e44fb8a4SOuyang Changchun 	/*
1163e44fb8a4SOuyang Changchun 	 * HW vlan strip will reduce the packet length
1164e44fb8a4SOuyang Changchun 	 * by minus length of vlan tag, so need restore
1165e44fb8a4SOuyang Changchun 	 * the packet length by plus it.
1166e44fb8a4SOuyang Changchun 	 */
116725cf2630SFerruh Yigit 	*offset  = RTE_VLAN_HLEN;
1168e2a1dd12SYuanhan Liu 	*vlan_tag = vlan_tags[vdev->vid];
1169d19533e8SHuawei Xie 
11705d8f0bafSOlivier Matz 	RTE_LOG_DP(DEBUG, VHOST_DATA,
11717f262239SYuanhan Liu 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1172e2a1dd12SYuanhan Liu 		vdev->vid, dst_vdev->vid, *vlan_tag);
1173d19533e8SHuawei Xie 
117472ec8d77SOuyang Changchun 	return 0;
117572ec8d77SOuyang Changchun }
117672ec8d77SOuyang Changchun 
11779fd72e3cSJijiang Liu static void virtio_tx_offload(struct rte_mbuf *m)
11789fd72e3cSJijiang Liu {
1179ca7036b4SDavid Marchand 	struct rte_net_hdr_lens hdr_lens;
1180ca7036b4SDavid Marchand 	struct rte_ipv4_hdr *ipv4_hdr;
1181ca7036b4SDavid Marchand 	struct rte_tcp_hdr *tcp_hdr;
1182ca7036b4SDavid Marchand 	uint32_t ptype;
11839fd72e3cSJijiang Liu 	void *l3_hdr;
11849fd72e3cSJijiang Liu 
1185ca7036b4SDavid Marchand 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1186ca7036b4SDavid Marchand 	m->l2_len = hdr_lens.l2_len;
1187ca7036b4SDavid Marchand 	m->l3_len = hdr_lens.l3_len;
1188ca7036b4SDavid Marchand 	m->l4_len = hdr_lens.l4_len;
11899fd72e3cSJijiang Liu 
1190ca7036b4SDavid Marchand 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1191ca7036b4SDavid Marchand 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1192ca7036b4SDavid Marchand 		m->l2_len + m->l3_len);
1193ca7036b4SDavid Marchand 
1194daa02b5cSOlivier Matz 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1195ca7036b4SDavid Marchand 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1196daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1197daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1198df40169aSYuanhan Liu 		ipv4_hdr = l3_hdr;
11999fd72e3cSJijiang Liu 		ipv4_hdr->hdr_checksum = 0;
1200ca7036b4SDavid Marchand 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1201ca7036b4SDavid Marchand 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1202daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1203ca7036b4SDavid Marchand 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1204df40169aSYuanhan Liu 	}
12059fd72e3cSJijiang Liu }
12069fd72e3cSJijiang Liu 
1207c0583d98SJerin Jacob static __rte_always_inline void
1208273ecdbcSYuanhan Liu do_drain_mbuf_table(struct mbuf_table *tx_q)
1209273ecdbcSYuanhan Liu {
1210273ecdbcSYuanhan Liu 	uint16_t count;
1211273ecdbcSYuanhan Liu 
1212273ecdbcSYuanhan Liu 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1213273ecdbcSYuanhan Liu 				 tx_q->m_table, tx_q->len);
1214273ecdbcSYuanhan Liu 	if (unlikely(count < tx_q->len))
1215273ecdbcSYuanhan Liu 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1216273ecdbcSYuanhan Liu 
1217273ecdbcSYuanhan Liu 	tx_q->len = 0;
1218273ecdbcSYuanhan Liu }
1219273ecdbcSYuanhan Liu 
122072ec8d77SOuyang Changchun /*
1221273ecdbcSYuanhan Liu  * This function routes the TX packet to the correct interface. This
1222273ecdbcSYuanhan Liu  * may be a local device or the physical port.
122372ec8d77SOuyang Changchun  */
1224c0583d98SJerin Jacob static __rte_always_inline void
122572ec8d77SOuyang Changchun virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
122672ec8d77SOuyang Changchun {
122772ec8d77SOuyang Changchun 	struct mbuf_table *tx_q;
1228273ecdbcSYuanhan Liu 	unsigned offset = 0;
122972ec8d77SOuyang Changchun 	const uint16_t lcore_id = rte_lcore_id();
12306d13ea8eSOlivier Matz 	struct rte_ether_hdr *nh;
123172ec8d77SOuyang Changchun 
12329c5ef512SYuanhan Liu 
12336d13ea8eSOlivier Matz 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
123404d43857SDmitry Kozlyuk 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
12359c5ef512SYuanhan Liu 		struct vhost_dev *vdev2;
12369c5ef512SYuanhan Liu 
123797daf19eSYuanhan Liu 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1238a3fdb532SJunjie Chen 			if (vdev2 != vdev)
1239a68ba8e0SCheng Jiang 				sync_virtio_xmit(vdev2, vdev, m);
12409c5ef512SYuanhan Liu 		}
12419c5ef512SYuanhan Liu 		goto queue2nic;
12429c5ef512SYuanhan Liu 	}
12439c5ef512SYuanhan Liu 
124472ec8d77SOuyang Changchun 	/*check if destination is local VM*/
1245a68ba8e0SCheng Jiang 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
124672ec8d77SOuyang Changchun 		return;
124772ec8d77SOuyang Changchun 
1248c2ab5162SOuyang Changchun 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
12497f262239SYuanhan Liu 		if (unlikely(find_local_dest(vdev, m, &offset,
12507f262239SYuanhan Liu 					     &vlan_tag) != 0)) {
125172ec8d77SOuyang Changchun 			rte_pktmbuf_free(m);
125272ec8d77SOuyang Changchun 			return;
125372ec8d77SOuyang Changchun 		}
1254d19533e8SHuawei Xie 	}
1255d19533e8SHuawei Xie 
12565d8f0bafSOlivier Matz 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1257e2a1dd12SYuanhan Liu 		"(%d) TX: MAC address is external\n", vdev->vid);
1258d19533e8SHuawei Xie 
12599c5ef512SYuanhan Liu queue2nic:
12609c5ef512SYuanhan Liu 
1261d19533e8SHuawei Xie 	/*Add packet to the port tx queue*/
1262d19533e8SHuawei Xie 	tx_q = &lcore_tx_queue[lcore_id];
1263d19533e8SHuawei Xie 
12646d13ea8eSOlivier Matz 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
126535b2d13fSOlivier Matz 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
12668b9bb988SOuyang Changchun 		/* Guest has inserted the vlan tag. */
12676d13ea8eSOlivier Matz 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
12688b9bb988SOuyang Changchun 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
12698b9bb988SOuyang Changchun 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
12708b9bb988SOuyang Changchun 			(vh->vlan_tci != vlan_tag_be))
12718b9bb988SOuyang Changchun 			vh->vlan_tci = vlan_tag_be;
12728b9bb988SOuyang Changchun 	} else {
1273daa02b5cSOlivier Matz 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1274e44fb8a4SOuyang Changchun 
1275c2ab5162SOuyang Changchun 		/*
1276c2ab5162SOuyang Changchun 		 * Find the right seg to adjust the data len when offset is
1277c2ab5162SOuyang Changchun 		 * bigger than tail room size.
1278c2ab5162SOuyang Changchun 		 */
1279c2ab5162SOuyang Changchun 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1280c2ab5162SOuyang Changchun 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
12814d50b6acSHuawei Xie 				m->data_len += offset;
1282c2ab5162SOuyang Changchun 			else {
1283c2ab5162SOuyang Changchun 				struct rte_mbuf *seg = m;
1284c2ab5162SOuyang Changchun 
1285c2ab5162SOuyang Changchun 				while ((seg->next != NULL) &&
1286c2ab5162SOuyang Changchun 					(offset > rte_pktmbuf_tailroom(seg)))
1287c2ab5162SOuyang Changchun 					seg = seg->next;
1288c2ab5162SOuyang Changchun 
1289c2ab5162SOuyang Changchun 				seg->data_len += offset;
1290c2ab5162SOuyang Changchun 			}
1291e44fb8a4SOuyang Changchun 			m->pkt_len += offset;
1292c2ab5162SOuyang Changchun 		}
1293e44fb8a4SOuyang Changchun 
12944d50b6acSHuawei Xie 		m->vlan_tci = vlan_tag;
12958b9bb988SOuyang Changchun 	}
1296d19533e8SHuawei Xie 
1297daa02b5cSOlivier Matz 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
12989fd72e3cSJijiang Liu 		virtio_tx_offload(m);
12999fd72e3cSJijiang Liu 
1300273ecdbcSYuanhan Liu 	tx_q->m_table[tx_q->len++] = m;
1301d19533e8SHuawei Xie 	if (enable_stats) {
130256fe86f8SYuanhan Liu 		vdev->stats.tx_total++;
130356fe86f8SYuanhan Liu 		vdev->stats.tx++;
1304d19533e8SHuawei Xie 	}
1305d19533e8SHuawei Xie 
1306273ecdbcSYuanhan Liu 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1307273ecdbcSYuanhan Liu 		do_drain_mbuf_table(tx_q);
1308d19533e8SHuawei Xie }
1309d19533e8SHuawei Xie 
1310d19533e8SHuawei Xie 
1311c0583d98SJerin Jacob static __rte_always_inline void
1312273ecdbcSYuanhan Liu drain_mbuf_table(struct mbuf_table *tx_q)
1313273ecdbcSYuanhan Liu {
1314273ecdbcSYuanhan Liu 	static uint64_t prev_tsc;
1315273ecdbcSYuanhan Liu 	uint64_t cur_tsc;
1316273ecdbcSYuanhan Liu 
1317273ecdbcSYuanhan Liu 	if (tx_q->len == 0)
1318d19533e8SHuawei Xie 		return;
1319273ecdbcSYuanhan Liu 
1320273ecdbcSYuanhan Liu 	cur_tsc = rte_rdtsc();
1321273ecdbcSYuanhan Liu 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1322273ecdbcSYuanhan Liu 		prev_tsc = cur_tsc;
1323273ecdbcSYuanhan Liu 
13245d8f0bafSOlivier Matz 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1325273ecdbcSYuanhan Liu 			"TX queue drained after timeout with burst size %u\n",
1326273ecdbcSYuanhan Liu 			tx_q->len);
1327273ecdbcSYuanhan Liu 		do_drain_mbuf_table(tx_q);
1328d19533e8SHuawei Xie 	}
1329273ecdbcSYuanhan Liu }
1330273ecdbcSYuanhan Liu 
1331c0583d98SJerin Jacob static __rte_always_inline void
1332273ecdbcSYuanhan Liu drain_eth_rx(struct vhost_dev *vdev)
1333273ecdbcSYuanhan Liu {
1334273ecdbcSYuanhan Liu 	uint16_t rx_count, enqueue_count;
1335a68ba8e0SCheng Jiang 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1336273ecdbcSYuanhan Liu 
1337273ecdbcSYuanhan Liu 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1338273ecdbcSYuanhan Liu 				    pkts, MAX_PKT_BURST);
1339abec60e7SCheng Jiang 
1340273ecdbcSYuanhan Liu 	if (!rx_count)
1341273ecdbcSYuanhan Liu 		return;
1342273ecdbcSYuanhan Liu 
1343d19533e8SHuawei Xie 	/*
1344273ecdbcSYuanhan Liu 	 * When "enable_retry" is set, here we wait and retry when there
1345273ecdbcSYuanhan Liu 	 * is no enough free slots in the queue to hold @rx_count packets,
1346273ecdbcSYuanhan Liu 	 * to diminish packet loss.
1347273ecdbcSYuanhan Liu 	 */
1348273ecdbcSYuanhan Liu 	if (enable_retry &&
13494ecf22e3SYuanhan Liu 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1350273ecdbcSYuanhan Liu 			VIRTIO_RXQ))) {
1351273ecdbcSYuanhan Liu 		uint32_t retry;
1352273ecdbcSYuanhan Liu 
1353273ecdbcSYuanhan Liu 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1354273ecdbcSYuanhan Liu 			rte_delay_us(burst_rx_delay_time);
13554ecf22e3SYuanhan Liu 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1356273ecdbcSYuanhan Liu 					VIRTIO_RXQ))
1357273ecdbcSYuanhan Liu 				break;
1358273ecdbcSYuanhan Liu 		}
1359273ecdbcSYuanhan Liu 	}
1360273ecdbcSYuanhan Liu 
1361ca059fa5SYuanhan Liu 	if (builtin_net_driver) {
1362ca059fa5SYuanhan Liu 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1363ca059fa5SYuanhan Liu 						pkts, rx_count);
136453d3f477SJiayu Hu 	} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1365a68ba8e0SCheng Jiang 		uint16_t enqueue_fail = 0;
136653d3f477SJiayu Hu 		int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1367a68ba8e0SCheng Jiang 
1368a68ba8e0SCheng Jiang 		complete_async_pkts(vdev);
1369abec60e7SCheng Jiang 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
137053d3f477SJiayu Hu 					VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
1371abeb8652SJiayu Hu 		__atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1372a68ba8e0SCheng Jiang 
1373a68ba8e0SCheng Jiang 		enqueue_fail = rx_count - enqueue_count;
1374a68ba8e0SCheng Jiang 		if (enqueue_fail)
1375a68ba8e0SCheng Jiang 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1376a68ba8e0SCheng Jiang 
1377ca059fa5SYuanhan Liu 	} else {
13784ecf22e3SYuanhan Liu 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1379273ecdbcSYuanhan Liu 						pkts, rx_count);
1380ca059fa5SYuanhan Liu 	}
1381abec60e7SCheng Jiang 
1382273ecdbcSYuanhan Liu 	if (enable_stats) {
1383a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1384a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1385a68ba8e0SCheng Jiang 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1386a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1387273ecdbcSYuanhan Liu 	}
1388273ecdbcSYuanhan Liu 
138953d3f477SJiayu Hu 	if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1390273ecdbcSYuanhan Liu 		free_pkts(pkts, rx_count);
1391273ecdbcSYuanhan Liu }
1392273ecdbcSYuanhan Liu 
1393c0583d98SJerin Jacob static __rte_always_inline void
1394273ecdbcSYuanhan Liu drain_virtio_tx(struct vhost_dev *vdev)
1395273ecdbcSYuanhan Liu {
1396273ecdbcSYuanhan Liu 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1397273ecdbcSYuanhan Liu 	uint16_t count;
1398273ecdbcSYuanhan Liu 	uint16_t i;
1399273ecdbcSYuanhan Liu 
1400ca059fa5SYuanhan Liu 	if (builtin_net_driver) {
1401ca059fa5SYuanhan Liu 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1402273ecdbcSYuanhan Liu 					pkts, MAX_PKT_BURST);
1403ca059fa5SYuanhan Liu 	} else {
1404ca059fa5SYuanhan Liu 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1405ca059fa5SYuanhan Liu 					mbuf_pool, pkts, MAX_PKT_BURST);
1406ca059fa5SYuanhan Liu 	}
1407273ecdbcSYuanhan Liu 
1408273ecdbcSYuanhan Liu 	/* setup VMDq for the first packet */
1409273ecdbcSYuanhan Liu 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1410273ecdbcSYuanhan Liu 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1411273ecdbcSYuanhan Liu 			free_pkts(pkts, count);
1412273ecdbcSYuanhan Liu 	}
1413273ecdbcSYuanhan Liu 
14147f262239SYuanhan Liu 	for (i = 0; i < count; ++i)
1415e2a1dd12SYuanhan Liu 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1416273ecdbcSYuanhan Liu }
1417273ecdbcSYuanhan Liu 
1418273ecdbcSYuanhan Liu /*
1419273ecdbcSYuanhan Liu  * Main function of vhost-switch. It basically does:
1420273ecdbcSYuanhan Liu  *
1421273ecdbcSYuanhan Liu  * for each vhost device {
1422273ecdbcSYuanhan Liu  *    - drain_eth_rx()
1423273ecdbcSYuanhan Liu  *
1424273ecdbcSYuanhan Liu  *      Which drains the host eth Rx queue linked to the vhost device,
1425273ecdbcSYuanhan Liu  *      and deliver all of them to guest virito Rx ring associated with
1426273ecdbcSYuanhan Liu  *      this vhost device.
1427273ecdbcSYuanhan Liu  *
1428273ecdbcSYuanhan Liu  *    - drain_virtio_tx()
1429273ecdbcSYuanhan Liu  *
1430273ecdbcSYuanhan Liu  *      Which drains the guest virtio Tx queue and deliver all of them
1431273ecdbcSYuanhan Liu  *      to the target, which could be another vhost device, or the
1432273ecdbcSYuanhan Liu  *      physical eth dev. The route is done in function "virtio_tx_route".
1433273ecdbcSYuanhan Liu  * }
1434d19533e8SHuawei Xie  */
1435d19533e8SHuawei Xie static int
1436273ecdbcSYuanhan Liu switch_worker(void *arg __rte_unused)
1437d19533e8SHuawei Xie {
1438273ecdbcSYuanhan Liu 	unsigned i;
1439273ecdbcSYuanhan Liu 	unsigned lcore_id = rte_lcore_id();
1440273ecdbcSYuanhan Liu 	struct vhost_dev *vdev;
1441d19533e8SHuawei Xie 	struct mbuf_table *tx_q;
1442d19533e8SHuawei Xie 
14437be78d02SJosh Soref 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1444d19533e8SHuawei Xie 
1445d19533e8SHuawei Xie 	tx_q = &lcore_tx_queue[lcore_id];
1446273ecdbcSYuanhan Liu 	for (i = 0; i < rte_lcore_count(); i++) {
1447d19533e8SHuawei Xie 		if (lcore_ids[i] == lcore_id) {
1448d19533e8SHuawei Xie 			tx_q->txq_id = i;
1449d19533e8SHuawei Xie 			break;
1450d19533e8SHuawei Xie 		}
1451d19533e8SHuawei Xie 	}
1452d19533e8SHuawei Xie 
1453d19533e8SHuawei Xie 	while(1) {
1454273ecdbcSYuanhan Liu 		drain_mbuf_table(tx_q);
1455a68ba8e0SCheng Jiang 		drain_vhost_table();
1456d19533e8SHuawei Xie 		/*
145745657a5cSYuanhan Liu 		 * Inform the configuration core that we have exited the
145845657a5cSYuanhan Liu 		 * linked list and that no devices are in use if requested.
1459d19533e8SHuawei Xie 		 */
146045657a5cSYuanhan Liu 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
146145657a5cSYuanhan Liu 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1462d19533e8SHuawei Xie 
1463d19533e8SHuawei Xie 		/*
1464273ecdbcSYuanhan Liu 		 * Process vhost devices
1465d19533e8SHuawei Xie 		 */
146697daf19eSYuanhan Liu 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
146797daf19eSYuanhan Liu 			      lcore_vdev_entry) {
1468364dddcdSHuawei Xie 			if (unlikely(vdev->remove)) {
1469e571e6b4SHuawei Xie 				unlink_vmdq(vdev);
1470e571e6b4SHuawei Xie 				vdev->ready = DEVICE_SAFE_REMOVE;
1471d19533e8SHuawei Xie 				continue;
1472d19533e8SHuawei Xie 			}
147345657a5cSYuanhan Liu 
1474273ecdbcSYuanhan Liu 			if (likely(vdev->ready == DEVICE_RX))
1475273ecdbcSYuanhan Liu 				drain_eth_rx(vdev);
1476d19533e8SHuawei Xie 
1477273ecdbcSYuanhan Liu 			if (likely(!vdev->remove))
1478273ecdbcSYuanhan Liu 				drain_virtio_tx(vdev);
1479d19533e8SHuawei Xie 		}
1480d19533e8SHuawei Xie 	}
1481d19533e8SHuawei Xie 
1482d19533e8SHuawei Xie 	return 0;
1483d19533e8SHuawei Xie }
1484d19533e8SHuawei Xie 
1485d19533e8SHuawei Xie /*
148645657a5cSYuanhan Liu  * Remove a device from the specific data core linked list and from the
14877be78d02SJosh Soref  * main linked list. Synchronization  occurs through the use of the
148845657a5cSYuanhan Liu  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1489d19533e8SHuawei Xie  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1490d19533e8SHuawei Xie  */
1491d19533e8SHuawei Xie static void
14924ecf22e3SYuanhan Liu destroy_device(int vid)
1493d19533e8SHuawei Xie {
149416ae8abeSYuanhan Liu 	struct vhost_dev *vdev = NULL;
1495d19533e8SHuawei Xie 	int lcore;
1496a68ba8e0SCheng Jiang 	uint16_t i;
1497d19533e8SHuawei Xie 
149816ae8abeSYuanhan Liu 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
14994ecf22e3SYuanhan Liu 		if (vdev->vid == vid)
150016ae8abeSYuanhan Liu 			break;
150116ae8abeSYuanhan Liu 	}
150216ae8abeSYuanhan Liu 	if (!vdev)
150316ae8abeSYuanhan Liu 		return;
1504d19533e8SHuawei Xie 	/*set the remove flag. */
1505e571e6b4SHuawei Xie 	vdev->remove = 1;
1506e571e6b4SHuawei Xie 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1507d19533e8SHuawei Xie 		rte_pause();
1508d19533e8SHuawei Xie 	}
1509d19533e8SHuawei Xie 
1510a68ba8e0SCheng Jiang 	for (i = 0; i < RTE_MAX_LCORE; i++)
151153d3f477SJiayu Hu 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1512a68ba8e0SCheng Jiang 
1513ca059fa5SYuanhan Liu 	if (builtin_net_driver)
1514ca059fa5SYuanhan Liu 		vs_vhost_net_remove(vdev);
1515ca059fa5SYuanhan Liu 
151697daf19eSYuanhan Liu 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
151797daf19eSYuanhan Liu 		     lcore_vdev_entry);
151897daf19eSYuanhan Liu 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
151997daf19eSYuanhan Liu 
1520d19533e8SHuawei Xie 
1521d19533e8SHuawei Xie 	/* Set the dev_removal_flag on each lcore. */
1522cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore)
152345657a5cSYuanhan Liu 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1524d19533e8SHuawei Xie 
1525d19533e8SHuawei Xie 	/*
152645657a5cSYuanhan Liu 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
152745657a5cSYuanhan Liu 	 * we can be sure that they can no longer access the device removed
152845657a5cSYuanhan Liu 	 * from the linked lists and that the devices are no longer in use.
1529d19533e8SHuawei Xie 	 */
1530cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore) {
153145657a5cSYuanhan Liu 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1532d19533e8SHuawei Xie 			rte_pause();
1533d19533e8SHuawei Xie 	}
1534d19533e8SHuawei Xie 
153545657a5cSYuanhan Liu 	lcore_info[vdev->coreid].device_num--;
1536d19533e8SHuawei Xie 
153745657a5cSYuanhan Liu 	RTE_LOG(INFO, VHOST_DATA,
1538c08a3490SYuanhan Liu 		"(%d) device has been removed from data core\n",
1539e2a1dd12SYuanhan Liu 		vdev->vid);
1540d19533e8SHuawei Xie 
154153d3f477SJiayu Hu 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1542b9f23beeSCheng Jiang 		uint16_t n_pkt = 0;
154353d3f477SJiayu Hu 		int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1544b9f23beeSCheng Jiang 		struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1545b9f23beeSCheng Jiang 
1546b9f23beeSCheng Jiang 		while (vdev->pkts_inflight) {
1547b9f23beeSCheng Jiang 			n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
154853d3f477SJiayu Hu 						m_cpl, vdev->pkts_inflight, dma_id, 0);
1549b9f23beeSCheng Jiang 			free_pkts(m_cpl, n_pkt);
1550b9f23beeSCheng Jiang 			__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1551b9f23beeSCheng Jiang 		}
1552b9f23beeSCheng Jiang 
1553abec60e7SCheng Jiang 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
155453d3f477SJiayu Hu 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1555b9f23beeSCheng Jiang 	}
1556abec60e7SCheng Jiang 
1557e571e6b4SHuawei Xie 	rte_free(vdev);
1558d19533e8SHuawei Xie }
1559d19533e8SHuawei Xie 
1560d19533e8SHuawei Xie /*
1561d19533e8SHuawei Xie  * A new device is added to a data core. First the device is added to the main linked list
156210b4270fSRami Rosen  * and then allocated to a specific data core.
1563d19533e8SHuawei Xie  */
1564d19533e8SHuawei Xie static int
15654ecf22e3SYuanhan Liu new_device(int vid)
1566d19533e8SHuawei Xie {
1567d19533e8SHuawei Xie 	int lcore, core_add = 0;
1568a68ba8e0SCheng Jiang 	uint16_t i;
1569d19533e8SHuawei Xie 	uint32_t device_num_min = num_devices;
1570e571e6b4SHuawei Xie 	struct vhost_dev *vdev;
1571fdf20fa7SSergio Gonzalez Monroy 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1572e571e6b4SHuawei Xie 	if (vdev == NULL) {
1573c08a3490SYuanhan Liu 		RTE_LOG(INFO, VHOST_DATA,
15747f262239SYuanhan Liu 			"(%d) couldn't allocate memory for vhost dev\n",
1575e2a1dd12SYuanhan Liu 			vid);
1576e571e6b4SHuawei Xie 		return -1;
1577e571e6b4SHuawei Xie 	}
1578e2a1dd12SYuanhan Liu 	vdev->vid = vid;
1579d19533e8SHuawei Xie 
1580a68ba8e0SCheng Jiang 	for (i = 0; i < RTE_MAX_LCORE; i++) {
158153d3f477SJiayu Hu 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1582a68ba8e0SCheng Jiang 			= rte_zmalloc("vhost bufftable",
1583a68ba8e0SCheng Jiang 				sizeof(struct vhost_bufftable),
1584a68ba8e0SCheng Jiang 				RTE_CACHE_LINE_SIZE);
1585a68ba8e0SCheng Jiang 
158653d3f477SJiayu Hu 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1587a68ba8e0SCheng Jiang 			RTE_LOG(INFO, VHOST_DATA,
1588a68ba8e0SCheng Jiang 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1589a68ba8e0SCheng Jiang 			return -1;
1590a68ba8e0SCheng Jiang 		}
1591a68ba8e0SCheng Jiang 	}
1592a68ba8e0SCheng Jiang 
1593ca059fa5SYuanhan Liu 	if (builtin_net_driver)
1594ca059fa5SYuanhan Liu 		vs_vhost_net_setup(vdev);
1595ca059fa5SYuanhan Liu 
159697daf19eSYuanhan Liu 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1597e2a1dd12SYuanhan Liu 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1598d19533e8SHuawei Xie 
1599d19533e8SHuawei Xie 	/*reset ready flag*/
1600e571e6b4SHuawei Xie 	vdev->ready = DEVICE_MAC_LEARNING;
1601e571e6b4SHuawei Xie 	vdev->remove = 0;
1602d19533e8SHuawei Xie 
1603d19533e8SHuawei Xie 	/* Find a suitable lcore to add the device. */
1604cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore) {
160545657a5cSYuanhan Liu 		if (lcore_info[lcore].device_num < device_num_min) {
160645657a5cSYuanhan Liu 			device_num_min = lcore_info[lcore].device_num;
1607d19533e8SHuawei Xie 			core_add = lcore;
1608d19533e8SHuawei Xie 		}
1609d19533e8SHuawei Xie 	}
1610e571e6b4SHuawei Xie 	vdev->coreid = core_add;
1611e571e6b4SHuawei Xie 
161297daf19eSYuanhan Liu 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
161397daf19eSYuanhan Liu 			  lcore_vdev_entry);
161445657a5cSYuanhan Liu 	lcore_info[vdev->coreid].device_num++;
1615d19533e8SHuawei Xie 
1616d19533e8SHuawei Xie 	/* Disable notifications. */
16174ecf22e3SYuanhan Liu 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
16184ecf22e3SYuanhan Liu 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1619d19533e8SHuawei Xie 
1620c08a3490SYuanhan Liu 	RTE_LOG(INFO, VHOST_DATA,
1621c08a3490SYuanhan Liu 		"(%d) device has been added to data core %d\n",
1622e2a1dd12SYuanhan Liu 		vid, vdev->coreid);
1623d19533e8SHuawei Xie 
162453d3f477SJiayu Hu 	if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
162553d3f477SJiayu Hu 		int ret;
1626a68ba8e0SCheng Jiang 
162753d3f477SJiayu Hu 		ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
162853d3f477SJiayu Hu 		if (ret == 0)
162953d3f477SJiayu Hu 			dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
163053d3f477SJiayu Hu 		return ret;
16316e9a9d2aSCheng Jiang 	}
1632abec60e7SCheng Jiang 
1633d19533e8SHuawei Xie 	return 0;
1634d19533e8SHuawei Xie }
1635d19533e8SHuawei Xie 
1636b9f23beeSCheng Jiang static int
1637b9f23beeSCheng Jiang vring_state_changed(int vid, uint16_t queue_id, int enable)
1638b9f23beeSCheng Jiang {
1639b9f23beeSCheng Jiang 	struct vhost_dev *vdev = NULL;
1640b9f23beeSCheng Jiang 
1641b9f23beeSCheng Jiang 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1642b9f23beeSCheng Jiang 		if (vdev->vid == vid)
1643b9f23beeSCheng Jiang 			break;
1644b9f23beeSCheng Jiang 	}
1645b9f23beeSCheng Jiang 	if (!vdev)
1646b9f23beeSCheng Jiang 		return -1;
1647b9f23beeSCheng Jiang 
1648b9f23beeSCheng Jiang 	if (queue_id != VIRTIO_RXQ)
1649b9f23beeSCheng Jiang 		return 0;
1650b9f23beeSCheng Jiang 
165153d3f477SJiayu Hu 	if (dma_bind[vid].dmas[queue_id].async_enabled) {
1652b9f23beeSCheng Jiang 		if (!enable) {
1653b9f23beeSCheng Jiang 			uint16_t n_pkt = 0;
165453d3f477SJiayu Hu 			int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1655b9f23beeSCheng Jiang 			struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1656b9f23beeSCheng Jiang 
1657b9f23beeSCheng Jiang 			while (vdev->pkts_inflight) {
1658b9f23beeSCheng Jiang 				n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
165953d3f477SJiayu Hu 							m_cpl, vdev->pkts_inflight, dma_id, 0);
1660b9f23beeSCheng Jiang 				free_pkts(m_cpl, n_pkt);
1661b9f23beeSCheng Jiang 				__atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1662b9f23beeSCheng Jiang 			}
1663b9f23beeSCheng Jiang 		}
1664b9f23beeSCheng Jiang 	}
1665b9f23beeSCheng Jiang 
1666b9f23beeSCheng Jiang 	return 0;
1667b9f23beeSCheng Jiang }
1668b9f23beeSCheng Jiang 
1669d19533e8SHuawei Xie /*
1670d19533e8SHuawei Xie  * These callback allow devices to be added to the data core when configuration
1671d19533e8SHuawei Xie  * has been fully complete.
1672d19533e8SHuawei Xie  */
1673ab4bb424SMaxime Coquelin static const struct rte_vhost_device_ops virtio_net_device_ops =
1674d19533e8SHuawei Xie {
1675d19533e8SHuawei Xie 	.new_device =  new_device,
1676d19533e8SHuawei Xie 	.destroy_device = destroy_device,
1677b9f23beeSCheng Jiang 	.vring_state_changed = vring_state_changed,
1678d19533e8SHuawei Xie };
1679d19533e8SHuawei Xie 
1680d19533e8SHuawei Xie /*
1681d19533e8SHuawei Xie  * This is a thread will wake up after a period to print stats if the user has
1682d19533e8SHuawei Xie  * enabled them.
1683d19533e8SHuawei Xie  */
1684fa204854SOlivier Matz static void *
1685fa204854SOlivier Matz print_stats(__rte_unused void *arg)
1686d19533e8SHuawei Xie {
168745657a5cSYuanhan Liu 	struct vhost_dev *vdev;
1688d19533e8SHuawei Xie 	uint64_t tx_dropped, rx_dropped;
1689d19533e8SHuawei Xie 	uint64_t tx, tx_total, rx, rx_total;
1690d19533e8SHuawei Xie 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1691d19533e8SHuawei Xie 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1692d19533e8SHuawei Xie 
1693d19533e8SHuawei Xie 	while(1) {
1694d19533e8SHuawei Xie 		sleep(enable_stats);
1695d19533e8SHuawei Xie 
1696d19533e8SHuawei Xie 		/* Clear screen and move to top left */
169756fe86f8SYuanhan Liu 		printf("%s%s\n", clr, top_left);
169856fe86f8SYuanhan Liu 		printf("Device statistics =================================\n");
1699d19533e8SHuawei Xie 
170097daf19eSYuanhan Liu 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
170156fe86f8SYuanhan Liu 			tx_total   = vdev->stats.tx_total;
170256fe86f8SYuanhan Liu 			tx         = vdev->stats.tx;
1703d19533e8SHuawei Xie 			tx_dropped = tx_total - tx;
170456fe86f8SYuanhan Liu 
1705a68ba8e0SCheng Jiang 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1706a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1707a68ba8e0SCheng Jiang 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1708a68ba8e0SCheng Jiang 				__ATOMIC_SEQ_CST);
1709d19533e8SHuawei Xie 			rx_dropped = rx_total - rx;
1710d19533e8SHuawei Xie 
1711c08a3490SYuanhan Liu 			printf("Statistics for device %d\n"
171256fe86f8SYuanhan Liu 				"-----------------------\n"
171356fe86f8SYuanhan Liu 				"TX total:              %" PRIu64 "\n"
171456fe86f8SYuanhan Liu 				"TX dropped:            %" PRIu64 "\n"
171556fe86f8SYuanhan Liu 				"TX successful:         %" PRIu64 "\n"
171656fe86f8SYuanhan Liu 				"RX total:              %" PRIu64 "\n"
171756fe86f8SYuanhan Liu 				"RX dropped:            %" PRIu64 "\n"
171856fe86f8SYuanhan Liu 				"RX successful:         %" PRIu64 "\n",
17194ecf22e3SYuanhan Liu 				vdev->vid,
172056fe86f8SYuanhan Liu 				tx_total, tx_dropped, tx,
172156fe86f8SYuanhan Liu 				rx_total, rx_dropped, rx);
1722d19533e8SHuawei Xie 		}
172356fe86f8SYuanhan Liu 
172456fe86f8SYuanhan Liu 		printf("===================================================\n");
17253ee6f706SGeorgiy Levashov 
17263ee6f706SGeorgiy Levashov 		fflush(stdout);
1727d19533e8SHuawei Xie 	}
1728fa204854SOlivier Matz 
1729fa204854SOlivier Matz 	return NULL;
1730d19533e8SHuawei Xie }
1731d19533e8SHuawei Xie 
1732ad0eef4dSJiayu Hu static void
1733ad0eef4dSJiayu Hu unregister_drivers(int socket_num)
1734ad0eef4dSJiayu Hu {
1735ad0eef4dSJiayu Hu 	int i, ret;
1736ad0eef4dSJiayu Hu 
1737ad0eef4dSJiayu Hu 	for (i = 0; i < socket_num; i++) {
1738ad0eef4dSJiayu Hu 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1739ad0eef4dSJiayu Hu 		if (ret != 0)
1740ad0eef4dSJiayu Hu 			RTE_LOG(ERR, VHOST_CONFIG,
1741ad0eef4dSJiayu Hu 				"Fail to unregister vhost driver for %s.\n",
1742ad0eef4dSJiayu Hu 				socket_files + i * PATH_MAX);
1743ad0eef4dSJiayu Hu 	}
1744ad0eef4dSJiayu Hu }
1745ad0eef4dSJiayu Hu 
1746c83d2d00SOuyang Changchun /* When we receive a INT signal, unregister vhost driver */
1747c83d2d00SOuyang Changchun static void
1748c83d2d00SOuyang Changchun sigint_handler(__rte_unused int signum)
1749c83d2d00SOuyang Changchun {
1750c83d2d00SOuyang Changchun 	/* Unregister vhost driver. */
1751ad0eef4dSJiayu Hu 	unregister_drivers(nb_sockets);
1752ad0eef4dSJiayu Hu 
1753c83d2d00SOuyang Changchun 	exit(0);
1754c83d2d00SOuyang Changchun }
1755d19533e8SHuawei Xie 
175653d3f477SJiayu Hu static void
175753d3f477SJiayu Hu reset_dma(void)
175853d3f477SJiayu Hu {
175953d3f477SJiayu Hu 	int i;
176053d3f477SJiayu Hu 
176153d3f477SJiayu Hu 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
176253d3f477SJiayu Hu 		int j;
176353d3f477SJiayu Hu 
176453d3f477SJiayu Hu 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
176553d3f477SJiayu Hu 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
176653d3f477SJiayu Hu 			dma_bind[i].dmas[j].async_enabled = false;
176753d3f477SJiayu Hu 		}
176853d3f477SJiayu Hu 	}
176953d3f477SJiayu Hu 
177053d3f477SJiayu Hu 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
177153d3f477SJiayu Hu 		dmas_id[i] = INVALID_DMA_ID;
177253d3f477SJiayu Hu }
177353d3f477SJiayu Hu 
1774bdb19b77SYuanhan Liu /*
1775164a601bSYuanhan Liu  * Main function, does initialisation and calls the per-lcore functions.
1776d19533e8SHuawei Xie  */
1777d19533e8SHuawei Xie int
177898a16481SDavid Marchand main(int argc, char *argv[])
1779d19533e8SHuawei Xie {
1780d19533e8SHuawei Xie 	unsigned lcore_id, core_id = 0;
1781d19533e8SHuawei Xie 	unsigned nb_ports, valid_num_ports;
1782ad0eef4dSJiayu Hu 	int ret, i;
1783f8244c63SZhiyong Yang 	uint16_t portid;
1784d19533e8SHuawei Xie 	static pthread_t tid;
1785ca7036b4SDavid Marchand 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1786d19533e8SHuawei Xie 
1787c83d2d00SOuyang Changchun 	signal(SIGINT, sigint_handler);
1788c83d2d00SOuyang Changchun 
1789d19533e8SHuawei Xie 	/* init EAL */
1790d19533e8SHuawei Xie 	ret = rte_eal_init(argc, argv);
1791d19533e8SHuawei Xie 	if (ret < 0)
1792d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1793d19533e8SHuawei Xie 	argc -= ret;
1794d19533e8SHuawei Xie 	argv += ret;
1795d19533e8SHuawei Xie 
179653d3f477SJiayu Hu 	/* initialize dma structures */
179753d3f477SJiayu Hu 	reset_dma();
179853d3f477SJiayu Hu 
1799d19533e8SHuawei Xie 	/* parse app arguments */
1800d19533e8SHuawei Xie 	ret = us_vhost_parse_args(argc, argv);
1801d19533e8SHuawei Xie 	if (ret < 0)
1802d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1803d19533e8SHuawei Xie 
1804b3bee7d8SYong Wang 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
180545657a5cSYuanhan Liu 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
180645657a5cSYuanhan Liu 
1807d19533e8SHuawei Xie 		if (rte_lcore_is_enabled(lcore_id))
1808d19533e8SHuawei Xie 			lcore_ids[core_id++] = lcore_id;
1809b3bee7d8SYong Wang 	}
1810d19533e8SHuawei Xie 
1811d19533e8SHuawei Xie 	if (rte_lcore_count() > RTE_MAX_LCORE)
1812d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1813d19533e8SHuawei Xie 
1814d19533e8SHuawei Xie 	/* Get the number of physical ports. */
1815d9a42a69SThomas Monjalon 	nb_ports = rte_eth_dev_count_avail();
1816d19533e8SHuawei Xie 
1817d19533e8SHuawei Xie 	/*
1818d19533e8SHuawei Xie 	 * Update the global var NUM_PORTS and global array PORTS
1819d19533e8SHuawei Xie 	 * and get value of var VALID_NUM_PORTS according to system ports number
1820d19533e8SHuawei Xie 	 */
1821d19533e8SHuawei Xie 	valid_num_ports = check_ports_num(nb_ports);
1822d19533e8SHuawei Xie 
1823d19533e8SHuawei Xie 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1824d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1825d19533e8SHuawei Xie 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1826d19533e8SHuawei Xie 		return -1;
1827d19533e8SHuawei Xie 	}
1828d19533e8SHuawei Xie 
1829bdb19b77SYuanhan Liu 	/*
1830bdb19b77SYuanhan Liu 	 * FIXME: here we are trying to allocate mbufs big enough for
1831bdb19b77SYuanhan Liu 	 * @MAX_QUEUES, but the truth is we're never going to use that
1832bdb19b77SYuanhan Liu 	 * many queues here. We probably should only do allocation for
1833bdb19b77SYuanhan Liu 	 * those queues we are going to use.
1834bdb19b77SYuanhan Liu 	 */
1835*917229c2SWenwu Ma 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1836*917229c2SWenwu Ma 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1837*917229c2SWenwu Ma 					    rte_socket_id());
1838*917229c2SWenwu Ma 	if (mbuf_pool == NULL)
1839*917229c2SWenwu Ma 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1840d19533e8SHuawei Xie 
1841d19533e8SHuawei Xie 	if (vm2vm_mode == VM2VM_HARDWARE) {
1842d19533e8SHuawei Xie 		/* Enable VT loop back to let L2 switch to do it. */
1843d19533e8SHuawei Xie 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
18441f49ec15SThomas Monjalon 		RTE_LOG(DEBUG, VHOST_CONFIG,
1845d19533e8SHuawei Xie 			"Enable loop back for L2 switch in vmdq.\n");
1846d19533e8SHuawei Xie 	}
1847d19533e8SHuawei Xie 
1848d19533e8SHuawei Xie 	/* initialize all ports */
18498728ccf3SThomas Monjalon 	RTE_ETH_FOREACH_DEV(portid) {
1850d19533e8SHuawei Xie 		/* skip ports that are not enabled */
1851d19533e8SHuawei Xie 		if ((enabled_port_mask & (1 << portid)) == 0) {
1852d19533e8SHuawei Xie 			RTE_LOG(INFO, VHOST_PORT,
1853d19533e8SHuawei Xie 				"Skipping disabled port %d\n", portid);
1854d19533e8SHuawei Xie 			continue;
1855d19533e8SHuawei Xie 		}
1856d19533e8SHuawei Xie 		if (port_init(portid) != 0)
1857d19533e8SHuawei Xie 			rte_exit(EXIT_FAILURE,
1858d19533e8SHuawei Xie 				"Cannot initialize network ports\n");
1859d19533e8SHuawei Xie 	}
1860d19533e8SHuawei Xie 
1861d19533e8SHuawei Xie 	/* Enable stats if the user option is set. */
186267b6d303SRavi Kerur 	if (enable_stats) {
1863fa204854SOlivier Matz 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1864fa204854SOlivier Matz 					print_stats, NULL);
1865fa204854SOlivier Matz 		if (ret < 0)
186667b6d303SRavi Kerur 			rte_exit(EXIT_FAILURE,
186767b6d303SRavi Kerur 				"Cannot create print-stats thread\n");
186867b6d303SRavi Kerur 	}
1869d19533e8SHuawei Xie 
1870d19533e8SHuawei Xie 	/* Launch all data cores. */
1871cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore_id)
187268363d85SYuanhan Liu 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1873d19533e8SHuawei Xie 
18742345e3beSYuanhan Liu 	if (client_mode)
18752345e3beSYuanhan Liu 		flags |= RTE_VHOST_USER_CLIENT;
18762345e3beSYuanhan Liu 
187753d3f477SJiayu Hu 	for (i = 0; i < dma_count; i++) {
187853d3f477SJiayu Hu 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
187953d3f477SJiayu Hu 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
188053d3f477SJiayu Hu 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
188153d3f477SJiayu Hu 		}
188253d3f477SJiayu Hu 	}
188353d3f477SJiayu Hu 
1884bde19a4dSJiayu Hu 	/* Register vhost user driver to handle vhost messages. */
1885ad0eef4dSJiayu Hu 	for (i = 0; i < nb_sockets; i++) {
18860917f9d1SYuanhan Liu 		char *file = socket_files + i * PATH_MAX;
1887a68ba8e0SCheng Jiang 
188853d3f477SJiayu Hu 		if (dma_count)
1889abec60e7SCheng Jiang 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1890abec60e7SCheng Jiang 
18910917f9d1SYuanhan Liu 		ret = rte_vhost_driver_register(file, flags);
1892ad0eef4dSJiayu Hu 		if (ret != 0) {
1893ad0eef4dSJiayu Hu 			unregister_drivers(i);
1894ad0eef4dSJiayu Hu 			rte_exit(EXIT_FAILURE,
1895ad0eef4dSJiayu Hu 				"vhost driver register failure.\n");
1896ad0eef4dSJiayu Hu 		}
1897ca059fa5SYuanhan Liu 
1898ca059fa5SYuanhan Liu 		if (builtin_net_driver)
1899ca059fa5SYuanhan Liu 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1900ca059fa5SYuanhan Liu 
19010917f9d1SYuanhan Liu 		if (mergeable == 0) {
19020917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
19030917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
19040917f9d1SYuanhan Liu 		}
19050917f9d1SYuanhan Liu 
19060917f9d1SYuanhan Liu 		if (enable_tx_csum == 0) {
19070917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
19080917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_CSUM);
19090917f9d1SYuanhan Liu 		}
19100917f9d1SYuanhan Liu 
19110917f9d1SYuanhan Liu 		if (enable_tso == 0) {
19120917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
19130917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_HOST_TSO4);
19140917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
19150917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_HOST_TSO6);
19160917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
19170917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
19180917f9d1SYuanhan Liu 			rte_vhost_driver_disable_features(file,
19190917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
19200917f9d1SYuanhan Liu 		}
19210917f9d1SYuanhan Liu 
19220917f9d1SYuanhan Liu 		if (promiscuous) {
19230917f9d1SYuanhan Liu 			rte_vhost_driver_enable_features(file,
19240917f9d1SYuanhan Liu 				1ULL << VIRTIO_NET_F_CTRL_RX);
19250917f9d1SYuanhan Liu 		}
1926d19533e8SHuawei Xie 
192793433b63SYuanhan Liu 		ret = rte_vhost_driver_callback_register(file,
192893433b63SYuanhan Liu 			&virtio_net_device_ops);
192993433b63SYuanhan Liu 		if (ret != 0) {
193093433b63SYuanhan Liu 			rte_exit(EXIT_FAILURE,
193193433b63SYuanhan Liu 				"failed to register vhost driver callbacks.\n");
193293433b63SYuanhan Liu 		}
1933af147591SYuanhan Liu 
1934af147591SYuanhan Liu 		if (rte_vhost_driver_start(file) < 0) {
1935af147591SYuanhan Liu 			rte_exit(EXIT_FAILURE,
1936af147591SYuanhan Liu 				"failed to start vhost driver.\n");
1937af147591SYuanhan Liu 		}
193893433b63SYuanhan Liu 	}
1939d19533e8SHuawei Xie 
1940cb056611SStephen Hemminger 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1941af147591SYuanhan Liu 		rte_eal_wait_lcore(lcore_id);
1942af147591SYuanhan Liu 
194310aa3757SChengchang Tang 	/* clean up the EAL */
194410aa3757SChengchang Tang 	rte_eal_cleanup();
1945d19533e8SHuawei Xie 
194610aa3757SChengchang Tang 	return 0;
1947d19533e8SHuawei Xie }
1948