xref: /dpdk/examples/vhost/main.c (revision d19533e86f1db2c3b4b944ab40171386ea076d54)
1*d19533e8SHuawei Xie /*-
2*d19533e8SHuawei Xie  *   BSD LICENSE
3*d19533e8SHuawei Xie  *
4*d19533e8SHuawei Xie  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5*d19533e8SHuawei Xie  *   All rights reserved.
6*d19533e8SHuawei Xie  *
7*d19533e8SHuawei Xie  *   Redistribution and use in source and binary forms, with or without
8*d19533e8SHuawei Xie  *   modification, are permitted provided that the following conditions
9*d19533e8SHuawei Xie  *   are met:
10*d19533e8SHuawei Xie  *
11*d19533e8SHuawei Xie  *     * Redistributions of source code must retain the above copyright
12*d19533e8SHuawei Xie  *       notice, this list of conditions and the following disclaimer.
13*d19533e8SHuawei Xie  *     * Redistributions in binary form must reproduce the above copyright
14*d19533e8SHuawei Xie  *       notice, this list of conditions and the following disclaimer in
15*d19533e8SHuawei Xie  *       the documentation and/or other materials provided with the
16*d19533e8SHuawei Xie  *       distribution.
17*d19533e8SHuawei Xie  *     * Neither the name of Intel Corporation nor the names of its
18*d19533e8SHuawei Xie  *       contributors may be used to endorse or promote products derived
19*d19533e8SHuawei Xie  *       from this software without specific prior written permission.
20*d19533e8SHuawei Xie  *
21*d19533e8SHuawei Xie  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22*d19533e8SHuawei Xie  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23*d19533e8SHuawei Xie  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24*d19533e8SHuawei Xie  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25*d19533e8SHuawei Xie  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26*d19533e8SHuawei Xie  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27*d19533e8SHuawei Xie  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28*d19533e8SHuawei Xie  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29*d19533e8SHuawei Xie  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30*d19533e8SHuawei Xie  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31*d19533e8SHuawei Xie  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32*d19533e8SHuawei Xie  */
33*d19533e8SHuawei Xie 
34*d19533e8SHuawei Xie #include <arpa/inet.h>
35*d19533e8SHuawei Xie #include <getopt.h>
36*d19533e8SHuawei Xie #include <linux/if_ether.h>
37*d19533e8SHuawei Xie #include <linux/if_vlan.h>
38*d19533e8SHuawei Xie #include <linux/virtio_net.h>
39*d19533e8SHuawei Xie #include <linux/virtio_ring.h>
40*d19533e8SHuawei Xie #include <signal.h>
41*d19533e8SHuawei Xie #include <stdint.h>
42*d19533e8SHuawei Xie #include <sys/eventfd.h>
43*d19533e8SHuawei Xie #include <sys/param.h>
44*d19533e8SHuawei Xie #include <unistd.h>
45*d19533e8SHuawei Xie 
46*d19533e8SHuawei Xie #include <rte_atomic.h>
47*d19533e8SHuawei Xie #include <rte_cycles.h>
48*d19533e8SHuawei Xie #include <rte_ethdev.h>
49*d19533e8SHuawei Xie #include <rte_log.h>
50*d19533e8SHuawei Xie #include <rte_string_fns.h>
51*d19533e8SHuawei Xie #include <rte_malloc.h>
52*d19533e8SHuawei Xie 
53*d19533e8SHuawei Xie #include "main.h"
54*d19533e8SHuawei Xie #include "virtio-net.h"
55*d19533e8SHuawei Xie #include "vhost-net-cdev.h"
56*d19533e8SHuawei Xie 
57*d19533e8SHuawei Xie #define MAX_QUEUES 128
58*d19533e8SHuawei Xie 
59*d19533e8SHuawei Xie /* the maximum number of external ports supported */
60*d19533e8SHuawei Xie #define MAX_SUP_PORTS 1
61*d19533e8SHuawei Xie 
62*d19533e8SHuawei Xie /*
63*d19533e8SHuawei Xie  * Calculate the number of buffers needed per port
64*d19533e8SHuawei Xie  */
65*d19533e8SHuawei Xie #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
66*d19533e8SHuawei Xie 							(num_switching_cores*MAX_PKT_BURST) +  			\
67*d19533e8SHuawei Xie 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68*d19533e8SHuawei Xie 							(num_switching_cores*MBUF_CACHE_SIZE))
69*d19533e8SHuawei Xie 
70*d19533e8SHuawei Xie #define MBUF_CACHE_SIZE 128
71*d19533e8SHuawei Xie #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72*d19533e8SHuawei Xie 
73*d19533e8SHuawei Xie /*
74*d19533e8SHuawei Xie  * No frame data buffer allocated from host are required for zero copy
75*d19533e8SHuawei Xie  * implementation, guest will allocate the frame data buffer, and vhost
76*d19533e8SHuawei Xie  * directly use it.
77*d19533e8SHuawei Xie  */
78*d19533e8SHuawei Xie #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79*d19533e8SHuawei Xie #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80*d19533e8SHuawei Xie 	+ RTE_PKTMBUF_HEADROOM)
81*d19533e8SHuawei Xie #define MBUF_CACHE_SIZE_ZCP 0
82*d19533e8SHuawei Xie 
83*d19533e8SHuawei Xie /*
84*d19533e8SHuawei Xie  * RX and TX Prefetch, Host, and Write-back threshold values should be
85*d19533e8SHuawei Xie  * carefully set for optimal performance. Consult the network
86*d19533e8SHuawei Xie  * controller's datasheet and supporting DPDK documentation for guidance
87*d19533e8SHuawei Xie  * on how these parameters should be set.
88*d19533e8SHuawei Xie  */
89*d19533e8SHuawei Xie #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90*d19533e8SHuawei Xie #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91*d19533e8SHuawei Xie #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92*d19533e8SHuawei Xie 
93*d19533e8SHuawei Xie /*
94*d19533e8SHuawei Xie  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95*d19533e8SHuawei Xie  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96*d19533e8SHuawei Xie  * network controllers and/or network drivers.
97*d19533e8SHuawei Xie  */
98*d19533e8SHuawei Xie #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99*d19533e8SHuawei Xie #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100*d19533e8SHuawei Xie #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101*d19533e8SHuawei Xie 
102*d19533e8SHuawei Xie #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
103*d19533e8SHuawei Xie #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
104*d19533e8SHuawei Xie #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
105*d19533e8SHuawei Xie 
106*d19533e8SHuawei Xie #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
107*d19533e8SHuawei Xie #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
108*d19533e8SHuawei Xie 
109*d19533e8SHuawei Xie #define JUMBO_FRAME_MAX_SIZE    0x2600
110*d19533e8SHuawei Xie 
111*d19533e8SHuawei Xie /* State of virtio device. */
112*d19533e8SHuawei Xie #define DEVICE_MAC_LEARNING 0
113*d19533e8SHuawei Xie #define DEVICE_RX			1
114*d19533e8SHuawei Xie #define DEVICE_SAFE_REMOVE	2
115*d19533e8SHuawei Xie 
116*d19533e8SHuawei Xie /* Config_core_flag status definitions. */
117*d19533e8SHuawei Xie #define REQUEST_DEV_REMOVAL 1
118*d19533e8SHuawei Xie #define ACK_DEV_REMOVAL 0
119*d19533e8SHuawei Xie 
120*d19533e8SHuawei Xie /* Configurable number of RX/TX ring descriptors */
121*d19533e8SHuawei Xie #define RTE_TEST_RX_DESC_DEFAULT 1024
122*d19533e8SHuawei Xie #define RTE_TEST_TX_DESC_DEFAULT 512
123*d19533e8SHuawei Xie 
124*d19533e8SHuawei Xie /*
125*d19533e8SHuawei Xie  * Need refine these 2 macros for legacy and DPDK based front end:
126*d19533e8SHuawei Xie  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127*d19533e8SHuawei Xie  * And then adjust power 2.
128*d19533e8SHuawei Xie  */
129*d19533e8SHuawei Xie /*
130*d19533e8SHuawei Xie  * For legacy front end, 128 descriptors,
131*d19533e8SHuawei Xie  * half for virtio header, another half for mbuf.
132*d19533e8SHuawei Xie  */
133*d19533e8SHuawei Xie #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
134*d19533e8SHuawei Xie #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
135*d19533e8SHuawei Xie 
136*d19533e8SHuawei Xie /* Get first 4 bytes in mbuf headroom. */
137*d19533e8SHuawei Xie #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138*d19533e8SHuawei Xie 		+ sizeof(struct rte_mbuf)))
139*d19533e8SHuawei Xie 
140*d19533e8SHuawei Xie /* true if x is a power of 2 */
141*d19533e8SHuawei Xie #define POWEROF2(x) ((((x)-1) & (x)) == 0)
142*d19533e8SHuawei Xie 
143*d19533e8SHuawei Xie #define INVALID_PORT_ID 0xFF
144*d19533e8SHuawei Xie 
145*d19533e8SHuawei Xie /* Max number of devices. Limited by vmdq. */
146*d19533e8SHuawei Xie #define MAX_DEVICES 64
147*d19533e8SHuawei Xie 
148*d19533e8SHuawei Xie /* Size of buffers used for snprintfs. */
149*d19533e8SHuawei Xie #define MAX_PRINT_BUFF 6072
150*d19533e8SHuawei Xie 
151*d19533e8SHuawei Xie /* Maximum character device basename size. */
152*d19533e8SHuawei Xie #define MAX_BASENAME_SZ 10
153*d19533e8SHuawei Xie 
154*d19533e8SHuawei Xie /* Maximum long option length for option parsing. */
155*d19533e8SHuawei Xie #define MAX_LONG_OPT_SZ 64
156*d19533e8SHuawei Xie 
157*d19533e8SHuawei Xie /* Used to compare MAC addresses. */
158*d19533e8SHuawei Xie #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
159*d19533e8SHuawei Xie 
160*d19533e8SHuawei Xie /* Number of descriptors per cacheline. */
161*d19533e8SHuawei Xie #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
162*d19533e8SHuawei Xie 
163*d19533e8SHuawei Xie /* mask of enabled ports */
164*d19533e8SHuawei Xie static uint32_t enabled_port_mask = 0;
165*d19533e8SHuawei Xie 
166*d19533e8SHuawei Xie /*Number of switching cores enabled*/
167*d19533e8SHuawei Xie static uint32_t num_switching_cores = 0;
168*d19533e8SHuawei Xie 
169*d19533e8SHuawei Xie /* number of devices/queues to support*/
170*d19533e8SHuawei Xie static uint32_t num_queues = 0;
171*d19533e8SHuawei Xie uint32_t num_devices = 0;
172*d19533e8SHuawei Xie 
173*d19533e8SHuawei Xie /*
174*d19533e8SHuawei Xie  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175*d19533e8SHuawei Xie  * disabled on default.
176*d19533e8SHuawei Xie  */
177*d19533e8SHuawei Xie static uint32_t zero_copy;
178*d19533e8SHuawei Xie 
179*d19533e8SHuawei Xie /* number of descriptors to apply*/
180*d19533e8SHuawei Xie static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181*d19533e8SHuawei Xie static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182*d19533e8SHuawei Xie 
183*d19533e8SHuawei Xie /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184*d19533e8SHuawei Xie #define MAX_RING_DESC 4096
185*d19533e8SHuawei Xie 
186*d19533e8SHuawei Xie struct vpool {
187*d19533e8SHuawei Xie 	struct rte_mempool *pool;
188*d19533e8SHuawei Xie 	struct rte_ring *ring;
189*d19533e8SHuawei Xie 	uint32_t buf_size;
190*d19533e8SHuawei Xie } vpool_array[MAX_QUEUES+MAX_QUEUES];
191*d19533e8SHuawei Xie 
192*d19533e8SHuawei Xie /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
193*d19533e8SHuawei Xie typedef enum {
194*d19533e8SHuawei Xie 	VM2VM_DISABLED = 0,
195*d19533e8SHuawei Xie 	VM2VM_SOFTWARE = 1,
196*d19533e8SHuawei Xie 	VM2VM_HARDWARE = 2,
197*d19533e8SHuawei Xie 	VM2VM_LAST
198*d19533e8SHuawei Xie } vm2vm_type;
199*d19533e8SHuawei Xie static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200*d19533e8SHuawei Xie 
201*d19533e8SHuawei Xie /* The type of host physical address translated from guest physical address. */
202*d19533e8SHuawei Xie typedef enum {
203*d19533e8SHuawei Xie 	PHYS_ADDR_CONTINUOUS = 0,
204*d19533e8SHuawei Xie 	PHYS_ADDR_CROSS_SUBREG = 1,
205*d19533e8SHuawei Xie 	PHYS_ADDR_INVALID = 2,
206*d19533e8SHuawei Xie 	PHYS_ADDR_LAST
207*d19533e8SHuawei Xie } hpa_type;
208*d19533e8SHuawei Xie 
209*d19533e8SHuawei Xie /* Enable stats. */
210*d19533e8SHuawei Xie static uint32_t enable_stats = 0;
211*d19533e8SHuawei Xie /* Enable retries on RX. */
212*d19533e8SHuawei Xie static uint32_t enable_retry = 1;
213*d19533e8SHuawei Xie /* Specify timeout (in useconds) between retries on RX. */
214*d19533e8SHuawei Xie static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215*d19533e8SHuawei Xie /* Specify the number of retries on RX. */
216*d19533e8SHuawei Xie static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217*d19533e8SHuawei Xie 
218*d19533e8SHuawei Xie /* Character device basename. Can be set by user. */
219*d19533e8SHuawei Xie static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220*d19533e8SHuawei Xie 
221*d19533e8SHuawei Xie /* Charater device index. Can be set by user. */
222*d19533e8SHuawei Xie static uint32_t dev_index = 0;
223*d19533e8SHuawei Xie 
224*d19533e8SHuawei Xie /* This can be set by the user so it is made available here. */
225*d19533e8SHuawei Xie extern uint64_t VHOST_FEATURES;
226*d19533e8SHuawei Xie 
227*d19533e8SHuawei Xie /* Default configuration for rx and tx thresholds etc. */
228*d19533e8SHuawei Xie static struct rte_eth_rxconf rx_conf_default = {
229*d19533e8SHuawei Xie 	.rx_thresh = {
230*d19533e8SHuawei Xie 		.pthresh = RX_PTHRESH,
231*d19533e8SHuawei Xie 		.hthresh = RX_HTHRESH,
232*d19533e8SHuawei Xie 		.wthresh = RX_WTHRESH,
233*d19533e8SHuawei Xie 	},
234*d19533e8SHuawei Xie 	.rx_drop_en = 1,
235*d19533e8SHuawei Xie };
236*d19533e8SHuawei Xie 
237*d19533e8SHuawei Xie /*
238*d19533e8SHuawei Xie  * These default values are optimized for use with the Intel(R) 82599 10 GbE
239*d19533e8SHuawei Xie  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240*d19533e8SHuawei Xie  * network controllers and/or network drivers.
241*d19533e8SHuawei Xie  */
242*d19533e8SHuawei Xie static struct rte_eth_txconf tx_conf_default = {
243*d19533e8SHuawei Xie 	.tx_thresh = {
244*d19533e8SHuawei Xie 		.pthresh = TX_PTHRESH,
245*d19533e8SHuawei Xie 		.hthresh = TX_HTHRESH,
246*d19533e8SHuawei Xie 		.wthresh = TX_WTHRESH,
247*d19533e8SHuawei Xie 	},
248*d19533e8SHuawei Xie 	.tx_free_thresh = 0, /* Use PMD default values */
249*d19533e8SHuawei Xie 	.tx_rs_thresh = 0, /* Use PMD default values */
250*d19533e8SHuawei Xie };
251*d19533e8SHuawei Xie 
252*d19533e8SHuawei Xie /* empty vmdq configuration structure. Filled in programatically */
253*d19533e8SHuawei Xie static struct rte_eth_conf vmdq_conf_default = {
254*d19533e8SHuawei Xie 	.rxmode = {
255*d19533e8SHuawei Xie 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
256*d19533e8SHuawei Xie 		.split_hdr_size = 0,
257*d19533e8SHuawei Xie 		.header_split   = 0, /**< Header Split disabled */
258*d19533e8SHuawei Xie 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
259*d19533e8SHuawei Xie 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
260*d19533e8SHuawei Xie 		/*
261*d19533e8SHuawei Xie 		 * It is necessary for 1G NIC such as I350,
262*d19533e8SHuawei Xie 		 * this fixes bug of ipv4 forwarding in guest can't
263*d19533e8SHuawei Xie 		 * forward pakets from one virtio dev to another virtio dev.
264*d19533e8SHuawei Xie 		 */
265*d19533e8SHuawei Xie 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
266*d19533e8SHuawei Xie 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
267*d19533e8SHuawei Xie 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
268*d19533e8SHuawei Xie 	},
269*d19533e8SHuawei Xie 
270*d19533e8SHuawei Xie 	.txmode = {
271*d19533e8SHuawei Xie 		.mq_mode = ETH_MQ_TX_NONE,
272*d19533e8SHuawei Xie 	},
273*d19533e8SHuawei Xie 	.rx_adv_conf = {
274*d19533e8SHuawei Xie 		/*
275*d19533e8SHuawei Xie 		 * should be overridden separately in code with
276*d19533e8SHuawei Xie 		 * appropriate values
277*d19533e8SHuawei Xie 		 */
278*d19533e8SHuawei Xie 		.vmdq_rx_conf = {
279*d19533e8SHuawei Xie 			.nb_queue_pools = ETH_8_POOLS,
280*d19533e8SHuawei Xie 			.enable_default_pool = 0,
281*d19533e8SHuawei Xie 			.default_pool = 0,
282*d19533e8SHuawei Xie 			.nb_pool_maps = 0,
283*d19533e8SHuawei Xie 			.pool_map = {{0, 0},},
284*d19533e8SHuawei Xie 		},
285*d19533e8SHuawei Xie 	},
286*d19533e8SHuawei Xie };
287*d19533e8SHuawei Xie 
288*d19533e8SHuawei Xie static unsigned lcore_ids[RTE_MAX_LCORE];
289*d19533e8SHuawei Xie static uint8_t ports[RTE_MAX_ETHPORTS];
290*d19533e8SHuawei Xie static unsigned num_ports = 0; /**< The number of ports specified in command line */
291*d19533e8SHuawei Xie 
292*d19533e8SHuawei Xie static const uint16_t external_pkt_default_vlan_tag = 2000;
293*d19533e8SHuawei Xie const uint16_t vlan_tags[] = {
294*d19533e8SHuawei Xie 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295*d19533e8SHuawei Xie 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
296*d19533e8SHuawei Xie 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297*d19533e8SHuawei Xie 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298*d19533e8SHuawei Xie 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299*d19533e8SHuawei Xie 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300*d19533e8SHuawei Xie 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301*d19533e8SHuawei Xie 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302*d19533e8SHuawei Xie };
303*d19533e8SHuawei Xie 
304*d19533e8SHuawei Xie /* ethernet addresses of ports */
305*d19533e8SHuawei Xie static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306*d19533e8SHuawei Xie 
307*d19533e8SHuawei Xie /* heads for the main used and free linked lists for the data path. */
308*d19533e8SHuawei Xie static struct virtio_net_data_ll *ll_root_used = NULL;
309*d19533e8SHuawei Xie static struct virtio_net_data_ll *ll_root_free = NULL;
310*d19533e8SHuawei Xie 
311*d19533e8SHuawei Xie /* Array of data core structures containing information on individual core linked lists. */
312*d19533e8SHuawei Xie static struct lcore_info lcore_info[RTE_MAX_LCORE];
313*d19533e8SHuawei Xie 
314*d19533e8SHuawei Xie /* Used for queueing bursts of TX packets. */
315*d19533e8SHuawei Xie struct mbuf_table {
316*d19533e8SHuawei Xie 	unsigned len;
317*d19533e8SHuawei Xie 	unsigned txq_id;
318*d19533e8SHuawei Xie 	struct rte_mbuf *m_table[MAX_PKT_BURST];
319*d19533e8SHuawei Xie };
320*d19533e8SHuawei Xie 
321*d19533e8SHuawei Xie /* TX queue for each data core. */
322*d19533e8SHuawei Xie struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323*d19533e8SHuawei Xie 
324*d19533e8SHuawei Xie /* TX queue fori each virtio device for zero copy. */
325*d19533e8SHuawei Xie struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326*d19533e8SHuawei Xie 
327*d19533e8SHuawei Xie /* Vlan header struct used to insert vlan tags on TX. */
328*d19533e8SHuawei Xie struct vlan_ethhdr {
329*d19533e8SHuawei Xie 	unsigned char   h_dest[ETH_ALEN];
330*d19533e8SHuawei Xie 	unsigned char   h_source[ETH_ALEN];
331*d19533e8SHuawei Xie 	__be16          h_vlan_proto;
332*d19533e8SHuawei Xie 	__be16          h_vlan_TCI;
333*d19533e8SHuawei Xie 	__be16          h_vlan_encapsulated_proto;
334*d19533e8SHuawei Xie };
335*d19533e8SHuawei Xie 
336*d19533e8SHuawei Xie /* IPv4 Header */
337*d19533e8SHuawei Xie struct ipv4_hdr {
338*d19533e8SHuawei Xie 	uint8_t  version_ihl;		/**< version and header length */
339*d19533e8SHuawei Xie 	uint8_t  type_of_service;	/**< type of service */
340*d19533e8SHuawei Xie 	uint16_t total_length;		/**< length of packet */
341*d19533e8SHuawei Xie 	uint16_t packet_id;		/**< packet ID */
342*d19533e8SHuawei Xie 	uint16_t fragment_offset;	/**< fragmentation offset */
343*d19533e8SHuawei Xie 	uint8_t  time_to_live;		/**< time to live */
344*d19533e8SHuawei Xie 	uint8_t  next_proto_id;		/**< protocol ID */
345*d19533e8SHuawei Xie 	uint16_t hdr_checksum;		/**< header checksum */
346*d19533e8SHuawei Xie 	uint32_t src_addr;		/**< source address */
347*d19533e8SHuawei Xie 	uint32_t dst_addr;		/**< destination address */
348*d19533e8SHuawei Xie } __attribute__((__packed__));
349*d19533e8SHuawei Xie 
350*d19533e8SHuawei Xie /* Header lengths. */
351*d19533e8SHuawei Xie #define VLAN_HLEN       4
352*d19533e8SHuawei Xie #define VLAN_ETH_HLEN   18
353*d19533e8SHuawei Xie 
354*d19533e8SHuawei Xie /* Per-device statistics struct */
355*d19533e8SHuawei Xie struct device_statistics {
356*d19533e8SHuawei Xie 	uint64_t tx_total;
357*d19533e8SHuawei Xie 	rte_atomic64_t rx_total_atomic;
358*d19533e8SHuawei Xie 	uint64_t rx_total;
359*d19533e8SHuawei Xie 	uint64_t tx;
360*d19533e8SHuawei Xie 	rte_atomic64_t rx_atomic;
361*d19533e8SHuawei Xie 	uint64_t rx;
362*d19533e8SHuawei Xie } __rte_cache_aligned;
363*d19533e8SHuawei Xie struct device_statistics dev_statistics[MAX_DEVICES];
364*d19533e8SHuawei Xie 
365*d19533e8SHuawei Xie /*
366*d19533e8SHuawei Xie  * Builds up the correct configuration for VMDQ VLAN pool map
367*d19533e8SHuawei Xie  * according to the pool & queue limits.
368*d19533e8SHuawei Xie  */
369*d19533e8SHuawei Xie static inline int
370*d19533e8SHuawei Xie get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371*d19533e8SHuawei Xie {
372*d19533e8SHuawei Xie 	struct rte_eth_vmdq_rx_conf conf;
373*d19533e8SHuawei Xie 	unsigned i;
374*d19533e8SHuawei Xie 
375*d19533e8SHuawei Xie 	memset(&conf, 0, sizeof(conf));
376*d19533e8SHuawei Xie 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377*d19533e8SHuawei Xie 	conf.nb_pool_maps = num_devices;
378*d19533e8SHuawei Xie 	conf.enable_loop_back =
379*d19533e8SHuawei Xie 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
380*d19533e8SHuawei Xie 
381*d19533e8SHuawei Xie 	for (i = 0; i < conf.nb_pool_maps; i++) {
382*d19533e8SHuawei Xie 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
383*d19533e8SHuawei Xie 		conf.pool_map[i].pools = (1UL << i);
384*d19533e8SHuawei Xie 	}
385*d19533e8SHuawei Xie 
386*d19533e8SHuawei Xie 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387*d19533e8SHuawei Xie 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388*d19533e8SHuawei Xie 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
389*d19533e8SHuawei Xie 	return 0;
390*d19533e8SHuawei Xie }
391*d19533e8SHuawei Xie 
392*d19533e8SHuawei Xie /*
393*d19533e8SHuawei Xie  * Validate the device number according to the max pool number gotten form
394*d19533e8SHuawei Xie  * dev_info. If the device number is invalid, give the error message and
395*d19533e8SHuawei Xie  * return -1. Each device must have its own pool.
396*d19533e8SHuawei Xie  */
397*d19533e8SHuawei Xie static inline int
398*d19533e8SHuawei Xie validate_num_devices(uint32_t max_nb_devices)
399*d19533e8SHuawei Xie {
400*d19533e8SHuawei Xie 	if (num_devices > max_nb_devices) {
401*d19533e8SHuawei Xie 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402*d19533e8SHuawei Xie 		return -1;
403*d19533e8SHuawei Xie 	}
404*d19533e8SHuawei Xie 	return 0;
405*d19533e8SHuawei Xie }
406*d19533e8SHuawei Xie 
407*d19533e8SHuawei Xie /*
408*d19533e8SHuawei Xie  * Initialises a given port using global settings and with the rx buffers
409*d19533e8SHuawei Xie  * coming from the mbuf_pool passed as parameter
410*d19533e8SHuawei Xie  */
411*d19533e8SHuawei Xie static inline int
412*d19533e8SHuawei Xie port_init(uint8_t port)
413*d19533e8SHuawei Xie {
414*d19533e8SHuawei Xie 	struct rte_eth_dev_info dev_info;
415*d19533e8SHuawei Xie 	struct rte_eth_conf port_conf;
416*d19533e8SHuawei Xie 	uint16_t rx_rings, tx_rings;
417*d19533e8SHuawei Xie 	uint16_t rx_ring_size, tx_ring_size;
418*d19533e8SHuawei Xie 	int retval;
419*d19533e8SHuawei Xie 	uint16_t q;
420*d19533e8SHuawei Xie 
421*d19533e8SHuawei Xie 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422*d19533e8SHuawei Xie 	rte_eth_dev_info_get (port, &dev_info);
423*d19533e8SHuawei Xie 
424*d19533e8SHuawei Xie 	/*configure the number of supported virtio devices based on VMDQ limits */
425*d19533e8SHuawei Xie 	num_devices = dev_info.max_vmdq_pools;
426*d19533e8SHuawei Xie 	num_queues = dev_info.max_rx_queues;
427*d19533e8SHuawei Xie 
428*d19533e8SHuawei Xie 	if (zero_copy) {
429*d19533e8SHuawei Xie 		rx_ring_size = num_rx_descriptor;
430*d19533e8SHuawei Xie 		tx_ring_size = num_tx_descriptor;
431*d19533e8SHuawei Xie 		tx_rings = dev_info.max_tx_queues;
432*d19533e8SHuawei Xie 	} else {
433*d19533e8SHuawei Xie 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434*d19533e8SHuawei Xie 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435*d19533e8SHuawei Xie 		tx_rings = (uint16_t)rte_lcore_count();
436*d19533e8SHuawei Xie 	}
437*d19533e8SHuawei Xie 
438*d19533e8SHuawei Xie 	retval = validate_num_devices(MAX_DEVICES);
439*d19533e8SHuawei Xie 	if (retval < 0)
440*d19533e8SHuawei Xie 		return retval;
441*d19533e8SHuawei Xie 
442*d19533e8SHuawei Xie 	/* Get port configuration. */
443*d19533e8SHuawei Xie 	retval = get_eth_conf(&port_conf, num_devices);
444*d19533e8SHuawei Xie 	if (retval < 0)
445*d19533e8SHuawei Xie 		return retval;
446*d19533e8SHuawei Xie 
447*d19533e8SHuawei Xie 	if (port >= rte_eth_dev_count()) return -1;
448*d19533e8SHuawei Xie 
449*d19533e8SHuawei Xie 	rx_rings = (uint16_t)num_queues,
450*d19533e8SHuawei Xie 	/* Configure ethernet device. */
451*d19533e8SHuawei Xie 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452*d19533e8SHuawei Xie 	if (retval != 0)
453*d19533e8SHuawei Xie 		return retval;
454*d19533e8SHuawei Xie 
455*d19533e8SHuawei Xie 	/* Setup the queues. */
456*d19533e8SHuawei Xie 	for (q = 0; q < rx_rings; q ++) {
457*d19533e8SHuawei Xie 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458*d19533e8SHuawei Xie 						rte_eth_dev_socket_id(port), &rx_conf_default,
459*d19533e8SHuawei Xie 						vpool_array[q].pool);
460*d19533e8SHuawei Xie 		if (retval < 0)
461*d19533e8SHuawei Xie 			return retval;
462*d19533e8SHuawei Xie 	}
463*d19533e8SHuawei Xie 	for (q = 0; q < tx_rings; q ++) {
464*d19533e8SHuawei Xie 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465*d19533e8SHuawei Xie 						rte_eth_dev_socket_id(port), &tx_conf_default);
466*d19533e8SHuawei Xie 		if (retval < 0)
467*d19533e8SHuawei Xie 			return retval;
468*d19533e8SHuawei Xie 	}
469*d19533e8SHuawei Xie 
470*d19533e8SHuawei Xie 	/* Start the device. */
471*d19533e8SHuawei Xie 	retval  = rte_eth_dev_start(port);
472*d19533e8SHuawei Xie 	if (retval < 0) {
473*d19533e8SHuawei Xie 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474*d19533e8SHuawei Xie 		return retval;
475*d19533e8SHuawei Xie 	}
476*d19533e8SHuawei Xie 
477*d19533e8SHuawei Xie 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480*d19533e8SHuawei Xie 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481*d19533e8SHuawei Xie 			(unsigned)port,
482*d19533e8SHuawei Xie 			vmdq_ports_eth_addr[port].addr_bytes[0],
483*d19533e8SHuawei Xie 			vmdq_ports_eth_addr[port].addr_bytes[1],
484*d19533e8SHuawei Xie 			vmdq_ports_eth_addr[port].addr_bytes[2],
485*d19533e8SHuawei Xie 			vmdq_ports_eth_addr[port].addr_bytes[3],
486*d19533e8SHuawei Xie 			vmdq_ports_eth_addr[port].addr_bytes[4],
487*d19533e8SHuawei Xie 			vmdq_ports_eth_addr[port].addr_bytes[5]);
488*d19533e8SHuawei Xie 
489*d19533e8SHuawei Xie 	return 0;
490*d19533e8SHuawei Xie }
491*d19533e8SHuawei Xie 
492*d19533e8SHuawei Xie /*
493*d19533e8SHuawei Xie  * Set character device basename.
494*d19533e8SHuawei Xie  */
495*d19533e8SHuawei Xie static int
496*d19533e8SHuawei Xie us_vhost_parse_basename(const char *q_arg)
497*d19533e8SHuawei Xie {
498*d19533e8SHuawei Xie 	/* parse number string */
499*d19533e8SHuawei Xie 
500*d19533e8SHuawei Xie 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501*d19533e8SHuawei Xie 		return -1;
502*d19533e8SHuawei Xie 	else
503*d19533e8SHuawei Xie 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504*d19533e8SHuawei Xie 
505*d19533e8SHuawei Xie 	return 0;
506*d19533e8SHuawei Xie }
507*d19533e8SHuawei Xie 
508*d19533e8SHuawei Xie /*
509*d19533e8SHuawei Xie  * Parse the portmask provided at run time.
510*d19533e8SHuawei Xie  */
511*d19533e8SHuawei Xie static int
512*d19533e8SHuawei Xie parse_portmask(const char *portmask)
513*d19533e8SHuawei Xie {
514*d19533e8SHuawei Xie 	char *end = NULL;
515*d19533e8SHuawei Xie 	unsigned long pm;
516*d19533e8SHuawei Xie 
517*d19533e8SHuawei Xie 	errno = 0;
518*d19533e8SHuawei Xie 
519*d19533e8SHuawei Xie 	/* parse hexadecimal string */
520*d19533e8SHuawei Xie 	pm = strtoul(portmask, &end, 16);
521*d19533e8SHuawei Xie 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522*d19533e8SHuawei Xie 		return -1;
523*d19533e8SHuawei Xie 
524*d19533e8SHuawei Xie 	if (pm == 0)
525*d19533e8SHuawei Xie 		return -1;
526*d19533e8SHuawei Xie 
527*d19533e8SHuawei Xie 	return pm;
528*d19533e8SHuawei Xie 
529*d19533e8SHuawei Xie }
530*d19533e8SHuawei Xie 
531*d19533e8SHuawei Xie /*
532*d19533e8SHuawei Xie  * Parse num options at run time.
533*d19533e8SHuawei Xie  */
534*d19533e8SHuawei Xie static int
535*d19533e8SHuawei Xie parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536*d19533e8SHuawei Xie {
537*d19533e8SHuawei Xie 	char *end = NULL;
538*d19533e8SHuawei Xie 	unsigned long num;
539*d19533e8SHuawei Xie 
540*d19533e8SHuawei Xie 	errno = 0;
541*d19533e8SHuawei Xie 
542*d19533e8SHuawei Xie 	/* parse unsigned int string */
543*d19533e8SHuawei Xie 	num = strtoul(q_arg, &end, 10);
544*d19533e8SHuawei Xie 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545*d19533e8SHuawei Xie 		return -1;
546*d19533e8SHuawei Xie 
547*d19533e8SHuawei Xie 	if (num > max_valid_value)
548*d19533e8SHuawei Xie 		return -1;
549*d19533e8SHuawei Xie 
550*d19533e8SHuawei Xie 	return num;
551*d19533e8SHuawei Xie 
552*d19533e8SHuawei Xie }
553*d19533e8SHuawei Xie 
554*d19533e8SHuawei Xie /*
555*d19533e8SHuawei Xie  * Display usage
556*d19533e8SHuawei Xie  */
557*d19533e8SHuawei Xie static void
558*d19533e8SHuawei Xie us_vhost_usage(const char *prgname)
559*d19533e8SHuawei Xie {
560*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561*d19533e8SHuawei Xie 	"		--vm2vm [0|1|2]\n"
562*d19533e8SHuawei Xie 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563*d19533e8SHuawei Xie 	"		--dev-basename <name> --dev-index [0-N]\n"
564*d19533e8SHuawei Xie 	"		--nb-devices ND\n"
565*d19533e8SHuawei Xie 	"		-p PORTMASK: Set mask for ports to be used by application\n"
566*d19533e8SHuawei Xie 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567*d19533e8SHuawei Xie 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568*d19533e8SHuawei Xie 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569*d19533e8SHuawei Xie 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570*d19533e8SHuawei Xie 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571*d19533e8SHuawei Xie 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572*d19533e8SHuawei Xie 	"		--dev-basename: The basename to be used for the character device.\n"
573*d19533e8SHuawei Xie 	"		--dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574*d19533e8SHuawei Xie 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
575*d19533e8SHuawei Xie 			"zero copy\n"
576*d19533e8SHuawei Xie 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
577*d19533e8SHuawei Xie 			"used only when zero copy is enabled.\n"
578*d19533e8SHuawei Xie 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
579*d19533e8SHuawei Xie 			"used only when zero copy is enabled.\n",
580*d19533e8SHuawei Xie 	       prgname);
581*d19533e8SHuawei Xie }
582*d19533e8SHuawei Xie 
583*d19533e8SHuawei Xie /*
584*d19533e8SHuawei Xie  * Parse the arguments given in the command line of the application.
585*d19533e8SHuawei Xie  */
586*d19533e8SHuawei Xie static int
587*d19533e8SHuawei Xie us_vhost_parse_args(int argc, char **argv)
588*d19533e8SHuawei Xie {
589*d19533e8SHuawei Xie 	int opt, ret;
590*d19533e8SHuawei Xie 	int option_index;
591*d19533e8SHuawei Xie 	unsigned i;
592*d19533e8SHuawei Xie 	const char *prgname = argv[0];
593*d19533e8SHuawei Xie 	static struct option long_option[] = {
594*d19533e8SHuawei Xie 		{"vm2vm", required_argument, NULL, 0},
595*d19533e8SHuawei Xie 		{"rx-retry", required_argument, NULL, 0},
596*d19533e8SHuawei Xie 		{"rx-retry-delay", required_argument, NULL, 0},
597*d19533e8SHuawei Xie 		{"rx-retry-num", required_argument, NULL, 0},
598*d19533e8SHuawei Xie 		{"mergeable", required_argument, NULL, 0},
599*d19533e8SHuawei Xie 		{"stats", required_argument, NULL, 0},
600*d19533e8SHuawei Xie 		{"dev-basename", required_argument, NULL, 0},
601*d19533e8SHuawei Xie 		{"dev-index", required_argument, NULL, 0},
602*d19533e8SHuawei Xie 		{"zero-copy", required_argument, NULL, 0},
603*d19533e8SHuawei Xie 		{"rx-desc-num", required_argument, NULL, 0},
604*d19533e8SHuawei Xie 		{"tx-desc-num", required_argument, NULL, 0},
605*d19533e8SHuawei Xie 		{NULL, 0, 0, 0},
606*d19533e8SHuawei Xie 	};
607*d19533e8SHuawei Xie 
608*d19533e8SHuawei Xie 	/* Parse command line */
609*d19533e8SHuawei Xie 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
610*d19533e8SHuawei Xie 		switch (opt) {
611*d19533e8SHuawei Xie 		/* Portmask */
612*d19533e8SHuawei Xie 		case 'p':
613*d19533e8SHuawei Xie 			enabled_port_mask = parse_portmask(optarg);
614*d19533e8SHuawei Xie 			if (enabled_port_mask == 0) {
615*d19533e8SHuawei Xie 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616*d19533e8SHuawei Xie 				us_vhost_usage(prgname);
617*d19533e8SHuawei Xie 				return -1;
618*d19533e8SHuawei Xie 			}
619*d19533e8SHuawei Xie 			break;
620*d19533e8SHuawei Xie 
621*d19533e8SHuawei Xie 		case 0:
622*d19533e8SHuawei Xie 			/* Enable/disable vm2vm comms. */
623*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "vm2vm",
624*d19533e8SHuawei Xie 				MAX_LONG_OPT_SZ)) {
625*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
626*d19533e8SHuawei Xie 				if (ret == -1) {
627*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG,
628*d19533e8SHuawei Xie 						"Invalid argument for "
629*d19533e8SHuawei Xie 						"vm2vm [0|1|2]\n");
630*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
631*d19533e8SHuawei Xie 					return -1;
632*d19533e8SHuawei Xie 				} else {
633*d19533e8SHuawei Xie 					vm2vm_mode = (vm2vm_type)ret;
634*d19533e8SHuawei Xie 				}
635*d19533e8SHuawei Xie 			}
636*d19533e8SHuawei Xie 
637*d19533e8SHuawei Xie 			/* Enable/disable retries on RX. */
638*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, 1);
640*d19533e8SHuawei Xie 				if (ret == -1) {
641*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
643*d19533e8SHuawei Xie 					return -1;
644*d19533e8SHuawei Xie 				} else {
645*d19533e8SHuawei Xie 					enable_retry = ret;
646*d19533e8SHuawei Xie 				}
647*d19533e8SHuawei Xie 			}
648*d19533e8SHuawei Xie 
649*d19533e8SHuawei Xie 			/* Specify the retries delay time (in useconds) on RX. */
650*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, INT32_MAX);
652*d19533e8SHuawei Xie 				if (ret == -1) {
653*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
655*d19533e8SHuawei Xie 					return -1;
656*d19533e8SHuawei Xie 				} else {
657*d19533e8SHuawei Xie 					burst_rx_delay_time = ret;
658*d19533e8SHuawei Xie 				}
659*d19533e8SHuawei Xie 			}
660*d19533e8SHuawei Xie 
661*d19533e8SHuawei Xie 			/* Specify the retries number on RX. */
662*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, INT32_MAX);
664*d19533e8SHuawei Xie 				if (ret == -1) {
665*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
667*d19533e8SHuawei Xie 					return -1;
668*d19533e8SHuawei Xie 				} else {
669*d19533e8SHuawei Xie 					burst_rx_retry_num = ret;
670*d19533e8SHuawei Xie 				}
671*d19533e8SHuawei Xie 			}
672*d19533e8SHuawei Xie 
673*d19533e8SHuawei Xie 			/* Enable/disable RX mergeable buffers. */
674*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, 1);
676*d19533e8SHuawei Xie 				if (ret == -1) {
677*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
679*d19533e8SHuawei Xie 					return -1;
680*d19533e8SHuawei Xie 				} else {
681*d19533e8SHuawei Xie 					if (ret) {
682*d19533e8SHuawei Xie 						vmdq_conf_default.rxmode.jumbo_frame = 1;
683*d19533e8SHuawei Xie 						vmdq_conf_default.rxmode.max_rx_pkt_len
684*d19533e8SHuawei Xie 							= JUMBO_FRAME_MAX_SIZE;
685*d19533e8SHuawei Xie 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
686*d19533e8SHuawei Xie 					}
687*d19533e8SHuawei Xie 				}
688*d19533e8SHuawei Xie 			}
689*d19533e8SHuawei Xie 
690*d19533e8SHuawei Xie 			/* Enable/disable stats. */
691*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, INT32_MAX);
693*d19533e8SHuawei Xie 				if (ret == -1) {
694*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
696*d19533e8SHuawei Xie 					return -1;
697*d19533e8SHuawei Xie 				} else {
698*d19533e8SHuawei Xie 					enable_stats = ret;
699*d19533e8SHuawei Xie 				}
700*d19533e8SHuawei Xie 			}
701*d19533e8SHuawei Xie 
702*d19533e8SHuawei Xie 			/* Set character device basename. */
703*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704*d19533e8SHuawei Xie 				if (us_vhost_parse_basename(optarg) == -1) {
705*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
707*d19533e8SHuawei Xie 					return -1;
708*d19533e8SHuawei Xie 				}
709*d19533e8SHuawei Xie 			}
710*d19533e8SHuawei Xie 
711*d19533e8SHuawei Xie 			/* Set character device index. */
712*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, INT32_MAX);
714*d19533e8SHuawei Xie 				if (ret == -1) {
715*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
717*d19533e8SHuawei Xie 					return -1;
718*d19533e8SHuawei Xie 				} else
719*d19533e8SHuawei Xie 					dev_index = ret;
720*d19533e8SHuawei Xie 			}
721*d19533e8SHuawei Xie 
722*d19533e8SHuawei Xie 			/* Enable/disable rx/tx zero copy. */
723*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name,
724*d19533e8SHuawei Xie 				"zero-copy", MAX_LONG_OPT_SZ)) {
725*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, 1);
726*d19533e8SHuawei Xie 				if (ret == -1) {
727*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG,
728*d19533e8SHuawei Xie 						"Invalid argument"
729*d19533e8SHuawei Xie 						" for zero-copy [0|1]\n");
730*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
731*d19533e8SHuawei Xie 					return -1;
732*d19533e8SHuawei Xie 				} else
733*d19533e8SHuawei Xie 					zero_copy = ret;
734*d19533e8SHuawei Xie 
735*d19533e8SHuawei Xie 				if (zero_copy) {
736*d19533e8SHuawei Xie #ifdef RTE_MBUF_REFCNT
737*d19533e8SHuawei Xie 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738*d19533e8SHuawei Xie 					"zero copy vhost APP, please "
739*d19533e8SHuawei Xie 					"disable RTE_MBUF_REFCNT\n"
740*d19533e8SHuawei Xie 					"in config file and then rebuild DPDK "
741*d19533e8SHuawei Xie 					"core lib!\n"
742*d19533e8SHuawei Xie 					"Otherwise please disable zero copy "
743*d19533e8SHuawei Xie 					"flag in command line!\n");
744*d19533e8SHuawei Xie 					return -1;
745*d19533e8SHuawei Xie #endif
746*d19533e8SHuawei Xie 				}
747*d19533e8SHuawei Xie 			}
748*d19533e8SHuawei Xie 
749*d19533e8SHuawei Xie 			/* Specify the descriptor number on RX. */
750*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name,
751*d19533e8SHuawei Xie 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
752*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, MAX_RING_DESC);
753*d19533e8SHuawei Xie 				if ((ret == -1) || (!POWEROF2(ret))) {
754*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG,
755*d19533e8SHuawei Xie 					"Invalid argument for rx-desc-num[0-N],"
756*d19533e8SHuawei Xie 					"power of 2 required.\n");
757*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
758*d19533e8SHuawei Xie 					return -1;
759*d19533e8SHuawei Xie 				} else {
760*d19533e8SHuawei Xie 					num_rx_descriptor = ret;
761*d19533e8SHuawei Xie 				}
762*d19533e8SHuawei Xie 			}
763*d19533e8SHuawei Xie 
764*d19533e8SHuawei Xie 			/* Specify the descriptor number on TX. */
765*d19533e8SHuawei Xie 			if (!strncmp(long_option[option_index].name,
766*d19533e8SHuawei Xie 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
767*d19533e8SHuawei Xie 				ret = parse_num_opt(optarg, MAX_RING_DESC);
768*d19533e8SHuawei Xie 				if ((ret == -1) || (!POWEROF2(ret))) {
769*d19533e8SHuawei Xie 					RTE_LOG(INFO, VHOST_CONFIG,
770*d19533e8SHuawei Xie 					"Invalid argument for tx-desc-num [0-N],"
771*d19533e8SHuawei Xie 					"power of 2 required.\n");
772*d19533e8SHuawei Xie 					us_vhost_usage(prgname);
773*d19533e8SHuawei Xie 					return -1;
774*d19533e8SHuawei Xie 				} else {
775*d19533e8SHuawei Xie 					num_tx_descriptor = ret;
776*d19533e8SHuawei Xie 				}
777*d19533e8SHuawei Xie 			}
778*d19533e8SHuawei Xie 
779*d19533e8SHuawei Xie 			break;
780*d19533e8SHuawei Xie 
781*d19533e8SHuawei Xie 			/* Invalid option - print options. */
782*d19533e8SHuawei Xie 		default:
783*d19533e8SHuawei Xie 			us_vhost_usage(prgname);
784*d19533e8SHuawei Xie 			return -1;
785*d19533e8SHuawei Xie 		}
786*d19533e8SHuawei Xie 	}
787*d19533e8SHuawei Xie 
788*d19533e8SHuawei Xie 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789*d19533e8SHuawei Xie 		if (enabled_port_mask & (1 << i))
790*d19533e8SHuawei Xie 			ports[num_ports++] = (uint8_t)i;
791*d19533e8SHuawei Xie 	}
792*d19533e8SHuawei Xie 
793*d19533e8SHuawei Xie 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
794*d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795*d19533e8SHuawei Xie 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
796*d19533e8SHuawei Xie 		return -1;
797*d19533e8SHuawei Xie 	}
798*d19533e8SHuawei Xie 
799*d19533e8SHuawei Xie 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800*d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT,
801*d19533e8SHuawei Xie 			"Vhost zero copy doesn't support software vm2vm,"
802*d19533e8SHuawei Xie 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
803*d19533e8SHuawei Xie 		return -1;
804*d19533e8SHuawei Xie 	}
805*d19533e8SHuawei Xie 
806*d19533e8SHuawei Xie 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807*d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT,
808*d19533e8SHuawei Xie 			"Vhost zero copy doesn't support jumbo frame,"
809*d19533e8SHuawei Xie 			"please specify '--mergeable 0' to disable the "
810*d19533e8SHuawei Xie 			"mergeable feature.\n");
811*d19533e8SHuawei Xie 		return -1;
812*d19533e8SHuawei Xie 	}
813*d19533e8SHuawei Xie 
814*d19533e8SHuawei Xie 	return 0;
815*d19533e8SHuawei Xie }
816*d19533e8SHuawei Xie 
817*d19533e8SHuawei Xie /*
818*d19533e8SHuawei Xie  * Update the global var NUM_PORTS and array PORTS according to system ports number
819*d19533e8SHuawei Xie  * and return valid ports number
820*d19533e8SHuawei Xie  */
821*d19533e8SHuawei Xie static unsigned check_ports_num(unsigned nb_ports)
822*d19533e8SHuawei Xie {
823*d19533e8SHuawei Xie 	unsigned valid_num_ports = num_ports;
824*d19533e8SHuawei Xie 	unsigned portid;
825*d19533e8SHuawei Xie 
826*d19533e8SHuawei Xie 	if (num_ports > nb_ports) {
827*d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828*d19533e8SHuawei Xie 			num_ports, nb_ports);
829*d19533e8SHuawei Xie 		num_ports = nb_ports;
830*d19533e8SHuawei Xie 	}
831*d19533e8SHuawei Xie 
832*d19533e8SHuawei Xie 	for (portid = 0; portid < num_ports; portid ++) {
833*d19533e8SHuawei Xie 		if (ports[portid] >= nb_ports) {
834*d19533e8SHuawei Xie 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835*d19533e8SHuawei Xie 				ports[portid], (nb_ports - 1));
836*d19533e8SHuawei Xie 			ports[portid] = INVALID_PORT_ID;
837*d19533e8SHuawei Xie 			valid_num_ports--;
838*d19533e8SHuawei Xie 		}
839*d19533e8SHuawei Xie 	}
840*d19533e8SHuawei Xie 	return valid_num_ports;
841*d19533e8SHuawei Xie }
842*d19533e8SHuawei Xie 
843*d19533e8SHuawei Xie /*
844*d19533e8SHuawei Xie  * Macro to print out packet contents. Wrapped in debug define so that the
845*d19533e8SHuawei Xie  * data path is not effected when debug is disabled.
846*d19533e8SHuawei Xie  */
847*d19533e8SHuawei Xie #ifdef DEBUG
848*d19533e8SHuawei Xie #define PRINT_PACKET(device, addr, size, header) do {																\
849*d19533e8SHuawei Xie 	char *pkt_addr = (char*)(addr);																					\
850*d19533e8SHuawei Xie 	unsigned int index;																								\
851*d19533e8SHuawei Xie 	char packet[MAX_PRINT_BUFF];																					\
852*d19533e8SHuawei Xie 																													\
853*d19533e8SHuawei Xie 	if ((header))																									\
854*d19533e8SHuawei Xie 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
855*d19533e8SHuawei Xie 	else																											\
856*d19533e8SHuawei Xie 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
857*d19533e8SHuawei Xie 	for (index = 0; index < (size); index++) {																		\
858*d19533e8SHuawei Xie 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
859*d19533e8SHuawei Xie 			"%02hhx ", pkt_addr[index]);																			\
860*d19533e8SHuawei Xie 	}																												\
861*d19533e8SHuawei Xie 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
862*d19533e8SHuawei Xie 																													\
863*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
864*d19533e8SHuawei Xie } while(0)
865*d19533e8SHuawei Xie #else
866*d19533e8SHuawei Xie #define PRINT_PACKET(device, addr, size, header) do{} while(0)
867*d19533e8SHuawei Xie #endif
868*d19533e8SHuawei Xie 
869*d19533e8SHuawei Xie /*
870*d19533e8SHuawei Xie  * Function to convert guest physical addresses to vhost virtual addresses. This
871*d19533e8SHuawei Xie  * is used to convert virtio buffer addresses.
872*d19533e8SHuawei Xie  */
873*d19533e8SHuawei Xie static inline uint64_t __attribute__((always_inline))
874*d19533e8SHuawei Xie gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
875*d19533e8SHuawei Xie {
876*d19533e8SHuawei Xie 	struct virtio_memory_regions *region;
877*d19533e8SHuawei Xie 	uint32_t regionidx;
878*d19533e8SHuawei Xie 	uint64_t vhost_va = 0;
879*d19533e8SHuawei Xie 
880*d19533e8SHuawei Xie 	for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
881*d19533e8SHuawei Xie 		region = &dev->mem->regions[regionidx];
882*d19533e8SHuawei Xie 		if ((guest_pa >= region->guest_phys_address) &&
883*d19533e8SHuawei Xie 			(guest_pa <= region->guest_phys_address_end)) {
884*d19533e8SHuawei Xie 			vhost_va = region->address_offset + guest_pa;
885*d19533e8SHuawei Xie 			break;
886*d19533e8SHuawei Xie 		}
887*d19533e8SHuawei Xie 	}
888*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| VVA %p\n",
889*d19533e8SHuawei Xie 		dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
890*d19533e8SHuawei Xie 
891*d19533e8SHuawei Xie 	return vhost_va;
892*d19533e8SHuawei Xie }
893*d19533e8SHuawei Xie 
894*d19533e8SHuawei Xie /*
895*d19533e8SHuawei Xie  * Function to convert guest physical addresses to vhost physical addresses.
896*d19533e8SHuawei Xie  * This is used to convert virtio buffer addresses.
897*d19533e8SHuawei Xie  */
898*d19533e8SHuawei Xie static inline uint64_t __attribute__((always_inline))
899*d19533e8SHuawei Xie gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
900*d19533e8SHuawei Xie 	uint32_t buf_len, hpa_type *addr_type)
901*d19533e8SHuawei Xie {
902*d19533e8SHuawei Xie 	struct virtio_memory_regions_hpa *region;
903*d19533e8SHuawei Xie 	uint32_t regionidx;
904*d19533e8SHuawei Xie 	uint64_t vhost_pa = 0;
905*d19533e8SHuawei Xie 
906*d19533e8SHuawei Xie 	*addr_type = PHYS_ADDR_INVALID;
907*d19533e8SHuawei Xie 
908*d19533e8SHuawei Xie 	for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
909*d19533e8SHuawei Xie 		region = &dev->mem->regions_hpa[regionidx];
910*d19533e8SHuawei Xie 		if ((guest_pa >= region->guest_phys_address) &&
911*d19533e8SHuawei Xie 			(guest_pa <= region->guest_phys_address_end)) {
912*d19533e8SHuawei Xie 			vhost_pa = region->host_phys_addr_offset + guest_pa;
913*d19533e8SHuawei Xie 			if (likely((guest_pa + buf_len - 1)
914*d19533e8SHuawei Xie 				<= region->guest_phys_address_end))
915*d19533e8SHuawei Xie 				*addr_type = PHYS_ADDR_CONTINUOUS;
916*d19533e8SHuawei Xie 			else
917*d19533e8SHuawei Xie 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
918*d19533e8SHuawei Xie 			break;
919*d19533e8SHuawei Xie 		}
920*d19533e8SHuawei Xie 	}
921*d19533e8SHuawei Xie 
922*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
923*d19533e8SHuawei Xie 		dev->device_fh, (void *)(uintptr_t)guest_pa,
924*d19533e8SHuawei Xie 		(void *)(uintptr_t)vhost_pa);
925*d19533e8SHuawei Xie 
926*d19533e8SHuawei Xie 	return vhost_pa;
927*d19533e8SHuawei Xie }
928*d19533e8SHuawei Xie 
929*d19533e8SHuawei Xie /*
930*d19533e8SHuawei Xie  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
931*d19533e8SHuawei Xie  * be received from the physical port or from another virtio device. A packet
932*d19533e8SHuawei Xie  * count is returned to indicate the number of packets that were succesfully
933*d19533e8SHuawei Xie  * added to the RX queue. This function works when mergeable is disabled.
934*d19533e8SHuawei Xie  */
935*d19533e8SHuawei Xie static inline uint32_t __attribute__((always_inline))
936*d19533e8SHuawei Xie virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
937*d19533e8SHuawei Xie {
938*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
939*d19533e8SHuawei Xie 	struct vring_desc *desc;
940*d19533e8SHuawei Xie 	struct rte_mbuf *buff;
941*d19533e8SHuawei Xie 	/* The virtio_hdr is initialised to 0. */
942*d19533e8SHuawei Xie 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
943*d19533e8SHuawei Xie 	uint64_t buff_addr = 0;
944*d19533e8SHuawei Xie 	uint64_t buff_hdr_addr = 0;
945*d19533e8SHuawei Xie 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
946*d19533e8SHuawei Xie 	uint32_t head_idx, packet_success = 0;
947*d19533e8SHuawei Xie 	uint32_t retry = 0;
948*d19533e8SHuawei Xie 	uint16_t avail_idx, res_cur_idx;
949*d19533e8SHuawei Xie 	uint16_t res_base_idx, res_end_idx;
950*d19533e8SHuawei Xie 	uint16_t free_entries;
951*d19533e8SHuawei Xie 	uint8_t success = 0;
952*d19533e8SHuawei Xie 
953*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
954*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_RXQ];
955*d19533e8SHuawei Xie 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
956*d19533e8SHuawei Xie 
957*d19533e8SHuawei Xie 	/* As many data cores may want access to available buffers, they need to be reserved. */
958*d19533e8SHuawei Xie 	do {
959*d19533e8SHuawei Xie 		res_base_idx = vq->last_used_idx_res;
960*d19533e8SHuawei Xie 		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
961*d19533e8SHuawei Xie 
962*d19533e8SHuawei Xie 		free_entries = (avail_idx - res_base_idx);
963*d19533e8SHuawei Xie 		/* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
964*d19533e8SHuawei Xie 		if (enable_retry && unlikely(count > free_entries)) {
965*d19533e8SHuawei Xie 			for (retry = 0; retry < burst_rx_retry_num; retry++) {
966*d19533e8SHuawei Xie 				rte_delay_us(burst_rx_delay_time);
967*d19533e8SHuawei Xie 				avail_idx =
968*d19533e8SHuawei Xie 					*((volatile uint16_t *)&vq->avail->idx);
969*d19533e8SHuawei Xie 				free_entries = (avail_idx - res_base_idx);
970*d19533e8SHuawei Xie 				if (count <= free_entries)
971*d19533e8SHuawei Xie 					break;
972*d19533e8SHuawei Xie 			}
973*d19533e8SHuawei Xie 		}
974*d19533e8SHuawei Xie 
975*d19533e8SHuawei Xie 		/*check that we have enough buffers*/
976*d19533e8SHuawei Xie 		if (unlikely(count > free_entries))
977*d19533e8SHuawei Xie 			count = free_entries;
978*d19533e8SHuawei Xie 
979*d19533e8SHuawei Xie 		if (count == 0)
980*d19533e8SHuawei Xie 			return 0;
981*d19533e8SHuawei Xie 
982*d19533e8SHuawei Xie 		res_end_idx = res_base_idx + count;
983*d19533e8SHuawei Xie 		/* vq->last_used_idx_res is atomically updated. */
984*d19533e8SHuawei Xie 		success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
985*d19533e8SHuawei Xie 									res_end_idx);
986*d19533e8SHuawei Xie 	} while (unlikely(success == 0));
987*d19533e8SHuawei Xie 	res_cur_idx = res_base_idx;
988*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
989*d19533e8SHuawei Xie 
990*d19533e8SHuawei Xie 	/* Prefetch available ring to retrieve indexes. */
991*d19533e8SHuawei Xie 	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
992*d19533e8SHuawei Xie 
993*d19533e8SHuawei Xie 	/* Retrieve all of the head indexes first to avoid caching issues. */
994*d19533e8SHuawei Xie 	for (head_idx = 0; head_idx < count; head_idx++)
995*d19533e8SHuawei Xie 		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
996*d19533e8SHuawei Xie 
997*d19533e8SHuawei Xie 	/*Prefetch descriptor index. */
998*d19533e8SHuawei Xie 	rte_prefetch0(&vq->desc[head[packet_success]]);
999*d19533e8SHuawei Xie 
1000*d19533e8SHuawei Xie 	while (res_cur_idx != res_end_idx) {
1001*d19533e8SHuawei Xie 		/* Get descriptor from available ring */
1002*d19533e8SHuawei Xie 		desc = &vq->desc[head[packet_success]];
1003*d19533e8SHuawei Xie 
1004*d19533e8SHuawei Xie 		buff = pkts[packet_success];
1005*d19533e8SHuawei Xie 
1006*d19533e8SHuawei Xie 		/* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
1007*d19533e8SHuawei Xie 		buff_addr = gpa_to_vva(dev, desc->addr);
1008*d19533e8SHuawei Xie 		/* Prefetch buffer address. */
1009*d19533e8SHuawei Xie 		rte_prefetch0((void*)(uintptr_t)buff_addr);
1010*d19533e8SHuawei Xie 
1011*d19533e8SHuawei Xie 		/* Copy virtio_hdr to packet and increment buffer address */
1012*d19533e8SHuawei Xie 		buff_hdr_addr = buff_addr;
1013*d19533e8SHuawei Xie 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1014*d19533e8SHuawei Xie 
1015*d19533e8SHuawei Xie 		/*
1016*d19533e8SHuawei Xie 		 * If the descriptors are chained the header and data are
1017*d19533e8SHuawei Xie 		 * placed in separate buffers.
1018*d19533e8SHuawei Xie 		 */
1019*d19533e8SHuawei Xie 		if (desc->flags & VRING_DESC_F_NEXT) {
1020*d19533e8SHuawei Xie 			desc->len = vq->vhost_hlen;
1021*d19533e8SHuawei Xie 			desc = &vq->desc[desc->next];
1022*d19533e8SHuawei Xie 			/* Buffer address translation. */
1023*d19533e8SHuawei Xie 			buff_addr = gpa_to_vva(dev, desc->addr);
1024*d19533e8SHuawei Xie 			desc->len = rte_pktmbuf_data_len(buff);
1025*d19533e8SHuawei Xie 		} else {
1026*d19533e8SHuawei Xie 			buff_addr += vq->vhost_hlen;
1027*d19533e8SHuawei Xie 			desc->len = packet_len;
1028*d19533e8SHuawei Xie 		}
1029*d19533e8SHuawei Xie 
1030*d19533e8SHuawei Xie 		/* Update used ring with desc information */
1031*d19533e8SHuawei Xie 		vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
1032*d19533e8SHuawei Xie 		vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
1033*d19533e8SHuawei Xie 
1034*d19533e8SHuawei Xie 		/* Copy mbuf data to buffer */
1035*d19533e8SHuawei Xie 		rte_memcpy((void *)(uintptr_t)buff_addr,
1036*d19533e8SHuawei Xie 			rte_pktmbuf_mtod(buff, const void *),
1037*d19533e8SHuawei Xie 			rte_pktmbuf_data_len(buff));
1038*d19533e8SHuawei Xie 		PRINT_PACKET(dev, (uintptr_t)buff_addr,
1039*d19533e8SHuawei Xie 			rte_pktmbuf_data_len(buff), 0);
1040*d19533e8SHuawei Xie 
1041*d19533e8SHuawei Xie 		res_cur_idx++;
1042*d19533e8SHuawei Xie 		packet_success++;
1043*d19533e8SHuawei Xie 
1044*d19533e8SHuawei Xie 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1045*d19533e8SHuawei Xie 			(const void *)&virtio_hdr, vq->vhost_hlen);
1046*d19533e8SHuawei Xie 
1047*d19533e8SHuawei Xie 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1048*d19533e8SHuawei Xie 
1049*d19533e8SHuawei Xie 		if (res_cur_idx < res_end_idx) {
1050*d19533e8SHuawei Xie 			/* Prefetch descriptor index. */
1051*d19533e8SHuawei Xie 			rte_prefetch0(&vq->desc[head[packet_success]]);
1052*d19533e8SHuawei Xie 		}
1053*d19533e8SHuawei Xie 	}
1054*d19533e8SHuawei Xie 
1055*d19533e8SHuawei Xie 	rte_compiler_barrier();
1056*d19533e8SHuawei Xie 
1057*d19533e8SHuawei Xie 	/* Wait until it's our turn to add our buffer to the used ring. */
1058*d19533e8SHuawei Xie 	while (unlikely(vq->last_used_idx != res_base_idx))
1059*d19533e8SHuawei Xie 		rte_pause();
1060*d19533e8SHuawei Xie 
1061*d19533e8SHuawei Xie 	*(volatile uint16_t *)&vq->used->idx += count;
1062*d19533e8SHuawei Xie 	vq->last_used_idx = res_end_idx;
1063*d19533e8SHuawei Xie 
1064*d19533e8SHuawei Xie 	/* Kick the guest if necessary. */
1065*d19533e8SHuawei Xie 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1066*d19533e8SHuawei Xie 		eventfd_write((int)vq->kickfd, 1);
1067*d19533e8SHuawei Xie 	return count;
1068*d19533e8SHuawei Xie }
1069*d19533e8SHuawei Xie 
1070*d19533e8SHuawei Xie static inline uint32_t __attribute__((always_inline))
1071*d19533e8SHuawei Xie copy_from_mbuf_to_vring(struct virtio_net *dev,
1072*d19533e8SHuawei Xie 	uint16_t res_base_idx, uint16_t res_end_idx,
1073*d19533e8SHuawei Xie 	struct rte_mbuf *pkt)
1074*d19533e8SHuawei Xie {
1075*d19533e8SHuawei Xie 	uint32_t vec_idx = 0;
1076*d19533e8SHuawei Xie 	uint32_t entry_success = 0;
1077*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
1078*d19533e8SHuawei Xie 	/* The virtio_hdr is initialised to 0. */
1079*d19533e8SHuawei Xie 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
1080*d19533e8SHuawei Xie 		{0, 0, 0, 0, 0, 0}, 0};
1081*d19533e8SHuawei Xie 	uint16_t cur_idx = res_base_idx;
1082*d19533e8SHuawei Xie 	uint64_t vb_addr = 0;
1083*d19533e8SHuawei Xie 	uint64_t vb_hdr_addr = 0;
1084*d19533e8SHuawei Xie 	uint32_t seg_offset = 0;
1085*d19533e8SHuawei Xie 	uint32_t vb_offset = 0;
1086*d19533e8SHuawei Xie 	uint32_t seg_avail;
1087*d19533e8SHuawei Xie 	uint32_t vb_avail;
1088*d19533e8SHuawei Xie 	uint32_t cpy_len, entry_len;
1089*d19533e8SHuawei Xie 
1090*d19533e8SHuawei Xie 	if (pkt == NULL)
1091*d19533e8SHuawei Xie 		return 0;
1092*d19533e8SHuawei Xie 
1093*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
1094*d19533e8SHuawei Xie 		"End Index %d\n",
1095*d19533e8SHuawei Xie 		dev->device_fh, cur_idx, res_end_idx);
1096*d19533e8SHuawei Xie 
1097*d19533e8SHuawei Xie 	/*
1098*d19533e8SHuawei Xie 	 * Convert from gpa to vva
1099*d19533e8SHuawei Xie 	 * (guest physical addr -> vhost virtual addr)
1100*d19533e8SHuawei Xie 	 */
1101*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_RXQ];
1102*d19533e8SHuawei Xie 	vb_addr =
1103*d19533e8SHuawei Xie 		gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1104*d19533e8SHuawei Xie 	vb_hdr_addr = vb_addr;
1105*d19533e8SHuawei Xie 
1106*d19533e8SHuawei Xie 	/* Prefetch buffer address. */
1107*d19533e8SHuawei Xie 	rte_prefetch0((void *)(uintptr_t)vb_addr);
1108*d19533e8SHuawei Xie 
1109*d19533e8SHuawei Xie 	virtio_hdr.num_buffers = res_end_idx - res_base_idx;
1110*d19533e8SHuawei Xie 
1111*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
1112*d19533e8SHuawei Xie 		dev->device_fh, virtio_hdr.num_buffers);
1113*d19533e8SHuawei Xie 
1114*d19533e8SHuawei Xie 	rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
1115*d19533e8SHuawei Xie 		(const void *)&virtio_hdr, vq->vhost_hlen);
1116*d19533e8SHuawei Xie 
1117*d19533e8SHuawei Xie 	PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
1118*d19533e8SHuawei Xie 
1119*d19533e8SHuawei Xie 	seg_avail = rte_pktmbuf_data_len(pkt);
1120*d19533e8SHuawei Xie 	vb_offset = vq->vhost_hlen;
1121*d19533e8SHuawei Xie 	vb_avail =
1122*d19533e8SHuawei Xie 		vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
1123*d19533e8SHuawei Xie 
1124*d19533e8SHuawei Xie 	entry_len = vq->vhost_hlen;
1125*d19533e8SHuawei Xie 
1126*d19533e8SHuawei Xie 	if (vb_avail == 0) {
1127*d19533e8SHuawei Xie 		uint32_t desc_idx =
1128*d19533e8SHuawei Xie 			vq->buf_vec[vec_idx].desc_idx;
1129*d19533e8SHuawei Xie 		vq->desc[desc_idx].len = vq->vhost_hlen;
1130*d19533e8SHuawei Xie 
1131*d19533e8SHuawei Xie 		if ((vq->desc[desc_idx].flags
1132*d19533e8SHuawei Xie 			& VRING_DESC_F_NEXT) == 0) {
1133*d19533e8SHuawei Xie 			/* Update used ring with desc information */
1134*d19533e8SHuawei Xie 			vq->used->ring[cur_idx & (vq->size - 1)].id
1135*d19533e8SHuawei Xie 				= vq->buf_vec[vec_idx].desc_idx;
1136*d19533e8SHuawei Xie 			vq->used->ring[cur_idx & (vq->size - 1)].len
1137*d19533e8SHuawei Xie 				= entry_len;
1138*d19533e8SHuawei Xie 
1139*d19533e8SHuawei Xie 			entry_len = 0;
1140*d19533e8SHuawei Xie 			cur_idx++;
1141*d19533e8SHuawei Xie 			entry_success++;
1142*d19533e8SHuawei Xie 		}
1143*d19533e8SHuawei Xie 
1144*d19533e8SHuawei Xie 		vec_idx++;
1145*d19533e8SHuawei Xie 		vb_addr =
1146*d19533e8SHuawei Xie 			gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1147*d19533e8SHuawei Xie 
1148*d19533e8SHuawei Xie 		/* Prefetch buffer address. */
1149*d19533e8SHuawei Xie 		rte_prefetch0((void *)(uintptr_t)vb_addr);
1150*d19533e8SHuawei Xie 		vb_offset = 0;
1151*d19533e8SHuawei Xie 		vb_avail = vq->buf_vec[vec_idx].buf_len;
1152*d19533e8SHuawei Xie 	}
1153*d19533e8SHuawei Xie 
1154*d19533e8SHuawei Xie 	cpy_len = RTE_MIN(vb_avail, seg_avail);
1155*d19533e8SHuawei Xie 
1156*d19533e8SHuawei Xie 	while (cpy_len > 0) {
1157*d19533e8SHuawei Xie 		/* Copy mbuf data to vring buffer */
1158*d19533e8SHuawei Xie 		rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
1159*d19533e8SHuawei Xie 			(const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
1160*d19533e8SHuawei Xie 			cpy_len);
1161*d19533e8SHuawei Xie 
1162*d19533e8SHuawei Xie 		PRINT_PACKET(dev,
1163*d19533e8SHuawei Xie 			(uintptr_t)(vb_addr + vb_offset),
1164*d19533e8SHuawei Xie 			cpy_len, 0);
1165*d19533e8SHuawei Xie 
1166*d19533e8SHuawei Xie 		seg_offset += cpy_len;
1167*d19533e8SHuawei Xie 		vb_offset += cpy_len;
1168*d19533e8SHuawei Xie 		seg_avail -= cpy_len;
1169*d19533e8SHuawei Xie 		vb_avail -= cpy_len;
1170*d19533e8SHuawei Xie 		entry_len += cpy_len;
1171*d19533e8SHuawei Xie 
1172*d19533e8SHuawei Xie 		if (seg_avail != 0) {
1173*d19533e8SHuawei Xie 			/*
1174*d19533e8SHuawei Xie 			 * The virtio buffer in this vring
1175*d19533e8SHuawei Xie 			 * entry reach to its end.
1176*d19533e8SHuawei Xie 			 * But the segment doesn't complete.
1177*d19533e8SHuawei Xie 			 */
1178*d19533e8SHuawei Xie 			if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
1179*d19533e8SHuawei Xie 				VRING_DESC_F_NEXT) == 0) {
1180*d19533e8SHuawei Xie 				/* Update used ring with desc information */
1181*d19533e8SHuawei Xie 				vq->used->ring[cur_idx & (vq->size - 1)].id
1182*d19533e8SHuawei Xie 					= vq->buf_vec[vec_idx].desc_idx;
1183*d19533e8SHuawei Xie 				vq->used->ring[cur_idx & (vq->size - 1)].len
1184*d19533e8SHuawei Xie 					= entry_len;
1185*d19533e8SHuawei Xie 				entry_len = 0;
1186*d19533e8SHuawei Xie 				cur_idx++;
1187*d19533e8SHuawei Xie 				entry_success++;
1188*d19533e8SHuawei Xie 			}
1189*d19533e8SHuawei Xie 
1190*d19533e8SHuawei Xie 			vec_idx++;
1191*d19533e8SHuawei Xie 			vb_addr = gpa_to_vva(dev,
1192*d19533e8SHuawei Xie 				vq->buf_vec[vec_idx].buf_addr);
1193*d19533e8SHuawei Xie 			vb_offset = 0;
1194*d19533e8SHuawei Xie 			vb_avail = vq->buf_vec[vec_idx].buf_len;
1195*d19533e8SHuawei Xie 			cpy_len = RTE_MIN(vb_avail, seg_avail);
1196*d19533e8SHuawei Xie 		} else {
1197*d19533e8SHuawei Xie 			/*
1198*d19533e8SHuawei Xie 			 * This current segment complete, need continue to
1199*d19533e8SHuawei Xie 			 * check if the whole packet complete or not.
1200*d19533e8SHuawei Xie 			 */
1201*d19533e8SHuawei Xie 			pkt = pkt->next;
1202*d19533e8SHuawei Xie 			if (pkt != NULL) {
1203*d19533e8SHuawei Xie 				/*
1204*d19533e8SHuawei Xie 				 * There are more segments.
1205*d19533e8SHuawei Xie 				 */
1206*d19533e8SHuawei Xie 				if (vb_avail == 0) {
1207*d19533e8SHuawei Xie 					/*
1208*d19533e8SHuawei Xie 					 * This current buffer from vring is
1209*d19533e8SHuawei Xie 					 * used up, need fetch next buffer
1210*d19533e8SHuawei Xie 					 * from buf_vec.
1211*d19533e8SHuawei Xie 					 */
1212*d19533e8SHuawei Xie 					uint32_t desc_idx =
1213*d19533e8SHuawei Xie 						vq->buf_vec[vec_idx].desc_idx;
1214*d19533e8SHuawei Xie 					vq->desc[desc_idx].len = vb_offset;
1215*d19533e8SHuawei Xie 
1216*d19533e8SHuawei Xie 					if ((vq->desc[desc_idx].flags &
1217*d19533e8SHuawei Xie 						VRING_DESC_F_NEXT) == 0) {
1218*d19533e8SHuawei Xie 						uint16_t wrapped_idx =
1219*d19533e8SHuawei Xie 							cur_idx & (vq->size - 1);
1220*d19533e8SHuawei Xie 						/*
1221*d19533e8SHuawei Xie 						 * Update used ring with the
1222*d19533e8SHuawei Xie 						 * descriptor information
1223*d19533e8SHuawei Xie 						 */
1224*d19533e8SHuawei Xie 						vq->used->ring[wrapped_idx].id
1225*d19533e8SHuawei Xie 							= desc_idx;
1226*d19533e8SHuawei Xie 						vq->used->ring[wrapped_idx].len
1227*d19533e8SHuawei Xie 							= entry_len;
1228*d19533e8SHuawei Xie 						entry_success++;
1229*d19533e8SHuawei Xie 						entry_len = 0;
1230*d19533e8SHuawei Xie 						cur_idx++;
1231*d19533e8SHuawei Xie 					}
1232*d19533e8SHuawei Xie 
1233*d19533e8SHuawei Xie 					/* Get next buffer from buf_vec. */
1234*d19533e8SHuawei Xie 					vec_idx++;
1235*d19533e8SHuawei Xie 					vb_addr = gpa_to_vva(dev,
1236*d19533e8SHuawei Xie 						vq->buf_vec[vec_idx].buf_addr);
1237*d19533e8SHuawei Xie 					vb_avail =
1238*d19533e8SHuawei Xie 						vq->buf_vec[vec_idx].buf_len;
1239*d19533e8SHuawei Xie 					vb_offset = 0;
1240*d19533e8SHuawei Xie 				}
1241*d19533e8SHuawei Xie 
1242*d19533e8SHuawei Xie 				seg_offset = 0;
1243*d19533e8SHuawei Xie 				seg_avail = rte_pktmbuf_data_len(pkt);
1244*d19533e8SHuawei Xie 				cpy_len = RTE_MIN(vb_avail, seg_avail);
1245*d19533e8SHuawei Xie 			} else {
1246*d19533e8SHuawei Xie 				/*
1247*d19533e8SHuawei Xie 				 * This whole packet completes.
1248*d19533e8SHuawei Xie 				 */
1249*d19533e8SHuawei Xie 				uint32_t desc_idx =
1250*d19533e8SHuawei Xie 					vq->buf_vec[vec_idx].desc_idx;
1251*d19533e8SHuawei Xie 				vq->desc[desc_idx].len = vb_offset;
1252*d19533e8SHuawei Xie 
1253*d19533e8SHuawei Xie 				while (vq->desc[desc_idx].flags &
1254*d19533e8SHuawei Xie 					VRING_DESC_F_NEXT) {
1255*d19533e8SHuawei Xie 					desc_idx = vq->desc[desc_idx].next;
1256*d19533e8SHuawei Xie 					 vq->desc[desc_idx].len = 0;
1257*d19533e8SHuawei Xie 				}
1258*d19533e8SHuawei Xie 
1259*d19533e8SHuawei Xie 				/* Update used ring with desc information */
1260*d19533e8SHuawei Xie 				vq->used->ring[cur_idx & (vq->size - 1)].id
1261*d19533e8SHuawei Xie 					= vq->buf_vec[vec_idx].desc_idx;
1262*d19533e8SHuawei Xie 				vq->used->ring[cur_idx & (vq->size - 1)].len
1263*d19533e8SHuawei Xie 					= entry_len;
1264*d19533e8SHuawei Xie 				entry_len = 0;
1265*d19533e8SHuawei Xie 				cur_idx++;
1266*d19533e8SHuawei Xie 				entry_success++;
1267*d19533e8SHuawei Xie 				seg_avail = 0;
1268*d19533e8SHuawei Xie 				cpy_len = RTE_MIN(vb_avail, seg_avail);
1269*d19533e8SHuawei Xie 			}
1270*d19533e8SHuawei Xie 		}
1271*d19533e8SHuawei Xie 	}
1272*d19533e8SHuawei Xie 
1273*d19533e8SHuawei Xie 	return entry_success;
1274*d19533e8SHuawei Xie }
1275*d19533e8SHuawei Xie 
1276*d19533e8SHuawei Xie /*
1277*d19533e8SHuawei Xie  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
1278*d19533e8SHuawei Xie  * be received from the physical port or from another virtio device. A packet
1279*d19533e8SHuawei Xie  * count is returned to indicate the number of packets that were succesfully
1280*d19533e8SHuawei Xie  * added to the RX queue. This function works for mergeable RX.
1281*d19533e8SHuawei Xie  */
1282*d19533e8SHuawei Xie static inline uint32_t __attribute__((always_inline))
1283*d19533e8SHuawei Xie virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
1284*d19533e8SHuawei Xie 	uint32_t count)
1285*d19533e8SHuawei Xie {
1286*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
1287*d19533e8SHuawei Xie 	uint32_t pkt_idx = 0, entry_success = 0;
1288*d19533e8SHuawei Xie 	uint32_t retry = 0;
1289*d19533e8SHuawei Xie 	uint16_t avail_idx, res_cur_idx;
1290*d19533e8SHuawei Xie 	uint16_t res_base_idx, res_end_idx;
1291*d19533e8SHuawei Xie 	uint8_t success = 0;
1292*d19533e8SHuawei Xie 
1293*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
1294*d19533e8SHuawei Xie 		dev->device_fh);
1295*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_RXQ];
1296*d19533e8SHuawei Xie 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1297*d19533e8SHuawei Xie 
1298*d19533e8SHuawei Xie 	if (count == 0)
1299*d19533e8SHuawei Xie 		return 0;
1300*d19533e8SHuawei Xie 
1301*d19533e8SHuawei Xie 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1302*d19533e8SHuawei Xie 		uint32_t secure_len = 0;
1303*d19533e8SHuawei Xie 		uint16_t need_cnt;
1304*d19533e8SHuawei Xie 		uint32_t vec_idx = 0;
1305*d19533e8SHuawei Xie 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
1306*d19533e8SHuawei Xie 		uint16_t i, id;
1307*d19533e8SHuawei Xie 
1308*d19533e8SHuawei Xie 		do {
1309*d19533e8SHuawei Xie 			/*
1310*d19533e8SHuawei Xie 			 * As many data cores may want access to available
1311*d19533e8SHuawei Xie 			 * buffers, they need to be reserved.
1312*d19533e8SHuawei Xie 			 */
1313*d19533e8SHuawei Xie 			res_base_idx = vq->last_used_idx_res;
1314*d19533e8SHuawei Xie 			res_cur_idx = res_base_idx;
1315*d19533e8SHuawei Xie 
1316*d19533e8SHuawei Xie 			do {
1317*d19533e8SHuawei Xie 				avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1318*d19533e8SHuawei Xie 				if (unlikely(res_cur_idx == avail_idx)) {
1319*d19533e8SHuawei Xie 					/*
1320*d19533e8SHuawei Xie 					 * If retry is enabled and the queue is
1321*d19533e8SHuawei Xie 					 * full then we wait and retry to avoid
1322*d19533e8SHuawei Xie 					 * packet loss.
1323*d19533e8SHuawei Xie 					 */
1324*d19533e8SHuawei Xie 					if (enable_retry) {
1325*d19533e8SHuawei Xie 						uint8_t cont = 0;
1326*d19533e8SHuawei Xie 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1327*d19533e8SHuawei Xie 							rte_delay_us(burst_rx_delay_time);
1328*d19533e8SHuawei Xie 							avail_idx =
1329*d19533e8SHuawei Xie 								*((volatile uint16_t *)&vq->avail->idx);
1330*d19533e8SHuawei Xie 							if (likely(res_cur_idx != avail_idx)) {
1331*d19533e8SHuawei Xie 								cont = 1;
1332*d19533e8SHuawei Xie 								break;
1333*d19533e8SHuawei Xie 							}
1334*d19533e8SHuawei Xie 						}
1335*d19533e8SHuawei Xie 						if (cont == 1)
1336*d19533e8SHuawei Xie 							continue;
1337*d19533e8SHuawei Xie 					}
1338*d19533e8SHuawei Xie 
1339*d19533e8SHuawei Xie 					LOG_DEBUG(VHOST_DATA,
1340*d19533e8SHuawei Xie 						"(%"PRIu64") Failed "
1341*d19533e8SHuawei Xie 						"to get enough desc from "
1342*d19533e8SHuawei Xie 						"vring\n",
1343*d19533e8SHuawei Xie 						dev->device_fh);
1344*d19533e8SHuawei Xie 					return pkt_idx;
1345*d19533e8SHuawei Xie 				} else {
1346*d19533e8SHuawei Xie 					uint16_t wrapped_idx =
1347*d19533e8SHuawei Xie 						(res_cur_idx) & (vq->size - 1);
1348*d19533e8SHuawei Xie 					uint32_t idx =
1349*d19533e8SHuawei Xie 						vq->avail->ring[wrapped_idx];
1350*d19533e8SHuawei Xie 					uint8_t next_desc;
1351*d19533e8SHuawei Xie 
1352*d19533e8SHuawei Xie 					do {
1353*d19533e8SHuawei Xie 						next_desc = 0;
1354*d19533e8SHuawei Xie 						secure_len += vq->desc[idx].len;
1355*d19533e8SHuawei Xie 						if (vq->desc[idx].flags &
1356*d19533e8SHuawei Xie 							VRING_DESC_F_NEXT) {
1357*d19533e8SHuawei Xie 							idx = vq->desc[idx].next;
1358*d19533e8SHuawei Xie 							next_desc = 1;
1359*d19533e8SHuawei Xie 						}
1360*d19533e8SHuawei Xie 					} while (next_desc);
1361*d19533e8SHuawei Xie 
1362*d19533e8SHuawei Xie 					res_cur_idx++;
1363*d19533e8SHuawei Xie 				}
1364*d19533e8SHuawei Xie 			} while (pkt_len > secure_len);
1365*d19533e8SHuawei Xie 
1366*d19533e8SHuawei Xie 			/* vq->last_used_idx_res is atomically updated. */
1367*d19533e8SHuawei Xie 			success = rte_atomic16_cmpset(&vq->last_used_idx_res,
1368*d19533e8SHuawei Xie 							res_base_idx,
1369*d19533e8SHuawei Xie 							res_cur_idx);
1370*d19533e8SHuawei Xie 		} while (success == 0);
1371*d19533e8SHuawei Xie 
1372*d19533e8SHuawei Xie 		id = res_base_idx;
1373*d19533e8SHuawei Xie 		need_cnt = res_cur_idx - res_base_idx;
1374*d19533e8SHuawei Xie 
1375*d19533e8SHuawei Xie 		for (i = 0; i < need_cnt; i++, id++) {
1376*d19533e8SHuawei Xie 			uint16_t wrapped_idx = id & (vq->size - 1);
1377*d19533e8SHuawei Xie 			uint32_t idx = vq->avail->ring[wrapped_idx];
1378*d19533e8SHuawei Xie 			uint8_t next_desc;
1379*d19533e8SHuawei Xie 			do {
1380*d19533e8SHuawei Xie 				next_desc = 0;
1381*d19533e8SHuawei Xie 				vq->buf_vec[vec_idx].buf_addr =
1382*d19533e8SHuawei Xie 					vq->desc[idx].addr;
1383*d19533e8SHuawei Xie 				vq->buf_vec[vec_idx].buf_len =
1384*d19533e8SHuawei Xie 					vq->desc[idx].len;
1385*d19533e8SHuawei Xie 				vq->buf_vec[vec_idx].desc_idx = idx;
1386*d19533e8SHuawei Xie 				vec_idx++;
1387*d19533e8SHuawei Xie 
1388*d19533e8SHuawei Xie 				if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
1389*d19533e8SHuawei Xie 					idx = vq->desc[idx].next;
1390*d19533e8SHuawei Xie 					next_desc = 1;
1391*d19533e8SHuawei Xie 				}
1392*d19533e8SHuawei Xie 			} while (next_desc);
1393*d19533e8SHuawei Xie 		}
1394*d19533e8SHuawei Xie 
1395*d19533e8SHuawei Xie 		res_end_idx = res_cur_idx;
1396*d19533e8SHuawei Xie 
1397*d19533e8SHuawei Xie 		entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
1398*d19533e8SHuawei Xie 			res_end_idx, pkts[pkt_idx]);
1399*d19533e8SHuawei Xie 
1400*d19533e8SHuawei Xie 		rte_compiler_barrier();
1401*d19533e8SHuawei Xie 
1402*d19533e8SHuawei Xie 		/*
1403*d19533e8SHuawei Xie 		 * Wait until it's our turn to add our buffer
1404*d19533e8SHuawei Xie 		 * to the used ring.
1405*d19533e8SHuawei Xie 		 */
1406*d19533e8SHuawei Xie 		while (unlikely(vq->last_used_idx != res_base_idx))
1407*d19533e8SHuawei Xie 			rte_pause();
1408*d19533e8SHuawei Xie 
1409*d19533e8SHuawei Xie 		*(volatile uint16_t *)&vq->used->idx += entry_success;
1410*d19533e8SHuawei Xie 		vq->last_used_idx = res_end_idx;
1411*d19533e8SHuawei Xie 
1412*d19533e8SHuawei Xie 		/* Kick the guest if necessary. */
1413*d19533e8SHuawei Xie 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1414*d19533e8SHuawei Xie 			eventfd_write((int)vq->kickfd, 1);
1415*d19533e8SHuawei Xie 	}
1416*d19533e8SHuawei Xie 
1417*d19533e8SHuawei Xie 	return count;
1418*d19533e8SHuawei Xie }
1419*d19533e8SHuawei Xie 
1420*d19533e8SHuawei Xie /*
1421*d19533e8SHuawei Xie  * Compares a packet destination MAC address to a device MAC address.
1422*d19533e8SHuawei Xie  */
1423*d19533e8SHuawei Xie static inline int __attribute__((always_inline))
1424*d19533e8SHuawei Xie ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
1425*d19533e8SHuawei Xie {
1426*d19533e8SHuawei Xie 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
1427*d19533e8SHuawei Xie }
1428*d19533e8SHuawei Xie 
1429*d19533e8SHuawei Xie /*
1430*d19533e8SHuawei Xie  * This function learns the MAC address of the device and registers this along with a
1431*d19533e8SHuawei Xie  * vlan tag to a VMDQ.
1432*d19533e8SHuawei Xie  */
1433*d19533e8SHuawei Xie static int
1434*d19533e8SHuawei Xie link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
1435*d19533e8SHuawei Xie {
1436*d19533e8SHuawei Xie 	struct ether_hdr *pkt_hdr;
1437*d19533e8SHuawei Xie 	struct virtio_net_data_ll *dev_ll;
1438*d19533e8SHuawei Xie 	int i, ret;
1439*d19533e8SHuawei Xie 
1440*d19533e8SHuawei Xie 	/* Learn MAC address of guest device from packet */
1441*d19533e8SHuawei Xie 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1442*d19533e8SHuawei Xie 
1443*d19533e8SHuawei Xie 	dev_ll = ll_root_used;
1444*d19533e8SHuawei Xie 
1445*d19533e8SHuawei Xie 	while (dev_ll != NULL) {
1446*d19533e8SHuawei Xie 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
1447*d19533e8SHuawei Xie 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
1448*d19533e8SHuawei Xie 			return -1;
1449*d19533e8SHuawei Xie 		}
1450*d19533e8SHuawei Xie 		dev_ll = dev_ll->next;
1451*d19533e8SHuawei Xie 	}
1452*d19533e8SHuawei Xie 
1453*d19533e8SHuawei Xie 	for (i = 0; i < ETHER_ADDR_LEN; i++)
1454*d19533e8SHuawei Xie 		dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
1455*d19533e8SHuawei Xie 
1456*d19533e8SHuawei Xie 	/* vlan_tag currently uses the device_id. */
1457*d19533e8SHuawei Xie 	dev->vlan_tag = vlan_tags[dev->device_fh];
1458*d19533e8SHuawei Xie 
1459*d19533e8SHuawei Xie 	/* Print out VMDQ registration info. */
1460*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
1461*d19533e8SHuawei Xie 		dev->device_fh,
1462*d19533e8SHuawei Xie 		dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
1463*d19533e8SHuawei Xie 		dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
1464*d19533e8SHuawei Xie 		dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
1465*d19533e8SHuawei Xie 		dev->vlan_tag);
1466*d19533e8SHuawei Xie 
1467*d19533e8SHuawei Xie 	/* Register the MAC address. */
1468*d19533e8SHuawei Xie 	ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
1469*d19533e8SHuawei Xie 	if (ret)
1470*d19533e8SHuawei Xie 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
1471*d19533e8SHuawei Xie 					dev->device_fh);
1472*d19533e8SHuawei Xie 
1473*d19533e8SHuawei Xie 	/* Enable stripping of the vlan tag as we handle routing. */
1474*d19533e8SHuawei Xie 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
1475*d19533e8SHuawei Xie 
1476*d19533e8SHuawei Xie 	/* Set device as ready for RX. */
1477*d19533e8SHuawei Xie 	dev->ready = DEVICE_RX;
1478*d19533e8SHuawei Xie 
1479*d19533e8SHuawei Xie 	return 0;
1480*d19533e8SHuawei Xie }
1481*d19533e8SHuawei Xie 
1482*d19533e8SHuawei Xie /*
1483*d19533e8SHuawei Xie  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1484*d19533e8SHuawei Xie  * queue before disabling RX on the device.
1485*d19533e8SHuawei Xie  */
1486*d19533e8SHuawei Xie static inline void
1487*d19533e8SHuawei Xie unlink_vmdq(struct virtio_net *dev)
1488*d19533e8SHuawei Xie {
1489*d19533e8SHuawei Xie 	unsigned i = 0;
1490*d19533e8SHuawei Xie 	unsigned rx_count;
1491*d19533e8SHuawei Xie 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1492*d19533e8SHuawei Xie 
1493*d19533e8SHuawei Xie 	if (dev->ready == DEVICE_RX) {
1494*d19533e8SHuawei Xie 		/*clear MAC and VLAN settings*/
1495*d19533e8SHuawei Xie 		rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
1496*d19533e8SHuawei Xie 		for (i = 0; i < 6; i++)
1497*d19533e8SHuawei Xie 			dev->mac_address.addr_bytes[i] = 0;
1498*d19533e8SHuawei Xie 
1499*d19533e8SHuawei Xie 		dev->vlan_tag = 0;
1500*d19533e8SHuawei Xie 
1501*d19533e8SHuawei Xie 		/*Clear out the receive buffers*/
1502*d19533e8SHuawei Xie 		rx_count = rte_eth_rx_burst(ports[0],
1503*d19533e8SHuawei Xie 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1504*d19533e8SHuawei Xie 
1505*d19533e8SHuawei Xie 		while (rx_count) {
1506*d19533e8SHuawei Xie 			for (i = 0; i < rx_count; i++)
1507*d19533e8SHuawei Xie 				rte_pktmbuf_free(pkts_burst[i]);
1508*d19533e8SHuawei Xie 
1509*d19533e8SHuawei Xie 			rx_count = rte_eth_rx_burst(ports[0],
1510*d19533e8SHuawei Xie 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1511*d19533e8SHuawei Xie 		}
1512*d19533e8SHuawei Xie 
1513*d19533e8SHuawei Xie 		dev->ready = DEVICE_MAC_LEARNING;
1514*d19533e8SHuawei Xie 	}
1515*d19533e8SHuawei Xie }
1516*d19533e8SHuawei Xie 
1517*d19533e8SHuawei Xie /*
1518*d19533e8SHuawei Xie  * Check if the packet destination MAC address is for a local device. If so then put
1519*d19533e8SHuawei Xie  * the packet on that devices RX queue. If not then return.
1520*d19533e8SHuawei Xie  */
1521*d19533e8SHuawei Xie static inline unsigned __attribute__((always_inline))
1522*d19533e8SHuawei Xie virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1523*d19533e8SHuawei Xie {
1524*d19533e8SHuawei Xie 	struct virtio_net_data_ll *dev_ll;
1525*d19533e8SHuawei Xie 	struct ether_hdr *pkt_hdr;
1526*d19533e8SHuawei Xie 	uint64_t ret = 0;
1527*d19533e8SHuawei Xie 
1528*d19533e8SHuawei Xie 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1529*d19533e8SHuawei Xie 
1530*d19533e8SHuawei Xie 	/*get the used devices list*/
1531*d19533e8SHuawei Xie 	dev_ll = ll_root_used;
1532*d19533e8SHuawei Xie 
1533*d19533e8SHuawei Xie 	while (dev_ll != NULL) {
1534*d19533e8SHuawei Xie 		if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1535*d19533e8SHuawei Xie 				          &dev_ll->dev->mac_address)) {
1536*d19533e8SHuawei Xie 
1537*d19533e8SHuawei Xie 			/* Drop the packet if the TX packet is destined for the TX device. */
1538*d19533e8SHuawei Xie 			if (dev_ll->dev->device_fh == dev->device_fh) {
1539*d19533e8SHuawei Xie 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1540*d19533e8SHuawei Xie 							dev_ll->dev->device_fh);
1541*d19533e8SHuawei Xie 				return 0;
1542*d19533e8SHuawei Xie 			}
1543*d19533e8SHuawei Xie 
1544*d19533e8SHuawei Xie 
1545*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1546*d19533e8SHuawei Xie 
1547*d19533e8SHuawei Xie 			if (dev_ll->dev->remove) {
1548*d19533e8SHuawei Xie 				/*drop the packet if the device is marked for removal*/
1549*d19533e8SHuawei Xie 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1550*d19533e8SHuawei Xie 			} else {
1551*d19533e8SHuawei Xie 				uint32_t mergeable =
1552*d19533e8SHuawei Xie 					dev_ll->dev->features &
1553*d19533e8SHuawei Xie 					(1 << VIRTIO_NET_F_MRG_RXBUF);
1554*d19533e8SHuawei Xie 
1555*d19533e8SHuawei Xie 				/*send the packet to the local virtio device*/
1556*d19533e8SHuawei Xie 				if (likely(mergeable == 0))
1557*d19533e8SHuawei Xie 					ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1558*d19533e8SHuawei Xie 				else
1559*d19533e8SHuawei Xie 					ret = virtio_dev_merge_rx(dev_ll->dev,
1560*d19533e8SHuawei Xie 						&m, 1);
1561*d19533e8SHuawei Xie 
1562*d19533e8SHuawei Xie 				if (enable_stats) {
1563*d19533e8SHuawei Xie 					rte_atomic64_add(
1564*d19533e8SHuawei Xie 					&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1565*d19533e8SHuawei Xie 					1);
1566*d19533e8SHuawei Xie 					rte_atomic64_add(
1567*d19533e8SHuawei Xie 					&dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1568*d19533e8SHuawei Xie 					ret);
1569*d19533e8SHuawei Xie 					dev_statistics[dev->device_fh].tx_total++;
1570*d19533e8SHuawei Xie 					dev_statistics[dev->device_fh].tx += ret;
1571*d19533e8SHuawei Xie 				}
1572*d19533e8SHuawei Xie 			}
1573*d19533e8SHuawei Xie 
1574*d19533e8SHuawei Xie 			return 0;
1575*d19533e8SHuawei Xie 		}
1576*d19533e8SHuawei Xie 		dev_ll = dev_ll->next;
1577*d19533e8SHuawei Xie 	}
1578*d19533e8SHuawei Xie 
1579*d19533e8SHuawei Xie 	return -1;
1580*d19533e8SHuawei Xie }
1581*d19533e8SHuawei Xie 
1582*d19533e8SHuawei Xie /*
1583*d19533e8SHuawei Xie  * This function routes the TX packet to the correct interface. This may be a local device
1584*d19533e8SHuawei Xie  * or the physical port.
1585*d19533e8SHuawei Xie  */
1586*d19533e8SHuawei Xie static inline void __attribute__((always_inline))
1587*d19533e8SHuawei Xie virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1588*d19533e8SHuawei Xie {
1589*d19533e8SHuawei Xie 	struct mbuf_table *tx_q;
1590*d19533e8SHuawei Xie 	struct vlan_ethhdr *vlan_hdr;
1591*d19533e8SHuawei Xie 	struct rte_mbuf **m_table;
1592*d19533e8SHuawei Xie 	struct rte_mbuf *mbuf, *prev;
1593*d19533e8SHuawei Xie 	unsigned len, ret, offset = 0;
1594*d19533e8SHuawei Xie 	const uint16_t lcore_id = rte_lcore_id();
1595*d19533e8SHuawei Xie 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1596*d19533e8SHuawei Xie 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1597*d19533e8SHuawei Xie 
1598*d19533e8SHuawei Xie 	/*check if destination is local VM*/
1599*d19533e8SHuawei Xie 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1600*d19533e8SHuawei Xie 		return;
1601*d19533e8SHuawei Xie 
1602*d19533e8SHuawei Xie 	if (vm2vm_mode == VM2VM_HARDWARE) {
1603*d19533e8SHuawei Xie 		while (dev_ll != NULL) {
1604*d19533e8SHuawei Xie 			if ((dev_ll->dev->ready == DEVICE_RX)
1605*d19533e8SHuawei Xie 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1606*d19533e8SHuawei Xie 				&dev_ll->dev->mac_address)) {
1607*d19533e8SHuawei Xie 				/*
1608*d19533e8SHuawei Xie 				 * Drop the packet if the TX packet is
1609*d19533e8SHuawei Xie 				 * destined for the TX device.
1610*d19533e8SHuawei Xie 				 */
1611*d19533e8SHuawei Xie 				if (dev_ll->dev->device_fh == dev->device_fh) {
1612*d19533e8SHuawei Xie 					LOG_DEBUG(VHOST_DATA,
1613*d19533e8SHuawei Xie 					"(%"PRIu64") TX: Source and destination"
1614*d19533e8SHuawei Xie 					" MAC addresses are the same. Dropping "
1615*d19533e8SHuawei Xie 					"packet.\n",
1616*d19533e8SHuawei Xie 					dev_ll->dev->device_fh);
1617*d19533e8SHuawei Xie 					return;
1618*d19533e8SHuawei Xie 				}
1619*d19533e8SHuawei Xie 				offset = 4;
1620*d19533e8SHuawei Xie 				vlan_tag =
1621*d19533e8SHuawei Xie 				(uint16_t)
1622*d19533e8SHuawei Xie 				vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1623*d19533e8SHuawei Xie 
1624*d19533e8SHuawei Xie 				LOG_DEBUG(VHOST_DATA,
1625*d19533e8SHuawei Xie 				"(%"PRIu64") TX: pkt to local VM device id:"
1626*d19533e8SHuawei Xie 				"(%"PRIu64") vlan tag: %d.\n",
1627*d19533e8SHuawei Xie 				dev->device_fh, dev_ll->dev->device_fh,
1628*d19533e8SHuawei Xie 				vlan_tag);
1629*d19533e8SHuawei Xie 
1630*d19533e8SHuawei Xie 				break;
1631*d19533e8SHuawei Xie 			}
1632*d19533e8SHuawei Xie 			dev_ll = dev_ll->next;
1633*d19533e8SHuawei Xie 		}
1634*d19533e8SHuawei Xie 	}
1635*d19533e8SHuawei Xie 
1636*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1637*d19533e8SHuawei Xie 
1638*d19533e8SHuawei Xie 	/*Add packet to the port tx queue*/
1639*d19533e8SHuawei Xie 	tx_q = &lcore_tx_queue[lcore_id];
1640*d19533e8SHuawei Xie 	len = tx_q->len;
1641*d19533e8SHuawei Xie 
1642*d19533e8SHuawei Xie 	/* Allocate an mbuf and populate the structure. */
1643*d19533e8SHuawei Xie 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
1644*d19533e8SHuawei Xie 	if (unlikely(mbuf == NULL)) {
1645*d19533e8SHuawei Xie 		RTE_LOG(ERR, VHOST_DATA,
1646*d19533e8SHuawei Xie 			"Failed to allocate memory for mbuf.\n");
1647*d19533e8SHuawei Xie 		return;
1648*d19533e8SHuawei Xie 	}
1649*d19533e8SHuawei Xie 
1650*d19533e8SHuawei Xie 	mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1651*d19533e8SHuawei Xie 	mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1652*d19533e8SHuawei Xie 	mbuf->nb_segs = m->nb_segs;
1653*d19533e8SHuawei Xie 
1654*d19533e8SHuawei Xie 	/* Copy ethernet header to mbuf. */
1655*d19533e8SHuawei Xie 	rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1656*d19533e8SHuawei Xie 		rte_pktmbuf_mtod(m, const void *),
1657*d19533e8SHuawei Xie 		ETH_HLEN);
1658*d19533e8SHuawei Xie 
1659*d19533e8SHuawei Xie 
1660*d19533e8SHuawei Xie 	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1661*d19533e8SHuawei Xie 	vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1662*d19533e8SHuawei Xie 	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1663*d19533e8SHuawei Xie 	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1664*d19533e8SHuawei Xie 	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1665*d19533e8SHuawei Xie 
1666*d19533e8SHuawei Xie 	/* Copy the remaining packet contents to the mbuf. */
1667*d19533e8SHuawei Xie 	rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1668*d19533e8SHuawei Xie 		(const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1669*d19533e8SHuawei Xie 		(m->data_len - ETH_HLEN));
1670*d19533e8SHuawei Xie 
1671*d19533e8SHuawei Xie 	/* Copy the remaining segments for the whole packet. */
1672*d19533e8SHuawei Xie 	prev = mbuf;
1673*d19533e8SHuawei Xie 	while (m->next) {
1674*d19533e8SHuawei Xie 		/* Allocate an mbuf and populate the structure. */
1675*d19533e8SHuawei Xie 		struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1676*d19533e8SHuawei Xie 		if (unlikely(next_mbuf == NULL)) {
1677*d19533e8SHuawei Xie 			rte_pktmbuf_free(mbuf);
1678*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_DATA,
1679*d19533e8SHuawei Xie 				"Failed to allocate memory for mbuf.\n");
1680*d19533e8SHuawei Xie 			return;
1681*d19533e8SHuawei Xie 		}
1682*d19533e8SHuawei Xie 
1683*d19533e8SHuawei Xie 		m = m->next;
1684*d19533e8SHuawei Xie 		prev->next = next_mbuf;
1685*d19533e8SHuawei Xie 		prev = next_mbuf;
1686*d19533e8SHuawei Xie 		next_mbuf->data_len = m->data_len;
1687*d19533e8SHuawei Xie 
1688*d19533e8SHuawei Xie 		/* Copy data to next mbuf. */
1689*d19533e8SHuawei Xie 		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1690*d19533e8SHuawei Xie 			rte_pktmbuf_mtod(m, const void *), m->data_len);
1691*d19533e8SHuawei Xie 	}
1692*d19533e8SHuawei Xie 
1693*d19533e8SHuawei Xie 	tx_q->m_table[len] = mbuf;
1694*d19533e8SHuawei Xie 	len++;
1695*d19533e8SHuawei Xie 	if (enable_stats) {
1696*d19533e8SHuawei Xie 		dev_statistics[dev->device_fh].tx_total++;
1697*d19533e8SHuawei Xie 		dev_statistics[dev->device_fh].tx++;
1698*d19533e8SHuawei Xie 	}
1699*d19533e8SHuawei Xie 
1700*d19533e8SHuawei Xie 	if (unlikely(len == MAX_PKT_BURST)) {
1701*d19533e8SHuawei Xie 		m_table = (struct rte_mbuf **)tx_q->m_table;
1702*d19533e8SHuawei Xie 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1703*d19533e8SHuawei Xie 		/* Free any buffers not handled by TX and update the port stats. */
1704*d19533e8SHuawei Xie 		if (unlikely(ret < len)) {
1705*d19533e8SHuawei Xie 			do {
1706*d19533e8SHuawei Xie 				rte_pktmbuf_free(m_table[ret]);
1707*d19533e8SHuawei Xie 			} while (++ret < len);
1708*d19533e8SHuawei Xie 		}
1709*d19533e8SHuawei Xie 
1710*d19533e8SHuawei Xie 		len = 0;
1711*d19533e8SHuawei Xie 	}
1712*d19533e8SHuawei Xie 
1713*d19533e8SHuawei Xie 	tx_q->len = len;
1714*d19533e8SHuawei Xie 	return;
1715*d19533e8SHuawei Xie }
1716*d19533e8SHuawei Xie 
1717*d19533e8SHuawei Xie static inline void __attribute__((always_inline))
1718*d19533e8SHuawei Xie virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
1719*d19533e8SHuawei Xie {
1720*d19533e8SHuawei Xie 	struct rte_mbuf m;
1721*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
1722*d19533e8SHuawei Xie 	struct vring_desc *desc;
1723*d19533e8SHuawei Xie 	uint64_t buff_addr = 0;
1724*d19533e8SHuawei Xie 	uint32_t head[MAX_PKT_BURST];
1725*d19533e8SHuawei Xie 	uint32_t used_idx;
1726*d19533e8SHuawei Xie 	uint32_t i;
1727*d19533e8SHuawei Xie 	uint16_t free_entries, packet_success = 0;
1728*d19533e8SHuawei Xie 	uint16_t avail_idx;
1729*d19533e8SHuawei Xie 
1730*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_TXQ];
1731*d19533e8SHuawei Xie 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1732*d19533e8SHuawei Xie 
1733*d19533e8SHuawei Xie 	/* If there are no available buffers then return. */
1734*d19533e8SHuawei Xie 	if (vq->last_used_idx == avail_idx)
1735*d19533e8SHuawei Xie 		return;
1736*d19533e8SHuawei Xie 
1737*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1738*d19533e8SHuawei Xie 
1739*d19533e8SHuawei Xie 	/* Prefetch available ring to retrieve head indexes. */
1740*d19533e8SHuawei Xie 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1741*d19533e8SHuawei Xie 
1742*d19533e8SHuawei Xie 	/*get the number of free entries in the ring*/
1743*d19533e8SHuawei Xie 	free_entries = (avail_idx - vq->last_used_idx);
1744*d19533e8SHuawei Xie 
1745*d19533e8SHuawei Xie 	/* Limit to MAX_PKT_BURST. */
1746*d19533e8SHuawei Xie 	if (free_entries > MAX_PKT_BURST)
1747*d19533e8SHuawei Xie 		free_entries = MAX_PKT_BURST;
1748*d19533e8SHuawei Xie 
1749*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
1750*d19533e8SHuawei Xie 	/* Retrieve all of the head indexes first to avoid caching issues. */
1751*d19533e8SHuawei Xie 	for (i = 0; i < free_entries; i++)
1752*d19533e8SHuawei Xie 		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1753*d19533e8SHuawei Xie 
1754*d19533e8SHuawei Xie 	/* Prefetch descriptor index. */
1755*d19533e8SHuawei Xie 	rte_prefetch0(&vq->desc[head[packet_success]]);
1756*d19533e8SHuawei Xie 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1757*d19533e8SHuawei Xie 
1758*d19533e8SHuawei Xie 	while (packet_success < free_entries) {
1759*d19533e8SHuawei Xie 		desc = &vq->desc[head[packet_success]];
1760*d19533e8SHuawei Xie 
1761*d19533e8SHuawei Xie 		/* Discard first buffer as it is the virtio header */
1762*d19533e8SHuawei Xie 		desc = &vq->desc[desc->next];
1763*d19533e8SHuawei Xie 
1764*d19533e8SHuawei Xie 		/* Buffer address translation. */
1765*d19533e8SHuawei Xie 		buff_addr = gpa_to_vva(dev, desc->addr);
1766*d19533e8SHuawei Xie 		/* Prefetch buffer address. */
1767*d19533e8SHuawei Xie 		rte_prefetch0((void*)(uintptr_t)buff_addr);
1768*d19533e8SHuawei Xie 
1769*d19533e8SHuawei Xie 		used_idx = vq->last_used_idx & (vq->size - 1);
1770*d19533e8SHuawei Xie 
1771*d19533e8SHuawei Xie 		if (packet_success < (free_entries - 1)) {
1772*d19533e8SHuawei Xie 			/* Prefetch descriptor index. */
1773*d19533e8SHuawei Xie 			rte_prefetch0(&vq->desc[head[packet_success+1]]);
1774*d19533e8SHuawei Xie 			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1775*d19533e8SHuawei Xie 		}
1776*d19533e8SHuawei Xie 
1777*d19533e8SHuawei Xie 		/* Update used index buffer information. */
1778*d19533e8SHuawei Xie 		vq->used->ring[used_idx].id = head[packet_success];
1779*d19533e8SHuawei Xie 		vq->used->ring[used_idx].len = 0;
1780*d19533e8SHuawei Xie 
1781*d19533e8SHuawei Xie 		/* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
1782*d19533e8SHuawei Xie 		m.data_len = desc->len;
1783*d19533e8SHuawei Xie 		m.pkt_len = desc->len;
1784*d19533e8SHuawei Xie 		m.data_off = 0;
1785*d19533e8SHuawei Xie 
1786*d19533e8SHuawei Xie 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1787*d19533e8SHuawei Xie 
1788*d19533e8SHuawei Xie 		/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1789*d19533e8SHuawei Xie 		if (dev->ready == DEVICE_MAC_LEARNING) {
1790*d19533e8SHuawei Xie 			if (dev->remove || (link_vmdq(dev, &m) == -1)) {
1791*d19533e8SHuawei Xie 				/*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
1792*d19533e8SHuawei Xie 				packet_success += free_entries;
1793*d19533e8SHuawei Xie 				vq->last_used_idx += packet_success;
1794*d19533e8SHuawei Xie 				break;
1795*d19533e8SHuawei Xie 			}
1796*d19533e8SHuawei Xie 		}
1797*d19533e8SHuawei Xie 		virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
1798*d19533e8SHuawei Xie 
1799*d19533e8SHuawei Xie 		vq->last_used_idx++;
1800*d19533e8SHuawei Xie 		packet_success++;
1801*d19533e8SHuawei Xie 	}
1802*d19533e8SHuawei Xie 
1803*d19533e8SHuawei Xie 	rte_compiler_barrier();
1804*d19533e8SHuawei Xie 	vq->used->idx += packet_success;
1805*d19533e8SHuawei Xie 	/* Kick guest if required. */
1806*d19533e8SHuawei Xie 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1807*d19533e8SHuawei Xie 		eventfd_write((int)vq->kickfd, 1);
1808*d19533e8SHuawei Xie }
1809*d19533e8SHuawei Xie 
1810*d19533e8SHuawei Xie /* This function works for TX packets with mergeable feature enabled. */
1811*d19533e8SHuawei Xie static inline void __attribute__((always_inline))
1812*d19533e8SHuawei Xie virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool)
1813*d19533e8SHuawei Xie {
1814*d19533e8SHuawei Xie 	struct rte_mbuf *m, *prev;
1815*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
1816*d19533e8SHuawei Xie 	struct vring_desc *desc;
1817*d19533e8SHuawei Xie 	uint64_t vb_addr = 0;
1818*d19533e8SHuawei Xie 	uint32_t head[MAX_PKT_BURST];
1819*d19533e8SHuawei Xie 	uint32_t used_idx;
1820*d19533e8SHuawei Xie 	uint32_t i;
1821*d19533e8SHuawei Xie 	uint16_t free_entries, entry_success = 0;
1822*d19533e8SHuawei Xie 	uint16_t avail_idx;
1823*d19533e8SHuawei Xie 	uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
1824*d19533e8SHuawei Xie 			+ RTE_PKTMBUF_HEADROOM);
1825*d19533e8SHuawei Xie 
1826*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_TXQ];
1827*d19533e8SHuawei Xie 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1828*d19533e8SHuawei Xie 
1829*d19533e8SHuawei Xie 	/* If there are no available buffers then return. */
1830*d19533e8SHuawei Xie 	if (vq->last_used_idx == avail_idx)
1831*d19533e8SHuawei Xie 		return;
1832*d19533e8SHuawei Xie 
1833*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
1834*d19533e8SHuawei Xie 		dev->device_fh);
1835*d19533e8SHuawei Xie 
1836*d19533e8SHuawei Xie 	/* Prefetch available ring to retrieve head indexes. */
1837*d19533e8SHuawei Xie 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1838*d19533e8SHuawei Xie 
1839*d19533e8SHuawei Xie 	/*get the number of free entries in the ring*/
1840*d19533e8SHuawei Xie 	free_entries = (avail_idx - vq->last_used_idx);
1841*d19533e8SHuawei Xie 
1842*d19533e8SHuawei Xie 	/* Limit to MAX_PKT_BURST. */
1843*d19533e8SHuawei Xie 	free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
1844*d19533e8SHuawei Xie 
1845*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1846*d19533e8SHuawei Xie 		dev->device_fh, free_entries);
1847*d19533e8SHuawei Xie 	/* Retrieve all of the head indexes first to avoid caching issues. */
1848*d19533e8SHuawei Xie 	for (i = 0; i < free_entries; i++)
1849*d19533e8SHuawei Xie 		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1850*d19533e8SHuawei Xie 
1851*d19533e8SHuawei Xie 	/* Prefetch descriptor index. */
1852*d19533e8SHuawei Xie 	rte_prefetch0(&vq->desc[head[entry_success]]);
1853*d19533e8SHuawei Xie 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1854*d19533e8SHuawei Xie 
1855*d19533e8SHuawei Xie 	while (entry_success < free_entries) {
1856*d19533e8SHuawei Xie 		uint32_t vb_avail, vb_offset;
1857*d19533e8SHuawei Xie 		uint32_t seg_avail, seg_offset;
1858*d19533e8SHuawei Xie 		uint32_t cpy_len;
1859*d19533e8SHuawei Xie 		uint32_t seg_num = 0;
1860*d19533e8SHuawei Xie 		struct rte_mbuf *cur;
1861*d19533e8SHuawei Xie 		uint8_t alloc_err = 0;
1862*d19533e8SHuawei Xie 
1863*d19533e8SHuawei Xie 		desc = &vq->desc[head[entry_success]];
1864*d19533e8SHuawei Xie 
1865*d19533e8SHuawei Xie 		/* Discard first buffer as it is the virtio header */
1866*d19533e8SHuawei Xie 		desc = &vq->desc[desc->next];
1867*d19533e8SHuawei Xie 
1868*d19533e8SHuawei Xie 		/* Buffer address translation. */
1869*d19533e8SHuawei Xie 		vb_addr = gpa_to_vva(dev, desc->addr);
1870*d19533e8SHuawei Xie 		/* Prefetch buffer address. */
1871*d19533e8SHuawei Xie 		rte_prefetch0((void *)(uintptr_t)vb_addr);
1872*d19533e8SHuawei Xie 
1873*d19533e8SHuawei Xie 		used_idx = vq->last_used_idx & (vq->size - 1);
1874*d19533e8SHuawei Xie 
1875*d19533e8SHuawei Xie 		if (entry_success < (free_entries - 1)) {
1876*d19533e8SHuawei Xie 			/* Prefetch descriptor index. */
1877*d19533e8SHuawei Xie 			rte_prefetch0(&vq->desc[head[entry_success+1]]);
1878*d19533e8SHuawei Xie 			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1879*d19533e8SHuawei Xie 		}
1880*d19533e8SHuawei Xie 
1881*d19533e8SHuawei Xie 		/* Update used index buffer information. */
1882*d19533e8SHuawei Xie 		vq->used->ring[used_idx].id = head[entry_success];
1883*d19533e8SHuawei Xie 		vq->used->ring[used_idx].len = 0;
1884*d19533e8SHuawei Xie 
1885*d19533e8SHuawei Xie 		vb_offset = 0;
1886*d19533e8SHuawei Xie 		vb_avail = desc->len;
1887*d19533e8SHuawei Xie 		seg_offset = 0;
1888*d19533e8SHuawei Xie 		seg_avail = buf_size;
1889*d19533e8SHuawei Xie 		cpy_len = RTE_MIN(vb_avail, seg_avail);
1890*d19533e8SHuawei Xie 
1891*d19533e8SHuawei Xie 		PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
1892*d19533e8SHuawei Xie 
1893*d19533e8SHuawei Xie 		/* Allocate an mbuf and populate the structure. */
1894*d19533e8SHuawei Xie 		m = rte_pktmbuf_alloc(mbuf_pool);
1895*d19533e8SHuawei Xie 		if (unlikely(m == NULL)) {
1896*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_DATA,
1897*d19533e8SHuawei Xie 				"Failed to allocate memory for mbuf.\n");
1898*d19533e8SHuawei Xie 			return;
1899*d19533e8SHuawei Xie 		}
1900*d19533e8SHuawei Xie 
1901*d19533e8SHuawei Xie 		seg_num++;
1902*d19533e8SHuawei Xie 		cur = m;
1903*d19533e8SHuawei Xie 		prev = m;
1904*d19533e8SHuawei Xie 		while (cpy_len != 0) {
1905*d19533e8SHuawei Xie 			rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
1906*d19533e8SHuawei Xie 				(void *)((uintptr_t)(vb_addr + vb_offset)),
1907*d19533e8SHuawei Xie 				cpy_len);
1908*d19533e8SHuawei Xie 
1909*d19533e8SHuawei Xie 			seg_offset += cpy_len;
1910*d19533e8SHuawei Xie 			vb_offset += cpy_len;
1911*d19533e8SHuawei Xie 			vb_avail -= cpy_len;
1912*d19533e8SHuawei Xie 			seg_avail -= cpy_len;
1913*d19533e8SHuawei Xie 
1914*d19533e8SHuawei Xie 			if (vb_avail != 0) {
1915*d19533e8SHuawei Xie 				/*
1916*d19533e8SHuawei Xie 				 * The segment reachs to its end,
1917*d19533e8SHuawei Xie 				 * while the virtio buffer in TX vring has
1918*d19533e8SHuawei Xie 				 * more data to be copied.
1919*d19533e8SHuawei Xie 				 */
1920*d19533e8SHuawei Xie 				cur->data_len = seg_offset;
1921*d19533e8SHuawei Xie 				m->pkt_len += seg_offset;
1922*d19533e8SHuawei Xie 				/* Allocate mbuf and populate the structure. */
1923*d19533e8SHuawei Xie 				cur = rte_pktmbuf_alloc(mbuf_pool);
1924*d19533e8SHuawei Xie 				if (unlikely(cur == NULL)) {
1925*d19533e8SHuawei Xie 					RTE_LOG(ERR, VHOST_DATA, "Failed to "
1926*d19533e8SHuawei Xie 						"allocate memory for mbuf.\n");
1927*d19533e8SHuawei Xie 					rte_pktmbuf_free(m);
1928*d19533e8SHuawei Xie 					alloc_err = 1;
1929*d19533e8SHuawei Xie 					break;
1930*d19533e8SHuawei Xie 				}
1931*d19533e8SHuawei Xie 
1932*d19533e8SHuawei Xie 				seg_num++;
1933*d19533e8SHuawei Xie 				prev->next = cur;
1934*d19533e8SHuawei Xie 				prev = cur;
1935*d19533e8SHuawei Xie 				seg_offset = 0;
1936*d19533e8SHuawei Xie 				seg_avail = buf_size;
1937*d19533e8SHuawei Xie 			} else {
1938*d19533e8SHuawei Xie 				if (desc->flags & VRING_DESC_F_NEXT) {
1939*d19533e8SHuawei Xie 					/*
1940*d19533e8SHuawei Xie 					 * There are more virtio buffers in
1941*d19533e8SHuawei Xie 					 * same vring entry need to be copied.
1942*d19533e8SHuawei Xie 					 */
1943*d19533e8SHuawei Xie 					if (seg_avail == 0) {
1944*d19533e8SHuawei Xie 						/*
1945*d19533e8SHuawei Xie 						 * The current segment hasn't
1946*d19533e8SHuawei Xie 						 * room to accomodate more
1947*d19533e8SHuawei Xie 						 * data.
1948*d19533e8SHuawei Xie 						 */
1949*d19533e8SHuawei Xie 						cur->data_len = seg_offset;
1950*d19533e8SHuawei Xie 						m->pkt_len += seg_offset;
1951*d19533e8SHuawei Xie 						/*
1952*d19533e8SHuawei Xie 						 * Allocate an mbuf and
1953*d19533e8SHuawei Xie 						 * populate the structure.
1954*d19533e8SHuawei Xie 						 */
1955*d19533e8SHuawei Xie 						cur = rte_pktmbuf_alloc(mbuf_pool);
1956*d19533e8SHuawei Xie 						if (unlikely(cur == NULL)) {
1957*d19533e8SHuawei Xie 							RTE_LOG(ERR,
1958*d19533e8SHuawei Xie 								VHOST_DATA,
1959*d19533e8SHuawei Xie 								"Failed to "
1960*d19533e8SHuawei Xie 								"allocate memory "
1961*d19533e8SHuawei Xie 								"for mbuf\n");
1962*d19533e8SHuawei Xie 							rte_pktmbuf_free(m);
1963*d19533e8SHuawei Xie 							alloc_err = 1;
1964*d19533e8SHuawei Xie 							break;
1965*d19533e8SHuawei Xie 						}
1966*d19533e8SHuawei Xie 						seg_num++;
1967*d19533e8SHuawei Xie 						prev->next = cur;
1968*d19533e8SHuawei Xie 						prev = cur;
1969*d19533e8SHuawei Xie 						seg_offset = 0;
1970*d19533e8SHuawei Xie 						seg_avail = buf_size;
1971*d19533e8SHuawei Xie 					}
1972*d19533e8SHuawei Xie 
1973*d19533e8SHuawei Xie 					desc = &vq->desc[desc->next];
1974*d19533e8SHuawei Xie 
1975*d19533e8SHuawei Xie 					/* Buffer address translation. */
1976*d19533e8SHuawei Xie 					vb_addr = gpa_to_vva(dev, desc->addr);
1977*d19533e8SHuawei Xie 					/* Prefetch buffer address. */
1978*d19533e8SHuawei Xie 					rte_prefetch0((void *)(uintptr_t)vb_addr);
1979*d19533e8SHuawei Xie 					vb_offset = 0;
1980*d19533e8SHuawei Xie 					vb_avail = desc->len;
1981*d19533e8SHuawei Xie 
1982*d19533e8SHuawei Xie 					PRINT_PACKET(dev, (uintptr_t)vb_addr,
1983*d19533e8SHuawei Xie 						desc->len, 0);
1984*d19533e8SHuawei Xie 				} else {
1985*d19533e8SHuawei Xie 					/* The whole packet completes. */
1986*d19533e8SHuawei Xie 					cur->data_len = seg_offset;
1987*d19533e8SHuawei Xie 					m->pkt_len += seg_offset;
1988*d19533e8SHuawei Xie 					vb_avail = 0;
1989*d19533e8SHuawei Xie 				}
1990*d19533e8SHuawei Xie 			}
1991*d19533e8SHuawei Xie 
1992*d19533e8SHuawei Xie 			cpy_len = RTE_MIN(vb_avail, seg_avail);
1993*d19533e8SHuawei Xie 		}
1994*d19533e8SHuawei Xie 
1995*d19533e8SHuawei Xie 		if (unlikely(alloc_err == 1))
1996*d19533e8SHuawei Xie 			break;
1997*d19533e8SHuawei Xie 
1998*d19533e8SHuawei Xie 		m->nb_segs = seg_num;
1999*d19533e8SHuawei Xie 
2000*d19533e8SHuawei Xie 		/*
2001*d19533e8SHuawei Xie 		 * If this is the first received packet we need to learn
2002*d19533e8SHuawei Xie 		 * the MAC and setup VMDQ
2003*d19533e8SHuawei Xie 		 */
2004*d19533e8SHuawei Xie 		if (dev->ready == DEVICE_MAC_LEARNING) {
2005*d19533e8SHuawei Xie 			if (dev->remove || (link_vmdq(dev, m) == -1)) {
2006*d19533e8SHuawei Xie 				/*
2007*d19533e8SHuawei Xie 				 * Discard frame if device is scheduled for
2008*d19533e8SHuawei Xie 				 * removal or a duplicate MAC address is found.
2009*d19533e8SHuawei Xie 				 */
2010*d19533e8SHuawei Xie 				entry_success = free_entries;
2011*d19533e8SHuawei Xie 				vq->last_used_idx += entry_success;
2012*d19533e8SHuawei Xie 				rte_pktmbuf_free(m);
2013*d19533e8SHuawei Xie 				break;
2014*d19533e8SHuawei Xie 			}
2015*d19533e8SHuawei Xie 		}
2016*d19533e8SHuawei Xie 
2017*d19533e8SHuawei Xie 		virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh);
2018*d19533e8SHuawei Xie 		vq->last_used_idx++;
2019*d19533e8SHuawei Xie 		entry_success++;
2020*d19533e8SHuawei Xie 		rte_pktmbuf_free(m);
2021*d19533e8SHuawei Xie 	}
2022*d19533e8SHuawei Xie 
2023*d19533e8SHuawei Xie 	rte_compiler_barrier();
2024*d19533e8SHuawei Xie 	vq->used->idx += entry_success;
2025*d19533e8SHuawei Xie 	/* Kick guest if required. */
2026*d19533e8SHuawei Xie 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2027*d19533e8SHuawei Xie 		eventfd_write((int)vq->kickfd, 1);
2028*d19533e8SHuawei Xie 
2029*d19533e8SHuawei Xie }
2030*d19533e8SHuawei Xie 
2031*d19533e8SHuawei Xie /*
2032*d19533e8SHuawei Xie  * This function is called by each data core. It handles all RX/TX registered with the
2033*d19533e8SHuawei Xie  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
2034*d19533e8SHuawei Xie  * with all devices in the main linked list.
2035*d19533e8SHuawei Xie  */
2036*d19533e8SHuawei Xie static int
2037*d19533e8SHuawei Xie switch_worker(__attribute__((unused)) void *arg)
2038*d19533e8SHuawei Xie {
2039*d19533e8SHuawei Xie 	struct rte_mempool *mbuf_pool = arg;
2040*d19533e8SHuawei Xie 	struct virtio_net *dev = NULL;
2041*d19533e8SHuawei Xie 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2042*d19533e8SHuawei Xie 	struct virtio_net_data_ll *dev_ll;
2043*d19533e8SHuawei Xie 	struct mbuf_table *tx_q;
2044*d19533e8SHuawei Xie 	volatile struct lcore_ll_info *lcore_ll;
2045*d19533e8SHuawei Xie 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
2046*d19533e8SHuawei Xie 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2047*d19533e8SHuawei Xie 	unsigned ret, i;
2048*d19533e8SHuawei Xie 	const uint16_t lcore_id = rte_lcore_id();
2049*d19533e8SHuawei Xie 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
2050*d19533e8SHuawei Xie 	uint16_t rx_count = 0;
2051*d19533e8SHuawei Xie 	uint32_t mergeable = 0;
2052*d19533e8SHuawei Xie 
2053*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2054*d19533e8SHuawei Xie 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2055*d19533e8SHuawei Xie 	prev_tsc = 0;
2056*d19533e8SHuawei Xie 
2057*d19533e8SHuawei Xie 	tx_q = &lcore_tx_queue[lcore_id];
2058*d19533e8SHuawei Xie 	for (i = 0; i < num_cores; i ++) {
2059*d19533e8SHuawei Xie 		if (lcore_ids[i] == lcore_id) {
2060*d19533e8SHuawei Xie 			tx_q->txq_id = i;
2061*d19533e8SHuawei Xie 			break;
2062*d19533e8SHuawei Xie 		}
2063*d19533e8SHuawei Xie 	}
2064*d19533e8SHuawei Xie 
2065*d19533e8SHuawei Xie 	while(1) {
2066*d19533e8SHuawei Xie 		cur_tsc = rte_rdtsc();
2067*d19533e8SHuawei Xie 		/*
2068*d19533e8SHuawei Xie 		 * TX burst queue drain
2069*d19533e8SHuawei Xie 		 */
2070*d19533e8SHuawei Xie 		diff_tsc = cur_tsc - prev_tsc;
2071*d19533e8SHuawei Xie 		if (unlikely(diff_tsc > drain_tsc)) {
2072*d19533e8SHuawei Xie 
2073*d19533e8SHuawei Xie 			if (tx_q->len) {
2074*d19533e8SHuawei Xie 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
2075*d19533e8SHuawei Xie 
2076*d19533e8SHuawei Xie 				/*Tx any packets in the queue*/
2077*d19533e8SHuawei Xie 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
2078*d19533e8SHuawei Xie 									   (struct rte_mbuf **)tx_q->m_table,
2079*d19533e8SHuawei Xie 									   (uint16_t)tx_q->len);
2080*d19533e8SHuawei Xie 				if (unlikely(ret < tx_q->len)) {
2081*d19533e8SHuawei Xie 					do {
2082*d19533e8SHuawei Xie 						rte_pktmbuf_free(tx_q->m_table[ret]);
2083*d19533e8SHuawei Xie 					} while (++ret < tx_q->len);
2084*d19533e8SHuawei Xie 				}
2085*d19533e8SHuawei Xie 
2086*d19533e8SHuawei Xie 				tx_q->len = 0;
2087*d19533e8SHuawei Xie 			}
2088*d19533e8SHuawei Xie 
2089*d19533e8SHuawei Xie 			prev_tsc = cur_tsc;
2090*d19533e8SHuawei Xie 
2091*d19533e8SHuawei Xie 		}
2092*d19533e8SHuawei Xie 
2093*d19533e8SHuawei Xie 		rte_prefetch0(lcore_ll->ll_root_used);
2094*d19533e8SHuawei Xie 		/*
2095*d19533e8SHuawei Xie 		 * Inform the configuration core that we have exited the linked list and that no devices are
2096*d19533e8SHuawei Xie 		 * in use if requested.
2097*d19533e8SHuawei Xie 		 */
2098*d19533e8SHuawei Xie 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2099*d19533e8SHuawei Xie 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2100*d19533e8SHuawei Xie 
2101*d19533e8SHuawei Xie 		/*
2102*d19533e8SHuawei Xie 		 * Process devices
2103*d19533e8SHuawei Xie 		 */
2104*d19533e8SHuawei Xie 		dev_ll = lcore_ll->ll_root_used;
2105*d19533e8SHuawei Xie 
2106*d19533e8SHuawei Xie 		while (dev_ll != NULL) {
2107*d19533e8SHuawei Xie 			/*get virtio device ID*/
2108*d19533e8SHuawei Xie 			dev = dev_ll->dev;
2109*d19533e8SHuawei Xie 			mergeable =
2110*d19533e8SHuawei Xie 				dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
2111*d19533e8SHuawei Xie 
2112*d19533e8SHuawei Xie 			if (dev->remove) {
2113*d19533e8SHuawei Xie 				dev_ll = dev_ll->next;
2114*d19533e8SHuawei Xie 				unlink_vmdq(dev);
2115*d19533e8SHuawei Xie 				dev->ready = DEVICE_SAFE_REMOVE;
2116*d19533e8SHuawei Xie 				continue;
2117*d19533e8SHuawei Xie 			}
2118*d19533e8SHuawei Xie 			if (likely(dev->ready == DEVICE_RX)) {
2119*d19533e8SHuawei Xie 				/*Handle guest RX*/
2120*d19533e8SHuawei Xie 				rx_count = rte_eth_rx_burst(ports[0],
2121*d19533e8SHuawei Xie 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
2122*d19533e8SHuawei Xie 
2123*d19533e8SHuawei Xie 				if (rx_count) {
2124*d19533e8SHuawei Xie 					if (likely(mergeable == 0))
2125*d19533e8SHuawei Xie 						ret_count =
2126*d19533e8SHuawei Xie 							virtio_dev_rx(dev,
2127*d19533e8SHuawei Xie 							pkts_burst, rx_count);
2128*d19533e8SHuawei Xie 					else
2129*d19533e8SHuawei Xie 						ret_count =
2130*d19533e8SHuawei Xie 							virtio_dev_merge_rx(dev,
2131*d19533e8SHuawei Xie 							pkts_burst, rx_count);
2132*d19533e8SHuawei Xie 
2133*d19533e8SHuawei Xie 					if (enable_stats) {
2134*d19533e8SHuawei Xie 						rte_atomic64_add(
2135*d19533e8SHuawei Xie 						&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
2136*d19533e8SHuawei Xie 						rx_count);
2137*d19533e8SHuawei Xie 						rte_atomic64_add(
2138*d19533e8SHuawei Xie 						&dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
2139*d19533e8SHuawei Xie 					}
2140*d19533e8SHuawei Xie 					while (likely(rx_count)) {
2141*d19533e8SHuawei Xie 						rx_count--;
2142*d19533e8SHuawei Xie 						rte_pktmbuf_free(pkts_burst[rx_count]);
2143*d19533e8SHuawei Xie 					}
2144*d19533e8SHuawei Xie 
2145*d19533e8SHuawei Xie 				}
2146*d19533e8SHuawei Xie 			}
2147*d19533e8SHuawei Xie 
2148*d19533e8SHuawei Xie 			if (!dev->remove) {
2149*d19533e8SHuawei Xie 				/*Handle guest TX*/
2150*d19533e8SHuawei Xie 				if (likely(mergeable == 0))
2151*d19533e8SHuawei Xie 					virtio_dev_tx(dev, mbuf_pool);
2152*d19533e8SHuawei Xie 				else
2153*d19533e8SHuawei Xie 					virtio_dev_merge_tx(dev, mbuf_pool);
2154*d19533e8SHuawei Xie 			}
2155*d19533e8SHuawei Xie 
2156*d19533e8SHuawei Xie 			/*move to the next device in the list*/
2157*d19533e8SHuawei Xie 			dev_ll = dev_ll->next;
2158*d19533e8SHuawei Xie 		}
2159*d19533e8SHuawei Xie 	}
2160*d19533e8SHuawei Xie 
2161*d19533e8SHuawei Xie 	return 0;
2162*d19533e8SHuawei Xie }
2163*d19533e8SHuawei Xie 
2164*d19533e8SHuawei Xie /*
2165*d19533e8SHuawei Xie  * This function gets available ring number for zero copy rx.
2166*d19533e8SHuawei Xie  * Only one thread will call this funciton for a paticular virtio device,
2167*d19533e8SHuawei Xie  * so, it is designed as non-thread-safe function.
2168*d19533e8SHuawei Xie  */
2169*d19533e8SHuawei Xie static inline uint32_t __attribute__((always_inline))
2170*d19533e8SHuawei Xie get_available_ring_num_zcp(struct virtio_net *dev)
2171*d19533e8SHuawei Xie {
2172*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2173*d19533e8SHuawei Xie 	uint16_t avail_idx;
2174*d19533e8SHuawei Xie 
2175*d19533e8SHuawei Xie 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2176*d19533e8SHuawei Xie 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
2177*d19533e8SHuawei Xie }
2178*d19533e8SHuawei Xie 
2179*d19533e8SHuawei Xie /*
2180*d19533e8SHuawei Xie  * This function gets available ring index for zero copy rx,
2181*d19533e8SHuawei Xie  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
2182*d19533e8SHuawei Xie  * Only one thread will call this funciton for a paticular virtio device,
2183*d19533e8SHuawei Xie  * so, it is designed as non-thread-safe function.
2184*d19533e8SHuawei Xie  */
2185*d19533e8SHuawei Xie static inline uint32_t __attribute__((always_inline))
2186*d19533e8SHuawei Xie get_available_ring_index_zcp(struct virtio_net *dev,
2187*d19533e8SHuawei Xie 	uint16_t *res_base_idx, uint32_t count)
2188*d19533e8SHuawei Xie {
2189*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2190*d19533e8SHuawei Xie 	uint16_t avail_idx;
2191*d19533e8SHuawei Xie 	uint32_t retry = 0;
2192*d19533e8SHuawei Xie 	uint16_t free_entries;
2193*d19533e8SHuawei Xie 
2194*d19533e8SHuawei Xie 	*res_base_idx = vq->last_used_idx_res;
2195*d19533e8SHuawei Xie 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2196*d19533e8SHuawei Xie 	free_entries = (avail_idx - *res_base_idx);
2197*d19533e8SHuawei Xie 
2198*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
2199*d19533e8SHuawei Xie 			"avail idx: %d, "
2200*d19533e8SHuawei Xie 			"res base idx:%d, free entries:%d\n",
2201*d19533e8SHuawei Xie 			dev->device_fh, avail_idx, *res_base_idx,
2202*d19533e8SHuawei Xie 			free_entries);
2203*d19533e8SHuawei Xie 
2204*d19533e8SHuawei Xie 	/*
2205*d19533e8SHuawei Xie 	 * If retry is enabled and the queue is full then we wait
2206*d19533e8SHuawei Xie 	 * and retry to avoid packet loss.
2207*d19533e8SHuawei Xie 	 */
2208*d19533e8SHuawei Xie 	if (enable_retry && unlikely(count > free_entries)) {
2209*d19533e8SHuawei Xie 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
2210*d19533e8SHuawei Xie 			rte_delay_us(burst_rx_delay_time);
2211*d19533e8SHuawei Xie 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2212*d19533e8SHuawei Xie 			free_entries = (avail_idx - *res_base_idx);
2213*d19533e8SHuawei Xie 			if (count <= free_entries)
2214*d19533e8SHuawei Xie 				break;
2215*d19533e8SHuawei Xie 		}
2216*d19533e8SHuawei Xie 	}
2217*d19533e8SHuawei Xie 
2218*d19533e8SHuawei Xie 	/*check that we have enough buffers*/
2219*d19533e8SHuawei Xie 	if (unlikely(count > free_entries))
2220*d19533e8SHuawei Xie 		count = free_entries;
2221*d19533e8SHuawei Xie 
2222*d19533e8SHuawei Xie 	if (unlikely(count == 0)) {
2223*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_DATA,
2224*d19533e8SHuawei Xie 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
2225*d19533e8SHuawei Xie 			"avail idx: %d, res base idx:%d, free entries:%d\n",
2226*d19533e8SHuawei Xie 			dev->device_fh, avail_idx,
2227*d19533e8SHuawei Xie 			*res_base_idx, free_entries);
2228*d19533e8SHuawei Xie 		return 0;
2229*d19533e8SHuawei Xie 	}
2230*d19533e8SHuawei Xie 
2231*d19533e8SHuawei Xie 	vq->last_used_idx_res = *res_base_idx + count;
2232*d19533e8SHuawei Xie 
2233*d19533e8SHuawei Xie 	return count;
2234*d19533e8SHuawei Xie }
2235*d19533e8SHuawei Xie 
2236*d19533e8SHuawei Xie /*
2237*d19533e8SHuawei Xie  * This function put descriptor back to used list.
2238*d19533e8SHuawei Xie  */
2239*d19533e8SHuawei Xie static inline void __attribute__((always_inline))
2240*d19533e8SHuawei Xie put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
2241*d19533e8SHuawei Xie {
2242*d19533e8SHuawei Xie 	uint16_t res_cur_idx = vq->last_used_idx;
2243*d19533e8SHuawei Xie 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
2244*d19533e8SHuawei Xie 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
2245*d19533e8SHuawei Xie 	rte_compiler_barrier();
2246*d19533e8SHuawei Xie 	*(volatile uint16_t *)&vq->used->idx += 1;
2247*d19533e8SHuawei Xie 	vq->last_used_idx += 1;
2248*d19533e8SHuawei Xie 
2249*d19533e8SHuawei Xie 	/* Kick the guest if necessary. */
2250*d19533e8SHuawei Xie 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2251*d19533e8SHuawei Xie 		eventfd_write((int)vq->kickfd, 1);
2252*d19533e8SHuawei Xie }
2253*d19533e8SHuawei Xie 
2254*d19533e8SHuawei Xie /*
2255*d19533e8SHuawei Xie  * This function get available descriptor from vitio vring and un-attached mbuf
2256*d19533e8SHuawei Xie  * from vpool->ring, and then attach them together. It needs adjust the offset
2257*d19533e8SHuawei Xie  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
2258*d19533e8SHuawei Xie  * frame data may be put to wrong location in mbuf.
2259*d19533e8SHuawei Xie  */
2260*d19533e8SHuawei Xie static inline void __attribute__((always_inline))
2261*d19533e8SHuawei Xie attach_rxmbuf_zcp(struct virtio_net *dev)
2262*d19533e8SHuawei Xie {
2263*d19533e8SHuawei Xie 	uint16_t res_base_idx, desc_idx;
2264*d19533e8SHuawei Xie 	uint64_t buff_addr, phys_addr;
2265*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
2266*d19533e8SHuawei Xie 	struct vring_desc *desc;
2267*d19533e8SHuawei Xie 	struct rte_mbuf *mbuf = NULL;
2268*d19533e8SHuawei Xie 	struct vpool *vpool;
2269*d19533e8SHuawei Xie 	hpa_type addr_type;
2270*d19533e8SHuawei Xie 
2271*d19533e8SHuawei Xie 	vpool = &vpool_array[dev->vmdq_rx_q];
2272*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_RXQ];
2273*d19533e8SHuawei Xie 
2274*d19533e8SHuawei Xie 	do {
2275*d19533e8SHuawei Xie 		if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
2276*d19533e8SHuawei Xie 				1) != 1))
2277*d19533e8SHuawei Xie 			return;
2278*d19533e8SHuawei Xie 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
2279*d19533e8SHuawei Xie 
2280*d19533e8SHuawei Xie 		desc = &vq->desc[desc_idx];
2281*d19533e8SHuawei Xie 		if (desc->flags & VRING_DESC_F_NEXT) {
2282*d19533e8SHuawei Xie 			desc = &vq->desc[desc->next];
2283*d19533e8SHuawei Xie 			buff_addr = gpa_to_vva(dev, desc->addr);
2284*d19533e8SHuawei Xie 			phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
2285*d19533e8SHuawei Xie 					&addr_type);
2286*d19533e8SHuawei Xie 		} else {
2287*d19533e8SHuawei Xie 			buff_addr = gpa_to_vva(dev,
2288*d19533e8SHuawei Xie 					desc->addr + vq->vhost_hlen);
2289*d19533e8SHuawei Xie 			phys_addr = gpa_to_hpa(dev,
2290*d19533e8SHuawei Xie 					desc->addr + vq->vhost_hlen,
2291*d19533e8SHuawei Xie 					desc->len, &addr_type);
2292*d19533e8SHuawei Xie 		}
2293*d19533e8SHuawei Xie 
2294*d19533e8SHuawei Xie 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2295*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
2296*d19533e8SHuawei Xie 				" address found when attaching RX frame buffer"
2297*d19533e8SHuawei Xie 				" address!\n", dev->device_fh);
2298*d19533e8SHuawei Xie 			put_desc_to_used_list_zcp(vq, desc_idx);
2299*d19533e8SHuawei Xie 			continue;
2300*d19533e8SHuawei Xie 		}
2301*d19533e8SHuawei Xie 
2302*d19533e8SHuawei Xie 		/*
2303*d19533e8SHuawei Xie 		 * Check if the frame buffer address from guest crosses
2304*d19533e8SHuawei Xie 		 * sub-region or not.
2305*d19533e8SHuawei Xie 		 */
2306*d19533e8SHuawei Xie 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2307*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_DATA,
2308*d19533e8SHuawei Xie 				"(%"PRIu64") Frame buffer address cross "
2309*d19533e8SHuawei Xie 				"sub-regioin found when attaching RX frame "
2310*d19533e8SHuawei Xie 				"buffer address!\n",
2311*d19533e8SHuawei Xie 				dev->device_fh);
2312*d19533e8SHuawei Xie 			put_desc_to_used_list_zcp(vq, desc_idx);
2313*d19533e8SHuawei Xie 			continue;
2314*d19533e8SHuawei Xie 		}
2315*d19533e8SHuawei Xie 	} while (unlikely(phys_addr == 0));
2316*d19533e8SHuawei Xie 
2317*d19533e8SHuawei Xie 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2318*d19533e8SHuawei Xie 	if (unlikely(mbuf == NULL)) {
2319*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_DATA,
2320*d19533e8SHuawei Xie 			"(%"PRIu64") in attach_rxmbuf_zcp: "
2321*d19533e8SHuawei Xie 			"ring_sc_dequeue fail.\n",
2322*d19533e8SHuawei Xie 			dev->device_fh);
2323*d19533e8SHuawei Xie 		put_desc_to_used_list_zcp(vq, desc_idx);
2324*d19533e8SHuawei Xie 		return;
2325*d19533e8SHuawei Xie 	}
2326*d19533e8SHuawei Xie 
2327*d19533e8SHuawei Xie 	if (unlikely(vpool->buf_size > desc->len)) {
2328*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_DATA,
2329*d19533e8SHuawei Xie 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
2330*d19533e8SHuawei Xie 			"length(%d) of descriptor idx: %d less than room "
2331*d19533e8SHuawei Xie 			"size required: %d\n",
2332*d19533e8SHuawei Xie 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
2333*d19533e8SHuawei Xie 		put_desc_to_used_list_zcp(vq, desc_idx);
2334*d19533e8SHuawei Xie 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2335*d19533e8SHuawei Xie 		return;
2336*d19533e8SHuawei Xie 	}
2337*d19533e8SHuawei Xie 
2338*d19533e8SHuawei Xie 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
2339*d19533e8SHuawei Xie 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
2340*d19533e8SHuawei Xie 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
2341*d19533e8SHuawei Xie 	mbuf->data_len = desc->len;
2342*d19533e8SHuawei Xie 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2343*d19533e8SHuawei Xie 
2344*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2345*d19533e8SHuawei Xie 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
2346*d19533e8SHuawei Xie 		"descriptor idx:%d\n",
2347*d19533e8SHuawei Xie 		dev->device_fh, res_base_idx, desc_idx);
2348*d19533e8SHuawei Xie 
2349*d19533e8SHuawei Xie 	__rte_mbuf_raw_free(mbuf);
2350*d19533e8SHuawei Xie 
2351*d19533e8SHuawei Xie 	return;
2352*d19533e8SHuawei Xie }
2353*d19533e8SHuawei Xie 
2354*d19533e8SHuawei Xie /*
2355*d19533e8SHuawei Xie  * Detach an attched packet mbuf -
2356*d19533e8SHuawei Xie  *  - restore original mbuf address and length values.
2357*d19533e8SHuawei Xie  *  - reset pktmbuf data and data_len to their default values.
2358*d19533e8SHuawei Xie  *  All other fields of the given packet mbuf will be left intact.
2359*d19533e8SHuawei Xie  *
2360*d19533e8SHuawei Xie  * @param m
2361*d19533e8SHuawei Xie  *   The attached packet mbuf.
2362*d19533e8SHuawei Xie  */
2363*d19533e8SHuawei Xie static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
2364*d19533e8SHuawei Xie {
2365*d19533e8SHuawei Xie 	const struct rte_mempool *mp = m->pool;
2366*d19533e8SHuawei Xie 	void *buf = RTE_MBUF_TO_BADDR(m);
2367*d19533e8SHuawei Xie 	uint32_t buf_ofs;
2368*d19533e8SHuawei Xie 	uint32_t buf_len = mp->elt_size - sizeof(*m);
2369*d19533e8SHuawei Xie 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
2370*d19533e8SHuawei Xie 
2371*d19533e8SHuawei Xie 	m->buf_addr = buf;
2372*d19533e8SHuawei Xie 	m->buf_len = (uint16_t)buf_len;
2373*d19533e8SHuawei Xie 
2374*d19533e8SHuawei Xie 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
2375*d19533e8SHuawei Xie 			RTE_PKTMBUF_HEADROOM : m->buf_len;
2376*d19533e8SHuawei Xie 	m->data_off = buf_ofs;
2377*d19533e8SHuawei Xie 
2378*d19533e8SHuawei Xie 	m->data_len = 0;
2379*d19533e8SHuawei Xie }
2380*d19533e8SHuawei Xie 
2381*d19533e8SHuawei Xie /*
2382*d19533e8SHuawei Xie  * This function is called after packets have been transimited. It fetchs mbuf
2383*d19533e8SHuawei Xie  * from vpool->pool, detached it and put into vpool->ring. It also update the
2384*d19533e8SHuawei Xie  * used index and kick the guest if necessary.
2385*d19533e8SHuawei Xie  */
2386*d19533e8SHuawei Xie static inline uint32_t __attribute__((always_inline))
2387*d19533e8SHuawei Xie txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
2388*d19533e8SHuawei Xie {
2389*d19533e8SHuawei Xie 	struct rte_mbuf *mbuf;
2390*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2391*d19533e8SHuawei Xie 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
2392*d19533e8SHuawei Xie 	uint32_t index = 0;
2393*d19533e8SHuawei Xie 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
2394*d19533e8SHuawei Xie 
2395*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2396*d19533e8SHuawei Xie 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
2397*d19533e8SHuawei Xie 		"clean is: %d\n",
2398*d19533e8SHuawei Xie 		dev->device_fh, mbuf_count);
2399*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2400*d19533e8SHuawei Xie 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
2401*d19533e8SHuawei Xie 		"clean  is : %d\n",
2402*d19533e8SHuawei Xie 		dev->device_fh, rte_ring_count(vpool->ring));
2403*d19533e8SHuawei Xie 
2404*d19533e8SHuawei Xie 	for (index = 0; index < mbuf_count; index++) {
2405*d19533e8SHuawei Xie 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2406*d19533e8SHuawei Xie 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
2407*d19533e8SHuawei Xie 			pktmbuf_detach_zcp(mbuf);
2408*d19533e8SHuawei Xie 		rte_ring_sp_enqueue(vpool->ring, mbuf);
2409*d19533e8SHuawei Xie 
2410*d19533e8SHuawei Xie 		/* Update used index buffer information. */
2411*d19533e8SHuawei Xie 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
2412*d19533e8SHuawei Xie 		vq->used->ring[used_idx].len = 0;
2413*d19533e8SHuawei Xie 
2414*d19533e8SHuawei Xie 		used_idx = (used_idx + 1) & (vq->size - 1);
2415*d19533e8SHuawei Xie 	}
2416*d19533e8SHuawei Xie 
2417*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2418*d19533e8SHuawei Xie 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
2419*d19533e8SHuawei Xie 		"clean is: %d\n",
2420*d19533e8SHuawei Xie 		dev->device_fh, rte_mempool_count(vpool->pool));
2421*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2422*d19533e8SHuawei Xie 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
2423*d19533e8SHuawei Xie 		"clean  is : %d\n",
2424*d19533e8SHuawei Xie 		dev->device_fh, rte_ring_count(vpool->ring));
2425*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2426*d19533e8SHuawei Xie 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
2427*d19533e8SHuawei Xie 		"vq->last_used_idx:%d\n",
2428*d19533e8SHuawei Xie 		dev->device_fh, vq->last_used_idx);
2429*d19533e8SHuawei Xie 
2430*d19533e8SHuawei Xie 	vq->last_used_idx += mbuf_count;
2431*d19533e8SHuawei Xie 
2432*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2433*d19533e8SHuawei Xie 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
2434*d19533e8SHuawei Xie 		"vq->last_used_idx:%d\n",
2435*d19533e8SHuawei Xie 		dev->device_fh, vq->last_used_idx);
2436*d19533e8SHuawei Xie 
2437*d19533e8SHuawei Xie 	rte_compiler_barrier();
2438*d19533e8SHuawei Xie 
2439*d19533e8SHuawei Xie 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
2440*d19533e8SHuawei Xie 
2441*d19533e8SHuawei Xie 	/* Kick guest if required. */
2442*d19533e8SHuawei Xie 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2443*d19533e8SHuawei Xie 		eventfd_write((int)vq->kickfd, 1);
2444*d19533e8SHuawei Xie 
2445*d19533e8SHuawei Xie 	return 0;
2446*d19533e8SHuawei Xie }
2447*d19533e8SHuawei Xie 
2448*d19533e8SHuawei Xie /*
2449*d19533e8SHuawei Xie  * This function is called when a virtio device is destroy.
2450*d19533e8SHuawei Xie  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
2451*d19533e8SHuawei Xie  */
2452*d19533e8SHuawei Xie static void mbuf_destroy_zcp(struct vpool *vpool)
2453*d19533e8SHuawei Xie {
2454*d19533e8SHuawei Xie 	struct rte_mbuf *mbuf = NULL;
2455*d19533e8SHuawei Xie 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
2456*d19533e8SHuawei Xie 
2457*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_CONFIG,
2458*d19533e8SHuawei Xie 		"in mbuf_destroy_zcp: mbuf count in mempool before "
2459*d19533e8SHuawei Xie 		"mbuf_destroy_zcp is: %d\n",
2460*d19533e8SHuawei Xie 		mbuf_count);
2461*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_CONFIG,
2462*d19533e8SHuawei Xie 		"in mbuf_destroy_zcp: mbuf count in  ring before "
2463*d19533e8SHuawei Xie 		"mbuf_destroy_zcp  is : %d\n",
2464*d19533e8SHuawei Xie 		rte_ring_count(vpool->ring));
2465*d19533e8SHuawei Xie 
2466*d19533e8SHuawei Xie 	for (index = 0; index < mbuf_count; index++) {
2467*d19533e8SHuawei Xie 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2468*d19533e8SHuawei Xie 		if (likely(mbuf != NULL)) {
2469*d19533e8SHuawei Xie 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
2470*d19533e8SHuawei Xie 				pktmbuf_detach_zcp(mbuf);
2471*d19533e8SHuawei Xie 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2472*d19533e8SHuawei Xie 		}
2473*d19533e8SHuawei Xie 	}
2474*d19533e8SHuawei Xie 
2475*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_CONFIG,
2476*d19533e8SHuawei Xie 		"in mbuf_destroy_zcp: mbuf count in mempool after "
2477*d19533e8SHuawei Xie 		"mbuf_destroy_zcp is: %d\n",
2478*d19533e8SHuawei Xie 		rte_mempool_count(vpool->pool));
2479*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_CONFIG,
2480*d19533e8SHuawei Xie 		"in mbuf_destroy_zcp: mbuf count in ring after "
2481*d19533e8SHuawei Xie 		"mbuf_destroy_zcp is : %d\n",
2482*d19533e8SHuawei Xie 		rte_ring_count(vpool->ring));
2483*d19533e8SHuawei Xie }
2484*d19533e8SHuawei Xie 
2485*d19533e8SHuawei Xie /*
2486*d19533e8SHuawei Xie  * This function update the use flag and counter.
2487*d19533e8SHuawei Xie  */
2488*d19533e8SHuawei Xie static inline uint32_t __attribute__((always_inline))
2489*d19533e8SHuawei Xie virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
2490*d19533e8SHuawei Xie 	uint32_t count)
2491*d19533e8SHuawei Xie {
2492*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
2493*d19533e8SHuawei Xie 	struct vring_desc *desc;
2494*d19533e8SHuawei Xie 	struct rte_mbuf *buff;
2495*d19533e8SHuawei Xie 	/* The virtio_hdr is initialised to 0. */
2496*d19533e8SHuawei Xie 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
2497*d19533e8SHuawei Xie 		= {{0, 0, 0, 0, 0, 0}, 0};
2498*d19533e8SHuawei Xie 	uint64_t buff_hdr_addr = 0;
2499*d19533e8SHuawei Xie 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
2500*d19533e8SHuawei Xie 	uint32_t head_idx, packet_success = 0;
2501*d19533e8SHuawei Xie 	uint16_t res_cur_idx;
2502*d19533e8SHuawei Xie 
2503*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
2504*d19533e8SHuawei Xie 
2505*d19533e8SHuawei Xie 	if (count == 0)
2506*d19533e8SHuawei Xie 		return 0;
2507*d19533e8SHuawei Xie 
2508*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_RXQ];
2509*d19533e8SHuawei Xie 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
2510*d19533e8SHuawei Xie 
2511*d19533e8SHuawei Xie 	res_cur_idx = vq->last_used_idx;
2512*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
2513*d19533e8SHuawei Xie 		dev->device_fh, res_cur_idx, res_cur_idx + count);
2514*d19533e8SHuawei Xie 
2515*d19533e8SHuawei Xie 	/* Retrieve all of the head indexes first to avoid caching issues. */
2516*d19533e8SHuawei Xie 	for (head_idx = 0; head_idx < count; head_idx++)
2517*d19533e8SHuawei Xie 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
2518*d19533e8SHuawei Xie 
2519*d19533e8SHuawei Xie 	/*Prefetch descriptor index. */
2520*d19533e8SHuawei Xie 	rte_prefetch0(&vq->desc[head[packet_success]]);
2521*d19533e8SHuawei Xie 
2522*d19533e8SHuawei Xie 	while (packet_success != count) {
2523*d19533e8SHuawei Xie 		/* Get descriptor from available ring */
2524*d19533e8SHuawei Xie 		desc = &vq->desc[head[packet_success]];
2525*d19533e8SHuawei Xie 
2526*d19533e8SHuawei Xie 		buff = pkts[packet_success];
2527*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_DATA,
2528*d19533e8SHuawei Xie 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
2529*d19533e8SHuawei Xie 			"pkt[%d] descriptor idx: %d\n",
2530*d19533e8SHuawei Xie 			dev->device_fh, packet_success,
2531*d19533e8SHuawei Xie 			MBUF_HEADROOM_UINT32(buff));
2532*d19533e8SHuawei Xie 
2533*d19533e8SHuawei Xie 		PRINT_PACKET(dev,
2534*d19533e8SHuawei Xie 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
2535*d19533e8SHuawei Xie 			+ RTE_PKTMBUF_HEADROOM),
2536*d19533e8SHuawei Xie 			rte_pktmbuf_data_len(buff), 0);
2537*d19533e8SHuawei Xie 
2538*d19533e8SHuawei Xie 		/* Buffer address translation for virtio header. */
2539*d19533e8SHuawei Xie 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
2540*d19533e8SHuawei Xie 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
2541*d19533e8SHuawei Xie 
2542*d19533e8SHuawei Xie 		/*
2543*d19533e8SHuawei Xie 		 * If the descriptors are chained the header and data are
2544*d19533e8SHuawei Xie 		 * placed in separate buffers.
2545*d19533e8SHuawei Xie 		 */
2546*d19533e8SHuawei Xie 		if (desc->flags & VRING_DESC_F_NEXT) {
2547*d19533e8SHuawei Xie 			desc->len = vq->vhost_hlen;
2548*d19533e8SHuawei Xie 			desc = &vq->desc[desc->next];
2549*d19533e8SHuawei Xie 			desc->len = rte_pktmbuf_data_len(buff);
2550*d19533e8SHuawei Xie 		} else {
2551*d19533e8SHuawei Xie 			desc->len = packet_len;
2552*d19533e8SHuawei Xie 		}
2553*d19533e8SHuawei Xie 
2554*d19533e8SHuawei Xie 		/* Update used ring with desc information */
2555*d19533e8SHuawei Xie 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
2556*d19533e8SHuawei Xie 			= head[packet_success];
2557*d19533e8SHuawei Xie 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
2558*d19533e8SHuawei Xie 			= packet_len;
2559*d19533e8SHuawei Xie 		res_cur_idx++;
2560*d19533e8SHuawei Xie 		packet_success++;
2561*d19533e8SHuawei Xie 
2562*d19533e8SHuawei Xie 		/* A header is required per buffer. */
2563*d19533e8SHuawei Xie 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
2564*d19533e8SHuawei Xie 			(const void *)&virtio_hdr, vq->vhost_hlen);
2565*d19533e8SHuawei Xie 
2566*d19533e8SHuawei Xie 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
2567*d19533e8SHuawei Xie 
2568*d19533e8SHuawei Xie 		if (likely(packet_success < count)) {
2569*d19533e8SHuawei Xie 			/* Prefetch descriptor index. */
2570*d19533e8SHuawei Xie 			rte_prefetch0(&vq->desc[head[packet_success]]);
2571*d19533e8SHuawei Xie 		}
2572*d19533e8SHuawei Xie 	}
2573*d19533e8SHuawei Xie 
2574*d19533e8SHuawei Xie 	rte_compiler_barrier();
2575*d19533e8SHuawei Xie 
2576*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2577*d19533e8SHuawei Xie 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
2578*d19533e8SHuawei Xie 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
2579*d19533e8SHuawei Xie 		dev->device_fh, vq->last_used_idx, vq->used->idx);
2580*d19533e8SHuawei Xie 
2581*d19533e8SHuawei Xie 	*(volatile uint16_t *)&vq->used->idx += count;
2582*d19533e8SHuawei Xie 	vq->last_used_idx += count;
2583*d19533e8SHuawei Xie 
2584*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2585*d19533e8SHuawei Xie 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
2586*d19533e8SHuawei Xie 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
2587*d19533e8SHuawei Xie 		dev->device_fh, vq->last_used_idx, vq->used->idx);
2588*d19533e8SHuawei Xie 
2589*d19533e8SHuawei Xie 	/* Kick the guest if necessary. */
2590*d19533e8SHuawei Xie 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2591*d19533e8SHuawei Xie 		eventfd_write((int)vq->kickfd, 1);
2592*d19533e8SHuawei Xie 
2593*d19533e8SHuawei Xie 	return count;
2594*d19533e8SHuawei Xie }
2595*d19533e8SHuawei Xie 
2596*d19533e8SHuawei Xie /*
2597*d19533e8SHuawei Xie  * This function routes the TX packet to the correct interface.
2598*d19533e8SHuawei Xie  * This may be a local device or the physical port.
2599*d19533e8SHuawei Xie  */
2600*d19533e8SHuawei Xie static inline void __attribute__((always_inline))
2601*d19533e8SHuawei Xie virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
2602*d19533e8SHuawei Xie 	uint32_t desc_idx, uint8_t need_copy)
2603*d19533e8SHuawei Xie {
2604*d19533e8SHuawei Xie 	struct mbuf_table *tx_q;
2605*d19533e8SHuawei Xie 	struct rte_mbuf **m_table;
2606*d19533e8SHuawei Xie 	struct rte_mbuf *mbuf = NULL;
2607*d19533e8SHuawei Xie 	unsigned len, ret, offset = 0;
2608*d19533e8SHuawei Xie 	struct vpool *vpool;
2609*d19533e8SHuawei Xie 	struct virtio_net_data_ll *dev_ll = ll_root_used;
2610*d19533e8SHuawei Xie 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
2611*d19533e8SHuawei Xie 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
2612*d19533e8SHuawei Xie 
2613*d19533e8SHuawei Xie 	/*Add packet to the port tx queue*/
2614*d19533e8SHuawei Xie 	tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2615*d19533e8SHuawei Xie 	len = tx_q->len;
2616*d19533e8SHuawei Xie 
2617*d19533e8SHuawei Xie 	/* Allocate an mbuf and populate the structure. */
2618*d19533e8SHuawei Xie 	vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
2619*d19533e8SHuawei Xie 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2620*d19533e8SHuawei Xie 	if (unlikely(mbuf == NULL)) {
2621*d19533e8SHuawei Xie 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2622*d19533e8SHuawei Xie 		RTE_LOG(ERR, VHOST_DATA,
2623*d19533e8SHuawei Xie 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
2624*d19533e8SHuawei Xie 			dev->device_fh);
2625*d19533e8SHuawei Xie 		put_desc_to_used_list_zcp(vq, desc_idx);
2626*d19533e8SHuawei Xie 		return;
2627*d19533e8SHuawei Xie 	}
2628*d19533e8SHuawei Xie 
2629*d19533e8SHuawei Xie 	if (vm2vm_mode == VM2VM_HARDWARE) {
2630*d19533e8SHuawei Xie 		/* Avoid using a vlan tag from any vm for external pkt, such as
2631*d19533e8SHuawei Xie 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
2632*d19533e8SHuawei Xie 		 * selection, MAC address determines it as an external pkt
2633*d19533e8SHuawei Xie 		 * which should go to network, while vlan tag determine it as
2634*d19533e8SHuawei Xie 		 * a vm2vm pkt should forward to another vm. Hardware confuse
2635*d19533e8SHuawei Xie 		 * such a ambiguous situation, so pkt will lost.
2636*d19533e8SHuawei Xie 		 */
2637*d19533e8SHuawei Xie 		vlan_tag = external_pkt_default_vlan_tag;
2638*d19533e8SHuawei Xie 		while (dev_ll != NULL) {
2639*d19533e8SHuawei Xie 			if (likely(dev_ll->dev->ready == DEVICE_RX) &&
2640*d19533e8SHuawei Xie 				ether_addr_cmp(&(pkt_hdr->d_addr),
2641*d19533e8SHuawei Xie 				&dev_ll->dev->mac_address)) {
2642*d19533e8SHuawei Xie 
2643*d19533e8SHuawei Xie 				/*
2644*d19533e8SHuawei Xie 				 * Drop the packet if the TX packet is destined
2645*d19533e8SHuawei Xie 				 * for the TX device.
2646*d19533e8SHuawei Xie 				 */
2647*d19533e8SHuawei Xie 				if (unlikely(dev_ll->dev->device_fh
2648*d19533e8SHuawei Xie 					== dev->device_fh)) {
2649*d19533e8SHuawei Xie 					LOG_DEBUG(VHOST_DATA,
2650*d19533e8SHuawei Xie 					"(%"PRIu64") TX: Source and destination"
2651*d19533e8SHuawei Xie 					"MAC addresses are the same. Dropping "
2652*d19533e8SHuawei Xie 					"packet.\n",
2653*d19533e8SHuawei Xie 					dev_ll->dev->device_fh);
2654*d19533e8SHuawei Xie 					MBUF_HEADROOM_UINT32(mbuf)
2655*d19533e8SHuawei Xie 						= (uint32_t)desc_idx;
2656*d19533e8SHuawei Xie 					__rte_mbuf_raw_free(mbuf);
2657*d19533e8SHuawei Xie 					return;
2658*d19533e8SHuawei Xie 				}
2659*d19533e8SHuawei Xie 
2660*d19533e8SHuawei Xie 				/*
2661*d19533e8SHuawei Xie 				 * Packet length offset 4 bytes for HW vlan
2662*d19533e8SHuawei Xie 				 * strip when L2 switch back.
2663*d19533e8SHuawei Xie 				 */
2664*d19533e8SHuawei Xie 				offset = 4;
2665*d19533e8SHuawei Xie 				vlan_tag =
2666*d19533e8SHuawei Xie 				(uint16_t)
2667*d19533e8SHuawei Xie 				vlan_tags[(uint16_t)dev_ll->dev->device_fh];
2668*d19533e8SHuawei Xie 
2669*d19533e8SHuawei Xie 				LOG_DEBUG(VHOST_DATA,
2670*d19533e8SHuawei Xie 				"(%"PRIu64") TX: pkt to local VM device id:"
2671*d19533e8SHuawei Xie 				"(%"PRIu64") vlan tag: %d.\n",
2672*d19533e8SHuawei Xie 				dev->device_fh, dev_ll->dev->device_fh,
2673*d19533e8SHuawei Xie 				vlan_tag);
2674*d19533e8SHuawei Xie 
2675*d19533e8SHuawei Xie 				break;
2676*d19533e8SHuawei Xie 			}
2677*d19533e8SHuawei Xie 			dev_ll = dev_ll->next;
2678*d19533e8SHuawei Xie 		}
2679*d19533e8SHuawei Xie 	}
2680*d19533e8SHuawei Xie 
2681*d19533e8SHuawei Xie 	mbuf->nb_segs = m->nb_segs;
2682*d19533e8SHuawei Xie 	mbuf->next = m->next;
2683*d19533e8SHuawei Xie 	mbuf->data_len = m->data_len + offset;
2684*d19533e8SHuawei Xie 	mbuf->pkt_len = mbuf->data_len;
2685*d19533e8SHuawei Xie 	if (unlikely(need_copy)) {
2686*d19533e8SHuawei Xie 		/* Copy the packet contents to the mbuf. */
2687*d19533e8SHuawei Xie 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
2688*d19533e8SHuawei Xie 			rte_pktmbuf_mtod(m, void *),
2689*d19533e8SHuawei Xie 			m->data_len);
2690*d19533e8SHuawei Xie 	} else {
2691*d19533e8SHuawei Xie 		mbuf->data_off = m->data_off;
2692*d19533e8SHuawei Xie 		mbuf->buf_physaddr = m->buf_physaddr;
2693*d19533e8SHuawei Xie 		mbuf->buf_addr = m->buf_addr;
2694*d19533e8SHuawei Xie 	}
2695*d19533e8SHuawei Xie 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
2696*d19533e8SHuawei Xie 	mbuf->vlan_tci = vlan_tag;
2697*d19533e8SHuawei Xie 	mbuf->l2_len = sizeof(struct ether_hdr);
2698*d19533e8SHuawei Xie 	mbuf->l3_len = sizeof(struct ipv4_hdr);
2699*d19533e8SHuawei Xie 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2700*d19533e8SHuawei Xie 
2701*d19533e8SHuawei Xie 	tx_q->m_table[len] = mbuf;
2702*d19533e8SHuawei Xie 	len++;
2703*d19533e8SHuawei Xie 
2704*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA,
2705*d19533e8SHuawei Xie 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
2706*d19533e8SHuawei Xie 		dev->device_fh,
2707*d19533e8SHuawei Xie 		mbuf->nb_segs,
2708*d19533e8SHuawei Xie 		(mbuf->next == NULL) ? "null" : "non-null");
2709*d19533e8SHuawei Xie 
2710*d19533e8SHuawei Xie 	if (enable_stats) {
2711*d19533e8SHuawei Xie 		dev_statistics[dev->device_fh].tx_total++;
2712*d19533e8SHuawei Xie 		dev_statistics[dev->device_fh].tx++;
2713*d19533e8SHuawei Xie 	}
2714*d19533e8SHuawei Xie 
2715*d19533e8SHuawei Xie 	if (unlikely(len == MAX_PKT_BURST)) {
2716*d19533e8SHuawei Xie 		m_table = (struct rte_mbuf **)tx_q->m_table;
2717*d19533e8SHuawei Xie 		ret = rte_eth_tx_burst(ports[0],
2718*d19533e8SHuawei Xie 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
2719*d19533e8SHuawei Xie 
2720*d19533e8SHuawei Xie 		/*
2721*d19533e8SHuawei Xie 		 * Free any buffers not handled by TX and update
2722*d19533e8SHuawei Xie 		 * the port stats.
2723*d19533e8SHuawei Xie 		 */
2724*d19533e8SHuawei Xie 		if (unlikely(ret < len)) {
2725*d19533e8SHuawei Xie 			do {
2726*d19533e8SHuawei Xie 				rte_pktmbuf_free(m_table[ret]);
2727*d19533e8SHuawei Xie 			} while (++ret < len);
2728*d19533e8SHuawei Xie 		}
2729*d19533e8SHuawei Xie 
2730*d19533e8SHuawei Xie 		len = 0;
2731*d19533e8SHuawei Xie 		txmbuf_clean_zcp(dev, vpool);
2732*d19533e8SHuawei Xie 	}
2733*d19533e8SHuawei Xie 
2734*d19533e8SHuawei Xie 	tx_q->len = len;
2735*d19533e8SHuawei Xie 
2736*d19533e8SHuawei Xie 	return;
2737*d19533e8SHuawei Xie }
2738*d19533e8SHuawei Xie 
2739*d19533e8SHuawei Xie /*
2740*d19533e8SHuawei Xie  * This function TX all available packets in virtio TX queue for one
2741*d19533e8SHuawei Xie  * virtio-net device. If it is first packet, it learns MAC address and
2742*d19533e8SHuawei Xie  * setup VMDQ.
2743*d19533e8SHuawei Xie  */
2744*d19533e8SHuawei Xie static inline void __attribute__((always_inline))
2745*d19533e8SHuawei Xie virtio_dev_tx_zcp(struct virtio_net *dev)
2746*d19533e8SHuawei Xie {
2747*d19533e8SHuawei Xie 	struct rte_mbuf m;
2748*d19533e8SHuawei Xie 	struct vhost_virtqueue *vq;
2749*d19533e8SHuawei Xie 	struct vring_desc *desc;
2750*d19533e8SHuawei Xie 	uint64_t buff_addr = 0, phys_addr;
2751*d19533e8SHuawei Xie 	uint32_t head[MAX_PKT_BURST];
2752*d19533e8SHuawei Xie 	uint32_t i;
2753*d19533e8SHuawei Xie 	uint16_t free_entries, packet_success = 0;
2754*d19533e8SHuawei Xie 	uint16_t avail_idx;
2755*d19533e8SHuawei Xie 	uint8_t need_copy = 0;
2756*d19533e8SHuawei Xie 	hpa_type addr_type;
2757*d19533e8SHuawei Xie 
2758*d19533e8SHuawei Xie 	vq = dev->virtqueue[VIRTIO_TXQ];
2759*d19533e8SHuawei Xie 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
2760*d19533e8SHuawei Xie 
2761*d19533e8SHuawei Xie 	/* If there are no available buffers then return. */
2762*d19533e8SHuawei Xie 	if (vq->last_used_idx_res == avail_idx)
2763*d19533e8SHuawei Xie 		return;
2764*d19533e8SHuawei Xie 
2765*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2766*d19533e8SHuawei Xie 
2767*d19533e8SHuawei Xie 	/* Prefetch available ring to retrieve head indexes. */
2768*d19533e8SHuawei Xie 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2769*d19533e8SHuawei Xie 
2770*d19533e8SHuawei Xie 	/* Get the number of free entries in the ring */
2771*d19533e8SHuawei Xie 	free_entries = (avail_idx - vq->last_used_idx_res);
2772*d19533e8SHuawei Xie 
2773*d19533e8SHuawei Xie 	/* Limit to MAX_PKT_BURST. */
2774*d19533e8SHuawei Xie 	free_entries
2775*d19533e8SHuawei Xie 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2776*d19533e8SHuawei Xie 
2777*d19533e8SHuawei Xie 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
2778*d19533e8SHuawei Xie 		dev->device_fh, free_entries);
2779*d19533e8SHuawei Xie 
2780*d19533e8SHuawei Xie 	/* Retrieve all of the head indexes first to avoid caching issues. */
2781*d19533e8SHuawei Xie 	for (i = 0; i < free_entries; i++)
2782*d19533e8SHuawei Xie 		head[i]
2783*d19533e8SHuawei Xie 			= vq->avail->ring[(vq->last_used_idx_res + i)
2784*d19533e8SHuawei Xie 			& (vq->size - 1)];
2785*d19533e8SHuawei Xie 
2786*d19533e8SHuawei Xie 	vq->last_used_idx_res += free_entries;
2787*d19533e8SHuawei Xie 
2788*d19533e8SHuawei Xie 	/* Prefetch descriptor index. */
2789*d19533e8SHuawei Xie 	rte_prefetch0(&vq->desc[head[packet_success]]);
2790*d19533e8SHuawei Xie 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2791*d19533e8SHuawei Xie 
2792*d19533e8SHuawei Xie 	while (packet_success < free_entries) {
2793*d19533e8SHuawei Xie 		desc = &vq->desc[head[packet_success]];
2794*d19533e8SHuawei Xie 
2795*d19533e8SHuawei Xie 		/* Discard first buffer as it is the virtio header */
2796*d19533e8SHuawei Xie 		desc = &vq->desc[desc->next];
2797*d19533e8SHuawei Xie 
2798*d19533e8SHuawei Xie 		/* Buffer address translation. */
2799*d19533e8SHuawei Xie 		buff_addr = gpa_to_vva(dev, desc->addr);
2800*d19533e8SHuawei Xie 		phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
2801*d19533e8SHuawei Xie 
2802*d19533e8SHuawei Xie 		if (likely(packet_success < (free_entries - 1)))
2803*d19533e8SHuawei Xie 			/* Prefetch descriptor index. */
2804*d19533e8SHuawei Xie 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2805*d19533e8SHuawei Xie 
2806*d19533e8SHuawei Xie 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2807*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_DATA,
2808*d19533e8SHuawei Xie 				"(%"PRIu64") Invalid frame buffer address found"
2809*d19533e8SHuawei Xie 				"when TX packets!\n",
2810*d19533e8SHuawei Xie 				dev->device_fh);
2811*d19533e8SHuawei Xie 			packet_success++;
2812*d19533e8SHuawei Xie 			continue;
2813*d19533e8SHuawei Xie 		}
2814*d19533e8SHuawei Xie 
2815*d19533e8SHuawei Xie 		/* Prefetch buffer address. */
2816*d19533e8SHuawei Xie 		rte_prefetch0((void *)(uintptr_t)buff_addr);
2817*d19533e8SHuawei Xie 
2818*d19533e8SHuawei Xie 		/*
2819*d19533e8SHuawei Xie 		 * Setup dummy mbuf. This is copied to a real mbuf if
2820*d19533e8SHuawei Xie 		 * transmitted out the physical port.
2821*d19533e8SHuawei Xie 		 */
2822*d19533e8SHuawei Xie 		m.data_len = desc->len;
2823*d19533e8SHuawei Xie 		m.nb_segs = 1;
2824*d19533e8SHuawei Xie 		m.next = NULL;
2825*d19533e8SHuawei Xie 		m.data_off = 0;
2826*d19533e8SHuawei Xie 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2827*d19533e8SHuawei Xie 		m.buf_physaddr = phys_addr;
2828*d19533e8SHuawei Xie 
2829*d19533e8SHuawei Xie 		/*
2830*d19533e8SHuawei Xie 		 * Check if the frame buffer address from guest crosses
2831*d19533e8SHuawei Xie 		 * sub-region or not.
2832*d19533e8SHuawei Xie 		 */
2833*d19533e8SHuawei Xie 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2834*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_DATA,
2835*d19533e8SHuawei Xie 				"(%"PRIu64") Frame buffer address cross "
2836*d19533e8SHuawei Xie 				"sub-regioin found when attaching TX frame "
2837*d19533e8SHuawei Xie 				"buffer address!\n",
2838*d19533e8SHuawei Xie 				dev->device_fh);
2839*d19533e8SHuawei Xie 			need_copy = 1;
2840*d19533e8SHuawei Xie 		} else
2841*d19533e8SHuawei Xie 			need_copy = 0;
2842*d19533e8SHuawei Xie 
2843*d19533e8SHuawei Xie 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2844*d19533e8SHuawei Xie 
2845*d19533e8SHuawei Xie 		/*
2846*d19533e8SHuawei Xie 		 * If this is the first received packet we need to learn
2847*d19533e8SHuawei Xie 		 * the MAC and setup VMDQ
2848*d19533e8SHuawei Xie 		 */
2849*d19533e8SHuawei Xie 		if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2850*d19533e8SHuawei Xie 			if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2851*d19533e8SHuawei Xie 				/*
2852*d19533e8SHuawei Xie 				 * Discard frame if device is scheduled for
2853*d19533e8SHuawei Xie 				 * removal or a duplicate MAC address is found.
2854*d19533e8SHuawei Xie 				 */
2855*d19533e8SHuawei Xie 				packet_success += free_entries;
2856*d19533e8SHuawei Xie 				vq->last_used_idx += packet_success;
2857*d19533e8SHuawei Xie 				break;
2858*d19533e8SHuawei Xie 			}
2859*d19533e8SHuawei Xie 		}
2860*d19533e8SHuawei Xie 
2861*d19533e8SHuawei Xie 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2862*d19533e8SHuawei Xie 		packet_success++;
2863*d19533e8SHuawei Xie 	}
2864*d19533e8SHuawei Xie }
2865*d19533e8SHuawei Xie 
2866*d19533e8SHuawei Xie /*
2867*d19533e8SHuawei Xie  * This function is called by each data core. It handles all RX/TX registered
2868*d19533e8SHuawei Xie  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2869*d19533e8SHuawei Xie  * addresses are compared with all devices in the main linked list.
2870*d19533e8SHuawei Xie  */
2871*d19533e8SHuawei Xie static int
2872*d19533e8SHuawei Xie switch_worker_zcp(__attribute__((unused)) void *arg)
2873*d19533e8SHuawei Xie {
2874*d19533e8SHuawei Xie 	struct virtio_net *dev = NULL;
2875*d19533e8SHuawei Xie 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2876*d19533e8SHuawei Xie 	struct virtio_net_data_ll *dev_ll;
2877*d19533e8SHuawei Xie 	struct mbuf_table *tx_q;
2878*d19533e8SHuawei Xie 	volatile struct lcore_ll_info *lcore_ll;
2879*d19533e8SHuawei Xie 	const uint64_t drain_tsc
2880*d19533e8SHuawei Xie 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2881*d19533e8SHuawei Xie 		* BURST_TX_DRAIN_US;
2882*d19533e8SHuawei Xie 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2883*d19533e8SHuawei Xie 	unsigned ret;
2884*d19533e8SHuawei Xie 	const uint16_t lcore_id = rte_lcore_id();
2885*d19533e8SHuawei Xie 	uint16_t count_in_ring, rx_count = 0;
2886*d19533e8SHuawei Xie 
2887*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2888*d19533e8SHuawei Xie 
2889*d19533e8SHuawei Xie 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2890*d19533e8SHuawei Xie 	prev_tsc = 0;
2891*d19533e8SHuawei Xie 
2892*d19533e8SHuawei Xie 	while (1) {
2893*d19533e8SHuawei Xie 		cur_tsc = rte_rdtsc();
2894*d19533e8SHuawei Xie 
2895*d19533e8SHuawei Xie 		/* TX burst queue drain */
2896*d19533e8SHuawei Xie 		diff_tsc = cur_tsc - prev_tsc;
2897*d19533e8SHuawei Xie 		if (unlikely(diff_tsc > drain_tsc)) {
2898*d19533e8SHuawei Xie 			/*
2899*d19533e8SHuawei Xie 			 * Get mbuf from vpool.pool and detach mbuf and
2900*d19533e8SHuawei Xie 			 * put back into vpool.ring.
2901*d19533e8SHuawei Xie 			 */
2902*d19533e8SHuawei Xie 			dev_ll = lcore_ll->ll_root_used;
2903*d19533e8SHuawei Xie 			while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2904*d19533e8SHuawei Xie 				/* Get virtio device ID */
2905*d19533e8SHuawei Xie 				dev = dev_ll->dev;
2906*d19533e8SHuawei Xie 
2907*d19533e8SHuawei Xie 				if (likely(!dev->remove)) {
2908*d19533e8SHuawei Xie 					tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2909*d19533e8SHuawei Xie 					if (tx_q->len) {
2910*d19533e8SHuawei Xie 						LOG_DEBUG(VHOST_DATA,
2911*d19533e8SHuawei Xie 						"TX queue drained after timeout"
2912*d19533e8SHuawei Xie 						" with burst size %u\n",
2913*d19533e8SHuawei Xie 						tx_q->len);
2914*d19533e8SHuawei Xie 
2915*d19533e8SHuawei Xie 						/*
2916*d19533e8SHuawei Xie 						 * Tx any packets in the queue
2917*d19533e8SHuawei Xie 						 */
2918*d19533e8SHuawei Xie 						ret = rte_eth_tx_burst(
2919*d19533e8SHuawei Xie 							ports[0],
2920*d19533e8SHuawei Xie 							(uint16_t)tx_q->txq_id,
2921*d19533e8SHuawei Xie 							(struct rte_mbuf **)
2922*d19533e8SHuawei Xie 							tx_q->m_table,
2923*d19533e8SHuawei Xie 							(uint16_t)tx_q->len);
2924*d19533e8SHuawei Xie 						if (unlikely(ret < tx_q->len)) {
2925*d19533e8SHuawei Xie 							do {
2926*d19533e8SHuawei Xie 								rte_pktmbuf_free(
2927*d19533e8SHuawei Xie 									tx_q->m_table[ret]);
2928*d19533e8SHuawei Xie 							} while (++ret < tx_q->len);
2929*d19533e8SHuawei Xie 						}
2930*d19533e8SHuawei Xie 						tx_q->len = 0;
2931*d19533e8SHuawei Xie 
2932*d19533e8SHuawei Xie 						txmbuf_clean_zcp(dev,
2933*d19533e8SHuawei Xie 							&vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2934*d19533e8SHuawei Xie 					}
2935*d19533e8SHuawei Xie 				}
2936*d19533e8SHuawei Xie 				dev_ll = dev_ll->next;
2937*d19533e8SHuawei Xie 			}
2938*d19533e8SHuawei Xie 			prev_tsc = cur_tsc;
2939*d19533e8SHuawei Xie 		}
2940*d19533e8SHuawei Xie 
2941*d19533e8SHuawei Xie 		rte_prefetch0(lcore_ll->ll_root_used);
2942*d19533e8SHuawei Xie 
2943*d19533e8SHuawei Xie 		/*
2944*d19533e8SHuawei Xie 		 * Inform the configuration core that we have exited the linked
2945*d19533e8SHuawei Xie 		 * list and that no devices are in use if requested.
2946*d19533e8SHuawei Xie 		 */
2947*d19533e8SHuawei Xie 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2948*d19533e8SHuawei Xie 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2949*d19533e8SHuawei Xie 
2950*d19533e8SHuawei Xie 		/* Process devices */
2951*d19533e8SHuawei Xie 		dev_ll = lcore_ll->ll_root_used;
2952*d19533e8SHuawei Xie 
2953*d19533e8SHuawei Xie 		while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2954*d19533e8SHuawei Xie 			dev = dev_ll->dev;
2955*d19533e8SHuawei Xie 			if (unlikely(dev->remove)) {
2956*d19533e8SHuawei Xie 				dev_ll = dev_ll->next;
2957*d19533e8SHuawei Xie 				unlink_vmdq(dev);
2958*d19533e8SHuawei Xie 				dev->ready = DEVICE_SAFE_REMOVE;
2959*d19533e8SHuawei Xie 				continue;
2960*d19533e8SHuawei Xie 			}
2961*d19533e8SHuawei Xie 
2962*d19533e8SHuawei Xie 			if (likely(dev->ready == DEVICE_RX)) {
2963*d19533e8SHuawei Xie 				uint32_t index = dev->vmdq_rx_q;
2964*d19533e8SHuawei Xie 				uint16_t i;
2965*d19533e8SHuawei Xie 				count_in_ring
2966*d19533e8SHuawei Xie 				= rte_ring_count(vpool_array[index].ring);
2967*d19533e8SHuawei Xie 				uint16_t free_entries
2968*d19533e8SHuawei Xie 				= (uint16_t)get_available_ring_num_zcp(dev);
2969*d19533e8SHuawei Xie 
2970*d19533e8SHuawei Xie 				/*
2971*d19533e8SHuawei Xie 				 * Attach all mbufs in vpool.ring and put back
2972*d19533e8SHuawei Xie 				 * into vpool.pool.
2973*d19533e8SHuawei Xie 				 */
2974*d19533e8SHuawei Xie 				for (i = 0;
2975*d19533e8SHuawei Xie 				i < RTE_MIN(free_entries,
2976*d19533e8SHuawei Xie 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2977*d19533e8SHuawei Xie 				i++)
2978*d19533e8SHuawei Xie 					attach_rxmbuf_zcp(dev);
2979*d19533e8SHuawei Xie 
2980*d19533e8SHuawei Xie 				/* Handle guest RX */
2981*d19533e8SHuawei Xie 				rx_count = rte_eth_rx_burst(ports[0],
2982*d19533e8SHuawei Xie 					(uint16_t)dev->vmdq_rx_q, pkts_burst,
2983*d19533e8SHuawei Xie 					MAX_PKT_BURST);
2984*d19533e8SHuawei Xie 
2985*d19533e8SHuawei Xie 				if (rx_count) {
2986*d19533e8SHuawei Xie 					ret_count = virtio_dev_rx_zcp(dev,
2987*d19533e8SHuawei Xie 							pkts_burst, rx_count);
2988*d19533e8SHuawei Xie 					if (enable_stats) {
2989*d19533e8SHuawei Xie 						dev_statistics[dev->device_fh].rx_total
2990*d19533e8SHuawei Xie 							+= rx_count;
2991*d19533e8SHuawei Xie 						dev_statistics[dev->device_fh].rx
2992*d19533e8SHuawei Xie 							+= ret_count;
2993*d19533e8SHuawei Xie 					}
2994*d19533e8SHuawei Xie 					while (likely(rx_count)) {
2995*d19533e8SHuawei Xie 						rx_count--;
2996*d19533e8SHuawei Xie 						pktmbuf_detach_zcp(
2997*d19533e8SHuawei Xie 							pkts_burst[rx_count]);
2998*d19533e8SHuawei Xie 						rte_ring_sp_enqueue(
2999*d19533e8SHuawei Xie 							vpool_array[index].ring,
3000*d19533e8SHuawei Xie 							(void *)pkts_burst[rx_count]);
3001*d19533e8SHuawei Xie 					}
3002*d19533e8SHuawei Xie 				}
3003*d19533e8SHuawei Xie 			}
3004*d19533e8SHuawei Xie 
3005*d19533e8SHuawei Xie 			if (likely(!dev->remove))
3006*d19533e8SHuawei Xie 				/* Handle guest TX */
3007*d19533e8SHuawei Xie 				virtio_dev_tx_zcp(dev);
3008*d19533e8SHuawei Xie 
3009*d19533e8SHuawei Xie 			/* Move to the next device in the list */
3010*d19533e8SHuawei Xie 			dev_ll = dev_ll->next;
3011*d19533e8SHuawei Xie 		}
3012*d19533e8SHuawei Xie 	}
3013*d19533e8SHuawei Xie 
3014*d19533e8SHuawei Xie 	return 0;
3015*d19533e8SHuawei Xie }
3016*d19533e8SHuawei Xie 
3017*d19533e8SHuawei Xie 
3018*d19533e8SHuawei Xie /*
3019*d19533e8SHuawei Xie  * Add an entry to a used linked list. A free entry must first be found
3020*d19533e8SHuawei Xie  * in the free linked list using get_data_ll_free_entry();
3021*d19533e8SHuawei Xie  */
3022*d19533e8SHuawei Xie static void
3023*d19533e8SHuawei Xie add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3024*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_dev)
3025*d19533e8SHuawei Xie {
3026*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll = *ll_root_addr;
3027*d19533e8SHuawei Xie 
3028*d19533e8SHuawei Xie 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
3029*d19533e8SHuawei Xie 	ll_dev->next = NULL;
3030*d19533e8SHuawei Xie 	rte_compiler_barrier();
3031*d19533e8SHuawei Xie 
3032*d19533e8SHuawei Xie 	/* If ll == NULL then this is the first device. */
3033*d19533e8SHuawei Xie 	if (ll) {
3034*d19533e8SHuawei Xie 		/* Increment to the tail of the linked list. */
3035*d19533e8SHuawei Xie 		while ((ll->next != NULL) )
3036*d19533e8SHuawei Xie 			ll = ll->next;
3037*d19533e8SHuawei Xie 
3038*d19533e8SHuawei Xie 		ll->next = ll_dev;
3039*d19533e8SHuawei Xie 	} else {
3040*d19533e8SHuawei Xie 		*ll_root_addr = ll_dev;
3041*d19533e8SHuawei Xie 	}
3042*d19533e8SHuawei Xie }
3043*d19533e8SHuawei Xie 
3044*d19533e8SHuawei Xie /*
3045*d19533e8SHuawei Xie  * Remove an entry from a used linked list. The entry must then be added to
3046*d19533e8SHuawei Xie  * the free linked list using put_data_ll_free_entry().
3047*d19533e8SHuawei Xie  */
3048*d19533e8SHuawei Xie static void
3049*d19533e8SHuawei Xie rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3050*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_dev,
3051*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_dev_last)
3052*d19533e8SHuawei Xie {
3053*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll = *ll_root_addr;
3054*d19533e8SHuawei Xie 
3055*d19533e8SHuawei Xie 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
3056*d19533e8SHuawei Xie 		return;
3057*d19533e8SHuawei Xie 
3058*d19533e8SHuawei Xie 	if (ll_dev == ll)
3059*d19533e8SHuawei Xie 		*ll_root_addr = ll_dev->next;
3060*d19533e8SHuawei Xie 	else
3061*d19533e8SHuawei Xie 		if (likely(ll_dev_last != NULL))
3062*d19533e8SHuawei Xie 			ll_dev_last->next = ll_dev->next;
3063*d19533e8SHuawei Xie 		else
3064*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
3065*d19533e8SHuawei Xie }
3066*d19533e8SHuawei Xie 
3067*d19533e8SHuawei Xie /*
3068*d19533e8SHuawei Xie  * Find and return an entry from the free linked list.
3069*d19533e8SHuawei Xie  */
3070*d19533e8SHuawei Xie static struct virtio_net_data_ll *
3071*d19533e8SHuawei Xie get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
3072*d19533e8SHuawei Xie {
3073*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
3074*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_dev;
3075*d19533e8SHuawei Xie 
3076*d19533e8SHuawei Xie 	if (ll_free == NULL)
3077*d19533e8SHuawei Xie 		return NULL;
3078*d19533e8SHuawei Xie 
3079*d19533e8SHuawei Xie 	ll_dev = ll_free;
3080*d19533e8SHuawei Xie 	*ll_root_addr = ll_free->next;
3081*d19533e8SHuawei Xie 
3082*d19533e8SHuawei Xie 	return ll_dev;
3083*d19533e8SHuawei Xie }
3084*d19533e8SHuawei Xie 
3085*d19533e8SHuawei Xie /*
3086*d19533e8SHuawei Xie  * Place an entry back on to the free linked list.
3087*d19533e8SHuawei Xie  */
3088*d19533e8SHuawei Xie static void
3089*d19533e8SHuawei Xie put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
3090*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_dev)
3091*d19533e8SHuawei Xie {
3092*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
3093*d19533e8SHuawei Xie 
3094*d19533e8SHuawei Xie 	if (ll_dev == NULL)
3095*d19533e8SHuawei Xie 		return;
3096*d19533e8SHuawei Xie 
3097*d19533e8SHuawei Xie 	ll_dev->next = ll_free;
3098*d19533e8SHuawei Xie 	*ll_root_addr = ll_dev;
3099*d19533e8SHuawei Xie }
3100*d19533e8SHuawei Xie 
3101*d19533e8SHuawei Xie /*
3102*d19533e8SHuawei Xie  * Creates a linked list of a given size.
3103*d19533e8SHuawei Xie  */
3104*d19533e8SHuawei Xie static struct virtio_net_data_ll *
3105*d19533e8SHuawei Xie alloc_data_ll(uint32_t size)
3106*d19533e8SHuawei Xie {
3107*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_new;
3108*d19533e8SHuawei Xie 	uint32_t i;
3109*d19533e8SHuawei Xie 
3110*d19533e8SHuawei Xie 	/* Malloc and then chain the linked list. */
3111*d19533e8SHuawei Xie 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
3112*d19533e8SHuawei Xie 	if (ll_new == NULL) {
3113*d19533e8SHuawei Xie 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
3114*d19533e8SHuawei Xie 		return NULL;
3115*d19533e8SHuawei Xie 	}
3116*d19533e8SHuawei Xie 
3117*d19533e8SHuawei Xie 	for (i = 0; i < size - 1; i++) {
3118*d19533e8SHuawei Xie 		ll_new[i].dev = NULL;
3119*d19533e8SHuawei Xie 		ll_new[i].next = &ll_new[i+1];
3120*d19533e8SHuawei Xie 	}
3121*d19533e8SHuawei Xie 	ll_new[i].next = NULL;
3122*d19533e8SHuawei Xie 
3123*d19533e8SHuawei Xie 	return (ll_new);
3124*d19533e8SHuawei Xie }
3125*d19533e8SHuawei Xie 
3126*d19533e8SHuawei Xie /*
3127*d19533e8SHuawei Xie  * Create the main linked list along with each individual cores linked list. A used and a free list
3128*d19533e8SHuawei Xie  * are created to manage entries.
3129*d19533e8SHuawei Xie  */
3130*d19533e8SHuawei Xie static int
3131*d19533e8SHuawei Xie init_data_ll (void)
3132*d19533e8SHuawei Xie {
3133*d19533e8SHuawei Xie 	int lcore;
3134*d19533e8SHuawei Xie 
3135*d19533e8SHuawei Xie 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3136*d19533e8SHuawei Xie 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
3137*d19533e8SHuawei Xie 		if (lcore_info[lcore].lcore_ll == NULL) {
3138*d19533e8SHuawei Xie 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
3139*d19533e8SHuawei Xie 			return -1;
3140*d19533e8SHuawei Xie 		}
3141*d19533e8SHuawei Xie 
3142*d19533e8SHuawei Xie 		lcore_info[lcore].lcore_ll->device_num = 0;
3143*d19533e8SHuawei Xie 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
3144*d19533e8SHuawei Xie 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
3145*d19533e8SHuawei Xie 		if (num_devices % num_switching_cores)
3146*d19533e8SHuawei Xie 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
3147*d19533e8SHuawei Xie 		else
3148*d19533e8SHuawei Xie 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
3149*d19533e8SHuawei Xie 	}
3150*d19533e8SHuawei Xie 
3151*d19533e8SHuawei Xie 	/* Allocate devices up to a maximum of MAX_DEVICES. */
3152*d19533e8SHuawei Xie 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
3153*d19533e8SHuawei Xie 
3154*d19533e8SHuawei Xie 	return 0;
3155*d19533e8SHuawei Xie }
3156*d19533e8SHuawei Xie 
3157*d19533e8SHuawei Xie /*
3158*d19533e8SHuawei Xie  * Set virtqueue flags so that we do not receive interrupts.
3159*d19533e8SHuawei Xie  */
3160*d19533e8SHuawei Xie static void
3161*d19533e8SHuawei Xie set_irq_status (struct virtio_net *dev)
3162*d19533e8SHuawei Xie {
3163*d19533e8SHuawei Xie 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3164*d19533e8SHuawei Xie 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3165*d19533e8SHuawei Xie }
3166*d19533e8SHuawei Xie 
3167*d19533e8SHuawei Xie /*
3168*d19533e8SHuawei Xie  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
3169*d19533e8SHuawei Xie  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
3170*d19533e8SHuawei Xie  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
3171*d19533e8SHuawei Xie  */
3172*d19533e8SHuawei Xie static void
3173*d19533e8SHuawei Xie destroy_device (volatile struct virtio_net *dev)
3174*d19533e8SHuawei Xie {
3175*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_lcore_dev_cur;
3176*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_main_dev_cur;
3177*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
3178*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
3179*d19533e8SHuawei Xie 	int lcore;
3180*d19533e8SHuawei Xie 
3181*d19533e8SHuawei Xie 	dev->flags &= ~VIRTIO_DEV_RUNNING;
3182*d19533e8SHuawei Xie 
3183*d19533e8SHuawei Xie 	/*set the remove flag. */
3184*d19533e8SHuawei Xie 	dev->remove = 1;
3185*d19533e8SHuawei Xie 
3186*d19533e8SHuawei Xie 	while(dev->ready != DEVICE_SAFE_REMOVE) {
3187*d19533e8SHuawei Xie 		rte_pause();
3188*d19533e8SHuawei Xie 	}
3189*d19533e8SHuawei Xie 
3190*d19533e8SHuawei Xie 	/* Search for entry to be removed from lcore ll */
3191*d19533e8SHuawei Xie 	ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
3192*d19533e8SHuawei Xie 	while (ll_lcore_dev_cur != NULL) {
3193*d19533e8SHuawei Xie 		if (ll_lcore_dev_cur->dev == dev) {
3194*d19533e8SHuawei Xie 			break;
3195*d19533e8SHuawei Xie 		} else {
3196*d19533e8SHuawei Xie 			ll_lcore_dev_last = ll_lcore_dev_cur;
3197*d19533e8SHuawei Xie 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
3198*d19533e8SHuawei Xie 		}
3199*d19533e8SHuawei Xie 	}
3200*d19533e8SHuawei Xie 
3201*d19533e8SHuawei Xie 	if (ll_lcore_dev_cur == NULL) {
3202*d19533e8SHuawei Xie 		RTE_LOG(ERR, VHOST_CONFIG,
3203*d19533e8SHuawei Xie 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
3204*d19533e8SHuawei Xie 			dev->device_fh);
3205*d19533e8SHuawei Xie 		return;
3206*d19533e8SHuawei Xie 	}
3207*d19533e8SHuawei Xie 
3208*d19533e8SHuawei Xie 	/* Search for entry to be removed from main ll */
3209*d19533e8SHuawei Xie 	ll_main_dev_cur = ll_root_used;
3210*d19533e8SHuawei Xie 	ll_main_dev_last = NULL;
3211*d19533e8SHuawei Xie 	while (ll_main_dev_cur != NULL) {
3212*d19533e8SHuawei Xie 		if (ll_main_dev_cur->dev == dev) {
3213*d19533e8SHuawei Xie 			break;
3214*d19533e8SHuawei Xie 		} else {
3215*d19533e8SHuawei Xie 			ll_main_dev_last = ll_main_dev_cur;
3216*d19533e8SHuawei Xie 			ll_main_dev_cur = ll_main_dev_cur->next;
3217*d19533e8SHuawei Xie 		}
3218*d19533e8SHuawei Xie 	}
3219*d19533e8SHuawei Xie 
3220*d19533e8SHuawei Xie 	/* Remove entries from the lcore and main ll. */
3221*d19533e8SHuawei Xie 	rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
3222*d19533e8SHuawei Xie 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
3223*d19533e8SHuawei Xie 
3224*d19533e8SHuawei Xie 	/* Set the dev_removal_flag on each lcore. */
3225*d19533e8SHuawei Xie 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3226*d19533e8SHuawei Xie 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
3227*d19533e8SHuawei Xie 	}
3228*d19533e8SHuawei Xie 
3229*d19533e8SHuawei Xie 	/*
3230*d19533e8SHuawei Xie 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
3231*d19533e8SHuawei Xie 	 * they can no longer access the device removed from the linked lists and that the devices
3232*d19533e8SHuawei Xie 	 * are no longer in use.
3233*d19533e8SHuawei Xie 	 */
3234*d19533e8SHuawei Xie 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3235*d19533e8SHuawei Xie 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
3236*d19533e8SHuawei Xie 			rte_pause();
3237*d19533e8SHuawei Xie 		}
3238*d19533e8SHuawei Xie 	}
3239*d19533e8SHuawei Xie 
3240*d19533e8SHuawei Xie 	/* Add the entries back to the lcore and main free ll.*/
3241*d19533e8SHuawei Xie 	put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
3242*d19533e8SHuawei Xie 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
3243*d19533e8SHuawei Xie 
3244*d19533e8SHuawei Xie 	/* Decrement number of device on the lcore. */
3245*d19533e8SHuawei Xie 	lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
3246*d19533e8SHuawei Xie 
3247*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
3248*d19533e8SHuawei Xie 
3249*d19533e8SHuawei Xie 	if (zero_copy) {
3250*d19533e8SHuawei Xie 		struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3251*d19533e8SHuawei Xie 
3252*d19533e8SHuawei Xie 		/* Stop the RX queue. */
3253*d19533e8SHuawei Xie 		if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3254*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3255*d19533e8SHuawei Xie 				"(%"PRIu64") In destroy_device: Failed to stop "
3256*d19533e8SHuawei Xie 				"rx queue:%d\n",
3257*d19533e8SHuawei Xie 				dev->device_fh,
3258*d19533e8SHuawei Xie 				dev->vmdq_rx_q);
3259*d19533e8SHuawei Xie 		}
3260*d19533e8SHuawei Xie 
3261*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_CONFIG,
3262*d19533e8SHuawei Xie 			"(%"PRIu64") in destroy_device: Start put mbuf in "
3263*d19533e8SHuawei Xie 			"mempool back to ring for RX queue: %d\n",
3264*d19533e8SHuawei Xie 			dev->device_fh, dev->vmdq_rx_q);
3265*d19533e8SHuawei Xie 
3266*d19533e8SHuawei Xie 		mbuf_destroy_zcp(vpool);
3267*d19533e8SHuawei Xie 
3268*d19533e8SHuawei Xie 		/* Stop the TX queue. */
3269*d19533e8SHuawei Xie 		if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3270*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3271*d19533e8SHuawei Xie 				"(%"PRIu64") In destroy_device: Failed to "
3272*d19533e8SHuawei Xie 				"stop tx queue:%d\n",
3273*d19533e8SHuawei Xie 				dev->device_fh, dev->vmdq_rx_q);
3274*d19533e8SHuawei Xie 		}
3275*d19533e8SHuawei Xie 
3276*d19533e8SHuawei Xie 		vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
3277*d19533e8SHuawei Xie 
3278*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_CONFIG,
3279*d19533e8SHuawei Xie 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
3280*d19533e8SHuawei Xie 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
3281*d19533e8SHuawei Xie 			dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
3282*d19533e8SHuawei Xie 			dev->device_fh);
3283*d19533e8SHuawei Xie 
3284*d19533e8SHuawei Xie 		mbuf_destroy_zcp(vpool);
3285*d19533e8SHuawei Xie 	}
3286*d19533e8SHuawei Xie 
3287*d19533e8SHuawei Xie }
3288*d19533e8SHuawei Xie 
3289*d19533e8SHuawei Xie /*
3290*d19533e8SHuawei Xie  * A new device is added to a data core. First the device is added to the main linked list
3291*d19533e8SHuawei Xie  * and the allocated to a specific data core.
3292*d19533e8SHuawei Xie  */
3293*d19533e8SHuawei Xie static int
3294*d19533e8SHuawei Xie new_device (struct virtio_net *dev)
3295*d19533e8SHuawei Xie {
3296*d19533e8SHuawei Xie 	struct virtio_net_data_ll *ll_dev;
3297*d19533e8SHuawei Xie 	int lcore, core_add = 0;
3298*d19533e8SHuawei Xie 	uint32_t device_num_min = num_devices;
3299*d19533e8SHuawei Xie 
3300*d19533e8SHuawei Xie 	/* Add device to main ll */
3301*d19533e8SHuawei Xie 	ll_dev = get_data_ll_free_entry(&ll_root_free);
3302*d19533e8SHuawei Xie 	if (ll_dev == NULL) {
3303*d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
3304*d19533e8SHuawei Xie 			"of %d devices per core has been reached\n",
3305*d19533e8SHuawei Xie 			dev->device_fh, num_devices);
3306*d19533e8SHuawei Xie 		return -1;
3307*d19533e8SHuawei Xie 	}
3308*d19533e8SHuawei Xie 	ll_dev->dev = dev;
3309*d19533e8SHuawei Xie 	add_data_ll_entry(&ll_root_used, ll_dev);
3310*d19533e8SHuawei Xie 	ll_dev->dev->vmdq_rx_q
3311*d19533e8SHuawei Xie 		= ll_dev->dev->device_fh * (num_queues / num_devices);
3312*d19533e8SHuawei Xie 
3313*d19533e8SHuawei Xie 	if (zero_copy) {
3314*d19533e8SHuawei Xie 		uint32_t index = ll_dev->dev->vmdq_rx_q;
3315*d19533e8SHuawei Xie 		uint32_t count_in_ring, i;
3316*d19533e8SHuawei Xie 		struct mbuf_table *tx_q;
3317*d19533e8SHuawei Xie 
3318*d19533e8SHuawei Xie 		count_in_ring = rte_ring_count(vpool_array[index].ring);
3319*d19533e8SHuawei Xie 
3320*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_CONFIG,
3321*d19533e8SHuawei Xie 			"(%"PRIu64") in new_device: mbuf count in mempool "
3322*d19533e8SHuawei Xie 			"before attach is: %d\n",
3323*d19533e8SHuawei Xie 			dev->device_fh,
3324*d19533e8SHuawei Xie 			rte_mempool_count(vpool_array[index].pool));
3325*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_CONFIG,
3326*d19533e8SHuawei Xie 			"(%"PRIu64") in new_device: mbuf count in  ring "
3327*d19533e8SHuawei Xie 			"before attach  is : %d\n",
3328*d19533e8SHuawei Xie 			dev->device_fh, count_in_ring);
3329*d19533e8SHuawei Xie 
3330*d19533e8SHuawei Xie 		/*
3331*d19533e8SHuawei Xie 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
3332*d19533e8SHuawei Xie 		 */
3333*d19533e8SHuawei Xie 		for (i = 0; i < count_in_ring; i++)
3334*d19533e8SHuawei Xie 			attach_rxmbuf_zcp(dev);
3335*d19533e8SHuawei Xie 
3336*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3337*d19533e8SHuawei Xie 			"mempool after attach is: %d\n",
3338*d19533e8SHuawei Xie 			dev->device_fh,
3339*d19533e8SHuawei Xie 			rte_mempool_count(vpool_array[index].pool));
3340*d19533e8SHuawei Xie 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3341*d19533e8SHuawei Xie 			"ring after attach  is : %d\n",
3342*d19533e8SHuawei Xie 			dev->device_fh,
3343*d19533e8SHuawei Xie 			rte_ring_count(vpool_array[index].ring));
3344*d19533e8SHuawei Xie 
3345*d19533e8SHuawei Xie 		tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
3346*d19533e8SHuawei Xie 		tx_q->txq_id = dev->vmdq_rx_q;
3347*d19533e8SHuawei Xie 
3348*d19533e8SHuawei Xie 		if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3349*d19533e8SHuawei Xie 			struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3350*d19533e8SHuawei Xie 
3351*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3352*d19533e8SHuawei Xie 				"(%"PRIu64") In new_device: Failed to start "
3353*d19533e8SHuawei Xie 				"tx queue:%d\n",
3354*d19533e8SHuawei Xie 				dev->device_fh, dev->vmdq_rx_q);
3355*d19533e8SHuawei Xie 
3356*d19533e8SHuawei Xie 			mbuf_destroy_zcp(vpool);
3357*d19533e8SHuawei Xie 			return -1;
3358*d19533e8SHuawei Xie 		}
3359*d19533e8SHuawei Xie 
3360*d19533e8SHuawei Xie 		if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3361*d19533e8SHuawei Xie 			struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3362*d19533e8SHuawei Xie 
3363*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3364*d19533e8SHuawei Xie 				"(%"PRIu64") In new_device: Failed to start "
3365*d19533e8SHuawei Xie 				"rx queue:%d\n",
3366*d19533e8SHuawei Xie 				dev->device_fh, dev->vmdq_rx_q);
3367*d19533e8SHuawei Xie 
3368*d19533e8SHuawei Xie 			/* Stop the TX queue. */
3369*d19533e8SHuawei Xie 			if (rte_eth_dev_tx_queue_stop(ports[0],
3370*d19533e8SHuawei Xie 				dev->vmdq_rx_q) != 0) {
3371*d19533e8SHuawei Xie 				LOG_DEBUG(VHOST_CONFIG,
3372*d19533e8SHuawei Xie 					"(%"PRIu64") In new_device: Failed to "
3373*d19533e8SHuawei Xie 					"stop tx queue:%d\n",
3374*d19533e8SHuawei Xie 					dev->device_fh, dev->vmdq_rx_q);
3375*d19533e8SHuawei Xie 			}
3376*d19533e8SHuawei Xie 
3377*d19533e8SHuawei Xie 			mbuf_destroy_zcp(vpool);
3378*d19533e8SHuawei Xie 			return -1;
3379*d19533e8SHuawei Xie 		}
3380*d19533e8SHuawei Xie 
3381*d19533e8SHuawei Xie 	}
3382*d19533e8SHuawei Xie 
3383*d19533e8SHuawei Xie 	/*reset ready flag*/
3384*d19533e8SHuawei Xie 	dev->ready = DEVICE_MAC_LEARNING;
3385*d19533e8SHuawei Xie 	dev->remove = 0;
3386*d19533e8SHuawei Xie 
3387*d19533e8SHuawei Xie 	/* Find a suitable lcore to add the device. */
3388*d19533e8SHuawei Xie 	RTE_LCORE_FOREACH_SLAVE(lcore) {
3389*d19533e8SHuawei Xie 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
3390*d19533e8SHuawei Xie 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
3391*d19533e8SHuawei Xie 			core_add = lcore;
3392*d19533e8SHuawei Xie 		}
3393*d19533e8SHuawei Xie 	}
3394*d19533e8SHuawei Xie 	/* Add device to lcore ll */
3395*d19533e8SHuawei Xie 	ll_dev->dev->coreid = core_add;
3396*d19533e8SHuawei Xie 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
3397*d19533e8SHuawei Xie 	if (ll_dev == NULL) {
3398*d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
3399*d19533e8SHuawei Xie 		dev->ready = DEVICE_SAFE_REMOVE;
3400*d19533e8SHuawei Xie 		destroy_device(dev);
3401*d19533e8SHuawei Xie 		return -1;
3402*d19533e8SHuawei Xie 	}
3403*d19533e8SHuawei Xie 	ll_dev->dev = dev;
3404*d19533e8SHuawei Xie 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
3405*d19533e8SHuawei Xie 
3406*d19533e8SHuawei Xie 	/* Initialize device stats */
3407*d19533e8SHuawei Xie 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
3408*d19533e8SHuawei Xie 
3409*d19533e8SHuawei Xie 	/* Disable notifications. */
3410*d19533e8SHuawei Xie 	set_irq_status(dev);
3411*d19533e8SHuawei Xie 	lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
3412*d19533e8SHuawei Xie 	dev->flags |= VIRTIO_DEV_RUNNING;
3413*d19533e8SHuawei Xie 
3414*d19533e8SHuawei Xie 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
3415*d19533e8SHuawei Xie 
3416*d19533e8SHuawei Xie 	return 0;
3417*d19533e8SHuawei Xie }
3418*d19533e8SHuawei Xie 
3419*d19533e8SHuawei Xie /*
3420*d19533e8SHuawei Xie  * These callback allow devices to be added to the data core when configuration
3421*d19533e8SHuawei Xie  * has been fully complete.
3422*d19533e8SHuawei Xie  */
3423*d19533e8SHuawei Xie static const struct virtio_net_device_ops virtio_net_device_ops =
3424*d19533e8SHuawei Xie {
3425*d19533e8SHuawei Xie 	.new_device =  new_device,
3426*d19533e8SHuawei Xie 	.destroy_device = destroy_device,
3427*d19533e8SHuawei Xie };
3428*d19533e8SHuawei Xie 
3429*d19533e8SHuawei Xie /*
3430*d19533e8SHuawei Xie  * This is a thread will wake up after a period to print stats if the user has
3431*d19533e8SHuawei Xie  * enabled them.
3432*d19533e8SHuawei Xie  */
3433*d19533e8SHuawei Xie static void
3434*d19533e8SHuawei Xie print_stats(void)
3435*d19533e8SHuawei Xie {
3436*d19533e8SHuawei Xie 	struct virtio_net_data_ll *dev_ll;
3437*d19533e8SHuawei Xie 	uint64_t tx_dropped, rx_dropped;
3438*d19533e8SHuawei Xie 	uint64_t tx, tx_total, rx, rx_total;
3439*d19533e8SHuawei Xie 	uint32_t device_fh;
3440*d19533e8SHuawei Xie 	const char clr[] = { 27, '[', '2', 'J', '\0' };
3441*d19533e8SHuawei Xie 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
3442*d19533e8SHuawei Xie 
3443*d19533e8SHuawei Xie 	while(1) {
3444*d19533e8SHuawei Xie 		sleep(enable_stats);
3445*d19533e8SHuawei Xie 
3446*d19533e8SHuawei Xie 		/* Clear screen and move to top left */
3447*d19533e8SHuawei Xie 		printf("%s%s", clr, top_left);
3448*d19533e8SHuawei Xie 
3449*d19533e8SHuawei Xie 		printf("\nDevice statistics ====================================");
3450*d19533e8SHuawei Xie 
3451*d19533e8SHuawei Xie 		dev_ll = ll_root_used;
3452*d19533e8SHuawei Xie 		while (dev_ll != NULL) {
3453*d19533e8SHuawei Xie 			device_fh = (uint32_t)dev_ll->dev->device_fh;
3454*d19533e8SHuawei Xie 			tx_total = dev_statistics[device_fh].tx_total;
3455*d19533e8SHuawei Xie 			tx = dev_statistics[device_fh].tx;
3456*d19533e8SHuawei Xie 			tx_dropped = tx_total - tx;
3457*d19533e8SHuawei Xie 			if (zero_copy == 0) {
3458*d19533e8SHuawei Xie 				rx_total = rte_atomic64_read(
3459*d19533e8SHuawei Xie 					&dev_statistics[device_fh].rx_total_atomic);
3460*d19533e8SHuawei Xie 				rx = rte_atomic64_read(
3461*d19533e8SHuawei Xie 					&dev_statistics[device_fh].rx_atomic);
3462*d19533e8SHuawei Xie 			} else {
3463*d19533e8SHuawei Xie 				rx_total = dev_statistics[device_fh].rx_total;
3464*d19533e8SHuawei Xie 				rx = dev_statistics[device_fh].rx;
3465*d19533e8SHuawei Xie 			}
3466*d19533e8SHuawei Xie 			rx_dropped = rx_total - rx;
3467*d19533e8SHuawei Xie 
3468*d19533e8SHuawei Xie 			printf("\nStatistics for device %"PRIu32" ------------------------------"
3469*d19533e8SHuawei Xie 					"\nTX total: 		%"PRIu64""
3470*d19533e8SHuawei Xie 					"\nTX dropped: 		%"PRIu64""
3471*d19533e8SHuawei Xie 					"\nTX successful: 		%"PRIu64""
3472*d19533e8SHuawei Xie 					"\nRX total: 		%"PRIu64""
3473*d19533e8SHuawei Xie 					"\nRX dropped: 		%"PRIu64""
3474*d19533e8SHuawei Xie 					"\nRX successful: 		%"PRIu64"",
3475*d19533e8SHuawei Xie 					device_fh,
3476*d19533e8SHuawei Xie 					tx_total,
3477*d19533e8SHuawei Xie 					tx_dropped,
3478*d19533e8SHuawei Xie 					tx,
3479*d19533e8SHuawei Xie 					rx_total,
3480*d19533e8SHuawei Xie 					rx_dropped,
3481*d19533e8SHuawei Xie 					rx);
3482*d19533e8SHuawei Xie 
3483*d19533e8SHuawei Xie 			dev_ll = dev_ll->next;
3484*d19533e8SHuawei Xie 		}
3485*d19533e8SHuawei Xie 		printf("\n======================================================\n");
3486*d19533e8SHuawei Xie 	}
3487*d19533e8SHuawei Xie }
3488*d19533e8SHuawei Xie 
3489*d19533e8SHuawei Xie static void
3490*d19533e8SHuawei Xie setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
3491*d19533e8SHuawei Xie 	char *ring_name, uint32_t nb_mbuf)
3492*d19533e8SHuawei Xie {
3493*d19533e8SHuawei Xie 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
3494*d19533e8SHuawei Xie 	vpool_array[index].pool
3495*d19533e8SHuawei Xie 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
3496*d19533e8SHuawei Xie 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
3497*d19533e8SHuawei Xie 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
3498*d19533e8SHuawei Xie 		rte_pktmbuf_init, NULL, socket, 0);
3499*d19533e8SHuawei Xie 	if (vpool_array[index].pool != NULL) {
3500*d19533e8SHuawei Xie 		vpool_array[index].ring
3501*d19533e8SHuawei Xie 			= rte_ring_create(ring_name,
3502*d19533e8SHuawei Xie 				rte_align32pow2(nb_mbuf + 1),
3503*d19533e8SHuawei Xie 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
3504*d19533e8SHuawei Xie 		if (likely(vpool_array[index].ring != NULL)) {
3505*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3506*d19533e8SHuawei Xie 				"in setup_mempool_tbl: mbuf count in "
3507*d19533e8SHuawei Xie 				"mempool is: %d\n",
3508*d19533e8SHuawei Xie 				rte_mempool_count(vpool_array[index].pool));
3509*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3510*d19533e8SHuawei Xie 				"in setup_mempool_tbl: mbuf count in "
3511*d19533e8SHuawei Xie 				"ring   is: %d\n",
3512*d19533e8SHuawei Xie 				rte_ring_count(vpool_array[index].ring));
3513*d19533e8SHuawei Xie 		} else {
3514*d19533e8SHuawei Xie 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
3515*d19533e8SHuawei Xie 				ring_name);
3516*d19533e8SHuawei Xie 		}
3517*d19533e8SHuawei Xie 
3518*d19533e8SHuawei Xie 		/* Need consider head room. */
3519*d19533e8SHuawei Xie 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
3520*d19533e8SHuawei Xie 	} else {
3521*d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
3522*d19533e8SHuawei Xie 	}
3523*d19533e8SHuawei Xie }
3524*d19533e8SHuawei Xie 
3525*d19533e8SHuawei Xie 
3526*d19533e8SHuawei Xie /*
3527*d19533e8SHuawei Xie  * Main function, does initialisation and calls the per-lcore functions. The CUSE
3528*d19533e8SHuawei Xie  * device is also registered here to handle the IOCTLs.
3529*d19533e8SHuawei Xie  */
3530*d19533e8SHuawei Xie int
3531*d19533e8SHuawei Xie MAIN(int argc, char *argv[])
3532*d19533e8SHuawei Xie {
3533*d19533e8SHuawei Xie 	struct rte_mempool *mbuf_pool = NULL;
3534*d19533e8SHuawei Xie 	unsigned lcore_id, core_id = 0;
3535*d19533e8SHuawei Xie 	unsigned nb_ports, valid_num_ports;
3536*d19533e8SHuawei Xie 	int ret;
3537*d19533e8SHuawei Xie 	uint8_t portid, queue_id = 0;
3538*d19533e8SHuawei Xie 	static pthread_t tid;
3539*d19533e8SHuawei Xie 
3540*d19533e8SHuawei Xie 	/* init EAL */
3541*d19533e8SHuawei Xie 	ret = rte_eal_init(argc, argv);
3542*d19533e8SHuawei Xie 	if (ret < 0)
3543*d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
3544*d19533e8SHuawei Xie 	argc -= ret;
3545*d19533e8SHuawei Xie 	argv += ret;
3546*d19533e8SHuawei Xie 
3547*d19533e8SHuawei Xie 	/* parse app arguments */
3548*d19533e8SHuawei Xie 	ret = us_vhost_parse_args(argc, argv);
3549*d19533e8SHuawei Xie 	if (ret < 0)
3550*d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
3551*d19533e8SHuawei Xie 
3552*d19533e8SHuawei Xie 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
3553*d19533e8SHuawei Xie 		if (rte_lcore_is_enabled(lcore_id))
3554*d19533e8SHuawei Xie 			lcore_ids[core_id ++] = lcore_id;
3555*d19533e8SHuawei Xie 
3556*d19533e8SHuawei Xie 	if (rte_lcore_count() > RTE_MAX_LCORE)
3557*d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
3558*d19533e8SHuawei Xie 
3559*d19533e8SHuawei Xie 	/*set the number of swithcing cores available*/
3560*d19533e8SHuawei Xie 	num_switching_cores = rte_lcore_count()-1;
3561*d19533e8SHuawei Xie 
3562*d19533e8SHuawei Xie 	/* Get the number of physical ports. */
3563*d19533e8SHuawei Xie 	nb_ports = rte_eth_dev_count();
3564*d19533e8SHuawei Xie 	if (nb_ports > RTE_MAX_ETHPORTS)
3565*d19533e8SHuawei Xie 		nb_ports = RTE_MAX_ETHPORTS;
3566*d19533e8SHuawei Xie 
3567*d19533e8SHuawei Xie 	/*
3568*d19533e8SHuawei Xie 	 * Update the global var NUM_PORTS and global array PORTS
3569*d19533e8SHuawei Xie 	 * and get value of var VALID_NUM_PORTS according to system ports number
3570*d19533e8SHuawei Xie 	 */
3571*d19533e8SHuawei Xie 	valid_num_ports = check_ports_num(nb_ports);
3572*d19533e8SHuawei Xie 
3573*d19533e8SHuawei Xie 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3574*d19533e8SHuawei Xie 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3575*d19533e8SHuawei Xie 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3576*d19533e8SHuawei Xie 		return -1;
3577*d19533e8SHuawei Xie 	}
3578*d19533e8SHuawei Xie 
3579*d19533e8SHuawei Xie 	if (zero_copy == 0) {
3580*d19533e8SHuawei Xie 		/* Create the mbuf pool. */
3581*d19533e8SHuawei Xie 		mbuf_pool = rte_mempool_create(
3582*d19533e8SHuawei Xie 				"MBUF_POOL",
3583*d19533e8SHuawei Xie 				NUM_MBUFS_PER_PORT
3584*d19533e8SHuawei Xie 				* valid_num_ports,
3585*d19533e8SHuawei Xie 				MBUF_SIZE, MBUF_CACHE_SIZE,
3586*d19533e8SHuawei Xie 				sizeof(struct rte_pktmbuf_pool_private),
3587*d19533e8SHuawei Xie 				rte_pktmbuf_pool_init, NULL,
3588*d19533e8SHuawei Xie 				rte_pktmbuf_init, NULL,
3589*d19533e8SHuawei Xie 				rte_socket_id(), 0);
3590*d19533e8SHuawei Xie 		if (mbuf_pool == NULL)
3591*d19533e8SHuawei Xie 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3592*d19533e8SHuawei Xie 
3593*d19533e8SHuawei Xie 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3594*d19533e8SHuawei Xie 			vpool_array[queue_id].pool = mbuf_pool;
3595*d19533e8SHuawei Xie 
3596*d19533e8SHuawei Xie 		if (vm2vm_mode == VM2VM_HARDWARE) {
3597*d19533e8SHuawei Xie 			/* Enable VT loop back to let L2 switch to do it. */
3598*d19533e8SHuawei Xie 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3599*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3600*d19533e8SHuawei Xie 				"Enable loop back for L2 switch in vmdq.\n");
3601*d19533e8SHuawei Xie 		}
3602*d19533e8SHuawei Xie 	} else {
3603*d19533e8SHuawei Xie 		uint32_t nb_mbuf;
3604*d19533e8SHuawei Xie 		char pool_name[RTE_MEMPOOL_NAMESIZE];
3605*d19533e8SHuawei Xie 		char ring_name[RTE_MEMPOOL_NAMESIZE];
3606*d19533e8SHuawei Xie 
3607*d19533e8SHuawei Xie 		/*
3608*d19533e8SHuawei Xie 		 * Zero copy defers queue RX/TX start to the time when guest
3609*d19533e8SHuawei Xie 		 * finishes its startup and packet buffers from that guest are
3610*d19533e8SHuawei Xie 		 * available.
3611*d19533e8SHuawei Xie 		 */
3612*d19533e8SHuawei Xie 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
3613*d19533e8SHuawei Xie 		rx_conf_default.rx_drop_en = 0;
3614*d19533e8SHuawei Xie 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
3615*d19533e8SHuawei Xie 		nb_mbuf = num_rx_descriptor
3616*d19533e8SHuawei Xie 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3617*d19533e8SHuawei Xie 			+ num_switching_cores * MAX_PKT_BURST;
3618*d19533e8SHuawei Xie 
3619*d19533e8SHuawei Xie 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3620*d19533e8SHuawei Xie 			snprintf(pool_name, sizeof(pool_name),
3621*d19533e8SHuawei Xie 				"rxmbuf_pool_%u", queue_id);
3622*d19533e8SHuawei Xie 			snprintf(ring_name, sizeof(ring_name),
3623*d19533e8SHuawei Xie 				"rxmbuf_ring_%u", queue_id);
3624*d19533e8SHuawei Xie 			setup_mempool_tbl(rte_socket_id(), queue_id,
3625*d19533e8SHuawei Xie 				pool_name, ring_name, nb_mbuf);
3626*d19533e8SHuawei Xie 		}
3627*d19533e8SHuawei Xie 
3628*d19533e8SHuawei Xie 		nb_mbuf = num_tx_descriptor
3629*d19533e8SHuawei Xie 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3630*d19533e8SHuawei Xie 				+ num_switching_cores * MAX_PKT_BURST;
3631*d19533e8SHuawei Xie 
3632*d19533e8SHuawei Xie 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3633*d19533e8SHuawei Xie 			snprintf(pool_name, sizeof(pool_name),
3634*d19533e8SHuawei Xie 				"txmbuf_pool_%u", queue_id);
3635*d19533e8SHuawei Xie 			snprintf(ring_name, sizeof(ring_name),
3636*d19533e8SHuawei Xie 				"txmbuf_ring_%u", queue_id);
3637*d19533e8SHuawei Xie 			setup_mempool_tbl(rte_socket_id(),
3638*d19533e8SHuawei Xie 				(queue_id + MAX_QUEUES),
3639*d19533e8SHuawei Xie 				pool_name, ring_name, nb_mbuf);
3640*d19533e8SHuawei Xie 		}
3641*d19533e8SHuawei Xie 
3642*d19533e8SHuawei Xie 		if (vm2vm_mode == VM2VM_HARDWARE) {
3643*d19533e8SHuawei Xie 			/* Enable VT loop back to let L2 switch to do it. */
3644*d19533e8SHuawei Xie 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3645*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3646*d19533e8SHuawei Xie 				"Enable loop back for L2 switch in vmdq.\n");
3647*d19533e8SHuawei Xie 		}
3648*d19533e8SHuawei Xie 	}
3649*d19533e8SHuawei Xie 	/* Set log level. */
3650*d19533e8SHuawei Xie 	rte_set_log_level(LOG_LEVEL);
3651*d19533e8SHuawei Xie 
3652*d19533e8SHuawei Xie 	/* initialize all ports */
3653*d19533e8SHuawei Xie 	for (portid = 0; portid < nb_ports; portid++) {
3654*d19533e8SHuawei Xie 		/* skip ports that are not enabled */
3655*d19533e8SHuawei Xie 		if ((enabled_port_mask & (1 << portid)) == 0) {
3656*d19533e8SHuawei Xie 			RTE_LOG(INFO, VHOST_PORT,
3657*d19533e8SHuawei Xie 				"Skipping disabled port %d\n", portid);
3658*d19533e8SHuawei Xie 			continue;
3659*d19533e8SHuawei Xie 		}
3660*d19533e8SHuawei Xie 		if (port_init(portid) != 0)
3661*d19533e8SHuawei Xie 			rte_exit(EXIT_FAILURE,
3662*d19533e8SHuawei Xie 				"Cannot initialize network ports\n");
3663*d19533e8SHuawei Xie 	}
3664*d19533e8SHuawei Xie 
3665*d19533e8SHuawei Xie 	/* Initialise all linked lists. */
3666*d19533e8SHuawei Xie 	if (init_data_ll() == -1)
3667*d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3668*d19533e8SHuawei Xie 
3669*d19533e8SHuawei Xie 	/* Initialize device stats */
3670*d19533e8SHuawei Xie 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3671*d19533e8SHuawei Xie 
3672*d19533e8SHuawei Xie 	/* Enable stats if the user option is set. */
3673*d19533e8SHuawei Xie 	if (enable_stats)
3674*d19533e8SHuawei Xie 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3675*d19533e8SHuawei Xie 
3676*d19533e8SHuawei Xie 	/* Launch all data cores. */
3677*d19533e8SHuawei Xie 	if (zero_copy == 0) {
3678*d19533e8SHuawei Xie 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3679*d19533e8SHuawei Xie 			rte_eal_remote_launch(switch_worker,
3680*d19533e8SHuawei Xie 				mbuf_pool, lcore_id);
3681*d19533e8SHuawei Xie 		}
3682*d19533e8SHuawei Xie 	} else {
3683*d19533e8SHuawei Xie 		uint32_t count_in_mempool, index, i;
3684*d19533e8SHuawei Xie 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3685*d19533e8SHuawei Xie 			/* For all RX and TX queues. */
3686*d19533e8SHuawei Xie 			count_in_mempool
3687*d19533e8SHuawei Xie 				= rte_mempool_count(vpool_array[index].pool);
3688*d19533e8SHuawei Xie 
3689*d19533e8SHuawei Xie 			/*
3690*d19533e8SHuawei Xie 			 * Transfer all un-attached mbufs from vpool.pool
3691*d19533e8SHuawei Xie 			 * to vpoo.ring.
3692*d19533e8SHuawei Xie 			 */
3693*d19533e8SHuawei Xie 			for (i = 0; i < count_in_mempool; i++) {
3694*d19533e8SHuawei Xie 				struct rte_mbuf *mbuf
3695*d19533e8SHuawei Xie 					= __rte_mbuf_raw_alloc(
3696*d19533e8SHuawei Xie 						vpool_array[index].pool);
3697*d19533e8SHuawei Xie 				rte_ring_sp_enqueue(vpool_array[index].ring,
3698*d19533e8SHuawei Xie 						(void *)mbuf);
3699*d19533e8SHuawei Xie 			}
3700*d19533e8SHuawei Xie 
3701*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3702*d19533e8SHuawei Xie 				"in MAIN: mbuf count in mempool at initial "
3703*d19533e8SHuawei Xie 				"is: %d\n", count_in_mempool);
3704*d19533e8SHuawei Xie 			LOG_DEBUG(VHOST_CONFIG,
3705*d19533e8SHuawei Xie 				"in MAIN: mbuf count in  ring at initial  is :"
3706*d19533e8SHuawei Xie 				" %d\n",
3707*d19533e8SHuawei Xie 				rte_ring_count(vpool_array[index].ring));
3708*d19533e8SHuawei Xie 		}
3709*d19533e8SHuawei Xie 
3710*d19533e8SHuawei Xie 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3711*d19533e8SHuawei Xie 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3712*d19533e8SHuawei Xie 				lcore_id);
3713*d19533e8SHuawei Xie 	}
3714*d19533e8SHuawei Xie 
3715*d19533e8SHuawei Xie 	/* Register CUSE device to handle IOCTLs. */
3716*d19533e8SHuawei Xie 	ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3717*d19533e8SHuawei Xie 	if (ret != 0)
3718*d19533e8SHuawei Xie 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3719*d19533e8SHuawei Xie 
3720*d19533e8SHuawei Xie 	init_virtio_net(&virtio_net_device_ops);
3721*d19533e8SHuawei Xie 
3722*d19533e8SHuawei Xie 	/* Start CUSE session. */
3723*d19533e8SHuawei Xie 	start_cuse_session_loop();
3724*d19533e8SHuawei Xie 	return 0;
3725*d19533e8SHuawei Xie 
3726*d19533e8SHuawei Xie }
3727*d19533e8SHuawei Xie 
3728