xref: /dpdk/examples/vhost/main.c (revision a38dfe974b3b9ef7d961a9805a805a3ce7df9288)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 #include <rte_udp.h>
56 #include <rte_sctp.h>
57 
58 #include "main.h"
59 
60 #ifndef MAX_QUEUES
61 #define MAX_QUEUES 128
62 #endif
63 
64 /* the maximum number of external ports supported */
65 #define MAX_SUP_PORTS 1
66 
67 /*
68  * Calculate the number of buffers needed per port
69  */
70 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
71 							(num_switching_cores*MAX_PKT_BURST) +  			\
72 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
73 							((num_switching_cores+1)*MBUF_CACHE_SIZE))
74 
75 #define MBUF_CACHE_SIZE	128
76 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
77 
78 /*
79  * No frame data buffer allocated from host are required for zero copy
80  * implementation, guest will allocate the frame data buffer, and vhost
81  * directly use it.
82  */
83 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
84 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
85 #define MBUF_CACHE_SIZE_ZCP 0
86 
87 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
88 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
89 
90 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
91 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
92 
93 #define JUMBO_FRAME_MAX_SIZE    0x2600
94 
95 /* State of virtio device. */
96 #define DEVICE_MAC_LEARNING 0
97 #define DEVICE_RX			1
98 #define DEVICE_SAFE_REMOVE	2
99 
100 /* Config_core_flag status definitions. */
101 #define REQUEST_DEV_REMOVAL 1
102 #define ACK_DEV_REMOVAL 0
103 
104 /* Configurable number of RX/TX ring descriptors */
105 #define RTE_TEST_RX_DESC_DEFAULT 1024
106 #define RTE_TEST_TX_DESC_DEFAULT 512
107 
108 /*
109  * Need refine these 2 macros for legacy and DPDK based front end:
110  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
111  * And then adjust power 2.
112  */
113 /*
114  * For legacy front end, 128 descriptors,
115  * half for virtio header, another half for mbuf.
116  */
117 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
118 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
119 
120 /* Get first 4 bytes in mbuf headroom. */
121 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
122 		+ sizeof(struct rte_mbuf)))
123 
124 /* true if x is a power of 2 */
125 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
126 
127 #define INVALID_PORT_ID 0xFF
128 
129 /* Max number of devices. Limited by vmdq. */
130 #define MAX_DEVICES 64
131 
132 /* Size of buffers used for snprintfs. */
133 #define MAX_PRINT_BUFF 6072
134 
135 /* Maximum character device basename size. */
136 #define MAX_BASENAME_SZ 10
137 
138 /* Maximum long option length for option parsing. */
139 #define MAX_LONG_OPT_SZ 64
140 
141 /* Used to compare MAC addresses. */
142 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
143 
144 /* Number of descriptors per cacheline. */
145 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
146 
147 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
148 
149 /* mask of enabled ports */
150 static uint32_t enabled_port_mask = 0;
151 
152 /* Promiscuous mode */
153 static uint32_t promiscuous;
154 
155 /*Number of switching cores enabled*/
156 static uint32_t num_switching_cores = 0;
157 
158 /* number of devices/queues to support*/
159 static uint32_t num_queues = 0;
160 static uint32_t num_devices;
161 
162 /*
163  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
164  * disabled on default.
165  */
166 static uint32_t zero_copy;
167 static int mergeable;
168 
169 /* Do vlan strip on host, enabled on default */
170 static uint32_t vlan_strip = 1;
171 
172 /* number of descriptors to apply*/
173 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
174 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
175 
176 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
177 #define MAX_RING_DESC 4096
178 
179 struct vpool {
180 	struct rte_mempool *pool;
181 	struct rte_ring *ring;
182 	uint32_t buf_size;
183 } vpool_array[MAX_QUEUES+MAX_QUEUES];
184 
185 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
186 typedef enum {
187 	VM2VM_DISABLED = 0,
188 	VM2VM_SOFTWARE = 1,
189 	VM2VM_HARDWARE = 2,
190 	VM2VM_LAST
191 } vm2vm_type;
192 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
193 
194 /* The type of host physical address translated from guest physical address. */
195 typedef enum {
196 	PHYS_ADDR_CONTINUOUS = 0,
197 	PHYS_ADDR_CROSS_SUBREG = 1,
198 	PHYS_ADDR_INVALID = 2,
199 	PHYS_ADDR_LAST
200 } hpa_type;
201 
202 /* Enable stats. */
203 static uint32_t enable_stats = 0;
204 /* Enable retries on RX. */
205 static uint32_t enable_retry = 1;
206 
207 /* Disable TX checksum offload */
208 static uint32_t enable_tx_csum;
209 
210 /* Disable TSO offload */
211 static uint32_t enable_tso;
212 
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217 
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220 
221 /* empty vmdq configuration structure. Filled in programatically */
222 static struct rte_eth_conf vmdq_conf_default = {
223 	.rxmode = {
224 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
225 		.split_hdr_size = 0,
226 		.header_split   = 0, /**< Header Split disabled */
227 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
228 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
229 		/*
230 		 * It is necessary for 1G NIC such as I350,
231 		 * this fixes bug of ipv4 forwarding in guest can't
232 		 * forward pakets from one virtio dev to another virtio dev.
233 		 */
234 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
235 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
236 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
237 	},
238 
239 	.txmode = {
240 		.mq_mode = ETH_MQ_TX_NONE,
241 	},
242 	.rx_adv_conf = {
243 		/*
244 		 * should be overridden separately in code with
245 		 * appropriate values
246 		 */
247 		.vmdq_rx_conf = {
248 			.nb_queue_pools = ETH_8_POOLS,
249 			.enable_default_pool = 0,
250 			.default_pool = 0,
251 			.nb_pool_maps = 0,
252 			.pool_map = {{0, 0},},
253 		},
254 	},
255 };
256 
257 static unsigned lcore_ids[RTE_MAX_LCORE];
258 static uint8_t ports[RTE_MAX_ETHPORTS];
259 static unsigned num_ports = 0; /**< The number of ports specified in command line */
260 static uint16_t num_pf_queues, num_vmdq_queues;
261 static uint16_t vmdq_pool_base, vmdq_queue_base;
262 static uint16_t queues_per_pool;
263 
264 static const uint16_t external_pkt_default_vlan_tag = 2000;
265 const uint16_t vlan_tags[] = {
266 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
267 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
268 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
269 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
270 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
271 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
272 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
273 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
274 };
275 
276 /* ethernet addresses of ports */
277 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
278 
279 /* heads for the main used and free linked lists for the data path. */
280 static struct virtio_net_data_ll *ll_root_used = NULL;
281 static struct virtio_net_data_ll *ll_root_free = NULL;
282 
283 /* Array of data core structures containing information on individual core linked lists. */
284 static struct lcore_info lcore_info[RTE_MAX_LCORE];
285 
286 /* Used for queueing bursts of TX packets. */
287 struct mbuf_table {
288 	unsigned len;
289 	unsigned txq_id;
290 	struct rte_mbuf *m_table[MAX_PKT_BURST];
291 };
292 
293 /* TX queue for each data core. */
294 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
295 
296 /* TX queue fori each virtio device for zero copy. */
297 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
298 
299 /* Vlan header struct used to insert vlan tags on TX. */
300 struct vlan_ethhdr {
301 	unsigned char   h_dest[ETH_ALEN];
302 	unsigned char   h_source[ETH_ALEN];
303 	__be16          h_vlan_proto;
304 	__be16          h_vlan_TCI;
305 	__be16          h_vlan_encapsulated_proto;
306 };
307 
308 /* Header lengths. */
309 #define VLAN_HLEN       4
310 #define VLAN_ETH_HLEN   18
311 
312 /* Per-device statistics struct */
313 struct device_statistics {
314 	uint64_t tx_total;
315 	rte_atomic64_t rx_total_atomic;
316 	uint64_t rx_total;
317 	uint64_t tx;
318 	rte_atomic64_t rx_atomic;
319 	uint64_t rx;
320 } __rte_cache_aligned;
321 struct device_statistics dev_statistics[MAX_DEVICES];
322 
323 /*
324  * Builds up the correct configuration for VMDQ VLAN pool map
325  * according to the pool & queue limits.
326  */
327 static inline int
328 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
329 {
330 	struct rte_eth_vmdq_rx_conf conf;
331 	struct rte_eth_vmdq_rx_conf *def_conf =
332 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
333 	unsigned i;
334 
335 	memset(&conf, 0, sizeof(conf));
336 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
337 	conf.nb_pool_maps = num_devices;
338 	conf.enable_loop_back = def_conf->enable_loop_back;
339 	conf.rx_mode = def_conf->rx_mode;
340 
341 	for (i = 0; i < conf.nb_pool_maps; i++) {
342 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
343 		conf.pool_map[i].pools = (1UL << i);
344 	}
345 
346 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
347 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
348 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
349 	return 0;
350 }
351 
352 /*
353  * Validate the device number according to the max pool number gotten form
354  * dev_info. If the device number is invalid, give the error message and
355  * return -1. Each device must have its own pool.
356  */
357 static inline int
358 validate_num_devices(uint32_t max_nb_devices)
359 {
360 	if (num_devices > max_nb_devices) {
361 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
362 		return -1;
363 	}
364 	return 0;
365 }
366 
367 /*
368  * Initialises a given port using global settings and with the rx buffers
369  * coming from the mbuf_pool passed as parameter
370  */
371 static inline int
372 port_init(uint8_t port)
373 {
374 	struct rte_eth_dev_info dev_info;
375 	struct rte_eth_conf port_conf;
376 	struct rte_eth_rxconf *rxconf;
377 	struct rte_eth_txconf *txconf;
378 	int16_t rx_rings, tx_rings;
379 	uint16_t rx_ring_size, tx_ring_size;
380 	int retval;
381 	uint16_t q;
382 
383 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
384 	rte_eth_dev_info_get (port, &dev_info);
385 
386 	if (dev_info.max_rx_queues > MAX_QUEUES) {
387 		rte_exit(EXIT_FAILURE,
388 			"please define MAX_QUEUES no less than %u in %s\n",
389 			dev_info.max_rx_queues, __FILE__);
390 	}
391 
392 	rxconf = &dev_info.default_rxconf;
393 	txconf = &dev_info.default_txconf;
394 	rxconf->rx_drop_en = 1;
395 
396 	/* Enable vlan offload */
397 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
398 
399 	/*
400 	 * Zero copy defers queue RX/TX start to the time when guest
401 	 * finishes its startup and packet buffers from that guest are
402 	 * available.
403 	 */
404 	if (zero_copy) {
405 		rxconf->rx_deferred_start = 1;
406 		rxconf->rx_drop_en = 0;
407 		txconf->tx_deferred_start = 1;
408 	}
409 
410 	/*configure the number of supported virtio devices based on VMDQ limits */
411 	num_devices = dev_info.max_vmdq_pools;
412 
413 	if (zero_copy) {
414 		rx_ring_size = num_rx_descriptor;
415 		tx_ring_size = num_tx_descriptor;
416 		tx_rings = dev_info.max_tx_queues;
417 	} else {
418 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
419 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
420 		tx_rings = (uint16_t)rte_lcore_count();
421 	}
422 
423 	retval = validate_num_devices(MAX_DEVICES);
424 	if (retval < 0)
425 		return retval;
426 
427 	/* Get port configuration. */
428 	retval = get_eth_conf(&port_conf, num_devices);
429 	if (retval < 0)
430 		return retval;
431 	/* NIC queues are divided into pf queues and vmdq queues.  */
432 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
433 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
434 	num_vmdq_queues = num_devices * queues_per_pool;
435 	num_queues = num_pf_queues + num_vmdq_queues;
436 	vmdq_queue_base = dev_info.vmdq_queue_base;
437 	vmdq_pool_base  = dev_info.vmdq_pool_base;
438 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
439 		num_pf_queues, num_devices, queues_per_pool);
440 
441 	if (port >= rte_eth_dev_count()) return -1;
442 
443 	if (enable_tx_csum == 0)
444 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
445 
446 	if (enable_tso == 0) {
447 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
448 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
449 	}
450 
451 	rx_rings = (uint16_t)dev_info.max_rx_queues;
452 	/* Configure ethernet device. */
453 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
454 	if (retval != 0)
455 		return retval;
456 
457 	/* Setup the queues. */
458 	for (q = 0; q < rx_rings; q ++) {
459 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
460 						rte_eth_dev_socket_id(port),
461 						rxconf,
462 						vpool_array[q].pool);
463 		if (retval < 0)
464 			return retval;
465 	}
466 	for (q = 0; q < tx_rings; q ++) {
467 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
468 						rte_eth_dev_socket_id(port),
469 						txconf);
470 		if (retval < 0)
471 			return retval;
472 	}
473 
474 	/* Start the device. */
475 	retval  = rte_eth_dev_start(port);
476 	if (retval < 0) {
477 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
478 		return retval;
479 	}
480 
481 	if (promiscuous)
482 		rte_eth_promiscuous_enable(port);
483 
484 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
485 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
486 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
487 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
488 			(unsigned)port,
489 			vmdq_ports_eth_addr[port].addr_bytes[0],
490 			vmdq_ports_eth_addr[port].addr_bytes[1],
491 			vmdq_ports_eth_addr[port].addr_bytes[2],
492 			vmdq_ports_eth_addr[port].addr_bytes[3],
493 			vmdq_ports_eth_addr[port].addr_bytes[4],
494 			vmdq_ports_eth_addr[port].addr_bytes[5]);
495 
496 	return 0;
497 }
498 
499 /*
500  * Set character device basename.
501  */
502 static int
503 us_vhost_parse_basename(const char *q_arg)
504 {
505 	/* parse number string */
506 
507 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
508 		return -1;
509 	else
510 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
511 
512 	return 0;
513 }
514 
515 /*
516  * Parse the portmask provided at run time.
517  */
518 static int
519 parse_portmask(const char *portmask)
520 {
521 	char *end = NULL;
522 	unsigned long pm;
523 
524 	errno = 0;
525 
526 	/* parse hexadecimal string */
527 	pm = strtoul(portmask, &end, 16);
528 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
529 		return -1;
530 
531 	if (pm == 0)
532 		return -1;
533 
534 	return pm;
535 
536 }
537 
538 /*
539  * Parse num options at run time.
540  */
541 static int
542 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
543 {
544 	char *end = NULL;
545 	unsigned long num;
546 
547 	errno = 0;
548 
549 	/* parse unsigned int string */
550 	num = strtoul(q_arg, &end, 10);
551 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
552 		return -1;
553 
554 	if (num > max_valid_value)
555 		return -1;
556 
557 	return num;
558 
559 }
560 
561 /*
562  * Display usage
563  */
564 static void
565 us_vhost_usage(const char *prgname)
566 {
567 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
568 	"		--vm2vm [0|1|2]\n"
569 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
570 	"		--dev-basename <name>\n"
571 	"		--nb-devices ND\n"
572 	"		-p PORTMASK: Set mask for ports to be used by application\n"
573 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
574 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
575 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
576 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
577 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
578 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
579 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
580 	"		--dev-basename: The basename to be used for the character device.\n"
581 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
582 			"zero copy\n"
583 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
584 			"used only when zero copy is enabled.\n"
585 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
586 			"used only when zero copy is enabled.\n"
587 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
588 	"		--tso [0|1] disable/enable TCP segment offload.\n",
589 	       prgname);
590 }
591 
592 /*
593  * Parse the arguments given in the command line of the application.
594  */
595 static int
596 us_vhost_parse_args(int argc, char **argv)
597 {
598 	int opt, ret;
599 	int option_index;
600 	unsigned i;
601 	const char *prgname = argv[0];
602 	static struct option long_option[] = {
603 		{"vm2vm", required_argument, NULL, 0},
604 		{"rx-retry", required_argument, NULL, 0},
605 		{"rx-retry-delay", required_argument, NULL, 0},
606 		{"rx-retry-num", required_argument, NULL, 0},
607 		{"mergeable", required_argument, NULL, 0},
608 		{"vlan-strip", required_argument, NULL, 0},
609 		{"stats", required_argument, NULL, 0},
610 		{"dev-basename", required_argument, NULL, 0},
611 		{"zero-copy", required_argument, NULL, 0},
612 		{"rx-desc-num", required_argument, NULL, 0},
613 		{"tx-desc-num", required_argument, NULL, 0},
614 		{"tx-csum", required_argument, NULL, 0},
615 		{"tso", required_argument, NULL, 0},
616 		{NULL, 0, 0, 0},
617 	};
618 
619 	/* Parse command line */
620 	while ((opt = getopt_long(argc, argv, "p:P",
621 			long_option, &option_index)) != EOF) {
622 		switch (opt) {
623 		/* Portmask */
624 		case 'p':
625 			enabled_port_mask = parse_portmask(optarg);
626 			if (enabled_port_mask == 0) {
627 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
628 				us_vhost_usage(prgname);
629 				return -1;
630 			}
631 			break;
632 
633 		case 'P':
634 			promiscuous = 1;
635 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
636 				ETH_VMDQ_ACCEPT_BROADCAST |
637 				ETH_VMDQ_ACCEPT_MULTICAST;
638 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
639 
640 			break;
641 
642 		case 0:
643 			/* Enable/disable vm2vm comms. */
644 			if (!strncmp(long_option[option_index].name, "vm2vm",
645 				MAX_LONG_OPT_SZ)) {
646 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
647 				if (ret == -1) {
648 					RTE_LOG(INFO, VHOST_CONFIG,
649 						"Invalid argument for "
650 						"vm2vm [0|1|2]\n");
651 					us_vhost_usage(prgname);
652 					return -1;
653 				} else {
654 					vm2vm_mode = (vm2vm_type)ret;
655 				}
656 			}
657 
658 			/* Enable/disable retries on RX. */
659 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
660 				ret = parse_num_opt(optarg, 1);
661 				if (ret == -1) {
662 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
663 					us_vhost_usage(prgname);
664 					return -1;
665 				} else {
666 					enable_retry = ret;
667 				}
668 			}
669 
670 			/* Enable/disable TX checksum offload. */
671 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
672 				ret = parse_num_opt(optarg, 1);
673 				if (ret == -1) {
674 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
675 					us_vhost_usage(prgname);
676 					return -1;
677 				} else
678 					enable_tx_csum = ret;
679 			}
680 
681 			/* Enable/disable TSO offload. */
682 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
683 				ret = parse_num_opt(optarg, 1);
684 				if (ret == -1) {
685 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
686 					us_vhost_usage(prgname);
687 					return -1;
688 				} else
689 					enable_tso = ret;
690 			}
691 
692 			/* Specify the retries delay time (in useconds) on RX. */
693 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
694 				ret = parse_num_opt(optarg, INT32_MAX);
695 				if (ret == -1) {
696 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
697 					us_vhost_usage(prgname);
698 					return -1;
699 				} else {
700 					burst_rx_delay_time = ret;
701 				}
702 			}
703 
704 			/* Specify the retries number on RX. */
705 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
706 				ret = parse_num_opt(optarg, INT32_MAX);
707 				if (ret == -1) {
708 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
709 					us_vhost_usage(prgname);
710 					return -1;
711 				} else {
712 					burst_rx_retry_num = ret;
713 				}
714 			}
715 
716 			/* Enable/disable RX mergeable buffers. */
717 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
718 				ret = parse_num_opt(optarg, 1);
719 				if (ret == -1) {
720 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
721 					us_vhost_usage(prgname);
722 					return -1;
723 				} else {
724 					mergeable = !!ret;
725 					if (ret) {
726 						vmdq_conf_default.rxmode.jumbo_frame = 1;
727 						vmdq_conf_default.rxmode.max_rx_pkt_len
728 							= JUMBO_FRAME_MAX_SIZE;
729 					}
730 				}
731 			}
732 
733 			/* Enable/disable RX VLAN strip on host. */
734 			if (!strncmp(long_option[option_index].name,
735 				"vlan-strip", MAX_LONG_OPT_SZ)) {
736 				ret = parse_num_opt(optarg, 1);
737 				if (ret == -1) {
738 					RTE_LOG(INFO, VHOST_CONFIG,
739 						"Invalid argument for VLAN strip [0|1]\n");
740 					us_vhost_usage(prgname);
741 					return -1;
742 				} else {
743 					vlan_strip = !!ret;
744 					vmdq_conf_default.rxmode.hw_vlan_strip =
745 						vlan_strip;
746 				}
747 			}
748 
749 			/* Enable/disable stats. */
750 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
751 				ret = parse_num_opt(optarg, INT32_MAX);
752 				if (ret == -1) {
753 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
754 					us_vhost_usage(prgname);
755 					return -1;
756 				} else {
757 					enable_stats = ret;
758 				}
759 			}
760 
761 			/* Set character device basename. */
762 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
763 				if (us_vhost_parse_basename(optarg) == -1) {
764 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
765 					us_vhost_usage(prgname);
766 					return -1;
767 				}
768 			}
769 
770 			/* Enable/disable rx/tx zero copy. */
771 			if (!strncmp(long_option[option_index].name,
772 				"zero-copy", MAX_LONG_OPT_SZ)) {
773 				ret = parse_num_opt(optarg, 1);
774 				if (ret == -1) {
775 					RTE_LOG(INFO, VHOST_CONFIG,
776 						"Invalid argument"
777 						" for zero-copy [0|1]\n");
778 					us_vhost_usage(prgname);
779 					return -1;
780 				} else
781 					zero_copy = ret;
782 			}
783 
784 			/* Specify the descriptor number on RX. */
785 			if (!strncmp(long_option[option_index].name,
786 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
787 				ret = parse_num_opt(optarg, MAX_RING_DESC);
788 				if ((ret == -1) || (!POWEROF2(ret))) {
789 					RTE_LOG(INFO, VHOST_CONFIG,
790 					"Invalid argument for rx-desc-num[0-N],"
791 					"power of 2 required.\n");
792 					us_vhost_usage(prgname);
793 					return -1;
794 				} else {
795 					num_rx_descriptor = ret;
796 				}
797 			}
798 
799 			/* Specify the descriptor number on TX. */
800 			if (!strncmp(long_option[option_index].name,
801 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
802 				ret = parse_num_opt(optarg, MAX_RING_DESC);
803 				if ((ret == -1) || (!POWEROF2(ret))) {
804 					RTE_LOG(INFO, VHOST_CONFIG,
805 					"Invalid argument for tx-desc-num [0-N],"
806 					"power of 2 required.\n");
807 					us_vhost_usage(prgname);
808 					return -1;
809 				} else {
810 					num_tx_descriptor = ret;
811 				}
812 			}
813 
814 			break;
815 
816 			/* Invalid option - print options. */
817 		default:
818 			us_vhost_usage(prgname);
819 			return -1;
820 		}
821 	}
822 
823 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
824 		if (enabled_port_mask & (1 << i))
825 			ports[num_ports++] = (uint8_t)i;
826 	}
827 
828 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
829 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
830 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
831 		return -1;
832 	}
833 
834 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
835 		RTE_LOG(INFO, VHOST_PORT,
836 			"Vhost zero copy doesn't support software vm2vm,"
837 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
838 		return -1;
839 	}
840 
841 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
842 		RTE_LOG(INFO, VHOST_PORT,
843 			"Vhost zero copy doesn't support jumbo frame,"
844 			"please specify '--mergeable 0' to disable the "
845 			"mergeable feature.\n");
846 		return -1;
847 	}
848 
849 	return 0;
850 }
851 
852 /*
853  * Update the global var NUM_PORTS and array PORTS according to system ports number
854  * and return valid ports number
855  */
856 static unsigned check_ports_num(unsigned nb_ports)
857 {
858 	unsigned valid_num_ports = num_ports;
859 	unsigned portid;
860 
861 	if (num_ports > nb_ports) {
862 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
863 			num_ports, nb_ports);
864 		num_ports = nb_ports;
865 	}
866 
867 	for (portid = 0; portid < num_ports; portid ++) {
868 		if (ports[portid] >= nb_ports) {
869 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
870 				ports[portid], (nb_ports - 1));
871 			ports[portid] = INVALID_PORT_ID;
872 			valid_num_ports--;
873 		}
874 	}
875 	return valid_num_ports;
876 }
877 
878 /*
879  * Macro to print out packet contents. Wrapped in debug define so that the
880  * data path is not effected when debug is disabled.
881  */
882 #ifdef DEBUG
883 #define PRINT_PACKET(device, addr, size, header) do {																\
884 	char *pkt_addr = (char*)(addr);																					\
885 	unsigned int index;																								\
886 	char packet[MAX_PRINT_BUFF];																					\
887 																													\
888 	if ((header))																									\
889 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
890 	else																											\
891 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
892 	for (index = 0; index < (size); index++) {																		\
893 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
894 			"%02hhx ", pkt_addr[index]);																			\
895 	}																												\
896 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
897 																													\
898 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
899 } while(0)
900 #else
901 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
902 #endif
903 
904 /*
905  * Function to convert guest physical addresses to vhost physical addresses.
906  * This is used to convert virtio buffer addresses.
907  */
908 static inline uint64_t __attribute__((always_inline))
909 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
910 	uint32_t buf_len, hpa_type *addr_type)
911 {
912 	struct virtio_memory_regions_hpa *region;
913 	uint32_t regionidx;
914 	uint64_t vhost_pa = 0;
915 
916 	*addr_type = PHYS_ADDR_INVALID;
917 
918 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
919 		region = &vdev->regions_hpa[regionidx];
920 		if ((guest_pa >= region->guest_phys_address) &&
921 			(guest_pa <= region->guest_phys_address_end)) {
922 			vhost_pa = region->host_phys_addr_offset + guest_pa;
923 			if (likely((guest_pa + buf_len - 1)
924 				<= region->guest_phys_address_end))
925 				*addr_type = PHYS_ADDR_CONTINUOUS;
926 			else
927 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
928 			break;
929 		}
930 	}
931 
932 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
933 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
934 		(void *)(uintptr_t)vhost_pa);
935 
936 	return vhost_pa;
937 }
938 
939 /*
940  * Compares a packet destination MAC address to a device MAC address.
941  */
942 static inline int __attribute__((always_inline))
943 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
944 {
945 	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
946 }
947 
948 /*
949  * This function learns the MAC address of the device and registers this along with a
950  * vlan tag to a VMDQ.
951  */
952 static int
953 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
954 {
955 	struct ether_hdr *pkt_hdr;
956 	struct virtio_net_data_ll *dev_ll;
957 	struct virtio_net *dev = vdev->dev;
958 	int i, ret;
959 
960 	/* Learn MAC address of guest device from packet */
961 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
962 
963 	dev_ll = ll_root_used;
964 
965 	while (dev_ll != NULL) {
966 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
967 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
968 			return -1;
969 		}
970 		dev_ll = dev_ll->next;
971 	}
972 
973 	for (i = 0; i < ETHER_ADDR_LEN; i++)
974 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
975 
976 	/* vlan_tag currently uses the device_id. */
977 	vdev->vlan_tag = vlan_tags[dev->device_fh];
978 
979 	/* Print out VMDQ registration info. */
980 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
981 		dev->device_fh,
982 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
983 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
984 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
985 		vdev->vlan_tag);
986 
987 	/* Register the MAC address. */
988 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
989 				(uint32_t)dev->device_fh + vmdq_pool_base);
990 	if (ret)
991 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
992 					dev->device_fh);
993 
994 	/* Enable stripping of the vlan tag as we handle routing. */
995 	if (vlan_strip)
996 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
997 			(uint16_t)vdev->vmdq_rx_q, 1);
998 
999 	/* Set device as ready for RX. */
1000 	vdev->ready = DEVICE_RX;
1001 
1002 	return 0;
1003 }
1004 
1005 /*
1006  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1007  * queue before disabling RX on the device.
1008  */
1009 static inline void
1010 unlink_vmdq(struct vhost_dev *vdev)
1011 {
1012 	unsigned i = 0;
1013 	unsigned rx_count;
1014 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1015 
1016 	if (vdev->ready == DEVICE_RX) {
1017 		/*clear MAC and VLAN settings*/
1018 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1019 		for (i = 0; i < 6; i++)
1020 			vdev->mac_address.addr_bytes[i] = 0;
1021 
1022 		vdev->vlan_tag = 0;
1023 
1024 		/*Clear out the receive buffers*/
1025 		rx_count = rte_eth_rx_burst(ports[0],
1026 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1027 
1028 		while (rx_count) {
1029 			for (i = 0; i < rx_count; i++)
1030 				rte_pktmbuf_free(pkts_burst[i]);
1031 
1032 			rx_count = rte_eth_rx_burst(ports[0],
1033 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1034 		}
1035 
1036 		vdev->ready = DEVICE_MAC_LEARNING;
1037 	}
1038 }
1039 
1040 /*
1041  * Check if the packet destination MAC address is for a local device. If so then put
1042  * the packet on that devices RX queue. If not then return.
1043  */
1044 static inline int __attribute__((always_inline))
1045 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1046 {
1047 	struct virtio_net_data_ll *dev_ll;
1048 	struct ether_hdr *pkt_hdr;
1049 	uint64_t ret = 0;
1050 	struct virtio_net *dev = vdev->dev;
1051 	struct virtio_net *tdev; /* destination virito device */
1052 
1053 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1054 
1055 	/*get the used devices list*/
1056 	dev_ll = ll_root_used;
1057 
1058 	while (dev_ll != NULL) {
1059 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1060 				          &dev_ll->vdev->mac_address)) {
1061 
1062 			/* Drop the packet if the TX packet is destined for the TX device. */
1063 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1064 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1065 							dev->device_fh);
1066 				return 0;
1067 			}
1068 			tdev = dev_ll->vdev->dev;
1069 
1070 
1071 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1072 
1073 			if (unlikely(dev_ll->vdev->remove)) {
1074 				/*drop the packet if the device is marked for removal*/
1075 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1076 			} else {
1077 				/*send the packet to the local virtio device*/
1078 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1079 				if (enable_stats) {
1080 					rte_atomic64_add(
1081 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1082 					1);
1083 					rte_atomic64_add(
1084 					&dev_statistics[tdev->device_fh].rx_atomic,
1085 					ret);
1086 					dev_statistics[dev->device_fh].tx_total++;
1087 					dev_statistics[dev->device_fh].tx += ret;
1088 				}
1089 			}
1090 
1091 			return 0;
1092 		}
1093 		dev_ll = dev_ll->next;
1094 	}
1095 
1096 	return -1;
1097 }
1098 
1099 /*
1100  * Check if the destination MAC of a packet is one local VM,
1101  * and get its vlan tag, and offset if it is.
1102  */
1103 static inline int __attribute__((always_inline))
1104 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1105 	uint32_t *offset, uint16_t *vlan_tag)
1106 {
1107 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1108 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1109 
1110 	while (dev_ll != NULL) {
1111 		if ((dev_ll->vdev->ready == DEVICE_RX)
1112 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1113 		&dev_ll->vdev->mac_address)) {
1114 			/*
1115 			 * Drop the packet if the TX packet is
1116 			 * destined for the TX device.
1117 			 */
1118 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1119 				LOG_DEBUG(VHOST_DATA,
1120 				"(%"PRIu64") TX: Source and destination"
1121 				" MAC addresses are the same. Dropping "
1122 				"packet.\n",
1123 				dev_ll->vdev->dev->device_fh);
1124 				return -1;
1125 			}
1126 
1127 			/*
1128 			 * HW vlan strip will reduce the packet length
1129 			 * by minus length of vlan tag, so need restore
1130 			 * the packet length by plus it.
1131 			 */
1132 			*offset = VLAN_HLEN;
1133 			*vlan_tag =
1134 			(uint16_t)
1135 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1136 
1137 			LOG_DEBUG(VHOST_DATA,
1138 			"(%"PRIu64") TX: pkt to local VM device id:"
1139 			"(%"PRIu64") vlan tag: %d.\n",
1140 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1141 			(int)*vlan_tag);
1142 
1143 			break;
1144 		}
1145 		dev_ll = dev_ll->next;
1146 	}
1147 	return 0;
1148 }
1149 
1150 static uint16_t
1151 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1152 {
1153 	if (ol_flags & PKT_TX_IPV4)
1154 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1155 	else /* assume ethertype == ETHER_TYPE_IPv6 */
1156 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1157 }
1158 
1159 static void virtio_tx_offload(struct rte_mbuf *m)
1160 {
1161 	void *l3_hdr;
1162 	struct ipv4_hdr *ipv4_hdr = NULL;
1163 	struct tcp_hdr *tcp_hdr = NULL;
1164 	struct udp_hdr *udp_hdr = NULL;
1165 	struct sctp_hdr *sctp_hdr = NULL;
1166 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1167 
1168 	l3_hdr = (char *)eth_hdr + m->l2_len;
1169 
1170 	if (m->tso_segsz != 0) {
1171 		ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
1172 		tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1173 		m->ol_flags |= PKT_TX_IP_CKSUM;
1174 		ipv4_hdr->hdr_checksum = 0;
1175 		tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1176 		return;
1177 	}
1178 
1179 	if (m->ol_flags & PKT_TX_L4_MASK) {
1180 		switch (m->ol_flags & PKT_TX_L4_MASK) {
1181 		case PKT_TX_TCP_CKSUM:
1182 			tcp_hdr = (struct tcp_hdr *)
1183 					((char *)l3_hdr + m->l3_len);
1184 			tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1185 			break;
1186 		case PKT_TX_UDP_CKSUM:
1187 			udp_hdr = (struct udp_hdr *)
1188 					((char *)l3_hdr + m->l3_len);
1189 			udp_hdr->dgram_cksum = get_psd_sum(l3_hdr, m->ol_flags);
1190 			break;
1191 		case PKT_TX_SCTP_CKSUM:
1192 			sctp_hdr = (struct sctp_hdr *)
1193 					((char *)l3_hdr + m->l3_len);
1194 			sctp_hdr->cksum = 0;
1195 			break;
1196 		default:
1197 			break;
1198 		}
1199 	}
1200 }
1201 
1202 /*
1203  * This function routes the TX packet to the correct interface. This may be a local device
1204  * or the physical port.
1205  */
1206 static inline void __attribute__((always_inline))
1207 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1208 {
1209 	struct mbuf_table *tx_q;
1210 	struct rte_mbuf **m_table;
1211 	unsigned len, ret, offset = 0;
1212 	const uint16_t lcore_id = rte_lcore_id();
1213 	struct virtio_net *dev = vdev->dev;
1214 	struct ether_hdr *nh;
1215 
1216 	/*check if destination is local VM*/
1217 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1218 		rte_pktmbuf_free(m);
1219 		return;
1220 	}
1221 
1222 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1223 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1224 			rte_pktmbuf_free(m);
1225 			return;
1226 		}
1227 	}
1228 
1229 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1230 
1231 	/*Add packet to the port tx queue*/
1232 	tx_q = &lcore_tx_queue[lcore_id];
1233 	len = tx_q->len;
1234 
1235 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1236 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1237 		/* Guest has inserted the vlan tag. */
1238 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1239 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1240 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1241 			(vh->vlan_tci != vlan_tag_be))
1242 			vh->vlan_tci = vlan_tag_be;
1243 	} else {
1244 		m->ol_flags |= PKT_TX_VLAN_PKT;
1245 
1246 		/*
1247 		 * Find the right seg to adjust the data len when offset is
1248 		 * bigger than tail room size.
1249 		 */
1250 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1251 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1252 				m->data_len += offset;
1253 			else {
1254 				struct rte_mbuf *seg = m;
1255 
1256 				while ((seg->next != NULL) &&
1257 					(offset > rte_pktmbuf_tailroom(seg)))
1258 					seg = seg->next;
1259 
1260 				seg->data_len += offset;
1261 			}
1262 			m->pkt_len += offset;
1263 		}
1264 
1265 		m->vlan_tci = vlan_tag;
1266 	}
1267 
1268 	if ((m->ol_flags & PKT_TX_L4_MASK) || (m->ol_flags & PKT_TX_TCP_SEG))
1269 		virtio_tx_offload(m);
1270 
1271 	tx_q->m_table[len] = m;
1272 	len++;
1273 	if (enable_stats) {
1274 		dev_statistics[dev->device_fh].tx_total++;
1275 		dev_statistics[dev->device_fh].tx++;
1276 	}
1277 
1278 	if (unlikely(len == MAX_PKT_BURST)) {
1279 		m_table = (struct rte_mbuf **)tx_q->m_table;
1280 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1281 		/* Free any buffers not handled by TX and update the port stats. */
1282 		if (unlikely(ret < len)) {
1283 			do {
1284 				rte_pktmbuf_free(m_table[ret]);
1285 			} while (++ret < len);
1286 		}
1287 
1288 		len = 0;
1289 	}
1290 
1291 	tx_q->len = len;
1292 	return;
1293 }
1294 /*
1295  * This function is called by each data core. It handles all RX/TX registered with the
1296  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1297  * with all devices in the main linked list.
1298  */
1299 static int
1300 switch_worker(__attribute__((unused)) void *arg)
1301 {
1302 	struct rte_mempool *mbuf_pool = arg;
1303 	struct virtio_net *dev = NULL;
1304 	struct vhost_dev *vdev = NULL;
1305 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1306 	struct virtio_net_data_ll *dev_ll;
1307 	struct mbuf_table *tx_q;
1308 	volatile struct lcore_ll_info *lcore_ll;
1309 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1310 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1311 	unsigned ret, i;
1312 	const uint16_t lcore_id = rte_lcore_id();
1313 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1314 	uint16_t rx_count = 0;
1315 	uint16_t tx_count;
1316 	uint32_t retry = 0;
1317 
1318 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1319 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1320 	prev_tsc = 0;
1321 
1322 	tx_q = &lcore_tx_queue[lcore_id];
1323 	for (i = 0; i < num_cores; i ++) {
1324 		if (lcore_ids[i] == lcore_id) {
1325 			tx_q->txq_id = i;
1326 			break;
1327 		}
1328 	}
1329 
1330 	while(1) {
1331 		cur_tsc = rte_rdtsc();
1332 		/*
1333 		 * TX burst queue drain
1334 		 */
1335 		diff_tsc = cur_tsc - prev_tsc;
1336 		if (unlikely(diff_tsc > drain_tsc)) {
1337 
1338 			if (tx_q->len) {
1339 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1340 
1341 				/*Tx any packets in the queue*/
1342 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1343 									   (struct rte_mbuf **)tx_q->m_table,
1344 									   (uint16_t)tx_q->len);
1345 				if (unlikely(ret < tx_q->len)) {
1346 					do {
1347 						rte_pktmbuf_free(tx_q->m_table[ret]);
1348 					} while (++ret < tx_q->len);
1349 				}
1350 
1351 				tx_q->len = 0;
1352 			}
1353 
1354 			prev_tsc = cur_tsc;
1355 
1356 		}
1357 
1358 		rte_prefetch0(lcore_ll->ll_root_used);
1359 		/*
1360 		 * Inform the configuration core that we have exited the linked list and that no devices are
1361 		 * in use if requested.
1362 		 */
1363 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1364 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1365 
1366 		/*
1367 		 * Process devices
1368 		 */
1369 		dev_ll = lcore_ll->ll_root_used;
1370 
1371 		while (dev_ll != NULL) {
1372 			/*get virtio device ID*/
1373 			vdev = dev_ll->vdev;
1374 			dev = vdev->dev;
1375 
1376 			if (unlikely(vdev->remove)) {
1377 				dev_ll = dev_ll->next;
1378 				unlink_vmdq(vdev);
1379 				vdev->ready = DEVICE_SAFE_REMOVE;
1380 				continue;
1381 			}
1382 			if (likely(vdev->ready == DEVICE_RX)) {
1383 				/*Handle guest RX*/
1384 				rx_count = rte_eth_rx_burst(ports[0],
1385 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1386 
1387 				if (rx_count) {
1388 					/*
1389 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1390 					* Here MAX_PKT_BURST must be less than virtio queue size
1391 					*/
1392 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1393 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1394 							rte_delay_us(burst_rx_delay_time);
1395 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1396 								break;
1397 						}
1398 					}
1399 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1400 					if (enable_stats) {
1401 						rte_atomic64_add(
1402 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1403 						rx_count);
1404 						rte_atomic64_add(
1405 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1406 					}
1407 					while (likely(rx_count)) {
1408 						rx_count--;
1409 						rte_pktmbuf_free(pkts_burst[rx_count]);
1410 					}
1411 
1412 				}
1413 			}
1414 
1415 			if (likely(!vdev->remove)) {
1416 				/* Handle guest TX*/
1417 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1418 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1419 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1420 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1421 						while (tx_count)
1422 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1423 					}
1424 				}
1425 				for (i = 0; i < tx_count; ++i)
1426 					virtio_tx_route(vdev, pkts_burst[i], (uint16_t)dev->device_fh);
1427 			}
1428 
1429 			/*move to the next device in the list*/
1430 			dev_ll = dev_ll->next;
1431 		}
1432 	}
1433 
1434 	return 0;
1435 }
1436 
1437 /*
1438  * This function gets available ring number for zero copy rx.
1439  * Only one thread will call this funciton for a paticular virtio device,
1440  * so, it is designed as non-thread-safe function.
1441  */
1442 static inline uint32_t __attribute__((always_inline))
1443 get_available_ring_num_zcp(struct virtio_net *dev)
1444 {
1445 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1446 	uint16_t avail_idx;
1447 
1448 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1449 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1450 }
1451 
1452 /*
1453  * This function gets available ring index for zero copy rx,
1454  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1455  * Only one thread will call this funciton for a paticular virtio device,
1456  * so, it is designed as non-thread-safe function.
1457  */
1458 static inline uint32_t __attribute__((always_inline))
1459 get_available_ring_index_zcp(struct virtio_net *dev,
1460 	uint16_t *res_base_idx, uint32_t count)
1461 {
1462 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1463 	uint16_t avail_idx;
1464 	uint32_t retry = 0;
1465 	uint16_t free_entries;
1466 
1467 	*res_base_idx = vq->last_used_idx_res;
1468 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1469 	free_entries = (avail_idx - *res_base_idx);
1470 
1471 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1472 			"avail idx: %d, "
1473 			"res base idx:%d, free entries:%d\n",
1474 			dev->device_fh, avail_idx, *res_base_idx,
1475 			free_entries);
1476 
1477 	/*
1478 	 * If retry is enabled and the queue is full then we wait
1479 	 * and retry to avoid packet loss.
1480 	 */
1481 	if (enable_retry && unlikely(count > free_entries)) {
1482 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1483 			rte_delay_us(burst_rx_delay_time);
1484 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1485 			free_entries = (avail_idx - *res_base_idx);
1486 			if (count <= free_entries)
1487 				break;
1488 		}
1489 	}
1490 
1491 	/*check that we have enough buffers*/
1492 	if (unlikely(count > free_entries))
1493 		count = free_entries;
1494 
1495 	if (unlikely(count == 0)) {
1496 		LOG_DEBUG(VHOST_DATA,
1497 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1498 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1499 			dev->device_fh, avail_idx,
1500 			*res_base_idx, free_entries);
1501 		return 0;
1502 	}
1503 
1504 	vq->last_used_idx_res = *res_base_idx + count;
1505 
1506 	return count;
1507 }
1508 
1509 /*
1510  * This function put descriptor back to used list.
1511  */
1512 static inline void __attribute__((always_inline))
1513 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1514 {
1515 	uint16_t res_cur_idx = vq->last_used_idx;
1516 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1517 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1518 	rte_compiler_barrier();
1519 	*(volatile uint16_t *)&vq->used->idx += 1;
1520 	vq->last_used_idx += 1;
1521 
1522 	/* Kick the guest if necessary. */
1523 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1524 		eventfd_write(vq->callfd, (eventfd_t)1);
1525 }
1526 
1527 /*
1528  * This function get available descriptor from vitio vring and un-attached mbuf
1529  * from vpool->ring, and then attach them together. It needs adjust the offset
1530  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1531  * frame data may be put to wrong location in mbuf.
1532  */
1533 static inline void __attribute__((always_inline))
1534 attach_rxmbuf_zcp(struct virtio_net *dev)
1535 {
1536 	uint16_t res_base_idx, desc_idx;
1537 	uint64_t buff_addr, phys_addr;
1538 	struct vhost_virtqueue *vq;
1539 	struct vring_desc *desc;
1540 	void *obj = NULL;
1541 	struct rte_mbuf *mbuf;
1542 	struct vpool *vpool;
1543 	hpa_type addr_type;
1544 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1545 
1546 	vpool = &vpool_array[vdev->vmdq_rx_q];
1547 	vq = dev->virtqueue[VIRTIO_RXQ];
1548 
1549 	do {
1550 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1551 				1) != 1))
1552 			return;
1553 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1554 
1555 		desc = &vq->desc[desc_idx];
1556 		if (desc->flags & VRING_DESC_F_NEXT) {
1557 			desc = &vq->desc[desc->next];
1558 			buff_addr = gpa_to_vva(dev, desc->addr);
1559 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1560 					&addr_type);
1561 		} else {
1562 			buff_addr = gpa_to_vva(dev,
1563 					desc->addr + vq->vhost_hlen);
1564 			phys_addr = gpa_to_hpa(vdev,
1565 					desc->addr + vq->vhost_hlen,
1566 					desc->len, &addr_type);
1567 		}
1568 
1569 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1570 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1571 				" address found when attaching RX frame buffer"
1572 				" address!\n", dev->device_fh);
1573 			put_desc_to_used_list_zcp(vq, desc_idx);
1574 			continue;
1575 		}
1576 
1577 		/*
1578 		 * Check if the frame buffer address from guest crosses
1579 		 * sub-region or not.
1580 		 */
1581 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1582 			RTE_LOG(ERR, VHOST_DATA,
1583 				"(%"PRIu64") Frame buffer address cross "
1584 				"sub-regioin found when attaching RX frame "
1585 				"buffer address!\n",
1586 				dev->device_fh);
1587 			put_desc_to_used_list_zcp(vq, desc_idx);
1588 			continue;
1589 		}
1590 	} while (unlikely(phys_addr == 0));
1591 
1592 	rte_ring_sc_dequeue(vpool->ring, &obj);
1593 	mbuf = obj;
1594 	if (unlikely(mbuf == NULL)) {
1595 		LOG_DEBUG(VHOST_DATA,
1596 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1597 			"ring_sc_dequeue fail.\n",
1598 			dev->device_fh);
1599 		put_desc_to_used_list_zcp(vq, desc_idx);
1600 		return;
1601 	}
1602 
1603 	if (unlikely(vpool->buf_size > desc->len)) {
1604 		LOG_DEBUG(VHOST_DATA,
1605 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1606 			"length(%d) of descriptor idx: %d less than room "
1607 			"size required: %d\n",
1608 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1609 		put_desc_to_used_list_zcp(vq, desc_idx);
1610 		rte_ring_sp_enqueue(vpool->ring, obj);
1611 		return;
1612 	}
1613 
1614 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1615 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1616 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1617 	mbuf->data_len = desc->len;
1618 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1619 
1620 	LOG_DEBUG(VHOST_DATA,
1621 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1622 		"descriptor idx:%d\n",
1623 		dev->device_fh, res_base_idx, desc_idx);
1624 
1625 	__rte_mbuf_raw_free(mbuf);
1626 
1627 	return;
1628 }
1629 
1630 /*
1631  * Detach an attched packet mbuf -
1632  *  - restore original mbuf address and length values.
1633  *  - reset pktmbuf data and data_len to their default values.
1634  *  All other fields of the given packet mbuf will be left intact.
1635  *
1636  * @param m
1637  *   The attached packet mbuf.
1638  */
1639 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1640 {
1641 	const struct rte_mempool *mp = m->pool;
1642 	void *buf = rte_mbuf_to_baddr(m);
1643 	uint32_t buf_ofs;
1644 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1645 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1646 
1647 	m->buf_addr = buf;
1648 	m->buf_len = (uint16_t)buf_len;
1649 
1650 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1651 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1652 	m->data_off = buf_ofs;
1653 
1654 	m->data_len = 0;
1655 }
1656 
1657 /*
1658  * This function is called after packets have been transimited. It fetchs mbuf
1659  * from vpool->pool, detached it and put into vpool->ring. It also update the
1660  * used index and kick the guest if necessary.
1661  */
1662 static inline uint32_t __attribute__((always_inline))
1663 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1664 {
1665 	struct rte_mbuf *mbuf;
1666 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1667 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1668 	uint32_t index = 0;
1669 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1670 
1671 	LOG_DEBUG(VHOST_DATA,
1672 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1673 		"clean is: %d\n",
1674 		dev->device_fh, mbuf_count);
1675 	LOG_DEBUG(VHOST_DATA,
1676 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1677 		"clean  is : %d\n",
1678 		dev->device_fh, rte_ring_count(vpool->ring));
1679 
1680 	for (index = 0; index < mbuf_count; index++) {
1681 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1682 		if (likely(MBUF_EXT_MEM(mbuf)))
1683 			pktmbuf_detach_zcp(mbuf);
1684 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1685 
1686 		/* Update used index buffer information. */
1687 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1688 		vq->used->ring[used_idx].len = 0;
1689 
1690 		used_idx = (used_idx + 1) & (vq->size - 1);
1691 	}
1692 
1693 	LOG_DEBUG(VHOST_DATA,
1694 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1695 		"clean is: %d\n",
1696 		dev->device_fh, rte_mempool_count(vpool->pool));
1697 	LOG_DEBUG(VHOST_DATA,
1698 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1699 		"clean  is : %d\n",
1700 		dev->device_fh, rte_ring_count(vpool->ring));
1701 	LOG_DEBUG(VHOST_DATA,
1702 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1703 		"vq->last_used_idx:%d\n",
1704 		dev->device_fh, vq->last_used_idx);
1705 
1706 	vq->last_used_idx += mbuf_count;
1707 
1708 	LOG_DEBUG(VHOST_DATA,
1709 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1710 		"vq->last_used_idx:%d\n",
1711 		dev->device_fh, vq->last_used_idx);
1712 
1713 	rte_compiler_barrier();
1714 
1715 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1716 
1717 	/* Kick guest if required. */
1718 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1719 		eventfd_write(vq->callfd, (eventfd_t)1);
1720 
1721 	return 0;
1722 }
1723 
1724 /*
1725  * This function is called when a virtio device is destroy.
1726  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1727  */
1728 static void mbuf_destroy_zcp(struct vpool *vpool)
1729 {
1730 	struct rte_mbuf *mbuf = NULL;
1731 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1732 
1733 	LOG_DEBUG(VHOST_CONFIG,
1734 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1735 		"mbuf_destroy_zcp is: %d\n",
1736 		mbuf_count);
1737 	LOG_DEBUG(VHOST_CONFIG,
1738 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1739 		"mbuf_destroy_zcp  is : %d\n",
1740 		rte_ring_count(vpool->ring));
1741 
1742 	for (index = 0; index < mbuf_count; index++) {
1743 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1744 		if (likely(mbuf != NULL)) {
1745 			if (likely(MBUF_EXT_MEM(mbuf)))
1746 				pktmbuf_detach_zcp(mbuf);
1747 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1748 		}
1749 	}
1750 
1751 	LOG_DEBUG(VHOST_CONFIG,
1752 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1753 		"mbuf_destroy_zcp is: %d\n",
1754 		rte_mempool_count(vpool->pool));
1755 	LOG_DEBUG(VHOST_CONFIG,
1756 		"in mbuf_destroy_zcp: mbuf count in ring after "
1757 		"mbuf_destroy_zcp is : %d\n",
1758 		rte_ring_count(vpool->ring));
1759 }
1760 
1761 /*
1762  * This function update the use flag and counter.
1763  */
1764 static inline uint32_t __attribute__((always_inline))
1765 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1766 	uint32_t count)
1767 {
1768 	struct vhost_virtqueue *vq;
1769 	struct vring_desc *desc;
1770 	struct rte_mbuf *buff;
1771 	/* The virtio_hdr is initialised to 0. */
1772 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1773 		= {{0, 0, 0, 0, 0, 0}, 0};
1774 	uint64_t buff_hdr_addr = 0;
1775 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1776 	uint32_t head_idx, packet_success = 0;
1777 	uint16_t res_cur_idx;
1778 
1779 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1780 
1781 	if (count == 0)
1782 		return 0;
1783 
1784 	vq = dev->virtqueue[VIRTIO_RXQ];
1785 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1786 
1787 	res_cur_idx = vq->last_used_idx;
1788 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1789 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1790 
1791 	/* Retrieve all of the head indexes first to avoid caching issues. */
1792 	for (head_idx = 0; head_idx < count; head_idx++)
1793 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1794 
1795 	/*Prefetch descriptor index. */
1796 	rte_prefetch0(&vq->desc[head[packet_success]]);
1797 
1798 	while (packet_success != count) {
1799 		/* Get descriptor from available ring */
1800 		desc = &vq->desc[head[packet_success]];
1801 
1802 		buff = pkts[packet_success];
1803 		LOG_DEBUG(VHOST_DATA,
1804 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1805 			"pkt[%d] descriptor idx: %d\n",
1806 			dev->device_fh, packet_success,
1807 			MBUF_HEADROOM_UINT32(buff));
1808 
1809 		PRINT_PACKET(dev,
1810 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1811 			+ RTE_PKTMBUF_HEADROOM),
1812 			rte_pktmbuf_data_len(buff), 0);
1813 
1814 		/* Buffer address translation for virtio header. */
1815 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1816 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1817 
1818 		/*
1819 		 * If the descriptors are chained the header and data are
1820 		 * placed in separate buffers.
1821 		 */
1822 		if (desc->flags & VRING_DESC_F_NEXT) {
1823 			desc->len = vq->vhost_hlen;
1824 			desc = &vq->desc[desc->next];
1825 			desc->len = rte_pktmbuf_data_len(buff);
1826 		} else {
1827 			desc->len = packet_len;
1828 		}
1829 
1830 		/* Update used ring with desc information */
1831 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1832 			= head[packet_success];
1833 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1834 			= packet_len;
1835 		res_cur_idx++;
1836 		packet_success++;
1837 
1838 		/* A header is required per buffer. */
1839 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1840 			(const void *)&virtio_hdr, vq->vhost_hlen);
1841 
1842 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1843 
1844 		if (likely(packet_success < count)) {
1845 			/* Prefetch descriptor index. */
1846 			rte_prefetch0(&vq->desc[head[packet_success]]);
1847 		}
1848 	}
1849 
1850 	rte_compiler_barrier();
1851 
1852 	LOG_DEBUG(VHOST_DATA,
1853 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1854 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1855 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1856 
1857 	*(volatile uint16_t *)&vq->used->idx += count;
1858 	vq->last_used_idx += count;
1859 
1860 	LOG_DEBUG(VHOST_DATA,
1861 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1862 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1863 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1864 
1865 	/* Kick the guest if necessary. */
1866 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1867 		eventfd_write(vq->callfd, (eventfd_t)1);
1868 
1869 	return count;
1870 }
1871 
1872 /*
1873  * This function routes the TX packet to the correct interface.
1874  * This may be a local device or the physical port.
1875  */
1876 static inline void __attribute__((always_inline))
1877 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1878 	uint32_t desc_idx, uint8_t need_copy)
1879 {
1880 	struct mbuf_table *tx_q;
1881 	struct rte_mbuf **m_table;
1882 	void *obj = NULL;
1883 	struct rte_mbuf *mbuf;
1884 	unsigned len, ret, offset = 0;
1885 	struct vpool *vpool;
1886 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1887 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1888 
1889 	/*Add packet to the port tx queue*/
1890 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1891 	len = tx_q->len;
1892 
1893 	/* Allocate an mbuf and populate the structure. */
1894 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1895 	rte_ring_sc_dequeue(vpool->ring, &obj);
1896 	mbuf = obj;
1897 	if (unlikely(mbuf == NULL)) {
1898 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1899 		RTE_LOG(ERR, VHOST_DATA,
1900 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1901 			dev->device_fh);
1902 		put_desc_to_used_list_zcp(vq, desc_idx);
1903 		return;
1904 	}
1905 
1906 	if (vm2vm_mode == VM2VM_HARDWARE) {
1907 		/* Avoid using a vlan tag from any vm for external pkt, such as
1908 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1909 		 * selection, MAC address determines it as an external pkt
1910 		 * which should go to network, while vlan tag determine it as
1911 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1912 		 * such a ambiguous situation, so pkt will lost.
1913 		 */
1914 		vlan_tag = external_pkt_default_vlan_tag;
1915 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1916 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1917 			__rte_mbuf_raw_free(mbuf);
1918 			return;
1919 		}
1920 	}
1921 
1922 	mbuf->nb_segs = m->nb_segs;
1923 	mbuf->next = m->next;
1924 	mbuf->data_len = m->data_len + offset;
1925 	mbuf->pkt_len = mbuf->data_len;
1926 	if (unlikely(need_copy)) {
1927 		/* Copy the packet contents to the mbuf. */
1928 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1929 			rte_pktmbuf_mtod(m, void *),
1930 			m->data_len);
1931 	} else {
1932 		mbuf->data_off = m->data_off;
1933 		mbuf->buf_physaddr = m->buf_physaddr;
1934 		mbuf->buf_addr = m->buf_addr;
1935 	}
1936 	mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1937 	mbuf->vlan_tci = vlan_tag;
1938 	mbuf->l2_len = sizeof(struct ether_hdr);
1939 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1940 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1941 
1942 	tx_q->m_table[len] = mbuf;
1943 	len++;
1944 
1945 	LOG_DEBUG(VHOST_DATA,
1946 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1947 		dev->device_fh,
1948 		mbuf->nb_segs,
1949 		(mbuf->next == NULL) ? "null" : "non-null");
1950 
1951 	if (enable_stats) {
1952 		dev_statistics[dev->device_fh].tx_total++;
1953 		dev_statistics[dev->device_fh].tx++;
1954 	}
1955 
1956 	if (unlikely(len == MAX_PKT_BURST)) {
1957 		m_table = (struct rte_mbuf **)tx_q->m_table;
1958 		ret = rte_eth_tx_burst(ports[0],
1959 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1960 
1961 		/*
1962 		 * Free any buffers not handled by TX and update
1963 		 * the port stats.
1964 		 */
1965 		if (unlikely(ret < len)) {
1966 			do {
1967 				rte_pktmbuf_free(m_table[ret]);
1968 			} while (++ret < len);
1969 		}
1970 
1971 		len = 0;
1972 		txmbuf_clean_zcp(dev, vpool);
1973 	}
1974 
1975 	tx_q->len = len;
1976 
1977 	return;
1978 }
1979 
1980 /*
1981  * This function TX all available packets in virtio TX queue for one
1982  * virtio-net device. If it is first packet, it learns MAC address and
1983  * setup VMDQ.
1984  */
1985 static inline void __attribute__((always_inline))
1986 virtio_dev_tx_zcp(struct virtio_net *dev)
1987 {
1988 	struct rte_mbuf m;
1989 	struct vhost_virtqueue *vq;
1990 	struct vring_desc *desc;
1991 	uint64_t buff_addr = 0, phys_addr;
1992 	uint32_t head[MAX_PKT_BURST];
1993 	uint32_t i;
1994 	uint16_t free_entries, packet_success = 0;
1995 	uint16_t avail_idx;
1996 	uint8_t need_copy = 0;
1997 	hpa_type addr_type;
1998 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1999 
2000 	vq = dev->virtqueue[VIRTIO_TXQ];
2001 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
2002 
2003 	/* If there are no available buffers then return. */
2004 	if (vq->last_used_idx_res == avail_idx)
2005 		return;
2006 
2007 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2008 
2009 	/* Prefetch available ring to retrieve head indexes. */
2010 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2011 
2012 	/* Get the number of free entries in the ring */
2013 	free_entries = (avail_idx - vq->last_used_idx_res);
2014 
2015 	/* Limit to MAX_PKT_BURST. */
2016 	free_entries
2017 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2018 
2019 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
2020 		dev->device_fh, free_entries);
2021 
2022 	/* Retrieve all of the head indexes first to avoid caching issues. */
2023 	for (i = 0; i < free_entries; i++)
2024 		head[i]
2025 			= vq->avail->ring[(vq->last_used_idx_res + i)
2026 			& (vq->size - 1)];
2027 
2028 	vq->last_used_idx_res += free_entries;
2029 
2030 	/* Prefetch descriptor index. */
2031 	rte_prefetch0(&vq->desc[head[packet_success]]);
2032 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2033 
2034 	while (packet_success < free_entries) {
2035 		desc = &vq->desc[head[packet_success]];
2036 
2037 		/* Discard first buffer as it is the virtio header */
2038 		desc = &vq->desc[desc->next];
2039 
2040 		/* Buffer address translation. */
2041 		buff_addr = gpa_to_vva(dev, desc->addr);
2042 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
2043 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2044 			&addr_type);
2045 
2046 		if (likely(packet_success < (free_entries - 1)))
2047 			/* Prefetch descriptor index. */
2048 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2049 
2050 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2051 			RTE_LOG(ERR, VHOST_DATA,
2052 				"(%"PRIu64") Invalid frame buffer address found"
2053 				"when TX packets!\n",
2054 				dev->device_fh);
2055 			packet_success++;
2056 			continue;
2057 		}
2058 
2059 		/* Prefetch buffer address. */
2060 		rte_prefetch0((void *)(uintptr_t)buff_addr);
2061 
2062 		/*
2063 		 * Setup dummy mbuf. This is copied to a real mbuf if
2064 		 * transmitted out the physical port.
2065 		 */
2066 		m.data_len = desc->len;
2067 		m.nb_segs = 1;
2068 		m.next = NULL;
2069 		m.data_off = 0;
2070 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2071 		m.buf_physaddr = phys_addr;
2072 
2073 		/*
2074 		 * Check if the frame buffer address from guest crosses
2075 		 * sub-region or not.
2076 		 */
2077 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2078 			RTE_LOG(ERR, VHOST_DATA,
2079 				"(%"PRIu64") Frame buffer address cross "
2080 				"sub-regioin found when attaching TX frame "
2081 				"buffer address!\n",
2082 				dev->device_fh);
2083 			need_copy = 1;
2084 		} else
2085 			need_copy = 0;
2086 
2087 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2088 
2089 		/*
2090 		 * If this is the first received packet we need to learn
2091 		 * the MAC and setup VMDQ
2092 		 */
2093 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2094 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2095 				/*
2096 				 * Discard frame if device is scheduled for
2097 				 * removal or a duplicate MAC address is found.
2098 				 */
2099 				packet_success += free_entries;
2100 				vq->last_used_idx += packet_success;
2101 				break;
2102 			}
2103 		}
2104 
2105 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2106 		packet_success++;
2107 	}
2108 }
2109 
2110 /*
2111  * This function is called by each data core. It handles all RX/TX registered
2112  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2113  * addresses are compared with all devices in the main linked list.
2114  */
2115 static int
2116 switch_worker_zcp(__attribute__((unused)) void *arg)
2117 {
2118 	struct virtio_net *dev = NULL;
2119 	struct vhost_dev  *vdev = NULL;
2120 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2121 	struct virtio_net_data_ll *dev_ll;
2122 	struct mbuf_table *tx_q;
2123 	volatile struct lcore_ll_info *lcore_ll;
2124 	const uint64_t drain_tsc
2125 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2126 		* BURST_TX_DRAIN_US;
2127 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2128 	unsigned ret;
2129 	const uint16_t lcore_id = rte_lcore_id();
2130 	uint16_t count_in_ring, rx_count = 0;
2131 
2132 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2133 
2134 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2135 	prev_tsc = 0;
2136 
2137 	while (1) {
2138 		cur_tsc = rte_rdtsc();
2139 
2140 		/* TX burst queue drain */
2141 		diff_tsc = cur_tsc - prev_tsc;
2142 		if (unlikely(diff_tsc > drain_tsc)) {
2143 			/*
2144 			 * Get mbuf from vpool.pool and detach mbuf and
2145 			 * put back into vpool.ring.
2146 			 */
2147 			dev_ll = lcore_ll->ll_root_used;
2148 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2149 				/* Get virtio device ID */
2150 				vdev = dev_ll->vdev;
2151 				dev = vdev->dev;
2152 
2153 				if (likely(!vdev->remove)) {
2154 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2155 					if (tx_q->len) {
2156 						LOG_DEBUG(VHOST_DATA,
2157 						"TX queue drained after timeout"
2158 						" with burst size %u\n",
2159 						tx_q->len);
2160 
2161 						/*
2162 						 * Tx any packets in the queue
2163 						 */
2164 						ret = rte_eth_tx_burst(
2165 							ports[0],
2166 							(uint16_t)tx_q->txq_id,
2167 							(struct rte_mbuf **)
2168 							tx_q->m_table,
2169 							(uint16_t)tx_q->len);
2170 						if (unlikely(ret < tx_q->len)) {
2171 							do {
2172 								rte_pktmbuf_free(
2173 									tx_q->m_table[ret]);
2174 							} while (++ret < tx_q->len);
2175 						}
2176 						tx_q->len = 0;
2177 
2178 						txmbuf_clean_zcp(dev,
2179 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2180 					}
2181 				}
2182 				dev_ll = dev_ll->next;
2183 			}
2184 			prev_tsc = cur_tsc;
2185 		}
2186 
2187 		rte_prefetch0(lcore_ll->ll_root_used);
2188 
2189 		/*
2190 		 * Inform the configuration core that we have exited the linked
2191 		 * list and that no devices are in use if requested.
2192 		 */
2193 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2194 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2195 
2196 		/* Process devices */
2197 		dev_ll = lcore_ll->ll_root_used;
2198 
2199 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2200 			vdev = dev_ll->vdev;
2201 			dev  = vdev->dev;
2202 			if (unlikely(vdev->remove)) {
2203 				dev_ll = dev_ll->next;
2204 				unlink_vmdq(vdev);
2205 				vdev->ready = DEVICE_SAFE_REMOVE;
2206 				continue;
2207 			}
2208 
2209 			if (likely(vdev->ready == DEVICE_RX)) {
2210 				uint32_t index = vdev->vmdq_rx_q;
2211 				uint16_t i;
2212 				count_in_ring
2213 				= rte_ring_count(vpool_array[index].ring);
2214 				uint16_t free_entries
2215 				= (uint16_t)get_available_ring_num_zcp(dev);
2216 
2217 				/*
2218 				 * Attach all mbufs in vpool.ring and put back
2219 				 * into vpool.pool.
2220 				 */
2221 				for (i = 0;
2222 				i < RTE_MIN(free_entries,
2223 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2224 				i++)
2225 					attach_rxmbuf_zcp(dev);
2226 
2227 				/* Handle guest RX */
2228 				rx_count = rte_eth_rx_burst(ports[0],
2229 					vdev->vmdq_rx_q, pkts_burst,
2230 					MAX_PKT_BURST);
2231 
2232 				if (rx_count) {
2233 					ret_count = virtio_dev_rx_zcp(dev,
2234 							pkts_burst, rx_count);
2235 					if (enable_stats) {
2236 						dev_statistics[dev->device_fh].rx_total
2237 							+= rx_count;
2238 						dev_statistics[dev->device_fh].rx
2239 							+= ret_count;
2240 					}
2241 					while (likely(rx_count)) {
2242 						rx_count--;
2243 						pktmbuf_detach_zcp(
2244 							pkts_burst[rx_count]);
2245 						rte_ring_sp_enqueue(
2246 							vpool_array[index].ring,
2247 							(void *)pkts_burst[rx_count]);
2248 					}
2249 				}
2250 			}
2251 
2252 			if (likely(!vdev->remove))
2253 				/* Handle guest TX */
2254 				virtio_dev_tx_zcp(dev);
2255 
2256 			/* Move to the next device in the list */
2257 			dev_ll = dev_ll->next;
2258 		}
2259 	}
2260 
2261 	return 0;
2262 }
2263 
2264 
2265 /*
2266  * Add an entry to a used linked list. A free entry must first be found
2267  * in the free linked list using get_data_ll_free_entry();
2268  */
2269 static void
2270 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2271 	struct virtio_net_data_ll *ll_dev)
2272 {
2273 	struct virtio_net_data_ll *ll = *ll_root_addr;
2274 
2275 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2276 	ll_dev->next = NULL;
2277 	rte_compiler_barrier();
2278 
2279 	/* If ll == NULL then this is the first device. */
2280 	if (ll) {
2281 		/* Increment to the tail of the linked list. */
2282 		while ((ll->next != NULL) )
2283 			ll = ll->next;
2284 
2285 		ll->next = ll_dev;
2286 	} else {
2287 		*ll_root_addr = ll_dev;
2288 	}
2289 }
2290 
2291 /*
2292  * Remove an entry from a used linked list. The entry must then be added to
2293  * the free linked list using put_data_ll_free_entry().
2294  */
2295 static void
2296 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2297 	struct virtio_net_data_ll *ll_dev,
2298 	struct virtio_net_data_ll *ll_dev_last)
2299 {
2300 	struct virtio_net_data_ll *ll = *ll_root_addr;
2301 
2302 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2303 		return;
2304 
2305 	if (ll_dev == ll)
2306 		*ll_root_addr = ll_dev->next;
2307 	else
2308 		if (likely(ll_dev_last != NULL))
2309 			ll_dev_last->next = ll_dev->next;
2310 		else
2311 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2312 }
2313 
2314 /*
2315  * Find and return an entry from the free linked list.
2316  */
2317 static struct virtio_net_data_ll *
2318 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2319 {
2320 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2321 	struct virtio_net_data_ll *ll_dev;
2322 
2323 	if (ll_free == NULL)
2324 		return NULL;
2325 
2326 	ll_dev = ll_free;
2327 	*ll_root_addr = ll_free->next;
2328 
2329 	return ll_dev;
2330 }
2331 
2332 /*
2333  * Place an entry back on to the free linked list.
2334  */
2335 static void
2336 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2337 	struct virtio_net_data_ll *ll_dev)
2338 {
2339 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2340 
2341 	if (ll_dev == NULL)
2342 		return;
2343 
2344 	ll_dev->next = ll_free;
2345 	*ll_root_addr = ll_dev;
2346 }
2347 
2348 /*
2349  * Creates a linked list of a given size.
2350  */
2351 static struct virtio_net_data_ll *
2352 alloc_data_ll(uint32_t size)
2353 {
2354 	struct virtio_net_data_ll *ll_new;
2355 	uint32_t i;
2356 
2357 	/* Malloc and then chain the linked list. */
2358 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2359 	if (ll_new == NULL) {
2360 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2361 		return NULL;
2362 	}
2363 
2364 	for (i = 0; i < size - 1; i++) {
2365 		ll_new[i].vdev = NULL;
2366 		ll_new[i].next = &ll_new[i+1];
2367 	}
2368 	ll_new[i].next = NULL;
2369 
2370 	return ll_new;
2371 }
2372 
2373 /*
2374  * Create the main linked list along with each individual cores linked list. A used and a free list
2375  * are created to manage entries.
2376  */
2377 static int
2378 init_data_ll (void)
2379 {
2380 	int lcore;
2381 
2382 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2383 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2384 		if (lcore_info[lcore].lcore_ll == NULL) {
2385 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2386 			return -1;
2387 		}
2388 
2389 		lcore_info[lcore].lcore_ll->device_num = 0;
2390 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2391 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2392 		if (num_devices % num_switching_cores)
2393 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2394 		else
2395 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2396 	}
2397 
2398 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2399 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2400 
2401 	return 0;
2402 }
2403 
2404 /*
2405  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2406  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2407  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2408  */
2409 static void
2410 destroy_device (volatile struct virtio_net *dev)
2411 {
2412 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2413 	struct virtio_net_data_ll *ll_main_dev_cur;
2414 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2415 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2416 	struct vhost_dev *vdev;
2417 	int lcore;
2418 
2419 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2420 
2421 	vdev = (struct vhost_dev *)dev->priv;
2422 	/*set the remove flag. */
2423 	vdev->remove = 1;
2424 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2425 		rte_pause();
2426 	}
2427 
2428 	/* Search for entry to be removed from lcore ll */
2429 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2430 	while (ll_lcore_dev_cur != NULL) {
2431 		if (ll_lcore_dev_cur->vdev == vdev) {
2432 			break;
2433 		} else {
2434 			ll_lcore_dev_last = ll_lcore_dev_cur;
2435 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2436 		}
2437 	}
2438 
2439 	if (ll_lcore_dev_cur == NULL) {
2440 		RTE_LOG(ERR, VHOST_CONFIG,
2441 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2442 			dev->device_fh);
2443 		return;
2444 	}
2445 
2446 	/* Search for entry to be removed from main ll */
2447 	ll_main_dev_cur = ll_root_used;
2448 	ll_main_dev_last = NULL;
2449 	while (ll_main_dev_cur != NULL) {
2450 		if (ll_main_dev_cur->vdev == vdev) {
2451 			break;
2452 		} else {
2453 			ll_main_dev_last = ll_main_dev_cur;
2454 			ll_main_dev_cur = ll_main_dev_cur->next;
2455 		}
2456 	}
2457 
2458 	/* Remove entries from the lcore and main ll. */
2459 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2460 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2461 
2462 	/* Set the dev_removal_flag on each lcore. */
2463 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2464 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2465 	}
2466 
2467 	/*
2468 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2469 	 * they can no longer access the device removed from the linked lists and that the devices
2470 	 * are no longer in use.
2471 	 */
2472 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2473 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2474 			rte_pause();
2475 		}
2476 	}
2477 
2478 	/* Add the entries back to the lcore and main free ll.*/
2479 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2480 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2481 
2482 	/* Decrement number of device on the lcore. */
2483 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2484 
2485 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2486 
2487 	if (zero_copy) {
2488 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2489 
2490 		/* Stop the RX queue. */
2491 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2492 			LOG_DEBUG(VHOST_CONFIG,
2493 				"(%"PRIu64") In destroy_device: Failed to stop "
2494 				"rx queue:%d\n",
2495 				dev->device_fh,
2496 				vdev->vmdq_rx_q);
2497 		}
2498 
2499 		LOG_DEBUG(VHOST_CONFIG,
2500 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2501 			"mempool back to ring for RX queue: %d\n",
2502 			dev->device_fh, vdev->vmdq_rx_q);
2503 
2504 		mbuf_destroy_zcp(vpool);
2505 
2506 		/* Stop the TX queue. */
2507 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2508 			LOG_DEBUG(VHOST_CONFIG,
2509 				"(%"PRIu64") In destroy_device: Failed to "
2510 				"stop tx queue:%d\n",
2511 				dev->device_fh, vdev->vmdq_rx_q);
2512 		}
2513 
2514 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2515 
2516 		LOG_DEBUG(VHOST_CONFIG,
2517 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2518 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2519 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2520 			dev->device_fh);
2521 
2522 		mbuf_destroy_zcp(vpool);
2523 		rte_free(vdev->regions_hpa);
2524 	}
2525 	rte_free(vdev);
2526 
2527 }
2528 
2529 /*
2530  * Calculate the region count of physical continous regions for one particular
2531  * region of whose vhost virtual address is continous. The particular region
2532  * start from vva_start, with size of 'size' in argument.
2533  */
2534 static uint32_t
2535 check_hpa_regions(uint64_t vva_start, uint64_t size)
2536 {
2537 	uint32_t i, nregions = 0, page_size = getpagesize();
2538 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2539 	if (vva_start % page_size) {
2540 		LOG_DEBUG(VHOST_CONFIG,
2541 			"in check_countinous: vva start(%p) mod page_size(%d) "
2542 			"has remainder\n",
2543 			(void *)(uintptr_t)vva_start, page_size);
2544 		return 0;
2545 	}
2546 	if (size % page_size) {
2547 		LOG_DEBUG(VHOST_CONFIG,
2548 			"in check_countinous: "
2549 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2550 			size, page_size);
2551 		return 0;
2552 	}
2553 	for (i = 0; i < size - page_size; i = i + page_size) {
2554 		cur_phys_addr
2555 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2556 		next_phys_addr = rte_mem_virt2phy(
2557 			(void *)(uintptr_t)(vva_start + i + page_size));
2558 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2559 			++nregions;
2560 			LOG_DEBUG(VHOST_CONFIG,
2561 				"in check_continuous: hva addr:(%p) is not "
2562 				"continuous with hva addr:(%p), diff:%d\n",
2563 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2564 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2565 				+ page_size), page_size);
2566 			LOG_DEBUG(VHOST_CONFIG,
2567 				"in check_continuous: hpa addr:(%p) is not "
2568 				"continuous with hpa addr:(%p), "
2569 				"diff:(%"PRIu64")\n",
2570 				(void *)(uintptr_t)cur_phys_addr,
2571 				(void *)(uintptr_t)next_phys_addr,
2572 				(next_phys_addr-cur_phys_addr));
2573 		}
2574 	}
2575 	return nregions;
2576 }
2577 
2578 /*
2579  * Divide each region whose vhost virtual address is continous into a few
2580  * sub-regions, make sure the physical address within each sub-region are
2581  * continous. And fill offset(to GPA) and size etc. information of each
2582  * sub-region into regions_hpa.
2583  */
2584 static uint32_t
2585 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2586 {
2587 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2588 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2589 
2590 	if (mem_region_hpa == NULL)
2591 		return 0;
2592 
2593 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2594 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2595 			virtio_memory->regions[regionidx].address_offset;
2596 		mem_region_hpa[regionidx_hpa].guest_phys_address
2597 			= virtio_memory->regions[regionidx].guest_phys_address;
2598 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2599 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2600 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2601 		LOG_DEBUG(VHOST_CONFIG,
2602 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2603 			regionidx_hpa,
2604 			(void *)(uintptr_t)
2605 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2606 		LOG_DEBUG(VHOST_CONFIG,
2607 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2608 			regionidx_hpa,
2609 			(void *)(uintptr_t)
2610 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2611 		for (i = 0, k = 0;
2612 			i < virtio_memory->regions[regionidx].memory_size -
2613 				page_size;
2614 			i += page_size) {
2615 			cur_phys_addr = rte_mem_virt2phy(
2616 					(void *)(uintptr_t)(vva_start + i));
2617 			next_phys_addr = rte_mem_virt2phy(
2618 					(void *)(uintptr_t)(vva_start +
2619 					i + page_size));
2620 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2621 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2622 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2623 					k + page_size;
2624 				mem_region_hpa[regionidx_hpa].memory_size
2625 					= k + page_size;
2626 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2627 					"phys addr end  [%d]:(%p)\n",
2628 					regionidx_hpa,
2629 					(void *)(uintptr_t)
2630 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2631 				LOG_DEBUG(VHOST_CONFIG,
2632 					"in fill_hpa_regions: guest phys addr "
2633 					"size [%d]:(%p)\n",
2634 					regionidx_hpa,
2635 					(void *)(uintptr_t)
2636 					(mem_region_hpa[regionidx_hpa].memory_size));
2637 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2638 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2639 				++regionidx_hpa;
2640 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2641 					next_phys_addr -
2642 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2643 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2644 					" phys addr start[%d]:(%p)\n",
2645 					regionidx_hpa,
2646 					(void *)(uintptr_t)
2647 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2648 				LOG_DEBUG(VHOST_CONFIG,
2649 					"in fill_hpa_regions: host  phys addr "
2650 					"start[%d]:(%p)\n",
2651 					regionidx_hpa,
2652 					(void *)(uintptr_t)
2653 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2654 				k = 0;
2655 			} else {
2656 				k += page_size;
2657 			}
2658 		}
2659 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2660 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2661 			+ k + page_size;
2662 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2663 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2664 			"[%d]:(%p)\n", regionidx_hpa,
2665 			(void *)(uintptr_t)
2666 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2667 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2668 			"[%d]:(%p)\n", regionidx_hpa,
2669 			(void *)(uintptr_t)
2670 			(mem_region_hpa[regionidx_hpa].memory_size));
2671 		++regionidx_hpa;
2672 	}
2673 	return regionidx_hpa;
2674 }
2675 
2676 /*
2677  * A new device is added to a data core. First the device is added to the main linked list
2678  * and the allocated to a specific data core.
2679  */
2680 static int
2681 new_device (struct virtio_net *dev)
2682 {
2683 	struct virtio_net_data_ll *ll_dev;
2684 	int lcore, core_add = 0;
2685 	uint32_t device_num_min = num_devices;
2686 	struct vhost_dev *vdev;
2687 	uint32_t regionidx;
2688 
2689 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2690 	if (vdev == NULL) {
2691 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2692 			dev->device_fh);
2693 		return -1;
2694 	}
2695 	vdev->dev = dev;
2696 	dev->priv = vdev;
2697 
2698 	if (zero_copy) {
2699 		vdev->nregions_hpa = dev->mem->nregions;
2700 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2701 			vdev->nregions_hpa
2702 				+= check_hpa_regions(
2703 					dev->mem->regions[regionidx].guest_phys_address
2704 					+ dev->mem->regions[regionidx].address_offset,
2705 					dev->mem->regions[regionidx].memory_size);
2706 
2707 		}
2708 
2709 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2710 					       vdev->nregions_hpa,
2711 					       sizeof(struct virtio_memory_regions_hpa),
2712 					       RTE_CACHE_LINE_SIZE);
2713 		if (vdev->regions_hpa == NULL) {
2714 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2715 			rte_free(vdev);
2716 			return -1;
2717 		}
2718 
2719 
2720 		if (fill_hpa_memory_regions(
2721 			vdev->regions_hpa, dev->mem
2722 			) != vdev->nregions_hpa) {
2723 
2724 			RTE_LOG(ERR, VHOST_CONFIG,
2725 				"hpa memory regions number mismatch: "
2726 				"[%d]\n", vdev->nregions_hpa);
2727 			rte_free(vdev->regions_hpa);
2728 			rte_free(vdev);
2729 			return -1;
2730 		}
2731 	}
2732 
2733 
2734 	/* Add device to main ll */
2735 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2736 	if (ll_dev == NULL) {
2737 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2738 			"of %d devices per core has been reached\n",
2739 			dev->device_fh, num_devices);
2740 		if (vdev->regions_hpa)
2741 			rte_free(vdev->regions_hpa);
2742 		rte_free(vdev);
2743 		return -1;
2744 	}
2745 	ll_dev->vdev = vdev;
2746 	add_data_ll_entry(&ll_root_used, ll_dev);
2747 	vdev->vmdq_rx_q
2748 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2749 
2750 	if (zero_copy) {
2751 		uint32_t index = vdev->vmdq_rx_q;
2752 		uint32_t count_in_ring, i;
2753 		struct mbuf_table *tx_q;
2754 
2755 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2756 
2757 		LOG_DEBUG(VHOST_CONFIG,
2758 			"(%"PRIu64") in new_device: mbuf count in mempool "
2759 			"before attach is: %d\n",
2760 			dev->device_fh,
2761 			rte_mempool_count(vpool_array[index].pool));
2762 		LOG_DEBUG(VHOST_CONFIG,
2763 			"(%"PRIu64") in new_device: mbuf count in  ring "
2764 			"before attach  is : %d\n",
2765 			dev->device_fh, count_in_ring);
2766 
2767 		/*
2768 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2769 		 */
2770 		for (i = 0; i < count_in_ring; i++)
2771 			attach_rxmbuf_zcp(dev);
2772 
2773 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2774 			"mempool after attach is: %d\n",
2775 			dev->device_fh,
2776 			rte_mempool_count(vpool_array[index].pool));
2777 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2778 			"ring after attach  is : %d\n",
2779 			dev->device_fh,
2780 			rte_ring_count(vpool_array[index].ring));
2781 
2782 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2783 		tx_q->txq_id = vdev->vmdq_rx_q;
2784 
2785 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2786 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2787 
2788 			LOG_DEBUG(VHOST_CONFIG,
2789 				"(%"PRIu64") In new_device: Failed to start "
2790 				"tx queue:%d\n",
2791 				dev->device_fh, vdev->vmdq_rx_q);
2792 
2793 			mbuf_destroy_zcp(vpool);
2794 			rte_free(vdev->regions_hpa);
2795 			rte_free(vdev);
2796 			return -1;
2797 		}
2798 
2799 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2800 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2801 
2802 			LOG_DEBUG(VHOST_CONFIG,
2803 				"(%"PRIu64") In new_device: Failed to start "
2804 				"rx queue:%d\n",
2805 				dev->device_fh, vdev->vmdq_rx_q);
2806 
2807 			/* Stop the TX queue. */
2808 			if (rte_eth_dev_tx_queue_stop(ports[0],
2809 				vdev->vmdq_rx_q) != 0) {
2810 				LOG_DEBUG(VHOST_CONFIG,
2811 					"(%"PRIu64") In new_device: Failed to "
2812 					"stop tx queue:%d\n",
2813 					dev->device_fh, vdev->vmdq_rx_q);
2814 			}
2815 
2816 			mbuf_destroy_zcp(vpool);
2817 			rte_free(vdev->regions_hpa);
2818 			rte_free(vdev);
2819 			return -1;
2820 		}
2821 
2822 	}
2823 
2824 	/*reset ready flag*/
2825 	vdev->ready = DEVICE_MAC_LEARNING;
2826 	vdev->remove = 0;
2827 
2828 	/* Find a suitable lcore to add the device. */
2829 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2830 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2831 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2832 			core_add = lcore;
2833 		}
2834 	}
2835 	/* Add device to lcore ll */
2836 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2837 	if (ll_dev == NULL) {
2838 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2839 		vdev->ready = DEVICE_SAFE_REMOVE;
2840 		destroy_device(dev);
2841 		rte_free(vdev->regions_hpa);
2842 		rte_free(vdev);
2843 		return -1;
2844 	}
2845 	ll_dev->vdev = vdev;
2846 	vdev->coreid = core_add;
2847 
2848 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2849 
2850 	/* Initialize device stats */
2851 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2852 
2853 	/* Disable notifications. */
2854 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2855 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2856 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2857 	dev->flags |= VIRTIO_DEV_RUNNING;
2858 
2859 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2860 
2861 	return 0;
2862 }
2863 
2864 /*
2865  * These callback allow devices to be added to the data core when configuration
2866  * has been fully complete.
2867  */
2868 static const struct virtio_net_device_ops virtio_net_device_ops =
2869 {
2870 	.new_device =  new_device,
2871 	.destroy_device = destroy_device,
2872 };
2873 
2874 /*
2875  * This is a thread will wake up after a period to print stats if the user has
2876  * enabled them.
2877  */
2878 static void
2879 print_stats(void)
2880 {
2881 	struct virtio_net_data_ll *dev_ll;
2882 	uint64_t tx_dropped, rx_dropped;
2883 	uint64_t tx, tx_total, rx, rx_total;
2884 	uint32_t device_fh;
2885 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2886 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2887 
2888 	while(1) {
2889 		sleep(enable_stats);
2890 
2891 		/* Clear screen and move to top left */
2892 		printf("%s%s", clr, top_left);
2893 
2894 		printf("\nDevice statistics ====================================");
2895 
2896 		dev_ll = ll_root_used;
2897 		while (dev_ll != NULL) {
2898 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2899 			tx_total = dev_statistics[device_fh].tx_total;
2900 			tx = dev_statistics[device_fh].tx;
2901 			tx_dropped = tx_total - tx;
2902 			if (zero_copy == 0) {
2903 				rx_total = rte_atomic64_read(
2904 					&dev_statistics[device_fh].rx_total_atomic);
2905 				rx = rte_atomic64_read(
2906 					&dev_statistics[device_fh].rx_atomic);
2907 			} else {
2908 				rx_total = dev_statistics[device_fh].rx_total;
2909 				rx = dev_statistics[device_fh].rx;
2910 			}
2911 			rx_dropped = rx_total - rx;
2912 
2913 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2914 					"\nTX total: 		%"PRIu64""
2915 					"\nTX dropped: 		%"PRIu64""
2916 					"\nTX successful: 		%"PRIu64""
2917 					"\nRX total: 		%"PRIu64""
2918 					"\nRX dropped: 		%"PRIu64""
2919 					"\nRX successful: 		%"PRIu64"",
2920 					device_fh,
2921 					tx_total,
2922 					tx_dropped,
2923 					tx,
2924 					rx_total,
2925 					rx_dropped,
2926 					rx);
2927 
2928 			dev_ll = dev_ll->next;
2929 		}
2930 		printf("\n======================================================\n");
2931 	}
2932 }
2933 
2934 static void
2935 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2936 	char *ring_name, uint32_t nb_mbuf)
2937 {
2938 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2939 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2940 	if (vpool_array[index].pool != NULL) {
2941 		vpool_array[index].ring
2942 			= rte_ring_create(ring_name,
2943 				rte_align32pow2(nb_mbuf + 1),
2944 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2945 		if (likely(vpool_array[index].ring != NULL)) {
2946 			LOG_DEBUG(VHOST_CONFIG,
2947 				"in setup_mempool_tbl: mbuf count in "
2948 				"mempool is: %d\n",
2949 				rte_mempool_count(vpool_array[index].pool));
2950 			LOG_DEBUG(VHOST_CONFIG,
2951 				"in setup_mempool_tbl: mbuf count in "
2952 				"ring   is: %d\n",
2953 				rte_ring_count(vpool_array[index].ring));
2954 		} else {
2955 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2956 				ring_name);
2957 		}
2958 
2959 		/* Need consider head room. */
2960 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2961 	} else {
2962 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2963 	}
2964 }
2965 
2966 /* When we receive a INT signal, unregister vhost driver */
2967 static void
2968 sigint_handler(__rte_unused int signum)
2969 {
2970 	/* Unregister vhost driver. */
2971 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2972 	if (ret != 0)
2973 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2974 	exit(0);
2975 }
2976 
2977 /*
2978  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2979  * device is also registered here to handle the IOCTLs.
2980  */
2981 int
2982 main(int argc, char *argv[])
2983 {
2984 	struct rte_mempool *mbuf_pool = NULL;
2985 	unsigned lcore_id, core_id = 0;
2986 	unsigned nb_ports, valid_num_ports;
2987 	int ret;
2988 	uint8_t portid;
2989 	uint16_t queue_id;
2990 	static pthread_t tid;
2991 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2992 
2993 	signal(SIGINT, sigint_handler);
2994 
2995 	/* init EAL */
2996 	ret = rte_eal_init(argc, argv);
2997 	if (ret < 0)
2998 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2999 	argc -= ret;
3000 	argv += ret;
3001 
3002 	/* parse app arguments */
3003 	ret = us_vhost_parse_args(argc, argv);
3004 	if (ret < 0)
3005 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
3006 
3007 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
3008 		if (rte_lcore_is_enabled(lcore_id))
3009 			lcore_ids[core_id ++] = lcore_id;
3010 
3011 	if (rte_lcore_count() > RTE_MAX_LCORE)
3012 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
3013 
3014 	/*set the number of swithcing cores available*/
3015 	num_switching_cores = rte_lcore_count()-1;
3016 
3017 	/* Get the number of physical ports. */
3018 	nb_ports = rte_eth_dev_count();
3019 	if (nb_ports > RTE_MAX_ETHPORTS)
3020 		nb_ports = RTE_MAX_ETHPORTS;
3021 
3022 	/*
3023 	 * Update the global var NUM_PORTS and global array PORTS
3024 	 * and get value of var VALID_NUM_PORTS according to system ports number
3025 	 */
3026 	valid_num_ports = check_ports_num(nb_ports);
3027 
3028 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3029 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3030 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3031 		return -1;
3032 	}
3033 
3034 	if (zero_copy == 0) {
3035 		/* Create the mbuf pool. */
3036 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3037 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3038 			0, MBUF_DATA_SIZE, rte_socket_id());
3039 		if (mbuf_pool == NULL)
3040 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3041 
3042 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3043 			vpool_array[queue_id].pool = mbuf_pool;
3044 
3045 		if (vm2vm_mode == VM2VM_HARDWARE) {
3046 			/* Enable VT loop back to let L2 switch to do it. */
3047 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3048 			LOG_DEBUG(VHOST_CONFIG,
3049 				"Enable loop back for L2 switch in vmdq.\n");
3050 		}
3051 	} else {
3052 		uint32_t nb_mbuf;
3053 		char pool_name[RTE_MEMPOOL_NAMESIZE];
3054 		char ring_name[RTE_MEMPOOL_NAMESIZE];
3055 
3056 		nb_mbuf = num_rx_descriptor
3057 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3058 			+ num_switching_cores * MAX_PKT_BURST;
3059 
3060 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3061 			snprintf(pool_name, sizeof(pool_name),
3062 				"rxmbuf_pool_%u", queue_id);
3063 			snprintf(ring_name, sizeof(ring_name),
3064 				"rxmbuf_ring_%u", queue_id);
3065 			setup_mempool_tbl(rte_socket_id(), queue_id,
3066 				pool_name, ring_name, nb_mbuf);
3067 		}
3068 
3069 		nb_mbuf = num_tx_descriptor
3070 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3071 				+ num_switching_cores * MAX_PKT_BURST;
3072 
3073 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3074 			snprintf(pool_name, sizeof(pool_name),
3075 				"txmbuf_pool_%u", queue_id);
3076 			snprintf(ring_name, sizeof(ring_name),
3077 				"txmbuf_ring_%u", queue_id);
3078 			setup_mempool_tbl(rte_socket_id(),
3079 				(queue_id + MAX_QUEUES),
3080 				pool_name, ring_name, nb_mbuf);
3081 		}
3082 
3083 		if (vm2vm_mode == VM2VM_HARDWARE) {
3084 			/* Enable VT loop back to let L2 switch to do it. */
3085 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3086 			LOG_DEBUG(VHOST_CONFIG,
3087 				"Enable loop back for L2 switch in vmdq.\n");
3088 		}
3089 	}
3090 	/* Set log level. */
3091 	rte_set_log_level(LOG_LEVEL);
3092 
3093 	/* initialize all ports */
3094 	for (portid = 0; portid < nb_ports; portid++) {
3095 		/* skip ports that are not enabled */
3096 		if ((enabled_port_mask & (1 << portid)) == 0) {
3097 			RTE_LOG(INFO, VHOST_PORT,
3098 				"Skipping disabled port %d\n", portid);
3099 			continue;
3100 		}
3101 		if (port_init(portid) != 0)
3102 			rte_exit(EXIT_FAILURE,
3103 				"Cannot initialize network ports\n");
3104 	}
3105 
3106 	/* Initialise all linked lists. */
3107 	if (init_data_ll() == -1)
3108 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3109 
3110 	/* Initialize device stats */
3111 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3112 
3113 	/* Enable stats if the user option is set. */
3114 	if (enable_stats) {
3115 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3116 		if (ret != 0)
3117 			rte_exit(EXIT_FAILURE,
3118 				"Cannot create print-stats thread\n");
3119 
3120 		/* Set thread_name for aid in debugging.  */
3121 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3122 		ret = rte_thread_setname(tid, thread_name);
3123 		if (ret != 0)
3124 			RTE_LOG(ERR, VHOST_CONFIG,
3125 				"Cannot set print-stats name\n");
3126 	}
3127 
3128 	/* Launch all data cores. */
3129 	if (zero_copy == 0) {
3130 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3131 			rte_eal_remote_launch(switch_worker,
3132 				mbuf_pool, lcore_id);
3133 		}
3134 	} else {
3135 		uint32_t count_in_mempool, index, i;
3136 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3137 			/* For all RX and TX queues. */
3138 			count_in_mempool
3139 				= rte_mempool_count(vpool_array[index].pool);
3140 
3141 			/*
3142 			 * Transfer all un-attached mbufs from vpool.pool
3143 			 * to vpoo.ring.
3144 			 */
3145 			for (i = 0; i < count_in_mempool; i++) {
3146 				struct rte_mbuf *mbuf
3147 					= __rte_mbuf_raw_alloc(
3148 						vpool_array[index].pool);
3149 				rte_ring_sp_enqueue(vpool_array[index].ring,
3150 						(void *)mbuf);
3151 			}
3152 
3153 			LOG_DEBUG(VHOST_CONFIG,
3154 				"in main: mbuf count in mempool at initial "
3155 				"is: %d\n", count_in_mempool);
3156 			LOG_DEBUG(VHOST_CONFIG,
3157 				"in main: mbuf count in  ring at initial  is :"
3158 				" %d\n",
3159 				rte_ring_count(vpool_array[index].ring));
3160 		}
3161 
3162 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3163 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3164 				lcore_id);
3165 	}
3166 
3167 	if (mergeable == 0)
3168 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3169 
3170 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3171 	ret = rte_vhost_driver_register((char *)&dev_basename);
3172 	if (ret != 0)
3173 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3174 
3175 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3176 
3177 	/* Start CUSE session. */
3178 	rte_vhost_driver_session_start();
3179 	return 0;
3180 
3181 }
3182