xref: /dpdk/examples/vhost/main.c (revision db4014f2b65cb31bf209cadd5bcec778ca137fe2)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 256
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144 
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147 
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150 
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154 
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161 
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165 
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168 
169 struct vpool {
170 	struct rte_mempool *pool;
171 	struct rte_ring *ring;
172 	uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174 
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177 	VM2VM_DISABLED = 0,
178 	VM2VM_SOFTWARE = 1,
179 	VM2VM_HARDWARE = 2,
180 	VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183 
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186 	PHYS_ADDR_CONTINUOUS = 0,
187 	PHYS_ADDR_CROSS_SUBREG = 1,
188 	PHYS_ADDR_INVALID = 2,
189 	PHYS_ADDR_LAST
190 } hpa_type;
191 
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200 
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203 
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206 	.rxmode = {
207 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208 		.split_hdr_size = 0,
209 		.header_split   = 0, /**< Header Split disabled */
210 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
211 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
212 		/*
213 		 * It is necessary for 1G NIC such as I350,
214 		 * this fixes bug of ipv4 forwarding in guest can't
215 		 * forward pakets from one virtio dev to another virtio dev.
216 		 */
217 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
220 	},
221 
222 	.txmode = {
223 		.mq_mode = ETH_MQ_TX_NONE,
224 	},
225 	.rx_adv_conf = {
226 		/*
227 		 * should be overridden separately in code with
228 		 * appropriate values
229 		 */
230 		.vmdq_rx_conf = {
231 			.nb_queue_pools = ETH_8_POOLS,
232 			.enable_default_pool = 0,
233 			.default_pool = 0,
234 			.nb_pool_maps = 0,
235 			.pool_map = {{0, 0},},
236 		},
237 	},
238 };
239 
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246 
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
251 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258 
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261 
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265 
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268 
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271 	unsigned len;
272 	unsigned txq_id;
273 	struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275 
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278 
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281 
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284 	unsigned char   h_dest[ETH_ALEN];
285 	unsigned char   h_source[ETH_ALEN];
286 	__be16          h_vlan_proto;
287 	__be16          h_vlan_TCI;
288 	__be16          h_vlan_encapsulated_proto;
289 };
290 
291 /* IPv4 Header */
292 struct ipv4_hdr {
293 	uint8_t  version_ihl;		/**< version and header length */
294 	uint8_t  type_of_service;	/**< type of service */
295 	uint16_t total_length;		/**< length of packet */
296 	uint16_t packet_id;		/**< packet ID */
297 	uint16_t fragment_offset;	/**< fragmentation offset */
298 	uint8_t  time_to_live;		/**< time to live */
299 	uint8_t  next_proto_id;		/**< protocol ID */
300 	uint16_t hdr_checksum;		/**< header checksum */
301 	uint32_t src_addr;		/**< source address */
302 	uint32_t dst_addr;		/**< destination address */
303 } __attribute__((__packed__));
304 
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308 
309 /* Per-device statistics struct */
310 struct device_statistics {
311 	uint64_t tx_total;
312 	rte_atomic64_t rx_total_atomic;
313 	uint64_t rx_total;
314 	uint64_t tx;
315 	rte_atomic64_t rx_atomic;
316 	uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319 
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327 	struct rte_eth_vmdq_rx_conf conf;
328 	struct rte_eth_vmdq_rx_conf *def_conf =
329 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330 	unsigned i;
331 
332 	memset(&conf, 0, sizeof(conf));
333 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334 	conf.nb_pool_maps = num_devices;
335 	conf.enable_loop_back = def_conf->enable_loop_back;
336 	conf.rx_mode = def_conf->rx_mode;
337 
338 	for (i = 0; i < conf.nb_pool_maps; i++) {
339 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
340 		conf.pool_map[i].pools = (1UL << i);
341 	}
342 
343 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346 	return 0;
347 }
348 
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357 	if (num_devices > max_nb_devices) {
358 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359 		return -1;
360 	}
361 	return 0;
362 }
363 
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371 	struct rte_eth_dev_info dev_info;
372 	struct rte_eth_conf port_conf;
373 	struct rte_eth_rxconf *rxconf;
374 	struct rte_eth_txconf *txconf;
375 	int16_t rx_rings, tx_rings;
376 	uint16_t rx_ring_size, tx_ring_size;
377 	int retval;
378 	uint16_t q;
379 
380 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381 	rte_eth_dev_info_get (port, &dev_info);
382 
383 	rxconf = &dev_info.default_rxconf;
384 	txconf = &dev_info.default_txconf;
385 	rxconf->rx_drop_en = 1;
386 
387 	/*
388 	 * Zero copy defers queue RX/TX start to the time when guest
389 	 * finishes its startup and packet buffers from that guest are
390 	 * available.
391 	 */
392 	if (zero_copy) {
393 		rxconf->rx_deferred_start = 1;
394 		rxconf->rx_drop_en = 0;
395 		txconf->tx_deferred_start = 1;
396 	}
397 
398 	/*configure the number of supported virtio devices based on VMDQ limits */
399 	num_devices = dev_info.max_vmdq_pools;
400 
401 	if (zero_copy) {
402 		rx_ring_size = num_rx_descriptor;
403 		tx_ring_size = num_tx_descriptor;
404 		tx_rings = dev_info.max_tx_queues;
405 	} else {
406 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
408 		tx_rings = (uint16_t)rte_lcore_count();
409 	}
410 
411 	retval = validate_num_devices(MAX_DEVICES);
412 	if (retval < 0)
413 		return retval;
414 
415 	/* Get port configuration. */
416 	retval = get_eth_conf(&port_conf, num_devices);
417 	if (retval < 0)
418 		return retval;
419 	/* NIC queues are divided into pf queues and vmdq queues.  */
420 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
421 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
422 	num_vmdq_queues = num_devices * queues_per_pool;
423 	num_queues = num_pf_queues + num_vmdq_queues;
424 	vmdq_queue_base = dev_info.vmdq_queue_base;
425 	vmdq_pool_base  = dev_info.vmdq_pool_base;
426 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
427 		num_pf_queues, num_devices, queues_per_pool);
428 
429 	if (port >= rte_eth_dev_count()) return -1;
430 
431 	rx_rings = (uint16_t)dev_info.max_rx_queues;
432 	/* Configure ethernet device. */
433 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
434 	if (retval != 0)
435 		return retval;
436 
437 	/* Setup the queues. */
438 	for (q = 0; q < rx_rings; q ++) {
439 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
440 						rte_eth_dev_socket_id(port),
441 						rxconf,
442 						vpool_array[q].pool);
443 		if (retval < 0)
444 			return retval;
445 	}
446 	for (q = 0; q < tx_rings; q ++) {
447 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
448 						rte_eth_dev_socket_id(port),
449 						txconf);
450 		if (retval < 0)
451 			return retval;
452 	}
453 
454 	/* Start the device. */
455 	retval  = rte_eth_dev_start(port);
456 	if (retval < 0) {
457 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
458 		return retval;
459 	}
460 
461 	if (promiscuous)
462 		rte_eth_promiscuous_enable(port);
463 
464 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
465 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
466 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
467 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
468 			(unsigned)port,
469 			vmdq_ports_eth_addr[port].addr_bytes[0],
470 			vmdq_ports_eth_addr[port].addr_bytes[1],
471 			vmdq_ports_eth_addr[port].addr_bytes[2],
472 			vmdq_ports_eth_addr[port].addr_bytes[3],
473 			vmdq_ports_eth_addr[port].addr_bytes[4],
474 			vmdq_ports_eth_addr[port].addr_bytes[5]);
475 
476 	return 0;
477 }
478 
479 /*
480  * Set character device basename.
481  */
482 static int
483 us_vhost_parse_basename(const char *q_arg)
484 {
485 	/* parse number string */
486 
487 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
488 		return -1;
489 	else
490 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
491 
492 	return 0;
493 }
494 
495 /*
496  * Parse the portmask provided at run time.
497  */
498 static int
499 parse_portmask(const char *portmask)
500 {
501 	char *end = NULL;
502 	unsigned long pm;
503 
504 	errno = 0;
505 
506 	/* parse hexadecimal string */
507 	pm = strtoul(portmask, &end, 16);
508 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
509 		return -1;
510 
511 	if (pm == 0)
512 		return -1;
513 
514 	return pm;
515 
516 }
517 
518 /*
519  * Parse num options at run time.
520  */
521 static int
522 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
523 {
524 	char *end = NULL;
525 	unsigned long num;
526 
527 	errno = 0;
528 
529 	/* parse unsigned int string */
530 	num = strtoul(q_arg, &end, 10);
531 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
532 		return -1;
533 
534 	if (num > max_valid_value)
535 		return -1;
536 
537 	return num;
538 
539 }
540 
541 /*
542  * Display usage
543  */
544 static void
545 us_vhost_usage(const char *prgname)
546 {
547 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
548 	"		--vm2vm [0|1|2]\n"
549 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
550 	"		--dev-basename <name>\n"
551 	"		--nb-devices ND\n"
552 	"		-p PORTMASK: Set mask for ports to be used by application\n"
553 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
554 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
555 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
556 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
557 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
558 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
559 	"		--dev-basename: The basename to be used for the character device.\n"
560 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
561 			"zero copy\n"
562 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
563 			"used only when zero copy is enabled.\n"
564 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
565 			"used only when zero copy is enabled.\n",
566 	       prgname);
567 }
568 
569 /*
570  * Parse the arguments given in the command line of the application.
571  */
572 static int
573 us_vhost_parse_args(int argc, char **argv)
574 {
575 	int opt, ret;
576 	int option_index;
577 	unsigned i;
578 	const char *prgname = argv[0];
579 	static struct option long_option[] = {
580 		{"vm2vm", required_argument, NULL, 0},
581 		{"rx-retry", required_argument, NULL, 0},
582 		{"rx-retry-delay", required_argument, NULL, 0},
583 		{"rx-retry-num", required_argument, NULL, 0},
584 		{"mergeable", required_argument, NULL, 0},
585 		{"stats", required_argument, NULL, 0},
586 		{"dev-basename", required_argument, NULL, 0},
587 		{"zero-copy", required_argument, NULL, 0},
588 		{"rx-desc-num", required_argument, NULL, 0},
589 		{"tx-desc-num", required_argument, NULL, 0},
590 		{NULL, 0, 0, 0},
591 	};
592 
593 	/* Parse command line */
594 	while ((opt = getopt_long(argc, argv, "p:P",
595 			long_option, &option_index)) != EOF) {
596 		switch (opt) {
597 		/* Portmask */
598 		case 'p':
599 			enabled_port_mask = parse_portmask(optarg);
600 			if (enabled_port_mask == 0) {
601 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
602 				us_vhost_usage(prgname);
603 				return -1;
604 			}
605 			break;
606 
607 		case 'P':
608 			promiscuous = 1;
609 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
610 				ETH_VMDQ_ACCEPT_BROADCAST |
611 				ETH_VMDQ_ACCEPT_MULTICAST;
612 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
613 
614 			break;
615 
616 		case 0:
617 			/* Enable/disable vm2vm comms. */
618 			if (!strncmp(long_option[option_index].name, "vm2vm",
619 				MAX_LONG_OPT_SZ)) {
620 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621 				if (ret == -1) {
622 					RTE_LOG(INFO, VHOST_CONFIG,
623 						"Invalid argument for "
624 						"vm2vm [0|1|2]\n");
625 					us_vhost_usage(prgname);
626 					return -1;
627 				} else {
628 					vm2vm_mode = (vm2vm_type)ret;
629 				}
630 			}
631 
632 			/* Enable/disable retries on RX. */
633 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634 				ret = parse_num_opt(optarg, 1);
635 				if (ret == -1) {
636 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637 					us_vhost_usage(prgname);
638 					return -1;
639 				} else {
640 					enable_retry = ret;
641 				}
642 			}
643 
644 			/* Specify the retries delay time (in useconds) on RX. */
645 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646 				ret = parse_num_opt(optarg, INT32_MAX);
647 				if (ret == -1) {
648 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649 					us_vhost_usage(prgname);
650 					return -1;
651 				} else {
652 					burst_rx_delay_time = ret;
653 				}
654 			}
655 
656 			/* Specify the retries number on RX. */
657 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658 				ret = parse_num_opt(optarg, INT32_MAX);
659 				if (ret == -1) {
660 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661 					us_vhost_usage(prgname);
662 					return -1;
663 				} else {
664 					burst_rx_retry_num = ret;
665 				}
666 			}
667 
668 			/* Enable/disable RX mergeable buffers. */
669 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670 				ret = parse_num_opt(optarg, 1);
671 				if (ret == -1) {
672 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673 					us_vhost_usage(prgname);
674 					return -1;
675 				} else {
676 					mergeable = !!ret;
677 					if (ret) {
678 						vmdq_conf_default.rxmode.jumbo_frame = 1;
679 						vmdq_conf_default.rxmode.max_rx_pkt_len
680 							= JUMBO_FRAME_MAX_SIZE;
681 					}
682 				}
683 			}
684 
685 			/* Enable/disable stats. */
686 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687 				ret = parse_num_opt(optarg, INT32_MAX);
688 				if (ret == -1) {
689 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690 					us_vhost_usage(prgname);
691 					return -1;
692 				} else {
693 					enable_stats = ret;
694 				}
695 			}
696 
697 			/* Set character device basename. */
698 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699 				if (us_vhost_parse_basename(optarg) == -1) {
700 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701 					us_vhost_usage(prgname);
702 					return -1;
703 				}
704 			}
705 
706 			/* Enable/disable rx/tx zero copy. */
707 			if (!strncmp(long_option[option_index].name,
708 				"zero-copy", MAX_LONG_OPT_SZ)) {
709 				ret = parse_num_opt(optarg, 1);
710 				if (ret == -1) {
711 					RTE_LOG(INFO, VHOST_CONFIG,
712 						"Invalid argument"
713 						" for zero-copy [0|1]\n");
714 					us_vhost_usage(prgname);
715 					return -1;
716 				} else
717 					zero_copy = ret;
718 
719 				if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722 					"zero copy vhost APP, please "
723 					"disable RTE_MBUF_REFCNT\n"
724 					"in config file and then rebuild DPDK "
725 					"core lib!\n"
726 					"Otherwise please disable zero copy "
727 					"flag in command line!\n");
728 					return -1;
729 #endif
730 				}
731 			}
732 
733 			/* Specify the descriptor number on RX. */
734 			if (!strncmp(long_option[option_index].name,
735 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
736 				ret = parse_num_opt(optarg, MAX_RING_DESC);
737 				if ((ret == -1) || (!POWEROF2(ret))) {
738 					RTE_LOG(INFO, VHOST_CONFIG,
739 					"Invalid argument for rx-desc-num[0-N],"
740 					"power of 2 required.\n");
741 					us_vhost_usage(prgname);
742 					return -1;
743 				} else {
744 					num_rx_descriptor = ret;
745 				}
746 			}
747 
748 			/* Specify the descriptor number on TX. */
749 			if (!strncmp(long_option[option_index].name,
750 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
751 				ret = parse_num_opt(optarg, MAX_RING_DESC);
752 				if ((ret == -1) || (!POWEROF2(ret))) {
753 					RTE_LOG(INFO, VHOST_CONFIG,
754 					"Invalid argument for tx-desc-num [0-N],"
755 					"power of 2 required.\n");
756 					us_vhost_usage(prgname);
757 					return -1;
758 				} else {
759 					num_tx_descriptor = ret;
760 				}
761 			}
762 
763 			break;
764 
765 			/* Invalid option - print options. */
766 		default:
767 			us_vhost_usage(prgname);
768 			return -1;
769 		}
770 	}
771 
772 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773 		if (enabled_port_mask & (1 << i))
774 			ports[num_ports++] = (uint8_t)i;
775 	}
776 
777 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780 		return -1;
781 	}
782 
783 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784 		RTE_LOG(INFO, VHOST_PORT,
785 			"Vhost zero copy doesn't support software vm2vm,"
786 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
787 		return -1;
788 	}
789 
790 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791 		RTE_LOG(INFO, VHOST_PORT,
792 			"Vhost zero copy doesn't support jumbo frame,"
793 			"please specify '--mergeable 0' to disable the "
794 			"mergeable feature.\n");
795 		return -1;
796 	}
797 
798 	return 0;
799 }
800 
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807 	unsigned valid_num_ports = num_ports;
808 	unsigned portid;
809 
810 	if (num_ports > nb_ports) {
811 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812 			num_ports, nb_ports);
813 		num_ports = nb_ports;
814 	}
815 
816 	for (portid = 0; portid < num_ports; portid ++) {
817 		if (ports[portid] >= nb_ports) {
818 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819 				ports[portid], (nb_ports - 1));
820 			ports[portid] = INVALID_PORT_ID;
821 			valid_num_ports--;
822 		}
823 	}
824 	return valid_num_ports;
825 }
826 
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {																\
833 	char *pkt_addr = (char*)(addr);																					\
834 	unsigned int index;																								\
835 	char packet[MAX_PRINT_BUFF];																					\
836 																													\
837 	if ((header))																									\
838 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
839 	else																											\
840 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
841 	for (index = 0; index < (size); index++) {																		\
842 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
843 			"%02hhx ", pkt_addr[index]);																			\
844 	}																												\
845 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
846 																													\
847 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852 
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859 	uint32_t buf_len, hpa_type *addr_type)
860 {
861 	struct virtio_memory_regions_hpa *region;
862 	uint32_t regionidx;
863 	uint64_t vhost_pa = 0;
864 
865 	*addr_type = PHYS_ADDR_INVALID;
866 
867 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868 		region = &vdev->regions_hpa[regionidx];
869 		if ((guest_pa >= region->guest_phys_address) &&
870 			(guest_pa <= region->guest_phys_address_end)) {
871 			vhost_pa = region->host_phys_addr_offset + guest_pa;
872 			if (likely((guest_pa + buf_len - 1)
873 				<= region->guest_phys_address_end))
874 				*addr_type = PHYS_ADDR_CONTINUOUS;
875 			else
876 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
877 			break;
878 		}
879 	}
880 
881 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883 		(void *)(uintptr_t)vhost_pa);
884 
885 	return vhost_pa;
886 }
887 
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896 
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904 	struct ether_hdr *pkt_hdr;
905 	struct virtio_net_data_ll *dev_ll;
906 	struct virtio_net *dev = vdev->dev;
907 	int i, ret;
908 
909 	/* Learn MAC address of guest device from packet */
910 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911 
912 	dev_ll = ll_root_used;
913 
914 	while (dev_ll != NULL) {
915 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917 			return -1;
918 		}
919 		dev_ll = dev_ll->next;
920 	}
921 
922 	for (i = 0; i < ETHER_ADDR_LEN; i++)
923 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924 
925 	/* vlan_tag currently uses the device_id. */
926 	vdev->vlan_tag = vlan_tags[dev->device_fh];
927 
928 	/* Print out VMDQ registration info. */
929 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930 		dev->device_fh,
931 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934 		vdev->vlan_tag);
935 
936 	/* Register the MAC address. */
937 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
938 				(uint32_t)dev->device_fh + vmdq_pool_base);
939 	if (ret)
940 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
941 					dev->device_fh);
942 
943 	/* Enable stripping of the vlan tag as we handle routing. */
944 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
945 
946 	/* Set device as ready for RX. */
947 	vdev->ready = DEVICE_RX;
948 
949 	return 0;
950 }
951 
952 /*
953  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
954  * queue before disabling RX on the device.
955  */
956 static inline void
957 unlink_vmdq(struct vhost_dev *vdev)
958 {
959 	unsigned i = 0;
960 	unsigned rx_count;
961 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
962 
963 	if (vdev->ready == DEVICE_RX) {
964 		/*clear MAC and VLAN settings*/
965 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
966 		for (i = 0; i < 6; i++)
967 			vdev->mac_address.addr_bytes[i] = 0;
968 
969 		vdev->vlan_tag = 0;
970 
971 		/*Clear out the receive buffers*/
972 		rx_count = rte_eth_rx_burst(ports[0],
973 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
974 
975 		while (rx_count) {
976 			for (i = 0; i < rx_count; i++)
977 				rte_pktmbuf_free(pkts_burst[i]);
978 
979 			rx_count = rte_eth_rx_burst(ports[0],
980 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
981 		}
982 
983 		vdev->ready = DEVICE_MAC_LEARNING;
984 	}
985 }
986 
987 /*
988  * Check if the packet destination MAC address is for a local device. If so then put
989  * the packet on that devices RX queue. If not then return.
990  */
991 static inline int __attribute__((always_inline))
992 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
993 {
994 	struct virtio_net_data_ll *dev_ll;
995 	struct ether_hdr *pkt_hdr;
996 	uint64_t ret = 0;
997 	struct virtio_net *dev = vdev->dev;
998 	struct virtio_net *tdev; /* destination virito device */
999 
1000 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1001 
1002 	/*get the used devices list*/
1003 	dev_ll = ll_root_used;
1004 
1005 	while (dev_ll != NULL) {
1006 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1007 				          &dev_ll->vdev->mac_address)) {
1008 
1009 			/* Drop the packet if the TX packet is destined for the TX device. */
1010 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1011 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1012 							dev->device_fh);
1013 				return 0;
1014 			}
1015 			tdev = dev_ll->vdev->dev;
1016 
1017 
1018 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1019 
1020 			if (unlikely(dev_ll->vdev->remove)) {
1021 				/*drop the packet if the device is marked for removal*/
1022 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1023 			} else {
1024 				/*send the packet to the local virtio device*/
1025 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1026 				if (enable_stats) {
1027 					rte_atomic64_add(
1028 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1029 					1);
1030 					rte_atomic64_add(
1031 					&dev_statistics[tdev->device_fh].rx_atomic,
1032 					ret);
1033 					dev_statistics[tdev->device_fh].tx_total++;
1034 					dev_statistics[tdev->device_fh].tx += ret;
1035 				}
1036 			}
1037 
1038 			return 0;
1039 		}
1040 		dev_ll = dev_ll->next;
1041 	}
1042 
1043 	return -1;
1044 }
1045 
1046 /*
1047  * Check if the destination MAC of a packet is one local VM,
1048  * and get its vlan tag, and offset if it is.
1049  */
1050 static inline int __attribute__((always_inline))
1051 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1052 	uint32_t *offset, uint16_t *vlan_tag)
1053 {
1054 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1055 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1056 
1057 	while (dev_ll != NULL) {
1058 		if ((dev_ll->vdev->ready == DEVICE_RX)
1059 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1060 		&dev_ll->vdev->mac_address)) {
1061 			/*
1062 			 * Drop the packet if the TX packet is
1063 			 * destined for the TX device.
1064 			 */
1065 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1066 				LOG_DEBUG(VHOST_DATA,
1067 				"(%"PRIu64") TX: Source and destination"
1068 				" MAC addresses are the same. Dropping "
1069 				"packet.\n",
1070 				dev_ll->vdev->dev->device_fh);
1071 				return -1;
1072 			}
1073 
1074 			/*
1075 			 * HW vlan strip will reduce the packet length
1076 			 * by minus length of vlan tag, so need restore
1077 			 * the packet length by plus it.
1078 			 */
1079 			*offset = VLAN_HLEN;
1080 			*vlan_tag =
1081 			(uint16_t)
1082 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1083 
1084 			LOG_DEBUG(VHOST_DATA,
1085 			"(%"PRIu64") TX: pkt to local VM device id:"
1086 			"(%"PRIu64") vlan tag: %d.\n",
1087 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1088 			vlan_tag);
1089 
1090 			break;
1091 		}
1092 		dev_ll = dev_ll->next;
1093 	}
1094 	return 0;
1095 }
1096 
1097 /*
1098  * This function routes the TX packet to the correct interface. This may be a local device
1099  * or the physical port.
1100  */
1101 static inline void __attribute__((always_inline))
1102 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1103 {
1104 	struct mbuf_table *tx_q;
1105 	struct rte_mbuf **m_table;
1106 	unsigned len, ret, offset = 0;
1107 	const uint16_t lcore_id = rte_lcore_id();
1108 	struct virtio_net *dev = vdev->dev;
1109 
1110 	/*check if destination is local VM*/
1111 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1112 		rte_pktmbuf_free(m);
1113 		return;
1114 	}
1115 
1116 	if (vm2vm_mode == VM2VM_HARDWARE) {
1117 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1118 			offset > rte_pktmbuf_tailroom(m)) {
1119 			rte_pktmbuf_free(m);
1120 			return;
1121 		}
1122 	}
1123 
1124 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1125 
1126 	/*Add packet to the port tx queue*/
1127 	tx_q = &lcore_tx_queue[lcore_id];
1128 	len = tx_q->len;
1129 
1130 	m->ol_flags = PKT_TX_VLAN_PKT;
1131 
1132 	m->data_len += offset;
1133 	m->pkt_len += offset;
1134 
1135 	m->vlan_tci = vlan_tag;
1136 
1137 	tx_q->m_table[len] = m;
1138 	len++;
1139 	if (enable_stats) {
1140 		dev_statistics[dev->device_fh].tx_total++;
1141 		dev_statistics[dev->device_fh].tx++;
1142 	}
1143 
1144 	if (unlikely(len == MAX_PKT_BURST)) {
1145 		m_table = (struct rte_mbuf **)tx_q->m_table;
1146 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1147 		/* Free any buffers not handled by TX and update the port stats. */
1148 		if (unlikely(ret < len)) {
1149 			do {
1150 				rte_pktmbuf_free(m_table[ret]);
1151 			} while (++ret < len);
1152 		}
1153 
1154 		len = 0;
1155 	}
1156 
1157 	tx_q->len = len;
1158 	return;
1159 }
1160 /*
1161  * This function is called by each data core. It handles all RX/TX registered with the
1162  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1163  * with all devices in the main linked list.
1164  */
1165 static int
1166 switch_worker(__attribute__((unused)) void *arg)
1167 {
1168 	struct rte_mempool *mbuf_pool = arg;
1169 	struct virtio_net *dev = NULL;
1170 	struct vhost_dev *vdev = NULL;
1171 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1172 	struct virtio_net_data_ll *dev_ll;
1173 	struct mbuf_table *tx_q;
1174 	volatile struct lcore_ll_info *lcore_ll;
1175 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1176 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1177 	unsigned ret, i;
1178 	const uint16_t lcore_id = rte_lcore_id();
1179 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1180 	uint16_t rx_count = 0;
1181 	uint16_t tx_count;
1182 	uint32_t retry = 0;
1183 
1184 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1185 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1186 	prev_tsc = 0;
1187 
1188 	tx_q = &lcore_tx_queue[lcore_id];
1189 	for (i = 0; i < num_cores; i ++) {
1190 		if (lcore_ids[i] == lcore_id) {
1191 			tx_q->txq_id = i;
1192 			break;
1193 		}
1194 	}
1195 
1196 	while(1) {
1197 		cur_tsc = rte_rdtsc();
1198 		/*
1199 		 * TX burst queue drain
1200 		 */
1201 		diff_tsc = cur_tsc - prev_tsc;
1202 		if (unlikely(diff_tsc > drain_tsc)) {
1203 
1204 			if (tx_q->len) {
1205 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1206 
1207 				/*Tx any packets in the queue*/
1208 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1209 									   (struct rte_mbuf **)tx_q->m_table,
1210 									   (uint16_t)tx_q->len);
1211 				if (unlikely(ret < tx_q->len)) {
1212 					do {
1213 						rte_pktmbuf_free(tx_q->m_table[ret]);
1214 					} while (++ret < tx_q->len);
1215 				}
1216 
1217 				tx_q->len = 0;
1218 			}
1219 
1220 			prev_tsc = cur_tsc;
1221 
1222 		}
1223 
1224 		rte_prefetch0(lcore_ll->ll_root_used);
1225 		/*
1226 		 * Inform the configuration core that we have exited the linked list and that no devices are
1227 		 * in use if requested.
1228 		 */
1229 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1230 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1231 
1232 		/*
1233 		 * Process devices
1234 		 */
1235 		dev_ll = lcore_ll->ll_root_used;
1236 
1237 		while (dev_ll != NULL) {
1238 			/*get virtio device ID*/
1239 			vdev = dev_ll->vdev;
1240 			dev = vdev->dev;
1241 
1242 			if (unlikely(vdev->remove)) {
1243 				dev_ll = dev_ll->next;
1244 				unlink_vmdq(vdev);
1245 				vdev->ready = DEVICE_SAFE_REMOVE;
1246 				continue;
1247 			}
1248 			if (likely(vdev->ready == DEVICE_RX)) {
1249 				/*Handle guest RX*/
1250 				rx_count = rte_eth_rx_burst(ports[0],
1251 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1252 
1253 				if (rx_count) {
1254 					/*
1255 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1256 					* Here MAX_PKT_BURST must be less than virtio queue size
1257 					*/
1258 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1259 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1260 							rte_delay_us(burst_rx_delay_time);
1261 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1262 								break;
1263 						}
1264 					}
1265 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1266 					if (enable_stats) {
1267 						rte_atomic64_add(
1268 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1269 						rx_count);
1270 						rte_atomic64_add(
1271 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1272 					}
1273 					while (likely(rx_count)) {
1274 						rx_count--;
1275 						rte_pktmbuf_free(pkts_burst[rx_count]);
1276 					}
1277 
1278 				}
1279 			}
1280 
1281 			if (likely(!vdev->remove)) {
1282 				/* Handle guest TX*/
1283 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1284 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1285 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1286 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1287 						while (tx_count--)
1288 							rte_pktmbuf_free(pkts_burst[tx_count]);
1289 					}
1290 				}
1291 				while (tx_count)
1292 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1293 			}
1294 
1295 			/*move to the next device in the list*/
1296 			dev_ll = dev_ll->next;
1297 		}
1298 	}
1299 
1300 	return 0;
1301 }
1302 
1303 /*
1304  * This function gets available ring number for zero copy rx.
1305  * Only one thread will call this funciton for a paticular virtio device,
1306  * so, it is designed as non-thread-safe function.
1307  */
1308 static inline uint32_t __attribute__((always_inline))
1309 get_available_ring_num_zcp(struct virtio_net *dev)
1310 {
1311 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1312 	uint16_t avail_idx;
1313 
1314 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1315 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1316 }
1317 
1318 /*
1319  * This function gets available ring index for zero copy rx,
1320  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1321  * Only one thread will call this funciton for a paticular virtio device,
1322  * so, it is designed as non-thread-safe function.
1323  */
1324 static inline uint32_t __attribute__((always_inline))
1325 get_available_ring_index_zcp(struct virtio_net *dev,
1326 	uint16_t *res_base_idx, uint32_t count)
1327 {
1328 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1329 	uint16_t avail_idx;
1330 	uint32_t retry = 0;
1331 	uint16_t free_entries;
1332 
1333 	*res_base_idx = vq->last_used_idx_res;
1334 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1335 	free_entries = (avail_idx - *res_base_idx);
1336 
1337 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1338 			"avail idx: %d, "
1339 			"res base idx:%d, free entries:%d\n",
1340 			dev->device_fh, avail_idx, *res_base_idx,
1341 			free_entries);
1342 
1343 	/*
1344 	 * If retry is enabled and the queue is full then we wait
1345 	 * and retry to avoid packet loss.
1346 	 */
1347 	if (enable_retry && unlikely(count > free_entries)) {
1348 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1349 			rte_delay_us(burst_rx_delay_time);
1350 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1351 			free_entries = (avail_idx - *res_base_idx);
1352 			if (count <= free_entries)
1353 				break;
1354 		}
1355 	}
1356 
1357 	/*check that we have enough buffers*/
1358 	if (unlikely(count > free_entries))
1359 		count = free_entries;
1360 
1361 	if (unlikely(count == 0)) {
1362 		LOG_DEBUG(VHOST_DATA,
1363 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1364 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1365 			dev->device_fh, avail_idx,
1366 			*res_base_idx, free_entries);
1367 		return 0;
1368 	}
1369 
1370 	vq->last_used_idx_res = *res_base_idx + count;
1371 
1372 	return count;
1373 }
1374 
1375 /*
1376  * This function put descriptor back to used list.
1377  */
1378 static inline void __attribute__((always_inline))
1379 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1380 {
1381 	uint16_t res_cur_idx = vq->last_used_idx;
1382 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1383 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1384 	rte_compiler_barrier();
1385 	*(volatile uint16_t *)&vq->used->idx += 1;
1386 	vq->last_used_idx += 1;
1387 
1388 	/* Kick the guest if necessary. */
1389 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1390 		eventfd_write((int)vq->kickfd, 1);
1391 }
1392 
1393 /*
1394  * This function get available descriptor from vitio vring and un-attached mbuf
1395  * from vpool->ring, and then attach them together. It needs adjust the offset
1396  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1397  * frame data may be put to wrong location in mbuf.
1398  */
1399 static inline void __attribute__((always_inline))
1400 attach_rxmbuf_zcp(struct virtio_net *dev)
1401 {
1402 	uint16_t res_base_idx, desc_idx;
1403 	uint64_t buff_addr, phys_addr;
1404 	struct vhost_virtqueue *vq;
1405 	struct vring_desc *desc;
1406 	struct rte_mbuf *mbuf = NULL;
1407 	struct vpool *vpool;
1408 	hpa_type addr_type;
1409 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1410 
1411 	vpool = &vpool_array[vdev->vmdq_rx_q];
1412 	vq = dev->virtqueue[VIRTIO_RXQ];
1413 
1414 	do {
1415 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1416 				1) != 1))
1417 			return;
1418 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1419 
1420 		desc = &vq->desc[desc_idx];
1421 		if (desc->flags & VRING_DESC_F_NEXT) {
1422 			desc = &vq->desc[desc->next];
1423 			buff_addr = gpa_to_vva(dev, desc->addr);
1424 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1425 					&addr_type);
1426 		} else {
1427 			buff_addr = gpa_to_vva(dev,
1428 					desc->addr + vq->vhost_hlen);
1429 			phys_addr = gpa_to_hpa(vdev,
1430 					desc->addr + vq->vhost_hlen,
1431 					desc->len, &addr_type);
1432 		}
1433 
1434 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1435 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1436 				" address found when attaching RX frame buffer"
1437 				" address!\n", dev->device_fh);
1438 			put_desc_to_used_list_zcp(vq, desc_idx);
1439 			continue;
1440 		}
1441 
1442 		/*
1443 		 * Check if the frame buffer address from guest crosses
1444 		 * sub-region or not.
1445 		 */
1446 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1447 			RTE_LOG(ERR, VHOST_DATA,
1448 				"(%"PRIu64") Frame buffer address cross "
1449 				"sub-regioin found when attaching RX frame "
1450 				"buffer address!\n",
1451 				dev->device_fh);
1452 			put_desc_to_used_list_zcp(vq, desc_idx);
1453 			continue;
1454 		}
1455 	} while (unlikely(phys_addr == 0));
1456 
1457 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1458 	if (unlikely(mbuf == NULL)) {
1459 		LOG_DEBUG(VHOST_DATA,
1460 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1461 			"ring_sc_dequeue fail.\n",
1462 			dev->device_fh);
1463 		put_desc_to_used_list_zcp(vq, desc_idx);
1464 		return;
1465 	}
1466 
1467 	if (unlikely(vpool->buf_size > desc->len)) {
1468 		LOG_DEBUG(VHOST_DATA,
1469 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1470 			"length(%d) of descriptor idx: %d less than room "
1471 			"size required: %d\n",
1472 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1473 		put_desc_to_used_list_zcp(vq, desc_idx);
1474 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1475 		return;
1476 	}
1477 
1478 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1479 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1480 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1481 	mbuf->data_len = desc->len;
1482 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1483 
1484 	LOG_DEBUG(VHOST_DATA,
1485 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1486 		"descriptor idx:%d\n",
1487 		dev->device_fh, res_base_idx, desc_idx);
1488 
1489 	__rte_mbuf_raw_free(mbuf);
1490 
1491 	return;
1492 }
1493 
1494 /*
1495  * Detach an attched packet mbuf -
1496  *  - restore original mbuf address and length values.
1497  *  - reset pktmbuf data and data_len to their default values.
1498  *  All other fields of the given packet mbuf will be left intact.
1499  *
1500  * @param m
1501  *   The attached packet mbuf.
1502  */
1503 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1504 {
1505 	const struct rte_mempool *mp = m->pool;
1506 	void *buf = RTE_MBUF_TO_BADDR(m);
1507 	uint32_t buf_ofs;
1508 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1509 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1510 
1511 	m->buf_addr = buf;
1512 	m->buf_len = (uint16_t)buf_len;
1513 
1514 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1515 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1516 	m->data_off = buf_ofs;
1517 
1518 	m->data_len = 0;
1519 }
1520 
1521 /*
1522  * This function is called after packets have been transimited. It fetchs mbuf
1523  * from vpool->pool, detached it and put into vpool->ring. It also update the
1524  * used index and kick the guest if necessary.
1525  */
1526 static inline uint32_t __attribute__((always_inline))
1527 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1528 {
1529 	struct rte_mbuf *mbuf;
1530 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1531 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1532 	uint32_t index = 0;
1533 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1534 
1535 	LOG_DEBUG(VHOST_DATA,
1536 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1537 		"clean is: %d\n",
1538 		dev->device_fh, mbuf_count);
1539 	LOG_DEBUG(VHOST_DATA,
1540 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1541 		"clean  is : %d\n",
1542 		dev->device_fh, rte_ring_count(vpool->ring));
1543 
1544 	for (index = 0; index < mbuf_count; index++) {
1545 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1546 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1547 			pktmbuf_detach_zcp(mbuf);
1548 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1549 
1550 		/* Update used index buffer information. */
1551 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1552 		vq->used->ring[used_idx].len = 0;
1553 
1554 		used_idx = (used_idx + 1) & (vq->size - 1);
1555 	}
1556 
1557 	LOG_DEBUG(VHOST_DATA,
1558 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1559 		"clean is: %d\n",
1560 		dev->device_fh, rte_mempool_count(vpool->pool));
1561 	LOG_DEBUG(VHOST_DATA,
1562 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1563 		"clean  is : %d\n",
1564 		dev->device_fh, rte_ring_count(vpool->ring));
1565 	LOG_DEBUG(VHOST_DATA,
1566 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1567 		"vq->last_used_idx:%d\n",
1568 		dev->device_fh, vq->last_used_idx);
1569 
1570 	vq->last_used_idx += mbuf_count;
1571 
1572 	LOG_DEBUG(VHOST_DATA,
1573 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1574 		"vq->last_used_idx:%d\n",
1575 		dev->device_fh, vq->last_used_idx);
1576 
1577 	rte_compiler_barrier();
1578 
1579 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1580 
1581 	/* Kick guest if required. */
1582 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1583 		eventfd_write((int)vq->kickfd, 1);
1584 
1585 	return 0;
1586 }
1587 
1588 /*
1589  * This function is called when a virtio device is destroy.
1590  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1591  */
1592 static void mbuf_destroy_zcp(struct vpool *vpool)
1593 {
1594 	struct rte_mbuf *mbuf = NULL;
1595 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1596 
1597 	LOG_DEBUG(VHOST_CONFIG,
1598 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1599 		"mbuf_destroy_zcp is: %d\n",
1600 		mbuf_count);
1601 	LOG_DEBUG(VHOST_CONFIG,
1602 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1603 		"mbuf_destroy_zcp  is : %d\n",
1604 		rte_ring_count(vpool->ring));
1605 
1606 	for (index = 0; index < mbuf_count; index++) {
1607 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1608 		if (likely(mbuf != NULL)) {
1609 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1610 				pktmbuf_detach_zcp(mbuf);
1611 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1612 		}
1613 	}
1614 
1615 	LOG_DEBUG(VHOST_CONFIG,
1616 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1617 		"mbuf_destroy_zcp is: %d\n",
1618 		rte_mempool_count(vpool->pool));
1619 	LOG_DEBUG(VHOST_CONFIG,
1620 		"in mbuf_destroy_zcp: mbuf count in ring after "
1621 		"mbuf_destroy_zcp is : %d\n",
1622 		rte_ring_count(vpool->ring));
1623 }
1624 
1625 /*
1626  * This function update the use flag and counter.
1627  */
1628 static inline uint32_t __attribute__((always_inline))
1629 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1630 	uint32_t count)
1631 {
1632 	struct vhost_virtqueue *vq;
1633 	struct vring_desc *desc;
1634 	struct rte_mbuf *buff;
1635 	/* The virtio_hdr is initialised to 0. */
1636 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1637 		= {{0, 0, 0, 0, 0, 0}, 0};
1638 	uint64_t buff_hdr_addr = 0;
1639 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1640 	uint32_t head_idx, packet_success = 0;
1641 	uint16_t res_cur_idx;
1642 
1643 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1644 
1645 	if (count == 0)
1646 		return 0;
1647 
1648 	vq = dev->virtqueue[VIRTIO_RXQ];
1649 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1650 
1651 	res_cur_idx = vq->last_used_idx;
1652 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1653 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1654 
1655 	/* Retrieve all of the head indexes first to avoid caching issues. */
1656 	for (head_idx = 0; head_idx < count; head_idx++)
1657 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1658 
1659 	/*Prefetch descriptor index. */
1660 	rte_prefetch0(&vq->desc[head[packet_success]]);
1661 
1662 	while (packet_success != count) {
1663 		/* Get descriptor from available ring */
1664 		desc = &vq->desc[head[packet_success]];
1665 
1666 		buff = pkts[packet_success];
1667 		LOG_DEBUG(VHOST_DATA,
1668 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1669 			"pkt[%d] descriptor idx: %d\n",
1670 			dev->device_fh, packet_success,
1671 			MBUF_HEADROOM_UINT32(buff));
1672 
1673 		PRINT_PACKET(dev,
1674 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1675 			+ RTE_PKTMBUF_HEADROOM),
1676 			rte_pktmbuf_data_len(buff), 0);
1677 
1678 		/* Buffer address translation for virtio header. */
1679 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1680 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1681 
1682 		/*
1683 		 * If the descriptors are chained the header and data are
1684 		 * placed in separate buffers.
1685 		 */
1686 		if (desc->flags & VRING_DESC_F_NEXT) {
1687 			desc->len = vq->vhost_hlen;
1688 			desc = &vq->desc[desc->next];
1689 			desc->len = rte_pktmbuf_data_len(buff);
1690 		} else {
1691 			desc->len = packet_len;
1692 		}
1693 
1694 		/* Update used ring with desc information */
1695 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1696 			= head[packet_success];
1697 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1698 			= packet_len;
1699 		res_cur_idx++;
1700 		packet_success++;
1701 
1702 		/* A header is required per buffer. */
1703 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1704 			(const void *)&virtio_hdr, vq->vhost_hlen);
1705 
1706 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1707 
1708 		if (likely(packet_success < count)) {
1709 			/* Prefetch descriptor index. */
1710 			rte_prefetch0(&vq->desc[head[packet_success]]);
1711 		}
1712 	}
1713 
1714 	rte_compiler_barrier();
1715 
1716 	LOG_DEBUG(VHOST_DATA,
1717 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1718 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1719 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1720 
1721 	*(volatile uint16_t *)&vq->used->idx += count;
1722 	vq->last_used_idx += count;
1723 
1724 	LOG_DEBUG(VHOST_DATA,
1725 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1726 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1727 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1728 
1729 	/* Kick the guest if necessary. */
1730 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1731 		eventfd_write((int)vq->kickfd, 1);
1732 
1733 	return count;
1734 }
1735 
1736 /*
1737  * This function routes the TX packet to the correct interface.
1738  * This may be a local device or the physical port.
1739  */
1740 static inline void __attribute__((always_inline))
1741 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1742 	uint32_t desc_idx, uint8_t need_copy)
1743 {
1744 	struct mbuf_table *tx_q;
1745 	struct rte_mbuf **m_table;
1746 	struct rte_mbuf *mbuf = NULL;
1747 	unsigned len, ret, offset = 0;
1748 	struct vpool *vpool;
1749 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1750 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1751 
1752 	/*Add packet to the port tx queue*/
1753 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1754 	len = tx_q->len;
1755 
1756 	/* Allocate an mbuf and populate the structure. */
1757 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1758 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1759 	if (unlikely(mbuf == NULL)) {
1760 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1761 		RTE_LOG(ERR, VHOST_DATA,
1762 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1763 			dev->device_fh);
1764 		put_desc_to_used_list_zcp(vq, desc_idx);
1765 		return;
1766 	}
1767 
1768 	if (vm2vm_mode == VM2VM_HARDWARE) {
1769 		/* Avoid using a vlan tag from any vm for external pkt, such as
1770 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1771 		 * selection, MAC address determines it as an external pkt
1772 		 * which should go to network, while vlan tag determine it as
1773 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1774 		 * such a ambiguous situation, so pkt will lost.
1775 		 */
1776 		vlan_tag = external_pkt_default_vlan_tag;
1777 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1778 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1779 			__rte_mbuf_raw_free(mbuf);
1780 			return;
1781 		}
1782 	}
1783 
1784 	mbuf->nb_segs = m->nb_segs;
1785 	mbuf->next = m->next;
1786 	mbuf->data_len = m->data_len + offset;
1787 	mbuf->pkt_len = mbuf->data_len;
1788 	if (unlikely(need_copy)) {
1789 		/* Copy the packet contents to the mbuf. */
1790 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1791 			rte_pktmbuf_mtod(m, void *),
1792 			m->data_len);
1793 	} else {
1794 		mbuf->data_off = m->data_off;
1795 		mbuf->buf_physaddr = m->buf_physaddr;
1796 		mbuf->buf_addr = m->buf_addr;
1797 	}
1798 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1799 	mbuf->vlan_tci = vlan_tag;
1800 	mbuf->l2_len = sizeof(struct ether_hdr);
1801 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1802 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1803 
1804 	tx_q->m_table[len] = mbuf;
1805 	len++;
1806 
1807 	LOG_DEBUG(VHOST_DATA,
1808 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1809 		dev->device_fh,
1810 		mbuf->nb_segs,
1811 		(mbuf->next == NULL) ? "null" : "non-null");
1812 
1813 	if (enable_stats) {
1814 		dev_statistics[dev->device_fh].tx_total++;
1815 		dev_statistics[dev->device_fh].tx++;
1816 	}
1817 
1818 	if (unlikely(len == MAX_PKT_BURST)) {
1819 		m_table = (struct rte_mbuf **)tx_q->m_table;
1820 		ret = rte_eth_tx_burst(ports[0],
1821 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1822 
1823 		/*
1824 		 * Free any buffers not handled by TX and update
1825 		 * the port stats.
1826 		 */
1827 		if (unlikely(ret < len)) {
1828 			do {
1829 				rte_pktmbuf_free(m_table[ret]);
1830 			} while (++ret < len);
1831 		}
1832 
1833 		len = 0;
1834 		txmbuf_clean_zcp(dev, vpool);
1835 	}
1836 
1837 	tx_q->len = len;
1838 
1839 	return;
1840 }
1841 
1842 /*
1843  * This function TX all available packets in virtio TX queue for one
1844  * virtio-net device. If it is first packet, it learns MAC address and
1845  * setup VMDQ.
1846  */
1847 static inline void __attribute__((always_inline))
1848 virtio_dev_tx_zcp(struct virtio_net *dev)
1849 {
1850 	struct rte_mbuf m;
1851 	struct vhost_virtqueue *vq;
1852 	struct vring_desc *desc;
1853 	uint64_t buff_addr = 0, phys_addr;
1854 	uint32_t head[MAX_PKT_BURST];
1855 	uint32_t i;
1856 	uint16_t free_entries, packet_success = 0;
1857 	uint16_t avail_idx;
1858 	uint8_t need_copy = 0;
1859 	hpa_type addr_type;
1860 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1861 
1862 	vq = dev->virtqueue[VIRTIO_TXQ];
1863 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1864 
1865 	/* If there are no available buffers then return. */
1866 	if (vq->last_used_idx_res == avail_idx)
1867 		return;
1868 
1869 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1870 
1871 	/* Prefetch available ring to retrieve head indexes. */
1872 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1873 
1874 	/* Get the number of free entries in the ring */
1875 	free_entries = (avail_idx - vq->last_used_idx_res);
1876 
1877 	/* Limit to MAX_PKT_BURST. */
1878 	free_entries
1879 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1880 
1881 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1882 		dev->device_fh, free_entries);
1883 
1884 	/* Retrieve all of the head indexes first to avoid caching issues. */
1885 	for (i = 0; i < free_entries; i++)
1886 		head[i]
1887 			= vq->avail->ring[(vq->last_used_idx_res + i)
1888 			& (vq->size - 1)];
1889 
1890 	vq->last_used_idx_res += free_entries;
1891 
1892 	/* Prefetch descriptor index. */
1893 	rte_prefetch0(&vq->desc[head[packet_success]]);
1894 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1895 
1896 	while (packet_success < free_entries) {
1897 		desc = &vq->desc[head[packet_success]];
1898 
1899 		/* Discard first buffer as it is the virtio header */
1900 		desc = &vq->desc[desc->next];
1901 
1902 		/* Buffer address translation. */
1903 		buff_addr = gpa_to_vva(dev, desc->addr);
1904 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1905 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1906 			&addr_type);
1907 
1908 		if (likely(packet_success < (free_entries - 1)))
1909 			/* Prefetch descriptor index. */
1910 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1911 
1912 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1913 			RTE_LOG(ERR, VHOST_DATA,
1914 				"(%"PRIu64") Invalid frame buffer address found"
1915 				"when TX packets!\n",
1916 				dev->device_fh);
1917 			packet_success++;
1918 			continue;
1919 		}
1920 
1921 		/* Prefetch buffer address. */
1922 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1923 
1924 		/*
1925 		 * Setup dummy mbuf. This is copied to a real mbuf if
1926 		 * transmitted out the physical port.
1927 		 */
1928 		m.data_len = desc->len;
1929 		m.nb_segs = 1;
1930 		m.next = NULL;
1931 		m.data_off = 0;
1932 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1933 		m.buf_physaddr = phys_addr;
1934 
1935 		/*
1936 		 * Check if the frame buffer address from guest crosses
1937 		 * sub-region or not.
1938 		 */
1939 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1940 			RTE_LOG(ERR, VHOST_DATA,
1941 				"(%"PRIu64") Frame buffer address cross "
1942 				"sub-regioin found when attaching TX frame "
1943 				"buffer address!\n",
1944 				dev->device_fh);
1945 			need_copy = 1;
1946 		} else
1947 			need_copy = 0;
1948 
1949 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1950 
1951 		/*
1952 		 * If this is the first received packet we need to learn
1953 		 * the MAC and setup VMDQ
1954 		 */
1955 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1956 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1957 				/*
1958 				 * Discard frame if device is scheduled for
1959 				 * removal or a duplicate MAC address is found.
1960 				 */
1961 				packet_success += free_entries;
1962 				vq->last_used_idx += packet_success;
1963 				break;
1964 			}
1965 		}
1966 
1967 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1968 		packet_success++;
1969 	}
1970 }
1971 
1972 /*
1973  * This function is called by each data core. It handles all RX/TX registered
1974  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1975  * addresses are compared with all devices in the main linked list.
1976  */
1977 static int
1978 switch_worker_zcp(__attribute__((unused)) void *arg)
1979 {
1980 	struct virtio_net *dev = NULL;
1981 	struct vhost_dev  *vdev = NULL;
1982 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1983 	struct virtio_net_data_ll *dev_ll;
1984 	struct mbuf_table *tx_q;
1985 	volatile struct lcore_ll_info *lcore_ll;
1986 	const uint64_t drain_tsc
1987 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1988 		* BURST_TX_DRAIN_US;
1989 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1990 	unsigned ret;
1991 	const uint16_t lcore_id = rte_lcore_id();
1992 	uint16_t count_in_ring, rx_count = 0;
1993 
1994 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1995 
1996 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1997 	prev_tsc = 0;
1998 
1999 	while (1) {
2000 		cur_tsc = rte_rdtsc();
2001 
2002 		/* TX burst queue drain */
2003 		diff_tsc = cur_tsc - prev_tsc;
2004 		if (unlikely(diff_tsc > drain_tsc)) {
2005 			/*
2006 			 * Get mbuf from vpool.pool and detach mbuf and
2007 			 * put back into vpool.ring.
2008 			 */
2009 			dev_ll = lcore_ll->ll_root_used;
2010 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2011 				/* Get virtio device ID */
2012 				vdev = dev_ll->vdev;
2013 				dev = vdev->dev;
2014 
2015 				if (likely(!vdev->remove)) {
2016 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2017 					if (tx_q->len) {
2018 						LOG_DEBUG(VHOST_DATA,
2019 						"TX queue drained after timeout"
2020 						" with burst size %u\n",
2021 						tx_q->len);
2022 
2023 						/*
2024 						 * Tx any packets in the queue
2025 						 */
2026 						ret = rte_eth_tx_burst(
2027 							ports[0],
2028 							(uint16_t)tx_q->txq_id,
2029 							(struct rte_mbuf **)
2030 							tx_q->m_table,
2031 							(uint16_t)tx_q->len);
2032 						if (unlikely(ret < tx_q->len)) {
2033 							do {
2034 								rte_pktmbuf_free(
2035 									tx_q->m_table[ret]);
2036 							} while (++ret < tx_q->len);
2037 						}
2038 						tx_q->len = 0;
2039 
2040 						txmbuf_clean_zcp(dev,
2041 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2042 					}
2043 				}
2044 				dev_ll = dev_ll->next;
2045 			}
2046 			prev_tsc = cur_tsc;
2047 		}
2048 
2049 		rte_prefetch0(lcore_ll->ll_root_used);
2050 
2051 		/*
2052 		 * Inform the configuration core that we have exited the linked
2053 		 * list and that no devices are in use if requested.
2054 		 */
2055 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2056 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2057 
2058 		/* Process devices */
2059 		dev_ll = lcore_ll->ll_root_used;
2060 
2061 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2062 			vdev = dev_ll->vdev;
2063 			dev  = vdev->dev;
2064 			if (unlikely(vdev->remove)) {
2065 				dev_ll = dev_ll->next;
2066 				unlink_vmdq(vdev);
2067 				vdev->ready = DEVICE_SAFE_REMOVE;
2068 				continue;
2069 			}
2070 
2071 			if (likely(vdev->ready == DEVICE_RX)) {
2072 				uint32_t index = vdev->vmdq_rx_q;
2073 				uint16_t i;
2074 				count_in_ring
2075 				= rte_ring_count(vpool_array[index].ring);
2076 				uint16_t free_entries
2077 				= (uint16_t)get_available_ring_num_zcp(dev);
2078 
2079 				/*
2080 				 * Attach all mbufs in vpool.ring and put back
2081 				 * into vpool.pool.
2082 				 */
2083 				for (i = 0;
2084 				i < RTE_MIN(free_entries,
2085 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2086 				i++)
2087 					attach_rxmbuf_zcp(dev);
2088 
2089 				/* Handle guest RX */
2090 				rx_count = rte_eth_rx_burst(ports[0],
2091 					vdev->vmdq_rx_q, pkts_burst,
2092 					MAX_PKT_BURST);
2093 
2094 				if (rx_count) {
2095 					ret_count = virtio_dev_rx_zcp(dev,
2096 							pkts_burst, rx_count);
2097 					if (enable_stats) {
2098 						dev_statistics[dev->device_fh].rx_total
2099 							+= rx_count;
2100 						dev_statistics[dev->device_fh].rx
2101 							+= ret_count;
2102 					}
2103 					while (likely(rx_count)) {
2104 						rx_count--;
2105 						pktmbuf_detach_zcp(
2106 							pkts_burst[rx_count]);
2107 						rte_ring_sp_enqueue(
2108 							vpool_array[index].ring,
2109 							(void *)pkts_burst[rx_count]);
2110 					}
2111 				}
2112 			}
2113 
2114 			if (likely(!vdev->remove))
2115 				/* Handle guest TX */
2116 				virtio_dev_tx_zcp(dev);
2117 
2118 			/* Move to the next device in the list */
2119 			dev_ll = dev_ll->next;
2120 		}
2121 	}
2122 
2123 	return 0;
2124 }
2125 
2126 
2127 /*
2128  * Add an entry to a used linked list. A free entry must first be found
2129  * in the free linked list using get_data_ll_free_entry();
2130  */
2131 static void
2132 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2133 	struct virtio_net_data_ll *ll_dev)
2134 {
2135 	struct virtio_net_data_ll *ll = *ll_root_addr;
2136 
2137 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2138 	ll_dev->next = NULL;
2139 	rte_compiler_barrier();
2140 
2141 	/* If ll == NULL then this is the first device. */
2142 	if (ll) {
2143 		/* Increment to the tail of the linked list. */
2144 		while ((ll->next != NULL) )
2145 			ll = ll->next;
2146 
2147 		ll->next = ll_dev;
2148 	} else {
2149 		*ll_root_addr = ll_dev;
2150 	}
2151 }
2152 
2153 /*
2154  * Remove an entry from a used linked list. The entry must then be added to
2155  * the free linked list using put_data_ll_free_entry().
2156  */
2157 static void
2158 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2159 	struct virtio_net_data_ll *ll_dev,
2160 	struct virtio_net_data_ll *ll_dev_last)
2161 {
2162 	struct virtio_net_data_ll *ll = *ll_root_addr;
2163 
2164 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2165 		return;
2166 
2167 	if (ll_dev == ll)
2168 		*ll_root_addr = ll_dev->next;
2169 	else
2170 		if (likely(ll_dev_last != NULL))
2171 			ll_dev_last->next = ll_dev->next;
2172 		else
2173 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2174 }
2175 
2176 /*
2177  * Find and return an entry from the free linked list.
2178  */
2179 static struct virtio_net_data_ll *
2180 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2181 {
2182 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2183 	struct virtio_net_data_ll *ll_dev;
2184 
2185 	if (ll_free == NULL)
2186 		return NULL;
2187 
2188 	ll_dev = ll_free;
2189 	*ll_root_addr = ll_free->next;
2190 
2191 	return ll_dev;
2192 }
2193 
2194 /*
2195  * Place an entry back on to the free linked list.
2196  */
2197 static void
2198 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2199 	struct virtio_net_data_ll *ll_dev)
2200 {
2201 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2202 
2203 	if (ll_dev == NULL)
2204 		return;
2205 
2206 	ll_dev->next = ll_free;
2207 	*ll_root_addr = ll_dev;
2208 }
2209 
2210 /*
2211  * Creates a linked list of a given size.
2212  */
2213 static struct virtio_net_data_ll *
2214 alloc_data_ll(uint32_t size)
2215 {
2216 	struct virtio_net_data_ll *ll_new;
2217 	uint32_t i;
2218 
2219 	/* Malloc and then chain the linked list. */
2220 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2221 	if (ll_new == NULL) {
2222 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2223 		return NULL;
2224 	}
2225 
2226 	for (i = 0; i < size - 1; i++) {
2227 		ll_new[i].vdev = NULL;
2228 		ll_new[i].next = &ll_new[i+1];
2229 	}
2230 	ll_new[i].next = NULL;
2231 
2232 	return (ll_new);
2233 }
2234 
2235 /*
2236  * Create the main linked list along with each individual cores linked list. A used and a free list
2237  * are created to manage entries.
2238  */
2239 static int
2240 init_data_ll (void)
2241 {
2242 	int lcore;
2243 
2244 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2245 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2246 		if (lcore_info[lcore].lcore_ll == NULL) {
2247 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2248 			return -1;
2249 		}
2250 
2251 		lcore_info[lcore].lcore_ll->device_num = 0;
2252 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2253 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2254 		if (num_devices % num_switching_cores)
2255 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2256 		else
2257 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2258 	}
2259 
2260 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2261 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2262 
2263 	return 0;
2264 }
2265 
2266 /*
2267  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2268  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2269  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2270  */
2271 static void
2272 destroy_device (volatile struct virtio_net *dev)
2273 {
2274 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2275 	struct virtio_net_data_ll *ll_main_dev_cur;
2276 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2277 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2278 	struct vhost_dev *vdev;
2279 	int lcore;
2280 
2281 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2282 
2283 	vdev = (struct vhost_dev *)dev->priv;
2284 	/*set the remove flag. */
2285 	vdev->remove = 1;
2286 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2287 		rte_pause();
2288 	}
2289 
2290 	/* Search for entry to be removed from lcore ll */
2291 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2292 	while (ll_lcore_dev_cur != NULL) {
2293 		if (ll_lcore_dev_cur->vdev == vdev) {
2294 			break;
2295 		} else {
2296 			ll_lcore_dev_last = ll_lcore_dev_cur;
2297 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2298 		}
2299 	}
2300 
2301 	if (ll_lcore_dev_cur == NULL) {
2302 		RTE_LOG(ERR, VHOST_CONFIG,
2303 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2304 			dev->device_fh);
2305 		return;
2306 	}
2307 
2308 	/* Search for entry to be removed from main ll */
2309 	ll_main_dev_cur = ll_root_used;
2310 	ll_main_dev_last = NULL;
2311 	while (ll_main_dev_cur != NULL) {
2312 		if (ll_main_dev_cur->vdev == vdev) {
2313 			break;
2314 		} else {
2315 			ll_main_dev_last = ll_main_dev_cur;
2316 			ll_main_dev_cur = ll_main_dev_cur->next;
2317 		}
2318 	}
2319 
2320 	/* Remove entries from the lcore and main ll. */
2321 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2322 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2323 
2324 	/* Set the dev_removal_flag on each lcore. */
2325 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2326 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2327 	}
2328 
2329 	/*
2330 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2331 	 * they can no longer access the device removed from the linked lists and that the devices
2332 	 * are no longer in use.
2333 	 */
2334 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2335 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2336 			rte_pause();
2337 		}
2338 	}
2339 
2340 	/* Add the entries back to the lcore and main free ll.*/
2341 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2342 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2343 
2344 	/* Decrement number of device on the lcore. */
2345 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2346 
2347 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2348 
2349 	if (zero_copy) {
2350 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2351 
2352 		/* Stop the RX queue. */
2353 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2354 			LOG_DEBUG(VHOST_CONFIG,
2355 				"(%"PRIu64") In destroy_device: Failed to stop "
2356 				"rx queue:%d\n",
2357 				dev->device_fh,
2358 				vdev->vmdq_rx_q);
2359 		}
2360 
2361 		LOG_DEBUG(VHOST_CONFIG,
2362 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2363 			"mempool back to ring for RX queue: %d\n",
2364 			dev->device_fh, vdev->vmdq_rx_q);
2365 
2366 		mbuf_destroy_zcp(vpool);
2367 
2368 		/* Stop the TX queue. */
2369 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2370 			LOG_DEBUG(VHOST_CONFIG,
2371 				"(%"PRIu64") In destroy_device: Failed to "
2372 				"stop tx queue:%d\n",
2373 				dev->device_fh, vdev->vmdq_rx_q);
2374 		}
2375 
2376 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2377 
2378 		LOG_DEBUG(VHOST_CONFIG,
2379 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2380 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2381 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2382 			dev->device_fh);
2383 
2384 		mbuf_destroy_zcp(vpool);
2385 		rte_free(vdev->regions_hpa);
2386 	}
2387 	rte_free(vdev);
2388 
2389 }
2390 
2391 /*
2392  * Calculate the region count of physical continous regions for one particular
2393  * region of whose vhost virtual address is continous. The particular region
2394  * start from vva_start, with size of 'size' in argument.
2395  */
2396 static uint32_t
2397 check_hpa_regions(uint64_t vva_start, uint64_t size)
2398 {
2399 	uint32_t i, nregions = 0, page_size = getpagesize();
2400 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2401 	if (vva_start % page_size) {
2402 		LOG_DEBUG(VHOST_CONFIG,
2403 			"in check_countinous: vva start(%p) mod page_size(%d) "
2404 			"has remainder\n",
2405 			(void *)(uintptr_t)vva_start, page_size);
2406 		return 0;
2407 	}
2408 	if (size % page_size) {
2409 		LOG_DEBUG(VHOST_CONFIG,
2410 			"in check_countinous: "
2411 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2412 			size, page_size);
2413 		return 0;
2414 	}
2415 	for (i = 0; i < size - page_size; i = i + page_size) {
2416 		cur_phys_addr
2417 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2418 		next_phys_addr = rte_mem_virt2phy(
2419 			(void *)(uintptr_t)(vva_start + i + page_size));
2420 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2421 			++nregions;
2422 			LOG_DEBUG(VHOST_CONFIG,
2423 				"in check_continuous: hva addr:(%p) is not "
2424 				"continuous with hva addr:(%p), diff:%d\n",
2425 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2426 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2427 				+ page_size), page_size);
2428 			LOG_DEBUG(VHOST_CONFIG,
2429 				"in check_continuous: hpa addr:(%p) is not "
2430 				"continuous with hpa addr:(%p), "
2431 				"diff:(%"PRIu64")\n",
2432 				(void *)(uintptr_t)cur_phys_addr,
2433 				(void *)(uintptr_t)next_phys_addr,
2434 				(next_phys_addr-cur_phys_addr));
2435 		}
2436 	}
2437 	return nregions;
2438 }
2439 
2440 /*
2441  * Divide each region whose vhost virtual address is continous into a few
2442  * sub-regions, make sure the physical address within each sub-region are
2443  * continous. And fill offset(to GPA) and size etc. information of each
2444  * sub-region into regions_hpa.
2445  */
2446 static uint32_t
2447 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2448 {
2449 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2450 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2451 
2452 	if (mem_region_hpa == NULL)
2453 		return 0;
2454 
2455 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2456 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2457 			virtio_memory->regions[regionidx].address_offset;
2458 		mem_region_hpa[regionidx_hpa].guest_phys_address
2459 			= virtio_memory->regions[regionidx].guest_phys_address;
2460 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2461 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2462 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2463 		LOG_DEBUG(VHOST_CONFIG,
2464 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2465 			regionidx_hpa,
2466 			(void *)(uintptr_t)
2467 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2468 		LOG_DEBUG(VHOST_CONFIG,
2469 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2470 			regionidx_hpa,
2471 			(void *)(uintptr_t)
2472 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2473 		for (i = 0, k = 0;
2474 			i < virtio_memory->regions[regionidx].memory_size -
2475 				page_size;
2476 			i += page_size) {
2477 			cur_phys_addr = rte_mem_virt2phy(
2478 					(void *)(uintptr_t)(vva_start + i));
2479 			next_phys_addr = rte_mem_virt2phy(
2480 					(void *)(uintptr_t)(vva_start +
2481 					i + page_size));
2482 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2483 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2484 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2485 					k + page_size;
2486 				mem_region_hpa[regionidx_hpa].memory_size
2487 					= k + page_size;
2488 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2489 					"phys addr end  [%d]:(%p)\n",
2490 					regionidx_hpa,
2491 					(void *)(uintptr_t)
2492 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2493 				LOG_DEBUG(VHOST_CONFIG,
2494 					"in fill_hpa_regions: guest phys addr "
2495 					"size [%d]:(%p)\n",
2496 					regionidx_hpa,
2497 					(void *)(uintptr_t)
2498 					(mem_region_hpa[regionidx_hpa].memory_size));
2499 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2500 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2501 				++regionidx_hpa;
2502 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2503 					next_phys_addr -
2504 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2505 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2506 					" phys addr start[%d]:(%p)\n",
2507 					regionidx_hpa,
2508 					(void *)(uintptr_t)
2509 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2510 				LOG_DEBUG(VHOST_CONFIG,
2511 					"in fill_hpa_regions: host  phys addr "
2512 					"start[%d]:(%p)\n",
2513 					regionidx_hpa,
2514 					(void *)(uintptr_t)
2515 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2516 				k = 0;
2517 			} else {
2518 				k += page_size;
2519 			}
2520 		}
2521 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2522 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2523 			+ k + page_size;
2524 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2525 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2526 			"[%d]:(%p)\n", regionidx_hpa,
2527 			(void *)(uintptr_t)
2528 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2529 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2530 			"[%d]:(%p)\n", regionidx_hpa,
2531 			(void *)(uintptr_t)
2532 			(mem_region_hpa[regionidx_hpa].memory_size));
2533 		++regionidx_hpa;
2534 	}
2535 	return regionidx_hpa;
2536 }
2537 
2538 /*
2539  * A new device is added to a data core. First the device is added to the main linked list
2540  * and the allocated to a specific data core.
2541  */
2542 static int
2543 new_device (struct virtio_net *dev)
2544 {
2545 	struct virtio_net_data_ll *ll_dev;
2546 	int lcore, core_add = 0;
2547 	uint32_t device_num_min = num_devices;
2548 	struct vhost_dev *vdev;
2549 	uint32_t regionidx;
2550 
2551 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2552 	if (vdev == NULL) {
2553 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2554 			dev->device_fh);
2555 		return -1;
2556 	}
2557 	vdev->dev = dev;
2558 	dev->priv = vdev;
2559 
2560 	if (zero_copy) {
2561 		vdev->nregions_hpa = dev->mem->nregions;
2562 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2563 			vdev->nregions_hpa
2564 				+= check_hpa_regions(
2565 					dev->mem->regions[regionidx].guest_phys_address
2566 					+ dev->mem->regions[regionidx].address_offset,
2567 					dev->mem->regions[regionidx].memory_size);
2568 
2569 		}
2570 
2571 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2572 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2573 			RTE_CACHE_LINE_SIZE);
2574 		if (vdev->regions_hpa == NULL) {
2575 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2576 			rte_free(vdev);
2577 			return -1;
2578 		}
2579 
2580 
2581 		if (fill_hpa_memory_regions(
2582 			vdev->regions_hpa, dev->mem
2583 			) != vdev->nregions_hpa) {
2584 
2585 			RTE_LOG(ERR, VHOST_CONFIG,
2586 				"hpa memory regions number mismatch: "
2587 				"[%d]\n", vdev->nregions_hpa);
2588 			rte_free(vdev->regions_hpa);
2589 			rte_free(vdev);
2590 			return -1;
2591 		}
2592 	}
2593 
2594 
2595 	/* Add device to main ll */
2596 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2597 	if (ll_dev == NULL) {
2598 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2599 			"of %d devices per core has been reached\n",
2600 			dev->device_fh, num_devices);
2601 		if (vdev->regions_hpa)
2602 			rte_free(vdev->regions_hpa);
2603 		rte_free(vdev);
2604 		return -1;
2605 	}
2606 	ll_dev->vdev = vdev;
2607 	add_data_ll_entry(&ll_root_used, ll_dev);
2608 	vdev->vmdq_rx_q
2609 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2610 
2611 	if (zero_copy) {
2612 		uint32_t index = vdev->vmdq_rx_q;
2613 		uint32_t count_in_ring, i;
2614 		struct mbuf_table *tx_q;
2615 
2616 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2617 
2618 		LOG_DEBUG(VHOST_CONFIG,
2619 			"(%"PRIu64") in new_device: mbuf count in mempool "
2620 			"before attach is: %d\n",
2621 			dev->device_fh,
2622 			rte_mempool_count(vpool_array[index].pool));
2623 		LOG_DEBUG(VHOST_CONFIG,
2624 			"(%"PRIu64") in new_device: mbuf count in  ring "
2625 			"before attach  is : %d\n",
2626 			dev->device_fh, count_in_ring);
2627 
2628 		/*
2629 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2630 		 */
2631 		for (i = 0; i < count_in_ring; i++)
2632 			attach_rxmbuf_zcp(dev);
2633 
2634 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2635 			"mempool after attach is: %d\n",
2636 			dev->device_fh,
2637 			rte_mempool_count(vpool_array[index].pool));
2638 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2639 			"ring after attach  is : %d\n",
2640 			dev->device_fh,
2641 			rte_ring_count(vpool_array[index].ring));
2642 
2643 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2644 		tx_q->txq_id = vdev->vmdq_rx_q;
2645 
2646 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2647 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2648 
2649 			LOG_DEBUG(VHOST_CONFIG,
2650 				"(%"PRIu64") In new_device: Failed to start "
2651 				"tx queue:%d\n",
2652 				dev->device_fh, vdev->vmdq_rx_q);
2653 
2654 			mbuf_destroy_zcp(vpool);
2655 			rte_free(vdev->regions_hpa);
2656 			rte_free(vdev);
2657 			return -1;
2658 		}
2659 
2660 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2661 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2662 
2663 			LOG_DEBUG(VHOST_CONFIG,
2664 				"(%"PRIu64") In new_device: Failed to start "
2665 				"rx queue:%d\n",
2666 				dev->device_fh, vdev->vmdq_rx_q);
2667 
2668 			/* Stop the TX queue. */
2669 			if (rte_eth_dev_tx_queue_stop(ports[0],
2670 				vdev->vmdq_rx_q) != 0) {
2671 				LOG_DEBUG(VHOST_CONFIG,
2672 					"(%"PRIu64") In new_device: Failed to "
2673 					"stop tx queue:%d\n",
2674 					dev->device_fh, vdev->vmdq_rx_q);
2675 			}
2676 
2677 			mbuf_destroy_zcp(vpool);
2678 			rte_free(vdev->regions_hpa);
2679 			rte_free(vdev);
2680 			return -1;
2681 		}
2682 
2683 	}
2684 
2685 	/*reset ready flag*/
2686 	vdev->ready = DEVICE_MAC_LEARNING;
2687 	vdev->remove = 0;
2688 
2689 	/* Find a suitable lcore to add the device. */
2690 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2691 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2692 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2693 			core_add = lcore;
2694 		}
2695 	}
2696 	/* Add device to lcore ll */
2697 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2698 	if (ll_dev == NULL) {
2699 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2700 		vdev->ready = DEVICE_SAFE_REMOVE;
2701 		destroy_device(dev);
2702 		if (vdev->regions_hpa)
2703 			rte_free(vdev->regions_hpa);
2704 		rte_free(vdev);
2705 		return -1;
2706 	}
2707 	ll_dev->vdev = vdev;
2708 	vdev->coreid = core_add;
2709 
2710 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2711 
2712 	/* Initialize device stats */
2713 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2714 
2715 	/* Disable notifications. */
2716 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2717 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2718 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2719 	dev->flags |= VIRTIO_DEV_RUNNING;
2720 
2721 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2722 
2723 	return 0;
2724 }
2725 
2726 /*
2727  * These callback allow devices to be added to the data core when configuration
2728  * has been fully complete.
2729  */
2730 static const struct virtio_net_device_ops virtio_net_device_ops =
2731 {
2732 	.new_device =  new_device,
2733 	.destroy_device = destroy_device,
2734 };
2735 
2736 /*
2737  * This is a thread will wake up after a period to print stats if the user has
2738  * enabled them.
2739  */
2740 static void
2741 print_stats(void)
2742 {
2743 	struct virtio_net_data_ll *dev_ll;
2744 	uint64_t tx_dropped, rx_dropped;
2745 	uint64_t tx, tx_total, rx, rx_total;
2746 	uint32_t device_fh;
2747 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2748 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2749 
2750 	while(1) {
2751 		sleep(enable_stats);
2752 
2753 		/* Clear screen and move to top left */
2754 		printf("%s%s", clr, top_left);
2755 
2756 		printf("\nDevice statistics ====================================");
2757 
2758 		dev_ll = ll_root_used;
2759 		while (dev_ll != NULL) {
2760 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2761 			tx_total = dev_statistics[device_fh].tx_total;
2762 			tx = dev_statistics[device_fh].tx;
2763 			tx_dropped = tx_total - tx;
2764 			if (zero_copy == 0) {
2765 				rx_total = rte_atomic64_read(
2766 					&dev_statistics[device_fh].rx_total_atomic);
2767 				rx = rte_atomic64_read(
2768 					&dev_statistics[device_fh].rx_atomic);
2769 			} else {
2770 				rx_total = dev_statistics[device_fh].rx_total;
2771 				rx = dev_statistics[device_fh].rx;
2772 			}
2773 			rx_dropped = rx_total - rx;
2774 
2775 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2776 					"\nTX total: 		%"PRIu64""
2777 					"\nTX dropped: 		%"PRIu64""
2778 					"\nTX successful: 		%"PRIu64""
2779 					"\nRX total: 		%"PRIu64""
2780 					"\nRX dropped: 		%"PRIu64""
2781 					"\nRX successful: 		%"PRIu64"",
2782 					device_fh,
2783 					tx_total,
2784 					tx_dropped,
2785 					tx,
2786 					rx_total,
2787 					rx_dropped,
2788 					rx);
2789 
2790 			dev_ll = dev_ll->next;
2791 		}
2792 		printf("\n======================================================\n");
2793 	}
2794 }
2795 
2796 static void
2797 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2798 	char *ring_name, uint32_t nb_mbuf)
2799 {
2800 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2801 	vpool_array[index].pool
2802 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2803 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2804 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2805 		rte_pktmbuf_init, NULL, socket, 0);
2806 	if (vpool_array[index].pool != NULL) {
2807 		vpool_array[index].ring
2808 			= rte_ring_create(ring_name,
2809 				rte_align32pow2(nb_mbuf + 1),
2810 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2811 		if (likely(vpool_array[index].ring != NULL)) {
2812 			LOG_DEBUG(VHOST_CONFIG,
2813 				"in setup_mempool_tbl: mbuf count in "
2814 				"mempool is: %d\n",
2815 				rte_mempool_count(vpool_array[index].pool));
2816 			LOG_DEBUG(VHOST_CONFIG,
2817 				"in setup_mempool_tbl: mbuf count in "
2818 				"ring   is: %d\n",
2819 				rte_ring_count(vpool_array[index].ring));
2820 		} else {
2821 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2822 				ring_name);
2823 		}
2824 
2825 		/* Need consider head room. */
2826 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2827 	} else {
2828 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2829 	}
2830 }
2831 
2832 
2833 /*
2834  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2835  * device is also registered here to handle the IOCTLs.
2836  */
2837 int
2838 main(int argc, char *argv[])
2839 {
2840 	struct rte_mempool *mbuf_pool = NULL;
2841 	unsigned lcore_id, core_id = 0;
2842 	unsigned nb_ports, valid_num_ports;
2843 	int ret;
2844 	uint8_t portid;
2845 	uint16_t queue_id;
2846 	static pthread_t tid;
2847 
2848 	/* init EAL */
2849 	ret = rte_eal_init(argc, argv);
2850 	if (ret < 0)
2851 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2852 	argc -= ret;
2853 	argv += ret;
2854 
2855 	/* parse app arguments */
2856 	ret = us_vhost_parse_args(argc, argv);
2857 	if (ret < 0)
2858 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2859 
2860 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2861 		if (rte_lcore_is_enabled(lcore_id))
2862 			lcore_ids[core_id ++] = lcore_id;
2863 
2864 	if (rte_lcore_count() > RTE_MAX_LCORE)
2865 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2866 
2867 	/*set the number of swithcing cores available*/
2868 	num_switching_cores = rte_lcore_count()-1;
2869 
2870 	/* Get the number of physical ports. */
2871 	nb_ports = rte_eth_dev_count();
2872 	if (nb_ports > RTE_MAX_ETHPORTS)
2873 		nb_ports = RTE_MAX_ETHPORTS;
2874 
2875 	/*
2876 	 * Update the global var NUM_PORTS and global array PORTS
2877 	 * and get value of var VALID_NUM_PORTS according to system ports number
2878 	 */
2879 	valid_num_ports = check_ports_num(nb_ports);
2880 
2881 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2882 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2883 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2884 		return -1;
2885 	}
2886 
2887 	if (zero_copy == 0) {
2888 		/* Create the mbuf pool. */
2889 		mbuf_pool = rte_mempool_create(
2890 				"MBUF_POOL",
2891 				NUM_MBUFS_PER_PORT
2892 				* valid_num_ports,
2893 				MBUF_SIZE, MBUF_CACHE_SIZE,
2894 				sizeof(struct rte_pktmbuf_pool_private),
2895 				rte_pktmbuf_pool_init, NULL,
2896 				rte_pktmbuf_init, NULL,
2897 				rte_socket_id(), 0);
2898 		if (mbuf_pool == NULL)
2899 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2900 
2901 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2902 			vpool_array[queue_id].pool = mbuf_pool;
2903 
2904 		if (vm2vm_mode == VM2VM_HARDWARE) {
2905 			/* Enable VT loop back to let L2 switch to do it. */
2906 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2907 			LOG_DEBUG(VHOST_CONFIG,
2908 				"Enable loop back for L2 switch in vmdq.\n");
2909 		}
2910 	} else {
2911 		uint32_t nb_mbuf;
2912 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2913 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2914 
2915 		nb_mbuf = num_rx_descriptor
2916 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2917 			+ num_switching_cores * MAX_PKT_BURST;
2918 
2919 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2920 			snprintf(pool_name, sizeof(pool_name),
2921 				"rxmbuf_pool_%u", queue_id);
2922 			snprintf(ring_name, sizeof(ring_name),
2923 				"rxmbuf_ring_%u", queue_id);
2924 			setup_mempool_tbl(rte_socket_id(), queue_id,
2925 				pool_name, ring_name, nb_mbuf);
2926 		}
2927 
2928 		nb_mbuf = num_tx_descriptor
2929 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2930 				+ num_switching_cores * MAX_PKT_BURST;
2931 
2932 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2933 			snprintf(pool_name, sizeof(pool_name),
2934 				"txmbuf_pool_%u", queue_id);
2935 			snprintf(ring_name, sizeof(ring_name),
2936 				"txmbuf_ring_%u", queue_id);
2937 			setup_mempool_tbl(rte_socket_id(),
2938 				(queue_id + MAX_QUEUES),
2939 				pool_name, ring_name, nb_mbuf);
2940 		}
2941 
2942 		if (vm2vm_mode == VM2VM_HARDWARE) {
2943 			/* Enable VT loop back to let L2 switch to do it. */
2944 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2945 			LOG_DEBUG(VHOST_CONFIG,
2946 				"Enable loop back for L2 switch in vmdq.\n");
2947 		}
2948 	}
2949 	/* Set log level. */
2950 	rte_set_log_level(LOG_LEVEL);
2951 
2952 	/* initialize all ports */
2953 	for (portid = 0; portid < nb_ports; portid++) {
2954 		/* skip ports that are not enabled */
2955 		if ((enabled_port_mask & (1 << portid)) == 0) {
2956 			RTE_LOG(INFO, VHOST_PORT,
2957 				"Skipping disabled port %d\n", portid);
2958 			continue;
2959 		}
2960 		if (port_init(portid) != 0)
2961 			rte_exit(EXIT_FAILURE,
2962 				"Cannot initialize network ports\n");
2963 	}
2964 
2965 	/* Initialise all linked lists. */
2966 	if (init_data_ll() == -1)
2967 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2968 
2969 	/* Initialize device stats */
2970 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2971 
2972 	/* Enable stats if the user option is set. */
2973 	if (enable_stats)
2974 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2975 
2976 	/* Launch all data cores. */
2977 	if (zero_copy == 0) {
2978 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2979 			rte_eal_remote_launch(switch_worker,
2980 				mbuf_pool, lcore_id);
2981 		}
2982 	} else {
2983 		uint32_t count_in_mempool, index, i;
2984 		for (index = 0; index < 2*MAX_QUEUES; index++) {
2985 			/* For all RX and TX queues. */
2986 			count_in_mempool
2987 				= rte_mempool_count(vpool_array[index].pool);
2988 
2989 			/*
2990 			 * Transfer all un-attached mbufs from vpool.pool
2991 			 * to vpoo.ring.
2992 			 */
2993 			for (i = 0; i < count_in_mempool; i++) {
2994 				struct rte_mbuf *mbuf
2995 					= __rte_mbuf_raw_alloc(
2996 						vpool_array[index].pool);
2997 				rte_ring_sp_enqueue(vpool_array[index].ring,
2998 						(void *)mbuf);
2999 			}
3000 
3001 			LOG_DEBUG(VHOST_CONFIG,
3002 				"in main: mbuf count in mempool at initial "
3003 				"is: %d\n", count_in_mempool);
3004 			LOG_DEBUG(VHOST_CONFIG,
3005 				"in main: mbuf count in  ring at initial  is :"
3006 				" %d\n",
3007 				rte_ring_count(vpool_array[index].ring));
3008 		}
3009 
3010 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3011 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3012 				lcore_id);
3013 	}
3014 
3015 	if (mergeable == 0)
3016 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3017 
3018 	/* Register CUSE device to handle IOCTLs. */
3019 	ret = rte_vhost_driver_register((char *)&dev_basename);
3020 	if (ret != 0)
3021 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3022 
3023 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3024 
3025 	/* Start CUSE session. */
3026 	rte_vhost_driver_session_start();
3027 	return 0;
3028 
3029 }
3030 
3031