xref: /dpdk/examples/vhost/main.c (revision 8bd6c395a568ef7a70c64fbef0968c456bed6d29)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144 
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147 
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150 
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154 
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161 
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165 
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168 
169 struct vpool {
170 	struct rte_mempool *pool;
171 	struct rte_ring *ring;
172 	uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174 
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177 	VM2VM_DISABLED = 0,
178 	VM2VM_SOFTWARE = 1,
179 	VM2VM_HARDWARE = 2,
180 	VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183 
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186 	PHYS_ADDR_CONTINUOUS = 0,
187 	PHYS_ADDR_CROSS_SUBREG = 1,
188 	PHYS_ADDR_INVALID = 2,
189 	PHYS_ADDR_LAST
190 } hpa_type;
191 
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200 
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203 
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206 	.rxmode = {
207 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208 		.split_hdr_size = 0,
209 		.header_split   = 0, /**< Header Split disabled */
210 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
211 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
212 		/*
213 		 * It is necessary for 1G NIC such as I350,
214 		 * this fixes bug of ipv4 forwarding in guest can't
215 		 * forward pakets from one virtio dev to another virtio dev.
216 		 */
217 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
220 	},
221 
222 	.txmode = {
223 		.mq_mode = ETH_MQ_TX_NONE,
224 	},
225 	.rx_adv_conf = {
226 		/*
227 		 * should be overridden separately in code with
228 		 * appropriate values
229 		 */
230 		.vmdq_rx_conf = {
231 			.nb_queue_pools = ETH_8_POOLS,
232 			.enable_default_pool = 0,
233 			.default_pool = 0,
234 			.nb_pool_maps = 0,
235 			.pool_map = {{0, 0},},
236 		},
237 	},
238 };
239 
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246 
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
251 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258 
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261 
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265 
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268 
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271 	unsigned len;
272 	unsigned txq_id;
273 	struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275 
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278 
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281 
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284 	unsigned char   h_dest[ETH_ALEN];
285 	unsigned char   h_source[ETH_ALEN];
286 	__be16          h_vlan_proto;
287 	__be16          h_vlan_TCI;
288 	__be16          h_vlan_encapsulated_proto;
289 };
290 
291 /* IPv4 Header */
292 struct ipv4_hdr {
293 	uint8_t  version_ihl;		/**< version and header length */
294 	uint8_t  type_of_service;	/**< type of service */
295 	uint16_t total_length;		/**< length of packet */
296 	uint16_t packet_id;		/**< packet ID */
297 	uint16_t fragment_offset;	/**< fragmentation offset */
298 	uint8_t  time_to_live;		/**< time to live */
299 	uint8_t  next_proto_id;		/**< protocol ID */
300 	uint16_t hdr_checksum;		/**< header checksum */
301 	uint32_t src_addr;		/**< source address */
302 	uint32_t dst_addr;		/**< destination address */
303 } __attribute__((__packed__));
304 
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308 
309 /* Per-device statistics struct */
310 struct device_statistics {
311 	uint64_t tx_total;
312 	rte_atomic64_t rx_total_atomic;
313 	uint64_t rx_total;
314 	uint64_t tx;
315 	rte_atomic64_t rx_atomic;
316 	uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319 
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327 	struct rte_eth_vmdq_rx_conf conf;
328 	struct rte_eth_vmdq_rx_conf *def_conf =
329 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330 	unsigned i;
331 
332 	memset(&conf, 0, sizeof(conf));
333 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334 	conf.nb_pool_maps = num_devices;
335 	conf.enable_loop_back = def_conf->enable_loop_back;
336 	conf.rx_mode = def_conf->rx_mode;
337 
338 	for (i = 0; i < conf.nb_pool_maps; i++) {
339 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
340 		conf.pool_map[i].pools = (1UL << i);
341 	}
342 
343 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346 	return 0;
347 }
348 
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357 	if (num_devices > max_nb_devices) {
358 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359 		return -1;
360 	}
361 	return 0;
362 }
363 
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371 	struct rte_eth_dev_info dev_info;
372 	struct rte_eth_conf port_conf;
373 	struct rte_eth_rxconf *rxconf;
374 	struct rte_eth_txconf *txconf;
375 	int16_t rx_rings, tx_rings;
376 	uint16_t rx_ring_size, tx_ring_size;
377 	int retval;
378 	uint16_t q;
379 
380 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381 	rte_eth_dev_info_get (port, &dev_info);
382 
383 	if (dev_info.max_rx_queues > MAX_QUEUES) {
384 		rte_exit(EXIT_FAILURE,
385 			"please define MAX_QUEUES no less than %u in %s\n",
386 			dev_info.max_rx_queues, __FILE__);
387 	}
388 
389 	rxconf = &dev_info.default_rxconf;
390 	txconf = &dev_info.default_txconf;
391 	rxconf->rx_drop_en = 1;
392 
393 	/*
394 	 * Zero copy defers queue RX/TX start to the time when guest
395 	 * finishes its startup and packet buffers from that guest are
396 	 * available.
397 	 */
398 	if (zero_copy) {
399 		rxconf->rx_deferred_start = 1;
400 		rxconf->rx_drop_en = 0;
401 		txconf->tx_deferred_start = 1;
402 	}
403 
404 	/*configure the number of supported virtio devices based on VMDQ limits */
405 	num_devices = dev_info.max_vmdq_pools;
406 
407 	if (zero_copy) {
408 		rx_ring_size = num_rx_descriptor;
409 		tx_ring_size = num_tx_descriptor;
410 		tx_rings = dev_info.max_tx_queues;
411 	} else {
412 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
413 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
414 		tx_rings = (uint16_t)rte_lcore_count();
415 	}
416 
417 	retval = validate_num_devices(MAX_DEVICES);
418 	if (retval < 0)
419 		return retval;
420 
421 	/* Get port configuration. */
422 	retval = get_eth_conf(&port_conf, num_devices);
423 	if (retval < 0)
424 		return retval;
425 	/* NIC queues are divided into pf queues and vmdq queues.  */
426 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
427 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
428 	num_vmdq_queues = num_devices * queues_per_pool;
429 	num_queues = num_pf_queues + num_vmdq_queues;
430 	vmdq_queue_base = dev_info.vmdq_queue_base;
431 	vmdq_pool_base  = dev_info.vmdq_pool_base;
432 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
433 		num_pf_queues, num_devices, queues_per_pool);
434 
435 	if (port >= rte_eth_dev_count()) return -1;
436 
437 	rx_rings = (uint16_t)dev_info.max_rx_queues;
438 	/* Configure ethernet device. */
439 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
440 	if (retval != 0)
441 		return retval;
442 
443 	/* Setup the queues. */
444 	for (q = 0; q < rx_rings; q ++) {
445 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
446 						rte_eth_dev_socket_id(port),
447 						rxconf,
448 						vpool_array[q].pool);
449 		if (retval < 0)
450 			return retval;
451 	}
452 	for (q = 0; q < tx_rings; q ++) {
453 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
454 						rte_eth_dev_socket_id(port),
455 						txconf);
456 		if (retval < 0)
457 			return retval;
458 	}
459 
460 	/* Start the device. */
461 	retval  = rte_eth_dev_start(port);
462 	if (retval < 0) {
463 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
464 		return retval;
465 	}
466 
467 	if (promiscuous)
468 		rte_eth_promiscuous_enable(port);
469 
470 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
471 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
472 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
473 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
474 			(unsigned)port,
475 			vmdq_ports_eth_addr[port].addr_bytes[0],
476 			vmdq_ports_eth_addr[port].addr_bytes[1],
477 			vmdq_ports_eth_addr[port].addr_bytes[2],
478 			vmdq_ports_eth_addr[port].addr_bytes[3],
479 			vmdq_ports_eth_addr[port].addr_bytes[4],
480 			vmdq_ports_eth_addr[port].addr_bytes[5]);
481 
482 	return 0;
483 }
484 
485 /*
486  * Set character device basename.
487  */
488 static int
489 us_vhost_parse_basename(const char *q_arg)
490 {
491 	/* parse number string */
492 
493 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
494 		return -1;
495 	else
496 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
497 
498 	return 0;
499 }
500 
501 /*
502  * Parse the portmask provided at run time.
503  */
504 static int
505 parse_portmask(const char *portmask)
506 {
507 	char *end = NULL;
508 	unsigned long pm;
509 
510 	errno = 0;
511 
512 	/* parse hexadecimal string */
513 	pm = strtoul(portmask, &end, 16);
514 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
515 		return -1;
516 
517 	if (pm == 0)
518 		return -1;
519 
520 	return pm;
521 
522 }
523 
524 /*
525  * Parse num options at run time.
526  */
527 static int
528 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
529 {
530 	char *end = NULL;
531 	unsigned long num;
532 
533 	errno = 0;
534 
535 	/* parse unsigned int string */
536 	num = strtoul(q_arg, &end, 10);
537 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
538 		return -1;
539 
540 	if (num > max_valid_value)
541 		return -1;
542 
543 	return num;
544 
545 }
546 
547 /*
548  * Display usage
549  */
550 static void
551 us_vhost_usage(const char *prgname)
552 {
553 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
554 	"		--vm2vm [0|1|2]\n"
555 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
556 	"		--dev-basename <name>\n"
557 	"		--nb-devices ND\n"
558 	"		-p PORTMASK: Set mask for ports to be used by application\n"
559 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
560 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
561 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
562 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
563 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
564 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
565 	"		--dev-basename: The basename to be used for the character device.\n"
566 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
567 			"zero copy\n"
568 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
569 			"used only when zero copy is enabled.\n"
570 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
571 			"used only when zero copy is enabled.\n",
572 	       prgname);
573 }
574 
575 /*
576  * Parse the arguments given in the command line of the application.
577  */
578 static int
579 us_vhost_parse_args(int argc, char **argv)
580 {
581 	int opt, ret;
582 	int option_index;
583 	unsigned i;
584 	const char *prgname = argv[0];
585 	static struct option long_option[] = {
586 		{"vm2vm", required_argument, NULL, 0},
587 		{"rx-retry", required_argument, NULL, 0},
588 		{"rx-retry-delay", required_argument, NULL, 0},
589 		{"rx-retry-num", required_argument, NULL, 0},
590 		{"mergeable", required_argument, NULL, 0},
591 		{"stats", required_argument, NULL, 0},
592 		{"dev-basename", required_argument, NULL, 0},
593 		{"zero-copy", required_argument, NULL, 0},
594 		{"rx-desc-num", required_argument, NULL, 0},
595 		{"tx-desc-num", required_argument, NULL, 0},
596 		{NULL, 0, 0, 0},
597 	};
598 
599 	/* Parse command line */
600 	while ((opt = getopt_long(argc, argv, "p:P",
601 			long_option, &option_index)) != EOF) {
602 		switch (opt) {
603 		/* Portmask */
604 		case 'p':
605 			enabled_port_mask = parse_portmask(optarg);
606 			if (enabled_port_mask == 0) {
607 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 				us_vhost_usage(prgname);
609 				return -1;
610 			}
611 			break;
612 
613 		case 'P':
614 			promiscuous = 1;
615 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
616 				ETH_VMDQ_ACCEPT_BROADCAST |
617 				ETH_VMDQ_ACCEPT_MULTICAST;
618 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
619 
620 			break;
621 
622 		case 0:
623 			/* Enable/disable vm2vm comms. */
624 			if (!strncmp(long_option[option_index].name, "vm2vm",
625 				MAX_LONG_OPT_SZ)) {
626 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
627 				if (ret == -1) {
628 					RTE_LOG(INFO, VHOST_CONFIG,
629 						"Invalid argument for "
630 						"vm2vm [0|1|2]\n");
631 					us_vhost_usage(prgname);
632 					return -1;
633 				} else {
634 					vm2vm_mode = (vm2vm_type)ret;
635 				}
636 			}
637 
638 			/* Enable/disable retries on RX. */
639 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
640 				ret = parse_num_opt(optarg, 1);
641 				if (ret == -1) {
642 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
643 					us_vhost_usage(prgname);
644 					return -1;
645 				} else {
646 					enable_retry = ret;
647 				}
648 			}
649 
650 			/* Specify the retries delay time (in useconds) on RX. */
651 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
652 				ret = parse_num_opt(optarg, INT32_MAX);
653 				if (ret == -1) {
654 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
655 					us_vhost_usage(prgname);
656 					return -1;
657 				} else {
658 					burst_rx_delay_time = ret;
659 				}
660 			}
661 
662 			/* Specify the retries number on RX. */
663 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
664 				ret = parse_num_opt(optarg, INT32_MAX);
665 				if (ret == -1) {
666 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
667 					us_vhost_usage(prgname);
668 					return -1;
669 				} else {
670 					burst_rx_retry_num = ret;
671 				}
672 			}
673 
674 			/* Enable/disable RX mergeable buffers. */
675 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
676 				ret = parse_num_opt(optarg, 1);
677 				if (ret == -1) {
678 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
679 					us_vhost_usage(prgname);
680 					return -1;
681 				} else {
682 					mergeable = !!ret;
683 					if (ret) {
684 						vmdq_conf_default.rxmode.jumbo_frame = 1;
685 						vmdq_conf_default.rxmode.max_rx_pkt_len
686 							= JUMBO_FRAME_MAX_SIZE;
687 					}
688 				}
689 			}
690 
691 			/* Enable/disable stats. */
692 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
693 				ret = parse_num_opt(optarg, INT32_MAX);
694 				if (ret == -1) {
695 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
696 					us_vhost_usage(prgname);
697 					return -1;
698 				} else {
699 					enable_stats = ret;
700 				}
701 			}
702 
703 			/* Set character device basename. */
704 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
705 				if (us_vhost_parse_basename(optarg) == -1) {
706 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
707 					us_vhost_usage(prgname);
708 					return -1;
709 				}
710 			}
711 
712 			/* Enable/disable rx/tx zero copy. */
713 			if (!strncmp(long_option[option_index].name,
714 				"zero-copy", MAX_LONG_OPT_SZ)) {
715 				ret = parse_num_opt(optarg, 1);
716 				if (ret == -1) {
717 					RTE_LOG(INFO, VHOST_CONFIG,
718 						"Invalid argument"
719 						" for zero-copy [0|1]\n");
720 					us_vhost_usage(prgname);
721 					return -1;
722 				} else
723 					zero_copy = ret;
724 
725 				if (zero_copy) {
726 #ifdef RTE_MBUF_REFCNT
727 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
728 					"zero copy vhost APP, please "
729 					"disable RTE_MBUF_REFCNT\n"
730 					"in config file and then rebuild DPDK "
731 					"core lib!\n"
732 					"Otherwise please disable zero copy "
733 					"flag in command line!\n");
734 					return -1;
735 #endif
736 				}
737 			}
738 
739 			/* Specify the descriptor number on RX. */
740 			if (!strncmp(long_option[option_index].name,
741 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
742 				ret = parse_num_opt(optarg, MAX_RING_DESC);
743 				if ((ret == -1) || (!POWEROF2(ret))) {
744 					RTE_LOG(INFO, VHOST_CONFIG,
745 					"Invalid argument for rx-desc-num[0-N],"
746 					"power of 2 required.\n");
747 					us_vhost_usage(prgname);
748 					return -1;
749 				} else {
750 					num_rx_descriptor = ret;
751 				}
752 			}
753 
754 			/* Specify the descriptor number on TX. */
755 			if (!strncmp(long_option[option_index].name,
756 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
757 				ret = parse_num_opt(optarg, MAX_RING_DESC);
758 				if ((ret == -1) || (!POWEROF2(ret))) {
759 					RTE_LOG(INFO, VHOST_CONFIG,
760 					"Invalid argument for tx-desc-num [0-N],"
761 					"power of 2 required.\n");
762 					us_vhost_usage(prgname);
763 					return -1;
764 				} else {
765 					num_tx_descriptor = ret;
766 				}
767 			}
768 
769 			break;
770 
771 			/* Invalid option - print options. */
772 		default:
773 			us_vhost_usage(prgname);
774 			return -1;
775 		}
776 	}
777 
778 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
779 		if (enabled_port_mask & (1 << i))
780 			ports[num_ports++] = (uint8_t)i;
781 	}
782 
783 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
784 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
785 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
786 		return -1;
787 	}
788 
789 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
790 		RTE_LOG(INFO, VHOST_PORT,
791 			"Vhost zero copy doesn't support software vm2vm,"
792 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
793 		return -1;
794 	}
795 
796 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
797 		RTE_LOG(INFO, VHOST_PORT,
798 			"Vhost zero copy doesn't support jumbo frame,"
799 			"please specify '--mergeable 0' to disable the "
800 			"mergeable feature.\n");
801 		return -1;
802 	}
803 
804 	return 0;
805 }
806 
807 /*
808  * Update the global var NUM_PORTS and array PORTS according to system ports number
809  * and return valid ports number
810  */
811 static unsigned check_ports_num(unsigned nb_ports)
812 {
813 	unsigned valid_num_ports = num_ports;
814 	unsigned portid;
815 
816 	if (num_ports > nb_ports) {
817 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
818 			num_ports, nb_ports);
819 		num_ports = nb_ports;
820 	}
821 
822 	for (portid = 0; portid < num_ports; portid ++) {
823 		if (ports[portid] >= nb_ports) {
824 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
825 				ports[portid], (nb_ports - 1));
826 			ports[portid] = INVALID_PORT_ID;
827 			valid_num_ports--;
828 		}
829 	}
830 	return valid_num_ports;
831 }
832 
833 /*
834  * Macro to print out packet contents. Wrapped in debug define so that the
835  * data path is not effected when debug is disabled.
836  */
837 #ifdef DEBUG
838 #define PRINT_PACKET(device, addr, size, header) do {																\
839 	char *pkt_addr = (char*)(addr);																					\
840 	unsigned int index;																								\
841 	char packet[MAX_PRINT_BUFF];																					\
842 																													\
843 	if ((header))																									\
844 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
845 	else																											\
846 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
847 	for (index = 0; index < (size); index++) {																		\
848 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
849 			"%02hhx ", pkt_addr[index]);																			\
850 	}																												\
851 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
852 																													\
853 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
854 } while(0)
855 #else
856 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
857 #endif
858 
859 /*
860  * Function to convert guest physical addresses to vhost physical addresses.
861  * This is used to convert virtio buffer addresses.
862  */
863 static inline uint64_t __attribute__((always_inline))
864 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
865 	uint32_t buf_len, hpa_type *addr_type)
866 {
867 	struct virtio_memory_regions_hpa *region;
868 	uint32_t regionidx;
869 	uint64_t vhost_pa = 0;
870 
871 	*addr_type = PHYS_ADDR_INVALID;
872 
873 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
874 		region = &vdev->regions_hpa[regionidx];
875 		if ((guest_pa >= region->guest_phys_address) &&
876 			(guest_pa <= region->guest_phys_address_end)) {
877 			vhost_pa = region->host_phys_addr_offset + guest_pa;
878 			if (likely((guest_pa + buf_len - 1)
879 				<= region->guest_phys_address_end))
880 				*addr_type = PHYS_ADDR_CONTINUOUS;
881 			else
882 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
883 			break;
884 		}
885 	}
886 
887 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
888 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
889 		(void *)(uintptr_t)vhost_pa);
890 
891 	return vhost_pa;
892 }
893 
894 /*
895  * Compares a packet destination MAC address to a device MAC address.
896  */
897 static inline int __attribute__((always_inline))
898 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
899 {
900 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
901 }
902 
903 /*
904  * This function learns the MAC address of the device and registers this along with a
905  * vlan tag to a VMDQ.
906  */
907 static int
908 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
909 {
910 	struct ether_hdr *pkt_hdr;
911 	struct virtio_net_data_ll *dev_ll;
912 	struct virtio_net *dev = vdev->dev;
913 	int i, ret;
914 
915 	/* Learn MAC address of guest device from packet */
916 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
917 
918 	dev_ll = ll_root_used;
919 
920 	while (dev_ll != NULL) {
921 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
922 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
923 			return -1;
924 		}
925 		dev_ll = dev_ll->next;
926 	}
927 
928 	for (i = 0; i < ETHER_ADDR_LEN; i++)
929 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
930 
931 	/* vlan_tag currently uses the device_id. */
932 	vdev->vlan_tag = vlan_tags[dev->device_fh];
933 
934 	/* Print out VMDQ registration info. */
935 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
936 		dev->device_fh,
937 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
938 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
939 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
940 		vdev->vlan_tag);
941 
942 	/* Register the MAC address. */
943 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
944 				(uint32_t)dev->device_fh + vmdq_pool_base);
945 	if (ret)
946 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
947 					dev->device_fh);
948 
949 	/* Enable stripping of the vlan tag as we handle routing. */
950 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
951 
952 	/* Set device as ready for RX. */
953 	vdev->ready = DEVICE_RX;
954 
955 	return 0;
956 }
957 
958 /*
959  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
960  * queue before disabling RX on the device.
961  */
962 static inline void
963 unlink_vmdq(struct vhost_dev *vdev)
964 {
965 	unsigned i = 0;
966 	unsigned rx_count;
967 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
968 
969 	if (vdev->ready == DEVICE_RX) {
970 		/*clear MAC and VLAN settings*/
971 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
972 		for (i = 0; i < 6; i++)
973 			vdev->mac_address.addr_bytes[i] = 0;
974 
975 		vdev->vlan_tag = 0;
976 
977 		/*Clear out the receive buffers*/
978 		rx_count = rte_eth_rx_burst(ports[0],
979 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980 
981 		while (rx_count) {
982 			for (i = 0; i < rx_count; i++)
983 				rte_pktmbuf_free(pkts_burst[i]);
984 
985 			rx_count = rte_eth_rx_burst(ports[0],
986 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
987 		}
988 
989 		vdev->ready = DEVICE_MAC_LEARNING;
990 	}
991 }
992 
993 /*
994  * Check if the packet destination MAC address is for a local device. If so then put
995  * the packet on that devices RX queue. If not then return.
996  */
997 static inline int __attribute__((always_inline))
998 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
999 {
1000 	struct virtio_net_data_ll *dev_ll;
1001 	struct ether_hdr *pkt_hdr;
1002 	uint64_t ret = 0;
1003 	struct virtio_net *dev = vdev->dev;
1004 	struct virtio_net *tdev; /* destination virito device */
1005 
1006 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1007 
1008 	/*get the used devices list*/
1009 	dev_ll = ll_root_used;
1010 
1011 	while (dev_ll != NULL) {
1012 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1013 				          &dev_ll->vdev->mac_address)) {
1014 
1015 			/* Drop the packet if the TX packet is destined for the TX device. */
1016 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1017 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1018 							dev->device_fh);
1019 				return 0;
1020 			}
1021 			tdev = dev_ll->vdev->dev;
1022 
1023 
1024 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1025 
1026 			if (unlikely(dev_ll->vdev->remove)) {
1027 				/*drop the packet if the device is marked for removal*/
1028 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1029 			} else {
1030 				/*send the packet to the local virtio device*/
1031 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1032 				if (enable_stats) {
1033 					rte_atomic64_add(
1034 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1035 					1);
1036 					rte_atomic64_add(
1037 					&dev_statistics[tdev->device_fh].rx_atomic,
1038 					ret);
1039 					dev_statistics[tdev->device_fh].tx_total++;
1040 					dev_statistics[tdev->device_fh].tx += ret;
1041 				}
1042 			}
1043 
1044 			return 0;
1045 		}
1046 		dev_ll = dev_ll->next;
1047 	}
1048 
1049 	return -1;
1050 }
1051 
1052 /*
1053  * Check if the destination MAC of a packet is one local VM,
1054  * and get its vlan tag, and offset if it is.
1055  */
1056 static inline int __attribute__((always_inline))
1057 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1058 	uint32_t *offset, uint16_t *vlan_tag)
1059 {
1060 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1061 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1062 
1063 	while (dev_ll != NULL) {
1064 		if ((dev_ll->vdev->ready == DEVICE_RX)
1065 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1066 		&dev_ll->vdev->mac_address)) {
1067 			/*
1068 			 * Drop the packet if the TX packet is
1069 			 * destined for the TX device.
1070 			 */
1071 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1072 				LOG_DEBUG(VHOST_DATA,
1073 				"(%"PRIu64") TX: Source and destination"
1074 				" MAC addresses are the same. Dropping "
1075 				"packet.\n",
1076 				dev_ll->vdev->dev->device_fh);
1077 				return -1;
1078 			}
1079 
1080 			/*
1081 			 * HW vlan strip will reduce the packet length
1082 			 * by minus length of vlan tag, so need restore
1083 			 * the packet length by plus it.
1084 			 */
1085 			*offset = VLAN_HLEN;
1086 			*vlan_tag =
1087 			(uint16_t)
1088 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1089 
1090 			LOG_DEBUG(VHOST_DATA,
1091 			"(%"PRIu64") TX: pkt to local VM device id:"
1092 			"(%"PRIu64") vlan tag: %d.\n",
1093 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1094 			vlan_tag);
1095 
1096 			break;
1097 		}
1098 		dev_ll = dev_ll->next;
1099 	}
1100 	return 0;
1101 }
1102 
1103 /*
1104  * This function routes the TX packet to the correct interface. This may be a local device
1105  * or the physical port.
1106  */
1107 static inline void __attribute__((always_inline))
1108 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1109 {
1110 	struct mbuf_table *tx_q;
1111 	struct rte_mbuf **m_table;
1112 	unsigned len, ret, offset = 0;
1113 	const uint16_t lcore_id = rte_lcore_id();
1114 	struct virtio_net *dev = vdev->dev;
1115 
1116 	/*check if destination is local VM*/
1117 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1118 		rte_pktmbuf_free(m);
1119 		return;
1120 	}
1121 
1122 	if (vm2vm_mode == VM2VM_HARDWARE) {
1123 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1124 			offset > rte_pktmbuf_tailroom(m)) {
1125 			rte_pktmbuf_free(m);
1126 			return;
1127 		}
1128 	}
1129 
1130 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1131 
1132 	/*Add packet to the port tx queue*/
1133 	tx_q = &lcore_tx_queue[lcore_id];
1134 	len = tx_q->len;
1135 
1136 	m->ol_flags = PKT_TX_VLAN_PKT;
1137 
1138 	m->data_len += offset;
1139 	m->pkt_len += offset;
1140 
1141 	m->vlan_tci = vlan_tag;
1142 
1143 	tx_q->m_table[len] = m;
1144 	len++;
1145 	if (enable_stats) {
1146 		dev_statistics[dev->device_fh].tx_total++;
1147 		dev_statistics[dev->device_fh].tx++;
1148 	}
1149 
1150 	if (unlikely(len == MAX_PKT_BURST)) {
1151 		m_table = (struct rte_mbuf **)tx_q->m_table;
1152 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1153 		/* Free any buffers not handled by TX and update the port stats. */
1154 		if (unlikely(ret < len)) {
1155 			do {
1156 				rte_pktmbuf_free(m_table[ret]);
1157 			} while (++ret < len);
1158 		}
1159 
1160 		len = 0;
1161 	}
1162 
1163 	tx_q->len = len;
1164 	return;
1165 }
1166 /*
1167  * This function is called by each data core. It handles all RX/TX registered with the
1168  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1169  * with all devices in the main linked list.
1170  */
1171 static int
1172 switch_worker(__attribute__((unused)) void *arg)
1173 {
1174 	struct rte_mempool *mbuf_pool = arg;
1175 	struct virtio_net *dev = NULL;
1176 	struct vhost_dev *vdev = NULL;
1177 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1178 	struct virtio_net_data_ll *dev_ll;
1179 	struct mbuf_table *tx_q;
1180 	volatile struct lcore_ll_info *lcore_ll;
1181 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1182 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1183 	unsigned ret, i;
1184 	const uint16_t lcore_id = rte_lcore_id();
1185 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1186 	uint16_t rx_count = 0;
1187 	uint16_t tx_count;
1188 	uint32_t retry = 0;
1189 
1190 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1191 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1192 	prev_tsc = 0;
1193 
1194 	tx_q = &lcore_tx_queue[lcore_id];
1195 	for (i = 0; i < num_cores; i ++) {
1196 		if (lcore_ids[i] == lcore_id) {
1197 			tx_q->txq_id = i;
1198 			break;
1199 		}
1200 	}
1201 
1202 	while(1) {
1203 		cur_tsc = rte_rdtsc();
1204 		/*
1205 		 * TX burst queue drain
1206 		 */
1207 		diff_tsc = cur_tsc - prev_tsc;
1208 		if (unlikely(diff_tsc > drain_tsc)) {
1209 
1210 			if (tx_q->len) {
1211 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1212 
1213 				/*Tx any packets in the queue*/
1214 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1215 									   (struct rte_mbuf **)tx_q->m_table,
1216 									   (uint16_t)tx_q->len);
1217 				if (unlikely(ret < tx_q->len)) {
1218 					do {
1219 						rte_pktmbuf_free(tx_q->m_table[ret]);
1220 					} while (++ret < tx_q->len);
1221 				}
1222 
1223 				tx_q->len = 0;
1224 			}
1225 
1226 			prev_tsc = cur_tsc;
1227 
1228 		}
1229 
1230 		rte_prefetch0(lcore_ll->ll_root_used);
1231 		/*
1232 		 * Inform the configuration core that we have exited the linked list and that no devices are
1233 		 * in use if requested.
1234 		 */
1235 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1236 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1237 
1238 		/*
1239 		 * Process devices
1240 		 */
1241 		dev_ll = lcore_ll->ll_root_used;
1242 
1243 		while (dev_ll != NULL) {
1244 			/*get virtio device ID*/
1245 			vdev = dev_ll->vdev;
1246 			dev = vdev->dev;
1247 
1248 			if (unlikely(vdev->remove)) {
1249 				dev_ll = dev_ll->next;
1250 				unlink_vmdq(vdev);
1251 				vdev->ready = DEVICE_SAFE_REMOVE;
1252 				continue;
1253 			}
1254 			if (likely(vdev->ready == DEVICE_RX)) {
1255 				/*Handle guest RX*/
1256 				rx_count = rte_eth_rx_burst(ports[0],
1257 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1258 
1259 				if (rx_count) {
1260 					/*
1261 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1262 					* Here MAX_PKT_BURST must be less than virtio queue size
1263 					*/
1264 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1265 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1266 							rte_delay_us(burst_rx_delay_time);
1267 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1268 								break;
1269 						}
1270 					}
1271 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1272 					if (enable_stats) {
1273 						rte_atomic64_add(
1274 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1275 						rx_count);
1276 						rte_atomic64_add(
1277 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1278 					}
1279 					while (likely(rx_count)) {
1280 						rx_count--;
1281 						rte_pktmbuf_free(pkts_burst[rx_count]);
1282 					}
1283 
1284 				}
1285 			}
1286 
1287 			if (likely(!vdev->remove)) {
1288 				/* Handle guest TX*/
1289 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1290 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1291 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1292 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1293 						while (tx_count--)
1294 							rte_pktmbuf_free(pkts_burst[tx_count]);
1295 					}
1296 				}
1297 				while (tx_count)
1298 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1299 			}
1300 
1301 			/*move to the next device in the list*/
1302 			dev_ll = dev_ll->next;
1303 		}
1304 	}
1305 
1306 	return 0;
1307 }
1308 
1309 /*
1310  * This function gets available ring number for zero copy rx.
1311  * Only one thread will call this funciton for a paticular virtio device,
1312  * so, it is designed as non-thread-safe function.
1313  */
1314 static inline uint32_t __attribute__((always_inline))
1315 get_available_ring_num_zcp(struct virtio_net *dev)
1316 {
1317 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1318 	uint16_t avail_idx;
1319 
1320 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1321 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1322 }
1323 
1324 /*
1325  * This function gets available ring index for zero copy rx,
1326  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1327  * Only one thread will call this funciton for a paticular virtio device,
1328  * so, it is designed as non-thread-safe function.
1329  */
1330 static inline uint32_t __attribute__((always_inline))
1331 get_available_ring_index_zcp(struct virtio_net *dev,
1332 	uint16_t *res_base_idx, uint32_t count)
1333 {
1334 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1335 	uint16_t avail_idx;
1336 	uint32_t retry = 0;
1337 	uint16_t free_entries;
1338 
1339 	*res_base_idx = vq->last_used_idx_res;
1340 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1341 	free_entries = (avail_idx - *res_base_idx);
1342 
1343 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1344 			"avail idx: %d, "
1345 			"res base idx:%d, free entries:%d\n",
1346 			dev->device_fh, avail_idx, *res_base_idx,
1347 			free_entries);
1348 
1349 	/*
1350 	 * If retry is enabled and the queue is full then we wait
1351 	 * and retry to avoid packet loss.
1352 	 */
1353 	if (enable_retry && unlikely(count > free_entries)) {
1354 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1355 			rte_delay_us(burst_rx_delay_time);
1356 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1357 			free_entries = (avail_idx - *res_base_idx);
1358 			if (count <= free_entries)
1359 				break;
1360 		}
1361 	}
1362 
1363 	/*check that we have enough buffers*/
1364 	if (unlikely(count > free_entries))
1365 		count = free_entries;
1366 
1367 	if (unlikely(count == 0)) {
1368 		LOG_DEBUG(VHOST_DATA,
1369 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1370 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1371 			dev->device_fh, avail_idx,
1372 			*res_base_idx, free_entries);
1373 		return 0;
1374 	}
1375 
1376 	vq->last_used_idx_res = *res_base_idx + count;
1377 
1378 	return count;
1379 }
1380 
1381 /*
1382  * This function put descriptor back to used list.
1383  */
1384 static inline void __attribute__((always_inline))
1385 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1386 {
1387 	uint16_t res_cur_idx = vq->last_used_idx;
1388 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1389 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1390 	rte_compiler_barrier();
1391 	*(volatile uint16_t *)&vq->used->idx += 1;
1392 	vq->last_used_idx += 1;
1393 
1394 	/* Kick the guest if necessary. */
1395 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1396 		eventfd_write((int)vq->kickfd, 1);
1397 }
1398 
1399 /*
1400  * This function get available descriptor from vitio vring and un-attached mbuf
1401  * from vpool->ring, and then attach them together. It needs adjust the offset
1402  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1403  * frame data may be put to wrong location in mbuf.
1404  */
1405 static inline void __attribute__((always_inline))
1406 attach_rxmbuf_zcp(struct virtio_net *dev)
1407 {
1408 	uint16_t res_base_idx, desc_idx;
1409 	uint64_t buff_addr, phys_addr;
1410 	struct vhost_virtqueue *vq;
1411 	struct vring_desc *desc;
1412 	struct rte_mbuf *mbuf = NULL;
1413 	struct vpool *vpool;
1414 	hpa_type addr_type;
1415 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1416 
1417 	vpool = &vpool_array[vdev->vmdq_rx_q];
1418 	vq = dev->virtqueue[VIRTIO_RXQ];
1419 
1420 	do {
1421 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1422 				1) != 1))
1423 			return;
1424 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1425 
1426 		desc = &vq->desc[desc_idx];
1427 		if (desc->flags & VRING_DESC_F_NEXT) {
1428 			desc = &vq->desc[desc->next];
1429 			buff_addr = gpa_to_vva(dev, desc->addr);
1430 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1431 					&addr_type);
1432 		} else {
1433 			buff_addr = gpa_to_vva(dev,
1434 					desc->addr + vq->vhost_hlen);
1435 			phys_addr = gpa_to_hpa(vdev,
1436 					desc->addr + vq->vhost_hlen,
1437 					desc->len, &addr_type);
1438 		}
1439 
1440 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1441 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1442 				" address found when attaching RX frame buffer"
1443 				" address!\n", dev->device_fh);
1444 			put_desc_to_used_list_zcp(vq, desc_idx);
1445 			continue;
1446 		}
1447 
1448 		/*
1449 		 * Check if the frame buffer address from guest crosses
1450 		 * sub-region or not.
1451 		 */
1452 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1453 			RTE_LOG(ERR, VHOST_DATA,
1454 				"(%"PRIu64") Frame buffer address cross "
1455 				"sub-regioin found when attaching RX frame "
1456 				"buffer address!\n",
1457 				dev->device_fh);
1458 			put_desc_to_used_list_zcp(vq, desc_idx);
1459 			continue;
1460 		}
1461 	} while (unlikely(phys_addr == 0));
1462 
1463 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1464 	if (unlikely(mbuf == NULL)) {
1465 		LOG_DEBUG(VHOST_DATA,
1466 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1467 			"ring_sc_dequeue fail.\n",
1468 			dev->device_fh);
1469 		put_desc_to_used_list_zcp(vq, desc_idx);
1470 		return;
1471 	}
1472 
1473 	if (unlikely(vpool->buf_size > desc->len)) {
1474 		LOG_DEBUG(VHOST_DATA,
1475 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1476 			"length(%d) of descriptor idx: %d less than room "
1477 			"size required: %d\n",
1478 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1479 		put_desc_to_used_list_zcp(vq, desc_idx);
1480 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1481 		return;
1482 	}
1483 
1484 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1485 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1486 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1487 	mbuf->data_len = desc->len;
1488 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1489 
1490 	LOG_DEBUG(VHOST_DATA,
1491 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1492 		"descriptor idx:%d\n",
1493 		dev->device_fh, res_base_idx, desc_idx);
1494 
1495 	__rte_mbuf_raw_free(mbuf);
1496 
1497 	return;
1498 }
1499 
1500 /*
1501  * Detach an attched packet mbuf -
1502  *  - restore original mbuf address and length values.
1503  *  - reset pktmbuf data and data_len to their default values.
1504  *  All other fields of the given packet mbuf will be left intact.
1505  *
1506  * @param m
1507  *   The attached packet mbuf.
1508  */
1509 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1510 {
1511 	const struct rte_mempool *mp = m->pool;
1512 	void *buf = RTE_MBUF_TO_BADDR(m);
1513 	uint32_t buf_ofs;
1514 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1515 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1516 
1517 	m->buf_addr = buf;
1518 	m->buf_len = (uint16_t)buf_len;
1519 
1520 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1521 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1522 	m->data_off = buf_ofs;
1523 
1524 	m->data_len = 0;
1525 }
1526 
1527 /*
1528  * This function is called after packets have been transimited. It fetchs mbuf
1529  * from vpool->pool, detached it and put into vpool->ring. It also update the
1530  * used index and kick the guest if necessary.
1531  */
1532 static inline uint32_t __attribute__((always_inline))
1533 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1534 {
1535 	struct rte_mbuf *mbuf;
1536 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1537 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1538 	uint32_t index = 0;
1539 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1540 
1541 	LOG_DEBUG(VHOST_DATA,
1542 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1543 		"clean is: %d\n",
1544 		dev->device_fh, mbuf_count);
1545 	LOG_DEBUG(VHOST_DATA,
1546 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1547 		"clean  is : %d\n",
1548 		dev->device_fh, rte_ring_count(vpool->ring));
1549 
1550 	for (index = 0; index < mbuf_count; index++) {
1551 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1552 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1553 			pktmbuf_detach_zcp(mbuf);
1554 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1555 
1556 		/* Update used index buffer information. */
1557 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1558 		vq->used->ring[used_idx].len = 0;
1559 
1560 		used_idx = (used_idx + 1) & (vq->size - 1);
1561 	}
1562 
1563 	LOG_DEBUG(VHOST_DATA,
1564 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1565 		"clean is: %d\n",
1566 		dev->device_fh, rte_mempool_count(vpool->pool));
1567 	LOG_DEBUG(VHOST_DATA,
1568 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1569 		"clean  is : %d\n",
1570 		dev->device_fh, rte_ring_count(vpool->ring));
1571 	LOG_DEBUG(VHOST_DATA,
1572 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1573 		"vq->last_used_idx:%d\n",
1574 		dev->device_fh, vq->last_used_idx);
1575 
1576 	vq->last_used_idx += mbuf_count;
1577 
1578 	LOG_DEBUG(VHOST_DATA,
1579 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1580 		"vq->last_used_idx:%d\n",
1581 		dev->device_fh, vq->last_used_idx);
1582 
1583 	rte_compiler_barrier();
1584 
1585 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1586 
1587 	/* Kick guest if required. */
1588 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1589 		eventfd_write((int)vq->kickfd, 1);
1590 
1591 	return 0;
1592 }
1593 
1594 /*
1595  * This function is called when a virtio device is destroy.
1596  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1597  */
1598 static void mbuf_destroy_zcp(struct vpool *vpool)
1599 {
1600 	struct rte_mbuf *mbuf = NULL;
1601 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1602 
1603 	LOG_DEBUG(VHOST_CONFIG,
1604 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1605 		"mbuf_destroy_zcp is: %d\n",
1606 		mbuf_count);
1607 	LOG_DEBUG(VHOST_CONFIG,
1608 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1609 		"mbuf_destroy_zcp  is : %d\n",
1610 		rte_ring_count(vpool->ring));
1611 
1612 	for (index = 0; index < mbuf_count; index++) {
1613 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1614 		if (likely(mbuf != NULL)) {
1615 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1616 				pktmbuf_detach_zcp(mbuf);
1617 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1618 		}
1619 	}
1620 
1621 	LOG_DEBUG(VHOST_CONFIG,
1622 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1623 		"mbuf_destroy_zcp is: %d\n",
1624 		rte_mempool_count(vpool->pool));
1625 	LOG_DEBUG(VHOST_CONFIG,
1626 		"in mbuf_destroy_zcp: mbuf count in ring after "
1627 		"mbuf_destroy_zcp is : %d\n",
1628 		rte_ring_count(vpool->ring));
1629 }
1630 
1631 /*
1632  * This function update the use flag and counter.
1633  */
1634 static inline uint32_t __attribute__((always_inline))
1635 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1636 	uint32_t count)
1637 {
1638 	struct vhost_virtqueue *vq;
1639 	struct vring_desc *desc;
1640 	struct rte_mbuf *buff;
1641 	/* The virtio_hdr is initialised to 0. */
1642 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1643 		= {{0, 0, 0, 0, 0, 0}, 0};
1644 	uint64_t buff_hdr_addr = 0;
1645 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1646 	uint32_t head_idx, packet_success = 0;
1647 	uint16_t res_cur_idx;
1648 
1649 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1650 
1651 	if (count == 0)
1652 		return 0;
1653 
1654 	vq = dev->virtqueue[VIRTIO_RXQ];
1655 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1656 
1657 	res_cur_idx = vq->last_used_idx;
1658 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1659 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1660 
1661 	/* Retrieve all of the head indexes first to avoid caching issues. */
1662 	for (head_idx = 0; head_idx < count; head_idx++)
1663 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1664 
1665 	/*Prefetch descriptor index. */
1666 	rte_prefetch0(&vq->desc[head[packet_success]]);
1667 
1668 	while (packet_success != count) {
1669 		/* Get descriptor from available ring */
1670 		desc = &vq->desc[head[packet_success]];
1671 
1672 		buff = pkts[packet_success];
1673 		LOG_DEBUG(VHOST_DATA,
1674 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1675 			"pkt[%d] descriptor idx: %d\n",
1676 			dev->device_fh, packet_success,
1677 			MBUF_HEADROOM_UINT32(buff));
1678 
1679 		PRINT_PACKET(dev,
1680 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1681 			+ RTE_PKTMBUF_HEADROOM),
1682 			rte_pktmbuf_data_len(buff), 0);
1683 
1684 		/* Buffer address translation for virtio header. */
1685 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1686 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1687 
1688 		/*
1689 		 * If the descriptors are chained the header and data are
1690 		 * placed in separate buffers.
1691 		 */
1692 		if (desc->flags & VRING_DESC_F_NEXT) {
1693 			desc->len = vq->vhost_hlen;
1694 			desc = &vq->desc[desc->next];
1695 			desc->len = rte_pktmbuf_data_len(buff);
1696 		} else {
1697 			desc->len = packet_len;
1698 		}
1699 
1700 		/* Update used ring with desc information */
1701 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1702 			= head[packet_success];
1703 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1704 			= packet_len;
1705 		res_cur_idx++;
1706 		packet_success++;
1707 
1708 		/* A header is required per buffer. */
1709 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1710 			(const void *)&virtio_hdr, vq->vhost_hlen);
1711 
1712 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1713 
1714 		if (likely(packet_success < count)) {
1715 			/* Prefetch descriptor index. */
1716 			rte_prefetch0(&vq->desc[head[packet_success]]);
1717 		}
1718 	}
1719 
1720 	rte_compiler_barrier();
1721 
1722 	LOG_DEBUG(VHOST_DATA,
1723 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1724 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1725 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1726 
1727 	*(volatile uint16_t *)&vq->used->idx += count;
1728 	vq->last_used_idx += count;
1729 
1730 	LOG_DEBUG(VHOST_DATA,
1731 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1732 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1733 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1734 
1735 	/* Kick the guest if necessary. */
1736 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1737 		eventfd_write((int)vq->kickfd, 1);
1738 
1739 	return count;
1740 }
1741 
1742 /*
1743  * This function routes the TX packet to the correct interface.
1744  * This may be a local device or the physical port.
1745  */
1746 static inline void __attribute__((always_inline))
1747 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1748 	uint32_t desc_idx, uint8_t need_copy)
1749 {
1750 	struct mbuf_table *tx_q;
1751 	struct rte_mbuf **m_table;
1752 	struct rte_mbuf *mbuf = NULL;
1753 	unsigned len, ret, offset = 0;
1754 	struct vpool *vpool;
1755 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1756 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1757 
1758 	/*Add packet to the port tx queue*/
1759 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1760 	len = tx_q->len;
1761 
1762 	/* Allocate an mbuf and populate the structure. */
1763 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1764 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1765 	if (unlikely(mbuf == NULL)) {
1766 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1767 		RTE_LOG(ERR, VHOST_DATA,
1768 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1769 			dev->device_fh);
1770 		put_desc_to_used_list_zcp(vq, desc_idx);
1771 		return;
1772 	}
1773 
1774 	if (vm2vm_mode == VM2VM_HARDWARE) {
1775 		/* Avoid using a vlan tag from any vm for external pkt, such as
1776 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1777 		 * selection, MAC address determines it as an external pkt
1778 		 * which should go to network, while vlan tag determine it as
1779 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1780 		 * such a ambiguous situation, so pkt will lost.
1781 		 */
1782 		vlan_tag = external_pkt_default_vlan_tag;
1783 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1784 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1785 			__rte_mbuf_raw_free(mbuf);
1786 			return;
1787 		}
1788 	}
1789 
1790 	mbuf->nb_segs = m->nb_segs;
1791 	mbuf->next = m->next;
1792 	mbuf->data_len = m->data_len + offset;
1793 	mbuf->pkt_len = mbuf->data_len;
1794 	if (unlikely(need_copy)) {
1795 		/* Copy the packet contents to the mbuf. */
1796 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1797 			rte_pktmbuf_mtod(m, void *),
1798 			m->data_len);
1799 	} else {
1800 		mbuf->data_off = m->data_off;
1801 		mbuf->buf_physaddr = m->buf_physaddr;
1802 		mbuf->buf_addr = m->buf_addr;
1803 	}
1804 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1805 	mbuf->vlan_tci = vlan_tag;
1806 	mbuf->l2_len = sizeof(struct ether_hdr);
1807 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1808 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1809 
1810 	tx_q->m_table[len] = mbuf;
1811 	len++;
1812 
1813 	LOG_DEBUG(VHOST_DATA,
1814 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1815 		dev->device_fh,
1816 		mbuf->nb_segs,
1817 		(mbuf->next == NULL) ? "null" : "non-null");
1818 
1819 	if (enable_stats) {
1820 		dev_statistics[dev->device_fh].tx_total++;
1821 		dev_statistics[dev->device_fh].tx++;
1822 	}
1823 
1824 	if (unlikely(len == MAX_PKT_BURST)) {
1825 		m_table = (struct rte_mbuf **)tx_q->m_table;
1826 		ret = rte_eth_tx_burst(ports[0],
1827 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1828 
1829 		/*
1830 		 * Free any buffers not handled by TX and update
1831 		 * the port stats.
1832 		 */
1833 		if (unlikely(ret < len)) {
1834 			do {
1835 				rte_pktmbuf_free(m_table[ret]);
1836 			} while (++ret < len);
1837 		}
1838 
1839 		len = 0;
1840 		txmbuf_clean_zcp(dev, vpool);
1841 	}
1842 
1843 	tx_q->len = len;
1844 
1845 	return;
1846 }
1847 
1848 /*
1849  * This function TX all available packets in virtio TX queue for one
1850  * virtio-net device. If it is first packet, it learns MAC address and
1851  * setup VMDQ.
1852  */
1853 static inline void __attribute__((always_inline))
1854 virtio_dev_tx_zcp(struct virtio_net *dev)
1855 {
1856 	struct rte_mbuf m;
1857 	struct vhost_virtqueue *vq;
1858 	struct vring_desc *desc;
1859 	uint64_t buff_addr = 0, phys_addr;
1860 	uint32_t head[MAX_PKT_BURST];
1861 	uint32_t i;
1862 	uint16_t free_entries, packet_success = 0;
1863 	uint16_t avail_idx;
1864 	uint8_t need_copy = 0;
1865 	hpa_type addr_type;
1866 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1867 
1868 	vq = dev->virtqueue[VIRTIO_TXQ];
1869 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1870 
1871 	/* If there are no available buffers then return. */
1872 	if (vq->last_used_idx_res == avail_idx)
1873 		return;
1874 
1875 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1876 
1877 	/* Prefetch available ring to retrieve head indexes. */
1878 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1879 
1880 	/* Get the number of free entries in the ring */
1881 	free_entries = (avail_idx - vq->last_used_idx_res);
1882 
1883 	/* Limit to MAX_PKT_BURST. */
1884 	free_entries
1885 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1886 
1887 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1888 		dev->device_fh, free_entries);
1889 
1890 	/* Retrieve all of the head indexes first to avoid caching issues. */
1891 	for (i = 0; i < free_entries; i++)
1892 		head[i]
1893 			= vq->avail->ring[(vq->last_used_idx_res + i)
1894 			& (vq->size - 1)];
1895 
1896 	vq->last_used_idx_res += free_entries;
1897 
1898 	/* Prefetch descriptor index. */
1899 	rte_prefetch0(&vq->desc[head[packet_success]]);
1900 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1901 
1902 	while (packet_success < free_entries) {
1903 		desc = &vq->desc[head[packet_success]];
1904 
1905 		/* Discard first buffer as it is the virtio header */
1906 		desc = &vq->desc[desc->next];
1907 
1908 		/* Buffer address translation. */
1909 		buff_addr = gpa_to_vva(dev, desc->addr);
1910 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1911 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1912 			&addr_type);
1913 
1914 		if (likely(packet_success < (free_entries - 1)))
1915 			/* Prefetch descriptor index. */
1916 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1917 
1918 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1919 			RTE_LOG(ERR, VHOST_DATA,
1920 				"(%"PRIu64") Invalid frame buffer address found"
1921 				"when TX packets!\n",
1922 				dev->device_fh);
1923 			packet_success++;
1924 			continue;
1925 		}
1926 
1927 		/* Prefetch buffer address. */
1928 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1929 
1930 		/*
1931 		 * Setup dummy mbuf. This is copied to a real mbuf if
1932 		 * transmitted out the physical port.
1933 		 */
1934 		m.data_len = desc->len;
1935 		m.nb_segs = 1;
1936 		m.next = NULL;
1937 		m.data_off = 0;
1938 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1939 		m.buf_physaddr = phys_addr;
1940 
1941 		/*
1942 		 * Check if the frame buffer address from guest crosses
1943 		 * sub-region or not.
1944 		 */
1945 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1946 			RTE_LOG(ERR, VHOST_DATA,
1947 				"(%"PRIu64") Frame buffer address cross "
1948 				"sub-regioin found when attaching TX frame "
1949 				"buffer address!\n",
1950 				dev->device_fh);
1951 			need_copy = 1;
1952 		} else
1953 			need_copy = 0;
1954 
1955 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1956 
1957 		/*
1958 		 * If this is the first received packet we need to learn
1959 		 * the MAC and setup VMDQ
1960 		 */
1961 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1962 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1963 				/*
1964 				 * Discard frame if device is scheduled for
1965 				 * removal or a duplicate MAC address is found.
1966 				 */
1967 				packet_success += free_entries;
1968 				vq->last_used_idx += packet_success;
1969 				break;
1970 			}
1971 		}
1972 
1973 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1974 		packet_success++;
1975 	}
1976 }
1977 
1978 /*
1979  * This function is called by each data core. It handles all RX/TX registered
1980  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1981  * addresses are compared with all devices in the main linked list.
1982  */
1983 static int
1984 switch_worker_zcp(__attribute__((unused)) void *arg)
1985 {
1986 	struct virtio_net *dev = NULL;
1987 	struct vhost_dev  *vdev = NULL;
1988 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1989 	struct virtio_net_data_ll *dev_ll;
1990 	struct mbuf_table *tx_q;
1991 	volatile struct lcore_ll_info *lcore_ll;
1992 	const uint64_t drain_tsc
1993 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1994 		* BURST_TX_DRAIN_US;
1995 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1996 	unsigned ret;
1997 	const uint16_t lcore_id = rte_lcore_id();
1998 	uint16_t count_in_ring, rx_count = 0;
1999 
2000 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2001 
2002 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2003 	prev_tsc = 0;
2004 
2005 	while (1) {
2006 		cur_tsc = rte_rdtsc();
2007 
2008 		/* TX burst queue drain */
2009 		diff_tsc = cur_tsc - prev_tsc;
2010 		if (unlikely(diff_tsc > drain_tsc)) {
2011 			/*
2012 			 * Get mbuf from vpool.pool and detach mbuf and
2013 			 * put back into vpool.ring.
2014 			 */
2015 			dev_ll = lcore_ll->ll_root_used;
2016 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2017 				/* Get virtio device ID */
2018 				vdev = dev_ll->vdev;
2019 				dev = vdev->dev;
2020 
2021 				if (likely(!vdev->remove)) {
2022 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2023 					if (tx_q->len) {
2024 						LOG_DEBUG(VHOST_DATA,
2025 						"TX queue drained after timeout"
2026 						" with burst size %u\n",
2027 						tx_q->len);
2028 
2029 						/*
2030 						 * Tx any packets in the queue
2031 						 */
2032 						ret = rte_eth_tx_burst(
2033 							ports[0],
2034 							(uint16_t)tx_q->txq_id,
2035 							(struct rte_mbuf **)
2036 							tx_q->m_table,
2037 							(uint16_t)tx_q->len);
2038 						if (unlikely(ret < tx_q->len)) {
2039 							do {
2040 								rte_pktmbuf_free(
2041 									tx_q->m_table[ret]);
2042 							} while (++ret < tx_q->len);
2043 						}
2044 						tx_q->len = 0;
2045 
2046 						txmbuf_clean_zcp(dev,
2047 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2048 					}
2049 				}
2050 				dev_ll = dev_ll->next;
2051 			}
2052 			prev_tsc = cur_tsc;
2053 		}
2054 
2055 		rte_prefetch0(lcore_ll->ll_root_used);
2056 
2057 		/*
2058 		 * Inform the configuration core that we have exited the linked
2059 		 * list and that no devices are in use if requested.
2060 		 */
2061 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2062 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2063 
2064 		/* Process devices */
2065 		dev_ll = lcore_ll->ll_root_used;
2066 
2067 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2068 			vdev = dev_ll->vdev;
2069 			dev  = vdev->dev;
2070 			if (unlikely(vdev->remove)) {
2071 				dev_ll = dev_ll->next;
2072 				unlink_vmdq(vdev);
2073 				vdev->ready = DEVICE_SAFE_REMOVE;
2074 				continue;
2075 			}
2076 
2077 			if (likely(vdev->ready == DEVICE_RX)) {
2078 				uint32_t index = vdev->vmdq_rx_q;
2079 				uint16_t i;
2080 				count_in_ring
2081 				= rte_ring_count(vpool_array[index].ring);
2082 				uint16_t free_entries
2083 				= (uint16_t)get_available_ring_num_zcp(dev);
2084 
2085 				/*
2086 				 * Attach all mbufs in vpool.ring and put back
2087 				 * into vpool.pool.
2088 				 */
2089 				for (i = 0;
2090 				i < RTE_MIN(free_entries,
2091 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2092 				i++)
2093 					attach_rxmbuf_zcp(dev);
2094 
2095 				/* Handle guest RX */
2096 				rx_count = rte_eth_rx_burst(ports[0],
2097 					vdev->vmdq_rx_q, pkts_burst,
2098 					MAX_PKT_BURST);
2099 
2100 				if (rx_count) {
2101 					ret_count = virtio_dev_rx_zcp(dev,
2102 							pkts_burst, rx_count);
2103 					if (enable_stats) {
2104 						dev_statistics[dev->device_fh].rx_total
2105 							+= rx_count;
2106 						dev_statistics[dev->device_fh].rx
2107 							+= ret_count;
2108 					}
2109 					while (likely(rx_count)) {
2110 						rx_count--;
2111 						pktmbuf_detach_zcp(
2112 							pkts_burst[rx_count]);
2113 						rte_ring_sp_enqueue(
2114 							vpool_array[index].ring,
2115 							(void *)pkts_burst[rx_count]);
2116 					}
2117 				}
2118 			}
2119 
2120 			if (likely(!vdev->remove))
2121 				/* Handle guest TX */
2122 				virtio_dev_tx_zcp(dev);
2123 
2124 			/* Move to the next device in the list */
2125 			dev_ll = dev_ll->next;
2126 		}
2127 	}
2128 
2129 	return 0;
2130 }
2131 
2132 
2133 /*
2134  * Add an entry to a used linked list. A free entry must first be found
2135  * in the free linked list using get_data_ll_free_entry();
2136  */
2137 static void
2138 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2139 	struct virtio_net_data_ll *ll_dev)
2140 {
2141 	struct virtio_net_data_ll *ll = *ll_root_addr;
2142 
2143 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2144 	ll_dev->next = NULL;
2145 	rte_compiler_barrier();
2146 
2147 	/* If ll == NULL then this is the first device. */
2148 	if (ll) {
2149 		/* Increment to the tail of the linked list. */
2150 		while ((ll->next != NULL) )
2151 			ll = ll->next;
2152 
2153 		ll->next = ll_dev;
2154 	} else {
2155 		*ll_root_addr = ll_dev;
2156 	}
2157 }
2158 
2159 /*
2160  * Remove an entry from a used linked list. The entry must then be added to
2161  * the free linked list using put_data_ll_free_entry().
2162  */
2163 static void
2164 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2165 	struct virtio_net_data_ll *ll_dev,
2166 	struct virtio_net_data_ll *ll_dev_last)
2167 {
2168 	struct virtio_net_data_ll *ll = *ll_root_addr;
2169 
2170 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2171 		return;
2172 
2173 	if (ll_dev == ll)
2174 		*ll_root_addr = ll_dev->next;
2175 	else
2176 		if (likely(ll_dev_last != NULL))
2177 			ll_dev_last->next = ll_dev->next;
2178 		else
2179 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2180 }
2181 
2182 /*
2183  * Find and return an entry from the free linked list.
2184  */
2185 static struct virtio_net_data_ll *
2186 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2187 {
2188 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2189 	struct virtio_net_data_ll *ll_dev;
2190 
2191 	if (ll_free == NULL)
2192 		return NULL;
2193 
2194 	ll_dev = ll_free;
2195 	*ll_root_addr = ll_free->next;
2196 
2197 	return ll_dev;
2198 }
2199 
2200 /*
2201  * Place an entry back on to the free linked list.
2202  */
2203 static void
2204 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2205 	struct virtio_net_data_ll *ll_dev)
2206 {
2207 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2208 
2209 	if (ll_dev == NULL)
2210 		return;
2211 
2212 	ll_dev->next = ll_free;
2213 	*ll_root_addr = ll_dev;
2214 }
2215 
2216 /*
2217  * Creates a linked list of a given size.
2218  */
2219 static struct virtio_net_data_ll *
2220 alloc_data_ll(uint32_t size)
2221 {
2222 	struct virtio_net_data_ll *ll_new;
2223 	uint32_t i;
2224 
2225 	/* Malloc and then chain the linked list. */
2226 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2227 	if (ll_new == NULL) {
2228 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2229 		return NULL;
2230 	}
2231 
2232 	for (i = 0; i < size - 1; i++) {
2233 		ll_new[i].vdev = NULL;
2234 		ll_new[i].next = &ll_new[i+1];
2235 	}
2236 	ll_new[i].next = NULL;
2237 
2238 	return (ll_new);
2239 }
2240 
2241 /*
2242  * Create the main linked list along with each individual cores linked list. A used and a free list
2243  * are created to manage entries.
2244  */
2245 static int
2246 init_data_ll (void)
2247 {
2248 	int lcore;
2249 
2250 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2251 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2252 		if (lcore_info[lcore].lcore_ll == NULL) {
2253 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2254 			return -1;
2255 		}
2256 
2257 		lcore_info[lcore].lcore_ll->device_num = 0;
2258 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2259 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2260 		if (num_devices % num_switching_cores)
2261 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2262 		else
2263 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2264 	}
2265 
2266 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2267 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2268 
2269 	return 0;
2270 }
2271 
2272 /*
2273  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2274  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2275  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2276  */
2277 static void
2278 destroy_device (volatile struct virtio_net *dev)
2279 {
2280 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2281 	struct virtio_net_data_ll *ll_main_dev_cur;
2282 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2283 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2284 	struct vhost_dev *vdev;
2285 	int lcore;
2286 
2287 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2288 
2289 	vdev = (struct vhost_dev *)dev->priv;
2290 	/*set the remove flag. */
2291 	vdev->remove = 1;
2292 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2293 		rte_pause();
2294 	}
2295 
2296 	/* Search for entry to be removed from lcore ll */
2297 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2298 	while (ll_lcore_dev_cur != NULL) {
2299 		if (ll_lcore_dev_cur->vdev == vdev) {
2300 			break;
2301 		} else {
2302 			ll_lcore_dev_last = ll_lcore_dev_cur;
2303 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2304 		}
2305 	}
2306 
2307 	if (ll_lcore_dev_cur == NULL) {
2308 		RTE_LOG(ERR, VHOST_CONFIG,
2309 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2310 			dev->device_fh);
2311 		return;
2312 	}
2313 
2314 	/* Search for entry to be removed from main ll */
2315 	ll_main_dev_cur = ll_root_used;
2316 	ll_main_dev_last = NULL;
2317 	while (ll_main_dev_cur != NULL) {
2318 		if (ll_main_dev_cur->vdev == vdev) {
2319 			break;
2320 		} else {
2321 			ll_main_dev_last = ll_main_dev_cur;
2322 			ll_main_dev_cur = ll_main_dev_cur->next;
2323 		}
2324 	}
2325 
2326 	/* Remove entries from the lcore and main ll. */
2327 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2328 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2329 
2330 	/* Set the dev_removal_flag on each lcore. */
2331 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2332 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2333 	}
2334 
2335 	/*
2336 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2337 	 * they can no longer access the device removed from the linked lists and that the devices
2338 	 * are no longer in use.
2339 	 */
2340 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2341 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2342 			rte_pause();
2343 		}
2344 	}
2345 
2346 	/* Add the entries back to the lcore and main free ll.*/
2347 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2348 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2349 
2350 	/* Decrement number of device on the lcore. */
2351 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2352 
2353 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2354 
2355 	if (zero_copy) {
2356 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2357 
2358 		/* Stop the RX queue. */
2359 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2360 			LOG_DEBUG(VHOST_CONFIG,
2361 				"(%"PRIu64") In destroy_device: Failed to stop "
2362 				"rx queue:%d\n",
2363 				dev->device_fh,
2364 				vdev->vmdq_rx_q);
2365 		}
2366 
2367 		LOG_DEBUG(VHOST_CONFIG,
2368 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2369 			"mempool back to ring for RX queue: %d\n",
2370 			dev->device_fh, vdev->vmdq_rx_q);
2371 
2372 		mbuf_destroy_zcp(vpool);
2373 
2374 		/* Stop the TX queue. */
2375 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2376 			LOG_DEBUG(VHOST_CONFIG,
2377 				"(%"PRIu64") In destroy_device: Failed to "
2378 				"stop tx queue:%d\n",
2379 				dev->device_fh, vdev->vmdq_rx_q);
2380 		}
2381 
2382 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2383 
2384 		LOG_DEBUG(VHOST_CONFIG,
2385 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2386 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2387 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2388 			dev->device_fh);
2389 
2390 		mbuf_destroy_zcp(vpool);
2391 		rte_free(vdev->regions_hpa);
2392 	}
2393 	rte_free(vdev);
2394 
2395 }
2396 
2397 /*
2398  * Calculate the region count of physical continous regions for one particular
2399  * region of whose vhost virtual address is continous. The particular region
2400  * start from vva_start, with size of 'size' in argument.
2401  */
2402 static uint32_t
2403 check_hpa_regions(uint64_t vva_start, uint64_t size)
2404 {
2405 	uint32_t i, nregions = 0, page_size = getpagesize();
2406 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2407 	if (vva_start % page_size) {
2408 		LOG_DEBUG(VHOST_CONFIG,
2409 			"in check_countinous: vva start(%p) mod page_size(%d) "
2410 			"has remainder\n",
2411 			(void *)(uintptr_t)vva_start, page_size);
2412 		return 0;
2413 	}
2414 	if (size % page_size) {
2415 		LOG_DEBUG(VHOST_CONFIG,
2416 			"in check_countinous: "
2417 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2418 			size, page_size);
2419 		return 0;
2420 	}
2421 	for (i = 0; i < size - page_size; i = i + page_size) {
2422 		cur_phys_addr
2423 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2424 		next_phys_addr = rte_mem_virt2phy(
2425 			(void *)(uintptr_t)(vva_start + i + page_size));
2426 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2427 			++nregions;
2428 			LOG_DEBUG(VHOST_CONFIG,
2429 				"in check_continuous: hva addr:(%p) is not "
2430 				"continuous with hva addr:(%p), diff:%d\n",
2431 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2432 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2433 				+ page_size), page_size);
2434 			LOG_DEBUG(VHOST_CONFIG,
2435 				"in check_continuous: hpa addr:(%p) is not "
2436 				"continuous with hpa addr:(%p), "
2437 				"diff:(%"PRIu64")\n",
2438 				(void *)(uintptr_t)cur_phys_addr,
2439 				(void *)(uintptr_t)next_phys_addr,
2440 				(next_phys_addr-cur_phys_addr));
2441 		}
2442 	}
2443 	return nregions;
2444 }
2445 
2446 /*
2447  * Divide each region whose vhost virtual address is continous into a few
2448  * sub-regions, make sure the physical address within each sub-region are
2449  * continous. And fill offset(to GPA) and size etc. information of each
2450  * sub-region into regions_hpa.
2451  */
2452 static uint32_t
2453 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2454 {
2455 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2456 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2457 
2458 	if (mem_region_hpa == NULL)
2459 		return 0;
2460 
2461 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2462 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2463 			virtio_memory->regions[regionidx].address_offset;
2464 		mem_region_hpa[regionidx_hpa].guest_phys_address
2465 			= virtio_memory->regions[regionidx].guest_phys_address;
2466 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2467 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2468 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2469 		LOG_DEBUG(VHOST_CONFIG,
2470 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2471 			regionidx_hpa,
2472 			(void *)(uintptr_t)
2473 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2474 		LOG_DEBUG(VHOST_CONFIG,
2475 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2476 			regionidx_hpa,
2477 			(void *)(uintptr_t)
2478 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2479 		for (i = 0, k = 0;
2480 			i < virtio_memory->regions[regionidx].memory_size -
2481 				page_size;
2482 			i += page_size) {
2483 			cur_phys_addr = rte_mem_virt2phy(
2484 					(void *)(uintptr_t)(vva_start + i));
2485 			next_phys_addr = rte_mem_virt2phy(
2486 					(void *)(uintptr_t)(vva_start +
2487 					i + page_size));
2488 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2489 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2490 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2491 					k + page_size;
2492 				mem_region_hpa[regionidx_hpa].memory_size
2493 					= k + page_size;
2494 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2495 					"phys addr end  [%d]:(%p)\n",
2496 					regionidx_hpa,
2497 					(void *)(uintptr_t)
2498 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2499 				LOG_DEBUG(VHOST_CONFIG,
2500 					"in fill_hpa_regions: guest phys addr "
2501 					"size [%d]:(%p)\n",
2502 					regionidx_hpa,
2503 					(void *)(uintptr_t)
2504 					(mem_region_hpa[regionidx_hpa].memory_size));
2505 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2506 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2507 				++regionidx_hpa;
2508 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2509 					next_phys_addr -
2510 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2511 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2512 					" phys addr start[%d]:(%p)\n",
2513 					regionidx_hpa,
2514 					(void *)(uintptr_t)
2515 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2516 				LOG_DEBUG(VHOST_CONFIG,
2517 					"in fill_hpa_regions: host  phys addr "
2518 					"start[%d]:(%p)\n",
2519 					regionidx_hpa,
2520 					(void *)(uintptr_t)
2521 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2522 				k = 0;
2523 			} else {
2524 				k += page_size;
2525 			}
2526 		}
2527 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2528 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2529 			+ k + page_size;
2530 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2531 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2532 			"[%d]:(%p)\n", regionidx_hpa,
2533 			(void *)(uintptr_t)
2534 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2535 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2536 			"[%d]:(%p)\n", regionidx_hpa,
2537 			(void *)(uintptr_t)
2538 			(mem_region_hpa[regionidx_hpa].memory_size));
2539 		++regionidx_hpa;
2540 	}
2541 	return regionidx_hpa;
2542 }
2543 
2544 /*
2545  * A new device is added to a data core. First the device is added to the main linked list
2546  * and the allocated to a specific data core.
2547  */
2548 static int
2549 new_device (struct virtio_net *dev)
2550 {
2551 	struct virtio_net_data_ll *ll_dev;
2552 	int lcore, core_add = 0;
2553 	uint32_t device_num_min = num_devices;
2554 	struct vhost_dev *vdev;
2555 	uint32_t regionidx;
2556 
2557 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2558 	if (vdev == NULL) {
2559 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2560 			dev->device_fh);
2561 		return -1;
2562 	}
2563 	vdev->dev = dev;
2564 	dev->priv = vdev;
2565 
2566 	if (zero_copy) {
2567 		vdev->nregions_hpa = dev->mem->nregions;
2568 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2569 			vdev->nregions_hpa
2570 				+= check_hpa_regions(
2571 					dev->mem->regions[regionidx].guest_phys_address
2572 					+ dev->mem->regions[regionidx].address_offset,
2573 					dev->mem->regions[regionidx].memory_size);
2574 
2575 		}
2576 
2577 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2578 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2579 			RTE_CACHE_LINE_SIZE);
2580 		if (vdev->regions_hpa == NULL) {
2581 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2582 			rte_free(vdev);
2583 			return -1;
2584 		}
2585 
2586 
2587 		if (fill_hpa_memory_regions(
2588 			vdev->regions_hpa, dev->mem
2589 			) != vdev->nregions_hpa) {
2590 
2591 			RTE_LOG(ERR, VHOST_CONFIG,
2592 				"hpa memory regions number mismatch: "
2593 				"[%d]\n", vdev->nregions_hpa);
2594 			rte_free(vdev->regions_hpa);
2595 			rte_free(vdev);
2596 			return -1;
2597 		}
2598 	}
2599 
2600 
2601 	/* Add device to main ll */
2602 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2603 	if (ll_dev == NULL) {
2604 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2605 			"of %d devices per core has been reached\n",
2606 			dev->device_fh, num_devices);
2607 		if (vdev->regions_hpa)
2608 			rte_free(vdev->regions_hpa);
2609 		rte_free(vdev);
2610 		return -1;
2611 	}
2612 	ll_dev->vdev = vdev;
2613 	add_data_ll_entry(&ll_root_used, ll_dev);
2614 	vdev->vmdq_rx_q
2615 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2616 
2617 	if (zero_copy) {
2618 		uint32_t index = vdev->vmdq_rx_q;
2619 		uint32_t count_in_ring, i;
2620 		struct mbuf_table *tx_q;
2621 
2622 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2623 
2624 		LOG_DEBUG(VHOST_CONFIG,
2625 			"(%"PRIu64") in new_device: mbuf count in mempool "
2626 			"before attach is: %d\n",
2627 			dev->device_fh,
2628 			rte_mempool_count(vpool_array[index].pool));
2629 		LOG_DEBUG(VHOST_CONFIG,
2630 			"(%"PRIu64") in new_device: mbuf count in  ring "
2631 			"before attach  is : %d\n",
2632 			dev->device_fh, count_in_ring);
2633 
2634 		/*
2635 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2636 		 */
2637 		for (i = 0; i < count_in_ring; i++)
2638 			attach_rxmbuf_zcp(dev);
2639 
2640 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2641 			"mempool after attach is: %d\n",
2642 			dev->device_fh,
2643 			rte_mempool_count(vpool_array[index].pool));
2644 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2645 			"ring after attach  is : %d\n",
2646 			dev->device_fh,
2647 			rte_ring_count(vpool_array[index].ring));
2648 
2649 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2650 		tx_q->txq_id = vdev->vmdq_rx_q;
2651 
2652 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2653 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2654 
2655 			LOG_DEBUG(VHOST_CONFIG,
2656 				"(%"PRIu64") In new_device: Failed to start "
2657 				"tx queue:%d\n",
2658 				dev->device_fh, vdev->vmdq_rx_q);
2659 
2660 			mbuf_destroy_zcp(vpool);
2661 			rte_free(vdev->regions_hpa);
2662 			rte_free(vdev);
2663 			return -1;
2664 		}
2665 
2666 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2667 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2668 
2669 			LOG_DEBUG(VHOST_CONFIG,
2670 				"(%"PRIu64") In new_device: Failed to start "
2671 				"rx queue:%d\n",
2672 				dev->device_fh, vdev->vmdq_rx_q);
2673 
2674 			/* Stop the TX queue. */
2675 			if (rte_eth_dev_tx_queue_stop(ports[0],
2676 				vdev->vmdq_rx_q) != 0) {
2677 				LOG_DEBUG(VHOST_CONFIG,
2678 					"(%"PRIu64") In new_device: Failed to "
2679 					"stop tx queue:%d\n",
2680 					dev->device_fh, vdev->vmdq_rx_q);
2681 			}
2682 
2683 			mbuf_destroy_zcp(vpool);
2684 			rte_free(vdev->regions_hpa);
2685 			rte_free(vdev);
2686 			return -1;
2687 		}
2688 
2689 	}
2690 
2691 	/*reset ready flag*/
2692 	vdev->ready = DEVICE_MAC_LEARNING;
2693 	vdev->remove = 0;
2694 
2695 	/* Find a suitable lcore to add the device. */
2696 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2697 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2698 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2699 			core_add = lcore;
2700 		}
2701 	}
2702 	/* Add device to lcore ll */
2703 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2704 	if (ll_dev == NULL) {
2705 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2706 		vdev->ready = DEVICE_SAFE_REMOVE;
2707 		destroy_device(dev);
2708 		if (vdev->regions_hpa)
2709 			rte_free(vdev->regions_hpa);
2710 		rte_free(vdev);
2711 		return -1;
2712 	}
2713 	ll_dev->vdev = vdev;
2714 	vdev->coreid = core_add;
2715 
2716 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2717 
2718 	/* Initialize device stats */
2719 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2720 
2721 	/* Disable notifications. */
2722 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2723 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2724 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2725 	dev->flags |= VIRTIO_DEV_RUNNING;
2726 
2727 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2728 
2729 	return 0;
2730 }
2731 
2732 /*
2733  * These callback allow devices to be added to the data core when configuration
2734  * has been fully complete.
2735  */
2736 static const struct virtio_net_device_ops virtio_net_device_ops =
2737 {
2738 	.new_device =  new_device,
2739 	.destroy_device = destroy_device,
2740 };
2741 
2742 /*
2743  * This is a thread will wake up after a period to print stats if the user has
2744  * enabled them.
2745  */
2746 static void
2747 print_stats(void)
2748 {
2749 	struct virtio_net_data_ll *dev_ll;
2750 	uint64_t tx_dropped, rx_dropped;
2751 	uint64_t tx, tx_total, rx, rx_total;
2752 	uint32_t device_fh;
2753 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2754 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2755 
2756 	while(1) {
2757 		sleep(enable_stats);
2758 
2759 		/* Clear screen and move to top left */
2760 		printf("%s%s", clr, top_left);
2761 
2762 		printf("\nDevice statistics ====================================");
2763 
2764 		dev_ll = ll_root_used;
2765 		while (dev_ll != NULL) {
2766 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2767 			tx_total = dev_statistics[device_fh].tx_total;
2768 			tx = dev_statistics[device_fh].tx;
2769 			tx_dropped = tx_total - tx;
2770 			if (zero_copy == 0) {
2771 				rx_total = rte_atomic64_read(
2772 					&dev_statistics[device_fh].rx_total_atomic);
2773 				rx = rte_atomic64_read(
2774 					&dev_statistics[device_fh].rx_atomic);
2775 			} else {
2776 				rx_total = dev_statistics[device_fh].rx_total;
2777 				rx = dev_statistics[device_fh].rx;
2778 			}
2779 			rx_dropped = rx_total - rx;
2780 
2781 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2782 					"\nTX total: 		%"PRIu64""
2783 					"\nTX dropped: 		%"PRIu64""
2784 					"\nTX successful: 		%"PRIu64""
2785 					"\nRX total: 		%"PRIu64""
2786 					"\nRX dropped: 		%"PRIu64""
2787 					"\nRX successful: 		%"PRIu64"",
2788 					device_fh,
2789 					tx_total,
2790 					tx_dropped,
2791 					tx,
2792 					rx_total,
2793 					rx_dropped,
2794 					rx);
2795 
2796 			dev_ll = dev_ll->next;
2797 		}
2798 		printf("\n======================================================\n");
2799 	}
2800 }
2801 
2802 static void
2803 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2804 	char *ring_name, uint32_t nb_mbuf)
2805 {
2806 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2807 	vpool_array[index].pool
2808 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2809 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2810 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2811 		rte_pktmbuf_init, NULL, socket, 0);
2812 	if (vpool_array[index].pool != NULL) {
2813 		vpool_array[index].ring
2814 			= rte_ring_create(ring_name,
2815 				rte_align32pow2(nb_mbuf + 1),
2816 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2817 		if (likely(vpool_array[index].ring != NULL)) {
2818 			LOG_DEBUG(VHOST_CONFIG,
2819 				"in setup_mempool_tbl: mbuf count in "
2820 				"mempool is: %d\n",
2821 				rte_mempool_count(vpool_array[index].pool));
2822 			LOG_DEBUG(VHOST_CONFIG,
2823 				"in setup_mempool_tbl: mbuf count in "
2824 				"ring   is: %d\n",
2825 				rte_ring_count(vpool_array[index].ring));
2826 		} else {
2827 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2828 				ring_name);
2829 		}
2830 
2831 		/* Need consider head room. */
2832 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2833 	} else {
2834 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2835 	}
2836 }
2837 
2838 
2839 /*
2840  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2841  * device is also registered here to handle the IOCTLs.
2842  */
2843 int
2844 main(int argc, char *argv[])
2845 {
2846 	struct rte_mempool *mbuf_pool = NULL;
2847 	unsigned lcore_id, core_id = 0;
2848 	unsigned nb_ports, valid_num_ports;
2849 	int ret;
2850 	uint8_t portid;
2851 	uint16_t queue_id;
2852 	static pthread_t tid;
2853 
2854 	/* init EAL */
2855 	ret = rte_eal_init(argc, argv);
2856 	if (ret < 0)
2857 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2858 	argc -= ret;
2859 	argv += ret;
2860 
2861 	/* parse app arguments */
2862 	ret = us_vhost_parse_args(argc, argv);
2863 	if (ret < 0)
2864 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2865 
2866 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2867 		if (rte_lcore_is_enabled(lcore_id))
2868 			lcore_ids[core_id ++] = lcore_id;
2869 
2870 	if (rte_lcore_count() > RTE_MAX_LCORE)
2871 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2872 
2873 	/*set the number of swithcing cores available*/
2874 	num_switching_cores = rte_lcore_count()-1;
2875 
2876 	/* Get the number of physical ports. */
2877 	nb_ports = rte_eth_dev_count();
2878 	if (nb_ports > RTE_MAX_ETHPORTS)
2879 		nb_ports = RTE_MAX_ETHPORTS;
2880 
2881 	/*
2882 	 * Update the global var NUM_PORTS and global array PORTS
2883 	 * and get value of var VALID_NUM_PORTS according to system ports number
2884 	 */
2885 	valid_num_ports = check_ports_num(nb_ports);
2886 
2887 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2888 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2889 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2890 		return -1;
2891 	}
2892 
2893 	if (zero_copy == 0) {
2894 		/* Create the mbuf pool. */
2895 		mbuf_pool = rte_mempool_create(
2896 				"MBUF_POOL",
2897 				NUM_MBUFS_PER_PORT
2898 				* valid_num_ports,
2899 				MBUF_SIZE, MBUF_CACHE_SIZE,
2900 				sizeof(struct rte_pktmbuf_pool_private),
2901 				rte_pktmbuf_pool_init, NULL,
2902 				rte_pktmbuf_init, NULL,
2903 				rte_socket_id(), 0);
2904 		if (mbuf_pool == NULL)
2905 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2906 
2907 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2908 			vpool_array[queue_id].pool = mbuf_pool;
2909 
2910 		if (vm2vm_mode == VM2VM_HARDWARE) {
2911 			/* Enable VT loop back to let L2 switch to do it. */
2912 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2913 			LOG_DEBUG(VHOST_CONFIG,
2914 				"Enable loop back for L2 switch in vmdq.\n");
2915 		}
2916 	} else {
2917 		uint32_t nb_mbuf;
2918 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2919 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2920 
2921 		nb_mbuf = num_rx_descriptor
2922 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2923 			+ num_switching_cores * MAX_PKT_BURST;
2924 
2925 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2926 			snprintf(pool_name, sizeof(pool_name),
2927 				"rxmbuf_pool_%u", queue_id);
2928 			snprintf(ring_name, sizeof(ring_name),
2929 				"rxmbuf_ring_%u", queue_id);
2930 			setup_mempool_tbl(rte_socket_id(), queue_id,
2931 				pool_name, ring_name, nb_mbuf);
2932 		}
2933 
2934 		nb_mbuf = num_tx_descriptor
2935 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2936 				+ num_switching_cores * MAX_PKT_BURST;
2937 
2938 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2939 			snprintf(pool_name, sizeof(pool_name),
2940 				"txmbuf_pool_%u", queue_id);
2941 			snprintf(ring_name, sizeof(ring_name),
2942 				"txmbuf_ring_%u", queue_id);
2943 			setup_mempool_tbl(rte_socket_id(),
2944 				(queue_id + MAX_QUEUES),
2945 				pool_name, ring_name, nb_mbuf);
2946 		}
2947 
2948 		if (vm2vm_mode == VM2VM_HARDWARE) {
2949 			/* Enable VT loop back to let L2 switch to do it. */
2950 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2951 			LOG_DEBUG(VHOST_CONFIG,
2952 				"Enable loop back for L2 switch in vmdq.\n");
2953 		}
2954 	}
2955 	/* Set log level. */
2956 	rte_set_log_level(LOG_LEVEL);
2957 
2958 	/* initialize all ports */
2959 	for (portid = 0; portid < nb_ports; portid++) {
2960 		/* skip ports that are not enabled */
2961 		if ((enabled_port_mask & (1 << portid)) == 0) {
2962 			RTE_LOG(INFO, VHOST_PORT,
2963 				"Skipping disabled port %d\n", portid);
2964 			continue;
2965 		}
2966 		if (port_init(portid) != 0)
2967 			rte_exit(EXIT_FAILURE,
2968 				"Cannot initialize network ports\n");
2969 	}
2970 
2971 	/* Initialise all linked lists. */
2972 	if (init_data_ll() == -1)
2973 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2974 
2975 	/* Initialize device stats */
2976 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2977 
2978 	/* Enable stats if the user option is set. */
2979 	if (enable_stats)
2980 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2981 
2982 	/* Launch all data cores. */
2983 	if (zero_copy == 0) {
2984 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2985 			rte_eal_remote_launch(switch_worker,
2986 				mbuf_pool, lcore_id);
2987 		}
2988 	} else {
2989 		uint32_t count_in_mempool, index, i;
2990 		for (index = 0; index < 2*MAX_QUEUES; index++) {
2991 			/* For all RX and TX queues. */
2992 			count_in_mempool
2993 				= rte_mempool_count(vpool_array[index].pool);
2994 
2995 			/*
2996 			 * Transfer all un-attached mbufs from vpool.pool
2997 			 * to vpoo.ring.
2998 			 */
2999 			for (i = 0; i < count_in_mempool; i++) {
3000 				struct rte_mbuf *mbuf
3001 					= __rte_mbuf_raw_alloc(
3002 						vpool_array[index].pool);
3003 				rte_ring_sp_enqueue(vpool_array[index].ring,
3004 						(void *)mbuf);
3005 			}
3006 
3007 			LOG_DEBUG(VHOST_CONFIG,
3008 				"in main: mbuf count in mempool at initial "
3009 				"is: %d\n", count_in_mempool);
3010 			LOG_DEBUG(VHOST_CONFIG,
3011 				"in main: mbuf count in  ring at initial  is :"
3012 				" %d\n",
3013 				rte_ring_count(vpool_array[index].ring));
3014 		}
3015 
3016 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3017 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3018 				lcore_id);
3019 	}
3020 
3021 	if (mergeable == 0)
3022 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3023 
3024 	/* Register CUSE device to handle IOCTLs. */
3025 	ret = rte_vhost_driver_register((char *)&dev_basename);
3026 	if (ret != 0)
3027 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3028 
3029 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3030 
3031 	/* Start CUSE session. */
3032 	rte_vhost_driver_session_start();
3033 	return 0;
3034 
3035 }
3036 
3037