xref: /dpdk/examples/vhost/main.c (revision e44fb8a4306dd4b62f46323d751f67e91ee5a31a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
103 
104 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
106 
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108 
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX			1
112 #define DEVICE_SAFE_REMOVE	2
113 
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117 
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121 
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133 
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 		+ sizeof(struct rte_mbuf)))
137 
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140 
141 #define INVALID_PORT_ID 0xFF
142 
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145 
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148 
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151 
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154 
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157 
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160 
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163 
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166 
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170 
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177 
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184 
185 struct vpool {
186 	struct rte_mempool *pool;
187 	struct rte_ring *ring;
188 	uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193 	VM2VM_DISABLED = 0,
194 	VM2VM_SOFTWARE = 1,
195 	VM2VM_HARDWARE = 2,
196 	VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202 	PHYS_ADDR_CONTINUOUS = 0,
203 	PHYS_ADDR_CROSS_SUBREG = 1,
204 	PHYS_ADDR_INVALID = 2,
205 	PHYS_ADDR_LAST
206 } hpa_type;
207 
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 
220 
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223 	.rx_thresh = {
224 		.pthresh = RX_PTHRESH,
225 		.hthresh = RX_HTHRESH,
226 		.wthresh = RX_WTHRESH,
227 	},
228 	.rx_drop_en = 1,
229 };
230 
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237 	.tx_thresh = {
238 		.pthresh = TX_PTHRESH,
239 		.hthresh = TX_HTHRESH,
240 		.wthresh = TX_WTHRESH,
241 	},
242 	.tx_free_thresh = 0, /* Use PMD default values */
243 	.tx_rs_thresh = 0, /* Use PMD default values */
244 };
245 
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248 	.rxmode = {
249 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250 		.split_hdr_size = 0,
251 		.header_split   = 0, /**< Header Split disabled */
252 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
253 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
254 		/*
255 		 * It is necessary for 1G NIC such as I350,
256 		 * this fixes bug of ipv4 forwarding in guest can't
257 		 * forward pakets from one virtio dev to another virtio dev.
258 		 */
259 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
262 	},
263 
264 	.txmode = {
265 		.mq_mode = ETH_MQ_TX_NONE,
266 	},
267 	.rx_adv_conf = {
268 		/*
269 		 * should be overridden separately in code with
270 		 * appropriate values
271 		 */
272 		.vmdq_rx_conf = {
273 			.nb_queue_pools = ETH_8_POOLS,
274 			.enable_default_pool = 0,
275 			.default_pool = 0,
276 			.nb_pool_maps = 0,
277 			.pool_map = {{0, 0},},
278 		},
279 	},
280 };
281 
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285 
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
290 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297 
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300 
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304 
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307 
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310 	unsigned len;
311 	unsigned txq_id;
312 	struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314 
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317 
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320 
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323 	unsigned char   h_dest[ETH_ALEN];
324 	unsigned char   h_source[ETH_ALEN];
325 	__be16          h_vlan_proto;
326 	__be16          h_vlan_TCI;
327 	__be16          h_vlan_encapsulated_proto;
328 };
329 
330 /* IPv4 Header */
331 struct ipv4_hdr {
332 	uint8_t  version_ihl;		/**< version and header length */
333 	uint8_t  type_of_service;	/**< type of service */
334 	uint16_t total_length;		/**< length of packet */
335 	uint16_t packet_id;		/**< packet ID */
336 	uint16_t fragment_offset;	/**< fragmentation offset */
337 	uint8_t  time_to_live;		/**< time to live */
338 	uint8_t  next_proto_id;		/**< protocol ID */
339 	uint16_t hdr_checksum;		/**< header checksum */
340 	uint32_t src_addr;		/**< source address */
341 	uint32_t dst_addr;		/**< destination address */
342 } __attribute__((__packed__));
343 
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347 
348 /* Per-device statistics struct */
349 struct device_statistics {
350 	uint64_t tx_total;
351 	rte_atomic64_t rx_total_atomic;
352 	uint64_t rx_total;
353 	uint64_t tx;
354 	rte_atomic64_t rx_atomic;
355 	uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358 
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366 	struct rte_eth_vmdq_rx_conf conf;
367 	unsigned i;
368 
369 	memset(&conf, 0, sizeof(conf));
370 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371 	conf.nb_pool_maps = num_devices;
372 	conf.enable_loop_back =
373 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374 
375 	for (i = 0; i < conf.nb_pool_maps; i++) {
376 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
377 		conf.pool_map[i].pools = (1UL << i);
378 	}
379 
380 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383 	return 0;
384 }
385 
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394 	if (num_devices > max_nb_devices) {
395 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396 		return -1;
397 	}
398 	return 0;
399 }
400 
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408 	struct rte_eth_dev_info dev_info;
409 	struct rte_eth_conf port_conf;
410 	uint16_t rx_rings, tx_rings;
411 	uint16_t rx_ring_size, tx_ring_size;
412 	int retval;
413 	uint16_t q;
414 
415 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416 	rte_eth_dev_info_get (port, &dev_info);
417 
418 	/*configure the number of supported virtio devices based on VMDQ limits */
419 	num_devices = dev_info.max_vmdq_pools;
420 	num_queues = dev_info.max_rx_queues;
421 
422 	if (zero_copy) {
423 		rx_ring_size = num_rx_descriptor;
424 		tx_ring_size = num_tx_descriptor;
425 		tx_rings = dev_info.max_tx_queues;
426 	} else {
427 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429 		tx_rings = (uint16_t)rte_lcore_count();
430 	}
431 
432 	retval = validate_num_devices(MAX_DEVICES);
433 	if (retval < 0)
434 		return retval;
435 
436 	/* Get port configuration. */
437 	retval = get_eth_conf(&port_conf, num_devices);
438 	if (retval < 0)
439 		return retval;
440 
441 	if (port >= rte_eth_dev_count()) return -1;
442 
443 	rx_rings = (uint16_t)num_queues,
444 	/* Configure ethernet device. */
445 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446 	if (retval != 0)
447 		return retval;
448 
449 	/* Setup the queues. */
450 	for (q = 0; q < rx_rings; q ++) {
451 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 						rte_eth_dev_socket_id(port), &rx_conf_default,
453 						vpool_array[q].pool);
454 		if (retval < 0)
455 			return retval;
456 	}
457 	for (q = 0; q < tx_rings; q ++) {
458 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459 						rte_eth_dev_socket_id(port), &tx_conf_default);
460 		if (retval < 0)
461 			return retval;
462 	}
463 
464 	/* Start the device. */
465 	retval  = rte_eth_dev_start(port);
466 	if (retval < 0) {
467 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468 		return retval;
469 	}
470 
471 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475 			(unsigned)port,
476 			vmdq_ports_eth_addr[port].addr_bytes[0],
477 			vmdq_ports_eth_addr[port].addr_bytes[1],
478 			vmdq_ports_eth_addr[port].addr_bytes[2],
479 			vmdq_ports_eth_addr[port].addr_bytes[3],
480 			vmdq_ports_eth_addr[port].addr_bytes[4],
481 			vmdq_ports_eth_addr[port].addr_bytes[5]);
482 
483 	return 0;
484 }
485 
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492 	/* parse number string */
493 
494 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495 		return -1;
496 	else
497 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498 
499 	return 0;
500 }
501 
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508 	char *end = NULL;
509 	unsigned long pm;
510 
511 	errno = 0;
512 
513 	/* parse hexadecimal string */
514 	pm = strtoul(portmask, &end, 16);
515 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516 		return -1;
517 
518 	if (pm == 0)
519 		return -1;
520 
521 	return pm;
522 
523 }
524 
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531 	char *end = NULL;
532 	unsigned long num;
533 
534 	errno = 0;
535 
536 	/* parse unsigned int string */
537 	num = strtoul(q_arg, &end, 10);
538 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539 		return -1;
540 
541 	if (num > max_valid_value)
542 		return -1;
543 
544 	return num;
545 
546 }
547 
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555 	"		--vm2vm [0|1|2]\n"
556 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557 	"		--dev-basename <name>\n"
558 	"		--nb-devices ND\n"
559 	"		-p PORTMASK: Set mask for ports to be used by application\n"
560 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566 	"		--dev-basename: The basename to be used for the character device.\n"
567 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
568 			"zero copy\n"
569 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
570 			"used only when zero copy is enabled.\n"
571 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
572 			"used only when zero copy is enabled.\n",
573 	       prgname);
574 }
575 
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582 	int opt, ret;
583 	int option_index;
584 	unsigned i;
585 	const char *prgname = argv[0];
586 	static struct option long_option[] = {
587 		{"vm2vm", required_argument, NULL, 0},
588 		{"rx-retry", required_argument, NULL, 0},
589 		{"rx-retry-delay", required_argument, NULL, 0},
590 		{"rx-retry-num", required_argument, NULL, 0},
591 		{"mergeable", required_argument, NULL, 0},
592 		{"stats", required_argument, NULL, 0},
593 		{"dev-basename", required_argument, NULL, 0},
594 		{"zero-copy", required_argument, NULL, 0},
595 		{"rx-desc-num", required_argument, NULL, 0},
596 		{"tx-desc-num", required_argument, NULL, 0},
597 		{NULL, 0, 0, 0},
598 	};
599 
600 	/* Parse command line */
601 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602 		switch (opt) {
603 		/* Portmask */
604 		case 'p':
605 			enabled_port_mask = parse_portmask(optarg);
606 			if (enabled_port_mask == 0) {
607 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 				us_vhost_usage(prgname);
609 				return -1;
610 			}
611 			break;
612 
613 		case 0:
614 			/* Enable/disable vm2vm comms. */
615 			if (!strncmp(long_option[option_index].name, "vm2vm",
616 				MAX_LONG_OPT_SZ)) {
617 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618 				if (ret == -1) {
619 					RTE_LOG(INFO, VHOST_CONFIG,
620 						"Invalid argument for "
621 						"vm2vm [0|1|2]\n");
622 					us_vhost_usage(prgname);
623 					return -1;
624 				} else {
625 					vm2vm_mode = (vm2vm_type)ret;
626 				}
627 			}
628 
629 			/* Enable/disable retries on RX. */
630 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631 				ret = parse_num_opt(optarg, 1);
632 				if (ret == -1) {
633 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634 					us_vhost_usage(prgname);
635 					return -1;
636 				} else {
637 					enable_retry = ret;
638 				}
639 			}
640 
641 			/* Specify the retries delay time (in useconds) on RX. */
642 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643 				ret = parse_num_opt(optarg, INT32_MAX);
644 				if (ret == -1) {
645 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646 					us_vhost_usage(prgname);
647 					return -1;
648 				} else {
649 					burst_rx_delay_time = ret;
650 				}
651 			}
652 
653 			/* Specify the retries number on RX. */
654 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655 				ret = parse_num_opt(optarg, INT32_MAX);
656 				if (ret == -1) {
657 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658 					us_vhost_usage(prgname);
659 					return -1;
660 				} else {
661 					burst_rx_retry_num = ret;
662 				}
663 			}
664 
665 			/* Enable/disable RX mergeable buffers. */
666 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667 				ret = parse_num_opt(optarg, 1);
668 				if (ret == -1) {
669 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670 					us_vhost_usage(prgname);
671 					return -1;
672 				} else {
673 					mergeable = !!ret;
674 					if (ret) {
675 						vmdq_conf_default.rxmode.jumbo_frame = 1;
676 						vmdq_conf_default.rxmode.max_rx_pkt_len
677 							= JUMBO_FRAME_MAX_SIZE;
678 					}
679 				}
680 			}
681 
682 			/* Enable/disable stats. */
683 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684 				ret = parse_num_opt(optarg, INT32_MAX);
685 				if (ret == -1) {
686 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687 					us_vhost_usage(prgname);
688 					return -1;
689 				} else {
690 					enable_stats = ret;
691 				}
692 			}
693 
694 			/* Set character device basename. */
695 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696 				if (us_vhost_parse_basename(optarg) == -1) {
697 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698 					us_vhost_usage(prgname);
699 					return -1;
700 				}
701 			}
702 
703 			/* Enable/disable rx/tx zero copy. */
704 			if (!strncmp(long_option[option_index].name,
705 				"zero-copy", MAX_LONG_OPT_SZ)) {
706 				ret = parse_num_opt(optarg, 1);
707 				if (ret == -1) {
708 					RTE_LOG(INFO, VHOST_CONFIG,
709 						"Invalid argument"
710 						" for zero-copy [0|1]\n");
711 					us_vhost_usage(prgname);
712 					return -1;
713 				} else
714 					zero_copy = ret;
715 
716 				if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719 					"zero copy vhost APP, please "
720 					"disable RTE_MBUF_REFCNT\n"
721 					"in config file and then rebuild DPDK "
722 					"core lib!\n"
723 					"Otherwise please disable zero copy "
724 					"flag in command line!\n");
725 					return -1;
726 #endif
727 				}
728 			}
729 
730 			/* Specify the descriptor number on RX. */
731 			if (!strncmp(long_option[option_index].name,
732 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
733 				ret = parse_num_opt(optarg, MAX_RING_DESC);
734 				if ((ret == -1) || (!POWEROF2(ret))) {
735 					RTE_LOG(INFO, VHOST_CONFIG,
736 					"Invalid argument for rx-desc-num[0-N],"
737 					"power of 2 required.\n");
738 					us_vhost_usage(prgname);
739 					return -1;
740 				} else {
741 					num_rx_descriptor = ret;
742 				}
743 			}
744 
745 			/* Specify the descriptor number on TX. */
746 			if (!strncmp(long_option[option_index].name,
747 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
748 				ret = parse_num_opt(optarg, MAX_RING_DESC);
749 				if ((ret == -1) || (!POWEROF2(ret))) {
750 					RTE_LOG(INFO, VHOST_CONFIG,
751 					"Invalid argument for tx-desc-num [0-N],"
752 					"power of 2 required.\n");
753 					us_vhost_usage(prgname);
754 					return -1;
755 				} else {
756 					num_tx_descriptor = ret;
757 				}
758 			}
759 
760 			break;
761 
762 			/* Invalid option - print options. */
763 		default:
764 			us_vhost_usage(prgname);
765 			return -1;
766 		}
767 	}
768 
769 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770 		if (enabled_port_mask & (1 << i))
771 			ports[num_ports++] = (uint8_t)i;
772 	}
773 
774 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777 		return -1;
778 	}
779 
780 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781 		RTE_LOG(INFO, VHOST_PORT,
782 			"Vhost zero copy doesn't support software vm2vm,"
783 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
784 		return -1;
785 	}
786 
787 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788 		RTE_LOG(INFO, VHOST_PORT,
789 			"Vhost zero copy doesn't support jumbo frame,"
790 			"please specify '--mergeable 0' to disable the "
791 			"mergeable feature.\n");
792 		return -1;
793 	}
794 
795 	return 0;
796 }
797 
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804 	unsigned valid_num_ports = num_ports;
805 	unsigned portid;
806 
807 	if (num_ports > nb_ports) {
808 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809 			num_ports, nb_ports);
810 		num_ports = nb_ports;
811 	}
812 
813 	for (portid = 0; portid < num_ports; portid ++) {
814 		if (ports[portid] >= nb_ports) {
815 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816 				ports[portid], (nb_ports - 1));
817 			ports[portid] = INVALID_PORT_ID;
818 			valid_num_ports--;
819 		}
820 	}
821 	return valid_num_ports;
822 }
823 
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {																\
830 	char *pkt_addr = (char*)(addr);																					\
831 	unsigned int index;																								\
832 	char packet[MAX_PRINT_BUFF];																					\
833 																													\
834 	if ((header))																									\
835 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
836 	else																											\
837 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
838 	for (index = 0; index < (size); index++) {																		\
839 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
840 			"%02hhx ", pkt_addr[index]);																			\
841 	}																												\
842 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
843 																													\
844 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849 
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856 	uint32_t buf_len, hpa_type *addr_type)
857 {
858 	struct virtio_memory_regions_hpa *region;
859 	uint32_t regionidx;
860 	uint64_t vhost_pa = 0;
861 
862 	*addr_type = PHYS_ADDR_INVALID;
863 
864 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865 		region = &vdev->regions_hpa[regionidx];
866 		if ((guest_pa >= region->guest_phys_address) &&
867 			(guest_pa <= region->guest_phys_address_end)) {
868 			vhost_pa = region->host_phys_addr_offset + guest_pa;
869 			if (likely((guest_pa + buf_len - 1)
870 				<= region->guest_phys_address_end))
871 				*addr_type = PHYS_ADDR_CONTINUOUS;
872 			else
873 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
874 			break;
875 		}
876 	}
877 
878 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880 		(void *)(uintptr_t)vhost_pa);
881 
882 	return vhost_pa;
883 }
884 
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893 
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901 	struct ether_hdr *pkt_hdr;
902 	struct virtio_net_data_ll *dev_ll;
903 	struct virtio_net *dev = vdev->dev;
904 	int i, ret;
905 
906 	/* Learn MAC address of guest device from packet */
907 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908 
909 	dev_ll = ll_root_used;
910 
911 	while (dev_ll != NULL) {
912 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914 			return -1;
915 		}
916 		dev_ll = dev_ll->next;
917 	}
918 
919 	for (i = 0; i < ETHER_ADDR_LEN; i++)
920 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921 
922 	/* vlan_tag currently uses the device_id. */
923 	vdev->vlan_tag = vlan_tags[dev->device_fh];
924 
925 	/* Print out VMDQ registration info. */
926 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927 		dev->device_fh,
928 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931 		vdev->vlan_tag);
932 
933 	/* Register the MAC address. */
934 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935 	if (ret)
936 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937 					dev->device_fh);
938 
939 	/* Enable stripping of the vlan tag as we handle routing. */
940 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941 
942 	/* Set device as ready for RX. */
943 	vdev->ready = DEVICE_RX;
944 
945 	return 0;
946 }
947 
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955 	unsigned i = 0;
956 	unsigned rx_count;
957 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958 
959 	if (vdev->ready == DEVICE_RX) {
960 		/*clear MAC and VLAN settings*/
961 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962 		for (i = 0; i < 6; i++)
963 			vdev->mac_address.addr_bytes[i] = 0;
964 
965 		vdev->vlan_tag = 0;
966 
967 		/*Clear out the receive buffers*/
968 		rx_count = rte_eth_rx_burst(ports[0],
969 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970 
971 		while (rx_count) {
972 			for (i = 0; i < rx_count; i++)
973 				rte_pktmbuf_free(pkts_burst[i]);
974 
975 			rx_count = rte_eth_rx_burst(ports[0],
976 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977 		}
978 
979 		vdev->ready = DEVICE_MAC_LEARNING;
980 	}
981 }
982 
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990 	struct virtio_net_data_ll *dev_ll;
991 	struct ether_hdr *pkt_hdr;
992 	uint64_t ret = 0;
993 	struct virtio_net *dev = vdev->dev;
994 	struct virtio_net *tdev; /* destination virito device */
995 
996 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997 
998 	/*get the used devices list*/
999 	dev_ll = ll_root_used;
1000 
1001 	while (dev_ll != NULL) {
1002 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003 				          &dev_ll->vdev->mac_address)) {
1004 
1005 			/* Drop the packet if the TX packet is destined for the TX device. */
1006 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008 							dev->device_fh);
1009 				return 0;
1010 			}
1011 			tdev = dev_ll->vdev->dev;
1012 
1013 
1014 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015 
1016 			if (unlikely(dev_ll->vdev->remove)) {
1017 				/*drop the packet if the device is marked for removal*/
1018 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019 			} else {
1020 				/*send the packet to the local virtio device*/
1021 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022 				if (enable_stats) {
1023 					rte_atomic64_add(
1024 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1025 					1);
1026 					rte_atomic64_add(
1027 					&dev_statistics[tdev->device_fh].rx_atomic,
1028 					ret);
1029 					dev_statistics[tdev->device_fh].tx_total++;
1030 					dev_statistics[tdev->device_fh].tx += ret;
1031 				}
1032 			}
1033 
1034 			return 0;
1035 		}
1036 		dev_ll = dev_ll->next;
1037 	}
1038 
1039 	return -1;
1040 }
1041 
1042 /*
1043  * This function routes the TX packet to the correct interface. This may be a local device
1044  * or the physical port.
1045  */
1046 static inline void __attribute__((always_inline))
1047 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1048 {
1049 	struct mbuf_table *tx_q;
1050 	struct rte_mbuf **m_table;
1051 	unsigned len, ret, offset = 0;
1052 	const uint16_t lcore_id = rte_lcore_id();
1053 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1054 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1055 	struct virtio_net *dev = vdev->dev;
1056 
1057 	/*check if destination is local VM*/
1058 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1059 		rte_pktmbuf_free(m);
1060 		return;
1061 	}
1062 
1063 	if (vm2vm_mode == VM2VM_HARDWARE) {
1064 		while (dev_ll != NULL) {
1065 			if ((dev_ll->vdev->ready == DEVICE_RX)
1066 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1067 				&dev_ll->vdev->mac_address)) {
1068 				/*
1069 				 * Drop the packet if the TX packet is
1070 				 * destined for the TX device.
1071 				 */
1072 				if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1073 					LOG_DEBUG(VHOST_DATA,
1074 					"(%"PRIu64") TX: Source and destination"
1075 					" MAC addresses are the same. Dropping "
1076 					"packet.\n",
1077 					dev_ll->vdev->dev->device_fh);
1078 					rte_pktmbuf_free(m);
1079 					return;
1080 				}
1081 
1082 				/*
1083 				 * HW vlan strip will reduce the packet length
1084 				 * by minus length of vlan tag, so need restore
1085 				 * the packet length by plus it.
1086 				 */
1087 				offset = VLAN_HLEN;
1088 				vlan_tag =
1089 				(uint16_t)
1090 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1091 
1092 				LOG_DEBUG(VHOST_DATA,
1093 				"(%"PRIu64") TX: pkt to local VM device id:"
1094 				"(%"PRIu64") vlan tag: %d.\n",
1095 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1096 				vlan_tag);
1097 
1098 				break;
1099 			}
1100 			dev_ll = dev_ll->next;
1101 		}
1102 	}
1103 
1104 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1105 
1106 	/*Add packet to the port tx queue*/
1107 	tx_q = &lcore_tx_queue[lcore_id];
1108 	len = tx_q->len;
1109 
1110 	m->ol_flags = PKT_TX_VLAN_PKT;
1111 
1112 	m->data_len += offset;
1113 	m->pkt_len += offset;
1114 
1115 	m->vlan_tci = vlan_tag;
1116 
1117 	tx_q->m_table[len] = m;
1118 	len++;
1119 	if (enable_stats) {
1120 		dev_statistics[dev->device_fh].tx_total++;
1121 		dev_statistics[dev->device_fh].tx++;
1122 	}
1123 
1124 	if (unlikely(len == MAX_PKT_BURST)) {
1125 		m_table = (struct rte_mbuf **)tx_q->m_table;
1126 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1127 		/* Free any buffers not handled by TX and update the port stats. */
1128 		if (unlikely(ret < len)) {
1129 			do {
1130 				rte_pktmbuf_free(m_table[ret]);
1131 			} while (++ret < len);
1132 		}
1133 
1134 		len = 0;
1135 	}
1136 
1137 	tx_q->len = len;
1138 	return;
1139 }
1140 /*
1141  * This function is called by each data core. It handles all RX/TX registered with the
1142  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1143  * with all devices in the main linked list.
1144  */
1145 static int
1146 switch_worker(__attribute__((unused)) void *arg)
1147 {
1148 	struct rte_mempool *mbuf_pool = arg;
1149 	struct virtio_net *dev = NULL;
1150 	struct vhost_dev *vdev = NULL;
1151 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1152 	struct virtio_net_data_ll *dev_ll;
1153 	struct mbuf_table *tx_q;
1154 	volatile struct lcore_ll_info *lcore_ll;
1155 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1156 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1157 	unsigned ret, i;
1158 	const uint16_t lcore_id = rte_lcore_id();
1159 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1160 	uint16_t rx_count = 0;
1161 	uint16_t tx_count;
1162 	uint32_t retry = 0;
1163 
1164 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1165 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1166 	prev_tsc = 0;
1167 
1168 	tx_q = &lcore_tx_queue[lcore_id];
1169 	for (i = 0; i < num_cores; i ++) {
1170 		if (lcore_ids[i] == lcore_id) {
1171 			tx_q->txq_id = i;
1172 			break;
1173 		}
1174 	}
1175 
1176 	while(1) {
1177 		cur_tsc = rte_rdtsc();
1178 		/*
1179 		 * TX burst queue drain
1180 		 */
1181 		diff_tsc = cur_tsc - prev_tsc;
1182 		if (unlikely(diff_tsc > drain_tsc)) {
1183 
1184 			if (tx_q->len) {
1185 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1186 
1187 				/*Tx any packets in the queue*/
1188 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1189 									   (struct rte_mbuf **)tx_q->m_table,
1190 									   (uint16_t)tx_q->len);
1191 				if (unlikely(ret < tx_q->len)) {
1192 					do {
1193 						rte_pktmbuf_free(tx_q->m_table[ret]);
1194 					} while (++ret < tx_q->len);
1195 				}
1196 
1197 				tx_q->len = 0;
1198 			}
1199 
1200 			prev_tsc = cur_tsc;
1201 
1202 		}
1203 
1204 		rte_prefetch0(lcore_ll->ll_root_used);
1205 		/*
1206 		 * Inform the configuration core that we have exited the linked list and that no devices are
1207 		 * in use if requested.
1208 		 */
1209 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1210 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1211 
1212 		/*
1213 		 * Process devices
1214 		 */
1215 		dev_ll = lcore_ll->ll_root_used;
1216 
1217 		while (dev_ll != NULL) {
1218 			/*get virtio device ID*/
1219 			vdev = dev_ll->vdev;
1220 			dev = vdev->dev;
1221 
1222 			if (unlikely(vdev->remove)) {
1223 				dev_ll = dev_ll->next;
1224 				unlink_vmdq(vdev);
1225 				vdev->ready = DEVICE_SAFE_REMOVE;
1226 				continue;
1227 			}
1228 			if (likely(vdev->ready == DEVICE_RX)) {
1229 				/*Handle guest RX*/
1230 				rx_count = rte_eth_rx_burst(ports[0],
1231 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1232 
1233 				if (rx_count) {
1234 					/*
1235 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1236 					* Here MAX_PKT_BURST must be less than virtio queue size
1237 					*/
1238 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1239 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1240 							rte_delay_us(burst_rx_delay_time);
1241 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1242 								break;
1243 						}
1244 					}
1245 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1246 					if (enable_stats) {
1247 						rte_atomic64_add(
1248 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1249 						rx_count);
1250 						rte_atomic64_add(
1251 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1252 					}
1253 					while (likely(rx_count)) {
1254 						rx_count--;
1255 						rte_pktmbuf_free(pkts_burst[rx_count]);
1256 					}
1257 
1258 				}
1259 			}
1260 
1261 			if (likely(!vdev->remove)) {
1262 				/* Handle guest TX*/
1263 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1264 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1265 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1266 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1267 						while (tx_count--)
1268 							rte_pktmbuf_free(pkts_burst[tx_count]);
1269 					}
1270 				}
1271 				while (tx_count)
1272 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1273 			}
1274 
1275 			/*move to the next device in the list*/
1276 			dev_ll = dev_ll->next;
1277 		}
1278 	}
1279 
1280 	return 0;
1281 }
1282 
1283 /*
1284  * This function gets available ring number for zero copy rx.
1285  * Only one thread will call this funciton for a paticular virtio device,
1286  * so, it is designed as non-thread-safe function.
1287  */
1288 static inline uint32_t __attribute__((always_inline))
1289 get_available_ring_num_zcp(struct virtio_net *dev)
1290 {
1291 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1292 	uint16_t avail_idx;
1293 
1294 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1295 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1296 }
1297 
1298 /*
1299  * This function gets available ring index for zero copy rx,
1300  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1301  * Only one thread will call this funciton for a paticular virtio device,
1302  * so, it is designed as non-thread-safe function.
1303  */
1304 static inline uint32_t __attribute__((always_inline))
1305 get_available_ring_index_zcp(struct virtio_net *dev,
1306 	uint16_t *res_base_idx, uint32_t count)
1307 {
1308 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1309 	uint16_t avail_idx;
1310 	uint32_t retry = 0;
1311 	uint16_t free_entries;
1312 
1313 	*res_base_idx = vq->last_used_idx_res;
1314 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1315 	free_entries = (avail_idx - *res_base_idx);
1316 
1317 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1318 			"avail idx: %d, "
1319 			"res base idx:%d, free entries:%d\n",
1320 			dev->device_fh, avail_idx, *res_base_idx,
1321 			free_entries);
1322 
1323 	/*
1324 	 * If retry is enabled and the queue is full then we wait
1325 	 * and retry to avoid packet loss.
1326 	 */
1327 	if (enable_retry && unlikely(count > free_entries)) {
1328 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1329 			rte_delay_us(burst_rx_delay_time);
1330 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1331 			free_entries = (avail_idx - *res_base_idx);
1332 			if (count <= free_entries)
1333 				break;
1334 		}
1335 	}
1336 
1337 	/*check that we have enough buffers*/
1338 	if (unlikely(count > free_entries))
1339 		count = free_entries;
1340 
1341 	if (unlikely(count == 0)) {
1342 		LOG_DEBUG(VHOST_DATA,
1343 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1344 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1345 			dev->device_fh, avail_idx,
1346 			*res_base_idx, free_entries);
1347 		return 0;
1348 	}
1349 
1350 	vq->last_used_idx_res = *res_base_idx + count;
1351 
1352 	return count;
1353 }
1354 
1355 /*
1356  * This function put descriptor back to used list.
1357  */
1358 static inline void __attribute__((always_inline))
1359 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1360 {
1361 	uint16_t res_cur_idx = vq->last_used_idx;
1362 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1363 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1364 	rte_compiler_barrier();
1365 	*(volatile uint16_t *)&vq->used->idx += 1;
1366 	vq->last_used_idx += 1;
1367 
1368 	/* Kick the guest if necessary. */
1369 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1370 		eventfd_write((int)vq->kickfd, 1);
1371 }
1372 
1373 /*
1374  * This function get available descriptor from vitio vring and un-attached mbuf
1375  * from vpool->ring, and then attach them together. It needs adjust the offset
1376  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1377  * frame data may be put to wrong location in mbuf.
1378  */
1379 static inline void __attribute__((always_inline))
1380 attach_rxmbuf_zcp(struct virtio_net *dev)
1381 {
1382 	uint16_t res_base_idx, desc_idx;
1383 	uint64_t buff_addr, phys_addr;
1384 	struct vhost_virtqueue *vq;
1385 	struct vring_desc *desc;
1386 	struct rte_mbuf *mbuf = NULL;
1387 	struct vpool *vpool;
1388 	hpa_type addr_type;
1389 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1390 
1391 	vpool = &vpool_array[vdev->vmdq_rx_q];
1392 	vq = dev->virtqueue[VIRTIO_RXQ];
1393 
1394 	do {
1395 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1396 				1) != 1))
1397 			return;
1398 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1399 
1400 		desc = &vq->desc[desc_idx];
1401 		if (desc->flags & VRING_DESC_F_NEXT) {
1402 			desc = &vq->desc[desc->next];
1403 			buff_addr = gpa_to_vva(dev, desc->addr);
1404 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1405 					&addr_type);
1406 		} else {
1407 			buff_addr = gpa_to_vva(dev,
1408 					desc->addr + vq->vhost_hlen);
1409 			phys_addr = gpa_to_hpa(vdev,
1410 					desc->addr + vq->vhost_hlen,
1411 					desc->len, &addr_type);
1412 		}
1413 
1414 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1415 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1416 				" address found when attaching RX frame buffer"
1417 				" address!\n", dev->device_fh);
1418 			put_desc_to_used_list_zcp(vq, desc_idx);
1419 			continue;
1420 		}
1421 
1422 		/*
1423 		 * Check if the frame buffer address from guest crosses
1424 		 * sub-region or not.
1425 		 */
1426 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1427 			RTE_LOG(ERR, VHOST_DATA,
1428 				"(%"PRIu64") Frame buffer address cross "
1429 				"sub-regioin found when attaching RX frame "
1430 				"buffer address!\n",
1431 				dev->device_fh);
1432 			put_desc_to_used_list_zcp(vq, desc_idx);
1433 			continue;
1434 		}
1435 	} while (unlikely(phys_addr == 0));
1436 
1437 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1438 	if (unlikely(mbuf == NULL)) {
1439 		LOG_DEBUG(VHOST_DATA,
1440 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1441 			"ring_sc_dequeue fail.\n",
1442 			dev->device_fh);
1443 		put_desc_to_used_list_zcp(vq, desc_idx);
1444 		return;
1445 	}
1446 
1447 	if (unlikely(vpool->buf_size > desc->len)) {
1448 		LOG_DEBUG(VHOST_DATA,
1449 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1450 			"length(%d) of descriptor idx: %d less than room "
1451 			"size required: %d\n",
1452 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1453 		put_desc_to_used_list_zcp(vq, desc_idx);
1454 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1455 		return;
1456 	}
1457 
1458 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1459 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1460 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1461 	mbuf->data_len = desc->len;
1462 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1463 
1464 	LOG_DEBUG(VHOST_DATA,
1465 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1466 		"descriptor idx:%d\n",
1467 		dev->device_fh, res_base_idx, desc_idx);
1468 
1469 	__rte_mbuf_raw_free(mbuf);
1470 
1471 	return;
1472 }
1473 
1474 /*
1475  * Detach an attched packet mbuf -
1476  *  - restore original mbuf address and length values.
1477  *  - reset pktmbuf data and data_len to their default values.
1478  *  All other fields of the given packet mbuf will be left intact.
1479  *
1480  * @param m
1481  *   The attached packet mbuf.
1482  */
1483 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1484 {
1485 	const struct rte_mempool *mp = m->pool;
1486 	void *buf = RTE_MBUF_TO_BADDR(m);
1487 	uint32_t buf_ofs;
1488 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1489 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1490 
1491 	m->buf_addr = buf;
1492 	m->buf_len = (uint16_t)buf_len;
1493 
1494 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1495 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1496 	m->data_off = buf_ofs;
1497 
1498 	m->data_len = 0;
1499 }
1500 
1501 /*
1502  * This function is called after packets have been transimited. It fetchs mbuf
1503  * from vpool->pool, detached it and put into vpool->ring. It also update the
1504  * used index and kick the guest if necessary.
1505  */
1506 static inline uint32_t __attribute__((always_inline))
1507 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1508 {
1509 	struct rte_mbuf *mbuf;
1510 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1511 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1512 	uint32_t index = 0;
1513 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1514 
1515 	LOG_DEBUG(VHOST_DATA,
1516 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1517 		"clean is: %d\n",
1518 		dev->device_fh, mbuf_count);
1519 	LOG_DEBUG(VHOST_DATA,
1520 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1521 		"clean  is : %d\n",
1522 		dev->device_fh, rte_ring_count(vpool->ring));
1523 
1524 	for (index = 0; index < mbuf_count; index++) {
1525 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1526 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1527 			pktmbuf_detach_zcp(mbuf);
1528 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1529 
1530 		/* Update used index buffer information. */
1531 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1532 		vq->used->ring[used_idx].len = 0;
1533 
1534 		used_idx = (used_idx + 1) & (vq->size - 1);
1535 	}
1536 
1537 	LOG_DEBUG(VHOST_DATA,
1538 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1539 		"clean is: %d\n",
1540 		dev->device_fh, rte_mempool_count(vpool->pool));
1541 	LOG_DEBUG(VHOST_DATA,
1542 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1543 		"clean  is : %d\n",
1544 		dev->device_fh, rte_ring_count(vpool->ring));
1545 	LOG_DEBUG(VHOST_DATA,
1546 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1547 		"vq->last_used_idx:%d\n",
1548 		dev->device_fh, vq->last_used_idx);
1549 
1550 	vq->last_used_idx += mbuf_count;
1551 
1552 	LOG_DEBUG(VHOST_DATA,
1553 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1554 		"vq->last_used_idx:%d\n",
1555 		dev->device_fh, vq->last_used_idx);
1556 
1557 	rte_compiler_barrier();
1558 
1559 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1560 
1561 	/* Kick guest if required. */
1562 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1563 		eventfd_write((int)vq->kickfd, 1);
1564 
1565 	return 0;
1566 }
1567 
1568 /*
1569  * This function is called when a virtio device is destroy.
1570  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1571  */
1572 static void mbuf_destroy_zcp(struct vpool *vpool)
1573 {
1574 	struct rte_mbuf *mbuf = NULL;
1575 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1576 
1577 	LOG_DEBUG(VHOST_CONFIG,
1578 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1579 		"mbuf_destroy_zcp is: %d\n",
1580 		mbuf_count);
1581 	LOG_DEBUG(VHOST_CONFIG,
1582 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1583 		"mbuf_destroy_zcp  is : %d\n",
1584 		rte_ring_count(vpool->ring));
1585 
1586 	for (index = 0; index < mbuf_count; index++) {
1587 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1588 		if (likely(mbuf != NULL)) {
1589 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1590 				pktmbuf_detach_zcp(mbuf);
1591 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1592 		}
1593 	}
1594 
1595 	LOG_DEBUG(VHOST_CONFIG,
1596 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1597 		"mbuf_destroy_zcp is: %d\n",
1598 		rte_mempool_count(vpool->pool));
1599 	LOG_DEBUG(VHOST_CONFIG,
1600 		"in mbuf_destroy_zcp: mbuf count in ring after "
1601 		"mbuf_destroy_zcp is : %d\n",
1602 		rte_ring_count(vpool->ring));
1603 }
1604 
1605 /*
1606  * This function update the use flag and counter.
1607  */
1608 static inline uint32_t __attribute__((always_inline))
1609 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1610 	uint32_t count)
1611 {
1612 	struct vhost_virtqueue *vq;
1613 	struct vring_desc *desc;
1614 	struct rte_mbuf *buff;
1615 	/* The virtio_hdr is initialised to 0. */
1616 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1617 		= {{0, 0, 0, 0, 0, 0}, 0};
1618 	uint64_t buff_hdr_addr = 0;
1619 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1620 	uint32_t head_idx, packet_success = 0;
1621 	uint16_t res_cur_idx;
1622 
1623 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1624 
1625 	if (count == 0)
1626 		return 0;
1627 
1628 	vq = dev->virtqueue[VIRTIO_RXQ];
1629 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1630 
1631 	res_cur_idx = vq->last_used_idx;
1632 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1633 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1634 
1635 	/* Retrieve all of the head indexes first to avoid caching issues. */
1636 	for (head_idx = 0; head_idx < count; head_idx++)
1637 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1638 
1639 	/*Prefetch descriptor index. */
1640 	rte_prefetch0(&vq->desc[head[packet_success]]);
1641 
1642 	while (packet_success != count) {
1643 		/* Get descriptor from available ring */
1644 		desc = &vq->desc[head[packet_success]];
1645 
1646 		buff = pkts[packet_success];
1647 		LOG_DEBUG(VHOST_DATA,
1648 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1649 			"pkt[%d] descriptor idx: %d\n",
1650 			dev->device_fh, packet_success,
1651 			MBUF_HEADROOM_UINT32(buff));
1652 
1653 		PRINT_PACKET(dev,
1654 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1655 			+ RTE_PKTMBUF_HEADROOM),
1656 			rte_pktmbuf_data_len(buff), 0);
1657 
1658 		/* Buffer address translation for virtio header. */
1659 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1660 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1661 
1662 		/*
1663 		 * If the descriptors are chained the header and data are
1664 		 * placed in separate buffers.
1665 		 */
1666 		if (desc->flags & VRING_DESC_F_NEXT) {
1667 			desc->len = vq->vhost_hlen;
1668 			desc = &vq->desc[desc->next];
1669 			desc->len = rte_pktmbuf_data_len(buff);
1670 		} else {
1671 			desc->len = packet_len;
1672 		}
1673 
1674 		/* Update used ring with desc information */
1675 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1676 			= head[packet_success];
1677 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1678 			= packet_len;
1679 		res_cur_idx++;
1680 		packet_success++;
1681 
1682 		/* A header is required per buffer. */
1683 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1684 			(const void *)&virtio_hdr, vq->vhost_hlen);
1685 
1686 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1687 
1688 		if (likely(packet_success < count)) {
1689 			/* Prefetch descriptor index. */
1690 			rte_prefetch0(&vq->desc[head[packet_success]]);
1691 		}
1692 	}
1693 
1694 	rte_compiler_barrier();
1695 
1696 	LOG_DEBUG(VHOST_DATA,
1697 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1698 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1699 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1700 
1701 	*(volatile uint16_t *)&vq->used->idx += count;
1702 	vq->last_used_idx += count;
1703 
1704 	LOG_DEBUG(VHOST_DATA,
1705 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1706 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1707 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1708 
1709 	/* Kick the guest if necessary. */
1710 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1711 		eventfd_write((int)vq->kickfd, 1);
1712 
1713 	return count;
1714 }
1715 
1716 /*
1717  * This function routes the TX packet to the correct interface.
1718  * This may be a local device or the physical port.
1719  */
1720 static inline void __attribute__((always_inline))
1721 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1722 	uint32_t desc_idx, uint8_t need_copy)
1723 {
1724 	struct mbuf_table *tx_q;
1725 	struct rte_mbuf **m_table;
1726 	struct rte_mbuf *mbuf = NULL;
1727 	unsigned len, ret, offset = 0;
1728 	struct vpool *vpool;
1729 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1730 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1731 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1732 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1733 
1734 	/*Add packet to the port tx queue*/
1735 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1736 	len = tx_q->len;
1737 
1738 	/* Allocate an mbuf and populate the structure. */
1739 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1740 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1741 	if (unlikely(mbuf == NULL)) {
1742 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1743 		RTE_LOG(ERR, VHOST_DATA,
1744 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1745 			dev->device_fh);
1746 		put_desc_to_used_list_zcp(vq, desc_idx);
1747 		return;
1748 	}
1749 
1750 	if (vm2vm_mode == VM2VM_HARDWARE) {
1751 		/* Avoid using a vlan tag from any vm for external pkt, such as
1752 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1753 		 * selection, MAC address determines it as an external pkt
1754 		 * which should go to network, while vlan tag determine it as
1755 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1756 		 * such a ambiguous situation, so pkt will lost.
1757 		 */
1758 		vlan_tag = external_pkt_default_vlan_tag;
1759 		while (dev_ll != NULL) {
1760 			if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1761 				ether_addr_cmp(&(pkt_hdr->d_addr),
1762 				&dev_ll->vdev->mac_address)) {
1763 
1764 				/*
1765 				 * Drop the packet if the TX packet is destined
1766 				 * for the TX device.
1767 				 */
1768 				if (unlikely(dev_ll->vdev->dev->device_fh
1769 					== dev->device_fh)) {
1770 					LOG_DEBUG(VHOST_DATA,
1771 					"(%"PRIu64") TX: Source and destination"
1772 					"MAC addresses are the same. Dropping "
1773 					"packet.\n",
1774 					dev_ll->vdev->dev->device_fh);
1775 					MBUF_HEADROOM_UINT32(mbuf)
1776 						= (uint32_t)desc_idx;
1777 					__rte_mbuf_raw_free(mbuf);
1778 					return;
1779 				}
1780 
1781 				/*
1782 				 * Packet length offset 4 bytes for HW vlan
1783 				 * strip when L2 switch back.
1784 				 */
1785 				offset = 4;
1786 				vlan_tag =
1787 				(uint16_t)
1788 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1789 
1790 				LOG_DEBUG(VHOST_DATA,
1791 				"(%"PRIu64") TX: pkt to local VM device id:"
1792 				"(%"PRIu64") vlan tag: %d.\n",
1793 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1794 				vlan_tag);
1795 
1796 				break;
1797 			}
1798 			dev_ll = dev_ll->next;
1799 		}
1800 	}
1801 
1802 	mbuf->nb_segs = m->nb_segs;
1803 	mbuf->next = m->next;
1804 	mbuf->data_len = m->data_len + offset;
1805 	mbuf->pkt_len = mbuf->data_len;
1806 	if (unlikely(need_copy)) {
1807 		/* Copy the packet contents to the mbuf. */
1808 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1809 			rte_pktmbuf_mtod(m, void *),
1810 			m->data_len);
1811 	} else {
1812 		mbuf->data_off = m->data_off;
1813 		mbuf->buf_physaddr = m->buf_physaddr;
1814 		mbuf->buf_addr = m->buf_addr;
1815 	}
1816 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1817 	mbuf->vlan_tci = vlan_tag;
1818 	mbuf->l2_len = sizeof(struct ether_hdr);
1819 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1820 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1821 
1822 	tx_q->m_table[len] = mbuf;
1823 	len++;
1824 
1825 	LOG_DEBUG(VHOST_DATA,
1826 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1827 		dev->device_fh,
1828 		mbuf->nb_segs,
1829 		(mbuf->next == NULL) ? "null" : "non-null");
1830 
1831 	if (enable_stats) {
1832 		dev_statistics[dev->device_fh].tx_total++;
1833 		dev_statistics[dev->device_fh].tx++;
1834 	}
1835 
1836 	if (unlikely(len == MAX_PKT_BURST)) {
1837 		m_table = (struct rte_mbuf **)tx_q->m_table;
1838 		ret = rte_eth_tx_burst(ports[0],
1839 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1840 
1841 		/*
1842 		 * Free any buffers not handled by TX and update
1843 		 * the port stats.
1844 		 */
1845 		if (unlikely(ret < len)) {
1846 			do {
1847 				rte_pktmbuf_free(m_table[ret]);
1848 			} while (++ret < len);
1849 		}
1850 
1851 		len = 0;
1852 		txmbuf_clean_zcp(dev, vpool);
1853 	}
1854 
1855 	tx_q->len = len;
1856 
1857 	return;
1858 }
1859 
1860 /*
1861  * This function TX all available packets in virtio TX queue for one
1862  * virtio-net device. If it is first packet, it learns MAC address and
1863  * setup VMDQ.
1864  */
1865 static inline void __attribute__((always_inline))
1866 virtio_dev_tx_zcp(struct virtio_net *dev)
1867 {
1868 	struct rte_mbuf m;
1869 	struct vhost_virtqueue *vq;
1870 	struct vring_desc *desc;
1871 	uint64_t buff_addr = 0, phys_addr;
1872 	uint32_t head[MAX_PKT_BURST];
1873 	uint32_t i;
1874 	uint16_t free_entries, packet_success = 0;
1875 	uint16_t avail_idx;
1876 	uint8_t need_copy = 0;
1877 	hpa_type addr_type;
1878 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1879 
1880 	vq = dev->virtqueue[VIRTIO_TXQ];
1881 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1882 
1883 	/* If there are no available buffers then return. */
1884 	if (vq->last_used_idx_res == avail_idx)
1885 		return;
1886 
1887 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1888 
1889 	/* Prefetch available ring to retrieve head indexes. */
1890 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1891 
1892 	/* Get the number of free entries in the ring */
1893 	free_entries = (avail_idx - vq->last_used_idx_res);
1894 
1895 	/* Limit to MAX_PKT_BURST. */
1896 	free_entries
1897 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1898 
1899 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1900 		dev->device_fh, free_entries);
1901 
1902 	/* Retrieve all of the head indexes first to avoid caching issues. */
1903 	for (i = 0; i < free_entries; i++)
1904 		head[i]
1905 			= vq->avail->ring[(vq->last_used_idx_res + i)
1906 			& (vq->size - 1)];
1907 
1908 	vq->last_used_idx_res += free_entries;
1909 
1910 	/* Prefetch descriptor index. */
1911 	rte_prefetch0(&vq->desc[head[packet_success]]);
1912 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1913 
1914 	while (packet_success < free_entries) {
1915 		desc = &vq->desc[head[packet_success]];
1916 
1917 		/* Discard first buffer as it is the virtio header */
1918 		desc = &vq->desc[desc->next];
1919 
1920 		/* Buffer address translation. */
1921 		buff_addr = gpa_to_vva(dev, desc->addr);
1922 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1923 
1924 		if (likely(packet_success < (free_entries - 1)))
1925 			/* Prefetch descriptor index. */
1926 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1927 
1928 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1929 			RTE_LOG(ERR, VHOST_DATA,
1930 				"(%"PRIu64") Invalid frame buffer address found"
1931 				"when TX packets!\n",
1932 				dev->device_fh);
1933 			packet_success++;
1934 			continue;
1935 		}
1936 
1937 		/* Prefetch buffer address. */
1938 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1939 
1940 		/*
1941 		 * Setup dummy mbuf. This is copied to a real mbuf if
1942 		 * transmitted out the physical port.
1943 		 */
1944 		m.data_len = desc->len;
1945 		m.nb_segs = 1;
1946 		m.next = NULL;
1947 		m.data_off = 0;
1948 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1949 		m.buf_physaddr = phys_addr;
1950 
1951 		/*
1952 		 * Check if the frame buffer address from guest crosses
1953 		 * sub-region or not.
1954 		 */
1955 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1956 			RTE_LOG(ERR, VHOST_DATA,
1957 				"(%"PRIu64") Frame buffer address cross "
1958 				"sub-regioin found when attaching TX frame "
1959 				"buffer address!\n",
1960 				dev->device_fh);
1961 			need_copy = 1;
1962 		} else
1963 			need_copy = 0;
1964 
1965 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1966 
1967 		/*
1968 		 * If this is the first received packet we need to learn
1969 		 * the MAC and setup VMDQ
1970 		 */
1971 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1972 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1973 				/*
1974 				 * Discard frame if device is scheduled for
1975 				 * removal or a duplicate MAC address is found.
1976 				 */
1977 				packet_success += free_entries;
1978 				vq->last_used_idx += packet_success;
1979 				break;
1980 			}
1981 		}
1982 
1983 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1984 		packet_success++;
1985 	}
1986 }
1987 
1988 /*
1989  * This function is called by each data core. It handles all RX/TX registered
1990  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1991  * addresses are compared with all devices in the main linked list.
1992  */
1993 static int
1994 switch_worker_zcp(__attribute__((unused)) void *arg)
1995 {
1996 	struct virtio_net *dev = NULL;
1997 	struct vhost_dev  *vdev = NULL;
1998 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1999 	struct virtio_net_data_ll *dev_ll;
2000 	struct mbuf_table *tx_q;
2001 	volatile struct lcore_ll_info *lcore_ll;
2002 	const uint64_t drain_tsc
2003 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2004 		* BURST_TX_DRAIN_US;
2005 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2006 	unsigned ret;
2007 	const uint16_t lcore_id = rte_lcore_id();
2008 	uint16_t count_in_ring, rx_count = 0;
2009 
2010 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2011 
2012 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2013 	prev_tsc = 0;
2014 
2015 	while (1) {
2016 		cur_tsc = rte_rdtsc();
2017 
2018 		/* TX burst queue drain */
2019 		diff_tsc = cur_tsc - prev_tsc;
2020 		if (unlikely(diff_tsc > drain_tsc)) {
2021 			/*
2022 			 * Get mbuf from vpool.pool and detach mbuf and
2023 			 * put back into vpool.ring.
2024 			 */
2025 			dev_ll = lcore_ll->ll_root_used;
2026 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2027 				/* Get virtio device ID */
2028 				vdev = dev_ll->vdev;
2029 				dev = vdev->dev;
2030 
2031 				if (likely(!vdev->remove)) {
2032 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2033 					if (tx_q->len) {
2034 						LOG_DEBUG(VHOST_DATA,
2035 						"TX queue drained after timeout"
2036 						" with burst size %u\n",
2037 						tx_q->len);
2038 
2039 						/*
2040 						 * Tx any packets in the queue
2041 						 */
2042 						ret = rte_eth_tx_burst(
2043 							ports[0],
2044 							(uint16_t)tx_q->txq_id,
2045 							(struct rte_mbuf **)
2046 							tx_q->m_table,
2047 							(uint16_t)tx_q->len);
2048 						if (unlikely(ret < tx_q->len)) {
2049 							do {
2050 								rte_pktmbuf_free(
2051 									tx_q->m_table[ret]);
2052 							} while (++ret < tx_q->len);
2053 						}
2054 						tx_q->len = 0;
2055 
2056 						txmbuf_clean_zcp(dev,
2057 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2058 					}
2059 				}
2060 				dev_ll = dev_ll->next;
2061 			}
2062 			prev_tsc = cur_tsc;
2063 		}
2064 
2065 		rte_prefetch0(lcore_ll->ll_root_used);
2066 
2067 		/*
2068 		 * Inform the configuration core that we have exited the linked
2069 		 * list and that no devices are in use if requested.
2070 		 */
2071 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2072 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2073 
2074 		/* Process devices */
2075 		dev_ll = lcore_ll->ll_root_used;
2076 
2077 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2078 			vdev = dev_ll->vdev;
2079 			dev  = vdev->dev;
2080 			if (unlikely(vdev->remove)) {
2081 				dev_ll = dev_ll->next;
2082 				unlink_vmdq(vdev);
2083 				vdev->ready = DEVICE_SAFE_REMOVE;
2084 				continue;
2085 			}
2086 
2087 			if (likely(vdev->ready == DEVICE_RX)) {
2088 				uint32_t index = vdev->vmdq_rx_q;
2089 				uint16_t i;
2090 				count_in_ring
2091 				= rte_ring_count(vpool_array[index].ring);
2092 				uint16_t free_entries
2093 				= (uint16_t)get_available_ring_num_zcp(dev);
2094 
2095 				/*
2096 				 * Attach all mbufs in vpool.ring and put back
2097 				 * into vpool.pool.
2098 				 */
2099 				for (i = 0;
2100 				i < RTE_MIN(free_entries,
2101 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2102 				i++)
2103 					attach_rxmbuf_zcp(dev);
2104 
2105 				/* Handle guest RX */
2106 				rx_count = rte_eth_rx_burst(ports[0],
2107 					vdev->vmdq_rx_q, pkts_burst,
2108 					MAX_PKT_BURST);
2109 
2110 				if (rx_count) {
2111 					ret_count = virtio_dev_rx_zcp(dev,
2112 							pkts_burst, rx_count);
2113 					if (enable_stats) {
2114 						dev_statistics[dev->device_fh].rx_total
2115 							+= rx_count;
2116 						dev_statistics[dev->device_fh].rx
2117 							+= ret_count;
2118 					}
2119 					while (likely(rx_count)) {
2120 						rx_count--;
2121 						pktmbuf_detach_zcp(
2122 							pkts_burst[rx_count]);
2123 						rte_ring_sp_enqueue(
2124 							vpool_array[index].ring,
2125 							(void *)pkts_burst[rx_count]);
2126 					}
2127 				}
2128 			}
2129 
2130 			if (likely(!vdev->remove))
2131 				/* Handle guest TX */
2132 				virtio_dev_tx_zcp(dev);
2133 
2134 			/* Move to the next device in the list */
2135 			dev_ll = dev_ll->next;
2136 		}
2137 	}
2138 
2139 	return 0;
2140 }
2141 
2142 
2143 /*
2144  * Add an entry to a used linked list. A free entry must first be found
2145  * in the free linked list using get_data_ll_free_entry();
2146  */
2147 static void
2148 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2149 	struct virtio_net_data_ll *ll_dev)
2150 {
2151 	struct virtio_net_data_ll *ll = *ll_root_addr;
2152 
2153 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2154 	ll_dev->next = NULL;
2155 	rte_compiler_barrier();
2156 
2157 	/* If ll == NULL then this is the first device. */
2158 	if (ll) {
2159 		/* Increment to the tail of the linked list. */
2160 		while ((ll->next != NULL) )
2161 			ll = ll->next;
2162 
2163 		ll->next = ll_dev;
2164 	} else {
2165 		*ll_root_addr = ll_dev;
2166 	}
2167 }
2168 
2169 /*
2170  * Remove an entry from a used linked list. The entry must then be added to
2171  * the free linked list using put_data_ll_free_entry().
2172  */
2173 static void
2174 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2175 	struct virtio_net_data_ll *ll_dev,
2176 	struct virtio_net_data_ll *ll_dev_last)
2177 {
2178 	struct virtio_net_data_ll *ll = *ll_root_addr;
2179 
2180 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2181 		return;
2182 
2183 	if (ll_dev == ll)
2184 		*ll_root_addr = ll_dev->next;
2185 	else
2186 		if (likely(ll_dev_last != NULL))
2187 			ll_dev_last->next = ll_dev->next;
2188 		else
2189 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2190 }
2191 
2192 /*
2193  * Find and return an entry from the free linked list.
2194  */
2195 static struct virtio_net_data_ll *
2196 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2197 {
2198 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2199 	struct virtio_net_data_ll *ll_dev;
2200 
2201 	if (ll_free == NULL)
2202 		return NULL;
2203 
2204 	ll_dev = ll_free;
2205 	*ll_root_addr = ll_free->next;
2206 
2207 	return ll_dev;
2208 }
2209 
2210 /*
2211  * Place an entry back on to the free linked list.
2212  */
2213 static void
2214 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2215 	struct virtio_net_data_ll *ll_dev)
2216 {
2217 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2218 
2219 	if (ll_dev == NULL)
2220 		return;
2221 
2222 	ll_dev->next = ll_free;
2223 	*ll_root_addr = ll_dev;
2224 }
2225 
2226 /*
2227  * Creates a linked list of a given size.
2228  */
2229 static struct virtio_net_data_ll *
2230 alloc_data_ll(uint32_t size)
2231 {
2232 	struct virtio_net_data_ll *ll_new;
2233 	uint32_t i;
2234 
2235 	/* Malloc and then chain the linked list. */
2236 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2237 	if (ll_new == NULL) {
2238 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2239 		return NULL;
2240 	}
2241 
2242 	for (i = 0; i < size - 1; i++) {
2243 		ll_new[i].vdev = NULL;
2244 		ll_new[i].next = &ll_new[i+1];
2245 	}
2246 	ll_new[i].next = NULL;
2247 
2248 	return (ll_new);
2249 }
2250 
2251 /*
2252  * Create the main linked list along with each individual cores linked list. A used and a free list
2253  * are created to manage entries.
2254  */
2255 static int
2256 init_data_ll (void)
2257 {
2258 	int lcore;
2259 
2260 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2261 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2262 		if (lcore_info[lcore].lcore_ll == NULL) {
2263 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2264 			return -1;
2265 		}
2266 
2267 		lcore_info[lcore].lcore_ll->device_num = 0;
2268 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2269 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2270 		if (num_devices % num_switching_cores)
2271 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2272 		else
2273 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2274 	}
2275 
2276 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2277 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2278 
2279 	return 0;
2280 }
2281 
2282 /*
2283  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2284  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2285  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2286  */
2287 static void
2288 destroy_device (volatile struct virtio_net *dev)
2289 {
2290 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2291 	struct virtio_net_data_ll *ll_main_dev_cur;
2292 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2293 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2294 	struct vhost_dev *vdev;
2295 	int lcore;
2296 
2297 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2298 
2299 	vdev = (struct vhost_dev *)dev->priv;
2300 	/*set the remove flag. */
2301 	vdev->remove = 1;
2302 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2303 		rte_pause();
2304 	}
2305 
2306 	/* Search for entry to be removed from lcore ll */
2307 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2308 	while (ll_lcore_dev_cur != NULL) {
2309 		if (ll_lcore_dev_cur->vdev == vdev) {
2310 			break;
2311 		} else {
2312 			ll_lcore_dev_last = ll_lcore_dev_cur;
2313 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2314 		}
2315 	}
2316 
2317 	if (ll_lcore_dev_cur == NULL) {
2318 		RTE_LOG(ERR, VHOST_CONFIG,
2319 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2320 			dev->device_fh);
2321 		return;
2322 	}
2323 
2324 	/* Search for entry to be removed from main ll */
2325 	ll_main_dev_cur = ll_root_used;
2326 	ll_main_dev_last = NULL;
2327 	while (ll_main_dev_cur != NULL) {
2328 		if (ll_main_dev_cur->vdev == vdev) {
2329 			break;
2330 		} else {
2331 			ll_main_dev_last = ll_main_dev_cur;
2332 			ll_main_dev_cur = ll_main_dev_cur->next;
2333 		}
2334 	}
2335 
2336 	/* Remove entries from the lcore and main ll. */
2337 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2338 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2339 
2340 	/* Set the dev_removal_flag on each lcore. */
2341 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2342 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2343 	}
2344 
2345 	/*
2346 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2347 	 * they can no longer access the device removed from the linked lists and that the devices
2348 	 * are no longer in use.
2349 	 */
2350 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2351 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2352 			rte_pause();
2353 		}
2354 	}
2355 
2356 	/* Add the entries back to the lcore and main free ll.*/
2357 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2358 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2359 
2360 	/* Decrement number of device on the lcore. */
2361 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2362 
2363 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2364 
2365 	if (zero_copy) {
2366 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2367 
2368 		/* Stop the RX queue. */
2369 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2370 			LOG_DEBUG(VHOST_CONFIG,
2371 				"(%"PRIu64") In destroy_device: Failed to stop "
2372 				"rx queue:%d\n",
2373 				dev->device_fh,
2374 				vdev->vmdq_rx_q);
2375 		}
2376 
2377 		LOG_DEBUG(VHOST_CONFIG,
2378 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2379 			"mempool back to ring for RX queue: %d\n",
2380 			dev->device_fh, vdev->vmdq_rx_q);
2381 
2382 		mbuf_destroy_zcp(vpool);
2383 
2384 		/* Stop the TX queue. */
2385 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2386 			LOG_DEBUG(VHOST_CONFIG,
2387 				"(%"PRIu64") In destroy_device: Failed to "
2388 				"stop tx queue:%d\n",
2389 				dev->device_fh, vdev->vmdq_rx_q);
2390 		}
2391 
2392 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2393 
2394 		LOG_DEBUG(VHOST_CONFIG,
2395 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2396 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2397 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2398 			dev->device_fh);
2399 
2400 		mbuf_destroy_zcp(vpool);
2401 		rte_free(vdev->regions_hpa);
2402 	}
2403 	rte_free(vdev);
2404 
2405 }
2406 
2407 /*
2408  * Calculate the region count of physical continous regions for one particular
2409  * region of whose vhost virtual address is continous. The particular region
2410  * start from vva_start, with size of 'size' in argument.
2411  */
2412 static uint32_t
2413 check_hpa_regions(uint64_t vva_start, uint64_t size)
2414 {
2415 	uint32_t i, nregions = 0, page_size = getpagesize();
2416 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2417 	if (vva_start % page_size) {
2418 		LOG_DEBUG(VHOST_CONFIG,
2419 			"in check_countinous: vva start(%p) mod page_size(%d) "
2420 			"has remainder\n",
2421 			(void *)(uintptr_t)vva_start, page_size);
2422 		return 0;
2423 	}
2424 	if (size % page_size) {
2425 		LOG_DEBUG(VHOST_CONFIG,
2426 			"in check_countinous: "
2427 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2428 			size, page_size);
2429 		return 0;
2430 	}
2431 	for (i = 0; i < size - page_size; i = i + page_size) {
2432 		cur_phys_addr
2433 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2434 		next_phys_addr = rte_mem_virt2phy(
2435 			(void *)(uintptr_t)(vva_start + i + page_size));
2436 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2437 			++nregions;
2438 			LOG_DEBUG(VHOST_CONFIG,
2439 				"in check_continuous: hva addr:(%p) is not "
2440 				"continuous with hva addr:(%p), diff:%d\n",
2441 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2442 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2443 				+ page_size), page_size);
2444 			LOG_DEBUG(VHOST_CONFIG,
2445 				"in check_continuous: hpa addr:(%p) is not "
2446 				"continuous with hpa addr:(%p), "
2447 				"diff:(%"PRIu64")\n",
2448 				(void *)(uintptr_t)cur_phys_addr,
2449 				(void *)(uintptr_t)next_phys_addr,
2450 				(next_phys_addr-cur_phys_addr));
2451 		}
2452 	}
2453 	return nregions;
2454 }
2455 
2456 /*
2457  * Divide each region whose vhost virtual address is continous into a few
2458  * sub-regions, make sure the physical address within each sub-region are
2459  * continous. And fill offset(to GPA) and size etc. information of each
2460  * sub-region into regions_hpa.
2461  */
2462 static uint32_t
2463 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2464 {
2465 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2466 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2467 
2468 	if (mem_region_hpa == NULL)
2469 		return 0;
2470 
2471 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2472 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2473 			virtio_memory->regions[regionidx].address_offset;
2474 		mem_region_hpa[regionidx_hpa].guest_phys_address
2475 			= virtio_memory->regions[regionidx].guest_phys_address;
2476 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2477 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2478 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2479 		LOG_DEBUG(VHOST_CONFIG,
2480 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2481 			regionidx_hpa,
2482 			(void *)(uintptr_t)
2483 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2484 		LOG_DEBUG(VHOST_CONFIG,
2485 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2486 			regionidx_hpa,
2487 			(void *)(uintptr_t)
2488 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2489 		for (i = 0, k = 0;
2490 			i < virtio_memory->regions[regionidx].memory_size -
2491 				page_size;
2492 			i += page_size) {
2493 			cur_phys_addr = rte_mem_virt2phy(
2494 					(void *)(uintptr_t)(vva_start + i));
2495 			next_phys_addr = rte_mem_virt2phy(
2496 					(void *)(uintptr_t)(vva_start +
2497 					i + page_size));
2498 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2499 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2500 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2501 					k + page_size;
2502 				mem_region_hpa[regionidx_hpa].memory_size
2503 					= k + page_size;
2504 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2505 					"phys addr end  [%d]:(%p)\n",
2506 					regionidx_hpa,
2507 					(void *)(uintptr_t)
2508 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2509 				LOG_DEBUG(VHOST_CONFIG,
2510 					"in fill_hpa_regions: guest phys addr "
2511 					"size [%d]:(%p)\n",
2512 					regionidx_hpa,
2513 					(void *)(uintptr_t)
2514 					(mem_region_hpa[regionidx_hpa].memory_size));
2515 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2516 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2517 				++regionidx_hpa;
2518 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2519 					next_phys_addr -
2520 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2521 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2522 					" phys addr start[%d]:(%p)\n",
2523 					regionidx_hpa,
2524 					(void *)(uintptr_t)
2525 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2526 				LOG_DEBUG(VHOST_CONFIG,
2527 					"in fill_hpa_regions: host  phys addr "
2528 					"start[%d]:(%p)\n",
2529 					regionidx_hpa,
2530 					(void *)(uintptr_t)
2531 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2532 				k = 0;
2533 			} else {
2534 				k += page_size;
2535 			}
2536 		}
2537 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2538 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2539 			+ k + page_size;
2540 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2541 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2542 			"[%d]:(%p)\n", regionidx_hpa,
2543 			(void *)(uintptr_t)
2544 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2545 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2546 			"[%d]:(%p)\n", regionidx_hpa,
2547 			(void *)(uintptr_t)
2548 			(mem_region_hpa[regionidx_hpa].memory_size));
2549 		++regionidx_hpa;
2550 	}
2551 	return regionidx_hpa;
2552 }
2553 
2554 /*
2555  * A new device is added to a data core. First the device is added to the main linked list
2556  * and the allocated to a specific data core.
2557  */
2558 static int
2559 new_device (struct virtio_net *dev)
2560 {
2561 	struct virtio_net_data_ll *ll_dev;
2562 	int lcore, core_add = 0;
2563 	uint32_t device_num_min = num_devices;
2564 	struct vhost_dev *vdev;
2565 	uint32_t regionidx;
2566 
2567 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2568 	if (vdev == NULL) {
2569 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2570 			dev->device_fh);
2571 		return -1;
2572 	}
2573 	vdev->dev = dev;
2574 	dev->priv = vdev;
2575 
2576 	if (zero_copy) {
2577 		vdev->nregions_hpa = dev->mem->nregions;
2578 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2579 			vdev->nregions_hpa
2580 				+= check_hpa_regions(
2581 					dev->mem->regions[regionidx].guest_phys_address
2582 					+ dev->mem->regions[regionidx].address_offset,
2583 					dev->mem->regions[regionidx].memory_size);
2584 
2585 		}
2586 
2587 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2588 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2589 			CACHE_LINE_SIZE);
2590 		if (vdev->regions_hpa == NULL) {
2591 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2592 			rte_free(vdev);
2593 			return -1;
2594 		}
2595 
2596 
2597 		if (fill_hpa_memory_regions(
2598 			vdev->regions_hpa, dev->mem
2599 			) != vdev->nregions_hpa) {
2600 
2601 			RTE_LOG(ERR, VHOST_CONFIG,
2602 				"hpa memory regions number mismatch: "
2603 				"[%d]\n", vdev->nregions_hpa);
2604 			rte_free(vdev->regions_hpa);
2605 			rte_free(vdev);
2606 			return -1;
2607 		}
2608 	}
2609 
2610 
2611 	/* Add device to main ll */
2612 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2613 	if (ll_dev == NULL) {
2614 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2615 			"of %d devices per core has been reached\n",
2616 			dev->device_fh, num_devices);
2617 		if (vdev->regions_hpa)
2618 			rte_free(vdev->regions_hpa);
2619 		rte_free(vdev);
2620 		return -1;
2621 	}
2622 	ll_dev->vdev = vdev;
2623 	add_data_ll_entry(&ll_root_used, ll_dev);
2624 	vdev->vmdq_rx_q
2625 		= dev->device_fh * (num_queues / num_devices);
2626 
2627 	if (zero_copy) {
2628 		uint32_t index = vdev->vmdq_rx_q;
2629 		uint32_t count_in_ring, i;
2630 		struct mbuf_table *tx_q;
2631 
2632 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2633 
2634 		LOG_DEBUG(VHOST_CONFIG,
2635 			"(%"PRIu64") in new_device: mbuf count in mempool "
2636 			"before attach is: %d\n",
2637 			dev->device_fh,
2638 			rte_mempool_count(vpool_array[index].pool));
2639 		LOG_DEBUG(VHOST_CONFIG,
2640 			"(%"PRIu64") in new_device: mbuf count in  ring "
2641 			"before attach  is : %d\n",
2642 			dev->device_fh, count_in_ring);
2643 
2644 		/*
2645 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2646 		 */
2647 		for (i = 0; i < count_in_ring; i++)
2648 			attach_rxmbuf_zcp(dev);
2649 
2650 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2651 			"mempool after attach is: %d\n",
2652 			dev->device_fh,
2653 			rte_mempool_count(vpool_array[index].pool));
2654 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2655 			"ring after attach  is : %d\n",
2656 			dev->device_fh,
2657 			rte_ring_count(vpool_array[index].ring));
2658 
2659 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2660 		tx_q->txq_id = vdev->vmdq_rx_q;
2661 
2662 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2663 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2664 
2665 			LOG_DEBUG(VHOST_CONFIG,
2666 				"(%"PRIu64") In new_device: Failed to start "
2667 				"tx queue:%d\n",
2668 				dev->device_fh, vdev->vmdq_rx_q);
2669 
2670 			mbuf_destroy_zcp(vpool);
2671 			rte_free(vdev->regions_hpa);
2672 			rte_free(vdev);
2673 			return -1;
2674 		}
2675 
2676 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2677 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2678 
2679 			LOG_DEBUG(VHOST_CONFIG,
2680 				"(%"PRIu64") In new_device: Failed to start "
2681 				"rx queue:%d\n",
2682 				dev->device_fh, vdev->vmdq_rx_q);
2683 
2684 			/* Stop the TX queue. */
2685 			if (rte_eth_dev_tx_queue_stop(ports[0],
2686 				vdev->vmdq_rx_q) != 0) {
2687 				LOG_DEBUG(VHOST_CONFIG,
2688 					"(%"PRIu64") In new_device: Failed to "
2689 					"stop tx queue:%d\n",
2690 					dev->device_fh, vdev->vmdq_rx_q);
2691 			}
2692 
2693 			mbuf_destroy_zcp(vpool);
2694 			rte_free(vdev->regions_hpa);
2695 			rte_free(vdev);
2696 			return -1;
2697 		}
2698 
2699 	}
2700 
2701 	/*reset ready flag*/
2702 	vdev->ready = DEVICE_MAC_LEARNING;
2703 	vdev->remove = 0;
2704 
2705 	/* Find a suitable lcore to add the device. */
2706 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2707 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2708 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2709 			core_add = lcore;
2710 		}
2711 	}
2712 	/* Add device to lcore ll */
2713 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2714 	if (ll_dev == NULL) {
2715 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2716 		vdev->ready = DEVICE_SAFE_REMOVE;
2717 		destroy_device(dev);
2718 		if (vdev->regions_hpa)
2719 			rte_free(vdev->regions_hpa);
2720 		rte_free(vdev);
2721 		return -1;
2722 	}
2723 	ll_dev->vdev = vdev;
2724 	vdev->coreid = core_add;
2725 
2726 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2727 
2728 	/* Initialize device stats */
2729 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2730 
2731 	/* Disable notifications. */
2732 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2733 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2734 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2735 	dev->flags |= VIRTIO_DEV_RUNNING;
2736 
2737 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2738 
2739 	return 0;
2740 }
2741 
2742 /*
2743  * These callback allow devices to be added to the data core when configuration
2744  * has been fully complete.
2745  */
2746 static const struct virtio_net_device_ops virtio_net_device_ops =
2747 {
2748 	.new_device =  new_device,
2749 	.destroy_device = destroy_device,
2750 };
2751 
2752 /*
2753  * This is a thread will wake up after a period to print stats if the user has
2754  * enabled them.
2755  */
2756 static void
2757 print_stats(void)
2758 {
2759 	struct virtio_net_data_ll *dev_ll;
2760 	uint64_t tx_dropped, rx_dropped;
2761 	uint64_t tx, tx_total, rx, rx_total;
2762 	uint32_t device_fh;
2763 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2764 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2765 
2766 	while(1) {
2767 		sleep(enable_stats);
2768 
2769 		/* Clear screen and move to top left */
2770 		printf("%s%s", clr, top_left);
2771 
2772 		printf("\nDevice statistics ====================================");
2773 
2774 		dev_ll = ll_root_used;
2775 		while (dev_ll != NULL) {
2776 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2777 			tx_total = dev_statistics[device_fh].tx_total;
2778 			tx = dev_statistics[device_fh].tx;
2779 			tx_dropped = tx_total - tx;
2780 			if (zero_copy == 0) {
2781 				rx_total = rte_atomic64_read(
2782 					&dev_statistics[device_fh].rx_total_atomic);
2783 				rx = rte_atomic64_read(
2784 					&dev_statistics[device_fh].rx_atomic);
2785 			} else {
2786 				rx_total = dev_statistics[device_fh].rx_total;
2787 				rx = dev_statistics[device_fh].rx;
2788 			}
2789 			rx_dropped = rx_total - rx;
2790 
2791 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2792 					"\nTX total: 		%"PRIu64""
2793 					"\nTX dropped: 		%"PRIu64""
2794 					"\nTX successful: 		%"PRIu64""
2795 					"\nRX total: 		%"PRIu64""
2796 					"\nRX dropped: 		%"PRIu64""
2797 					"\nRX successful: 		%"PRIu64"",
2798 					device_fh,
2799 					tx_total,
2800 					tx_dropped,
2801 					tx,
2802 					rx_total,
2803 					rx_dropped,
2804 					rx);
2805 
2806 			dev_ll = dev_ll->next;
2807 		}
2808 		printf("\n======================================================\n");
2809 	}
2810 }
2811 
2812 static void
2813 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2814 	char *ring_name, uint32_t nb_mbuf)
2815 {
2816 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2817 	vpool_array[index].pool
2818 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2819 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2820 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2821 		rte_pktmbuf_init, NULL, socket, 0);
2822 	if (vpool_array[index].pool != NULL) {
2823 		vpool_array[index].ring
2824 			= rte_ring_create(ring_name,
2825 				rte_align32pow2(nb_mbuf + 1),
2826 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2827 		if (likely(vpool_array[index].ring != NULL)) {
2828 			LOG_DEBUG(VHOST_CONFIG,
2829 				"in setup_mempool_tbl: mbuf count in "
2830 				"mempool is: %d\n",
2831 				rte_mempool_count(vpool_array[index].pool));
2832 			LOG_DEBUG(VHOST_CONFIG,
2833 				"in setup_mempool_tbl: mbuf count in "
2834 				"ring   is: %d\n",
2835 				rte_ring_count(vpool_array[index].ring));
2836 		} else {
2837 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2838 				ring_name);
2839 		}
2840 
2841 		/* Need consider head room. */
2842 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2843 	} else {
2844 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2845 	}
2846 }
2847 
2848 
2849 /*
2850  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2851  * device is also registered here to handle the IOCTLs.
2852  */
2853 int
2854 MAIN(int argc, char *argv[])
2855 {
2856 	struct rte_mempool *mbuf_pool = NULL;
2857 	unsigned lcore_id, core_id = 0;
2858 	unsigned nb_ports, valid_num_ports;
2859 	int ret;
2860 	uint8_t portid, queue_id = 0;
2861 	static pthread_t tid;
2862 
2863 	/* init EAL */
2864 	ret = rte_eal_init(argc, argv);
2865 	if (ret < 0)
2866 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2867 	argc -= ret;
2868 	argv += ret;
2869 
2870 	/* parse app arguments */
2871 	ret = us_vhost_parse_args(argc, argv);
2872 	if (ret < 0)
2873 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2874 
2875 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2876 		if (rte_lcore_is_enabled(lcore_id))
2877 			lcore_ids[core_id ++] = lcore_id;
2878 
2879 	if (rte_lcore_count() > RTE_MAX_LCORE)
2880 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2881 
2882 	/*set the number of swithcing cores available*/
2883 	num_switching_cores = rte_lcore_count()-1;
2884 
2885 	/* Get the number of physical ports. */
2886 	nb_ports = rte_eth_dev_count();
2887 	if (nb_ports > RTE_MAX_ETHPORTS)
2888 		nb_ports = RTE_MAX_ETHPORTS;
2889 
2890 	/*
2891 	 * Update the global var NUM_PORTS and global array PORTS
2892 	 * and get value of var VALID_NUM_PORTS according to system ports number
2893 	 */
2894 	valid_num_ports = check_ports_num(nb_ports);
2895 
2896 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2897 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2898 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2899 		return -1;
2900 	}
2901 
2902 	if (zero_copy == 0) {
2903 		/* Create the mbuf pool. */
2904 		mbuf_pool = rte_mempool_create(
2905 				"MBUF_POOL",
2906 				NUM_MBUFS_PER_PORT
2907 				* valid_num_ports,
2908 				MBUF_SIZE, MBUF_CACHE_SIZE,
2909 				sizeof(struct rte_pktmbuf_pool_private),
2910 				rte_pktmbuf_pool_init, NULL,
2911 				rte_pktmbuf_init, NULL,
2912 				rte_socket_id(), 0);
2913 		if (mbuf_pool == NULL)
2914 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2915 
2916 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2917 			vpool_array[queue_id].pool = mbuf_pool;
2918 
2919 		if (vm2vm_mode == VM2VM_HARDWARE) {
2920 			/* Enable VT loop back to let L2 switch to do it. */
2921 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2922 			LOG_DEBUG(VHOST_CONFIG,
2923 				"Enable loop back for L2 switch in vmdq.\n");
2924 		}
2925 	} else {
2926 		uint32_t nb_mbuf;
2927 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2928 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2929 
2930 		/*
2931 		 * Zero copy defers queue RX/TX start to the time when guest
2932 		 * finishes its startup and packet buffers from that guest are
2933 		 * available.
2934 		 */
2935 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2936 		rx_conf_default.rx_drop_en = 0;
2937 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2938 		nb_mbuf = num_rx_descriptor
2939 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2940 			+ num_switching_cores * MAX_PKT_BURST;
2941 
2942 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2943 			snprintf(pool_name, sizeof(pool_name),
2944 				"rxmbuf_pool_%u", queue_id);
2945 			snprintf(ring_name, sizeof(ring_name),
2946 				"rxmbuf_ring_%u", queue_id);
2947 			setup_mempool_tbl(rte_socket_id(), queue_id,
2948 				pool_name, ring_name, nb_mbuf);
2949 		}
2950 
2951 		nb_mbuf = num_tx_descriptor
2952 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2953 				+ num_switching_cores * MAX_PKT_BURST;
2954 
2955 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2956 			snprintf(pool_name, sizeof(pool_name),
2957 				"txmbuf_pool_%u", queue_id);
2958 			snprintf(ring_name, sizeof(ring_name),
2959 				"txmbuf_ring_%u", queue_id);
2960 			setup_mempool_tbl(rte_socket_id(),
2961 				(queue_id + MAX_QUEUES),
2962 				pool_name, ring_name, nb_mbuf);
2963 		}
2964 
2965 		if (vm2vm_mode == VM2VM_HARDWARE) {
2966 			/* Enable VT loop back to let L2 switch to do it. */
2967 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2968 			LOG_DEBUG(VHOST_CONFIG,
2969 				"Enable loop back for L2 switch in vmdq.\n");
2970 		}
2971 	}
2972 	/* Set log level. */
2973 	rte_set_log_level(LOG_LEVEL);
2974 
2975 	/* initialize all ports */
2976 	for (portid = 0; portid < nb_ports; portid++) {
2977 		/* skip ports that are not enabled */
2978 		if ((enabled_port_mask & (1 << portid)) == 0) {
2979 			RTE_LOG(INFO, VHOST_PORT,
2980 				"Skipping disabled port %d\n", portid);
2981 			continue;
2982 		}
2983 		if (port_init(portid) != 0)
2984 			rte_exit(EXIT_FAILURE,
2985 				"Cannot initialize network ports\n");
2986 	}
2987 
2988 	/* Initialise all linked lists. */
2989 	if (init_data_ll() == -1)
2990 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2991 
2992 	/* Initialize device stats */
2993 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2994 
2995 	/* Enable stats if the user option is set. */
2996 	if (enable_stats)
2997 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2998 
2999 	/* Launch all data cores. */
3000 	if (zero_copy == 0) {
3001 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3002 			rte_eal_remote_launch(switch_worker,
3003 				mbuf_pool, lcore_id);
3004 		}
3005 	} else {
3006 		uint32_t count_in_mempool, index, i;
3007 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3008 			/* For all RX and TX queues. */
3009 			count_in_mempool
3010 				= rte_mempool_count(vpool_array[index].pool);
3011 
3012 			/*
3013 			 * Transfer all un-attached mbufs from vpool.pool
3014 			 * to vpoo.ring.
3015 			 */
3016 			for (i = 0; i < count_in_mempool; i++) {
3017 				struct rte_mbuf *mbuf
3018 					= __rte_mbuf_raw_alloc(
3019 						vpool_array[index].pool);
3020 				rte_ring_sp_enqueue(vpool_array[index].ring,
3021 						(void *)mbuf);
3022 			}
3023 
3024 			LOG_DEBUG(VHOST_CONFIG,
3025 				"in MAIN: mbuf count in mempool at initial "
3026 				"is: %d\n", count_in_mempool);
3027 			LOG_DEBUG(VHOST_CONFIG,
3028 				"in MAIN: mbuf count in  ring at initial  is :"
3029 				" %d\n",
3030 				rte_ring_count(vpool_array[index].ring));
3031 		}
3032 
3033 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3034 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3035 				lcore_id);
3036 	}
3037 
3038 	if (mergeable == 0)
3039 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3040 
3041 	/* Register CUSE device to handle IOCTLs. */
3042 	ret = rte_vhost_driver_register((char *)&dev_basename);
3043 	if (ret != 0)
3044 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3045 
3046 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3047 
3048 	/* Start CUSE session. */
3049 	rte_vhost_driver_session_start();
3050 	return 0;
3051 
3052 }
3053 
3054