xref: /dpdk/examples/vhost/main.c (revision be800696c26efe452a664d4eefebe8da23472139)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
103 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
104 
105 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
106 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
107 
108 #define JUMBO_FRAME_MAX_SIZE    0x2600
109 
110 /* State of virtio device. */
111 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_RX			1
113 #define DEVICE_SAFE_REMOVE	2
114 
115 /* Config_core_flag status definitions. */
116 #define REQUEST_DEV_REMOVAL 1
117 #define ACK_DEV_REMOVAL 0
118 
119 /* Configurable number of RX/TX ring descriptors */
120 #define RTE_TEST_RX_DESC_DEFAULT 1024
121 #define RTE_TEST_TX_DESC_DEFAULT 512
122 
123 /*
124  * Need refine these 2 macros for legacy and DPDK based front end:
125  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
126  * And then adjust power 2.
127  */
128 /*
129  * For legacy front end, 128 descriptors,
130  * half for virtio header, another half for mbuf.
131  */
132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
134 
135 /* Get first 4 bytes in mbuf headroom. */
136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
137 		+ sizeof(struct rte_mbuf)))
138 
139 /* true if x is a power of 2 */
140 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 
142 #define INVALID_PORT_ID 0xFF
143 
144 /* Max number of devices. Limited by vmdq. */
145 #define MAX_DEVICES 64
146 
147 /* Size of buffers used for snprintfs. */
148 #define MAX_PRINT_BUFF 6072
149 
150 /* Maximum character device basename size. */
151 #define MAX_BASENAME_SZ 10
152 
153 /* Maximum long option length for option parsing. */
154 #define MAX_LONG_OPT_SZ 64
155 
156 /* Used to compare MAC addresses. */
157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 
159 /* Number of descriptors per cacheline. */
160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 
162 /* mask of enabled ports */
163 static uint32_t enabled_port_mask = 0;
164 
165 /*Number of switching cores enabled*/
166 static uint32_t num_switching_cores = 0;
167 
168 /* number of devices/queues to support*/
169 static uint32_t num_queues = 0;
170 uint32_t num_devices = 0;
171 
172 /*
173  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
174  * disabled on default.
175  */
176 static uint32_t zero_copy;
177 
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184 
185 struct vpool {
186 	struct rte_mempool *pool;
187 	struct rte_ring *ring;
188 	uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193 	VM2VM_DISABLED = 0,
194 	VM2VM_SOFTWARE = 1,
195 	VM2VM_HARDWARE = 2,
196 	VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202 	PHYS_ADDR_CONTINUOUS = 0,
203 	PHYS_ADDR_CROSS_SUBREG = 1,
204 	PHYS_ADDR_INVALID = 2,
205 	PHYS_ADDR_LAST
206 } hpa_type;
207 
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 
220 
221 /* This can be set by the user so it is made available here. */
222 extern uint64_t VHOST_FEATURES;
223 
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226 	.rx_thresh = {
227 		.pthresh = RX_PTHRESH,
228 		.hthresh = RX_HTHRESH,
229 		.wthresh = RX_WTHRESH,
230 	},
231 	.rx_drop_en = 1,
232 };
233 
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240 	.tx_thresh = {
241 		.pthresh = TX_PTHRESH,
242 		.hthresh = TX_HTHRESH,
243 		.wthresh = TX_WTHRESH,
244 	},
245 	.tx_free_thresh = 0, /* Use PMD default values */
246 	.tx_rs_thresh = 0, /* Use PMD default values */
247 };
248 
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251 	.rxmode = {
252 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253 		.split_hdr_size = 0,
254 		.header_split   = 0, /**< Header Split disabled */
255 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
256 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
257 		/*
258 		 * It is necessary for 1G NIC such as I350,
259 		 * this fixes bug of ipv4 forwarding in guest can't
260 		 * forward pakets from one virtio dev to another virtio dev.
261 		 */
262 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
265 	},
266 
267 	.txmode = {
268 		.mq_mode = ETH_MQ_TX_NONE,
269 	},
270 	.rx_adv_conf = {
271 		/*
272 		 * should be overridden separately in code with
273 		 * appropriate values
274 		 */
275 		.vmdq_rx_conf = {
276 			.nb_queue_pools = ETH_8_POOLS,
277 			.enable_default_pool = 0,
278 			.default_pool = 0,
279 			.nb_pool_maps = 0,
280 			.pool_map = {{0, 0},},
281 		},
282 	},
283 };
284 
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288 
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
293 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300 
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303 
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307 
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310 
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313 	unsigned len;
314 	unsigned txq_id;
315 	struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317 
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320 
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323 
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326 	unsigned char   h_dest[ETH_ALEN];
327 	unsigned char   h_source[ETH_ALEN];
328 	__be16          h_vlan_proto;
329 	__be16          h_vlan_TCI;
330 	__be16          h_vlan_encapsulated_proto;
331 };
332 
333 /* IPv4 Header */
334 struct ipv4_hdr {
335 	uint8_t  version_ihl;		/**< version and header length */
336 	uint8_t  type_of_service;	/**< type of service */
337 	uint16_t total_length;		/**< length of packet */
338 	uint16_t packet_id;		/**< packet ID */
339 	uint16_t fragment_offset;	/**< fragmentation offset */
340 	uint8_t  time_to_live;		/**< time to live */
341 	uint8_t  next_proto_id;		/**< protocol ID */
342 	uint16_t hdr_checksum;		/**< header checksum */
343 	uint32_t src_addr;		/**< source address */
344 	uint32_t dst_addr;		/**< destination address */
345 } __attribute__((__packed__));
346 
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350 
351 /* Per-device statistics struct */
352 struct device_statistics {
353 	uint64_t tx_total;
354 	rte_atomic64_t rx_total_atomic;
355 	uint64_t rx_total;
356 	uint64_t tx;
357 	rte_atomic64_t rx_atomic;
358 	uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361 
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369 	struct rte_eth_vmdq_rx_conf conf;
370 	unsigned i;
371 
372 	memset(&conf, 0, sizeof(conf));
373 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
374 	conf.nb_pool_maps = num_devices;
375 	conf.enable_loop_back =
376 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
377 
378 	for (i = 0; i < conf.nb_pool_maps; i++) {
379 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
380 		conf.pool_map[i].pools = (1UL << i);
381 	}
382 
383 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
384 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
385 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
386 	return 0;
387 }
388 
389 /*
390  * Validate the device number according to the max pool number gotten form
391  * dev_info. If the device number is invalid, give the error message and
392  * return -1. Each device must have its own pool.
393  */
394 static inline int
395 validate_num_devices(uint32_t max_nb_devices)
396 {
397 	if (num_devices > max_nb_devices) {
398 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
399 		return -1;
400 	}
401 	return 0;
402 }
403 
404 /*
405  * Initialises a given port using global settings and with the rx buffers
406  * coming from the mbuf_pool passed as parameter
407  */
408 static inline int
409 port_init(uint8_t port)
410 {
411 	struct rte_eth_dev_info dev_info;
412 	struct rte_eth_conf port_conf;
413 	uint16_t rx_rings, tx_rings;
414 	uint16_t rx_ring_size, tx_ring_size;
415 	int retval;
416 	uint16_t q;
417 
418 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
419 	rte_eth_dev_info_get (port, &dev_info);
420 
421 	/*configure the number of supported virtio devices based on VMDQ limits */
422 	num_devices = dev_info.max_vmdq_pools;
423 	num_queues = dev_info.max_rx_queues;
424 
425 	if (zero_copy) {
426 		rx_ring_size = num_rx_descriptor;
427 		tx_ring_size = num_tx_descriptor;
428 		tx_rings = dev_info.max_tx_queues;
429 	} else {
430 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
431 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
432 		tx_rings = (uint16_t)rte_lcore_count();
433 	}
434 
435 	retval = validate_num_devices(MAX_DEVICES);
436 	if (retval < 0)
437 		return retval;
438 
439 	/* Get port configuration. */
440 	retval = get_eth_conf(&port_conf, num_devices);
441 	if (retval < 0)
442 		return retval;
443 
444 	if (port >= rte_eth_dev_count()) return -1;
445 
446 	rx_rings = (uint16_t)num_queues,
447 	/* Configure ethernet device. */
448 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449 	if (retval != 0)
450 		return retval;
451 
452 	/* Setup the queues. */
453 	for (q = 0; q < rx_rings; q ++) {
454 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455 						rte_eth_dev_socket_id(port), &rx_conf_default,
456 						vpool_array[q].pool);
457 		if (retval < 0)
458 			return retval;
459 	}
460 	for (q = 0; q < tx_rings; q ++) {
461 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462 						rte_eth_dev_socket_id(port), &tx_conf_default);
463 		if (retval < 0)
464 			return retval;
465 	}
466 
467 	/* Start the device. */
468 	retval  = rte_eth_dev_start(port);
469 	if (retval < 0) {
470 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471 		return retval;
472 	}
473 
474 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
475 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
476 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
477 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
478 			(unsigned)port,
479 			vmdq_ports_eth_addr[port].addr_bytes[0],
480 			vmdq_ports_eth_addr[port].addr_bytes[1],
481 			vmdq_ports_eth_addr[port].addr_bytes[2],
482 			vmdq_ports_eth_addr[port].addr_bytes[3],
483 			vmdq_ports_eth_addr[port].addr_bytes[4],
484 			vmdq_ports_eth_addr[port].addr_bytes[5]);
485 
486 	return 0;
487 }
488 
489 /*
490  * Set character device basename.
491  */
492 static int
493 us_vhost_parse_basename(const char *q_arg)
494 {
495 	/* parse number string */
496 
497 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
498 		return -1;
499 	else
500 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
501 
502 	return 0;
503 }
504 
505 /*
506  * Parse the portmask provided at run time.
507  */
508 static int
509 parse_portmask(const char *portmask)
510 {
511 	char *end = NULL;
512 	unsigned long pm;
513 
514 	errno = 0;
515 
516 	/* parse hexadecimal string */
517 	pm = strtoul(portmask, &end, 16);
518 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
519 		return -1;
520 
521 	if (pm == 0)
522 		return -1;
523 
524 	return pm;
525 
526 }
527 
528 /*
529  * Parse num options at run time.
530  */
531 static int
532 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
533 {
534 	char *end = NULL;
535 	unsigned long num;
536 
537 	errno = 0;
538 
539 	/* parse unsigned int string */
540 	num = strtoul(q_arg, &end, 10);
541 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
542 		return -1;
543 
544 	if (num > max_valid_value)
545 		return -1;
546 
547 	return num;
548 
549 }
550 
551 /*
552  * Display usage
553  */
554 static void
555 us_vhost_usage(const char *prgname)
556 {
557 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
558 	"		--vm2vm [0|1|2]\n"
559 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
560 	"		--dev-basename <name>\n"
561 	"		--nb-devices ND\n"
562 	"		-p PORTMASK: Set mask for ports to be used by application\n"
563 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
564 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
565 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
566 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
567 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
568 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
569 	"		--dev-basename: The basename to be used for the character device.\n"
570 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
571 			"zero copy\n"
572 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
573 			"used only when zero copy is enabled.\n"
574 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
575 			"used only when zero copy is enabled.\n",
576 	       prgname);
577 }
578 
579 /*
580  * Parse the arguments given in the command line of the application.
581  */
582 static int
583 us_vhost_parse_args(int argc, char **argv)
584 {
585 	int opt, ret;
586 	int option_index;
587 	unsigned i;
588 	const char *prgname = argv[0];
589 	static struct option long_option[] = {
590 		{"vm2vm", required_argument, NULL, 0},
591 		{"rx-retry", required_argument, NULL, 0},
592 		{"rx-retry-delay", required_argument, NULL, 0},
593 		{"rx-retry-num", required_argument, NULL, 0},
594 		{"mergeable", required_argument, NULL, 0},
595 		{"stats", required_argument, NULL, 0},
596 		{"dev-basename", required_argument, NULL, 0},
597 		{"zero-copy", required_argument, NULL, 0},
598 		{"rx-desc-num", required_argument, NULL, 0},
599 		{"tx-desc-num", required_argument, NULL, 0},
600 		{NULL, 0, 0, 0},
601 	};
602 
603 	/* Parse command line */
604 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605 		switch (opt) {
606 		/* Portmask */
607 		case 'p':
608 			enabled_port_mask = parse_portmask(optarg);
609 			if (enabled_port_mask == 0) {
610 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611 				us_vhost_usage(prgname);
612 				return -1;
613 			}
614 			break;
615 
616 		case 0:
617 			/* Enable/disable vm2vm comms. */
618 			if (!strncmp(long_option[option_index].name, "vm2vm",
619 				MAX_LONG_OPT_SZ)) {
620 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621 				if (ret == -1) {
622 					RTE_LOG(INFO, VHOST_CONFIG,
623 						"Invalid argument for "
624 						"vm2vm [0|1|2]\n");
625 					us_vhost_usage(prgname);
626 					return -1;
627 				} else {
628 					vm2vm_mode = (vm2vm_type)ret;
629 				}
630 			}
631 
632 			/* Enable/disable retries on RX. */
633 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634 				ret = parse_num_opt(optarg, 1);
635 				if (ret == -1) {
636 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637 					us_vhost_usage(prgname);
638 					return -1;
639 				} else {
640 					enable_retry = ret;
641 				}
642 			}
643 
644 			/* Specify the retries delay time (in useconds) on RX. */
645 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646 				ret = parse_num_opt(optarg, INT32_MAX);
647 				if (ret == -1) {
648 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649 					us_vhost_usage(prgname);
650 					return -1;
651 				} else {
652 					burst_rx_delay_time = ret;
653 				}
654 			}
655 
656 			/* Specify the retries number on RX. */
657 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658 				ret = parse_num_opt(optarg, INT32_MAX);
659 				if (ret == -1) {
660 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661 					us_vhost_usage(prgname);
662 					return -1;
663 				} else {
664 					burst_rx_retry_num = ret;
665 				}
666 			}
667 
668 			/* Enable/disable RX mergeable buffers. */
669 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670 				ret = parse_num_opt(optarg, 1);
671 				if (ret == -1) {
672 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673 					us_vhost_usage(prgname);
674 					return -1;
675 				} else {
676 					if (ret) {
677 						vmdq_conf_default.rxmode.jumbo_frame = 1;
678 						vmdq_conf_default.rxmode.max_rx_pkt_len
679 							= JUMBO_FRAME_MAX_SIZE;
680 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681 					}
682 				}
683 			}
684 
685 			/* Enable/disable stats. */
686 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687 				ret = parse_num_opt(optarg, INT32_MAX);
688 				if (ret == -1) {
689 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690 					us_vhost_usage(prgname);
691 					return -1;
692 				} else {
693 					enable_stats = ret;
694 				}
695 			}
696 
697 			/* Set character device basename. */
698 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699 				if (us_vhost_parse_basename(optarg) == -1) {
700 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701 					us_vhost_usage(prgname);
702 					return -1;
703 				}
704 			}
705 
706 			/* Enable/disable rx/tx zero copy. */
707 			if (!strncmp(long_option[option_index].name,
708 				"zero-copy", MAX_LONG_OPT_SZ)) {
709 				ret = parse_num_opt(optarg, 1);
710 				if (ret == -1) {
711 					RTE_LOG(INFO, VHOST_CONFIG,
712 						"Invalid argument"
713 						" for zero-copy [0|1]\n");
714 					us_vhost_usage(prgname);
715 					return -1;
716 				} else
717 					zero_copy = ret;
718 
719 				if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722 					"zero copy vhost APP, please "
723 					"disable RTE_MBUF_REFCNT\n"
724 					"in config file and then rebuild DPDK "
725 					"core lib!\n"
726 					"Otherwise please disable zero copy "
727 					"flag in command line!\n");
728 					return -1;
729 #endif
730 				}
731 			}
732 
733 			/* Specify the descriptor number on RX. */
734 			if (!strncmp(long_option[option_index].name,
735 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
736 				ret = parse_num_opt(optarg, MAX_RING_DESC);
737 				if ((ret == -1) || (!POWEROF2(ret))) {
738 					RTE_LOG(INFO, VHOST_CONFIG,
739 					"Invalid argument for rx-desc-num[0-N],"
740 					"power of 2 required.\n");
741 					us_vhost_usage(prgname);
742 					return -1;
743 				} else {
744 					num_rx_descriptor = ret;
745 				}
746 			}
747 
748 			/* Specify the descriptor number on TX. */
749 			if (!strncmp(long_option[option_index].name,
750 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
751 				ret = parse_num_opt(optarg, MAX_RING_DESC);
752 				if ((ret == -1) || (!POWEROF2(ret))) {
753 					RTE_LOG(INFO, VHOST_CONFIG,
754 					"Invalid argument for tx-desc-num [0-N],"
755 					"power of 2 required.\n");
756 					us_vhost_usage(prgname);
757 					return -1;
758 				} else {
759 					num_tx_descriptor = ret;
760 				}
761 			}
762 
763 			break;
764 
765 			/* Invalid option - print options. */
766 		default:
767 			us_vhost_usage(prgname);
768 			return -1;
769 		}
770 	}
771 
772 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773 		if (enabled_port_mask & (1 << i))
774 			ports[num_ports++] = (uint8_t)i;
775 	}
776 
777 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780 		return -1;
781 	}
782 
783 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784 		RTE_LOG(INFO, VHOST_PORT,
785 			"Vhost zero copy doesn't support software vm2vm,"
786 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
787 		return -1;
788 	}
789 
790 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791 		RTE_LOG(INFO, VHOST_PORT,
792 			"Vhost zero copy doesn't support jumbo frame,"
793 			"please specify '--mergeable 0' to disable the "
794 			"mergeable feature.\n");
795 		return -1;
796 	}
797 
798 	return 0;
799 }
800 
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807 	unsigned valid_num_ports = num_ports;
808 	unsigned portid;
809 
810 	if (num_ports > nb_ports) {
811 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812 			num_ports, nb_ports);
813 		num_ports = nb_ports;
814 	}
815 
816 	for (portid = 0; portid < num_ports; portid ++) {
817 		if (ports[portid] >= nb_ports) {
818 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819 				ports[portid], (nb_ports - 1));
820 			ports[portid] = INVALID_PORT_ID;
821 			valid_num_ports--;
822 		}
823 	}
824 	return valid_num_ports;
825 }
826 
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {																\
833 	char *pkt_addr = (char*)(addr);																					\
834 	unsigned int index;																								\
835 	char packet[MAX_PRINT_BUFF];																					\
836 																													\
837 	if ((header))																									\
838 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
839 	else																											\
840 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
841 	for (index = 0; index < (size); index++) {																		\
842 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
843 			"%02hhx ", pkt_addr[index]);																			\
844 	}																												\
845 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
846 																													\
847 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852 
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859 	uint32_t buf_len, hpa_type *addr_type)
860 {
861 	struct virtio_memory_regions_hpa *region;
862 	uint32_t regionidx;
863 	uint64_t vhost_pa = 0;
864 
865 	*addr_type = PHYS_ADDR_INVALID;
866 
867 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868 		region = &vdev->regions_hpa[regionidx];
869 		if ((guest_pa >= region->guest_phys_address) &&
870 			(guest_pa <= region->guest_phys_address_end)) {
871 			vhost_pa = region->host_phys_addr_offset + guest_pa;
872 			if (likely((guest_pa + buf_len - 1)
873 				<= region->guest_phys_address_end))
874 				*addr_type = PHYS_ADDR_CONTINUOUS;
875 			else
876 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
877 			break;
878 		}
879 	}
880 
881 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883 		(void *)(uintptr_t)vhost_pa);
884 
885 	return vhost_pa;
886 }
887 
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896 
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904 	struct ether_hdr *pkt_hdr;
905 	struct virtio_net_data_ll *dev_ll;
906 	struct virtio_net *dev = vdev->dev;
907 	int i, ret;
908 
909 	/* Learn MAC address of guest device from packet */
910 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911 
912 	dev_ll = ll_root_used;
913 
914 	while (dev_ll != NULL) {
915 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917 			return -1;
918 		}
919 		dev_ll = dev_ll->next;
920 	}
921 
922 	for (i = 0; i < ETHER_ADDR_LEN; i++)
923 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924 
925 	/* vlan_tag currently uses the device_id. */
926 	vdev->vlan_tag = vlan_tags[dev->device_fh];
927 
928 	/* Print out VMDQ registration info. */
929 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930 		dev->device_fh,
931 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934 		vdev->vlan_tag);
935 
936 	/* Register the MAC address. */
937 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
938 	if (ret)
939 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
940 					dev->device_fh);
941 
942 	/* Enable stripping of the vlan tag as we handle routing. */
943 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
944 
945 	/* Set device as ready for RX. */
946 	vdev->ready = DEVICE_RX;
947 
948 	return 0;
949 }
950 
951 /*
952  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
953  * queue before disabling RX on the device.
954  */
955 static inline void
956 unlink_vmdq(struct vhost_dev *vdev)
957 {
958 	unsigned i = 0;
959 	unsigned rx_count;
960 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
961 
962 	if (vdev->ready == DEVICE_RX) {
963 		/*clear MAC and VLAN settings*/
964 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
965 		for (i = 0; i < 6; i++)
966 			vdev->mac_address.addr_bytes[i] = 0;
967 
968 		vdev->vlan_tag = 0;
969 
970 		/*Clear out the receive buffers*/
971 		rx_count = rte_eth_rx_burst(ports[0],
972 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
973 
974 		while (rx_count) {
975 			for (i = 0; i < rx_count; i++)
976 				rte_pktmbuf_free(pkts_burst[i]);
977 
978 			rx_count = rte_eth_rx_burst(ports[0],
979 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980 		}
981 
982 		vdev->ready = DEVICE_MAC_LEARNING;
983 	}
984 }
985 
986 /*
987  * Check if the packet destination MAC address is for a local device. If so then put
988  * the packet on that devices RX queue. If not then return.
989  */
990 static inline unsigned __attribute__((always_inline))
991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
992 {
993 	struct virtio_net_data_ll *dev_ll;
994 	struct ether_hdr *pkt_hdr;
995 	uint64_t ret = 0;
996 	struct virtio_net *dev = vdev->dev;
997 	struct virtio_net *tdev; /* destination virito device */
998 
999 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1000 
1001 	/*get the used devices list*/
1002 	dev_ll = ll_root_used;
1003 
1004 	while (dev_ll != NULL) {
1005 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1006 				          &dev_ll->vdev->mac_address)) {
1007 
1008 			/* Drop the packet if the TX packet is destined for the TX device. */
1009 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1010 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011 							dev->device_fh);
1012 				return 0;
1013 			}
1014 			tdev = dev_ll->vdev->dev;
1015 
1016 
1017 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1018 
1019 			if (dev_ll->vdev->remove) {
1020 				/*drop the packet if the device is marked for removal*/
1021 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1022 			} else {
1023 				/*send the packet to the local virtio device*/
1024 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1025 				if (enable_stats) {
1026 					rte_atomic64_add(
1027 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1028 					1);
1029 					rte_atomic64_add(
1030 					&dev_statistics[tdev->device_fh].rx_atomic,
1031 					ret);
1032 					dev_statistics[tdev->device_fh].tx_total++;
1033 					dev_statistics[tdev->device_fh].tx += ret;
1034 				}
1035 			}
1036 
1037 			return 0;
1038 		}
1039 		dev_ll = dev_ll->next;
1040 	}
1041 
1042 	return -1;
1043 }
1044 
1045 /*
1046  * This function routes the TX packet to the correct interface. This may be a local device
1047  * or the physical port.
1048  */
1049 static inline void __attribute__((always_inline))
1050 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1051 {
1052 	struct mbuf_table *tx_q;
1053 	struct vlan_ethhdr *vlan_hdr;
1054 	struct rte_mbuf **m_table;
1055 	struct rte_mbuf *mbuf, *prev;
1056 	unsigned len, ret, offset = 0;
1057 	const uint16_t lcore_id = rte_lcore_id();
1058 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1059 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1060 	struct virtio_net *dev = vdev->dev;
1061 
1062 	/*check if destination is local VM*/
1063 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1064 		return;
1065 
1066 	if (vm2vm_mode == VM2VM_HARDWARE) {
1067 		while (dev_ll != NULL) {
1068 			if ((dev_ll->vdev->ready == DEVICE_RX)
1069 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1070 				&dev_ll->vdev->mac_address)) {
1071 				/*
1072 				 * Drop the packet if the TX packet is
1073 				 * destined for the TX device.
1074 				 */
1075 				if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1076 					LOG_DEBUG(VHOST_DATA,
1077 					"(%"PRIu64") TX: Source and destination"
1078 					" MAC addresses are the same. Dropping "
1079 					"packet.\n",
1080 					dev_ll->vdev->device_fh);
1081 					return;
1082 				}
1083 				offset = 4;
1084 				vlan_tag =
1085 				(uint16_t)
1086 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1087 
1088 				LOG_DEBUG(VHOST_DATA,
1089 				"(%"PRIu64") TX: pkt to local VM device id:"
1090 				"(%"PRIu64") vlan tag: %d.\n",
1091 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1092 				vlan_tag);
1093 
1094 				break;
1095 			}
1096 			dev_ll = dev_ll->next;
1097 		}
1098 	}
1099 
1100 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1101 
1102 	/*Add packet to the port tx queue*/
1103 	tx_q = &lcore_tx_queue[lcore_id];
1104 	len = tx_q->len;
1105 
1106 	/* Allocate an mbuf and populate the structure. */
1107 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
1108 	if (unlikely(mbuf == NULL)) {
1109 		RTE_LOG(ERR, VHOST_DATA,
1110 			"Failed to allocate memory for mbuf.\n");
1111 		return;
1112 	}
1113 
1114 	mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1115 	mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1116 	mbuf->nb_segs = m->nb_segs;
1117 
1118 	/* Copy ethernet header to mbuf. */
1119 	rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1120 		rte_pktmbuf_mtod(m, const void *),
1121 		ETH_HLEN);
1122 
1123 
1124 	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1125 	vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1126 	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1127 	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1128 	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1129 
1130 	/* Copy the remaining packet contents to the mbuf. */
1131 	rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1132 		(const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1133 		(m->data_len - ETH_HLEN));
1134 
1135 	/* Copy the remaining segments for the whole packet. */
1136 	prev = mbuf;
1137 	while (m->next) {
1138 		/* Allocate an mbuf and populate the structure. */
1139 		struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1140 		if (unlikely(next_mbuf == NULL)) {
1141 			rte_pktmbuf_free(mbuf);
1142 			RTE_LOG(ERR, VHOST_DATA,
1143 				"Failed to allocate memory for mbuf.\n");
1144 			return;
1145 		}
1146 
1147 		m = m->next;
1148 		prev->next = next_mbuf;
1149 		prev = next_mbuf;
1150 		next_mbuf->data_len = m->data_len;
1151 
1152 		/* Copy data to next mbuf. */
1153 		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1154 			rte_pktmbuf_mtod(m, const void *), m->data_len);
1155 	}
1156 
1157 	tx_q->m_table[len] = mbuf;
1158 	len++;
1159 	if (enable_stats) {
1160 		dev_statistics[dev->device_fh].tx_total++;
1161 		dev_statistics[dev->device_fh].tx++;
1162 	}
1163 
1164 	if (unlikely(len == MAX_PKT_BURST)) {
1165 		m_table = (struct rte_mbuf **)tx_q->m_table;
1166 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1167 		/* Free any buffers not handled by TX and update the port stats. */
1168 		if (unlikely(ret < len)) {
1169 			do {
1170 				rte_pktmbuf_free(m_table[ret]);
1171 			} while (++ret < len);
1172 		}
1173 
1174 		len = 0;
1175 	}
1176 
1177 	tx_q->len = len;
1178 	return;
1179 }
1180 /*
1181  * This function is called by each data core. It handles all RX/TX registered with the
1182  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1183  * with all devices in the main linked list.
1184  */
1185 static int
1186 switch_worker(__attribute__((unused)) void *arg)
1187 {
1188 	struct rte_mempool *mbuf_pool = arg;
1189 	struct virtio_net *dev = NULL;
1190 	struct vhost_dev *vdev = NULL;
1191 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1192 	struct virtio_net_data_ll *dev_ll;
1193 	struct mbuf_table *tx_q;
1194 	volatile struct lcore_ll_info *lcore_ll;
1195 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1196 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1197 	unsigned ret, i;
1198 	const uint16_t lcore_id = rte_lcore_id();
1199 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1200 	uint16_t rx_count = 0;
1201 	uint16_t tx_count;
1202 	uint32_t retry = 0;
1203 
1204 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1205 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1206 	prev_tsc = 0;
1207 
1208 	tx_q = &lcore_tx_queue[lcore_id];
1209 	for (i = 0; i < num_cores; i ++) {
1210 		if (lcore_ids[i] == lcore_id) {
1211 			tx_q->txq_id = i;
1212 			break;
1213 		}
1214 	}
1215 
1216 	while(1) {
1217 		cur_tsc = rte_rdtsc();
1218 		/*
1219 		 * TX burst queue drain
1220 		 */
1221 		diff_tsc = cur_tsc - prev_tsc;
1222 		if (unlikely(diff_tsc > drain_tsc)) {
1223 
1224 			if (tx_q->len) {
1225 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1226 
1227 				/*Tx any packets in the queue*/
1228 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1229 									   (struct rte_mbuf **)tx_q->m_table,
1230 									   (uint16_t)tx_q->len);
1231 				if (unlikely(ret < tx_q->len)) {
1232 					do {
1233 						rte_pktmbuf_free(tx_q->m_table[ret]);
1234 					} while (++ret < tx_q->len);
1235 				}
1236 
1237 				tx_q->len = 0;
1238 			}
1239 
1240 			prev_tsc = cur_tsc;
1241 
1242 		}
1243 
1244 		rte_prefetch0(lcore_ll->ll_root_used);
1245 		/*
1246 		 * Inform the configuration core that we have exited the linked list and that no devices are
1247 		 * in use if requested.
1248 		 */
1249 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1250 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1251 
1252 		/*
1253 		 * Process devices
1254 		 */
1255 		dev_ll = lcore_ll->ll_root_used;
1256 
1257 		while (dev_ll != NULL) {
1258 			/*get virtio device ID*/
1259 			vdev = dev_ll->vdev;
1260 			dev = vdev->dev;
1261 
1262 			if (vdev->remove) {
1263 				dev_ll = dev_ll->next;
1264 				unlink_vmdq(vdev);
1265 				vdev->ready = DEVICE_SAFE_REMOVE;
1266 				continue;
1267 			}
1268 			if (likely(vdev->ready == DEVICE_RX)) {
1269 				/*Handle guest RX*/
1270 				rx_count = rte_eth_rx_burst(ports[0],
1271 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1272 
1273 				if (rx_count) {
1274 					/*
1275 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1276 					* Here MAX_PKT_BURST must be less than virtio queue size
1277 					*/
1278 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1279 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1280 							rte_delay_us(burst_rx_delay_time);
1281 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1282 								break;
1283 						}
1284 					}
1285 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1286 					if (enable_stats) {
1287 						rte_atomic64_add(
1288 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1289 						rx_count);
1290 						rte_atomic64_add(
1291 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1292 					}
1293 					while (likely(rx_count)) {
1294 						rx_count--;
1295 						rte_pktmbuf_free(pkts_burst[rx_count]);
1296 					}
1297 
1298 				}
1299 			}
1300 
1301 			if (!vdev->remove) {
1302 				/* Handle guest TX*/
1303 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1304 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1305 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1306 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1307 						while (tx_count--)
1308 							rte_pktmbuf_free(pkts_burst[tx_count]);
1309 					}
1310 				}
1311 				while (tx_count)
1312 					virtio_tx_route(vdev, pkts_burst[--tx_count], mbuf_pool, (uint16_t)dev->device_fh);
1313 			}
1314 
1315 			/*move to the next device in the list*/
1316 			dev_ll = dev_ll->next;
1317 		}
1318 	}
1319 
1320 	return 0;
1321 }
1322 
1323 /*
1324  * This function gets available ring number for zero copy rx.
1325  * Only one thread will call this funciton for a paticular virtio device,
1326  * so, it is designed as non-thread-safe function.
1327  */
1328 static inline uint32_t __attribute__((always_inline))
1329 get_available_ring_num_zcp(struct virtio_net *dev)
1330 {
1331 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1332 	uint16_t avail_idx;
1333 
1334 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1335 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1336 }
1337 
1338 /*
1339  * This function gets available ring index for zero copy rx,
1340  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1341  * Only one thread will call this funciton for a paticular virtio device,
1342  * so, it is designed as non-thread-safe function.
1343  */
1344 static inline uint32_t __attribute__((always_inline))
1345 get_available_ring_index_zcp(struct virtio_net *dev,
1346 	uint16_t *res_base_idx, uint32_t count)
1347 {
1348 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1349 	uint16_t avail_idx;
1350 	uint32_t retry = 0;
1351 	uint16_t free_entries;
1352 
1353 	*res_base_idx = vq->last_used_idx_res;
1354 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1355 	free_entries = (avail_idx - *res_base_idx);
1356 
1357 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1358 			"avail idx: %d, "
1359 			"res base idx:%d, free entries:%d\n",
1360 			dev->device_fh, avail_idx, *res_base_idx,
1361 			free_entries);
1362 
1363 	/*
1364 	 * If retry is enabled and the queue is full then we wait
1365 	 * and retry to avoid packet loss.
1366 	 */
1367 	if (enable_retry && unlikely(count > free_entries)) {
1368 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1369 			rte_delay_us(burst_rx_delay_time);
1370 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1371 			free_entries = (avail_idx - *res_base_idx);
1372 			if (count <= free_entries)
1373 				break;
1374 		}
1375 	}
1376 
1377 	/*check that we have enough buffers*/
1378 	if (unlikely(count > free_entries))
1379 		count = free_entries;
1380 
1381 	if (unlikely(count == 0)) {
1382 		LOG_DEBUG(VHOST_DATA,
1383 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1384 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1385 			dev->device_fh, avail_idx,
1386 			*res_base_idx, free_entries);
1387 		return 0;
1388 	}
1389 
1390 	vq->last_used_idx_res = *res_base_idx + count;
1391 
1392 	return count;
1393 }
1394 
1395 /*
1396  * This function put descriptor back to used list.
1397  */
1398 static inline void __attribute__((always_inline))
1399 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1400 {
1401 	uint16_t res_cur_idx = vq->last_used_idx;
1402 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1403 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1404 	rte_compiler_barrier();
1405 	*(volatile uint16_t *)&vq->used->idx += 1;
1406 	vq->last_used_idx += 1;
1407 
1408 	/* Kick the guest if necessary. */
1409 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1410 		eventfd_write((int)vq->kickfd, 1);
1411 }
1412 
1413 /*
1414  * This function get available descriptor from vitio vring and un-attached mbuf
1415  * from vpool->ring, and then attach them together. It needs adjust the offset
1416  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1417  * frame data may be put to wrong location in mbuf.
1418  */
1419 static inline void __attribute__((always_inline))
1420 attach_rxmbuf_zcp(struct virtio_net *dev)
1421 {
1422 	uint16_t res_base_idx, desc_idx;
1423 	uint64_t buff_addr, phys_addr;
1424 	struct vhost_virtqueue *vq;
1425 	struct vring_desc *desc;
1426 	struct rte_mbuf *mbuf = NULL;
1427 	struct vpool *vpool;
1428 	hpa_type addr_type;
1429 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1430 
1431 	vpool = &vpool_array[vdev->vmdq_rx_q];
1432 	vq = dev->virtqueue[VIRTIO_RXQ];
1433 
1434 	do {
1435 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1436 				1) != 1))
1437 			return;
1438 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1439 
1440 		desc = &vq->desc[desc_idx];
1441 		if (desc->flags & VRING_DESC_F_NEXT) {
1442 			desc = &vq->desc[desc->next];
1443 			buff_addr = gpa_to_vva(dev, desc->addr);
1444 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1445 					&addr_type);
1446 		} else {
1447 			buff_addr = gpa_to_vva(dev,
1448 					desc->addr + vq->vhost_hlen);
1449 			phys_addr = gpa_to_hpa(vdev,
1450 					desc->addr + vq->vhost_hlen,
1451 					desc->len, &addr_type);
1452 		}
1453 
1454 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1455 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1456 				" address found when attaching RX frame buffer"
1457 				" address!\n", dev->device_fh);
1458 			put_desc_to_used_list_zcp(vq, desc_idx);
1459 			continue;
1460 		}
1461 
1462 		/*
1463 		 * Check if the frame buffer address from guest crosses
1464 		 * sub-region or not.
1465 		 */
1466 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1467 			RTE_LOG(ERR, VHOST_DATA,
1468 				"(%"PRIu64") Frame buffer address cross "
1469 				"sub-regioin found when attaching RX frame "
1470 				"buffer address!\n",
1471 				dev->device_fh);
1472 			put_desc_to_used_list_zcp(vq, desc_idx);
1473 			continue;
1474 		}
1475 	} while (unlikely(phys_addr == 0));
1476 
1477 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1478 	if (unlikely(mbuf == NULL)) {
1479 		LOG_DEBUG(VHOST_DATA,
1480 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1481 			"ring_sc_dequeue fail.\n",
1482 			dev->device_fh);
1483 		put_desc_to_used_list_zcp(vq, desc_idx);
1484 		return;
1485 	}
1486 
1487 	if (unlikely(vpool->buf_size > desc->len)) {
1488 		LOG_DEBUG(VHOST_DATA,
1489 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1490 			"length(%d) of descriptor idx: %d less than room "
1491 			"size required: %d\n",
1492 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1493 		put_desc_to_used_list_zcp(vq, desc_idx);
1494 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1495 		return;
1496 	}
1497 
1498 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1499 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1500 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1501 	mbuf->data_len = desc->len;
1502 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1503 
1504 	LOG_DEBUG(VHOST_DATA,
1505 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1506 		"descriptor idx:%d\n",
1507 		dev->device_fh, res_base_idx, desc_idx);
1508 
1509 	__rte_mbuf_raw_free(mbuf);
1510 
1511 	return;
1512 }
1513 
1514 /*
1515  * Detach an attched packet mbuf -
1516  *  - restore original mbuf address and length values.
1517  *  - reset pktmbuf data and data_len to their default values.
1518  *  All other fields of the given packet mbuf will be left intact.
1519  *
1520  * @param m
1521  *   The attached packet mbuf.
1522  */
1523 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1524 {
1525 	const struct rte_mempool *mp = m->pool;
1526 	void *buf = RTE_MBUF_TO_BADDR(m);
1527 	uint32_t buf_ofs;
1528 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1529 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1530 
1531 	m->buf_addr = buf;
1532 	m->buf_len = (uint16_t)buf_len;
1533 
1534 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1535 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1536 	m->data_off = buf_ofs;
1537 
1538 	m->data_len = 0;
1539 }
1540 
1541 /*
1542  * This function is called after packets have been transimited. It fetchs mbuf
1543  * from vpool->pool, detached it and put into vpool->ring. It also update the
1544  * used index and kick the guest if necessary.
1545  */
1546 static inline uint32_t __attribute__((always_inline))
1547 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1548 {
1549 	struct rte_mbuf *mbuf;
1550 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1551 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1552 	uint32_t index = 0;
1553 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1554 
1555 	LOG_DEBUG(VHOST_DATA,
1556 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1557 		"clean is: %d\n",
1558 		dev->device_fh, mbuf_count);
1559 	LOG_DEBUG(VHOST_DATA,
1560 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1561 		"clean  is : %d\n",
1562 		dev->device_fh, rte_ring_count(vpool->ring));
1563 
1564 	for (index = 0; index < mbuf_count; index++) {
1565 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1566 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1567 			pktmbuf_detach_zcp(mbuf);
1568 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1569 
1570 		/* Update used index buffer information. */
1571 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1572 		vq->used->ring[used_idx].len = 0;
1573 
1574 		used_idx = (used_idx + 1) & (vq->size - 1);
1575 	}
1576 
1577 	LOG_DEBUG(VHOST_DATA,
1578 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1579 		"clean is: %d\n",
1580 		dev->device_fh, rte_mempool_count(vpool->pool));
1581 	LOG_DEBUG(VHOST_DATA,
1582 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1583 		"clean  is : %d\n",
1584 		dev->device_fh, rte_ring_count(vpool->ring));
1585 	LOG_DEBUG(VHOST_DATA,
1586 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1587 		"vq->last_used_idx:%d\n",
1588 		dev->device_fh, vq->last_used_idx);
1589 
1590 	vq->last_used_idx += mbuf_count;
1591 
1592 	LOG_DEBUG(VHOST_DATA,
1593 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1594 		"vq->last_used_idx:%d\n",
1595 		dev->device_fh, vq->last_used_idx);
1596 
1597 	rte_compiler_barrier();
1598 
1599 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1600 
1601 	/* Kick guest if required. */
1602 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1603 		eventfd_write((int)vq->kickfd, 1);
1604 
1605 	return 0;
1606 }
1607 
1608 /*
1609  * This function is called when a virtio device is destroy.
1610  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1611  */
1612 static void mbuf_destroy_zcp(struct vpool *vpool)
1613 {
1614 	struct rte_mbuf *mbuf = NULL;
1615 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1616 
1617 	LOG_DEBUG(VHOST_CONFIG,
1618 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1619 		"mbuf_destroy_zcp is: %d\n",
1620 		mbuf_count);
1621 	LOG_DEBUG(VHOST_CONFIG,
1622 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1623 		"mbuf_destroy_zcp  is : %d\n",
1624 		rte_ring_count(vpool->ring));
1625 
1626 	for (index = 0; index < mbuf_count; index++) {
1627 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1628 		if (likely(mbuf != NULL)) {
1629 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1630 				pktmbuf_detach_zcp(mbuf);
1631 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1632 		}
1633 	}
1634 
1635 	LOG_DEBUG(VHOST_CONFIG,
1636 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1637 		"mbuf_destroy_zcp is: %d\n",
1638 		rte_mempool_count(vpool->pool));
1639 	LOG_DEBUG(VHOST_CONFIG,
1640 		"in mbuf_destroy_zcp: mbuf count in ring after "
1641 		"mbuf_destroy_zcp is : %d\n",
1642 		rte_ring_count(vpool->ring));
1643 }
1644 
1645 /*
1646  * This function update the use flag and counter.
1647  */
1648 static inline uint32_t __attribute__((always_inline))
1649 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1650 	uint32_t count)
1651 {
1652 	struct vhost_virtqueue *vq;
1653 	struct vring_desc *desc;
1654 	struct rte_mbuf *buff;
1655 	/* The virtio_hdr is initialised to 0. */
1656 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1657 		= {{0, 0, 0, 0, 0, 0}, 0};
1658 	uint64_t buff_hdr_addr = 0;
1659 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1660 	uint32_t head_idx, packet_success = 0;
1661 	uint16_t res_cur_idx;
1662 
1663 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1664 
1665 	if (count == 0)
1666 		return 0;
1667 
1668 	vq = dev->virtqueue[VIRTIO_RXQ];
1669 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1670 
1671 	res_cur_idx = vq->last_used_idx;
1672 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1673 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1674 
1675 	/* Retrieve all of the head indexes first to avoid caching issues. */
1676 	for (head_idx = 0; head_idx < count; head_idx++)
1677 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1678 
1679 	/*Prefetch descriptor index. */
1680 	rte_prefetch0(&vq->desc[head[packet_success]]);
1681 
1682 	while (packet_success != count) {
1683 		/* Get descriptor from available ring */
1684 		desc = &vq->desc[head[packet_success]];
1685 
1686 		buff = pkts[packet_success];
1687 		LOG_DEBUG(VHOST_DATA,
1688 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1689 			"pkt[%d] descriptor idx: %d\n",
1690 			dev->device_fh, packet_success,
1691 			MBUF_HEADROOM_UINT32(buff));
1692 
1693 		PRINT_PACKET(dev,
1694 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1695 			+ RTE_PKTMBUF_HEADROOM),
1696 			rte_pktmbuf_data_len(buff), 0);
1697 
1698 		/* Buffer address translation for virtio header. */
1699 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1700 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1701 
1702 		/*
1703 		 * If the descriptors are chained the header and data are
1704 		 * placed in separate buffers.
1705 		 */
1706 		if (desc->flags & VRING_DESC_F_NEXT) {
1707 			desc->len = vq->vhost_hlen;
1708 			desc = &vq->desc[desc->next];
1709 			desc->len = rte_pktmbuf_data_len(buff);
1710 		} else {
1711 			desc->len = packet_len;
1712 		}
1713 
1714 		/* Update used ring with desc information */
1715 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1716 			= head[packet_success];
1717 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1718 			= packet_len;
1719 		res_cur_idx++;
1720 		packet_success++;
1721 
1722 		/* A header is required per buffer. */
1723 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1724 			(const void *)&virtio_hdr, vq->vhost_hlen);
1725 
1726 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1727 
1728 		if (likely(packet_success < count)) {
1729 			/* Prefetch descriptor index. */
1730 			rte_prefetch0(&vq->desc[head[packet_success]]);
1731 		}
1732 	}
1733 
1734 	rte_compiler_barrier();
1735 
1736 	LOG_DEBUG(VHOST_DATA,
1737 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1738 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1739 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1740 
1741 	*(volatile uint16_t *)&vq->used->idx += count;
1742 	vq->last_used_idx += count;
1743 
1744 	LOG_DEBUG(VHOST_DATA,
1745 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1746 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1747 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1748 
1749 	/* Kick the guest if necessary. */
1750 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1751 		eventfd_write((int)vq->kickfd, 1);
1752 
1753 	return count;
1754 }
1755 
1756 /*
1757  * This function routes the TX packet to the correct interface.
1758  * This may be a local device or the physical port.
1759  */
1760 static inline void __attribute__((always_inline))
1761 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1762 	uint32_t desc_idx, uint8_t need_copy)
1763 {
1764 	struct mbuf_table *tx_q;
1765 	struct rte_mbuf **m_table;
1766 	struct rte_mbuf *mbuf = NULL;
1767 	unsigned len, ret, offset = 0;
1768 	struct vpool *vpool;
1769 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1770 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1771 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1772 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1773 
1774 	/*Add packet to the port tx queue*/
1775 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1776 	len = tx_q->len;
1777 
1778 	/* Allocate an mbuf and populate the structure. */
1779 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1780 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1781 	if (unlikely(mbuf == NULL)) {
1782 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1783 		RTE_LOG(ERR, VHOST_DATA,
1784 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1785 			dev->device_fh);
1786 		put_desc_to_used_list_zcp(vq, desc_idx);
1787 		return;
1788 	}
1789 
1790 	if (vm2vm_mode == VM2VM_HARDWARE) {
1791 		/* Avoid using a vlan tag from any vm for external pkt, such as
1792 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1793 		 * selection, MAC address determines it as an external pkt
1794 		 * which should go to network, while vlan tag determine it as
1795 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1796 		 * such a ambiguous situation, so pkt will lost.
1797 		 */
1798 		vlan_tag = external_pkt_default_vlan_tag;
1799 		while (dev_ll != NULL) {
1800 			if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1801 				ether_addr_cmp(&(pkt_hdr->d_addr),
1802 				&dev_ll->vdev->mac_address)) {
1803 
1804 				/*
1805 				 * Drop the packet if the TX packet is destined
1806 				 * for the TX device.
1807 				 */
1808 				if (unlikely(dev_ll->vdev->dev->device_fh
1809 					== dev->device_fh)) {
1810 					LOG_DEBUG(VHOST_DATA,
1811 					"(%"PRIu64") TX: Source and destination"
1812 					"MAC addresses are the same. Dropping "
1813 					"packet.\n",
1814 					dev_ll->vdev->dev->device_fh);
1815 					MBUF_HEADROOM_UINT32(mbuf)
1816 						= (uint32_t)desc_idx;
1817 					__rte_mbuf_raw_free(mbuf);
1818 					return;
1819 				}
1820 
1821 				/*
1822 				 * Packet length offset 4 bytes for HW vlan
1823 				 * strip when L2 switch back.
1824 				 */
1825 				offset = 4;
1826 				vlan_tag =
1827 				(uint16_t)
1828 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1829 
1830 				LOG_DEBUG(VHOST_DATA,
1831 				"(%"PRIu64") TX: pkt to local VM device id:"
1832 				"(%"PRIu64") vlan tag: %d.\n",
1833 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1834 				vlan_tag);
1835 
1836 				break;
1837 			}
1838 			dev_ll = dev_ll->next;
1839 		}
1840 	}
1841 
1842 	mbuf->nb_segs = m->nb_segs;
1843 	mbuf->next = m->next;
1844 	mbuf->data_len = m->data_len + offset;
1845 	mbuf->pkt_len = mbuf->data_len;
1846 	if (unlikely(need_copy)) {
1847 		/* Copy the packet contents to the mbuf. */
1848 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1849 			rte_pktmbuf_mtod(m, void *),
1850 			m->data_len);
1851 	} else {
1852 		mbuf->data_off = m->data_off;
1853 		mbuf->buf_physaddr = m->buf_physaddr;
1854 		mbuf->buf_addr = m->buf_addr;
1855 	}
1856 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1857 	mbuf->vlan_tci = vlan_tag;
1858 	mbuf->l2_len = sizeof(struct ether_hdr);
1859 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1860 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1861 
1862 	tx_q->m_table[len] = mbuf;
1863 	len++;
1864 
1865 	LOG_DEBUG(VHOST_DATA,
1866 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1867 		dev->device_fh,
1868 		mbuf->nb_segs,
1869 		(mbuf->next == NULL) ? "null" : "non-null");
1870 
1871 	if (enable_stats) {
1872 		dev_statistics[dev->device_fh].tx_total++;
1873 		dev_statistics[dev->device_fh].tx++;
1874 	}
1875 
1876 	if (unlikely(len == MAX_PKT_BURST)) {
1877 		m_table = (struct rte_mbuf **)tx_q->m_table;
1878 		ret = rte_eth_tx_burst(ports[0],
1879 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1880 
1881 		/*
1882 		 * Free any buffers not handled by TX and update
1883 		 * the port stats.
1884 		 */
1885 		if (unlikely(ret < len)) {
1886 			do {
1887 				rte_pktmbuf_free(m_table[ret]);
1888 			} while (++ret < len);
1889 		}
1890 
1891 		len = 0;
1892 		txmbuf_clean_zcp(dev, vpool);
1893 	}
1894 
1895 	tx_q->len = len;
1896 
1897 	return;
1898 }
1899 
1900 /*
1901  * This function TX all available packets in virtio TX queue for one
1902  * virtio-net device. If it is first packet, it learns MAC address and
1903  * setup VMDQ.
1904  */
1905 static inline void __attribute__((always_inline))
1906 virtio_dev_tx_zcp(struct virtio_net *dev)
1907 {
1908 	struct rte_mbuf m;
1909 	struct vhost_virtqueue *vq;
1910 	struct vring_desc *desc;
1911 	uint64_t buff_addr = 0, phys_addr;
1912 	uint32_t head[MAX_PKT_BURST];
1913 	uint32_t i;
1914 	uint16_t free_entries, packet_success = 0;
1915 	uint16_t avail_idx;
1916 	uint8_t need_copy = 0;
1917 	hpa_type addr_type;
1918 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1919 
1920 	vq = dev->virtqueue[VIRTIO_TXQ];
1921 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1922 
1923 	/* If there are no available buffers then return. */
1924 	if (vq->last_used_idx_res == avail_idx)
1925 		return;
1926 
1927 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1928 
1929 	/* Prefetch available ring to retrieve head indexes. */
1930 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1931 
1932 	/* Get the number of free entries in the ring */
1933 	free_entries = (avail_idx - vq->last_used_idx_res);
1934 
1935 	/* Limit to MAX_PKT_BURST. */
1936 	free_entries
1937 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1938 
1939 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1940 		dev->device_fh, free_entries);
1941 
1942 	/* Retrieve all of the head indexes first to avoid caching issues. */
1943 	for (i = 0; i < free_entries; i++)
1944 		head[i]
1945 			= vq->avail->ring[(vq->last_used_idx_res + i)
1946 			& (vq->size - 1)];
1947 
1948 	vq->last_used_idx_res += free_entries;
1949 
1950 	/* Prefetch descriptor index. */
1951 	rte_prefetch0(&vq->desc[head[packet_success]]);
1952 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1953 
1954 	while (packet_success < free_entries) {
1955 		desc = &vq->desc[head[packet_success]];
1956 
1957 		/* Discard first buffer as it is the virtio header */
1958 		desc = &vq->desc[desc->next];
1959 
1960 		/* Buffer address translation. */
1961 		buff_addr = gpa_to_vva(dev, desc->addr);
1962 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1963 
1964 		if (likely(packet_success < (free_entries - 1)))
1965 			/* Prefetch descriptor index. */
1966 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1967 
1968 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1969 			RTE_LOG(ERR, VHOST_DATA,
1970 				"(%"PRIu64") Invalid frame buffer address found"
1971 				"when TX packets!\n",
1972 				dev->device_fh);
1973 			packet_success++;
1974 			continue;
1975 		}
1976 
1977 		/* Prefetch buffer address. */
1978 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1979 
1980 		/*
1981 		 * Setup dummy mbuf. This is copied to a real mbuf if
1982 		 * transmitted out the physical port.
1983 		 */
1984 		m.data_len = desc->len;
1985 		m.nb_segs = 1;
1986 		m.next = NULL;
1987 		m.data_off = 0;
1988 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1989 		m.buf_physaddr = phys_addr;
1990 
1991 		/*
1992 		 * Check if the frame buffer address from guest crosses
1993 		 * sub-region or not.
1994 		 */
1995 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1996 			RTE_LOG(ERR, VHOST_DATA,
1997 				"(%"PRIu64") Frame buffer address cross "
1998 				"sub-regioin found when attaching TX frame "
1999 				"buffer address!\n",
2000 				dev->device_fh);
2001 			need_copy = 1;
2002 		} else
2003 			need_copy = 0;
2004 
2005 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2006 
2007 		/*
2008 		 * If this is the first received packet we need to learn
2009 		 * the MAC and setup VMDQ
2010 		 */
2011 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2012 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2013 				/*
2014 				 * Discard frame if device is scheduled for
2015 				 * removal or a duplicate MAC address is found.
2016 				 */
2017 				packet_success += free_entries;
2018 				vq->last_used_idx += packet_success;
2019 				break;
2020 			}
2021 		}
2022 
2023 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2024 		packet_success++;
2025 	}
2026 }
2027 
2028 /*
2029  * This function is called by each data core. It handles all RX/TX registered
2030  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2031  * addresses are compared with all devices in the main linked list.
2032  */
2033 static int
2034 switch_worker_zcp(__attribute__((unused)) void *arg)
2035 {
2036 	struct virtio_net *dev = NULL;
2037 	struct vhost_dev  *vdev = NULL;
2038 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2039 	struct virtio_net_data_ll *dev_ll;
2040 	struct mbuf_table *tx_q;
2041 	volatile struct lcore_ll_info *lcore_ll;
2042 	const uint64_t drain_tsc
2043 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2044 		* BURST_TX_DRAIN_US;
2045 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2046 	unsigned ret;
2047 	const uint16_t lcore_id = rte_lcore_id();
2048 	uint16_t count_in_ring, rx_count = 0;
2049 
2050 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2051 
2052 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2053 	prev_tsc = 0;
2054 
2055 	while (1) {
2056 		cur_tsc = rte_rdtsc();
2057 
2058 		/* TX burst queue drain */
2059 		diff_tsc = cur_tsc - prev_tsc;
2060 		if (unlikely(diff_tsc > drain_tsc)) {
2061 			/*
2062 			 * Get mbuf from vpool.pool and detach mbuf and
2063 			 * put back into vpool.ring.
2064 			 */
2065 			dev_ll = lcore_ll->ll_root_used;
2066 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2067 				/* Get virtio device ID */
2068 				vdev = dev_ll->vdev;
2069 				dev = vdev->dev;
2070 
2071 				if (likely(!vdev->remove)) {
2072 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2073 					if (tx_q->len) {
2074 						LOG_DEBUG(VHOST_DATA,
2075 						"TX queue drained after timeout"
2076 						" with burst size %u\n",
2077 						tx_q->len);
2078 
2079 						/*
2080 						 * Tx any packets in the queue
2081 						 */
2082 						ret = rte_eth_tx_burst(
2083 							ports[0],
2084 							(uint16_t)tx_q->txq_id,
2085 							(struct rte_mbuf **)
2086 							tx_q->m_table,
2087 							(uint16_t)tx_q->len);
2088 						if (unlikely(ret < tx_q->len)) {
2089 							do {
2090 								rte_pktmbuf_free(
2091 									tx_q->m_table[ret]);
2092 							} while (++ret < tx_q->len);
2093 						}
2094 						tx_q->len = 0;
2095 
2096 						txmbuf_clean_zcp(dev,
2097 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2098 					}
2099 				}
2100 				dev_ll = dev_ll->next;
2101 			}
2102 			prev_tsc = cur_tsc;
2103 		}
2104 
2105 		rte_prefetch0(lcore_ll->ll_root_used);
2106 
2107 		/*
2108 		 * Inform the configuration core that we have exited the linked
2109 		 * list and that no devices are in use if requested.
2110 		 */
2111 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2112 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2113 
2114 		/* Process devices */
2115 		dev_ll = lcore_ll->ll_root_used;
2116 
2117 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2118 			vdev = dev_ll->vdev;
2119 			dev  = vdev->dev;
2120 			if (unlikely(vdev->remove)) {
2121 				dev_ll = dev_ll->next;
2122 				unlink_vmdq(vdev);
2123 				vdev->ready = DEVICE_SAFE_REMOVE;
2124 				continue;
2125 			}
2126 
2127 			if (likely(vdev->ready == DEVICE_RX)) {
2128 				uint32_t index = vdev->vmdq_rx_q;
2129 				uint16_t i;
2130 				count_in_ring
2131 				= rte_ring_count(vpool_array[index].ring);
2132 				uint16_t free_entries
2133 				= (uint16_t)get_available_ring_num_zcp(dev);
2134 
2135 				/*
2136 				 * Attach all mbufs in vpool.ring and put back
2137 				 * into vpool.pool.
2138 				 */
2139 				for (i = 0;
2140 				i < RTE_MIN(free_entries,
2141 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2142 				i++)
2143 					attach_rxmbuf_zcp(dev);
2144 
2145 				/* Handle guest RX */
2146 				rx_count = rte_eth_rx_burst(ports[0],
2147 					vdev->vmdq_rx_q, pkts_burst,
2148 					MAX_PKT_BURST);
2149 
2150 				if (rx_count) {
2151 					ret_count = virtio_dev_rx_zcp(dev,
2152 							pkts_burst, rx_count);
2153 					if (enable_stats) {
2154 						dev_statistics[dev->device_fh].rx_total
2155 							+= rx_count;
2156 						dev_statistics[dev->device_fh].rx
2157 							+= ret_count;
2158 					}
2159 					while (likely(rx_count)) {
2160 						rx_count--;
2161 						pktmbuf_detach_zcp(
2162 							pkts_burst[rx_count]);
2163 						rte_ring_sp_enqueue(
2164 							vpool_array[index].ring,
2165 							(void *)pkts_burst[rx_count]);
2166 					}
2167 				}
2168 			}
2169 
2170 			if (likely(!vdev->remove))
2171 				/* Handle guest TX */
2172 				virtio_dev_tx_zcp(dev);
2173 
2174 			/* Move to the next device in the list */
2175 			dev_ll = dev_ll->next;
2176 		}
2177 	}
2178 
2179 	return 0;
2180 }
2181 
2182 
2183 /*
2184  * Add an entry to a used linked list. A free entry must first be found
2185  * in the free linked list using get_data_ll_free_entry();
2186  */
2187 static void
2188 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2189 	struct virtio_net_data_ll *ll_dev)
2190 {
2191 	struct virtio_net_data_ll *ll = *ll_root_addr;
2192 
2193 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2194 	ll_dev->next = NULL;
2195 	rte_compiler_barrier();
2196 
2197 	/* If ll == NULL then this is the first device. */
2198 	if (ll) {
2199 		/* Increment to the tail of the linked list. */
2200 		while ((ll->next != NULL) )
2201 			ll = ll->next;
2202 
2203 		ll->next = ll_dev;
2204 	} else {
2205 		*ll_root_addr = ll_dev;
2206 	}
2207 }
2208 
2209 /*
2210  * Remove an entry from a used linked list. The entry must then be added to
2211  * the free linked list using put_data_ll_free_entry().
2212  */
2213 static void
2214 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2215 	struct virtio_net_data_ll *ll_dev,
2216 	struct virtio_net_data_ll *ll_dev_last)
2217 {
2218 	struct virtio_net_data_ll *ll = *ll_root_addr;
2219 
2220 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2221 		return;
2222 
2223 	if (ll_dev == ll)
2224 		*ll_root_addr = ll_dev->next;
2225 	else
2226 		if (likely(ll_dev_last != NULL))
2227 			ll_dev_last->next = ll_dev->next;
2228 		else
2229 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2230 }
2231 
2232 /*
2233  * Find and return an entry from the free linked list.
2234  */
2235 static struct virtio_net_data_ll *
2236 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2237 {
2238 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2239 	struct virtio_net_data_ll *ll_dev;
2240 
2241 	if (ll_free == NULL)
2242 		return NULL;
2243 
2244 	ll_dev = ll_free;
2245 	*ll_root_addr = ll_free->next;
2246 
2247 	return ll_dev;
2248 }
2249 
2250 /*
2251  * Place an entry back on to the free linked list.
2252  */
2253 static void
2254 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2255 	struct virtio_net_data_ll *ll_dev)
2256 {
2257 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2258 
2259 	if (ll_dev == NULL)
2260 		return;
2261 
2262 	ll_dev->next = ll_free;
2263 	*ll_root_addr = ll_dev;
2264 }
2265 
2266 /*
2267  * Creates a linked list of a given size.
2268  */
2269 static struct virtio_net_data_ll *
2270 alloc_data_ll(uint32_t size)
2271 {
2272 	struct virtio_net_data_ll *ll_new;
2273 	uint32_t i;
2274 
2275 	/* Malloc and then chain the linked list. */
2276 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2277 	if (ll_new == NULL) {
2278 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2279 		return NULL;
2280 	}
2281 
2282 	for (i = 0; i < size - 1; i++) {
2283 		ll_new[i].vdev = NULL;
2284 		ll_new[i].next = &ll_new[i+1];
2285 	}
2286 	ll_new[i].next = NULL;
2287 
2288 	return (ll_new);
2289 }
2290 
2291 /*
2292  * Create the main linked list along with each individual cores linked list. A used and a free list
2293  * are created to manage entries.
2294  */
2295 static int
2296 init_data_ll (void)
2297 {
2298 	int lcore;
2299 
2300 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2301 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2302 		if (lcore_info[lcore].lcore_ll == NULL) {
2303 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2304 			return -1;
2305 		}
2306 
2307 		lcore_info[lcore].lcore_ll->device_num = 0;
2308 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2309 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2310 		if (num_devices % num_switching_cores)
2311 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2312 		else
2313 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2314 	}
2315 
2316 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2317 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2318 
2319 	return 0;
2320 }
2321 
2322 /*
2323  * Set virtqueue flags so that we do not receive interrupts.
2324  */
2325 static void
2326 set_irq_status (struct virtio_net *dev)
2327 {
2328 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2329 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2330 }
2331 
2332 /*
2333  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2334  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2335  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2336  */
2337 static void
2338 destroy_device (volatile struct virtio_net *dev)
2339 {
2340 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2341 	struct virtio_net_data_ll *ll_main_dev_cur;
2342 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2343 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2344 	struct vhost_dev *vdev;
2345 	int lcore;
2346 
2347 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2348 
2349 	vdev = (struct vhost_dev *)dev->priv;
2350 	/*set the remove flag. */
2351 	vdev->remove = 1;
2352 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2353 		rte_pause();
2354 	}
2355 
2356 	/* Search for entry to be removed from lcore ll */
2357 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2358 	while (ll_lcore_dev_cur != NULL) {
2359 		if (ll_lcore_dev_cur->vdev == vdev) {
2360 			break;
2361 		} else {
2362 			ll_lcore_dev_last = ll_lcore_dev_cur;
2363 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2364 		}
2365 	}
2366 
2367 	if (ll_lcore_dev_cur == NULL) {
2368 		RTE_LOG(ERR, VHOST_CONFIG,
2369 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2370 			dev->device_fh);
2371 		return;
2372 	}
2373 
2374 	/* Search for entry to be removed from main ll */
2375 	ll_main_dev_cur = ll_root_used;
2376 	ll_main_dev_last = NULL;
2377 	while (ll_main_dev_cur != NULL) {
2378 		if (ll_main_dev_cur->vdev == vdev) {
2379 			break;
2380 		} else {
2381 			ll_main_dev_last = ll_main_dev_cur;
2382 			ll_main_dev_cur = ll_main_dev_cur->next;
2383 		}
2384 	}
2385 
2386 	/* Remove entries from the lcore and main ll. */
2387 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2388 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2389 
2390 	/* Set the dev_removal_flag on each lcore. */
2391 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2392 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2393 	}
2394 
2395 	/*
2396 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2397 	 * they can no longer access the device removed from the linked lists and that the devices
2398 	 * are no longer in use.
2399 	 */
2400 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2401 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2402 			rte_pause();
2403 		}
2404 	}
2405 
2406 	/* Add the entries back to the lcore and main free ll.*/
2407 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2408 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2409 
2410 	/* Decrement number of device on the lcore. */
2411 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2412 
2413 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2414 
2415 	if (zero_copy) {
2416 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2417 
2418 		/* Stop the RX queue. */
2419 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2420 			LOG_DEBUG(VHOST_CONFIG,
2421 				"(%"PRIu64") In destroy_device: Failed to stop "
2422 				"rx queue:%d\n",
2423 				dev->device_fh,
2424 				vdev->vmdq_rx_q);
2425 		}
2426 
2427 		LOG_DEBUG(VHOST_CONFIG,
2428 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2429 			"mempool back to ring for RX queue: %d\n",
2430 			dev->device_fh, vdev->vmdq_rx_q);
2431 
2432 		mbuf_destroy_zcp(vpool);
2433 
2434 		/* Stop the TX queue. */
2435 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2436 			LOG_DEBUG(VHOST_CONFIG,
2437 				"(%"PRIu64") In destroy_device: Failed to "
2438 				"stop tx queue:%d\n",
2439 				dev->device_fh, vdev->vmdq_rx_q);
2440 		}
2441 
2442 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2443 
2444 		LOG_DEBUG(VHOST_CONFIG,
2445 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2446 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2447 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2448 			dev->device_fh);
2449 
2450 		mbuf_destroy_zcp(vpool);
2451 		rte_free(vdev->regions_hpa);
2452 	}
2453 	rte_free(vdev);
2454 
2455 }
2456 
2457 /*
2458  * Calculate the region count of physical continous regions for one particular
2459  * region of whose vhost virtual address is continous. The particular region
2460  * start from vva_start, with size of 'size' in argument.
2461  */
2462 static uint32_t
2463 check_hpa_regions(uint64_t vva_start, uint64_t size)
2464 {
2465 	uint32_t i, nregions = 0, page_size = getpagesize();
2466 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2467 	if (vva_start % page_size) {
2468 		LOG_DEBUG(VHOST_CONFIG,
2469 			"in check_countinous: vva start(%p) mod page_size(%d) "
2470 			"has remainder\n",
2471 			(void *)(uintptr_t)vva_start, page_size);
2472 		return 0;
2473 	}
2474 	if (size % page_size) {
2475 		LOG_DEBUG(VHOST_CONFIG,
2476 			"in check_countinous: "
2477 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2478 			size, page_size);
2479 		return 0;
2480 	}
2481 	for (i = 0; i < size - page_size; i = i + page_size) {
2482 		cur_phys_addr
2483 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2484 		next_phys_addr = rte_mem_virt2phy(
2485 			(void *)(uintptr_t)(vva_start + i + page_size));
2486 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2487 			++nregions;
2488 			LOG_DEBUG(VHOST_CONFIG,
2489 				"in check_continuous: hva addr:(%p) is not "
2490 				"continuous with hva addr:(%p), diff:%d\n",
2491 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2492 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2493 				+ page_size), page_size);
2494 			LOG_DEBUG(VHOST_CONFIG,
2495 				"in check_continuous: hpa addr:(%p) is not "
2496 				"continuous with hpa addr:(%p), "
2497 				"diff:(%"PRIu64")\n",
2498 				(void *)(uintptr_t)cur_phys_addr,
2499 				(void *)(uintptr_t)next_phys_addr,
2500 				(next_phys_addr-cur_phys_addr));
2501 		}
2502 	}
2503 	return nregions;
2504 }
2505 
2506 /*
2507  * Divide each region whose vhost virtual address is continous into a few
2508  * sub-regions, make sure the physical address within each sub-region are
2509  * continous. And fill offset(to GPA) and size etc. information of each
2510  * sub-region into regions_hpa.
2511  */
2512 static uint32_t
2513 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2514 {
2515 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2516 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2517 
2518 	if (mem_region_hpa == NULL)
2519 		return 0;
2520 
2521 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2522 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2523 			virtio_memory->regions[regionidx].address_offset;
2524 		mem_region_hpa[regionidx_hpa].guest_phys_address
2525 			= virtio_memory->regions[regionidx].guest_phys_address;
2526 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2527 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2528 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2529 		LOG_DEBUG(VHOST_CONFIG,
2530 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2531 			regionidx_hpa,
2532 			(void *)(uintptr_t)
2533 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2534 		LOG_DEBUG(VHOST_CONFIG,
2535 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2536 			regionidx_hpa,
2537 			(void *)(uintptr_t)
2538 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2539 		for (i = 0, k = 0;
2540 			i < virtio_memory->regions[regionidx].memory_size -
2541 				page_size;
2542 			i += page_size) {
2543 			cur_phys_addr = rte_mem_virt2phy(
2544 					(void *)(uintptr_t)(vva_start + i));
2545 			next_phys_addr = rte_mem_virt2phy(
2546 					(void *)(uintptr_t)(vva_start +
2547 					i + page_size));
2548 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2549 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2550 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2551 					k + page_size;
2552 				mem_region_hpa[regionidx_hpa].memory_size
2553 					= k + page_size;
2554 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2555 					"phys addr end  [%d]:(%p)\n",
2556 					regionidx_hpa,
2557 					(void *)(uintptr_t)
2558 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2559 				LOG_DEBUG(VHOST_CONFIG,
2560 					"in fill_hpa_regions: guest phys addr "
2561 					"size [%d]:(%p)\n",
2562 					regionidx_hpa,
2563 					(void *)(uintptr_t)
2564 					(mem_region_hpa[regionidx_hpa].memory_size));
2565 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2566 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2567 				++regionidx_hpa;
2568 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2569 					next_phys_addr -
2570 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2571 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2572 					" phys addr start[%d]:(%p)\n",
2573 					regionidx_hpa,
2574 					(void *)(uintptr_t)
2575 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2576 				LOG_DEBUG(VHOST_CONFIG,
2577 					"in fill_hpa_regions: host  phys addr "
2578 					"start[%d]:(%p)\n",
2579 					regionidx_hpa,
2580 					(void *)(uintptr_t)
2581 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2582 				k = 0;
2583 			} else {
2584 				k += page_size;
2585 			}
2586 		}
2587 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2588 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2589 			+ k + page_size;
2590 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2591 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2592 			"[%d]:(%p)\n", regionidx_hpa,
2593 			(void *)(uintptr_t)
2594 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2595 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2596 			"[%d]:(%p)\n", regionidx_hpa,
2597 			(void *)(uintptr_t)
2598 			(mem_region_hpa[regionidx_hpa].memory_size));
2599 		++regionidx_hpa;
2600 	}
2601 	return regionidx_hpa;
2602 }
2603 
2604 /*
2605  * A new device is added to a data core. First the device is added to the main linked list
2606  * and the allocated to a specific data core.
2607  */
2608 static int
2609 new_device (struct virtio_net *dev)
2610 {
2611 	struct virtio_net_data_ll *ll_dev;
2612 	int lcore, core_add = 0;
2613 	uint32_t device_num_min = num_devices;
2614 	struct vhost_dev *vdev;
2615 	uint32_t regionidx;
2616 
2617 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2618 	if (vdev == NULL) {
2619 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2620 			dev->device_fh);
2621 		return -1;
2622 	}
2623 	vdev->dev = dev;
2624 	dev->priv = vdev;
2625 
2626 	if (zero_copy) {
2627 		vdev->nregions_hpa = dev->mem->nregions;
2628 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2629 			vdev->nregions_hpa
2630 				+= check_hpa_regions(
2631 					dev->mem->regions[regionidx].guest_phys_address
2632 					+ dev->mem->regions[regionidx].address_offset,
2633 					dev->mem->regions[regionidx].memory_size);
2634 
2635 		}
2636 
2637 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2638 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2639 			CACHE_LINE_SIZE);
2640 		if (vdev->regions_hpa == NULL) {
2641 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2642 			rte_free(vdev);
2643 			return -1;
2644 		}
2645 
2646 
2647 		if (fill_hpa_memory_regions(
2648 			vdev->regions_hpa, dev->mem
2649 			) != vdev->nregions_hpa) {
2650 
2651 			RTE_LOG(ERR, VHOST_CONFIG,
2652 				"hpa memory regions number mismatch: "
2653 				"[%d]\n", vdev->nregions_hpa);
2654 			rte_free(vdev->regions_hpa);
2655 			rte_free(vdev);
2656 			return -1;
2657 		}
2658 	}
2659 
2660 
2661 	/* Add device to main ll */
2662 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2663 	if (ll_dev == NULL) {
2664 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2665 			"of %d devices per core has been reached\n",
2666 			dev->device_fh, num_devices);
2667 		if (vdev->regions_hpa)
2668 			rte_free(vdev->regions_hpa);
2669 		rte_free(vdev);
2670 		return -1;
2671 	}
2672 	ll_dev->vdev = vdev;
2673 	add_data_ll_entry(&ll_root_used, ll_dev);
2674 	vdev->vmdq_rx_q
2675 		= dev->device_fh * (num_queues / num_devices);
2676 
2677 	if (zero_copy) {
2678 		uint32_t index = vdev->vmdq_rx_q;
2679 		uint32_t count_in_ring, i;
2680 		struct mbuf_table *tx_q;
2681 
2682 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2683 
2684 		LOG_DEBUG(VHOST_CONFIG,
2685 			"(%"PRIu64") in new_device: mbuf count in mempool "
2686 			"before attach is: %d\n",
2687 			dev->device_fh,
2688 			rte_mempool_count(vpool_array[index].pool));
2689 		LOG_DEBUG(VHOST_CONFIG,
2690 			"(%"PRIu64") in new_device: mbuf count in  ring "
2691 			"before attach  is : %d\n",
2692 			dev->device_fh, count_in_ring);
2693 
2694 		/*
2695 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2696 		 */
2697 		for (i = 0; i < count_in_ring; i++)
2698 			attach_rxmbuf_zcp(dev);
2699 
2700 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2701 			"mempool after attach is: %d\n",
2702 			dev->device_fh,
2703 			rte_mempool_count(vpool_array[index].pool));
2704 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2705 			"ring after attach  is : %d\n",
2706 			dev->device_fh,
2707 			rte_ring_count(vpool_array[index].ring));
2708 
2709 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2710 		tx_q->txq_id = vdev->vmdq_rx_q;
2711 
2712 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2713 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2714 
2715 			LOG_DEBUG(VHOST_CONFIG,
2716 				"(%"PRIu64") In new_device: Failed to start "
2717 				"tx queue:%d\n",
2718 				dev->device_fh, vdev->vmdq_rx_q);
2719 
2720 			mbuf_destroy_zcp(vpool);
2721 			rte_free(vdev->regions_hpa);
2722 			rte_free(vdev);
2723 			return -1;
2724 		}
2725 
2726 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2727 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2728 
2729 			LOG_DEBUG(VHOST_CONFIG,
2730 				"(%"PRIu64") In new_device: Failed to start "
2731 				"rx queue:%d\n",
2732 				dev->device_fh, vdev->vmdq_rx_q);
2733 
2734 			/* Stop the TX queue. */
2735 			if (rte_eth_dev_tx_queue_stop(ports[0],
2736 				vdev->vmdq_rx_q) != 0) {
2737 				LOG_DEBUG(VHOST_CONFIG,
2738 					"(%"PRIu64") In new_device: Failed to "
2739 					"stop tx queue:%d\n",
2740 					dev->device_fh, vdev->vmdq_rx_q);
2741 			}
2742 
2743 			mbuf_destroy_zcp(vpool);
2744 			rte_free(vdev->regions_hpa);
2745 			rte_free(vdev);
2746 			return -1;
2747 		}
2748 
2749 	}
2750 
2751 	/*reset ready flag*/
2752 	vdev->ready = DEVICE_MAC_LEARNING;
2753 	vdev->remove = 0;
2754 
2755 	/* Find a suitable lcore to add the device. */
2756 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2757 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2758 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2759 			core_add = lcore;
2760 		}
2761 	}
2762 	/* Add device to lcore ll */
2763 	ll_dev->dev->coreid = core_add;
2764 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2765 	if (ll_dev == NULL) {
2766 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2767 		vdev->ready = DEVICE_SAFE_REMOVE;
2768 		destroy_device(dev);
2769 		if (vdev->regions_hpa)
2770 			rte_free(vdev->regions_hpa);
2771 		rte_free(vdev);
2772 		return -1;
2773 	}
2774 	ll_dev->vdev = vdev;
2775 	vdev->coreid = core_add;
2776 
2777 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2778 
2779 	/* Initialize device stats */
2780 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2781 
2782 	/* Disable notifications. */
2783 	set_irq_status(dev);
2784 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2785 	dev->flags |= VIRTIO_DEV_RUNNING;
2786 
2787 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2788 
2789 	return 0;
2790 }
2791 
2792 /*
2793  * These callback allow devices to be added to the data core when configuration
2794  * has been fully complete.
2795  */
2796 static const struct virtio_net_device_ops virtio_net_device_ops =
2797 {
2798 	.new_device =  new_device,
2799 	.destroy_device = destroy_device,
2800 };
2801 
2802 /*
2803  * This is a thread will wake up after a period to print stats if the user has
2804  * enabled them.
2805  */
2806 static void
2807 print_stats(void)
2808 {
2809 	struct virtio_net_data_ll *dev_ll;
2810 	uint64_t tx_dropped, rx_dropped;
2811 	uint64_t tx, tx_total, rx, rx_total;
2812 	uint32_t device_fh;
2813 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2814 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2815 
2816 	while(1) {
2817 		sleep(enable_stats);
2818 
2819 		/* Clear screen and move to top left */
2820 		printf("%s%s", clr, top_left);
2821 
2822 		printf("\nDevice statistics ====================================");
2823 
2824 		dev_ll = ll_root_used;
2825 		while (dev_ll != NULL) {
2826 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2827 			tx_total = dev_statistics[device_fh].tx_total;
2828 			tx = dev_statistics[device_fh].tx;
2829 			tx_dropped = tx_total - tx;
2830 			if (zero_copy == 0) {
2831 				rx_total = rte_atomic64_read(
2832 					&dev_statistics[device_fh].rx_total_atomic);
2833 				rx = rte_atomic64_read(
2834 					&dev_statistics[device_fh].rx_atomic);
2835 			} else {
2836 				rx_total = dev_statistics[device_fh].rx_total;
2837 				rx = dev_statistics[device_fh].rx;
2838 			}
2839 			rx_dropped = rx_total - rx;
2840 
2841 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2842 					"\nTX total: 		%"PRIu64""
2843 					"\nTX dropped: 		%"PRIu64""
2844 					"\nTX successful: 		%"PRIu64""
2845 					"\nRX total: 		%"PRIu64""
2846 					"\nRX dropped: 		%"PRIu64""
2847 					"\nRX successful: 		%"PRIu64"",
2848 					device_fh,
2849 					tx_total,
2850 					tx_dropped,
2851 					tx,
2852 					rx_total,
2853 					rx_dropped,
2854 					rx);
2855 
2856 			dev_ll = dev_ll->next;
2857 		}
2858 		printf("\n======================================================\n");
2859 	}
2860 }
2861 
2862 static void
2863 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2864 	char *ring_name, uint32_t nb_mbuf)
2865 {
2866 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2867 	vpool_array[index].pool
2868 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2869 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2870 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2871 		rte_pktmbuf_init, NULL, socket, 0);
2872 	if (vpool_array[index].pool != NULL) {
2873 		vpool_array[index].ring
2874 			= rte_ring_create(ring_name,
2875 				rte_align32pow2(nb_mbuf + 1),
2876 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2877 		if (likely(vpool_array[index].ring != NULL)) {
2878 			LOG_DEBUG(VHOST_CONFIG,
2879 				"in setup_mempool_tbl: mbuf count in "
2880 				"mempool is: %d\n",
2881 				rte_mempool_count(vpool_array[index].pool));
2882 			LOG_DEBUG(VHOST_CONFIG,
2883 				"in setup_mempool_tbl: mbuf count in "
2884 				"ring   is: %d\n",
2885 				rte_ring_count(vpool_array[index].ring));
2886 		} else {
2887 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2888 				ring_name);
2889 		}
2890 
2891 		/* Need consider head room. */
2892 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2893 	} else {
2894 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2895 	}
2896 }
2897 
2898 
2899 /*
2900  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2901  * device is also registered here to handle the IOCTLs.
2902  */
2903 int
2904 MAIN(int argc, char *argv[])
2905 {
2906 	struct rte_mempool *mbuf_pool = NULL;
2907 	unsigned lcore_id, core_id = 0;
2908 	unsigned nb_ports, valid_num_ports;
2909 	int ret;
2910 	uint8_t portid, queue_id = 0;
2911 	static pthread_t tid;
2912 
2913 	/* init EAL */
2914 	ret = rte_eal_init(argc, argv);
2915 	if (ret < 0)
2916 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2917 	argc -= ret;
2918 	argv += ret;
2919 
2920 	/* parse app arguments */
2921 	ret = us_vhost_parse_args(argc, argv);
2922 	if (ret < 0)
2923 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2924 
2925 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2926 		if (rte_lcore_is_enabled(lcore_id))
2927 			lcore_ids[core_id ++] = lcore_id;
2928 
2929 	if (rte_lcore_count() > RTE_MAX_LCORE)
2930 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2931 
2932 	/*set the number of swithcing cores available*/
2933 	num_switching_cores = rte_lcore_count()-1;
2934 
2935 	/* Get the number of physical ports. */
2936 	nb_ports = rte_eth_dev_count();
2937 	if (nb_ports > RTE_MAX_ETHPORTS)
2938 		nb_ports = RTE_MAX_ETHPORTS;
2939 
2940 	/*
2941 	 * Update the global var NUM_PORTS and global array PORTS
2942 	 * and get value of var VALID_NUM_PORTS according to system ports number
2943 	 */
2944 	valid_num_ports = check_ports_num(nb_ports);
2945 
2946 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2947 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2948 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2949 		return -1;
2950 	}
2951 
2952 	if (zero_copy == 0) {
2953 		/* Create the mbuf pool. */
2954 		mbuf_pool = rte_mempool_create(
2955 				"MBUF_POOL",
2956 				NUM_MBUFS_PER_PORT
2957 				* valid_num_ports,
2958 				MBUF_SIZE, MBUF_CACHE_SIZE,
2959 				sizeof(struct rte_pktmbuf_pool_private),
2960 				rte_pktmbuf_pool_init, NULL,
2961 				rte_pktmbuf_init, NULL,
2962 				rte_socket_id(), 0);
2963 		if (mbuf_pool == NULL)
2964 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2965 
2966 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2967 			vpool_array[queue_id].pool = mbuf_pool;
2968 
2969 		if (vm2vm_mode == VM2VM_HARDWARE) {
2970 			/* Enable VT loop back to let L2 switch to do it. */
2971 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2972 			LOG_DEBUG(VHOST_CONFIG,
2973 				"Enable loop back for L2 switch in vmdq.\n");
2974 		}
2975 	} else {
2976 		uint32_t nb_mbuf;
2977 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2978 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2979 
2980 		/*
2981 		 * Zero copy defers queue RX/TX start to the time when guest
2982 		 * finishes its startup and packet buffers from that guest are
2983 		 * available.
2984 		 */
2985 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2986 		rx_conf_default.rx_drop_en = 0;
2987 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2988 		nb_mbuf = num_rx_descriptor
2989 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2990 			+ num_switching_cores * MAX_PKT_BURST;
2991 
2992 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2993 			snprintf(pool_name, sizeof(pool_name),
2994 				"rxmbuf_pool_%u", queue_id);
2995 			snprintf(ring_name, sizeof(ring_name),
2996 				"rxmbuf_ring_%u", queue_id);
2997 			setup_mempool_tbl(rte_socket_id(), queue_id,
2998 				pool_name, ring_name, nb_mbuf);
2999 		}
3000 
3001 		nb_mbuf = num_tx_descriptor
3002 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3003 				+ num_switching_cores * MAX_PKT_BURST;
3004 
3005 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3006 			snprintf(pool_name, sizeof(pool_name),
3007 				"txmbuf_pool_%u", queue_id);
3008 			snprintf(ring_name, sizeof(ring_name),
3009 				"txmbuf_ring_%u", queue_id);
3010 			setup_mempool_tbl(rte_socket_id(),
3011 				(queue_id + MAX_QUEUES),
3012 				pool_name, ring_name, nb_mbuf);
3013 		}
3014 
3015 		if (vm2vm_mode == VM2VM_HARDWARE) {
3016 			/* Enable VT loop back to let L2 switch to do it. */
3017 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3018 			LOG_DEBUG(VHOST_CONFIG,
3019 				"Enable loop back for L2 switch in vmdq.\n");
3020 		}
3021 	}
3022 	/* Set log level. */
3023 	rte_set_log_level(LOG_LEVEL);
3024 
3025 	/* initialize all ports */
3026 	for (portid = 0; portid < nb_ports; portid++) {
3027 		/* skip ports that are not enabled */
3028 		if ((enabled_port_mask & (1 << portid)) == 0) {
3029 			RTE_LOG(INFO, VHOST_PORT,
3030 				"Skipping disabled port %d\n", portid);
3031 			continue;
3032 		}
3033 		if (port_init(portid) != 0)
3034 			rte_exit(EXIT_FAILURE,
3035 				"Cannot initialize network ports\n");
3036 	}
3037 
3038 	/* Initialise all linked lists. */
3039 	if (init_data_ll() == -1)
3040 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3041 
3042 	/* Initialize device stats */
3043 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3044 
3045 	/* Enable stats if the user option is set. */
3046 	if (enable_stats)
3047 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3048 
3049 	/* Launch all data cores. */
3050 	if (zero_copy == 0) {
3051 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3052 			rte_eal_remote_launch(switch_worker,
3053 				mbuf_pool, lcore_id);
3054 		}
3055 	} else {
3056 		uint32_t count_in_mempool, index, i;
3057 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3058 			/* For all RX and TX queues. */
3059 			count_in_mempool
3060 				= rte_mempool_count(vpool_array[index].pool);
3061 
3062 			/*
3063 			 * Transfer all un-attached mbufs from vpool.pool
3064 			 * to vpoo.ring.
3065 			 */
3066 			for (i = 0; i < count_in_mempool; i++) {
3067 				struct rte_mbuf *mbuf
3068 					= __rte_mbuf_raw_alloc(
3069 						vpool_array[index].pool);
3070 				rte_ring_sp_enqueue(vpool_array[index].ring,
3071 						(void *)mbuf);
3072 			}
3073 
3074 			LOG_DEBUG(VHOST_CONFIG,
3075 				"in MAIN: mbuf count in mempool at initial "
3076 				"is: %d\n", count_in_mempool);
3077 			LOG_DEBUG(VHOST_CONFIG,
3078 				"in MAIN: mbuf count in  ring at initial  is :"
3079 				" %d\n",
3080 				rte_ring_count(vpool_array[index].ring));
3081 		}
3082 
3083 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3084 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3085 				lcore_id);
3086 	}
3087 
3088 	/* Register CUSE device to handle IOCTLs. */
3089 	ret = rte_vhost_driver_register((char *)&dev_basename);
3090 	if (ret != 0)
3091 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3092 
3093 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3094 
3095 	/* Start CUSE session. */
3096 	rte_vhost_driver_session_start();
3097 	return 0;
3098 
3099 }
3100 
3101