xref: /dpdk/examples/vhost/main.c (revision e571e6b472a1e52eb39d4ded937ef426f7cd0be2)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56 
57 #define MAX_QUEUES 128
58 
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61 
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
66 							(num_switching_cores*MAX_PKT_BURST) +  			\
67 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 							(num_switching_cores*MBUF_CACHE_SIZE))
69 
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72 
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 	+ RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82 
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92 
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101 
102 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
105 
106 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
108 
109 #define JUMBO_FRAME_MAX_SIZE    0x2600
110 
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
113 #define DEVICE_RX			1
114 #define DEVICE_SAFE_REMOVE	2
115 
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
119 
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
123 
124 /*
125  * Need refine these 2 macros for legacy and DPDK based front end:
126  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127  * And then adjust power 2.
128  */
129 /*
130  * For legacy front end, 128 descriptors,
131  * half for virtio header, another half for mbuf.
132  */
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
135 
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138 		+ sizeof(struct rte_mbuf)))
139 
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
142 
143 #define INVALID_PORT_ID 0xFF
144 
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
147 
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
150 
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
153 
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
156 
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
159 
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
162 
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
165 
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
168 
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
172 
173 /*
174  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175  * disabled on default.
176  */
177 static uint32_t zero_copy;
178 
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182 
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
185 
186 struct vpool {
187 	struct rte_mempool *pool;
188 	struct rte_ring *ring;
189 	uint32_t buf_size;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191 
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
193 typedef enum {
194 	VM2VM_DISABLED = 0,
195 	VM2VM_SOFTWARE = 1,
196 	VM2VM_HARDWARE = 2,
197 	VM2VM_LAST
198 } vm2vm_type;
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200 
201 /* The type of host physical address translated from guest physical address. */
202 typedef enum {
203 	PHYS_ADDR_CONTINUOUS = 0,
204 	PHYS_ADDR_CROSS_SUBREG = 1,
205 	PHYS_ADDR_INVALID = 2,
206 	PHYS_ADDR_LAST
207 } hpa_type;
208 
209 /* Enable stats. */
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217 
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220 
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
223 
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
226 
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
229 	.rx_thresh = {
230 		.pthresh = RX_PTHRESH,
231 		.hthresh = RX_HTHRESH,
232 		.wthresh = RX_WTHRESH,
233 	},
234 	.rx_drop_en = 1,
235 };
236 
237 /*
238  * These default values are optimized for use with the Intel(R) 82599 10 GbE
239  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240  * network controllers and/or network drivers.
241  */
242 static struct rte_eth_txconf tx_conf_default = {
243 	.tx_thresh = {
244 		.pthresh = TX_PTHRESH,
245 		.hthresh = TX_HTHRESH,
246 		.wthresh = TX_WTHRESH,
247 	},
248 	.tx_free_thresh = 0, /* Use PMD default values */
249 	.tx_rs_thresh = 0, /* Use PMD default values */
250 };
251 
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
254 	.rxmode = {
255 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
256 		.split_hdr_size = 0,
257 		.header_split   = 0, /**< Header Split disabled */
258 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
259 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
260 		/*
261 		 * It is necessary for 1G NIC such as I350,
262 		 * this fixes bug of ipv4 forwarding in guest can't
263 		 * forward pakets from one virtio dev to another virtio dev.
264 		 */
265 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
266 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
267 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
268 	},
269 
270 	.txmode = {
271 		.mq_mode = ETH_MQ_TX_NONE,
272 	},
273 	.rx_adv_conf = {
274 		/*
275 		 * should be overridden separately in code with
276 		 * appropriate values
277 		 */
278 		.vmdq_rx_conf = {
279 			.nb_queue_pools = ETH_8_POOLS,
280 			.enable_default_pool = 0,
281 			.default_pool = 0,
282 			.nb_pool_maps = 0,
283 			.pool_map = {{0, 0},},
284 		},
285 	},
286 };
287 
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
291 
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
296 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 };
303 
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306 
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
310 
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
313 
314 /* Used for queueing bursts of TX packets. */
315 struct mbuf_table {
316 	unsigned len;
317 	unsigned txq_id;
318 	struct rte_mbuf *m_table[MAX_PKT_BURST];
319 };
320 
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323 
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326 
327 /* Vlan header struct used to insert vlan tags on TX. */
328 struct vlan_ethhdr {
329 	unsigned char   h_dest[ETH_ALEN];
330 	unsigned char   h_source[ETH_ALEN];
331 	__be16          h_vlan_proto;
332 	__be16          h_vlan_TCI;
333 	__be16          h_vlan_encapsulated_proto;
334 };
335 
336 /* IPv4 Header */
337 struct ipv4_hdr {
338 	uint8_t  version_ihl;		/**< version and header length */
339 	uint8_t  type_of_service;	/**< type of service */
340 	uint16_t total_length;		/**< length of packet */
341 	uint16_t packet_id;		/**< packet ID */
342 	uint16_t fragment_offset;	/**< fragmentation offset */
343 	uint8_t  time_to_live;		/**< time to live */
344 	uint8_t  next_proto_id;		/**< protocol ID */
345 	uint16_t hdr_checksum;		/**< header checksum */
346 	uint32_t src_addr;		/**< source address */
347 	uint32_t dst_addr;		/**< destination address */
348 } __attribute__((__packed__));
349 
350 /* Header lengths. */
351 #define VLAN_HLEN       4
352 #define VLAN_ETH_HLEN   18
353 
354 /* Per-device statistics struct */
355 struct device_statistics {
356 	uint64_t tx_total;
357 	rte_atomic64_t rx_total_atomic;
358 	uint64_t rx_total;
359 	uint64_t tx;
360 	rte_atomic64_t rx_atomic;
361 	uint64_t rx;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
364 
365 /*
366  * Builds up the correct configuration for VMDQ VLAN pool map
367  * according to the pool & queue limits.
368  */
369 static inline int
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371 {
372 	struct rte_eth_vmdq_rx_conf conf;
373 	unsigned i;
374 
375 	memset(&conf, 0, sizeof(conf));
376 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377 	conf.nb_pool_maps = num_devices;
378 	conf.enable_loop_back =
379 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
380 
381 	for (i = 0; i < conf.nb_pool_maps; i++) {
382 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
383 		conf.pool_map[i].pools = (1UL << i);
384 	}
385 
386 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
389 	return 0;
390 }
391 
392 /*
393  * Validate the device number according to the max pool number gotten form
394  * dev_info. If the device number is invalid, give the error message and
395  * return -1. Each device must have its own pool.
396  */
397 static inline int
398 validate_num_devices(uint32_t max_nb_devices)
399 {
400 	if (num_devices > max_nb_devices) {
401 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402 		return -1;
403 	}
404 	return 0;
405 }
406 
407 /*
408  * Initialises a given port using global settings and with the rx buffers
409  * coming from the mbuf_pool passed as parameter
410  */
411 static inline int
412 port_init(uint8_t port)
413 {
414 	struct rte_eth_dev_info dev_info;
415 	struct rte_eth_conf port_conf;
416 	uint16_t rx_rings, tx_rings;
417 	uint16_t rx_ring_size, tx_ring_size;
418 	int retval;
419 	uint16_t q;
420 
421 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422 	rte_eth_dev_info_get (port, &dev_info);
423 
424 	/*configure the number of supported virtio devices based on VMDQ limits */
425 	num_devices = dev_info.max_vmdq_pools;
426 	num_queues = dev_info.max_rx_queues;
427 
428 	if (zero_copy) {
429 		rx_ring_size = num_rx_descriptor;
430 		tx_ring_size = num_tx_descriptor;
431 		tx_rings = dev_info.max_tx_queues;
432 	} else {
433 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435 		tx_rings = (uint16_t)rte_lcore_count();
436 	}
437 
438 	retval = validate_num_devices(MAX_DEVICES);
439 	if (retval < 0)
440 		return retval;
441 
442 	/* Get port configuration. */
443 	retval = get_eth_conf(&port_conf, num_devices);
444 	if (retval < 0)
445 		return retval;
446 
447 	if (port >= rte_eth_dev_count()) return -1;
448 
449 	rx_rings = (uint16_t)num_queues,
450 	/* Configure ethernet device. */
451 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452 	if (retval != 0)
453 		return retval;
454 
455 	/* Setup the queues. */
456 	for (q = 0; q < rx_rings; q ++) {
457 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 						rte_eth_dev_socket_id(port), &rx_conf_default,
459 						vpool_array[q].pool);
460 		if (retval < 0)
461 			return retval;
462 	}
463 	for (q = 0; q < tx_rings; q ++) {
464 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465 						rte_eth_dev_socket_id(port), &tx_conf_default);
466 		if (retval < 0)
467 			return retval;
468 	}
469 
470 	/* Start the device. */
471 	retval  = rte_eth_dev_start(port);
472 	if (retval < 0) {
473 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474 		return retval;
475 	}
476 
477 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481 			(unsigned)port,
482 			vmdq_ports_eth_addr[port].addr_bytes[0],
483 			vmdq_ports_eth_addr[port].addr_bytes[1],
484 			vmdq_ports_eth_addr[port].addr_bytes[2],
485 			vmdq_ports_eth_addr[port].addr_bytes[3],
486 			vmdq_ports_eth_addr[port].addr_bytes[4],
487 			vmdq_ports_eth_addr[port].addr_bytes[5]);
488 
489 	return 0;
490 }
491 
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498 	/* parse number string */
499 
500 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501 		return -1;
502 	else
503 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504 
505 	return 0;
506 }
507 
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514 	char *end = NULL;
515 	unsigned long pm;
516 
517 	errno = 0;
518 
519 	/* parse hexadecimal string */
520 	pm = strtoul(portmask, &end, 16);
521 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522 		return -1;
523 
524 	if (pm == 0)
525 		return -1;
526 
527 	return pm;
528 
529 }
530 
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537 	char *end = NULL;
538 	unsigned long num;
539 
540 	errno = 0;
541 
542 	/* parse unsigned int string */
543 	num = strtoul(q_arg, &end, 10);
544 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545 		return -1;
546 
547 	if (num > max_valid_value)
548 		return -1;
549 
550 	return num;
551 
552 }
553 
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561 	"		--vm2vm [0|1|2]\n"
562 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 	"		--dev-basename <name> --dev-index [0-N]\n"
564 	"		--nb-devices ND\n"
565 	"		-p PORTMASK: Set mask for ports to be used by application\n"
566 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 	"		--dev-basename: The basename to be used for the character device.\n"
573 	"		--dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
575 			"zero copy\n"
576 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
577 			"used only when zero copy is enabled.\n"
578 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
579 			"used only when zero copy is enabled.\n",
580 	       prgname);
581 }
582 
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589 	int opt, ret;
590 	int option_index;
591 	unsigned i;
592 	const char *prgname = argv[0];
593 	static struct option long_option[] = {
594 		{"vm2vm", required_argument, NULL, 0},
595 		{"rx-retry", required_argument, NULL, 0},
596 		{"rx-retry-delay", required_argument, NULL, 0},
597 		{"rx-retry-num", required_argument, NULL, 0},
598 		{"mergeable", required_argument, NULL, 0},
599 		{"stats", required_argument, NULL, 0},
600 		{"dev-basename", required_argument, NULL, 0},
601 		{"dev-index", required_argument, NULL, 0},
602 		{"zero-copy", required_argument, NULL, 0},
603 		{"rx-desc-num", required_argument, NULL, 0},
604 		{"tx-desc-num", required_argument, NULL, 0},
605 		{NULL, 0, 0, 0},
606 	};
607 
608 	/* Parse command line */
609 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
610 		switch (opt) {
611 		/* Portmask */
612 		case 'p':
613 			enabled_port_mask = parse_portmask(optarg);
614 			if (enabled_port_mask == 0) {
615 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 				us_vhost_usage(prgname);
617 				return -1;
618 			}
619 			break;
620 
621 		case 0:
622 			/* Enable/disable vm2vm comms. */
623 			if (!strncmp(long_option[option_index].name, "vm2vm",
624 				MAX_LONG_OPT_SZ)) {
625 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
626 				if (ret == -1) {
627 					RTE_LOG(INFO, VHOST_CONFIG,
628 						"Invalid argument for "
629 						"vm2vm [0|1|2]\n");
630 					us_vhost_usage(prgname);
631 					return -1;
632 				} else {
633 					vm2vm_mode = (vm2vm_type)ret;
634 				}
635 			}
636 
637 			/* Enable/disable retries on RX. */
638 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639 				ret = parse_num_opt(optarg, 1);
640 				if (ret == -1) {
641 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642 					us_vhost_usage(prgname);
643 					return -1;
644 				} else {
645 					enable_retry = ret;
646 				}
647 			}
648 
649 			/* Specify the retries delay time (in useconds) on RX. */
650 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651 				ret = parse_num_opt(optarg, INT32_MAX);
652 				if (ret == -1) {
653 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654 					us_vhost_usage(prgname);
655 					return -1;
656 				} else {
657 					burst_rx_delay_time = ret;
658 				}
659 			}
660 
661 			/* Specify the retries number on RX. */
662 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663 				ret = parse_num_opt(optarg, INT32_MAX);
664 				if (ret == -1) {
665 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666 					us_vhost_usage(prgname);
667 					return -1;
668 				} else {
669 					burst_rx_retry_num = ret;
670 				}
671 			}
672 
673 			/* Enable/disable RX mergeable buffers. */
674 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675 				ret = parse_num_opt(optarg, 1);
676 				if (ret == -1) {
677 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678 					us_vhost_usage(prgname);
679 					return -1;
680 				} else {
681 					if (ret) {
682 						vmdq_conf_default.rxmode.jumbo_frame = 1;
683 						vmdq_conf_default.rxmode.max_rx_pkt_len
684 							= JUMBO_FRAME_MAX_SIZE;
685 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
686 					}
687 				}
688 			}
689 
690 			/* Enable/disable stats. */
691 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692 				ret = parse_num_opt(optarg, INT32_MAX);
693 				if (ret == -1) {
694 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695 					us_vhost_usage(prgname);
696 					return -1;
697 				} else {
698 					enable_stats = ret;
699 				}
700 			}
701 
702 			/* Set character device basename. */
703 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704 				if (us_vhost_parse_basename(optarg) == -1) {
705 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706 					us_vhost_usage(prgname);
707 					return -1;
708 				}
709 			}
710 
711 			/* Set character device index. */
712 			if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713 				ret = parse_num_opt(optarg, INT32_MAX);
714 				if (ret == -1) {
715 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716 					us_vhost_usage(prgname);
717 					return -1;
718 				} else
719 					dev_index = ret;
720 			}
721 
722 			/* Enable/disable rx/tx zero copy. */
723 			if (!strncmp(long_option[option_index].name,
724 				"zero-copy", MAX_LONG_OPT_SZ)) {
725 				ret = parse_num_opt(optarg, 1);
726 				if (ret == -1) {
727 					RTE_LOG(INFO, VHOST_CONFIG,
728 						"Invalid argument"
729 						" for zero-copy [0|1]\n");
730 					us_vhost_usage(prgname);
731 					return -1;
732 				} else
733 					zero_copy = ret;
734 
735 				if (zero_copy) {
736 #ifdef RTE_MBUF_REFCNT
737 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738 					"zero copy vhost APP, please "
739 					"disable RTE_MBUF_REFCNT\n"
740 					"in config file and then rebuild DPDK "
741 					"core lib!\n"
742 					"Otherwise please disable zero copy "
743 					"flag in command line!\n");
744 					return -1;
745 #endif
746 				}
747 			}
748 
749 			/* Specify the descriptor number on RX. */
750 			if (!strncmp(long_option[option_index].name,
751 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
752 				ret = parse_num_opt(optarg, MAX_RING_DESC);
753 				if ((ret == -1) || (!POWEROF2(ret))) {
754 					RTE_LOG(INFO, VHOST_CONFIG,
755 					"Invalid argument for rx-desc-num[0-N],"
756 					"power of 2 required.\n");
757 					us_vhost_usage(prgname);
758 					return -1;
759 				} else {
760 					num_rx_descriptor = ret;
761 				}
762 			}
763 
764 			/* Specify the descriptor number on TX. */
765 			if (!strncmp(long_option[option_index].name,
766 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
767 				ret = parse_num_opt(optarg, MAX_RING_DESC);
768 				if ((ret == -1) || (!POWEROF2(ret))) {
769 					RTE_LOG(INFO, VHOST_CONFIG,
770 					"Invalid argument for tx-desc-num [0-N],"
771 					"power of 2 required.\n");
772 					us_vhost_usage(prgname);
773 					return -1;
774 				} else {
775 					num_tx_descriptor = ret;
776 				}
777 			}
778 
779 			break;
780 
781 			/* Invalid option - print options. */
782 		default:
783 			us_vhost_usage(prgname);
784 			return -1;
785 		}
786 	}
787 
788 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789 		if (enabled_port_mask & (1 << i))
790 			ports[num_ports++] = (uint8_t)i;
791 	}
792 
793 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
794 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
796 		return -1;
797 	}
798 
799 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800 		RTE_LOG(INFO, VHOST_PORT,
801 			"Vhost zero copy doesn't support software vm2vm,"
802 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
803 		return -1;
804 	}
805 
806 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807 		RTE_LOG(INFO, VHOST_PORT,
808 			"Vhost zero copy doesn't support jumbo frame,"
809 			"please specify '--mergeable 0' to disable the "
810 			"mergeable feature.\n");
811 		return -1;
812 	}
813 
814 	return 0;
815 }
816 
817 /*
818  * Update the global var NUM_PORTS and array PORTS according to system ports number
819  * and return valid ports number
820  */
821 static unsigned check_ports_num(unsigned nb_ports)
822 {
823 	unsigned valid_num_ports = num_ports;
824 	unsigned portid;
825 
826 	if (num_ports > nb_ports) {
827 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828 			num_ports, nb_ports);
829 		num_ports = nb_ports;
830 	}
831 
832 	for (portid = 0; portid < num_ports; portid ++) {
833 		if (ports[portid] >= nb_ports) {
834 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835 				ports[portid], (nb_ports - 1));
836 			ports[portid] = INVALID_PORT_ID;
837 			valid_num_ports--;
838 		}
839 	}
840 	return valid_num_ports;
841 }
842 
843 /*
844  * Macro to print out packet contents. Wrapped in debug define so that the
845  * data path is not effected when debug is disabled.
846  */
847 #ifdef DEBUG
848 #define PRINT_PACKET(device, addr, size, header) do {																\
849 	char *pkt_addr = (char*)(addr);																					\
850 	unsigned int index;																								\
851 	char packet[MAX_PRINT_BUFF];																					\
852 																													\
853 	if ((header))																									\
854 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
855 	else																											\
856 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
857 	for (index = 0; index < (size); index++) {																		\
858 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
859 			"%02hhx ", pkt_addr[index]);																			\
860 	}																												\
861 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
862 																													\
863 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
864 } while(0)
865 #else
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
867 #endif
868 
869 /*
870  * Function to convert guest physical addresses to vhost physical addresses.
871  * This is used to convert virtio buffer addresses.
872  */
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
875 	uint32_t buf_len, hpa_type *addr_type)
876 {
877 	struct virtio_memory_regions_hpa *region;
878 	uint32_t regionidx;
879 	uint64_t vhost_pa = 0;
880 
881 	*addr_type = PHYS_ADDR_INVALID;
882 
883 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
884 		region = &vdev->regions_hpa[regionidx];
885 		if ((guest_pa >= region->guest_phys_address) &&
886 			(guest_pa <= region->guest_phys_address_end)) {
887 			vhost_pa = region->host_phys_addr_offset + guest_pa;
888 			if (likely((guest_pa + buf_len - 1)
889 				<= region->guest_phys_address_end))
890 				*addr_type = PHYS_ADDR_CONTINUOUS;
891 			else
892 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
893 			break;
894 		}
895 	}
896 
897 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
898 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
899 		(void *)(uintptr_t)vhost_pa);
900 
901 	return vhost_pa;
902 }
903 
904 /*
905  * Compares a packet destination MAC address to a device MAC address.
906  */
907 static inline int __attribute__((always_inline))
908 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
909 {
910 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
911 }
912 
913 /*
914  * This function learns the MAC address of the device and registers this along with a
915  * vlan tag to a VMDQ.
916  */
917 static int
918 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
919 {
920 	struct ether_hdr *pkt_hdr;
921 	struct virtio_net_data_ll *dev_ll;
922 	struct virtio_net *dev = vdev->dev;
923 	int i, ret;
924 
925 	/* Learn MAC address of guest device from packet */
926 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
927 
928 	dev_ll = ll_root_used;
929 
930 	while (dev_ll != NULL) {
931 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
932 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
933 			return -1;
934 		}
935 		dev_ll = dev_ll->next;
936 	}
937 
938 	for (i = 0; i < ETHER_ADDR_LEN; i++)
939 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
940 
941 	/* vlan_tag currently uses the device_id. */
942 	vdev->vlan_tag = vlan_tags[dev->device_fh];
943 
944 	/* Print out VMDQ registration info. */
945 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
946 		dev->device_fh,
947 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
948 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
949 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
950 		vdev->vlan_tag);
951 
952 	/* Register the MAC address. */
953 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
954 	if (ret)
955 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
956 					dev->device_fh);
957 
958 	/* Enable stripping of the vlan tag as we handle routing. */
959 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
960 
961 	/* Set device as ready for RX. */
962 	vdev->ready = DEVICE_RX;
963 
964 	return 0;
965 }
966 
967 /*
968  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
969  * queue before disabling RX on the device.
970  */
971 static inline void
972 unlink_vmdq(struct vhost_dev *vdev)
973 {
974 	unsigned i = 0;
975 	unsigned rx_count;
976 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
977 
978 	if (vdev->ready == DEVICE_RX) {
979 		/*clear MAC and VLAN settings*/
980 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
981 		for (i = 0; i < 6; i++)
982 			vdev->mac_address.addr_bytes[i] = 0;
983 
984 		vdev->vlan_tag = 0;
985 
986 		/*Clear out the receive buffers*/
987 		rx_count = rte_eth_rx_burst(ports[0],
988 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
989 
990 		while (rx_count) {
991 			for (i = 0; i < rx_count; i++)
992 				rte_pktmbuf_free(pkts_burst[i]);
993 
994 			rx_count = rte_eth_rx_burst(ports[0],
995 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
996 		}
997 
998 		vdev->ready = DEVICE_MAC_LEARNING;
999 	}
1000 }
1001 
1002 /*
1003  * Check if the packet destination MAC address is for a local device. If so then put
1004  * the packet on that devices RX queue. If not then return.
1005  */
1006 static inline unsigned __attribute__((always_inline))
1007 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1008 {
1009 	struct virtio_net_data_ll *dev_ll;
1010 	struct ether_hdr *pkt_hdr;
1011 	uint64_t ret = 0;
1012 	struct virtio_net *dev = vdev->dev;
1013 	struct virtio_net *tdev; /* destination virito device */
1014 
1015 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1016 
1017 	/*get the used devices list*/
1018 	dev_ll = ll_root_used;
1019 
1020 	while (dev_ll != NULL) {
1021 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1022 				          &dev_ll->vdev->mac_address)) {
1023 
1024 			/* Drop the packet if the TX packet is destined for the TX device. */
1025 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1026 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1027 							dev->device_fh);
1028 				return 0;
1029 			}
1030 			tdev = dev_ll->vdev->dev;
1031 
1032 
1033 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1034 
1035 			if (dev_ll->vdev->remove) {
1036 				/*drop the packet if the device is marked for removal*/
1037 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1038 			} else {
1039 				uint32_t mergeable =
1040 					dev_ll->dev->features &
1041 					(1 << VIRTIO_NET_F_MRG_RXBUF);
1042 
1043 				/*send the packet to the local virtio device*/
1044 				if (likely(mergeable == 0))
1045 					ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1046 				else
1047 					ret = virtio_dev_merge_rx(dev_ll->dev,
1048 						&m, 1);
1049 
1050 				if (enable_stats) {
1051 					rte_atomic64_add(
1052 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1053 					1);
1054 					rte_atomic64_add(
1055 					&dev_statistics[tdev->device_fh].rx_atomic,
1056 					ret);
1057 					dev_statistics[tdev->device_fh].tx_total++;
1058 					dev_statistics[tdev->device_fh].tx += ret;
1059 				}
1060 			}
1061 
1062 			return 0;
1063 		}
1064 		dev_ll = dev_ll->next;
1065 	}
1066 
1067 	return -1;
1068 }
1069 
1070 /*
1071  * This function routes the TX packet to the correct interface. This may be a local device
1072  * or the physical port.
1073  */
1074 static inline void __attribute__((always_inline))
1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1076 {
1077 	struct mbuf_table *tx_q;
1078 	struct vlan_ethhdr *vlan_hdr;
1079 	struct rte_mbuf **m_table;
1080 	struct rte_mbuf *mbuf, *prev;
1081 	unsigned len, ret, offset = 0;
1082 	const uint16_t lcore_id = rte_lcore_id();
1083 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1084 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1085 	struct virtio_net *dev = vdev->dev;
1086 
1087 	/*check if destination is local VM*/
1088 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1089 		return;
1090 
1091 	if (vm2vm_mode == VM2VM_HARDWARE) {
1092 		while (dev_ll != NULL) {
1093 			if ((dev_ll->vdev->ready == DEVICE_RX)
1094 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1095 				&dev_ll->vdev->mac_address)) {
1096 				/*
1097 				 * Drop the packet if the TX packet is
1098 				 * destined for the TX device.
1099 				 */
1100 				if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1101 					LOG_DEBUG(VHOST_DATA,
1102 					"(%"PRIu64") TX: Source and destination"
1103 					" MAC addresses are the same. Dropping "
1104 					"packet.\n",
1105 					dev_ll->vdev->device_fh);
1106 					return;
1107 				}
1108 				offset = 4;
1109 				vlan_tag =
1110 				(uint16_t)
1111 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1112 
1113 				LOG_DEBUG(VHOST_DATA,
1114 				"(%"PRIu64") TX: pkt to local VM device id:"
1115 				"(%"PRIu64") vlan tag: %d.\n",
1116 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1117 				vlan_tag);
1118 
1119 				break;
1120 			}
1121 			dev_ll = dev_ll->next;
1122 		}
1123 	}
1124 
1125 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1126 
1127 	/*Add packet to the port tx queue*/
1128 	tx_q = &lcore_tx_queue[lcore_id];
1129 	len = tx_q->len;
1130 
1131 	/* Allocate an mbuf and populate the structure. */
1132 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
1133 	if (unlikely(mbuf == NULL)) {
1134 		RTE_LOG(ERR, VHOST_DATA,
1135 			"Failed to allocate memory for mbuf.\n");
1136 		return;
1137 	}
1138 
1139 	mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1140 	mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1141 	mbuf->nb_segs = m->nb_segs;
1142 
1143 	/* Copy ethernet header to mbuf. */
1144 	rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1145 		rte_pktmbuf_mtod(m, const void *),
1146 		ETH_HLEN);
1147 
1148 
1149 	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1150 	vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1151 	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1152 	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1153 	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1154 
1155 	/* Copy the remaining packet contents to the mbuf. */
1156 	rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1157 		(const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1158 		(m->data_len - ETH_HLEN));
1159 
1160 	/* Copy the remaining segments for the whole packet. */
1161 	prev = mbuf;
1162 	while (m->next) {
1163 		/* Allocate an mbuf and populate the structure. */
1164 		struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1165 		if (unlikely(next_mbuf == NULL)) {
1166 			rte_pktmbuf_free(mbuf);
1167 			RTE_LOG(ERR, VHOST_DATA,
1168 				"Failed to allocate memory for mbuf.\n");
1169 			return;
1170 		}
1171 
1172 		m = m->next;
1173 		prev->next = next_mbuf;
1174 		prev = next_mbuf;
1175 		next_mbuf->data_len = m->data_len;
1176 
1177 		/* Copy data to next mbuf. */
1178 		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1179 			rte_pktmbuf_mtod(m, const void *), m->data_len);
1180 	}
1181 
1182 	tx_q->m_table[len] = mbuf;
1183 	len++;
1184 	if (enable_stats) {
1185 		dev_statistics[dev->device_fh].tx_total++;
1186 		dev_statistics[dev->device_fh].tx++;
1187 	}
1188 
1189 	if (unlikely(len == MAX_PKT_BURST)) {
1190 		m_table = (struct rte_mbuf **)tx_q->m_table;
1191 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1192 		/* Free any buffers not handled by TX and update the port stats. */
1193 		if (unlikely(ret < len)) {
1194 			do {
1195 				rte_pktmbuf_free(m_table[ret]);
1196 			} while (++ret < len);
1197 		}
1198 
1199 		len = 0;
1200 	}
1201 
1202 	tx_q->len = len;
1203 	return;
1204 }
1205 /*
1206  * This function is called by each data core. It handles all RX/TX registered with the
1207  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1208  * with all devices in the main linked list.
1209  */
1210 static int
1211 switch_worker(__attribute__((unused)) void *arg)
1212 {
1213 	struct rte_mempool *mbuf_pool = arg;
1214 	struct virtio_net *dev = NULL;
1215 	struct vhost_dev *vdev = NULL;
1216 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1217 	struct virtio_net_data_ll *dev_ll;
1218 	struct mbuf_table *tx_q;
1219 	volatile struct lcore_ll_info *lcore_ll;
1220 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1221 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1222 	unsigned ret, i;
1223 	const uint16_t lcore_id = rte_lcore_id();
1224 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1225 	uint16_t rx_count = 0;
1226 	uint32_t mergeable = 0;
1227 
1228 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1229 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1230 	prev_tsc = 0;
1231 
1232 	tx_q = &lcore_tx_queue[lcore_id];
1233 	for (i = 0; i < num_cores; i ++) {
1234 		if (lcore_ids[i] == lcore_id) {
1235 			tx_q->txq_id = i;
1236 			break;
1237 		}
1238 	}
1239 
1240 	while(1) {
1241 		cur_tsc = rte_rdtsc();
1242 		/*
1243 		 * TX burst queue drain
1244 		 */
1245 		diff_tsc = cur_tsc - prev_tsc;
1246 		if (unlikely(diff_tsc > drain_tsc)) {
1247 
1248 			if (tx_q->len) {
1249 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1250 
1251 				/*Tx any packets in the queue*/
1252 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1253 									   (struct rte_mbuf **)tx_q->m_table,
1254 									   (uint16_t)tx_q->len);
1255 				if (unlikely(ret < tx_q->len)) {
1256 					do {
1257 						rte_pktmbuf_free(tx_q->m_table[ret]);
1258 					} while (++ret < tx_q->len);
1259 				}
1260 
1261 				tx_q->len = 0;
1262 			}
1263 
1264 			prev_tsc = cur_tsc;
1265 
1266 		}
1267 
1268 		rte_prefetch0(lcore_ll->ll_root_used);
1269 		/*
1270 		 * Inform the configuration core that we have exited the linked list and that no devices are
1271 		 * in use if requested.
1272 		 */
1273 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1274 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1275 
1276 		/*
1277 		 * Process devices
1278 		 */
1279 		dev_ll = lcore_ll->ll_root_used;
1280 
1281 		while (dev_ll != NULL) {
1282 			/*get virtio device ID*/
1283 			vdev = dev_ll->vdev;
1284 			dev = vdev->dev;
1285 			mergeable =
1286 				dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1287 
1288 			if (vdev->remove) {
1289 				dev_ll = dev_ll->next;
1290 				unlink_vmdq(vdev);
1291 				vdev->ready = DEVICE_SAFE_REMOVE;
1292 				continue;
1293 			}
1294 			if (likely(vdev->ready == DEVICE_RX)) {
1295 				/*Handle guest RX*/
1296 				rx_count = rte_eth_rx_burst(ports[0],
1297 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1298 
1299 				if (rx_count) {
1300 					if (likely(mergeable == 0))
1301 						ret_count =
1302 							virtio_dev_rx(dev,
1303 							pkts_burst, rx_count);
1304 					else
1305 						ret_count =
1306 							virtio_dev_merge_rx(dev,
1307 							pkts_burst, rx_count);
1308 
1309 					if (enable_stats) {
1310 						rte_atomic64_add(
1311 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1312 						rx_count);
1313 						rte_atomic64_add(
1314 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1315 					}
1316 					while (likely(rx_count)) {
1317 						rx_count--;
1318 						rte_pktmbuf_free(pkts_burst[rx_count]);
1319 					}
1320 
1321 				}
1322 			}
1323 
1324 			if (!vdev->remove) {
1325 				/*Handle guest TX*/
1326 				if (likely(mergeable == 0))
1327 					virtio_dev_tx(dev, mbuf_pool);
1328 				else
1329 					virtio_dev_merge_tx(dev, mbuf_pool);
1330 			}
1331 
1332 			/*move to the next device in the list*/
1333 			dev_ll = dev_ll->next;
1334 		}
1335 	}
1336 
1337 	return 0;
1338 }
1339 
1340 /*
1341  * This function gets available ring number for zero copy rx.
1342  * Only one thread will call this funciton for a paticular virtio device,
1343  * so, it is designed as non-thread-safe function.
1344  */
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_num_zcp(struct virtio_net *dev)
1347 {
1348 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1349 	uint16_t avail_idx;
1350 
1351 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1352 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1353 }
1354 
1355 /*
1356  * This function gets available ring index for zero copy rx,
1357  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1358  * Only one thread will call this funciton for a paticular virtio device,
1359  * so, it is designed as non-thread-safe function.
1360  */
1361 static inline uint32_t __attribute__((always_inline))
1362 get_available_ring_index_zcp(struct virtio_net *dev,
1363 	uint16_t *res_base_idx, uint32_t count)
1364 {
1365 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1366 	uint16_t avail_idx;
1367 	uint32_t retry = 0;
1368 	uint16_t free_entries;
1369 
1370 	*res_base_idx = vq->last_used_idx_res;
1371 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372 	free_entries = (avail_idx - *res_base_idx);
1373 
1374 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1375 			"avail idx: %d, "
1376 			"res base idx:%d, free entries:%d\n",
1377 			dev->device_fh, avail_idx, *res_base_idx,
1378 			free_entries);
1379 
1380 	/*
1381 	 * If retry is enabled and the queue is full then we wait
1382 	 * and retry to avoid packet loss.
1383 	 */
1384 	if (enable_retry && unlikely(count > free_entries)) {
1385 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1386 			rte_delay_us(burst_rx_delay_time);
1387 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1388 			free_entries = (avail_idx - *res_base_idx);
1389 			if (count <= free_entries)
1390 				break;
1391 		}
1392 	}
1393 
1394 	/*check that we have enough buffers*/
1395 	if (unlikely(count > free_entries))
1396 		count = free_entries;
1397 
1398 	if (unlikely(count == 0)) {
1399 		LOG_DEBUG(VHOST_DATA,
1400 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1401 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1402 			dev->device_fh, avail_idx,
1403 			*res_base_idx, free_entries);
1404 		return 0;
1405 	}
1406 
1407 	vq->last_used_idx_res = *res_base_idx + count;
1408 
1409 	return count;
1410 }
1411 
1412 /*
1413  * This function put descriptor back to used list.
1414  */
1415 static inline void __attribute__((always_inline))
1416 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1417 {
1418 	uint16_t res_cur_idx = vq->last_used_idx;
1419 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1420 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1421 	rte_compiler_barrier();
1422 	*(volatile uint16_t *)&vq->used->idx += 1;
1423 	vq->last_used_idx += 1;
1424 
1425 	/* Kick the guest if necessary. */
1426 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1427 		eventfd_write((int)vq->kickfd, 1);
1428 }
1429 
1430 /*
1431  * This function get available descriptor from vitio vring and un-attached mbuf
1432  * from vpool->ring, and then attach them together. It needs adjust the offset
1433  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1434  * frame data may be put to wrong location in mbuf.
1435  */
1436 static inline void __attribute__((always_inline))
1437 attach_rxmbuf_zcp(struct virtio_net *dev)
1438 {
1439 	uint16_t res_base_idx, desc_idx;
1440 	uint64_t buff_addr, phys_addr;
1441 	struct vhost_virtqueue *vq;
1442 	struct vring_desc *desc;
1443 	struct rte_mbuf *mbuf = NULL;
1444 	struct vpool *vpool;
1445 	hpa_type addr_type;
1446 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1447 
1448 	vpool = &vpool_array[vdev->vmdq_rx_q];
1449 	vq = dev->virtqueue[VIRTIO_RXQ];
1450 
1451 	do {
1452 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1453 				1) != 1))
1454 			return;
1455 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1456 
1457 		desc = &vq->desc[desc_idx];
1458 		if (desc->flags & VRING_DESC_F_NEXT) {
1459 			desc = &vq->desc[desc->next];
1460 			buff_addr = gpa_to_vva(dev, desc->addr);
1461 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1462 					&addr_type);
1463 		} else {
1464 			buff_addr = gpa_to_vva(dev,
1465 					desc->addr + vq->vhost_hlen);
1466 			phys_addr = gpa_to_hpa(vdev,
1467 					desc->addr + vq->vhost_hlen,
1468 					desc->len, &addr_type);
1469 		}
1470 
1471 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1472 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1473 				" address found when attaching RX frame buffer"
1474 				" address!\n", dev->device_fh);
1475 			put_desc_to_used_list_zcp(vq, desc_idx);
1476 			continue;
1477 		}
1478 
1479 		/*
1480 		 * Check if the frame buffer address from guest crosses
1481 		 * sub-region or not.
1482 		 */
1483 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1484 			RTE_LOG(ERR, VHOST_DATA,
1485 				"(%"PRIu64") Frame buffer address cross "
1486 				"sub-regioin found when attaching RX frame "
1487 				"buffer address!\n",
1488 				dev->device_fh);
1489 			put_desc_to_used_list_zcp(vq, desc_idx);
1490 			continue;
1491 		}
1492 	} while (unlikely(phys_addr == 0));
1493 
1494 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1495 	if (unlikely(mbuf == NULL)) {
1496 		LOG_DEBUG(VHOST_DATA,
1497 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1498 			"ring_sc_dequeue fail.\n",
1499 			dev->device_fh);
1500 		put_desc_to_used_list_zcp(vq, desc_idx);
1501 		return;
1502 	}
1503 
1504 	if (unlikely(vpool->buf_size > desc->len)) {
1505 		LOG_DEBUG(VHOST_DATA,
1506 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1507 			"length(%d) of descriptor idx: %d less than room "
1508 			"size required: %d\n",
1509 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1510 		put_desc_to_used_list_zcp(vq, desc_idx);
1511 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1512 		return;
1513 	}
1514 
1515 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1516 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1517 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1518 	mbuf->data_len = desc->len;
1519 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1520 
1521 	LOG_DEBUG(VHOST_DATA,
1522 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1523 		"descriptor idx:%d\n",
1524 		dev->device_fh, res_base_idx, desc_idx);
1525 
1526 	__rte_mbuf_raw_free(mbuf);
1527 
1528 	return;
1529 }
1530 
1531 /*
1532  * Detach an attched packet mbuf -
1533  *  - restore original mbuf address and length values.
1534  *  - reset pktmbuf data and data_len to their default values.
1535  *  All other fields of the given packet mbuf will be left intact.
1536  *
1537  * @param m
1538  *   The attached packet mbuf.
1539  */
1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1541 {
1542 	const struct rte_mempool *mp = m->pool;
1543 	void *buf = RTE_MBUF_TO_BADDR(m);
1544 	uint32_t buf_ofs;
1545 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1546 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1547 
1548 	m->buf_addr = buf;
1549 	m->buf_len = (uint16_t)buf_len;
1550 
1551 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1552 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1553 	m->data_off = buf_ofs;
1554 
1555 	m->data_len = 0;
1556 }
1557 
1558 /*
1559  * This function is called after packets have been transimited. It fetchs mbuf
1560  * from vpool->pool, detached it and put into vpool->ring. It also update the
1561  * used index and kick the guest if necessary.
1562  */
1563 static inline uint32_t __attribute__((always_inline))
1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1565 {
1566 	struct rte_mbuf *mbuf;
1567 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1568 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1569 	uint32_t index = 0;
1570 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1571 
1572 	LOG_DEBUG(VHOST_DATA,
1573 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1574 		"clean is: %d\n",
1575 		dev->device_fh, mbuf_count);
1576 	LOG_DEBUG(VHOST_DATA,
1577 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1578 		"clean  is : %d\n",
1579 		dev->device_fh, rte_ring_count(vpool->ring));
1580 
1581 	for (index = 0; index < mbuf_count; index++) {
1582 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1584 			pktmbuf_detach_zcp(mbuf);
1585 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1586 
1587 		/* Update used index buffer information. */
1588 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1589 		vq->used->ring[used_idx].len = 0;
1590 
1591 		used_idx = (used_idx + 1) & (vq->size - 1);
1592 	}
1593 
1594 	LOG_DEBUG(VHOST_DATA,
1595 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1596 		"clean is: %d\n",
1597 		dev->device_fh, rte_mempool_count(vpool->pool));
1598 	LOG_DEBUG(VHOST_DATA,
1599 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1600 		"clean  is : %d\n",
1601 		dev->device_fh, rte_ring_count(vpool->ring));
1602 	LOG_DEBUG(VHOST_DATA,
1603 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1604 		"vq->last_used_idx:%d\n",
1605 		dev->device_fh, vq->last_used_idx);
1606 
1607 	vq->last_used_idx += mbuf_count;
1608 
1609 	LOG_DEBUG(VHOST_DATA,
1610 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1611 		"vq->last_used_idx:%d\n",
1612 		dev->device_fh, vq->last_used_idx);
1613 
1614 	rte_compiler_barrier();
1615 
1616 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1617 
1618 	/* Kick guest if required. */
1619 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1620 		eventfd_write((int)vq->kickfd, 1);
1621 
1622 	return 0;
1623 }
1624 
1625 /*
1626  * This function is called when a virtio device is destroy.
1627  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1628  */
1629 static void mbuf_destroy_zcp(struct vpool *vpool)
1630 {
1631 	struct rte_mbuf *mbuf = NULL;
1632 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1633 
1634 	LOG_DEBUG(VHOST_CONFIG,
1635 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1636 		"mbuf_destroy_zcp is: %d\n",
1637 		mbuf_count);
1638 	LOG_DEBUG(VHOST_CONFIG,
1639 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1640 		"mbuf_destroy_zcp  is : %d\n",
1641 		rte_ring_count(vpool->ring));
1642 
1643 	for (index = 0; index < mbuf_count; index++) {
1644 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1645 		if (likely(mbuf != NULL)) {
1646 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1647 				pktmbuf_detach_zcp(mbuf);
1648 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1649 		}
1650 	}
1651 
1652 	LOG_DEBUG(VHOST_CONFIG,
1653 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1654 		"mbuf_destroy_zcp is: %d\n",
1655 		rte_mempool_count(vpool->pool));
1656 	LOG_DEBUG(VHOST_CONFIG,
1657 		"in mbuf_destroy_zcp: mbuf count in ring after "
1658 		"mbuf_destroy_zcp is : %d\n",
1659 		rte_ring_count(vpool->ring));
1660 }
1661 
1662 /*
1663  * This function update the use flag and counter.
1664  */
1665 static inline uint32_t __attribute__((always_inline))
1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1667 	uint32_t count)
1668 {
1669 	struct vhost_virtqueue *vq;
1670 	struct vring_desc *desc;
1671 	struct rte_mbuf *buff;
1672 	/* The virtio_hdr is initialised to 0. */
1673 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1674 		= {{0, 0, 0, 0, 0, 0}, 0};
1675 	uint64_t buff_hdr_addr = 0;
1676 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1677 	uint32_t head_idx, packet_success = 0;
1678 	uint16_t res_cur_idx;
1679 
1680 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1681 
1682 	if (count == 0)
1683 		return 0;
1684 
1685 	vq = dev->virtqueue[VIRTIO_RXQ];
1686 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1687 
1688 	res_cur_idx = vq->last_used_idx;
1689 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1690 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1691 
1692 	/* Retrieve all of the head indexes first to avoid caching issues. */
1693 	for (head_idx = 0; head_idx < count; head_idx++)
1694 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1695 
1696 	/*Prefetch descriptor index. */
1697 	rte_prefetch0(&vq->desc[head[packet_success]]);
1698 
1699 	while (packet_success != count) {
1700 		/* Get descriptor from available ring */
1701 		desc = &vq->desc[head[packet_success]];
1702 
1703 		buff = pkts[packet_success];
1704 		LOG_DEBUG(VHOST_DATA,
1705 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1706 			"pkt[%d] descriptor idx: %d\n",
1707 			dev->device_fh, packet_success,
1708 			MBUF_HEADROOM_UINT32(buff));
1709 
1710 		PRINT_PACKET(dev,
1711 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1712 			+ RTE_PKTMBUF_HEADROOM),
1713 			rte_pktmbuf_data_len(buff), 0);
1714 
1715 		/* Buffer address translation for virtio header. */
1716 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1717 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1718 
1719 		/*
1720 		 * If the descriptors are chained the header and data are
1721 		 * placed in separate buffers.
1722 		 */
1723 		if (desc->flags & VRING_DESC_F_NEXT) {
1724 			desc->len = vq->vhost_hlen;
1725 			desc = &vq->desc[desc->next];
1726 			desc->len = rte_pktmbuf_data_len(buff);
1727 		} else {
1728 			desc->len = packet_len;
1729 		}
1730 
1731 		/* Update used ring with desc information */
1732 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1733 			= head[packet_success];
1734 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1735 			= packet_len;
1736 		res_cur_idx++;
1737 		packet_success++;
1738 
1739 		/* A header is required per buffer. */
1740 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1741 			(const void *)&virtio_hdr, vq->vhost_hlen);
1742 
1743 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1744 
1745 		if (likely(packet_success < count)) {
1746 			/* Prefetch descriptor index. */
1747 			rte_prefetch0(&vq->desc[head[packet_success]]);
1748 		}
1749 	}
1750 
1751 	rte_compiler_barrier();
1752 
1753 	LOG_DEBUG(VHOST_DATA,
1754 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1755 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1756 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1757 
1758 	*(volatile uint16_t *)&vq->used->idx += count;
1759 	vq->last_used_idx += count;
1760 
1761 	LOG_DEBUG(VHOST_DATA,
1762 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1763 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1764 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1765 
1766 	/* Kick the guest if necessary. */
1767 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1768 		eventfd_write((int)vq->kickfd, 1);
1769 
1770 	return count;
1771 }
1772 
1773 /*
1774  * This function routes the TX packet to the correct interface.
1775  * This may be a local device or the physical port.
1776  */
1777 static inline void __attribute__((always_inline))
1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1779 	uint32_t desc_idx, uint8_t need_copy)
1780 {
1781 	struct mbuf_table *tx_q;
1782 	struct rte_mbuf **m_table;
1783 	struct rte_mbuf *mbuf = NULL;
1784 	unsigned len, ret, offset = 0;
1785 	struct vpool *vpool;
1786 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1787 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1788 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1789 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1790 
1791 	/*Add packet to the port tx queue*/
1792 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1793 	len = tx_q->len;
1794 
1795 	/* Allocate an mbuf and populate the structure. */
1796 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1797 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1798 	if (unlikely(mbuf == NULL)) {
1799 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1800 		RTE_LOG(ERR, VHOST_DATA,
1801 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1802 			dev->device_fh);
1803 		put_desc_to_used_list_zcp(vq, desc_idx);
1804 		return;
1805 	}
1806 
1807 	if (vm2vm_mode == VM2VM_HARDWARE) {
1808 		/* Avoid using a vlan tag from any vm for external pkt, such as
1809 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1810 		 * selection, MAC address determines it as an external pkt
1811 		 * which should go to network, while vlan tag determine it as
1812 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1813 		 * such a ambiguous situation, so pkt will lost.
1814 		 */
1815 		vlan_tag = external_pkt_default_vlan_tag;
1816 		while (dev_ll != NULL) {
1817 			if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1818 				ether_addr_cmp(&(pkt_hdr->d_addr),
1819 				&dev_ll->vdev->mac_address)) {
1820 
1821 				/*
1822 				 * Drop the packet if the TX packet is destined
1823 				 * for the TX device.
1824 				 */
1825 				if (unlikely(dev_ll->vdev->dev->device_fh
1826 					== dev->device_fh)) {
1827 					LOG_DEBUG(VHOST_DATA,
1828 					"(%"PRIu64") TX: Source and destination"
1829 					"MAC addresses are the same. Dropping "
1830 					"packet.\n",
1831 					dev_ll->vdev->dev->device_fh);
1832 					MBUF_HEADROOM_UINT32(mbuf)
1833 						= (uint32_t)desc_idx;
1834 					__rte_mbuf_raw_free(mbuf);
1835 					return;
1836 				}
1837 
1838 				/*
1839 				 * Packet length offset 4 bytes for HW vlan
1840 				 * strip when L2 switch back.
1841 				 */
1842 				offset = 4;
1843 				vlan_tag =
1844 				(uint16_t)
1845 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1846 
1847 				LOG_DEBUG(VHOST_DATA,
1848 				"(%"PRIu64") TX: pkt to local VM device id:"
1849 				"(%"PRIu64") vlan tag: %d.\n",
1850 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1851 				vlan_tag);
1852 
1853 				break;
1854 			}
1855 			dev_ll = dev_ll->next;
1856 		}
1857 	}
1858 
1859 	mbuf->nb_segs = m->nb_segs;
1860 	mbuf->next = m->next;
1861 	mbuf->data_len = m->data_len + offset;
1862 	mbuf->pkt_len = mbuf->data_len;
1863 	if (unlikely(need_copy)) {
1864 		/* Copy the packet contents to the mbuf. */
1865 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1866 			rte_pktmbuf_mtod(m, void *),
1867 			m->data_len);
1868 	} else {
1869 		mbuf->data_off = m->data_off;
1870 		mbuf->buf_physaddr = m->buf_physaddr;
1871 		mbuf->buf_addr = m->buf_addr;
1872 	}
1873 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1874 	mbuf->vlan_tci = vlan_tag;
1875 	mbuf->l2_len = sizeof(struct ether_hdr);
1876 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1877 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1878 
1879 	tx_q->m_table[len] = mbuf;
1880 	len++;
1881 
1882 	LOG_DEBUG(VHOST_DATA,
1883 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1884 		dev->device_fh,
1885 		mbuf->nb_segs,
1886 		(mbuf->next == NULL) ? "null" : "non-null");
1887 
1888 	if (enable_stats) {
1889 		dev_statistics[dev->device_fh].tx_total++;
1890 		dev_statistics[dev->device_fh].tx++;
1891 	}
1892 
1893 	if (unlikely(len == MAX_PKT_BURST)) {
1894 		m_table = (struct rte_mbuf **)tx_q->m_table;
1895 		ret = rte_eth_tx_burst(ports[0],
1896 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1897 
1898 		/*
1899 		 * Free any buffers not handled by TX and update
1900 		 * the port stats.
1901 		 */
1902 		if (unlikely(ret < len)) {
1903 			do {
1904 				rte_pktmbuf_free(m_table[ret]);
1905 			} while (++ret < len);
1906 		}
1907 
1908 		len = 0;
1909 		txmbuf_clean_zcp(dev, vpool);
1910 	}
1911 
1912 	tx_q->len = len;
1913 
1914 	return;
1915 }
1916 
1917 /*
1918  * This function TX all available packets in virtio TX queue for one
1919  * virtio-net device. If it is first packet, it learns MAC address and
1920  * setup VMDQ.
1921  */
1922 static inline void __attribute__((always_inline))
1923 virtio_dev_tx_zcp(struct virtio_net *dev)
1924 {
1925 	struct rte_mbuf m;
1926 	struct vhost_virtqueue *vq;
1927 	struct vring_desc *desc;
1928 	uint64_t buff_addr = 0, phys_addr;
1929 	uint32_t head[MAX_PKT_BURST];
1930 	uint32_t i;
1931 	uint16_t free_entries, packet_success = 0;
1932 	uint16_t avail_idx;
1933 	uint8_t need_copy = 0;
1934 	hpa_type addr_type;
1935 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1936 
1937 	vq = dev->virtqueue[VIRTIO_TXQ];
1938 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1939 
1940 	/* If there are no available buffers then return. */
1941 	if (vq->last_used_idx_res == avail_idx)
1942 		return;
1943 
1944 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1945 
1946 	/* Prefetch available ring to retrieve head indexes. */
1947 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1948 
1949 	/* Get the number of free entries in the ring */
1950 	free_entries = (avail_idx - vq->last_used_idx_res);
1951 
1952 	/* Limit to MAX_PKT_BURST. */
1953 	free_entries
1954 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1955 
1956 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1957 		dev->device_fh, free_entries);
1958 
1959 	/* Retrieve all of the head indexes first to avoid caching issues. */
1960 	for (i = 0; i < free_entries; i++)
1961 		head[i]
1962 			= vq->avail->ring[(vq->last_used_idx_res + i)
1963 			& (vq->size - 1)];
1964 
1965 	vq->last_used_idx_res += free_entries;
1966 
1967 	/* Prefetch descriptor index. */
1968 	rte_prefetch0(&vq->desc[head[packet_success]]);
1969 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1970 
1971 	while (packet_success < free_entries) {
1972 		desc = &vq->desc[head[packet_success]];
1973 
1974 		/* Discard first buffer as it is the virtio header */
1975 		desc = &vq->desc[desc->next];
1976 
1977 		/* Buffer address translation. */
1978 		buff_addr = gpa_to_vva(dev, desc->addr);
1979 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1980 
1981 		if (likely(packet_success < (free_entries - 1)))
1982 			/* Prefetch descriptor index. */
1983 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1984 
1985 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1986 			RTE_LOG(ERR, VHOST_DATA,
1987 				"(%"PRIu64") Invalid frame buffer address found"
1988 				"when TX packets!\n",
1989 				dev->device_fh);
1990 			packet_success++;
1991 			continue;
1992 		}
1993 
1994 		/* Prefetch buffer address. */
1995 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1996 
1997 		/*
1998 		 * Setup dummy mbuf. This is copied to a real mbuf if
1999 		 * transmitted out the physical port.
2000 		 */
2001 		m.data_len = desc->len;
2002 		m.nb_segs = 1;
2003 		m.next = NULL;
2004 		m.data_off = 0;
2005 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2006 		m.buf_physaddr = phys_addr;
2007 
2008 		/*
2009 		 * Check if the frame buffer address from guest crosses
2010 		 * sub-region or not.
2011 		 */
2012 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2013 			RTE_LOG(ERR, VHOST_DATA,
2014 				"(%"PRIu64") Frame buffer address cross "
2015 				"sub-regioin found when attaching TX frame "
2016 				"buffer address!\n",
2017 				dev->device_fh);
2018 			need_copy = 1;
2019 		} else
2020 			need_copy = 0;
2021 
2022 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2023 
2024 		/*
2025 		 * If this is the first received packet we need to learn
2026 		 * the MAC and setup VMDQ
2027 		 */
2028 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2029 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2030 				/*
2031 				 * Discard frame if device is scheduled for
2032 				 * removal or a duplicate MAC address is found.
2033 				 */
2034 				packet_success += free_entries;
2035 				vq->last_used_idx += packet_success;
2036 				break;
2037 			}
2038 		}
2039 
2040 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2041 		packet_success++;
2042 	}
2043 }
2044 
2045 /*
2046  * This function is called by each data core. It handles all RX/TX registered
2047  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2048  * addresses are compared with all devices in the main linked list.
2049  */
2050 static int
2051 switch_worker_zcp(__attribute__((unused)) void *arg)
2052 {
2053 	struct virtio_net *dev = NULL;
2054 	struct vhost_dev  *vdev = NULL;
2055 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2056 	struct virtio_net_data_ll *dev_ll;
2057 	struct mbuf_table *tx_q;
2058 	volatile struct lcore_ll_info *lcore_ll;
2059 	const uint64_t drain_tsc
2060 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2061 		* BURST_TX_DRAIN_US;
2062 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2063 	unsigned ret;
2064 	const uint16_t lcore_id = rte_lcore_id();
2065 	uint16_t count_in_ring, rx_count = 0;
2066 
2067 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2068 
2069 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2070 	prev_tsc = 0;
2071 
2072 	while (1) {
2073 		cur_tsc = rte_rdtsc();
2074 
2075 		/* TX burst queue drain */
2076 		diff_tsc = cur_tsc - prev_tsc;
2077 		if (unlikely(diff_tsc > drain_tsc)) {
2078 			/*
2079 			 * Get mbuf from vpool.pool and detach mbuf and
2080 			 * put back into vpool.ring.
2081 			 */
2082 			dev_ll = lcore_ll->ll_root_used;
2083 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2084 				/* Get virtio device ID */
2085 				vdev = dev_ll->vdev;
2086 				dev = vdev->dev;
2087 
2088 				if (likely(!vdev->remove)) {
2089 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2090 					if (tx_q->len) {
2091 						LOG_DEBUG(VHOST_DATA,
2092 						"TX queue drained after timeout"
2093 						" with burst size %u\n",
2094 						tx_q->len);
2095 
2096 						/*
2097 						 * Tx any packets in the queue
2098 						 */
2099 						ret = rte_eth_tx_burst(
2100 							ports[0],
2101 							(uint16_t)tx_q->txq_id,
2102 							(struct rte_mbuf **)
2103 							tx_q->m_table,
2104 							(uint16_t)tx_q->len);
2105 						if (unlikely(ret < tx_q->len)) {
2106 							do {
2107 								rte_pktmbuf_free(
2108 									tx_q->m_table[ret]);
2109 							} while (++ret < tx_q->len);
2110 						}
2111 						tx_q->len = 0;
2112 
2113 						txmbuf_clean_zcp(dev,
2114 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2115 					}
2116 				}
2117 				dev_ll = dev_ll->next;
2118 			}
2119 			prev_tsc = cur_tsc;
2120 		}
2121 
2122 		rte_prefetch0(lcore_ll->ll_root_used);
2123 
2124 		/*
2125 		 * Inform the configuration core that we have exited the linked
2126 		 * list and that no devices are in use if requested.
2127 		 */
2128 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2129 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2130 
2131 		/* Process devices */
2132 		dev_ll = lcore_ll->ll_root_used;
2133 
2134 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2135 			vdev = dev_ll->vdev;
2136 			dev  = vdev->dev;
2137 			if (unlikely(vdev->remove)) {
2138 				dev_ll = dev_ll->next;
2139 				unlink_vmdq(vdev);
2140 				vdev->ready = DEVICE_SAFE_REMOVE;
2141 				continue;
2142 			}
2143 
2144 			if (likely(vdev->ready == DEVICE_RX)) {
2145 				uint32_t index = vdev->vmdq_rx_q;
2146 				uint16_t i;
2147 				count_in_ring
2148 				= rte_ring_count(vpool_array[index].ring);
2149 				uint16_t free_entries
2150 				= (uint16_t)get_available_ring_num_zcp(dev);
2151 
2152 				/*
2153 				 * Attach all mbufs in vpool.ring and put back
2154 				 * into vpool.pool.
2155 				 */
2156 				for (i = 0;
2157 				i < RTE_MIN(free_entries,
2158 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2159 				i++)
2160 					attach_rxmbuf_zcp(dev);
2161 
2162 				/* Handle guest RX */
2163 				rx_count = rte_eth_rx_burst(ports[0],
2164 					vdev->vmdq_rx_q, pkts_burst,
2165 					MAX_PKT_BURST);
2166 
2167 				if (rx_count) {
2168 					ret_count = virtio_dev_rx_zcp(dev,
2169 							pkts_burst, rx_count);
2170 					if (enable_stats) {
2171 						dev_statistics[dev->device_fh].rx_total
2172 							+= rx_count;
2173 						dev_statistics[dev->device_fh].rx
2174 							+= ret_count;
2175 					}
2176 					while (likely(rx_count)) {
2177 						rx_count--;
2178 						pktmbuf_detach_zcp(
2179 							pkts_burst[rx_count]);
2180 						rte_ring_sp_enqueue(
2181 							vpool_array[index].ring,
2182 							(void *)pkts_burst[rx_count]);
2183 					}
2184 				}
2185 			}
2186 
2187 			if (likely(!vdev->remove))
2188 				/* Handle guest TX */
2189 				virtio_dev_tx_zcp(dev);
2190 
2191 			/* Move to the next device in the list */
2192 			dev_ll = dev_ll->next;
2193 		}
2194 	}
2195 
2196 	return 0;
2197 }
2198 
2199 
2200 /*
2201  * Add an entry to a used linked list. A free entry must first be found
2202  * in the free linked list using get_data_ll_free_entry();
2203  */
2204 static void
2205 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2206 	struct virtio_net_data_ll *ll_dev)
2207 {
2208 	struct virtio_net_data_ll *ll = *ll_root_addr;
2209 
2210 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2211 	ll_dev->next = NULL;
2212 	rte_compiler_barrier();
2213 
2214 	/* If ll == NULL then this is the first device. */
2215 	if (ll) {
2216 		/* Increment to the tail of the linked list. */
2217 		while ((ll->next != NULL) )
2218 			ll = ll->next;
2219 
2220 		ll->next = ll_dev;
2221 	} else {
2222 		*ll_root_addr = ll_dev;
2223 	}
2224 }
2225 
2226 /*
2227  * Remove an entry from a used linked list. The entry must then be added to
2228  * the free linked list using put_data_ll_free_entry().
2229  */
2230 static void
2231 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2232 	struct virtio_net_data_ll *ll_dev,
2233 	struct virtio_net_data_ll *ll_dev_last)
2234 {
2235 	struct virtio_net_data_ll *ll = *ll_root_addr;
2236 
2237 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2238 		return;
2239 
2240 	if (ll_dev == ll)
2241 		*ll_root_addr = ll_dev->next;
2242 	else
2243 		if (likely(ll_dev_last != NULL))
2244 			ll_dev_last->next = ll_dev->next;
2245 		else
2246 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2247 }
2248 
2249 /*
2250  * Find and return an entry from the free linked list.
2251  */
2252 static struct virtio_net_data_ll *
2253 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2254 {
2255 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2256 	struct virtio_net_data_ll *ll_dev;
2257 
2258 	if (ll_free == NULL)
2259 		return NULL;
2260 
2261 	ll_dev = ll_free;
2262 	*ll_root_addr = ll_free->next;
2263 
2264 	return ll_dev;
2265 }
2266 
2267 /*
2268  * Place an entry back on to the free linked list.
2269  */
2270 static void
2271 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2272 	struct virtio_net_data_ll *ll_dev)
2273 {
2274 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2275 
2276 	if (ll_dev == NULL)
2277 		return;
2278 
2279 	ll_dev->next = ll_free;
2280 	*ll_root_addr = ll_dev;
2281 }
2282 
2283 /*
2284  * Creates a linked list of a given size.
2285  */
2286 static struct virtio_net_data_ll *
2287 alloc_data_ll(uint32_t size)
2288 {
2289 	struct virtio_net_data_ll *ll_new;
2290 	uint32_t i;
2291 
2292 	/* Malloc and then chain the linked list. */
2293 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2294 	if (ll_new == NULL) {
2295 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2296 		return NULL;
2297 	}
2298 
2299 	for (i = 0; i < size - 1; i++) {
2300 		ll_new[i].vdev = NULL;
2301 		ll_new[i].next = &ll_new[i+1];
2302 	}
2303 	ll_new[i].next = NULL;
2304 
2305 	return (ll_new);
2306 }
2307 
2308 /*
2309  * Create the main linked list along with each individual cores linked list. A used and a free list
2310  * are created to manage entries.
2311  */
2312 static int
2313 init_data_ll (void)
2314 {
2315 	int lcore;
2316 
2317 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2318 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2319 		if (lcore_info[lcore].lcore_ll == NULL) {
2320 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2321 			return -1;
2322 		}
2323 
2324 		lcore_info[lcore].lcore_ll->device_num = 0;
2325 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2326 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2327 		if (num_devices % num_switching_cores)
2328 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2329 		else
2330 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2331 	}
2332 
2333 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2334 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2335 
2336 	return 0;
2337 }
2338 
2339 /*
2340  * Set virtqueue flags so that we do not receive interrupts.
2341  */
2342 static void
2343 set_irq_status (struct virtio_net *dev)
2344 {
2345 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2346 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2347 }
2348 
2349 /*
2350  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2351  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2352  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2353  */
2354 static void
2355 destroy_device (volatile struct virtio_net *dev)
2356 {
2357 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2358 	struct virtio_net_data_ll *ll_main_dev_cur;
2359 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2360 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2361 	struct vhost_dev *vdev;
2362 	int lcore;
2363 
2364 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2365 
2366 	vdev = (struct vhost_dev *)dev->priv;
2367 	/*set the remove flag. */
2368 	vdev->remove = 1;
2369 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2370 		rte_pause();
2371 	}
2372 
2373 	/* Search for entry to be removed from lcore ll */
2374 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2375 	while (ll_lcore_dev_cur != NULL) {
2376 		if (ll_lcore_dev_cur->vdev == vdev) {
2377 			break;
2378 		} else {
2379 			ll_lcore_dev_last = ll_lcore_dev_cur;
2380 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2381 		}
2382 	}
2383 
2384 	if (ll_lcore_dev_cur == NULL) {
2385 		RTE_LOG(ERR, VHOST_CONFIG,
2386 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2387 			dev->device_fh);
2388 		return;
2389 	}
2390 
2391 	/* Search for entry to be removed from main ll */
2392 	ll_main_dev_cur = ll_root_used;
2393 	ll_main_dev_last = NULL;
2394 	while (ll_main_dev_cur != NULL) {
2395 		if (ll_main_dev_cur->vdev == vdev) {
2396 			break;
2397 		} else {
2398 			ll_main_dev_last = ll_main_dev_cur;
2399 			ll_main_dev_cur = ll_main_dev_cur->next;
2400 		}
2401 	}
2402 
2403 	/* Remove entries from the lcore and main ll. */
2404 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2405 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2406 
2407 	/* Set the dev_removal_flag on each lcore. */
2408 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2409 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2410 	}
2411 
2412 	/*
2413 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2414 	 * they can no longer access the device removed from the linked lists and that the devices
2415 	 * are no longer in use.
2416 	 */
2417 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2418 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2419 			rte_pause();
2420 		}
2421 	}
2422 
2423 	/* Add the entries back to the lcore and main free ll.*/
2424 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2425 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2426 
2427 	/* Decrement number of device on the lcore. */
2428 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2429 
2430 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2431 
2432 	if (zero_copy) {
2433 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2434 
2435 		/* Stop the RX queue. */
2436 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2437 			LOG_DEBUG(VHOST_CONFIG,
2438 				"(%"PRIu64") In destroy_device: Failed to stop "
2439 				"rx queue:%d\n",
2440 				dev->device_fh,
2441 				vdev->vmdq_rx_q);
2442 		}
2443 
2444 		LOG_DEBUG(VHOST_CONFIG,
2445 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2446 			"mempool back to ring for RX queue: %d\n",
2447 			dev->device_fh, vdev->vmdq_rx_q);
2448 
2449 		mbuf_destroy_zcp(vpool);
2450 
2451 		/* Stop the TX queue. */
2452 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2453 			LOG_DEBUG(VHOST_CONFIG,
2454 				"(%"PRIu64") In destroy_device: Failed to "
2455 				"stop tx queue:%d\n",
2456 				dev->device_fh, vdev->vmdq_rx_q);
2457 		}
2458 
2459 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2460 
2461 		LOG_DEBUG(VHOST_CONFIG,
2462 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2463 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2464 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2465 			dev->device_fh);
2466 
2467 		mbuf_destroy_zcp(vpool);
2468 	}
2469 	rte_free(vdev);
2470 
2471 }
2472 
2473 /*
2474  * A new device is added to a data core. First the device is added to the main linked list
2475  * and the allocated to a specific data core.
2476  */
2477 static int
2478 new_device (struct virtio_net *dev)
2479 {
2480 	struct virtio_net_data_ll *ll_dev;
2481 	int lcore, core_add = 0;
2482 	uint32_t device_num_min = num_devices;
2483 	struct vhost_dev *vdev;
2484 
2485 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2486 	if (vdev == NULL) {
2487 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2488 			dev->device_fh);
2489 		return -1;
2490 	}
2491 	vdev->dev = dev;
2492 	dev->priv = vdev;
2493 
2494 	/* Add device to main ll */
2495 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2496 	if (ll_dev == NULL) {
2497 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2498 			"of %d devices per core has been reached\n",
2499 			dev->device_fh, num_devices);
2500 		rte_free(vdev);
2501 		return -1;
2502 	}
2503 	ll_dev->vdev = vdev;
2504 	add_data_ll_entry(&ll_root_used, ll_dev);
2505 	vdev->vmdq_rx_q
2506 		= dev->device_fh * (num_queues / num_devices);
2507 
2508 	if (zero_copy) {
2509 		uint32_t index = vdev->vmdq_rx_q;
2510 		uint32_t count_in_ring, i;
2511 		struct mbuf_table *tx_q;
2512 
2513 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2514 
2515 		LOG_DEBUG(VHOST_CONFIG,
2516 			"(%"PRIu64") in new_device: mbuf count in mempool "
2517 			"before attach is: %d\n",
2518 			dev->device_fh,
2519 			rte_mempool_count(vpool_array[index].pool));
2520 		LOG_DEBUG(VHOST_CONFIG,
2521 			"(%"PRIu64") in new_device: mbuf count in  ring "
2522 			"before attach  is : %d\n",
2523 			dev->device_fh, count_in_ring);
2524 
2525 		/*
2526 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2527 		 */
2528 		for (i = 0; i < count_in_ring; i++)
2529 			attach_rxmbuf_zcp(dev);
2530 
2531 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2532 			"mempool after attach is: %d\n",
2533 			dev->device_fh,
2534 			rte_mempool_count(vpool_array[index].pool));
2535 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2536 			"ring after attach  is : %d\n",
2537 			dev->device_fh,
2538 			rte_ring_count(vpool_array[index].ring));
2539 
2540 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2541 		tx_q->txq_id = vdev->vmdq_rx_q;
2542 
2543 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2544 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2545 
2546 			LOG_DEBUG(VHOST_CONFIG,
2547 				"(%"PRIu64") In new_device: Failed to start "
2548 				"tx queue:%d\n",
2549 				dev->device_fh, vdev->vmdq_rx_q);
2550 
2551 			mbuf_destroy_zcp(vpool);
2552 			rte_free(vdev);
2553 			return -1;
2554 		}
2555 
2556 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2557 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2558 
2559 			LOG_DEBUG(VHOST_CONFIG,
2560 				"(%"PRIu64") In new_device: Failed to start "
2561 				"rx queue:%d\n",
2562 				dev->device_fh, vdev->vmdq_rx_q);
2563 
2564 			/* Stop the TX queue. */
2565 			if (rte_eth_dev_tx_queue_stop(ports[0],
2566 				vdev->vmdq_rx_q) != 0) {
2567 				LOG_DEBUG(VHOST_CONFIG,
2568 					"(%"PRIu64") In new_device: Failed to "
2569 					"stop tx queue:%d\n",
2570 					dev->device_fh, vdev->vmdq_rx_q);
2571 			}
2572 
2573 			mbuf_destroy_zcp(vpool);
2574 			rte_free(vdev);
2575 			return -1;
2576 		}
2577 
2578 	}
2579 
2580 	/*reset ready flag*/
2581 	vdev->ready = DEVICE_MAC_LEARNING;
2582 	vdev->remove = 0;
2583 
2584 	/* Find a suitable lcore to add the device. */
2585 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2586 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2587 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2588 			core_add = lcore;
2589 		}
2590 	}
2591 	/* Add device to lcore ll */
2592 	ll_dev->dev->coreid = core_add;
2593 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2594 	if (ll_dev == NULL) {
2595 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2596 		vdev->ready = DEVICE_SAFE_REMOVE;
2597 		destroy_device(dev);
2598 		rte_free(vdev);
2599 		return -1;
2600 	}
2601 	ll_dev->vdev = vdev;
2602 	vdev->coreid = core_add;
2603 
2604 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2605 
2606 	/* Initialize device stats */
2607 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2608 
2609 	/* Disable notifications. */
2610 	set_irq_status(dev);
2611 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2612 	dev->flags |= VIRTIO_DEV_RUNNING;
2613 
2614 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2615 
2616 	return 0;
2617 }
2618 
2619 /*
2620  * These callback allow devices to be added to the data core when configuration
2621  * has been fully complete.
2622  */
2623 static const struct virtio_net_device_ops virtio_net_device_ops =
2624 {
2625 	.new_device =  new_device,
2626 	.destroy_device = destroy_device,
2627 };
2628 
2629 /*
2630  * This is a thread will wake up after a period to print stats if the user has
2631  * enabled them.
2632  */
2633 static void
2634 print_stats(void)
2635 {
2636 	struct virtio_net_data_ll *dev_ll;
2637 	uint64_t tx_dropped, rx_dropped;
2638 	uint64_t tx, tx_total, rx, rx_total;
2639 	uint32_t device_fh;
2640 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2641 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2642 
2643 	while(1) {
2644 		sleep(enable_stats);
2645 
2646 		/* Clear screen and move to top left */
2647 		printf("%s%s", clr, top_left);
2648 
2649 		printf("\nDevice statistics ====================================");
2650 
2651 		dev_ll = ll_root_used;
2652 		while (dev_ll != NULL) {
2653 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2654 			tx_total = dev_statistics[device_fh].tx_total;
2655 			tx = dev_statistics[device_fh].tx;
2656 			tx_dropped = tx_total - tx;
2657 			if (zero_copy == 0) {
2658 				rx_total = rte_atomic64_read(
2659 					&dev_statistics[device_fh].rx_total_atomic);
2660 				rx = rte_atomic64_read(
2661 					&dev_statistics[device_fh].rx_atomic);
2662 			} else {
2663 				rx_total = dev_statistics[device_fh].rx_total;
2664 				rx = dev_statistics[device_fh].rx;
2665 			}
2666 			rx_dropped = rx_total - rx;
2667 
2668 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2669 					"\nTX total: 		%"PRIu64""
2670 					"\nTX dropped: 		%"PRIu64""
2671 					"\nTX successful: 		%"PRIu64""
2672 					"\nRX total: 		%"PRIu64""
2673 					"\nRX dropped: 		%"PRIu64""
2674 					"\nRX successful: 		%"PRIu64"",
2675 					device_fh,
2676 					tx_total,
2677 					tx_dropped,
2678 					tx,
2679 					rx_total,
2680 					rx_dropped,
2681 					rx);
2682 
2683 			dev_ll = dev_ll->next;
2684 		}
2685 		printf("\n======================================================\n");
2686 	}
2687 }
2688 
2689 static void
2690 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2691 	char *ring_name, uint32_t nb_mbuf)
2692 {
2693 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2694 	vpool_array[index].pool
2695 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2696 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2697 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2698 		rte_pktmbuf_init, NULL, socket, 0);
2699 	if (vpool_array[index].pool != NULL) {
2700 		vpool_array[index].ring
2701 			= rte_ring_create(ring_name,
2702 				rte_align32pow2(nb_mbuf + 1),
2703 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2704 		if (likely(vpool_array[index].ring != NULL)) {
2705 			LOG_DEBUG(VHOST_CONFIG,
2706 				"in setup_mempool_tbl: mbuf count in "
2707 				"mempool is: %d\n",
2708 				rte_mempool_count(vpool_array[index].pool));
2709 			LOG_DEBUG(VHOST_CONFIG,
2710 				"in setup_mempool_tbl: mbuf count in "
2711 				"ring   is: %d\n",
2712 				rte_ring_count(vpool_array[index].ring));
2713 		} else {
2714 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2715 				ring_name);
2716 		}
2717 
2718 		/* Need consider head room. */
2719 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2720 	} else {
2721 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2722 	}
2723 }
2724 
2725 
2726 /*
2727  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2728  * device is also registered here to handle the IOCTLs.
2729  */
2730 int
2731 MAIN(int argc, char *argv[])
2732 {
2733 	struct rte_mempool *mbuf_pool = NULL;
2734 	unsigned lcore_id, core_id = 0;
2735 	unsigned nb_ports, valid_num_ports;
2736 	int ret;
2737 	uint8_t portid, queue_id = 0;
2738 	static pthread_t tid;
2739 
2740 	/* init EAL */
2741 	ret = rte_eal_init(argc, argv);
2742 	if (ret < 0)
2743 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2744 	argc -= ret;
2745 	argv += ret;
2746 
2747 	/* parse app arguments */
2748 	ret = us_vhost_parse_args(argc, argv);
2749 	if (ret < 0)
2750 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2751 
2752 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2753 		if (rte_lcore_is_enabled(lcore_id))
2754 			lcore_ids[core_id ++] = lcore_id;
2755 
2756 	if (rte_lcore_count() > RTE_MAX_LCORE)
2757 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2758 
2759 	/*set the number of swithcing cores available*/
2760 	num_switching_cores = rte_lcore_count()-1;
2761 
2762 	/* Get the number of physical ports. */
2763 	nb_ports = rte_eth_dev_count();
2764 	if (nb_ports > RTE_MAX_ETHPORTS)
2765 		nb_ports = RTE_MAX_ETHPORTS;
2766 
2767 	/*
2768 	 * Update the global var NUM_PORTS and global array PORTS
2769 	 * and get value of var VALID_NUM_PORTS according to system ports number
2770 	 */
2771 	valid_num_ports = check_ports_num(nb_ports);
2772 
2773 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2774 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2775 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2776 		return -1;
2777 	}
2778 
2779 	if (zero_copy == 0) {
2780 		/* Create the mbuf pool. */
2781 		mbuf_pool = rte_mempool_create(
2782 				"MBUF_POOL",
2783 				NUM_MBUFS_PER_PORT
2784 				* valid_num_ports,
2785 				MBUF_SIZE, MBUF_CACHE_SIZE,
2786 				sizeof(struct rte_pktmbuf_pool_private),
2787 				rte_pktmbuf_pool_init, NULL,
2788 				rte_pktmbuf_init, NULL,
2789 				rte_socket_id(), 0);
2790 		if (mbuf_pool == NULL)
2791 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2792 
2793 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2794 			vpool_array[queue_id].pool = mbuf_pool;
2795 
2796 		if (vm2vm_mode == VM2VM_HARDWARE) {
2797 			/* Enable VT loop back to let L2 switch to do it. */
2798 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2799 			LOG_DEBUG(VHOST_CONFIG,
2800 				"Enable loop back for L2 switch in vmdq.\n");
2801 		}
2802 	} else {
2803 		uint32_t nb_mbuf;
2804 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2805 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2806 
2807 		/*
2808 		 * Zero copy defers queue RX/TX start to the time when guest
2809 		 * finishes its startup and packet buffers from that guest are
2810 		 * available.
2811 		 */
2812 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2813 		rx_conf_default.rx_drop_en = 0;
2814 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2815 		nb_mbuf = num_rx_descriptor
2816 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2817 			+ num_switching_cores * MAX_PKT_BURST;
2818 
2819 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2820 			snprintf(pool_name, sizeof(pool_name),
2821 				"rxmbuf_pool_%u", queue_id);
2822 			snprintf(ring_name, sizeof(ring_name),
2823 				"rxmbuf_ring_%u", queue_id);
2824 			setup_mempool_tbl(rte_socket_id(), queue_id,
2825 				pool_name, ring_name, nb_mbuf);
2826 		}
2827 
2828 		nb_mbuf = num_tx_descriptor
2829 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2830 				+ num_switching_cores * MAX_PKT_BURST;
2831 
2832 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2833 			snprintf(pool_name, sizeof(pool_name),
2834 				"txmbuf_pool_%u", queue_id);
2835 			snprintf(ring_name, sizeof(ring_name),
2836 				"txmbuf_ring_%u", queue_id);
2837 			setup_mempool_tbl(rte_socket_id(),
2838 				(queue_id + MAX_QUEUES),
2839 				pool_name, ring_name, nb_mbuf);
2840 		}
2841 
2842 		if (vm2vm_mode == VM2VM_HARDWARE) {
2843 			/* Enable VT loop back to let L2 switch to do it. */
2844 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2845 			LOG_DEBUG(VHOST_CONFIG,
2846 				"Enable loop back for L2 switch in vmdq.\n");
2847 		}
2848 	}
2849 	/* Set log level. */
2850 	rte_set_log_level(LOG_LEVEL);
2851 
2852 	/* initialize all ports */
2853 	for (portid = 0; portid < nb_ports; portid++) {
2854 		/* skip ports that are not enabled */
2855 		if ((enabled_port_mask & (1 << portid)) == 0) {
2856 			RTE_LOG(INFO, VHOST_PORT,
2857 				"Skipping disabled port %d\n", portid);
2858 			continue;
2859 		}
2860 		if (port_init(portid) != 0)
2861 			rte_exit(EXIT_FAILURE,
2862 				"Cannot initialize network ports\n");
2863 	}
2864 
2865 	/* Initialise all linked lists. */
2866 	if (init_data_ll() == -1)
2867 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2868 
2869 	/* Initialize device stats */
2870 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2871 
2872 	/* Enable stats if the user option is set. */
2873 	if (enable_stats)
2874 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2875 
2876 	/* Launch all data cores. */
2877 	if (zero_copy == 0) {
2878 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2879 			rte_eal_remote_launch(switch_worker,
2880 				mbuf_pool, lcore_id);
2881 		}
2882 	} else {
2883 		uint32_t count_in_mempool, index, i;
2884 		for (index = 0; index < 2*MAX_QUEUES; index++) {
2885 			/* For all RX and TX queues. */
2886 			count_in_mempool
2887 				= rte_mempool_count(vpool_array[index].pool);
2888 
2889 			/*
2890 			 * Transfer all un-attached mbufs from vpool.pool
2891 			 * to vpoo.ring.
2892 			 */
2893 			for (i = 0; i < count_in_mempool; i++) {
2894 				struct rte_mbuf *mbuf
2895 					= __rte_mbuf_raw_alloc(
2896 						vpool_array[index].pool);
2897 				rte_ring_sp_enqueue(vpool_array[index].ring,
2898 						(void *)mbuf);
2899 			}
2900 
2901 			LOG_DEBUG(VHOST_CONFIG,
2902 				"in MAIN: mbuf count in mempool at initial "
2903 				"is: %d\n", count_in_mempool);
2904 			LOG_DEBUG(VHOST_CONFIG,
2905 				"in MAIN: mbuf count in  ring at initial  is :"
2906 				" %d\n",
2907 				rte_ring_count(vpool_array[index].ring));
2908 		}
2909 
2910 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
2911 			rte_eal_remote_launch(switch_worker_zcp, NULL,
2912 				lcore_id);
2913 	}
2914 
2915 	/* Register CUSE device to handle IOCTLs. */
2916 	ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
2917 	if (ret != 0)
2918 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
2919 
2920 	init_virtio_net(&virtio_net_device_ops);
2921 
2922 	/* Start CUSE session. */
2923 	start_cuse_session_loop();
2924 	return 0;
2925 
2926 }
2927 
2928