xref: /dpdk/examples/vhost/main.c (revision a3fb248dce91a3038b14453aa4002cfc45469f7a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144 
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147 
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150 
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154 
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161 
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165 
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168 
169 struct vpool {
170 	struct rte_mempool *pool;
171 	struct rte_ring *ring;
172 	uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174 
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177 	VM2VM_DISABLED = 0,
178 	VM2VM_SOFTWARE = 1,
179 	VM2VM_HARDWARE = 2,
180 	VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183 
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186 	PHYS_ADDR_CONTINUOUS = 0,
187 	PHYS_ADDR_CROSS_SUBREG = 1,
188 	PHYS_ADDR_INVALID = 2,
189 	PHYS_ADDR_LAST
190 } hpa_type;
191 
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200 
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203 
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206 	.rxmode = {
207 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208 		.split_hdr_size = 0,
209 		.header_split   = 0, /**< Header Split disabled */
210 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
211 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
212 		/*
213 		 * It is necessary for 1G NIC such as I350,
214 		 * this fixes bug of ipv4 forwarding in guest can't
215 		 * forward pakets from one virtio dev to another virtio dev.
216 		 */
217 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
220 	},
221 
222 	.txmode = {
223 		.mq_mode = ETH_MQ_TX_NONE,
224 	},
225 	.rx_adv_conf = {
226 		/*
227 		 * should be overridden separately in code with
228 		 * appropriate values
229 		 */
230 		.vmdq_rx_conf = {
231 			.nb_queue_pools = ETH_8_POOLS,
232 			.enable_default_pool = 0,
233 			.default_pool = 0,
234 			.nb_pool_maps = 0,
235 			.pool_map = {{0, 0},},
236 		},
237 	},
238 };
239 
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246 
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
251 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258 
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261 
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265 
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268 
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271 	unsigned len;
272 	unsigned txq_id;
273 	struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275 
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278 
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281 
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284 	unsigned char   h_dest[ETH_ALEN];
285 	unsigned char   h_source[ETH_ALEN];
286 	__be16          h_vlan_proto;
287 	__be16          h_vlan_TCI;
288 	__be16          h_vlan_encapsulated_proto;
289 };
290 
291 /* IPv4 Header */
292 struct ipv4_hdr {
293 	uint8_t  version_ihl;		/**< version and header length */
294 	uint8_t  type_of_service;	/**< type of service */
295 	uint16_t total_length;		/**< length of packet */
296 	uint16_t packet_id;		/**< packet ID */
297 	uint16_t fragment_offset;	/**< fragmentation offset */
298 	uint8_t  time_to_live;		/**< time to live */
299 	uint8_t  next_proto_id;		/**< protocol ID */
300 	uint16_t hdr_checksum;		/**< header checksum */
301 	uint32_t src_addr;		/**< source address */
302 	uint32_t dst_addr;		/**< destination address */
303 } __attribute__((__packed__));
304 
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308 
309 /* Per-device statistics struct */
310 struct device_statistics {
311 	uint64_t tx_total;
312 	rte_atomic64_t rx_total_atomic;
313 	uint64_t rx_total;
314 	uint64_t tx;
315 	rte_atomic64_t rx_atomic;
316 	uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319 
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327 	struct rte_eth_vmdq_rx_conf conf;
328 	struct rte_eth_vmdq_rx_conf *def_conf =
329 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330 	unsigned i;
331 
332 	memset(&conf, 0, sizeof(conf));
333 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334 	conf.nb_pool_maps = num_devices;
335 	conf.enable_loop_back = def_conf->enable_loop_back;
336 	conf.rx_mode = def_conf->rx_mode;
337 
338 	for (i = 0; i < conf.nb_pool_maps; i++) {
339 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
340 		conf.pool_map[i].pools = (1UL << i);
341 	}
342 
343 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346 	return 0;
347 }
348 
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357 	if (num_devices > max_nb_devices) {
358 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359 		return -1;
360 	}
361 	return 0;
362 }
363 
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371 	struct rte_eth_dev_info dev_info;
372 	struct rte_eth_conf port_conf;
373 	struct rte_eth_rxconf *rxconf;
374 	struct rte_eth_txconf *txconf;
375 	int16_t rx_rings, tx_rings;
376 	uint16_t rx_ring_size, tx_ring_size;
377 	int retval;
378 	uint16_t q;
379 
380 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381 	rte_eth_dev_info_get (port, &dev_info);
382 
383 	if (dev_info.max_rx_queues > MAX_QUEUES) {
384 		rte_exit(EXIT_FAILURE,
385 			"please define MAX_QUEUES no less than %u in %s\n",
386 			dev_info.max_rx_queues, __FILE__);
387 	}
388 
389 	rxconf = &dev_info.default_rxconf;
390 	txconf = &dev_info.default_txconf;
391 	rxconf->rx_drop_en = 1;
392 
393 	/*
394 	 * Zero copy defers queue RX/TX start to the time when guest
395 	 * finishes its startup and packet buffers from that guest are
396 	 * available.
397 	 */
398 	if (zero_copy) {
399 		rxconf->rx_deferred_start = 1;
400 		rxconf->rx_drop_en = 0;
401 		txconf->tx_deferred_start = 1;
402 	}
403 
404 	/*configure the number of supported virtio devices based on VMDQ limits */
405 	num_devices = dev_info.max_vmdq_pools;
406 
407 	if (zero_copy) {
408 		rx_ring_size = num_rx_descriptor;
409 		tx_ring_size = num_tx_descriptor;
410 		tx_rings = dev_info.max_tx_queues;
411 	} else {
412 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
413 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
414 		tx_rings = (uint16_t)rte_lcore_count();
415 	}
416 
417 	retval = validate_num_devices(MAX_DEVICES);
418 	if (retval < 0)
419 		return retval;
420 
421 	/* Get port configuration. */
422 	retval = get_eth_conf(&port_conf, num_devices);
423 	if (retval < 0)
424 		return retval;
425 	/* NIC queues are divided into pf queues and vmdq queues.  */
426 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
427 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
428 	num_vmdq_queues = num_devices * queues_per_pool;
429 	num_queues = num_pf_queues + num_vmdq_queues;
430 	vmdq_queue_base = dev_info.vmdq_queue_base;
431 	vmdq_pool_base  = dev_info.vmdq_pool_base;
432 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
433 		num_pf_queues, num_devices, queues_per_pool);
434 
435 	if (port >= rte_eth_dev_count()) return -1;
436 
437 	rx_rings = (uint16_t)dev_info.max_rx_queues;
438 	/* Configure ethernet device. */
439 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
440 	if (retval != 0)
441 		return retval;
442 
443 	/* Setup the queues. */
444 	for (q = 0; q < rx_rings; q ++) {
445 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
446 						rte_eth_dev_socket_id(port),
447 						rxconf,
448 						vpool_array[q].pool);
449 		if (retval < 0)
450 			return retval;
451 	}
452 	for (q = 0; q < tx_rings; q ++) {
453 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
454 						rte_eth_dev_socket_id(port),
455 						txconf);
456 		if (retval < 0)
457 			return retval;
458 	}
459 
460 	/* Start the device. */
461 	retval  = rte_eth_dev_start(port);
462 	if (retval < 0) {
463 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
464 		return retval;
465 	}
466 
467 	if (promiscuous)
468 		rte_eth_promiscuous_enable(port);
469 
470 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
471 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
472 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
473 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
474 			(unsigned)port,
475 			vmdq_ports_eth_addr[port].addr_bytes[0],
476 			vmdq_ports_eth_addr[port].addr_bytes[1],
477 			vmdq_ports_eth_addr[port].addr_bytes[2],
478 			vmdq_ports_eth_addr[port].addr_bytes[3],
479 			vmdq_ports_eth_addr[port].addr_bytes[4],
480 			vmdq_ports_eth_addr[port].addr_bytes[5]);
481 
482 	return 0;
483 }
484 
485 /*
486  * Set character device basename.
487  */
488 static int
489 us_vhost_parse_basename(const char *q_arg)
490 {
491 	/* parse number string */
492 
493 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
494 		return -1;
495 	else
496 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
497 
498 	return 0;
499 }
500 
501 /*
502  * Parse the portmask provided at run time.
503  */
504 static int
505 parse_portmask(const char *portmask)
506 {
507 	char *end = NULL;
508 	unsigned long pm;
509 
510 	errno = 0;
511 
512 	/* parse hexadecimal string */
513 	pm = strtoul(portmask, &end, 16);
514 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
515 		return -1;
516 
517 	if (pm == 0)
518 		return -1;
519 
520 	return pm;
521 
522 }
523 
524 /*
525  * Parse num options at run time.
526  */
527 static int
528 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
529 {
530 	char *end = NULL;
531 	unsigned long num;
532 
533 	errno = 0;
534 
535 	/* parse unsigned int string */
536 	num = strtoul(q_arg, &end, 10);
537 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
538 		return -1;
539 
540 	if (num > max_valid_value)
541 		return -1;
542 
543 	return num;
544 
545 }
546 
547 /*
548  * Display usage
549  */
550 static void
551 us_vhost_usage(const char *prgname)
552 {
553 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
554 	"		--vm2vm [0|1|2]\n"
555 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
556 	"		--dev-basename <name>\n"
557 	"		--nb-devices ND\n"
558 	"		-p PORTMASK: Set mask for ports to be used by application\n"
559 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
560 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
561 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
562 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
563 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
564 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
565 	"		--dev-basename: The basename to be used for the character device.\n"
566 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
567 			"zero copy\n"
568 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
569 			"used only when zero copy is enabled.\n"
570 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
571 			"used only when zero copy is enabled.\n",
572 	       prgname);
573 }
574 
575 /*
576  * Parse the arguments given in the command line of the application.
577  */
578 static int
579 us_vhost_parse_args(int argc, char **argv)
580 {
581 	int opt, ret;
582 	int option_index;
583 	unsigned i;
584 	const char *prgname = argv[0];
585 	static struct option long_option[] = {
586 		{"vm2vm", required_argument, NULL, 0},
587 		{"rx-retry", required_argument, NULL, 0},
588 		{"rx-retry-delay", required_argument, NULL, 0},
589 		{"rx-retry-num", required_argument, NULL, 0},
590 		{"mergeable", required_argument, NULL, 0},
591 		{"stats", required_argument, NULL, 0},
592 		{"dev-basename", required_argument, NULL, 0},
593 		{"zero-copy", required_argument, NULL, 0},
594 		{"rx-desc-num", required_argument, NULL, 0},
595 		{"tx-desc-num", required_argument, NULL, 0},
596 		{NULL, 0, 0, 0},
597 	};
598 
599 	/* Parse command line */
600 	while ((opt = getopt_long(argc, argv, "p:P",
601 			long_option, &option_index)) != EOF) {
602 		switch (opt) {
603 		/* Portmask */
604 		case 'p':
605 			enabled_port_mask = parse_portmask(optarg);
606 			if (enabled_port_mask == 0) {
607 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 				us_vhost_usage(prgname);
609 				return -1;
610 			}
611 			break;
612 
613 		case 'P':
614 			promiscuous = 1;
615 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
616 				ETH_VMDQ_ACCEPT_BROADCAST |
617 				ETH_VMDQ_ACCEPT_MULTICAST;
618 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
619 
620 			break;
621 
622 		case 0:
623 			/* Enable/disable vm2vm comms. */
624 			if (!strncmp(long_option[option_index].name, "vm2vm",
625 				MAX_LONG_OPT_SZ)) {
626 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
627 				if (ret == -1) {
628 					RTE_LOG(INFO, VHOST_CONFIG,
629 						"Invalid argument for "
630 						"vm2vm [0|1|2]\n");
631 					us_vhost_usage(prgname);
632 					return -1;
633 				} else {
634 					vm2vm_mode = (vm2vm_type)ret;
635 				}
636 			}
637 
638 			/* Enable/disable retries on RX. */
639 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
640 				ret = parse_num_opt(optarg, 1);
641 				if (ret == -1) {
642 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
643 					us_vhost_usage(prgname);
644 					return -1;
645 				} else {
646 					enable_retry = ret;
647 				}
648 			}
649 
650 			/* Specify the retries delay time (in useconds) on RX. */
651 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
652 				ret = parse_num_opt(optarg, INT32_MAX);
653 				if (ret == -1) {
654 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
655 					us_vhost_usage(prgname);
656 					return -1;
657 				} else {
658 					burst_rx_delay_time = ret;
659 				}
660 			}
661 
662 			/* Specify the retries number on RX. */
663 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
664 				ret = parse_num_opt(optarg, INT32_MAX);
665 				if (ret == -1) {
666 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
667 					us_vhost_usage(prgname);
668 					return -1;
669 				} else {
670 					burst_rx_retry_num = ret;
671 				}
672 			}
673 
674 			/* Enable/disable RX mergeable buffers. */
675 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
676 				ret = parse_num_opt(optarg, 1);
677 				if (ret == -1) {
678 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
679 					us_vhost_usage(prgname);
680 					return -1;
681 				} else {
682 					mergeable = !!ret;
683 					if (ret) {
684 						vmdq_conf_default.rxmode.jumbo_frame = 1;
685 						vmdq_conf_default.rxmode.max_rx_pkt_len
686 							= JUMBO_FRAME_MAX_SIZE;
687 					}
688 				}
689 			}
690 
691 			/* Enable/disable stats. */
692 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
693 				ret = parse_num_opt(optarg, INT32_MAX);
694 				if (ret == -1) {
695 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
696 					us_vhost_usage(prgname);
697 					return -1;
698 				} else {
699 					enable_stats = ret;
700 				}
701 			}
702 
703 			/* Set character device basename. */
704 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
705 				if (us_vhost_parse_basename(optarg) == -1) {
706 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
707 					us_vhost_usage(prgname);
708 					return -1;
709 				}
710 			}
711 
712 			/* Enable/disable rx/tx zero copy. */
713 			if (!strncmp(long_option[option_index].name,
714 				"zero-copy", MAX_LONG_OPT_SZ)) {
715 				ret = parse_num_opt(optarg, 1);
716 				if (ret == -1) {
717 					RTE_LOG(INFO, VHOST_CONFIG,
718 						"Invalid argument"
719 						" for zero-copy [0|1]\n");
720 					us_vhost_usage(prgname);
721 					return -1;
722 				} else
723 					zero_copy = ret;
724 
725 				if (zero_copy) {
726 #ifdef RTE_MBUF_REFCNT
727 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
728 					"zero copy vhost APP, please "
729 					"disable RTE_MBUF_REFCNT\n"
730 					"in config file and then rebuild DPDK "
731 					"core lib!\n"
732 					"Otherwise please disable zero copy "
733 					"flag in command line!\n");
734 					return -1;
735 #endif
736 				}
737 			}
738 
739 			/* Specify the descriptor number on RX. */
740 			if (!strncmp(long_option[option_index].name,
741 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
742 				ret = parse_num_opt(optarg, MAX_RING_DESC);
743 				if ((ret == -1) || (!POWEROF2(ret))) {
744 					RTE_LOG(INFO, VHOST_CONFIG,
745 					"Invalid argument for rx-desc-num[0-N],"
746 					"power of 2 required.\n");
747 					us_vhost_usage(prgname);
748 					return -1;
749 				} else {
750 					num_rx_descriptor = ret;
751 				}
752 			}
753 
754 			/* Specify the descriptor number on TX. */
755 			if (!strncmp(long_option[option_index].name,
756 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
757 				ret = parse_num_opt(optarg, MAX_RING_DESC);
758 				if ((ret == -1) || (!POWEROF2(ret))) {
759 					RTE_LOG(INFO, VHOST_CONFIG,
760 					"Invalid argument for tx-desc-num [0-N],"
761 					"power of 2 required.\n");
762 					us_vhost_usage(prgname);
763 					return -1;
764 				} else {
765 					num_tx_descriptor = ret;
766 				}
767 			}
768 
769 			break;
770 
771 			/* Invalid option - print options. */
772 		default:
773 			us_vhost_usage(prgname);
774 			return -1;
775 		}
776 	}
777 
778 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
779 		if (enabled_port_mask & (1 << i))
780 			ports[num_ports++] = (uint8_t)i;
781 	}
782 
783 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
784 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
785 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
786 		return -1;
787 	}
788 
789 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
790 		RTE_LOG(INFO, VHOST_PORT,
791 			"Vhost zero copy doesn't support software vm2vm,"
792 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
793 		return -1;
794 	}
795 
796 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
797 		RTE_LOG(INFO, VHOST_PORT,
798 			"Vhost zero copy doesn't support jumbo frame,"
799 			"please specify '--mergeable 0' to disable the "
800 			"mergeable feature.\n");
801 		return -1;
802 	}
803 
804 	return 0;
805 }
806 
807 /*
808  * Update the global var NUM_PORTS and array PORTS according to system ports number
809  * and return valid ports number
810  */
811 static unsigned check_ports_num(unsigned nb_ports)
812 {
813 	unsigned valid_num_ports = num_ports;
814 	unsigned portid;
815 
816 	if (num_ports > nb_ports) {
817 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
818 			num_ports, nb_ports);
819 		num_ports = nb_ports;
820 	}
821 
822 	for (portid = 0; portid < num_ports; portid ++) {
823 		if (ports[portid] >= nb_ports) {
824 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
825 				ports[portid], (nb_ports - 1));
826 			ports[portid] = INVALID_PORT_ID;
827 			valid_num_ports--;
828 		}
829 	}
830 	return valid_num_ports;
831 }
832 
833 /*
834  * Macro to print out packet contents. Wrapped in debug define so that the
835  * data path is not effected when debug is disabled.
836  */
837 #ifdef DEBUG
838 #define PRINT_PACKET(device, addr, size, header) do {																\
839 	char *pkt_addr = (char*)(addr);																					\
840 	unsigned int index;																								\
841 	char packet[MAX_PRINT_BUFF];																					\
842 																													\
843 	if ((header))																									\
844 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
845 	else																											\
846 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
847 	for (index = 0; index < (size); index++) {																		\
848 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
849 			"%02hhx ", pkt_addr[index]);																			\
850 	}																												\
851 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
852 																													\
853 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
854 } while(0)
855 #else
856 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
857 #endif
858 
859 /*
860  * Function to convert guest physical addresses to vhost physical addresses.
861  * This is used to convert virtio buffer addresses.
862  */
863 static inline uint64_t __attribute__((always_inline))
864 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
865 	uint32_t buf_len, hpa_type *addr_type)
866 {
867 	struct virtio_memory_regions_hpa *region;
868 	uint32_t regionidx;
869 	uint64_t vhost_pa = 0;
870 
871 	*addr_type = PHYS_ADDR_INVALID;
872 
873 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
874 		region = &vdev->regions_hpa[regionidx];
875 		if ((guest_pa >= region->guest_phys_address) &&
876 			(guest_pa <= region->guest_phys_address_end)) {
877 			vhost_pa = region->host_phys_addr_offset + guest_pa;
878 			if (likely((guest_pa + buf_len - 1)
879 				<= region->guest_phys_address_end))
880 				*addr_type = PHYS_ADDR_CONTINUOUS;
881 			else
882 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
883 			break;
884 		}
885 	}
886 
887 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
888 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
889 		(void *)(uintptr_t)vhost_pa);
890 
891 	return vhost_pa;
892 }
893 
894 /*
895  * Compares a packet destination MAC address to a device MAC address.
896  */
897 static inline int __attribute__((always_inline))
898 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
899 {
900 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
901 }
902 
903 /*
904  * This function learns the MAC address of the device and registers this along with a
905  * vlan tag to a VMDQ.
906  */
907 static int
908 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
909 {
910 	struct ether_hdr *pkt_hdr;
911 	struct virtio_net_data_ll *dev_ll;
912 	struct virtio_net *dev = vdev->dev;
913 	int i, ret;
914 
915 	/* Learn MAC address of guest device from packet */
916 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
917 
918 	dev_ll = ll_root_used;
919 
920 	while (dev_ll != NULL) {
921 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
922 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
923 			return -1;
924 		}
925 		dev_ll = dev_ll->next;
926 	}
927 
928 	for (i = 0; i < ETHER_ADDR_LEN; i++)
929 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
930 
931 	/* vlan_tag currently uses the device_id. */
932 	vdev->vlan_tag = vlan_tags[dev->device_fh];
933 
934 	/* Print out VMDQ registration info. */
935 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
936 		dev->device_fh,
937 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
938 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
939 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
940 		vdev->vlan_tag);
941 
942 	/* Register the MAC address. */
943 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
944 				(uint32_t)dev->device_fh + vmdq_pool_base);
945 	if (ret)
946 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
947 					dev->device_fh);
948 
949 	/* Enable stripping of the vlan tag as we handle routing. */
950 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
951 
952 	/* Set device as ready for RX. */
953 	vdev->ready = DEVICE_RX;
954 
955 	return 0;
956 }
957 
958 /*
959  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
960  * queue before disabling RX on the device.
961  */
962 static inline void
963 unlink_vmdq(struct vhost_dev *vdev)
964 {
965 	unsigned i = 0;
966 	unsigned rx_count;
967 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
968 
969 	if (vdev->ready == DEVICE_RX) {
970 		/*clear MAC and VLAN settings*/
971 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
972 		for (i = 0; i < 6; i++)
973 			vdev->mac_address.addr_bytes[i] = 0;
974 
975 		vdev->vlan_tag = 0;
976 
977 		/*Clear out the receive buffers*/
978 		rx_count = rte_eth_rx_burst(ports[0],
979 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980 
981 		while (rx_count) {
982 			for (i = 0; i < rx_count; i++)
983 				rte_pktmbuf_free(pkts_burst[i]);
984 
985 			rx_count = rte_eth_rx_burst(ports[0],
986 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
987 		}
988 
989 		vdev->ready = DEVICE_MAC_LEARNING;
990 	}
991 }
992 
993 /*
994  * Check if the packet destination MAC address is for a local device. If so then put
995  * the packet on that devices RX queue. If not then return.
996  */
997 static inline int __attribute__((always_inline))
998 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
999 {
1000 	struct virtio_net_data_ll *dev_ll;
1001 	struct ether_hdr *pkt_hdr;
1002 	uint64_t ret = 0;
1003 	struct virtio_net *dev = vdev->dev;
1004 	struct virtio_net *tdev; /* destination virito device */
1005 
1006 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1007 
1008 	/*get the used devices list*/
1009 	dev_ll = ll_root_used;
1010 
1011 	while (dev_ll != NULL) {
1012 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1013 				          &dev_ll->vdev->mac_address)) {
1014 
1015 			/* Drop the packet if the TX packet is destined for the TX device. */
1016 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1017 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1018 							dev->device_fh);
1019 				return 0;
1020 			}
1021 			tdev = dev_ll->vdev->dev;
1022 
1023 
1024 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1025 
1026 			if (unlikely(dev_ll->vdev->remove)) {
1027 				/*drop the packet if the device is marked for removal*/
1028 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1029 			} else {
1030 				/*send the packet to the local virtio device*/
1031 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1032 				if (enable_stats) {
1033 					rte_atomic64_add(
1034 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1035 					1);
1036 					rte_atomic64_add(
1037 					&dev_statistics[tdev->device_fh].rx_atomic,
1038 					ret);
1039 					dev_statistics[tdev->device_fh].tx_total++;
1040 					dev_statistics[tdev->device_fh].tx += ret;
1041 				}
1042 			}
1043 
1044 			return 0;
1045 		}
1046 		dev_ll = dev_ll->next;
1047 	}
1048 
1049 	return -1;
1050 }
1051 
1052 /*
1053  * Check if the destination MAC of a packet is one local VM,
1054  * and get its vlan tag, and offset if it is.
1055  */
1056 static inline int __attribute__((always_inline))
1057 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1058 	uint32_t *offset, uint16_t *vlan_tag)
1059 {
1060 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1061 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1062 
1063 	while (dev_ll != NULL) {
1064 		if ((dev_ll->vdev->ready == DEVICE_RX)
1065 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1066 		&dev_ll->vdev->mac_address)) {
1067 			/*
1068 			 * Drop the packet if the TX packet is
1069 			 * destined for the TX device.
1070 			 */
1071 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1072 				LOG_DEBUG(VHOST_DATA,
1073 				"(%"PRIu64") TX: Source and destination"
1074 				" MAC addresses are the same. Dropping "
1075 				"packet.\n",
1076 				dev_ll->vdev->dev->device_fh);
1077 				return -1;
1078 			}
1079 
1080 			/*
1081 			 * HW vlan strip will reduce the packet length
1082 			 * by minus length of vlan tag, so need restore
1083 			 * the packet length by plus it.
1084 			 */
1085 			*offset = VLAN_HLEN;
1086 			*vlan_tag =
1087 			(uint16_t)
1088 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1089 
1090 			LOG_DEBUG(VHOST_DATA,
1091 			"(%"PRIu64") TX: pkt to local VM device id:"
1092 			"(%"PRIu64") vlan tag: %d.\n",
1093 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1094 			vlan_tag);
1095 
1096 			break;
1097 		}
1098 		dev_ll = dev_ll->next;
1099 	}
1100 	return 0;
1101 }
1102 
1103 /*
1104  * This function routes the TX packet to the correct interface. This may be a local device
1105  * or the physical port.
1106  */
1107 static inline void __attribute__((always_inline))
1108 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1109 {
1110 	struct mbuf_table *tx_q;
1111 	struct rte_mbuf **m_table;
1112 	unsigned len, ret, offset = 0;
1113 	const uint16_t lcore_id = rte_lcore_id();
1114 	struct virtio_net *dev = vdev->dev;
1115 
1116 	/*check if destination is local VM*/
1117 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1118 		rte_pktmbuf_free(m);
1119 		return;
1120 	}
1121 
1122 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1123 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1124 			rte_pktmbuf_free(m);
1125 			return;
1126 		}
1127 	}
1128 
1129 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1130 
1131 	/*Add packet to the port tx queue*/
1132 	tx_q = &lcore_tx_queue[lcore_id];
1133 	len = tx_q->len;
1134 
1135 	m->ol_flags = PKT_TX_VLAN_PKT;
1136 
1137 	/*
1138 	 * Find the right seg to adjust the data len when offset is
1139 	 * bigger than tail room size.
1140 	 */
1141 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1142 		if (likely(offset <= rte_pktmbuf_tailroom(m)))
1143 			m->data_len += offset;
1144 		else {
1145 			struct rte_mbuf *seg = m;
1146 
1147 			while ((seg->next != NULL) &&
1148 				(offset > rte_pktmbuf_tailroom(seg)))
1149 				seg = seg->next;
1150 
1151 			seg->data_len += offset;
1152 		}
1153 		m->pkt_len += offset;
1154 	}
1155 
1156 	m->vlan_tci = vlan_tag;
1157 
1158 	tx_q->m_table[len] = m;
1159 	len++;
1160 	if (enable_stats) {
1161 		dev_statistics[dev->device_fh].tx_total++;
1162 		dev_statistics[dev->device_fh].tx++;
1163 	}
1164 
1165 	if (unlikely(len == MAX_PKT_BURST)) {
1166 		m_table = (struct rte_mbuf **)tx_q->m_table;
1167 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1168 		/* Free any buffers not handled by TX and update the port stats. */
1169 		if (unlikely(ret < len)) {
1170 			do {
1171 				rte_pktmbuf_free(m_table[ret]);
1172 			} while (++ret < len);
1173 		}
1174 
1175 		len = 0;
1176 	}
1177 
1178 	tx_q->len = len;
1179 	return;
1180 }
1181 /*
1182  * This function is called by each data core. It handles all RX/TX registered with the
1183  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1184  * with all devices in the main linked list.
1185  */
1186 static int
1187 switch_worker(__attribute__((unused)) void *arg)
1188 {
1189 	struct rte_mempool *mbuf_pool = arg;
1190 	struct virtio_net *dev = NULL;
1191 	struct vhost_dev *vdev = NULL;
1192 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1193 	struct virtio_net_data_ll *dev_ll;
1194 	struct mbuf_table *tx_q;
1195 	volatile struct lcore_ll_info *lcore_ll;
1196 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1197 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1198 	unsigned ret, i;
1199 	const uint16_t lcore_id = rte_lcore_id();
1200 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1201 	uint16_t rx_count = 0;
1202 	uint16_t tx_count;
1203 	uint32_t retry = 0;
1204 
1205 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1206 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1207 	prev_tsc = 0;
1208 
1209 	tx_q = &lcore_tx_queue[lcore_id];
1210 	for (i = 0; i < num_cores; i ++) {
1211 		if (lcore_ids[i] == lcore_id) {
1212 			tx_q->txq_id = i;
1213 			break;
1214 		}
1215 	}
1216 
1217 	while(1) {
1218 		cur_tsc = rte_rdtsc();
1219 		/*
1220 		 * TX burst queue drain
1221 		 */
1222 		diff_tsc = cur_tsc - prev_tsc;
1223 		if (unlikely(diff_tsc > drain_tsc)) {
1224 
1225 			if (tx_q->len) {
1226 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1227 
1228 				/*Tx any packets in the queue*/
1229 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1230 									   (struct rte_mbuf **)tx_q->m_table,
1231 									   (uint16_t)tx_q->len);
1232 				if (unlikely(ret < tx_q->len)) {
1233 					do {
1234 						rte_pktmbuf_free(tx_q->m_table[ret]);
1235 					} while (++ret < tx_q->len);
1236 				}
1237 
1238 				tx_q->len = 0;
1239 			}
1240 
1241 			prev_tsc = cur_tsc;
1242 
1243 		}
1244 
1245 		rte_prefetch0(lcore_ll->ll_root_used);
1246 		/*
1247 		 * Inform the configuration core that we have exited the linked list and that no devices are
1248 		 * in use if requested.
1249 		 */
1250 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1251 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1252 
1253 		/*
1254 		 * Process devices
1255 		 */
1256 		dev_ll = lcore_ll->ll_root_used;
1257 
1258 		while (dev_ll != NULL) {
1259 			/*get virtio device ID*/
1260 			vdev = dev_ll->vdev;
1261 			dev = vdev->dev;
1262 
1263 			if (unlikely(vdev->remove)) {
1264 				dev_ll = dev_ll->next;
1265 				unlink_vmdq(vdev);
1266 				vdev->ready = DEVICE_SAFE_REMOVE;
1267 				continue;
1268 			}
1269 			if (likely(vdev->ready == DEVICE_RX)) {
1270 				/*Handle guest RX*/
1271 				rx_count = rte_eth_rx_burst(ports[0],
1272 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1273 
1274 				if (rx_count) {
1275 					/*
1276 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1277 					* Here MAX_PKT_BURST must be less than virtio queue size
1278 					*/
1279 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1280 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1281 							rte_delay_us(burst_rx_delay_time);
1282 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1283 								break;
1284 						}
1285 					}
1286 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1287 					if (enable_stats) {
1288 						rte_atomic64_add(
1289 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1290 						rx_count);
1291 						rte_atomic64_add(
1292 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1293 					}
1294 					while (likely(rx_count)) {
1295 						rx_count--;
1296 						rte_pktmbuf_free(pkts_burst[rx_count]);
1297 					}
1298 
1299 				}
1300 			}
1301 
1302 			if (likely(!vdev->remove)) {
1303 				/* Handle guest TX*/
1304 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1305 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1306 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1307 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1308 						while (tx_count--)
1309 							rte_pktmbuf_free(pkts_burst[tx_count]);
1310 					}
1311 				}
1312 				while (tx_count)
1313 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1314 			}
1315 
1316 			/*move to the next device in the list*/
1317 			dev_ll = dev_ll->next;
1318 		}
1319 	}
1320 
1321 	return 0;
1322 }
1323 
1324 /*
1325  * This function gets available ring number for zero copy rx.
1326  * Only one thread will call this funciton for a paticular virtio device,
1327  * so, it is designed as non-thread-safe function.
1328  */
1329 static inline uint32_t __attribute__((always_inline))
1330 get_available_ring_num_zcp(struct virtio_net *dev)
1331 {
1332 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1333 	uint16_t avail_idx;
1334 
1335 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1336 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1337 }
1338 
1339 /*
1340  * This function gets available ring index for zero copy rx,
1341  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1342  * Only one thread will call this funciton for a paticular virtio device,
1343  * so, it is designed as non-thread-safe function.
1344  */
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_index_zcp(struct virtio_net *dev,
1347 	uint16_t *res_base_idx, uint32_t count)
1348 {
1349 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1350 	uint16_t avail_idx;
1351 	uint32_t retry = 0;
1352 	uint16_t free_entries;
1353 
1354 	*res_base_idx = vq->last_used_idx_res;
1355 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1356 	free_entries = (avail_idx - *res_base_idx);
1357 
1358 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1359 			"avail idx: %d, "
1360 			"res base idx:%d, free entries:%d\n",
1361 			dev->device_fh, avail_idx, *res_base_idx,
1362 			free_entries);
1363 
1364 	/*
1365 	 * If retry is enabled and the queue is full then we wait
1366 	 * and retry to avoid packet loss.
1367 	 */
1368 	if (enable_retry && unlikely(count > free_entries)) {
1369 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1370 			rte_delay_us(burst_rx_delay_time);
1371 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372 			free_entries = (avail_idx - *res_base_idx);
1373 			if (count <= free_entries)
1374 				break;
1375 		}
1376 	}
1377 
1378 	/*check that we have enough buffers*/
1379 	if (unlikely(count > free_entries))
1380 		count = free_entries;
1381 
1382 	if (unlikely(count == 0)) {
1383 		LOG_DEBUG(VHOST_DATA,
1384 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1385 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1386 			dev->device_fh, avail_idx,
1387 			*res_base_idx, free_entries);
1388 		return 0;
1389 	}
1390 
1391 	vq->last_used_idx_res = *res_base_idx + count;
1392 
1393 	return count;
1394 }
1395 
1396 /*
1397  * This function put descriptor back to used list.
1398  */
1399 static inline void __attribute__((always_inline))
1400 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1401 {
1402 	uint16_t res_cur_idx = vq->last_used_idx;
1403 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1404 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1405 	rte_compiler_barrier();
1406 	*(volatile uint16_t *)&vq->used->idx += 1;
1407 	vq->last_used_idx += 1;
1408 
1409 	/* Kick the guest if necessary. */
1410 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1411 		eventfd_write((int)vq->kickfd, 1);
1412 }
1413 
1414 /*
1415  * This function get available descriptor from vitio vring and un-attached mbuf
1416  * from vpool->ring, and then attach them together. It needs adjust the offset
1417  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1418  * frame data may be put to wrong location in mbuf.
1419  */
1420 static inline void __attribute__((always_inline))
1421 attach_rxmbuf_zcp(struct virtio_net *dev)
1422 {
1423 	uint16_t res_base_idx, desc_idx;
1424 	uint64_t buff_addr, phys_addr;
1425 	struct vhost_virtqueue *vq;
1426 	struct vring_desc *desc;
1427 	struct rte_mbuf *mbuf = NULL;
1428 	struct vpool *vpool;
1429 	hpa_type addr_type;
1430 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1431 
1432 	vpool = &vpool_array[vdev->vmdq_rx_q];
1433 	vq = dev->virtqueue[VIRTIO_RXQ];
1434 
1435 	do {
1436 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1437 				1) != 1))
1438 			return;
1439 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1440 
1441 		desc = &vq->desc[desc_idx];
1442 		if (desc->flags & VRING_DESC_F_NEXT) {
1443 			desc = &vq->desc[desc->next];
1444 			buff_addr = gpa_to_vva(dev, desc->addr);
1445 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1446 					&addr_type);
1447 		} else {
1448 			buff_addr = gpa_to_vva(dev,
1449 					desc->addr + vq->vhost_hlen);
1450 			phys_addr = gpa_to_hpa(vdev,
1451 					desc->addr + vq->vhost_hlen,
1452 					desc->len, &addr_type);
1453 		}
1454 
1455 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1456 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1457 				" address found when attaching RX frame buffer"
1458 				" address!\n", dev->device_fh);
1459 			put_desc_to_used_list_zcp(vq, desc_idx);
1460 			continue;
1461 		}
1462 
1463 		/*
1464 		 * Check if the frame buffer address from guest crosses
1465 		 * sub-region or not.
1466 		 */
1467 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1468 			RTE_LOG(ERR, VHOST_DATA,
1469 				"(%"PRIu64") Frame buffer address cross "
1470 				"sub-regioin found when attaching RX frame "
1471 				"buffer address!\n",
1472 				dev->device_fh);
1473 			put_desc_to_used_list_zcp(vq, desc_idx);
1474 			continue;
1475 		}
1476 	} while (unlikely(phys_addr == 0));
1477 
1478 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1479 	if (unlikely(mbuf == NULL)) {
1480 		LOG_DEBUG(VHOST_DATA,
1481 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1482 			"ring_sc_dequeue fail.\n",
1483 			dev->device_fh);
1484 		put_desc_to_used_list_zcp(vq, desc_idx);
1485 		return;
1486 	}
1487 
1488 	if (unlikely(vpool->buf_size > desc->len)) {
1489 		LOG_DEBUG(VHOST_DATA,
1490 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1491 			"length(%d) of descriptor idx: %d less than room "
1492 			"size required: %d\n",
1493 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1494 		put_desc_to_used_list_zcp(vq, desc_idx);
1495 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1496 		return;
1497 	}
1498 
1499 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1500 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1501 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1502 	mbuf->data_len = desc->len;
1503 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1504 
1505 	LOG_DEBUG(VHOST_DATA,
1506 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1507 		"descriptor idx:%d\n",
1508 		dev->device_fh, res_base_idx, desc_idx);
1509 
1510 	__rte_mbuf_raw_free(mbuf);
1511 
1512 	return;
1513 }
1514 
1515 /*
1516  * Detach an attched packet mbuf -
1517  *  - restore original mbuf address and length values.
1518  *  - reset pktmbuf data and data_len to their default values.
1519  *  All other fields of the given packet mbuf will be left intact.
1520  *
1521  * @param m
1522  *   The attached packet mbuf.
1523  */
1524 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1525 {
1526 	const struct rte_mempool *mp = m->pool;
1527 	void *buf = RTE_MBUF_TO_BADDR(m);
1528 	uint32_t buf_ofs;
1529 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1530 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1531 
1532 	m->buf_addr = buf;
1533 	m->buf_len = (uint16_t)buf_len;
1534 
1535 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1536 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1537 	m->data_off = buf_ofs;
1538 
1539 	m->data_len = 0;
1540 }
1541 
1542 /*
1543  * This function is called after packets have been transimited. It fetchs mbuf
1544  * from vpool->pool, detached it and put into vpool->ring. It also update the
1545  * used index and kick the guest if necessary.
1546  */
1547 static inline uint32_t __attribute__((always_inline))
1548 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1549 {
1550 	struct rte_mbuf *mbuf;
1551 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1552 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1553 	uint32_t index = 0;
1554 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1555 
1556 	LOG_DEBUG(VHOST_DATA,
1557 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1558 		"clean is: %d\n",
1559 		dev->device_fh, mbuf_count);
1560 	LOG_DEBUG(VHOST_DATA,
1561 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1562 		"clean  is : %d\n",
1563 		dev->device_fh, rte_ring_count(vpool->ring));
1564 
1565 	for (index = 0; index < mbuf_count; index++) {
1566 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1567 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1568 			pktmbuf_detach_zcp(mbuf);
1569 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1570 
1571 		/* Update used index buffer information. */
1572 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1573 		vq->used->ring[used_idx].len = 0;
1574 
1575 		used_idx = (used_idx + 1) & (vq->size - 1);
1576 	}
1577 
1578 	LOG_DEBUG(VHOST_DATA,
1579 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1580 		"clean is: %d\n",
1581 		dev->device_fh, rte_mempool_count(vpool->pool));
1582 	LOG_DEBUG(VHOST_DATA,
1583 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1584 		"clean  is : %d\n",
1585 		dev->device_fh, rte_ring_count(vpool->ring));
1586 	LOG_DEBUG(VHOST_DATA,
1587 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1588 		"vq->last_used_idx:%d\n",
1589 		dev->device_fh, vq->last_used_idx);
1590 
1591 	vq->last_used_idx += mbuf_count;
1592 
1593 	LOG_DEBUG(VHOST_DATA,
1594 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1595 		"vq->last_used_idx:%d\n",
1596 		dev->device_fh, vq->last_used_idx);
1597 
1598 	rte_compiler_barrier();
1599 
1600 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1601 
1602 	/* Kick guest if required. */
1603 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1604 		eventfd_write((int)vq->kickfd, 1);
1605 
1606 	return 0;
1607 }
1608 
1609 /*
1610  * This function is called when a virtio device is destroy.
1611  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1612  */
1613 static void mbuf_destroy_zcp(struct vpool *vpool)
1614 {
1615 	struct rte_mbuf *mbuf = NULL;
1616 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1617 
1618 	LOG_DEBUG(VHOST_CONFIG,
1619 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1620 		"mbuf_destroy_zcp is: %d\n",
1621 		mbuf_count);
1622 	LOG_DEBUG(VHOST_CONFIG,
1623 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1624 		"mbuf_destroy_zcp  is : %d\n",
1625 		rte_ring_count(vpool->ring));
1626 
1627 	for (index = 0; index < mbuf_count; index++) {
1628 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1629 		if (likely(mbuf != NULL)) {
1630 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1631 				pktmbuf_detach_zcp(mbuf);
1632 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1633 		}
1634 	}
1635 
1636 	LOG_DEBUG(VHOST_CONFIG,
1637 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1638 		"mbuf_destroy_zcp is: %d\n",
1639 		rte_mempool_count(vpool->pool));
1640 	LOG_DEBUG(VHOST_CONFIG,
1641 		"in mbuf_destroy_zcp: mbuf count in ring after "
1642 		"mbuf_destroy_zcp is : %d\n",
1643 		rte_ring_count(vpool->ring));
1644 }
1645 
1646 /*
1647  * This function update the use flag and counter.
1648  */
1649 static inline uint32_t __attribute__((always_inline))
1650 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1651 	uint32_t count)
1652 {
1653 	struct vhost_virtqueue *vq;
1654 	struct vring_desc *desc;
1655 	struct rte_mbuf *buff;
1656 	/* The virtio_hdr is initialised to 0. */
1657 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1658 		= {{0, 0, 0, 0, 0, 0}, 0};
1659 	uint64_t buff_hdr_addr = 0;
1660 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1661 	uint32_t head_idx, packet_success = 0;
1662 	uint16_t res_cur_idx;
1663 
1664 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1665 
1666 	if (count == 0)
1667 		return 0;
1668 
1669 	vq = dev->virtqueue[VIRTIO_RXQ];
1670 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1671 
1672 	res_cur_idx = vq->last_used_idx;
1673 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1674 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1675 
1676 	/* Retrieve all of the head indexes first to avoid caching issues. */
1677 	for (head_idx = 0; head_idx < count; head_idx++)
1678 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1679 
1680 	/*Prefetch descriptor index. */
1681 	rte_prefetch0(&vq->desc[head[packet_success]]);
1682 
1683 	while (packet_success != count) {
1684 		/* Get descriptor from available ring */
1685 		desc = &vq->desc[head[packet_success]];
1686 
1687 		buff = pkts[packet_success];
1688 		LOG_DEBUG(VHOST_DATA,
1689 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1690 			"pkt[%d] descriptor idx: %d\n",
1691 			dev->device_fh, packet_success,
1692 			MBUF_HEADROOM_UINT32(buff));
1693 
1694 		PRINT_PACKET(dev,
1695 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1696 			+ RTE_PKTMBUF_HEADROOM),
1697 			rte_pktmbuf_data_len(buff), 0);
1698 
1699 		/* Buffer address translation for virtio header. */
1700 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1701 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1702 
1703 		/*
1704 		 * If the descriptors are chained the header and data are
1705 		 * placed in separate buffers.
1706 		 */
1707 		if (desc->flags & VRING_DESC_F_NEXT) {
1708 			desc->len = vq->vhost_hlen;
1709 			desc = &vq->desc[desc->next];
1710 			desc->len = rte_pktmbuf_data_len(buff);
1711 		} else {
1712 			desc->len = packet_len;
1713 		}
1714 
1715 		/* Update used ring with desc information */
1716 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1717 			= head[packet_success];
1718 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1719 			= packet_len;
1720 		res_cur_idx++;
1721 		packet_success++;
1722 
1723 		/* A header is required per buffer. */
1724 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1725 			(const void *)&virtio_hdr, vq->vhost_hlen);
1726 
1727 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1728 
1729 		if (likely(packet_success < count)) {
1730 			/* Prefetch descriptor index. */
1731 			rte_prefetch0(&vq->desc[head[packet_success]]);
1732 		}
1733 	}
1734 
1735 	rte_compiler_barrier();
1736 
1737 	LOG_DEBUG(VHOST_DATA,
1738 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1739 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1740 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1741 
1742 	*(volatile uint16_t *)&vq->used->idx += count;
1743 	vq->last_used_idx += count;
1744 
1745 	LOG_DEBUG(VHOST_DATA,
1746 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1747 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1748 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1749 
1750 	/* Kick the guest if necessary. */
1751 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1752 		eventfd_write((int)vq->kickfd, 1);
1753 
1754 	return count;
1755 }
1756 
1757 /*
1758  * This function routes the TX packet to the correct interface.
1759  * This may be a local device or the physical port.
1760  */
1761 static inline void __attribute__((always_inline))
1762 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1763 	uint32_t desc_idx, uint8_t need_copy)
1764 {
1765 	struct mbuf_table *tx_q;
1766 	struct rte_mbuf **m_table;
1767 	struct rte_mbuf *mbuf = NULL;
1768 	unsigned len, ret, offset = 0;
1769 	struct vpool *vpool;
1770 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1771 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1772 
1773 	/*Add packet to the port tx queue*/
1774 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1775 	len = tx_q->len;
1776 
1777 	/* Allocate an mbuf and populate the structure. */
1778 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1779 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1780 	if (unlikely(mbuf == NULL)) {
1781 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1782 		RTE_LOG(ERR, VHOST_DATA,
1783 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1784 			dev->device_fh);
1785 		put_desc_to_used_list_zcp(vq, desc_idx);
1786 		return;
1787 	}
1788 
1789 	if (vm2vm_mode == VM2VM_HARDWARE) {
1790 		/* Avoid using a vlan tag from any vm for external pkt, such as
1791 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1792 		 * selection, MAC address determines it as an external pkt
1793 		 * which should go to network, while vlan tag determine it as
1794 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1795 		 * such a ambiguous situation, so pkt will lost.
1796 		 */
1797 		vlan_tag = external_pkt_default_vlan_tag;
1798 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1799 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1800 			__rte_mbuf_raw_free(mbuf);
1801 			return;
1802 		}
1803 	}
1804 
1805 	mbuf->nb_segs = m->nb_segs;
1806 	mbuf->next = m->next;
1807 	mbuf->data_len = m->data_len + offset;
1808 	mbuf->pkt_len = mbuf->data_len;
1809 	if (unlikely(need_copy)) {
1810 		/* Copy the packet contents to the mbuf. */
1811 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1812 			rte_pktmbuf_mtod(m, void *),
1813 			m->data_len);
1814 	} else {
1815 		mbuf->data_off = m->data_off;
1816 		mbuf->buf_physaddr = m->buf_physaddr;
1817 		mbuf->buf_addr = m->buf_addr;
1818 	}
1819 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1820 	mbuf->vlan_tci = vlan_tag;
1821 	mbuf->l2_len = sizeof(struct ether_hdr);
1822 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1823 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1824 
1825 	tx_q->m_table[len] = mbuf;
1826 	len++;
1827 
1828 	LOG_DEBUG(VHOST_DATA,
1829 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1830 		dev->device_fh,
1831 		mbuf->nb_segs,
1832 		(mbuf->next == NULL) ? "null" : "non-null");
1833 
1834 	if (enable_stats) {
1835 		dev_statistics[dev->device_fh].tx_total++;
1836 		dev_statistics[dev->device_fh].tx++;
1837 	}
1838 
1839 	if (unlikely(len == MAX_PKT_BURST)) {
1840 		m_table = (struct rte_mbuf **)tx_q->m_table;
1841 		ret = rte_eth_tx_burst(ports[0],
1842 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1843 
1844 		/*
1845 		 * Free any buffers not handled by TX and update
1846 		 * the port stats.
1847 		 */
1848 		if (unlikely(ret < len)) {
1849 			do {
1850 				rte_pktmbuf_free(m_table[ret]);
1851 			} while (++ret < len);
1852 		}
1853 
1854 		len = 0;
1855 		txmbuf_clean_zcp(dev, vpool);
1856 	}
1857 
1858 	tx_q->len = len;
1859 
1860 	return;
1861 }
1862 
1863 /*
1864  * This function TX all available packets in virtio TX queue for one
1865  * virtio-net device. If it is first packet, it learns MAC address and
1866  * setup VMDQ.
1867  */
1868 static inline void __attribute__((always_inline))
1869 virtio_dev_tx_zcp(struct virtio_net *dev)
1870 {
1871 	struct rte_mbuf m;
1872 	struct vhost_virtqueue *vq;
1873 	struct vring_desc *desc;
1874 	uint64_t buff_addr = 0, phys_addr;
1875 	uint32_t head[MAX_PKT_BURST];
1876 	uint32_t i;
1877 	uint16_t free_entries, packet_success = 0;
1878 	uint16_t avail_idx;
1879 	uint8_t need_copy = 0;
1880 	hpa_type addr_type;
1881 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1882 
1883 	vq = dev->virtqueue[VIRTIO_TXQ];
1884 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1885 
1886 	/* If there are no available buffers then return. */
1887 	if (vq->last_used_idx_res == avail_idx)
1888 		return;
1889 
1890 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1891 
1892 	/* Prefetch available ring to retrieve head indexes. */
1893 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1894 
1895 	/* Get the number of free entries in the ring */
1896 	free_entries = (avail_idx - vq->last_used_idx_res);
1897 
1898 	/* Limit to MAX_PKT_BURST. */
1899 	free_entries
1900 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1901 
1902 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1903 		dev->device_fh, free_entries);
1904 
1905 	/* Retrieve all of the head indexes first to avoid caching issues. */
1906 	for (i = 0; i < free_entries; i++)
1907 		head[i]
1908 			= vq->avail->ring[(vq->last_used_idx_res + i)
1909 			& (vq->size - 1)];
1910 
1911 	vq->last_used_idx_res += free_entries;
1912 
1913 	/* Prefetch descriptor index. */
1914 	rte_prefetch0(&vq->desc[head[packet_success]]);
1915 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1916 
1917 	while (packet_success < free_entries) {
1918 		desc = &vq->desc[head[packet_success]];
1919 
1920 		/* Discard first buffer as it is the virtio header */
1921 		desc = &vq->desc[desc->next];
1922 
1923 		/* Buffer address translation. */
1924 		buff_addr = gpa_to_vva(dev, desc->addr);
1925 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1926 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1927 			&addr_type);
1928 
1929 		if (likely(packet_success < (free_entries - 1)))
1930 			/* Prefetch descriptor index. */
1931 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1932 
1933 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1934 			RTE_LOG(ERR, VHOST_DATA,
1935 				"(%"PRIu64") Invalid frame buffer address found"
1936 				"when TX packets!\n",
1937 				dev->device_fh);
1938 			packet_success++;
1939 			continue;
1940 		}
1941 
1942 		/* Prefetch buffer address. */
1943 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1944 
1945 		/*
1946 		 * Setup dummy mbuf. This is copied to a real mbuf if
1947 		 * transmitted out the physical port.
1948 		 */
1949 		m.data_len = desc->len;
1950 		m.nb_segs = 1;
1951 		m.next = NULL;
1952 		m.data_off = 0;
1953 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1954 		m.buf_physaddr = phys_addr;
1955 
1956 		/*
1957 		 * Check if the frame buffer address from guest crosses
1958 		 * sub-region or not.
1959 		 */
1960 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1961 			RTE_LOG(ERR, VHOST_DATA,
1962 				"(%"PRIu64") Frame buffer address cross "
1963 				"sub-regioin found when attaching TX frame "
1964 				"buffer address!\n",
1965 				dev->device_fh);
1966 			need_copy = 1;
1967 		} else
1968 			need_copy = 0;
1969 
1970 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1971 
1972 		/*
1973 		 * If this is the first received packet we need to learn
1974 		 * the MAC and setup VMDQ
1975 		 */
1976 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1977 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1978 				/*
1979 				 * Discard frame if device is scheduled for
1980 				 * removal or a duplicate MAC address is found.
1981 				 */
1982 				packet_success += free_entries;
1983 				vq->last_used_idx += packet_success;
1984 				break;
1985 			}
1986 		}
1987 
1988 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1989 		packet_success++;
1990 	}
1991 }
1992 
1993 /*
1994  * This function is called by each data core. It handles all RX/TX registered
1995  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1996  * addresses are compared with all devices in the main linked list.
1997  */
1998 static int
1999 switch_worker_zcp(__attribute__((unused)) void *arg)
2000 {
2001 	struct virtio_net *dev = NULL;
2002 	struct vhost_dev  *vdev = NULL;
2003 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2004 	struct virtio_net_data_ll *dev_ll;
2005 	struct mbuf_table *tx_q;
2006 	volatile struct lcore_ll_info *lcore_ll;
2007 	const uint64_t drain_tsc
2008 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2009 		* BURST_TX_DRAIN_US;
2010 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2011 	unsigned ret;
2012 	const uint16_t lcore_id = rte_lcore_id();
2013 	uint16_t count_in_ring, rx_count = 0;
2014 
2015 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2016 
2017 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2018 	prev_tsc = 0;
2019 
2020 	while (1) {
2021 		cur_tsc = rte_rdtsc();
2022 
2023 		/* TX burst queue drain */
2024 		diff_tsc = cur_tsc - prev_tsc;
2025 		if (unlikely(diff_tsc > drain_tsc)) {
2026 			/*
2027 			 * Get mbuf from vpool.pool and detach mbuf and
2028 			 * put back into vpool.ring.
2029 			 */
2030 			dev_ll = lcore_ll->ll_root_used;
2031 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2032 				/* Get virtio device ID */
2033 				vdev = dev_ll->vdev;
2034 				dev = vdev->dev;
2035 
2036 				if (likely(!vdev->remove)) {
2037 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2038 					if (tx_q->len) {
2039 						LOG_DEBUG(VHOST_DATA,
2040 						"TX queue drained after timeout"
2041 						" with burst size %u\n",
2042 						tx_q->len);
2043 
2044 						/*
2045 						 * Tx any packets in the queue
2046 						 */
2047 						ret = rte_eth_tx_burst(
2048 							ports[0],
2049 							(uint16_t)tx_q->txq_id,
2050 							(struct rte_mbuf **)
2051 							tx_q->m_table,
2052 							(uint16_t)tx_q->len);
2053 						if (unlikely(ret < tx_q->len)) {
2054 							do {
2055 								rte_pktmbuf_free(
2056 									tx_q->m_table[ret]);
2057 							} while (++ret < tx_q->len);
2058 						}
2059 						tx_q->len = 0;
2060 
2061 						txmbuf_clean_zcp(dev,
2062 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2063 					}
2064 				}
2065 				dev_ll = dev_ll->next;
2066 			}
2067 			prev_tsc = cur_tsc;
2068 		}
2069 
2070 		rte_prefetch0(lcore_ll->ll_root_used);
2071 
2072 		/*
2073 		 * Inform the configuration core that we have exited the linked
2074 		 * list and that no devices are in use if requested.
2075 		 */
2076 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2077 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2078 
2079 		/* Process devices */
2080 		dev_ll = lcore_ll->ll_root_used;
2081 
2082 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2083 			vdev = dev_ll->vdev;
2084 			dev  = vdev->dev;
2085 			if (unlikely(vdev->remove)) {
2086 				dev_ll = dev_ll->next;
2087 				unlink_vmdq(vdev);
2088 				vdev->ready = DEVICE_SAFE_REMOVE;
2089 				continue;
2090 			}
2091 
2092 			if (likely(vdev->ready == DEVICE_RX)) {
2093 				uint32_t index = vdev->vmdq_rx_q;
2094 				uint16_t i;
2095 				count_in_ring
2096 				= rte_ring_count(vpool_array[index].ring);
2097 				uint16_t free_entries
2098 				= (uint16_t)get_available_ring_num_zcp(dev);
2099 
2100 				/*
2101 				 * Attach all mbufs in vpool.ring and put back
2102 				 * into vpool.pool.
2103 				 */
2104 				for (i = 0;
2105 				i < RTE_MIN(free_entries,
2106 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2107 				i++)
2108 					attach_rxmbuf_zcp(dev);
2109 
2110 				/* Handle guest RX */
2111 				rx_count = rte_eth_rx_burst(ports[0],
2112 					vdev->vmdq_rx_q, pkts_burst,
2113 					MAX_PKT_BURST);
2114 
2115 				if (rx_count) {
2116 					ret_count = virtio_dev_rx_zcp(dev,
2117 							pkts_burst, rx_count);
2118 					if (enable_stats) {
2119 						dev_statistics[dev->device_fh].rx_total
2120 							+= rx_count;
2121 						dev_statistics[dev->device_fh].rx
2122 							+= ret_count;
2123 					}
2124 					while (likely(rx_count)) {
2125 						rx_count--;
2126 						pktmbuf_detach_zcp(
2127 							pkts_burst[rx_count]);
2128 						rte_ring_sp_enqueue(
2129 							vpool_array[index].ring,
2130 							(void *)pkts_burst[rx_count]);
2131 					}
2132 				}
2133 			}
2134 
2135 			if (likely(!vdev->remove))
2136 				/* Handle guest TX */
2137 				virtio_dev_tx_zcp(dev);
2138 
2139 			/* Move to the next device in the list */
2140 			dev_ll = dev_ll->next;
2141 		}
2142 	}
2143 
2144 	return 0;
2145 }
2146 
2147 
2148 /*
2149  * Add an entry to a used linked list. A free entry must first be found
2150  * in the free linked list using get_data_ll_free_entry();
2151  */
2152 static void
2153 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2154 	struct virtio_net_data_ll *ll_dev)
2155 {
2156 	struct virtio_net_data_ll *ll = *ll_root_addr;
2157 
2158 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2159 	ll_dev->next = NULL;
2160 	rte_compiler_barrier();
2161 
2162 	/* If ll == NULL then this is the first device. */
2163 	if (ll) {
2164 		/* Increment to the tail of the linked list. */
2165 		while ((ll->next != NULL) )
2166 			ll = ll->next;
2167 
2168 		ll->next = ll_dev;
2169 	} else {
2170 		*ll_root_addr = ll_dev;
2171 	}
2172 }
2173 
2174 /*
2175  * Remove an entry from a used linked list. The entry must then be added to
2176  * the free linked list using put_data_ll_free_entry().
2177  */
2178 static void
2179 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2180 	struct virtio_net_data_ll *ll_dev,
2181 	struct virtio_net_data_ll *ll_dev_last)
2182 {
2183 	struct virtio_net_data_ll *ll = *ll_root_addr;
2184 
2185 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2186 		return;
2187 
2188 	if (ll_dev == ll)
2189 		*ll_root_addr = ll_dev->next;
2190 	else
2191 		if (likely(ll_dev_last != NULL))
2192 			ll_dev_last->next = ll_dev->next;
2193 		else
2194 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2195 }
2196 
2197 /*
2198  * Find and return an entry from the free linked list.
2199  */
2200 static struct virtio_net_data_ll *
2201 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2202 {
2203 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2204 	struct virtio_net_data_ll *ll_dev;
2205 
2206 	if (ll_free == NULL)
2207 		return NULL;
2208 
2209 	ll_dev = ll_free;
2210 	*ll_root_addr = ll_free->next;
2211 
2212 	return ll_dev;
2213 }
2214 
2215 /*
2216  * Place an entry back on to the free linked list.
2217  */
2218 static void
2219 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2220 	struct virtio_net_data_ll *ll_dev)
2221 {
2222 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2223 
2224 	if (ll_dev == NULL)
2225 		return;
2226 
2227 	ll_dev->next = ll_free;
2228 	*ll_root_addr = ll_dev;
2229 }
2230 
2231 /*
2232  * Creates a linked list of a given size.
2233  */
2234 static struct virtio_net_data_ll *
2235 alloc_data_ll(uint32_t size)
2236 {
2237 	struct virtio_net_data_ll *ll_new;
2238 	uint32_t i;
2239 
2240 	/* Malloc and then chain the linked list. */
2241 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2242 	if (ll_new == NULL) {
2243 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2244 		return NULL;
2245 	}
2246 
2247 	for (i = 0; i < size - 1; i++) {
2248 		ll_new[i].vdev = NULL;
2249 		ll_new[i].next = &ll_new[i+1];
2250 	}
2251 	ll_new[i].next = NULL;
2252 
2253 	return (ll_new);
2254 }
2255 
2256 /*
2257  * Create the main linked list along with each individual cores linked list. A used and a free list
2258  * are created to manage entries.
2259  */
2260 static int
2261 init_data_ll (void)
2262 {
2263 	int lcore;
2264 
2265 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2266 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2267 		if (lcore_info[lcore].lcore_ll == NULL) {
2268 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2269 			return -1;
2270 		}
2271 
2272 		lcore_info[lcore].lcore_ll->device_num = 0;
2273 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2274 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2275 		if (num_devices % num_switching_cores)
2276 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2277 		else
2278 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2279 	}
2280 
2281 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2282 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2283 
2284 	return 0;
2285 }
2286 
2287 /*
2288  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2289  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2290  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2291  */
2292 static void
2293 destroy_device (volatile struct virtio_net *dev)
2294 {
2295 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2296 	struct virtio_net_data_ll *ll_main_dev_cur;
2297 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2298 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2299 	struct vhost_dev *vdev;
2300 	int lcore;
2301 
2302 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2303 
2304 	vdev = (struct vhost_dev *)dev->priv;
2305 	/*set the remove flag. */
2306 	vdev->remove = 1;
2307 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2308 		rte_pause();
2309 	}
2310 
2311 	/* Search for entry to be removed from lcore ll */
2312 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2313 	while (ll_lcore_dev_cur != NULL) {
2314 		if (ll_lcore_dev_cur->vdev == vdev) {
2315 			break;
2316 		} else {
2317 			ll_lcore_dev_last = ll_lcore_dev_cur;
2318 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2319 		}
2320 	}
2321 
2322 	if (ll_lcore_dev_cur == NULL) {
2323 		RTE_LOG(ERR, VHOST_CONFIG,
2324 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2325 			dev->device_fh);
2326 		return;
2327 	}
2328 
2329 	/* Search for entry to be removed from main ll */
2330 	ll_main_dev_cur = ll_root_used;
2331 	ll_main_dev_last = NULL;
2332 	while (ll_main_dev_cur != NULL) {
2333 		if (ll_main_dev_cur->vdev == vdev) {
2334 			break;
2335 		} else {
2336 			ll_main_dev_last = ll_main_dev_cur;
2337 			ll_main_dev_cur = ll_main_dev_cur->next;
2338 		}
2339 	}
2340 
2341 	/* Remove entries from the lcore and main ll. */
2342 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2343 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2344 
2345 	/* Set the dev_removal_flag on each lcore. */
2346 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2347 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2348 	}
2349 
2350 	/*
2351 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2352 	 * they can no longer access the device removed from the linked lists and that the devices
2353 	 * are no longer in use.
2354 	 */
2355 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2356 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2357 			rte_pause();
2358 		}
2359 	}
2360 
2361 	/* Add the entries back to the lcore and main free ll.*/
2362 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2363 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2364 
2365 	/* Decrement number of device on the lcore. */
2366 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2367 
2368 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2369 
2370 	if (zero_copy) {
2371 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2372 
2373 		/* Stop the RX queue. */
2374 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2375 			LOG_DEBUG(VHOST_CONFIG,
2376 				"(%"PRIu64") In destroy_device: Failed to stop "
2377 				"rx queue:%d\n",
2378 				dev->device_fh,
2379 				vdev->vmdq_rx_q);
2380 		}
2381 
2382 		LOG_DEBUG(VHOST_CONFIG,
2383 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2384 			"mempool back to ring for RX queue: %d\n",
2385 			dev->device_fh, vdev->vmdq_rx_q);
2386 
2387 		mbuf_destroy_zcp(vpool);
2388 
2389 		/* Stop the TX queue. */
2390 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2391 			LOG_DEBUG(VHOST_CONFIG,
2392 				"(%"PRIu64") In destroy_device: Failed to "
2393 				"stop tx queue:%d\n",
2394 				dev->device_fh, vdev->vmdq_rx_q);
2395 		}
2396 
2397 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2398 
2399 		LOG_DEBUG(VHOST_CONFIG,
2400 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2401 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2402 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2403 			dev->device_fh);
2404 
2405 		mbuf_destroy_zcp(vpool);
2406 		rte_free(vdev->regions_hpa);
2407 	}
2408 	rte_free(vdev);
2409 
2410 }
2411 
2412 /*
2413  * Calculate the region count of physical continous regions for one particular
2414  * region of whose vhost virtual address is continous. The particular region
2415  * start from vva_start, with size of 'size' in argument.
2416  */
2417 static uint32_t
2418 check_hpa_regions(uint64_t vva_start, uint64_t size)
2419 {
2420 	uint32_t i, nregions = 0, page_size = getpagesize();
2421 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2422 	if (vva_start % page_size) {
2423 		LOG_DEBUG(VHOST_CONFIG,
2424 			"in check_countinous: vva start(%p) mod page_size(%d) "
2425 			"has remainder\n",
2426 			(void *)(uintptr_t)vva_start, page_size);
2427 		return 0;
2428 	}
2429 	if (size % page_size) {
2430 		LOG_DEBUG(VHOST_CONFIG,
2431 			"in check_countinous: "
2432 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2433 			size, page_size);
2434 		return 0;
2435 	}
2436 	for (i = 0; i < size - page_size; i = i + page_size) {
2437 		cur_phys_addr
2438 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2439 		next_phys_addr = rte_mem_virt2phy(
2440 			(void *)(uintptr_t)(vva_start + i + page_size));
2441 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2442 			++nregions;
2443 			LOG_DEBUG(VHOST_CONFIG,
2444 				"in check_continuous: hva addr:(%p) is not "
2445 				"continuous with hva addr:(%p), diff:%d\n",
2446 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2447 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2448 				+ page_size), page_size);
2449 			LOG_DEBUG(VHOST_CONFIG,
2450 				"in check_continuous: hpa addr:(%p) is not "
2451 				"continuous with hpa addr:(%p), "
2452 				"diff:(%"PRIu64")\n",
2453 				(void *)(uintptr_t)cur_phys_addr,
2454 				(void *)(uintptr_t)next_phys_addr,
2455 				(next_phys_addr-cur_phys_addr));
2456 		}
2457 	}
2458 	return nregions;
2459 }
2460 
2461 /*
2462  * Divide each region whose vhost virtual address is continous into a few
2463  * sub-regions, make sure the physical address within each sub-region are
2464  * continous. And fill offset(to GPA) and size etc. information of each
2465  * sub-region into regions_hpa.
2466  */
2467 static uint32_t
2468 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2469 {
2470 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2471 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2472 
2473 	if (mem_region_hpa == NULL)
2474 		return 0;
2475 
2476 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2477 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2478 			virtio_memory->regions[regionidx].address_offset;
2479 		mem_region_hpa[regionidx_hpa].guest_phys_address
2480 			= virtio_memory->regions[regionidx].guest_phys_address;
2481 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2482 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2483 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2484 		LOG_DEBUG(VHOST_CONFIG,
2485 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2486 			regionidx_hpa,
2487 			(void *)(uintptr_t)
2488 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2489 		LOG_DEBUG(VHOST_CONFIG,
2490 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2491 			regionidx_hpa,
2492 			(void *)(uintptr_t)
2493 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2494 		for (i = 0, k = 0;
2495 			i < virtio_memory->regions[regionidx].memory_size -
2496 				page_size;
2497 			i += page_size) {
2498 			cur_phys_addr = rte_mem_virt2phy(
2499 					(void *)(uintptr_t)(vva_start + i));
2500 			next_phys_addr = rte_mem_virt2phy(
2501 					(void *)(uintptr_t)(vva_start +
2502 					i + page_size));
2503 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2504 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2505 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2506 					k + page_size;
2507 				mem_region_hpa[regionidx_hpa].memory_size
2508 					= k + page_size;
2509 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2510 					"phys addr end  [%d]:(%p)\n",
2511 					regionidx_hpa,
2512 					(void *)(uintptr_t)
2513 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2514 				LOG_DEBUG(VHOST_CONFIG,
2515 					"in fill_hpa_regions: guest phys addr "
2516 					"size [%d]:(%p)\n",
2517 					regionidx_hpa,
2518 					(void *)(uintptr_t)
2519 					(mem_region_hpa[regionidx_hpa].memory_size));
2520 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2521 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2522 				++regionidx_hpa;
2523 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2524 					next_phys_addr -
2525 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2526 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2527 					" phys addr start[%d]:(%p)\n",
2528 					regionidx_hpa,
2529 					(void *)(uintptr_t)
2530 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2531 				LOG_DEBUG(VHOST_CONFIG,
2532 					"in fill_hpa_regions: host  phys addr "
2533 					"start[%d]:(%p)\n",
2534 					regionidx_hpa,
2535 					(void *)(uintptr_t)
2536 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2537 				k = 0;
2538 			} else {
2539 				k += page_size;
2540 			}
2541 		}
2542 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2543 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2544 			+ k + page_size;
2545 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2546 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2547 			"[%d]:(%p)\n", regionidx_hpa,
2548 			(void *)(uintptr_t)
2549 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2550 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2551 			"[%d]:(%p)\n", regionidx_hpa,
2552 			(void *)(uintptr_t)
2553 			(mem_region_hpa[regionidx_hpa].memory_size));
2554 		++regionidx_hpa;
2555 	}
2556 	return regionidx_hpa;
2557 }
2558 
2559 /*
2560  * A new device is added to a data core. First the device is added to the main linked list
2561  * and the allocated to a specific data core.
2562  */
2563 static int
2564 new_device (struct virtio_net *dev)
2565 {
2566 	struct virtio_net_data_ll *ll_dev;
2567 	int lcore, core_add = 0;
2568 	uint32_t device_num_min = num_devices;
2569 	struct vhost_dev *vdev;
2570 	uint32_t regionidx;
2571 
2572 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2573 	if (vdev == NULL) {
2574 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2575 			dev->device_fh);
2576 		return -1;
2577 	}
2578 	vdev->dev = dev;
2579 	dev->priv = vdev;
2580 
2581 	if (zero_copy) {
2582 		vdev->nregions_hpa = dev->mem->nregions;
2583 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2584 			vdev->nregions_hpa
2585 				+= check_hpa_regions(
2586 					dev->mem->regions[regionidx].guest_phys_address
2587 					+ dev->mem->regions[regionidx].address_offset,
2588 					dev->mem->regions[regionidx].memory_size);
2589 
2590 		}
2591 
2592 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2593 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2594 			RTE_CACHE_LINE_SIZE);
2595 		if (vdev->regions_hpa == NULL) {
2596 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2597 			rte_free(vdev);
2598 			return -1;
2599 		}
2600 
2601 
2602 		if (fill_hpa_memory_regions(
2603 			vdev->regions_hpa, dev->mem
2604 			) != vdev->nregions_hpa) {
2605 
2606 			RTE_LOG(ERR, VHOST_CONFIG,
2607 				"hpa memory regions number mismatch: "
2608 				"[%d]\n", vdev->nregions_hpa);
2609 			rte_free(vdev->regions_hpa);
2610 			rte_free(vdev);
2611 			return -1;
2612 		}
2613 	}
2614 
2615 
2616 	/* Add device to main ll */
2617 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2618 	if (ll_dev == NULL) {
2619 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2620 			"of %d devices per core has been reached\n",
2621 			dev->device_fh, num_devices);
2622 		if (vdev->regions_hpa)
2623 			rte_free(vdev->regions_hpa);
2624 		rte_free(vdev);
2625 		return -1;
2626 	}
2627 	ll_dev->vdev = vdev;
2628 	add_data_ll_entry(&ll_root_used, ll_dev);
2629 	vdev->vmdq_rx_q
2630 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2631 
2632 	if (zero_copy) {
2633 		uint32_t index = vdev->vmdq_rx_q;
2634 		uint32_t count_in_ring, i;
2635 		struct mbuf_table *tx_q;
2636 
2637 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2638 
2639 		LOG_DEBUG(VHOST_CONFIG,
2640 			"(%"PRIu64") in new_device: mbuf count in mempool "
2641 			"before attach is: %d\n",
2642 			dev->device_fh,
2643 			rte_mempool_count(vpool_array[index].pool));
2644 		LOG_DEBUG(VHOST_CONFIG,
2645 			"(%"PRIu64") in new_device: mbuf count in  ring "
2646 			"before attach  is : %d\n",
2647 			dev->device_fh, count_in_ring);
2648 
2649 		/*
2650 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2651 		 */
2652 		for (i = 0; i < count_in_ring; i++)
2653 			attach_rxmbuf_zcp(dev);
2654 
2655 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2656 			"mempool after attach is: %d\n",
2657 			dev->device_fh,
2658 			rte_mempool_count(vpool_array[index].pool));
2659 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2660 			"ring after attach  is : %d\n",
2661 			dev->device_fh,
2662 			rte_ring_count(vpool_array[index].ring));
2663 
2664 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2665 		tx_q->txq_id = vdev->vmdq_rx_q;
2666 
2667 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2668 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2669 
2670 			LOG_DEBUG(VHOST_CONFIG,
2671 				"(%"PRIu64") In new_device: Failed to start "
2672 				"tx queue:%d\n",
2673 				dev->device_fh, vdev->vmdq_rx_q);
2674 
2675 			mbuf_destroy_zcp(vpool);
2676 			rte_free(vdev->regions_hpa);
2677 			rte_free(vdev);
2678 			return -1;
2679 		}
2680 
2681 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2682 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2683 
2684 			LOG_DEBUG(VHOST_CONFIG,
2685 				"(%"PRIu64") In new_device: Failed to start "
2686 				"rx queue:%d\n",
2687 				dev->device_fh, vdev->vmdq_rx_q);
2688 
2689 			/* Stop the TX queue. */
2690 			if (rte_eth_dev_tx_queue_stop(ports[0],
2691 				vdev->vmdq_rx_q) != 0) {
2692 				LOG_DEBUG(VHOST_CONFIG,
2693 					"(%"PRIu64") In new_device: Failed to "
2694 					"stop tx queue:%d\n",
2695 					dev->device_fh, vdev->vmdq_rx_q);
2696 			}
2697 
2698 			mbuf_destroy_zcp(vpool);
2699 			rte_free(vdev->regions_hpa);
2700 			rte_free(vdev);
2701 			return -1;
2702 		}
2703 
2704 	}
2705 
2706 	/*reset ready flag*/
2707 	vdev->ready = DEVICE_MAC_LEARNING;
2708 	vdev->remove = 0;
2709 
2710 	/* Find a suitable lcore to add the device. */
2711 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2712 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2713 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2714 			core_add = lcore;
2715 		}
2716 	}
2717 	/* Add device to lcore ll */
2718 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2719 	if (ll_dev == NULL) {
2720 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2721 		vdev->ready = DEVICE_SAFE_REMOVE;
2722 		destroy_device(dev);
2723 		if (vdev->regions_hpa)
2724 			rte_free(vdev->regions_hpa);
2725 		rte_free(vdev);
2726 		return -1;
2727 	}
2728 	ll_dev->vdev = vdev;
2729 	vdev->coreid = core_add;
2730 
2731 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2732 
2733 	/* Initialize device stats */
2734 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2735 
2736 	/* Disable notifications. */
2737 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2738 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2739 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2740 	dev->flags |= VIRTIO_DEV_RUNNING;
2741 
2742 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2743 
2744 	return 0;
2745 }
2746 
2747 /*
2748  * These callback allow devices to be added to the data core when configuration
2749  * has been fully complete.
2750  */
2751 static const struct virtio_net_device_ops virtio_net_device_ops =
2752 {
2753 	.new_device =  new_device,
2754 	.destroy_device = destroy_device,
2755 };
2756 
2757 /*
2758  * This is a thread will wake up after a period to print stats if the user has
2759  * enabled them.
2760  */
2761 static void
2762 print_stats(void)
2763 {
2764 	struct virtio_net_data_ll *dev_ll;
2765 	uint64_t tx_dropped, rx_dropped;
2766 	uint64_t tx, tx_total, rx, rx_total;
2767 	uint32_t device_fh;
2768 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2769 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2770 
2771 	while(1) {
2772 		sleep(enable_stats);
2773 
2774 		/* Clear screen and move to top left */
2775 		printf("%s%s", clr, top_left);
2776 
2777 		printf("\nDevice statistics ====================================");
2778 
2779 		dev_ll = ll_root_used;
2780 		while (dev_ll != NULL) {
2781 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2782 			tx_total = dev_statistics[device_fh].tx_total;
2783 			tx = dev_statistics[device_fh].tx;
2784 			tx_dropped = tx_total - tx;
2785 			if (zero_copy == 0) {
2786 				rx_total = rte_atomic64_read(
2787 					&dev_statistics[device_fh].rx_total_atomic);
2788 				rx = rte_atomic64_read(
2789 					&dev_statistics[device_fh].rx_atomic);
2790 			} else {
2791 				rx_total = dev_statistics[device_fh].rx_total;
2792 				rx = dev_statistics[device_fh].rx;
2793 			}
2794 			rx_dropped = rx_total - rx;
2795 
2796 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2797 					"\nTX total: 		%"PRIu64""
2798 					"\nTX dropped: 		%"PRIu64""
2799 					"\nTX successful: 		%"PRIu64""
2800 					"\nRX total: 		%"PRIu64""
2801 					"\nRX dropped: 		%"PRIu64""
2802 					"\nRX successful: 		%"PRIu64"",
2803 					device_fh,
2804 					tx_total,
2805 					tx_dropped,
2806 					tx,
2807 					rx_total,
2808 					rx_dropped,
2809 					rx);
2810 
2811 			dev_ll = dev_ll->next;
2812 		}
2813 		printf("\n======================================================\n");
2814 	}
2815 }
2816 
2817 static void
2818 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2819 	char *ring_name, uint32_t nb_mbuf)
2820 {
2821 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2822 	vpool_array[index].pool
2823 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2824 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2825 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2826 		rte_pktmbuf_init, NULL, socket, 0);
2827 	if (vpool_array[index].pool != NULL) {
2828 		vpool_array[index].ring
2829 			= rte_ring_create(ring_name,
2830 				rte_align32pow2(nb_mbuf + 1),
2831 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2832 		if (likely(vpool_array[index].ring != NULL)) {
2833 			LOG_DEBUG(VHOST_CONFIG,
2834 				"in setup_mempool_tbl: mbuf count in "
2835 				"mempool is: %d\n",
2836 				rte_mempool_count(vpool_array[index].pool));
2837 			LOG_DEBUG(VHOST_CONFIG,
2838 				"in setup_mempool_tbl: mbuf count in "
2839 				"ring   is: %d\n",
2840 				rte_ring_count(vpool_array[index].ring));
2841 		} else {
2842 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2843 				ring_name);
2844 		}
2845 
2846 		/* Need consider head room. */
2847 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2848 	} else {
2849 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2850 	}
2851 }
2852 
2853 
2854 /*
2855  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2856  * device is also registered here to handle the IOCTLs.
2857  */
2858 int
2859 main(int argc, char *argv[])
2860 {
2861 	struct rte_mempool *mbuf_pool = NULL;
2862 	unsigned lcore_id, core_id = 0;
2863 	unsigned nb_ports, valid_num_ports;
2864 	int ret;
2865 	uint8_t portid;
2866 	uint16_t queue_id;
2867 	static pthread_t tid;
2868 
2869 	/* init EAL */
2870 	ret = rte_eal_init(argc, argv);
2871 	if (ret < 0)
2872 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2873 	argc -= ret;
2874 	argv += ret;
2875 
2876 	/* parse app arguments */
2877 	ret = us_vhost_parse_args(argc, argv);
2878 	if (ret < 0)
2879 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2880 
2881 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2882 		if (rte_lcore_is_enabled(lcore_id))
2883 			lcore_ids[core_id ++] = lcore_id;
2884 
2885 	if (rte_lcore_count() > RTE_MAX_LCORE)
2886 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2887 
2888 	/*set the number of swithcing cores available*/
2889 	num_switching_cores = rte_lcore_count()-1;
2890 
2891 	/* Get the number of physical ports. */
2892 	nb_ports = rte_eth_dev_count();
2893 	if (nb_ports > RTE_MAX_ETHPORTS)
2894 		nb_ports = RTE_MAX_ETHPORTS;
2895 
2896 	/*
2897 	 * Update the global var NUM_PORTS and global array PORTS
2898 	 * and get value of var VALID_NUM_PORTS according to system ports number
2899 	 */
2900 	valid_num_ports = check_ports_num(nb_ports);
2901 
2902 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2903 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2904 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2905 		return -1;
2906 	}
2907 
2908 	if (zero_copy == 0) {
2909 		/* Create the mbuf pool. */
2910 		mbuf_pool = rte_mempool_create(
2911 				"MBUF_POOL",
2912 				NUM_MBUFS_PER_PORT
2913 				* valid_num_ports,
2914 				MBUF_SIZE, MBUF_CACHE_SIZE,
2915 				sizeof(struct rte_pktmbuf_pool_private),
2916 				rte_pktmbuf_pool_init, NULL,
2917 				rte_pktmbuf_init, NULL,
2918 				rte_socket_id(), 0);
2919 		if (mbuf_pool == NULL)
2920 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2921 
2922 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2923 			vpool_array[queue_id].pool = mbuf_pool;
2924 
2925 		if (vm2vm_mode == VM2VM_HARDWARE) {
2926 			/* Enable VT loop back to let L2 switch to do it. */
2927 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2928 			LOG_DEBUG(VHOST_CONFIG,
2929 				"Enable loop back for L2 switch in vmdq.\n");
2930 		}
2931 	} else {
2932 		uint32_t nb_mbuf;
2933 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2934 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2935 
2936 		nb_mbuf = num_rx_descriptor
2937 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2938 			+ num_switching_cores * MAX_PKT_BURST;
2939 
2940 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2941 			snprintf(pool_name, sizeof(pool_name),
2942 				"rxmbuf_pool_%u", queue_id);
2943 			snprintf(ring_name, sizeof(ring_name),
2944 				"rxmbuf_ring_%u", queue_id);
2945 			setup_mempool_tbl(rte_socket_id(), queue_id,
2946 				pool_name, ring_name, nb_mbuf);
2947 		}
2948 
2949 		nb_mbuf = num_tx_descriptor
2950 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2951 				+ num_switching_cores * MAX_PKT_BURST;
2952 
2953 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2954 			snprintf(pool_name, sizeof(pool_name),
2955 				"txmbuf_pool_%u", queue_id);
2956 			snprintf(ring_name, sizeof(ring_name),
2957 				"txmbuf_ring_%u", queue_id);
2958 			setup_mempool_tbl(rte_socket_id(),
2959 				(queue_id + MAX_QUEUES),
2960 				pool_name, ring_name, nb_mbuf);
2961 		}
2962 
2963 		if (vm2vm_mode == VM2VM_HARDWARE) {
2964 			/* Enable VT loop back to let L2 switch to do it. */
2965 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2966 			LOG_DEBUG(VHOST_CONFIG,
2967 				"Enable loop back for L2 switch in vmdq.\n");
2968 		}
2969 	}
2970 	/* Set log level. */
2971 	rte_set_log_level(LOG_LEVEL);
2972 
2973 	/* initialize all ports */
2974 	for (portid = 0; portid < nb_ports; portid++) {
2975 		/* skip ports that are not enabled */
2976 		if ((enabled_port_mask & (1 << portid)) == 0) {
2977 			RTE_LOG(INFO, VHOST_PORT,
2978 				"Skipping disabled port %d\n", portid);
2979 			continue;
2980 		}
2981 		if (port_init(portid) != 0)
2982 			rte_exit(EXIT_FAILURE,
2983 				"Cannot initialize network ports\n");
2984 	}
2985 
2986 	/* Initialise all linked lists. */
2987 	if (init_data_ll() == -1)
2988 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2989 
2990 	/* Initialize device stats */
2991 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2992 
2993 	/* Enable stats if the user option is set. */
2994 	if (enable_stats)
2995 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2996 
2997 	/* Launch all data cores. */
2998 	if (zero_copy == 0) {
2999 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3000 			rte_eal_remote_launch(switch_worker,
3001 				mbuf_pool, lcore_id);
3002 		}
3003 	} else {
3004 		uint32_t count_in_mempool, index, i;
3005 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3006 			/* For all RX and TX queues. */
3007 			count_in_mempool
3008 				= rte_mempool_count(vpool_array[index].pool);
3009 
3010 			/*
3011 			 * Transfer all un-attached mbufs from vpool.pool
3012 			 * to vpoo.ring.
3013 			 */
3014 			for (i = 0; i < count_in_mempool; i++) {
3015 				struct rte_mbuf *mbuf
3016 					= __rte_mbuf_raw_alloc(
3017 						vpool_array[index].pool);
3018 				rte_ring_sp_enqueue(vpool_array[index].ring,
3019 						(void *)mbuf);
3020 			}
3021 
3022 			LOG_DEBUG(VHOST_CONFIG,
3023 				"in main: mbuf count in mempool at initial "
3024 				"is: %d\n", count_in_mempool);
3025 			LOG_DEBUG(VHOST_CONFIG,
3026 				"in main: mbuf count in  ring at initial  is :"
3027 				" %d\n",
3028 				rte_ring_count(vpool_array[index].ring));
3029 		}
3030 
3031 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3032 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3033 				lcore_id);
3034 	}
3035 
3036 	if (mergeable == 0)
3037 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3038 
3039 	/* Register CUSE device to handle IOCTLs. */
3040 	ret = rte_vhost_driver_register((char *)&dev_basename);
3041 	if (ret != 0)
3042 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3043 
3044 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3045 
3046 	/* Start CUSE session. */
3047 	rte_vhost_driver_session_start();
3048 	return 0;
3049 
3050 }
3051 
3052