xref: /dpdk/examples/vhost/main.c (revision aae5e11e847ec0b07b00c14564fdef787a0f3595)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
103 
104 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
106 
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108 
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX			1
112 #define DEVICE_SAFE_REMOVE	2
113 
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117 
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121 
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133 
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 		+ sizeof(struct rte_mbuf)))
137 
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140 
141 #define INVALID_PORT_ID 0xFF
142 
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145 
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148 
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151 
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154 
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157 
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160 
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163 
164 /* Promiscuous mode */
165 static uint32_t promiscuous;
166 
167 /*Number of switching cores enabled*/
168 static uint32_t num_switching_cores = 0;
169 
170 /* number of devices/queues to support*/
171 static uint32_t num_queues = 0;
172 static uint32_t num_devices;
173 
174 /*
175  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
176  * disabled on default.
177  */
178 static uint32_t zero_copy;
179 static int mergeable;
180 
181 /* number of descriptors to apply*/
182 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
183 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
184 
185 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
186 #define MAX_RING_DESC 4096
187 
188 struct vpool {
189 	struct rte_mempool *pool;
190 	struct rte_ring *ring;
191 	uint32_t buf_size;
192 } vpool_array[MAX_QUEUES+MAX_QUEUES];
193 
194 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
195 typedef enum {
196 	VM2VM_DISABLED = 0,
197 	VM2VM_SOFTWARE = 1,
198 	VM2VM_HARDWARE = 2,
199 	VM2VM_LAST
200 } vm2vm_type;
201 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
202 
203 /* The type of host physical address translated from guest physical address. */
204 typedef enum {
205 	PHYS_ADDR_CONTINUOUS = 0,
206 	PHYS_ADDR_CROSS_SUBREG = 1,
207 	PHYS_ADDR_INVALID = 2,
208 	PHYS_ADDR_LAST
209 } hpa_type;
210 
211 /* Enable stats. */
212 static uint32_t enable_stats = 0;
213 /* Enable retries on RX. */
214 static uint32_t enable_retry = 1;
215 /* Specify timeout (in useconds) between retries on RX. */
216 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
217 /* Specify the number of retries on RX. */
218 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
219 
220 /* Character device basename. Can be set by user. */
221 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
222 
223 
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226 	.rx_thresh = {
227 		.pthresh = RX_PTHRESH,
228 		.hthresh = RX_HTHRESH,
229 		.wthresh = RX_WTHRESH,
230 	},
231 	.rx_drop_en = 1,
232 };
233 
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240 	.tx_thresh = {
241 		.pthresh = TX_PTHRESH,
242 		.hthresh = TX_HTHRESH,
243 		.wthresh = TX_WTHRESH,
244 	},
245 	.tx_free_thresh = 0, /* Use PMD default values */
246 	.tx_rs_thresh = 0, /* Use PMD default values */
247 };
248 
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251 	.rxmode = {
252 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253 		.split_hdr_size = 0,
254 		.header_split   = 0, /**< Header Split disabled */
255 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
256 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
257 		/*
258 		 * It is necessary for 1G NIC such as I350,
259 		 * this fixes bug of ipv4 forwarding in guest can't
260 		 * forward pakets from one virtio dev to another virtio dev.
261 		 */
262 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
265 	},
266 
267 	.txmode = {
268 		.mq_mode = ETH_MQ_TX_NONE,
269 	},
270 	.rx_adv_conf = {
271 		/*
272 		 * should be overridden separately in code with
273 		 * appropriate values
274 		 */
275 		.vmdq_rx_conf = {
276 			.nb_queue_pools = ETH_8_POOLS,
277 			.enable_default_pool = 0,
278 			.default_pool = 0,
279 			.nb_pool_maps = 0,
280 			.pool_map = {{0, 0},},
281 		},
282 	},
283 };
284 
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288 
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
293 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300 
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303 
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307 
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310 
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313 	unsigned len;
314 	unsigned txq_id;
315 	struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317 
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320 
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323 
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326 	unsigned char   h_dest[ETH_ALEN];
327 	unsigned char   h_source[ETH_ALEN];
328 	__be16          h_vlan_proto;
329 	__be16          h_vlan_TCI;
330 	__be16          h_vlan_encapsulated_proto;
331 };
332 
333 /* IPv4 Header */
334 struct ipv4_hdr {
335 	uint8_t  version_ihl;		/**< version and header length */
336 	uint8_t  type_of_service;	/**< type of service */
337 	uint16_t total_length;		/**< length of packet */
338 	uint16_t packet_id;		/**< packet ID */
339 	uint16_t fragment_offset;	/**< fragmentation offset */
340 	uint8_t  time_to_live;		/**< time to live */
341 	uint8_t  next_proto_id;		/**< protocol ID */
342 	uint16_t hdr_checksum;		/**< header checksum */
343 	uint32_t src_addr;		/**< source address */
344 	uint32_t dst_addr;		/**< destination address */
345 } __attribute__((__packed__));
346 
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350 
351 /* Per-device statistics struct */
352 struct device_statistics {
353 	uint64_t tx_total;
354 	rte_atomic64_t rx_total_atomic;
355 	uint64_t rx_total;
356 	uint64_t tx;
357 	rte_atomic64_t rx_atomic;
358 	uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361 
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369 	struct rte_eth_vmdq_rx_conf conf;
370 	struct rte_eth_vmdq_rx_conf *def_conf =
371 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
372 	unsigned i;
373 
374 	memset(&conf, 0, sizeof(conf));
375 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
376 	conf.nb_pool_maps = num_devices;
377 	conf.enable_loop_back = def_conf->enable_loop_back;
378 	conf.rx_mode = def_conf->rx_mode;
379 
380 	for (i = 0; i < conf.nb_pool_maps; i++) {
381 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
382 		conf.pool_map[i].pools = (1UL << i);
383 	}
384 
385 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
386 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
387 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
388 	return 0;
389 }
390 
391 /*
392  * Validate the device number according to the max pool number gotten form
393  * dev_info. If the device number is invalid, give the error message and
394  * return -1. Each device must have its own pool.
395  */
396 static inline int
397 validate_num_devices(uint32_t max_nb_devices)
398 {
399 	if (num_devices > max_nb_devices) {
400 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
401 		return -1;
402 	}
403 	return 0;
404 }
405 
406 /*
407  * Initialises a given port using global settings and with the rx buffers
408  * coming from the mbuf_pool passed as parameter
409  */
410 static inline int
411 port_init(uint8_t port)
412 {
413 	struct rte_eth_dev_info dev_info;
414 	struct rte_eth_conf port_conf;
415 	uint16_t rx_rings, tx_rings;
416 	uint16_t rx_ring_size, tx_ring_size;
417 	int retval;
418 	uint16_t q;
419 
420 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
421 	rte_eth_dev_info_get (port, &dev_info);
422 
423 	/*configure the number of supported virtio devices based on VMDQ limits */
424 	num_devices = dev_info.max_vmdq_pools;
425 	num_queues = dev_info.max_rx_queues;
426 
427 	if (zero_copy) {
428 		rx_ring_size = num_rx_descriptor;
429 		tx_ring_size = num_tx_descriptor;
430 		tx_rings = dev_info.max_tx_queues;
431 	} else {
432 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
433 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
434 		tx_rings = (uint16_t)rte_lcore_count();
435 	}
436 
437 	retval = validate_num_devices(MAX_DEVICES);
438 	if (retval < 0)
439 		return retval;
440 
441 	/* Get port configuration. */
442 	retval = get_eth_conf(&port_conf, num_devices);
443 	if (retval < 0)
444 		return retval;
445 
446 	if (port >= rte_eth_dev_count()) return -1;
447 
448 	rx_rings = (uint16_t)num_queues,
449 	/* Configure ethernet device. */
450 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
451 	if (retval != 0)
452 		return retval;
453 
454 	/* Setup the queues. */
455 	for (q = 0; q < rx_rings; q ++) {
456 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
457 						rte_eth_dev_socket_id(port), &rx_conf_default,
458 						vpool_array[q].pool);
459 		if (retval < 0)
460 			return retval;
461 	}
462 	for (q = 0; q < tx_rings; q ++) {
463 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
464 						rte_eth_dev_socket_id(port), &tx_conf_default);
465 		if (retval < 0)
466 			return retval;
467 	}
468 
469 	/* Start the device. */
470 	retval  = rte_eth_dev_start(port);
471 	if (retval < 0) {
472 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
473 		return retval;
474 	}
475 
476 	if (promiscuous)
477 		rte_eth_promiscuous_enable(port);
478 
479 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
480 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
481 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
482 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
483 			(unsigned)port,
484 			vmdq_ports_eth_addr[port].addr_bytes[0],
485 			vmdq_ports_eth_addr[port].addr_bytes[1],
486 			vmdq_ports_eth_addr[port].addr_bytes[2],
487 			vmdq_ports_eth_addr[port].addr_bytes[3],
488 			vmdq_ports_eth_addr[port].addr_bytes[4],
489 			vmdq_ports_eth_addr[port].addr_bytes[5]);
490 
491 	return 0;
492 }
493 
494 /*
495  * Set character device basename.
496  */
497 static int
498 us_vhost_parse_basename(const char *q_arg)
499 {
500 	/* parse number string */
501 
502 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503 		return -1;
504 	else
505 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
506 
507 	return 0;
508 }
509 
510 /*
511  * Parse the portmask provided at run time.
512  */
513 static int
514 parse_portmask(const char *portmask)
515 {
516 	char *end = NULL;
517 	unsigned long pm;
518 
519 	errno = 0;
520 
521 	/* parse hexadecimal string */
522 	pm = strtoul(portmask, &end, 16);
523 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
524 		return -1;
525 
526 	if (pm == 0)
527 		return -1;
528 
529 	return pm;
530 
531 }
532 
533 /*
534  * Parse num options at run time.
535  */
536 static int
537 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
538 {
539 	char *end = NULL;
540 	unsigned long num;
541 
542 	errno = 0;
543 
544 	/* parse unsigned int string */
545 	num = strtoul(q_arg, &end, 10);
546 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547 		return -1;
548 
549 	if (num > max_valid_value)
550 		return -1;
551 
552 	return num;
553 
554 }
555 
556 /*
557  * Display usage
558  */
559 static void
560 us_vhost_usage(const char *prgname)
561 {
562 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
563 	"		--vm2vm [0|1|2]\n"
564 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
565 	"		--dev-basename <name>\n"
566 	"		--nb-devices ND\n"
567 	"		-p PORTMASK: Set mask for ports to be used by application\n"
568 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
569 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
570 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
571 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
572 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
573 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574 	"		--dev-basename: The basename to be used for the character device.\n"
575 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
576 			"zero copy\n"
577 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
578 			"used only when zero copy is enabled.\n"
579 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
580 			"used only when zero copy is enabled.\n",
581 	       prgname);
582 }
583 
584 /*
585  * Parse the arguments given in the command line of the application.
586  */
587 static int
588 us_vhost_parse_args(int argc, char **argv)
589 {
590 	int opt, ret;
591 	int option_index;
592 	unsigned i;
593 	const char *prgname = argv[0];
594 	static struct option long_option[] = {
595 		{"vm2vm", required_argument, NULL, 0},
596 		{"rx-retry", required_argument, NULL, 0},
597 		{"rx-retry-delay", required_argument, NULL, 0},
598 		{"rx-retry-num", required_argument, NULL, 0},
599 		{"mergeable", required_argument, NULL, 0},
600 		{"stats", required_argument, NULL, 0},
601 		{"dev-basename", required_argument, NULL, 0},
602 		{"zero-copy", required_argument, NULL, 0},
603 		{"rx-desc-num", required_argument, NULL, 0},
604 		{"tx-desc-num", required_argument, NULL, 0},
605 		{NULL, 0, 0, 0},
606 	};
607 
608 	/* Parse command line */
609 	while ((opt = getopt_long(argc, argv, "p:P",
610 			long_option, &option_index)) != EOF) {
611 		switch (opt) {
612 		/* Portmask */
613 		case 'p':
614 			enabled_port_mask = parse_portmask(optarg);
615 			if (enabled_port_mask == 0) {
616 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617 				us_vhost_usage(prgname);
618 				return -1;
619 			}
620 			break;
621 
622 		case 'P':
623 			promiscuous = 1;
624 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625 				ETH_VMDQ_ACCEPT_BROADCAST |
626 				ETH_VMDQ_ACCEPT_MULTICAST;
627 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
628 
629 			break;
630 
631 		case 0:
632 			/* Enable/disable vm2vm comms. */
633 			if (!strncmp(long_option[option_index].name, "vm2vm",
634 				MAX_LONG_OPT_SZ)) {
635 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
636 				if (ret == -1) {
637 					RTE_LOG(INFO, VHOST_CONFIG,
638 						"Invalid argument for "
639 						"vm2vm [0|1|2]\n");
640 					us_vhost_usage(prgname);
641 					return -1;
642 				} else {
643 					vm2vm_mode = (vm2vm_type)ret;
644 				}
645 			}
646 
647 			/* Enable/disable retries on RX. */
648 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649 				ret = parse_num_opt(optarg, 1);
650 				if (ret == -1) {
651 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652 					us_vhost_usage(prgname);
653 					return -1;
654 				} else {
655 					enable_retry = ret;
656 				}
657 			}
658 
659 			/* Specify the retries delay time (in useconds) on RX. */
660 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661 				ret = parse_num_opt(optarg, INT32_MAX);
662 				if (ret == -1) {
663 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664 					us_vhost_usage(prgname);
665 					return -1;
666 				} else {
667 					burst_rx_delay_time = ret;
668 				}
669 			}
670 
671 			/* Specify the retries number on RX. */
672 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673 				ret = parse_num_opt(optarg, INT32_MAX);
674 				if (ret == -1) {
675 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676 					us_vhost_usage(prgname);
677 					return -1;
678 				} else {
679 					burst_rx_retry_num = ret;
680 				}
681 			}
682 
683 			/* Enable/disable RX mergeable buffers. */
684 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685 				ret = parse_num_opt(optarg, 1);
686 				if (ret == -1) {
687 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688 					us_vhost_usage(prgname);
689 					return -1;
690 				} else {
691 					mergeable = !!ret;
692 					if (ret) {
693 						vmdq_conf_default.rxmode.jumbo_frame = 1;
694 						vmdq_conf_default.rxmode.max_rx_pkt_len
695 							= JUMBO_FRAME_MAX_SIZE;
696 					}
697 				}
698 			}
699 
700 			/* Enable/disable stats. */
701 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
702 				ret = parse_num_opt(optarg, INT32_MAX);
703 				if (ret == -1) {
704 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
705 					us_vhost_usage(prgname);
706 					return -1;
707 				} else {
708 					enable_stats = ret;
709 				}
710 			}
711 
712 			/* Set character device basename. */
713 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
714 				if (us_vhost_parse_basename(optarg) == -1) {
715 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
716 					us_vhost_usage(prgname);
717 					return -1;
718 				}
719 			}
720 
721 			/* Enable/disable rx/tx zero copy. */
722 			if (!strncmp(long_option[option_index].name,
723 				"zero-copy", MAX_LONG_OPT_SZ)) {
724 				ret = parse_num_opt(optarg, 1);
725 				if (ret == -1) {
726 					RTE_LOG(INFO, VHOST_CONFIG,
727 						"Invalid argument"
728 						" for zero-copy [0|1]\n");
729 					us_vhost_usage(prgname);
730 					return -1;
731 				} else
732 					zero_copy = ret;
733 
734 				if (zero_copy) {
735 #ifdef RTE_MBUF_REFCNT
736 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
737 					"zero copy vhost APP, please "
738 					"disable RTE_MBUF_REFCNT\n"
739 					"in config file and then rebuild DPDK "
740 					"core lib!\n"
741 					"Otherwise please disable zero copy "
742 					"flag in command line!\n");
743 					return -1;
744 #endif
745 				}
746 			}
747 
748 			/* Specify the descriptor number on RX. */
749 			if (!strncmp(long_option[option_index].name,
750 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
751 				ret = parse_num_opt(optarg, MAX_RING_DESC);
752 				if ((ret == -1) || (!POWEROF2(ret))) {
753 					RTE_LOG(INFO, VHOST_CONFIG,
754 					"Invalid argument for rx-desc-num[0-N],"
755 					"power of 2 required.\n");
756 					us_vhost_usage(prgname);
757 					return -1;
758 				} else {
759 					num_rx_descriptor = ret;
760 				}
761 			}
762 
763 			/* Specify the descriptor number on TX. */
764 			if (!strncmp(long_option[option_index].name,
765 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
766 				ret = parse_num_opt(optarg, MAX_RING_DESC);
767 				if ((ret == -1) || (!POWEROF2(ret))) {
768 					RTE_LOG(INFO, VHOST_CONFIG,
769 					"Invalid argument for tx-desc-num [0-N],"
770 					"power of 2 required.\n");
771 					us_vhost_usage(prgname);
772 					return -1;
773 				} else {
774 					num_tx_descriptor = ret;
775 				}
776 			}
777 
778 			break;
779 
780 			/* Invalid option - print options. */
781 		default:
782 			us_vhost_usage(prgname);
783 			return -1;
784 		}
785 	}
786 
787 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
788 		if (enabled_port_mask & (1 << i))
789 			ports[num_ports++] = (uint8_t)i;
790 	}
791 
792 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
793 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
794 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
795 		return -1;
796 	}
797 
798 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
799 		RTE_LOG(INFO, VHOST_PORT,
800 			"Vhost zero copy doesn't support software vm2vm,"
801 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
802 		return -1;
803 	}
804 
805 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
806 		RTE_LOG(INFO, VHOST_PORT,
807 			"Vhost zero copy doesn't support jumbo frame,"
808 			"please specify '--mergeable 0' to disable the "
809 			"mergeable feature.\n");
810 		return -1;
811 	}
812 
813 	return 0;
814 }
815 
816 /*
817  * Update the global var NUM_PORTS and array PORTS according to system ports number
818  * and return valid ports number
819  */
820 static unsigned check_ports_num(unsigned nb_ports)
821 {
822 	unsigned valid_num_ports = num_ports;
823 	unsigned portid;
824 
825 	if (num_ports > nb_ports) {
826 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
827 			num_ports, nb_ports);
828 		num_ports = nb_ports;
829 	}
830 
831 	for (portid = 0; portid < num_ports; portid ++) {
832 		if (ports[portid] >= nb_ports) {
833 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
834 				ports[portid], (nb_ports - 1));
835 			ports[portid] = INVALID_PORT_ID;
836 			valid_num_ports--;
837 		}
838 	}
839 	return valid_num_ports;
840 }
841 
842 /*
843  * Macro to print out packet contents. Wrapped in debug define so that the
844  * data path is not effected when debug is disabled.
845  */
846 #ifdef DEBUG
847 #define PRINT_PACKET(device, addr, size, header) do {																\
848 	char *pkt_addr = (char*)(addr);																					\
849 	unsigned int index;																								\
850 	char packet[MAX_PRINT_BUFF];																					\
851 																													\
852 	if ((header))																									\
853 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
854 	else																											\
855 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
856 	for (index = 0; index < (size); index++) {																		\
857 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
858 			"%02hhx ", pkt_addr[index]);																			\
859 	}																												\
860 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
861 																													\
862 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
863 } while(0)
864 #else
865 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
866 #endif
867 
868 /*
869  * Function to convert guest physical addresses to vhost physical addresses.
870  * This is used to convert virtio buffer addresses.
871  */
872 static inline uint64_t __attribute__((always_inline))
873 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
874 	uint32_t buf_len, hpa_type *addr_type)
875 {
876 	struct virtio_memory_regions_hpa *region;
877 	uint32_t regionidx;
878 	uint64_t vhost_pa = 0;
879 
880 	*addr_type = PHYS_ADDR_INVALID;
881 
882 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
883 		region = &vdev->regions_hpa[regionidx];
884 		if ((guest_pa >= region->guest_phys_address) &&
885 			(guest_pa <= region->guest_phys_address_end)) {
886 			vhost_pa = region->host_phys_addr_offset + guest_pa;
887 			if (likely((guest_pa + buf_len - 1)
888 				<= region->guest_phys_address_end))
889 				*addr_type = PHYS_ADDR_CONTINUOUS;
890 			else
891 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
892 			break;
893 		}
894 	}
895 
896 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
897 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
898 		(void *)(uintptr_t)vhost_pa);
899 
900 	return vhost_pa;
901 }
902 
903 /*
904  * Compares a packet destination MAC address to a device MAC address.
905  */
906 static inline int __attribute__((always_inline))
907 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
908 {
909 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
910 }
911 
912 /*
913  * This function learns the MAC address of the device and registers this along with a
914  * vlan tag to a VMDQ.
915  */
916 static int
917 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
918 {
919 	struct ether_hdr *pkt_hdr;
920 	struct virtio_net_data_ll *dev_ll;
921 	struct virtio_net *dev = vdev->dev;
922 	int i, ret;
923 
924 	/* Learn MAC address of guest device from packet */
925 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
926 
927 	dev_ll = ll_root_used;
928 
929 	while (dev_ll != NULL) {
930 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
931 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
932 			return -1;
933 		}
934 		dev_ll = dev_ll->next;
935 	}
936 
937 	for (i = 0; i < ETHER_ADDR_LEN; i++)
938 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
939 
940 	/* vlan_tag currently uses the device_id. */
941 	vdev->vlan_tag = vlan_tags[dev->device_fh];
942 
943 	/* Print out VMDQ registration info. */
944 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
945 		dev->device_fh,
946 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
947 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
948 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
949 		vdev->vlan_tag);
950 
951 	/* Register the MAC address. */
952 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
953 	if (ret)
954 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
955 					dev->device_fh);
956 
957 	/* Enable stripping of the vlan tag as we handle routing. */
958 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
959 
960 	/* Set device as ready for RX. */
961 	vdev->ready = DEVICE_RX;
962 
963 	return 0;
964 }
965 
966 /*
967  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
968  * queue before disabling RX on the device.
969  */
970 static inline void
971 unlink_vmdq(struct vhost_dev *vdev)
972 {
973 	unsigned i = 0;
974 	unsigned rx_count;
975 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
976 
977 	if (vdev->ready == DEVICE_RX) {
978 		/*clear MAC and VLAN settings*/
979 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
980 		for (i = 0; i < 6; i++)
981 			vdev->mac_address.addr_bytes[i] = 0;
982 
983 		vdev->vlan_tag = 0;
984 
985 		/*Clear out the receive buffers*/
986 		rx_count = rte_eth_rx_burst(ports[0],
987 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
988 
989 		while (rx_count) {
990 			for (i = 0; i < rx_count; i++)
991 				rte_pktmbuf_free(pkts_burst[i]);
992 
993 			rx_count = rte_eth_rx_burst(ports[0],
994 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
995 		}
996 
997 		vdev->ready = DEVICE_MAC_LEARNING;
998 	}
999 }
1000 
1001 /*
1002  * Check if the packet destination MAC address is for a local device. If so then put
1003  * the packet on that devices RX queue. If not then return.
1004  */
1005 static inline int __attribute__((always_inline))
1006 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1007 {
1008 	struct virtio_net_data_ll *dev_ll;
1009 	struct ether_hdr *pkt_hdr;
1010 	uint64_t ret = 0;
1011 	struct virtio_net *dev = vdev->dev;
1012 	struct virtio_net *tdev; /* destination virito device */
1013 
1014 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1015 
1016 	/*get the used devices list*/
1017 	dev_ll = ll_root_used;
1018 
1019 	while (dev_ll != NULL) {
1020 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1021 				          &dev_ll->vdev->mac_address)) {
1022 
1023 			/* Drop the packet if the TX packet is destined for the TX device. */
1024 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1025 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1026 							dev->device_fh);
1027 				return 0;
1028 			}
1029 			tdev = dev_ll->vdev->dev;
1030 
1031 
1032 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1033 
1034 			if (unlikely(dev_ll->vdev->remove)) {
1035 				/*drop the packet if the device is marked for removal*/
1036 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1037 			} else {
1038 				/*send the packet to the local virtio device*/
1039 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1040 				if (enable_stats) {
1041 					rte_atomic64_add(
1042 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1043 					1);
1044 					rte_atomic64_add(
1045 					&dev_statistics[tdev->device_fh].rx_atomic,
1046 					ret);
1047 					dev_statistics[tdev->device_fh].tx_total++;
1048 					dev_statistics[tdev->device_fh].tx += ret;
1049 				}
1050 			}
1051 
1052 			return 0;
1053 		}
1054 		dev_ll = dev_ll->next;
1055 	}
1056 
1057 	return -1;
1058 }
1059 
1060 /*
1061  * Check if the destination MAC of a packet is one local VM,
1062  * and get its vlan tag, and offset if it is.
1063  */
1064 static inline int __attribute__((always_inline))
1065 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1066 	uint32_t *offset, uint16_t *vlan_tag)
1067 {
1068 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1069 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1070 
1071 	while (dev_ll != NULL) {
1072 		if ((dev_ll->vdev->ready == DEVICE_RX)
1073 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1074 		&dev_ll->vdev->mac_address)) {
1075 			/*
1076 			 * Drop the packet if the TX packet is
1077 			 * destined for the TX device.
1078 			 */
1079 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1080 				LOG_DEBUG(VHOST_DATA,
1081 				"(%"PRIu64") TX: Source and destination"
1082 				" MAC addresses are the same. Dropping "
1083 				"packet.\n",
1084 				dev_ll->vdev->dev->device_fh);
1085 				return -1;
1086 			}
1087 
1088 			/*
1089 			 * HW vlan strip will reduce the packet length
1090 			 * by minus length of vlan tag, so need restore
1091 			 * the packet length by plus it.
1092 			 */
1093 			*offset = VLAN_HLEN;
1094 			*vlan_tag =
1095 			(uint16_t)
1096 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1097 
1098 			LOG_DEBUG(VHOST_DATA,
1099 			"(%"PRIu64") TX: pkt to local VM device id:"
1100 			"(%"PRIu64") vlan tag: %d.\n",
1101 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1102 			vlan_tag);
1103 
1104 			break;
1105 		}
1106 		dev_ll = dev_ll->next;
1107 	}
1108 	return 0;
1109 }
1110 
1111 /*
1112  * This function routes the TX packet to the correct interface. This may be a local device
1113  * or the physical port.
1114  */
1115 static inline void __attribute__((always_inline))
1116 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1117 {
1118 	struct mbuf_table *tx_q;
1119 	struct rte_mbuf **m_table;
1120 	unsigned len, ret, offset = 0;
1121 	const uint16_t lcore_id = rte_lcore_id();
1122 	struct virtio_net *dev = vdev->dev;
1123 
1124 	/*check if destination is local VM*/
1125 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1126 		rte_pktmbuf_free(m);
1127 		return;
1128 	}
1129 
1130 	if (vm2vm_mode == VM2VM_HARDWARE) {
1131 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1132 			offset > rte_pktmbuf_tailroom(m)) {
1133 			rte_pktmbuf_free(m);
1134 			return;
1135 		}
1136 	}
1137 
1138 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1139 
1140 	/*Add packet to the port tx queue*/
1141 	tx_q = &lcore_tx_queue[lcore_id];
1142 	len = tx_q->len;
1143 
1144 	m->ol_flags = PKT_TX_VLAN_PKT;
1145 
1146 	m->data_len += offset;
1147 	m->pkt_len += offset;
1148 
1149 	m->vlan_tci = vlan_tag;
1150 
1151 	tx_q->m_table[len] = m;
1152 	len++;
1153 	if (enable_stats) {
1154 		dev_statistics[dev->device_fh].tx_total++;
1155 		dev_statistics[dev->device_fh].tx++;
1156 	}
1157 
1158 	if (unlikely(len == MAX_PKT_BURST)) {
1159 		m_table = (struct rte_mbuf **)tx_q->m_table;
1160 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1161 		/* Free any buffers not handled by TX and update the port stats. */
1162 		if (unlikely(ret < len)) {
1163 			do {
1164 				rte_pktmbuf_free(m_table[ret]);
1165 			} while (++ret < len);
1166 		}
1167 
1168 		len = 0;
1169 	}
1170 
1171 	tx_q->len = len;
1172 	return;
1173 }
1174 /*
1175  * This function is called by each data core. It handles all RX/TX registered with the
1176  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1177  * with all devices in the main linked list.
1178  */
1179 static int
1180 switch_worker(__attribute__((unused)) void *arg)
1181 {
1182 	struct rte_mempool *mbuf_pool = arg;
1183 	struct virtio_net *dev = NULL;
1184 	struct vhost_dev *vdev = NULL;
1185 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1186 	struct virtio_net_data_ll *dev_ll;
1187 	struct mbuf_table *tx_q;
1188 	volatile struct lcore_ll_info *lcore_ll;
1189 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1190 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1191 	unsigned ret, i;
1192 	const uint16_t lcore_id = rte_lcore_id();
1193 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1194 	uint16_t rx_count = 0;
1195 	uint16_t tx_count;
1196 	uint32_t retry = 0;
1197 
1198 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1199 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1200 	prev_tsc = 0;
1201 
1202 	tx_q = &lcore_tx_queue[lcore_id];
1203 	for (i = 0; i < num_cores; i ++) {
1204 		if (lcore_ids[i] == lcore_id) {
1205 			tx_q->txq_id = i;
1206 			break;
1207 		}
1208 	}
1209 
1210 	while(1) {
1211 		cur_tsc = rte_rdtsc();
1212 		/*
1213 		 * TX burst queue drain
1214 		 */
1215 		diff_tsc = cur_tsc - prev_tsc;
1216 		if (unlikely(diff_tsc > drain_tsc)) {
1217 
1218 			if (tx_q->len) {
1219 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1220 
1221 				/*Tx any packets in the queue*/
1222 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1223 									   (struct rte_mbuf **)tx_q->m_table,
1224 									   (uint16_t)tx_q->len);
1225 				if (unlikely(ret < tx_q->len)) {
1226 					do {
1227 						rte_pktmbuf_free(tx_q->m_table[ret]);
1228 					} while (++ret < tx_q->len);
1229 				}
1230 
1231 				tx_q->len = 0;
1232 			}
1233 
1234 			prev_tsc = cur_tsc;
1235 
1236 		}
1237 
1238 		rte_prefetch0(lcore_ll->ll_root_used);
1239 		/*
1240 		 * Inform the configuration core that we have exited the linked list and that no devices are
1241 		 * in use if requested.
1242 		 */
1243 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1244 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1245 
1246 		/*
1247 		 * Process devices
1248 		 */
1249 		dev_ll = lcore_ll->ll_root_used;
1250 
1251 		while (dev_ll != NULL) {
1252 			/*get virtio device ID*/
1253 			vdev = dev_ll->vdev;
1254 			dev = vdev->dev;
1255 
1256 			if (unlikely(vdev->remove)) {
1257 				dev_ll = dev_ll->next;
1258 				unlink_vmdq(vdev);
1259 				vdev->ready = DEVICE_SAFE_REMOVE;
1260 				continue;
1261 			}
1262 			if (likely(vdev->ready == DEVICE_RX)) {
1263 				/*Handle guest RX*/
1264 				rx_count = rte_eth_rx_burst(ports[0],
1265 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1266 
1267 				if (rx_count) {
1268 					/*
1269 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1270 					* Here MAX_PKT_BURST must be less than virtio queue size
1271 					*/
1272 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1273 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1274 							rte_delay_us(burst_rx_delay_time);
1275 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1276 								break;
1277 						}
1278 					}
1279 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1280 					if (enable_stats) {
1281 						rte_atomic64_add(
1282 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1283 						rx_count);
1284 						rte_atomic64_add(
1285 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1286 					}
1287 					while (likely(rx_count)) {
1288 						rx_count--;
1289 						rte_pktmbuf_free(pkts_burst[rx_count]);
1290 					}
1291 
1292 				}
1293 			}
1294 
1295 			if (likely(!vdev->remove)) {
1296 				/* Handle guest TX*/
1297 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1298 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1299 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1300 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1301 						while (tx_count--)
1302 							rte_pktmbuf_free(pkts_burst[tx_count]);
1303 					}
1304 				}
1305 				while (tx_count)
1306 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1307 			}
1308 
1309 			/*move to the next device in the list*/
1310 			dev_ll = dev_ll->next;
1311 		}
1312 	}
1313 
1314 	return 0;
1315 }
1316 
1317 /*
1318  * This function gets available ring number for zero copy rx.
1319  * Only one thread will call this funciton for a paticular virtio device,
1320  * so, it is designed as non-thread-safe function.
1321  */
1322 static inline uint32_t __attribute__((always_inline))
1323 get_available_ring_num_zcp(struct virtio_net *dev)
1324 {
1325 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1326 	uint16_t avail_idx;
1327 
1328 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1329 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1330 }
1331 
1332 /*
1333  * This function gets available ring index for zero copy rx,
1334  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1335  * Only one thread will call this funciton for a paticular virtio device,
1336  * so, it is designed as non-thread-safe function.
1337  */
1338 static inline uint32_t __attribute__((always_inline))
1339 get_available_ring_index_zcp(struct virtio_net *dev,
1340 	uint16_t *res_base_idx, uint32_t count)
1341 {
1342 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1343 	uint16_t avail_idx;
1344 	uint32_t retry = 0;
1345 	uint16_t free_entries;
1346 
1347 	*res_base_idx = vq->last_used_idx_res;
1348 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1349 	free_entries = (avail_idx - *res_base_idx);
1350 
1351 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1352 			"avail idx: %d, "
1353 			"res base idx:%d, free entries:%d\n",
1354 			dev->device_fh, avail_idx, *res_base_idx,
1355 			free_entries);
1356 
1357 	/*
1358 	 * If retry is enabled and the queue is full then we wait
1359 	 * and retry to avoid packet loss.
1360 	 */
1361 	if (enable_retry && unlikely(count > free_entries)) {
1362 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1363 			rte_delay_us(burst_rx_delay_time);
1364 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1365 			free_entries = (avail_idx - *res_base_idx);
1366 			if (count <= free_entries)
1367 				break;
1368 		}
1369 	}
1370 
1371 	/*check that we have enough buffers*/
1372 	if (unlikely(count > free_entries))
1373 		count = free_entries;
1374 
1375 	if (unlikely(count == 0)) {
1376 		LOG_DEBUG(VHOST_DATA,
1377 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1378 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1379 			dev->device_fh, avail_idx,
1380 			*res_base_idx, free_entries);
1381 		return 0;
1382 	}
1383 
1384 	vq->last_used_idx_res = *res_base_idx + count;
1385 
1386 	return count;
1387 }
1388 
1389 /*
1390  * This function put descriptor back to used list.
1391  */
1392 static inline void __attribute__((always_inline))
1393 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1394 {
1395 	uint16_t res_cur_idx = vq->last_used_idx;
1396 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1397 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1398 	rte_compiler_barrier();
1399 	*(volatile uint16_t *)&vq->used->idx += 1;
1400 	vq->last_used_idx += 1;
1401 
1402 	/* Kick the guest if necessary. */
1403 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1404 		eventfd_write((int)vq->kickfd, 1);
1405 }
1406 
1407 /*
1408  * This function get available descriptor from vitio vring and un-attached mbuf
1409  * from vpool->ring, and then attach them together. It needs adjust the offset
1410  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1411  * frame data may be put to wrong location in mbuf.
1412  */
1413 static inline void __attribute__((always_inline))
1414 attach_rxmbuf_zcp(struct virtio_net *dev)
1415 {
1416 	uint16_t res_base_idx, desc_idx;
1417 	uint64_t buff_addr, phys_addr;
1418 	struct vhost_virtqueue *vq;
1419 	struct vring_desc *desc;
1420 	struct rte_mbuf *mbuf = NULL;
1421 	struct vpool *vpool;
1422 	hpa_type addr_type;
1423 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1424 
1425 	vpool = &vpool_array[vdev->vmdq_rx_q];
1426 	vq = dev->virtqueue[VIRTIO_RXQ];
1427 
1428 	do {
1429 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1430 				1) != 1))
1431 			return;
1432 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1433 
1434 		desc = &vq->desc[desc_idx];
1435 		if (desc->flags & VRING_DESC_F_NEXT) {
1436 			desc = &vq->desc[desc->next];
1437 			buff_addr = gpa_to_vva(dev, desc->addr);
1438 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1439 					&addr_type);
1440 		} else {
1441 			buff_addr = gpa_to_vva(dev,
1442 					desc->addr + vq->vhost_hlen);
1443 			phys_addr = gpa_to_hpa(vdev,
1444 					desc->addr + vq->vhost_hlen,
1445 					desc->len, &addr_type);
1446 		}
1447 
1448 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1449 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1450 				" address found when attaching RX frame buffer"
1451 				" address!\n", dev->device_fh);
1452 			put_desc_to_used_list_zcp(vq, desc_idx);
1453 			continue;
1454 		}
1455 
1456 		/*
1457 		 * Check if the frame buffer address from guest crosses
1458 		 * sub-region or not.
1459 		 */
1460 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1461 			RTE_LOG(ERR, VHOST_DATA,
1462 				"(%"PRIu64") Frame buffer address cross "
1463 				"sub-regioin found when attaching RX frame "
1464 				"buffer address!\n",
1465 				dev->device_fh);
1466 			put_desc_to_used_list_zcp(vq, desc_idx);
1467 			continue;
1468 		}
1469 	} while (unlikely(phys_addr == 0));
1470 
1471 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1472 	if (unlikely(mbuf == NULL)) {
1473 		LOG_DEBUG(VHOST_DATA,
1474 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1475 			"ring_sc_dequeue fail.\n",
1476 			dev->device_fh);
1477 		put_desc_to_used_list_zcp(vq, desc_idx);
1478 		return;
1479 	}
1480 
1481 	if (unlikely(vpool->buf_size > desc->len)) {
1482 		LOG_DEBUG(VHOST_DATA,
1483 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1484 			"length(%d) of descriptor idx: %d less than room "
1485 			"size required: %d\n",
1486 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1487 		put_desc_to_used_list_zcp(vq, desc_idx);
1488 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1489 		return;
1490 	}
1491 
1492 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1493 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1494 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1495 	mbuf->data_len = desc->len;
1496 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1497 
1498 	LOG_DEBUG(VHOST_DATA,
1499 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1500 		"descriptor idx:%d\n",
1501 		dev->device_fh, res_base_idx, desc_idx);
1502 
1503 	__rte_mbuf_raw_free(mbuf);
1504 
1505 	return;
1506 }
1507 
1508 /*
1509  * Detach an attched packet mbuf -
1510  *  - restore original mbuf address and length values.
1511  *  - reset pktmbuf data and data_len to their default values.
1512  *  All other fields of the given packet mbuf will be left intact.
1513  *
1514  * @param m
1515  *   The attached packet mbuf.
1516  */
1517 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1518 {
1519 	const struct rte_mempool *mp = m->pool;
1520 	void *buf = RTE_MBUF_TO_BADDR(m);
1521 	uint32_t buf_ofs;
1522 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1523 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1524 
1525 	m->buf_addr = buf;
1526 	m->buf_len = (uint16_t)buf_len;
1527 
1528 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1529 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1530 	m->data_off = buf_ofs;
1531 
1532 	m->data_len = 0;
1533 }
1534 
1535 /*
1536  * This function is called after packets have been transimited. It fetchs mbuf
1537  * from vpool->pool, detached it and put into vpool->ring. It also update the
1538  * used index and kick the guest if necessary.
1539  */
1540 static inline uint32_t __attribute__((always_inline))
1541 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1542 {
1543 	struct rte_mbuf *mbuf;
1544 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1545 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1546 	uint32_t index = 0;
1547 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1548 
1549 	LOG_DEBUG(VHOST_DATA,
1550 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1551 		"clean is: %d\n",
1552 		dev->device_fh, mbuf_count);
1553 	LOG_DEBUG(VHOST_DATA,
1554 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1555 		"clean  is : %d\n",
1556 		dev->device_fh, rte_ring_count(vpool->ring));
1557 
1558 	for (index = 0; index < mbuf_count; index++) {
1559 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1560 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1561 			pktmbuf_detach_zcp(mbuf);
1562 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1563 
1564 		/* Update used index buffer information. */
1565 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1566 		vq->used->ring[used_idx].len = 0;
1567 
1568 		used_idx = (used_idx + 1) & (vq->size - 1);
1569 	}
1570 
1571 	LOG_DEBUG(VHOST_DATA,
1572 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1573 		"clean is: %d\n",
1574 		dev->device_fh, rte_mempool_count(vpool->pool));
1575 	LOG_DEBUG(VHOST_DATA,
1576 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1577 		"clean  is : %d\n",
1578 		dev->device_fh, rte_ring_count(vpool->ring));
1579 	LOG_DEBUG(VHOST_DATA,
1580 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1581 		"vq->last_used_idx:%d\n",
1582 		dev->device_fh, vq->last_used_idx);
1583 
1584 	vq->last_used_idx += mbuf_count;
1585 
1586 	LOG_DEBUG(VHOST_DATA,
1587 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1588 		"vq->last_used_idx:%d\n",
1589 		dev->device_fh, vq->last_used_idx);
1590 
1591 	rte_compiler_barrier();
1592 
1593 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1594 
1595 	/* Kick guest if required. */
1596 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1597 		eventfd_write((int)vq->kickfd, 1);
1598 
1599 	return 0;
1600 }
1601 
1602 /*
1603  * This function is called when a virtio device is destroy.
1604  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1605  */
1606 static void mbuf_destroy_zcp(struct vpool *vpool)
1607 {
1608 	struct rte_mbuf *mbuf = NULL;
1609 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1610 
1611 	LOG_DEBUG(VHOST_CONFIG,
1612 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1613 		"mbuf_destroy_zcp is: %d\n",
1614 		mbuf_count);
1615 	LOG_DEBUG(VHOST_CONFIG,
1616 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1617 		"mbuf_destroy_zcp  is : %d\n",
1618 		rte_ring_count(vpool->ring));
1619 
1620 	for (index = 0; index < mbuf_count; index++) {
1621 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1622 		if (likely(mbuf != NULL)) {
1623 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1624 				pktmbuf_detach_zcp(mbuf);
1625 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1626 		}
1627 	}
1628 
1629 	LOG_DEBUG(VHOST_CONFIG,
1630 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1631 		"mbuf_destroy_zcp is: %d\n",
1632 		rte_mempool_count(vpool->pool));
1633 	LOG_DEBUG(VHOST_CONFIG,
1634 		"in mbuf_destroy_zcp: mbuf count in ring after "
1635 		"mbuf_destroy_zcp is : %d\n",
1636 		rte_ring_count(vpool->ring));
1637 }
1638 
1639 /*
1640  * This function update the use flag and counter.
1641  */
1642 static inline uint32_t __attribute__((always_inline))
1643 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1644 	uint32_t count)
1645 {
1646 	struct vhost_virtqueue *vq;
1647 	struct vring_desc *desc;
1648 	struct rte_mbuf *buff;
1649 	/* The virtio_hdr is initialised to 0. */
1650 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1651 		= {{0, 0, 0, 0, 0, 0}, 0};
1652 	uint64_t buff_hdr_addr = 0;
1653 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1654 	uint32_t head_idx, packet_success = 0;
1655 	uint16_t res_cur_idx;
1656 
1657 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1658 
1659 	if (count == 0)
1660 		return 0;
1661 
1662 	vq = dev->virtqueue[VIRTIO_RXQ];
1663 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1664 
1665 	res_cur_idx = vq->last_used_idx;
1666 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1667 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1668 
1669 	/* Retrieve all of the head indexes first to avoid caching issues. */
1670 	for (head_idx = 0; head_idx < count; head_idx++)
1671 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1672 
1673 	/*Prefetch descriptor index. */
1674 	rte_prefetch0(&vq->desc[head[packet_success]]);
1675 
1676 	while (packet_success != count) {
1677 		/* Get descriptor from available ring */
1678 		desc = &vq->desc[head[packet_success]];
1679 
1680 		buff = pkts[packet_success];
1681 		LOG_DEBUG(VHOST_DATA,
1682 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1683 			"pkt[%d] descriptor idx: %d\n",
1684 			dev->device_fh, packet_success,
1685 			MBUF_HEADROOM_UINT32(buff));
1686 
1687 		PRINT_PACKET(dev,
1688 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1689 			+ RTE_PKTMBUF_HEADROOM),
1690 			rte_pktmbuf_data_len(buff), 0);
1691 
1692 		/* Buffer address translation for virtio header. */
1693 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1694 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1695 
1696 		/*
1697 		 * If the descriptors are chained the header and data are
1698 		 * placed in separate buffers.
1699 		 */
1700 		if (desc->flags & VRING_DESC_F_NEXT) {
1701 			desc->len = vq->vhost_hlen;
1702 			desc = &vq->desc[desc->next];
1703 			desc->len = rte_pktmbuf_data_len(buff);
1704 		} else {
1705 			desc->len = packet_len;
1706 		}
1707 
1708 		/* Update used ring with desc information */
1709 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1710 			= head[packet_success];
1711 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1712 			= packet_len;
1713 		res_cur_idx++;
1714 		packet_success++;
1715 
1716 		/* A header is required per buffer. */
1717 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1718 			(const void *)&virtio_hdr, vq->vhost_hlen);
1719 
1720 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1721 
1722 		if (likely(packet_success < count)) {
1723 			/* Prefetch descriptor index. */
1724 			rte_prefetch0(&vq->desc[head[packet_success]]);
1725 		}
1726 	}
1727 
1728 	rte_compiler_barrier();
1729 
1730 	LOG_DEBUG(VHOST_DATA,
1731 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1732 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1733 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1734 
1735 	*(volatile uint16_t *)&vq->used->idx += count;
1736 	vq->last_used_idx += count;
1737 
1738 	LOG_DEBUG(VHOST_DATA,
1739 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1740 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1741 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1742 
1743 	/* Kick the guest if necessary. */
1744 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1745 		eventfd_write((int)vq->kickfd, 1);
1746 
1747 	return count;
1748 }
1749 
1750 /*
1751  * This function routes the TX packet to the correct interface.
1752  * This may be a local device or the physical port.
1753  */
1754 static inline void __attribute__((always_inline))
1755 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1756 	uint32_t desc_idx, uint8_t need_copy)
1757 {
1758 	struct mbuf_table *tx_q;
1759 	struct rte_mbuf **m_table;
1760 	struct rte_mbuf *mbuf = NULL;
1761 	unsigned len, ret, offset = 0;
1762 	struct vpool *vpool;
1763 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1764 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1765 
1766 	/*Add packet to the port tx queue*/
1767 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1768 	len = tx_q->len;
1769 
1770 	/* Allocate an mbuf and populate the structure. */
1771 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1772 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1773 	if (unlikely(mbuf == NULL)) {
1774 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1775 		RTE_LOG(ERR, VHOST_DATA,
1776 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1777 			dev->device_fh);
1778 		put_desc_to_used_list_zcp(vq, desc_idx);
1779 		return;
1780 	}
1781 
1782 	if (vm2vm_mode == VM2VM_HARDWARE) {
1783 		/* Avoid using a vlan tag from any vm for external pkt, such as
1784 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1785 		 * selection, MAC address determines it as an external pkt
1786 		 * which should go to network, while vlan tag determine it as
1787 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1788 		 * such a ambiguous situation, so pkt will lost.
1789 		 */
1790 		vlan_tag = external_pkt_default_vlan_tag;
1791 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1792 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1793 			__rte_mbuf_raw_free(mbuf);
1794 			return;
1795 		}
1796 	}
1797 
1798 	mbuf->nb_segs = m->nb_segs;
1799 	mbuf->next = m->next;
1800 	mbuf->data_len = m->data_len + offset;
1801 	mbuf->pkt_len = mbuf->data_len;
1802 	if (unlikely(need_copy)) {
1803 		/* Copy the packet contents to the mbuf. */
1804 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1805 			rte_pktmbuf_mtod(m, void *),
1806 			m->data_len);
1807 	} else {
1808 		mbuf->data_off = m->data_off;
1809 		mbuf->buf_physaddr = m->buf_physaddr;
1810 		mbuf->buf_addr = m->buf_addr;
1811 	}
1812 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1813 	mbuf->vlan_tci = vlan_tag;
1814 	mbuf->l2_len = sizeof(struct ether_hdr);
1815 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1816 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1817 
1818 	tx_q->m_table[len] = mbuf;
1819 	len++;
1820 
1821 	LOG_DEBUG(VHOST_DATA,
1822 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1823 		dev->device_fh,
1824 		mbuf->nb_segs,
1825 		(mbuf->next == NULL) ? "null" : "non-null");
1826 
1827 	if (enable_stats) {
1828 		dev_statistics[dev->device_fh].tx_total++;
1829 		dev_statistics[dev->device_fh].tx++;
1830 	}
1831 
1832 	if (unlikely(len == MAX_PKT_BURST)) {
1833 		m_table = (struct rte_mbuf **)tx_q->m_table;
1834 		ret = rte_eth_tx_burst(ports[0],
1835 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1836 
1837 		/*
1838 		 * Free any buffers not handled by TX and update
1839 		 * the port stats.
1840 		 */
1841 		if (unlikely(ret < len)) {
1842 			do {
1843 				rte_pktmbuf_free(m_table[ret]);
1844 			} while (++ret < len);
1845 		}
1846 
1847 		len = 0;
1848 		txmbuf_clean_zcp(dev, vpool);
1849 	}
1850 
1851 	tx_q->len = len;
1852 
1853 	return;
1854 }
1855 
1856 /*
1857  * This function TX all available packets in virtio TX queue for one
1858  * virtio-net device. If it is first packet, it learns MAC address and
1859  * setup VMDQ.
1860  */
1861 static inline void __attribute__((always_inline))
1862 virtio_dev_tx_zcp(struct virtio_net *dev)
1863 {
1864 	struct rte_mbuf m;
1865 	struct vhost_virtqueue *vq;
1866 	struct vring_desc *desc;
1867 	uint64_t buff_addr = 0, phys_addr;
1868 	uint32_t head[MAX_PKT_BURST];
1869 	uint32_t i;
1870 	uint16_t free_entries, packet_success = 0;
1871 	uint16_t avail_idx;
1872 	uint8_t need_copy = 0;
1873 	hpa_type addr_type;
1874 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1875 
1876 	vq = dev->virtqueue[VIRTIO_TXQ];
1877 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1878 
1879 	/* If there are no available buffers then return. */
1880 	if (vq->last_used_idx_res == avail_idx)
1881 		return;
1882 
1883 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1884 
1885 	/* Prefetch available ring to retrieve head indexes. */
1886 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1887 
1888 	/* Get the number of free entries in the ring */
1889 	free_entries = (avail_idx - vq->last_used_idx_res);
1890 
1891 	/* Limit to MAX_PKT_BURST. */
1892 	free_entries
1893 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1894 
1895 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1896 		dev->device_fh, free_entries);
1897 
1898 	/* Retrieve all of the head indexes first to avoid caching issues. */
1899 	for (i = 0; i < free_entries; i++)
1900 		head[i]
1901 			= vq->avail->ring[(vq->last_used_idx_res + i)
1902 			& (vq->size - 1)];
1903 
1904 	vq->last_used_idx_res += free_entries;
1905 
1906 	/* Prefetch descriptor index. */
1907 	rte_prefetch0(&vq->desc[head[packet_success]]);
1908 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1909 
1910 	while (packet_success < free_entries) {
1911 		desc = &vq->desc[head[packet_success]];
1912 
1913 		/* Discard first buffer as it is the virtio header */
1914 		desc = &vq->desc[desc->next];
1915 
1916 		/* Buffer address translation. */
1917 		buff_addr = gpa_to_vva(dev, desc->addr);
1918 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1919 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1920 			&addr_type);
1921 
1922 		if (likely(packet_success < (free_entries - 1)))
1923 			/* Prefetch descriptor index. */
1924 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1925 
1926 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1927 			RTE_LOG(ERR, VHOST_DATA,
1928 				"(%"PRIu64") Invalid frame buffer address found"
1929 				"when TX packets!\n",
1930 				dev->device_fh);
1931 			packet_success++;
1932 			continue;
1933 		}
1934 
1935 		/* Prefetch buffer address. */
1936 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1937 
1938 		/*
1939 		 * Setup dummy mbuf. This is copied to a real mbuf if
1940 		 * transmitted out the physical port.
1941 		 */
1942 		m.data_len = desc->len;
1943 		m.nb_segs = 1;
1944 		m.next = NULL;
1945 		m.data_off = 0;
1946 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1947 		m.buf_physaddr = phys_addr;
1948 
1949 		/*
1950 		 * Check if the frame buffer address from guest crosses
1951 		 * sub-region or not.
1952 		 */
1953 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1954 			RTE_LOG(ERR, VHOST_DATA,
1955 				"(%"PRIu64") Frame buffer address cross "
1956 				"sub-regioin found when attaching TX frame "
1957 				"buffer address!\n",
1958 				dev->device_fh);
1959 			need_copy = 1;
1960 		} else
1961 			need_copy = 0;
1962 
1963 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1964 
1965 		/*
1966 		 * If this is the first received packet we need to learn
1967 		 * the MAC and setup VMDQ
1968 		 */
1969 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1970 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1971 				/*
1972 				 * Discard frame if device is scheduled for
1973 				 * removal or a duplicate MAC address is found.
1974 				 */
1975 				packet_success += free_entries;
1976 				vq->last_used_idx += packet_success;
1977 				break;
1978 			}
1979 		}
1980 
1981 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1982 		packet_success++;
1983 	}
1984 }
1985 
1986 /*
1987  * This function is called by each data core. It handles all RX/TX registered
1988  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1989  * addresses are compared with all devices in the main linked list.
1990  */
1991 static int
1992 switch_worker_zcp(__attribute__((unused)) void *arg)
1993 {
1994 	struct virtio_net *dev = NULL;
1995 	struct vhost_dev  *vdev = NULL;
1996 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1997 	struct virtio_net_data_ll *dev_ll;
1998 	struct mbuf_table *tx_q;
1999 	volatile struct lcore_ll_info *lcore_ll;
2000 	const uint64_t drain_tsc
2001 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2002 		* BURST_TX_DRAIN_US;
2003 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2004 	unsigned ret;
2005 	const uint16_t lcore_id = rte_lcore_id();
2006 	uint16_t count_in_ring, rx_count = 0;
2007 
2008 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2009 
2010 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2011 	prev_tsc = 0;
2012 
2013 	while (1) {
2014 		cur_tsc = rte_rdtsc();
2015 
2016 		/* TX burst queue drain */
2017 		diff_tsc = cur_tsc - prev_tsc;
2018 		if (unlikely(diff_tsc > drain_tsc)) {
2019 			/*
2020 			 * Get mbuf from vpool.pool and detach mbuf and
2021 			 * put back into vpool.ring.
2022 			 */
2023 			dev_ll = lcore_ll->ll_root_used;
2024 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2025 				/* Get virtio device ID */
2026 				vdev = dev_ll->vdev;
2027 				dev = vdev->dev;
2028 
2029 				if (likely(!vdev->remove)) {
2030 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2031 					if (tx_q->len) {
2032 						LOG_DEBUG(VHOST_DATA,
2033 						"TX queue drained after timeout"
2034 						" with burst size %u\n",
2035 						tx_q->len);
2036 
2037 						/*
2038 						 * Tx any packets in the queue
2039 						 */
2040 						ret = rte_eth_tx_burst(
2041 							ports[0],
2042 							(uint16_t)tx_q->txq_id,
2043 							(struct rte_mbuf **)
2044 							tx_q->m_table,
2045 							(uint16_t)tx_q->len);
2046 						if (unlikely(ret < tx_q->len)) {
2047 							do {
2048 								rte_pktmbuf_free(
2049 									tx_q->m_table[ret]);
2050 							} while (++ret < tx_q->len);
2051 						}
2052 						tx_q->len = 0;
2053 
2054 						txmbuf_clean_zcp(dev,
2055 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2056 					}
2057 				}
2058 				dev_ll = dev_ll->next;
2059 			}
2060 			prev_tsc = cur_tsc;
2061 		}
2062 
2063 		rte_prefetch0(lcore_ll->ll_root_used);
2064 
2065 		/*
2066 		 * Inform the configuration core that we have exited the linked
2067 		 * list and that no devices are in use if requested.
2068 		 */
2069 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2070 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2071 
2072 		/* Process devices */
2073 		dev_ll = lcore_ll->ll_root_used;
2074 
2075 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2076 			vdev = dev_ll->vdev;
2077 			dev  = vdev->dev;
2078 			if (unlikely(vdev->remove)) {
2079 				dev_ll = dev_ll->next;
2080 				unlink_vmdq(vdev);
2081 				vdev->ready = DEVICE_SAFE_REMOVE;
2082 				continue;
2083 			}
2084 
2085 			if (likely(vdev->ready == DEVICE_RX)) {
2086 				uint32_t index = vdev->vmdq_rx_q;
2087 				uint16_t i;
2088 				count_in_ring
2089 				= rte_ring_count(vpool_array[index].ring);
2090 				uint16_t free_entries
2091 				= (uint16_t)get_available_ring_num_zcp(dev);
2092 
2093 				/*
2094 				 * Attach all mbufs in vpool.ring and put back
2095 				 * into vpool.pool.
2096 				 */
2097 				for (i = 0;
2098 				i < RTE_MIN(free_entries,
2099 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2100 				i++)
2101 					attach_rxmbuf_zcp(dev);
2102 
2103 				/* Handle guest RX */
2104 				rx_count = rte_eth_rx_burst(ports[0],
2105 					vdev->vmdq_rx_q, pkts_burst,
2106 					MAX_PKT_BURST);
2107 
2108 				if (rx_count) {
2109 					ret_count = virtio_dev_rx_zcp(dev,
2110 							pkts_burst, rx_count);
2111 					if (enable_stats) {
2112 						dev_statistics[dev->device_fh].rx_total
2113 							+= rx_count;
2114 						dev_statistics[dev->device_fh].rx
2115 							+= ret_count;
2116 					}
2117 					while (likely(rx_count)) {
2118 						rx_count--;
2119 						pktmbuf_detach_zcp(
2120 							pkts_burst[rx_count]);
2121 						rte_ring_sp_enqueue(
2122 							vpool_array[index].ring,
2123 							(void *)pkts_burst[rx_count]);
2124 					}
2125 				}
2126 			}
2127 
2128 			if (likely(!vdev->remove))
2129 				/* Handle guest TX */
2130 				virtio_dev_tx_zcp(dev);
2131 
2132 			/* Move to the next device in the list */
2133 			dev_ll = dev_ll->next;
2134 		}
2135 	}
2136 
2137 	return 0;
2138 }
2139 
2140 
2141 /*
2142  * Add an entry to a used linked list. A free entry must first be found
2143  * in the free linked list using get_data_ll_free_entry();
2144  */
2145 static void
2146 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2147 	struct virtio_net_data_ll *ll_dev)
2148 {
2149 	struct virtio_net_data_ll *ll = *ll_root_addr;
2150 
2151 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2152 	ll_dev->next = NULL;
2153 	rte_compiler_barrier();
2154 
2155 	/* If ll == NULL then this is the first device. */
2156 	if (ll) {
2157 		/* Increment to the tail of the linked list. */
2158 		while ((ll->next != NULL) )
2159 			ll = ll->next;
2160 
2161 		ll->next = ll_dev;
2162 	} else {
2163 		*ll_root_addr = ll_dev;
2164 	}
2165 }
2166 
2167 /*
2168  * Remove an entry from a used linked list. The entry must then be added to
2169  * the free linked list using put_data_ll_free_entry().
2170  */
2171 static void
2172 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2173 	struct virtio_net_data_ll *ll_dev,
2174 	struct virtio_net_data_ll *ll_dev_last)
2175 {
2176 	struct virtio_net_data_ll *ll = *ll_root_addr;
2177 
2178 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2179 		return;
2180 
2181 	if (ll_dev == ll)
2182 		*ll_root_addr = ll_dev->next;
2183 	else
2184 		if (likely(ll_dev_last != NULL))
2185 			ll_dev_last->next = ll_dev->next;
2186 		else
2187 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2188 }
2189 
2190 /*
2191  * Find and return an entry from the free linked list.
2192  */
2193 static struct virtio_net_data_ll *
2194 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2195 {
2196 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2197 	struct virtio_net_data_ll *ll_dev;
2198 
2199 	if (ll_free == NULL)
2200 		return NULL;
2201 
2202 	ll_dev = ll_free;
2203 	*ll_root_addr = ll_free->next;
2204 
2205 	return ll_dev;
2206 }
2207 
2208 /*
2209  * Place an entry back on to the free linked list.
2210  */
2211 static void
2212 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2213 	struct virtio_net_data_ll *ll_dev)
2214 {
2215 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2216 
2217 	if (ll_dev == NULL)
2218 		return;
2219 
2220 	ll_dev->next = ll_free;
2221 	*ll_root_addr = ll_dev;
2222 }
2223 
2224 /*
2225  * Creates a linked list of a given size.
2226  */
2227 static struct virtio_net_data_ll *
2228 alloc_data_ll(uint32_t size)
2229 {
2230 	struct virtio_net_data_ll *ll_new;
2231 	uint32_t i;
2232 
2233 	/* Malloc and then chain the linked list. */
2234 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2235 	if (ll_new == NULL) {
2236 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2237 		return NULL;
2238 	}
2239 
2240 	for (i = 0; i < size - 1; i++) {
2241 		ll_new[i].vdev = NULL;
2242 		ll_new[i].next = &ll_new[i+1];
2243 	}
2244 	ll_new[i].next = NULL;
2245 
2246 	return (ll_new);
2247 }
2248 
2249 /*
2250  * Create the main linked list along with each individual cores linked list. A used and a free list
2251  * are created to manage entries.
2252  */
2253 static int
2254 init_data_ll (void)
2255 {
2256 	int lcore;
2257 
2258 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2259 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2260 		if (lcore_info[lcore].lcore_ll == NULL) {
2261 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2262 			return -1;
2263 		}
2264 
2265 		lcore_info[lcore].lcore_ll->device_num = 0;
2266 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2267 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2268 		if (num_devices % num_switching_cores)
2269 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2270 		else
2271 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2272 	}
2273 
2274 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2275 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2276 
2277 	return 0;
2278 }
2279 
2280 /*
2281  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2282  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2283  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2284  */
2285 static void
2286 destroy_device (volatile struct virtio_net *dev)
2287 {
2288 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2289 	struct virtio_net_data_ll *ll_main_dev_cur;
2290 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2291 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2292 	struct vhost_dev *vdev;
2293 	int lcore;
2294 
2295 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2296 
2297 	vdev = (struct vhost_dev *)dev->priv;
2298 	/*set the remove flag. */
2299 	vdev->remove = 1;
2300 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2301 		rte_pause();
2302 	}
2303 
2304 	/* Search for entry to be removed from lcore ll */
2305 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2306 	while (ll_lcore_dev_cur != NULL) {
2307 		if (ll_lcore_dev_cur->vdev == vdev) {
2308 			break;
2309 		} else {
2310 			ll_lcore_dev_last = ll_lcore_dev_cur;
2311 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2312 		}
2313 	}
2314 
2315 	if (ll_lcore_dev_cur == NULL) {
2316 		RTE_LOG(ERR, VHOST_CONFIG,
2317 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2318 			dev->device_fh);
2319 		return;
2320 	}
2321 
2322 	/* Search for entry to be removed from main ll */
2323 	ll_main_dev_cur = ll_root_used;
2324 	ll_main_dev_last = NULL;
2325 	while (ll_main_dev_cur != NULL) {
2326 		if (ll_main_dev_cur->vdev == vdev) {
2327 			break;
2328 		} else {
2329 			ll_main_dev_last = ll_main_dev_cur;
2330 			ll_main_dev_cur = ll_main_dev_cur->next;
2331 		}
2332 	}
2333 
2334 	/* Remove entries from the lcore and main ll. */
2335 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2336 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2337 
2338 	/* Set the dev_removal_flag on each lcore. */
2339 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2340 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2341 	}
2342 
2343 	/*
2344 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2345 	 * they can no longer access the device removed from the linked lists and that the devices
2346 	 * are no longer in use.
2347 	 */
2348 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2349 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2350 			rte_pause();
2351 		}
2352 	}
2353 
2354 	/* Add the entries back to the lcore and main free ll.*/
2355 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2356 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2357 
2358 	/* Decrement number of device on the lcore. */
2359 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2360 
2361 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2362 
2363 	if (zero_copy) {
2364 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2365 
2366 		/* Stop the RX queue. */
2367 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2368 			LOG_DEBUG(VHOST_CONFIG,
2369 				"(%"PRIu64") In destroy_device: Failed to stop "
2370 				"rx queue:%d\n",
2371 				dev->device_fh,
2372 				vdev->vmdq_rx_q);
2373 		}
2374 
2375 		LOG_DEBUG(VHOST_CONFIG,
2376 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2377 			"mempool back to ring for RX queue: %d\n",
2378 			dev->device_fh, vdev->vmdq_rx_q);
2379 
2380 		mbuf_destroy_zcp(vpool);
2381 
2382 		/* Stop the TX queue. */
2383 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2384 			LOG_DEBUG(VHOST_CONFIG,
2385 				"(%"PRIu64") In destroy_device: Failed to "
2386 				"stop tx queue:%d\n",
2387 				dev->device_fh, vdev->vmdq_rx_q);
2388 		}
2389 
2390 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2391 
2392 		LOG_DEBUG(VHOST_CONFIG,
2393 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2394 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2395 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2396 			dev->device_fh);
2397 
2398 		mbuf_destroy_zcp(vpool);
2399 		rte_free(vdev->regions_hpa);
2400 	}
2401 	rte_free(vdev);
2402 
2403 }
2404 
2405 /*
2406  * Calculate the region count of physical continous regions for one particular
2407  * region of whose vhost virtual address is continous. The particular region
2408  * start from vva_start, with size of 'size' in argument.
2409  */
2410 static uint32_t
2411 check_hpa_regions(uint64_t vva_start, uint64_t size)
2412 {
2413 	uint32_t i, nregions = 0, page_size = getpagesize();
2414 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2415 	if (vva_start % page_size) {
2416 		LOG_DEBUG(VHOST_CONFIG,
2417 			"in check_countinous: vva start(%p) mod page_size(%d) "
2418 			"has remainder\n",
2419 			(void *)(uintptr_t)vva_start, page_size);
2420 		return 0;
2421 	}
2422 	if (size % page_size) {
2423 		LOG_DEBUG(VHOST_CONFIG,
2424 			"in check_countinous: "
2425 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2426 			size, page_size);
2427 		return 0;
2428 	}
2429 	for (i = 0; i < size - page_size; i = i + page_size) {
2430 		cur_phys_addr
2431 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2432 		next_phys_addr = rte_mem_virt2phy(
2433 			(void *)(uintptr_t)(vva_start + i + page_size));
2434 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2435 			++nregions;
2436 			LOG_DEBUG(VHOST_CONFIG,
2437 				"in check_continuous: hva addr:(%p) is not "
2438 				"continuous with hva addr:(%p), diff:%d\n",
2439 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2440 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2441 				+ page_size), page_size);
2442 			LOG_DEBUG(VHOST_CONFIG,
2443 				"in check_continuous: hpa addr:(%p) is not "
2444 				"continuous with hpa addr:(%p), "
2445 				"diff:(%"PRIu64")\n",
2446 				(void *)(uintptr_t)cur_phys_addr,
2447 				(void *)(uintptr_t)next_phys_addr,
2448 				(next_phys_addr-cur_phys_addr));
2449 		}
2450 	}
2451 	return nregions;
2452 }
2453 
2454 /*
2455  * Divide each region whose vhost virtual address is continous into a few
2456  * sub-regions, make sure the physical address within each sub-region are
2457  * continous. And fill offset(to GPA) and size etc. information of each
2458  * sub-region into regions_hpa.
2459  */
2460 static uint32_t
2461 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2462 {
2463 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2464 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2465 
2466 	if (mem_region_hpa == NULL)
2467 		return 0;
2468 
2469 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2470 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2471 			virtio_memory->regions[regionidx].address_offset;
2472 		mem_region_hpa[regionidx_hpa].guest_phys_address
2473 			= virtio_memory->regions[regionidx].guest_phys_address;
2474 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2475 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2476 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2477 		LOG_DEBUG(VHOST_CONFIG,
2478 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2479 			regionidx_hpa,
2480 			(void *)(uintptr_t)
2481 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2482 		LOG_DEBUG(VHOST_CONFIG,
2483 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2484 			regionidx_hpa,
2485 			(void *)(uintptr_t)
2486 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2487 		for (i = 0, k = 0;
2488 			i < virtio_memory->regions[regionidx].memory_size -
2489 				page_size;
2490 			i += page_size) {
2491 			cur_phys_addr = rte_mem_virt2phy(
2492 					(void *)(uintptr_t)(vva_start + i));
2493 			next_phys_addr = rte_mem_virt2phy(
2494 					(void *)(uintptr_t)(vva_start +
2495 					i + page_size));
2496 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2497 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2498 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2499 					k + page_size;
2500 				mem_region_hpa[regionidx_hpa].memory_size
2501 					= k + page_size;
2502 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2503 					"phys addr end  [%d]:(%p)\n",
2504 					regionidx_hpa,
2505 					(void *)(uintptr_t)
2506 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2507 				LOG_DEBUG(VHOST_CONFIG,
2508 					"in fill_hpa_regions: guest phys addr "
2509 					"size [%d]:(%p)\n",
2510 					regionidx_hpa,
2511 					(void *)(uintptr_t)
2512 					(mem_region_hpa[regionidx_hpa].memory_size));
2513 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2514 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2515 				++regionidx_hpa;
2516 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2517 					next_phys_addr -
2518 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2519 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2520 					" phys addr start[%d]:(%p)\n",
2521 					regionidx_hpa,
2522 					(void *)(uintptr_t)
2523 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2524 				LOG_DEBUG(VHOST_CONFIG,
2525 					"in fill_hpa_regions: host  phys addr "
2526 					"start[%d]:(%p)\n",
2527 					regionidx_hpa,
2528 					(void *)(uintptr_t)
2529 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2530 				k = 0;
2531 			} else {
2532 				k += page_size;
2533 			}
2534 		}
2535 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2536 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2537 			+ k + page_size;
2538 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2539 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2540 			"[%d]:(%p)\n", regionidx_hpa,
2541 			(void *)(uintptr_t)
2542 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2543 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2544 			"[%d]:(%p)\n", regionidx_hpa,
2545 			(void *)(uintptr_t)
2546 			(mem_region_hpa[regionidx_hpa].memory_size));
2547 		++regionidx_hpa;
2548 	}
2549 	return regionidx_hpa;
2550 }
2551 
2552 /*
2553  * A new device is added to a data core. First the device is added to the main linked list
2554  * and the allocated to a specific data core.
2555  */
2556 static int
2557 new_device (struct virtio_net *dev)
2558 {
2559 	struct virtio_net_data_ll *ll_dev;
2560 	int lcore, core_add = 0;
2561 	uint32_t device_num_min = num_devices;
2562 	struct vhost_dev *vdev;
2563 	uint32_t regionidx;
2564 
2565 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2566 	if (vdev == NULL) {
2567 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2568 			dev->device_fh);
2569 		return -1;
2570 	}
2571 	vdev->dev = dev;
2572 	dev->priv = vdev;
2573 
2574 	if (zero_copy) {
2575 		vdev->nregions_hpa = dev->mem->nregions;
2576 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2577 			vdev->nregions_hpa
2578 				+= check_hpa_regions(
2579 					dev->mem->regions[regionidx].guest_phys_address
2580 					+ dev->mem->regions[regionidx].address_offset,
2581 					dev->mem->regions[regionidx].memory_size);
2582 
2583 		}
2584 
2585 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2586 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2587 			CACHE_LINE_SIZE);
2588 		if (vdev->regions_hpa == NULL) {
2589 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2590 			rte_free(vdev);
2591 			return -1;
2592 		}
2593 
2594 
2595 		if (fill_hpa_memory_regions(
2596 			vdev->regions_hpa, dev->mem
2597 			) != vdev->nregions_hpa) {
2598 
2599 			RTE_LOG(ERR, VHOST_CONFIG,
2600 				"hpa memory regions number mismatch: "
2601 				"[%d]\n", vdev->nregions_hpa);
2602 			rte_free(vdev->regions_hpa);
2603 			rte_free(vdev);
2604 			return -1;
2605 		}
2606 	}
2607 
2608 
2609 	/* Add device to main ll */
2610 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2611 	if (ll_dev == NULL) {
2612 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2613 			"of %d devices per core has been reached\n",
2614 			dev->device_fh, num_devices);
2615 		if (vdev->regions_hpa)
2616 			rte_free(vdev->regions_hpa);
2617 		rte_free(vdev);
2618 		return -1;
2619 	}
2620 	ll_dev->vdev = vdev;
2621 	add_data_ll_entry(&ll_root_used, ll_dev);
2622 	vdev->vmdq_rx_q
2623 		= dev->device_fh * (num_queues / num_devices);
2624 
2625 	if (zero_copy) {
2626 		uint32_t index = vdev->vmdq_rx_q;
2627 		uint32_t count_in_ring, i;
2628 		struct mbuf_table *tx_q;
2629 
2630 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2631 
2632 		LOG_DEBUG(VHOST_CONFIG,
2633 			"(%"PRIu64") in new_device: mbuf count in mempool "
2634 			"before attach is: %d\n",
2635 			dev->device_fh,
2636 			rte_mempool_count(vpool_array[index].pool));
2637 		LOG_DEBUG(VHOST_CONFIG,
2638 			"(%"PRIu64") in new_device: mbuf count in  ring "
2639 			"before attach  is : %d\n",
2640 			dev->device_fh, count_in_ring);
2641 
2642 		/*
2643 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2644 		 */
2645 		for (i = 0; i < count_in_ring; i++)
2646 			attach_rxmbuf_zcp(dev);
2647 
2648 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2649 			"mempool after attach is: %d\n",
2650 			dev->device_fh,
2651 			rte_mempool_count(vpool_array[index].pool));
2652 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2653 			"ring after attach  is : %d\n",
2654 			dev->device_fh,
2655 			rte_ring_count(vpool_array[index].ring));
2656 
2657 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2658 		tx_q->txq_id = vdev->vmdq_rx_q;
2659 
2660 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2661 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2662 
2663 			LOG_DEBUG(VHOST_CONFIG,
2664 				"(%"PRIu64") In new_device: Failed to start "
2665 				"tx queue:%d\n",
2666 				dev->device_fh, vdev->vmdq_rx_q);
2667 
2668 			mbuf_destroy_zcp(vpool);
2669 			rte_free(vdev->regions_hpa);
2670 			rte_free(vdev);
2671 			return -1;
2672 		}
2673 
2674 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2675 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2676 
2677 			LOG_DEBUG(VHOST_CONFIG,
2678 				"(%"PRIu64") In new_device: Failed to start "
2679 				"rx queue:%d\n",
2680 				dev->device_fh, vdev->vmdq_rx_q);
2681 
2682 			/* Stop the TX queue. */
2683 			if (rte_eth_dev_tx_queue_stop(ports[0],
2684 				vdev->vmdq_rx_q) != 0) {
2685 				LOG_DEBUG(VHOST_CONFIG,
2686 					"(%"PRIu64") In new_device: Failed to "
2687 					"stop tx queue:%d\n",
2688 					dev->device_fh, vdev->vmdq_rx_q);
2689 			}
2690 
2691 			mbuf_destroy_zcp(vpool);
2692 			rte_free(vdev->regions_hpa);
2693 			rte_free(vdev);
2694 			return -1;
2695 		}
2696 
2697 	}
2698 
2699 	/*reset ready flag*/
2700 	vdev->ready = DEVICE_MAC_LEARNING;
2701 	vdev->remove = 0;
2702 
2703 	/* Find a suitable lcore to add the device. */
2704 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2705 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2706 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2707 			core_add = lcore;
2708 		}
2709 	}
2710 	/* Add device to lcore ll */
2711 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2712 	if (ll_dev == NULL) {
2713 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2714 		vdev->ready = DEVICE_SAFE_REMOVE;
2715 		destroy_device(dev);
2716 		if (vdev->regions_hpa)
2717 			rte_free(vdev->regions_hpa);
2718 		rte_free(vdev);
2719 		return -1;
2720 	}
2721 	ll_dev->vdev = vdev;
2722 	vdev->coreid = core_add;
2723 
2724 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2725 
2726 	/* Initialize device stats */
2727 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2728 
2729 	/* Disable notifications. */
2730 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2731 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2732 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2733 	dev->flags |= VIRTIO_DEV_RUNNING;
2734 
2735 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2736 
2737 	return 0;
2738 }
2739 
2740 /*
2741  * These callback allow devices to be added to the data core when configuration
2742  * has been fully complete.
2743  */
2744 static const struct virtio_net_device_ops virtio_net_device_ops =
2745 {
2746 	.new_device =  new_device,
2747 	.destroy_device = destroy_device,
2748 };
2749 
2750 /*
2751  * This is a thread will wake up after a period to print stats if the user has
2752  * enabled them.
2753  */
2754 static void
2755 print_stats(void)
2756 {
2757 	struct virtio_net_data_ll *dev_ll;
2758 	uint64_t tx_dropped, rx_dropped;
2759 	uint64_t tx, tx_total, rx, rx_total;
2760 	uint32_t device_fh;
2761 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2762 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2763 
2764 	while(1) {
2765 		sleep(enable_stats);
2766 
2767 		/* Clear screen and move to top left */
2768 		printf("%s%s", clr, top_left);
2769 
2770 		printf("\nDevice statistics ====================================");
2771 
2772 		dev_ll = ll_root_used;
2773 		while (dev_ll != NULL) {
2774 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2775 			tx_total = dev_statistics[device_fh].tx_total;
2776 			tx = dev_statistics[device_fh].tx;
2777 			tx_dropped = tx_total - tx;
2778 			if (zero_copy == 0) {
2779 				rx_total = rte_atomic64_read(
2780 					&dev_statistics[device_fh].rx_total_atomic);
2781 				rx = rte_atomic64_read(
2782 					&dev_statistics[device_fh].rx_atomic);
2783 			} else {
2784 				rx_total = dev_statistics[device_fh].rx_total;
2785 				rx = dev_statistics[device_fh].rx;
2786 			}
2787 			rx_dropped = rx_total - rx;
2788 
2789 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2790 					"\nTX total: 		%"PRIu64""
2791 					"\nTX dropped: 		%"PRIu64""
2792 					"\nTX successful: 		%"PRIu64""
2793 					"\nRX total: 		%"PRIu64""
2794 					"\nRX dropped: 		%"PRIu64""
2795 					"\nRX successful: 		%"PRIu64"",
2796 					device_fh,
2797 					tx_total,
2798 					tx_dropped,
2799 					tx,
2800 					rx_total,
2801 					rx_dropped,
2802 					rx);
2803 
2804 			dev_ll = dev_ll->next;
2805 		}
2806 		printf("\n======================================================\n");
2807 	}
2808 }
2809 
2810 static void
2811 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2812 	char *ring_name, uint32_t nb_mbuf)
2813 {
2814 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2815 	vpool_array[index].pool
2816 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2817 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2818 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2819 		rte_pktmbuf_init, NULL, socket, 0);
2820 	if (vpool_array[index].pool != NULL) {
2821 		vpool_array[index].ring
2822 			= rte_ring_create(ring_name,
2823 				rte_align32pow2(nb_mbuf + 1),
2824 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2825 		if (likely(vpool_array[index].ring != NULL)) {
2826 			LOG_DEBUG(VHOST_CONFIG,
2827 				"in setup_mempool_tbl: mbuf count in "
2828 				"mempool is: %d\n",
2829 				rte_mempool_count(vpool_array[index].pool));
2830 			LOG_DEBUG(VHOST_CONFIG,
2831 				"in setup_mempool_tbl: mbuf count in "
2832 				"ring   is: %d\n",
2833 				rte_ring_count(vpool_array[index].ring));
2834 		} else {
2835 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2836 				ring_name);
2837 		}
2838 
2839 		/* Need consider head room. */
2840 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2841 	} else {
2842 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2843 	}
2844 }
2845 
2846 
2847 /*
2848  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2849  * device is also registered here to handle the IOCTLs.
2850  */
2851 int
2852 MAIN(int argc, char *argv[])
2853 {
2854 	struct rte_mempool *mbuf_pool = NULL;
2855 	unsigned lcore_id, core_id = 0;
2856 	unsigned nb_ports, valid_num_ports;
2857 	int ret;
2858 	uint8_t portid, queue_id = 0;
2859 	static pthread_t tid;
2860 
2861 	/* init EAL */
2862 	ret = rte_eal_init(argc, argv);
2863 	if (ret < 0)
2864 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2865 	argc -= ret;
2866 	argv += ret;
2867 
2868 	/* parse app arguments */
2869 	ret = us_vhost_parse_args(argc, argv);
2870 	if (ret < 0)
2871 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2872 
2873 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2874 		if (rte_lcore_is_enabled(lcore_id))
2875 			lcore_ids[core_id ++] = lcore_id;
2876 
2877 	if (rte_lcore_count() > RTE_MAX_LCORE)
2878 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2879 
2880 	/*set the number of swithcing cores available*/
2881 	num_switching_cores = rte_lcore_count()-1;
2882 
2883 	/* Get the number of physical ports. */
2884 	nb_ports = rte_eth_dev_count();
2885 	if (nb_ports > RTE_MAX_ETHPORTS)
2886 		nb_ports = RTE_MAX_ETHPORTS;
2887 
2888 	/*
2889 	 * Update the global var NUM_PORTS and global array PORTS
2890 	 * and get value of var VALID_NUM_PORTS according to system ports number
2891 	 */
2892 	valid_num_ports = check_ports_num(nb_ports);
2893 
2894 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2895 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2896 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2897 		return -1;
2898 	}
2899 
2900 	if (zero_copy == 0) {
2901 		/* Create the mbuf pool. */
2902 		mbuf_pool = rte_mempool_create(
2903 				"MBUF_POOL",
2904 				NUM_MBUFS_PER_PORT
2905 				* valid_num_ports,
2906 				MBUF_SIZE, MBUF_CACHE_SIZE,
2907 				sizeof(struct rte_pktmbuf_pool_private),
2908 				rte_pktmbuf_pool_init, NULL,
2909 				rte_pktmbuf_init, NULL,
2910 				rte_socket_id(), 0);
2911 		if (mbuf_pool == NULL)
2912 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2913 
2914 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2915 			vpool_array[queue_id].pool = mbuf_pool;
2916 
2917 		if (vm2vm_mode == VM2VM_HARDWARE) {
2918 			/* Enable VT loop back to let L2 switch to do it. */
2919 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2920 			LOG_DEBUG(VHOST_CONFIG,
2921 				"Enable loop back for L2 switch in vmdq.\n");
2922 		}
2923 	} else {
2924 		uint32_t nb_mbuf;
2925 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2926 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2927 
2928 		/*
2929 		 * Zero copy defers queue RX/TX start to the time when guest
2930 		 * finishes its startup and packet buffers from that guest are
2931 		 * available.
2932 		 */
2933 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2934 		rx_conf_default.rx_drop_en = 0;
2935 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2936 		nb_mbuf = num_rx_descriptor
2937 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2938 			+ num_switching_cores * MAX_PKT_BURST;
2939 
2940 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2941 			snprintf(pool_name, sizeof(pool_name),
2942 				"rxmbuf_pool_%u", queue_id);
2943 			snprintf(ring_name, sizeof(ring_name),
2944 				"rxmbuf_ring_%u", queue_id);
2945 			setup_mempool_tbl(rte_socket_id(), queue_id,
2946 				pool_name, ring_name, nb_mbuf);
2947 		}
2948 
2949 		nb_mbuf = num_tx_descriptor
2950 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2951 				+ num_switching_cores * MAX_PKT_BURST;
2952 
2953 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2954 			snprintf(pool_name, sizeof(pool_name),
2955 				"txmbuf_pool_%u", queue_id);
2956 			snprintf(ring_name, sizeof(ring_name),
2957 				"txmbuf_ring_%u", queue_id);
2958 			setup_mempool_tbl(rte_socket_id(),
2959 				(queue_id + MAX_QUEUES),
2960 				pool_name, ring_name, nb_mbuf);
2961 		}
2962 
2963 		if (vm2vm_mode == VM2VM_HARDWARE) {
2964 			/* Enable VT loop back to let L2 switch to do it. */
2965 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2966 			LOG_DEBUG(VHOST_CONFIG,
2967 				"Enable loop back for L2 switch in vmdq.\n");
2968 		}
2969 	}
2970 	/* Set log level. */
2971 	rte_set_log_level(LOG_LEVEL);
2972 
2973 	/* initialize all ports */
2974 	for (portid = 0; portid < nb_ports; portid++) {
2975 		/* skip ports that are not enabled */
2976 		if ((enabled_port_mask & (1 << portid)) == 0) {
2977 			RTE_LOG(INFO, VHOST_PORT,
2978 				"Skipping disabled port %d\n", portid);
2979 			continue;
2980 		}
2981 		if (port_init(portid) != 0)
2982 			rte_exit(EXIT_FAILURE,
2983 				"Cannot initialize network ports\n");
2984 	}
2985 
2986 	/* Initialise all linked lists. */
2987 	if (init_data_ll() == -1)
2988 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2989 
2990 	/* Initialize device stats */
2991 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2992 
2993 	/* Enable stats if the user option is set. */
2994 	if (enable_stats)
2995 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2996 
2997 	/* Launch all data cores. */
2998 	if (zero_copy == 0) {
2999 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3000 			rte_eal_remote_launch(switch_worker,
3001 				mbuf_pool, lcore_id);
3002 		}
3003 	} else {
3004 		uint32_t count_in_mempool, index, i;
3005 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3006 			/* For all RX and TX queues. */
3007 			count_in_mempool
3008 				= rte_mempool_count(vpool_array[index].pool);
3009 
3010 			/*
3011 			 * Transfer all un-attached mbufs from vpool.pool
3012 			 * to vpoo.ring.
3013 			 */
3014 			for (i = 0; i < count_in_mempool; i++) {
3015 				struct rte_mbuf *mbuf
3016 					= __rte_mbuf_raw_alloc(
3017 						vpool_array[index].pool);
3018 				rte_ring_sp_enqueue(vpool_array[index].ring,
3019 						(void *)mbuf);
3020 			}
3021 
3022 			LOG_DEBUG(VHOST_CONFIG,
3023 				"in MAIN: mbuf count in mempool at initial "
3024 				"is: %d\n", count_in_mempool);
3025 			LOG_DEBUG(VHOST_CONFIG,
3026 				"in MAIN: mbuf count in  ring at initial  is :"
3027 				" %d\n",
3028 				rte_ring_count(vpool_array[index].ring));
3029 		}
3030 
3031 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3032 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3033 				lcore_id);
3034 	}
3035 
3036 	if (mergeable == 0)
3037 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3038 
3039 	/* Register CUSE device to handle IOCTLs. */
3040 	ret = rte_vhost_driver_register((char *)&dev_basename);
3041 	if (ret != 0)
3042 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3043 
3044 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3045 
3046 	/* Start CUSE session. */
3047 	rte_vhost_driver_session_start();
3048 	return 0;
3049 
3050 }
3051 
3052