xref: /dpdk/examples/vhost/main.c (revision 72ec8d77ac68cdc2b74428e25e06765fbb10d0cb)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
103 
104 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
106 
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108 
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX			1
112 #define DEVICE_SAFE_REMOVE	2
113 
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117 
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121 
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133 
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 		+ sizeof(struct rte_mbuf)))
137 
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140 
141 #define INVALID_PORT_ID 0xFF
142 
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145 
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148 
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151 
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154 
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157 
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160 
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163 
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166 
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170 
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177 
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184 
185 struct vpool {
186 	struct rte_mempool *pool;
187 	struct rte_ring *ring;
188 	uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193 	VM2VM_DISABLED = 0,
194 	VM2VM_SOFTWARE = 1,
195 	VM2VM_HARDWARE = 2,
196 	VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202 	PHYS_ADDR_CONTINUOUS = 0,
203 	PHYS_ADDR_CROSS_SUBREG = 1,
204 	PHYS_ADDR_INVALID = 2,
205 	PHYS_ADDR_LAST
206 } hpa_type;
207 
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 
220 
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223 	.rx_thresh = {
224 		.pthresh = RX_PTHRESH,
225 		.hthresh = RX_HTHRESH,
226 		.wthresh = RX_WTHRESH,
227 	},
228 	.rx_drop_en = 1,
229 };
230 
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237 	.tx_thresh = {
238 		.pthresh = TX_PTHRESH,
239 		.hthresh = TX_HTHRESH,
240 		.wthresh = TX_WTHRESH,
241 	},
242 	.tx_free_thresh = 0, /* Use PMD default values */
243 	.tx_rs_thresh = 0, /* Use PMD default values */
244 };
245 
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248 	.rxmode = {
249 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250 		.split_hdr_size = 0,
251 		.header_split   = 0, /**< Header Split disabled */
252 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
253 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
254 		/*
255 		 * It is necessary for 1G NIC such as I350,
256 		 * this fixes bug of ipv4 forwarding in guest can't
257 		 * forward pakets from one virtio dev to another virtio dev.
258 		 */
259 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
262 	},
263 
264 	.txmode = {
265 		.mq_mode = ETH_MQ_TX_NONE,
266 	},
267 	.rx_adv_conf = {
268 		/*
269 		 * should be overridden separately in code with
270 		 * appropriate values
271 		 */
272 		.vmdq_rx_conf = {
273 			.nb_queue_pools = ETH_8_POOLS,
274 			.enable_default_pool = 0,
275 			.default_pool = 0,
276 			.nb_pool_maps = 0,
277 			.pool_map = {{0, 0},},
278 		},
279 	},
280 };
281 
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285 
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
290 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297 
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300 
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304 
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307 
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310 	unsigned len;
311 	unsigned txq_id;
312 	struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314 
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317 
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320 
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323 	unsigned char   h_dest[ETH_ALEN];
324 	unsigned char   h_source[ETH_ALEN];
325 	__be16          h_vlan_proto;
326 	__be16          h_vlan_TCI;
327 	__be16          h_vlan_encapsulated_proto;
328 };
329 
330 /* IPv4 Header */
331 struct ipv4_hdr {
332 	uint8_t  version_ihl;		/**< version and header length */
333 	uint8_t  type_of_service;	/**< type of service */
334 	uint16_t total_length;		/**< length of packet */
335 	uint16_t packet_id;		/**< packet ID */
336 	uint16_t fragment_offset;	/**< fragmentation offset */
337 	uint8_t  time_to_live;		/**< time to live */
338 	uint8_t  next_proto_id;		/**< protocol ID */
339 	uint16_t hdr_checksum;		/**< header checksum */
340 	uint32_t src_addr;		/**< source address */
341 	uint32_t dst_addr;		/**< destination address */
342 } __attribute__((__packed__));
343 
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347 
348 /* Per-device statistics struct */
349 struct device_statistics {
350 	uint64_t tx_total;
351 	rte_atomic64_t rx_total_atomic;
352 	uint64_t rx_total;
353 	uint64_t tx;
354 	rte_atomic64_t rx_atomic;
355 	uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358 
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366 	struct rte_eth_vmdq_rx_conf conf;
367 	unsigned i;
368 
369 	memset(&conf, 0, sizeof(conf));
370 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371 	conf.nb_pool_maps = num_devices;
372 	conf.enable_loop_back =
373 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374 
375 	for (i = 0; i < conf.nb_pool_maps; i++) {
376 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
377 		conf.pool_map[i].pools = (1UL << i);
378 	}
379 
380 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383 	return 0;
384 }
385 
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394 	if (num_devices > max_nb_devices) {
395 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396 		return -1;
397 	}
398 	return 0;
399 }
400 
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408 	struct rte_eth_dev_info dev_info;
409 	struct rte_eth_conf port_conf;
410 	uint16_t rx_rings, tx_rings;
411 	uint16_t rx_ring_size, tx_ring_size;
412 	int retval;
413 	uint16_t q;
414 
415 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416 	rte_eth_dev_info_get (port, &dev_info);
417 
418 	/*configure the number of supported virtio devices based on VMDQ limits */
419 	num_devices = dev_info.max_vmdq_pools;
420 	num_queues = dev_info.max_rx_queues;
421 
422 	if (zero_copy) {
423 		rx_ring_size = num_rx_descriptor;
424 		tx_ring_size = num_tx_descriptor;
425 		tx_rings = dev_info.max_tx_queues;
426 	} else {
427 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429 		tx_rings = (uint16_t)rte_lcore_count();
430 	}
431 
432 	retval = validate_num_devices(MAX_DEVICES);
433 	if (retval < 0)
434 		return retval;
435 
436 	/* Get port configuration. */
437 	retval = get_eth_conf(&port_conf, num_devices);
438 	if (retval < 0)
439 		return retval;
440 
441 	if (port >= rte_eth_dev_count()) return -1;
442 
443 	rx_rings = (uint16_t)num_queues,
444 	/* Configure ethernet device. */
445 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446 	if (retval != 0)
447 		return retval;
448 
449 	/* Setup the queues. */
450 	for (q = 0; q < rx_rings; q ++) {
451 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 						rte_eth_dev_socket_id(port), &rx_conf_default,
453 						vpool_array[q].pool);
454 		if (retval < 0)
455 			return retval;
456 	}
457 	for (q = 0; q < tx_rings; q ++) {
458 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459 						rte_eth_dev_socket_id(port), &tx_conf_default);
460 		if (retval < 0)
461 			return retval;
462 	}
463 
464 	/* Start the device. */
465 	retval  = rte_eth_dev_start(port);
466 	if (retval < 0) {
467 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468 		return retval;
469 	}
470 
471 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475 			(unsigned)port,
476 			vmdq_ports_eth_addr[port].addr_bytes[0],
477 			vmdq_ports_eth_addr[port].addr_bytes[1],
478 			vmdq_ports_eth_addr[port].addr_bytes[2],
479 			vmdq_ports_eth_addr[port].addr_bytes[3],
480 			vmdq_ports_eth_addr[port].addr_bytes[4],
481 			vmdq_ports_eth_addr[port].addr_bytes[5]);
482 
483 	return 0;
484 }
485 
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492 	/* parse number string */
493 
494 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495 		return -1;
496 	else
497 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498 
499 	return 0;
500 }
501 
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508 	char *end = NULL;
509 	unsigned long pm;
510 
511 	errno = 0;
512 
513 	/* parse hexadecimal string */
514 	pm = strtoul(portmask, &end, 16);
515 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516 		return -1;
517 
518 	if (pm == 0)
519 		return -1;
520 
521 	return pm;
522 
523 }
524 
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531 	char *end = NULL;
532 	unsigned long num;
533 
534 	errno = 0;
535 
536 	/* parse unsigned int string */
537 	num = strtoul(q_arg, &end, 10);
538 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539 		return -1;
540 
541 	if (num > max_valid_value)
542 		return -1;
543 
544 	return num;
545 
546 }
547 
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555 	"		--vm2vm [0|1|2]\n"
556 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557 	"		--dev-basename <name>\n"
558 	"		--nb-devices ND\n"
559 	"		-p PORTMASK: Set mask for ports to be used by application\n"
560 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566 	"		--dev-basename: The basename to be used for the character device.\n"
567 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
568 			"zero copy\n"
569 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
570 			"used only when zero copy is enabled.\n"
571 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
572 			"used only when zero copy is enabled.\n",
573 	       prgname);
574 }
575 
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582 	int opt, ret;
583 	int option_index;
584 	unsigned i;
585 	const char *prgname = argv[0];
586 	static struct option long_option[] = {
587 		{"vm2vm", required_argument, NULL, 0},
588 		{"rx-retry", required_argument, NULL, 0},
589 		{"rx-retry-delay", required_argument, NULL, 0},
590 		{"rx-retry-num", required_argument, NULL, 0},
591 		{"mergeable", required_argument, NULL, 0},
592 		{"stats", required_argument, NULL, 0},
593 		{"dev-basename", required_argument, NULL, 0},
594 		{"zero-copy", required_argument, NULL, 0},
595 		{"rx-desc-num", required_argument, NULL, 0},
596 		{"tx-desc-num", required_argument, NULL, 0},
597 		{NULL, 0, 0, 0},
598 	};
599 
600 	/* Parse command line */
601 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602 		switch (opt) {
603 		/* Portmask */
604 		case 'p':
605 			enabled_port_mask = parse_portmask(optarg);
606 			if (enabled_port_mask == 0) {
607 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 				us_vhost_usage(prgname);
609 				return -1;
610 			}
611 			break;
612 
613 		case 0:
614 			/* Enable/disable vm2vm comms. */
615 			if (!strncmp(long_option[option_index].name, "vm2vm",
616 				MAX_LONG_OPT_SZ)) {
617 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618 				if (ret == -1) {
619 					RTE_LOG(INFO, VHOST_CONFIG,
620 						"Invalid argument for "
621 						"vm2vm [0|1|2]\n");
622 					us_vhost_usage(prgname);
623 					return -1;
624 				} else {
625 					vm2vm_mode = (vm2vm_type)ret;
626 				}
627 			}
628 
629 			/* Enable/disable retries on RX. */
630 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631 				ret = parse_num_opt(optarg, 1);
632 				if (ret == -1) {
633 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634 					us_vhost_usage(prgname);
635 					return -1;
636 				} else {
637 					enable_retry = ret;
638 				}
639 			}
640 
641 			/* Specify the retries delay time (in useconds) on RX. */
642 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643 				ret = parse_num_opt(optarg, INT32_MAX);
644 				if (ret == -1) {
645 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646 					us_vhost_usage(prgname);
647 					return -1;
648 				} else {
649 					burst_rx_delay_time = ret;
650 				}
651 			}
652 
653 			/* Specify the retries number on RX. */
654 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655 				ret = parse_num_opt(optarg, INT32_MAX);
656 				if (ret == -1) {
657 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658 					us_vhost_usage(prgname);
659 					return -1;
660 				} else {
661 					burst_rx_retry_num = ret;
662 				}
663 			}
664 
665 			/* Enable/disable RX mergeable buffers. */
666 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667 				ret = parse_num_opt(optarg, 1);
668 				if (ret == -1) {
669 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670 					us_vhost_usage(prgname);
671 					return -1;
672 				} else {
673 					mergeable = !!ret;
674 					if (ret) {
675 						vmdq_conf_default.rxmode.jumbo_frame = 1;
676 						vmdq_conf_default.rxmode.max_rx_pkt_len
677 							= JUMBO_FRAME_MAX_SIZE;
678 					}
679 				}
680 			}
681 
682 			/* Enable/disable stats. */
683 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684 				ret = parse_num_opt(optarg, INT32_MAX);
685 				if (ret == -1) {
686 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687 					us_vhost_usage(prgname);
688 					return -1;
689 				} else {
690 					enable_stats = ret;
691 				}
692 			}
693 
694 			/* Set character device basename. */
695 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696 				if (us_vhost_parse_basename(optarg) == -1) {
697 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698 					us_vhost_usage(prgname);
699 					return -1;
700 				}
701 			}
702 
703 			/* Enable/disable rx/tx zero copy. */
704 			if (!strncmp(long_option[option_index].name,
705 				"zero-copy", MAX_LONG_OPT_SZ)) {
706 				ret = parse_num_opt(optarg, 1);
707 				if (ret == -1) {
708 					RTE_LOG(INFO, VHOST_CONFIG,
709 						"Invalid argument"
710 						" for zero-copy [0|1]\n");
711 					us_vhost_usage(prgname);
712 					return -1;
713 				} else
714 					zero_copy = ret;
715 
716 				if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719 					"zero copy vhost APP, please "
720 					"disable RTE_MBUF_REFCNT\n"
721 					"in config file and then rebuild DPDK "
722 					"core lib!\n"
723 					"Otherwise please disable zero copy "
724 					"flag in command line!\n");
725 					return -1;
726 #endif
727 				}
728 			}
729 
730 			/* Specify the descriptor number on RX. */
731 			if (!strncmp(long_option[option_index].name,
732 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
733 				ret = parse_num_opt(optarg, MAX_RING_DESC);
734 				if ((ret == -1) || (!POWEROF2(ret))) {
735 					RTE_LOG(INFO, VHOST_CONFIG,
736 					"Invalid argument for rx-desc-num[0-N],"
737 					"power of 2 required.\n");
738 					us_vhost_usage(prgname);
739 					return -1;
740 				} else {
741 					num_rx_descriptor = ret;
742 				}
743 			}
744 
745 			/* Specify the descriptor number on TX. */
746 			if (!strncmp(long_option[option_index].name,
747 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
748 				ret = parse_num_opt(optarg, MAX_RING_DESC);
749 				if ((ret == -1) || (!POWEROF2(ret))) {
750 					RTE_LOG(INFO, VHOST_CONFIG,
751 					"Invalid argument for tx-desc-num [0-N],"
752 					"power of 2 required.\n");
753 					us_vhost_usage(prgname);
754 					return -1;
755 				} else {
756 					num_tx_descriptor = ret;
757 				}
758 			}
759 
760 			break;
761 
762 			/* Invalid option - print options. */
763 		default:
764 			us_vhost_usage(prgname);
765 			return -1;
766 		}
767 	}
768 
769 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770 		if (enabled_port_mask & (1 << i))
771 			ports[num_ports++] = (uint8_t)i;
772 	}
773 
774 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777 		return -1;
778 	}
779 
780 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781 		RTE_LOG(INFO, VHOST_PORT,
782 			"Vhost zero copy doesn't support software vm2vm,"
783 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
784 		return -1;
785 	}
786 
787 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788 		RTE_LOG(INFO, VHOST_PORT,
789 			"Vhost zero copy doesn't support jumbo frame,"
790 			"please specify '--mergeable 0' to disable the "
791 			"mergeable feature.\n");
792 		return -1;
793 	}
794 
795 	return 0;
796 }
797 
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804 	unsigned valid_num_ports = num_ports;
805 	unsigned portid;
806 
807 	if (num_ports > nb_ports) {
808 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809 			num_ports, nb_ports);
810 		num_ports = nb_ports;
811 	}
812 
813 	for (portid = 0; portid < num_ports; portid ++) {
814 		if (ports[portid] >= nb_ports) {
815 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816 				ports[portid], (nb_ports - 1));
817 			ports[portid] = INVALID_PORT_ID;
818 			valid_num_ports--;
819 		}
820 	}
821 	return valid_num_ports;
822 }
823 
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {																\
830 	char *pkt_addr = (char*)(addr);																					\
831 	unsigned int index;																								\
832 	char packet[MAX_PRINT_BUFF];																					\
833 																													\
834 	if ((header))																									\
835 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
836 	else																											\
837 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
838 	for (index = 0; index < (size); index++) {																		\
839 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
840 			"%02hhx ", pkt_addr[index]);																			\
841 	}																												\
842 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
843 																													\
844 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849 
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856 	uint32_t buf_len, hpa_type *addr_type)
857 {
858 	struct virtio_memory_regions_hpa *region;
859 	uint32_t regionidx;
860 	uint64_t vhost_pa = 0;
861 
862 	*addr_type = PHYS_ADDR_INVALID;
863 
864 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865 		region = &vdev->regions_hpa[regionidx];
866 		if ((guest_pa >= region->guest_phys_address) &&
867 			(guest_pa <= region->guest_phys_address_end)) {
868 			vhost_pa = region->host_phys_addr_offset + guest_pa;
869 			if (likely((guest_pa + buf_len - 1)
870 				<= region->guest_phys_address_end))
871 				*addr_type = PHYS_ADDR_CONTINUOUS;
872 			else
873 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
874 			break;
875 		}
876 	}
877 
878 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880 		(void *)(uintptr_t)vhost_pa);
881 
882 	return vhost_pa;
883 }
884 
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893 
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901 	struct ether_hdr *pkt_hdr;
902 	struct virtio_net_data_ll *dev_ll;
903 	struct virtio_net *dev = vdev->dev;
904 	int i, ret;
905 
906 	/* Learn MAC address of guest device from packet */
907 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908 
909 	dev_ll = ll_root_used;
910 
911 	while (dev_ll != NULL) {
912 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914 			return -1;
915 		}
916 		dev_ll = dev_ll->next;
917 	}
918 
919 	for (i = 0; i < ETHER_ADDR_LEN; i++)
920 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921 
922 	/* vlan_tag currently uses the device_id. */
923 	vdev->vlan_tag = vlan_tags[dev->device_fh];
924 
925 	/* Print out VMDQ registration info. */
926 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927 		dev->device_fh,
928 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931 		vdev->vlan_tag);
932 
933 	/* Register the MAC address. */
934 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935 	if (ret)
936 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937 					dev->device_fh);
938 
939 	/* Enable stripping of the vlan tag as we handle routing. */
940 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941 
942 	/* Set device as ready for RX. */
943 	vdev->ready = DEVICE_RX;
944 
945 	return 0;
946 }
947 
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955 	unsigned i = 0;
956 	unsigned rx_count;
957 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958 
959 	if (vdev->ready == DEVICE_RX) {
960 		/*clear MAC and VLAN settings*/
961 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962 		for (i = 0; i < 6; i++)
963 			vdev->mac_address.addr_bytes[i] = 0;
964 
965 		vdev->vlan_tag = 0;
966 
967 		/*Clear out the receive buffers*/
968 		rx_count = rte_eth_rx_burst(ports[0],
969 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970 
971 		while (rx_count) {
972 			for (i = 0; i < rx_count; i++)
973 				rte_pktmbuf_free(pkts_burst[i]);
974 
975 			rx_count = rte_eth_rx_burst(ports[0],
976 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977 		}
978 
979 		vdev->ready = DEVICE_MAC_LEARNING;
980 	}
981 }
982 
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990 	struct virtio_net_data_ll *dev_ll;
991 	struct ether_hdr *pkt_hdr;
992 	uint64_t ret = 0;
993 	struct virtio_net *dev = vdev->dev;
994 	struct virtio_net *tdev; /* destination virito device */
995 
996 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997 
998 	/*get the used devices list*/
999 	dev_ll = ll_root_used;
1000 
1001 	while (dev_ll != NULL) {
1002 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003 				          &dev_ll->vdev->mac_address)) {
1004 
1005 			/* Drop the packet if the TX packet is destined for the TX device. */
1006 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008 							dev->device_fh);
1009 				return 0;
1010 			}
1011 			tdev = dev_ll->vdev->dev;
1012 
1013 
1014 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015 
1016 			if (unlikely(dev_ll->vdev->remove)) {
1017 				/*drop the packet if the device is marked for removal*/
1018 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019 			} else {
1020 				/*send the packet to the local virtio device*/
1021 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022 				if (enable_stats) {
1023 					rte_atomic64_add(
1024 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1025 					1);
1026 					rte_atomic64_add(
1027 					&dev_statistics[tdev->device_fh].rx_atomic,
1028 					ret);
1029 					dev_statistics[tdev->device_fh].tx_total++;
1030 					dev_statistics[tdev->device_fh].tx += ret;
1031 				}
1032 			}
1033 
1034 			return 0;
1035 		}
1036 		dev_ll = dev_ll->next;
1037 	}
1038 
1039 	return -1;
1040 }
1041 
1042 /*
1043  * Check if the destination MAC of a packet is one local VM,
1044  * and get its vlan tag, and offset if it is.
1045  */
1046 static inline int __attribute__((always_inline))
1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1048 	uint32_t *offset, uint16_t *vlan_tag)
1049 {
1050 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1051 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052 
1053 	while (dev_ll != NULL) {
1054 		if ((dev_ll->vdev->ready == DEVICE_RX)
1055 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1056 		&dev_ll->vdev->mac_address)) {
1057 			/*
1058 			 * Drop the packet if the TX packet is
1059 			 * destined for the TX device.
1060 			 */
1061 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 				LOG_DEBUG(VHOST_DATA,
1063 				"(%"PRIu64") TX: Source and destination"
1064 				" MAC addresses are the same. Dropping "
1065 				"packet.\n",
1066 				dev_ll->vdev->dev->device_fh);
1067 				return -1;
1068 			}
1069 
1070 			/*
1071 			 * HW vlan strip will reduce the packet length
1072 			 * by minus length of vlan tag, so need restore
1073 			 * the packet length by plus it.
1074 			 */
1075 			*offset = VLAN_HLEN;
1076 			*vlan_tag =
1077 			(uint16_t)
1078 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1079 
1080 			LOG_DEBUG(VHOST_DATA,
1081 			"(%"PRIu64") TX: pkt to local VM device id:"
1082 			"(%"PRIu64") vlan tag: %d.\n",
1083 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1084 			vlan_tag);
1085 
1086 			break;
1087 		}
1088 		dev_ll = dev_ll->next;
1089 	}
1090 	return 0;
1091 }
1092 
1093 /*
1094  * This function routes the TX packet to the correct interface. This may be a local device
1095  * or the physical port.
1096  */
1097 static inline void __attribute__((always_inline))
1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1099 {
1100 	struct mbuf_table *tx_q;
1101 	struct rte_mbuf **m_table;
1102 	unsigned len, ret, offset = 0;
1103 	const uint16_t lcore_id = rte_lcore_id();
1104 	struct virtio_net *dev = vdev->dev;
1105 
1106 	/*check if destination is local VM*/
1107 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1108 		rte_pktmbuf_free(m);
1109 		return;
1110 	}
1111 
1112 	if (vm2vm_mode == VM2VM_HARDWARE) {
1113 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1114 			rte_pktmbuf_free(m);
1115 			return;
1116 		}
1117 	}
1118 
1119 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1120 
1121 	/*Add packet to the port tx queue*/
1122 	tx_q = &lcore_tx_queue[lcore_id];
1123 	len = tx_q->len;
1124 
1125 	m->ol_flags = PKT_TX_VLAN_PKT;
1126 
1127 	m->data_len += offset;
1128 	m->pkt_len += offset;
1129 
1130 	m->vlan_tci = vlan_tag;
1131 
1132 	tx_q->m_table[len] = m;
1133 	len++;
1134 	if (enable_stats) {
1135 		dev_statistics[dev->device_fh].tx_total++;
1136 		dev_statistics[dev->device_fh].tx++;
1137 	}
1138 
1139 	if (unlikely(len == MAX_PKT_BURST)) {
1140 		m_table = (struct rte_mbuf **)tx_q->m_table;
1141 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1142 		/* Free any buffers not handled by TX and update the port stats. */
1143 		if (unlikely(ret < len)) {
1144 			do {
1145 				rte_pktmbuf_free(m_table[ret]);
1146 			} while (++ret < len);
1147 		}
1148 
1149 		len = 0;
1150 	}
1151 
1152 	tx_q->len = len;
1153 	return;
1154 }
1155 /*
1156  * This function is called by each data core. It handles all RX/TX registered with the
1157  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1158  * with all devices in the main linked list.
1159  */
1160 static int
1161 switch_worker(__attribute__((unused)) void *arg)
1162 {
1163 	struct rte_mempool *mbuf_pool = arg;
1164 	struct virtio_net *dev = NULL;
1165 	struct vhost_dev *vdev = NULL;
1166 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1167 	struct virtio_net_data_ll *dev_ll;
1168 	struct mbuf_table *tx_q;
1169 	volatile struct lcore_ll_info *lcore_ll;
1170 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1171 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1172 	unsigned ret, i;
1173 	const uint16_t lcore_id = rte_lcore_id();
1174 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1175 	uint16_t rx_count = 0;
1176 	uint16_t tx_count;
1177 	uint32_t retry = 0;
1178 
1179 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1180 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1181 	prev_tsc = 0;
1182 
1183 	tx_q = &lcore_tx_queue[lcore_id];
1184 	for (i = 0; i < num_cores; i ++) {
1185 		if (lcore_ids[i] == lcore_id) {
1186 			tx_q->txq_id = i;
1187 			break;
1188 		}
1189 	}
1190 
1191 	while(1) {
1192 		cur_tsc = rte_rdtsc();
1193 		/*
1194 		 * TX burst queue drain
1195 		 */
1196 		diff_tsc = cur_tsc - prev_tsc;
1197 		if (unlikely(diff_tsc > drain_tsc)) {
1198 
1199 			if (tx_q->len) {
1200 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1201 
1202 				/*Tx any packets in the queue*/
1203 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1204 									   (struct rte_mbuf **)tx_q->m_table,
1205 									   (uint16_t)tx_q->len);
1206 				if (unlikely(ret < tx_q->len)) {
1207 					do {
1208 						rte_pktmbuf_free(tx_q->m_table[ret]);
1209 					} while (++ret < tx_q->len);
1210 				}
1211 
1212 				tx_q->len = 0;
1213 			}
1214 
1215 			prev_tsc = cur_tsc;
1216 
1217 		}
1218 
1219 		rte_prefetch0(lcore_ll->ll_root_used);
1220 		/*
1221 		 * Inform the configuration core that we have exited the linked list and that no devices are
1222 		 * in use if requested.
1223 		 */
1224 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1225 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1226 
1227 		/*
1228 		 * Process devices
1229 		 */
1230 		dev_ll = lcore_ll->ll_root_used;
1231 
1232 		while (dev_ll != NULL) {
1233 			/*get virtio device ID*/
1234 			vdev = dev_ll->vdev;
1235 			dev = vdev->dev;
1236 
1237 			if (unlikely(vdev->remove)) {
1238 				dev_ll = dev_ll->next;
1239 				unlink_vmdq(vdev);
1240 				vdev->ready = DEVICE_SAFE_REMOVE;
1241 				continue;
1242 			}
1243 			if (likely(vdev->ready == DEVICE_RX)) {
1244 				/*Handle guest RX*/
1245 				rx_count = rte_eth_rx_burst(ports[0],
1246 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1247 
1248 				if (rx_count) {
1249 					/*
1250 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1251 					* Here MAX_PKT_BURST must be less than virtio queue size
1252 					*/
1253 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1254 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1255 							rte_delay_us(burst_rx_delay_time);
1256 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1257 								break;
1258 						}
1259 					}
1260 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1261 					if (enable_stats) {
1262 						rte_atomic64_add(
1263 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1264 						rx_count);
1265 						rte_atomic64_add(
1266 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1267 					}
1268 					while (likely(rx_count)) {
1269 						rx_count--;
1270 						rte_pktmbuf_free(pkts_burst[rx_count]);
1271 					}
1272 
1273 				}
1274 			}
1275 
1276 			if (likely(!vdev->remove)) {
1277 				/* Handle guest TX*/
1278 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1279 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1280 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1281 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1282 						while (tx_count--)
1283 							rte_pktmbuf_free(pkts_burst[tx_count]);
1284 					}
1285 				}
1286 				while (tx_count)
1287 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1288 			}
1289 
1290 			/*move to the next device in the list*/
1291 			dev_ll = dev_ll->next;
1292 		}
1293 	}
1294 
1295 	return 0;
1296 }
1297 
1298 /*
1299  * This function gets available ring number for zero copy rx.
1300  * Only one thread will call this funciton for a paticular virtio device,
1301  * so, it is designed as non-thread-safe function.
1302  */
1303 static inline uint32_t __attribute__((always_inline))
1304 get_available_ring_num_zcp(struct virtio_net *dev)
1305 {
1306 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1307 	uint16_t avail_idx;
1308 
1309 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1310 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1311 }
1312 
1313 /*
1314  * This function gets available ring index for zero copy rx,
1315  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1316  * Only one thread will call this funciton for a paticular virtio device,
1317  * so, it is designed as non-thread-safe function.
1318  */
1319 static inline uint32_t __attribute__((always_inline))
1320 get_available_ring_index_zcp(struct virtio_net *dev,
1321 	uint16_t *res_base_idx, uint32_t count)
1322 {
1323 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1324 	uint16_t avail_idx;
1325 	uint32_t retry = 0;
1326 	uint16_t free_entries;
1327 
1328 	*res_base_idx = vq->last_used_idx_res;
1329 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1330 	free_entries = (avail_idx - *res_base_idx);
1331 
1332 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1333 			"avail idx: %d, "
1334 			"res base idx:%d, free entries:%d\n",
1335 			dev->device_fh, avail_idx, *res_base_idx,
1336 			free_entries);
1337 
1338 	/*
1339 	 * If retry is enabled and the queue is full then we wait
1340 	 * and retry to avoid packet loss.
1341 	 */
1342 	if (enable_retry && unlikely(count > free_entries)) {
1343 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1344 			rte_delay_us(burst_rx_delay_time);
1345 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1346 			free_entries = (avail_idx - *res_base_idx);
1347 			if (count <= free_entries)
1348 				break;
1349 		}
1350 	}
1351 
1352 	/*check that we have enough buffers*/
1353 	if (unlikely(count > free_entries))
1354 		count = free_entries;
1355 
1356 	if (unlikely(count == 0)) {
1357 		LOG_DEBUG(VHOST_DATA,
1358 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1359 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1360 			dev->device_fh, avail_idx,
1361 			*res_base_idx, free_entries);
1362 		return 0;
1363 	}
1364 
1365 	vq->last_used_idx_res = *res_base_idx + count;
1366 
1367 	return count;
1368 }
1369 
1370 /*
1371  * This function put descriptor back to used list.
1372  */
1373 static inline void __attribute__((always_inline))
1374 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1375 {
1376 	uint16_t res_cur_idx = vq->last_used_idx;
1377 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1378 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1379 	rte_compiler_barrier();
1380 	*(volatile uint16_t *)&vq->used->idx += 1;
1381 	vq->last_used_idx += 1;
1382 
1383 	/* Kick the guest if necessary. */
1384 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1385 		eventfd_write((int)vq->kickfd, 1);
1386 }
1387 
1388 /*
1389  * This function get available descriptor from vitio vring and un-attached mbuf
1390  * from vpool->ring, and then attach them together. It needs adjust the offset
1391  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1392  * frame data may be put to wrong location in mbuf.
1393  */
1394 static inline void __attribute__((always_inline))
1395 attach_rxmbuf_zcp(struct virtio_net *dev)
1396 {
1397 	uint16_t res_base_idx, desc_idx;
1398 	uint64_t buff_addr, phys_addr;
1399 	struct vhost_virtqueue *vq;
1400 	struct vring_desc *desc;
1401 	struct rte_mbuf *mbuf = NULL;
1402 	struct vpool *vpool;
1403 	hpa_type addr_type;
1404 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1405 
1406 	vpool = &vpool_array[vdev->vmdq_rx_q];
1407 	vq = dev->virtqueue[VIRTIO_RXQ];
1408 
1409 	do {
1410 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1411 				1) != 1))
1412 			return;
1413 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1414 
1415 		desc = &vq->desc[desc_idx];
1416 		if (desc->flags & VRING_DESC_F_NEXT) {
1417 			desc = &vq->desc[desc->next];
1418 			buff_addr = gpa_to_vva(dev, desc->addr);
1419 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1420 					&addr_type);
1421 		} else {
1422 			buff_addr = gpa_to_vva(dev,
1423 					desc->addr + vq->vhost_hlen);
1424 			phys_addr = gpa_to_hpa(vdev,
1425 					desc->addr + vq->vhost_hlen,
1426 					desc->len, &addr_type);
1427 		}
1428 
1429 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1430 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1431 				" address found when attaching RX frame buffer"
1432 				" address!\n", dev->device_fh);
1433 			put_desc_to_used_list_zcp(vq, desc_idx);
1434 			continue;
1435 		}
1436 
1437 		/*
1438 		 * Check if the frame buffer address from guest crosses
1439 		 * sub-region or not.
1440 		 */
1441 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1442 			RTE_LOG(ERR, VHOST_DATA,
1443 				"(%"PRIu64") Frame buffer address cross "
1444 				"sub-regioin found when attaching RX frame "
1445 				"buffer address!\n",
1446 				dev->device_fh);
1447 			put_desc_to_used_list_zcp(vq, desc_idx);
1448 			continue;
1449 		}
1450 	} while (unlikely(phys_addr == 0));
1451 
1452 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1453 	if (unlikely(mbuf == NULL)) {
1454 		LOG_DEBUG(VHOST_DATA,
1455 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1456 			"ring_sc_dequeue fail.\n",
1457 			dev->device_fh);
1458 		put_desc_to_used_list_zcp(vq, desc_idx);
1459 		return;
1460 	}
1461 
1462 	if (unlikely(vpool->buf_size > desc->len)) {
1463 		LOG_DEBUG(VHOST_DATA,
1464 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1465 			"length(%d) of descriptor idx: %d less than room "
1466 			"size required: %d\n",
1467 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1468 		put_desc_to_used_list_zcp(vq, desc_idx);
1469 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1470 		return;
1471 	}
1472 
1473 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1474 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1475 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1476 	mbuf->data_len = desc->len;
1477 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1478 
1479 	LOG_DEBUG(VHOST_DATA,
1480 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1481 		"descriptor idx:%d\n",
1482 		dev->device_fh, res_base_idx, desc_idx);
1483 
1484 	__rte_mbuf_raw_free(mbuf);
1485 
1486 	return;
1487 }
1488 
1489 /*
1490  * Detach an attched packet mbuf -
1491  *  - restore original mbuf address and length values.
1492  *  - reset pktmbuf data and data_len to their default values.
1493  *  All other fields of the given packet mbuf will be left intact.
1494  *
1495  * @param m
1496  *   The attached packet mbuf.
1497  */
1498 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1499 {
1500 	const struct rte_mempool *mp = m->pool;
1501 	void *buf = RTE_MBUF_TO_BADDR(m);
1502 	uint32_t buf_ofs;
1503 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1504 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1505 
1506 	m->buf_addr = buf;
1507 	m->buf_len = (uint16_t)buf_len;
1508 
1509 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1510 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1511 	m->data_off = buf_ofs;
1512 
1513 	m->data_len = 0;
1514 }
1515 
1516 /*
1517  * This function is called after packets have been transimited. It fetchs mbuf
1518  * from vpool->pool, detached it and put into vpool->ring. It also update the
1519  * used index and kick the guest if necessary.
1520  */
1521 static inline uint32_t __attribute__((always_inline))
1522 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1523 {
1524 	struct rte_mbuf *mbuf;
1525 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1526 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1527 	uint32_t index = 0;
1528 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1529 
1530 	LOG_DEBUG(VHOST_DATA,
1531 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1532 		"clean is: %d\n",
1533 		dev->device_fh, mbuf_count);
1534 	LOG_DEBUG(VHOST_DATA,
1535 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1536 		"clean  is : %d\n",
1537 		dev->device_fh, rte_ring_count(vpool->ring));
1538 
1539 	for (index = 0; index < mbuf_count; index++) {
1540 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1541 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1542 			pktmbuf_detach_zcp(mbuf);
1543 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1544 
1545 		/* Update used index buffer information. */
1546 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1547 		vq->used->ring[used_idx].len = 0;
1548 
1549 		used_idx = (used_idx + 1) & (vq->size - 1);
1550 	}
1551 
1552 	LOG_DEBUG(VHOST_DATA,
1553 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1554 		"clean is: %d\n",
1555 		dev->device_fh, rte_mempool_count(vpool->pool));
1556 	LOG_DEBUG(VHOST_DATA,
1557 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1558 		"clean  is : %d\n",
1559 		dev->device_fh, rte_ring_count(vpool->ring));
1560 	LOG_DEBUG(VHOST_DATA,
1561 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1562 		"vq->last_used_idx:%d\n",
1563 		dev->device_fh, vq->last_used_idx);
1564 
1565 	vq->last_used_idx += mbuf_count;
1566 
1567 	LOG_DEBUG(VHOST_DATA,
1568 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1569 		"vq->last_used_idx:%d\n",
1570 		dev->device_fh, vq->last_used_idx);
1571 
1572 	rte_compiler_barrier();
1573 
1574 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1575 
1576 	/* Kick guest if required. */
1577 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1578 		eventfd_write((int)vq->kickfd, 1);
1579 
1580 	return 0;
1581 }
1582 
1583 /*
1584  * This function is called when a virtio device is destroy.
1585  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1586  */
1587 static void mbuf_destroy_zcp(struct vpool *vpool)
1588 {
1589 	struct rte_mbuf *mbuf = NULL;
1590 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1591 
1592 	LOG_DEBUG(VHOST_CONFIG,
1593 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1594 		"mbuf_destroy_zcp is: %d\n",
1595 		mbuf_count);
1596 	LOG_DEBUG(VHOST_CONFIG,
1597 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1598 		"mbuf_destroy_zcp  is : %d\n",
1599 		rte_ring_count(vpool->ring));
1600 
1601 	for (index = 0; index < mbuf_count; index++) {
1602 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1603 		if (likely(mbuf != NULL)) {
1604 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1605 				pktmbuf_detach_zcp(mbuf);
1606 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1607 		}
1608 	}
1609 
1610 	LOG_DEBUG(VHOST_CONFIG,
1611 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1612 		"mbuf_destroy_zcp is: %d\n",
1613 		rte_mempool_count(vpool->pool));
1614 	LOG_DEBUG(VHOST_CONFIG,
1615 		"in mbuf_destroy_zcp: mbuf count in ring after "
1616 		"mbuf_destroy_zcp is : %d\n",
1617 		rte_ring_count(vpool->ring));
1618 }
1619 
1620 /*
1621  * This function update the use flag and counter.
1622  */
1623 static inline uint32_t __attribute__((always_inline))
1624 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1625 	uint32_t count)
1626 {
1627 	struct vhost_virtqueue *vq;
1628 	struct vring_desc *desc;
1629 	struct rte_mbuf *buff;
1630 	/* The virtio_hdr is initialised to 0. */
1631 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1632 		= {{0, 0, 0, 0, 0, 0}, 0};
1633 	uint64_t buff_hdr_addr = 0;
1634 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1635 	uint32_t head_idx, packet_success = 0;
1636 	uint16_t res_cur_idx;
1637 
1638 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1639 
1640 	if (count == 0)
1641 		return 0;
1642 
1643 	vq = dev->virtqueue[VIRTIO_RXQ];
1644 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1645 
1646 	res_cur_idx = vq->last_used_idx;
1647 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1648 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1649 
1650 	/* Retrieve all of the head indexes first to avoid caching issues. */
1651 	for (head_idx = 0; head_idx < count; head_idx++)
1652 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1653 
1654 	/*Prefetch descriptor index. */
1655 	rte_prefetch0(&vq->desc[head[packet_success]]);
1656 
1657 	while (packet_success != count) {
1658 		/* Get descriptor from available ring */
1659 		desc = &vq->desc[head[packet_success]];
1660 
1661 		buff = pkts[packet_success];
1662 		LOG_DEBUG(VHOST_DATA,
1663 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1664 			"pkt[%d] descriptor idx: %d\n",
1665 			dev->device_fh, packet_success,
1666 			MBUF_HEADROOM_UINT32(buff));
1667 
1668 		PRINT_PACKET(dev,
1669 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1670 			+ RTE_PKTMBUF_HEADROOM),
1671 			rte_pktmbuf_data_len(buff), 0);
1672 
1673 		/* Buffer address translation for virtio header. */
1674 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1675 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1676 
1677 		/*
1678 		 * If the descriptors are chained the header and data are
1679 		 * placed in separate buffers.
1680 		 */
1681 		if (desc->flags & VRING_DESC_F_NEXT) {
1682 			desc->len = vq->vhost_hlen;
1683 			desc = &vq->desc[desc->next];
1684 			desc->len = rte_pktmbuf_data_len(buff);
1685 		} else {
1686 			desc->len = packet_len;
1687 		}
1688 
1689 		/* Update used ring with desc information */
1690 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1691 			= head[packet_success];
1692 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1693 			= packet_len;
1694 		res_cur_idx++;
1695 		packet_success++;
1696 
1697 		/* A header is required per buffer. */
1698 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1699 			(const void *)&virtio_hdr, vq->vhost_hlen);
1700 
1701 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1702 
1703 		if (likely(packet_success < count)) {
1704 			/* Prefetch descriptor index. */
1705 			rte_prefetch0(&vq->desc[head[packet_success]]);
1706 		}
1707 	}
1708 
1709 	rte_compiler_barrier();
1710 
1711 	LOG_DEBUG(VHOST_DATA,
1712 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1713 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1714 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1715 
1716 	*(volatile uint16_t *)&vq->used->idx += count;
1717 	vq->last_used_idx += count;
1718 
1719 	LOG_DEBUG(VHOST_DATA,
1720 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1721 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1722 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1723 
1724 	/* Kick the guest if necessary. */
1725 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1726 		eventfd_write((int)vq->kickfd, 1);
1727 
1728 	return count;
1729 }
1730 
1731 /*
1732  * This function routes the TX packet to the correct interface.
1733  * This may be a local device or the physical port.
1734  */
1735 static inline void __attribute__((always_inline))
1736 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1737 	uint32_t desc_idx, uint8_t need_copy)
1738 {
1739 	struct mbuf_table *tx_q;
1740 	struct rte_mbuf **m_table;
1741 	struct rte_mbuf *mbuf = NULL;
1742 	unsigned len, ret, offset = 0;
1743 	struct vpool *vpool;
1744 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1745 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1746 
1747 	/*Add packet to the port tx queue*/
1748 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1749 	len = tx_q->len;
1750 
1751 	/* Allocate an mbuf and populate the structure. */
1752 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1753 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1754 	if (unlikely(mbuf == NULL)) {
1755 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1756 		RTE_LOG(ERR, VHOST_DATA,
1757 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1758 			dev->device_fh);
1759 		put_desc_to_used_list_zcp(vq, desc_idx);
1760 		return;
1761 	}
1762 
1763 	if (vm2vm_mode == VM2VM_HARDWARE) {
1764 		/* Avoid using a vlan tag from any vm for external pkt, such as
1765 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1766 		 * selection, MAC address determines it as an external pkt
1767 		 * which should go to network, while vlan tag determine it as
1768 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1769 		 * such a ambiguous situation, so pkt will lost.
1770 		 */
1771 		vlan_tag = external_pkt_default_vlan_tag;
1772 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1773 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1774 			__rte_mbuf_raw_free(mbuf);
1775 			return;
1776 		}
1777 	}
1778 
1779 	mbuf->nb_segs = m->nb_segs;
1780 	mbuf->next = m->next;
1781 	mbuf->data_len = m->data_len + offset;
1782 	mbuf->pkt_len = mbuf->data_len;
1783 	if (unlikely(need_copy)) {
1784 		/* Copy the packet contents to the mbuf. */
1785 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1786 			rte_pktmbuf_mtod(m, void *),
1787 			m->data_len);
1788 	} else {
1789 		mbuf->data_off = m->data_off;
1790 		mbuf->buf_physaddr = m->buf_physaddr;
1791 		mbuf->buf_addr = m->buf_addr;
1792 	}
1793 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1794 	mbuf->vlan_tci = vlan_tag;
1795 	mbuf->l2_len = sizeof(struct ether_hdr);
1796 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1797 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1798 
1799 	tx_q->m_table[len] = mbuf;
1800 	len++;
1801 
1802 	LOG_DEBUG(VHOST_DATA,
1803 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1804 		dev->device_fh,
1805 		mbuf->nb_segs,
1806 		(mbuf->next == NULL) ? "null" : "non-null");
1807 
1808 	if (enable_stats) {
1809 		dev_statistics[dev->device_fh].tx_total++;
1810 		dev_statistics[dev->device_fh].tx++;
1811 	}
1812 
1813 	if (unlikely(len == MAX_PKT_BURST)) {
1814 		m_table = (struct rte_mbuf **)tx_q->m_table;
1815 		ret = rte_eth_tx_burst(ports[0],
1816 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1817 
1818 		/*
1819 		 * Free any buffers not handled by TX and update
1820 		 * the port stats.
1821 		 */
1822 		if (unlikely(ret < len)) {
1823 			do {
1824 				rte_pktmbuf_free(m_table[ret]);
1825 			} while (++ret < len);
1826 		}
1827 
1828 		len = 0;
1829 		txmbuf_clean_zcp(dev, vpool);
1830 	}
1831 
1832 	tx_q->len = len;
1833 
1834 	return;
1835 }
1836 
1837 /*
1838  * This function TX all available packets in virtio TX queue for one
1839  * virtio-net device. If it is first packet, it learns MAC address and
1840  * setup VMDQ.
1841  */
1842 static inline void __attribute__((always_inline))
1843 virtio_dev_tx_zcp(struct virtio_net *dev)
1844 {
1845 	struct rte_mbuf m;
1846 	struct vhost_virtqueue *vq;
1847 	struct vring_desc *desc;
1848 	uint64_t buff_addr = 0, phys_addr;
1849 	uint32_t head[MAX_PKT_BURST];
1850 	uint32_t i;
1851 	uint16_t free_entries, packet_success = 0;
1852 	uint16_t avail_idx;
1853 	uint8_t need_copy = 0;
1854 	hpa_type addr_type;
1855 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1856 
1857 	vq = dev->virtqueue[VIRTIO_TXQ];
1858 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1859 
1860 	/* If there are no available buffers then return. */
1861 	if (vq->last_used_idx_res == avail_idx)
1862 		return;
1863 
1864 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1865 
1866 	/* Prefetch available ring to retrieve head indexes. */
1867 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1868 
1869 	/* Get the number of free entries in the ring */
1870 	free_entries = (avail_idx - vq->last_used_idx_res);
1871 
1872 	/* Limit to MAX_PKT_BURST. */
1873 	free_entries
1874 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1875 
1876 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1877 		dev->device_fh, free_entries);
1878 
1879 	/* Retrieve all of the head indexes first to avoid caching issues. */
1880 	for (i = 0; i < free_entries; i++)
1881 		head[i]
1882 			= vq->avail->ring[(vq->last_used_idx_res + i)
1883 			& (vq->size - 1)];
1884 
1885 	vq->last_used_idx_res += free_entries;
1886 
1887 	/* Prefetch descriptor index. */
1888 	rte_prefetch0(&vq->desc[head[packet_success]]);
1889 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1890 
1891 	while (packet_success < free_entries) {
1892 		desc = &vq->desc[head[packet_success]];
1893 
1894 		/* Discard first buffer as it is the virtio header */
1895 		desc = &vq->desc[desc->next];
1896 
1897 		/* Buffer address translation. */
1898 		buff_addr = gpa_to_vva(dev, desc->addr);
1899 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1900 
1901 		if (likely(packet_success < (free_entries - 1)))
1902 			/* Prefetch descriptor index. */
1903 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1904 
1905 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1906 			RTE_LOG(ERR, VHOST_DATA,
1907 				"(%"PRIu64") Invalid frame buffer address found"
1908 				"when TX packets!\n",
1909 				dev->device_fh);
1910 			packet_success++;
1911 			continue;
1912 		}
1913 
1914 		/* Prefetch buffer address. */
1915 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1916 
1917 		/*
1918 		 * Setup dummy mbuf. This is copied to a real mbuf if
1919 		 * transmitted out the physical port.
1920 		 */
1921 		m.data_len = desc->len;
1922 		m.nb_segs = 1;
1923 		m.next = NULL;
1924 		m.data_off = 0;
1925 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1926 		m.buf_physaddr = phys_addr;
1927 
1928 		/*
1929 		 * Check if the frame buffer address from guest crosses
1930 		 * sub-region or not.
1931 		 */
1932 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1933 			RTE_LOG(ERR, VHOST_DATA,
1934 				"(%"PRIu64") Frame buffer address cross "
1935 				"sub-regioin found when attaching TX frame "
1936 				"buffer address!\n",
1937 				dev->device_fh);
1938 			need_copy = 1;
1939 		} else
1940 			need_copy = 0;
1941 
1942 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1943 
1944 		/*
1945 		 * If this is the first received packet we need to learn
1946 		 * the MAC and setup VMDQ
1947 		 */
1948 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1949 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1950 				/*
1951 				 * Discard frame if device is scheduled for
1952 				 * removal or a duplicate MAC address is found.
1953 				 */
1954 				packet_success += free_entries;
1955 				vq->last_used_idx += packet_success;
1956 				break;
1957 			}
1958 		}
1959 
1960 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1961 		packet_success++;
1962 	}
1963 }
1964 
1965 /*
1966  * This function is called by each data core. It handles all RX/TX registered
1967  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1968  * addresses are compared with all devices in the main linked list.
1969  */
1970 static int
1971 switch_worker_zcp(__attribute__((unused)) void *arg)
1972 {
1973 	struct virtio_net *dev = NULL;
1974 	struct vhost_dev  *vdev = NULL;
1975 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1976 	struct virtio_net_data_ll *dev_ll;
1977 	struct mbuf_table *tx_q;
1978 	volatile struct lcore_ll_info *lcore_ll;
1979 	const uint64_t drain_tsc
1980 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1981 		* BURST_TX_DRAIN_US;
1982 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1983 	unsigned ret;
1984 	const uint16_t lcore_id = rte_lcore_id();
1985 	uint16_t count_in_ring, rx_count = 0;
1986 
1987 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1988 
1989 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1990 	prev_tsc = 0;
1991 
1992 	while (1) {
1993 		cur_tsc = rte_rdtsc();
1994 
1995 		/* TX burst queue drain */
1996 		diff_tsc = cur_tsc - prev_tsc;
1997 		if (unlikely(diff_tsc > drain_tsc)) {
1998 			/*
1999 			 * Get mbuf from vpool.pool and detach mbuf and
2000 			 * put back into vpool.ring.
2001 			 */
2002 			dev_ll = lcore_ll->ll_root_used;
2003 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2004 				/* Get virtio device ID */
2005 				vdev = dev_ll->vdev;
2006 				dev = vdev->dev;
2007 
2008 				if (likely(!vdev->remove)) {
2009 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2010 					if (tx_q->len) {
2011 						LOG_DEBUG(VHOST_DATA,
2012 						"TX queue drained after timeout"
2013 						" with burst size %u\n",
2014 						tx_q->len);
2015 
2016 						/*
2017 						 * Tx any packets in the queue
2018 						 */
2019 						ret = rte_eth_tx_burst(
2020 							ports[0],
2021 							(uint16_t)tx_q->txq_id,
2022 							(struct rte_mbuf **)
2023 							tx_q->m_table,
2024 							(uint16_t)tx_q->len);
2025 						if (unlikely(ret < tx_q->len)) {
2026 							do {
2027 								rte_pktmbuf_free(
2028 									tx_q->m_table[ret]);
2029 							} while (++ret < tx_q->len);
2030 						}
2031 						tx_q->len = 0;
2032 
2033 						txmbuf_clean_zcp(dev,
2034 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2035 					}
2036 				}
2037 				dev_ll = dev_ll->next;
2038 			}
2039 			prev_tsc = cur_tsc;
2040 		}
2041 
2042 		rte_prefetch0(lcore_ll->ll_root_used);
2043 
2044 		/*
2045 		 * Inform the configuration core that we have exited the linked
2046 		 * list and that no devices are in use if requested.
2047 		 */
2048 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2049 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2050 
2051 		/* Process devices */
2052 		dev_ll = lcore_ll->ll_root_used;
2053 
2054 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2055 			vdev = dev_ll->vdev;
2056 			dev  = vdev->dev;
2057 			if (unlikely(vdev->remove)) {
2058 				dev_ll = dev_ll->next;
2059 				unlink_vmdq(vdev);
2060 				vdev->ready = DEVICE_SAFE_REMOVE;
2061 				continue;
2062 			}
2063 
2064 			if (likely(vdev->ready == DEVICE_RX)) {
2065 				uint32_t index = vdev->vmdq_rx_q;
2066 				uint16_t i;
2067 				count_in_ring
2068 				= rte_ring_count(vpool_array[index].ring);
2069 				uint16_t free_entries
2070 				= (uint16_t)get_available_ring_num_zcp(dev);
2071 
2072 				/*
2073 				 * Attach all mbufs in vpool.ring and put back
2074 				 * into vpool.pool.
2075 				 */
2076 				for (i = 0;
2077 				i < RTE_MIN(free_entries,
2078 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2079 				i++)
2080 					attach_rxmbuf_zcp(dev);
2081 
2082 				/* Handle guest RX */
2083 				rx_count = rte_eth_rx_burst(ports[0],
2084 					vdev->vmdq_rx_q, pkts_burst,
2085 					MAX_PKT_BURST);
2086 
2087 				if (rx_count) {
2088 					ret_count = virtio_dev_rx_zcp(dev,
2089 							pkts_burst, rx_count);
2090 					if (enable_stats) {
2091 						dev_statistics[dev->device_fh].rx_total
2092 							+= rx_count;
2093 						dev_statistics[dev->device_fh].rx
2094 							+= ret_count;
2095 					}
2096 					while (likely(rx_count)) {
2097 						rx_count--;
2098 						pktmbuf_detach_zcp(
2099 							pkts_burst[rx_count]);
2100 						rte_ring_sp_enqueue(
2101 							vpool_array[index].ring,
2102 							(void *)pkts_burst[rx_count]);
2103 					}
2104 				}
2105 			}
2106 
2107 			if (likely(!vdev->remove))
2108 				/* Handle guest TX */
2109 				virtio_dev_tx_zcp(dev);
2110 
2111 			/* Move to the next device in the list */
2112 			dev_ll = dev_ll->next;
2113 		}
2114 	}
2115 
2116 	return 0;
2117 }
2118 
2119 
2120 /*
2121  * Add an entry to a used linked list. A free entry must first be found
2122  * in the free linked list using get_data_ll_free_entry();
2123  */
2124 static void
2125 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2126 	struct virtio_net_data_ll *ll_dev)
2127 {
2128 	struct virtio_net_data_ll *ll = *ll_root_addr;
2129 
2130 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2131 	ll_dev->next = NULL;
2132 	rte_compiler_barrier();
2133 
2134 	/* If ll == NULL then this is the first device. */
2135 	if (ll) {
2136 		/* Increment to the tail of the linked list. */
2137 		while ((ll->next != NULL) )
2138 			ll = ll->next;
2139 
2140 		ll->next = ll_dev;
2141 	} else {
2142 		*ll_root_addr = ll_dev;
2143 	}
2144 }
2145 
2146 /*
2147  * Remove an entry from a used linked list. The entry must then be added to
2148  * the free linked list using put_data_ll_free_entry().
2149  */
2150 static void
2151 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2152 	struct virtio_net_data_ll *ll_dev,
2153 	struct virtio_net_data_ll *ll_dev_last)
2154 {
2155 	struct virtio_net_data_ll *ll = *ll_root_addr;
2156 
2157 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2158 		return;
2159 
2160 	if (ll_dev == ll)
2161 		*ll_root_addr = ll_dev->next;
2162 	else
2163 		if (likely(ll_dev_last != NULL))
2164 			ll_dev_last->next = ll_dev->next;
2165 		else
2166 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2167 }
2168 
2169 /*
2170  * Find and return an entry from the free linked list.
2171  */
2172 static struct virtio_net_data_ll *
2173 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2174 {
2175 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2176 	struct virtio_net_data_ll *ll_dev;
2177 
2178 	if (ll_free == NULL)
2179 		return NULL;
2180 
2181 	ll_dev = ll_free;
2182 	*ll_root_addr = ll_free->next;
2183 
2184 	return ll_dev;
2185 }
2186 
2187 /*
2188  * Place an entry back on to the free linked list.
2189  */
2190 static void
2191 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2192 	struct virtio_net_data_ll *ll_dev)
2193 {
2194 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2195 
2196 	if (ll_dev == NULL)
2197 		return;
2198 
2199 	ll_dev->next = ll_free;
2200 	*ll_root_addr = ll_dev;
2201 }
2202 
2203 /*
2204  * Creates a linked list of a given size.
2205  */
2206 static struct virtio_net_data_ll *
2207 alloc_data_ll(uint32_t size)
2208 {
2209 	struct virtio_net_data_ll *ll_new;
2210 	uint32_t i;
2211 
2212 	/* Malloc and then chain the linked list. */
2213 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2214 	if (ll_new == NULL) {
2215 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2216 		return NULL;
2217 	}
2218 
2219 	for (i = 0; i < size - 1; i++) {
2220 		ll_new[i].vdev = NULL;
2221 		ll_new[i].next = &ll_new[i+1];
2222 	}
2223 	ll_new[i].next = NULL;
2224 
2225 	return (ll_new);
2226 }
2227 
2228 /*
2229  * Create the main linked list along with each individual cores linked list. A used and a free list
2230  * are created to manage entries.
2231  */
2232 static int
2233 init_data_ll (void)
2234 {
2235 	int lcore;
2236 
2237 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2238 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2239 		if (lcore_info[lcore].lcore_ll == NULL) {
2240 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2241 			return -1;
2242 		}
2243 
2244 		lcore_info[lcore].lcore_ll->device_num = 0;
2245 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2246 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2247 		if (num_devices % num_switching_cores)
2248 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2249 		else
2250 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2251 	}
2252 
2253 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2254 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2255 
2256 	return 0;
2257 }
2258 
2259 /*
2260  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2261  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2262  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2263  */
2264 static void
2265 destroy_device (volatile struct virtio_net *dev)
2266 {
2267 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2268 	struct virtio_net_data_ll *ll_main_dev_cur;
2269 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2270 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2271 	struct vhost_dev *vdev;
2272 	int lcore;
2273 
2274 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2275 
2276 	vdev = (struct vhost_dev *)dev->priv;
2277 	/*set the remove flag. */
2278 	vdev->remove = 1;
2279 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2280 		rte_pause();
2281 	}
2282 
2283 	/* Search for entry to be removed from lcore ll */
2284 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2285 	while (ll_lcore_dev_cur != NULL) {
2286 		if (ll_lcore_dev_cur->vdev == vdev) {
2287 			break;
2288 		} else {
2289 			ll_lcore_dev_last = ll_lcore_dev_cur;
2290 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2291 		}
2292 	}
2293 
2294 	if (ll_lcore_dev_cur == NULL) {
2295 		RTE_LOG(ERR, VHOST_CONFIG,
2296 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2297 			dev->device_fh);
2298 		return;
2299 	}
2300 
2301 	/* Search for entry to be removed from main ll */
2302 	ll_main_dev_cur = ll_root_used;
2303 	ll_main_dev_last = NULL;
2304 	while (ll_main_dev_cur != NULL) {
2305 		if (ll_main_dev_cur->vdev == vdev) {
2306 			break;
2307 		} else {
2308 			ll_main_dev_last = ll_main_dev_cur;
2309 			ll_main_dev_cur = ll_main_dev_cur->next;
2310 		}
2311 	}
2312 
2313 	/* Remove entries from the lcore and main ll. */
2314 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2315 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2316 
2317 	/* Set the dev_removal_flag on each lcore. */
2318 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2319 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2320 	}
2321 
2322 	/*
2323 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2324 	 * they can no longer access the device removed from the linked lists and that the devices
2325 	 * are no longer in use.
2326 	 */
2327 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2328 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2329 			rte_pause();
2330 		}
2331 	}
2332 
2333 	/* Add the entries back to the lcore and main free ll.*/
2334 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2335 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2336 
2337 	/* Decrement number of device on the lcore. */
2338 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2339 
2340 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2341 
2342 	if (zero_copy) {
2343 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2344 
2345 		/* Stop the RX queue. */
2346 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2347 			LOG_DEBUG(VHOST_CONFIG,
2348 				"(%"PRIu64") In destroy_device: Failed to stop "
2349 				"rx queue:%d\n",
2350 				dev->device_fh,
2351 				vdev->vmdq_rx_q);
2352 		}
2353 
2354 		LOG_DEBUG(VHOST_CONFIG,
2355 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2356 			"mempool back to ring for RX queue: %d\n",
2357 			dev->device_fh, vdev->vmdq_rx_q);
2358 
2359 		mbuf_destroy_zcp(vpool);
2360 
2361 		/* Stop the TX queue. */
2362 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2363 			LOG_DEBUG(VHOST_CONFIG,
2364 				"(%"PRIu64") In destroy_device: Failed to "
2365 				"stop tx queue:%d\n",
2366 				dev->device_fh, vdev->vmdq_rx_q);
2367 		}
2368 
2369 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2370 
2371 		LOG_DEBUG(VHOST_CONFIG,
2372 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2373 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2374 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2375 			dev->device_fh);
2376 
2377 		mbuf_destroy_zcp(vpool);
2378 		rte_free(vdev->regions_hpa);
2379 	}
2380 	rte_free(vdev);
2381 
2382 }
2383 
2384 /*
2385  * Calculate the region count of physical continous regions for one particular
2386  * region of whose vhost virtual address is continous. The particular region
2387  * start from vva_start, with size of 'size' in argument.
2388  */
2389 static uint32_t
2390 check_hpa_regions(uint64_t vva_start, uint64_t size)
2391 {
2392 	uint32_t i, nregions = 0, page_size = getpagesize();
2393 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2394 	if (vva_start % page_size) {
2395 		LOG_DEBUG(VHOST_CONFIG,
2396 			"in check_countinous: vva start(%p) mod page_size(%d) "
2397 			"has remainder\n",
2398 			(void *)(uintptr_t)vva_start, page_size);
2399 		return 0;
2400 	}
2401 	if (size % page_size) {
2402 		LOG_DEBUG(VHOST_CONFIG,
2403 			"in check_countinous: "
2404 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2405 			size, page_size);
2406 		return 0;
2407 	}
2408 	for (i = 0; i < size - page_size; i = i + page_size) {
2409 		cur_phys_addr
2410 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2411 		next_phys_addr = rte_mem_virt2phy(
2412 			(void *)(uintptr_t)(vva_start + i + page_size));
2413 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2414 			++nregions;
2415 			LOG_DEBUG(VHOST_CONFIG,
2416 				"in check_continuous: hva addr:(%p) is not "
2417 				"continuous with hva addr:(%p), diff:%d\n",
2418 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2419 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2420 				+ page_size), page_size);
2421 			LOG_DEBUG(VHOST_CONFIG,
2422 				"in check_continuous: hpa addr:(%p) is not "
2423 				"continuous with hpa addr:(%p), "
2424 				"diff:(%"PRIu64")\n",
2425 				(void *)(uintptr_t)cur_phys_addr,
2426 				(void *)(uintptr_t)next_phys_addr,
2427 				(next_phys_addr-cur_phys_addr));
2428 		}
2429 	}
2430 	return nregions;
2431 }
2432 
2433 /*
2434  * Divide each region whose vhost virtual address is continous into a few
2435  * sub-regions, make sure the physical address within each sub-region are
2436  * continous. And fill offset(to GPA) and size etc. information of each
2437  * sub-region into regions_hpa.
2438  */
2439 static uint32_t
2440 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2441 {
2442 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2443 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2444 
2445 	if (mem_region_hpa == NULL)
2446 		return 0;
2447 
2448 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2449 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2450 			virtio_memory->regions[regionidx].address_offset;
2451 		mem_region_hpa[regionidx_hpa].guest_phys_address
2452 			= virtio_memory->regions[regionidx].guest_phys_address;
2453 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2454 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2455 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2456 		LOG_DEBUG(VHOST_CONFIG,
2457 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2458 			regionidx_hpa,
2459 			(void *)(uintptr_t)
2460 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2461 		LOG_DEBUG(VHOST_CONFIG,
2462 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2463 			regionidx_hpa,
2464 			(void *)(uintptr_t)
2465 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2466 		for (i = 0, k = 0;
2467 			i < virtio_memory->regions[regionidx].memory_size -
2468 				page_size;
2469 			i += page_size) {
2470 			cur_phys_addr = rte_mem_virt2phy(
2471 					(void *)(uintptr_t)(vva_start + i));
2472 			next_phys_addr = rte_mem_virt2phy(
2473 					(void *)(uintptr_t)(vva_start +
2474 					i + page_size));
2475 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2476 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2477 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2478 					k + page_size;
2479 				mem_region_hpa[regionidx_hpa].memory_size
2480 					= k + page_size;
2481 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2482 					"phys addr end  [%d]:(%p)\n",
2483 					regionidx_hpa,
2484 					(void *)(uintptr_t)
2485 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2486 				LOG_DEBUG(VHOST_CONFIG,
2487 					"in fill_hpa_regions: guest phys addr "
2488 					"size [%d]:(%p)\n",
2489 					regionidx_hpa,
2490 					(void *)(uintptr_t)
2491 					(mem_region_hpa[regionidx_hpa].memory_size));
2492 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2493 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2494 				++regionidx_hpa;
2495 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2496 					next_phys_addr -
2497 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2498 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2499 					" phys addr start[%d]:(%p)\n",
2500 					regionidx_hpa,
2501 					(void *)(uintptr_t)
2502 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2503 				LOG_DEBUG(VHOST_CONFIG,
2504 					"in fill_hpa_regions: host  phys addr "
2505 					"start[%d]:(%p)\n",
2506 					regionidx_hpa,
2507 					(void *)(uintptr_t)
2508 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2509 				k = 0;
2510 			} else {
2511 				k += page_size;
2512 			}
2513 		}
2514 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2515 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2516 			+ k + page_size;
2517 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2518 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2519 			"[%d]:(%p)\n", regionidx_hpa,
2520 			(void *)(uintptr_t)
2521 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2522 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2523 			"[%d]:(%p)\n", regionidx_hpa,
2524 			(void *)(uintptr_t)
2525 			(mem_region_hpa[regionidx_hpa].memory_size));
2526 		++regionidx_hpa;
2527 	}
2528 	return regionidx_hpa;
2529 }
2530 
2531 /*
2532  * A new device is added to a data core. First the device is added to the main linked list
2533  * and the allocated to a specific data core.
2534  */
2535 static int
2536 new_device (struct virtio_net *dev)
2537 {
2538 	struct virtio_net_data_ll *ll_dev;
2539 	int lcore, core_add = 0;
2540 	uint32_t device_num_min = num_devices;
2541 	struct vhost_dev *vdev;
2542 	uint32_t regionidx;
2543 
2544 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2545 	if (vdev == NULL) {
2546 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2547 			dev->device_fh);
2548 		return -1;
2549 	}
2550 	vdev->dev = dev;
2551 	dev->priv = vdev;
2552 
2553 	if (zero_copy) {
2554 		vdev->nregions_hpa = dev->mem->nregions;
2555 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2556 			vdev->nregions_hpa
2557 				+= check_hpa_regions(
2558 					dev->mem->regions[regionidx].guest_phys_address
2559 					+ dev->mem->regions[regionidx].address_offset,
2560 					dev->mem->regions[regionidx].memory_size);
2561 
2562 		}
2563 
2564 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2565 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2566 			CACHE_LINE_SIZE);
2567 		if (vdev->regions_hpa == NULL) {
2568 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2569 			rte_free(vdev);
2570 			return -1;
2571 		}
2572 
2573 
2574 		if (fill_hpa_memory_regions(
2575 			vdev->regions_hpa, dev->mem
2576 			) != vdev->nregions_hpa) {
2577 
2578 			RTE_LOG(ERR, VHOST_CONFIG,
2579 				"hpa memory regions number mismatch: "
2580 				"[%d]\n", vdev->nregions_hpa);
2581 			rte_free(vdev->regions_hpa);
2582 			rte_free(vdev);
2583 			return -1;
2584 		}
2585 	}
2586 
2587 
2588 	/* Add device to main ll */
2589 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2590 	if (ll_dev == NULL) {
2591 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2592 			"of %d devices per core has been reached\n",
2593 			dev->device_fh, num_devices);
2594 		if (vdev->regions_hpa)
2595 			rte_free(vdev->regions_hpa);
2596 		rte_free(vdev);
2597 		return -1;
2598 	}
2599 	ll_dev->vdev = vdev;
2600 	add_data_ll_entry(&ll_root_used, ll_dev);
2601 	vdev->vmdq_rx_q
2602 		= dev->device_fh * (num_queues / num_devices);
2603 
2604 	if (zero_copy) {
2605 		uint32_t index = vdev->vmdq_rx_q;
2606 		uint32_t count_in_ring, i;
2607 		struct mbuf_table *tx_q;
2608 
2609 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2610 
2611 		LOG_DEBUG(VHOST_CONFIG,
2612 			"(%"PRIu64") in new_device: mbuf count in mempool "
2613 			"before attach is: %d\n",
2614 			dev->device_fh,
2615 			rte_mempool_count(vpool_array[index].pool));
2616 		LOG_DEBUG(VHOST_CONFIG,
2617 			"(%"PRIu64") in new_device: mbuf count in  ring "
2618 			"before attach  is : %d\n",
2619 			dev->device_fh, count_in_ring);
2620 
2621 		/*
2622 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2623 		 */
2624 		for (i = 0; i < count_in_ring; i++)
2625 			attach_rxmbuf_zcp(dev);
2626 
2627 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2628 			"mempool after attach is: %d\n",
2629 			dev->device_fh,
2630 			rte_mempool_count(vpool_array[index].pool));
2631 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2632 			"ring after attach  is : %d\n",
2633 			dev->device_fh,
2634 			rte_ring_count(vpool_array[index].ring));
2635 
2636 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2637 		tx_q->txq_id = vdev->vmdq_rx_q;
2638 
2639 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2640 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2641 
2642 			LOG_DEBUG(VHOST_CONFIG,
2643 				"(%"PRIu64") In new_device: Failed to start "
2644 				"tx queue:%d\n",
2645 				dev->device_fh, vdev->vmdq_rx_q);
2646 
2647 			mbuf_destroy_zcp(vpool);
2648 			rte_free(vdev->regions_hpa);
2649 			rte_free(vdev);
2650 			return -1;
2651 		}
2652 
2653 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2654 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2655 
2656 			LOG_DEBUG(VHOST_CONFIG,
2657 				"(%"PRIu64") In new_device: Failed to start "
2658 				"rx queue:%d\n",
2659 				dev->device_fh, vdev->vmdq_rx_q);
2660 
2661 			/* Stop the TX queue. */
2662 			if (rte_eth_dev_tx_queue_stop(ports[0],
2663 				vdev->vmdq_rx_q) != 0) {
2664 				LOG_DEBUG(VHOST_CONFIG,
2665 					"(%"PRIu64") In new_device: Failed to "
2666 					"stop tx queue:%d\n",
2667 					dev->device_fh, vdev->vmdq_rx_q);
2668 			}
2669 
2670 			mbuf_destroy_zcp(vpool);
2671 			rte_free(vdev->regions_hpa);
2672 			rte_free(vdev);
2673 			return -1;
2674 		}
2675 
2676 	}
2677 
2678 	/*reset ready flag*/
2679 	vdev->ready = DEVICE_MAC_LEARNING;
2680 	vdev->remove = 0;
2681 
2682 	/* Find a suitable lcore to add the device. */
2683 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2684 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2685 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2686 			core_add = lcore;
2687 		}
2688 	}
2689 	/* Add device to lcore ll */
2690 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2691 	if (ll_dev == NULL) {
2692 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2693 		vdev->ready = DEVICE_SAFE_REMOVE;
2694 		destroy_device(dev);
2695 		if (vdev->regions_hpa)
2696 			rte_free(vdev->regions_hpa);
2697 		rte_free(vdev);
2698 		return -1;
2699 	}
2700 	ll_dev->vdev = vdev;
2701 	vdev->coreid = core_add;
2702 
2703 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2704 
2705 	/* Initialize device stats */
2706 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2707 
2708 	/* Disable notifications. */
2709 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2710 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2711 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2712 	dev->flags |= VIRTIO_DEV_RUNNING;
2713 
2714 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2715 
2716 	return 0;
2717 }
2718 
2719 /*
2720  * These callback allow devices to be added to the data core when configuration
2721  * has been fully complete.
2722  */
2723 static const struct virtio_net_device_ops virtio_net_device_ops =
2724 {
2725 	.new_device =  new_device,
2726 	.destroy_device = destroy_device,
2727 };
2728 
2729 /*
2730  * This is a thread will wake up after a period to print stats if the user has
2731  * enabled them.
2732  */
2733 static void
2734 print_stats(void)
2735 {
2736 	struct virtio_net_data_ll *dev_ll;
2737 	uint64_t tx_dropped, rx_dropped;
2738 	uint64_t tx, tx_total, rx, rx_total;
2739 	uint32_t device_fh;
2740 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2741 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2742 
2743 	while(1) {
2744 		sleep(enable_stats);
2745 
2746 		/* Clear screen and move to top left */
2747 		printf("%s%s", clr, top_left);
2748 
2749 		printf("\nDevice statistics ====================================");
2750 
2751 		dev_ll = ll_root_used;
2752 		while (dev_ll != NULL) {
2753 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2754 			tx_total = dev_statistics[device_fh].tx_total;
2755 			tx = dev_statistics[device_fh].tx;
2756 			tx_dropped = tx_total - tx;
2757 			if (zero_copy == 0) {
2758 				rx_total = rte_atomic64_read(
2759 					&dev_statistics[device_fh].rx_total_atomic);
2760 				rx = rte_atomic64_read(
2761 					&dev_statistics[device_fh].rx_atomic);
2762 			} else {
2763 				rx_total = dev_statistics[device_fh].rx_total;
2764 				rx = dev_statistics[device_fh].rx;
2765 			}
2766 			rx_dropped = rx_total - rx;
2767 
2768 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2769 					"\nTX total: 		%"PRIu64""
2770 					"\nTX dropped: 		%"PRIu64""
2771 					"\nTX successful: 		%"PRIu64""
2772 					"\nRX total: 		%"PRIu64""
2773 					"\nRX dropped: 		%"PRIu64""
2774 					"\nRX successful: 		%"PRIu64"",
2775 					device_fh,
2776 					tx_total,
2777 					tx_dropped,
2778 					tx,
2779 					rx_total,
2780 					rx_dropped,
2781 					rx);
2782 
2783 			dev_ll = dev_ll->next;
2784 		}
2785 		printf("\n======================================================\n");
2786 	}
2787 }
2788 
2789 static void
2790 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2791 	char *ring_name, uint32_t nb_mbuf)
2792 {
2793 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2794 	vpool_array[index].pool
2795 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2796 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2797 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2798 		rte_pktmbuf_init, NULL, socket, 0);
2799 	if (vpool_array[index].pool != NULL) {
2800 		vpool_array[index].ring
2801 			= rte_ring_create(ring_name,
2802 				rte_align32pow2(nb_mbuf + 1),
2803 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2804 		if (likely(vpool_array[index].ring != NULL)) {
2805 			LOG_DEBUG(VHOST_CONFIG,
2806 				"in setup_mempool_tbl: mbuf count in "
2807 				"mempool is: %d\n",
2808 				rte_mempool_count(vpool_array[index].pool));
2809 			LOG_DEBUG(VHOST_CONFIG,
2810 				"in setup_mempool_tbl: mbuf count in "
2811 				"ring   is: %d\n",
2812 				rte_ring_count(vpool_array[index].ring));
2813 		} else {
2814 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2815 				ring_name);
2816 		}
2817 
2818 		/* Need consider head room. */
2819 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2820 	} else {
2821 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2822 	}
2823 }
2824 
2825 
2826 /*
2827  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2828  * device is also registered here to handle the IOCTLs.
2829  */
2830 int
2831 MAIN(int argc, char *argv[])
2832 {
2833 	struct rte_mempool *mbuf_pool = NULL;
2834 	unsigned lcore_id, core_id = 0;
2835 	unsigned nb_ports, valid_num_ports;
2836 	int ret;
2837 	uint8_t portid, queue_id = 0;
2838 	static pthread_t tid;
2839 
2840 	/* init EAL */
2841 	ret = rte_eal_init(argc, argv);
2842 	if (ret < 0)
2843 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2844 	argc -= ret;
2845 	argv += ret;
2846 
2847 	/* parse app arguments */
2848 	ret = us_vhost_parse_args(argc, argv);
2849 	if (ret < 0)
2850 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2851 
2852 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2853 		if (rte_lcore_is_enabled(lcore_id))
2854 			lcore_ids[core_id ++] = lcore_id;
2855 
2856 	if (rte_lcore_count() > RTE_MAX_LCORE)
2857 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2858 
2859 	/*set the number of swithcing cores available*/
2860 	num_switching_cores = rte_lcore_count()-1;
2861 
2862 	/* Get the number of physical ports. */
2863 	nb_ports = rte_eth_dev_count();
2864 	if (nb_ports > RTE_MAX_ETHPORTS)
2865 		nb_ports = RTE_MAX_ETHPORTS;
2866 
2867 	/*
2868 	 * Update the global var NUM_PORTS and global array PORTS
2869 	 * and get value of var VALID_NUM_PORTS according to system ports number
2870 	 */
2871 	valid_num_ports = check_ports_num(nb_ports);
2872 
2873 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2874 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2875 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2876 		return -1;
2877 	}
2878 
2879 	if (zero_copy == 0) {
2880 		/* Create the mbuf pool. */
2881 		mbuf_pool = rte_mempool_create(
2882 				"MBUF_POOL",
2883 				NUM_MBUFS_PER_PORT
2884 				* valid_num_ports,
2885 				MBUF_SIZE, MBUF_CACHE_SIZE,
2886 				sizeof(struct rte_pktmbuf_pool_private),
2887 				rte_pktmbuf_pool_init, NULL,
2888 				rte_pktmbuf_init, NULL,
2889 				rte_socket_id(), 0);
2890 		if (mbuf_pool == NULL)
2891 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2892 
2893 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2894 			vpool_array[queue_id].pool = mbuf_pool;
2895 
2896 		if (vm2vm_mode == VM2VM_HARDWARE) {
2897 			/* Enable VT loop back to let L2 switch to do it. */
2898 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2899 			LOG_DEBUG(VHOST_CONFIG,
2900 				"Enable loop back for L2 switch in vmdq.\n");
2901 		}
2902 	} else {
2903 		uint32_t nb_mbuf;
2904 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2905 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2906 
2907 		/*
2908 		 * Zero copy defers queue RX/TX start to the time when guest
2909 		 * finishes its startup and packet buffers from that guest are
2910 		 * available.
2911 		 */
2912 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2913 		rx_conf_default.rx_drop_en = 0;
2914 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2915 		nb_mbuf = num_rx_descriptor
2916 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2917 			+ num_switching_cores * MAX_PKT_BURST;
2918 
2919 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2920 			snprintf(pool_name, sizeof(pool_name),
2921 				"rxmbuf_pool_%u", queue_id);
2922 			snprintf(ring_name, sizeof(ring_name),
2923 				"rxmbuf_ring_%u", queue_id);
2924 			setup_mempool_tbl(rte_socket_id(), queue_id,
2925 				pool_name, ring_name, nb_mbuf);
2926 		}
2927 
2928 		nb_mbuf = num_tx_descriptor
2929 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2930 				+ num_switching_cores * MAX_PKT_BURST;
2931 
2932 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2933 			snprintf(pool_name, sizeof(pool_name),
2934 				"txmbuf_pool_%u", queue_id);
2935 			snprintf(ring_name, sizeof(ring_name),
2936 				"txmbuf_ring_%u", queue_id);
2937 			setup_mempool_tbl(rte_socket_id(),
2938 				(queue_id + MAX_QUEUES),
2939 				pool_name, ring_name, nb_mbuf);
2940 		}
2941 
2942 		if (vm2vm_mode == VM2VM_HARDWARE) {
2943 			/* Enable VT loop back to let L2 switch to do it. */
2944 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2945 			LOG_DEBUG(VHOST_CONFIG,
2946 				"Enable loop back for L2 switch in vmdq.\n");
2947 		}
2948 	}
2949 	/* Set log level. */
2950 	rte_set_log_level(LOG_LEVEL);
2951 
2952 	/* initialize all ports */
2953 	for (portid = 0; portid < nb_ports; portid++) {
2954 		/* skip ports that are not enabled */
2955 		if ((enabled_port_mask & (1 << portid)) == 0) {
2956 			RTE_LOG(INFO, VHOST_PORT,
2957 				"Skipping disabled port %d\n", portid);
2958 			continue;
2959 		}
2960 		if (port_init(portid) != 0)
2961 			rte_exit(EXIT_FAILURE,
2962 				"Cannot initialize network ports\n");
2963 	}
2964 
2965 	/* Initialise all linked lists. */
2966 	if (init_data_ll() == -1)
2967 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2968 
2969 	/* Initialize device stats */
2970 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2971 
2972 	/* Enable stats if the user option is set. */
2973 	if (enable_stats)
2974 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2975 
2976 	/* Launch all data cores. */
2977 	if (zero_copy == 0) {
2978 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2979 			rte_eal_remote_launch(switch_worker,
2980 				mbuf_pool, lcore_id);
2981 		}
2982 	} else {
2983 		uint32_t count_in_mempool, index, i;
2984 		for (index = 0; index < 2*MAX_QUEUES; index++) {
2985 			/* For all RX and TX queues. */
2986 			count_in_mempool
2987 				= rte_mempool_count(vpool_array[index].pool);
2988 
2989 			/*
2990 			 * Transfer all un-attached mbufs from vpool.pool
2991 			 * to vpoo.ring.
2992 			 */
2993 			for (i = 0; i < count_in_mempool; i++) {
2994 				struct rte_mbuf *mbuf
2995 					= __rte_mbuf_raw_alloc(
2996 						vpool_array[index].pool);
2997 				rte_ring_sp_enqueue(vpool_array[index].ring,
2998 						(void *)mbuf);
2999 			}
3000 
3001 			LOG_DEBUG(VHOST_CONFIG,
3002 				"in MAIN: mbuf count in mempool at initial "
3003 				"is: %d\n", count_in_mempool);
3004 			LOG_DEBUG(VHOST_CONFIG,
3005 				"in MAIN: mbuf count in  ring at initial  is :"
3006 				" %d\n",
3007 				rte_ring_count(vpool_array[index].ring));
3008 		}
3009 
3010 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3011 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3012 				lcore_id);
3013 	}
3014 
3015 	if (mergeable == 0)
3016 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3017 
3018 	/* Register CUSE device to handle IOCTLs. */
3019 	ret = rte_vhost_driver_register((char *)&dev_basename);
3020 	if (ret != 0)
3021 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3022 
3023 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3024 
3025 	/* Start CUSE session. */
3026 	rte_vhost_driver_session_start();
3027 	return 0;
3028 
3029 }
3030 
3031