xref: /dpdk/examples/vhost/main.c (revision 6630bc42449aebba33b0817ae830cbdf3a83a1b6)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
103 
104 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
106 
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108 
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX			1
112 #define DEVICE_SAFE_REMOVE	2
113 
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117 
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121 
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133 
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 		+ sizeof(struct rte_mbuf)))
137 
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140 
141 #define INVALID_PORT_ID 0xFF
142 
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145 
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148 
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151 
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154 
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157 
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160 
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163 
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166 
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170 
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177 
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184 
185 struct vpool {
186 	struct rte_mempool *pool;
187 	struct rte_ring *ring;
188 	uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193 	VM2VM_DISABLED = 0,
194 	VM2VM_SOFTWARE = 1,
195 	VM2VM_HARDWARE = 2,
196 	VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202 	PHYS_ADDR_CONTINUOUS = 0,
203 	PHYS_ADDR_CROSS_SUBREG = 1,
204 	PHYS_ADDR_INVALID = 2,
205 	PHYS_ADDR_LAST
206 } hpa_type;
207 
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 
220 
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223 	.rx_thresh = {
224 		.pthresh = RX_PTHRESH,
225 		.hthresh = RX_HTHRESH,
226 		.wthresh = RX_WTHRESH,
227 	},
228 	.rx_drop_en = 1,
229 };
230 
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237 	.tx_thresh = {
238 		.pthresh = TX_PTHRESH,
239 		.hthresh = TX_HTHRESH,
240 		.wthresh = TX_WTHRESH,
241 	},
242 	.tx_free_thresh = 0, /* Use PMD default values */
243 	.tx_rs_thresh = 0, /* Use PMD default values */
244 };
245 
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248 	.rxmode = {
249 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250 		.split_hdr_size = 0,
251 		.header_split   = 0, /**< Header Split disabled */
252 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
253 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
254 		/*
255 		 * It is necessary for 1G NIC such as I350,
256 		 * this fixes bug of ipv4 forwarding in guest can't
257 		 * forward pakets from one virtio dev to another virtio dev.
258 		 */
259 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
262 	},
263 
264 	.txmode = {
265 		.mq_mode = ETH_MQ_TX_NONE,
266 	},
267 	.rx_adv_conf = {
268 		/*
269 		 * should be overridden separately in code with
270 		 * appropriate values
271 		 */
272 		.vmdq_rx_conf = {
273 			.nb_queue_pools = ETH_8_POOLS,
274 			.enable_default_pool = 0,
275 			.default_pool = 0,
276 			.nb_pool_maps = 0,
277 			.pool_map = {{0, 0},},
278 		},
279 	},
280 };
281 
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285 
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
290 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297 
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300 
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304 
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307 
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310 	unsigned len;
311 	unsigned txq_id;
312 	struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314 
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317 
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320 
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323 	unsigned char   h_dest[ETH_ALEN];
324 	unsigned char   h_source[ETH_ALEN];
325 	__be16          h_vlan_proto;
326 	__be16          h_vlan_TCI;
327 	__be16          h_vlan_encapsulated_proto;
328 };
329 
330 /* IPv4 Header */
331 struct ipv4_hdr {
332 	uint8_t  version_ihl;		/**< version and header length */
333 	uint8_t  type_of_service;	/**< type of service */
334 	uint16_t total_length;		/**< length of packet */
335 	uint16_t packet_id;		/**< packet ID */
336 	uint16_t fragment_offset;	/**< fragmentation offset */
337 	uint8_t  time_to_live;		/**< time to live */
338 	uint8_t  next_proto_id;		/**< protocol ID */
339 	uint16_t hdr_checksum;		/**< header checksum */
340 	uint32_t src_addr;		/**< source address */
341 	uint32_t dst_addr;		/**< destination address */
342 } __attribute__((__packed__));
343 
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347 
348 /* Per-device statistics struct */
349 struct device_statistics {
350 	uint64_t tx_total;
351 	rte_atomic64_t rx_total_atomic;
352 	uint64_t rx_total;
353 	uint64_t tx;
354 	rte_atomic64_t rx_atomic;
355 	uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358 
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366 	struct rte_eth_vmdq_rx_conf conf;
367 	unsigned i;
368 
369 	memset(&conf, 0, sizeof(conf));
370 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371 	conf.nb_pool_maps = num_devices;
372 	conf.enable_loop_back =
373 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374 
375 	for (i = 0; i < conf.nb_pool_maps; i++) {
376 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
377 		conf.pool_map[i].pools = (1UL << i);
378 	}
379 
380 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383 	return 0;
384 }
385 
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394 	if (num_devices > max_nb_devices) {
395 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396 		return -1;
397 	}
398 	return 0;
399 }
400 
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408 	struct rte_eth_dev_info dev_info;
409 	struct rte_eth_conf port_conf;
410 	uint16_t rx_rings, tx_rings;
411 	uint16_t rx_ring_size, tx_ring_size;
412 	int retval;
413 	uint16_t q;
414 
415 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416 	rte_eth_dev_info_get (port, &dev_info);
417 
418 	/*configure the number of supported virtio devices based on VMDQ limits */
419 	num_devices = dev_info.max_vmdq_pools;
420 	num_queues = dev_info.max_rx_queues;
421 
422 	if (zero_copy) {
423 		rx_ring_size = num_rx_descriptor;
424 		tx_ring_size = num_tx_descriptor;
425 		tx_rings = dev_info.max_tx_queues;
426 	} else {
427 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429 		tx_rings = (uint16_t)rte_lcore_count();
430 	}
431 
432 	retval = validate_num_devices(MAX_DEVICES);
433 	if (retval < 0)
434 		return retval;
435 
436 	/* Get port configuration. */
437 	retval = get_eth_conf(&port_conf, num_devices);
438 	if (retval < 0)
439 		return retval;
440 
441 	if (port >= rte_eth_dev_count()) return -1;
442 
443 	rx_rings = (uint16_t)num_queues,
444 	/* Configure ethernet device. */
445 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446 	if (retval != 0)
447 		return retval;
448 
449 	/* Setup the queues. */
450 	for (q = 0; q < rx_rings; q ++) {
451 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 						rte_eth_dev_socket_id(port), &rx_conf_default,
453 						vpool_array[q].pool);
454 		if (retval < 0)
455 			return retval;
456 	}
457 	for (q = 0; q < tx_rings; q ++) {
458 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459 						rte_eth_dev_socket_id(port), &tx_conf_default);
460 		if (retval < 0)
461 			return retval;
462 	}
463 
464 	/* Start the device. */
465 	retval  = rte_eth_dev_start(port);
466 	if (retval < 0) {
467 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468 		return retval;
469 	}
470 
471 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475 			(unsigned)port,
476 			vmdq_ports_eth_addr[port].addr_bytes[0],
477 			vmdq_ports_eth_addr[port].addr_bytes[1],
478 			vmdq_ports_eth_addr[port].addr_bytes[2],
479 			vmdq_ports_eth_addr[port].addr_bytes[3],
480 			vmdq_ports_eth_addr[port].addr_bytes[4],
481 			vmdq_ports_eth_addr[port].addr_bytes[5]);
482 
483 	return 0;
484 }
485 
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492 	/* parse number string */
493 
494 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495 		return -1;
496 	else
497 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498 
499 	return 0;
500 }
501 
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508 	char *end = NULL;
509 	unsigned long pm;
510 
511 	errno = 0;
512 
513 	/* parse hexadecimal string */
514 	pm = strtoul(portmask, &end, 16);
515 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516 		return -1;
517 
518 	if (pm == 0)
519 		return -1;
520 
521 	return pm;
522 
523 }
524 
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531 	char *end = NULL;
532 	unsigned long num;
533 
534 	errno = 0;
535 
536 	/* parse unsigned int string */
537 	num = strtoul(q_arg, &end, 10);
538 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539 		return -1;
540 
541 	if (num > max_valid_value)
542 		return -1;
543 
544 	return num;
545 
546 }
547 
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555 	"		--vm2vm [0|1|2]\n"
556 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557 	"		--dev-basename <name>\n"
558 	"		--nb-devices ND\n"
559 	"		-p PORTMASK: Set mask for ports to be used by application\n"
560 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566 	"		--dev-basename: The basename to be used for the character device.\n"
567 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
568 			"zero copy\n"
569 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
570 			"used only when zero copy is enabled.\n"
571 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
572 			"used only when zero copy is enabled.\n",
573 	       prgname);
574 }
575 
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582 	int opt, ret;
583 	int option_index;
584 	unsigned i;
585 	const char *prgname = argv[0];
586 	static struct option long_option[] = {
587 		{"vm2vm", required_argument, NULL, 0},
588 		{"rx-retry", required_argument, NULL, 0},
589 		{"rx-retry-delay", required_argument, NULL, 0},
590 		{"rx-retry-num", required_argument, NULL, 0},
591 		{"mergeable", required_argument, NULL, 0},
592 		{"stats", required_argument, NULL, 0},
593 		{"dev-basename", required_argument, NULL, 0},
594 		{"zero-copy", required_argument, NULL, 0},
595 		{"rx-desc-num", required_argument, NULL, 0},
596 		{"tx-desc-num", required_argument, NULL, 0},
597 		{NULL, 0, 0, 0},
598 	};
599 
600 	/* Parse command line */
601 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602 		switch (opt) {
603 		/* Portmask */
604 		case 'p':
605 			enabled_port_mask = parse_portmask(optarg);
606 			if (enabled_port_mask == 0) {
607 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 				us_vhost_usage(prgname);
609 				return -1;
610 			}
611 			break;
612 
613 		case 0:
614 			/* Enable/disable vm2vm comms. */
615 			if (!strncmp(long_option[option_index].name, "vm2vm",
616 				MAX_LONG_OPT_SZ)) {
617 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618 				if (ret == -1) {
619 					RTE_LOG(INFO, VHOST_CONFIG,
620 						"Invalid argument for "
621 						"vm2vm [0|1|2]\n");
622 					us_vhost_usage(prgname);
623 					return -1;
624 				} else {
625 					vm2vm_mode = (vm2vm_type)ret;
626 				}
627 			}
628 
629 			/* Enable/disable retries on RX. */
630 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631 				ret = parse_num_opt(optarg, 1);
632 				if (ret == -1) {
633 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634 					us_vhost_usage(prgname);
635 					return -1;
636 				} else {
637 					enable_retry = ret;
638 				}
639 			}
640 
641 			/* Specify the retries delay time (in useconds) on RX. */
642 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643 				ret = parse_num_opt(optarg, INT32_MAX);
644 				if (ret == -1) {
645 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646 					us_vhost_usage(prgname);
647 					return -1;
648 				} else {
649 					burst_rx_delay_time = ret;
650 				}
651 			}
652 
653 			/* Specify the retries number on RX. */
654 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655 				ret = parse_num_opt(optarg, INT32_MAX);
656 				if (ret == -1) {
657 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658 					us_vhost_usage(prgname);
659 					return -1;
660 				} else {
661 					burst_rx_retry_num = ret;
662 				}
663 			}
664 
665 			/* Enable/disable RX mergeable buffers. */
666 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667 				ret = parse_num_opt(optarg, 1);
668 				if (ret == -1) {
669 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670 					us_vhost_usage(prgname);
671 					return -1;
672 				} else {
673 					mergeable = !!ret;
674 					if (ret) {
675 						vmdq_conf_default.rxmode.jumbo_frame = 1;
676 						vmdq_conf_default.rxmode.max_rx_pkt_len
677 							= JUMBO_FRAME_MAX_SIZE;
678 					}
679 				}
680 			}
681 
682 			/* Enable/disable stats. */
683 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684 				ret = parse_num_opt(optarg, INT32_MAX);
685 				if (ret == -1) {
686 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687 					us_vhost_usage(prgname);
688 					return -1;
689 				} else {
690 					enable_stats = ret;
691 				}
692 			}
693 
694 			/* Set character device basename. */
695 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696 				if (us_vhost_parse_basename(optarg) == -1) {
697 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698 					us_vhost_usage(prgname);
699 					return -1;
700 				}
701 			}
702 
703 			/* Enable/disable rx/tx zero copy. */
704 			if (!strncmp(long_option[option_index].name,
705 				"zero-copy", MAX_LONG_OPT_SZ)) {
706 				ret = parse_num_opt(optarg, 1);
707 				if (ret == -1) {
708 					RTE_LOG(INFO, VHOST_CONFIG,
709 						"Invalid argument"
710 						" for zero-copy [0|1]\n");
711 					us_vhost_usage(prgname);
712 					return -1;
713 				} else
714 					zero_copy = ret;
715 
716 				if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719 					"zero copy vhost APP, please "
720 					"disable RTE_MBUF_REFCNT\n"
721 					"in config file and then rebuild DPDK "
722 					"core lib!\n"
723 					"Otherwise please disable zero copy "
724 					"flag in command line!\n");
725 					return -1;
726 #endif
727 				}
728 			}
729 
730 			/* Specify the descriptor number on RX. */
731 			if (!strncmp(long_option[option_index].name,
732 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
733 				ret = parse_num_opt(optarg, MAX_RING_DESC);
734 				if ((ret == -1) || (!POWEROF2(ret))) {
735 					RTE_LOG(INFO, VHOST_CONFIG,
736 					"Invalid argument for rx-desc-num[0-N],"
737 					"power of 2 required.\n");
738 					us_vhost_usage(prgname);
739 					return -1;
740 				} else {
741 					num_rx_descriptor = ret;
742 				}
743 			}
744 
745 			/* Specify the descriptor number on TX. */
746 			if (!strncmp(long_option[option_index].name,
747 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
748 				ret = parse_num_opt(optarg, MAX_RING_DESC);
749 				if ((ret == -1) || (!POWEROF2(ret))) {
750 					RTE_LOG(INFO, VHOST_CONFIG,
751 					"Invalid argument for tx-desc-num [0-N],"
752 					"power of 2 required.\n");
753 					us_vhost_usage(prgname);
754 					return -1;
755 				} else {
756 					num_tx_descriptor = ret;
757 				}
758 			}
759 
760 			break;
761 
762 			/* Invalid option - print options. */
763 		default:
764 			us_vhost_usage(prgname);
765 			return -1;
766 		}
767 	}
768 
769 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770 		if (enabled_port_mask & (1 << i))
771 			ports[num_ports++] = (uint8_t)i;
772 	}
773 
774 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777 		return -1;
778 	}
779 
780 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781 		RTE_LOG(INFO, VHOST_PORT,
782 			"Vhost zero copy doesn't support software vm2vm,"
783 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
784 		return -1;
785 	}
786 
787 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788 		RTE_LOG(INFO, VHOST_PORT,
789 			"Vhost zero copy doesn't support jumbo frame,"
790 			"please specify '--mergeable 0' to disable the "
791 			"mergeable feature.\n");
792 		return -1;
793 	}
794 
795 	return 0;
796 }
797 
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804 	unsigned valid_num_ports = num_ports;
805 	unsigned portid;
806 
807 	if (num_ports > nb_ports) {
808 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809 			num_ports, nb_ports);
810 		num_ports = nb_ports;
811 	}
812 
813 	for (portid = 0; portid < num_ports; portid ++) {
814 		if (ports[portid] >= nb_ports) {
815 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816 				ports[portid], (nb_ports - 1));
817 			ports[portid] = INVALID_PORT_ID;
818 			valid_num_ports--;
819 		}
820 	}
821 	return valid_num_ports;
822 }
823 
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {																\
830 	char *pkt_addr = (char*)(addr);																					\
831 	unsigned int index;																								\
832 	char packet[MAX_PRINT_BUFF];																					\
833 																													\
834 	if ((header))																									\
835 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
836 	else																											\
837 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
838 	for (index = 0; index < (size); index++) {																		\
839 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
840 			"%02hhx ", pkt_addr[index]);																			\
841 	}																												\
842 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
843 																													\
844 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849 
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856 	uint32_t buf_len, hpa_type *addr_type)
857 {
858 	struct virtio_memory_regions_hpa *region;
859 	uint32_t regionidx;
860 	uint64_t vhost_pa = 0;
861 
862 	*addr_type = PHYS_ADDR_INVALID;
863 
864 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865 		region = &vdev->regions_hpa[regionidx];
866 		if ((guest_pa >= region->guest_phys_address) &&
867 			(guest_pa <= region->guest_phys_address_end)) {
868 			vhost_pa = region->host_phys_addr_offset + guest_pa;
869 			if (likely((guest_pa + buf_len - 1)
870 				<= region->guest_phys_address_end))
871 				*addr_type = PHYS_ADDR_CONTINUOUS;
872 			else
873 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
874 			break;
875 		}
876 	}
877 
878 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880 		(void *)(uintptr_t)vhost_pa);
881 
882 	return vhost_pa;
883 }
884 
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893 
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901 	struct ether_hdr *pkt_hdr;
902 	struct virtio_net_data_ll *dev_ll;
903 	struct virtio_net *dev = vdev->dev;
904 	int i, ret;
905 
906 	/* Learn MAC address of guest device from packet */
907 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908 
909 	dev_ll = ll_root_used;
910 
911 	while (dev_ll != NULL) {
912 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914 			return -1;
915 		}
916 		dev_ll = dev_ll->next;
917 	}
918 
919 	for (i = 0; i < ETHER_ADDR_LEN; i++)
920 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921 
922 	/* vlan_tag currently uses the device_id. */
923 	vdev->vlan_tag = vlan_tags[dev->device_fh];
924 
925 	/* Print out VMDQ registration info. */
926 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927 		dev->device_fh,
928 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931 		vdev->vlan_tag);
932 
933 	/* Register the MAC address. */
934 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935 	if (ret)
936 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937 					dev->device_fh);
938 
939 	/* Enable stripping of the vlan tag as we handle routing. */
940 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941 
942 	/* Set device as ready for RX. */
943 	vdev->ready = DEVICE_RX;
944 
945 	return 0;
946 }
947 
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955 	unsigned i = 0;
956 	unsigned rx_count;
957 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958 
959 	if (vdev->ready == DEVICE_RX) {
960 		/*clear MAC and VLAN settings*/
961 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962 		for (i = 0; i < 6; i++)
963 			vdev->mac_address.addr_bytes[i] = 0;
964 
965 		vdev->vlan_tag = 0;
966 
967 		/*Clear out the receive buffers*/
968 		rx_count = rte_eth_rx_burst(ports[0],
969 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970 
971 		while (rx_count) {
972 			for (i = 0; i < rx_count; i++)
973 				rte_pktmbuf_free(pkts_burst[i]);
974 
975 			rx_count = rte_eth_rx_burst(ports[0],
976 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977 		}
978 
979 		vdev->ready = DEVICE_MAC_LEARNING;
980 	}
981 }
982 
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990 	struct virtio_net_data_ll *dev_ll;
991 	struct ether_hdr *pkt_hdr;
992 	uint64_t ret = 0;
993 	struct virtio_net *dev = vdev->dev;
994 	struct virtio_net *tdev; /* destination virito device */
995 
996 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997 
998 	/*get the used devices list*/
999 	dev_ll = ll_root_used;
1000 
1001 	while (dev_ll != NULL) {
1002 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003 				          &dev_ll->vdev->mac_address)) {
1004 
1005 			/* Drop the packet if the TX packet is destined for the TX device. */
1006 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008 							dev->device_fh);
1009 				return 0;
1010 			}
1011 			tdev = dev_ll->vdev->dev;
1012 
1013 
1014 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015 
1016 			if (unlikely(dev_ll->vdev->remove)) {
1017 				/*drop the packet if the device is marked for removal*/
1018 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019 			} else {
1020 				/*send the packet to the local virtio device*/
1021 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022 				if (enable_stats) {
1023 					rte_atomic64_add(
1024 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1025 					1);
1026 					rte_atomic64_add(
1027 					&dev_statistics[tdev->device_fh].rx_atomic,
1028 					ret);
1029 					dev_statistics[tdev->device_fh].tx_total++;
1030 					dev_statistics[tdev->device_fh].tx += ret;
1031 				}
1032 			}
1033 
1034 			return 0;
1035 		}
1036 		dev_ll = dev_ll->next;
1037 	}
1038 
1039 	return -1;
1040 }
1041 
1042 /*
1043  * Check if the destination MAC of a packet is one local VM,
1044  * and get its vlan tag, and offset if it is.
1045  */
1046 static inline int __attribute__((always_inline))
1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1048 	uint32_t *offset, uint16_t *vlan_tag)
1049 {
1050 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1051 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052 
1053 	while (dev_ll != NULL) {
1054 		if ((dev_ll->vdev->ready == DEVICE_RX)
1055 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1056 		&dev_ll->vdev->mac_address)) {
1057 			/*
1058 			 * Drop the packet if the TX packet is
1059 			 * destined for the TX device.
1060 			 */
1061 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 				LOG_DEBUG(VHOST_DATA,
1063 				"(%"PRIu64") TX: Source and destination"
1064 				" MAC addresses are the same. Dropping "
1065 				"packet.\n",
1066 				dev_ll->vdev->dev->device_fh);
1067 				return -1;
1068 			}
1069 
1070 			/*
1071 			 * HW vlan strip will reduce the packet length
1072 			 * by minus length of vlan tag, so need restore
1073 			 * the packet length by plus it.
1074 			 */
1075 			*offset = VLAN_HLEN;
1076 			*vlan_tag =
1077 			(uint16_t)
1078 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1079 
1080 			LOG_DEBUG(VHOST_DATA,
1081 			"(%"PRIu64") TX: pkt to local VM device id:"
1082 			"(%"PRIu64") vlan tag: %d.\n",
1083 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1084 			vlan_tag);
1085 
1086 			break;
1087 		}
1088 		dev_ll = dev_ll->next;
1089 	}
1090 	return 0;
1091 }
1092 
1093 /*
1094  * This function routes the TX packet to the correct interface. This may be a local device
1095  * or the physical port.
1096  */
1097 static inline void __attribute__((always_inline))
1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1099 {
1100 	struct mbuf_table *tx_q;
1101 	struct rte_mbuf **m_table;
1102 	unsigned len, ret, offset = 0;
1103 	const uint16_t lcore_id = rte_lcore_id();
1104 	struct virtio_net *dev = vdev->dev;
1105 
1106 	/*check if destination is local VM*/
1107 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1108 		rte_pktmbuf_free(m);
1109 		return;
1110 	}
1111 
1112 	if (vm2vm_mode == VM2VM_HARDWARE) {
1113 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1114 			offset > rte_pktmbuf_tailroom(m)) {
1115 			rte_pktmbuf_free(m);
1116 			return;
1117 		}
1118 	}
1119 
1120 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1121 
1122 	/*Add packet to the port tx queue*/
1123 	tx_q = &lcore_tx_queue[lcore_id];
1124 	len = tx_q->len;
1125 
1126 	m->ol_flags = PKT_TX_VLAN_PKT;
1127 
1128 	m->data_len += offset;
1129 	m->pkt_len += offset;
1130 
1131 	m->vlan_tci = vlan_tag;
1132 
1133 	tx_q->m_table[len] = m;
1134 	len++;
1135 	if (enable_stats) {
1136 		dev_statistics[dev->device_fh].tx_total++;
1137 		dev_statistics[dev->device_fh].tx++;
1138 	}
1139 
1140 	if (unlikely(len == MAX_PKT_BURST)) {
1141 		m_table = (struct rte_mbuf **)tx_q->m_table;
1142 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1143 		/* Free any buffers not handled by TX and update the port stats. */
1144 		if (unlikely(ret < len)) {
1145 			do {
1146 				rte_pktmbuf_free(m_table[ret]);
1147 			} while (++ret < len);
1148 		}
1149 
1150 		len = 0;
1151 	}
1152 
1153 	tx_q->len = len;
1154 	return;
1155 }
1156 /*
1157  * This function is called by each data core. It handles all RX/TX registered with the
1158  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1159  * with all devices in the main linked list.
1160  */
1161 static int
1162 switch_worker(__attribute__((unused)) void *arg)
1163 {
1164 	struct rte_mempool *mbuf_pool = arg;
1165 	struct virtio_net *dev = NULL;
1166 	struct vhost_dev *vdev = NULL;
1167 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1168 	struct virtio_net_data_ll *dev_ll;
1169 	struct mbuf_table *tx_q;
1170 	volatile struct lcore_ll_info *lcore_ll;
1171 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1172 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1173 	unsigned ret, i;
1174 	const uint16_t lcore_id = rte_lcore_id();
1175 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1176 	uint16_t rx_count = 0;
1177 	uint16_t tx_count;
1178 	uint32_t retry = 0;
1179 
1180 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1181 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1182 	prev_tsc = 0;
1183 
1184 	tx_q = &lcore_tx_queue[lcore_id];
1185 	for (i = 0; i < num_cores; i ++) {
1186 		if (lcore_ids[i] == lcore_id) {
1187 			tx_q->txq_id = i;
1188 			break;
1189 		}
1190 	}
1191 
1192 	while(1) {
1193 		cur_tsc = rte_rdtsc();
1194 		/*
1195 		 * TX burst queue drain
1196 		 */
1197 		diff_tsc = cur_tsc - prev_tsc;
1198 		if (unlikely(diff_tsc > drain_tsc)) {
1199 
1200 			if (tx_q->len) {
1201 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1202 
1203 				/*Tx any packets in the queue*/
1204 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1205 									   (struct rte_mbuf **)tx_q->m_table,
1206 									   (uint16_t)tx_q->len);
1207 				if (unlikely(ret < tx_q->len)) {
1208 					do {
1209 						rte_pktmbuf_free(tx_q->m_table[ret]);
1210 					} while (++ret < tx_q->len);
1211 				}
1212 
1213 				tx_q->len = 0;
1214 			}
1215 
1216 			prev_tsc = cur_tsc;
1217 
1218 		}
1219 
1220 		rte_prefetch0(lcore_ll->ll_root_used);
1221 		/*
1222 		 * Inform the configuration core that we have exited the linked list and that no devices are
1223 		 * in use if requested.
1224 		 */
1225 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1226 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1227 
1228 		/*
1229 		 * Process devices
1230 		 */
1231 		dev_ll = lcore_ll->ll_root_used;
1232 
1233 		while (dev_ll != NULL) {
1234 			/*get virtio device ID*/
1235 			vdev = dev_ll->vdev;
1236 			dev = vdev->dev;
1237 
1238 			if (unlikely(vdev->remove)) {
1239 				dev_ll = dev_ll->next;
1240 				unlink_vmdq(vdev);
1241 				vdev->ready = DEVICE_SAFE_REMOVE;
1242 				continue;
1243 			}
1244 			if (likely(vdev->ready == DEVICE_RX)) {
1245 				/*Handle guest RX*/
1246 				rx_count = rte_eth_rx_burst(ports[0],
1247 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1248 
1249 				if (rx_count) {
1250 					/*
1251 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1252 					* Here MAX_PKT_BURST must be less than virtio queue size
1253 					*/
1254 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1255 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1256 							rte_delay_us(burst_rx_delay_time);
1257 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1258 								break;
1259 						}
1260 					}
1261 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1262 					if (enable_stats) {
1263 						rte_atomic64_add(
1264 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1265 						rx_count);
1266 						rte_atomic64_add(
1267 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1268 					}
1269 					while (likely(rx_count)) {
1270 						rx_count--;
1271 						rte_pktmbuf_free(pkts_burst[rx_count]);
1272 					}
1273 
1274 				}
1275 			}
1276 
1277 			if (likely(!vdev->remove)) {
1278 				/* Handle guest TX*/
1279 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1280 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1281 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1282 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1283 						while (tx_count--)
1284 							rte_pktmbuf_free(pkts_burst[tx_count]);
1285 					}
1286 				}
1287 				while (tx_count)
1288 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1289 			}
1290 
1291 			/*move to the next device in the list*/
1292 			dev_ll = dev_ll->next;
1293 		}
1294 	}
1295 
1296 	return 0;
1297 }
1298 
1299 /*
1300  * This function gets available ring number for zero copy rx.
1301  * Only one thread will call this funciton for a paticular virtio device,
1302  * so, it is designed as non-thread-safe function.
1303  */
1304 static inline uint32_t __attribute__((always_inline))
1305 get_available_ring_num_zcp(struct virtio_net *dev)
1306 {
1307 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1308 	uint16_t avail_idx;
1309 
1310 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1311 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1312 }
1313 
1314 /*
1315  * This function gets available ring index for zero copy rx,
1316  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1317  * Only one thread will call this funciton for a paticular virtio device,
1318  * so, it is designed as non-thread-safe function.
1319  */
1320 static inline uint32_t __attribute__((always_inline))
1321 get_available_ring_index_zcp(struct virtio_net *dev,
1322 	uint16_t *res_base_idx, uint32_t count)
1323 {
1324 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1325 	uint16_t avail_idx;
1326 	uint32_t retry = 0;
1327 	uint16_t free_entries;
1328 
1329 	*res_base_idx = vq->last_used_idx_res;
1330 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1331 	free_entries = (avail_idx - *res_base_idx);
1332 
1333 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1334 			"avail idx: %d, "
1335 			"res base idx:%d, free entries:%d\n",
1336 			dev->device_fh, avail_idx, *res_base_idx,
1337 			free_entries);
1338 
1339 	/*
1340 	 * If retry is enabled and the queue is full then we wait
1341 	 * and retry to avoid packet loss.
1342 	 */
1343 	if (enable_retry && unlikely(count > free_entries)) {
1344 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1345 			rte_delay_us(burst_rx_delay_time);
1346 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1347 			free_entries = (avail_idx - *res_base_idx);
1348 			if (count <= free_entries)
1349 				break;
1350 		}
1351 	}
1352 
1353 	/*check that we have enough buffers*/
1354 	if (unlikely(count > free_entries))
1355 		count = free_entries;
1356 
1357 	if (unlikely(count == 0)) {
1358 		LOG_DEBUG(VHOST_DATA,
1359 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1360 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1361 			dev->device_fh, avail_idx,
1362 			*res_base_idx, free_entries);
1363 		return 0;
1364 	}
1365 
1366 	vq->last_used_idx_res = *res_base_idx + count;
1367 
1368 	return count;
1369 }
1370 
1371 /*
1372  * This function put descriptor back to used list.
1373  */
1374 static inline void __attribute__((always_inline))
1375 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1376 {
1377 	uint16_t res_cur_idx = vq->last_used_idx;
1378 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1379 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1380 	rte_compiler_barrier();
1381 	*(volatile uint16_t *)&vq->used->idx += 1;
1382 	vq->last_used_idx += 1;
1383 
1384 	/* Kick the guest if necessary. */
1385 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1386 		eventfd_write((int)vq->kickfd, 1);
1387 }
1388 
1389 /*
1390  * This function get available descriptor from vitio vring and un-attached mbuf
1391  * from vpool->ring, and then attach them together. It needs adjust the offset
1392  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1393  * frame data may be put to wrong location in mbuf.
1394  */
1395 static inline void __attribute__((always_inline))
1396 attach_rxmbuf_zcp(struct virtio_net *dev)
1397 {
1398 	uint16_t res_base_idx, desc_idx;
1399 	uint64_t buff_addr, phys_addr;
1400 	struct vhost_virtqueue *vq;
1401 	struct vring_desc *desc;
1402 	struct rte_mbuf *mbuf = NULL;
1403 	struct vpool *vpool;
1404 	hpa_type addr_type;
1405 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1406 
1407 	vpool = &vpool_array[vdev->vmdq_rx_q];
1408 	vq = dev->virtqueue[VIRTIO_RXQ];
1409 
1410 	do {
1411 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1412 				1) != 1))
1413 			return;
1414 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1415 
1416 		desc = &vq->desc[desc_idx];
1417 		if (desc->flags & VRING_DESC_F_NEXT) {
1418 			desc = &vq->desc[desc->next];
1419 			buff_addr = gpa_to_vva(dev, desc->addr);
1420 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1421 					&addr_type);
1422 		} else {
1423 			buff_addr = gpa_to_vva(dev,
1424 					desc->addr + vq->vhost_hlen);
1425 			phys_addr = gpa_to_hpa(vdev,
1426 					desc->addr + vq->vhost_hlen,
1427 					desc->len, &addr_type);
1428 		}
1429 
1430 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1431 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1432 				" address found when attaching RX frame buffer"
1433 				" address!\n", dev->device_fh);
1434 			put_desc_to_used_list_zcp(vq, desc_idx);
1435 			continue;
1436 		}
1437 
1438 		/*
1439 		 * Check if the frame buffer address from guest crosses
1440 		 * sub-region or not.
1441 		 */
1442 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1443 			RTE_LOG(ERR, VHOST_DATA,
1444 				"(%"PRIu64") Frame buffer address cross "
1445 				"sub-regioin found when attaching RX frame "
1446 				"buffer address!\n",
1447 				dev->device_fh);
1448 			put_desc_to_used_list_zcp(vq, desc_idx);
1449 			continue;
1450 		}
1451 	} while (unlikely(phys_addr == 0));
1452 
1453 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1454 	if (unlikely(mbuf == NULL)) {
1455 		LOG_DEBUG(VHOST_DATA,
1456 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1457 			"ring_sc_dequeue fail.\n",
1458 			dev->device_fh);
1459 		put_desc_to_used_list_zcp(vq, desc_idx);
1460 		return;
1461 	}
1462 
1463 	if (unlikely(vpool->buf_size > desc->len)) {
1464 		LOG_DEBUG(VHOST_DATA,
1465 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1466 			"length(%d) of descriptor idx: %d less than room "
1467 			"size required: %d\n",
1468 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1469 		put_desc_to_used_list_zcp(vq, desc_idx);
1470 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1471 		return;
1472 	}
1473 
1474 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1475 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1476 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1477 	mbuf->data_len = desc->len;
1478 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1479 
1480 	LOG_DEBUG(VHOST_DATA,
1481 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1482 		"descriptor idx:%d\n",
1483 		dev->device_fh, res_base_idx, desc_idx);
1484 
1485 	__rte_mbuf_raw_free(mbuf);
1486 
1487 	return;
1488 }
1489 
1490 /*
1491  * Detach an attched packet mbuf -
1492  *  - restore original mbuf address and length values.
1493  *  - reset pktmbuf data and data_len to their default values.
1494  *  All other fields of the given packet mbuf will be left intact.
1495  *
1496  * @param m
1497  *   The attached packet mbuf.
1498  */
1499 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1500 {
1501 	const struct rte_mempool *mp = m->pool;
1502 	void *buf = RTE_MBUF_TO_BADDR(m);
1503 	uint32_t buf_ofs;
1504 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1505 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1506 
1507 	m->buf_addr = buf;
1508 	m->buf_len = (uint16_t)buf_len;
1509 
1510 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1511 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1512 	m->data_off = buf_ofs;
1513 
1514 	m->data_len = 0;
1515 }
1516 
1517 /*
1518  * This function is called after packets have been transimited. It fetchs mbuf
1519  * from vpool->pool, detached it and put into vpool->ring. It also update the
1520  * used index and kick the guest if necessary.
1521  */
1522 static inline uint32_t __attribute__((always_inline))
1523 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1524 {
1525 	struct rte_mbuf *mbuf;
1526 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1527 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1528 	uint32_t index = 0;
1529 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1530 
1531 	LOG_DEBUG(VHOST_DATA,
1532 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1533 		"clean is: %d\n",
1534 		dev->device_fh, mbuf_count);
1535 	LOG_DEBUG(VHOST_DATA,
1536 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1537 		"clean  is : %d\n",
1538 		dev->device_fh, rte_ring_count(vpool->ring));
1539 
1540 	for (index = 0; index < mbuf_count; index++) {
1541 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1542 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1543 			pktmbuf_detach_zcp(mbuf);
1544 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1545 
1546 		/* Update used index buffer information. */
1547 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1548 		vq->used->ring[used_idx].len = 0;
1549 
1550 		used_idx = (used_idx + 1) & (vq->size - 1);
1551 	}
1552 
1553 	LOG_DEBUG(VHOST_DATA,
1554 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1555 		"clean is: %d\n",
1556 		dev->device_fh, rte_mempool_count(vpool->pool));
1557 	LOG_DEBUG(VHOST_DATA,
1558 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1559 		"clean  is : %d\n",
1560 		dev->device_fh, rte_ring_count(vpool->ring));
1561 	LOG_DEBUG(VHOST_DATA,
1562 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1563 		"vq->last_used_idx:%d\n",
1564 		dev->device_fh, vq->last_used_idx);
1565 
1566 	vq->last_used_idx += mbuf_count;
1567 
1568 	LOG_DEBUG(VHOST_DATA,
1569 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1570 		"vq->last_used_idx:%d\n",
1571 		dev->device_fh, vq->last_used_idx);
1572 
1573 	rte_compiler_barrier();
1574 
1575 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1576 
1577 	/* Kick guest if required. */
1578 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1579 		eventfd_write((int)vq->kickfd, 1);
1580 
1581 	return 0;
1582 }
1583 
1584 /*
1585  * This function is called when a virtio device is destroy.
1586  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1587  */
1588 static void mbuf_destroy_zcp(struct vpool *vpool)
1589 {
1590 	struct rte_mbuf *mbuf = NULL;
1591 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1592 
1593 	LOG_DEBUG(VHOST_CONFIG,
1594 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1595 		"mbuf_destroy_zcp is: %d\n",
1596 		mbuf_count);
1597 	LOG_DEBUG(VHOST_CONFIG,
1598 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1599 		"mbuf_destroy_zcp  is : %d\n",
1600 		rte_ring_count(vpool->ring));
1601 
1602 	for (index = 0; index < mbuf_count; index++) {
1603 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1604 		if (likely(mbuf != NULL)) {
1605 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1606 				pktmbuf_detach_zcp(mbuf);
1607 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1608 		}
1609 	}
1610 
1611 	LOG_DEBUG(VHOST_CONFIG,
1612 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1613 		"mbuf_destroy_zcp is: %d\n",
1614 		rte_mempool_count(vpool->pool));
1615 	LOG_DEBUG(VHOST_CONFIG,
1616 		"in mbuf_destroy_zcp: mbuf count in ring after "
1617 		"mbuf_destroy_zcp is : %d\n",
1618 		rte_ring_count(vpool->ring));
1619 }
1620 
1621 /*
1622  * This function update the use flag and counter.
1623  */
1624 static inline uint32_t __attribute__((always_inline))
1625 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1626 	uint32_t count)
1627 {
1628 	struct vhost_virtqueue *vq;
1629 	struct vring_desc *desc;
1630 	struct rte_mbuf *buff;
1631 	/* The virtio_hdr is initialised to 0. */
1632 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1633 		= {{0, 0, 0, 0, 0, 0}, 0};
1634 	uint64_t buff_hdr_addr = 0;
1635 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1636 	uint32_t head_idx, packet_success = 0;
1637 	uint16_t res_cur_idx;
1638 
1639 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1640 
1641 	if (count == 0)
1642 		return 0;
1643 
1644 	vq = dev->virtqueue[VIRTIO_RXQ];
1645 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1646 
1647 	res_cur_idx = vq->last_used_idx;
1648 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1649 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1650 
1651 	/* Retrieve all of the head indexes first to avoid caching issues. */
1652 	for (head_idx = 0; head_idx < count; head_idx++)
1653 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1654 
1655 	/*Prefetch descriptor index. */
1656 	rte_prefetch0(&vq->desc[head[packet_success]]);
1657 
1658 	while (packet_success != count) {
1659 		/* Get descriptor from available ring */
1660 		desc = &vq->desc[head[packet_success]];
1661 
1662 		buff = pkts[packet_success];
1663 		LOG_DEBUG(VHOST_DATA,
1664 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1665 			"pkt[%d] descriptor idx: %d\n",
1666 			dev->device_fh, packet_success,
1667 			MBUF_HEADROOM_UINT32(buff));
1668 
1669 		PRINT_PACKET(dev,
1670 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1671 			+ RTE_PKTMBUF_HEADROOM),
1672 			rte_pktmbuf_data_len(buff), 0);
1673 
1674 		/* Buffer address translation for virtio header. */
1675 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1676 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1677 
1678 		/*
1679 		 * If the descriptors are chained the header and data are
1680 		 * placed in separate buffers.
1681 		 */
1682 		if (desc->flags & VRING_DESC_F_NEXT) {
1683 			desc->len = vq->vhost_hlen;
1684 			desc = &vq->desc[desc->next];
1685 			desc->len = rte_pktmbuf_data_len(buff);
1686 		} else {
1687 			desc->len = packet_len;
1688 		}
1689 
1690 		/* Update used ring with desc information */
1691 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1692 			= head[packet_success];
1693 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1694 			= packet_len;
1695 		res_cur_idx++;
1696 		packet_success++;
1697 
1698 		/* A header is required per buffer. */
1699 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1700 			(const void *)&virtio_hdr, vq->vhost_hlen);
1701 
1702 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1703 
1704 		if (likely(packet_success < count)) {
1705 			/* Prefetch descriptor index. */
1706 			rte_prefetch0(&vq->desc[head[packet_success]]);
1707 		}
1708 	}
1709 
1710 	rte_compiler_barrier();
1711 
1712 	LOG_DEBUG(VHOST_DATA,
1713 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1714 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1715 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1716 
1717 	*(volatile uint16_t *)&vq->used->idx += count;
1718 	vq->last_used_idx += count;
1719 
1720 	LOG_DEBUG(VHOST_DATA,
1721 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1722 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1723 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1724 
1725 	/* Kick the guest if necessary. */
1726 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1727 		eventfd_write((int)vq->kickfd, 1);
1728 
1729 	return count;
1730 }
1731 
1732 /*
1733  * This function routes the TX packet to the correct interface.
1734  * This may be a local device or the physical port.
1735  */
1736 static inline void __attribute__((always_inline))
1737 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1738 	uint32_t desc_idx, uint8_t need_copy)
1739 {
1740 	struct mbuf_table *tx_q;
1741 	struct rte_mbuf **m_table;
1742 	struct rte_mbuf *mbuf = NULL;
1743 	unsigned len, ret, offset = 0;
1744 	struct vpool *vpool;
1745 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1746 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1747 
1748 	/*Add packet to the port tx queue*/
1749 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1750 	len = tx_q->len;
1751 
1752 	/* Allocate an mbuf and populate the structure. */
1753 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1754 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1755 	if (unlikely(mbuf == NULL)) {
1756 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1757 		RTE_LOG(ERR, VHOST_DATA,
1758 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1759 			dev->device_fh);
1760 		put_desc_to_used_list_zcp(vq, desc_idx);
1761 		return;
1762 	}
1763 
1764 	if (vm2vm_mode == VM2VM_HARDWARE) {
1765 		/* Avoid using a vlan tag from any vm for external pkt, such as
1766 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1767 		 * selection, MAC address determines it as an external pkt
1768 		 * which should go to network, while vlan tag determine it as
1769 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1770 		 * such a ambiguous situation, so pkt will lost.
1771 		 */
1772 		vlan_tag = external_pkt_default_vlan_tag;
1773 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1774 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1775 			__rte_mbuf_raw_free(mbuf);
1776 			return;
1777 		}
1778 	}
1779 
1780 	mbuf->nb_segs = m->nb_segs;
1781 	mbuf->next = m->next;
1782 	mbuf->data_len = m->data_len + offset;
1783 	mbuf->pkt_len = mbuf->data_len;
1784 	if (unlikely(need_copy)) {
1785 		/* Copy the packet contents to the mbuf. */
1786 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1787 			rte_pktmbuf_mtod(m, void *),
1788 			m->data_len);
1789 	} else {
1790 		mbuf->data_off = m->data_off;
1791 		mbuf->buf_physaddr = m->buf_physaddr;
1792 		mbuf->buf_addr = m->buf_addr;
1793 	}
1794 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1795 	mbuf->vlan_tci = vlan_tag;
1796 	mbuf->l2_len = sizeof(struct ether_hdr);
1797 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1798 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1799 
1800 	tx_q->m_table[len] = mbuf;
1801 	len++;
1802 
1803 	LOG_DEBUG(VHOST_DATA,
1804 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1805 		dev->device_fh,
1806 		mbuf->nb_segs,
1807 		(mbuf->next == NULL) ? "null" : "non-null");
1808 
1809 	if (enable_stats) {
1810 		dev_statistics[dev->device_fh].tx_total++;
1811 		dev_statistics[dev->device_fh].tx++;
1812 	}
1813 
1814 	if (unlikely(len == MAX_PKT_BURST)) {
1815 		m_table = (struct rte_mbuf **)tx_q->m_table;
1816 		ret = rte_eth_tx_burst(ports[0],
1817 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1818 
1819 		/*
1820 		 * Free any buffers not handled by TX and update
1821 		 * the port stats.
1822 		 */
1823 		if (unlikely(ret < len)) {
1824 			do {
1825 				rte_pktmbuf_free(m_table[ret]);
1826 			} while (++ret < len);
1827 		}
1828 
1829 		len = 0;
1830 		txmbuf_clean_zcp(dev, vpool);
1831 	}
1832 
1833 	tx_q->len = len;
1834 
1835 	return;
1836 }
1837 
1838 /*
1839  * This function TX all available packets in virtio TX queue for one
1840  * virtio-net device. If it is first packet, it learns MAC address and
1841  * setup VMDQ.
1842  */
1843 static inline void __attribute__((always_inline))
1844 virtio_dev_tx_zcp(struct virtio_net *dev)
1845 {
1846 	struct rte_mbuf m;
1847 	struct vhost_virtqueue *vq;
1848 	struct vring_desc *desc;
1849 	uint64_t buff_addr = 0, phys_addr;
1850 	uint32_t head[MAX_PKT_BURST];
1851 	uint32_t i;
1852 	uint16_t free_entries, packet_success = 0;
1853 	uint16_t avail_idx;
1854 	uint8_t need_copy = 0;
1855 	hpa_type addr_type;
1856 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1857 
1858 	vq = dev->virtqueue[VIRTIO_TXQ];
1859 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1860 
1861 	/* If there are no available buffers then return. */
1862 	if (vq->last_used_idx_res == avail_idx)
1863 		return;
1864 
1865 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1866 
1867 	/* Prefetch available ring to retrieve head indexes. */
1868 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1869 
1870 	/* Get the number of free entries in the ring */
1871 	free_entries = (avail_idx - vq->last_used_idx_res);
1872 
1873 	/* Limit to MAX_PKT_BURST. */
1874 	free_entries
1875 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1876 
1877 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1878 		dev->device_fh, free_entries);
1879 
1880 	/* Retrieve all of the head indexes first to avoid caching issues. */
1881 	for (i = 0; i < free_entries; i++)
1882 		head[i]
1883 			= vq->avail->ring[(vq->last_used_idx_res + i)
1884 			& (vq->size - 1)];
1885 
1886 	vq->last_used_idx_res += free_entries;
1887 
1888 	/* Prefetch descriptor index. */
1889 	rte_prefetch0(&vq->desc[head[packet_success]]);
1890 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1891 
1892 	while (packet_success < free_entries) {
1893 		desc = &vq->desc[head[packet_success]];
1894 
1895 		/* Discard first buffer as it is the virtio header */
1896 		desc = &vq->desc[desc->next];
1897 
1898 		/* Buffer address translation. */
1899 		buff_addr = gpa_to_vva(dev, desc->addr);
1900 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1901 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1902 			&addr_type);
1903 
1904 		if (likely(packet_success < (free_entries - 1)))
1905 			/* Prefetch descriptor index. */
1906 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1907 
1908 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1909 			RTE_LOG(ERR, VHOST_DATA,
1910 				"(%"PRIu64") Invalid frame buffer address found"
1911 				"when TX packets!\n",
1912 				dev->device_fh);
1913 			packet_success++;
1914 			continue;
1915 		}
1916 
1917 		/* Prefetch buffer address. */
1918 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1919 
1920 		/*
1921 		 * Setup dummy mbuf. This is copied to a real mbuf if
1922 		 * transmitted out the physical port.
1923 		 */
1924 		m.data_len = desc->len;
1925 		m.nb_segs = 1;
1926 		m.next = NULL;
1927 		m.data_off = 0;
1928 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1929 		m.buf_physaddr = phys_addr;
1930 
1931 		/*
1932 		 * Check if the frame buffer address from guest crosses
1933 		 * sub-region or not.
1934 		 */
1935 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1936 			RTE_LOG(ERR, VHOST_DATA,
1937 				"(%"PRIu64") Frame buffer address cross "
1938 				"sub-regioin found when attaching TX frame "
1939 				"buffer address!\n",
1940 				dev->device_fh);
1941 			need_copy = 1;
1942 		} else
1943 			need_copy = 0;
1944 
1945 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1946 
1947 		/*
1948 		 * If this is the first received packet we need to learn
1949 		 * the MAC and setup VMDQ
1950 		 */
1951 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1952 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1953 				/*
1954 				 * Discard frame if device is scheduled for
1955 				 * removal or a duplicate MAC address is found.
1956 				 */
1957 				packet_success += free_entries;
1958 				vq->last_used_idx += packet_success;
1959 				break;
1960 			}
1961 		}
1962 
1963 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1964 		packet_success++;
1965 	}
1966 }
1967 
1968 /*
1969  * This function is called by each data core. It handles all RX/TX registered
1970  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1971  * addresses are compared with all devices in the main linked list.
1972  */
1973 static int
1974 switch_worker_zcp(__attribute__((unused)) void *arg)
1975 {
1976 	struct virtio_net *dev = NULL;
1977 	struct vhost_dev  *vdev = NULL;
1978 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1979 	struct virtio_net_data_ll *dev_ll;
1980 	struct mbuf_table *tx_q;
1981 	volatile struct lcore_ll_info *lcore_ll;
1982 	const uint64_t drain_tsc
1983 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1984 		* BURST_TX_DRAIN_US;
1985 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1986 	unsigned ret;
1987 	const uint16_t lcore_id = rte_lcore_id();
1988 	uint16_t count_in_ring, rx_count = 0;
1989 
1990 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1991 
1992 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1993 	prev_tsc = 0;
1994 
1995 	while (1) {
1996 		cur_tsc = rte_rdtsc();
1997 
1998 		/* TX burst queue drain */
1999 		diff_tsc = cur_tsc - prev_tsc;
2000 		if (unlikely(diff_tsc > drain_tsc)) {
2001 			/*
2002 			 * Get mbuf from vpool.pool and detach mbuf and
2003 			 * put back into vpool.ring.
2004 			 */
2005 			dev_ll = lcore_ll->ll_root_used;
2006 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2007 				/* Get virtio device ID */
2008 				vdev = dev_ll->vdev;
2009 				dev = vdev->dev;
2010 
2011 				if (likely(!vdev->remove)) {
2012 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2013 					if (tx_q->len) {
2014 						LOG_DEBUG(VHOST_DATA,
2015 						"TX queue drained after timeout"
2016 						" with burst size %u\n",
2017 						tx_q->len);
2018 
2019 						/*
2020 						 * Tx any packets in the queue
2021 						 */
2022 						ret = rte_eth_tx_burst(
2023 							ports[0],
2024 							(uint16_t)tx_q->txq_id,
2025 							(struct rte_mbuf **)
2026 							tx_q->m_table,
2027 							(uint16_t)tx_q->len);
2028 						if (unlikely(ret < tx_q->len)) {
2029 							do {
2030 								rte_pktmbuf_free(
2031 									tx_q->m_table[ret]);
2032 							} while (++ret < tx_q->len);
2033 						}
2034 						tx_q->len = 0;
2035 
2036 						txmbuf_clean_zcp(dev,
2037 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2038 					}
2039 				}
2040 				dev_ll = dev_ll->next;
2041 			}
2042 			prev_tsc = cur_tsc;
2043 		}
2044 
2045 		rte_prefetch0(lcore_ll->ll_root_used);
2046 
2047 		/*
2048 		 * Inform the configuration core that we have exited the linked
2049 		 * list and that no devices are in use if requested.
2050 		 */
2051 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2052 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2053 
2054 		/* Process devices */
2055 		dev_ll = lcore_ll->ll_root_used;
2056 
2057 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2058 			vdev = dev_ll->vdev;
2059 			dev  = vdev->dev;
2060 			if (unlikely(vdev->remove)) {
2061 				dev_ll = dev_ll->next;
2062 				unlink_vmdq(vdev);
2063 				vdev->ready = DEVICE_SAFE_REMOVE;
2064 				continue;
2065 			}
2066 
2067 			if (likely(vdev->ready == DEVICE_RX)) {
2068 				uint32_t index = vdev->vmdq_rx_q;
2069 				uint16_t i;
2070 				count_in_ring
2071 				= rte_ring_count(vpool_array[index].ring);
2072 				uint16_t free_entries
2073 				= (uint16_t)get_available_ring_num_zcp(dev);
2074 
2075 				/*
2076 				 * Attach all mbufs in vpool.ring and put back
2077 				 * into vpool.pool.
2078 				 */
2079 				for (i = 0;
2080 				i < RTE_MIN(free_entries,
2081 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2082 				i++)
2083 					attach_rxmbuf_zcp(dev);
2084 
2085 				/* Handle guest RX */
2086 				rx_count = rte_eth_rx_burst(ports[0],
2087 					vdev->vmdq_rx_q, pkts_burst,
2088 					MAX_PKT_BURST);
2089 
2090 				if (rx_count) {
2091 					ret_count = virtio_dev_rx_zcp(dev,
2092 							pkts_burst, rx_count);
2093 					if (enable_stats) {
2094 						dev_statistics[dev->device_fh].rx_total
2095 							+= rx_count;
2096 						dev_statistics[dev->device_fh].rx
2097 							+= ret_count;
2098 					}
2099 					while (likely(rx_count)) {
2100 						rx_count--;
2101 						pktmbuf_detach_zcp(
2102 							pkts_burst[rx_count]);
2103 						rte_ring_sp_enqueue(
2104 							vpool_array[index].ring,
2105 							(void *)pkts_burst[rx_count]);
2106 					}
2107 				}
2108 			}
2109 
2110 			if (likely(!vdev->remove))
2111 				/* Handle guest TX */
2112 				virtio_dev_tx_zcp(dev);
2113 
2114 			/* Move to the next device in the list */
2115 			dev_ll = dev_ll->next;
2116 		}
2117 	}
2118 
2119 	return 0;
2120 }
2121 
2122 
2123 /*
2124  * Add an entry to a used linked list. A free entry must first be found
2125  * in the free linked list using get_data_ll_free_entry();
2126  */
2127 static void
2128 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2129 	struct virtio_net_data_ll *ll_dev)
2130 {
2131 	struct virtio_net_data_ll *ll = *ll_root_addr;
2132 
2133 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2134 	ll_dev->next = NULL;
2135 	rte_compiler_barrier();
2136 
2137 	/* If ll == NULL then this is the first device. */
2138 	if (ll) {
2139 		/* Increment to the tail of the linked list. */
2140 		while ((ll->next != NULL) )
2141 			ll = ll->next;
2142 
2143 		ll->next = ll_dev;
2144 	} else {
2145 		*ll_root_addr = ll_dev;
2146 	}
2147 }
2148 
2149 /*
2150  * Remove an entry from a used linked list. The entry must then be added to
2151  * the free linked list using put_data_ll_free_entry().
2152  */
2153 static void
2154 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2155 	struct virtio_net_data_ll *ll_dev,
2156 	struct virtio_net_data_ll *ll_dev_last)
2157 {
2158 	struct virtio_net_data_ll *ll = *ll_root_addr;
2159 
2160 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2161 		return;
2162 
2163 	if (ll_dev == ll)
2164 		*ll_root_addr = ll_dev->next;
2165 	else
2166 		if (likely(ll_dev_last != NULL))
2167 			ll_dev_last->next = ll_dev->next;
2168 		else
2169 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2170 }
2171 
2172 /*
2173  * Find and return an entry from the free linked list.
2174  */
2175 static struct virtio_net_data_ll *
2176 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2177 {
2178 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2179 	struct virtio_net_data_ll *ll_dev;
2180 
2181 	if (ll_free == NULL)
2182 		return NULL;
2183 
2184 	ll_dev = ll_free;
2185 	*ll_root_addr = ll_free->next;
2186 
2187 	return ll_dev;
2188 }
2189 
2190 /*
2191  * Place an entry back on to the free linked list.
2192  */
2193 static void
2194 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2195 	struct virtio_net_data_ll *ll_dev)
2196 {
2197 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2198 
2199 	if (ll_dev == NULL)
2200 		return;
2201 
2202 	ll_dev->next = ll_free;
2203 	*ll_root_addr = ll_dev;
2204 }
2205 
2206 /*
2207  * Creates a linked list of a given size.
2208  */
2209 static struct virtio_net_data_ll *
2210 alloc_data_ll(uint32_t size)
2211 {
2212 	struct virtio_net_data_ll *ll_new;
2213 	uint32_t i;
2214 
2215 	/* Malloc and then chain the linked list. */
2216 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2217 	if (ll_new == NULL) {
2218 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2219 		return NULL;
2220 	}
2221 
2222 	for (i = 0; i < size - 1; i++) {
2223 		ll_new[i].vdev = NULL;
2224 		ll_new[i].next = &ll_new[i+1];
2225 	}
2226 	ll_new[i].next = NULL;
2227 
2228 	return (ll_new);
2229 }
2230 
2231 /*
2232  * Create the main linked list along with each individual cores linked list. A used and a free list
2233  * are created to manage entries.
2234  */
2235 static int
2236 init_data_ll (void)
2237 {
2238 	int lcore;
2239 
2240 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2241 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2242 		if (lcore_info[lcore].lcore_ll == NULL) {
2243 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2244 			return -1;
2245 		}
2246 
2247 		lcore_info[lcore].lcore_ll->device_num = 0;
2248 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2249 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2250 		if (num_devices % num_switching_cores)
2251 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2252 		else
2253 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2254 	}
2255 
2256 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2257 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2258 
2259 	return 0;
2260 }
2261 
2262 /*
2263  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2264  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2265  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2266  */
2267 static void
2268 destroy_device (volatile struct virtio_net *dev)
2269 {
2270 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2271 	struct virtio_net_data_ll *ll_main_dev_cur;
2272 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2273 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2274 	struct vhost_dev *vdev;
2275 	int lcore;
2276 
2277 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2278 
2279 	vdev = (struct vhost_dev *)dev->priv;
2280 	/*set the remove flag. */
2281 	vdev->remove = 1;
2282 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2283 		rte_pause();
2284 	}
2285 
2286 	/* Search for entry to be removed from lcore ll */
2287 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2288 	while (ll_lcore_dev_cur != NULL) {
2289 		if (ll_lcore_dev_cur->vdev == vdev) {
2290 			break;
2291 		} else {
2292 			ll_lcore_dev_last = ll_lcore_dev_cur;
2293 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2294 		}
2295 	}
2296 
2297 	if (ll_lcore_dev_cur == NULL) {
2298 		RTE_LOG(ERR, VHOST_CONFIG,
2299 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2300 			dev->device_fh);
2301 		return;
2302 	}
2303 
2304 	/* Search for entry to be removed from main ll */
2305 	ll_main_dev_cur = ll_root_used;
2306 	ll_main_dev_last = NULL;
2307 	while (ll_main_dev_cur != NULL) {
2308 		if (ll_main_dev_cur->vdev == vdev) {
2309 			break;
2310 		} else {
2311 			ll_main_dev_last = ll_main_dev_cur;
2312 			ll_main_dev_cur = ll_main_dev_cur->next;
2313 		}
2314 	}
2315 
2316 	/* Remove entries from the lcore and main ll. */
2317 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2318 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2319 
2320 	/* Set the dev_removal_flag on each lcore. */
2321 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2322 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2323 	}
2324 
2325 	/*
2326 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2327 	 * they can no longer access the device removed from the linked lists and that the devices
2328 	 * are no longer in use.
2329 	 */
2330 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2331 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2332 			rte_pause();
2333 		}
2334 	}
2335 
2336 	/* Add the entries back to the lcore and main free ll.*/
2337 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2338 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2339 
2340 	/* Decrement number of device on the lcore. */
2341 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2342 
2343 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2344 
2345 	if (zero_copy) {
2346 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2347 
2348 		/* Stop the RX queue. */
2349 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2350 			LOG_DEBUG(VHOST_CONFIG,
2351 				"(%"PRIu64") In destroy_device: Failed to stop "
2352 				"rx queue:%d\n",
2353 				dev->device_fh,
2354 				vdev->vmdq_rx_q);
2355 		}
2356 
2357 		LOG_DEBUG(VHOST_CONFIG,
2358 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2359 			"mempool back to ring for RX queue: %d\n",
2360 			dev->device_fh, vdev->vmdq_rx_q);
2361 
2362 		mbuf_destroy_zcp(vpool);
2363 
2364 		/* Stop the TX queue. */
2365 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2366 			LOG_DEBUG(VHOST_CONFIG,
2367 				"(%"PRIu64") In destroy_device: Failed to "
2368 				"stop tx queue:%d\n",
2369 				dev->device_fh, vdev->vmdq_rx_q);
2370 		}
2371 
2372 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2373 
2374 		LOG_DEBUG(VHOST_CONFIG,
2375 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2376 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2377 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2378 			dev->device_fh);
2379 
2380 		mbuf_destroy_zcp(vpool);
2381 		rte_free(vdev->regions_hpa);
2382 	}
2383 	rte_free(vdev);
2384 
2385 }
2386 
2387 /*
2388  * Calculate the region count of physical continous regions for one particular
2389  * region of whose vhost virtual address is continous. The particular region
2390  * start from vva_start, with size of 'size' in argument.
2391  */
2392 static uint32_t
2393 check_hpa_regions(uint64_t vva_start, uint64_t size)
2394 {
2395 	uint32_t i, nregions = 0, page_size = getpagesize();
2396 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2397 	if (vva_start % page_size) {
2398 		LOG_DEBUG(VHOST_CONFIG,
2399 			"in check_countinous: vva start(%p) mod page_size(%d) "
2400 			"has remainder\n",
2401 			(void *)(uintptr_t)vva_start, page_size);
2402 		return 0;
2403 	}
2404 	if (size % page_size) {
2405 		LOG_DEBUG(VHOST_CONFIG,
2406 			"in check_countinous: "
2407 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2408 			size, page_size);
2409 		return 0;
2410 	}
2411 	for (i = 0; i < size - page_size; i = i + page_size) {
2412 		cur_phys_addr
2413 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2414 		next_phys_addr = rte_mem_virt2phy(
2415 			(void *)(uintptr_t)(vva_start + i + page_size));
2416 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2417 			++nregions;
2418 			LOG_DEBUG(VHOST_CONFIG,
2419 				"in check_continuous: hva addr:(%p) is not "
2420 				"continuous with hva addr:(%p), diff:%d\n",
2421 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2422 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2423 				+ page_size), page_size);
2424 			LOG_DEBUG(VHOST_CONFIG,
2425 				"in check_continuous: hpa addr:(%p) is not "
2426 				"continuous with hpa addr:(%p), "
2427 				"diff:(%"PRIu64")\n",
2428 				(void *)(uintptr_t)cur_phys_addr,
2429 				(void *)(uintptr_t)next_phys_addr,
2430 				(next_phys_addr-cur_phys_addr));
2431 		}
2432 	}
2433 	return nregions;
2434 }
2435 
2436 /*
2437  * Divide each region whose vhost virtual address is continous into a few
2438  * sub-regions, make sure the physical address within each sub-region are
2439  * continous. And fill offset(to GPA) and size etc. information of each
2440  * sub-region into regions_hpa.
2441  */
2442 static uint32_t
2443 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2444 {
2445 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2446 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2447 
2448 	if (mem_region_hpa == NULL)
2449 		return 0;
2450 
2451 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2452 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2453 			virtio_memory->regions[regionidx].address_offset;
2454 		mem_region_hpa[regionidx_hpa].guest_phys_address
2455 			= virtio_memory->regions[regionidx].guest_phys_address;
2456 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2457 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2458 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2459 		LOG_DEBUG(VHOST_CONFIG,
2460 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2461 			regionidx_hpa,
2462 			(void *)(uintptr_t)
2463 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2464 		LOG_DEBUG(VHOST_CONFIG,
2465 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2466 			regionidx_hpa,
2467 			(void *)(uintptr_t)
2468 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2469 		for (i = 0, k = 0;
2470 			i < virtio_memory->regions[regionidx].memory_size -
2471 				page_size;
2472 			i += page_size) {
2473 			cur_phys_addr = rte_mem_virt2phy(
2474 					(void *)(uintptr_t)(vva_start + i));
2475 			next_phys_addr = rte_mem_virt2phy(
2476 					(void *)(uintptr_t)(vva_start +
2477 					i + page_size));
2478 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2479 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2480 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2481 					k + page_size;
2482 				mem_region_hpa[regionidx_hpa].memory_size
2483 					= k + page_size;
2484 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2485 					"phys addr end  [%d]:(%p)\n",
2486 					regionidx_hpa,
2487 					(void *)(uintptr_t)
2488 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2489 				LOG_DEBUG(VHOST_CONFIG,
2490 					"in fill_hpa_regions: guest phys addr "
2491 					"size [%d]:(%p)\n",
2492 					regionidx_hpa,
2493 					(void *)(uintptr_t)
2494 					(mem_region_hpa[regionidx_hpa].memory_size));
2495 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2496 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2497 				++regionidx_hpa;
2498 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2499 					next_phys_addr -
2500 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2501 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2502 					" phys addr start[%d]:(%p)\n",
2503 					regionidx_hpa,
2504 					(void *)(uintptr_t)
2505 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2506 				LOG_DEBUG(VHOST_CONFIG,
2507 					"in fill_hpa_regions: host  phys addr "
2508 					"start[%d]:(%p)\n",
2509 					regionidx_hpa,
2510 					(void *)(uintptr_t)
2511 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2512 				k = 0;
2513 			} else {
2514 				k += page_size;
2515 			}
2516 		}
2517 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2518 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2519 			+ k + page_size;
2520 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2521 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2522 			"[%d]:(%p)\n", regionidx_hpa,
2523 			(void *)(uintptr_t)
2524 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2525 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2526 			"[%d]:(%p)\n", regionidx_hpa,
2527 			(void *)(uintptr_t)
2528 			(mem_region_hpa[regionidx_hpa].memory_size));
2529 		++regionidx_hpa;
2530 	}
2531 	return regionidx_hpa;
2532 }
2533 
2534 /*
2535  * A new device is added to a data core. First the device is added to the main linked list
2536  * and the allocated to a specific data core.
2537  */
2538 static int
2539 new_device (struct virtio_net *dev)
2540 {
2541 	struct virtio_net_data_ll *ll_dev;
2542 	int lcore, core_add = 0;
2543 	uint32_t device_num_min = num_devices;
2544 	struct vhost_dev *vdev;
2545 	uint32_t regionidx;
2546 
2547 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2548 	if (vdev == NULL) {
2549 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2550 			dev->device_fh);
2551 		return -1;
2552 	}
2553 	vdev->dev = dev;
2554 	dev->priv = vdev;
2555 
2556 	if (zero_copy) {
2557 		vdev->nregions_hpa = dev->mem->nregions;
2558 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2559 			vdev->nregions_hpa
2560 				+= check_hpa_regions(
2561 					dev->mem->regions[regionidx].guest_phys_address
2562 					+ dev->mem->regions[regionidx].address_offset,
2563 					dev->mem->regions[regionidx].memory_size);
2564 
2565 		}
2566 
2567 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2568 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2569 			CACHE_LINE_SIZE);
2570 		if (vdev->regions_hpa == NULL) {
2571 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2572 			rte_free(vdev);
2573 			return -1;
2574 		}
2575 
2576 
2577 		if (fill_hpa_memory_regions(
2578 			vdev->regions_hpa, dev->mem
2579 			) != vdev->nregions_hpa) {
2580 
2581 			RTE_LOG(ERR, VHOST_CONFIG,
2582 				"hpa memory regions number mismatch: "
2583 				"[%d]\n", vdev->nregions_hpa);
2584 			rte_free(vdev->regions_hpa);
2585 			rte_free(vdev);
2586 			return -1;
2587 		}
2588 	}
2589 
2590 
2591 	/* Add device to main ll */
2592 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2593 	if (ll_dev == NULL) {
2594 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2595 			"of %d devices per core has been reached\n",
2596 			dev->device_fh, num_devices);
2597 		if (vdev->regions_hpa)
2598 			rte_free(vdev->regions_hpa);
2599 		rte_free(vdev);
2600 		return -1;
2601 	}
2602 	ll_dev->vdev = vdev;
2603 	add_data_ll_entry(&ll_root_used, ll_dev);
2604 	vdev->vmdq_rx_q
2605 		= dev->device_fh * (num_queues / num_devices);
2606 
2607 	if (zero_copy) {
2608 		uint32_t index = vdev->vmdq_rx_q;
2609 		uint32_t count_in_ring, i;
2610 		struct mbuf_table *tx_q;
2611 
2612 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2613 
2614 		LOG_DEBUG(VHOST_CONFIG,
2615 			"(%"PRIu64") in new_device: mbuf count in mempool "
2616 			"before attach is: %d\n",
2617 			dev->device_fh,
2618 			rte_mempool_count(vpool_array[index].pool));
2619 		LOG_DEBUG(VHOST_CONFIG,
2620 			"(%"PRIu64") in new_device: mbuf count in  ring "
2621 			"before attach  is : %d\n",
2622 			dev->device_fh, count_in_ring);
2623 
2624 		/*
2625 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2626 		 */
2627 		for (i = 0; i < count_in_ring; i++)
2628 			attach_rxmbuf_zcp(dev);
2629 
2630 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2631 			"mempool after attach is: %d\n",
2632 			dev->device_fh,
2633 			rte_mempool_count(vpool_array[index].pool));
2634 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2635 			"ring after attach  is : %d\n",
2636 			dev->device_fh,
2637 			rte_ring_count(vpool_array[index].ring));
2638 
2639 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2640 		tx_q->txq_id = vdev->vmdq_rx_q;
2641 
2642 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2643 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2644 
2645 			LOG_DEBUG(VHOST_CONFIG,
2646 				"(%"PRIu64") In new_device: Failed to start "
2647 				"tx queue:%d\n",
2648 				dev->device_fh, vdev->vmdq_rx_q);
2649 
2650 			mbuf_destroy_zcp(vpool);
2651 			rte_free(vdev->regions_hpa);
2652 			rte_free(vdev);
2653 			return -1;
2654 		}
2655 
2656 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2657 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2658 
2659 			LOG_DEBUG(VHOST_CONFIG,
2660 				"(%"PRIu64") In new_device: Failed to start "
2661 				"rx queue:%d\n",
2662 				dev->device_fh, vdev->vmdq_rx_q);
2663 
2664 			/* Stop the TX queue. */
2665 			if (rte_eth_dev_tx_queue_stop(ports[0],
2666 				vdev->vmdq_rx_q) != 0) {
2667 				LOG_DEBUG(VHOST_CONFIG,
2668 					"(%"PRIu64") In new_device: Failed to "
2669 					"stop tx queue:%d\n",
2670 					dev->device_fh, vdev->vmdq_rx_q);
2671 			}
2672 
2673 			mbuf_destroy_zcp(vpool);
2674 			rte_free(vdev->regions_hpa);
2675 			rte_free(vdev);
2676 			return -1;
2677 		}
2678 
2679 	}
2680 
2681 	/*reset ready flag*/
2682 	vdev->ready = DEVICE_MAC_LEARNING;
2683 	vdev->remove = 0;
2684 
2685 	/* Find a suitable lcore to add the device. */
2686 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2687 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2688 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2689 			core_add = lcore;
2690 		}
2691 	}
2692 	/* Add device to lcore ll */
2693 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2694 	if (ll_dev == NULL) {
2695 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2696 		vdev->ready = DEVICE_SAFE_REMOVE;
2697 		destroy_device(dev);
2698 		if (vdev->regions_hpa)
2699 			rte_free(vdev->regions_hpa);
2700 		rte_free(vdev);
2701 		return -1;
2702 	}
2703 	ll_dev->vdev = vdev;
2704 	vdev->coreid = core_add;
2705 
2706 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2707 
2708 	/* Initialize device stats */
2709 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2710 
2711 	/* Disable notifications. */
2712 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2713 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2714 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2715 	dev->flags |= VIRTIO_DEV_RUNNING;
2716 
2717 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2718 
2719 	return 0;
2720 }
2721 
2722 /*
2723  * These callback allow devices to be added to the data core when configuration
2724  * has been fully complete.
2725  */
2726 static const struct virtio_net_device_ops virtio_net_device_ops =
2727 {
2728 	.new_device =  new_device,
2729 	.destroy_device = destroy_device,
2730 };
2731 
2732 /*
2733  * This is a thread will wake up after a period to print stats if the user has
2734  * enabled them.
2735  */
2736 static void
2737 print_stats(void)
2738 {
2739 	struct virtio_net_data_ll *dev_ll;
2740 	uint64_t tx_dropped, rx_dropped;
2741 	uint64_t tx, tx_total, rx, rx_total;
2742 	uint32_t device_fh;
2743 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2744 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2745 
2746 	while(1) {
2747 		sleep(enable_stats);
2748 
2749 		/* Clear screen and move to top left */
2750 		printf("%s%s", clr, top_left);
2751 
2752 		printf("\nDevice statistics ====================================");
2753 
2754 		dev_ll = ll_root_used;
2755 		while (dev_ll != NULL) {
2756 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2757 			tx_total = dev_statistics[device_fh].tx_total;
2758 			tx = dev_statistics[device_fh].tx;
2759 			tx_dropped = tx_total - tx;
2760 			if (zero_copy == 0) {
2761 				rx_total = rte_atomic64_read(
2762 					&dev_statistics[device_fh].rx_total_atomic);
2763 				rx = rte_atomic64_read(
2764 					&dev_statistics[device_fh].rx_atomic);
2765 			} else {
2766 				rx_total = dev_statistics[device_fh].rx_total;
2767 				rx = dev_statistics[device_fh].rx;
2768 			}
2769 			rx_dropped = rx_total - rx;
2770 
2771 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2772 					"\nTX total: 		%"PRIu64""
2773 					"\nTX dropped: 		%"PRIu64""
2774 					"\nTX successful: 		%"PRIu64""
2775 					"\nRX total: 		%"PRIu64""
2776 					"\nRX dropped: 		%"PRIu64""
2777 					"\nRX successful: 		%"PRIu64"",
2778 					device_fh,
2779 					tx_total,
2780 					tx_dropped,
2781 					tx,
2782 					rx_total,
2783 					rx_dropped,
2784 					rx);
2785 
2786 			dev_ll = dev_ll->next;
2787 		}
2788 		printf("\n======================================================\n");
2789 	}
2790 }
2791 
2792 static void
2793 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2794 	char *ring_name, uint32_t nb_mbuf)
2795 {
2796 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2797 	vpool_array[index].pool
2798 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2799 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2800 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2801 		rte_pktmbuf_init, NULL, socket, 0);
2802 	if (vpool_array[index].pool != NULL) {
2803 		vpool_array[index].ring
2804 			= rte_ring_create(ring_name,
2805 				rte_align32pow2(nb_mbuf + 1),
2806 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2807 		if (likely(vpool_array[index].ring != NULL)) {
2808 			LOG_DEBUG(VHOST_CONFIG,
2809 				"in setup_mempool_tbl: mbuf count in "
2810 				"mempool is: %d\n",
2811 				rte_mempool_count(vpool_array[index].pool));
2812 			LOG_DEBUG(VHOST_CONFIG,
2813 				"in setup_mempool_tbl: mbuf count in "
2814 				"ring   is: %d\n",
2815 				rte_ring_count(vpool_array[index].ring));
2816 		} else {
2817 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2818 				ring_name);
2819 		}
2820 
2821 		/* Need consider head room. */
2822 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2823 	} else {
2824 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2825 	}
2826 }
2827 
2828 
2829 /*
2830  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2831  * device is also registered here to handle the IOCTLs.
2832  */
2833 int
2834 MAIN(int argc, char *argv[])
2835 {
2836 	struct rte_mempool *mbuf_pool = NULL;
2837 	unsigned lcore_id, core_id = 0;
2838 	unsigned nb_ports, valid_num_ports;
2839 	int ret;
2840 	uint8_t portid, queue_id = 0;
2841 	static pthread_t tid;
2842 
2843 	/* init EAL */
2844 	ret = rte_eal_init(argc, argv);
2845 	if (ret < 0)
2846 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2847 	argc -= ret;
2848 	argv += ret;
2849 
2850 	/* parse app arguments */
2851 	ret = us_vhost_parse_args(argc, argv);
2852 	if (ret < 0)
2853 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2854 
2855 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2856 		if (rte_lcore_is_enabled(lcore_id))
2857 			lcore_ids[core_id ++] = lcore_id;
2858 
2859 	if (rte_lcore_count() > RTE_MAX_LCORE)
2860 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2861 
2862 	/*set the number of swithcing cores available*/
2863 	num_switching_cores = rte_lcore_count()-1;
2864 
2865 	/* Get the number of physical ports. */
2866 	nb_ports = rte_eth_dev_count();
2867 	if (nb_ports > RTE_MAX_ETHPORTS)
2868 		nb_ports = RTE_MAX_ETHPORTS;
2869 
2870 	/*
2871 	 * Update the global var NUM_PORTS and global array PORTS
2872 	 * and get value of var VALID_NUM_PORTS according to system ports number
2873 	 */
2874 	valid_num_ports = check_ports_num(nb_ports);
2875 
2876 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2877 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2878 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2879 		return -1;
2880 	}
2881 
2882 	if (zero_copy == 0) {
2883 		/* Create the mbuf pool. */
2884 		mbuf_pool = rte_mempool_create(
2885 				"MBUF_POOL",
2886 				NUM_MBUFS_PER_PORT
2887 				* valid_num_ports,
2888 				MBUF_SIZE, MBUF_CACHE_SIZE,
2889 				sizeof(struct rte_pktmbuf_pool_private),
2890 				rte_pktmbuf_pool_init, NULL,
2891 				rte_pktmbuf_init, NULL,
2892 				rte_socket_id(), 0);
2893 		if (mbuf_pool == NULL)
2894 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2895 
2896 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2897 			vpool_array[queue_id].pool = mbuf_pool;
2898 
2899 		if (vm2vm_mode == VM2VM_HARDWARE) {
2900 			/* Enable VT loop back to let L2 switch to do it. */
2901 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2902 			LOG_DEBUG(VHOST_CONFIG,
2903 				"Enable loop back for L2 switch in vmdq.\n");
2904 		}
2905 	} else {
2906 		uint32_t nb_mbuf;
2907 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2908 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2909 
2910 		/*
2911 		 * Zero copy defers queue RX/TX start to the time when guest
2912 		 * finishes its startup and packet buffers from that guest are
2913 		 * available.
2914 		 */
2915 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2916 		rx_conf_default.rx_drop_en = 0;
2917 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2918 		nb_mbuf = num_rx_descriptor
2919 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2920 			+ num_switching_cores * MAX_PKT_BURST;
2921 
2922 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2923 			snprintf(pool_name, sizeof(pool_name),
2924 				"rxmbuf_pool_%u", queue_id);
2925 			snprintf(ring_name, sizeof(ring_name),
2926 				"rxmbuf_ring_%u", queue_id);
2927 			setup_mempool_tbl(rte_socket_id(), queue_id,
2928 				pool_name, ring_name, nb_mbuf);
2929 		}
2930 
2931 		nb_mbuf = num_tx_descriptor
2932 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2933 				+ num_switching_cores * MAX_PKT_BURST;
2934 
2935 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2936 			snprintf(pool_name, sizeof(pool_name),
2937 				"txmbuf_pool_%u", queue_id);
2938 			snprintf(ring_name, sizeof(ring_name),
2939 				"txmbuf_ring_%u", queue_id);
2940 			setup_mempool_tbl(rte_socket_id(),
2941 				(queue_id + MAX_QUEUES),
2942 				pool_name, ring_name, nb_mbuf);
2943 		}
2944 
2945 		if (vm2vm_mode == VM2VM_HARDWARE) {
2946 			/* Enable VT loop back to let L2 switch to do it. */
2947 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2948 			LOG_DEBUG(VHOST_CONFIG,
2949 				"Enable loop back for L2 switch in vmdq.\n");
2950 		}
2951 	}
2952 	/* Set log level. */
2953 	rte_set_log_level(LOG_LEVEL);
2954 
2955 	/* initialize all ports */
2956 	for (portid = 0; portid < nb_ports; portid++) {
2957 		/* skip ports that are not enabled */
2958 		if ((enabled_port_mask & (1 << portid)) == 0) {
2959 			RTE_LOG(INFO, VHOST_PORT,
2960 				"Skipping disabled port %d\n", portid);
2961 			continue;
2962 		}
2963 		if (port_init(portid) != 0)
2964 			rte_exit(EXIT_FAILURE,
2965 				"Cannot initialize network ports\n");
2966 	}
2967 
2968 	/* Initialise all linked lists. */
2969 	if (init_data_ll() == -1)
2970 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2971 
2972 	/* Initialize device stats */
2973 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2974 
2975 	/* Enable stats if the user option is set. */
2976 	if (enable_stats)
2977 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2978 
2979 	/* Launch all data cores. */
2980 	if (zero_copy == 0) {
2981 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2982 			rte_eal_remote_launch(switch_worker,
2983 				mbuf_pool, lcore_id);
2984 		}
2985 	} else {
2986 		uint32_t count_in_mempool, index, i;
2987 		for (index = 0; index < 2*MAX_QUEUES; index++) {
2988 			/* For all RX and TX queues. */
2989 			count_in_mempool
2990 				= rte_mempool_count(vpool_array[index].pool);
2991 
2992 			/*
2993 			 * Transfer all un-attached mbufs from vpool.pool
2994 			 * to vpoo.ring.
2995 			 */
2996 			for (i = 0; i < count_in_mempool; i++) {
2997 				struct rte_mbuf *mbuf
2998 					= __rte_mbuf_raw_alloc(
2999 						vpool_array[index].pool);
3000 				rte_ring_sp_enqueue(vpool_array[index].ring,
3001 						(void *)mbuf);
3002 			}
3003 
3004 			LOG_DEBUG(VHOST_CONFIG,
3005 				"in MAIN: mbuf count in mempool at initial "
3006 				"is: %d\n", count_in_mempool);
3007 			LOG_DEBUG(VHOST_CONFIG,
3008 				"in MAIN: mbuf count in  ring at initial  is :"
3009 				" %d\n",
3010 				rte_ring_count(vpool_array[index].ring));
3011 		}
3012 
3013 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3014 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3015 				lcore_id);
3016 	}
3017 
3018 	if (mergeable == 0)
3019 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3020 
3021 	/* Register CUSE device to handle IOCTLs. */
3022 	ret = rte_vhost_driver_register((char *)&dev_basename);
3023 	if (ret != 0)
3024 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3025 
3026 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3027 
3028 	/* Start CUSE session. */
3029 	rte_vhost_driver_session_start();
3030 	return 0;
3031 
3032 }
3033 
3034