xref: /dpdk/examples/vhost/main.c (revision a981294b29a39c2781c2fad9356323fa4234a6ad)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
103 
104 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
106 
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108 
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX			1
112 #define DEVICE_SAFE_REMOVE	2
113 
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117 
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121 
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133 
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 		+ sizeof(struct rte_mbuf)))
137 
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140 
141 #define INVALID_PORT_ID 0xFF
142 
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145 
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148 
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151 
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154 
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157 
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160 
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163 
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166 
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170 
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177 
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184 
185 struct vpool {
186 	struct rte_mempool *pool;
187 	struct rte_ring *ring;
188 	uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193 	VM2VM_DISABLED = 0,
194 	VM2VM_SOFTWARE = 1,
195 	VM2VM_HARDWARE = 2,
196 	VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202 	PHYS_ADDR_CONTINUOUS = 0,
203 	PHYS_ADDR_CROSS_SUBREG = 1,
204 	PHYS_ADDR_INVALID = 2,
205 	PHYS_ADDR_LAST
206 } hpa_type;
207 
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 
220 
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223 	.rx_thresh = {
224 		.pthresh = RX_PTHRESH,
225 		.hthresh = RX_HTHRESH,
226 		.wthresh = RX_WTHRESH,
227 	},
228 	.rx_drop_en = 1,
229 };
230 
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237 	.tx_thresh = {
238 		.pthresh = TX_PTHRESH,
239 		.hthresh = TX_HTHRESH,
240 		.wthresh = TX_WTHRESH,
241 	},
242 	.tx_free_thresh = 0, /* Use PMD default values */
243 	.tx_rs_thresh = 0, /* Use PMD default values */
244 };
245 
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248 	.rxmode = {
249 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250 		.split_hdr_size = 0,
251 		.header_split   = 0, /**< Header Split disabled */
252 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
253 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
254 		/*
255 		 * It is necessary for 1G NIC such as I350,
256 		 * this fixes bug of ipv4 forwarding in guest can't
257 		 * forward pakets from one virtio dev to another virtio dev.
258 		 */
259 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
262 	},
263 
264 	.txmode = {
265 		.mq_mode = ETH_MQ_TX_NONE,
266 	},
267 	.rx_adv_conf = {
268 		/*
269 		 * should be overridden separately in code with
270 		 * appropriate values
271 		 */
272 		.vmdq_rx_conf = {
273 			.nb_queue_pools = ETH_8_POOLS,
274 			.enable_default_pool = 0,
275 			.default_pool = 0,
276 			.nb_pool_maps = 0,
277 			.pool_map = {{0, 0},},
278 		},
279 	},
280 };
281 
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285 
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
290 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297 
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300 
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304 
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307 
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310 	unsigned len;
311 	unsigned txq_id;
312 	struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314 
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317 
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320 
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323 	unsigned char   h_dest[ETH_ALEN];
324 	unsigned char   h_source[ETH_ALEN];
325 	__be16          h_vlan_proto;
326 	__be16          h_vlan_TCI;
327 	__be16          h_vlan_encapsulated_proto;
328 };
329 
330 /* IPv4 Header */
331 struct ipv4_hdr {
332 	uint8_t  version_ihl;		/**< version and header length */
333 	uint8_t  type_of_service;	/**< type of service */
334 	uint16_t total_length;		/**< length of packet */
335 	uint16_t packet_id;		/**< packet ID */
336 	uint16_t fragment_offset;	/**< fragmentation offset */
337 	uint8_t  time_to_live;		/**< time to live */
338 	uint8_t  next_proto_id;		/**< protocol ID */
339 	uint16_t hdr_checksum;		/**< header checksum */
340 	uint32_t src_addr;		/**< source address */
341 	uint32_t dst_addr;		/**< destination address */
342 } __attribute__((__packed__));
343 
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347 
348 /* Per-device statistics struct */
349 struct device_statistics {
350 	uint64_t tx_total;
351 	rte_atomic64_t rx_total_atomic;
352 	uint64_t rx_total;
353 	uint64_t tx;
354 	rte_atomic64_t rx_atomic;
355 	uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358 
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366 	struct rte_eth_vmdq_rx_conf conf;
367 	unsigned i;
368 
369 	memset(&conf, 0, sizeof(conf));
370 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371 	conf.nb_pool_maps = num_devices;
372 	conf.enable_loop_back =
373 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374 
375 	for (i = 0; i < conf.nb_pool_maps; i++) {
376 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
377 		conf.pool_map[i].pools = (1UL << i);
378 	}
379 
380 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383 	return 0;
384 }
385 
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394 	if (num_devices > max_nb_devices) {
395 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396 		return -1;
397 	}
398 	return 0;
399 }
400 
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408 	struct rte_eth_dev_info dev_info;
409 	struct rte_eth_conf port_conf;
410 	uint16_t rx_rings, tx_rings;
411 	uint16_t rx_ring_size, tx_ring_size;
412 	int retval;
413 	uint16_t q;
414 
415 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416 	rte_eth_dev_info_get (port, &dev_info);
417 
418 	/*configure the number of supported virtio devices based on VMDQ limits */
419 	num_devices = dev_info.max_vmdq_pools;
420 	num_queues = dev_info.max_rx_queues;
421 
422 	if (zero_copy) {
423 		rx_ring_size = num_rx_descriptor;
424 		tx_ring_size = num_tx_descriptor;
425 		tx_rings = dev_info.max_tx_queues;
426 	} else {
427 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429 		tx_rings = (uint16_t)rte_lcore_count();
430 	}
431 
432 	retval = validate_num_devices(MAX_DEVICES);
433 	if (retval < 0)
434 		return retval;
435 
436 	/* Get port configuration. */
437 	retval = get_eth_conf(&port_conf, num_devices);
438 	if (retval < 0)
439 		return retval;
440 
441 	if (port >= rte_eth_dev_count()) return -1;
442 
443 	rx_rings = (uint16_t)num_queues,
444 	/* Configure ethernet device. */
445 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446 	if (retval != 0)
447 		return retval;
448 
449 	/* Setup the queues. */
450 	for (q = 0; q < rx_rings; q ++) {
451 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 						rte_eth_dev_socket_id(port), &rx_conf_default,
453 						vpool_array[q].pool);
454 		if (retval < 0)
455 			return retval;
456 	}
457 	for (q = 0; q < tx_rings; q ++) {
458 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459 						rte_eth_dev_socket_id(port), &tx_conf_default);
460 		if (retval < 0)
461 			return retval;
462 	}
463 
464 	/* Start the device. */
465 	retval  = rte_eth_dev_start(port);
466 	if (retval < 0) {
467 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468 		return retval;
469 	}
470 
471 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475 			(unsigned)port,
476 			vmdq_ports_eth_addr[port].addr_bytes[0],
477 			vmdq_ports_eth_addr[port].addr_bytes[1],
478 			vmdq_ports_eth_addr[port].addr_bytes[2],
479 			vmdq_ports_eth_addr[port].addr_bytes[3],
480 			vmdq_ports_eth_addr[port].addr_bytes[4],
481 			vmdq_ports_eth_addr[port].addr_bytes[5]);
482 
483 	return 0;
484 }
485 
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492 	/* parse number string */
493 
494 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495 		return -1;
496 	else
497 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498 
499 	return 0;
500 }
501 
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508 	char *end = NULL;
509 	unsigned long pm;
510 
511 	errno = 0;
512 
513 	/* parse hexadecimal string */
514 	pm = strtoul(portmask, &end, 16);
515 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516 		return -1;
517 
518 	if (pm == 0)
519 		return -1;
520 
521 	return pm;
522 
523 }
524 
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531 	char *end = NULL;
532 	unsigned long num;
533 
534 	errno = 0;
535 
536 	/* parse unsigned int string */
537 	num = strtoul(q_arg, &end, 10);
538 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539 		return -1;
540 
541 	if (num > max_valid_value)
542 		return -1;
543 
544 	return num;
545 
546 }
547 
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555 	"		--vm2vm [0|1|2]\n"
556 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557 	"		--dev-basename <name>\n"
558 	"		--nb-devices ND\n"
559 	"		-p PORTMASK: Set mask for ports to be used by application\n"
560 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566 	"		--dev-basename: The basename to be used for the character device.\n"
567 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
568 			"zero copy\n"
569 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
570 			"used only when zero copy is enabled.\n"
571 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
572 			"used only when zero copy is enabled.\n",
573 	       prgname);
574 }
575 
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582 	int opt, ret;
583 	int option_index;
584 	unsigned i;
585 	const char *prgname = argv[0];
586 	static struct option long_option[] = {
587 		{"vm2vm", required_argument, NULL, 0},
588 		{"rx-retry", required_argument, NULL, 0},
589 		{"rx-retry-delay", required_argument, NULL, 0},
590 		{"rx-retry-num", required_argument, NULL, 0},
591 		{"mergeable", required_argument, NULL, 0},
592 		{"stats", required_argument, NULL, 0},
593 		{"dev-basename", required_argument, NULL, 0},
594 		{"zero-copy", required_argument, NULL, 0},
595 		{"rx-desc-num", required_argument, NULL, 0},
596 		{"tx-desc-num", required_argument, NULL, 0},
597 		{NULL, 0, 0, 0},
598 	};
599 
600 	/* Parse command line */
601 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602 		switch (opt) {
603 		/* Portmask */
604 		case 'p':
605 			enabled_port_mask = parse_portmask(optarg);
606 			if (enabled_port_mask == 0) {
607 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 				us_vhost_usage(prgname);
609 				return -1;
610 			}
611 			break;
612 
613 		case 0:
614 			/* Enable/disable vm2vm comms. */
615 			if (!strncmp(long_option[option_index].name, "vm2vm",
616 				MAX_LONG_OPT_SZ)) {
617 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618 				if (ret == -1) {
619 					RTE_LOG(INFO, VHOST_CONFIG,
620 						"Invalid argument for "
621 						"vm2vm [0|1|2]\n");
622 					us_vhost_usage(prgname);
623 					return -1;
624 				} else {
625 					vm2vm_mode = (vm2vm_type)ret;
626 				}
627 			}
628 
629 			/* Enable/disable retries on RX. */
630 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631 				ret = parse_num_opt(optarg, 1);
632 				if (ret == -1) {
633 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634 					us_vhost_usage(prgname);
635 					return -1;
636 				} else {
637 					enable_retry = ret;
638 				}
639 			}
640 
641 			/* Specify the retries delay time (in useconds) on RX. */
642 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643 				ret = parse_num_opt(optarg, INT32_MAX);
644 				if (ret == -1) {
645 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646 					us_vhost_usage(prgname);
647 					return -1;
648 				} else {
649 					burst_rx_delay_time = ret;
650 				}
651 			}
652 
653 			/* Specify the retries number on RX. */
654 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655 				ret = parse_num_opt(optarg, INT32_MAX);
656 				if (ret == -1) {
657 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658 					us_vhost_usage(prgname);
659 					return -1;
660 				} else {
661 					burst_rx_retry_num = ret;
662 				}
663 			}
664 
665 			/* Enable/disable RX mergeable buffers. */
666 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667 				ret = parse_num_opt(optarg, 1);
668 				if (ret == -1) {
669 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670 					us_vhost_usage(prgname);
671 					return -1;
672 				} else {
673 					mergeable = !!ret;
674 					if (ret) {
675 						vmdq_conf_default.rxmode.jumbo_frame = 1;
676 						vmdq_conf_default.rxmode.max_rx_pkt_len
677 							= JUMBO_FRAME_MAX_SIZE;
678 					}
679 				}
680 			}
681 
682 			/* Enable/disable stats. */
683 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684 				ret = parse_num_opt(optarg, INT32_MAX);
685 				if (ret == -1) {
686 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687 					us_vhost_usage(prgname);
688 					return -1;
689 				} else {
690 					enable_stats = ret;
691 				}
692 			}
693 
694 			/* Set character device basename. */
695 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696 				if (us_vhost_parse_basename(optarg) == -1) {
697 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698 					us_vhost_usage(prgname);
699 					return -1;
700 				}
701 			}
702 
703 			/* Enable/disable rx/tx zero copy. */
704 			if (!strncmp(long_option[option_index].name,
705 				"zero-copy", MAX_LONG_OPT_SZ)) {
706 				ret = parse_num_opt(optarg, 1);
707 				if (ret == -1) {
708 					RTE_LOG(INFO, VHOST_CONFIG,
709 						"Invalid argument"
710 						" for zero-copy [0|1]\n");
711 					us_vhost_usage(prgname);
712 					return -1;
713 				} else
714 					zero_copy = ret;
715 
716 				if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719 					"zero copy vhost APP, please "
720 					"disable RTE_MBUF_REFCNT\n"
721 					"in config file and then rebuild DPDK "
722 					"core lib!\n"
723 					"Otherwise please disable zero copy "
724 					"flag in command line!\n");
725 					return -1;
726 #endif
727 				}
728 			}
729 
730 			/* Specify the descriptor number on RX. */
731 			if (!strncmp(long_option[option_index].name,
732 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
733 				ret = parse_num_opt(optarg, MAX_RING_DESC);
734 				if ((ret == -1) || (!POWEROF2(ret))) {
735 					RTE_LOG(INFO, VHOST_CONFIG,
736 					"Invalid argument for rx-desc-num[0-N],"
737 					"power of 2 required.\n");
738 					us_vhost_usage(prgname);
739 					return -1;
740 				} else {
741 					num_rx_descriptor = ret;
742 				}
743 			}
744 
745 			/* Specify the descriptor number on TX. */
746 			if (!strncmp(long_option[option_index].name,
747 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
748 				ret = parse_num_opt(optarg, MAX_RING_DESC);
749 				if ((ret == -1) || (!POWEROF2(ret))) {
750 					RTE_LOG(INFO, VHOST_CONFIG,
751 					"Invalid argument for tx-desc-num [0-N],"
752 					"power of 2 required.\n");
753 					us_vhost_usage(prgname);
754 					return -1;
755 				} else {
756 					num_tx_descriptor = ret;
757 				}
758 			}
759 
760 			break;
761 
762 			/* Invalid option - print options. */
763 		default:
764 			us_vhost_usage(prgname);
765 			return -1;
766 		}
767 	}
768 
769 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770 		if (enabled_port_mask & (1 << i))
771 			ports[num_ports++] = (uint8_t)i;
772 	}
773 
774 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777 		return -1;
778 	}
779 
780 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781 		RTE_LOG(INFO, VHOST_PORT,
782 			"Vhost zero copy doesn't support software vm2vm,"
783 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
784 		return -1;
785 	}
786 
787 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788 		RTE_LOG(INFO, VHOST_PORT,
789 			"Vhost zero copy doesn't support jumbo frame,"
790 			"please specify '--mergeable 0' to disable the "
791 			"mergeable feature.\n");
792 		return -1;
793 	}
794 
795 	return 0;
796 }
797 
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804 	unsigned valid_num_ports = num_ports;
805 	unsigned portid;
806 
807 	if (num_ports > nb_ports) {
808 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809 			num_ports, nb_ports);
810 		num_ports = nb_ports;
811 	}
812 
813 	for (portid = 0; portid < num_ports; portid ++) {
814 		if (ports[portid] >= nb_ports) {
815 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816 				ports[portid], (nb_ports - 1));
817 			ports[portid] = INVALID_PORT_ID;
818 			valid_num_ports--;
819 		}
820 	}
821 	return valid_num_ports;
822 }
823 
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {																\
830 	char *pkt_addr = (char*)(addr);																					\
831 	unsigned int index;																								\
832 	char packet[MAX_PRINT_BUFF];																					\
833 																													\
834 	if ((header))																									\
835 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
836 	else																											\
837 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
838 	for (index = 0; index < (size); index++) {																		\
839 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
840 			"%02hhx ", pkt_addr[index]);																			\
841 	}																												\
842 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
843 																													\
844 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849 
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856 	uint32_t buf_len, hpa_type *addr_type)
857 {
858 	struct virtio_memory_regions_hpa *region;
859 	uint32_t regionidx;
860 	uint64_t vhost_pa = 0;
861 
862 	*addr_type = PHYS_ADDR_INVALID;
863 
864 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865 		region = &vdev->regions_hpa[regionidx];
866 		if ((guest_pa >= region->guest_phys_address) &&
867 			(guest_pa <= region->guest_phys_address_end)) {
868 			vhost_pa = region->host_phys_addr_offset + guest_pa;
869 			if (likely((guest_pa + buf_len - 1)
870 				<= region->guest_phys_address_end))
871 				*addr_type = PHYS_ADDR_CONTINUOUS;
872 			else
873 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
874 			break;
875 		}
876 	}
877 
878 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880 		(void *)(uintptr_t)vhost_pa);
881 
882 	return vhost_pa;
883 }
884 
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893 
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901 	struct ether_hdr *pkt_hdr;
902 	struct virtio_net_data_ll *dev_ll;
903 	struct virtio_net *dev = vdev->dev;
904 	int i, ret;
905 
906 	/* Learn MAC address of guest device from packet */
907 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908 
909 	dev_ll = ll_root_used;
910 
911 	while (dev_ll != NULL) {
912 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914 			return -1;
915 		}
916 		dev_ll = dev_ll->next;
917 	}
918 
919 	for (i = 0; i < ETHER_ADDR_LEN; i++)
920 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921 
922 	/* vlan_tag currently uses the device_id. */
923 	vdev->vlan_tag = vlan_tags[dev->device_fh];
924 
925 	/* Print out VMDQ registration info. */
926 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927 		dev->device_fh,
928 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931 		vdev->vlan_tag);
932 
933 	/* Register the MAC address. */
934 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935 	if (ret)
936 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937 					dev->device_fh);
938 
939 	/* Enable stripping of the vlan tag as we handle routing. */
940 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941 
942 	/* Set device as ready for RX. */
943 	vdev->ready = DEVICE_RX;
944 
945 	return 0;
946 }
947 
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955 	unsigned i = 0;
956 	unsigned rx_count;
957 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958 
959 	if (vdev->ready == DEVICE_RX) {
960 		/*clear MAC and VLAN settings*/
961 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962 		for (i = 0; i < 6; i++)
963 			vdev->mac_address.addr_bytes[i] = 0;
964 
965 		vdev->vlan_tag = 0;
966 
967 		/*Clear out the receive buffers*/
968 		rx_count = rte_eth_rx_burst(ports[0],
969 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970 
971 		while (rx_count) {
972 			for (i = 0; i < rx_count; i++)
973 				rte_pktmbuf_free(pkts_burst[i]);
974 
975 			rx_count = rte_eth_rx_burst(ports[0],
976 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977 		}
978 
979 		vdev->ready = DEVICE_MAC_LEARNING;
980 	}
981 }
982 
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990 	struct virtio_net_data_ll *dev_ll;
991 	struct ether_hdr *pkt_hdr;
992 	uint64_t ret = 0;
993 	struct virtio_net *dev = vdev->dev;
994 	struct virtio_net *tdev; /* destination virito device */
995 
996 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997 
998 	/*get the used devices list*/
999 	dev_ll = ll_root_used;
1000 
1001 	while (dev_ll != NULL) {
1002 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003 				          &dev_ll->vdev->mac_address)) {
1004 
1005 			/* Drop the packet if the TX packet is destined for the TX device. */
1006 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008 							dev->device_fh);
1009 				return 0;
1010 			}
1011 			tdev = dev_ll->vdev->dev;
1012 
1013 
1014 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015 
1016 			if (unlikely(dev_ll->vdev->remove)) {
1017 				/*drop the packet if the device is marked for removal*/
1018 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019 			} else {
1020 				/*send the packet to the local virtio device*/
1021 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022 				if (enable_stats) {
1023 					rte_atomic64_add(
1024 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1025 					1);
1026 					rte_atomic64_add(
1027 					&dev_statistics[tdev->device_fh].rx_atomic,
1028 					ret);
1029 					dev_statistics[tdev->device_fh].tx_total++;
1030 					dev_statistics[tdev->device_fh].tx += ret;
1031 				}
1032 			}
1033 
1034 			return 0;
1035 		}
1036 		dev_ll = dev_ll->next;
1037 	}
1038 
1039 	return -1;
1040 }
1041 
1042 /*
1043  * This function routes the TX packet to the correct interface. This may be a local device
1044  * or the physical port.
1045  */
1046 static inline void __attribute__((always_inline))
1047 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1048 {
1049 	struct mbuf_table *tx_q;
1050 	struct rte_mbuf **m_table;
1051 	unsigned len, ret, offset = 0;
1052 	const uint16_t lcore_id = rte_lcore_id();
1053 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1054 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1055 	struct virtio_net *dev = vdev->dev;
1056 
1057 	/*check if destination is local VM*/
1058 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1059 		rte_pktmbuf_free(m);
1060 		return;
1061 	}
1062 
1063 	if (vm2vm_mode == VM2VM_HARDWARE) {
1064 		while (dev_ll != NULL) {
1065 			if ((dev_ll->vdev->ready == DEVICE_RX)
1066 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1067 				&dev_ll->vdev->mac_address)) {
1068 				/*
1069 				 * Drop the packet if the TX packet is
1070 				 * destined for the TX device.
1071 				 */
1072 				if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1073 					LOG_DEBUG(VHOST_DATA,
1074 					"(%"PRIu64") TX: Source and destination"
1075 					" MAC addresses are the same. Dropping "
1076 					"packet.\n",
1077 					dev_ll->vdev->dev->device_fh);
1078 					rte_pktmbuf_free(m);
1079 					return;
1080 				}
1081 				offset = 4;
1082 				vlan_tag =
1083 				(uint16_t)
1084 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1085 
1086 				LOG_DEBUG(VHOST_DATA,
1087 				"(%"PRIu64") TX: pkt to local VM device id:"
1088 				"(%"PRIu64") vlan tag: %d.\n",
1089 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1090 				vlan_tag);
1091 
1092 				break;
1093 			}
1094 			dev_ll = dev_ll->next;
1095 		}
1096 	}
1097 
1098 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1099 
1100 	/*Add packet to the port tx queue*/
1101 	tx_q = &lcore_tx_queue[lcore_id];
1102 	len = tx_q->len;
1103 
1104 	m->ol_flags = PKT_TX_VLAN_PKT;
1105 	/*FIXME: offset*/
1106 	m->data_len += offset;
1107 	m->vlan_tci = vlan_tag;
1108 
1109 	tx_q->m_table[len] = m;
1110 	len++;
1111 	if (enable_stats) {
1112 		dev_statistics[dev->device_fh].tx_total++;
1113 		dev_statistics[dev->device_fh].tx++;
1114 	}
1115 
1116 	if (unlikely(len == MAX_PKT_BURST)) {
1117 		m_table = (struct rte_mbuf **)tx_q->m_table;
1118 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1119 		/* Free any buffers not handled by TX and update the port stats. */
1120 		if (unlikely(ret < len)) {
1121 			do {
1122 				rte_pktmbuf_free(m_table[ret]);
1123 			} while (++ret < len);
1124 		}
1125 
1126 		len = 0;
1127 	}
1128 
1129 	tx_q->len = len;
1130 	return;
1131 }
1132 /*
1133  * This function is called by each data core. It handles all RX/TX registered with the
1134  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1135  * with all devices in the main linked list.
1136  */
1137 static int
1138 switch_worker(__attribute__((unused)) void *arg)
1139 {
1140 	struct rte_mempool *mbuf_pool = arg;
1141 	struct virtio_net *dev = NULL;
1142 	struct vhost_dev *vdev = NULL;
1143 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1144 	struct virtio_net_data_ll *dev_ll;
1145 	struct mbuf_table *tx_q;
1146 	volatile struct lcore_ll_info *lcore_ll;
1147 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1148 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1149 	unsigned ret, i;
1150 	const uint16_t lcore_id = rte_lcore_id();
1151 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1152 	uint16_t rx_count = 0;
1153 	uint16_t tx_count;
1154 	uint32_t retry = 0;
1155 
1156 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1157 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1158 	prev_tsc = 0;
1159 
1160 	tx_q = &lcore_tx_queue[lcore_id];
1161 	for (i = 0; i < num_cores; i ++) {
1162 		if (lcore_ids[i] == lcore_id) {
1163 			tx_q->txq_id = i;
1164 			break;
1165 		}
1166 	}
1167 
1168 	while(1) {
1169 		cur_tsc = rte_rdtsc();
1170 		/*
1171 		 * TX burst queue drain
1172 		 */
1173 		diff_tsc = cur_tsc - prev_tsc;
1174 		if (unlikely(diff_tsc > drain_tsc)) {
1175 
1176 			if (tx_q->len) {
1177 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1178 
1179 				/*Tx any packets in the queue*/
1180 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1181 									   (struct rte_mbuf **)tx_q->m_table,
1182 									   (uint16_t)tx_q->len);
1183 				if (unlikely(ret < tx_q->len)) {
1184 					do {
1185 						rte_pktmbuf_free(tx_q->m_table[ret]);
1186 					} while (++ret < tx_q->len);
1187 				}
1188 
1189 				tx_q->len = 0;
1190 			}
1191 
1192 			prev_tsc = cur_tsc;
1193 
1194 		}
1195 
1196 		rte_prefetch0(lcore_ll->ll_root_used);
1197 		/*
1198 		 * Inform the configuration core that we have exited the linked list and that no devices are
1199 		 * in use if requested.
1200 		 */
1201 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1202 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1203 
1204 		/*
1205 		 * Process devices
1206 		 */
1207 		dev_ll = lcore_ll->ll_root_used;
1208 
1209 		while (dev_ll != NULL) {
1210 			/*get virtio device ID*/
1211 			vdev = dev_ll->vdev;
1212 			dev = vdev->dev;
1213 
1214 			if (unlikely(vdev->remove)) {
1215 				dev_ll = dev_ll->next;
1216 				unlink_vmdq(vdev);
1217 				vdev->ready = DEVICE_SAFE_REMOVE;
1218 				continue;
1219 			}
1220 			if (likely(vdev->ready == DEVICE_RX)) {
1221 				/*Handle guest RX*/
1222 				rx_count = rte_eth_rx_burst(ports[0],
1223 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1224 
1225 				if (rx_count) {
1226 					/*
1227 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1228 					* Here MAX_PKT_BURST must be less than virtio queue size
1229 					*/
1230 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1231 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1232 							rte_delay_us(burst_rx_delay_time);
1233 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1234 								break;
1235 						}
1236 					}
1237 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1238 					if (enable_stats) {
1239 						rte_atomic64_add(
1240 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1241 						rx_count);
1242 						rte_atomic64_add(
1243 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1244 					}
1245 					while (likely(rx_count)) {
1246 						rx_count--;
1247 						rte_pktmbuf_free(pkts_burst[rx_count]);
1248 					}
1249 
1250 				}
1251 			}
1252 
1253 			if (likely(!vdev->remove)) {
1254 				/* Handle guest TX*/
1255 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1256 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1257 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1258 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1259 						while (tx_count--)
1260 							rte_pktmbuf_free(pkts_burst[tx_count]);
1261 					}
1262 				}
1263 				while (tx_count)
1264 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1265 			}
1266 
1267 			/*move to the next device in the list*/
1268 			dev_ll = dev_ll->next;
1269 		}
1270 	}
1271 
1272 	return 0;
1273 }
1274 
1275 /*
1276  * This function gets available ring number for zero copy rx.
1277  * Only one thread will call this funciton for a paticular virtio device,
1278  * so, it is designed as non-thread-safe function.
1279  */
1280 static inline uint32_t __attribute__((always_inline))
1281 get_available_ring_num_zcp(struct virtio_net *dev)
1282 {
1283 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1284 	uint16_t avail_idx;
1285 
1286 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1287 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1288 }
1289 
1290 /*
1291  * This function gets available ring index for zero copy rx,
1292  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1293  * Only one thread will call this funciton for a paticular virtio device,
1294  * so, it is designed as non-thread-safe function.
1295  */
1296 static inline uint32_t __attribute__((always_inline))
1297 get_available_ring_index_zcp(struct virtio_net *dev,
1298 	uint16_t *res_base_idx, uint32_t count)
1299 {
1300 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1301 	uint16_t avail_idx;
1302 	uint32_t retry = 0;
1303 	uint16_t free_entries;
1304 
1305 	*res_base_idx = vq->last_used_idx_res;
1306 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1307 	free_entries = (avail_idx - *res_base_idx);
1308 
1309 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1310 			"avail idx: %d, "
1311 			"res base idx:%d, free entries:%d\n",
1312 			dev->device_fh, avail_idx, *res_base_idx,
1313 			free_entries);
1314 
1315 	/*
1316 	 * If retry is enabled and the queue is full then we wait
1317 	 * and retry to avoid packet loss.
1318 	 */
1319 	if (enable_retry && unlikely(count > free_entries)) {
1320 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1321 			rte_delay_us(burst_rx_delay_time);
1322 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1323 			free_entries = (avail_idx - *res_base_idx);
1324 			if (count <= free_entries)
1325 				break;
1326 		}
1327 	}
1328 
1329 	/*check that we have enough buffers*/
1330 	if (unlikely(count > free_entries))
1331 		count = free_entries;
1332 
1333 	if (unlikely(count == 0)) {
1334 		LOG_DEBUG(VHOST_DATA,
1335 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1336 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1337 			dev->device_fh, avail_idx,
1338 			*res_base_idx, free_entries);
1339 		return 0;
1340 	}
1341 
1342 	vq->last_used_idx_res = *res_base_idx + count;
1343 
1344 	return count;
1345 }
1346 
1347 /*
1348  * This function put descriptor back to used list.
1349  */
1350 static inline void __attribute__((always_inline))
1351 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1352 {
1353 	uint16_t res_cur_idx = vq->last_used_idx;
1354 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1355 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1356 	rte_compiler_barrier();
1357 	*(volatile uint16_t *)&vq->used->idx += 1;
1358 	vq->last_used_idx += 1;
1359 
1360 	/* Kick the guest if necessary. */
1361 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1362 		eventfd_write((int)vq->kickfd, 1);
1363 }
1364 
1365 /*
1366  * This function get available descriptor from vitio vring and un-attached mbuf
1367  * from vpool->ring, and then attach them together. It needs adjust the offset
1368  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1369  * frame data may be put to wrong location in mbuf.
1370  */
1371 static inline void __attribute__((always_inline))
1372 attach_rxmbuf_zcp(struct virtio_net *dev)
1373 {
1374 	uint16_t res_base_idx, desc_idx;
1375 	uint64_t buff_addr, phys_addr;
1376 	struct vhost_virtqueue *vq;
1377 	struct vring_desc *desc;
1378 	struct rte_mbuf *mbuf = NULL;
1379 	struct vpool *vpool;
1380 	hpa_type addr_type;
1381 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1382 
1383 	vpool = &vpool_array[vdev->vmdq_rx_q];
1384 	vq = dev->virtqueue[VIRTIO_RXQ];
1385 
1386 	do {
1387 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1388 				1) != 1))
1389 			return;
1390 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1391 
1392 		desc = &vq->desc[desc_idx];
1393 		if (desc->flags & VRING_DESC_F_NEXT) {
1394 			desc = &vq->desc[desc->next];
1395 			buff_addr = gpa_to_vva(dev, desc->addr);
1396 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1397 					&addr_type);
1398 		} else {
1399 			buff_addr = gpa_to_vva(dev,
1400 					desc->addr + vq->vhost_hlen);
1401 			phys_addr = gpa_to_hpa(vdev,
1402 					desc->addr + vq->vhost_hlen,
1403 					desc->len, &addr_type);
1404 		}
1405 
1406 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1407 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1408 				" address found when attaching RX frame buffer"
1409 				" address!\n", dev->device_fh);
1410 			put_desc_to_used_list_zcp(vq, desc_idx);
1411 			continue;
1412 		}
1413 
1414 		/*
1415 		 * Check if the frame buffer address from guest crosses
1416 		 * sub-region or not.
1417 		 */
1418 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1419 			RTE_LOG(ERR, VHOST_DATA,
1420 				"(%"PRIu64") Frame buffer address cross "
1421 				"sub-regioin found when attaching RX frame "
1422 				"buffer address!\n",
1423 				dev->device_fh);
1424 			put_desc_to_used_list_zcp(vq, desc_idx);
1425 			continue;
1426 		}
1427 	} while (unlikely(phys_addr == 0));
1428 
1429 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1430 	if (unlikely(mbuf == NULL)) {
1431 		LOG_DEBUG(VHOST_DATA,
1432 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1433 			"ring_sc_dequeue fail.\n",
1434 			dev->device_fh);
1435 		put_desc_to_used_list_zcp(vq, desc_idx);
1436 		return;
1437 	}
1438 
1439 	if (unlikely(vpool->buf_size > desc->len)) {
1440 		LOG_DEBUG(VHOST_DATA,
1441 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1442 			"length(%d) of descriptor idx: %d less than room "
1443 			"size required: %d\n",
1444 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1445 		put_desc_to_used_list_zcp(vq, desc_idx);
1446 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1447 		return;
1448 	}
1449 
1450 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1451 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1452 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1453 	mbuf->data_len = desc->len;
1454 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1455 
1456 	LOG_DEBUG(VHOST_DATA,
1457 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1458 		"descriptor idx:%d\n",
1459 		dev->device_fh, res_base_idx, desc_idx);
1460 
1461 	__rte_mbuf_raw_free(mbuf);
1462 
1463 	return;
1464 }
1465 
1466 /*
1467  * Detach an attched packet mbuf -
1468  *  - restore original mbuf address and length values.
1469  *  - reset pktmbuf data and data_len to their default values.
1470  *  All other fields of the given packet mbuf will be left intact.
1471  *
1472  * @param m
1473  *   The attached packet mbuf.
1474  */
1475 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1476 {
1477 	const struct rte_mempool *mp = m->pool;
1478 	void *buf = RTE_MBUF_TO_BADDR(m);
1479 	uint32_t buf_ofs;
1480 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1481 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1482 
1483 	m->buf_addr = buf;
1484 	m->buf_len = (uint16_t)buf_len;
1485 
1486 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1487 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1488 	m->data_off = buf_ofs;
1489 
1490 	m->data_len = 0;
1491 }
1492 
1493 /*
1494  * This function is called after packets have been transimited. It fetchs mbuf
1495  * from vpool->pool, detached it and put into vpool->ring. It also update the
1496  * used index and kick the guest if necessary.
1497  */
1498 static inline uint32_t __attribute__((always_inline))
1499 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1500 {
1501 	struct rte_mbuf *mbuf;
1502 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1503 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1504 	uint32_t index = 0;
1505 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1506 
1507 	LOG_DEBUG(VHOST_DATA,
1508 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1509 		"clean is: %d\n",
1510 		dev->device_fh, mbuf_count);
1511 	LOG_DEBUG(VHOST_DATA,
1512 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1513 		"clean  is : %d\n",
1514 		dev->device_fh, rte_ring_count(vpool->ring));
1515 
1516 	for (index = 0; index < mbuf_count; index++) {
1517 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1518 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1519 			pktmbuf_detach_zcp(mbuf);
1520 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1521 
1522 		/* Update used index buffer information. */
1523 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1524 		vq->used->ring[used_idx].len = 0;
1525 
1526 		used_idx = (used_idx + 1) & (vq->size - 1);
1527 	}
1528 
1529 	LOG_DEBUG(VHOST_DATA,
1530 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1531 		"clean is: %d\n",
1532 		dev->device_fh, rte_mempool_count(vpool->pool));
1533 	LOG_DEBUG(VHOST_DATA,
1534 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1535 		"clean  is : %d\n",
1536 		dev->device_fh, rte_ring_count(vpool->ring));
1537 	LOG_DEBUG(VHOST_DATA,
1538 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1539 		"vq->last_used_idx:%d\n",
1540 		dev->device_fh, vq->last_used_idx);
1541 
1542 	vq->last_used_idx += mbuf_count;
1543 
1544 	LOG_DEBUG(VHOST_DATA,
1545 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1546 		"vq->last_used_idx:%d\n",
1547 		dev->device_fh, vq->last_used_idx);
1548 
1549 	rte_compiler_barrier();
1550 
1551 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1552 
1553 	/* Kick guest if required. */
1554 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1555 		eventfd_write((int)vq->kickfd, 1);
1556 
1557 	return 0;
1558 }
1559 
1560 /*
1561  * This function is called when a virtio device is destroy.
1562  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1563  */
1564 static void mbuf_destroy_zcp(struct vpool *vpool)
1565 {
1566 	struct rte_mbuf *mbuf = NULL;
1567 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1568 
1569 	LOG_DEBUG(VHOST_CONFIG,
1570 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1571 		"mbuf_destroy_zcp is: %d\n",
1572 		mbuf_count);
1573 	LOG_DEBUG(VHOST_CONFIG,
1574 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1575 		"mbuf_destroy_zcp  is : %d\n",
1576 		rte_ring_count(vpool->ring));
1577 
1578 	for (index = 0; index < mbuf_count; index++) {
1579 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1580 		if (likely(mbuf != NULL)) {
1581 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1582 				pktmbuf_detach_zcp(mbuf);
1583 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1584 		}
1585 	}
1586 
1587 	LOG_DEBUG(VHOST_CONFIG,
1588 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1589 		"mbuf_destroy_zcp is: %d\n",
1590 		rte_mempool_count(vpool->pool));
1591 	LOG_DEBUG(VHOST_CONFIG,
1592 		"in mbuf_destroy_zcp: mbuf count in ring after "
1593 		"mbuf_destroy_zcp is : %d\n",
1594 		rte_ring_count(vpool->ring));
1595 }
1596 
1597 /*
1598  * This function update the use flag and counter.
1599  */
1600 static inline uint32_t __attribute__((always_inline))
1601 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1602 	uint32_t count)
1603 {
1604 	struct vhost_virtqueue *vq;
1605 	struct vring_desc *desc;
1606 	struct rte_mbuf *buff;
1607 	/* The virtio_hdr is initialised to 0. */
1608 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1609 		= {{0, 0, 0, 0, 0, 0}, 0};
1610 	uint64_t buff_hdr_addr = 0;
1611 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1612 	uint32_t head_idx, packet_success = 0;
1613 	uint16_t res_cur_idx;
1614 
1615 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1616 
1617 	if (count == 0)
1618 		return 0;
1619 
1620 	vq = dev->virtqueue[VIRTIO_RXQ];
1621 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1622 
1623 	res_cur_idx = vq->last_used_idx;
1624 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1625 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1626 
1627 	/* Retrieve all of the head indexes first to avoid caching issues. */
1628 	for (head_idx = 0; head_idx < count; head_idx++)
1629 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1630 
1631 	/*Prefetch descriptor index. */
1632 	rte_prefetch0(&vq->desc[head[packet_success]]);
1633 
1634 	while (packet_success != count) {
1635 		/* Get descriptor from available ring */
1636 		desc = &vq->desc[head[packet_success]];
1637 
1638 		buff = pkts[packet_success];
1639 		LOG_DEBUG(VHOST_DATA,
1640 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1641 			"pkt[%d] descriptor idx: %d\n",
1642 			dev->device_fh, packet_success,
1643 			MBUF_HEADROOM_UINT32(buff));
1644 
1645 		PRINT_PACKET(dev,
1646 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1647 			+ RTE_PKTMBUF_HEADROOM),
1648 			rte_pktmbuf_data_len(buff), 0);
1649 
1650 		/* Buffer address translation for virtio header. */
1651 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1652 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1653 
1654 		/*
1655 		 * If the descriptors are chained the header and data are
1656 		 * placed in separate buffers.
1657 		 */
1658 		if (desc->flags & VRING_DESC_F_NEXT) {
1659 			desc->len = vq->vhost_hlen;
1660 			desc = &vq->desc[desc->next];
1661 			desc->len = rte_pktmbuf_data_len(buff);
1662 		} else {
1663 			desc->len = packet_len;
1664 		}
1665 
1666 		/* Update used ring with desc information */
1667 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1668 			= head[packet_success];
1669 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1670 			= packet_len;
1671 		res_cur_idx++;
1672 		packet_success++;
1673 
1674 		/* A header is required per buffer. */
1675 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1676 			(const void *)&virtio_hdr, vq->vhost_hlen);
1677 
1678 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1679 
1680 		if (likely(packet_success < count)) {
1681 			/* Prefetch descriptor index. */
1682 			rte_prefetch0(&vq->desc[head[packet_success]]);
1683 		}
1684 	}
1685 
1686 	rte_compiler_barrier();
1687 
1688 	LOG_DEBUG(VHOST_DATA,
1689 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1690 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1691 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1692 
1693 	*(volatile uint16_t *)&vq->used->idx += count;
1694 	vq->last_used_idx += count;
1695 
1696 	LOG_DEBUG(VHOST_DATA,
1697 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1698 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1699 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1700 
1701 	/* Kick the guest if necessary. */
1702 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1703 		eventfd_write((int)vq->kickfd, 1);
1704 
1705 	return count;
1706 }
1707 
1708 /*
1709  * This function routes the TX packet to the correct interface.
1710  * This may be a local device or the physical port.
1711  */
1712 static inline void __attribute__((always_inline))
1713 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1714 	uint32_t desc_idx, uint8_t need_copy)
1715 {
1716 	struct mbuf_table *tx_q;
1717 	struct rte_mbuf **m_table;
1718 	struct rte_mbuf *mbuf = NULL;
1719 	unsigned len, ret, offset = 0;
1720 	struct vpool *vpool;
1721 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1722 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1723 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1724 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1725 
1726 	/*Add packet to the port tx queue*/
1727 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1728 	len = tx_q->len;
1729 
1730 	/* Allocate an mbuf and populate the structure. */
1731 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1732 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1733 	if (unlikely(mbuf == NULL)) {
1734 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1735 		RTE_LOG(ERR, VHOST_DATA,
1736 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1737 			dev->device_fh);
1738 		put_desc_to_used_list_zcp(vq, desc_idx);
1739 		return;
1740 	}
1741 
1742 	if (vm2vm_mode == VM2VM_HARDWARE) {
1743 		/* Avoid using a vlan tag from any vm for external pkt, such as
1744 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1745 		 * selection, MAC address determines it as an external pkt
1746 		 * which should go to network, while vlan tag determine it as
1747 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1748 		 * such a ambiguous situation, so pkt will lost.
1749 		 */
1750 		vlan_tag = external_pkt_default_vlan_tag;
1751 		while (dev_ll != NULL) {
1752 			if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1753 				ether_addr_cmp(&(pkt_hdr->d_addr),
1754 				&dev_ll->vdev->mac_address)) {
1755 
1756 				/*
1757 				 * Drop the packet if the TX packet is destined
1758 				 * for the TX device.
1759 				 */
1760 				if (unlikely(dev_ll->vdev->dev->device_fh
1761 					== dev->device_fh)) {
1762 					LOG_DEBUG(VHOST_DATA,
1763 					"(%"PRIu64") TX: Source and destination"
1764 					"MAC addresses are the same. Dropping "
1765 					"packet.\n",
1766 					dev_ll->vdev->dev->device_fh);
1767 					MBUF_HEADROOM_UINT32(mbuf)
1768 						= (uint32_t)desc_idx;
1769 					__rte_mbuf_raw_free(mbuf);
1770 					return;
1771 				}
1772 
1773 				/*
1774 				 * Packet length offset 4 bytes for HW vlan
1775 				 * strip when L2 switch back.
1776 				 */
1777 				offset = 4;
1778 				vlan_tag =
1779 				(uint16_t)
1780 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1781 
1782 				LOG_DEBUG(VHOST_DATA,
1783 				"(%"PRIu64") TX: pkt to local VM device id:"
1784 				"(%"PRIu64") vlan tag: %d.\n",
1785 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1786 				vlan_tag);
1787 
1788 				break;
1789 			}
1790 			dev_ll = dev_ll->next;
1791 		}
1792 	}
1793 
1794 	mbuf->nb_segs = m->nb_segs;
1795 	mbuf->next = m->next;
1796 	mbuf->data_len = m->data_len + offset;
1797 	mbuf->pkt_len = mbuf->data_len;
1798 	if (unlikely(need_copy)) {
1799 		/* Copy the packet contents to the mbuf. */
1800 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1801 			rte_pktmbuf_mtod(m, void *),
1802 			m->data_len);
1803 	} else {
1804 		mbuf->data_off = m->data_off;
1805 		mbuf->buf_physaddr = m->buf_physaddr;
1806 		mbuf->buf_addr = m->buf_addr;
1807 	}
1808 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1809 	mbuf->vlan_tci = vlan_tag;
1810 	mbuf->l2_len = sizeof(struct ether_hdr);
1811 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1812 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1813 
1814 	tx_q->m_table[len] = mbuf;
1815 	len++;
1816 
1817 	LOG_DEBUG(VHOST_DATA,
1818 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1819 		dev->device_fh,
1820 		mbuf->nb_segs,
1821 		(mbuf->next == NULL) ? "null" : "non-null");
1822 
1823 	if (enable_stats) {
1824 		dev_statistics[dev->device_fh].tx_total++;
1825 		dev_statistics[dev->device_fh].tx++;
1826 	}
1827 
1828 	if (unlikely(len == MAX_PKT_BURST)) {
1829 		m_table = (struct rte_mbuf **)tx_q->m_table;
1830 		ret = rte_eth_tx_burst(ports[0],
1831 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1832 
1833 		/*
1834 		 * Free any buffers not handled by TX and update
1835 		 * the port stats.
1836 		 */
1837 		if (unlikely(ret < len)) {
1838 			do {
1839 				rte_pktmbuf_free(m_table[ret]);
1840 			} while (++ret < len);
1841 		}
1842 
1843 		len = 0;
1844 		txmbuf_clean_zcp(dev, vpool);
1845 	}
1846 
1847 	tx_q->len = len;
1848 
1849 	return;
1850 }
1851 
1852 /*
1853  * This function TX all available packets in virtio TX queue for one
1854  * virtio-net device. If it is first packet, it learns MAC address and
1855  * setup VMDQ.
1856  */
1857 static inline void __attribute__((always_inline))
1858 virtio_dev_tx_zcp(struct virtio_net *dev)
1859 {
1860 	struct rte_mbuf m;
1861 	struct vhost_virtqueue *vq;
1862 	struct vring_desc *desc;
1863 	uint64_t buff_addr = 0, phys_addr;
1864 	uint32_t head[MAX_PKT_BURST];
1865 	uint32_t i;
1866 	uint16_t free_entries, packet_success = 0;
1867 	uint16_t avail_idx;
1868 	uint8_t need_copy = 0;
1869 	hpa_type addr_type;
1870 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1871 
1872 	vq = dev->virtqueue[VIRTIO_TXQ];
1873 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1874 
1875 	/* If there are no available buffers then return. */
1876 	if (vq->last_used_idx_res == avail_idx)
1877 		return;
1878 
1879 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1880 
1881 	/* Prefetch available ring to retrieve head indexes. */
1882 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1883 
1884 	/* Get the number of free entries in the ring */
1885 	free_entries = (avail_idx - vq->last_used_idx_res);
1886 
1887 	/* Limit to MAX_PKT_BURST. */
1888 	free_entries
1889 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1890 
1891 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1892 		dev->device_fh, free_entries);
1893 
1894 	/* Retrieve all of the head indexes first to avoid caching issues. */
1895 	for (i = 0; i < free_entries; i++)
1896 		head[i]
1897 			= vq->avail->ring[(vq->last_used_idx_res + i)
1898 			& (vq->size - 1)];
1899 
1900 	vq->last_used_idx_res += free_entries;
1901 
1902 	/* Prefetch descriptor index. */
1903 	rte_prefetch0(&vq->desc[head[packet_success]]);
1904 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1905 
1906 	while (packet_success < free_entries) {
1907 		desc = &vq->desc[head[packet_success]];
1908 
1909 		/* Discard first buffer as it is the virtio header */
1910 		desc = &vq->desc[desc->next];
1911 
1912 		/* Buffer address translation. */
1913 		buff_addr = gpa_to_vva(dev, desc->addr);
1914 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1915 
1916 		if (likely(packet_success < (free_entries - 1)))
1917 			/* Prefetch descriptor index. */
1918 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1919 
1920 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1921 			RTE_LOG(ERR, VHOST_DATA,
1922 				"(%"PRIu64") Invalid frame buffer address found"
1923 				"when TX packets!\n",
1924 				dev->device_fh);
1925 			packet_success++;
1926 			continue;
1927 		}
1928 
1929 		/* Prefetch buffer address. */
1930 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1931 
1932 		/*
1933 		 * Setup dummy mbuf. This is copied to a real mbuf if
1934 		 * transmitted out the physical port.
1935 		 */
1936 		m.data_len = desc->len;
1937 		m.nb_segs = 1;
1938 		m.next = NULL;
1939 		m.data_off = 0;
1940 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1941 		m.buf_physaddr = phys_addr;
1942 
1943 		/*
1944 		 * Check if the frame buffer address from guest crosses
1945 		 * sub-region or not.
1946 		 */
1947 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1948 			RTE_LOG(ERR, VHOST_DATA,
1949 				"(%"PRIu64") Frame buffer address cross "
1950 				"sub-regioin found when attaching TX frame "
1951 				"buffer address!\n",
1952 				dev->device_fh);
1953 			need_copy = 1;
1954 		} else
1955 			need_copy = 0;
1956 
1957 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1958 
1959 		/*
1960 		 * If this is the first received packet we need to learn
1961 		 * the MAC and setup VMDQ
1962 		 */
1963 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1964 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1965 				/*
1966 				 * Discard frame if device is scheduled for
1967 				 * removal or a duplicate MAC address is found.
1968 				 */
1969 				packet_success += free_entries;
1970 				vq->last_used_idx += packet_success;
1971 				break;
1972 			}
1973 		}
1974 
1975 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1976 		packet_success++;
1977 	}
1978 }
1979 
1980 /*
1981  * This function is called by each data core. It handles all RX/TX registered
1982  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1983  * addresses are compared with all devices in the main linked list.
1984  */
1985 static int
1986 switch_worker_zcp(__attribute__((unused)) void *arg)
1987 {
1988 	struct virtio_net *dev = NULL;
1989 	struct vhost_dev  *vdev = NULL;
1990 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1991 	struct virtio_net_data_ll *dev_ll;
1992 	struct mbuf_table *tx_q;
1993 	volatile struct lcore_ll_info *lcore_ll;
1994 	const uint64_t drain_tsc
1995 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1996 		* BURST_TX_DRAIN_US;
1997 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1998 	unsigned ret;
1999 	const uint16_t lcore_id = rte_lcore_id();
2000 	uint16_t count_in_ring, rx_count = 0;
2001 
2002 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2003 
2004 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2005 	prev_tsc = 0;
2006 
2007 	while (1) {
2008 		cur_tsc = rte_rdtsc();
2009 
2010 		/* TX burst queue drain */
2011 		diff_tsc = cur_tsc - prev_tsc;
2012 		if (unlikely(diff_tsc > drain_tsc)) {
2013 			/*
2014 			 * Get mbuf from vpool.pool and detach mbuf and
2015 			 * put back into vpool.ring.
2016 			 */
2017 			dev_ll = lcore_ll->ll_root_used;
2018 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2019 				/* Get virtio device ID */
2020 				vdev = dev_ll->vdev;
2021 				dev = vdev->dev;
2022 
2023 				if (likely(!vdev->remove)) {
2024 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2025 					if (tx_q->len) {
2026 						LOG_DEBUG(VHOST_DATA,
2027 						"TX queue drained after timeout"
2028 						" with burst size %u\n",
2029 						tx_q->len);
2030 
2031 						/*
2032 						 * Tx any packets in the queue
2033 						 */
2034 						ret = rte_eth_tx_burst(
2035 							ports[0],
2036 							(uint16_t)tx_q->txq_id,
2037 							(struct rte_mbuf **)
2038 							tx_q->m_table,
2039 							(uint16_t)tx_q->len);
2040 						if (unlikely(ret < tx_q->len)) {
2041 							do {
2042 								rte_pktmbuf_free(
2043 									tx_q->m_table[ret]);
2044 							} while (++ret < tx_q->len);
2045 						}
2046 						tx_q->len = 0;
2047 
2048 						txmbuf_clean_zcp(dev,
2049 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2050 					}
2051 				}
2052 				dev_ll = dev_ll->next;
2053 			}
2054 			prev_tsc = cur_tsc;
2055 		}
2056 
2057 		rte_prefetch0(lcore_ll->ll_root_used);
2058 
2059 		/*
2060 		 * Inform the configuration core that we have exited the linked
2061 		 * list and that no devices are in use if requested.
2062 		 */
2063 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2064 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2065 
2066 		/* Process devices */
2067 		dev_ll = lcore_ll->ll_root_used;
2068 
2069 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2070 			vdev = dev_ll->vdev;
2071 			dev  = vdev->dev;
2072 			if (unlikely(vdev->remove)) {
2073 				dev_ll = dev_ll->next;
2074 				unlink_vmdq(vdev);
2075 				vdev->ready = DEVICE_SAFE_REMOVE;
2076 				continue;
2077 			}
2078 
2079 			if (likely(vdev->ready == DEVICE_RX)) {
2080 				uint32_t index = vdev->vmdq_rx_q;
2081 				uint16_t i;
2082 				count_in_ring
2083 				= rte_ring_count(vpool_array[index].ring);
2084 				uint16_t free_entries
2085 				= (uint16_t)get_available_ring_num_zcp(dev);
2086 
2087 				/*
2088 				 * Attach all mbufs in vpool.ring and put back
2089 				 * into vpool.pool.
2090 				 */
2091 				for (i = 0;
2092 				i < RTE_MIN(free_entries,
2093 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2094 				i++)
2095 					attach_rxmbuf_zcp(dev);
2096 
2097 				/* Handle guest RX */
2098 				rx_count = rte_eth_rx_burst(ports[0],
2099 					vdev->vmdq_rx_q, pkts_burst,
2100 					MAX_PKT_BURST);
2101 
2102 				if (rx_count) {
2103 					ret_count = virtio_dev_rx_zcp(dev,
2104 							pkts_burst, rx_count);
2105 					if (enable_stats) {
2106 						dev_statistics[dev->device_fh].rx_total
2107 							+= rx_count;
2108 						dev_statistics[dev->device_fh].rx
2109 							+= ret_count;
2110 					}
2111 					while (likely(rx_count)) {
2112 						rx_count--;
2113 						pktmbuf_detach_zcp(
2114 							pkts_burst[rx_count]);
2115 						rte_ring_sp_enqueue(
2116 							vpool_array[index].ring,
2117 							(void *)pkts_burst[rx_count]);
2118 					}
2119 				}
2120 			}
2121 
2122 			if (likely(!vdev->remove))
2123 				/* Handle guest TX */
2124 				virtio_dev_tx_zcp(dev);
2125 
2126 			/* Move to the next device in the list */
2127 			dev_ll = dev_ll->next;
2128 		}
2129 	}
2130 
2131 	return 0;
2132 }
2133 
2134 
2135 /*
2136  * Add an entry to a used linked list. A free entry must first be found
2137  * in the free linked list using get_data_ll_free_entry();
2138  */
2139 static void
2140 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2141 	struct virtio_net_data_ll *ll_dev)
2142 {
2143 	struct virtio_net_data_ll *ll = *ll_root_addr;
2144 
2145 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2146 	ll_dev->next = NULL;
2147 	rte_compiler_barrier();
2148 
2149 	/* If ll == NULL then this is the first device. */
2150 	if (ll) {
2151 		/* Increment to the tail of the linked list. */
2152 		while ((ll->next != NULL) )
2153 			ll = ll->next;
2154 
2155 		ll->next = ll_dev;
2156 	} else {
2157 		*ll_root_addr = ll_dev;
2158 	}
2159 }
2160 
2161 /*
2162  * Remove an entry from a used linked list. The entry must then be added to
2163  * the free linked list using put_data_ll_free_entry().
2164  */
2165 static void
2166 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2167 	struct virtio_net_data_ll *ll_dev,
2168 	struct virtio_net_data_ll *ll_dev_last)
2169 {
2170 	struct virtio_net_data_ll *ll = *ll_root_addr;
2171 
2172 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2173 		return;
2174 
2175 	if (ll_dev == ll)
2176 		*ll_root_addr = ll_dev->next;
2177 	else
2178 		if (likely(ll_dev_last != NULL))
2179 			ll_dev_last->next = ll_dev->next;
2180 		else
2181 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2182 }
2183 
2184 /*
2185  * Find and return an entry from the free linked list.
2186  */
2187 static struct virtio_net_data_ll *
2188 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2189 {
2190 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2191 	struct virtio_net_data_ll *ll_dev;
2192 
2193 	if (ll_free == NULL)
2194 		return NULL;
2195 
2196 	ll_dev = ll_free;
2197 	*ll_root_addr = ll_free->next;
2198 
2199 	return ll_dev;
2200 }
2201 
2202 /*
2203  * Place an entry back on to the free linked list.
2204  */
2205 static void
2206 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2207 	struct virtio_net_data_ll *ll_dev)
2208 {
2209 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2210 
2211 	if (ll_dev == NULL)
2212 		return;
2213 
2214 	ll_dev->next = ll_free;
2215 	*ll_root_addr = ll_dev;
2216 }
2217 
2218 /*
2219  * Creates a linked list of a given size.
2220  */
2221 static struct virtio_net_data_ll *
2222 alloc_data_ll(uint32_t size)
2223 {
2224 	struct virtio_net_data_ll *ll_new;
2225 	uint32_t i;
2226 
2227 	/* Malloc and then chain the linked list. */
2228 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2229 	if (ll_new == NULL) {
2230 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2231 		return NULL;
2232 	}
2233 
2234 	for (i = 0; i < size - 1; i++) {
2235 		ll_new[i].vdev = NULL;
2236 		ll_new[i].next = &ll_new[i+1];
2237 	}
2238 	ll_new[i].next = NULL;
2239 
2240 	return (ll_new);
2241 }
2242 
2243 /*
2244  * Create the main linked list along with each individual cores linked list. A used and a free list
2245  * are created to manage entries.
2246  */
2247 static int
2248 init_data_ll (void)
2249 {
2250 	int lcore;
2251 
2252 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2253 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2254 		if (lcore_info[lcore].lcore_ll == NULL) {
2255 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2256 			return -1;
2257 		}
2258 
2259 		lcore_info[lcore].lcore_ll->device_num = 0;
2260 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2261 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2262 		if (num_devices % num_switching_cores)
2263 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2264 		else
2265 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2266 	}
2267 
2268 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2269 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2270 
2271 	return 0;
2272 }
2273 
2274 /*
2275  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2276  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2277  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2278  */
2279 static void
2280 destroy_device (volatile struct virtio_net *dev)
2281 {
2282 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2283 	struct virtio_net_data_ll *ll_main_dev_cur;
2284 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2285 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2286 	struct vhost_dev *vdev;
2287 	int lcore;
2288 
2289 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2290 
2291 	vdev = (struct vhost_dev *)dev->priv;
2292 	/*set the remove flag. */
2293 	vdev->remove = 1;
2294 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2295 		rte_pause();
2296 	}
2297 
2298 	/* Search for entry to be removed from lcore ll */
2299 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2300 	while (ll_lcore_dev_cur != NULL) {
2301 		if (ll_lcore_dev_cur->vdev == vdev) {
2302 			break;
2303 		} else {
2304 			ll_lcore_dev_last = ll_lcore_dev_cur;
2305 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2306 		}
2307 	}
2308 
2309 	if (ll_lcore_dev_cur == NULL) {
2310 		RTE_LOG(ERR, VHOST_CONFIG,
2311 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2312 			dev->device_fh);
2313 		return;
2314 	}
2315 
2316 	/* Search for entry to be removed from main ll */
2317 	ll_main_dev_cur = ll_root_used;
2318 	ll_main_dev_last = NULL;
2319 	while (ll_main_dev_cur != NULL) {
2320 		if (ll_main_dev_cur->vdev == vdev) {
2321 			break;
2322 		} else {
2323 			ll_main_dev_last = ll_main_dev_cur;
2324 			ll_main_dev_cur = ll_main_dev_cur->next;
2325 		}
2326 	}
2327 
2328 	/* Remove entries from the lcore and main ll. */
2329 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2330 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2331 
2332 	/* Set the dev_removal_flag on each lcore. */
2333 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2334 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2335 	}
2336 
2337 	/*
2338 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2339 	 * they can no longer access the device removed from the linked lists and that the devices
2340 	 * are no longer in use.
2341 	 */
2342 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2343 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2344 			rte_pause();
2345 		}
2346 	}
2347 
2348 	/* Add the entries back to the lcore and main free ll.*/
2349 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2350 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2351 
2352 	/* Decrement number of device on the lcore. */
2353 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2354 
2355 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2356 
2357 	if (zero_copy) {
2358 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2359 
2360 		/* Stop the RX queue. */
2361 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2362 			LOG_DEBUG(VHOST_CONFIG,
2363 				"(%"PRIu64") In destroy_device: Failed to stop "
2364 				"rx queue:%d\n",
2365 				dev->device_fh,
2366 				vdev->vmdq_rx_q);
2367 		}
2368 
2369 		LOG_DEBUG(VHOST_CONFIG,
2370 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2371 			"mempool back to ring for RX queue: %d\n",
2372 			dev->device_fh, vdev->vmdq_rx_q);
2373 
2374 		mbuf_destroy_zcp(vpool);
2375 
2376 		/* Stop the TX queue. */
2377 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2378 			LOG_DEBUG(VHOST_CONFIG,
2379 				"(%"PRIu64") In destroy_device: Failed to "
2380 				"stop tx queue:%d\n",
2381 				dev->device_fh, vdev->vmdq_rx_q);
2382 		}
2383 
2384 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2385 
2386 		LOG_DEBUG(VHOST_CONFIG,
2387 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2388 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2389 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2390 			dev->device_fh);
2391 
2392 		mbuf_destroy_zcp(vpool);
2393 		rte_free(vdev->regions_hpa);
2394 	}
2395 	rte_free(vdev);
2396 
2397 }
2398 
2399 /*
2400  * Calculate the region count of physical continous regions for one particular
2401  * region of whose vhost virtual address is continous. The particular region
2402  * start from vva_start, with size of 'size' in argument.
2403  */
2404 static uint32_t
2405 check_hpa_regions(uint64_t vva_start, uint64_t size)
2406 {
2407 	uint32_t i, nregions = 0, page_size = getpagesize();
2408 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2409 	if (vva_start % page_size) {
2410 		LOG_DEBUG(VHOST_CONFIG,
2411 			"in check_countinous: vva start(%p) mod page_size(%d) "
2412 			"has remainder\n",
2413 			(void *)(uintptr_t)vva_start, page_size);
2414 		return 0;
2415 	}
2416 	if (size % page_size) {
2417 		LOG_DEBUG(VHOST_CONFIG,
2418 			"in check_countinous: "
2419 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2420 			size, page_size);
2421 		return 0;
2422 	}
2423 	for (i = 0; i < size - page_size; i = i + page_size) {
2424 		cur_phys_addr
2425 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2426 		next_phys_addr = rte_mem_virt2phy(
2427 			(void *)(uintptr_t)(vva_start + i + page_size));
2428 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2429 			++nregions;
2430 			LOG_DEBUG(VHOST_CONFIG,
2431 				"in check_continuous: hva addr:(%p) is not "
2432 				"continuous with hva addr:(%p), diff:%d\n",
2433 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2434 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2435 				+ page_size), page_size);
2436 			LOG_DEBUG(VHOST_CONFIG,
2437 				"in check_continuous: hpa addr:(%p) is not "
2438 				"continuous with hpa addr:(%p), "
2439 				"diff:(%"PRIu64")\n",
2440 				(void *)(uintptr_t)cur_phys_addr,
2441 				(void *)(uintptr_t)next_phys_addr,
2442 				(next_phys_addr-cur_phys_addr));
2443 		}
2444 	}
2445 	return nregions;
2446 }
2447 
2448 /*
2449  * Divide each region whose vhost virtual address is continous into a few
2450  * sub-regions, make sure the physical address within each sub-region are
2451  * continous. And fill offset(to GPA) and size etc. information of each
2452  * sub-region into regions_hpa.
2453  */
2454 static uint32_t
2455 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2456 {
2457 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2458 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2459 
2460 	if (mem_region_hpa == NULL)
2461 		return 0;
2462 
2463 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2464 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2465 			virtio_memory->regions[regionidx].address_offset;
2466 		mem_region_hpa[regionidx_hpa].guest_phys_address
2467 			= virtio_memory->regions[regionidx].guest_phys_address;
2468 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2469 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2470 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2471 		LOG_DEBUG(VHOST_CONFIG,
2472 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2473 			regionidx_hpa,
2474 			(void *)(uintptr_t)
2475 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2476 		LOG_DEBUG(VHOST_CONFIG,
2477 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2478 			regionidx_hpa,
2479 			(void *)(uintptr_t)
2480 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2481 		for (i = 0, k = 0;
2482 			i < virtio_memory->regions[regionidx].memory_size -
2483 				page_size;
2484 			i += page_size) {
2485 			cur_phys_addr = rte_mem_virt2phy(
2486 					(void *)(uintptr_t)(vva_start + i));
2487 			next_phys_addr = rte_mem_virt2phy(
2488 					(void *)(uintptr_t)(vva_start +
2489 					i + page_size));
2490 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2491 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2492 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2493 					k + page_size;
2494 				mem_region_hpa[regionidx_hpa].memory_size
2495 					= k + page_size;
2496 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2497 					"phys addr end  [%d]:(%p)\n",
2498 					regionidx_hpa,
2499 					(void *)(uintptr_t)
2500 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2501 				LOG_DEBUG(VHOST_CONFIG,
2502 					"in fill_hpa_regions: guest phys addr "
2503 					"size [%d]:(%p)\n",
2504 					regionidx_hpa,
2505 					(void *)(uintptr_t)
2506 					(mem_region_hpa[regionidx_hpa].memory_size));
2507 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2508 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2509 				++regionidx_hpa;
2510 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2511 					next_phys_addr -
2512 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2513 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2514 					" phys addr start[%d]:(%p)\n",
2515 					regionidx_hpa,
2516 					(void *)(uintptr_t)
2517 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2518 				LOG_DEBUG(VHOST_CONFIG,
2519 					"in fill_hpa_regions: host  phys addr "
2520 					"start[%d]:(%p)\n",
2521 					regionidx_hpa,
2522 					(void *)(uintptr_t)
2523 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2524 				k = 0;
2525 			} else {
2526 				k += page_size;
2527 			}
2528 		}
2529 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2530 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2531 			+ k + page_size;
2532 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2533 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2534 			"[%d]:(%p)\n", regionidx_hpa,
2535 			(void *)(uintptr_t)
2536 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2537 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2538 			"[%d]:(%p)\n", regionidx_hpa,
2539 			(void *)(uintptr_t)
2540 			(mem_region_hpa[regionidx_hpa].memory_size));
2541 		++regionidx_hpa;
2542 	}
2543 	return regionidx_hpa;
2544 }
2545 
2546 /*
2547  * A new device is added to a data core. First the device is added to the main linked list
2548  * and the allocated to a specific data core.
2549  */
2550 static int
2551 new_device (struct virtio_net *dev)
2552 {
2553 	struct virtio_net_data_ll *ll_dev;
2554 	int lcore, core_add = 0;
2555 	uint32_t device_num_min = num_devices;
2556 	struct vhost_dev *vdev;
2557 	uint32_t regionidx;
2558 
2559 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2560 	if (vdev == NULL) {
2561 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2562 			dev->device_fh);
2563 		return -1;
2564 	}
2565 	vdev->dev = dev;
2566 	dev->priv = vdev;
2567 
2568 	if (zero_copy) {
2569 		vdev->nregions_hpa = dev->mem->nregions;
2570 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2571 			vdev->nregions_hpa
2572 				+= check_hpa_regions(
2573 					dev->mem->regions[regionidx].guest_phys_address
2574 					+ dev->mem->regions[regionidx].address_offset,
2575 					dev->mem->regions[regionidx].memory_size);
2576 
2577 		}
2578 
2579 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2580 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2581 			CACHE_LINE_SIZE);
2582 		if (vdev->regions_hpa == NULL) {
2583 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2584 			rte_free(vdev);
2585 			return -1;
2586 		}
2587 
2588 
2589 		if (fill_hpa_memory_regions(
2590 			vdev->regions_hpa, dev->mem
2591 			) != vdev->nregions_hpa) {
2592 
2593 			RTE_LOG(ERR, VHOST_CONFIG,
2594 				"hpa memory regions number mismatch: "
2595 				"[%d]\n", vdev->nregions_hpa);
2596 			rte_free(vdev->regions_hpa);
2597 			rte_free(vdev);
2598 			return -1;
2599 		}
2600 	}
2601 
2602 
2603 	/* Add device to main ll */
2604 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2605 	if (ll_dev == NULL) {
2606 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2607 			"of %d devices per core has been reached\n",
2608 			dev->device_fh, num_devices);
2609 		if (vdev->regions_hpa)
2610 			rte_free(vdev->regions_hpa);
2611 		rte_free(vdev);
2612 		return -1;
2613 	}
2614 	ll_dev->vdev = vdev;
2615 	add_data_ll_entry(&ll_root_used, ll_dev);
2616 	vdev->vmdq_rx_q
2617 		= dev->device_fh * (num_queues / num_devices);
2618 
2619 	if (zero_copy) {
2620 		uint32_t index = vdev->vmdq_rx_q;
2621 		uint32_t count_in_ring, i;
2622 		struct mbuf_table *tx_q;
2623 
2624 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2625 
2626 		LOG_DEBUG(VHOST_CONFIG,
2627 			"(%"PRIu64") in new_device: mbuf count in mempool "
2628 			"before attach is: %d\n",
2629 			dev->device_fh,
2630 			rte_mempool_count(vpool_array[index].pool));
2631 		LOG_DEBUG(VHOST_CONFIG,
2632 			"(%"PRIu64") in new_device: mbuf count in  ring "
2633 			"before attach  is : %d\n",
2634 			dev->device_fh, count_in_ring);
2635 
2636 		/*
2637 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2638 		 */
2639 		for (i = 0; i < count_in_ring; i++)
2640 			attach_rxmbuf_zcp(dev);
2641 
2642 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2643 			"mempool after attach is: %d\n",
2644 			dev->device_fh,
2645 			rte_mempool_count(vpool_array[index].pool));
2646 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2647 			"ring after attach  is : %d\n",
2648 			dev->device_fh,
2649 			rte_ring_count(vpool_array[index].ring));
2650 
2651 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2652 		tx_q->txq_id = vdev->vmdq_rx_q;
2653 
2654 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2655 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2656 
2657 			LOG_DEBUG(VHOST_CONFIG,
2658 				"(%"PRIu64") In new_device: Failed to start "
2659 				"tx queue:%d\n",
2660 				dev->device_fh, vdev->vmdq_rx_q);
2661 
2662 			mbuf_destroy_zcp(vpool);
2663 			rte_free(vdev->regions_hpa);
2664 			rte_free(vdev);
2665 			return -1;
2666 		}
2667 
2668 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2669 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2670 
2671 			LOG_DEBUG(VHOST_CONFIG,
2672 				"(%"PRIu64") In new_device: Failed to start "
2673 				"rx queue:%d\n",
2674 				dev->device_fh, vdev->vmdq_rx_q);
2675 
2676 			/* Stop the TX queue. */
2677 			if (rte_eth_dev_tx_queue_stop(ports[0],
2678 				vdev->vmdq_rx_q) != 0) {
2679 				LOG_DEBUG(VHOST_CONFIG,
2680 					"(%"PRIu64") In new_device: Failed to "
2681 					"stop tx queue:%d\n",
2682 					dev->device_fh, vdev->vmdq_rx_q);
2683 			}
2684 
2685 			mbuf_destroy_zcp(vpool);
2686 			rte_free(vdev->regions_hpa);
2687 			rte_free(vdev);
2688 			return -1;
2689 		}
2690 
2691 	}
2692 
2693 	/*reset ready flag*/
2694 	vdev->ready = DEVICE_MAC_LEARNING;
2695 	vdev->remove = 0;
2696 
2697 	/* Find a suitable lcore to add the device. */
2698 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2699 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2700 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2701 			core_add = lcore;
2702 		}
2703 	}
2704 	/* Add device to lcore ll */
2705 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2706 	if (ll_dev == NULL) {
2707 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2708 		vdev->ready = DEVICE_SAFE_REMOVE;
2709 		destroy_device(dev);
2710 		if (vdev->regions_hpa)
2711 			rte_free(vdev->regions_hpa);
2712 		rte_free(vdev);
2713 		return -1;
2714 	}
2715 	ll_dev->vdev = vdev;
2716 	vdev->coreid = core_add;
2717 
2718 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2719 
2720 	/* Initialize device stats */
2721 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2722 
2723 	/* Disable notifications. */
2724 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2725 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2726 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2727 	dev->flags |= VIRTIO_DEV_RUNNING;
2728 
2729 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2730 
2731 	return 0;
2732 }
2733 
2734 /*
2735  * These callback allow devices to be added to the data core when configuration
2736  * has been fully complete.
2737  */
2738 static const struct virtio_net_device_ops virtio_net_device_ops =
2739 {
2740 	.new_device =  new_device,
2741 	.destroy_device = destroy_device,
2742 };
2743 
2744 /*
2745  * This is a thread will wake up after a period to print stats if the user has
2746  * enabled them.
2747  */
2748 static void
2749 print_stats(void)
2750 {
2751 	struct virtio_net_data_ll *dev_ll;
2752 	uint64_t tx_dropped, rx_dropped;
2753 	uint64_t tx, tx_total, rx, rx_total;
2754 	uint32_t device_fh;
2755 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2756 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2757 
2758 	while(1) {
2759 		sleep(enable_stats);
2760 
2761 		/* Clear screen and move to top left */
2762 		printf("%s%s", clr, top_left);
2763 
2764 		printf("\nDevice statistics ====================================");
2765 
2766 		dev_ll = ll_root_used;
2767 		while (dev_ll != NULL) {
2768 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2769 			tx_total = dev_statistics[device_fh].tx_total;
2770 			tx = dev_statistics[device_fh].tx;
2771 			tx_dropped = tx_total - tx;
2772 			if (zero_copy == 0) {
2773 				rx_total = rte_atomic64_read(
2774 					&dev_statistics[device_fh].rx_total_atomic);
2775 				rx = rte_atomic64_read(
2776 					&dev_statistics[device_fh].rx_atomic);
2777 			} else {
2778 				rx_total = dev_statistics[device_fh].rx_total;
2779 				rx = dev_statistics[device_fh].rx;
2780 			}
2781 			rx_dropped = rx_total - rx;
2782 
2783 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2784 					"\nTX total: 		%"PRIu64""
2785 					"\nTX dropped: 		%"PRIu64""
2786 					"\nTX successful: 		%"PRIu64""
2787 					"\nRX total: 		%"PRIu64""
2788 					"\nRX dropped: 		%"PRIu64""
2789 					"\nRX successful: 		%"PRIu64"",
2790 					device_fh,
2791 					tx_total,
2792 					tx_dropped,
2793 					tx,
2794 					rx_total,
2795 					rx_dropped,
2796 					rx);
2797 
2798 			dev_ll = dev_ll->next;
2799 		}
2800 		printf("\n======================================================\n");
2801 	}
2802 }
2803 
2804 static void
2805 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2806 	char *ring_name, uint32_t nb_mbuf)
2807 {
2808 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2809 	vpool_array[index].pool
2810 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2811 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2812 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2813 		rte_pktmbuf_init, NULL, socket, 0);
2814 	if (vpool_array[index].pool != NULL) {
2815 		vpool_array[index].ring
2816 			= rte_ring_create(ring_name,
2817 				rte_align32pow2(nb_mbuf + 1),
2818 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2819 		if (likely(vpool_array[index].ring != NULL)) {
2820 			LOG_DEBUG(VHOST_CONFIG,
2821 				"in setup_mempool_tbl: mbuf count in "
2822 				"mempool is: %d\n",
2823 				rte_mempool_count(vpool_array[index].pool));
2824 			LOG_DEBUG(VHOST_CONFIG,
2825 				"in setup_mempool_tbl: mbuf count in "
2826 				"ring   is: %d\n",
2827 				rte_ring_count(vpool_array[index].ring));
2828 		} else {
2829 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2830 				ring_name);
2831 		}
2832 
2833 		/* Need consider head room. */
2834 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2835 	} else {
2836 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2837 	}
2838 }
2839 
2840 
2841 /*
2842  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2843  * device is also registered here to handle the IOCTLs.
2844  */
2845 int
2846 MAIN(int argc, char *argv[])
2847 {
2848 	struct rte_mempool *mbuf_pool = NULL;
2849 	unsigned lcore_id, core_id = 0;
2850 	unsigned nb_ports, valid_num_ports;
2851 	int ret;
2852 	uint8_t portid, queue_id = 0;
2853 	static pthread_t tid;
2854 
2855 	/* init EAL */
2856 	ret = rte_eal_init(argc, argv);
2857 	if (ret < 0)
2858 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2859 	argc -= ret;
2860 	argv += ret;
2861 
2862 	/* parse app arguments */
2863 	ret = us_vhost_parse_args(argc, argv);
2864 	if (ret < 0)
2865 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2866 #ifdef RTE_IXGBE_INC_VECTOR
2867 	if (mergeable == 1) {
2868 		rte_exit(EXIT_FAILURE,
2869 			"sorry, mergeable feature doesn't work with vec sg recv, " \
2870 			"please disable it in cfg as a workaround\n");
2871 	}
2872 #endif
2873 
2874 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2875 		if (rte_lcore_is_enabled(lcore_id))
2876 			lcore_ids[core_id ++] = lcore_id;
2877 
2878 	if (rte_lcore_count() > RTE_MAX_LCORE)
2879 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2880 
2881 	/*set the number of swithcing cores available*/
2882 	num_switching_cores = rte_lcore_count()-1;
2883 
2884 	/* Get the number of physical ports. */
2885 	nb_ports = rte_eth_dev_count();
2886 	if (nb_ports > RTE_MAX_ETHPORTS)
2887 		nb_ports = RTE_MAX_ETHPORTS;
2888 
2889 	/*
2890 	 * Update the global var NUM_PORTS and global array PORTS
2891 	 * and get value of var VALID_NUM_PORTS according to system ports number
2892 	 */
2893 	valid_num_ports = check_ports_num(nb_ports);
2894 
2895 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2896 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2897 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2898 		return -1;
2899 	}
2900 
2901 	if (zero_copy == 0) {
2902 		/* Create the mbuf pool. */
2903 		mbuf_pool = rte_mempool_create(
2904 				"MBUF_POOL",
2905 				NUM_MBUFS_PER_PORT
2906 				* valid_num_ports,
2907 				MBUF_SIZE, MBUF_CACHE_SIZE,
2908 				sizeof(struct rte_pktmbuf_pool_private),
2909 				rte_pktmbuf_pool_init, NULL,
2910 				rte_pktmbuf_init, NULL,
2911 				rte_socket_id(), 0);
2912 		if (mbuf_pool == NULL)
2913 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2914 
2915 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2916 			vpool_array[queue_id].pool = mbuf_pool;
2917 
2918 		if (vm2vm_mode == VM2VM_HARDWARE) {
2919 			/* Enable VT loop back to let L2 switch to do it. */
2920 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2921 			LOG_DEBUG(VHOST_CONFIG,
2922 				"Enable loop back for L2 switch in vmdq.\n");
2923 		}
2924 	} else {
2925 		uint32_t nb_mbuf;
2926 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2927 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2928 
2929 		/*
2930 		 * Zero copy defers queue RX/TX start to the time when guest
2931 		 * finishes its startup and packet buffers from that guest are
2932 		 * available.
2933 		 */
2934 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2935 		rx_conf_default.rx_drop_en = 0;
2936 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2937 		nb_mbuf = num_rx_descriptor
2938 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2939 			+ num_switching_cores * MAX_PKT_BURST;
2940 
2941 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2942 			snprintf(pool_name, sizeof(pool_name),
2943 				"rxmbuf_pool_%u", queue_id);
2944 			snprintf(ring_name, sizeof(ring_name),
2945 				"rxmbuf_ring_%u", queue_id);
2946 			setup_mempool_tbl(rte_socket_id(), queue_id,
2947 				pool_name, ring_name, nb_mbuf);
2948 		}
2949 
2950 		nb_mbuf = num_tx_descriptor
2951 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2952 				+ num_switching_cores * MAX_PKT_BURST;
2953 
2954 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2955 			snprintf(pool_name, sizeof(pool_name),
2956 				"txmbuf_pool_%u", queue_id);
2957 			snprintf(ring_name, sizeof(ring_name),
2958 				"txmbuf_ring_%u", queue_id);
2959 			setup_mempool_tbl(rte_socket_id(),
2960 				(queue_id + MAX_QUEUES),
2961 				pool_name, ring_name, nb_mbuf);
2962 		}
2963 
2964 		if (vm2vm_mode == VM2VM_HARDWARE) {
2965 			/* Enable VT loop back to let L2 switch to do it. */
2966 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2967 			LOG_DEBUG(VHOST_CONFIG,
2968 				"Enable loop back for L2 switch in vmdq.\n");
2969 		}
2970 	}
2971 	/* Set log level. */
2972 	rte_set_log_level(LOG_LEVEL);
2973 
2974 	/* initialize all ports */
2975 	for (portid = 0; portid < nb_ports; portid++) {
2976 		/* skip ports that are not enabled */
2977 		if ((enabled_port_mask & (1 << portid)) == 0) {
2978 			RTE_LOG(INFO, VHOST_PORT,
2979 				"Skipping disabled port %d\n", portid);
2980 			continue;
2981 		}
2982 		if (port_init(portid) != 0)
2983 			rte_exit(EXIT_FAILURE,
2984 				"Cannot initialize network ports\n");
2985 	}
2986 
2987 	/* Initialise all linked lists. */
2988 	if (init_data_ll() == -1)
2989 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2990 
2991 	/* Initialize device stats */
2992 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2993 
2994 	/* Enable stats if the user option is set. */
2995 	if (enable_stats)
2996 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2997 
2998 	/* Launch all data cores. */
2999 	if (zero_copy == 0) {
3000 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3001 			rte_eal_remote_launch(switch_worker,
3002 				mbuf_pool, lcore_id);
3003 		}
3004 	} else {
3005 		uint32_t count_in_mempool, index, i;
3006 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3007 			/* For all RX and TX queues. */
3008 			count_in_mempool
3009 				= rte_mempool_count(vpool_array[index].pool);
3010 
3011 			/*
3012 			 * Transfer all un-attached mbufs from vpool.pool
3013 			 * to vpoo.ring.
3014 			 */
3015 			for (i = 0; i < count_in_mempool; i++) {
3016 				struct rte_mbuf *mbuf
3017 					= __rte_mbuf_raw_alloc(
3018 						vpool_array[index].pool);
3019 				rte_ring_sp_enqueue(vpool_array[index].ring,
3020 						(void *)mbuf);
3021 			}
3022 
3023 			LOG_DEBUG(VHOST_CONFIG,
3024 				"in MAIN: mbuf count in mempool at initial "
3025 				"is: %d\n", count_in_mempool);
3026 			LOG_DEBUG(VHOST_CONFIG,
3027 				"in MAIN: mbuf count in  ring at initial  is :"
3028 				" %d\n",
3029 				rte_ring_count(vpool_array[index].ring));
3030 		}
3031 
3032 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3033 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3034 				lcore_id);
3035 	}
3036 
3037 	if (mergeable == 0)
3038 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3039 
3040 	/* Register CUSE device to handle IOCTLs. */
3041 	ret = rte_vhost_driver_register((char *)&dev_basename);
3042 	if (ret != 0)
3043 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3044 
3045 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3046 
3047 	/* Start CUSE session. */
3048 	rte_vhost_driver_session_start();
3049 	return 0;
3050 
3051 }
3052 
3053