xref: /dpdk/examples/vhost/main.c (revision 4d50b6acbd95f9ceacdcdc5dded512cacf06d8b1)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
103 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
104 
105 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
106 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
107 
108 #define JUMBO_FRAME_MAX_SIZE    0x2600
109 
110 /* State of virtio device. */
111 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_RX			1
113 #define DEVICE_SAFE_REMOVE	2
114 
115 /* Config_core_flag status definitions. */
116 #define REQUEST_DEV_REMOVAL 1
117 #define ACK_DEV_REMOVAL 0
118 
119 /* Configurable number of RX/TX ring descriptors */
120 #define RTE_TEST_RX_DESC_DEFAULT 1024
121 #define RTE_TEST_TX_DESC_DEFAULT 512
122 
123 /*
124  * Need refine these 2 macros for legacy and DPDK based front end:
125  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
126  * And then adjust power 2.
127  */
128 /*
129  * For legacy front end, 128 descriptors,
130  * half for virtio header, another half for mbuf.
131  */
132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
134 
135 /* Get first 4 bytes in mbuf headroom. */
136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
137 		+ sizeof(struct rte_mbuf)))
138 
139 /* true if x is a power of 2 */
140 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 
142 #define INVALID_PORT_ID 0xFF
143 
144 /* Max number of devices. Limited by vmdq. */
145 #define MAX_DEVICES 64
146 
147 /* Size of buffers used for snprintfs. */
148 #define MAX_PRINT_BUFF 6072
149 
150 /* Maximum character device basename size. */
151 #define MAX_BASENAME_SZ 10
152 
153 /* Maximum long option length for option parsing. */
154 #define MAX_LONG_OPT_SZ 64
155 
156 /* Used to compare MAC addresses. */
157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 
159 /* Number of descriptors per cacheline. */
160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 
162 /* mask of enabled ports */
163 static uint32_t enabled_port_mask = 0;
164 
165 /*Number of switching cores enabled*/
166 static uint32_t num_switching_cores = 0;
167 
168 /* number of devices/queues to support*/
169 static uint32_t num_queues = 0;
170 uint32_t num_devices = 0;
171 
172 /*
173  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
174  * disabled on default.
175  */
176 static uint32_t zero_copy;
177 
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184 
185 struct vpool {
186 	struct rte_mempool *pool;
187 	struct rte_ring *ring;
188 	uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193 	VM2VM_DISABLED = 0,
194 	VM2VM_SOFTWARE = 1,
195 	VM2VM_HARDWARE = 2,
196 	VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202 	PHYS_ADDR_CONTINUOUS = 0,
203 	PHYS_ADDR_CROSS_SUBREG = 1,
204 	PHYS_ADDR_INVALID = 2,
205 	PHYS_ADDR_LAST
206 } hpa_type;
207 
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 
220 
221 /* This can be set by the user so it is made available here. */
222 extern uint64_t VHOST_FEATURES;
223 
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226 	.rx_thresh = {
227 		.pthresh = RX_PTHRESH,
228 		.hthresh = RX_HTHRESH,
229 		.wthresh = RX_WTHRESH,
230 	},
231 	.rx_drop_en = 1,
232 };
233 
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240 	.tx_thresh = {
241 		.pthresh = TX_PTHRESH,
242 		.hthresh = TX_HTHRESH,
243 		.wthresh = TX_WTHRESH,
244 	},
245 	.tx_free_thresh = 0, /* Use PMD default values */
246 	.tx_rs_thresh = 0, /* Use PMD default values */
247 };
248 
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251 	.rxmode = {
252 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253 		.split_hdr_size = 0,
254 		.header_split   = 0, /**< Header Split disabled */
255 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
256 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
257 		/*
258 		 * It is necessary for 1G NIC such as I350,
259 		 * this fixes bug of ipv4 forwarding in guest can't
260 		 * forward pakets from one virtio dev to another virtio dev.
261 		 */
262 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
265 	},
266 
267 	.txmode = {
268 		.mq_mode = ETH_MQ_TX_NONE,
269 	},
270 	.rx_adv_conf = {
271 		/*
272 		 * should be overridden separately in code with
273 		 * appropriate values
274 		 */
275 		.vmdq_rx_conf = {
276 			.nb_queue_pools = ETH_8_POOLS,
277 			.enable_default_pool = 0,
278 			.default_pool = 0,
279 			.nb_pool_maps = 0,
280 			.pool_map = {{0, 0},},
281 		},
282 	},
283 };
284 
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288 
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
293 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300 
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303 
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307 
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310 
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313 	unsigned len;
314 	unsigned txq_id;
315 	struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317 
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320 
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323 
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326 	unsigned char   h_dest[ETH_ALEN];
327 	unsigned char   h_source[ETH_ALEN];
328 	__be16          h_vlan_proto;
329 	__be16          h_vlan_TCI;
330 	__be16          h_vlan_encapsulated_proto;
331 };
332 
333 /* IPv4 Header */
334 struct ipv4_hdr {
335 	uint8_t  version_ihl;		/**< version and header length */
336 	uint8_t  type_of_service;	/**< type of service */
337 	uint16_t total_length;		/**< length of packet */
338 	uint16_t packet_id;		/**< packet ID */
339 	uint16_t fragment_offset;	/**< fragmentation offset */
340 	uint8_t  time_to_live;		/**< time to live */
341 	uint8_t  next_proto_id;		/**< protocol ID */
342 	uint16_t hdr_checksum;		/**< header checksum */
343 	uint32_t src_addr;		/**< source address */
344 	uint32_t dst_addr;		/**< destination address */
345 } __attribute__((__packed__));
346 
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350 
351 /* Per-device statistics struct */
352 struct device_statistics {
353 	uint64_t tx_total;
354 	rte_atomic64_t rx_total_atomic;
355 	uint64_t rx_total;
356 	uint64_t tx;
357 	rte_atomic64_t rx_atomic;
358 	uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361 
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369 	struct rte_eth_vmdq_rx_conf conf;
370 	unsigned i;
371 
372 	memset(&conf, 0, sizeof(conf));
373 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
374 	conf.nb_pool_maps = num_devices;
375 	conf.enable_loop_back =
376 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
377 
378 	for (i = 0; i < conf.nb_pool_maps; i++) {
379 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
380 		conf.pool_map[i].pools = (1UL << i);
381 	}
382 
383 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
384 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
385 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
386 	return 0;
387 }
388 
389 /*
390  * Validate the device number according to the max pool number gotten form
391  * dev_info. If the device number is invalid, give the error message and
392  * return -1. Each device must have its own pool.
393  */
394 static inline int
395 validate_num_devices(uint32_t max_nb_devices)
396 {
397 	if (num_devices > max_nb_devices) {
398 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
399 		return -1;
400 	}
401 	return 0;
402 }
403 
404 /*
405  * Initialises a given port using global settings and with the rx buffers
406  * coming from the mbuf_pool passed as parameter
407  */
408 static inline int
409 port_init(uint8_t port)
410 {
411 	struct rte_eth_dev_info dev_info;
412 	struct rte_eth_conf port_conf;
413 	uint16_t rx_rings, tx_rings;
414 	uint16_t rx_ring_size, tx_ring_size;
415 	int retval;
416 	uint16_t q;
417 
418 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
419 	rte_eth_dev_info_get (port, &dev_info);
420 
421 	/*configure the number of supported virtio devices based on VMDQ limits */
422 	num_devices = dev_info.max_vmdq_pools;
423 	num_queues = dev_info.max_rx_queues;
424 
425 	if (zero_copy) {
426 		rx_ring_size = num_rx_descriptor;
427 		tx_ring_size = num_tx_descriptor;
428 		tx_rings = dev_info.max_tx_queues;
429 	} else {
430 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
431 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
432 		tx_rings = (uint16_t)rte_lcore_count();
433 	}
434 
435 	retval = validate_num_devices(MAX_DEVICES);
436 	if (retval < 0)
437 		return retval;
438 
439 	/* Get port configuration. */
440 	retval = get_eth_conf(&port_conf, num_devices);
441 	if (retval < 0)
442 		return retval;
443 
444 	if (port >= rte_eth_dev_count()) return -1;
445 
446 	rx_rings = (uint16_t)num_queues,
447 	/* Configure ethernet device. */
448 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449 	if (retval != 0)
450 		return retval;
451 
452 	/* Setup the queues. */
453 	for (q = 0; q < rx_rings; q ++) {
454 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455 						rte_eth_dev_socket_id(port), &rx_conf_default,
456 						vpool_array[q].pool);
457 		if (retval < 0)
458 			return retval;
459 	}
460 	for (q = 0; q < tx_rings; q ++) {
461 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462 						rte_eth_dev_socket_id(port), &tx_conf_default);
463 		if (retval < 0)
464 			return retval;
465 	}
466 
467 	/* Start the device. */
468 	retval  = rte_eth_dev_start(port);
469 	if (retval < 0) {
470 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471 		return retval;
472 	}
473 
474 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
475 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
476 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
477 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
478 			(unsigned)port,
479 			vmdq_ports_eth_addr[port].addr_bytes[0],
480 			vmdq_ports_eth_addr[port].addr_bytes[1],
481 			vmdq_ports_eth_addr[port].addr_bytes[2],
482 			vmdq_ports_eth_addr[port].addr_bytes[3],
483 			vmdq_ports_eth_addr[port].addr_bytes[4],
484 			vmdq_ports_eth_addr[port].addr_bytes[5]);
485 
486 	return 0;
487 }
488 
489 /*
490  * Set character device basename.
491  */
492 static int
493 us_vhost_parse_basename(const char *q_arg)
494 {
495 	/* parse number string */
496 
497 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
498 		return -1;
499 	else
500 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
501 
502 	return 0;
503 }
504 
505 /*
506  * Parse the portmask provided at run time.
507  */
508 static int
509 parse_portmask(const char *portmask)
510 {
511 	char *end = NULL;
512 	unsigned long pm;
513 
514 	errno = 0;
515 
516 	/* parse hexadecimal string */
517 	pm = strtoul(portmask, &end, 16);
518 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
519 		return -1;
520 
521 	if (pm == 0)
522 		return -1;
523 
524 	return pm;
525 
526 }
527 
528 /*
529  * Parse num options at run time.
530  */
531 static int
532 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
533 {
534 	char *end = NULL;
535 	unsigned long num;
536 
537 	errno = 0;
538 
539 	/* parse unsigned int string */
540 	num = strtoul(q_arg, &end, 10);
541 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
542 		return -1;
543 
544 	if (num > max_valid_value)
545 		return -1;
546 
547 	return num;
548 
549 }
550 
551 /*
552  * Display usage
553  */
554 static void
555 us_vhost_usage(const char *prgname)
556 {
557 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
558 	"		--vm2vm [0|1|2]\n"
559 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
560 	"		--dev-basename <name>\n"
561 	"		--nb-devices ND\n"
562 	"		-p PORTMASK: Set mask for ports to be used by application\n"
563 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
564 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
565 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
566 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
567 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
568 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
569 	"		--dev-basename: The basename to be used for the character device.\n"
570 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
571 			"zero copy\n"
572 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
573 			"used only when zero copy is enabled.\n"
574 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
575 			"used only when zero copy is enabled.\n",
576 	       prgname);
577 }
578 
579 /*
580  * Parse the arguments given in the command line of the application.
581  */
582 static int
583 us_vhost_parse_args(int argc, char **argv)
584 {
585 	int opt, ret;
586 	int option_index;
587 	unsigned i;
588 	const char *prgname = argv[0];
589 	static struct option long_option[] = {
590 		{"vm2vm", required_argument, NULL, 0},
591 		{"rx-retry", required_argument, NULL, 0},
592 		{"rx-retry-delay", required_argument, NULL, 0},
593 		{"rx-retry-num", required_argument, NULL, 0},
594 		{"mergeable", required_argument, NULL, 0},
595 		{"stats", required_argument, NULL, 0},
596 		{"dev-basename", required_argument, NULL, 0},
597 		{"zero-copy", required_argument, NULL, 0},
598 		{"rx-desc-num", required_argument, NULL, 0},
599 		{"tx-desc-num", required_argument, NULL, 0},
600 		{NULL, 0, 0, 0},
601 	};
602 
603 	/* Parse command line */
604 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605 		switch (opt) {
606 		/* Portmask */
607 		case 'p':
608 			enabled_port_mask = parse_portmask(optarg);
609 			if (enabled_port_mask == 0) {
610 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611 				us_vhost_usage(prgname);
612 				return -1;
613 			}
614 			break;
615 
616 		case 0:
617 			/* Enable/disable vm2vm comms. */
618 			if (!strncmp(long_option[option_index].name, "vm2vm",
619 				MAX_LONG_OPT_SZ)) {
620 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621 				if (ret == -1) {
622 					RTE_LOG(INFO, VHOST_CONFIG,
623 						"Invalid argument for "
624 						"vm2vm [0|1|2]\n");
625 					us_vhost_usage(prgname);
626 					return -1;
627 				} else {
628 					vm2vm_mode = (vm2vm_type)ret;
629 				}
630 			}
631 
632 			/* Enable/disable retries on RX. */
633 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634 				ret = parse_num_opt(optarg, 1);
635 				if (ret == -1) {
636 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637 					us_vhost_usage(prgname);
638 					return -1;
639 				} else {
640 					enable_retry = ret;
641 				}
642 			}
643 
644 			/* Specify the retries delay time (in useconds) on RX. */
645 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646 				ret = parse_num_opt(optarg, INT32_MAX);
647 				if (ret == -1) {
648 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649 					us_vhost_usage(prgname);
650 					return -1;
651 				} else {
652 					burst_rx_delay_time = ret;
653 				}
654 			}
655 
656 			/* Specify the retries number on RX. */
657 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658 				ret = parse_num_opt(optarg, INT32_MAX);
659 				if (ret == -1) {
660 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661 					us_vhost_usage(prgname);
662 					return -1;
663 				} else {
664 					burst_rx_retry_num = ret;
665 				}
666 			}
667 
668 			/* Enable/disable RX mergeable buffers. */
669 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670 				ret = parse_num_opt(optarg, 1);
671 				if (ret == -1) {
672 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673 					us_vhost_usage(prgname);
674 					return -1;
675 				} else {
676 					if (ret) {
677 						vmdq_conf_default.rxmode.jumbo_frame = 1;
678 						vmdq_conf_default.rxmode.max_rx_pkt_len
679 							= JUMBO_FRAME_MAX_SIZE;
680 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681 					}
682 				}
683 			}
684 
685 			/* Enable/disable stats. */
686 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687 				ret = parse_num_opt(optarg, INT32_MAX);
688 				if (ret == -1) {
689 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690 					us_vhost_usage(prgname);
691 					return -1;
692 				} else {
693 					enable_stats = ret;
694 				}
695 			}
696 
697 			/* Set character device basename. */
698 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699 				if (us_vhost_parse_basename(optarg) == -1) {
700 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701 					us_vhost_usage(prgname);
702 					return -1;
703 				}
704 			}
705 
706 			/* Enable/disable rx/tx zero copy. */
707 			if (!strncmp(long_option[option_index].name,
708 				"zero-copy", MAX_LONG_OPT_SZ)) {
709 				ret = parse_num_opt(optarg, 1);
710 				if (ret == -1) {
711 					RTE_LOG(INFO, VHOST_CONFIG,
712 						"Invalid argument"
713 						" for zero-copy [0|1]\n");
714 					us_vhost_usage(prgname);
715 					return -1;
716 				} else
717 					zero_copy = ret;
718 
719 				if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722 					"zero copy vhost APP, please "
723 					"disable RTE_MBUF_REFCNT\n"
724 					"in config file and then rebuild DPDK "
725 					"core lib!\n"
726 					"Otherwise please disable zero copy "
727 					"flag in command line!\n");
728 					return -1;
729 #endif
730 				}
731 			}
732 
733 			/* Specify the descriptor number on RX. */
734 			if (!strncmp(long_option[option_index].name,
735 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
736 				ret = parse_num_opt(optarg, MAX_RING_DESC);
737 				if ((ret == -1) || (!POWEROF2(ret))) {
738 					RTE_LOG(INFO, VHOST_CONFIG,
739 					"Invalid argument for rx-desc-num[0-N],"
740 					"power of 2 required.\n");
741 					us_vhost_usage(prgname);
742 					return -1;
743 				} else {
744 					num_rx_descriptor = ret;
745 				}
746 			}
747 
748 			/* Specify the descriptor number on TX. */
749 			if (!strncmp(long_option[option_index].name,
750 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
751 				ret = parse_num_opt(optarg, MAX_RING_DESC);
752 				if ((ret == -1) || (!POWEROF2(ret))) {
753 					RTE_LOG(INFO, VHOST_CONFIG,
754 					"Invalid argument for tx-desc-num [0-N],"
755 					"power of 2 required.\n");
756 					us_vhost_usage(prgname);
757 					return -1;
758 				} else {
759 					num_tx_descriptor = ret;
760 				}
761 			}
762 
763 			break;
764 
765 			/* Invalid option - print options. */
766 		default:
767 			us_vhost_usage(prgname);
768 			return -1;
769 		}
770 	}
771 
772 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773 		if (enabled_port_mask & (1 << i))
774 			ports[num_ports++] = (uint8_t)i;
775 	}
776 
777 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780 		return -1;
781 	}
782 
783 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784 		RTE_LOG(INFO, VHOST_PORT,
785 			"Vhost zero copy doesn't support software vm2vm,"
786 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
787 		return -1;
788 	}
789 
790 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791 		RTE_LOG(INFO, VHOST_PORT,
792 			"Vhost zero copy doesn't support jumbo frame,"
793 			"please specify '--mergeable 0' to disable the "
794 			"mergeable feature.\n");
795 		return -1;
796 	}
797 
798 	return 0;
799 }
800 
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807 	unsigned valid_num_ports = num_ports;
808 	unsigned portid;
809 
810 	if (num_ports > nb_ports) {
811 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812 			num_ports, nb_ports);
813 		num_ports = nb_ports;
814 	}
815 
816 	for (portid = 0; portid < num_ports; portid ++) {
817 		if (ports[portid] >= nb_ports) {
818 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819 				ports[portid], (nb_ports - 1));
820 			ports[portid] = INVALID_PORT_ID;
821 			valid_num_ports--;
822 		}
823 	}
824 	return valid_num_ports;
825 }
826 
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {																\
833 	char *pkt_addr = (char*)(addr);																					\
834 	unsigned int index;																								\
835 	char packet[MAX_PRINT_BUFF];																					\
836 																													\
837 	if ((header))																									\
838 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
839 	else																											\
840 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
841 	for (index = 0; index < (size); index++) {																		\
842 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
843 			"%02hhx ", pkt_addr[index]);																			\
844 	}																												\
845 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
846 																													\
847 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852 
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859 	uint32_t buf_len, hpa_type *addr_type)
860 {
861 	struct virtio_memory_regions_hpa *region;
862 	uint32_t regionidx;
863 	uint64_t vhost_pa = 0;
864 
865 	*addr_type = PHYS_ADDR_INVALID;
866 
867 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868 		region = &vdev->regions_hpa[regionidx];
869 		if ((guest_pa >= region->guest_phys_address) &&
870 			(guest_pa <= region->guest_phys_address_end)) {
871 			vhost_pa = region->host_phys_addr_offset + guest_pa;
872 			if (likely((guest_pa + buf_len - 1)
873 				<= region->guest_phys_address_end))
874 				*addr_type = PHYS_ADDR_CONTINUOUS;
875 			else
876 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
877 			break;
878 		}
879 	}
880 
881 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883 		(void *)(uintptr_t)vhost_pa);
884 
885 	return vhost_pa;
886 }
887 
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896 
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904 	struct ether_hdr *pkt_hdr;
905 	struct virtio_net_data_ll *dev_ll;
906 	struct virtio_net *dev = vdev->dev;
907 	int i, ret;
908 
909 	/* Learn MAC address of guest device from packet */
910 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911 
912 	dev_ll = ll_root_used;
913 
914 	while (dev_ll != NULL) {
915 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917 			return -1;
918 		}
919 		dev_ll = dev_ll->next;
920 	}
921 
922 	for (i = 0; i < ETHER_ADDR_LEN; i++)
923 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924 
925 	/* vlan_tag currently uses the device_id. */
926 	vdev->vlan_tag = vlan_tags[dev->device_fh];
927 
928 	/* Print out VMDQ registration info. */
929 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930 		dev->device_fh,
931 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934 		vdev->vlan_tag);
935 
936 	/* Register the MAC address. */
937 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
938 	if (ret)
939 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
940 					dev->device_fh);
941 
942 	/* Enable stripping of the vlan tag as we handle routing. */
943 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
944 
945 	/* Set device as ready for RX. */
946 	vdev->ready = DEVICE_RX;
947 
948 	return 0;
949 }
950 
951 /*
952  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
953  * queue before disabling RX on the device.
954  */
955 static inline void
956 unlink_vmdq(struct vhost_dev *vdev)
957 {
958 	unsigned i = 0;
959 	unsigned rx_count;
960 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
961 
962 	if (vdev->ready == DEVICE_RX) {
963 		/*clear MAC and VLAN settings*/
964 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
965 		for (i = 0; i < 6; i++)
966 			vdev->mac_address.addr_bytes[i] = 0;
967 
968 		vdev->vlan_tag = 0;
969 
970 		/*Clear out the receive buffers*/
971 		rx_count = rte_eth_rx_burst(ports[0],
972 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
973 
974 		while (rx_count) {
975 			for (i = 0; i < rx_count; i++)
976 				rte_pktmbuf_free(pkts_burst[i]);
977 
978 			rx_count = rte_eth_rx_burst(ports[0],
979 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980 		}
981 
982 		vdev->ready = DEVICE_MAC_LEARNING;
983 	}
984 }
985 
986 /*
987  * Check if the packet destination MAC address is for a local device. If so then put
988  * the packet on that devices RX queue. If not then return.
989  */
990 static inline unsigned __attribute__((always_inline))
991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
992 {
993 	struct virtio_net_data_ll *dev_ll;
994 	struct ether_hdr *pkt_hdr;
995 	uint64_t ret = 0;
996 	struct virtio_net *dev = vdev->dev;
997 	struct virtio_net *tdev; /* destination virito device */
998 
999 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1000 
1001 	/*get the used devices list*/
1002 	dev_ll = ll_root_used;
1003 
1004 	while (dev_ll != NULL) {
1005 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1006 				          &dev_ll->vdev->mac_address)) {
1007 
1008 			/* Drop the packet if the TX packet is destined for the TX device. */
1009 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1010 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011 							dev->device_fh);
1012 				return 0;
1013 			}
1014 			tdev = dev_ll->vdev->dev;
1015 
1016 
1017 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1018 
1019 			if (dev_ll->vdev->remove) {
1020 				/*drop the packet if the device is marked for removal*/
1021 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1022 			} else {
1023 				/*send the packet to the local virtio device*/
1024 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1025 				if (enable_stats) {
1026 					rte_atomic64_add(
1027 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1028 					1);
1029 					rte_atomic64_add(
1030 					&dev_statistics[tdev->device_fh].rx_atomic,
1031 					ret);
1032 					dev_statistics[tdev->device_fh].tx_total++;
1033 					dev_statistics[tdev->device_fh].tx += ret;
1034 				}
1035 			}
1036 
1037 			return 0;
1038 		}
1039 		dev_ll = dev_ll->next;
1040 	}
1041 
1042 	return -1;
1043 }
1044 
1045 /*
1046  * This function routes the TX packet to the correct interface. This may be a local device
1047  * or the physical port.
1048  */
1049 static inline void __attribute__((always_inline))
1050 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1051 {
1052 	struct mbuf_table *tx_q;
1053 	struct rte_mbuf **m_table;
1054 	unsigned len, ret, offset = 0;
1055 	const uint16_t lcore_id = rte_lcore_id();
1056 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1057 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1058 	struct virtio_net *dev = vdev->dev;
1059 
1060 	/*check if destination is local VM*/
1061 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1062 		rte_pktmbuf_free(m);
1063 		return;
1064 	}
1065 
1066 	if (vm2vm_mode == VM2VM_HARDWARE) {
1067 		while (dev_ll != NULL) {
1068 			if ((dev_ll->vdev->ready == DEVICE_RX)
1069 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1070 				&dev_ll->vdev->mac_address)) {
1071 				/*
1072 				 * Drop the packet if the TX packet is
1073 				 * destined for the TX device.
1074 				 */
1075 				if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1076 					LOG_DEBUG(VHOST_DATA,
1077 					"(%"PRIu64") TX: Source and destination"
1078 					" MAC addresses are the same. Dropping "
1079 					"packet.\n",
1080 					dev_ll->vdev->dev->device_fh);
1081 					rte_pktmbuf_free(m);
1082 					return;
1083 				}
1084 				offset = 4;
1085 				vlan_tag =
1086 				(uint16_t)
1087 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1088 
1089 				LOG_DEBUG(VHOST_DATA,
1090 				"(%"PRIu64") TX: pkt to local VM device id:"
1091 				"(%"PRIu64") vlan tag: %d.\n",
1092 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1093 				vlan_tag);
1094 
1095 				break;
1096 			}
1097 			dev_ll = dev_ll->next;
1098 		}
1099 	}
1100 
1101 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1102 
1103 	/*Add packet to the port tx queue*/
1104 	tx_q = &lcore_tx_queue[lcore_id];
1105 	len = tx_q->len;
1106 
1107 	m->ol_flags = PKT_TX_VLAN_PKT;
1108 	/*FIXME: offset*/
1109 	m->data_len += offset;
1110 	m->vlan_tci = vlan_tag;
1111 
1112 	tx_q->m_table[len] = m;
1113 	len++;
1114 	if (enable_stats) {
1115 		dev_statistics[dev->device_fh].tx_total++;
1116 		dev_statistics[dev->device_fh].tx++;
1117 	}
1118 
1119 	if (unlikely(len == MAX_PKT_BURST)) {
1120 		m_table = (struct rte_mbuf **)tx_q->m_table;
1121 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1122 		/* Free any buffers not handled by TX and update the port stats. */
1123 		if (unlikely(ret < len)) {
1124 			do {
1125 				rte_pktmbuf_free(m_table[ret]);
1126 			} while (++ret < len);
1127 		}
1128 
1129 		len = 0;
1130 	}
1131 
1132 	tx_q->len = len;
1133 	return;
1134 }
1135 /*
1136  * This function is called by each data core. It handles all RX/TX registered with the
1137  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1138  * with all devices in the main linked list.
1139  */
1140 static int
1141 switch_worker(__attribute__((unused)) void *arg)
1142 {
1143 	struct rte_mempool *mbuf_pool = arg;
1144 	struct virtio_net *dev = NULL;
1145 	struct vhost_dev *vdev = NULL;
1146 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1147 	struct virtio_net_data_ll *dev_ll;
1148 	struct mbuf_table *tx_q;
1149 	volatile struct lcore_ll_info *lcore_ll;
1150 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1151 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1152 	unsigned ret, i;
1153 	const uint16_t lcore_id = rte_lcore_id();
1154 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1155 	uint16_t rx_count = 0;
1156 	uint16_t tx_count;
1157 	uint32_t retry = 0;
1158 
1159 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1160 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1161 	prev_tsc = 0;
1162 
1163 	tx_q = &lcore_tx_queue[lcore_id];
1164 	for (i = 0; i < num_cores; i ++) {
1165 		if (lcore_ids[i] == lcore_id) {
1166 			tx_q->txq_id = i;
1167 			break;
1168 		}
1169 	}
1170 
1171 	while(1) {
1172 		cur_tsc = rte_rdtsc();
1173 		/*
1174 		 * TX burst queue drain
1175 		 */
1176 		diff_tsc = cur_tsc - prev_tsc;
1177 		if (unlikely(diff_tsc > drain_tsc)) {
1178 
1179 			if (tx_q->len) {
1180 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1181 
1182 				/*Tx any packets in the queue*/
1183 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1184 									   (struct rte_mbuf **)tx_q->m_table,
1185 									   (uint16_t)tx_q->len);
1186 				if (unlikely(ret < tx_q->len)) {
1187 					do {
1188 						rte_pktmbuf_free(tx_q->m_table[ret]);
1189 					} while (++ret < tx_q->len);
1190 				}
1191 
1192 				tx_q->len = 0;
1193 			}
1194 
1195 			prev_tsc = cur_tsc;
1196 
1197 		}
1198 
1199 		rte_prefetch0(lcore_ll->ll_root_used);
1200 		/*
1201 		 * Inform the configuration core that we have exited the linked list and that no devices are
1202 		 * in use if requested.
1203 		 */
1204 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1205 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1206 
1207 		/*
1208 		 * Process devices
1209 		 */
1210 		dev_ll = lcore_ll->ll_root_used;
1211 
1212 		while (dev_ll != NULL) {
1213 			/*get virtio device ID*/
1214 			vdev = dev_ll->vdev;
1215 			dev = vdev->dev;
1216 
1217 			if (vdev->remove) {
1218 				dev_ll = dev_ll->next;
1219 				unlink_vmdq(vdev);
1220 				vdev->ready = DEVICE_SAFE_REMOVE;
1221 				continue;
1222 			}
1223 			if (likely(vdev->ready == DEVICE_RX)) {
1224 				/*Handle guest RX*/
1225 				rx_count = rte_eth_rx_burst(ports[0],
1226 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1227 
1228 				if (rx_count) {
1229 					/*
1230 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1231 					* Here MAX_PKT_BURST must be less than virtio queue size
1232 					*/
1233 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1234 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1235 							rte_delay_us(burst_rx_delay_time);
1236 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1237 								break;
1238 						}
1239 					}
1240 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1241 					if (enable_stats) {
1242 						rte_atomic64_add(
1243 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1244 						rx_count);
1245 						rte_atomic64_add(
1246 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1247 					}
1248 					while (likely(rx_count)) {
1249 						rx_count--;
1250 						rte_pktmbuf_free(pkts_burst[rx_count]);
1251 					}
1252 
1253 				}
1254 			}
1255 
1256 			if (!vdev->remove) {
1257 				/* Handle guest TX*/
1258 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1259 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1260 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1261 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1262 						while (tx_count--)
1263 							rte_pktmbuf_free(pkts_burst[tx_count]);
1264 					}
1265 				}
1266 				while (tx_count)
1267 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1268 			}
1269 
1270 			/*move to the next device in the list*/
1271 			dev_ll = dev_ll->next;
1272 		}
1273 	}
1274 
1275 	return 0;
1276 }
1277 
1278 /*
1279  * This function gets available ring number for zero copy rx.
1280  * Only one thread will call this funciton for a paticular virtio device,
1281  * so, it is designed as non-thread-safe function.
1282  */
1283 static inline uint32_t __attribute__((always_inline))
1284 get_available_ring_num_zcp(struct virtio_net *dev)
1285 {
1286 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1287 	uint16_t avail_idx;
1288 
1289 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1290 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1291 }
1292 
1293 /*
1294  * This function gets available ring index for zero copy rx,
1295  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1296  * Only one thread will call this funciton for a paticular virtio device,
1297  * so, it is designed as non-thread-safe function.
1298  */
1299 static inline uint32_t __attribute__((always_inline))
1300 get_available_ring_index_zcp(struct virtio_net *dev,
1301 	uint16_t *res_base_idx, uint32_t count)
1302 {
1303 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1304 	uint16_t avail_idx;
1305 	uint32_t retry = 0;
1306 	uint16_t free_entries;
1307 
1308 	*res_base_idx = vq->last_used_idx_res;
1309 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1310 	free_entries = (avail_idx - *res_base_idx);
1311 
1312 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1313 			"avail idx: %d, "
1314 			"res base idx:%d, free entries:%d\n",
1315 			dev->device_fh, avail_idx, *res_base_idx,
1316 			free_entries);
1317 
1318 	/*
1319 	 * If retry is enabled and the queue is full then we wait
1320 	 * and retry to avoid packet loss.
1321 	 */
1322 	if (enable_retry && unlikely(count > free_entries)) {
1323 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1324 			rte_delay_us(burst_rx_delay_time);
1325 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1326 			free_entries = (avail_idx - *res_base_idx);
1327 			if (count <= free_entries)
1328 				break;
1329 		}
1330 	}
1331 
1332 	/*check that we have enough buffers*/
1333 	if (unlikely(count > free_entries))
1334 		count = free_entries;
1335 
1336 	if (unlikely(count == 0)) {
1337 		LOG_DEBUG(VHOST_DATA,
1338 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1339 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1340 			dev->device_fh, avail_idx,
1341 			*res_base_idx, free_entries);
1342 		return 0;
1343 	}
1344 
1345 	vq->last_used_idx_res = *res_base_idx + count;
1346 
1347 	return count;
1348 }
1349 
1350 /*
1351  * This function put descriptor back to used list.
1352  */
1353 static inline void __attribute__((always_inline))
1354 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1355 {
1356 	uint16_t res_cur_idx = vq->last_used_idx;
1357 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1358 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1359 	rte_compiler_barrier();
1360 	*(volatile uint16_t *)&vq->used->idx += 1;
1361 	vq->last_used_idx += 1;
1362 
1363 	/* Kick the guest if necessary. */
1364 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1365 		eventfd_write((int)vq->kickfd, 1);
1366 }
1367 
1368 /*
1369  * This function get available descriptor from vitio vring and un-attached mbuf
1370  * from vpool->ring, and then attach them together. It needs adjust the offset
1371  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1372  * frame data may be put to wrong location in mbuf.
1373  */
1374 static inline void __attribute__((always_inline))
1375 attach_rxmbuf_zcp(struct virtio_net *dev)
1376 {
1377 	uint16_t res_base_idx, desc_idx;
1378 	uint64_t buff_addr, phys_addr;
1379 	struct vhost_virtqueue *vq;
1380 	struct vring_desc *desc;
1381 	struct rte_mbuf *mbuf = NULL;
1382 	struct vpool *vpool;
1383 	hpa_type addr_type;
1384 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1385 
1386 	vpool = &vpool_array[vdev->vmdq_rx_q];
1387 	vq = dev->virtqueue[VIRTIO_RXQ];
1388 
1389 	do {
1390 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1391 				1) != 1))
1392 			return;
1393 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1394 
1395 		desc = &vq->desc[desc_idx];
1396 		if (desc->flags & VRING_DESC_F_NEXT) {
1397 			desc = &vq->desc[desc->next];
1398 			buff_addr = gpa_to_vva(dev, desc->addr);
1399 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1400 					&addr_type);
1401 		} else {
1402 			buff_addr = gpa_to_vva(dev,
1403 					desc->addr + vq->vhost_hlen);
1404 			phys_addr = gpa_to_hpa(vdev,
1405 					desc->addr + vq->vhost_hlen,
1406 					desc->len, &addr_type);
1407 		}
1408 
1409 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1410 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1411 				" address found when attaching RX frame buffer"
1412 				" address!\n", dev->device_fh);
1413 			put_desc_to_used_list_zcp(vq, desc_idx);
1414 			continue;
1415 		}
1416 
1417 		/*
1418 		 * Check if the frame buffer address from guest crosses
1419 		 * sub-region or not.
1420 		 */
1421 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1422 			RTE_LOG(ERR, VHOST_DATA,
1423 				"(%"PRIu64") Frame buffer address cross "
1424 				"sub-regioin found when attaching RX frame "
1425 				"buffer address!\n",
1426 				dev->device_fh);
1427 			put_desc_to_used_list_zcp(vq, desc_idx);
1428 			continue;
1429 		}
1430 	} while (unlikely(phys_addr == 0));
1431 
1432 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1433 	if (unlikely(mbuf == NULL)) {
1434 		LOG_DEBUG(VHOST_DATA,
1435 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1436 			"ring_sc_dequeue fail.\n",
1437 			dev->device_fh);
1438 		put_desc_to_used_list_zcp(vq, desc_idx);
1439 		return;
1440 	}
1441 
1442 	if (unlikely(vpool->buf_size > desc->len)) {
1443 		LOG_DEBUG(VHOST_DATA,
1444 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1445 			"length(%d) of descriptor idx: %d less than room "
1446 			"size required: %d\n",
1447 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1448 		put_desc_to_used_list_zcp(vq, desc_idx);
1449 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1450 		return;
1451 	}
1452 
1453 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1454 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1455 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1456 	mbuf->data_len = desc->len;
1457 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1458 
1459 	LOG_DEBUG(VHOST_DATA,
1460 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1461 		"descriptor idx:%d\n",
1462 		dev->device_fh, res_base_idx, desc_idx);
1463 
1464 	__rte_mbuf_raw_free(mbuf);
1465 
1466 	return;
1467 }
1468 
1469 /*
1470  * Detach an attched packet mbuf -
1471  *  - restore original mbuf address and length values.
1472  *  - reset pktmbuf data and data_len to their default values.
1473  *  All other fields of the given packet mbuf will be left intact.
1474  *
1475  * @param m
1476  *   The attached packet mbuf.
1477  */
1478 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1479 {
1480 	const struct rte_mempool *mp = m->pool;
1481 	void *buf = RTE_MBUF_TO_BADDR(m);
1482 	uint32_t buf_ofs;
1483 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1484 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1485 
1486 	m->buf_addr = buf;
1487 	m->buf_len = (uint16_t)buf_len;
1488 
1489 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1490 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1491 	m->data_off = buf_ofs;
1492 
1493 	m->data_len = 0;
1494 }
1495 
1496 /*
1497  * This function is called after packets have been transimited. It fetchs mbuf
1498  * from vpool->pool, detached it and put into vpool->ring. It also update the
1499  * used index and kick the guest if necessary.
1500  */
1501 static inline uint32_t __attribute__((always_inline))
1502 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1503 {
1504 	struct rte_mbuf *mbuf;
1505 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1506 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1507 	uint32_t index = 0;
1508 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1509 
1510 	LOG_DEBUG(VHOST_DATA,
1511 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1512 		"clean is: %d\n",
1513 		dev->device_fh, mbuf_count);
1514 	LOG_DEBUG(VHOST_DATA,
1515 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1516 		"clean  is : %d\n",
1517 		dev->device_fh, rte_ring_count(vpool->ring));
1518 
1519 	for (index = 0; index < mbuf_count; index++) {
1520 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1521 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1522 			pktmbuf_detach_zcp(mbuf);
1523 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1524 
1525 		/* Update used index buffer information. */
1526 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1527 		vq->used->ring[used_idx].len = 0;
1528 
1529 		used_idx = (used_idx + 1) & (vq->size - 1);
1530 	}
1531 
1532 	LOG_DEBUG(VHOST_DATA,
1533 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1534 		"clean is: %d\n",
1535 		dev->device_fh, rte_mempool_count(vpool->pool));
1536 	LOG_DEBUG(VHOST_DATA,
1537 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1538 		"clean  is : %d\n",
1539 		dev->device_fh, rte_ring_count(vpool->ring));
1540 	LOG_DEBUG(VHOST_DATA,
1541 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1542 		"vq->last_used_idx:%d\n",
1543 		dev->device_fh, vq->last_used_idx);
1544 
1545 	vq->last_used_idx += mbuf_count;
1546 
1547 	LOG_DEBUG(VHOST_DATA,
1548 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1549 		"vq->last_used_idx:%d\n",
1550 		dev->device_fh, vq->last_used_idx);
1551 
1552 	rte_compiler_barrier();
1553 
1554 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1555 
1556 	/* Kick guest if required. */
1557 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1558 		eventfd_write((int)vq->kickfd, 1);
1559 
1560 	return 0;
1561 }
1562 
1563 /*
1564  * This function is called when a virtio device is destroy.
1565  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1566  */
1567 static void mbuf_destroy_zcp(struct vpool *vpool)
1568 {
1569 	struct rte_mbuf *mbuf = NULL;
1570 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1571 
1572 	LOG_DEBUG(VHOST_CONFIG,
1573 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1574 		"mbuf_destroy_zcp is: %d\n",
1575 		mbuf_count);
1576 	LOG_DEBUG(VHOST_CONFIG,
1577 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1578 		"mbuf_destroy_zcp  is : %d\n",
1579 		rte_ring_count(vpool->ring));
1580 
1581 	for (index = 0; index < mbuf_count; index++) {
1582 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583 		if (likely(mbuf != NULL)) {
1584 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1585 				pktmbuf_detach_zcp(mbuf);
1586 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1587 		}
1588 	}
1589 
1590 	LOG_DEBUG(VHOST_CONFIG,
1591 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1592 		"mbuf_destroy_zcp is: %d\n",
1593 		rte_mempool_count(vpool->pool));
1594 	LOG_DEBUG(VHOST_CONFIG,
1595 		"in mbuf_destroy_zcp: mbuf count in ring after "
1596 		"mbuf_destroy_zcp is : %d\n",
1597 		rte_ring_count(vpool->ring));
1598 }
1599 
1600 /*
1601  * This function update the use flag and counter.
1602  */
1603 static inline uint32_t __attribute__((always_inline))
1604 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1605 	uint32_t count)
1606 {
1607 	struct vhost_virtqueue *vq;
1608 	struct vring_desc *desc;
1609 	struct rte_mbuf *buff;
1610 	/* The virtio_hdr is initialised to 0. */
1611 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1612 		= {{0, 0, 0, 0, 0, 0}, 0};
1613 	uint64_t buff_hdr_addr = 0;
1614 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1615 	uint32_t head_idx, packet_success = 0;
1616 	uint16_t res_cur_idx;
1617 
1618 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1619 
1620 	if (count == 0)
1621 		return 0;
1622 
1623 	vq = dev->virtqueue[VIRTIO_RXQ];
1624 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1625 
1626 	res_cur_idx = vq->last_used_idx;
1627 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1628 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1629 
1630 	/* Retrieve all of the head indexes first to avoid caching issues. */
1631 	for (head_idx = 0; head_idx < count; head_idx++)
1632 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1633 
1634 	/*Prefetch descriptor index. */
1635 	rte_prefetch0(&vq->desc[head[packet_success]]);
1636 
1637 	while (packet_success != count) {
1638 		/* Get descriptor from available ring */
1639 		desc = &vq->desc[head[packet_success]];
1640 
1641 		buff = pkts[packet_success];
1642 		LOG_DEBUG(VHOST_DATA,
1643 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1644 			"pkt[%d] descriptor idx: %d\n",
1645 			dev->device_fh, packet_success,
1646 			MBUF_HEADROOM_UINT32(buff));
1647 
1648 		PRINT_PACKET(dev,
1649 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1650 			+ RTE_PKTMBUF_HEADROOM),
1651 			rte_pktmbuf_data_len(buff), 0);
1652 
1653 		/* Buffer address translation for virtio header. */
1654 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1655 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1656 
1657 		/*
1658 		 * If the descriptors are chained the header and data are
1659 		 * placed in separate buffers.
1660 		 */
1661 		if (desc->flags & VRING_DESC_F_NEXT) {
1662 			desc->len = vq->vhost_hlen;
1663 			desc = &vq->desc[desc->next];
1664 			desc->len = rte_pktmbuf_data_len(buff);
1665 		} else {
1666 			desc->len = packet_len;
1667 		}
1668 
1669 		/* Update used ring with desc information */
1670 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1671 			= head[packet_success];
1672 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1673 			= packet_len;
1674 		res_cur_idx++;
1675 		packet_success++;
1676 
1677 		/* A header is required per buffer. */
1678 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1679 			(const void *)&virtio_hdr, vq->vhost_hlen);
1680 
1681 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1682 
1683 		if (likely(packet_success < count)) {
1684 			/* Prefetch descriptor index. */
1685 			rte_prefetch0(&vq->desc[head[packet_success]]);
1686 		}
1687 	}
1688 
1689 	rte_compiler_barrier();
1690 
1691 	LOG_DEBUG(VHOST_DATA,
1692 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1693 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1694 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1695 
1696 	*(volatile uint16_t *)&vq->used->idx += count;
1697 	vq->last_used_idx += count;
1698 
1699 	LOG_DEBUG(VHOST_DATA,
1700 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1701 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1702 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1703 
1704 	/* Kick the guest if necessary. */
1705 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1706 		eventfd_write((int)vq->kickfd, 1);
1707 
1708 	return count;
1709 }
1710 
1711 /*
1712  * This function routes the TX packet to the correct interface.
1713  * This may be a local device or the physical port.
1714  */
1715 static inline void __attribute__((always_inline))
1716 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1717 	uint32_t desc_idx, uint8_t need_copy)
1718 {
1719 	struct mbuf_table *tx_q;
1720 	struct rte_mbuf **m_table;
1721 	struct rte_mbuf *mbuf = NULL;
1722 	unsigned len, ret, offset = 0;
1723 	struct vpool *vpool;
1724 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1725 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1726 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1727 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1728 
1729 	/*Add packet to the port tx queue*/
1730 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1731 	len = tx_q->len;
1732 
1733 	/* Allocate an mbuf and populate the structure. */
1734 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1735 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1736 	if (unlikely(mbuf == NULL)) {
1737 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1738 		RTE_LOG(ERR, VHOST_DATA,
1739 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1740 			dev->device_fh);
1741 		put_desc_to_used_list_zcp(vq, desc_idx);
1742 		return;
1743 	}
1744 
1745 	if (vm2vm_mode == VM2VM_HARDWARE) {
1746 		/* Avoid using a vlan tag from any vm for external pkt, such as
1747 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1748 		 * selection, MAC address determines it as an external pkt
1749 		 * which should go to network, while vlan tag determine it as
1750 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1751 		 * such a ambiguous situation, so pkt will lost.
1752 		 */
1753 		vlan_tag = external_pkt_default_vlan_tag;
1754 		while (dev_ll != NULL) {
1755 			if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1756 				ether_addr_cmp(&(pkt_hdr->d_addr),
1757 				&dev_ll->vdev->mac_address)) {
1758 
1759 				/*
1760 				 * Drop the packet if the TX packet is destined
1761 				 * for the TX device.
1762 				 */
1763 				if (unlikely(dev_ll->vdev->dev->device_fh
1764 					== dev->device_fh)) {
1765 					LOG_DEBUG(VHOST_DATA,
1766 					"(%"PRIu64") TX: Source and destination"
1767 					"MAC addresses are the same. Dropping "
1768 					"packet.\n",
1769 					dev_ll->vdev->dev->device_fh);
1770 					MBUF_HEADROOM_UINT32(mbuf)
1771 						= (uint32_t)desc_idx;
1772 					__rte_mbuf_raw_free(mbuf);
1773 					return;
1774 				}
1775 
1776 				/*
1777 				 * Packet length offset 4 bytes for HW vlan
1778 				 * strip when L2 switch back.
1779 				 */
1780 				offset = 4;
1781 				vlan_tag =
1782 				(uint16_t)
1783 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1784 
1785 				LOG_DEBUG(VHOST_DATA,
1786 				"(%"PRIu64") TX: pkt to local VM device id:"
1787 				"(%"PRIu64") vlan tag: %d.\n",
1788 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1789 				vlan_tag);
1790 
1791 				break;
1792 			}
1793 			dev_ll = dev_ll->next;
1794 		}
1795 	}
1796 
1797 	mbuf->nb_segs = m->nb_segs;
1798 	mbuf->next = m->next;
1799 	mbuf->data_len = m->data_len + offset;
1800 	mbuf->pkt_len = mbuf->data_len;
1801 	if (unlikely(need_copy)) {
1802 		/* Copy the packet contents to the mbuf. */
1803 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1804 			rte_pktmbuf_mtod(m, void *),
1805 			m->data_len);
1806 	} else {
1807 		mbuf->data_off = m->data_off;
1808 		mbuf->buf_physaddr = m->buf_physaddr;
1809 		mbuf->buf_addr = m->buf_addr;
1810 	}
1811 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1812 	mbuf->vlan_tci = vlan_tag;
1813 	mbuf->l2_len = sizeof(struct ether_hdr);
1814 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1815 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1816 
1817 	tx_q->m_table[len] = mbuf;
1818 	len++;
1819 
1820 	LOG_DEBUG(VHOST_DATA,
1821 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1822 		dev->device_fh,
1823 		mbuf->nb_segs,
1824 		(mbuf->next == NULL) ? "null" : "non-null");
1825 
1826 	if (enable_stats) {
1827 		dev_statistics[dev->device_fh].tx_total++;
1828 		dev_statistics[dev->device_fh].tx++;
1829 	}
1830 
1831 	if (unlikely(len == MAX_PKT_BURST)) {
1832 		m_table = (struct rte_mbuf **)tx_q->m_table;
1833 		ret = rte_eth_tx_burst(ports[0],
1834 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1835 
1836 		/*
1837 		 * Free any buffers not handled by TX and update
1838 		 * the port stats.
1839 		 */
1840 		if (unlikely(ret < len)) {
1841 			do {
1842 				rte_pktmbuf_free(m_table[ret]);
1843 			} while (++ret < len);
1844 		}
1845 
1846 		len = 0;
1847 		txmbuf_clean_zcp(dev, vpool);
1848 	}
1849 
1850 	tx_q->len = len;
1851 
1852 	return;
1853 }
1854 
1855 /*
1856  * This function TX all available packets in virtio TX queue for one
1857  * virtio-net device. If it is first packet, it learns MAC address and
1858  * setup VMDQ.
1859  */
1860 static inline void __attribute__((always_inline))
1861 virtio_dev_tx_zcp(struct virtio_net *dev)
1862 {
1863 	struct rte_mbuf m;
1864 	struct vhost_virtqueue *vq;
1865 	struct vring_desc *desc;
1866 	uint64_t buff_addr = 0, phys_addr;
1867 	uint32_t head[MAX_PKT_BURST];
1868 	uint32_t i;
1869 	uint16_t free_entries, packet_success = 0;
1870 	uint16_t avail_idx;
1871 	uint8_t need_copy = 0;
1872 	hpa_type addr_type;
1873 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1874 
1875 	vq = dev->virtqueue[VIRTIO_TXQ];
1876 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1877 
1878 	/* If there are no available buffers then return. */
1879 	if (vq->last_used_idx_res == avail_idx)
1880 		return;
1881 
1882 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1883 
1884 	/* Prefetch available ring to retrieve head indexes. */
1885 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1886 
1887 	/* Get the number of free entries in the ring */
1888 	free_entries = (avail_idx - vq->last_used_idx_res);
1889 
1890 	/* Limit to MAX_PKT_BURST. */
1891 	free_entries
1892 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1893 
1894 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1895 		dev->device_fh, free_entries);
1896 
1897 	/* Retrieve all of the head indexes first to avoid caching issues. */
1898 	for (i = 0; i < free_entries; i++)
1899 		head[i]
1900 			= vq->avail->ring[(vq->last_used_idx_res + i)
1901 			& (vq->size - 1)];
1902 
1903 	vq->last_used_idx_res += free_entries;
1904 
1905 	/* Prefetch descriptor index. */
1906 	rte_prefetch0(&vq->desc[head[packet_success]]);
1907 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1908 
1909 	while (packet_success < free_entries) {
1910 		desc = &vq->desc[head[packet_success]];
1911 
1912 		/* Discard first buffer as it is the virtio header */
1913 		desc = &vq->desc[desc->next];
1914 
1915 		/* Buffer address translation. */
1916 		buff_addr = gpa_to_vva(dev, desc->addr);
1917 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1918 
1919 		if (likely(packet_success < (free_entries - 1)))
1920 			/* Prefetch descriptor index. */
1921 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1922 
1923 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1924 			RTE_LOG(ERR, VHOST_DATA,
1925 				"(%"PRIu64") Invalid frame buffer address found"
1926 				"when TX packets!\n",
1927 				dev->device_fh);
1928 			packet_success++;
1929 			continue;
1930 		}
1931 
1932 		/* Prefetch buffer address. */
1933 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1934 
1935 		/*
1936 		 * Setup dummy mbuf. This is copied to a real mbuf if
1937 		 * transmitted out the physical port.
1938 		 */
1939 		m.data_len = desc->len;
1940 		m.nb_segs = 1;
1941 		m.next = NULL;
1942 		m.data_off = 0;
1943 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1944 		m.buf_physaddr = phys_addr;
1945 
1946 		/*
1947 		 * Check if the frame buffer address from guest crosses
1948 		 * sub-region or not.
1949 		 */
1950 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1951 			RTE_LOG(ERR, VHOST_DATA,
1952 				"(%"PRIu64") Frame buffer address cross "
1953 				"sub-regioin found when attaching TX frame "
1954 				"buffer address!\n",
1955 				dev->device_fh);
1956 			need_copy = 1;
1957 		} else
1958 			need_copy = 0;
1959 
1960 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1961 
1962 		/*
1963 		 * If this is the first received packet we need to learn
1964 		 * the MAC and setup VMDQ
1965 		 */
1966 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1967 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1968 				/*
1969 				 * Discard frame if device is scheduled for
1970 				 * removal or a duplicate MAC address is found.
1971 				 */
1972 				packet_success += free_entries;
1973 				vq->last_used_idx += packet_success;
1974 				break;
1975 			}
1976 		}
1977 
1978 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1979 		packet_success++;
1980 	}
1981 }
1982 
1983 /*
1984  * This function is called by each data core. It handles all RX/TX registered
1985  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1986  * addresses are compared with all devices in the main linked list.
1987  */
1988 static int
1989 switch_worker_zcp(__attribute__((unused)) void *arg)
1990 {
1991 	struct virtio_net *dev = NULL;
1992 	struct vhost_dev  *vdev = NULL;
1993 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1994 	struct virtio_net_data_ll *dev_ll;
1995 	struct mbuf_table *tx_q;
1996 	volatile struct lcore_ll_info *lcore_ll;
1997 	const uint64_t drain_tsc
1998 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1999 		* BURST_TX_DRAIN_US;
2000 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2001 	unsigned ret;
2002 	const uint16_t lcore_id = rte_lcore_id();
2003 	uint16_t count_in_ring, rx_count = 0;
2004 
2005 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2006 
2007 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2008 	prev_tsc = 0;
2009 
2010 	while (1) {
2011 		cur_tsc = rte_rdtsc();
2012 
2013 		/* TX burst queue drain */
2014 		diff_tsc = cur_tsc - prev_tsc;
2015 		if (unlikely(diff_tsc > drain_tsc)) {
2016 			/*
2017 			 * Get mbuf from vpool.pool and detach mbuf and
2018 			 * put back into vpool.ring.
2019 			 */
2020 			dev_ll = lcore_ll->ll_root_used;
2021 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2022 				/* Get virtio device ID */
2023 				vdev = dev_ll->vdev;
2024 				dev = vdev->dev;
2025 
2026 				if (likely(!vdev->remove)) {
2027 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2028 					if (tx_q->len) {
2029 						LOG_DEBUG(VHOST_DATA,
2030 						"TX queue drained after timeout"
2031 						" with burst size %u\n",
2032 						tx_q->len);
2033 
2034 						/*
2035 						 * Tx any packets in the queue
2036 						 */
2037 						ret = rte_eth_tx_burst(
2038 							ports[0],
2039 							(uint16_t)tx_q->txq_id,
2040 							(struct rte_mbuf **)
2041 							tx_q->m_table,
2042 							(uint16_t)tx_q->len);
2043 						if (unlikely(ret < tx_q->len)) {
2044 							do {
2045 								rte_pktmbuf_free(
2046 									tx_q->m_table[ret]);
2047 							} while (++ret < tx_q->len);
2048 						}
2049 						tx_q->len = 0;
2050 
2051 						txmbuf_clean_zcp(dev,
2052 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2053 					}
2054 				}
2055 				dev_ll = dev_ll->next;
2056 			}
2057 			prev_tsc = cur_tsc;
2058 		}
2059 
2060 		rte_prefetch0(lcore_ll->ll_root_used);
2061 
2062 		/*
2063 		 * Inform the configuration core that we have exited the linked
2064 		 * list and that no devices are in use if requested.
2065 		 */
2066 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2067 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2068 
2069 		/* Process devices */
2070 		dev_ll = lcore_ll->ll_root_used;
2071 
2072 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2073 			vdev = dev_ll->vdev;
2074 			dev  = vdev->dev;
2075 			if (unlikely(vdev->remove)) {
2076 				dev_ll = dev_ll->next;
2077 				unlink_vmdq(vdev);
2078 				vdev->ready = DEVICE_SAFE_REMOVE;
2079 				continue;
2080 			}
2081 
2082 			if (likely(vdev->ready == DEVICE_RX)) {
2083 				uint32_t index = vdev->vmdq_rx_q;
2084 				uint16_t i;
2085 				count_in_ring
2086 				= rte_ring_count(vpool_array[index].ring);
2087 				uint16_t free_entries
2088 				= (uint16_t)get_available_ring_num_zcp(dev);
2089 
2090 				/*
2091 				 * Attach all mbufs in vpool.ring and put back
2092 				 * into vpool.pool.
2093 				 */
2094 				for (i = 0;
2095 				i < RTE_MIN(free_entries,
2096 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2097 				i++)
2098 					attach_rxmbuf_zcp(dev);
2099 
2100 				/* Handle guest RX */
2101 				rx_count = rte_eth_rx_burst(ports[0],
2102 					vdev->vmdq_rx_q, pkts_burst,
2103 					MAX_PKT_BURST);
2104 
2105 				if (rx_count) {
2106 					ret_count = virtio_dev_rx_zcp(dev,
2107 							pkts_burst, rx_count);
2108 					if (enable_stats) {
2109 						dev_statistics[dev->device_fh].rx_total
2110 							+= rx_count;
2111 						dev_statistics[dev->device_fh].rx
2112 							+= ret_count;
2113 					}
2114 					while (likely(rx_count)) {
2115 						rx_count--;
2116 						pktmbuf_detach_zcp(
2117 							pkts_burst[rx_count]);
2118 						rte_ring_sp_enqueue(
2119 							vpool_array[index].ring,
2120 							(void *)pkts_burst[rx_count]);
2121 					}
2122 				}
2123 			}
2124 
2125 			if (likely(!vdev->remove))
2126 				/* Handle guest TX */
2127 				virtio_dev_tx_zcp(dev);
2128 
2129 			/* Move to the next device in the list */
2130 			dev_ll = dev_ll->next;
2131 		}
2132 	}
2133 
2134 	return 0;
2135 }
2136 
2137 
2138 /*
2139  * Add an entry to a used linked list. A free entry must first be found
2140  * in the free linked list using get_data_ll_free_entry();
2141  */
2142 static void
2143 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2144 	struct virtio_net_data_ll *ll_dev)
2145 {
2146 	struct virtio_net_data_ll *ll = *ll_root_addr;
2147 
2148 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2149 	ll_dev->next = NULL;
2150 	rte_compiler_barrier();
2151 
2152 	/* If ll == NULL then this is the first device. */
2153 	if (ll) {
2154 		/* Increment to the tail of the linked list. */
2155 		while ((ll->next != NULL) )
2156 			ll = ll->next;
2157 
2158 		ll->next = ll_dev;
2159 	} else {
2160 		*ll_root_addr = ll_dev;
2161 	}
2162 }
2163 
2164 /*
2165  * Remove an entry from a used linked list. The entry must then be added to
2166  * the free linked list using put_data_ll_free_entry().
2167  */
2168 static void
2169 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2170 	struct virtio_net_data_ll *ll_dev,
2171 	struct virtio_net_data_ll *ll_dev_last)
2172 {
2173 	struct virtio_net_data_ll *ll = *ll_root_addr;
2174 
2175 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2176 		return;
2177 
2178 	if (ll_dev == ll)
2179 		*ll_root_addr = ll_dev->next;
2180 	else
2181 		if (likely(ll_dev_last != NULL))
2182 			ll_dev_last->next = ll_dev->next;
2183 		else
2184 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2185 }
2186 
2187 /*
2188  * Find and return an entry from the free linked list.
2189  */
2190 static struct virtio_net_data_ll *
2191 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2192 {
2193 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2194 	struct virtio_net_data_ll *ll_dev;
2195 
2196 	if (ll_free == NULL)
2197 		return NULL;
2198 
2199 	ll_dev = ll_free;
2200 	*ll_root_addr = ll_free->next;
2201 
2202 	return ll_dev;
2203 }
2204 
2205 /*
2206  * Place an entry back on to the free linked list.
2207  */
2208 static void
2209 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2210 	struct virtio_net_data_ll *ll_dev)
2211 {
2212 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2213 
2214 	if (ll_dev == NULL)
2215 		return;
2216 
2217 	ll_dev->next = ll_free;
2218 	*ll_root_addr = ll_dev;
2219 }
2220 
2221 /*
2222  * Creates a linked list of a given size.
2223  */
2224 static struct virtio_net_data_ll *
2225 alloc_data_ll(uint32_t size)
2226 {
2227 	struct virtio_net_data_ll *ll_new;
2228 	uint32_t i;
2229 
2230 	/* Malloc and then chain the linked list. */
2231 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2232 	if (ll_new == NULL) {
2233 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2234 		return NULL;
2235 	}
2236 
2237 	for (i = 0; i < size - 1; i++) {
2238 		ll_new[i].vdev = NULL;
2239 		ll_new[i].next = &ll_new[i+1];
2240 	}
2241 	ll_new[i].next = NULL;
2242 
2243 	return (ll_new);
2244 }
2245 
2246 /*
2247  * Create the main linked list along with each individual cores linked list. A used and a free list
2248  * are created to manage entries.
2249  */
2250 static int
2251 init_data_ll (void)
2252 {
2253 	int lcore;
2254 
2255 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2256 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2257 		if (lcore_info[lcore].lcore_ll == NULL) {
2258 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2259 			return -1;
2260 		}
2261 
2262 		lcore_info[lcore].lcore_ll->device_num = 0;
2263 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2264 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2265 		if (num_devices % num_switching_cores)
2266 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2267 		else
2268 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2269 	}
2270 
2271 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2272 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2273 
2274 	return 0;
2275 }
2276 
2277 /*
2278  * Set virtqueue flags so that we do not receive interrupts.
2279  */
2280 static void
2281 set_irq_status (struct virtio_net *dev)
2282 {
2283 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2284 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2285 }
2286 
2287 /*
2288  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2289  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2290  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2291  */
2292 static void
2293 destroy_device (volatile struct virtio_net *dev)
2294 {
2295 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2296 	struct virtio_net_data_ll *ll_main_dev_cur;
2297 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2298 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2299 	struct vhost_dev *vdev;
2300 	int lcore;
2301 
2302 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2303 
2304 	vdev = (struct vhost_dev *)dev->priv;
2305 	/*set the remove flag. */
2306 	vdev->remove = 1;
2307 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2308 		rte_pause();
2309 	}
2310 
2311 	/* Search for entry to be removed from lcore ll */
2312 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2313 	while (ll_lcore_dev_cur != NULL) {
2314 		if (ll_lcore_dev_cur->vdev == vdev) {
2315 			break;
2316 		} else {
2317 			ll_lcore_dev_last = ll_lcore_dev_cur;
2318 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2319 		}
2320 	}
2321 
2322 	if (ll_lcore_dev_cur == NULL) {
2323 		RTE_LOG(ERR, VHOST_CONFIG,
2324 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2325 			dev->device_fh);
2326 		return;
2327 	}
2328 
2329 	/* Search for entry to be removed from main ll */
2330 	ll_main_dev_cur = ll_root_used;
2331 	ll_main_dev_last = NULL;
2332 	while (ll_main_dev_cur != NULL) {
2333 		if (ll_main_dev_cur->vdev == vdev) {
2334 			break;
2335 		} else {
2336 			ll_main_dev_last = ll_main_dev_cur;
2337 			ll_main_dev_cur = ll_main_dev_cur->next;
2338 		}
2339 	}
2340 
2341 	/* Remove entries from the lcore and main ll. */
2342 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2343 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2344 
2345 	/* Set the dev_removal_flag on each lcore. */
2346 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2347 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2348 	}
2349 
2350 	/*
2351 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2352 	 * they can no longer access the device removed from the linked lists and that the devices
2353 	 * are no longer in use.
2354 	 */
2355 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2356 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2357 			rte_pause();
2358 		}
2359 	}
2360 
2361 	/* Add the entries back to the lcore and main free ll.*/
2362 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2363 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2364 
2365 	/* Decrement number of device on the lcore. */
2366 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2367 
2368 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2369 
2370 	if (zero_copy) {
2371 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2372 
2373 		/* Stop the RX queue. */
2374 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2375 			LOG_DEBUG(VHOST_CONFIG,
2376 				"(%"PRIu64") In destroy_device: Failed to stop "
2377 				"rx queue:%d\n",
2378 				dev->device_fh,
2379 				vdev->vmdq_rx_q);
2380 		}
2381 
2382 		LOG_DEBUG(VHOST_CONFIG,
2383 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2384 			"mempool back to ring for RX queue: %d\n",
2385 			dev->device_fh, vdev->vmdq_rx_q);
2386 
2387 		mbuf_destroy_zcp(vpool);
2388 
2389 		/* Stop the TX queue. */
2390 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2391 			LOG_DEBUG(VHOST_CONFIG,
2392 				"(%"PRIu64") In destroy_device: Failed to "
2393 				"stop tx queue:%d\n",
2394 				dev->device_fh, vdev->vmdq_rx_q);
2395 		}
2396 
2397 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2398 
2399 		LOG_DEBUG(VHOST_CONFIG,
2400 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2401 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2402 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2403 			dev->device_fh);
2404 
2405 		mbuf_destroy_zcp(vpool);
2406 		rte_free(vdev->regions_hpa);
2407 	}
2408 	rte_free(vdev);
2409 
2410 }
2411 
2412 /*
2413  * Calculate the region count of physical continous regions for one particular
2414  * region of whose vhost virtual address is continous. The particular region
2415  * start from vva_start, with size of 'size' in argument.
2416  */
2417 static uint32_t
2418 check_hpa_regions(uint64_t vva_start, uint64_t size)
2419 {
2420 	uint32_t i, nregions = 0, page_size = getpagesize();
2421 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2422 	if (vva_start % page_size) {
2423 		LOG_DEBUG(VHOST_CONFIG,
2424 			"in check_countinous: vva start(%p) mod page_size(%d) "
2425 			"has remainder\n",
2426 			(void *)(uintptr_t)vva_start, page_size);
2427 		return 0;
2428 	}
2429 	if (size % page_size) {
2430 		LOG_DEBUG(VHOST_CONFIG,
2431 			"in check_countinous: "
2432 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2433 			size, page_size);
2434 		return 0;
2435 	}
2436 	for (i = 0; i < size - page_size; i = i + page_size) {
2437 		cur_phys_addr
2438 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2439 		next_phys_addr = rte_mem_virt2phy(
2440 			(void *)(uintptr_t)(vva_start + i + page_size));
2441 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2442 			++nregions;
2443 			LOG_DEBUG(VHOST_CONFIG,
2444 				"in check_continuous: hva addr:(%p) is not "
2445 				"continuous with hva addr:(%p), diff:%d\n",
2446 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2447 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2448 				+ page_size), page_size);
2449 			LOG_DEBUG(VHOST_CONFIG,
2450 				"in check_continuous: hpa addr:(%p) is not "
2451 				"continuous with hpa addr:(%p), "
2452 				"diff:(%"PRIu64")\n",
2453 				(void *)(uintptr_t)cur_phys_addr,
2454 				(void *)(uintptr_t)next_phys_addr,
2455 				(next_phys_addr-cur_phys_addr));
2456 		}
2457 	}
2458 	return nregions;
2459 }
2460 
2461 /*
2462  * Divide each region whose vhost virtual address is continous into a few
2463  * sub-regions, make sure the physical address within each sub-region are
2464  * continous. And fill offset(to GPA) and size etc. information of each
2465  * sub-region into regions_hpa.
2466  */
2467 static uint32_t
2468 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2469 {
2470 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2471 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2472 
2473 	if (mem_region_hpa == NULL)
2474 		return 0;
2475 
2476 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2477 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2478 			virtio_memory->regions[regionidx].address_offset;
2479 		mem_region_hpa[regionidx_hpa].guest_phys_address
2480 			= virtio_memory->regions[regionidx].guest_phys_address;
2481 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2482 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2483 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2484 		LOG_DEBUG(VHOST_CONFIG,
2485 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2486 			regionidx_hpa,
2487 			(void *)(uintptr_t)
2488 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2489 		LOG_DEBUG(VHOST_CONFIG,
2490 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2491 			regionidx_hpa,
2492 			(void *)(uintptr_t)
2493 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2494 		for (i = 0, k = 0;
2495 			i < virtio_memory->regions[regionidx].memory_size -
2496 				page_size;
2497 			i += page_size) {
2498 			cur_phys_addr = rte_mem_virt2phy(
2499 					(void *)(uintptr_t)(vva_start + i));
2500 			next_phys_addr = rte_mem_virt2phy(
2501 					(void *)(uintptr_t)(vva_start +
2502 					i + page_size));
2503 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2504 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2505 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2506 					k + page_size;
2507 				mem_region_hpa[regionidx_hpa].memory_size
2508 					= k + page_size;
2509 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2510 					"phys addr end  [%d]:(%p)\n",
2511 					regionidx_hpa,
2512 					(void *)(uintptr_t)
2513 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2514 				LOG_DEBUG(VHOST_CONFIG,
2515 					"in fill_hpa_regions: guest phys addr "
2516 					"size [%d]:(%p)\n",
2517 					regionidx_hpa,
2518 					(void *)(uintptr_t)
2519 					(mem_region_hpa[regionidx_hpa].memory_size));
2520 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2521 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2522 				++regionidx_hpa;
2523 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2524 					next_phys_addr -
2525 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2526 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2527 					" phys addr start[%d]:(%p)\n",
2528 					regionidx_hpa,
2529 					(void *)(uintptr_t)
2530 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2531 				LOG_DEBUG(VHOST_CONFIG,
2532 					"in fill_hpa_regions: host  phys addr "
2533 					"start[%d]:(%p)\n",
2534 					regionidx_hpa,
2535 					(void *)(uintptr_t)
2536 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2537 				k = 0;
2538 			} else {
2539 				k += page_size;
2540 			}
2541 		}
2542 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2543 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2544 			+ k + page_size;
2545 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2546 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2547 			"[%d]:(%p)\n", regionidx_hpa,
2548 			(void *)(uintptr_t)
2549 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2550 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2551 			"[%d]:(%p)\n", regionidx_hpa,
2552 			(void *)(uintptr_t)
2553 			(mem_region_hpa[regionidx_hpa].memory_size));
2554 		++regionidx_hpa;
2555 	}
2556 	return regionidx_hpa;
2557 }
2558 
2559 /*
2560  * A new device is added to a data core. First the device is added to the main linked list
2561  * and the allocated to a specific data core.
2562  */
2563 static int
2564 new_device (struct virtio_net *dev)
2565 {
2566 	struct virtio_net_data_ll *ll_dev;
2567 	int lcore, core_add = 0;
2568 	uint32_t device_num_min = num_devices;
2569 	struct vhost_dev *vdev;
2570 	uint32_t regionidx;
2571 
2572 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2573 	if (vdev == NULL) {
2574 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2575 			dev->device_fh);
2576 		return -1;
2577 	}
2578 	vdev->dev = dev;
2579 	dev->priv = vdev;
2580 
2581 	if (zero_copy) {
2582 		vdev->nregions_hpa = dev->mem->nregions;
2583 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2584 			vdev->nregions_hpa
2585 				+= check_hpa_regions(
2586 					dev->mem->regions[regionidx].guest_phys_address
2587 					+ dev->mem->regions[regionidx].address_offset,
2588 					dev->mem->regions[regionidx].memory_size);
2589 
2590 		}
2591 
2592 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2593 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2594 			CACHE_LINE_SIZE);
2595 		if (vdev->regions_hpa == NULL) {
2596 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2597 			rte_free(vdev);
2598 			return -1;
2599 		}
2600 
2601 
2602 		if (fill_hpa_memory_regions(
2603 			vdev->regions_hpa, dev->mem
2604 			) != vdev->nregions_hpa) {
2605 
2606 			RTE_LOG(ERR, VHOST_CONFIG,
2607 				"hpa memory regions number mismatch: "
2608 				"[%d]\n", vdev->nregions_hpa);
2609 			rte_free(vdev->regions_hpa);
2610 			rte_free(vdev);
2611 			return -1;
2612 		}
2613 	}
2614 
2615 
2616 	/* Add device to main ll */
2617 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2618 	if (ll_dev == NULL) {
2619 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2620 			"of %d devices per core has been reached\n",
2621 			dev->device_fh, num_devices);
2622 		if (vdev->regions_hpa)
2623 			rte_free(vdev->regions_hpa);
2624 		rte_free(vdev);
2625 		return -1;
2626 	}
2627 	ll_dev->vdev = vdev;
2628 	add_data_ll_entry(&ll_root_used, ll_dev);
2629 	vdev->vmdq_rx_q
2630 		= dev->device_fh * (num_queues / num_devices);
2631 
2632 	if (zero_copy) {
2633 		uint32_t index = vdev->vmdq_rx_q;
2634 		uint32_t count_in_ring, i;
2635 		struct mbuf_table *tx_q;
2636 
2637 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2638 
2639 		LOG_DEBUG(VHOST_CONFIG,
2640 			"(%"PRIu64") in new_device: mbuf count in mempool "
2641 			"before attach is: %d\n",
2642 			dev->device_fh,
2643 			rte_mempool_count(vpool_array[index].pool));
2644 		LOG_DEBUG(VHOST_CONFIG,
2645 			"(%"PRIu64") in new_device: mbuf count in  ring "
2646 			"before attach  is : %d\n",
2647 			dev->device_fh, count_in_ring);
2648 
2649 		/*
2650 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2651 		 */
2652 		for (i = 0; i < count_in_ring; i++)
2653 			attach_rxmbuf_zcp(dev);
2654 
2655 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2656 			"mempool after attach is: %d\n",
2657 			dev->device_fh,
2658 			rte_mempool_count(vpool_array[index].pool));
2659 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2660 			"ring after attach  is : %d\n",
2661 			dev->device_fh,
2662 			rte_ring_count(vpool_array[index].ring));
2663 
2664 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2665 		tx_q->txq_id = vdev->vmdq_rx_q;
2666 
2667 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2668 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2669 
2670 			LOG_DEBUG(VHOST_CONFIG,
2671 				"(%"PRIu64") In new_device: Failed to start "
2672 				"tx queue:%d\n",
2673 				dev->device_fh, vdev->vmdq_rx_q);
2674 
2675 			mbuf_destroy_zcp(vpool);
2676 			rte_free(vdev->regions_hpa);
2677 			rte_free(vdev);
2678 			return -1;
2679 		}
2680 
2681 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2682 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2683 
2684 			LOG_DEBUG(VHOST_CONFIG,
2685 				"(%"PRIu64") In new_device: Failed to start "
2686 				"rx queue:%d\n",
2687 				dev->device_fh, vdev->vmdq_rx_q);
2688 
2689 			/* Stop the TX queue. */
2690 			if (rte_eth_dev_tx_queue_stop(ports[0],
2691 				vdev->vmdq_rx_q) != 0) {
2692 				LOG_DEBUG(VHOST_CONFIG,
2693 					"(%"PRIu64") In new_device: Failed to "
2694 					"stop tx queue:%d\n",
2695 					dev->device_fh, vdev->vmdq_rx_q);
2696 			}
2697 
2698 			mbuf_destroy_zcp(vpool);
2699 			rte_free(vdev->regions_hpa);
2700 			rte_free(vdev);
2701 			return -1;
2702 		}
2703 
2704 	}
2705 
2706 	/*reset ready flag*/
2707 	vdev->ready = DEVICE_MAC_LEARNING;
2708 	vdev->remove = 0;
2709 
2710 	/* Find a suitable lcore to add the device. */
2711 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2712 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2713 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2714 			core_add = lcore;
2715 		}
2716 	}
2717 	/* Add device to lcore ll */
2718 	ll_dev->dev->coreid = core_add;
2719 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2720 	if (ll_dev == NULL) {
2721 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2722 		vdev->ready = DEVICE_SAFE_REMOVE;
2723 		destroy_device(dev);
2724 		if (vdev->regions_hpa)
2725 			rte_free(vdev->regions_hpa);
2726 		rte_free(vdev);
2727 		return -1;
2728 	}
2729 	ll_dev->vdev = vdev;
2730 	vdev->coreid = core_add;
2731 
2732 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2733 
2734 	/* Initialize device stats */
2735 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2736 
2737 	/* Disable notifications. */
2738 	set_irq_status(dev);
2739 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2740 	dev->flags |= VIRTIO_DEV_RUNNING;
2741 
2742 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2743 
2744 	return 0;
2745 }
2746 
2747 /*
2748  * These callback allow devices to be added to the data core when configuration
2749  * has been fully complete.
2750  */
2751 static const struct virtio_net_device_ops virtio_net_device_ops =
2752 {
2753 	.new_device =  new_device,
2754 	.destroy_device = destroy_device,
2755 };
2756 
2757 /*
2758  * This is a thread will wake up after a period to print stats if the user has
2759  * enabled them.
2760  */
2761 static void
2762 print_stats(void)
2763 {
2764 	struct virtio_net_data_ll *dev_ll;
2765 	uint64_t tx_dropped, rx_dropped;
2766 	uint64_t tx, tx_total, rx, rx_total;
2767 	uint32_t device_fh;
2768 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2769 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2770 
2771 	while(1) {
2772 		sleep(enable_stats);
2773 
2774 		/* Clear screen and move to top left */
2775 		printf("%s%s", clr, top_left);
2776 
2777 		printf("\nDevice statistics ====================================");
2778 
2779 		dev_ll = ll_root_used;
2780 		while (dev_ll != NULL) {
2781 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2782 			tx_total = dev_statistics[device_fh].tx_total;
2783 			tx = dev_statistics[device_fh].tx;
2784 			tx_dropped = tx_total - tx;
2785 			if (zero_copy == 0) {
2786 				rx_total = rte_atomic64_read(
2787 					&dev_statistics[device_fh].rx_total_atomic);
2788 				rx = rte_atomic64_read(
2789 					&dev_statistics[device_fh].rx_atomic);
2790 			} else {
2791 				rx_total = dev_statistics[device_fh].rx_total;
2792 				rx = dev_statistics[device_fh].rx;
2793 			}
2794 			rx_dropped = rx_total - rx;
2795 
2796 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2797 					"\nTX total: 		%"PRIu64""
2798 					"\nTX dropped: 		%"PRIu64""
2799 					"\nTX successful: 		%"PRIu64""
2800 					"\nRX total: 		%"PRIu64""
2801 					"\nRX dropped: 		%"PRIu64""
2802 					"\nRX successful: 		%"PRIu64"",
2803 					device_fh,
2804 					tx_total,
2805 					tx_dropped,
2806 					tx,
2807 					rx_total,
2808 					rx_dropped,
2809 					rx);
2810 
2811 			dev_ll = dev_ll->next;
2812 		}
2813 		printf("\n======================================================\n");
2814 	}
2815 }
2816 
2817 static void
2818 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2819 	char *ring_name, uint32_t nb_mbuf)
2820 {
2821 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2822 	vpool_array[index].pool
2823 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2824 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2825 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2826 		rte_pktmbuf_init, NULL, socket, 0);
2827 	if (vpool_array[index].pool != NULL) {
2828 		vpool_array[index].ring
2829 			= rte_ring_create(ring_name,
2830 				rte_align32pow2(nb_mbuf + 1),
2831 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2832 		if (likely(vpool_array[index].ring != NULL)) {
2833 			LOG_DEBUG(VHOST_CONFIG,
2834 				"in setup_mempool_tbl: mbuf count in "
2835 				"mempool is: %d\n",
2836 				rte_mempool_count(vpool_array[index].pool));
2837 			LOG_DEBUG(VHOST_CONFIG,
2838 				"in setup_mempool_tbl: mbuf count in "
2839 				"ring   is: %d\n",
2840 				rte_ring_count(vpool_array[index].ring));
2841 		} else {
2842 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2843 				ring_name);
2844 		}
2845 
2846 		/* Need consider head room. */
2847 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2848 	} else {
2849 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2850 	}
2851 }
2852 
2853 
2854 /*
2855  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2856  * device is also registered here to handle the IOCTLs.
2857  */
2858 int
2859 MAIN(int argc, char *argv[])
2860 {
2861 	struct rte_mempool *mbuf_pool = NULL;
2862 	unsigned lcore_id, core_id = 0;
2863 	unsigned nb_ports, valid_num_ports;
2864 	int ret;
2865 	uint8_t portid, queue_id = 0;
2866 	static pthread_t tid;
2867 
2868 	/* init EAL */
2869 	ret = rte_eal_init(argc, argv);
2870 	if (ret < 0)
2871 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2872 	argc -= ret;
2873 	argv += ret;
2874 
2875 	/* parse app arguments */
2876 	ret = us_vhost_parse_args(argc, argv);
2877 	if (ret < 0)
2878 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2879 
2880 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2881 		if (rte_lcore_is_enabled(lcore_id))
2882 			lcore_ids[core_id ++] = lcore_id;
2883 
2884 	if (rte_lcore_count() > RTE_MAX_LCORE)
2885 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2886 
2887 	/*set the number of swithcing cores available*/
2888 	num_switching_cores = rte_lcore_count()-1;
2889 
2890 	/* Get the number of physical ports. */
2891 	nb_ports = rte_eth_dev_count();
2892 	if (nb_ports > RTE_MAX_ETHPORTS)
2893 		nb_ports = RTE_MAX_ETHPORTS;
2894 
2895 	/*
2896 	 * Update the global var NUM_PORTS and global array PORTS
2897 	 * and get value of var VALID_NUM_PORTS according to system ports number
2898 	 */
2899 	valid_num_ports = check_ports_num(nb_ports);
2900 
2901 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2902 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2903 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2904 		return -1;
2905 	}
2906 
2907 	if (zero_copy == 0) {
2908 		/* Create the mbuf pool. */
2909 		mbuf_pool = rte_mempool_create(
2910 				"MBUF_POOL",
2911 				NUM_MBUFS_PER_PORT
2912 				* valid_num_ports,
2913 				MBUF_SIZE, MBUF_CACHE_SIZE,
2914 				sizeof(struct rte_pktmbuf_pool_private),
2915 				rte_pktmbuf_pool_init, NULL,
2916 				rte_pktmbuf_init, NULL,
2917 				rte_socket_id(), 0);
2918 		if (mbuf_pool == NULL)
2919 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2920 
2921 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2922 			vpool_array[queue_id].pool = mbuf_pool;
2923 
2924 		if (vm2vm_mode == VM2VM_HARDWARE) {
2925 			/* Enable VT loop back to let L2 switch to do it. */
2926 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2927 			LOG_DEBUG(VHOST_CONFIG,
2928 				"Enable loop back for L2 switch in vmdq.\n");
2929 		}
2930 	} else {
2931 		uint32_t nb_mbuf;
2932 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2933 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2934 
2935 		/*
2936 		 * Zero copy defers queue RX/TX start to the time when guest
2937 		 * finishes its startup and packet buffers from that guest are
2938 		 * available.
2939 		 */
2940 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2941 		rx_conf_default.rx_drop_en = 0;
2942 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2943 		nb_mbuf = num_rx_descriptor
2944 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2945 			+ num_switching_cores * MAX_PKT_BURST;
2946 
2947 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2948 			snprintf(pool_name, sizeof(pool_name),
2949 				"rxmbuf_pool_%u", queue_id);
2950 			snprintf(ring_name, sizeof(ring_name),
2951 				"rxmbuf_ring_%u", queue_id);
2952 			setup_mempool_tbl(rte_socket_id(), queue_id,
2953 				pool_name, ring_name, nb_mbuf);
2954 		}
2955 
2956 		nb_mbuf = num_tx_descriptor
2957 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2958 				+ num_switching_cores * MAX_PKT_BURST;
2959 
2960 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2961 			snprintf(pool_name, sizeof(pool_name),
2962 				"txmbuf_pool_%u", queue_id);
2963 			snprintf(ring_name, sizeof(ring_name),
2964 				"txmbuf_ring_%u", queue_id);
2965 			setup_mempool_tbl(rte_socket_id(),
2966 				(queue_id + MAX_QUEUES),
2967 				pool_name, ring_name, nb_mbuf);
2968 		}
2969 
2970 		if (vm2vm_mode == VM2VM_HARDWARE) {
2971 			/* Enable VT loop back to let L2 switch to do it. */
2972 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2973 			LOG_DEBUG(VHOST_CONFIG,
2974 				"Enable loop back for L2 switch in vmdq.\n");
2975 		}
2976 	}
2977 	/* Set log level. */
2978 	rte_set_log_level(LOG_LEVEL);
2979 
2980 	/* initialize all ports */
2981 	for (portid = 0; portid < nb_ports; portid++) {
2982 		/* skip ports that are not enabled */
2983 		if ((enabled_port_mask & (1 << portid)) == 0) {
2984 			RTE_LOG(INFO, VHOST_PORT,
2985 				"Skipping disabled port %d\n", portid);
2986 			continue;
2987 		}
2988 		if (port_init(portid) != 0)
2989 			rte_exit(EXIT_FAILURE,
2990 				"Cannot initialize network ports\n");
2991 	}
2992 
2993 	/* Initialise all linked lists. */
2994 	if (init_data_ll() == -1)
2995 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2996 
2997 	/* Initialize device stats */
2998 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2999 
3000 	/* Enable stats if the user option is set. */
3001 	if (enable_stats)
3002 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3003 
3004 	/* Launch all data cores. */
3005 	if (zero_copy == 0) {
3006 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3007 			rte_eal_remote_launch(switch_worker,
3008 				mbuf_pool, lcore_id);
3009 		}
3010 	} else {
3011 		uint32_t count_in_mempool, index, i;
3012 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3013 			/* For all RX and TX queues. */
3014 			count_in_mempool
3015 				= rte_mempool_count(vpool_array[index].pool);
3016 
3017 			/*
3018 			 * Transfer all un-attached mbufs from vpool.pool
3019 			 * to vpoo.ring.
3020 			 */
3021 			for (i = 0; i < count_in_mempool; i++) {
3022 				struct rte_mbuf *mbuf
3023 					= __rte_mbuf_raw_alloc(
3024 						vpool_array[index].pool);
3025 				rte_ring_sp_enqueue(vpool_array[index].ring,
3026 						(void *)mbuf);
3027 			}
3028 
3029 			LOG_DEBUG(VHOST_CONFIG,
3030 				"in MAIN: mbuf count in mempool at initial "
3031 				"is: %d\n", count_in_mempool);
3032 			LOG_DEBUG(VHOST_CONFIG,
3033 				"in MAIN: mbuf count in  ring at initial  is :"
3034 				" %d\n",
3035 				rte_ring_count(vpool_array[index].ring));
3036 		}
3037 
3038 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3039 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3040 				lcore_id);
3041 	}
3042 
3043 	/* Register CUSE device to handle IOCTLs. */
3044 	ret = rte_vhost_driver_register((char *)&dev_basename);
3045 	if (ret != 0)
3046 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3047 
3048 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3049 
3050 	/* Start CUSE session. */
3051 	rte_vhost_driver_session_start();
3052 	return 0;
3053 
3054 }
3055 
3056