xref: /dpdk/examples/vhost/main.c (revision 8b9bb988f78cfd930b653e7626591544fd846ec8)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144 
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147 
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150 
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154 
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161 
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165 
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168 
169 struct vpool {
170 	struct rte_mempool *pool;
171 	struct rte_ring *ring;
172 	uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174 
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177 	VM2VM_DISABLED = 0,
178 	VM2VM_SOFTWARE = 1,
179 	VM2VM_HARDWARE = 2,
180 	VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183 
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186 	PHYS_ADDR_CONTINUOUS = 0,
187 	PHYS_ADDR_CROSS_SUBREG = 1,
188 	PHYS_ADDR_INVALID = 2,
189 	PHYS_ADDR_LAST
190 } hpa_type;
191 
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200 
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203 
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206 	.rxmode = {
207 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208 		.split_hdr_size = 0,
209 		.header_split   = 0, /**< Header Split disabled */
210 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
211 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
212 		/*
213 		 * It is necessary for 1G NIC such as I350,
214 		 * this fixes bug of ipv4 forwarding in guest can't
215 		 * forward pakets from one virtio dev to another virtio dev.
216 		 */
217 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
220 	},
221 
222 	.txmode = {
223 		.mq_mode = ETH_MQ_TX_NONE,
224 	},
225 	.rx_adv_conf = {
226 		/*
227 		 * should be overridden separately in code with
228 		 * appropriate values
229 		 */
230 		.vmdq_rx_conf = {
231 			.nb_queue_pools = ETH_8_POOLS,
232 			.enable_default_pool = 0,
233 			.default_pool = 0,
234 			.nb_pool_maps = 0,
235 			.pool_map = {{0, 0},},
236 		},
237 	},
238 };
239 
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246 
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
251 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258 
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261 
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265 
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268 
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271 	unsigned len;
272 	unsigned txq_id;
273 	struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275 
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278 
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281 
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284 	unsigned char   h_dest[ETH_ALEN];
285 	unsigned char   h_source[ETH_ALEN];
286 	__be16          h_vlan_proto;
287 	__be16          h_vlan_TCI;
288 	__be16          h_vlan_encapsulated_proto;
289 };
290 
291 /* IPv4 Header */
292 struct ipv4_hdr {
293 	uint8_t  version_ihl;		/**< version and header length */
294 	uint8_t  type_of_service;	/**< type of service */
295 	uint16_t total_length;		/**< length of packet */
296 	uint16_t packet_id;		/**< packet ID */
297 	uint16_t fragment_offset;	/**< fragmentation offset */
298 	uint8_t  time_to_live;		/**< time to live */
299 	uint8_t  next_proto_id;		/**< protocol ID */
300 	uint16_t hdr_checksum;		/**< header checksum */
301 	uint32_t src_addr;		/**< source address */
302 	uint32_t dst_addr;		/**< destination address */
303 } __attribute__((__packed__));
304 
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308 
309 /* Per-device statistics struct */
310 struct device_statistics {
311 	uint64_t tx_total;
312 	rte_atomic64_t rx_total_atomic;
313 	uint64_t rx_total;
314 	uint64_t tx;
315 	rte_atomic64_t rx_atomic;
316 	uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319 
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327 	struct rte_eth_vmdq_rx_conf conf;
328 	struct rte_eth_vmdq_rx_conf *def_conf =
329 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330 	unsigned i;
331 
332 	memset(&conf, 0, sizeof(conf));
333 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334 	conf.nb_pool_maps = num_devices;
335 	conf.enable_loop_back = def_conf->enable_loop_back;
336 	conf.rx_mode = def_conf->rx_mode;
337 
338 	for (i = 0; i < conf.nb_pool_maps; i++) {
339 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
340 		conf.pool_map[i].pools = (1UL << i);
341 	}
342 
343 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346 	return 0;
347 }
348 
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357 	if (num_devices > max_nb_devices) {
358 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359 		return -1;
360 	}
361 	return 0;
362 }
363 
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371 	struct rte_eth_dev_info dev_info;
372 	struct rte_eth_conf port_conf;
373 	struct rte_eth_rxconf *rxconf;
374 	struct rte_eth_txconf *txconf;
375 	int16_t rx_rings, tx_rings;
376 	uint16_t rx_ring_size, tx_ring_size;
377 	int retval;
378 	uint16_t q;
379 
380 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381 	rte_eth_dev_info_get (port, &dev_info);
382 
383 	if (dev_info.max_rx_queues > MAX_QUEUES) {
384 		rte_exit(EXIT_FAILURE,
385 			"please define MAX_QUEUES no less than %u in %s\n",
386 			dev_info.max_rx_queues, __FILE__);
387 	}
388 
389 	rxconf = &dev_info.default_rxconf;
390 	txconf = &dev_info.default_txconf;
391 	rxconf->rx_drop_en = 1;
392 
393 	/* Enable vlan offload */
394 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
395 
396 	/*
397 	 * Zero copy defers queue RX/TX start to the time when guest
398 	 * finishes its startup and packet buffers from that guest are
399 	 * available.
400 	 */
401 	if (zero_copy) {
402 		rxconf->rx_deferred_start = 1;
403 		rxconf->rx_drop_en = 0;
404 		txconf->tx_deferred_start = 1;
405 	}
406 
407 	/*configure the number of supported virtio devices based on VMDQ limits */
408 	num_devices = dev_info.max_vmdq_pools;
409 
410 	if (zero_copy) {
411 		rx_ring_size = num_rx_descriptor;
412 		tx_ring_size = num_tx_descriptor;
413 		tx_rings = dev_info.max_tx_queues;
414 	} else {
415 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
416 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
417 		tx_rings = (uint16_t)rte_lcore_count();
418 	}
419 
420 	retval = validate_num_devices(MAX_DEVICES);
421 	if (retval < 0)
422 		return retval;
423 
424 	/* Get port configuration. */
425 	retval = get_eth_conf(&port_conf, num_devices);
426 	if (retval < 0)
427 		return retval;
428 	/* NIC queues are divided into pf queues and vmdq queues.  */
429 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
430 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
431 	num_vmdq_queues = num_devices * queues_per_pool;
432 	num_queues = num_pf_queues + num_vmdq_queues;
433 	vmdq_queue_base = dev_info.vmdq_queue_base;
434 	vmdq_pool_base  = dev_info.vmdq_pool_base;
435 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
436 		num_pf_queues, num_devices, queues_per_pool);
437 
438 	if (port >= rte_eth_dev_count()) return -1;
439 
440 	rx_rings = (uint16_t)dev_info.max_rx_queues;
441 	/* Configure ethernet device. */
442 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
443 	if (retval != 0)
444 		return retval;
445 
446 	/* Setup the queues. */
447 	for (q = 0; q < rx_rings; q ++) {
448 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
449 						rte_eth_dev_socket_id(port),
450 						rxconf,
451 						vpool_array[q].pool);
452 		if (retval < 0)
453 			return retval;
454 	}
455 	for (q = 0; q < tx_rings; q ++) {
456 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
457 						rte_eth_dev_socket_id(port),
458 						txconf);
459 		if (retval < 0)
460 			return retval;
461 	}
462 
463 	/* Start the device. */
464 	retval  = rte_eth_dev_start(port);
465 	if (retval < 0) {
466 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
467 		return retval;
468 	}
469 
470 	if (promiscuous)
471 		rte_eth_promiscuous_enable(port);
472 
473 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
474 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
475 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
476 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
477 			(unsigned)port,
478 			vmdq_ports_eth_addr[port].addr_bytes[0],
479 			vmdq_ports_eth_addr[port].addr_bytes[1],
480 			vmdq_ports_eth_addr[port].addr_bytes[2],
481 			vmdq_ports_eth_addr[port].addr_bytes[3],
482 			vmdq_ports_eth_addr[port].addr_bytes[4],
483 			vmdq_ports_eth_addr[port].addr_bytes[5]);
484 
485 	return 0;
486 }
487 
488 /*
489  * Set character device basename.
490  */
491 static int
492 us_vhost_parse_basename(const char *q_arg)
493 {
494 	/* parse number string */
495 
496 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
497 		return -1;
498 	else
499 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
500 
501 	return 0;
502 }
503 
504 /*
505  * Parse the portmask provided at run time.
506  */
507 static int
508 parse_portmask(const char *portmask)
509 {
510 	char *end = NULL;
511 	unsigned long pm;
512 
513 	errno = 0;
514 
515 	/* parse hexadecimal string */
516 	pm = strtoul(portmask, &end, 16);
517 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
518 		return -1;
519 
520 	if (pm == 0)
521 		return -1;
522 
523 	return pm;
524 
525 }
526 
527 /*
528  * Parse num options at run time.
529  */
530 static int
531 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
532 {
533 	char *end = NULL;
534 	unsigned long num;
535 
536 	errno = 0;
537 
538 	/* parse unsigned int string */
539 	num = strtoul(q_arg, &end, 10);
540 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
541 		return -1;
542 
543 	if (num > max_valid_value)
544 		return -1;
545 
546 	return num;
547 
548 }
549 
550 /*
551  * Display usage
552  */
553 static void
554 us_vhost_usage(const char *prgname)
555 {
556 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
557 	"		--vm2vm [0|1|2]\n"
558 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
559 	"		--dev-basename <name>\n"
560 	"		--nb-devices ND\n"
561 	"		-p PORTMASK: Set mask for ports to be used by application\n"
562 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
563 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
564 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
565 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
566 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
567 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
568 	"		--dev-basename: The basename to be used for the character device.\n"
569 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
570 			"zero copy\n"
571 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
572 			"used only when zero copy is enabled.\n"
573 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
574 			"used only when zero copy is enabled.\n",
575 	       prgname);
576 }
577 
578 /*
579  * Parse the arguments given in the command line of the application.
580  */
581 static int
582 us_vhost_parse_args(int argc, char **argv)
583 {
584 	int opt, ret;
585 	int option_index;
586 	unsigned i;
587 	const char *prgname = argv[0];
588 	static struct option long_option[] = {
589 		{"vm2vm", required_argument, NULL, 0},
590 		{"rx-retry", required_argument, NULL, 0},
591 		{"rx-retry-delay", required_argument, NULL, 0},
592 		{"rx-retry-num", required_argument, NULL, 0},
593 		{"mergeable", required_argument, NULL, 0},
594 		{"stats", required_argument, NULL, 0},
595 		{"dev-basename", required_argument, NULL, 0},
596 		{"zero-copy", required_argument, NULL, 0},
597 		{"rx-desc-num", required_argument, NULL, 0},
598 		{"tx-desc-num", required_argument, NULL, 0},
599 		{NULL, 0, 0, 0},
600 	};
601 
602 	/* Parse command line */
603 	while ((opt = getopt_long(argc, argv, "p:P",
604 			long_option, &option_index)) != EOF) {
605 		switch (opt) {
606 		/* Portmask */
607 		case 'p':
608 			enabled_port_mask = parse_portmask(optarg);
609 			if (enabled_port_mask == 0) {
610 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611 				us_vhost_usage(prgname);
612 				return -1;
613 			}
614 			break;
615 
616 		case 'P':
617 			promiscuous = 1;
618 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
619 				ETH_VMDQ_ACCEPT_BROADCAST |
620 				ETH_VMDQ_ACCEPT_MULTICAST;
621 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
622 
623 			break;
624 
625 		case 0:
626 			/* Enable/disable vm2vm comms. */
627 			if (!strncmp(long_option[option_index].name, "vm2vm",
628 				MAX_LONG_OPT_SZ)) {
629 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
630 				if (ret == -1) {
631 					RTE_LOG(INFO, VHOST_CONFIG,
632 						"Invalid argument for "
633 						"vm2vm [0|1|2]\n");
634 					us_vhost_usage(prgname);
635 					return -1;
636 				} else {
637 					vm2vm_mode = (vm2vm_type)ret;
638 				}
639 			}
640 
641 			/* Enable/disable retries on RX. */
642 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
643 				ret = parse_num_opt(optarg, 1);
644 				if (ret == -1) {
645 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
646 					us_vhost_usage(prgname);
647 					return -1;
648 				} else {
649 					enable_retry = ret;
650 				}
651 			}
652 
653 			/* Specify the retries delay time (in useconds) on RX. */
654 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
655 				ret = parse_num_opt(optarg, INT32_MAX);
656 				if (ret == -1) {
657 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
658 					us_vhost_usage(prgname);
659 					return -1;
660 				} else {
661 					burst_rx_delay_time = ret;
662 				}
663 			}
664 
665 			/* Specify the retries number on RX. */
666 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
667 				ret = parse_num_opt(optarg, INT32_MAX);
668 				if (ret == -1) {
669 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
670 					us_vhost_usage(prgname);
671 					return -1;
672 				} else {
673 					burst_rx_retry_num = ret;
674 				}
675 			}
676 
677 			/* Enable/disable RX mergeable buffers. */
678 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
679 				ret = parse_num_opt(optarg, 1);
680 				if (ret == -1) {
681 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
682 					us_vhost_usage(prgname);
683 					return -1;
684 				} else {
685 					mergeable = !!ret;
686 					if (ret) {
687 						vmdq_conf_default.rxmode.jumbo_frame = 1;
688 						vmdq_conf_default.rxmode.max_rx_pkt_len
689 							= JUMBO_FRAME_MAX_SIZE;
690 					}
691 				}
692 			}
693 
694 			/* Enable/disable stats. */
695 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
696 				ret = parse_num_opt(optarg, INT32_MAX);
697 				if (ret == -1) {
698 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
699 					us_vhost_usage(prgname);
700 					return -1;
701 				} else {
702 					enable_stats = ret;
703 				}
704 			}
705 
706 			/* Set character device basename. */
707 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
708 				if (us_vhost_parse_basename(optarg) == -1) {
709 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
710 					us_vhost_usage(prgname);
711 					return -1;
712 				}
713 			}
714 
715 			/* Enable/disable rx/tx zero copy. */
716 			if (!strncmp(long_option[option_index].name,
717 				"zero-copy", MAX_LONG_OPT_SZ)) {
718 				ret = parse_num_opt(optarg, 1);
719 				if (ret == -1) {
720 					RTE_LOG(INFO, VHOST_CONFIG,
721 						"Invalid argument"
722 						" for zero-copy [0|1]\n");
723 					us_vhost_usage(prgname);
724 					return -1;
725 				} else
726 					zero_copy = ret;
727 
728 				if (zero_copy) {
729 #ifdef RTE_MBUF_REFCNT
730 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
731 					"zero copy vhost APP, please "
732 					"disable RTE_MBUF_REFCNT\n"
733 					"in config file and then rebuild DPDK "
734 					"core lib!\n"
735 					"Otherwise please disable zero copy "
736 					"flag in command line!\n");
737 					return -1;
738 #endif
739 				}
740 			}
741 
742 			/* Specify the descriptor number on RX. */
743 			if (!strncmp(long_option[option_index].name,
744 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
745 				ret = parse_num_opt(optarg, MAX_RING_DESC);
746 				if ((ret == -1) || (!POWEROF2(ret))) {
747 					RTE_LOG(INFO, VHOST_CONFIG,
748 					"Invalid argument for rx-desc-num[0-N],"
749 					"power of 2 required.\n");
750 					us_vhost_usage(prgname);
751 					return -1;
752 				} else {
753 					num_rx_descriptor = ret;
754 				}
755 			}
756 
757 			/* Specify the descriptor number on TX. */
758 			if (!strncmp(long_option[option_index].name,
759 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
760 				ret = parse_num_opt(optarg, MAX_RING_DESC);
761 				if ((ret == -1) || (!POWEROF2(ret))) {
762 					RTE_LOG(INFO, VHOST_CONFIG,
763 					"Invalid argument for tx-desc-num [0-N],"
764 					"power of 2 required.\n");
765 					us_vhost_usage(prgname);
766 					return -1;
767 				} else {
768 					num_tx_descriptor = ret;
769 				}
770 			}
771 
772 			break;
773 
774 			/* Invalid option - print options. */
775 		default:
776 			us_vhost_usage(prgname);
777 			return -1;
778 		}
779 	}
780 
781 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
782 		if (enabled_port_mask & (1 << i))
783 			ports[num_ports++] = (uint8_t)i;
784 	}
785 
786 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
787 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
788 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
789 		return -1;
790 	}
791 
792 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
793 		RTE_LOG(INFO, VHOST_PORT,
794 			"Vhost zero copy doesn't support software vm2vm,"
795 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
796 		return -1;
797 	}
798 
799 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
800 		RTE_LOG(INFO, VHOST_PORT,
801 			"Vhost zero copy doesn't support jumbo frame,"
802 			"please specify '--mergeable 0' to disable the "
803 			"mergeable feature.\n");
804 		return -1;
805 	}
806 
807 	return 0;
808 }
809 
810 /*
811  * Update the global var NUM_PORTS and array PORTS according to system ports number
812  * and return valid ports number
813  */
814 static unsigned check_ports_num(unsigned nb_ports)
815 {
816 	unsigned valid_num_ports = num_ports;
817 	unsigned portid;
818 
819 	if (num_ports > nb_ports) {
820 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
821 			num_ports, nb_ports);
822 		num_ports = nb_ports;
823 	}
824 
825 	for (portid = 0; portid < num_ports; portid ++) {
826 		if (ports[portid] >= nb_ports) {
827 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
828 				ports[portid], (nb_ports - 1));
829 			ports[portid] = INVALID_PORT_ID;
830 			valid_num_ports--;
831 		}
832 	}
833 	return valid_num_ports;
834 }
835 
836 /*
837  * Macro to print out packet contents. Wrapped in debug define so that the
838  * data path is not effected when debug is disabled.
839  */
840 #ifdef DEBUG
841 #define PRINT_PACKET(device, addr, size, header) do {																\
842 	char *pkt_addr = (char*)(addr);																					\
843 	unsigned int index;																								\
844 	char packet[MAX_PRINT_BUFF];																					\
845 																													\
846 	if ((header))																									\
847 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
848 	else																											\
849 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
850 	for (index = 0; index < (size); index++) {																		\
851 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
852 			"%02hhx ", pkt_addr[index]);																			\
853 	}																												\
854 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
855 																													\
856 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
857 } while(0)
858 #else
859 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
860 #endif
861 
862 /*
863  * Function to convert guest physical addresses to vhost physical addresses.
864  * This is used to convert virtio buffer addresses.
865  */
866 static inline uint64_t __attribute__((always_inline))
867 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
868 	uint32_t buf_len, hpa_type *addr_type)
869 {
870 	struct virtio_memory_regions_hpa *region;
871 	uint32_t regionidx;
872 	uint64_t vhost_pa = 0;
873 
874 	*addr_type = PHYS_ADDR_INVALID;
875 
876 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
877 		region = &vdev->regions_hpa[regionidx];
878 		if ((guest_pa >= region->guest_phys_address) &&
879 			(guest_pa <= region->guest_phys_address_end)) {
880 			vhost_pa = region->host_phys_addr_offset + guest_pa;
881 			if (likely((guest_pa + buf_len - 1)
882 				<= region->guest_phys_address_end))
883 				*addr_type = PHYS_ADDR_CONTINUOUS;
884 			else
885 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
886 			break;
887 		}
888 	}
889 
890 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
891 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
892 		(void *)(uintptr_t)vhost_pa);
893 
894 	return vhost_pa;
895 }
896 
897 /*
898  * Compares a packet destination MAC address to a device MAC address.
899  */
900 static inline int __attribute__((always_inline))
901 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
902 {
903 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
904 }
905 
906 /*
907  * This function learns the MAC address of the device and registers this along with a
908  * vlan tag to a VMDQ.
909  */
910 static int
911 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
912 {
913 	struct ether_hdr *pkt_hdr;
914 	struct virtio_net_data_ll *dev_ll;
915 	struct virtio_net *dev = vdev->dev;
916 	int i, ret;
917 
918 	/* Learn MAC address of guest device from packet */
919 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
920 
921 	dev_ll = ll_root_used;
922 
923 	while (dev_ll != NULL) {
924 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
925 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
926 			return -1;
927 		}
928 		dev_ll = dev_ll->next;
929 	}
930 
931 	for (i = 0; i < ETHER_ADDR_LEN; i++)
932 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
933 
934 	/* vlan_tag currently uses the device_id. */
935 	vdev->vlan_tag = vlan_tags[dev->device_fh];
936 
937 	/* Print out VMDQ registration info. */
938 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
939 		dev->device_fh,
940 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
941 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
942 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
943 		vdev->vlan_tag);
944 
945 	/* Register the MAC address. */
946 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
947 				(uint32_t)dev->device_fh + vmdq_pool_base);
948 	if (ret)
949 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
950 					dev->device_fh);
951 
952 	/* Enable stripping of the vlan tag as we handle routing. */
953 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
954 
955 	/* Set device as ready for RX. */
956 	vdev->ready = DEVICE_RX;
957 
958 	return 0;
959 }
960 
961 /*
962  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
963  * queue before disabling RX on the device.
964  */
965 static inline void
966 unlink_vmdq(struct vhost_dev *vdev)
967 {
968 	unsigned i = 0;
969 	unsigned rx_count;
970 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
971 
972 	if (vdev->ready == DEVICE_RX) {
973 		/*clear MAC and VLAN settings*/
974 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
975 		for (i = 0; i < 6; i++)
976 			vdev->mac_address.addr_bytes[i] = 0;
977 
978 		vdev->vlan_tag = 0;
979 
980 		/*Clear out the receive buffers*/
981 		rx_count = rte_eth_rx_burst(ports[0],
982 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
983 
984 		while (rx_count) {
985 			for (i = 0; i < rx_count; i++)
986 				rte_pktmbuf_free(pkts_burst[i]);
987 
988 			rx_count = rte_eth_rx_burst(ports[0],
989 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
990 		}
991 
992 		vdev->ready = DEVICE_MAC_LEARNING;
993 	}
994 }
995 
996 /*
997  * Check if the packet destination MAC address is for a local device. If so then put
998  * the packet on that devices RX queue. If not then return.
999  */
1000 static inline int __attribute__((always_inline))
1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1002 {
1003 	struct virtio_net_data_ll *dev_ll;
1004 	struct ether_hdr *pkt_hdr;
1005 	uint64_t ret = 0;
1006 	struct virtio_net *dev = vdev->dev;
1007 	struct virtio_net *tdev; /* destination virito device */
1008 
1009 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1010 
1011 	/*get the used devices list*/
1012 	dev_ll = ll_root_used;
1013 
1014 	while (dev_ll != NULL) {
1015 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1016 				          &dev_ll->vdev->mac_address)) {
1017 
1018 			/* Drop the packet if the TX packet is destined for the TX device. */
1019 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1020 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1021 							dev->device_fh);
1022 				return 0;
1023 			}
1024 			tdev = dev_ll->vdev->dev;
1025 
1026 
1027 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1028 
1029 			if (unlikely(dev_ll->vdev->remove)) {
1030 				/*drop the packet if the device is marked for removal*/
1031 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1032 			} else {
1033 				/*send the packet to the local virtio device*/
1034 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1035 				if (enable_stats) {
1036 					rte_atomic64_add(
1037 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1038 					1);
1039 					rte_atomic64_add(
1040 					&dev_statistics[tdev->device_fh].rx_atomic,
1041 					ret);
1042 					dev_statistics[tdev->device_fh].tx_total++;
1043 					dev_statistics[tdev->device_fh].tx += ret;
1044 				}
1045 			}
1046 
1047 			return 0;
1048 		}
1049 		dev_ll = dev_ll->next;
1050 	}
1051 
1052 	return -1;
1053 }
1054 
1055 /*
1056  * Check if the destination MAC of a packet is one local VM,
1057  * and get its vlan tag, and offset if it is.
1058  */
1059 static inline int __attribute__((always_inline))
1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1061 	uint32_t *offset, uint16_t *vlan_tag)
1062 {
1063 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1064 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1065 
1066 	while (dev_ll != NULL) {
1067 		if ((dev_ll->vdev->ready == DEVICE_RX)
1068 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1069 		&dev_ll->vdev->mac_address)) {
1070 			/*
1071 			 * Drop the packet if the TX packet is
1072 			 * destined for the TX device.
1073 			 */
1074 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1075 				LOG_DEBUG(VHOST_DATA,
1076 				"(%"PRIu64") TX: Source and destination"
1077 				" MAC addresses are the same. Dropping "
1078 				"packet.\n",
1079 				dev_ll->vdev->dev->device_fh);
1080 				return -1;
1081 			}
1082 
1083 			/*
1084 			 * HW vlan strip will reduce the packet length
1085 			 * by minus length of vlan tag, so need restore
1086 			 * the packet length by plus it.
1087 			 */
1088 			*offset = VLAN_HLEN;
1089 			*vlan_tag =
1090 			(uint16_t)
1091 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1092 
1093 			LOG_DEBUG(VHOST_DATA,
1094 			"(%"PRIu64") TX: pkt to local VM device id:"
1095 			"(%"PRIu64") vlan tag: %d.\n",
1096 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1097 			vlan_tag);
1098 
1099 			break;
1100 		}
1101 		dev_ll = dev_ll->next;
1102 	}
1103 	return 0;
1104 }
1105 
1106 /*
1107  * This function routes the TX packet to the correct interface. This may be a local device
1108  * or the physical port.
1109  */
1110 static inline void __attribute__((always_inline))
1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1112 {
1113 	struct mbuf_table *tx_q;
1114 	struct rte_mbuf **m_table;
1115 	unsigned len, ret, offset = 0;
1116 	const uint16_t lcore_id = rte_lcore_id();
1117 	struct virtio_net *dev = vdev->dev;
1118 	struct ether_hdr *nh;
1119 
1120 	/*check if destination is local VM*/
1121 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1122 		rte_pktmbuf_free(m);
1123 		return;
1124 	}
1125 
1126 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1127 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1128 			rte_pktmbuf_free(m);
1129 			return;
1130 		}
1131 	}
1132 
1133 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1134 
1135 	/*Add packet to the port tx queue*/
1136 	tx_q = &lcore_tx_queue[lcore_id];
1137 	len = tx_q->len;
1138 
1139 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1140 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1141 		/* Guest has inserted the vlan tag. */
1142 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1143 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1144 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1145 			(vh->vlan_tci != vlan_tag_be))
1146 			vh->vlan_tci = vlan_tag_be;
1147 	} else {
1148 		m->ol_flags = PKT_TX_VLAN_PKT;
1149 
1150 		/*
1151 		 * Find the right seg to adjust the data len when offset is
1152 		 * bigger than tail room size.
1153 		 */
1154 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1155 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1156 				m->data_len += offset;
1157 			else {
1158 				struct rte_mbuf *seg = m;
1159 
1160 				while ((seg->next != NULL) &&
1161 					(offset > rte_pktmbuf_tailroom(seg)))
1162 					seg = seg->next;
1163 
1164 				seg->data_len += offset;
1165 			}
1166 			m->pkt_len += offset;
1167 		}
1168 
1169 		m->vlan_tci = vlan_tag;
1170 	}
1171 
1172 	tx_q->m_table[len] = m;
1173 	len++;
1174 	if (enable_stats) {
1175 		dev_statistics[dev->device_fh].tx_total++;
1176 		dev_statistics[dev->device_fh].tx++;
1177 	}
1178 
1179 	if (unlikely(len == MAX_PKT_BURST)) {
1180 		m_table = (struct rte_mbuf **)tx_q->m_table;
1181 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1182 		/* Free any buffers not handled by TX and update the port stats. */
1183 		if (unlikely(ret < len)) {
1184 			do {
1185 				rte_pktmbuf_free(m_table[ret]);
1186 			} while (++ret < len);
1187 		}
1188 
1189 		len = 0;
1190 	}
1191 
1192 	tx_q->len = len;
1193 	return;
1194 }
1195 /*
1196  * This function is called by each data core. It handles all RX/TX registered with the
1197  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1198  * with all devices in the main linked list.
1199  */
1200 static int
1201 switch_worker(__attribute__((unused)) void *arg)
1202 {
1203 	struct rte_mempool *mbuf_pool = arg;
1204 	struct virtio_net *dev = NULL;
1205 	struct vhost_dev *vdev = NULL;
1206 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1207 	struct virtio_net_data_ll *dev_ll;
1208 	struct mbuf_table *tx_q;
1209 	volatile struct lcore_ll_info *lcore_ll;
1210 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1211 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1212 	unsigned ret, i;
1213 	const uint16_t lcore_id = rte_lcore_id();
1214 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1215 	uint16_t rx_count = 0;
1216 	uint16_t tx_count;
1217 	uint32_t retry = 0;
1218 
1219 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1220 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1221 	prev_tsc = 0;
1222 
1223 	tx_q = &lcore_tx_queue[lcore_id];
1224 	for (i = 0; i < num_cores; i ++) {
1225 		if (lcore_ids[i] == lcore_id) {
1226 			tx_q->txq_id = i;
1227 			break;
1228 		}
1229 	}
1230 
1231 	while(1) {
1232 		cur_tsc = rte_rdtsc();
1233 		/*
1234 		 * TX burst queue drain
1235 		 */
1236 		diff_tsc = cur_tsc - prev_tsc;
1237 		if (unlikely(diff_tsc > drain_tsc)) {
1238 
1239 			if (tx_q->len) {
1240 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1241 
1242 				/*Tx any packets in the queue*/
1243 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1244 									   (struct rte_mbuf **)tx_q->m_table,
1245 									   (uint16_t)tx_q->len);
1246 				if (unlikely(ret < tx_q->len)) {
1247 					do {
1248 						rte_pktmbuf_free(tx_q->m_table[ret]);
1249 					} while (++ret < tx_q->len);
1250 				}
1251 
1252 				tx_q->len = 0;
1253 			}
1254 
1255 			prev_tsc = cur_tsc;
1256 
1257 		}
1258 
1259 		rte_prefetch0(lcore_ll->ll_root_used);
1260 		/*
1261 		 * Inform the configuration core that we have exited the linked list and that no devices are
1262 		 * in use if requested.
1263 		 */
1264 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1265 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1266 
1267 		/*
1268 		 * Process devices
1269 		 */
1270 		dev_ll = lcore_ll->ll_root_used;
1271 
1272 		while (dev_ll != NULL) {
1273 			/*get virtio device ID*/
1274 			vdev = dev_ll->vdev;
1275 			dev = vdev->dev;
1276 
1277 			if (unlikely(vdev->remove)) {
1278 				dev_ll = dev_ll->next;
1279 				unlink_vmdq(vdev);
1280 				vdev->ready = DEVICE_SAFE_REMOVE;
1281 				continue;
1282 			}
1283 			if (likely(vdev->ready == DEVICE_RX)) {
1284 				/*Handle guest RX*/
1285 				rx_count = rte_eth_rx_burst(ports[0],
1286 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1287 
1288 				if (rx_count) {
1289 					/*
1290 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1291 					* Here MAX_PKT_BURST must be less than virtio queue size
1292 					*/
1293 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1294 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1295 							rte_delay_us(burst_rx_delay_time);
1296 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1297 								break;
1298 						}
1299 					}
1300 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1301 					if (enable_stats) {
1302 						rte_atomic64_add(
1303 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1304 						rx_count);
1305 						rte_atomic64_add(
1306 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1307 					}
1308 					while (likely(rx_count)) {
1309 						rx_count--;
1310 						rte_pktmbuf_free(pkts_burst[rx_count]);
1311 					}
1312 
1313 				}
1314 			}
1315 
1316 			if (likely(!vdev->remove)) {
1317 				/* Handle guest TX*/
1318 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1319 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1320 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1321 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1322 						while (tx_count)
1323 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1324 					}
1325 				}
1326 				while (tx_count)
1327 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1328 			}
1329 
1330 			/*move to the next device in the list*/
1331 			dev_ll = dev_ll->next;
1332 		}
1333 	}
1334 
1335 	return 0;
1336 }
1337 
1338 /*
1339  * This function gets available ring number for zero copy rx.
1340  * Only one thread will call this funciton for a paticular virtio device,
1341  * so, it is designed as non-thread-safe function.
1342  */
1343 static inline uint32_t __attribute__((always_inline))
1344 get_available_ring_num_zcp(struct virtio_net *dev)
1345 {
1346 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1347 	uint16_t avail_idx;
1348 
1349 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1350 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1351 }
1352 
1353 /*
1354  * This function gets available ring index for zero copy rx,
1355  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1356  * Only one thread will call this funciton for a paticular virtio device,
1357  * so, it is designed as non-thread-safe function.
1358  */
1359 static inline uint32_t __attribute__((always_inline))
1360 get_available_ring_index_zcp(struct virtio_net *dev,
1361 	uint16_t *res_base_idx, uint32_t count)
1362 {
1363 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1364 	uint16_t avail_idx;
1365 	uint32_t retry = 0;
1366 	uint16_t free_entries;
1367 
1368 	*res_base_idx = vq->last_used_idx_res;
1369 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1370 	free_entries = (avail_idx - *res_base_idx);
1371 
1372 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1373 			"avail idx: %d, "
1374 			"res base idx:%d, free entries:%d\n",
1375 			dev->device_fh, avail_idx, *res_base_idx,
1376 			free_entries);
1377 
1378 	/*
1379 	 * If retry is enabled and the queue is full then we wait
1380 	 * and retry to avoid packet loss.
1381 	 */
1382 	if (enable_retry && unlikely(count > free_entries)) {
1383 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1384 			rte_delay_us(burst_rx_delay_time);
1385 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1386 			free_entries = (avail_idx - *res_base_idx);
1387 			if (count <= free_entries)
1388 				break;
1389 		}
1390 	}
1391 
1392 	/*check that we have enough buffers*/
1393 	if (unlikely(count > free_entries))
1394 		count = free_entries;
1395 
1396 	if (unlikely(count == 0)) {
1397 		LOG_DEBUG(VHOST_DATA,
1398 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1399 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1400 			dev->device_fh, avail_idx,
1401 			*res_base_idx, free_entries);
1402 		return 0;
1403 	}
1404 
1405 	vq->last_used_idx_res = *res_base_idx + count;
1406 
1407 	return count;
1408 }
1409 
1410 /*
1411  * This function put descriptor back to used list.
1412  */
1413 static inline void __attribute__((always_inline))
1414 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1415 {
1416 	uint16_t res_cur_idx = vq->last_used_idx;
1417 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1418 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1419 	rte_compiler_barrier();
1420 	*(volatile uint16_t *)&vq->used->idx += 1;
1421 	vq->last_used_idx += 1;
1422 
1423 	/* Kick the guest if necessary. */
1424 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1425 		eventfd_write((int)vq->kickfd, 1);
1426 }
1427 
1428 /*
1429  * This function get available descriptor from vitio vring and un-attached mbuf
1430  * from vpool->ring, and then attach them together. It needs adjust the offset
1431  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1432  * frame data may be put to wrong location in mbuf.
1433  */
1434 static inline void __attribute__((always_inline))
1435 attach_rxmbuf_zcp(struct virtio_net *dev)
1436 {
1437 	uint16_t res_base_idx, desc_idx;
1438 	uint64_t buff_addr, phys_addr;
1439 	struct vhost_virtqueue *vq;
1440 	struct vring_desc *desc;
1441 	struct rte_mbuf *mbuf = NULL;
1442 	struct vpool *vpool;
1443 	hpa_type addr_type;
1444 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1445 
1446 	vpool = &vpool_array[vdev->vmdq_rx_q];
1447 	vq = dev->virtqueue[VIRTIO_RXQ];
1448 
1449 	do {
1450 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1451 				1) != 1))
1452 			return;
1453 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1454 
1455 		desc = &vq->desc[desc_idx];
1456 		if (desc->flags & VRING_DESC_F_NEXT) {
1457 			desc = &vq->desc[desc->next];
1458 			buff_addr = gpa_to_vva(dev, desc->addr);
1459 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1460 					&addr_type);
1461 		} else {
1462 			buff_addr = gpa_to_vva(dev,
1463 					desc->addr + vq->vhost_hlen);
1464 			phys_addr = gpa_to_hpa(vdev,
1465 					desc->addr + vq->vhost_hlen,
1466 					desc->len, &addr_type);
1467 		}
1468 
1469 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1470 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1471 				" address found when attaching RX frame buffer"
1472 				" address!\n", dev->device_fh);
1473 			put_desc_to_used_list_zcp(vq, desc_idx);
1474 			continue;
1475 		}
1476 
1477 		/*
1478 		 * Check if the frame buffer address from guest crosses
1479 		 * sub-region or not.
1480 		 */
1481 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1482 			RTE_LOG(ERR, VHOST_DATA,
1483 				"(%"PRIu64") Frame buffer address cross "
1484 				"sub-regioin found when attaching RX frame "
1485 				"buffer address!\n",
1486 				dev->device_fh);
1487 			put_desc_to_used_list_zcp(vq, desc_idx);
1488 			continue;
1489 		}
1490 	} while (unlikely(phys_addr == 0));
1491 
1492 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1493 	if (unlikely(mbuf == NULL)) {
1494 		LOG_DEBUG(VHOST_DATA,
1495 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1496 			"ring_sc_dequeue fail.\n",
1497 			dev->device_fh);
1498 		put_desc_to_used_list_zcp(vq, desc_idx);
1499 		return;
1500 	}
1501 
1502 	if (unlikely(vpool->buf_size > desc->len)) {
1503 		LOG_DEBUG(VHOST_DATA,
1504 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1505 			"length(%d) of descriptor idx: %d less than room "
1506 			"size required: %d\n",
1507 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1508 		put_desc_to_used_list_zcp(vq, desc_idx);
1509 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1510 		return;
1511 	}
1512 
1513 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1514 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1515 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1516 	mbuf->data_len = desc->len;
1517 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1518 
1519 	LOG_DEBUG(VHOST_DATA,
1520 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1521 		"descriptor idx:%d\n",
1522 		dev->device_fh, res_base_idx, desc_idx);
1523 
1524 	__rte_mbuf_raw_free(mbuf);
1525 
1526 	return;
1527 }
1528 
1529 /*
1530  * Detach an attched packet mbuf -
1531  *  - restore original mbuf address and length values.
1532  *  - reset pktmbuf data and data_len to their default values.
1533  *  All other fields of the given packet mbuf will be left intact.
1534  *
1535  * @param m
1536  *   The attached packet mbuf.
1537  */
1538 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1539 {
1540 	const struct rte_mempool *mp = m->pool;
1541 	void *buf = RTE_MBUF_TO_BADDR(m);
1542 	uint32_t buf_ofs;
1543 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1544 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1545 
1546 	m->buf_addr = buf;
1547 	m->buf_len = (uint16_t)buf_len;
1548 
1549 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1550 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1551 	m->data_off = buf_ofs;
1552 
1553 	m->data_len = 0;
1554 }
1555 
1556 /*
1557  * This function is called after packets have been transimited. It fetchs mbuf
1558  * from vpool->pool, detached it and put into vpool->ring. It also update the
1559  * used index and kick the guest if necessary.
1560  */
1561 static inline uint32_t __attribute__((always_inline))
1562 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1563 {
1564 	struct rte_mbuf *mbuf;
1565 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1566 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1567 	uint32_t index = 0;
1568 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1569 
1570 	LOG_DEBUG(VHOST_DATA,
1571 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1572 		"clean is: %d\n",
1573 		dev->device_fh, mbuf_count);
1574 	LOG_DEBUG(VHOST_DATA,
1575 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1576 		"clean  is : %d\n",
1577 		dev->device_fh, rte_ring_count(vpool->ring));
1578 
1579 	for (index = 0; index < mbuf_count; index++) {
1580 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1581 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1582 			pktmbuf_detach_zcp(mbuf);
1583 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1584 
1585 		/* Update used index buffer information. */
1586 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1587 		vq->used->ring[used_idx].len = 0;
1588 
1589 		used_idx = (used_idx + 1) & (vq->size - 1);
1590 	}
1591 
1592 	LOG_DEBUG(VHOST_DATA,
1593 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1594 		"clean is: %d\n",
1595 		dev->device_fh, rte_mempool_count(vpool->pool));
1596 	LOG_DEBUG(VHOST_DATA,
1597 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1598 		"clean  is : %d\n",
1599 		dev->device_fh, rte_ring_count(vpool->ring));
1600 	LOG_DEBUG(VHOST_DATA,
1601 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1602 		"vq->last_used_idx:%d\n",
1603 		dev->device_fh, vq->last_used_idx);
1604 
1605 	vq->last_used_idx += mbuf_count;
1606 
1607 	LOG_DEBUG(VHOST_DATA,
1608 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1609 		"vq->last_used_idx:%d\n",
1610 		dev->device_fh, vq->last_used_idx);
1611 
1612 	rte_compiler_barrier();
1613 
1614 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1615 
1616 	/* Kick guest if required. */
1617 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1618 		eventfd_write((int)vq->kickfd, 1);
1619 
1620 	return 0;
1621 }
1622 
1623 /*
1624  * This function is called when a virtio device is destroy.
1625  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1626  */
1627 static void mbuf_destroy_zcp(struct vpool *vpool)
1628 {
1629 	struct rte_mbuf *mbuf = NULL;
1630 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1631 
1632 	LOG_DEBUG(VHOST_CONFIG,
1633 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1634 		"mbuf_destroy_zcp is: %d\n",
1635 		mbuf_count);
1636 	LOG_DEBUG(VHOST_CONFIG,
1637 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1638 		"mbuf_destroy_zcp  is : %d\n",
1639 		rte_ring_count(vpool->ring));
1640 
1641 	for (index = 0; index < mbuf_count; index++) {
1642 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1643 		if (likely(mbuf != NULL)) {
1644 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1645 				pktmbuf_detach_zcp(mbuf);
1646 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1647 		}
1648 	}
1649 
1650 	LOG_DEBUG(VHOST_CONFIG,
1651 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1652 		"mbuf_destroy_zcp is: %d\n",
1653 		rte_mempool_count(vpool->pool));
1654 	LOG_DEBUG(VHOST_CONFIG,
1655 		"in mbuf_destroy_zcp: mbuf count in ring after "
1656 		"mbuf_destroy_zcp is : %d\n",
1657 		rte_ring_count(vpool->ring));
1658 }
1659 
1660 /*
1661  * This function update the use flag and counter.
1662  */
1663 static inline uint32_t __attribute__((always_inline))
1664 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1665 	uint32_t count)
1666 {
1667 	struct vhost_virtqueue *vq;
1668 	struct vring_desc *desc;
1669 	struct rte_mbuf *buff;
1670 	/* The virtio_hdr is initialised to 0. */
1671 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1672 		= {{0, 0, 0, 0, 0, 0}, 0};
1673 	uint64_t buff_hdr_addr = 0;
1674 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1675 	uint32_t head_idx, packet_success = 0;
1676 	uint16_t res_cur_idx;
1677 
1678 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1679 
1680 	if (count == 0)
1681 		return 0;
1682 
1683 	vq = dev->virtqueue[VIRTIO_RXQ];
1684 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1685 
1686 	res_cur_idx = vq->last_used_idx;
1687 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1688 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1689 
1690 	/* Retrieve all of the head indexes first to avoid caching issues. */
1691 	for (head_idx = 0; head_idx < count; head_idx++)
1692 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1693 
1694 	/*Prefetch descriptor index. */
1695 	rte_prefetch0(&vq->desc[head[packet_success]]);
1696 
1697 	while (packet_success != count) {
1698 		/* Get descriptor from available ring */
1699 		desc = &vq->desc[head[packet_success]];
1700 
1701 		buff = pkts[packet_success];
1702 		LOG_DEBUG(VHOST_DATA,
1703 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1704 			"pkt[%d] descriptor idx: %d\n",
1705 			dev->device_fh, packet_success,
1706 			MBUF_HEADROOM_UINT32(buff));
1707 
1708 		PRINT_PACKET(dev,
1709 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1710 			+ RTE_PKTMBUF_HEADROOM),
1711 			rte_pktmbuf_data_len(buff), 0);
1712 
1713 		/* Buffer address translation for virtio header. */
1714 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1715 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1716 
1717 		/*
1718 		 * If the descriptors are chained the header and data are
1719 		 * placed in separate buffers.
1720 		 */
1721 		if (desc->flags & VRING_DESC_F_NEXT) {
1722 			desc->len = vq->vhost_hlen;
1723 			desc = &vq->desc[desc->next];
1724 			desc->len = rte_pktmbuf_data_len(buff);
1725 		} else {
1726 			desc->len = packet_len;
1727 		}
1728 
1729 		/* Update used ring with desc information */
1730 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1731 			= head[packet_success];
1732 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1733 			= packet_len;
1734 		res_cur_idx++;
1735 		packet_success++;
1736 
1737 		/* A header is required per buffer. */
1738 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1739 			(const void *)&virtio_hdr, vq->vhost_hlen);
1740 
1741 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1742 
1743 		if (likely(packet_success < count)) {
1744 			/* Prefetch descriptor index. */
1745 			rte_prefetch0(&vq->desc[head[packet_success]]);
1746 		}
1747 	}
1748 
1749 	rte_compiler_barrier();
1750 
1751 	LOG_DEBUG(VHOST_DATA,
1752 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1753 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1754 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1755 
1756 	*(volatile uint16_t *)&vq->used->idx += count;
1757 	vq->last_used_idx += count;
1758 
1759 	LOG_DEBUG(VHOST_DATA,
1760 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1761 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1762 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1763 
1764 	/* Kick the guest if necessary. */
1765 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1766 		eventfd_write((int)vq->kickfd, 1);
1767 
1768 	return count;
1769 }
1770 
1771 /*
1772  * This function routes the TX packet to the correct interface.
1773  * This may be a local device or the physical port.
1774  */
1775 static inline void __attribute__((always_inline))
1776 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1777 	uint32_t desc_idx, uint8_t need_copy)
1778 {
1779 	struct mbuf_table *tx_q;
1780 	struct rte_mbuf **m_table;
1781 	struct rte_mbuf *mbuf = NULL;
1782 	unsigned len, ret, offset = 0;
1783 	struct vpool *vpool;
1784 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1785 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1786 
1787 	/*Add packet to the port tx queue*/
1788 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1789 	len = tx_q->len;
1790 
1791 	/* Allocate an mbuf and populate the structure. */
1792 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1793 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1794 	if (unlikely(mbuf == NULL)) {
1795 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1796 		RTE_LOG(ERR, VHOST_DATA,
1797 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1798 			dev->device_fh);
1799 		put_desc_to_used_list_zcp(vq, desc_idx);
1800 		return;
1801 	}
1802 
1803 	if (vm2vm_mode == VM2VM_HARDWARE) {
1804 		/* Avoid using a vlan tag from any vm for external pkt, such as
1805 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1806 		 * selection, MAC address determines it as an external pkt
1807 		 * which should go to network, while vlan tag determine it as
1808 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1809 		 * such a ambiguous situation, so pkt will lost.
1810 		 */
1811 		vlan_tag = external_pkt_default_vlan_tag;
1812 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1813 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1814 			__rte_mbuf_raw_free(mbuf);
1815 			return;
1816 		}
1817 	}
1818 
1819 	mbuf->nb_segs = m->nb_segs;
1820 	mbuf->next = m->next;
1821 	mbuf->data_len = m->data_len + offset;
1822 	mbuf->pkt_len = mbuf->data_len;
1823 	if (unlikely(need_copy)) {
1824 		/* Copy the packet contents to the mbuf. */
1825 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1826 			rte_pktmbuf_mtod(m, void *),
1827 			m->data_len);
1828 	} else {
1829 		mbuf->data_off = m->data_off;
1830 		mbuf->buf_physaddr = m->buf_physaddr;
1831 		mbuf->buf_addr = m->buf_addr;
1832 	}
1833 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1834 	mbuf->vlan_tci = vlan_tag;
1835 	mbuf->l2_len = sizeof(struct ether_hdr);
1836 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1837 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1838 
1839 	tx_q->m_table[len] = mbuf;
1840 	len++;
1841 
1842 	LOG_DEBUG(VHOST_DATA,
1843 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1844 		dev->device_fh,
1845 		mbuf->nb_segs,
1846 		(mbuf->next == NULL) ? "null" : "non-null");
1847 
1848 	if (enable_stats) {
1849 		dev_statistics[dev->device_fh].tx_total++;
1850 		dev_statistics[dev->device_fh].tx++;
1851 	}
1852 
1853 	if (unlikely(len == MAX_PKT_BURST)) {
1854 		m_table = (struct rte_mbuf **)tx_q->m_table;
1855 		ret = rte_eth_tx_burst(ports[0],
1856 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1857 
1858 		/*
1859 		 * Free any buffers not handled by TX and update
1860 		 * the port stats.
1861 		 */
1862 		if (unlikely(ret < len)) {
1863 			do {
1864 				rte_pktmbuf_free(m_table[ret]);
1865 			} while (++ret < len);
1866 		}
1867 
1868 		len = 0;
1869 		txmbuf_clean_zcp(dev, vpool);
1870 	}
1871 
1872 	tx_q->len = len;
1873 
1874 	return;
1875 }
1876 
1877 /*
1878  * This function TX all available packets in virtio TX queue for one
1879  * virtio-net device. If it is first packet, it learns MAC address and
1880  * setup VMDQ.
1881  */
1882 static inline void __attribute__((always_inline))
1883 virtio_dev_tx_zcp(struct virtio_net *dev)
1884 {
1885 	struct rte_mbuf m;
1886 	struct vhost_virtqueue *vq;
1887 	struct vring_desc *desc;
1888 	uint64_t buff_addr = 0, phys_addr;
1889 	uint32_t head[MAX_PKT_BURST];
1890 	uint32_t i;
1891 	uint16_t free_entries, packet_success = 0;
1892 	uint16_t avail_idx;
1893 	uint8_t need_copy = 0;
1894 	hpa_type addr_type;
1895 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1896 
1897 	vq = dev->virtqueue[VIRTIO_TXQ];
1898 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1899 
1900 	/* If there are no available buffers then return. */
1901 	if (vq->last_used_idx_res == avail_idx)
1902 		return;
1903 
1904 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1905 
1906 	/* Prefetch available ring to retrieve head indexes. */
1907 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1908 
1909 	/* Get the number of free entries in the ring */
1910 	free_entries = (avail_idx - vq->last_used_idx_res);
1911 
1912 	/* Limit to MAX_PKT_BURST. */
1913 	free_entries
1914 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1915 
1916 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1917 		dev->device_fh, free_entries);
1918 
1919 	/* Retrieve all of the head indexes first to avoid caching issues. */
1920 	for (i = 0; i < free_entries; i++)
1921 		head[i]
1922 			= vq->avail->ring[(vq->last_used_idx_res + i)
1923 			& (vq->size - 1)];
1924 
1925 	vq->last_used_idx_res += free_entries;
1926 
1927 	/* Prefetch descriptor index. */
1928 	rte_prefetch0(&vq->desc[head[packet_success]]);
1929 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1930 
1931 	while (packet_success < free_entries) {
1932 		desc = &vq->desc[head[packet_success]];
1933 
1934 		/* Discard first buffer as it is the virtio header */
1935 		desc = &vq->desc[desc->next];
1936 
1937 		/* Buffer address translation. */
1938 		buff_addr = gpa_to_vva(dev, desc->addr);
1939 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1940 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1941 			&addr_type);
1942 
1943 		if (likely(packet_success < (free_entries - 1)))
1944 			/* Prefetch descriptor index. */
1945 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1946 
1947 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1948 			RTE_LOG(ERR, VHOST_DATA,
1949 				"(%"PRIu64") Invalid frame buffer address found"
1950 				"when TX packets!\n",
1951 				dev->device_fh);
1952 			packet_success++;
1953 			continue;
1954 		}
1955 
1956 		/* Prefetch buffer address. */
1957 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1958 
1959 		/*
1960 		 * Setup dummy mbuf. This is copied to a real mbuf if
1961 		 * transmitted out the physical port.
1962 		 */
1963 		m.data_len = desc->len;
1964 		m.nb_segs = 1;
1965 		m.next = NULL;
1966 		m.data_off = 0;
1967 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1968 		m.buf_physaddr = phys_addr;
1969 
1970 		/*
1971 		 * Check if the frame buffer address from guest crosses
1972 		 * sub-region or not.
1973 		 */
1974 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1975 			RTE_LOG(ERR, VHOST_DATA,
1976 				"(%"PRIu64") Frame buffer address cross "
1977 				"sub-regioin found when attaching TX frame "
1978 				"buffer address!\n",
1979 				dev->device_fh);
1980 			need_copy = 1;
1981 		} else
1982 			need_copy = 0;
1983 
1984 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1985 
1986 		/*
1987 		 * If this is the first received packet we need to learn
1988 		 * the MAC and setup VMDQ
1989 		 */
1990 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1991 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1992 				/*
1993 				 * Discard frame if device is scheduled for
1994 				 * removal or a duplicate MAC address is found.
1995 				 */
1996 				packet_success += free_entries;
1997 				vq->last_used_idx += packet_success;
1998 				break;
1999 			}
2000 		}
2001 
2002 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2003 		packet_success++;
2004 	}
2005 }
2006 
2007 /*
2008  * This function is called by each data core. It handles all RX/TX registered
2009  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2010  * addresses are compared with all devices in the main linked list.
2011  */
2012 static int
2013 switch_worker_zcp(__attribute__((unused)) void *arg)
2014 {
2015 	struct virtio_net *dev = NULL;
2016 	struct vhost_dev  *vdev = NULL;
2017 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2018 	struct virtio_net_data_ll *dev_ll;
2019 	struct mbuf_table *tx_q;
2020 	volatile struct lcore_ll_info *lcore_ll;
2021 	const uint64_t drain_tsc
2022 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2023 		* BURST_TX_DRAIN_US;
2024 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2025 	unsigned ret;
2026 	const uint16_t lcore_id = rte_lcore_id();
2027 	uint16_t count_in_ring, rx_count = 0;
2028 
2029 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2030 
2031 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2032 	prev_tsc = 0;
2033 
2034 	while (1) {
2035 		cur_tsc = rte_rdtsc();
2036 
2037 		/* TX burst queue drain */
2038 		diff_tsc = cur_tsc - prev_tsc;
2039 		if (unlikely(diff_tsc > drain_tsc)) {
2040 			/*
2041 			 * Get mbuf from vpool.pool and detach mbuf and
2042 			 * put back into vpool.ring.
2043 			 */
2044 			dev_ll = lcore_ll->ll_root_used;
2045 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2046 				/* Get virtio device ID */
2047 				vdev = dev_ll->vdev;
2048 				dev = vdev->dev;
2049 
2050 				if (likely(!vdev->remove)) {
2051 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2052 					if (tx_q->len) {
2053 						LOG_DEBUG(VHOST_DATA,
2054 						"TX queue drained after timeout"
2055 						" with burst size %u\n",
2056 						tx_q->len);
2057 
2058 						/*
2059 						 * Tx any packets in the queue
2060 						 */
2061 						ret = rte_eth_tx_burst(
2062 							ports[0],
2063 							(uint16_t)tx_q->txq_id,
2064 							(struct rte_mbuf **)
2065 							tx_q->m_table,
2066 							(uint16_t)tx_q->len);
2067 						if (unlikely(ret < tx_q->len)) {
2068 							do {
2069 								rte_pktmbuf_free(
2070 									tx_q->m_table[ret]);
2071 							} while (++ret < tx_q->len);
2072 						}
2073 						tx_q->len = 0;
2074 
2075 						txmbuf_clean_zcp(dev,
2076 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2077 					}
2078 				}
2079 				dev_ll = dev_ll->next;
2080 			}
2081 			prev_tsc = cur_tsc;
2082 		}
2083 
2084 		rte_prefetch0(lcore_ll->ll_root_used);
2085 
2086 		/*
2087 		 * Inform the configuration core that we have exited the linked
2088 		 * list and that no devices are in use if requested.
2089 		 */
2090 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2091 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2092 
2093 		/* Process devices */
2094 		dev_ll = lcore_ll->ll_root_used;
2095 
2096 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2097 			vdev = dev_ll->vdev;
2098 			dev  = vdev->dev;
2099 			if (unlikely(vdev->remove)) {
2100 				dev_ll = dev_ll->next;
2101 				unlink_vmdq(vdev);
2102 				vdev->ready = DEVICE_SAFE_REMOVE;
2103 				continue;
2104 			}
2105 
2106 			if (likely(vdev->ready == DEVICE_RX)) {
2107 				uint32_t index = vdev->vmdq_rx_q;
2108 				uint16_t i;
2109 				count_in_ring
2110 				= rte_ring_count(vpool_array[index].ring);
2111 				uint16_t free_entries
2112 				= (uint16_t)get_available_ring_num_zcp(dev);
2113 
2114 				/*
2115 				 * Attach all mbufs in vpool.ring and put back
2116 				 * into vpool.pool.
2117 				 */
2118 				for (i = 0;
2119 				i < RTE_MIN(free_entries,
2120 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2121 				i++)
2122 					attach_rxmbuf_zcp(dev);
2123 
2124 				/* Handle guest RX */
2125 				rx_count = rte_eth_rx_burst(ports[0],
2126 					vdev->vmdq_rx_q, pkts_burst,
2127 					MAX_PKT_BURST);
2128 
2129 				if (rx_count) {
2130 					ret_count = virtio_dev_rx_zcp(dev,
2131 							pkts_burst, rx_count);
2132 					if (enable_stats) {
2133 						dev_statistics[dev->device_fh].rx_total
2134 							+= rx_count;
2135 						dev_statistics[dev->device_fh].rx
2136 							+= ret_count;
2137 					}
2138 					while (likely(rx_count)) {
2139 						rx_count--;
2140 						pktmbuf_detach_zcp(
2141 							pkts_burst[rx_count]);
2142 						rte_ring_sp_enqueue(
2143 							vpool_array[index].ring,
2144 							(void *)pkts_burst[rx_count]);
2145 					}
2146 				}
2147 			}
2148 
2149 			if (likely(!vdev->remove))
2150 				/* Handle guest TX */
2151 				virtio_dev_tx_zcp(dev);
2152 
2153 			/* Move to the next device in the list */
2154 			dev_ll = dev_ll->next;
2155 		}
2156 	}
2157 
2158 	return 0;
2159 }
2160 
2161 
2162 /*
2163  * Add an entry to a used linked list. A free entry must first be found
2164  * in the free linked list using get_data_ll_free_entry();
2165  */
2166 static void
2167 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2168 	struct virtio_net_data_ll *ll_dev)
2169 {
2170 	struct virtio_net_data_ll *ll = *ll_root_addr;
2171 
2172 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2173 	ll_dev->next = NULL;
2174 	rte_compiler_barrier();
2175 
2176 	/* If ll == NULL then this is the first device. */
2177 	if (ll) {
2178 		/* Increment to the tail of the linked list. */
2179 		while ((ll->next != NULL) )
2180 			ll = ll->next;
2181 
2182 		ll->next = ll_dev;
2183 	} else {
2184 		*ll_root_addr = ll_dev;
2185 	}
2186 }
2187 
2188 /*
2189  * Remove an entry from a used linked list. The entry must then be added to
2190  * the free linked list using put_data_ll_free_entry().
2191  */
2192 static void
2193 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2194 	struct virtio_net_data_ll *ll_dev,
2195 	struct virtio_net_data_ll *ll_dev_last)
2196 {
2197 	struct virtio_net_data_ll *ll = *ll_root_addr;
2198 
2199 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2200 		return;
2201 
2202 	if (ll_dev == ll)
2203 		*ll_root_addr = ll_dev->next;
2204 	else
2205 		if (likely(ll_dev_last != NULL))
2206 			ll_dev_last->next = ll_dev->next;
2207 		else
2208 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2209 }
2210 
2211 /*
2212  * Find and return an entry from the free linked list.
2213  */
2214 static struct virtio_net_data_ll *
2215 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2216 {
2217 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2218 	struct virtio_net_data_ll *ll_dev;
2219 
2220 	if (ll_free == NULL)
2221 		return NULL;
2222 
2223 	ll_dev = ll_free;
2224 	*ll_root_addr = ll_free->next;
2225 
2226 	return ll_dev;
2227 }
2228 
2229 /*
2230  * Place an entry back on to the free linked list.
2231  */
2232 static void
2233 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2234 	struct virtio_net_data_ll *ll_dev)
2235 {
2236 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2237 
2238 	if (ll_dev == NULL)
2239 		return;
2240 
2241 	ll_dev->next = ll_free;
2242 	*ll_root_addr = ll_dev;
2243 }
2244 
2245 /*
2246  * Creates a linked list of a given size.
2247  */
2248 static struct virtio_net_data_ll *
2249 alloc_data_ll(uint32_t size)
2250 {
2251 	struct virtio_net_data_ll *ll_new;
2252 	uint32_t i;
2253 
2254 	/* Malloc and then chain the linked list. */
2255 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2256 	if (ll_new == NULL) {
2257 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2258 		return NULL;
2259 	}
2260 
2261 	for (i = 0; i < size - 1; i++) {
2262 		ll_new[i].vdev = NULL;
2263 		ll_new[i].next = &ll_new[i+1];
2264 	}
2265 	ll_new[i].next = NULL;
2266 
2267 	return (ll_new);
2268 }
2269 
2270 /*
2271  * Create the main linked list along with each individual cores linked list. A used and a free list
2272  * are created to manage entries.
2273  */
2274 static int
2275 init_data_ll (void)
2276 {
2277 	int lcore;
2278 
2279 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2280 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2281 		if (lcore_info[lcore].lcore_ll == NULL) {
2282 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2283 			return -1;
2284 		}
2285 
2286 		lcore_info[lcore].lcore_ll->device_num = 0;
2287 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2288 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2289 		if (num_devices % num_switching_cores)
2290 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2291 		else
2292 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2293 	}
2294 
2295 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2296 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2297 
2298 	return 0;
2299 }
2300 
2301 /*
2302  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2303  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2304  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2305  */
2306 static void
2307 destroy_device (volatile struct virtio_net *dev)
2308 {
2309 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2310 	struct virtio_net_data_ll *ll_main_dev_cur;
2311 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2312 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2313 	struct vhost_dev *vdev;
2314 	int lcore;
2315 
2316 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2317 
2318 	vdev = (struct vhost_dev *)dev->priv;
2319 	/*set the remove flag. */
2320 	vdev->remove = 1;
2321 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2322 		rte_pause();
2323 	}
2324 
2325 	/* Search for entry to be removed from lcore ll */
2326 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2327 	while (ll_lcore_dev_cur != NULL) {
2328 		if (ll_lcore_dev_cur->vdev == vdev) {
2329 			break;
2330 		} else {
2331 			ll_lcore_dev_last = ll_lcore_dev_cur;
2332 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2333 		}
2334 	}
2335 
2336 	if (ll_lcore_dev_cur == NULL) {
2337 		RTE_LOG(ERR, VHOST_CONFIG,
2338 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2339 			dev->device_fh);
2340 		return;
2341 	}
2342 
2343 	/* Search for entry to be removed from main ll */
2344 	ll_main_dev_cur = ll_root_used;
2345 	ll_main_dev_last = NULL;
2346 	while (ll_main_dev_cur != NULL) {
2347 		if (ll_main_dev_cur->vdev == vdev) {
2348 			break;
2349 		} else {
2350 			ll_main_dev_last = ll_main_dev_cur;
2351 			ll_main_dev_cur = ll_main_dev_cur->next;
2352 		}
2353 	}
2354 
2355 	/* Remove entries from the lcore and main ll. */
2356 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2357 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2358 
2359 	/* Set the dev_removal_flag on each lcore. */
2360 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2361 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2362 	}
2363 
2364 	/*
2365 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2366 	 * they can no longer access the device removed from the linked lists and that the devices
2367 	 * are no longer in use.
2368 	 */
2369 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2370 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2371 			rte_pause();
2372 		}
2373 	}
2374 
2375 	/* Add the entries back to the lcore and main free ll.*/
2376 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2377 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2378 
2379 	/* Decrement number of device on the lcore. */
2380 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2381 
2382 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2383 
2384 	if (zero_copy) {
2385 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2386 
2387 		/* Stop the RX queue. */
2388 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2389 			LOG_DEBUG(VHOST_CONFIG,
2390 				"(%"PRIu64") In destroy_device: Failed to stop "
2391 				"rx queue:%d\n",
2392 				dev->device_fh,
2393 				vdev->vmdq_rx_q);
2394 		}
2395 
2396 		LOG_DEBUG(VHOST_CONFIG,
2397 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2398 			"mempool back to ring for RX queue: %d\n",
2399 			dev->device_fh, vdev->vmdq_rx_q);
2400 
2401 		mbuf_destroy_zcp(vpool);
2402 
2403 		/* Stop the TX queue. */
2404 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2405 			LOG_DEBUG(VHOST_CONFIG,
2406 				"(%"PRIu64") In destroy_device: Failed to "
2407 				"stop tx queue:%d\n",
2408 				dev->device_fh, vdev->vmdq_rx_q);
2409 		}
2410 
2411 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2412 
2413 		LOG_DEBUG(VHOST_CONFIG,
2414 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2415 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2416 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2417 			dev->device_fh);
2418 
2419 		mbuf_destroy_zcp(vpool);
2420 		rte_free(vdev->regions_hpa);
2421 	}
2422 	rte_free(vdev);
2423 
2424 }
2425 
2426 /*
2427  * Calculate the region count of physical continous regions for one particular
2428  * region of whose vhost virtual address is continous. The particular region
2429  * start from vva_start, with size of 'size' in argument.
2430  */
2431 static uint32_t
2432 check_hpa_regions(uint64_t vva_start, uint64_t size)
2433 {
2434 	uint32_t i, nregions = 0, page_size = getpagesize();
2435 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2436 	if (vva_start % page_size) {
2437 		LOG_DEBUG(VHOST_CONFIG,
2438 			"in check_countinous: vva start(%p) mod page_size(%d) "
2439 			"has remainder\n",
2440 			(void *)(uintptr_t)vva_start, page_size);
2441 		return 0;
2442 	}
2443 	if (size % page_size) {
2444 		LOG_DEBUG(VHOST_CONFIG,
2445 			"in check_countinous: "
2446 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2447 			size, page_size);
2448 		return 0;
2449 	}
2450 	for (i = 0; i < size - page_size; i = i + page_size) {
2451 		cur_phys_addr
2452 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2453 		next_phys_addr = rte_mem_virt2phy(
2454 			(void *)(uintptr_t)(vva_start + i + page_size));
2455 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2456 			++nregions;
2457 			LOG_DEBUG(VHOST_CONFIG,
2458 				"in check_continuous: hva addr:(%p) is not "
2459 				"continuous with hva addr:(%p), diff:%d\n",
2460 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2461 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2462 				+ page_size), page_size);
2463 			LOG_DEBUG(VHOST_CONFIG,
2464 				"in check_continuous: hpa addr:(%p) is not "
2465 				"continuous with hpa addr:(%p), "
2466 				"diff:(%"PRIu64")\n",
2467 				(void *)(uintptr_t)cur_phys_addr,
2468 				(void *)(uintptr_t)next_phys_addr,
2469 				(next_phys_addr-cur_phys_addr));
2470 		}
2471 	}
2472 	return nregions;
2473 }
2474 
2475 /*
2476  * Divide each region whose vhost virtual address is continous into a few
2477  * sub-regions, make sure the physical address within each sub-region are
2478  * continous. And fill offset(to GPA) and size etc. information of each
2479  * sub-region into regions_hpa.
2480  */
2481 static uint32_t
2482 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2483 {
2484 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2485 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2486 
2487 	if (mem_region_hpa == NULL)
2488 		return 0;
2489 
2490 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2491 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2492 			virtio_memory->regions[regionidx].address_offset;
2493 		mem_region_hpa[regionidx_hpa].guest_phys_address
2494 			= virtio_memory->regions[regionidx].guest_phys_address;
2495 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2496 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2497 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2498 		LOG_DEBUG(VHOST_CONFIG,
2499 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2500 			regionidx_hpa,
2501 			(void *)(uintptr_t)
2502 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2503 		LOG_DEBUG(VHOST_CONFIG,
2504 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2505 			regionidx_hpa,
2506 			(void *)(uintptr_t)
2507 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2508 		for (i = 0, k = 0;
2509 			i < virtio_memory->regions[regionidx].memory_size -
2510 				page_size;
2511 			i += page_size) {
2512 			cur_phys_addr = rte_mem_virt2phy(
2513 					(void *)(uintptr_t)(vva_start + i));
2514 			next_phys_addr = rte_mem_virt2phy(
2515 					(void *)(uintptr_t)(vva_start +
2516 					i + page_size));
2517 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2518 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2519 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2520 					k + page_size;
2521 				mem_region_hpa[regionidx_hpa].memory_size
2522 					= k + page_size;
2523 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2524 					"phys addr end  [%d]:(%p)\n",
2525 					regionidx_hpa,
2526 					(void *)(uintptr_t)
2527 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2528 				LOG_DEBUG(VHOST_CONFIG,
2529 					"in fill_hpa_regions: guest phys addr "
2530 					"size [%d]:(%p)\n",
2531 					regionidx_hpa,
2532 					(void *)(uintptr_t)
2533 					(mem_region_hpa[regionidx_hpa].memory_size));
2534 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2535 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2536 				++regionidx_hpa;
2537 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2538 					next_phys_addr -
2539 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2540 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2541 					" phys addr start[%d]:(%p)\n",
2542 					regionidx_hpa,
2543 					(void *)(uintptr_t)
2544 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2545 				LOG_DEBUG(VHOST_CONFIG,
2546 					"in fill_hpa_regions: host  phys addr "
2547 					"start[%d]:(%p)\n",
2548 					regionidx_hpa,
2549 					(void *)(uintptr_t)
2550 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2551 				k = 0;
2552 			} else {
2553 				k += page_size;
2554 			}
2555 		}
2556 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2557 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2558 			+ k + page_size;
2559 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2560 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2561 			"[%d]:(%p)\n", regionidx_hpa,
2562 			(void *)(uintptr_t)
2563 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2564 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2565 			"[%d]:(%p)\n", regionidx_hpa,
2566 			(void *)(uintptr_t)
2567 			(mem_region_hpa[regionidx_hpa].memory_size));
2568 		++regionidx_hpa;
2569 	}
2570 	return regionidx_hpa;
2571 }
2572 
2573 /*
2574  * A new device is added to a data core. First the device is added to the main linked list
2575  * and the allocated to a specific data core.
2576  */
2577 static int
2578 new_device (struct virtio_net *dev)
2579 {
2580 	struct virtio_net_data_ll *ll_dev;
2581 	int lcore, core_add = 0;
2582 	uint32_t device_num_min = num_devices;
2583 	struct vhost_dev *vdev;
2584 	uint32_t regionidx;
2585 
2586 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2587 	if (vdev == NULL) {
2588 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2589 			dev->device_fh);
2590 		return -1;
2591 	}
2592 	vdev->dev = dev;
2593 	dev->priv = vdev;
2594 
2595 	if (zero_copy) {
2596 		vdev->nregions_hpa = dev->mem->nregions;
2597 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2598 			vdev->nregions_hpa
2599 				+= check_hpa_regions(
2600 					dev->mem->regions[regionidx].guest_phys_address
2601 					+ dev->mem->regions[regionidx].address_offset,
2602 					dev->mem->regions[regionidx].memory_size);
2603 
2604 		}
2605 
2606 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2607 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2608 			RTE_CACHE_LINE_SIZE);
2609 		if (vdev->regions_hpa == NULL) {
2610 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2611 			rte_free(vdev);
2612 			return -1;
2613 		}
2614 
2615 
2616 		if (fill_hpa_memory_regions(
2617 			vdev->regions_hpa, dev->mem
2618 			) != vdev->nregions_hpa) {
2619 
2620 			RTE_LOG(ERR, VHOST_CONFIG,
2621 				"hpa memory regions number mismatch: "
2622 				"[%d]\n", vdev->nregions_hpa);
2623 			rte_free(vdev->regions_hpa);
2624 			rte_free(vdev);
2625 			return -1;
2626 		}
2627 	}
2628 
2629 
2630 	/* Add device to main ll */
2631 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2632 	if (ll_dev == NULL) {
2633 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2634 			"of %d devices per core has been reached\n",
2635 			dev->device_fh, num_devices);
2636 		if (vdev->regions_hpa)
2637 			rte_free(vdev->regions_hpa);
2638 		rte_free(vdev);
2639 		return -1;
2640 	}
2641 	ll_dev->vdev = vdev;
2642 	add_data_ll_entry(&ll_root_used, ll_dev);
2643 	vdev->vmdq_rx_q
2644 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2645 
2646 	if (zero_copy) {
2647 		uint32_t index = vdev->vmdq_rx_q;
2648 		uint32_t count_in_ring, i;
2649 		struct mbuf_table *tx_q;
2650 
2651 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2652 
2653 		LOG_DEBUG(VHOST_CONFIG,
2654 			"(%"PRIu64") in new_device: mbuf count in mempool "
2655 			"before attach is: %d\n",
2656 			dev->device_fh,
2657 			rte_mempool_count(vpool_array[index].pool));
2658 		LOG_DEBUG(VHOST_CONFIG,
2659 			"(%"PRIu64") in new_device: mbuf count in  ring "
2660 			"before attach  is : %d\n",
2661 			dev->device_fh, count_in_ring);
2662 
2663 		/*
2664 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2665 		 */
2666 		for (i = 0; i < count_in_ring; i++)
2667 			attach_rxmbuf_zcp(dev);
2668 
2669 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2670 			"mempool after attach is: %d\n",
2671 			dev->device_fh,
2672 			rte_mempool_count(vpool_array[index].pool));
2673 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2674 			"ring after attach  is : %d\n",
2675 			dev->device_fh,
2676 			rte_ring_count(vpool_array[index].ring));
2677 
2678 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2679 		tx_q->txq_id = vdev->vmdq_rx_q;
2680 
2681 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2682 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2683 
2684 			LOG_DEBUG(VHOST_CONFIG,
2685 				"(%"PRIu64") In new_device: Failed to start "
2686 				"tx queue:%d\n",
2687 				dev->device_fh, vdev->vmdq_rx_q);
2688 
2689 			mbuf_destroy_zcp(vpool);
2690 			rte_free(vdev->regions_hpa);
2691 			rte_free(vdev);
2692 			return -1;
2693 		}
2694 
2695 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2696 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2697 
2698 			LOG_DEBUG(VHOST_CONFIG,
2699 				"(%"PRIu64") In new_device: Failed to start "
2700 				"rx queue:%d\n",
2701 				dev->device_fh, vdev->vmdq_rx_q);
2702 
2703 			/* Stop the TX queue. */
2704 			if (rte_eth_dev_tx_queue_stop(ports[0],
2705 				vdev->vmdq_rx_q) != 0) {
2706 				LOG_DEBUG(VHOST_CONFIG,
2707 					"(%"PRIu64") In new_device: Failed to "
2708 					"stop tx queue:%d\n",
2709 					dev->device_fh, vdev->vmdq_rx_q);
2710 			}
2711 
2712 			mbuf_destroy_zcp(vpool);
2713 			rte_free(vdev->regions_hpa);
2714 			rte_free(vdev);
2715 			return -1;
2716 		}
2717 
2718 	}
2719 
2720 	/*reset ready flag*/
2721 	vdev->ready = DEVICE_MAC_LEARNING;
2722 	vdev->remove = 0;
2723 
2724 	/* Find a suitable lcore to add the device. */
2725 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2726 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2727 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2728 			core_add = lcore;
2729 		}
2730 	}
2731 	/* Add device to lcore ll */
2732 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2733 	if (ll_dev == NULL) {
2734 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2735 		vdev->ready = DEVICE_SAFE_REMOVE;
2736 		destroy_device(dev);
2737 		if (vdev->regions_hpa)
2738 			rte_free(vdev->regions_hpa);
2739 		rte_free(vdev);
2740 		return -1;
2741 	}
2742 	ll_dev->vdev = vdev;
2743 	vdev->coreid = core_add;
2744 
2745 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2746 
2747 	/* Initialize device stats */
2748 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2749 
2750 	/* Disable notifications. */
2751 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2752 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2753 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2754 	dev->flags |= VIRTIO_DEV_RUNNING;
2755 
2756 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2757 
2758 	return 0;
2759 }
2760 
2761 /*
2762  * These callback allow devices to be added to the data core when configuration
2763  * has been fully complete.
2764  */
2765 static const struct virtio_net_device_ops virtio_net_device_ops =
2766 {
2767 	.new_device =  new_device,
2768 	.destroy_device = destroy_device,
2769 };
2770 
2771 /*
2772  * This is a thread will wake up after a period to print stats if the user has
2773  * enabled them.
2774  */
2775 static void
2776 print_stats(void)
2777 {
2778 	struct virtio_net_data_ll *dev_ll;
2779 	uint64_t tx_dropped, rx_dropped;
2780 	uint64_t tx, tx_total, rx, rx_total;
2781 	uint32_t device_fh;
2782 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2783 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2784 
2785 	while(1) {
2786 		sleep(enable_stats);
2787 
2788 		/* Clear screen and move to top left */
2789 		printf("%s%s", clr, top_left);
2790 
2791 		printf("\nDevice statistics ====================================");
2792 
2793 		dev_ll = ll_root_used;
2794 		while (dev_ll != NULL) {
2795 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2796 			tx_total = dev_statistics[device_fh].tx_total;
2797 			tx = dev_statistics[device_fh].tx;
2798 			tx_dropped = tx_total - tx;
2799 			if (zero_copy == 0) {
2800 				rx_total = rte_atomic64_read(
2801 					&dev_statistics[device_fh].rx_total_atomic);
2802 				rx = rte_atomic64_read(
2803 					&dev_statistics[device_fh].rx_atomic);
2804 			} else {
2805 				rx_total = dev_statistics[device_fh].rx_total;
2806 				rx = dev_statistics[device_fh].rx;
2807 			}
2808 			rx_dropped = rx_total - rx;
2809 
2810 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2811 					"\nTX total: 		%"PRIu64""
2812 					"\nTX dropped: 		%"PRIu64""
2813 					"\nTX successful: 		%"PRIu64""
2814 					"\nRX total: 		%"PRIu64""
2815 					"\nRX dropped: 		%"PRIu64""
2816 					"\nRX successful: 		%"PRIu64"",
2817 					device_fh,
2818 					tx_total,
2819 					tx_dropped,
2820 					tx,
2821 					rx_total,
2822 					rx_dropped,
2823 					rx);
2824 
2825 			dev_ll = dev_ll->next;
2826 		}
2827 		printf("\n======================================================\n");
2828 	}
2829 }
2830 
2831 static void
2832 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2833 	char *ring_name, uint32_t nb_mbuf)
2834 {
2835 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2836 	vpool_array[index].pool
2837 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2838 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2839 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2840 		rte_pktmbuf_init, NULL, socket, 0);
2841 	if (vpool_array[index].pool != NULL) {
2842 		vpool_array[index].ring
2843 			= rte_ring_create(ring_name,
2844 				rte_align32pow2(nb_mbuf + 1),
2845 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2846 		if (likely(vpool_array[index].ring != NULL)) {
2847 			LOG_DEBUG(VHOST_CONFIG,
2848 				"in setup_mempool_tbl: mbuf count in "
2849 				"mempool is: %d\n",
2850 				rte_mempool_count(vpool_array[index].pool));
2851 			LOG_DEBUG(VHOST_CONFIG,
2852 				"in setup_mempool_tbl: mbuf count in "
2853 				"ring   is: %d\n",
2854 				rte_ring_count(vpool_array[index].ring));
2855 		} else {
2856 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2857 				ring_name);
2858 		}
2859 
2860 		/* Need consider head room. */
2861 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2862 	} else {
2863 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2864 	}
2865 }
2866 
2867 
2868 /*
2869  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2870  * device is also registered here to handle the IOCTLs.
2871  */
2872 int
2873 main(int argc, char *argv[])
2874 {
2875 	struct rte_mempool *mbuf_pool = NULL;
2876 	unsigned lcore_id, core_id = 0;
2877 	unsigned nb_ports, valid_num_ports;
2878 	int ret;
2879 	uint8_t portid;
2880 	uint16_t queue_id;
2881 	static pthread_t tid;
2882 
2883 	/* init EAL */
2884 	ret = rte_eal_init(argc, argv);
2885 	if (ret < 0)
2886 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2887 	argc -= ret;
2888 	argv += ret;
2889 
2890 	/* parse app arguments */
2891 	ret = us_vhost_parse_args(argc, argv);
2892 	if (ret < 0)
2893 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2894 
2895 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2896 		if (rte_lcore_is_enabled(lcore_id))
2897 			lcore_ids[core_id ++] = lcore_id;
2898 
2899 	if (rte_lcore_count() > RTE_MAX_LCORE)
2900 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2901 
2902 	/*set the number of swithcing cores available*/
2903 	num_switching_cores = rte_lcore_count()-1;
2904 
2905 	/* Get the number of physical ports. */
2906 	nb_ports = rte_eth_dev_count();
2907 	if (nb_ports > RTE_MAX_ETHPORTS)
2908 		nb_ports = RTE_MAX_ETHPORTS;
2909 
2910 	/*
2911 	 * Update the global var NUM_PORTS and global array PORTS
2912 	 * and get value of var VALID_NUM_PORTS according to system ports number
2913 	 */
2914 	valid_num_ports = check_ports_num(nb_ports);
2915 
2916 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2917 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2918 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2919 		return -1;
2920 	}
2921 
2922 	if (zero_copy == 0) {
2923 		/* Create the mbuf pool. */
2924 		mbuf_pool = rte_mempool_create(
2925 				"MBUF_POOL",
2926 				NUM_MBUFS_PER_PORT
2927 				* valid_num_ports,
2928 				MBUF_SIZE, MBUF_CACHE_SIZE,
2929 				sizeof(struct rte_pktmbuf_pool_private),
2930 				rte_pktmbuf_pool_init, NULL,
2931 				rte_pktmbuf_init, NULL,
2932 				rte_socket_id(), 0);
2933 		if (mbuf_pool == NULL)
2934 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2935 
2936 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2937 			vpool_array[queue_id].pool = mbuf_pool;
2938 
2939 		if (vm2vm_mode == VM2VM_HARDWARE) {
2940 			/* Enable VT loop back to let L2 switch to do it. */
2941 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2942 			LOG_DEBUG(VHOST_CONFIG,
2943 				"Enable loop back for L2 switch in vmdq.\n");
2944 		}
2945 	} else {
2946 		uint32_t nb_mbuf;
2947 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2948 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2949 
2950 		nb_mbuf = num_rx_descriptor
2951 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2952 			+ num_switching_cores * MAX_PKT_BURST;
2953 
2954 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2955 			snprintf(pool_name, sizeof(pool_name),
2956 				"rxmbuf_pool_%u", queue_id);
2957 			snprintf(ring_name, sizeof(ring_name),
2958 				"rxmbuf_ring_%u", queue_id);
2959 			setup_mempool_tbl(rte_socket_id(), queue_id,
2960 				pool_name, ring_name, nb_mbuf);
2961 		}
2962 
2963 		nb_mbuf = num_tx_descriptor
2964 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2965 				+ num_switching_cores * MAX_PKT_BURST;
2966 
2967 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2968 			snprintf(pool_name, sizeof(pool_name),
2969 				"txmbuf_pool_%u", queue_id);
2970 			snprintf(ring_name, sizeof(ring_name),
2971 				"txmbuf_ring_%u", queue_id);
2972 			setup_mempool_tbl(rte_socket_id(),
2973 				(queue_id + MAX_QUEUES),
2974 				pool_name, ring_name, nb_mbuf);
2975 		}
2976 
2977 		if (vm2vm_mode == VM2VM_HARDWARE) {
2978 			/* Enable VT loop back to let L2 switch to do it. */
2979 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2980 			LOG_DEBUG(VHOST_CONFIG,
2981 				"Enable loop back for L2 switch in vmdq.\n");
2982 		}
2983 	}
2984 	/* Set log level. */
2985 	rte_set_log_level(LOG_LEVEL);
2986 
2987 	/* initialize all ports */
2988 	for (portid = 0; portid < nb_ports; portid++) {
2989 		/* skip ports that are not enabled */
2990 		if ((enabled_port_mask & (1 << portid)) == 0) {
2991 			RTE_LOG(INFO, VHOST_PORT,
2992 				"Skipping disabled port %d\n", portid);
2993 			continue;
2994 		}
2995 		if (port_init(portid) != 0)
2996 			rte_exit(EXIT_FAILURE,
2997 				"Cannot initialize network ports\n");
2998 	}
2999 
3000 	/* Initialise all linked lists. */
3001 	if (init_data_ll() == -1)
3002 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3003 
3004 	/* Initialize device stats */
3005 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3006 
3007 	/* Enable stats if the user option is set. */
3008 	if (enable_stats)
3009 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3010 
3011 	/* Launch all data cores. */
3012 	if (zero_copy == 0) {
3013 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3014 			rte_eal_remote_launch(switch_worker,
3015 				mbuf_pool, lcore_id);
3016 		}
3017 	} else {
3018 		uint32_t count_in_mempool, index, i;
3019 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3020 			/* For all RX and TX queues. */
3021 			count_in_mempool
3022 				= rte_mempool_count(vpool_array[index].pool);
3023 
3024 			/*
3025 			 * Transfer all un-attached mbufs from vpool.pool
3026 			 * to vpoo.ring.
3027 			 */
3028 			for (i = 0; i < count_in_mempool; i++) {
3029 				struct rte_mbuf *mbuf
3030 					= __rte_mbuf_raw_alloc(
3031 						vpool_array[index].pool);
3032 				rte_ring_sp_enqueue(vpool_array[index].ring,
3033 						(void *)mbuf);
3034 			}
3035 
3036 			LOG_DEBUG(VHOST_CONFIG,
3037 				"in main: mbuf count in mempool at initial "
3038 				"is: %d\n", count_in_mempool);
3039 			LOG_DEBUG(VHOST_CONFIG,
3040 				"in main: mbuf count in  ring at initial  is :"
3041 				" %d\n",
3042 				rte_ring_count(vpool_array[index].ring));
3043 		}
3044 
3045 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3046 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3047 				lcore_id);
3048 	}
3049 
3050 	if (mergeable == 0)
3051 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3052 
3053 	/* Register CUSE device to handle IOCTLs. */
3054 	ret = rte_vhost_driver_register((char *)&dev_basename);
3055 	if (ret != 0)
3056 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3057 
3058 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3059 
3060 	/* Start CUSE session. */
3061 	rte_vhost_driver_session_start();
3062 	return 0;
3063 
3064 }
3065 
3066