xref: /dpdk/examples/vhost/main.c (revision 691693c6e6653dc887bd7eb5cc868d41082ca17f)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 
55 #include "main.h"
56 
57 #ifndef MAX_QUEUES
58 #define MAX_QUEUES 128
59 #endif
60 
61 /* the maximum number of external ports supported */
62 #define MAX_SUP_PORTS 1
63 
64 /*
65  * Calculate the number of buffers needed per port
66  */
67 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
68 							(num_switching_cores*MAX_PKT_BURST) +  			\
69 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
70 							(num_switching_cores*MBUF_CACHE_SIZE))
71 
72 #define MBUF_CACHE_SIZE	128
73 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
74 
75 /*
76  * No frame data buffer allocated from host are required for zero copy
77  * implementation, guest will allocate the frame data buffer, and vhost
78  * directly use it.
79  */
80 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
81 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
82 #define MBUF_CACHE_SIZE_ZCP 0
83 
84 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
85 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
86 
87 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
88 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
89 
90 #define JUMBO_FRAME_MAX_SIZE    0x2600
91 
92 /* State of virtio device. */
93 #define DEVICE_MAC_LEARNING 0
94 #define DEVICE_RX			1
95 #define DEVICE_SAFE_REMOVE	2
96 
97 /* Config_core_flag status definitions. */
98 #define REQUEST_DEV_REMOVAL 1
99 #define ACK_DEV_REMOVAL 0
100 
101 /* Configurable number of RX/TX ring descriptors */
102 #define RTE_TEST_RX_DESC_DEFAULT 1024
103 #define RTE_TEST_TX_DESC_DEFAULT 512
104 
105 /*
106  * Need refine these 2 macros for legacy and DPDK based front end:
107  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
108  * And then adjust power 2.
109  */
110 /*
111  * For legacy front end, 128 descriptors,
112  * half for virtio header, another half for mbuf.
113  */
114 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
115 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
116 
117 /* Get first 4 bytes in mbuf headroom. */
118 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
119 		+ sizeof(struct rte_mbuf)))
120 
121 /* true if x is a power of 2 */
122 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
123 
124 #define INVALID_PORT_ID 0xFF
125 
126 /* Max number of devices. Limited by vmdq. */
127 #define MAX_DEVICES 64
128 
129 /* Size of buffers used for snprintfs. */
130 #define MAX_PRINT_BUFF 6072
131 
132 /* Maximum character device basename size. */
133 #define MAX_BASENAME_SZ 10
134 
135 /* Maximum long option length for option parsing. */
136 #define MAX_LONG_OPT_SZ 64
137 
138 /* Used to compare MAC addresses. */
139 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
140 
141 /* Number of descriptors per cacheline. */
142 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
143 
144 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
145 
146 /* mask of enabled ports */
147 static uint32_t enabled_port_mask = 0;
148 
149 /* Promiscuous mode */
150 static uint32_t promiscuous;
151 
152 /*Number of switching cores enabled*/
153 static uint32_t num_switching_cores = 0;
154 
155 /* number of devices/queues to support*/
156 static uint32_t num_queues = 0;
157 static uint32_t num_devices;
158 
159 /*
160  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
161  * disabled on default.
162  */
163 static uint32_t zero_copy;
164 static int mergeable;
165 
166 /* Do vlan strip on host, enabled on default */
167 static uint32_t vlan_strip = 1;
168 
169 /* number of descriptors to apply*/
170 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
171 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
172 
173 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
174 #define MAX_RING_DESC 4096
175 
176 struct vpool {
177 	struct rte_mempool *pool;
178 	struct rte_ring *ring;
179 	uint32_t buf_size;
180 } vpool_array[MAX_QUEUES+MAX_QUEUES];
181 
182 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
183 typedef enum {
184 	VM2VM_DISABLED = 0,
185 	VM2VM_SOFTWARE = 1,
186 	VM2VM_HARDWARE = 2,
187 	VM2VM_LAST
188 } vm2vm_type;
189 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
190 
191 /* The type of host physical address translated from guest physical address. */
192 typedef enum {
193 	PHYS_ADDR_CONTINUOUS = 0,
194 	PHYS_ADDR_CROSS_SUBREG = 1,
195 	PHYS_ADDR_INVALID = 2,
196 	PHYS_ADDR_LAST
197 } hpa_type;
198 
199 /* Enable stats. */
200 static uint32_t enable_stats = 0;
201 /* Enable retries on RX. */
202 static uint32_t enable_retry = 1;
203 /* Specify timeout (in useconds) between retries on RX. */
204 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
205 /* Specify the number of retries on RX. */
206 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
207 
208 /* Character device basename. Can be set by user. */
209 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
210 
211 /* empty vmdq configuration structure. Filled in programatically */
212 static struct rte_eth_conf vmdq_conf_default = {
213 	.rxmode = {
214 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
215 		.split_hdr_size = 0,
216 		.header_split   = 0, /**< Header Split disabled */
217 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
218 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
219 		/*
220 		 * It is necessary for 1G NIC such as I350,
221 		 * this fixes bug of ipv4 forwarding in guest can't
222 		 * forward pakets from one virtio dev to another virtio dev.
223 		 */
224 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
225 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
226 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
227 	},
228 
229 	.txmode = {
230 		.mq_mode = ETH_MQ_TX_NONE,
231 	},
232 	.rx_adv_conf = {
233 		/*
234 		 * should be overridden separately in code with
235 		 * appropriate values
236 		 */
237 		.vmdq_rx_conf = {
238 			.nb_queue_pools = ETH_8_POOLS,
239 			.enable_default_pool = 0,
240 			.default_pool = 0,
241 			.nb_pool_maps = 0,
242 			.pool_map = {{0, 0},},
243 		},
244 	},
245 };
246 
247 static unsigned lcore_ids[RTE_MAX_LCORE];
248 static uint8_t ports[RTE_MAX_ETHPORTS];
249 static unsigned num_ports = 0; /**< The number of ports specified in command line */
250 static uint16_t num_pf_queues, num_vmdq_queues;
251 static uint16_t vmdq_pool_base, vmdq_queue_base;
252 static uint16_t queues_per_pool;
253 
254 static const uint16_t external_pkt_default_vlan_tag = 2000;
255 const uint16_t vlan_tags[] = {
256 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
257 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
258 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
259 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
260 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
261 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
262 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
263 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
264 };
265 
266 /* ethernet addresses of ports */
267 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
268 
269 /* heads for the main used and free linked lists for the data path. */
270 static struct virtio_net_data_ll *ll_root_used = NULL;
271 static struct virtio_net_data_ll *ll_root_free = NULL;
272 
273 /* Array of data core structures containing information on individual core linked lists. */
274 static struct lcore_info lcore_info[RTE_MAX_LCORE];
275 
276 /* Used for queueing bursts of TX packets. */
277 struct mbuf_table {
278 	unsigned len;
279 	unsigned txq_id;
280 	struct rte_mbuf *m_table[MAX_PKT_BURST];
281 };
282 
283 /* TX queue for each data core. */
284 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
285 
286 /* TX queue fori each virtio device for zero copy. */
287 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
288 
289 /* Vlan header struct used to insert vlan tags on TX. */
290 struct vlan_ethhdr {
291 	unsigned char   h_dest[ETH_ALEN];
292 	unsigned char   h_source[ETH_ALEN];
293 	__be16          h_vlan_proto;
294 	__be16          h_vlan_TCI;
295 	__be16          h_vlan_encapsulated_proto;
296 };
297 
298 /* Header lengths. */
299 #define VLAN_HLEN       4
300 #define VLAN_ETH_HLEN   18
301 
302 /* Per-device statistics struct */
303 struct device_statistics {
304 	uint64_t tx_total;
305 	rte_atomic64_t rx_total_atomic;
306 	uint64_t rx_total;
307 	uint64_t tx;
308 	rte_atomic64_t rx_atomic;
309 	uint64_t rx;
310 } __rte_cache_aligned;
311 struct device_statistics dev_statistics[MAX_DEVICES];
312 
313 /*
314  * Builds up the correct configuration for VMDQ VLAN pool map
315  * according to the pool & queue limits.
316  */
317 static inline int
318 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
319 {
320 	struct rte_eth_vmdq_rx_conf conf;
321 	struct rte_eth_vmdq_rx_conf *def_conf =
322 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
323 	unsigned i;
324 
325 	memset(&conf, 0, sizeof(conf));
326 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
327 	conf.nb_pool_maps = num_devices;
328 	conf.enable_loop_back = def_conf->enable_loop_back;
329 	conf.rx_mode = def_conf->rx_mode;
330 
331 	for (i = 0; i < conf.nb_pool_maps; i++) {
332 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
333 		conf.pool_map[i].pools = (1UL << i);
334 	}
335 
336 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
337 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
338 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
339 	return 0;
340 }
341 
342 /*
343  * Validate the device number according to the max pool number gotten form
344  * dev_info. If the device number is invalid, give the error message and
345  * return -1. Each device must have its own pool.
346  */
347 static inline int
348 validate_num_devices(uint32_t max_nb_devices)
349 {
350 	if (num_devices > max_nb_devices) {
351 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
352 		return -1;
353 	}
354 	return 0;
355 }
356 
357 /*
358  * Initialises a given port using global settings and with the rx buffers
359  * coming from the mbuf_pool passed as parameter
360  */
361 static inline int
362 port_init(uint8_t port)
363 {
364 	struct rte_eth_dev_info dev_info;
365 	struct rte_eth_conf port_conf;
366 	struct rte_eth_rxconf *rxconf;
367 	struct rte_eth_txconf *txconf;
368 	int16_t rx_rings, tx_rings;
369 	uint16_t rx_ring_size, tx_ring_size;
370 	int retval;
371 	uint16_t q;
372 
373 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
374 	rte_eth_dev_info_get (port, &dev_info);
375 
376 	if (dev_info.max_rx_queues > MAX_QUEUES) {
377 		rte_exit(EXIT_FAILURE,
378 			"please define MAX_QUEUES no less than %u in %s\n",
379 			dev_info.max_rx_queues, __FILE__);
380 	}
381 
382 	rxconf = &dev_info.default_rxconf;
383 	txconf = &dev_info.default_txconf;
384 	rxconf->rx_drop_en = 1;
385 
386 	/* Enable vlan offload */
387 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
388 
389 	/*
390 	 * Zero copy defers queue RX/TX start to the time when guest
391 	 * finishes its startup and packet buffers from that guest are
392 	 * available.
393 	 */
394 	if (zero_copy) {
395 		rxconf->rx_deferred_start = 1;
396 		rxconf->rx_drop_en = 0;
397 		txconf->tx_deferred_start = 1;
398 	}
399 
400 	/*configure the number of supported virtio devices based on VMDQ limits */
401 	num_devices = dev_info.max_vmdq_pools;
402 
403 	if (zero_copy) {
404 		rx_ring_size = num_rx_descriptor;
405 		tx_ring_size = num_tx_descriptor;
406 		tx_rings = dev_info.max_tx_queues;
407 	} else {
408 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
409 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
410 		tx_rings = (uint16_t)rte_lcore_count();
411 	}
412 
413 	retval = validate_num_devices(MAX_DEVICES);
414 	if (retval < 0)
415 		return retval;
416 
417 	/* Get port configuration. */
418 	retval = get_eth_conf(&port_conf, num_devices);
419 	if (retval < 0)
420 		return retval;
421 	/* NIC queues are divided into pf queues and vmdq queues.  */
422 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
423 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
424 	num_vmdq_queues = num_devices * queues_per_pool;
425 	num_queues = num_pf_queues + num_vmdq_queues;
426 	vmdq_queue_base = dev_info.vmdq_queue_base;
427 	vmdq_pool_base  = dev_info.vmdq_pool_base;
428 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
429 		num_pf_queues, num_devices, queues_per_pool);
430 
431 	if (port >= rte_eth_dev_count()) return -1;
432 
433 	rx_rings = (uint16_t)dev_info.max_rx_queues;
434 	/* Configure ethernet device. */
435 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
436 	if (retval != 0)
437 		return retval;
438 
439 	/* Setup the queues. */
440 	for (q = 0; q < rx_rings; q ++) {
441 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
442 						rte_eth_dev_socket_id(port),
443 						rxconf,
444 						vpool_array[q].pool);
445 		if (retval < 0)
446 			return retval;
447 	}
448 	for (q = 0; q < tx_rings; q ++) {
449 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
450 						rte_eth_dev_socket_id(port),
451 						txconf);
452 		if (retval < 0)
453 			return retval;
454 	}
455 
456 	/* Start the device. */
457 	retval  = rte_eth_dev_start(port);
458 	if (retval < 0) {
459 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
460 		return retval;
461 	}
462 
463 	if (promiscuous)
464 		rte_eth_promiscuous_enable(port);
465 
466 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
467 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
468 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
469 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
470 			(unsigned)port,
471 			vmdq_ports_eth_addr[port].addr_bytes[0],
472 			vmdq_ports_eth_addr[port].addr_bytes[1],
473 			vmdq_ports_eth_addr[port].addr_bytes[2],
474 			vmdq_ports_eth_addr[port].addr_bytes[3],
475 			vmdq_ports_eth_addr[port].addr_bytes[4],
476 			vmdq_ports_eth_addr[port].addr_bytes[5]);
477 
478 	return 0;
479 }
480 
481 /*
482  * Set character device basename.
483  */
484 static int
485 us_vhost_parse_basename(const char *q_arg)
486 {
487 	/* parse number string */
488 
489 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
490 		return -1;
491 	else
492 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
493 
494 	return 0;
495 }
496 
497 /*
498  * Parse the portmask provided at run time.
499  */
500 static int
501 parse_portmask(const char *portmask)
502 {
503 	char *end = NULL;
504 	unsigned long pm;
505 
506 	errno = 0;
507 
508 	/* parse hexadecimal string */
509 	pm = strtoul(portmask, &end, 16);
510 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
511 		return -1;
512 
513 	if (pm == 0)
514 		return -1;
515 
516 	return pm;
517 
518 }
519 
520 /*
521  * Parse num options at run time.
522  */
523 static int
524 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
525 {
526 	char *end = NULL;
527 	unsigned long num;
528 
529 	errno = 0;
530 
531 	/* parse unsigned int string */
532 	num = strtoul(q_arg, &end, 10);
533 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
534 		return -1;
535 
536 	if (num > max_valid_value)
537 		return -1;
538 
539 	return num;
540 
541 }
542 
543 /*
544  * Display usage
545  */
546 static void
547 us_vhost_usage(const char *prgname)
548 {
549 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
550 	"		--vm2vm [0|1|2]\n"
551 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
552 	"		--dev-basename <name>\n"
553 	"		--nb-devices ND\n"
554 	"		-p PORTMASK: Set mask for ports to be used by application\n"
555 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
556 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
557 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
558 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
559 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
560 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
561 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
562 	"		--dev-basename: The basename to be used for the character device.\n"
563 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
564 			"zero copy\n"
565 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
566 			"used only when zero copy is enabled.\n"
567 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
568 			"used only when zero copy is enabled.\n",
569 	       prgname);
570 }
571 
572 /*
573  * Parse the arguments given in the command line of the application.
574  */
575 static int
576 us_vhost_parse_args(int argc, char **argv)
577 {
578 	int opt, ret;
579 	int option_index;
580 	unsigned i;
581 	const char *prgname = argv[0];
582 	static struct option long_option[] = {
583 		{"vm2vm", required_argument, NULL, 0},
584 		{"rx-retry", required_argument, NULL, 0},
585 		{"rx-retry-delay", required_argument, NULL, 0},
586 		{"rx-retry-num", required_argument, NULL, 0},
587 		{"mergeable", required_argument, NULL, 0},
588 		{"vlan-strip", required_argument, NULL, 0},
589 		{"stats", required_argument, NULL, 0},
590 		{"dev-basename", required_argument, NULL, 0},
591 		{"zero-copy", required_argument, NULL, 0},
592 		{"rx-desc-num", required_argument, NULL, 0},
593 		{"tx-desc-num", required_argument, NULL, 0},
594 		{NULL, 0, 0, 0},
595 	};
596 
597 	/* Parse command line */
598 	while ((opt = getopt_long(argc, argv, "p:P",
599 			long_option, &option_index)) != EOF) {
600 		switch (opt) {
601 		/* Portmask */
602 		case 'p':
603 			enabled_port_mask = parse_portmask(optarg);
604 			if (enabled_port_mask == 0) {
605 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
606 				us_vhost_usage(prgname);
607 				return -1;
608 			}
609 			break;
610 
611 		case 'P':
612 			promiscuous = 1;
613 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
614 				ETH_VMDQ_ACCEPT_BROADCAST |
615 				ETH_VMDQ_ACCEPT_MULTICAST;
616 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
617 
618 			break;
619 
620 		case 0:
621 			/* Enable/disable vm2vm comms. */
622 			if (!strncmp(long_option[option_index].name, "vm2vm",
623 				MAX_LONG_OPT_SZ)) {
624 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
625 				if (ret == -1) {
626 					RTE_LOG(INFO, VHOST_CONFIG,
627 						"Invalid argument for "
628 						"vm2vm [0|1|2]\n");
629 					us_vhost_usage(prgname);
630 					return -1;
631 				} else {
632 					vm2vm_mode = (vm2vm_type)ret;
633 				}
634 			}
635 
636 			/* Enable/disable retries on RX. */
637 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
638 				ret = parse_num_opt(optarg, 1);
639 				if (ret == -1) {
640 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
641 					us_vhost_usage(prgname);
642 					return -1;
643 				} else {
644 					enable_retry = ret;
645 				}
646 			}
647 
648 			/* Specify the retries delay time (in useconds) on RX. */
649 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
650 				ret = parse_num_opt(optarg, INT32_MAX);
651 				if (ret == -1) {
652 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
653 					us_vhost_usage(prgname);
654 					return -1;
655 				} else {
656 					burst_rx_delay_time = ret;
657 				}
658 			}
659 
660 			/* Specify the retries number on RX. */
661 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
662 				ret = parse_num_opt(optarg, INT32_MAX);
663 				if (ret == -1) {
664 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
665 					us_vhost_usage(prgname);
666 					return -1;
667 				} else {
668 					burst_rx_retry_num = ret;
669 				}
670 			}
671 
672 			/* Enable/disable RX mergeable buffers. */
673 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
674 				ret = parse_num_opt(optarg, 1);
675 				if (ret == -1) {
676 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
677 					us_vhost_usage(prgname);
678 					return -1;
679 				} else {
680 					mergeable = !!ret;
681 					if (ret) {
682 						vmdq_conf_default.rxmode.jumbo_frame = 1;
683 						vmdq_conf_default.rxmode.max_rx_pkt_len
684 							= JUMBO_FRAME_MAX_SIZE;
685 					}
686 				}
687 			}
688 
689 			/* Enable/disable RX VLAN strip on host. */
690 			if (!strncmp(long_option[option_index].name,
691 				"vlan-strip", MAX_LONG_OPT_SZ)) {
692 				ret = parse_num_opt(optarg, 1);
693 				if (ret == -1) {
694 					RTE_LOG(INFO, VHOST_CONFIG,
695 						"Invalid argument for VLAN strip [0|1]\n");
696 					us_vhost_usage(prgname);
697 					return -1;
698 				} else {
699 					vlan_strip = !!ret;
700 					vmdq_conf_default.rxmode.hw_vlan_strip =
701 						vlan_strip;
702 				}
703 			}
704 
705 			/* Enable/disable stats. */
706 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
707 				ret = parse_num_opt(optarg, INT32_MAX);
708 				if (ret == -1) {
709 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
710 					us_vhost_usage(prgname);
711 					return -1;
712 				} else {
713 					enable_stats = ret;
714 				}
715 			}
716 
717 			/* Set character device basename. */
718 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
719 				if (us_vhost_parse_basename(optarg) == -1) {
720 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
721 					us_vhost_usage(prgname);
722 					return -1;
723 				}
724 			}
725 
726 			/* Enable/disable rx/tx zero copy. */
727 			if (!strncmp(long_option[option_index].name,
728 				"zero-copy", MAX_LONG_OPT_SZ)) {
729 				ret = parse_num_opt(optarg, 1);
730 				if (ret == -1) {
731 					RTE_LOG(INFO, VHOST_CONFIG,
732 						"Invalid argument"
733 						" for zero-copy [0|1]\n");
734 					us_vhost_usage(prgname);
735 					return -1;
736 				} else
737 					zero_copy = ret;
738 			}
739 
740 			/* Specify the descriptor number on RX. */
741 			if (!strncmp(long_option[option_index].name,
742 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
743 				ret = parse_num_opt(optarg, MAX_RING_DESC);
744 				if ((ret == -1) || (!POWEROF2(ret))) {
745 					RTE_LOG(INFO, VHOST_CONFIG,
746 					"Invalid argument for rx-desc-num[0-N],"
747 					"power of 2 required.\n");
748 					us_vhost_usage(prgname);
749 					return -1;
750 				} else {
751 					num_rx_descriptor = ret;
752 				}
753 			}
754 
755 			/* Specify the descriptor number on TX. */
756 			if (!strncmp(long_option[option_index].name,
757 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
758 				ret = parse_num_opt(optarg, MAX_RING_DESC);
759 				if ((ret == -1) || (!POWEROF2(ret))) {
760 					RTE_LOG(INFO, VHOST_CONFIG,
761 					"Invalid argument for tx-desc-num [0-N],"
762 					"power of 2 required.\n");
763 					us_vhost_usage(prgname);
764 					return -1;
765 				} else {
766 					num_tx_descriptor = ret;
767 				}
768 			}
769 
770 			break;
771 
772 			/* Invalid option - print options. */
773 		default:
774 			us_vhost_usage(prgname);
775 			return -1;
776 		}
777 	}
778 
779 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
780 		if (enabled_port_mask & (1 << i))
781 			ports[num_ports++] = (uint8_t)i;
782 	}
783 
784 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
785 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
786 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
787 		return -1;
788 	}
789 
790 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
791 		RTE_LOG(INFO, VHOST_PORT,
792 			"Vhost zero copy doesn't support software vm2vm,"
793 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
794 		return -1;
795 	}
796 
797 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
798 		RTE_LOG(INFO, VHOST_PORT,
799 			"Vhost zero copy doesn't support jumbo frame,"
800 			"please specify '--mergeable 0' to disable the "
801 			"mergeable feature.\n");
802 		return -1;
803 	}
804 
805 	return 0;
806 }
807 
808 /*
809  * Update the global var NUM_PORTS and array PORTS according to system ports number
810  * and return valid ports number
811  */
812 static unsigned check_ports_num(unsigned nb_ports)
813 {
814 	unsigned valid_num_ports = num_ports;
815 	unsigned portid;
816 
817 	if (num_ports > nb_ports) {
818 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
819 			num_ports, nb_ports);
820 		num_ports = nb_ports;
821 	}
822 
823 	for (portid = 0; portid < num_ports; portid ++) {
824 		if (ports[portid] >= nb_ports) {
825 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
826 				ports[portid], (nb_ports - 1));
827 			ports[portid] = INVALID_PORT_ID;
828 			valid_num_ports--;
829 		}
830 	}
831 	return valid_num_ports;
832 }
833 
834 /*
835  * Macro to print out packet contents. Wrapped in debug define so that the
836  * data path is not effected when debug is disabled.
837  */
838 #ifdef DEBUG
839 #define PRINT_PACKET(device, addr, size, header) do {																\
840 	char *pkt_addr = (char*)(addr);																					\
841 	unsigned int index;																								\
842 	char packet[MAX_PRINT_BUFF];																					\
843 																													\
844 	if ((header))																									\
845 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
846 	else																											\
847 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
848 	for (index = 0; index < (size); index++) {																		\
849 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
850 			"%02hhx ", pkt_addr[index]);																			\
851 	}																												\
852 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
853 																													\
854 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
855 } while(0)
856 #else
857 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
858 #endif
859 
860 /*
861  * Function to convert guest physical addresses to vhost physical addresses.
862  * This is used to convert virtio buffer addresses.
863  */
864 static inline uint64_t __attribute__((always_inline))
865 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
866 	uint32_t buf_len, hpa_type *addr_type)
867 {
868 	struct virtio_memory_regions_hpa *region;
869 	uint32_t regionidx;
870 	uint64_t vhost_pa = 0;
871 
872 	*addr_type = PHYS_ADDR_INVALID;
873 
874 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
875 		region = &vdev->regions_hpa[regionidx];
876 		if ((guest_pa >= region->guest_phys_address) &&
877 			(guest_pa <= region->guest_phys_address_end)) {
878 			vhost_pa = region->host_phys_addr_offset + guest_pa;
879 			if (likely((guest_pa + buf_len - 1)
880 				<= region->guest_phys_address_end))
881 				*addr_type = PHYS_ADDR_CONTINUOUS;
882 			else
883 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
884 			break;
885 		}
886 	}
887 
888 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
889 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
890 		(void *)(uintptr_t)vhost_pa);
891 
892 	return vhost_pa;
893 }
894 
895 /*
896  * Compares a packet destination MAC address to a device MAC address.
897  */
898 static inline int __attribute__((always_inline))
899 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
900 {
901 	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
902 }
903 
904 /*
905  * This function learns the MAC address of the device and registers this along with a
906  * vlan tag to a VMDQ.
907  */
908 static int
909 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
910 {
911 	struct ether_hdr *pkt_hdr;
912 	struct virtio_net_data_ll *dev_ll;
913 	struct virtio_net *dev = vdev->dev;
914 	int i, ret;
915 
916 	/* Learn MAC address of guest device from packet */
917 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
918 
919 	dev_ll = ll_root_used;
920 
921 	while (dev_ll != NULL) {
922 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
923 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
924 			return -1;
925 		}
926 		dev_ll = dev_ll->next;
927 	}
928 
929 	for (i = 0; i < ETHER_ADDR_LEN; i++)
930 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
931 
932 	/* vlan_tag currently uses the device_id. */
933 	vdev->vlan_tag = vlan_tags[dev->device_fh];
934 
935 	/* Print out VMDQ registration info. */
936 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
937 		dev->device_fh,
938 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
939 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
940 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
941 		vdev->vlan_tag);
942 
943 	/* Register the MAC address. */
944 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
945 				(uint32_t)dev->device_fh + vmdq_pool_base);
946 	if (ret)
947 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
948 					dev->device_fh);
949 
950 	/* Enable stripping of the vlan tag as we handle routing. */
951 	if (vlan_strip)
952 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
953 			(uint16_t)vdev->vmdq_rx_q, 1);
954 
955 	/* Set device as ready for RX. */
956 	vdev->ready = DEVICE_RX;
957 
958 	return 0;
959 }
960 
961 /*
962  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
963  * queue before disabling RX on the device.
964  */
965 static inline void
966 unlink_vmdq(struct vhost_dev *vdev)
967 {
968 	unsigned i = 0;
969 	unsigned rx_count;
970 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
971 
972 	if (vdev->ready == DEVICE_RX) {
973 		/*clear MAC and VLAN settings*/
974 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
975 		for (i = 0; i < 6; i++)
976 			vdev->mac_address.addr_bytes[i] = 0;
977 
978 		vdev->vlan_tag = 0;
979 
980 		/*Clear out the receive buffers*/
981 		rx_count = rte_eth_rx_burst(ports[0],
982 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
983 
984 		while (rx_count) {
985 			for (i = 0; i < rx_count; i++)
986 				rte_pktmbuf_free(pkts_burst[i]);
987 
988 			rx_count = rte_eth_rx_burst(ports[0],
989 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
990 		}
991 
992 		vdev->ready = DEVICE_MAC_LEARNING;
993 	}
994 }
995 
996 /*
997  * Check if the packet destination MAC address is for a local device. If so then put
998  * the packet on that devices RX queue. If not then return.
999  */
1000 static inline int __attribute__((always_inline))
1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1002 {
1003 	struct virtio_net_data_ll *dev_ll;
1004 	struct ether_hdr *pkt_hdr;
1005 	uint64_t ret = 0;
1006 	struct virtio_net *dev = vdev->dev;
1007 	struct virtio_net *tdev; /* destination virito device */
1008 
1009 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1010 
1011 	/*get the used devices list*/
1012 	dev_ll = ll_root_used;
1013 
1014 	while (dev_ll != NULL) {
1015 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1016 				          &dev_ll->vdev->mac_address)) {
1017 
1018 			/* Drop the packet if the TX packet is destined for the TX device. */
1019 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1020 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1021 							dev->device_fh);
1022 				return 0;
1023 			}
1024 			tdev = dev_ll->vdev->dev;
1025 
1026 
1027 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1028 
1029 			if (unlikely(dev_ll->vdev->remove)) {
1030 				/*drop the packet if the device is marked for removal*/
1031 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1032 			} else {
1033 				/*send the packet to the local virtio device*/
1034 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1035 				if (enable_stats) {
1036 					rte_atomic64_add(
1037 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1038 					1);
1039 					rte_atomic64_add(
1040 					&dev_statistics[tdev->device_fh].rx_atomic,
1041 					ret);
1042 					dev_statistics[dev->device_fh].tx_total++;
1043 					dev_statistics[dev->device_fh].tx += ret;
1044 				}
1045 			}
1046 
1047 			return 0;
1048 		}
1049 		dev_ll = dev_ll->next;
1050 	}
1051 
1052 	return -1;
1053 }
1054 
1055 /*
1056  * Check if the destination MAC of a packet is one local VM,
1057  * and get its vlan tag, and offset if it is.
1058  */
1059 static inline int __attribute__((always_inline))
1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1061 	uint32_t *offset, uint16_t *vlan_tag)
1062 {
1063 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1064 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1065 
1066 	while (dev_ll != NULL) {
1067 		if ((dev_ll->vdev->ready == DEVICE_RX)
1068 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1069 		&dev_ll->vdev->mac_address)) {
1070 			/*
1071 			 * Drop the packet if the TX packet is
1072 			 * destined for the TX device.
1073 			 */
1074 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1075 				LOG_DEBUG(VHOST_DATA,
1076 				"(%"PRIu64") TX: Source and destination"
1077 				" MAC addresses are the same. Dropping "
1078 				"packet.\n",
1079 				dev_ll->vdev->dev->device_fh);
1080 				return -1;
1081 			}
1082 
1083 			/*
1084 			 * HW vlan strip will reduce the packet length
1085 			 * by minus length of vlan tag, so need restore
1086 			 * the packet length by plus it.
1087 			 */
1088 			*offset = VLAN_HLEN;
1089 			*vlan_tag =
1090 			(uint16_t)
1091 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1092 
1093 			LOG_DEBUG(VHOST_DATA,
1094 			"(%"PRIu64") TX: pkt to local VM device id:"
1095 			"(%"PRIu64") vlan tag: %d.\n",
1096 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1097 			(int)*vlan_tag);
1098 
1099 			break;
1100 		}
1101 		dev_ll = dev_ll->next;
1102 	}
1103 	return 0;
1104 }
1105 
1106 /*
1107  * This function routes the TX packet to the correct interface. This may be a local device
1108  * or the physical port.
1109  */
1110 static inline void __attribute__((always_inline))
1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1112 {
1113 	struct mbuf_table *tx_q;
1114 	struct rte_mbuf **m_table;
1115 	unsigned len, ret, offset = 0;
1116 	const uint16_t lcore_id = rte_lcore_id();
1117 	struct virtio_net *dev = vdev->dev;
1118 	struct ether_hdr *nh;
1119 
1120 	/*check if destination is local VM*/
1121 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1122 		rte_pktmbuf_free(m);
1123 		return;
1124 	}
1125 
1126 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1127 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1128 			rte_pktmbuf_free(m);
1129 			return;
1130 		}
1131 	}
1132 
1133 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1134 
1135 	/*Add packet to the port tx queue*/
1136 	tx_q = &lcore_tx_queue[lcore_id];
1137 	len = tx_q->len;
1138 
1139 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1140 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1141 		/* Guest has inserted the vlan tag. */
1142 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1143 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1144 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1145 			(vh->vlan_tci != vlan_tag_be))
1146 			vh->vlan_tci = vlan_tag_be;
1147 	} else {
1148 		m->ol_flags = PKT_TX_VLAN_PKT;
1149 
1150 		/*
1151 		 * Find the right seg to adjust the data len when offset is
1152 		 * bigger than tail room size.
1153 		 */
1154 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1155 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1156 				m->data_len += offset;
1157 			else {
1158 				struct rte_mbuf *seg = m;
1159 
1160 				while ((seg->next != NULL) &&
1161 					(offset > rte_pktmbuf_tailroom(seg)))
1162 					seg = seg->next;
1163 
1164 				seg->data_len += offset;
1165 			}
1166 			m->pkt_len += offset;
1167 		}
1168 
1169 		m->vlan_tci = vlan_tag;
1170 	}
1171 
1172 	tx_q->m_table[len] = m;
1173 	len++;
1174 	if (enable_stats) {
1175 		dev_statistics[dev->device_fh].tx_total++;
1176 		dev_statistics[dev->device_fh].tx++;
1177 	}
1178 
1179 	if (unlikely(len == MAX_PKT_BURST)) {
1180 		m_table = (struct rte_mbuf **)tx_q->m_table;
1181 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1182 		/* Free any buffers not handled by TX and update the port stats. */
1183 		if (unlikely(ret < len)) {
1184 			do {
1185 				rte_pktmbuf_free(m_table[ret]);
1186 			} while (++ret < len);
1187 		}
1188 
1189 		len = 0;
1190 	}
1191 
1192 	tx_q->len = len;
1193 	return;
1194 }
1195 /*
1196  * This function is called by each data core. It handles all RX/TX registered with the
1197  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1198  * with all devices in the main linked list.
1199  */
1200 static int
1201 switch_worker(__attribute__((unused)) void *arg)
1202 {
1203 	struct rte_mempool *mbuf_pool = arg;
1204 	struct virtio_net *dev = NULL;
1205 	struct vhost_dev *vdev = NULL;
1206 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1207 	struct virtio_net_data_ll *dev_ll;
1208 	struct mbuf_table *tx_q;
1209 	volatile struct lcore_ll_info *lcore_ll;
1210 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1211 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1212 	unsigned ret, i;
1213 	const uint16_t lcore_id = rte_lcore_id();
1214 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1215 	uint16_t rx_count = 0;
1216 	uint16_t tx_count;
1217 	uint32_t retry = 0;
1218 
1219 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1220 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1221 	prev_tsc = 0;
1222 
1223 	tx_q = &lcore_tx_queue[lcore_id];
1224 	for (i = 0; i < num_cores; i ++) {
1225 		if (lcore_ids[i] == lcore_id) {
1226 			tx_q->txq_id = i;
1227 			break;
1228 		}
1229 	}
1230 
1231 	while(1) {
1232 		cur_tsc = rte_rdtsc();
1233 		/*
1234 		 * TX burst queue drain
1235 		 */
1236 		diff_tsc = cur_tsc - prev_tsc;
1237 		if (unlikely(diff_tsc > drain_tsc)) {
1238 
1239 			if (tx_q->len) {
1240 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1241 
1242 				/*Tx any packets in the queue*/
1243 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1244 									   (struct rte_mbuf **)tx_q->m_table,
1245 									   (uint16_t)tx_q->len);
1246 				if (unlikely(ret < tx_q->len)) {
1247 					do {
1248 						rte_pktmbuf_free(tx_q->m_table[ret]);
1249 					} while (++ret < tx_q->len);
1250 				}
1251 
1252 				tx_q->len = 0;
1253 			}
1254 
1255 			prev_tsc = cur_tsc;
1256 
1257 		}
1258 
1259 		rte_prefetch0(lcore_ll->ll_root_used);
1260 		/*
1261 		 * Inform the configuration core that we have exited the linked list and that no devices are
1262 		 * in use if requested.
1263 		 */
1264 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1265 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1266 
1267 		/*
1268 		 * Process devices
1269 		 */
1270 		dev_ll = lcore_ll->ll_root_used;
1271 
1272 		while (dev_ll != NULL) {
1273 			/*get virtio device ID*/
1274 			vdev = dev_ll->vdev;
1275 			dev = vdev->dev;
1276 
1277 			if (unlikely(vdev->remove)) {
1278 				dev_ll = dev_ll->next;
1279 				unlink_vmdq(vdev);
1280 				vdev->ready = DEVICE_SAFE_REMOVE;
1281 				continue;
1282 			}
1283 			if (likely(vdev->ready == DEVICE_RX)) {
1284 				/*Handle guest RX*/
1285 				rx_count = rte_eth_rx_burst(ports[0],
1286 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1287 
1288 				if (rx_count) {
1289 					/*
1290 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1291 					* Here MAX_PKT_BURST must be less than virtio queue size
1292 					*/
1293 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1294 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1295 							rte_delay_us(burst_rx_delay_time);
1296 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1297 								break;
1298 						}
1299 					}
1300 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1301 					if (enable_stats) {
1302 						rte_atomic64_add(
1303 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1304 						rx_count);
1305 						rte_atomic64_add(
1306 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1307 					}
1308 					while (likely(rx_count)) {
1309 						rx_count--;
1310 						rte_pktmbuf_free(pkts_burst[rx_count]);
1311 					}
1312 
1313 				}
1314 			}
1315 
1316 			if (likely(!vdev->remove)) {
1317 				/* Handle guest TX*/
1318 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1319 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1320 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1321 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1322 						while (tx_count)
1323 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1324 					}
1325 				}
1326 				while (tx_count)
1327 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1328 			}
1329 
1330 			/*move to the next device in the list*/
1331 			dev_ll = dev_ll->next;
1332 		}
1333 	}
1334 
1335 	return 0;
1336 }
1337 
1338 /*
1339  * This function gets available ring number for zero copy rx.
1340  * Only one thread will call this funciton for a paticular virtio device,
1341  * so, it is designed as non-thread-safe function.
1342  */
1343 static inline uint32_t __attribute__((always_inline))
1344 get_available_ring_num_zcp(struct virtio_net *dev)
1345 {
1346 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1347 	uint16_t avail_idx;
1348 
1349 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1350 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1351 }
1352 
1353 /*
1354  * This function gets available ring index for zero copy rx,
1355  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1356  * Only one thread will call this funciton for a paticular virtio device,
1357  * so, it is designed as non-thread-safe function.
1358  */
1359 static inline uint32_t __attribute__((always_inline))
1360 get_available_ring_index_zcp(struct virtio_net *dev,
1361 	uint16_t *res_base_idx, uint32_t count)
1362 {
1363 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1364 	uint16_t avail_idx;
1365 	uint32_t retry = 0;
1366 	uint16_t free_entries;
1367 
1368 	*res_base_idx = vq->last_used_idx_res;
1369 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1370 	free_entries = (avail_idx - *res_base_idx);
1371 
1372 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1373 			"avail idx: %d, "
1374 			"res base idx:%d, free entries:%d\n",
1375 			dev->device_fh, avail_idx, *res_base_idx,
1376 			free_entries);
1377 
1378 	/*
1379 	 * If retry is enabled and the queue is full then we wait
1380 	 * and retry to avoid packet loss.
1381 	 */
1382 	if (enable_retry && unlikely(count > free_entries)) {
1383 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1384 			rte_delay_us(burst_rx_delay_time);
1385 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1386 			free_entries = (avail_idx - *res_base_idx);
1387 			if (count <= free_entries)
1388 				break;
1389 		}
1390 	}
1391 
1392 	/*check that we have enough buffers*/
1393 	if (unlikely(count > free_entries))
1394 		count = free_entries;
1395 
1396 	if (unlikely(count == 0)) {
1397 		LOG_DEBUG(VHOST_DATA,
1398 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1399 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1400 			dev->device_fh, avail_idx,
1401 			*res_base_idx, free_entries);
1402 		return 0;
1403 	}
1404 
1405 	vq->last_used_idx_res = *res_base_idx + count;
1406 
1407 	return count;
1408 }
1409 
1410 /*
1411  * This function put descriptor back to used list.
1412  */
1413 static inline void __attribute__((always_inline))
1414 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1415 {
1416 	uint16_t res_cur_idx = vq->last_used_idx;
1417 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1418 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1419 	rte_compiler_barrier();
1420 	*(volatile uint16_t *)&vq->used->idx += 1;
1421 	vq->last_used_idx += 1;
1422 
1423 	/* Kick the guest if necessary. */
1424 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1425 		eventfd_write(vq->callfd, (eventfd_t)1);
1426 }
1427 
1428 /*
1429  * This function get available descriptor from vitio vring and un-attached mbuf
1430  * from vpool->ring, and then attach them together. It needs adjust the offset
1431  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1432  * frame data may be put to wrong location in mbuf.
1433  */
1434 static inline void __attribute__((always_inline))
1435 attach_rxmbuf_zcp(struct virtio_net *dev)
1436 {
1437 	uint16_t res_base_idx, desc_idx;
1438 	uint64_t buff_addr, phys_addr;
1439 	struct vhost_virtqueue *vq;
1440 	struct vring_desc *desc;
1441 	void *obj = NULL;
1442 	struct rte_mbuf *mbuf;
1443 	struct vpool *vpool;
1444 	hpa_type addr_type;
1445 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1446 
1447 	vpool = &vpool_array[vdev->vmdq_rx_q];
1448 	vq = dev->virtqueue[VIRTIO_RXQ];
1449 
1450 	do {
1451 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1452 				1) != 1))
1453 			return;
1454 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1455 
1456 		desc = &vq->desc[desc_idx];
1457 		if (desc->flags & VRING_DESC_F_NEXT) {
1458 			desc = &vq->desc[desc->next];
1459 			buff_addr = gpa_to_vva(dev, desc->addr);
1460 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1461 					&addr_type);
1462 		} else {
1463 			buff_addr = gpa_to_vva(dev,
1464 					desc->addr + vq->vhost_hlen);
1465 			phys_addr = gpa_to_hpa(vdev,
1466 					desc->addr + vq->vhost_hlen,
1467 					desc->len, &addr_type);
1468 		}
1469 
1470 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1471 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1472 				" address found when attaching RX frame buffer"
1473 				" address!\n", dev->device_fh);
1474 			put_desc_to_used_list_zcp(vq, desc_idx);
1475 			continue;
1476 		}
1477 
1478 		/*
1479 		 * Check if the frame buffer address from guest crosses
1480 		 * sub-region or not.
1481 		 */
1482 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1483 			RTE_LOG(ERR, VHOST_DATA,
1484 				"(%"PRIu64") Frame buffer address cross "
1485 				"sub-regioin found when attaching RX frame "
1486 				"buffer address!\n",
1487 				dev->device_fh);
1488 			put_desc_to_used_list_zcp(vq, desc_idx);
1489 			continue;
1490 		}
1491 	} while (unlikely(phys_addr == 0));
1492 
1493 	rte_ring_sc_dequeue(vpool->ring, &obj);
1494 	mbuf = obj;
1495 	if (unlikely(mbuf == NULL)) {
1496 		LOG_DEBUG(VHOST_DATA,
1497 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1498 			"ring_sc_dequeue fail.\n",
1499 			dev->device_fh);
1500 		put_desc_to_used_list_zcp(vq, desc_idx);
1501 		return;
1502 	}
1503 
1504 	if (unlikely(vpool->buf_size > desc->len)) {
1505 		LOG_DEBUG(VHOST_DATA,
1506 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1507 			"length(%d) of descriptor idx: %d less than room "
1508 			"size required: %d\n",
1509 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1510 		put_desc_to_used_list_zcp(vq, desc_idx);
1511 		rte_ring_sp_enqueue(vpool->ring, obj);
1512 		return;
1513 	}
1514 
1515 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1516 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1517 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1518 	mbuf->data_len = desc->len;
1519 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1520 
1521 	LOG_DEBUG(VHOST_DATA,
1522 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1523 		"descriptor idx:%d\n",
1524 		dev->device_fh, res_base_idx, desc_idx);
1525 
1526 	__rte_mbuf_raw_free(mbuf);
1527 
1528 	return;
1529 }
1530 
1531 /*
1532  * Detach an attched packet mbuf -
1533  *  - restore original mbuf address and length values.
1534  *  - reset pktmbuf data and data_len to their default values.
1535  *  All other fields of the given packet mbuf will be left intact.
1536  *
1537  * @param m
1538  *   The attached packet mbuf.
1539  */
1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1541 {
1542 	const struct rte_mempool *mp = m->pool;
1543 	void *buf = rte_mbuf_to_baddr(m);
1544 	uint32_t buf_ofs;
1545 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1546 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1547 
1548 	m->buf_addr = buf;
1549 	m->buf_len = (uint16_t)buf_len;
1550 
1551 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1552 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1553 	m->data_off = buf_ofs;
1554 
1555 	m->data_len = 0;
1556 }
1557 
1558 /*
1559  * This function is called after packets have been transimited. It fetchs mbuf
1560  * from vpool->pool, detached it and put into vpool->ring. It also update the
1561  * used index and kick the guest if necessary.
1562  */
1563 static inline uint32_t __attribute__((always_inline))
1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1565 {
1566 	struct rte_mbuf *mbuf;
1567 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1568 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1569 	uint32_t index = 0;
1570 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1571 
1572 	LOG_DEBUG(VHOST_DATA,
1573 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1574 		"clean is: %d\n",
1575 		dev->device_fh, mbuf_count);
1576 	LOG_DEBUG(VHOST_DATA,
1577 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1578 		"clean  is : %d\n",
1579 		dev->device_fh, rte_ring_count(vpool->ring));
1580 
1581 	for (index = 0; index < mbuf_count; index++) {
1582 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583 		if (likely(MBUF_EXT_MEM(mbuf)))
1584 			pktmbuf_detach_zcp(mbuf);
1585 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1586 
1587 		/* Update used index buffer information. */
1588 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1589 		vq->used->ring[used_idx].len = 0;
1590 
1591 		used_idx = (used_idx + 1) & (vq->size - 1);
1592 	}
1593 
1594 	LOG_DEBUG(VHOST_DATA,
1595 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1596 		"clean is: %d\n",
1597 		dev->device_fh, rte_mempool_count(vpool->pool));
1598 	LOG_DEBUG(VHOST_DATA,
1599 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1600 		"clean  is : %d\n",
1601 		dev->device_fh, rte_ring_count(vpool->ring));
1602 	LOG_DEBUG(VHOST_DATA,
1603 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1604 		"vq->last_used_idx:%d\n",
1605 		dev->device_fh, vq->last_used_idx);
1606 
1607 	vq->last_used_idx += mbuf_count;
1608 
1609 	LOG_DEBUG(VHOST_DATA,
1610 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1611 		"vq->last_used_idx:%d\n",
1612 		dev->device_fh, vq->last_used_idx);
1613 
1614 	rte_compiler_barrier();
1615 
1616 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1617 
1618 	/* Kick guest if required. */
1619 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1620 		eventfd_write(vq->callfd, (eventfd_t)1);
1621 
1622 	return 0;
1623 }
1624 
1625 /*
1626  * This function is called when a virtio device is destroy.
1627  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1628  */
1629 static void mbuf_destroy_zcp(struct vpool *vpool)
1630 {
1631 	struct rte_mbuf *mbuf = NULL;
1632 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1633 
1634 	LOG_DEBUG(VHOST_CONFIG,
1635 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1636 		"mbuf_destroy_zcp is: %d\n",
1637 		mbuf_count);
1638 	LOG_DEBUG(VHOST_CONFIG,
1639 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1640 		"mbuf_destroy_zcp  is : %d\n",
1641 		rte_ring_count(vpool->ring));
1642 
1643 	for (index = 0; index < mbuf_count; index++) {
1644 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1645 		if (likely(mbuf != NULL)) {
1646 			if (likely(MBUF_EXT_MEM(mbuf)))
1647 				pktmbuf_detach_zcp(mbuf);
1648 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1649 		}
1650 	}
1651 
1652 	LOG_DEBUG(VHOST_CONFIG,
1653 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1654 		"mbuf_destroy_zcp is: %d\n",
1655 		rte_mempool_count(vpool->pool));
1656 	LOG_DEBUG(VHOST_CONFIG,
1657 		"in mbuf_destroy_zcp: mbuf count in ring after "
1658 		"mbuf_destroy_zcp is : %d\n",
1659 		rte_ring_count(vpool->ring));
1660 }
1661 
1662 /*
1663  * This function update the use flag and counter.
1664  */
1665 static inline uint32_t __attribute__((always_inline))
1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1667 	uint32_t count)
1668 {
1669 	struct vhost_virtqueue *vq;
1670 	struct vring_desc *desc;
1671 	struct rte_mbuf *buff;
1672 	/* The virtio_hdr is initialised to 0. */
1673 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1674 		= {{0, 0, 0, 0, 0, 0}, 0};
1675 	uint64_t buff_hdr_addr = 0;
1676 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1677 	uint32_t head_idx, packet_success = 0;
1678 	uint16_t res_cur_idx;
1679 
1680 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1681 
1682 	if (count == 0)
1683 		return 0;
1684 
1685 	vq = dev->virtqueue[VIRTIO_RXQ];
1686 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1687 
1688 	res_cur_idx = vq->last_used_idx;
1689 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1690 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1691 
1692 	/* Retrieve all of the head indexes first to avoid caching issues. */
1693 	for (head_idx = 0; head_idx < count; head_idx++)
1694 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1695 
1696 	/*Prefetch descriptor index. */
1697 	rte_prefetch0(&vq->desc[head[packet_success]]);
1698 
1699 	while (packet_success != count) {
1700 		/* Get descriptor from available ring */
1701 		desc = &vq->desc[head[packet_success]];
1702 
1703 		buff = pkts[packet_success];
1704 		LOG_DEBUG(VHOST_DATA,
1705 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1706 			"pkt[%d] descriptor idx: %d\n",
1707 			dev->device_fh, packet_success,
1708 			MBUF_HEADROOM_UINT32(buff));
1709 
1710 		PRINT_PACKET(dev,
1711 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1712 			+ RTE_PKTMBUF_HEADROOM),
1713 			rte_pktmbuf_data_len(buff), 0);
1714 
1715 		/* Buffer address translation for virtio header. */
1716 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1717 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1718 
1719 		/*
1720 		 * If the descriptors are chained the header and data are
1721 		 * placed in separate buffers.
1722 		 */
1723 		if (desc->flags & VRING_DESC_F_NEXT) {
1724 			desc->len = vq->vhost_hlen;
1725 			desc = &vq->desc[desc->next];
1726 			desc->len = rte_pktmbuf_data_len(buff);
1727 		} else {
1728 			desc->len = packet_len;
1729 		}
1730 
1731 		/* Update used ring with desc information */
1732 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1733 			= head[packet_success];
1734 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1735 			= packet_len;
1736 		res_cur_idx++;
1737 		packet_success++;
1738 
1739 		/* A header is required per buffer. */
1740 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1741 			(const void *)&virtio_hdr, vq->vhost_hlen);
1742 
1743 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1744 
1745 		if (likely(packet_success < count)) {
1746 			/* Prefetch descriptor index. */
1747 			rte_prefetch0(&vq->desc[head[packet_success]]);
1748 		}
1749 	}
1750 
1751 	rte_compiler_barrier();
1752 
1753 	LOG_DEBUG(VHOST_DATA,
1754 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1755 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1756 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1757 
1758 	*(volatile uint16_t *)&vq->used->idx += count;
1759 	vq->last_used_idx += count;
1760 
1761 	LOG_DEBUG(VHOST_DATA,
1762 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1763 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1764 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1765 
1766 	/* Kick the guest if necessary. */
1767 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1768 		eventfd_write(vq->callfd, (eventfd_t)1);
1769 
1770 	return count;
1771 }
1772 
1773 /*
1774  * This function routes the TX packet to the correct interface.
1775  * This may be a local device or the physical port.
1776  */
1777 static inline void __attribute__((always_inline))
1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1779 	uint32_t desc_idx, uint8_t need_copy)
1780 {
1781 	struct mbuf_table *tx_q;
1782 	struct rte_mbuf **m_table;
1783 	void *obj = NULL;
1784 	struct rte_mbuf *mbuf;
1785 	unsigned len, ret, offset = 0;
1786 	struct vpool *vpool;
1787 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1788 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1789 
1790 	/*Add packet to the port tx queue*/
1791 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1792 	len = tx_q->len;
1793 
1794 	/* Allocate an mbuf and populate the structure. */
1795 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1796 	rte_ring_sc_dequeue(vpool->ring, &obj);
1797 	mbuf = obj;
1798 	if (unlikely(mbuf == NULL)) {
1799 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1800 		RTE_LOG(ERR, VHOST_DATA,
1801 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1802 			dev->device_fh);
1803 		put_desc_to_used_list_zcp(vq, desc_idx);
1804 		return;
1805 	}
1806 
1807 	if (vm2vm_mode == VM2VM_HARDWARE) {
1808 		/* Avoid using a vlan tag from any vm for external pkt, such as
1809 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1810 		 * selection, MAC address determines it as an external pkt
1811 		 * which should go to network, while vlan tag determine it as
1812 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1813 		 * such a ambiguous situation, so pkt will lost.
1814 		 */
1815 		vlan_tag = external_pkt_default_vlan_tag;
1816 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1817 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1818 			__rte_mbuf_raw_free(mbuf);
1819 			return;
1820 		}
1821 	}
1822 
1823 	mbuf->nb_segs = m->nb_segs;
1824 	mbuf->next = m->next;
1825 	mbuf->data_len = m->data_len + offset;
1826 	mbuf->pkt_len = mbuf->data_len;
1827 	if (unlikely(need_copy)) {
1828 		/* Copy the packet contents to the mbuf. */
1829 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1830 			rte_pktmbuf_mtod(m, void *),
1831 			m->data_len);
1832 	} else {
1833 		mbuf->data_off = m->data_off;
1834 		mbuf->buf_physaddr = m->buf_physaddr;
1835 		mbuf->buf_addr = m->buf_addr;
1836 	}
1837 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1838 	mbuf->vlan_tci = vlan_tag;
1839 	mbuf->l2_len = sizeof(struct ether_hdr);
1840 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1841 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1842 
1843 	tx_q->m_table[len] = mbuf;
1844 	len++;
1845 
1846 	LOG_DEBUG(VHOST_DATA,
1847 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1848 		dev->device_fh,
1849 		mbuf->nb_segs,
1850 		(mbuf->next == NULL) ? "null" : "non-null");
1851 
1852 	if (enable_stats) {
1853 		dev_statistics[dev->device_fh].tx_total++;
1854 		dev_statistics[dev->device_fh].tx++;
1855 	}
1856 
1857 	if (unlikely(len == MAX_PKT_BURST)) {
1858 		m_table = (struct rte_mbuf **)tx_q->m_table;
1859 		ret = rte_eth_tx_burst(ports[0],
1860 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1861 
1862 		/*
1863 		 * Free any buffers not handled by TX and update
1864 		 * the port stats.
1865 		 */
1866 		if (unlikely(ret < len)) {
1867 			do {
1868 				rte_pktmbuf_free(m_table[ret]);
1869 			} while (++ret < len);
1870 		}
1871 
1872 		len = 0;
1873 		txmbuf_clean_zcp(dev, vpool);
1874 	}
1875 
1876 	tx_q->len = len;
1877 
1878 	return;
1879 }
1880 
1881 /*
1882  * This function TX all available packets in virtio TX queue for one
1883  * virtio-net device. If it is first packet, it learns MAC address and
1884  * setup VMDQ.
1885  */
1886 static inline void __attribute__((always_inline))
1887 virtio_dev_tx_zcp(struct virtio_net *dev)
1888 {
1889 	struct rte_mbuf m;
1890 	struct vhost_virtqueue *vq;
1891 	struct vring_desc *desc;
1892 	uint64_t buff_addr = 0, phys_addr;
1893 	uint32_t head[MAX_PKT_BURST];
1894 	uint32_t i;
1895 	uint16_t free_entries, packet_success = 0;
1896 	uint16_t avail_idx;
1897 	uint8_t need_copy = 0;
1898 	hpa_type addr_type;
1899 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1900 
1901 	vq = dev->virtqueue[VIRTIO_TXQ];
1902 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1903 
1904 	/* If there are no available buffers then return. */
1905 	if (vq->last_used_idx_res == avail_idx)
1906 		return;
1907 
1908 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1909 
1910 	/* Prefetch available ring to retrieve head indexes. */
1911 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1912 
1913 	/* Get the number of free entries in the ring */
1914 	free_entries = (avail_idx - vq->last_used_idx_res);
1915 
1916 	/* Limit to MAX_PKT_BURST. */
1917 	free_entries
1918 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1919 
1920 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1921 		dev->device_fh, free_entries);
1922 
1923 	/* Retrieve all of the head indexes first to avoid caching issues. */
1924 	for (i = 0; i < free_entries; i++)
1925 		head[i]
1926 			= vq->avail->ring[(vq->last_used_idx_res + i)
1927 			& (vq->size - 1)];
1928 
1929 	vq->last_used_idx_res += free_entries;
1930 
1931 	/* Prefetch descriptor index. */
1932 	rte_prefetch0(&vq->desc[head[packet_success]]);
1933 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1934 
1935 	while (packet_success < free_entries) {
1936 		desc = &vq->desc[head[packet_success]];
1937 
1938 		/* Discard first buffer as it is the virtio header */
1939 		desc = &vq->desc[desc->next];
1940 
1941 		/* Buffer address translation. */
1942 		buff_addr = gpa_to_vva(dev, desc->addr);
1943 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1944 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1945 			&addr_type);
1946 
1947 		if (likely(packet_success < (free_entries - 1)))
1948 			/* Prefetch descriptor index. */
1949 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1950 
1951 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1952 			RTE_LOG(ERR, VHOST_DATA,
1953 				"(%"PRIu64") Invalid frame buffer address found"
1954 				"when TX packets!\n",
1955 				dev->device_fh);
1956 			packet_success++;
1957 			continue;
1958 		}
1959 
1960 		/* Prefetch buffer address. */
1961 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1962 
1963 		/*
1964 		 * Setup dummy mbuf. This is copied to a real mbuf if
1965 		 * transmitted out the physical port.
1966 		 */
1967 		m.data_len = desc->len;
1968 		m.nb_segs = 1;
1969 		m.next = NULL;
1970 		m.data_off = 0;
1971 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1972 		m.buf_physaddr = phys_addr;
1973 
1974 		/*
1975 		 * Check if the frame buffer address from guest crosses
1976 		 * sub-region or not.
1977 		 */
1978 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1979 			RTE_LOG(ERR, VHOST_DATA,
1980 				"(%"PRIu64") Frame buffer address cross "
1981 				"sub-regioin found when attaching TX frame "
1982 				"buffer address!\n",
1983 				dev->device_fh);
1984 			need_copy = 1;
1985 		} else
1986 			need_copy = 0;
1987 
1988 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1989 
1990 		/*
1991 		 * If this is the first received packet we need to learn
1992 		 * the MAC and setup VMDQ
1993 		 */
1994 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1995 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1996 				/*
1997 				 * Discard frame if device is scheduled for
1998 				 * removal or a duplicate MAC address is found.
1999 				 */
2000 				packet_success += free_entries;
2001 				vq->last_used_idx += packet_success;
2002 				break;
2003 			}
2004 		}
2005 
2006 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2007 		packet_success++;
2008 	}
2009 }
2010 
2011 /*
2012  * This function is called by each data core. It handles all RX/TX registered
2013  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2014  * addresses are compared with all devices in the main linked list.
2015  */
2016 static int
2017 switch_worker_zcp(__attribute__((unused)) void *arg)
2018 {
2019 	struct virtio_net *dev = NULL;
2020 	struct vhost_dev  *vdev = NULL;
2021 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2022 	struct virtio_net_data_ll *dev_ll;
2023 	struct mbuf_table *tx_q;
2024 	volatile struct lcore_ll_info *lcore_ll;
2025 	const uint64_t drain_tsc
2026 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2027 		* BURST_TX_DRAIN_US;
2028 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2029 	unsigned ret;
2030 	const uint16_t lcore_id = rte_lcore_id();
2031 	uint16_t count_in_ring, rx_count = 0;
2032 
2033 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2034 
2035 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2036 	prev_tsc = 0;
2037 
2038 	while (1) {
2039 		cur_tsc = rte_rdtsc();
2040 
2041 		/* TX burst queue drain */
2042 		diff_tsc = cur_tsc - prev_tsc;
2043 		if (unlikely(diff_tsc > drain_tsc)) {
2044 			/*
2045 			 * Get mbuf from vpool.pool and detach mbuf and
2046 			 * put back into vpool.ring.
2047 			 */
2048 			dev_ll = lcore_ll->ll_root_used;
2049 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2050 				/* Get virtio device ID */
2051 				vdev = dev_ll->vdev;
2052 				dev = vdev->dev;
2053 
2054 				if (likely(!vdev->remove)) {
2055 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2056 					if (tx_q->len) {
2057 						LOG_DEBUG(VHOST_DATA,
2058 						"TX queue drained after timeout"
2059 						" with burst size %u\n",
2060 						tx_q->len);
2061 
2062 						/*
2063 						 * Tx any packets in the queue
2064 						 */
2065 						ret = rte_eth_tx_burst(
2066 							ports[0],
2067 							(uint16_t)tx_q->txq_id,
2068 							(struct rte_mbuf **)
2069 							tx_q->m_table,
2070 							(uint16_t)tx_q->len);
2071 						if (unlikely(ret < tx_q->len)) {
2072 							do {
2073 								rte_pktmbuf_free(
2074 									tx_q->m_table[ret]);
2075 							} while (++ret < tx_q->len);
2076 						}
2077 						tx_q->len = 0;
2078 
2079 						txmbuf_clean_zcp(dev,
2080 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2081 					}
2082 				}
2083 				dev_ll = dev_ll->next;
2084 			}
2085 			prev_tsc = cur_tsc;
2086 		}
2087 
2088 		rte_prefetch0(lcore_ll->ll_root_used);
2089 
2090 		/*
2091 		 * Inform the configuration core that we have exited the linked
2092 		 * list and that no devices are in use if requested.
2093 		 */
2094 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2095 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2096 
2097 		/* Process devices */
2098 		dev_ll = lcore_ll->ll_root_used;
2099 
2100 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2101 			vdev = dev_ll->vdev;
2102 			dev  = vdev->dev;
2103 			if (unlikely(vdev->remove)) {
2104 				dev_ll = dev_ll->next;
2105 				unlink_vmdq(vdev);
2106 				vdev->ready = DEVICE_SAFE_REMOVE;
2107 				continue;
2108 			}
2109 
2110 			if (likely(vdev->ready == DEVICE_RX)) {
2111 				uint32_t index = vdev->vmdq_rx_q;
2112 				uint16_t i;
2113 				count_in_ring
2114 				= rte_ring_count(vpool_array[index].ring);
2115 				uint16_t free_entries
2116 				= (uint16_t)get_available_ring_num_zcp(dev);
2117 
2118 				/*
2119 				 * Attach all mbufs in vpool.ring and put back
2120 				 * into vpool.pool.
2121 				 */
2122 				for (i = 0;
2123 				i < RTE_MIN(free_entries,
2124 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2125 				i++)
2126 					attach_rxmbuf_zcp(dev);
2127 
2128 				/* Handle guest RX */
2129 				rx_count = rte_eth_rx_burst(ports[0],
2130 					vdev->vmdq_rx_q, pkts_burst,
2131 					MAX_PKT_BURST);
2132 
2133 				if (rx_count) {
2134 					ret_count = virtio_dev_rx_zcp(dev,
2135 							pkts_burst, rx_count);
2136 					if (enable_stats) {
2137 						dev_statistics[dev->device_fh].rx_total
2138 							+= rx_count;
2139 						dev_statistics[dev->device_fh].rx
2140 							+= ret_count;
2141 					}
2142 					while (likely(rx_count)) {
2143 						rx_count--;
2144 						pktmbuf_detach_zcp(
2145 							pkts_burst[rx_count]);
2146 						rte_ring_sp_enqueue(
2147 							vpool_array[index].ring,
2148 							(void *)pkts_burst[rx_count]);
2149 					}
2150 				}
2151 			}
2152 
2153 			if (likely(!vdev->remove))
2154 				/* Handle guest TX */
2155 				virtio_dev_tx_zcp(dev);
2156 
2157 			/* Move to the next device in the list */
2158 			dev_ll = dev_ll->next;
2159 		}
2160 	}
2161 
2162 	return 0;
2163 }
2164 
2165 
2166 /*
2167  * Add an entry to a used linked list. A free entry must first be found
2168  * in the free linked list using get_data_ll_free_entry();
2169  */
2170 static void
2171 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2172 	struct virtio_net_data_ll *ll_dev)
2173 {
2174 	struct virtio_net_data_ll *ll = *ll_root_addr;
2175 
2176 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2177 	ll_dev->next = NULL;
2178 	rte_compiler_barrier();
2179 
2180 	/* If ll == NULL then this is the first device. */
2181 	if (ll) {
2182 		/* Increment to the tail of the linked list. */
2183 		while ((ll->next != NULL) )
2184 			ll = ll->next;
2185 
2186 		ll->next = ll_dev;
2187 	} else {
2188 		*ll_root_addr = ll_dev;
2189 	}
2190 }
2191 
2192 /*
2193  * Remove an entry from a used linked list. The entry must then be added to
2194  * the free linked list using put_data_ll_free_entry().
2195  */
2196 static void
2197 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2198 	struct virtio_net_data_ll *ll_dev,
2199 	struct virtio_net_data_ll *ll_dev_last)
2200 {
2201 	struct virtio_net_data_ll *ll = *ll_root_addr;
2202 
2203 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2204 		return;
2205 
2206 	if (ll_dev == ll)
2207 		*ll_root_addr = ll_dev->next;
2208 	else
2209 		if (likely(ll_dev_last != NULL))
2210 			ll_dev_last->next = ll_dev->next;
2211 		else
2212 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2213 }
2214 
2215 /*
2216  * Find and return an entry from the free linked list.
2217  */
2218 static struct virtio_net_data_ll *
2219 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2220 {
2221 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2222 	struct virtio_net_data_ll *ll_dev;
2223 
2224 	if (ll_free == NULL)
2225 		return NULL;
2226 
2227 	ll_dev = ll_free;
2228 	*ll_root_addr = ll_free->next;
2229 
2230 	return ll_dev;
2231 }
2232 
2233 /*
2234  * Place an entry back on to the free linked list.
2235  */
2236 static void
2237 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2238 	struct virtio_net_data_ll *ll_dev)
2239 {
2240 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2241 
2242 	if (ll_dev == NULL)
2243 		return;
2244 
2245 	ll_dev->next = ll_free;
2246 	*ll_root_addr = ll_dev;
2247 }
2248 
2249 /*
2250  * Creates a linked list of a given size.
2251  */
2252 static struct virtio_net_data_ll *
2253 alloc_data_ll(uint32_t size)
2254 {
2255 	struct virtio_net_data_ll *ll_new;
2256 	uint32_t i;
2257 
2258 	/* Malloc and then chain the linked list. */
2259 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2260 	if (ll_new == NULL) {
2261 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2262 		return NULL;
2263 	}
2264 
2265 	for (i = 0; i < size - 1; i++) {
2266 		ll_new[i].vdev = NULL;
2267 		ll_new[i].next = &ll_new[i+1];
2268 	}
2269 	ll_new[i].next = NULL;
2270 
2271 	return ll_new;
2272 }
2273 
2274 /*
2275  * Create the main linked list along with each individual cores linked list. A used and a free list
2276  * are created to manage entries.
2277  */
2278 static int
2279 init_data_ll (void)
2280 {
2281 	int lcore;
2282 
2283 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2284 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2285 		if (lcore_info[lcore].lcore_ll == NULL) {
2286 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2287 			return -1;
2288 		}
2289 
2290 		lcore_info[lcore].lcore_ll->device_num = 0;
2291 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2292 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2293 		if (num_devices % num_switching_cores)
2294 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2295 		else
2296 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2297 	}
2298 
2299 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2300 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2301 
2302 	return 0;
2303 }
2304 
2305 /*
2306  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2307  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2308  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2309  */
2310 static void
2311 destroy_device (volatile struct virtio_net *dev)
2312 {
2313 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2314 	struct virtio_net_data_ll *ll_main_dev_cur;
2315 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2316 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2317 	struct vhost_dev *vdev;
2318 	int lcore;
2319 
2320 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2321 
2322 	vdev = (struct vhost_dev *)dev->priv;
2323 	/*set the remove flag. */
2324 	vdev->remove = 1;
2325 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2326 		rte_pause();
2327 	}
2328 
2329 	/* Search for entry to be removed from lcore ll */
2330 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2331 	while (ll_lcore_dev_cur != NULL) {
2332 		if (ll_lcore_dev_cur->vdev == vdev) {
2333 			break;
2334 		} else {
2335 			ll_lcore_dev_last = ll_lcore_dev_cur;
2336 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2337 		}
2338 	}
2339 
2340 	if (ll_lcore_dev_cur == NULL) {
2341 		RTE_LOG(ERR, VHOST_CONFIG,
2342 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2343 			dev->device_fh);
2344 		return;
2345 	}
2346 
2347 	/* Search for entry to be removed from main ll */
2348 	ll_main_dev_cur = ll_root_used;
2349 	ll_main_dev_last = NULL;
2350 	while (ll_main_dev_cur != NULL) {
2351 		if (ll_main_dev_cur->vdev == vdev) {
2352 			break;
2353 		} else {
2354 			ll_main_dev_last = ll_main_dev_cur;
2355 			ll_main_dev_cur = ll_main_dev_cur->next;
2356 		}
2357 	}
2358 
2359 	/* Remove entries from the lcore and main ll. */
2360 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2361 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2362 
2363 	/* Set the dev_removal_flag on each lcore. */
2364 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2365 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2366 	}
2367 
2368 	/*
2369 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2370 	 * they can no longer access the device removed from the linked lists and that the devices
2371 	 * are no longer in use.
2372 	 */
2373 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2374 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2375 			rte_pause();
2376 		}
2377 	}
2378 
2379 	/* Add the entries back to the lcore and main free ll.*/
2380 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2381 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2382 
2383 	/* Decrement number of device on the lcore. */
2384 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2385 
2386 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2387 
2388 	if (zero_copy) {
2389 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2390 
2391 		/* Stop the RX queue. */
2392 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2393 			LOG_DEBUG(VHOST_CONFIG,
2394 				"(%"PRIu64") In destroy_device: Failed to stop "
2395 				"rx queue:%d\n",
2396 				dev->device_fh,
2397 				vdev->vmdq_rx_q);
2398 		}
2399 
2400 		LOG_DEBUG(VHOST_CONFIG,
2401 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2402 			"mempool back to ring for RX queue: %d\n",
2403 			dev->device_fh, vdev->vmdq_rx_q);
2404 
2405 		mbuf_destroy_zcp(vpool);
2406 
2407 		/* Stop the TX queue. */
2408 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2409 			LOG_DEBUG(VHOST_CONFIG,
2410 				"(%"PRIu64") In destroy_device: Failed to "
2411 				"stop tx queue:%d\n",
2412 				dev->device_fh, vdev->vmdq_rx_q);
2413 		}
2414 
2415 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2416 
2417 		LOG_DEBUG(VHOST_CONFIG,
2418 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2419 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2420 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2421 			dev->device_fh);
2422 
2423 		mbuf_destroy_zcp(vpool);
2424 		rte_free(vdev->regions_hpa);
2425 	}
2426 	rte_free(vdev);
2427 
2428 }
2429 
2430 /*
2431  * Calculate the region count of physical continous regions for one particular
2432  * region of whose vhost virtual address is continous. The particular region
2433  * start from vva_start, with size of 'size' in argument.
2434  */
2435 static uint32_t
2436 check_hpa_regions(uint64_t vva_start, uint64_t size)
2437 {
2438 	uint32_t i, nregions = 0, page_size = getpagesize();
2439 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2440 	if (vva_start % page_size) {
2441 		LOG_DEBUG(VHOST_CONFIG,
2442 			"in check_countinous: vva start(%p) mod page_size(%d) "
2443 			"has remainder\n",
2444 			(void *)(uintptr_t)vva_start, page_size);
2445 		return 0;
2446 	}
2447 	if (size % page_size) {
2448 		LOG_DEBUG(VHOST_CONFIG,
2449 			"in check_countinous: "
2450 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2451 			size, page_size);
2452 		return 0;
2453 	}
2454 	for (i = 0; i < size - page_size; i = i + page_size) {
2455 		cur_phys_addr
2456 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2457 		next_phys_addr = rte_mem_virt2phy(
2458 			(void *)(uintptr_t)(vva_start + i + page_size));
2459 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2460 			++nregions;
2461 			LOG_DEBUG(VHOST_CONFIG,
2462 				"in check_continuous: hva addr:(%p) is not "
2463 				"continuous with hva addr:(%p), diff:%d\n",
2464 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2465 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2466 				+ page_size), page_size);
2467 			LOG_DEBUG(VHOST_CONFIG,
2468 				"in check_continuous: hpa addr:(%p) is not "
2469 				"continuous with hpa addr:(%p), "
2470 				"diff:(%"PRIu64")\n",
2471 				(void *)(uintptr_t)cur_phys_addr,
2472 				(void *)(uintptr_t)next_phys_addr,
2473 				(next_phys_addr-cur_phys_addr));
2474 		}
2475 	}
2476 	return nregions;
2477 }
2478 
2479 /*
2480  * Divide each region whose vhost virtual address is continous into a few
2481  * sub-regions, make sure the physical address within each sub-region are
2482  * continous. And fill offset(to GPA) and size etc. information of each
2483  * sub-region into regions_hpa.
2484  */
2485 static uint32_t
2486 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2487 {
2488 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2489 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2490 
2491 	if (mem_region_hpa == NULL)
2492 		return 0;
2493 
2494 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2495 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2496 			virtio_memory->regions[regionidx].address_offset;
2497 		mem_region_hpa[regionidx_hpa].guest_phys_address
2498 			= virtio_memory->regions[regionidx].guest_phys_address;
2499 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2500 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2501 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2502 		LOG_DEBUG(VHOST_CONFIG,
2503 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2504 			regionidx_hpa,
2505 			(void *)(uintptr_t)
2506 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2507 		LOG_DEBUG(VHOST_CONFIG,
2508 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2509 			regionidx_hpa,
2510 			(void *)(uintptr_t)
2511 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2512 		for (i = 0, k = 0;
2513 			i < virtio_memory->regions[regionidx].memory_size -
2514 				page_size;
2515 			i += page_size) {
2516 			cur_phys_addr = rte_mem_virt2phy(
2517 					(void *)(uintptr_t)(vva_start + i));
2518 			next_phys_addr = rte_mem_virt2phy(
2519 					(void *)(uintptr_t)(vva_start +
2520 					i + page_size));
2521 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2522 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2523 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2524 					k + page_size;
2525 				mem_region_hpa[regionidx_hpa].memory_size
2526 					= k + page_size;
2527 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2528 					"phys addr end  [%d]:(%p)\n",
2529 					regionidx_hpa,
2530 					(void *)(uintptr_t)
2531 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2532 				LOG_DEBUG(VHOST_CONFIG,
2533 					"in fill_hpa_regions: guest phys addr "
2534 					"size [%d]:(%p)\n",
2535 					regionidx_hpa,
2536 					(void *)(uintptr_t)
2537 					(mem_region_hpa[regionidx_hpa].memory_size));
2538 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2539 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2540 				++regionidx_hpa;
2541 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2542 					next_phys_addr -
2543 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2544 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2545 					" phys addr start[%d]:(%p)\n",
2546 					regionidx_hpa,
2547 					(void *)(uintptr_t)
2548 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2549 				LOG_DEBUG(VHOST_CONFIG,
2550 					"in fill_hpa_regions: host  phys addr "
2551 					"start[%d]:(%p)\n",
2552 					regionidx_hpa,
2553 					(void *)(uintptr_t)
2554 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2555 				k = 0;
2556 			} else {
2557 				k += page_size;
2558 			}
2559 		}
2560 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2561 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2562 			+ k + page_size;
2563 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2564 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2565 			"[%d]:(%p)\n", regionidx_hpa,
2566 			(void *)(uintptr_t)
2567 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2568 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2569 			"[%d]:(%p)\n", regionidx_hpa,
2570 			(void *)(uintptr_t)
2571 			(mem_region_hpa[regionidx_hpa].memory_size));
2572 		++regionidx_hpa;
2573 	}
2574 	return regionidx_hpa;
2575 }
2576 
2577 /*
2578  * A new device is added to a data core. First the device is added to the main linked list
2579  * and the allocated to a specific data core.
2580  */
2581 static int
2582 new_device (struct virtio_net *dev)
2583 {
2584 	struct virtio_net_data_ll *ll_dev;
2585 	int lcore, core_add = 0;
2586 	uint32_t device_num_min = num_devices;
2587 	struct vhost_dev *vdev;
2588 	uint32_t regionidx;
2589 
2590 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2591 	if (vdev == NULL) {
2592 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2593 			dev->device_fh);
2594 		return -1;
2595 	}
2596 	vdev->dev = dev;
2597 	dev->priv = vdev;
2598 
2599 	if (zero_copy) {
2600 		vdev->nregions_hpa = dev->mem->nregions;
2601 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2602 			vdev->nregions_hpa
2603 				+= check_hpa_regions(
2604 					dev->mem->regions[regionidx].guest_phys_address
2605 					+ dev->mem->regions[regionidx].address_offset,
2606 					dev->mem->regions[regionidx].memory_size);
2607 
2608 		}
2609 
2610 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2611 					       vdev->nregions_hpa,
2612 					       sizeof(struct virtio_memory_regions_hpa),
2613 					       RTE_CACHE_LINE_SIZE);
2614 		if (vdev->regions_hpa == NULL) {
2615 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2616 			rte_free(vdev);
2617 			return -1;
2618 		}
2619 
2620 
2621 		if (fill_hpa_memory_regions(
2622 			vdev->regions_hpa, dev->mem
2623 			) != vdev->nregions_hpa) {
2624 
2625 			RTE_LOG(ERR, VHOST_CONFIG,
2626 				"hpa memory regions number mismatch: "
2627 				"[%d]\n", vdev->nregions_hpa);
2628 			rte_free(vdev->regions_hpa);
2629 			rte_free(vdev);
2630 			return -1;
2631 		}
2632 	}
2633 
2634 
2635 	/* Add device to main ll */
2636 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2637 	if (ll_dev == NULL) {
2638 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2639 			"of %d devices per core has been reached\n",
2640 			dev->device_fh, num_devices);
2641 		if (vdev->regions_hpa)
2642 			rte_free(vdev->regions_hpa);
2643 		rte_free(vdev);
2644 		return -1;
2645 	}
2646 	ll_dev->vdev = vdev;
2647 	add_data_ll_entry(&ll_root_used, ll_dev);
2648 	vdev->vmdq_rx_q
2649 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2650 
2651 	if (zero_copy) {
2652 		uint32_t index = vdev->vmdq_rx_q;
2653 		uint32_t count_in_ring, i;
2654 		struct mbuf_table *tx_q;
2655 
2656 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2657 
2658 		LOG_DEBUG(VHOST_CONFIG,
2659 			"(%"PRIu64") in new_device: mbuf count in mempool "
2660 			"before attach is: %d\n",
2661 			dev->device_fh,
2662 			rte_mempool_count(vpool_array[index].pool));
2663 		LOG_DEBUG(VHOST_CONFIG,
2664 			"(%"PRIu64") in new_device: mbuf count in  ring "
2665 			"before attach  is : %d\n",
2666 			dev->device_fh, count_in_ring);
2667 
2668 		/*
2669 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2670 		 */
2671 		for (i = 0; i < count_in_ring; i++)
2672 			attach_rxmbuf_zcp(dev);
2673 
2674 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2675 			"mempool after attach is: %d\n",
2676 			dev->device_fh,
2677 			rte_mempool_count(vpool_array[index].pool));
2678 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2679 			"ring after attach  is : %d\n",
2680 			dev->device_fh,
2681 			rte_ring_count(vpool_array[index].ring));
2682 
2683 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2684 		tx_q->txq_id = vdev->vmdq_rx_q;
2685 
2686 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2687 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2688 
2689 			LOG_DEBUG(VHOST_CONFIG,
2690 				"(%"PRIu64") In new_device: Failed to start "
2691 				"tx queue:%d\n",
2692 				dev->device_fh, vdev->vmdq_rx_q);
2693 
2694 			mbuf_destroy_zcp(vpool);
2695 			rte_free(vdev->regions_hpa);
2696 			rte_free(vdev);
2697 			return -1;
2698 		}
2699 
2700 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2701 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2702 
2703 			LOG_DEBUG(VHOST_CONFIG,
2704 				"(%"PRIu64") In new_device: Failed to start "
2705 				"rx queue:%d\n",
2706 				dev->device_fh, vdev->vmdq_rx_q);
2707 
2708 			/* Stop the TX queue. */
2709 			if (rte_eth_dev_tx_queue_stop(ports[0],
2710 				vdev->vmdq_rx_q) != 0) {
2711 				LOG_DEBUG(VHOST_CONFIG,
2712 					"(%"PRIu64") In new_device: Failed to "
2713 					"stop tx queue:%d\n",
2714 					dev->device_fh, vdev->vmdq_rx_q);
2715 			}
2716 
2717 			mbuf_destroy_zcp(vpool);
2718 			rte_free(vdev->regions_hpa);
2719 			rte_free(vdev);
2720 			return -1;
2721 		}
2722 
2723 	}
2724 
2725 	/*reset ready flag*/
2726 	vdev->ready = DEVICE_MAC_LEARNING;
2727 	vdev->remove = 0;
2728 
2729 	/* Find a suitable lcore to add the device. */
2730 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2731 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2732 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2733 			core_add = lcore;
2734 		}
2735 	}
2736 	/* Add device to lcore ll */
2737 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2738 	if (ll_dev == NULL) {
2739 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2740 		vdev->ready = DEVICE_SAFE_REMOVE;
2741 		destroy_device(dev);
2742 		rte_free(vdev->regions_hpa);
2743 		rte_free(vdev);
2744 		return -1;
2745 	}
2746 	ll_dev->vdev = vdev;
2747 	vdev->coreid = core_add;
2748 
2749 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2750 
2751 	/* Initialize device stats */
2752 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2753 
2754 	/* Disable notifications. */
2755 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2756 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2757 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2758 	dev->flags |= VIRTIO_DEV_RUNNING;
2759 
2760 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2761 
2762 	return 0;
2763 }
2764 
2765 /*
2766  * These callback allow devices to be added to the data core when configuration
2767  * has been fully complete.
2768  */
2769 static const struct virtio_net_device_ops virtio_net_device_ops =
2770 {
2771 	.new_device =  new_device,
2772 	.destroy_device = destroy_device,
2773 };
2774 
2775 /*
2776  * This is a thread will wake up after a period to print stats if the user has
2777  * enabled them.
2778  */
2779 static void
2780 print_stats(void)
2781 {
2782 	struct virtio_net_data_ll *dev_ll;
2783 	uint64_t tx_dropped, rx_dropped;
2784 	uint64_t tx, tx_total, rx, rx_total;
2785 	uint32_t device_fh;
2786 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2787 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2788 
2789 	while(1) {
2790 		sleep(enable_stats);
2791 
2792 		/* Clear screen and move to top left */
2793 		printf("%s%s", clr, top_left);
2794 
2795 		printf("\nDevice statistics ====================================");
2796 
2797 		dev_ll = ll_root_used;
2798 		while (dev_ll != NULL) {
2799 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2800 			tx_total = dev_statistics[device_fh].tx_total;
2801 			tx = dev_statistics[device_fh].tx;
2802 			tx_dropped = tx_total - tx;
2803 			if (zero_copy == 0) {
2804 				rx_total = rte_atomic64_read(
2805 					&dev_statistics[device_fh].rx_total_atomic);
2806 				rx = rte_atomic64_read(
2807 					&dev_statistics[device_fh].rx_atomic);
2808 			} else {
2809 				rx_total = dev_statistics[device_fh].rx_total;
2810 				rx = dev_statistics[device_fh].rx;
2811 			}
2812 			rx_dropped = rx_total - rx;
2813 
2814 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2815 					"\nTX total: 		%"PRIu64""
2816 					"\nTX dropped: 		%"PRIu64""
2817 					"\nTX successful: 		%"PRIu64""
2818 					"\nRX total: 		%"PRIu64""
2819 					"\nRX dropped: 		%"PRIu64""
2820 					"\nRX successful: 		%"PRIu64"",
2821 					device_fh,
2822 					tx_total,
2823 					tx_dropped,
2824 					tx,
2825 					rx_total,
2826 					rx_dropped,
2827 					rx);
2828 
2829 			dev_ll = dev_ll->next;
2830 		}
2831 		printf("\n======================================================\n");
2832 	}
2833 }
2834 
2835 static void
2836 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2837 	char *ring_name, uint32_t nb_mbuf)
2838 {
2839 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2840 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2841 	if (vpool_array[index].pool != NULL) {
2842 		vpool_array[index].ring
2843 			= rte_ring_create(ring_name,
2844 				rte_align32pow2(nb_mbuf + 1),
2845 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2846 		if (likely(vpool_array[index].ring != NULL)) {
2847 			LOG_DEBUG(VHOST_CONFIG,
2848 				"in setup_mempool_tbl: mbuf count in "
2849 				"mempool is: %d\n",
2850 				rte_mempool_count(vpool_array[index].pool));
2851 			LOG_DEBUG(VHOST_CONFIG,
2852 				"in setup_mempool_tbl: mbuf count in "
2853 				"ring   is: %d\n",
2854 				rte_ring_count(vpool_array[index].ring));
2855 		} else {
2856 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2857 				ring_name);
2858 		}
2859 
2860 		/* Need consider head room. */
2861 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2862 	} else {
2863 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2864 	}
2865 }
2866 
2867 /* When we receive a INT signal, unregister vhost driver */
2868 static void
2869 sigint_handler(__rte_unused int signum)
2870 {
2871 	/* Unregister vhost driver. */
2872 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2873 	if (ret != 0)
2874 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2875 	exit(0);
2876 }
2877 
2878 /*
2879  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2880  * device is also registered here to handle the IOCTLs.
2881  */
2882 int
2883 main(int argc, char *argv[])
2884 {
2885 	struct rte_mempool *mbuf_pool = NULL;
2886 	unsigned lcore_id, core_id = 0;
2887 	unsigned nb_ports, valid_num_ports;
2888 	int ret;
2889 	uint8_t portid;
2890 	uint16_t queue_id;
2891 	static pthread_t tid;
2892 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2893 
2894 	signal(SIGINT, sigint_handler);
2895 
2896 	/* init EAL */
2897 	ret = rte_eal_init(argc, argv);
2898 	if (ret < 0)
2899 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2900 	argc -= ret;
2901 	argv += ret;
2902 
2903 	/* parse app arguments */
2904 	ret = us_vhost_parse_args(argc, argv);
2905 	if (ret < 0)
2906 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2907 
2908 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2909 		if (rte_lcore_is_enabled(lcore_id))
2910 			lcore_ids[core_id ++] = lcore_id;
2911 
2912 	if (rte_lcore_count() > RTE_MAX_LCORE)
2913 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2914 
2915 	/*set the number of swithcing cores available*/
2916 	num_switching_cores = rte_lcore_count()-1;
2917 
2918 	/* Get the number of physical ports. */
2919 	nb_ports = rte_eth_dev_count();
2920 	if (nb_ports > RTE_MAX_ETHPORTS)
2921 		nb_ports = RTE_MAX_ETHPORTS;
2922 
2923 	/*
2924 	 * Update the global var NUM_PORTS and global array PORTS
2925 	 * and get value of var VALID_NUM_PORTS according to system ports number
2926 	 */
2927 	valid_num_ports = check_ports_num(nb_ports);
2928 
2929 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2930 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2931 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2932 		return -1;
2933 	}
2934 
2935 	if (zero_copy == 0) {
2936 		/* Create the mbuf pool. */
2937 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2938 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2939 			0, MBUF_DATA_SIZE, rte_socket_id());
2940 		if (mbuf_pool == NULL)
2941 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2942 
2943 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2944 			vpool_array[queue_id].pool = mbuf_pool;
2945 
2946 		if (vm2vm_mode == VM2VM_HARDWARE) {
2947 			/* Enable VT loop back to let L2 switch to do it. */
2948 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2949 			LOG_DEBUG(VHOST_CONFIG,
2950 				"Enable loop back for L2 switch in vmdq.\n");
2951 		}
2952 	} else {
2953 		uint32_t nb_mbuf;
2954 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2955 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2956 
2957 		nb_mbuf = num_rx_descriptor
2958 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2959 			+ num_switching_cores * MAX_PKT_BURST;
2960 
2961 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2962 			snprintf(pool_name, sizeof(pool_name),
2963 				"rxmbuf_pool_%u", queue_id);
2964 			snprintf(ring_name, sizeof(ring_name),
2965 				"rxmbuf_ring_%u", queue_id);
2966 			setup_mempool_tbl(rte_socket_id(), queue_id,
2967 				pool_name, ring_name, nb_mbuf);
2968 		}
2969 
2970 		nb_mbuf = num_tx_descriptor
2971 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2972 				+ num_switching_cores * MAX_PKT_BURST;
2973 
2974 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2975 			snprintf(pool_name, sizeof(pool_name),
2976 				"txmbuf_pool_%u", queue_id);
2977 			snprintf(ring_name, sizeof(ring_name),
2978 				"txmbuf_ring_%u", queue_id);
2979 			setup_mempool_tbl(rte_socket_id(),
2980 				(queue_id + MAX_QUEUES),
2981 				pool_name, ring_name, nb_mbuf);
2982 		}
2983 
2984 		if (vm2vm_mode == VM2VM_HARDWARE) {
2985 			/* Enable VT loop back to let L2 switch to do it. */
2986 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2987 			LOG_DEBUG(VHOST_CONFIG,
2988 				"Enable loop back for L2 switch in vmdq.\n");
2989 		}
2990 	}
2991 	/* Set log level. */
2992 	rte_set_log_level(LOG_LEVEL);
2993 
2994 	/* initialize all ports */
2995 	for (portid = 0; portid < nb_ports; portid++) {
2996 		/* skip ports that are not enabled */
2997 		if ((enabled_port_mask & (1 << portid)) == 0) {
2998 			RTE_LOG(INFO, VHOST_PORT,
2999 				"Skipping disabled port %d\n", portid);
3000 			continue;
3001 		}
3002 		if (port_init(portid) != 0)
3003 			rte_exit(EXIT_FAILURE,
3004 				"Cannot initialize network ports\n");
3005 	}
3006 
3007 	/* Initialise all linked lists. */
3008 	if (init_data_ll() == -1)
3009 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3010 
3011 	/* Initialize device stats */
3012 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3013 
3014 	/* Enable stats if the user option is set. */
3015 	if (enable_stats) {
3016 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3017 		if (ret != 0)
3018 			rte_exit(EXIT_FAILURE,
3019 				"Cannot create print-stats thread\n");
3020 
3021 		/* Set thread_name for aid in debugging.  */
3022 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3023 		ret = rte_thread_setname(tid, thread_name);
3024 		if (ret != 0)
3025 			RTE_LOG(ERR, VHOST_CONFIG,
3026 				"Cannot set print-stats name\n");
3027 	}
3028 
3029 	/* Launch all data cores. */
3030 	if (zero_copy == 0) {
3031 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3032 			rte_eal_remote_launch(switch_worker,
3033 				mbuf_pool, lcore_id);
3034 		}
3035 	} else {
3036 		uint32_t count_in_mempool, index, i;
3037 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3038 			/* For all RX and TX queues. */
3039 			count_in_mempool
3040 				= rte_mempool_count(vpool_array[index].pool);
3041 
3042 			/*
3043 			 * Transfer all un-attached mbufs from vpool.pool
3044 			 * to vpoo.ring.
3045 			 */
3046 			for (i = 0; i < count_in_mempool; i++) {
3047 				struct rte_mbuf *mbuf
3048 					= __rte_mbuf_raw_alloc(
3049 						vpool_array[index].pool);
3050 				rte_ring_sp_enqueue(vpool_array[index].ring,
3051 						(void *)mbuf);
3052 			}
3053 
3054 			LOG_DEBUG(VHOST_CONFIG,
3055 				"in main: mbuf count in mempool at initial "
3056 				"is: %d\n", count_in_mempool);
3057 			LOG_DEBUG(VHOST_CONFIG,
3058 				"in main: mbuf count in  ring at initial  is :"
3059 				" %d\n",
3060 				rte_ring_count(vpool_array[index].ring));
3061 		}
3062 
3063 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3064 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3065 				lcore_id);
3066 	}
3067 
3068 	if (mergeable == 0)
3069 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3070 
3071 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3072 	ret = rte_vhost_driver_register((char *)&dev_basename);
3073 	if (ret != 0)
3074 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3075 
3076 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3077 
3078 	/* Start CUSE session. */
3079 	rte_vhost_driver_session_start();
3080 	return 0;
3081 
3082 }
3083