xref: /dpdk/examples/vhost/main.c (revision 1befe9ca2484970a4bec6079763a524a23cd2306)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE	128
70 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
78 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
79 #define MBUF_CACHE_SIZE_ZCP 0
80 
81 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
82 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
83 
84 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
85 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
86 
87 #define JUMBO_FRAME_MAX_SIZE    0x2600
88 
89 /* State of virtio device. */
90 #define DEVICE_MAC_LEARNING 0
91 #define DEVICE_RX			1
92 #define DEVICE_SAFE_REMOVE	2
93 
94 /* Config_core_flag status definitions. */
95 #define REQUEST_DEV_REMOVAL 1
96 #define ACK_DEV_REMOVAL 0
97 
98 /* Configurable number of RX/TX ring descriptors */
99 #define RTE_TEST_RX_DESC_DEFAULT 1024
100 #define RTE_TEST_TX_DESC_DEFAULT 512
101 
102 /*
103  * Need refine these 2 macros for legacy and DPDK based front end:
104  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
105  * And then adjust power 2.
106  */
107 /*
108  * For legacy front end, 128 descriptors,
109  * half for virtio header, another half for mbuf.
110  */
111 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
112 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
113 
114 /* Get first 4 bytes in mbuf headroom. */
115 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
116 		+ sizeof(struct rte_mbuf)))
117 
118 /* true if x is a power of 2 */
119 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
120 
121 #define INVALID_PORT_ID 0xFF
122 
123 /* Max number of devices. Limited by vmdq. */
124 #define MAX_DEVICES 64
125 
126 /* Size of buffers used for snprintfs. */
127 #define MAX_PRINT_BUFF 6072
128 
129 /* Maximum character device basename size. */
130 #define MAX_BASENAME_SZ 10
131 
132 /* Maximum long option length for option parsing. */
133 #define MAX_LONG_OPT_SZ 64
134 
135 /* Used to compare MAC addresses. */
136 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
137 
138 /* Number of descriptors per cacheline. */
139 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
140 
141 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
142 
143 /* mask of enabled ports */
144 static uint32_t enabled_port_mask = 0;
145 
146 /* Promiscuous mode */
147 static uint32_t promiscuous;
148 
149 /*Number of switching cores enabled*/
150 static uint32_t num_switching_cores = 0;
151 
152 /* number of devices/queues to support*/
153 static uint32_t num_queues = 0;
154 static uint32_t num_devices;
155 
156 /*
157  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
158  * disabled on default.
159  */
160 static uint32_t zero_copy;
161 static int mergeable;
162 
163 /* Do vlan strip on host, enabled on default */
164 static uint32_t vlan_strip = 1;
165 
166 /* number of descriptors to apply*/
167 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
168 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
169 
170 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
171 #define MAX_RING_DESC 4096
172 
173 struct vpool {
174 	struct rte_mempool *pool;
175 	struct rte_ring *ring;
176 	uint32_t buf_size;
177 } vpool_array[MAX_QUEUES+MAX_QUEUES];
178 
179 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
180 typedef enum {
181 	VM2VM_DISABLED = 0,
182 	VM2VM_SOFTWARE = 1,
183 	VM2VM_HARDWARE = 2,
184 	VM2VM_LAST
185 } vm2vm_type;
186 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
187 
188 /* The type of host physical address translated from guest physical address. */
189 typedef enum {
190 	PHYS_ADDR_CONTINUOUS = 0,
191 	PHYS_ADDR_CROSS_SUBREG = 1,
192 	PHYS_ADDR_INVALID = 2,
193 	PHYS_ADDR_LAST
194 } hpa_type;
195 
196 /* Enable stats. */
197 static uint32_t enable_stats = 0;
198 /* Enable retries on RX. */
199 static uint32_t enable_retry = 1;
200 /* Specify timeout (in useconds) between retries on RX. */
201 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
202 /* Specify the number of retries on RX. */
203 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
204 
205 /* Character device basename. Can be set by user. */
206 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
207 
208 /* empty vmdq configuration structure. Filled in programatically */
209 static struct rte_eth_conf vmdq_conf_default = {
210 	.rxmode = {
211 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
212 		.split_hdr_size = 0,
213 		.header_split   = 0, /**< Header Split disabled */
214 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
215 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
216 		/*
217 		 * It is necessary for 1G NIC such as I350,
218 		 * this fixes bug of ipv4 forwarding in guest can't
219 		 * forward pakets from one virtio dev to another virtio dev.
220 		 */
221 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
222 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
223 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
224 	},
225 
226 	.txmode = {
227 		.mq_mode = ETH_MQ_TX_NONE,
228 	},
229 	.rx_adv_conf = {
230 		/*
231 		 * should be overridden separately in code with
232 		 * appropriate values
233 		 */
234 		.vmdq_rx_conf = {
235 			.nb_queue_pools = ETH_8_POOLS,
236 			.enable_default_pool = 0,
237 			.default_pool = 0,
238 			.nb_pool_maps = 0,
239 			.pool_map = {{0, 0},},
240 		},
241 	},
242 };
243 
244 static unsigned lcore_ids[RTE_MAX_LCORE];
245 static uint8_t ports[RTE_MAX_ETHPORTS];
246 static unsigned num_ports = 0; /**< The number of ports specified in command line */
247 static uint16_t num_pf_queues, num_vmdq_queues;
248 static uint16_t vmdq_pool_base, vmdq_queue_base;
249 static uint16_t queues_per_pool;
250 
251 static const uint16_t external_pkt_default_vlan_tag = 2000;
252 const uint16_t vlan_tags[] = {
253 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
254 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
255 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
256 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
257 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
258 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
259 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
260 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
261 };
262 
263 /* ethernet addresses of ports */
264 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
265 
266 /* heads for the main used and free linked lists for the data path. */
267 static struct virtio_net_data_ll *ll_root_used = NULL;
268 static struct virtio_net_data_ll *ll_root_free = NULL;
269 
270 /* Array of data core structures containing information on individual core linked lists. */
271 static struct lcore_info lcore_info[RTE_MAX_LCORE];
272 
273 /* Used for queueing bursts of TX packets. */
274 struct mbuf_table {
275 	unsigned len;
276 	unsigned txq_id;
277 	struct rte_mbuf *m_table[MAX_PKT_BURST];
278 };
279 
280 /* TX queue for each data core. */
281 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
282 
283 /* TX queue fori each virtio device for zero copy. */
284 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
285 
286 /* Vlan header struct used to insert vlan tags on TX. */
287 struct vlan_ethhdr {
288 	unsigned char   h_dest[ETH_ALEN];
289 	unsigned char   h_source[ETH_ALEN];
290 	__be16          h_vlan_proto;
291 	__be16          h_vlan_TCI;
292 	__be16          h_vlan_encapsulated_proto;
293 };
294 
295 /* IPv4 Header */
296 struct ipv4_hdr {
297 	uint8_t  version_ihl;		/**< version and header length */
298 	uint8_t  type_of_service;	/**< type of service */
299 	uint16_t total_length;		/**< length of packet */
300 	uint16_t packet_id;		/**< packet ID */
301 	uint16_t fragment_offset;	/**< fragmentation offset */
302 	uint8_t  time_to_live;		/**< time to live */
303 	uint8_t  next_proto_id;		/**< protocol ID */
304 	uint16_t hdr_checksum;		/**< header checksum */
305 	uint32_t src_addr;		/**< source address */
306 	uint32_t dst_addr;		/**< destination address */
307 } __attribute__((__packed__));
308 
309 /* Header lengths. */
310 #define VLAN_HLEN       4
311 #define VLAN_ETH_HLEN   18
312 
313 /* Per-device statistics struct */
314 struct device_statistics {
315 	uint64_t tx_total;
316 	rte_atomic64_t rx_total_atomic;
317 	uint64_t rx_total;
318 	uint64_t tx;
319 	rte_atomic64_t rx_atomic;
320 	uint64_t rx;
321 } __rte_cache_aligned;
322 struct device_statistics dev_statistics[MAX_DEVICES];
323 
324 /*
325  * Builds up the correct configuration for VMDQ VLAN pool map
326  * according to the pool & queue limits.
327  */
328 static inline int
329 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
330 {
331 	struct rte_eth_vmdq_rx_conf conf;
332 	struct rte_eth_vmdq_rx_conf *def_conf =
333 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
334 	unsigned i;
335 
336 	memset(&conf, 0, sizeof(conf));
337 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
338 	conf.nb_pool_maps = num_devices;
339 	conf.enable_loop_back = def_conf->enable_loop_back;
340 	conf.rx_mode = def_conf->rx_mode;
341 
342 	for (i = 0; i < conf.nb_pool_maps; i++) {
343 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
344 		conf.pool_map[i].pools = (1UL << i);
345 	}
346 
347 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
348 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
349 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
350 	return 0;
351 }
352 
353 /*
354  * Validate the device number according to the max pool number gotten form
355  * dev_info. If the device number is invalid, give the error message and
356  * return -1. Each device must have its own pool.
357  */
358 static inline int
359 validate_num_devices(uint32_t max_nb_devices)
360 {
361 	if (num_devices > max_nb_devices) {
362 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
363 		return -1;
364 	}
365 	return 0;
366 }
367 
368 /*
369  * Initialises a given port using global settings and with the rx buffers
370  * coming from the mbuf_pool passed as parameter
371  */
372 static inline int
373 port_init(uint8_t port)
374 {
375 	struct rte_eth_dev_info dev_info;
376 	struct rte_eth_conf port_conf;
377 	struct rte_eth_rxconf *rxconf;
378 	struct rte_eth_txconf *txconf;
379 	int16_t rx_rings, tx_rings;
380 	uint16_t rx_ring_size, tx_ring_size;
381 	int retval;
382 	uint16_t q;
383 
384 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
385 	rte_eth_dev_info_get (port, &dev_info);
386 
387 	if (dev_info.max_rx_queues > MAX_QUEUES) {
388 		rte_exit(EXIT_FAILURE,
389 			"please define MAX_QUEUES no less than %u in %s\n",
390 			dev_info.max_rx_queues, __FILE__);
391 	}
392 
393 	rxconf = &dev_info.default_rxconf;
394 	txconf = &dev_info.default_txconf;
395 	rxconf->rx_drop_en = 1;
396 
397 	/* Enable vlan offload */
398 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
399 
400 	/*
401 	 * Zero copy defers queue RX/TX start to the time when guest
402 	 * finishes its startup and packet buffers from that guest are
403 	 * available.
404 	 */
405 	if (zero_copy) {
406 		rxconf->rx_deferred_start = 1;
407 		rxconf->rx_drop_en = 0;
408 		txconf->tx_deferred_start = 1;
409 	}
410 
411 	/*configure the number of supported virtio devices based on VMDQ limits */
412 	num_devices = dev_info.max_vmdq_pools;
413 
414 	if (zero_copy) {
415 		rx_ring_size = num_rx_descriptor;
416 		tx_ring_size = num_tx_descriptor;
417 		tx_rings = dev_info.max_tx_queues;
418 	} else {
419 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
420 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
421 		tx_rings = (uint16_t)rte_lcore_count();
422 	}
423 
424 	retval = validate_num_devices(MAX_DEVICES);
425 	if (retval < 0)
426 		return retval;
427 
428 	/* Get port configuration. */
429 	retval = get_eth_conf(&port_conf, num_devices);
430 	if (retval < 0)
431 		return retval;
432 	/* NIC queues are divided into pf queues and vmdq queues.  */
433 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
434 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
435 	num_vmdq_queues = num_devices * queues_per_pool;
436 	num_queues = num_pf_queues + num_vmdq_queues;
437 	vmdq_queue_base = dev_info.vmdq_queue_base;
438 	vmdq_pool_base  = dev_info.vmdq_pool_base;
439 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
440 		num_pf_queues, num_devices, queues_per_pool);
441 
442 	if (port >= rte_eth_dev_count()) return -1;
443 
444 	rx_rings = (uint16_t)dev_info.max_rx_queues;
445 	/* Configure ethernet device. */
446 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
447 	if (retval != 0)
448 		return retval;
449 
450 	/* Setup the queues. */
451 	for (q = 0; q < rx_rings; q ++) {
452 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
453 						rte_eth_dev_socket_id(port),
454 						rxconf,
455 						vpool_array[q].pool);
456 		if (retval < 0)
457 			return retval;
458 	}
459 	for (q = 0; q < tx_rings; q ++) {
460 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
461 						rte_eth_dev_socket_id(port),
462 						txconf);
463 		if (retval < 0)
464 			return retval;
465 	}
466 
467 	/* Start the device. */
468 	retval  = rte_eth_dev_start(port);
469 	if (retval < 0) {
470 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471 		return retval;
472 	}
473 
474 	if (promiscuous)
475 		rte_eth_promiscuous_enable(port);
476 
477 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481 			(unsigned)port,
482 			vmdq_ports_eth_addr[port].addr_bytes[0],
483 			vmdq_ports_eth_addr[port].addr_bytes[1],
484 			vmdq_ports_eth_addr[port].addr_bytes[2],
485 			vmdq_ports_eth_addr[port].addr_bytes[3],
486 			vmdq_ports_eth_addr[port].addr_bytes[4],
487 			vmdq_ports_eth_addr[port].addr_bytes[5]);
488 
489 	return 0;
490 }
491 
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498 	/* parse number string */
499 
500 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501 		return -1;
502 	else
503 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504 
505 	return 0;
506 }
507 
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514 	char *end = NULL;
515 	unsigned long pm;
516 
517 	errno = 0;
518 
519 	/* parse hexadecimal string */
520 	pm = strtoul(portmask, &end, 16);
521 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522 		return -1;
523 
524 	if (pm == 0)
525 		return -1;
526 
527 	return pm;
528 
529 }
530 
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537 	char *end = NULL;
538 	unsigned long num;
539 
540 	errno = 0;
541 
542 	/* parse unsigned int string */
543 	num = strtoul(q_arg, &end, 10);
544 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545 		return -1;
546 
547 	if (num > max_valid_value)
548 		return -1;
549 
550 	return num;
551 
552 }
553 
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561 	"		--vm2vm [0|1|2]\n"
562 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 	"		--dev-basename <name>\n"
564 	"		--nb-devices ND\n"
565 	"		-p PORTMASK: Set mask for ports to be used by application\n"
566 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
572 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
573 	"		--dev-basename: The basename to be used for the character device.\n"
574 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
575 			"zero copy\n"
576 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
577 			"used only when zero copy is enabled.\n"
578 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
579 			"used only when zero copy is enabled.\n",
580 	       prgname);
581 }
582 
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589 	int opt, ret;
590 	int option_index;
591 	unsigned i;
592 	const char *prgname = argv[0];
593 	static struct option long_option[] = {
594 		{"vm2vm", required_argument, NULL, 0},
595 		{"rx-retry", required_argument, NULL, 0},
596 		{"rx-retry-delay", required_argument, NULL, 0},
597 		{"rx-retry-num", required_argument, NULL, 0},
598 		{"mergeable", required_argument, NULL, 0},
599 		{"vlan-strip", required_argument, NULL, 0},
600 		{"stats", required_argument, NULL, 0},
601 		{"dev-basename", required_argument, NULL, 0},
602 		{"zero-copy", required_argument, NULL, 0},
603 		{"rx-desc-num", required_argument, NULL, 0},
604 		{"tx-desc-num", required_argument, NULL, 0},
605 		{NULL, 0, 0, 0},
606 	};
607 
608 	/* Parse command line */
609 	while ((opt = getopt_long(argc, argv, "p:P",
610 			long_option, &option_index)) != EOF) {
611 		switch (opt) {
612 		/* Portmask */
613 		case 'p':
614 			enabled_port_mask = parse_portmask(optarg);
615 			if (enabled_port_mask == 0) {
616 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617 				us_vhost_usage(prgname);
618 				return -1;
619 			}
620 			break;
621 
622 		case 'P':
623 			promiscuous = 1;
624 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625 				ETH_VMDQ_ACCEPT_BROADCAST |
626 				ETH_VMDQ_ACCEPT_MULTICAST;
627 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
628 
629 			break;
630 
631 		case 0:
632 			/* Enable/disable vm2vm comms. */
633 			if (!strncmp(long_option[option_index].name, "vm2vm",
634 				MAX_LONG_OPT_SZ)) {
635 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
636 				if (ret == -1) {
637 					RTE_LOG(INFO, VHOST_CONFIG,
638 						"Invalid argument for "
639 						"vm2vm [0|1|2]\n");
640 					us_vhost_usage(prgname);
641 					return -1;
642 				} else {
643 					vm2vm_mode = (vm2vm_type)ret;
644 				}
645 			}
646 
647 			/* Enable/disable retries on RX. */
648 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649 				ret = parse_num_opt(optarg, 1);
650 				if (ret == -1) {
651 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652 					us_vhost_usage(prgname);
653 					return -1;
654 				} else {
655 					enable_retry = ret;
656 				}
657 			}
658 
659 			/* Specify the retries delay time (in useconds) on RX. */
660 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661 				ret = parse_num_opt(optarg, INT32_MAX);
662 				if (ret == -1) {
663 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664 					us_vhost_usage(prgname);
665 					return -1;
666 				} else {
667 					burst_rx_delay_time = ret;
668 				}
669 			}
670 
671 			/* Specify the retries number on RX. */
672 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673 				ret = parse_num_opt(optarg, INT32_MAX);
674 				if (ret == -1) {
675 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676 					us_vhost_usage(prgname);
677 					return -1;
678 				} else {
679 					burst_rx_retry_num = ret;
680 				}
681 			}
682 
683 			/* Enable/disable RX mergeable buffers. */
684 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685 				ret = parse_num_opt(optarg, 1);
686 				if (ret == -1) {
687 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688 					us_vhost_usage(prgname);
689 					return -1;
690 				} else {
691 					mergeable = !!ret;
692 					if (ret) {
693 						vmdq_conf_default.rxmode.jumbo_frame = 1;
694 						vmdq_conf_default.rxmode.max_rx_pkt_len
695 							= JUMBO_FRAME_MAX_SIZE;
696 					}
697 				}
698 			}
699 
700 			/* Enable/disable RX VLAN strip on host. */
701 			if (!strncmp(long_option[option_index].name,
702 				"vlan-strip", MAX_LONG_OPT_SZ)) {
703 				ret = parse_num_opt(optarg, 1);
704 				if (ret == -1) {
705 					RTE_LOG(INFO, VHOST_CONFIG,
706 						"Invalid argument for VLAN strip [0|1]\n");
707 					us_vhost_usage(prgname);
708 					return -1;
709 				} else {
710 					vlan_strip = !!ret;
711 					vmdq_conf_default.rxmode.hw_vlan_strip =
712 						vlan_strip;
713 				}
714 			}
715 
716 			/* Enable/disable stats. */
717 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
718 				ret = parse_num_opt(optarg, INT32_MAX);
719 				if (ret == -1) {
720 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
721 					us_vhost_usage(prgname);
722 					return -1;
723 				} else {
724 					enable_stats = ret;
725 				}
726 			}
727 
728 			/* Set character device basename. */
729 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
730 				if (us_vhost_parse_basename(optarg) == -1) {
731 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
732 					us_vhost_usage(prgname);
733 					return -1;
734 				}
735 			}
736 
737 			/* Enable/disable rx/tx zero copy. */
738 			if (!strncmp(long_option[option_index].name,
739 				"zero-copy", MAX_LONG_OPT_SZ)) {
740 				ret = parse_num_opt(optarg, 1);
741 				if (ret == -1) {
742 					RTE_LOG(INFO, VHOST_CONFIG,
743 						"Invalid argument"
744 						" for zero-copy [0|1]\n");
745 					us_vhost_usage(prgname);
746 					return -1;
747 				} else
748 					zero_copy = ret;
749 			}
750 
751 			/* Specify the descriptor number on RX. */
752 			if (!strncmp(long_option[option_index].name,
753 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
754 				ret = parse_num_opt(optarg, MAX_RING_DESC);
755 				if ((ret == -1) || (!POWEROF2(ret))) {
756 					RTE_LOG(INFO, VHOST_CONFIG,
757 					"Invalid argument for rx-desc-num[0-N],"
758 					"power of 2 required.\n");
759 					us_vhost_usage(prgname);
760 					return -1;
761 				} else {
762 					num_rx_descriptor = ret;
763 				}
764 			}
765 
766 			/* Specify the descriptor number on TX. */
767 			if (!strncmp(long_option[option_index].name,
768 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
769 				ret = parse_num_opt(optarg, MAX_RING_DESC);
770 				if ((ret == -1) || (!POWEROF2(ret))) {
771 					RTE_LOG(INFO, VHOST_CONFIG,
772 					"Invalid argument for tx-desc-num [0-N],"
773 					"power of 2 required.\n");
774 					us_vhost_usage(prgname);
775 					return -1;
776 				} else {
777 					num_tx_descriptor = ret;
778 				}
779 			}
780 
781 			break;
782 
783 			/* Invalid option - print options. */
784 		default:
785 			us_vhost_usage(prgname);
786 			return -1;
787 		}
788 	}
789 
790 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
791 		if (enabled_port_mask & (1 << i))
792 			ports[num_ports++] = (uint8_t)i;
793 	}
794 
795 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
796 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
797 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
798 		return -1;
799 	}
800 
801 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
802 		RTE_LOG(INFO, VHOST_PORT,
803 			"Vhost zero copy doesn't support software vm2vm,"
804 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
805 		return -1;
806 	}
807 
808 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
809 		RTE_LOG(INFO, VHOST_PORT,
810 			"Vhost zero copy doesn't support jumbo frame,"
811 			"please specify '--mergeable 0' to disable the "
812 			"mergeable feature.\n");
813 		return -1;
814 	}
815 
816 	return 0;
817 }
818 
819 /*
820  * Update the global var NUM_PORTS and array PORTS according to system ports number
821  * and return valid ports number
822  */
823 static unsigned check_ports_num(unsigned nb_ports)
824 {
825 	unsigned valid_num_ports = num_ports;
826 	unsigned portid;
827 
828 	if (num_ports > nb_ports) {
829 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
830 			num_ports, nb_ports);
831 		num_ports = nb_ports;
832 	}
833 
834 	for (portid = 0; portid < num_ports; portid ++) {
835 		if (ports[portid] >= nb_ports) {
836 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
837 				ports[portid], (nb_ports - 1));
838 			ports[portid] = INVALID_PORT_ID;
839 			valid_num_ports--;
840 		}
841 	}
842 	return valid_num_ports;
843 }
844 
845 /*
846  * Macro to print out packet contents. Wrapped in debug define so that the
847  * data path is not effected when debug is disabled.
848  */
849 #ifdef DEBUG
850 #define PRINT_PACKET(device, addr, size, header) do {																\
851 	char *pkt_addr = (char*)(addr);																					\
852 	unsigned int index;																								\
853 	char packet[MAX_PRINT_BUFF];																					\
854 																													\
855 	if ((header))																									\
856 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
857 	else																											\
858 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
859 	for (index = 0; index < (size); index++) {																		\
860 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
861 			"%02hhx ", pkt_addr[index]);																			\
862 	}																												\
863 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
864 																													\
865 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
866 } while(0)
867 #else
868 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
869 #endif
870 
871 /*
872  * Function to convert guest physical addresses to vhost physical addresses.
873  * This is used to convert virtio buffer addresses.
874  */
875 static inline uint64_t __attribute__((always_inline))
876 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
877 	uint32_t buf_len, hpa_type *addr_type)
878 {
879 	struct virtio_memory_regions_hpa *region;
880 	uint32_t regionidx;
881 	uint64_t vhost_pa = 0;
882 
883 	*addr_type = PHYS_ADDR_INVALID;
884 
885 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
886 		region = &vdev->regions_hpa[regionidx];
887 		if ((guest_pa >= region->guest_phys_address) &&
888 			(guest_pa <= region->guest_phys_address_end)) {
889 			vhost_pa = region->host_phys_addr_offset + guest_pa;
890 			if (likely((guest_pa + buf_len - 1)
891 				<= region->guest_phys_address_end))
892 				*addr_type = PHYS_ADDR_CONTINUOUS;
893 			else
894 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
895 			break;
896 		}
897 	}
898 
899 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
900 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
901 		(void *)(uintptr_t)vhost_pa);
902 
903 	return vhost_pa;
904 }
905 
906 /*
907  * Compares a packet destination MAC address to a device MAC address.
908  */
909 static inline int __attribute__((always_inline))
910 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
911 {
912 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
913 }
914 
915 /*
916  * This function learns the MAC address of the device and registers this along with a
917  * vlan tag to a VMDQ.
918  */
919 static int
920 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
921 {
922 	struct ether_hdr *pkt_hdr;
923 	struct virtio_net_data_ll *dev_ll;
924 	struct virtio_net *dev = vdev->dev;
925 	int i, ret;
926 
927 	/* Learn MAC address of guest device from packet */
928 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
929 
930 	dev_ll = ll_root_used;
931 
932 	while (dev_ll != NULL) {
933 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
934 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
935 			return -1;
936 		}
937 		dev_ll = dev_ll->next;
938 	}
939 
940 	for (i = 0; i < ETHER_ADDR_LEN; i++)
941 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
942 
943 	/* vlan_tag currently uses the device_id. */
944 	vdev->vlan_tag = vlan_tags[dev->device_fh];
945 
946 	/* Print out VMDQ registration info. */
947 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
948 		dev->device_fh,
949 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
950 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
951 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
952 		vdev->vlan_tag);
953 
954 	/* Register the MAC address. */
955 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
956 				(uint32_t)dev->device_fh + vmdq_pool_base);
957 	if (ret)
958 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
959 					dev->device_fh);
960 
961 	/* Enable stripping of the vlan tag as we handle routing. */
962 	if (vlan_strip)
963 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
964 			(uint16_t)vdev->vmdq_rx_q, 1);
965 
966 	/* Set device as ready for RX. */
967 	vdev->ready = DEVICE_RX;
968 
969 	return 0;
970 }
971 
972 /*
973  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
974  * queue before disabling RX on the device.
975  */
976 static inline void
977 unlink_vmdq(struct vhost_dev *vdev)
978 {
979 	unsigned i = 0;
980 	unsigned rx_count;
981 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
982 
983 	if (vdev->ready == DEVICE_RX) {
984 		/*clear MAC and VLAN settings*/
985 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
986 		for (i = 0; i < 6; i++)
987 			vdev->mac_address.addr_bytes[i] = 0;
988 
989 		vdev->vlan_tag = 0;
990 
991 		/*Clear out the receive buffers*/
992 		rx_count = rte_eth_rx_burst(ports[0],
993 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
994 
995 		while (rx_count) {
996 			for (i = 0; i < rx_count; i++)
997 				rte_pktmbuf_free(pkts_burst[i]);
998 
999 			rx_count = rte_eth_rx_burst(ports[0],
1000 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1001 		}
1002 
1003 		vdev->ready = DEVICE_MAC_LEARNING;
1004 	}
1005 }
1006 
1007 /*
1008  * Check if the packet destination MAC address is for a local device. If so then put
1009  * the packet on that devices RX queue. If not then return.
1010  */
1011 static inline int __attribute__((always_inline))
1012 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1013 {
1014 	struct virtio_net_data_ll *dev_ll;
1015 	struct ether_hdr *pkt_hdr;
1016 	uint64_t ret = 0;
1017 	struct virtio_net *dev = vdev->dev;
1018 	struct virtio_net *tdev; /* destination virito device */
1019 
1020 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1021 
1022 	/*get the used devices list*/
1023 	dev_ll = ll_root_used;
1024 
1025 	while (dev_ll != NULL) {
1026 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1027 				          &dev_ll->vdev->mac_address)) {
1028 
1029 			/* Drop the packet if the TX packet is destined for the TX device. */
1030 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1031 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1032 							dev->device_fh);
1033 				return 0;
1034 			}
1035 			tdev = dev_ll->vdev->dev;
1036 
1037 
1038 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1039 
1040 			if (unlikely(dev_ll->vdev->remove)) {
1041 				/*drop the packet if the device is marked for removal*/
1042 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1043 			} else {
1044 				/*send the packet to the local virtio device*/
1045 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1046 				if (enable_stats) {
1047 					rte_atomic64_add(
1048 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1049 					1);
1050 					rte_atomic64_add(
1051 					&dev_statistics[tdev->device_fh].rx_atomic,
1052 					ret);
1053 					dev_statistics[dev->device_fh].tx_total++;
1054 					dev_statistics[dev->device_fh].tx += ret;
1055 				}
1056 			}
1057 
1058 			return 0;
1059 		}
1060 		dev_ll = dev_ll->next;
1061 	}
1062 
1063 	return -1;
1064 }
1065 
1066 /*
1067  * Check if the destination MAC of a packet is one local VM,
1068  * and get its vlan tag, and offset if it is.
1069  */
1070 static inline int __attribute__((always_inline))
1071 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1072 	uint32_t *offset, uint16_t *vlan_tag)
1073 {
1074 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1075 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1076 
1077 	while (dev_ll != NULL) {
1078 		if ((dev_ll->vdev->ready == DEVICE_RX)
1079 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1080 		&dev_ll->vdev->mac_address)) {
1081 			/*
1082 			 * Drop the packet if the TX packet is
1083 			 * destined for the TX device.
1084 			 */
1085 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1086 				LOG_DEBUG(VHOST_DATA,
1087 				"(%"PRIu64") TX: Source and destination"
1088 				" MAC addresses are the same. Dropping "
1089 				"packet.\n",
1090 				dev_ll->vdev->dev->device_fh);
1091 				return -1;
1092 			}
1093 
1094 			/*
1095 			 * HW vlan strip will reduce the packet length
1096 			 * by minus length of vlan tag, so need restore
1097 			 * the packet length by plus it.
1098 			 */
1099 			*offset = VLAN_HLEN;
1100 			*vlan_tag =
1101 			(uint16_t)
1102 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1103 
1104 			LOG_DEBUG(VHOST_DATA,
1105 			"(%"PRIu64") TX: pkt to local VM device id:"
1106 			"(%"PRIu64") vlan tag: %d.\n",
1107 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1108 			(int)*vlan_tag);
1109 
1110 			break;
1111 		}
1112 		dev_ll = dev_ll->next;
1113 	}
1114 	return 0;
1115 }
1116 
1117 /*
1118  * This function routes the TX packet to the correct interface. This may be a local device
1119  * or the physical port.
1120  */
1121 static inline void __attribute__((always_inline))
1122 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1123 {
1124 	struct mbuf_table *tx_q;
1125 	struct rte_mbuf **m_table;
1126 	unsigned len, ret, offset = 0;
1127 	const uint16_t lcore_id = rte_lcore_id();
1128 	struct virtio_net *dev = vdev->dev;
1129 	struct ether_hdr *nh;
1130 
1131 	/*check if destination is local VM*/
1132 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1133 		rte_pktmbuf_free(m);
1134 		return;
1135 	}
1136 
1137 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1138 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1139 			rte_pktmbuf_free(m);
1140 			return;
1141 		}
1142 	}
1143 
1144 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1145 
1146 	/*Add packet to the port tx queue*/
1147 	tx_q = &lcore_tx_queue[lcore_id];
1148 	len = tx_q->len;
1149 
1150 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1151 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1152 		/* Guest has inserted the vlan tag. */
1153 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1154 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1155 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1156 			(vh->vlan_tci != vlan_tag_be))
1157 			vh->vlan_tci = vlan_tag_be;
1158 	} else {
1159 		m->ol_flags = PKT_TX_VLAN_PKT;
1160 
1161 		/*
1162 		 * Find the right seg to adjust the data len when offset is
1163 		 * bigger than tail room size.
1164 		 */
1165 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1166 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1167 				m->data_len += offset;
1168 			else {
1169 				struct rte_mbuf *seg = m;
1170 
1171 				while ((seg->next != NULL) &&
1172 					(offset > rte_pktmbuf_tailroom(seg)))
1173 					seg = seg->next;
1174 
1175 				seg->data_len += offset;
1176 			}
1177 			m->pkt_len += offset;
1178 		}
1179 
1180 		m->vlan_tci = vlan_tag;
1181 	}
1182 
1183 	tx_q->m_table[len] = m;
1184 	len++;
1185 	if (enable_stats) {
1186 		dev_statistics[dev->device_fh].tx_total++;
1187 		dev_statistics[dev->device_fh].tx++;
1188 	}
1189 
1190 	if (unlikely(len == MAX_PKT_BURST)) {
1191 		m_table = (struct rte_mbuf **)tx_q->m_table;
1192 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1193 		/* Free any buffers not handled by TX and update the port stats. */
1194 		if (unlikely(ret < len)) {
1195 			do {
1196 				rte_pktmbuf_free(m_table[ret]);
1197 			} while (++ret < len);
1198 		}
1199 
1200 		len = 0;
1201 	}
1202 
1203 	tx_q->len = len;
1204 	return;
1205 }
1206 /*
1207  * This function is called by each data core. It handles all RX/TX registered with the
1208  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1209  * with all devices in the main linked list.
1210  */
1211 static int
1212 switch_worker(__attribute__((unused)) void *arg)
1213 {
1214 	struct rte_mempool *mbuf_pool = arg;
1215 	struct virtio_net *dev = NULL;
1216 	struct vhost_dev *vdev = NULL;
1217 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1218 	struct virtio_net_data_ll *dev_ll;
1219 	struct mbuf_table *tx_q;
1220 	volatile struct lcore_ll_info *lcore_ll;
1221 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1222 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1223 	unsigned ret, i;
1224 	const uint16_t lcore_id = rte_lcore_id();
1225 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1226 	uint16_t rx_count = 0;
1227 	uint16_t tx_count;
1228 	uint32_t retry = 0;
1229 
1230 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1231 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1232 	prev_tsc = 0;
1233 
1234 	tx_q = &lcore_tx_queue[lcore_id];
1235 	for (i = 0; i < num_cores; i ++) {
1236 		if (lcore_ids[i] == lcore_id) {
1237 			tx_q->txq_id = i;
1238 			break;
1239 		}
1240 	}
1241 
1242 	while(1) {
1243 		cur_tsc = rte_rdtsc();
1244 		/*
1245 		 * TX burst queue drain
1246 		 */
1247 		diff_tsc = cur_tsc - prev_tsc;
1248 		if (unlikely(diff_tsc > drain_tsc)) {
1249 
1250 			if (tx_q->len) {
1251 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1252 
1253 				/*Tx any packets in the queue*/
1254 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1255 									   (struct rte_mbuf **)tx_q->m_table,
1256 									   (uint16_t)tx_q->len);
1257 				if (unlikely(ret < tx_q->len)) {
1258 					do {
1259 						rte_pktmbuf_free(tx_q->m_table[ret]);
1260 					} while (++ret < tx_q->len);
1261 				}
1262 
1263 				tx_q->len = 0;
1264 			}
1265 
1266 			prev_tsc = cur_tsc;
1267 
1268 		}
1269 
1270 		rte_prefetch0(lcore_ll->ll_root_used);
1271 		/*
1272 		 * Inform the configuration core that we have exited the linked list and that no devices are
1273 		 * in use if requested.
1274 		 */
1275 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1276 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1277 
1278 		/*
1279 		 * Process devices
1280 		 */
1281 		dev_ll = lcore_ll->ll_root_used;
1282 
1283 		while (dev_ll != NULL) {
1284 			/*get virtio device ID*/
1285 			vdev = dev_ll->vdev;
1286 			dev = vdev->dev;
1287 
1288 			if (unlikely(vdev->remove)) {
1289 				dev_ll = dev_ll->next;
1290 				unlink_vmdq(vdev);
1291 				vdev->ready = DEVICE_SAFE_REMOVE;
1292 				continue;
1293 			}
1294 			if (likely(vdev->ready == DEVICE_RX)) {
1295 				/*Handle guest RX*/
1296 				rx_count = rte_eth_rx_burst(ports[0],
1297 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1298 
1299 				if (rx_count) {
1300 					/*
1301 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1302 					* Here MAX_PKT_BURST must be less than virtio queue size
1303 					*/
1304 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1305 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1306 							rte_delay_us(burst_rx_delay_time);
1307 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1308 								break;
1309 						}
1310 					}
1311 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1312 					if (enable_stats) {
1313 						rte_atomic64_add(
1314 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1315 						rx_count);
1316 						rte_atomic64_add(
1317 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1318 					}
1319 					while (likely(rx_count)) {
1320 						rx_count--;
1321 						rte_pktmbuf_free(pkts_burst[rx_count]);
1322 					}
1323 
1324 				}
1325 			}
1326 
1327 			if (likely(!vdev->remove)) {
1328 				/* Handle guest TX*/
1329 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1330 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1331 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1332 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1333 						while (tx_count)
1334 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1335 					}
1336 				}
1337 				while (tx_count)
1338 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1339 			}
1340 
1341 			/*move to the next device in the list*/
1342 			dev_ll = dev_ll->next;
1343 		}
1344 	}
1345 
1346 	return 0;
1347 }
1348 
1349 /*
1350  * This function gets available ring number for zero copy rx.
1351  * Only one thread will call this funciton for a paticular virtio device,
1352  * so, it is designed as non-thread-safe function.
1353  */
1354 static inline uint32_t __attribute__((always_inline))
1355 get_available_ring_num_zcp(struct virtio_net *dev)
1356 {
1357 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1358 	uint16_t avail_idx;
1359 
1360 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1361 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1362 }
1363 
1364 /*
1365  * This function gets available ring index for zero copy rx,
1366  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1367  * Only one thread will call this funciton for a paticular virtio device,
1368  * so, it is designed as non-thread-safe function.
1369  */
1370 static inline uint32_t __attribute__((always_inline))
1371 get_available_ring_index_zcp(struct virtio_net *dev,
1372 	uint16_t *res_base_idx, uint32_t count)
1373 {
1374 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1375 	uint16_t avail_idx;
1376 	uint32_t retry = 0;
1377 	uint16_t free_entries;
1378 
1379 	*res_base_idx = vq->last_used_idx_res;
1380 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1381 	free_entries = (avail_idx - *res_base_idx);
1382 
1383 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1384 			"avail idx: %d, "
1385 			"res base idx:%d, free entries:%d\n",
1386 			dev->device_fh, avail_idx, *res_base_idx,
1387 			free_entries);
1388 
1389 	/*
1390 	 * If retry is enabled and the queue is full then we wait
1391 	 * and retry to avoid packet loss.
1392 	 */
1393 	if (enable_retry && unlikely(count > free_entries)) {
1394 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1395 			rte_delay_us(burst_rx_delay_time);
1396 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1397 			free_entries = (avail_idx - *res_base_idx);
1398 			if (count <= free_entries)
1399 				break;
1400 		}
1401 	}
1402 
1403 	/*check that we have enough buffers*/
1404 	if (unlikely(count > free_entries))
1405 		count = free_entries;
1406 
1407 	if (unlikely(count == 0)) {
1408 		LOG_DEBUG(VHOST_DATA,
1409 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1410 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1411 			dev->device_fh, avail_idx,
1412 			*res_base_idx, free_entries);
1413 		return 0;
1414 	}
1415 
1416 	vq->last_used_idx_res = *res_base_idx + count;
1417 
1418 	return count;
1419 }
1420 
1421 /*
1422  * This function put descriptor back to used list.
1423  */
1424 static inline void __attribute__((always_inline))
1425 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1426 {
1427 	uint16_t res_cur_idx = vq->last_used_idx;
1428 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1429 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1430 	rte_compiler_barrier();
1431 	*(volatile uint16_t *)&vq->used->idx += 1;
1432 	vq->last_used_idx += 1;
1433 
1434 	/* Kick the guest if necessary. */
1435 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1436 		eventfd_write(vq->callfd, (eventfd_t)1);
1437 }
1438 
1439 /*
1440  * This function get available descriptor from vitio vring and un-attached mbuf
1441  * from vpool->ring, and then attach them together. It needs adjust the offset
1442  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1443  * frame data may be put to wrong location in mbuf.
1444  */
1445 static inline void __attribute__((always_inline))
1446 attach_rxmbuf_zcp(struct virtio_net *dev)
1447 {
1448 	uint16_t res_base_idx, desc_idx;
1449 	uint64_t buff_addr, phys_addr;
1450 	struct vhost_virtqueue *vq;
1451 	struct vring_desc *desc;
1452 	void *obj = NULL;
1453 	struct rte_mbuf *mbuf;
1454 	struct vpool *vpool;
1455 	hpa_type addr_type;
1456 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1457 
1458 	vpool = &vpool_array[vdev->vmdq_rx_q];
1459 	vq = dev->virtqueue[VIRTIO_RXQ];
1460 
1461 	do {
1462 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1463 				1) != 1))
1464 			return;
1465 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1466 
1467 		desc = &vq->desc[desc_idx];
1468 		if (desc->flags & VRING_DESC_F_NEXT) {
1469 			desc = &vq->desc[desc->next];
1470 			buff_addr = gpa_to_vva(dev, desc->addr);
1471 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1472 					&addr_type);
1473 		} else {
1474 			buff_addr = gpa_to_vva(dev,
1475 					desc->addr + vq->vhost_hlen);
1476 			phys_addr = gpa_to_hpa(vdev,
1477 					desc->addr + vq->vhost_hlen,
1478 					desc->len, &addr_type);
1479 		}
1480 
1481 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1482 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1483 				" address found when attaching RX frame buffer"
1484 				" address!\n", dev->device_fh);
1485 			put_desc_to_used_list_zcp(vq, desc_idx);
1486 			continue;
1487 		}
1488 
1489 		/*
1490 		 * Check if the frame buffer address from guest crosses
1491 		 * sub-region or not.
1492 		 */
1493 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1494 			RTE_LOG(ERR, VHOST_DATA,
1495 				"(%"PRIu64") Frame buffer address cross "
1496 				"sub-regioin found when attaching RX frame "
1497 				"buffer address!\n",
1498 				dev->device_fh);
1499 			put_desc_to_used_list_zcp(vq, desc_idx);
1500 			continue;
1501 		}
1502 	} while (unlikely(phys_addr == 0));
1503 
1504 	rte_ring_sc_dequeue(vpool->ring, &obj);
1505 	mbuf = obj;
1506 	if (unlikely(mbuf == NULL)) {
1507 		LOG_DEBUG(VHOST_DATA,
1508 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1509 			"ring_sc_dequeue fail.\n",
1510 			dev->device_fh);
1511 		put_desc_to_used_list_zcp(vq, desc_idx);
1512 		return;
1513 	}
1514 
1515 	if (unlikely(vpool->buf_size > desc->len)) {
1516 		LOG_DEBUG(VHOST_DATA,
1517 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1518 			"length(%d) of descriptor idx: %d less than room "
1519 			"size required: %d\n",
1520 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1521 		put_desc_to_used_list_zcp(vq, desc_idx);
1522 		rte_ring_sp_enqueue(vpool->ring, obj);
1523 		return;
1524 	}
1525 
1526 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1527 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1528 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1529 	mbuf->data_len = desc->len;
1530 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1531 
1532 	LOG_DEBUG(VHOST_DATA,
1533 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1534 		"descriptor idx:%d\n",
1535 		dev->device_fh, res_base_idx, desc_idx);
1536 
1537 	__rte_mbuf_raw_free(mbuf);
1538 
1539 	return;
1540 }
1541 
1542 /*
1543  * Detach an attched packet mbuf -
1544  *  - restore original mbuf address and length values.
1545  *  - reset pktmbuf data and data_len to their default values.
1546  *  All other fields of the given packet mbuf will be left intact.
1547  *
1548  * @param m
1549  *   The attached packet mbuf.
1550  */
1551 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1552 {
1553 	const struct rte_mempool *mp = m->pool;
1554 	void *buf = rte_mbuf_to_baddr(m);
1555 	uint32_t buf_ofs;
1556 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1557 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1558 
1559 	m->buf_addr = buf;
1560 	m->buf_len = (uint16_t)buf_len;
1561 
1562 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1563 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1564 	m->data_off = buf_ofs;
1565 
1566 	m->data_len = 0;
1567 }
1568 
1569 /*
1570  * This function is called after packets have been transimited. It fetchs mbuf
1571  * from vpool->pool, detached it and put into vpool->ring. It also update the
1572  * used index and kick the guest if necessary.
1573  */
1574 static inline uint32_t __attribute__((always_inline))
1575 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1576 {
1577 	struct rte_mbuf *mbuf;
1578 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1579 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1580 	uint32_t index = 0;
1581 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1582 
1583 	LOG_DEBUG(VHOST_DATA,
1584 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1585 		"clean is: %d\n",
1586 		dev->device_fh, mbuf_count);
1587 	LOG_DEBUG(VHOST_DATA,
1588 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1589 		"clean  is : %d\n",
1590 		dev->device_fh, rte_ring_count(vpool->ring));
1591 
1592 	for (index = 0; index < mbuf_count; index++) {
1593 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1594 		if (likely(MBUF_EXT_MEM(mbuf)))
1595 			pktmbuf_detach_zcp(mbuf);
1596 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1597 
1598 		/* Update used index buffer information. */
1599 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1600 		vq->used->ring[used_idx].len = 0;
1601 
1602 		used_idx = (used_idx + 1) & (vq->size - 1);
1603 	}
1604 
1605 	LOG_DEBUG(VHOST_DATA,
1606 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1607 		"clean is: %d\n",
1608 		dev->device_fh, rte_mempool_count(vpool->pool));
1609 	LOG_DEBUG(VHOST_DATA,
1610 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1611 		"clean  is : %d\n",
1612 		dev->device_fh, rte_ring_count(vpool->ring));
1613 	LOG_DEBUG(VHOST_DATA,
1614 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1615 		"vq->last_used_idx:%d\n",
1616 		dev->device_fh, vq->last_used_idx);
1617 
1618 	vq->last_used_idx += mbuf_count;
1619 
1620 	LOG_DEBUG(VHOST_DATA,
1621 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1622 		"vq->last_used_idx:%d\n",
1623 		dev->device_fh, vq->last_used_idx);
1624 
1625 	rte_compiler_barrier();
1626 
1627 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1628 
1629 	/* Kick guest if required. */
1630 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1631 		eventfd_write(vq->callfd, (eventfd_t)1);
1632 
1633 	return 0;
1634 }
1635 
1636 /*
1637  * This function is called when a virtio device is destroy.
1638  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1639  */
1640 static void mbuf_destroy_zcp(struct vpool *vpool)
1641 {
1642 	struct rte_mbuf *mbuf = NULL;
1643 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1644 
1645 	LOG_DEBUG(VHOST_CONFIG,
1646 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1647 		"mbuf_destroy_zcp is: %d\n",
1648 		mbuf_count);
1649 	LOG_DEBUG(VHOST_CONFIG,
1650 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1651 		"mbuf_destroy_zcp  is : %d\n",
1652 		rte_ring_count(vpool->ring));
1653 
1654 	for (index = 0; index < mbuf_count; index++) {
1655 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1656 		if (likely(mbuf != NULL)) {
1657 			if (likely(MBUF_EXT_MEM(mbuf)))
1658 				pktmbuf_detach_zcp(mbuf);
1659 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1660 		}
1661 	}
1662 
1663 	LOG_DEBUG(VHOST_CONFIG,
1664 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1665 		"mbuf_destroy_zcp is: %d\n",
1666 		rte_mempool_count(vpool->pool));
1667 	LOG_DEBUG(VHOST_CONFIG,
1668 		"in mbuf_destroy_zcp: mbuf count in ring after "
1669 		"mbuf_destroy_zcp is : %d\n",
1670 		rte_ring_count(vpool->ring));
1671 }
1672 
1673 /*
1674  * This function update the use flag and counter.
1675  */
1676 static inline uint32_t __attribute__((always_inline))
1677 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1678 	uint32_t count)
1679 {
1680 	struct vhost_virtqueue *vq;
1681 	struct vring_desc *desc;
1682 	struct rte_mbuf *buff;
1683 	/* The virtio_hdr is initialised to 0. */
1684 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1685 		= {{0, 0, 0, 0, 0, 0}, 0};
1686 	uint64_t buff_hdr_addr = 0;
1687 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1688 	uint32_t head_idx, packet_success = 0;
1689 	uint16_t res_cur_idx;
1690 
1691 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1692 
1693 	if (count == 0)
1694 		return 0;
1695 
1696 	vq = dev->virtqueue[VIRTIO_RXQ];
1697 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1698 
1699 	res_cur_idx = vq->last_used_idx;
1700 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1701 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1702 
1703 	/* Retrieve all of the head indexes first to avoid caching issues. */
1704 	for (head_idx = 0; head_idx < count; head_idx++)
1705 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1706 
1707 	/*Prefetch descriptor index. */
1708 	rte_prefetch0(&vq->desc[head[packet_success]]);
1709 
1710 	while (packet_success != count) {
1711 		/* Get descriptor from available ring */
1712 		desc = &vq->desc[head[packet_success]];
1713 
1714 		buff = pkts[packet_success];
1715 		LOG_DEBUG(VHOST_DATA,
1716 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1717 			"pkt[%d] descriptor idx: %d\n",
1718 			dev->device_fh, packet_success,
1719 			MBUF_HEADROOM_UINT32(buff));
1720 
1721 		PRINT_PACKET(dev,
1722 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1723 			+ RTE_PKTMBUF_HEADROOM),
1724 			rte_pktmbuf_data_len(buff), 0);
1725 
1726 		/* Buffer address translation for virtio header. */
1727 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1728 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1729 
1730 		/*
1731 		 * If the descriptors are chained the header and data are
1732 		 * placed in separate buffers.
1733 		 */
1734 		if (desc->flags & VRING_DESC_F_NEXT) {
1735 			desc->len = vq->vhost_hlen;
1736 			desc = &vq->desc[desc->next];
1737 			desc->len = rte_pktmbuf_data_len(buff);
1738 		} else {
1739 			desc->len = packet_len;
1740 		}
1741 
1742 		/* Update used ring with desc information */
1743 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1744 			= head[packet_success];
1745 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1746 			= packet_len;
1747 		res_cur_idx++;
1748 		packet_success++;
1749 
1750 		/* A header is required per buffer. */
1751 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1752 			(const void *)&virtio_hdr, vq->vhost_hlen);
1753 
1754 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1755 
1756 		if (likely(packet_success < count)) {
1757 			/* Prefetch descriptor index. */
1758 			rte_prefetch0(&vq->desc[head[packet_success]]);
1759 		}
1760 	}
1761 
1762 	rte_compiler_barrier();
1763 
1764 	LOG_DEBUG(VHOST_DATA,
1765 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1766 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1767 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1768 
1769 	*(volatile uint16_t *)&vq->used->idx += count;
1770 	vq->last_used_idx += count;
1771 
1772 	LOG_DEBUG(VHOST_DATA,
1773 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1774 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1775 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1776 
1777 	/* Kick the guest if necessary. */
1778 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1779 		eventfd_write(vq->callfd, (eventfd_t)1);
1780 
1781 	return count;
1782 }
1783 
1784 /*
1785  * This function routes the TX packet to the correct interface.
1786  * This may be a local device or the physical port.
1787  */
1788 static inline void __attribute__((always_inline))
1789 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1790 	uint32_t desc_idx, uint8_t need_copy)
1791 {
1792 	struct mbuf_table *tx_q;
1793 	struct rte_mbuf **m_table;
1794 	void *obj = NULL;
1795 	struct rte_mbuf *mbuf;
1796 	unsigned len, ret, offset = 0;
1797 	struct vpool *vpool;
1798 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1799 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1800 
1801 	/*Add packet to the port tx queue*/
1802 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1803 	len = tx_q->len;
1804 
1805 	/* Allocate an mbuf and populate the structure. */
1806 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1807 	rte_ring_sc_dequeue(vpool->ring, &obj);
1808 	mbuf = obj;
1809 	if (unlikely(mbuf == NULL)) {
1810 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1811 		RTE_LOG(ERR, VHOST_DATA,
1812 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1813 			dev->device_fh);
1814 		put_desc_to_used_list_zcp(vq, desc_idx);
1815 		return;
1816 	}
1817 
1818 	if (vm2vm_mode == VM2VM_HARDWARE) {
1819 		/* Avoid using a vlan tag from any vm for external pkt, such as
1820 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1821 		 * selection, MAC address determines it as an external pkt
1822 		 * which should go to network, while vlan tag determine it as
1823 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1824 		 * such a ambiguous situation, so pkt will lost.
1825 		 */
1826 		vlan_tag = external_pkt_default_vlan_tag;
1827 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1828 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1829 			__rte_mbuf_raw_free(mbuf);
1830 			return;
1831 		}
1832 	}
1833 
1834 	mbuf->nb_segs = m->nb_segs;
1835 	mbuf->next = m->next;
1836 	mbuf->data_len = m->data_len + offset;
1837 	mbuf->pkt_len = mbuf->data_len;
1838 	if (unlikely(need_copy)) {
1839 		/* Copy the packet contents to the mbuf. */
1840 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1841 			rte_pktmbuf_mtod(m, void *),
1842 			m->data_len);
1843 	} else {
1844 		mbuf->data_off = m->data_off;
1845 		mbuf->buf_physaddr = m->buf_physaddr;
1846 		mbuf->buf_addr = m->buf_addr;
1847 	}
1848 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1849 	mbuf->vlan_tci = vlan_tag;
1850 	mbuf->l2_len = sizeof(struct ether_hdr);
1851 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1852 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1853 
1854 	tx_q->m_table[len] = mbuf;
1855 	len++;
1856 
1857 	LOG_DEBUG(VHOST_DATA,
1858 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1859 		dev->device_fh,
1860 		mbuf->nb_segs,
1861 		(mbuf->next == NULL) ? "null" : "non-null");
1862 
1863 	if (enable_stats) {
1864 		dev_statistics[dev->device_fh].tx_total++;
1865 		dev_statistics[dev->device_fh].tx++;
1866 	}
1867 
1868 	if (unlikely(len == MAX_PKT_BURST)) {
1869 		m_table = (struct rte_mbuf **)tx_q->m_table;
1870 		ret = rte_eth_tx_burst(ports[0],
1871 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1872 
1873 		/*
1874 		 * Free any buffers not handled by TX and update
1875 		 * the port stats.
1876 		 */
1877 		if (unlikely(ret < len)) {
1878 			do {
1879 				rte_pktmbuf_free(m_table[ret]);
1880 			} while (++ret < len);
1881 		}
1882 
1883 		len = 0;
1884 		txmbuf_clean_zcp(dev, vpool);
1885 	}
1886 
1887 	tx_q->len = len;
1888 
1889 	return;
1890 }
1891 
1892 /*
1893  * This function TX all available packets in virtio TX queue for one
1894  * virtio-net device. If it is first packet, it learns MAC address and
1895  * setup VMDQ.
1896  */
1897 static inline void __attribute__((always_inline))
1898 virtio_dev_tx_zcp(struct virtio_net *dev)
1899 {
1900 	struct rte_mbuf m;
1901 	struct vhost_virtqueue *vq;
1902 	struct vring_desc *desc;
1903 	uint64_t buff_addr = 0, phys_addr;
1904 	uint32_t head[MAX_PKT_BURST];
1905 	uint32_t i;
1906 	uint16_t free_entries, packet_success = 0;
1907 	uint16_t avail_idx;
1908 	uint8_t need_copy = 0;
1909 	hpa_type addr_type;
1910 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1911 
1912 	vq = dev->virtqueue[VIRTIO_TXQ];
1913 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1914 
1915 	/* If there are no available buffers then return. */
1916 	if (vq->last_used_idx_res == avail_idx)
1917 		return;
1918 
1919 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1920 
1921 	/* Prefetch available ring to retrieve head indexes. */
1922 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1923 
1924 	/* Get the number of free entries in the ring */
1925 	free_entries = (avail_idx - vq->last_used_idx_res);
1926 
1927 	/* Limit to MAX_PKT_BURST. */
1928 	free_entries
1929 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1930 
1931 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1932 		dev->device_fh, free_entries);
1933 
1934 	/* Retrieve all of the head indexes first to avoid caching issues. */
1935 	for (i = 0; i < free_entries; i++)
1936 		head[i]
1937 			= vq->avail->ring[(vq->last_used_idx_res + i)
1938 			& (vq->size - 1)];
1939 
1940 	vq->last_used_idx_res += free_entries;
1941 
1942 	/* Prefetch descriptor index. */
1943 	rte_prefetch0(&vq->desc[head[packet_success]]);
1944 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1945 
1946 	while (packet_success < free_entries) {
1947 		desc = &vq->desc[head[packet_success]];
1948 
1949 		/* Discard first buffer as it is the virtio header */
1950 		desc = &vq->desc[desc->next];
1951 
1952 		/* Buffer address translation. */
1953 		buff_addr = gpa_to_vva(dev, desc->addr);
1954 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1955 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1956 			&addr_type);
1957 
1958 		if (likely(packet_success < (free_entries - 1)))
1959 			/* Prefetch descriptor index. */
1960 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1961 
1962 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1963 			RTE_LOG(ERR, VHOST_DATA,
1964 				"(%"PRIu64") Invalid frame buffer address found"
1965 				"when TX packets!\n",
1966 				dev->device_fh);
1967 			packet_success++;
1968 			continue;
1969 		}
1970 
1971 		/* Prefetch buffer address. */
1972 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1973 
1974 		/*
1975 		 * Setup dummy mbuf. This is copied to a real mbuf if
1976 		 * transmitted out the physical port.
1977 		 */
1978 		m.data_len = desc->len;
1979 		m.nb_segs = 1;
1980 		m.next = NULL;
1981 		m.data_off = 0;
1982 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1983 		m.buf_physaddr = phys_addr;
1984 
1985 		/*
1986 		 * Check if the frame buffer address from guest crosses
1987 		 * sub-region or not.
1988 		 */
1989 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1990 			RTE_LOG(ERR, VHOST_DATA,
1991 				"(%"PRIu64") Frame buffer address cross "
1992 				"sub-regioin found when attaching TX frame "
1993 				"buffer address!\n",
1994 				dev->device_fh);
1995 			need_copy = 1;
1996 		} else
1997 			need_copy = 0;
1998 
1999 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2000 
2001 		/*
2002 		 * If this is the first received packet we need to learn
2003 		 * the MAC and setup VMDQ
2004 		 */
2005 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2006 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2007 				/*
2008 				 * Discard frame if device is scheduled for
2009 				 * removal or a duplicate MAC address is found.
2010 				 */
2011 				packet_success += free_entries;
2012 				vq->last_used_idx += packet_success;
2013 				break;
2014 			}
2015 		}
2016 
2017 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2018 		packet_success++;
2019 	}
2020 }
2021 
2022 /*
2023  * This function is called by each data core. It handles all RX/TX registered
2024  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2025  * addresses are compared with all devices in the main linked list.
2026  */
2027 static int
2028 switch_worker_zcp(__attribute__((unused)) void *arg)
2029 {
2030 	struct virtio_net *dev = NULL;
2031 	struct vhost_dev  *vdev = NULL;
2032 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2033 	struct virtio_net_data_ll *dev_ll;
2034 	struct mbuf_table *tx_q;
2035 	volatile struct lcore_ll_info *lcore_ll;
2036 	const uint64_t drain_tsc
2037 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2038 		* BURST_TX_DRAIN_US;
2039 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2040 	unsigned ret;
2041 	const uint16_t lcore_id = rte_lcore_id();
2042 	uint16_t count_in_ring, rx_count = 0;
2043 
2044 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2045 
2046 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2047 	prev_tsc = 0;
2048 
2049 	while (1) {
2050 		cur_tsc = rte_rdtsc();
2051 
2052 		/* TX burst queue drain */
2053 		diff_tsc = cur_tsc - prev_tsc;
2054 		if (unlikely(diff_tsc > drain_tsc)) {
2055 			/*
2056 			 * Get mbuf from vpool.pool and detach mbuf and
2057 			 * put back into vpool.ring.
2058 			 */
2059 			dev_ll = lcore_ll->ll_root_used;
2060 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2061 				/* Get virtio device ID */
2062 				vdev = dev_ll->vdev;
2063 				dev = vdev->dev;
2064 
2065 				if (likely(!vdev->remove)) {
2066 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2067 					if (tx_q->len) {
2068 						LOG_DEBUG(VHOST_DATA,
2069 						"TX queue drained after timeout"
2070 						" with burst size %u\n",
2071 						tx_q->len);
2072 
2073 						/*
2074 						 * Tx any packets in the queue
2075 						 */
2076 						ret = rte_eth_tx_burst(
2077 							ports[0],
2078 							(uint16_t)tx_q->txq_id,
2079 							(struct rte_mbuf **)
2080 							tx_q->m_table,
2081 							(uint16_t)tx_q->len);
2082 						if (unlikely(ret < tx_q->len)) {
2083 							do {
2084 								rte_pktmbuf_free(
2085 									tx_q->m_table[ret]);
2086 							} while (++ret < tx_q->len);
2087 						}
2088 						tx_q->len = 0;
2089 
2090 						txmbuf_clean_zcp(dev,
2091 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2092 					}
2093 				}
2094 				dev_ll = dev_ll->next;
2095 			}
2096 			prev_tsc = cur_tsc;
2097 		}
2098 
2099 		rte_prefetch0(lcore_ll->ll_root_used);
2100 
2101 		/*
2102 		 * Inform the configuration core that we have exited the linked
2103 		 * list and that no devices are in use if requested.
2104 		 */
2105 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2106 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2107 
2108 		/* Process devices */
2109 		dev_ll = lcore_ll->ll_root_used;
2110 
2111 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2112 			vdev = dev_ll->vdev;
2113 			dev  = vdev->dev;
2114 			if (unlikely(vdev->remove)) {
2115 				dev_ll = dev_ll->next;
2116 				unlink_vmdq(vdev);
2117 				vdev->ready = DEVICE_SAFE_REMOVE;
2118 				continue;
2119 			}
2120 
2121 			if (likely(vdev->ready == DEVICE_RX)) {
2122 				uint32_t index = vdev->vmdq_rx_q;
2123 				uint16_t i;
2124 				count_in_ring
2125 				= rte_ring_count(vpool_array[index].ring);
2126 				uint16_t free_entries
2127 				= (uint16_t)get_available_ring_num_zcp(dev);
2128 
2129 				/*
2130 				 * Attach all mbufs in vpool.ring and put back
2131 				 * into vpool.pool.
2132 				 */
2133 				for (i = 0;
2134 				i < RTE_MIN(free_entries,
2135 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2136 				i++)
2137 					attach_rxmbuf_zcp(dev);
2138 
2139 				/* Handle guest RX */
2140 				rx_count = rte_eth_rx_burst(ports[0],
2141 					vdev->vmdq_rx_q, pkts_burst,
2142 					MAX_PKT_BURST);
2143 
2144 				if (rx_count) {
2145 					ret_count = virtio_dev_rx_zcp(dev,
2146 							pkts_burst, rx_count);
2147 					if (enable_stats) {
2148 						dev_statistics[dev->device_fh].rx_total
2149 							+= rx_count;
2150 						dev_statistics[dev->device_fh].rx
2151 							+= ret_count;
2152 					}
2153 					while (likely(rx_count)) {
2154 						rx_count--;
2155 						pktmbuf_detach_zcp(
2156 							pkts_burst[rx_count]);
2157 						rte_ring_sp_enqueue(
2158 							vpool_array[index].ring,
2159 							(void *)pkts_burst[rx_count]);
2160 					}
2161 				}
2162 			}
2163 
2164 			if (likely(!vdev->remove))
2165 				/* Handle guest TX */
2166 				virtio_dev_tx_zcp(dev);
2167 
2168 			/* Move to the next device in the list */
2169 			dev_ll = dev_ll->next;
2170 		}
2171 	}
2172 
2173 	return 0;
2174 }
2175 
2176 
2177 /*
2178  * Add an entry to a used linked list. A free entry must first be found
2179  * in the free linked list using get_data_ll_free_entry();
2180  */
2181 static void
2182 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2183 	struct virtio_net_data_ll *ll_dev)
2184 {
2185 	struct virtio_net_data_ll *ll = *ll_root_addr;
2186 
2187 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2188 	ll_dev->next = NULL;
2189 	rte_compiler_barrier();
2190 
2191 	/* If ll == NULL then this is the first device. */
2192 	if (ll) {
2193 		/* Increment to the tail of the linked list. */
2194 		while ((ll->next != NULL) )
2195 			ll = ll->next;
2196 
2197 		ll->next = ll_dev;
2198 	} else {
2199 		*ll_root_addr = ll_dev;
2200 	}
2201 }
2202 
2203 /*
2204  * Remove an entry from a used linked list. The entry must then be added to
2205  * the free linked list using put_data_ll_free_entry().
2206  */
2207 static void
2208 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2209 	struct virtio_net_data_ll *ll_dev,
2210 	struct virtio_net_data_ll *ll_dev_last)
2211 {
2212 	struct virtio_net_data_ll *ll = *ll_root_addr;
2213 
2214 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2215 		return;
2216 
2217 	if (ll_dev == ll)
2218 		*ll_root_addr = ll_dev->next;
2219 	else
2220 		if (likely(ll_dev_last != NULL))
2221 			ll_dev_last->next = ll_dev->next;
2222 		else
2223 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2224 }
2225 
2226 /*
2227  * Find and return an entry from the free linked list.
2228  */
2229 static struct virtio_net_data_ll *
2230 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2231 {
2232 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2233 	struct virtio_net_data_ll *ll_dev;
2234 
2235 	if (ll_free == NULL)
2236 		return NULL;
2237 
2238 	ll_dev = ll_free;
2239 	*ll_root_addr = ll_free->next;
2240 
2241 	return ll_dev;
2242 }
2243 
2244 /*
2245  * Place an entry back on to the free linked list.
2246  */
2247 static void
2248 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2249 	struct virtio_net_data_ll *ll_dev)
2250 {
2251 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2252 
2253 	if (ll_dev == NULL)
2254 		return;
2255 
2256 	ll_dev->next = ll_free;
2257 	*ll_root_addr = ll_dev;
2258 }
2259 
2260 /*
2261  * Creates a linked list of a given size.
2262  */
2263 static struct virtio_net_data_ll *
2264 alloc_data_ll(uint32_t size)
2265 {
2266 	struct virtio_net_data_ll *ll_new;
2267 	uint32_t i;
2268 
2269 	/* Malloc and then chain the linked list. */
2270 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2271 	if (ll_new == NULL) {
2272 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2273 		return NULL;
2274 	}
2275 
2276 	for (i = 0; i < size - 1; i++) {
2277 		ll_new[i].vdev = NULL;
2278 		ll_new[i].next = &ll_new[i+1];
2279 	}
2280 	ll_new[i].next = NULL;
2281 
2282 	return (ll_new);
2283 }
2284 
2285 /*
2286  * Create the main linked list along with each individual cores linked list. A used and a free list
2287  * are created to manage entries.
2288  */
2289 static int
2290 init_data_ll (void)
2291 {
2292 	int lcore;
2293 
2294 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2295 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2296 		if (lcore_info[lcore].lcore_ll == NULL) {
2297 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2298 			return -1;
2299 		}
2300 
2301 		lcore_info[lcore].lcore_ll->device_num = 0;
2302 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2303 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2304 		if (num_devices % num_switching_cores)
2305 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2306 		else
2307 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2308 	}
2309 
2310 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2311 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2312 
2313 	return 0;
2314 }
2315 
2316 /*
2317  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2318  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2319  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2320  */
2321 static void
2322 destroy_device (volatile struct virtio_net *dev)
2323 {
2324 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2325 	struct virtio_net_data_ll *ll_main_dev_cur;
2326 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2327 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2328 	struct vhost_dev *vdev;
2329 	int lcore;
2330 
2331 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2332 
2333 	vdev = (struct vhost_dev *)dev->priv;
2334 	/*set the remove flag. */
2335 	vdev->remove = 1;
2336 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2337 		rte_pause();
2338 	}
2339 
2340 	/* Search for entry to be removed from lcore ll */
2341 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2342 	while (ll_lcore_dev_cur != NULL) {
2343 		if (ll_lcore_dev_cur->vdev == vdev) {
2344 			break;
2345 		} else {
2346 			ll_lcore_dev_last = ll_lcore_dev_cur;
2347 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2348 		}
2349 	}
2350 
2351 	if (ll_lcore_dev_cur == NULL) {
2352 		RTE_LOG(ERR, VHOST_CONFIG,
2353 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2354 			dev->device_fh);
2355 		return;
2356 	}
2357 
2358 	/* Search for entry to be removed from main ll */
2359 	ll_main_dev_cur = ll_root_used;
2360 	ll_main_dev_last = NULL;
2361 	while (ll_main_dev_cur != NULL) {
2362 		if (ll_main_dev_cur->vdev == vdev) {
2363 			break;
2364 		} else {
2365 			ll_main_dev_last = ll_main_dev_cur;
2366 			ll_main_dev_cur = ll_main_dev_cur->next;
2367 		}
2368 	}
2369 
2370 	/* Remove entries from the lcore and main ll. */
2371 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2372 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2373 
2374 	/* Set the dev_removal_flag on each lcore. */
2375 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2376 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2377 	}
2378 
2379 	/*
2380 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2381 	 * they can no longer access the device removed from the linked lists and that the devices
2382 	 * are no longer in use.
2383 	 */
2384 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2385 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2386 			rte_pause();
2387 		}
2388 	}
2389 
2390 	/* Add the entries back to the lcore and main free ll.*/
2391 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2392 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2393 
2394 	/* Decrement number of device on the lcore. */
2395 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2396 
2397 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2398 
2399 	if (zero_copy) {
2400 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2401 
2402 		/* Stop the RX queue. */
2403 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2404 			LOG_DEBUG(VHOST_CONFIG,
2405 				"(%"PRIu64") In destroy_device: Failed to stop "
2406 				"rx queue:%d\n",
2407 				dev->device_fh,
2408 				vdev->vmdq_rx_q);
2409 		}
2410 
2411 		LOG_DEBUG(VHOST_CONFIG,
2412 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2413 			"mempool back to ring for RX queue: %d\n",
2414 			dev->device_fh, vdev->vmdq_rx_q);
2415 
2416 		mbuf_destroy_zcp(vpool);
2417 
2418 		/* Stop the TX queue. */
2419 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2420 			LOG_DEBUG(VHOST_CONFIG,
2421 				"(%"PRIu64") In destroy_device: Failed to "
2422 				"stop tx queue:%d\n",
2423 				dev->device_fh, vdev->vmdq_rx_q);
2424 		}
2425 
2426 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2427 
2428 		LOG_DEBUG(VHOST_CONFIG,
2429 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2430 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2431 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2432 			dev->device_fh);
2433 
2434 		mbuf_destroy_zcp(vpool);
2435 		rte_free(vdev->regions_hpa);
2436 	}
2437 	rte_free(vdev);
2438 
2439 }
2440 
2441 /*
2442  * Calculate the region count of physical continous regions for one particular
2443  * region of whose vhost virtual address is continous. The particular region
2444  * start from vva_start, with size of 'size' in argument.
2445  */
2446 static uint32_t
2447 check_hpa_regions(uint64_t vva_start, uint64_t size)
2448 {
2449 	uint32_t i, nregions = 0, page_size = getpagesize();
2450 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2451 	if (vva_start % page_size) {
2452 		LOG_DEBUG(VHOST_CONFIG,
2453 			"in check_countinous: vva start(%p) mod page_size(%d) "
2454 			"has remainder\n",
2455 			(void *)(uintptr_t)vva_start, page_size);
2456 		return 0;
2457 	}
2458 	if (size % page_size) {
2459 		LOG_DEBUG(VHOST_CONFIG,
2460 			"in check_countinous: "
2461 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2462 			size, page_size);
2463 		return 0;
2464 	}
2465 	for (i = 0; i < size - page_size; i = i + page_size) {
2466 		cur_phys_addr
2467 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2468 		next_phys_addr = rte_mem_virt2phy(
2469 			(void *)(uintptr_t)(vva_start + i + page_size));
2470 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2471 			++nregions;
2472 			LOG_DEBUG(VHOST_CONFIG,
2473 				"in check_continuous: hva addr:(%p) is not "
2474 				"continuous with hva addr:(%p), diff:%d\n",
2475 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2476 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2477 				+ page_size), page_size);
2478 			LOG_DEBUG(VHOST_CONFIG,
2479 				"in check_continuous: hpa addr:(%p) is not "
2480 				"continuous with hpa addr:(%p), "
2481 				"diff:(%"PRIu64")\n",
2482 				(void *)(uintptr_t)cur_phys_addr,
2483 				(void *)(uintptr_t)next_phys_addr,
2484 				(next_phys_addr-cur_phys_addr));
2485 		}
2486 	}
2487 	return nregions;
2488 }
2489 
2490 /*
2491  * Divide each region whose vhost virtual address is continous into a few
2492  * sub-regions, make sure the physical address within each sub-region are
2493  * continous. And fill offset(to GPA) and size etc. information of each
2494  * sub-region into regions_hpa.
2495  */
2496 static uint32_t
2497 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2498 {
2499 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2500 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2501 
2502 	if (mem_region_hpa == NULL)
2503 		return 0;
2504 
2505 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2506 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2507 			virtio_memory->regions[regionidx].address_offset;
2508 		mem_region_hpa[regionidx_hpa].guest_phys_address
2509 			= virtio_memory->regions[regionidx].guest_phys_address;
2510 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2511 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2512 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2513 		LOG_DEBUG(VHOST_CONFIG,
2514 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2515 			regionidx_hpa,
2516 			(void *)(uintptr_t)
2517 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2518 		LOG_DEBUG(VHOST_CONFIG,
2519 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2520 			regionidx_hpa,
2521 			(void *)(uintptr_t)
2522 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2523 		for (i = 0, k = 0;
2524 			i < virtio_memory->regions[regionidx].memory_size -
2525 				page_size;
2526 			i += page_size) {
2527 			cur_phys_addr = rte_mem_virt2phy(
2528 					(void *)(uintptr_t)(vva_start + i));
2529 			next_phys_addr = rte_mem_virt2phy(
2530 					(void *)(uintptr_t)(vva_start +
2531 					i + page_size));
2532 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2533 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2534 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2535 					k + page_size;
2536 				mem_region_hpa[regionidx_hpa].memory_size
2537 					= k + page_size;
2538 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2539 					"phys addr end  [%d]:(%p)\n",
2540 					regionidx_hpa,
2541 					(void *)(uintptr_t)
2542 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2543 				LOG_DEBUG(VHOST_CONFIG,
2544 					"in fill_hpa_regions: guest phys addr "
2545 					"size [%d]:(%p)\n",
2546 					regionidx_hpa,
2547 					(void *)(uintptr_t)
2548 					(mem_region_hpa[regionidx_hpa].memory_size));
2549 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2550 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2551 				++regionidx_hpa;
2552 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2553 					next_phys_addr -
2554 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2555 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2556 					" phys addr start[%d]:(%p)\n",
2557 					regionidx_hpa,
2558 					(void *)(uintptr_t)
2559 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2560 				LOG_DEBUG(VHOST_CONFIG,
2561 					"in fill_hpa_regions: host  phys addr "
2562 					"start[%d]:(%p)\n",
2563 					regionidx_hpa,
2564 					(void *)(uintptr_t)
2565 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2566 				k = 0;
2567 			} else {
2568 				k += page_size;
2569 			}
2570 		}
2571 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2572 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2573 			+ k + page_size;
2574 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2575 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2576 			"[%d]:(%p)\n", regionidx_hpa,
2577 			(void *)(uintptr_t)
2578 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2579 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2580 			"[%d]:(%p)\n", regionidx_hpa,
2581 			(void *)(uintptr_t)
2582 			(mem_region_hpa[regionidx_hpa].memory_size));
2583 		++regionidx_hpa;
2584 	}
2585 	return regionidx_hpa;
2586 }
2587 
2588 /*
2589  * A new device is added to a data core. First the device is added to the main linked list
2590  * and the allocated to a specific data core.
2591  */
2592 static int
2593 new_device (struct virtio_net *dev)
2594 {
2595 	struct virtio_net_data_ll *ll_dev;
2596 	int lcore, core_add = 0;
2597 	uint32_t device_num_min = num_devices;
2598 	struct vhost_dev *vdev;
2599 	uint32_t regionidx;
2600 
2601 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2602 	if (vdev == NULL) {
2603 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2604 			dev->device_fh);
2605 		return -1;
2606 	}
2607 	vdev->dev = dev;
2608 	dev->priv = vdev;
2609 
2610 	if (zero_copy) {
2611 		vdev->nregions_hpa = dev->mem->nregions;
2612 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2613 			vdev->nregions_hpa
2614 				+= check_hpa_regions(
2615 					dev->mem->regions[regionidx].guest_phys_address
2616 					+ dev->mem->regions[regionidx].address_offset,
2617 					dev->mem->regions[regionidx].memory_size);
2618 
2619 		}
2620 
2621 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2622 					       vdev->nregions_hpa,
2623 					       sizeof(struct virtio_memory_regions_hpa),
2624 					       RTE_CACHE_LINE_SIZE);
2625 		if (vdev->regions_hpa == NULL) {
2626 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2627 			rte_free(vdev);
2628 			return -1;
2629 		}
2630 
2631 
2632 		if (fill_hpa_memory_regions(
2633 			vdev->regions_hpa, dev->mem
2634 			) != vdev->nregions_hpa) {
2635 
2636 			RTE_LOG(ERR, VHOST_CONFIG,
2637 				"hpa memory regions number mismatch: "
2638 				"[%d]\n", vdev->nregions_hpa);
2639 			rte_free(vdev->regions_hpa);
2640 			rte_free(vdev);
2641 			return -1;
2642 		}
2643 	}
2644 
2645 
2646 	/* Add device to main ll */
2647 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2648 	if (ll_dev == NULL) {
2649 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2650 			"of %d devices per core has been reached\n",
2651 			dev->device_fh, num_devices);
2652 		if (vdev->regions_hpa)
2653 			rte_free(vdev->regions_hpa);
2654 		rte_free(vdev);
2655 		return -1;
2656 	}
2657 	ll_dev->vdev = vdev;
2658 	add_data_ll_entry(&ll_root_used, ll_dev);
2659 	vdev->vmdq_rx_q
2660 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2661 
2662 	if (zero_copy) {
2663 		uint32_t index = vdev->vmdq_rx_q;
2664 		uint32_t count_in_ring, i;
2665 		struct mbuf_table *tx_q;
2666 
2667 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2668 
2669 		LOG_DEBUG(VHOST_CONFIG,
2670 			"(%"PRIu64") in new_device: mbuf count in mempool "
2671 			"before attach is: %d\n",
2672 			dev->device_fh,
2673 			rte_mempool_count(vpool_array[index].pool));
2674 		LOG_DEBUG(VHOST_CONFIG,
2675 			"(%"PRIu64") in new_device: mbuf count in  ring "
2676 			"before attach  is : %d\n",
2677 			dev->device_fh, count_in_ring);
2678 
2679 		/*
2680 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2681 		 */
2682 		for (i = 0; i < count_in_ring; i++)
2683 			attach_rxmbuf_zcp(dev);
2684 
2685 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2686 			"mempool after attach is: %d\n",
2687 			dev->device_fh,
2688 			rte_mempool_count(vpool_array[index].pool));
2689 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2690 			"ring after attach  is : %d\n",
2691 			dev->device_fh,
2692 			rte_ring_count(vpool_array[index].ring));
2693 
2694 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2695 		tx_q->txq_id = vdev->vmdq_rx_q;
2696 
2697 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2698 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2699 
2700 			LOG_DEBUG(VHOST_CONFIG,
2701 				"(%"PRIu64") In new_device: Failed to start "
2702 				"tx queue:%d\n",
2703 				dev->device_fh, vdev->vmdq_rx_q);
2704 
2705 			mbuf_destroy_zcp(vpool);
2706 			rte_free(vdev->regions_hpa);
2707 			rte_free(vdev);
2708 			return -1;
2709 		}
2710 
2711 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2712 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2713 
2714 			LOG_DEBUG(VHOST_CONFIG,
2715 				"(%"PRIu64") In new_device: Failed to start "
2716 				"rx queue:%d\n",
2717 				dev->device_fh, vdev->vmdq_rx_q);
2718 
2719 			/* Stop the TX queue. */
2720 			if (rte_eth_dev_tx_queue_stop(ports[0],
2721 				vdev->vmdq_rx_q) != 0) {
2722 				LOG_DEBUG(VHOST_CONFIG,
2723 					"(%"PRIu64") In new_device: Failed to "
2724 					"stop tx queue:%d\n",
2725 					dev->device_fh, vdev->vmdq_rx_q);
2726 			}
2727 
2728 			mbuf_destroy_zcp(vpool);
2729 			rte_free(vdev->regions_hpa);
2730 			rte_free(vdev);
2731 			return -1;
2732 		}
2733 
2734 	}
2735 
2736 	/*reset ready flag*/
2737 	vdev->ready = DEVICE_MAC_LEARNING;
2738 	vdev->remove = 0;
2739 
2740 	/* Find a suitable lcore to add the device. */
2741 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2742 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2743 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2744 			core_add = lcore;
2745 		}
2746 	}
2747 	/* Add device to lcore ll */
2748 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2749 	if (ll_dev == NULL) {
2750 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2751 		vdev->ready = DEVICE_SAFE_REMOVE;
2752 		destroy_device(dev);
2753 		rte_free(vdev->regions_hpa);
2754 		rte_free(vdev);
2755 		return -1;
2756 	}
2757 	ll_dev->vdev = vdev;
2758 	vdev->coreid = core_add;
2759 
2760 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2761 
2762 	/* Initialize device stats */
2763 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2764 
2765 	/* Disable notifications. */
2766 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2767 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2768 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2769 	dev->flags |= VIRTIO_DEV_RUNNING;
2770 
2771 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2772 
2773 	return 0;
2774 }
2775 
2776 /*
2777  * These callback allow devices to be added to the data core when configuration
2778  * has been fully complete.
2779  */
2780 static const struct virtio_net_device_ops virtio_net_device_ops =
2781 {
2782 	.new_device =  new_device,
2783 	.destroy_device = destroy_device,
2784 };
2785 
2786 /*
2787  * This is a thread will wake up after a period to print stats if the user has
2788  * enabled them.
2789  */
2790 static void
2791 print_stats(void)
2792 {
2793 	struct virtio_net_data_ll *dev_ll;
2794 	uint64_t tx_dropped, rx_dropped;
2795 	uint64_t tx, tx_total, rx, rx_total;
2796 	uint32_t device_fh;
2797 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2798 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2799 
2800 	while(1) {
2801 		sleep(enable_stats);
2802 
2803 		/* Clear screen and move to top left */
2804 		printf("%s%s", clr, top_left);
2805 
2806 		printf("\nDevice statistics ====================================");
2807 
2808 		dev_ll = ll_root_used;
2809 		while (dev_ll != NULL) {
2810 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2811 			tx_total = dev_statistics[device_fh].tx_total;
2812 			tx = dev_statistics[device_fh].tx;
2813 			tx_dropped = tx_total - tx;
2814 			if (zero_copy == 0) {
2815 				rx_total = rte_atomic64_read(
2816 					&dev_statistics[device_fh].rx_total_atomic);
2817 				rx = rte_atomic64_read(
2818 					&dev_statistics[device_fh].rx_atomic);
2819 			} else {
2820 				rx_total = dev_statistics[device_fh].rx_total;
2821 				rx = dev_statistics[device_fh].rx;
2822 			}
2823 			rx_dropped = rx_total - rx;
2824 
2825 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2826 					"\nTX total: 		%"PRIu64""
2827 					"\nTX dropped: 		%"PRIu64""
2828 					"\nTX successful: 		%"PRIu64""
2829 					"\nRX total: 		%"PRIu64""
2830 					"\nRX dropped: 		%"PRIu64""
2831 					"\nRX successful: 		%"PRIu64"",
2832 					device_fh,
2833 					tx_total,
2834 					tx_dropped,
2835 					tx,
2836 					rx_total,
2837 					rx_dropped,
2838 					rx);
2839 
2840 			dev_ll = dev_ll->next;
2841 		}
2842 		printf("\n======================================================\n");
2843 	}
2844 }
2845 
2846 static void
2847 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2848 	char *ring_name, uint32_t nb_mbuf)
2849 {
2850 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2851 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2852 	if (vpool_array[index].pool != NULL) {
2853 		vpool_array[index].ring
2854 			= rte_ring_create(ring_name,
2855 				rte_align32pow2(nb_mbuf + 1),
2856 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2857 		if (likely(vpool_array[index].ring != NULL)) {
2858 			LOG_DEBUG(VHOST_CONFIG,
2859 				"in setup_mempool_tbl: mbuf count in "
2860 				"mempool is: %d\n",
2861 				rte_mempool_count(vpool_array[index].pool));
2862 			LOG_DEBUG(VHOST_CONFIG,
2863 				"in setup_mempool_tbl: mbuf count in "
2864 				"ring   is: %d\n",
2865 				rte_ring_count(vpool_array[index].ring));
2866 		} else {
2867 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2868 				ring_name);
2869 		}
2870 
2871 		/* Need consider head room. */
2872 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2873 	} else {
2874 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2875 	}
2876 }
2877 
2878 /* When we receive a INT signal, unregister vhost driver */
2879 static void
2880 sigint_handler(__rte_unused int signum)
2881 {
2882 	/* Unregister vhost driver. */
2883 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2884 	if (ret != 0)
2885 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2886 	exit(0);
2887 }
2888 
2889 /*
2890  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2891  * device is also registered here to handle the IOCTLs.
2892  */
2893 int
2894 main(int argc, char *argv[])
2895 {
2896 	struct rte_mempool *mbuf_pool = NULL;
2897 	unsigned lcore_id, core_id = 0;
2898 	unsigned nb_ports, valid_num_ports;
2899 	int ret;
2900 	uint8_t portid;
2901 	uint16_t queue_id;
2902 	static pthread_t tid;
2903 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2904 
2905 	signal(SIGINT, sigint_handler);
2906 
2907 	/* init EAL */
2908 	ret = rte_eal_init(argc, argv);
2909 	if (ret < 0)
2910 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2911 	argc -= ret;
2912 	argv += ret;
2913 
2914 	/* parse app arguments */
2915 	ret = us_vhost_parse_args(argc, argv);
2916 	if (ret < 0)
2917 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2918 
2919 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2920 		if (rte_lcore_is_enabled(lcore_id))
2921 			lcore_ids[core_id ++] = lcore_id;
2922 
2923 	if (rte_lcore_count() > RTE_MAX_LCORE)
2924 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2925 
2926 	/*set the number of swithcing cores available*/
2927 	num_switching_cores = rte_lcore_count()-1;
2928 
2929 	/* Get the number of physical ports. */
2930 	nb_ports = rte_eth_dev_count();
2931 	if (nb_ports > RTE_MAX_ETHPORTS)
2932 		nb_ports = RTE_MAX_ETHPORTS;
2933 
2934 	/*
2935 	 * Update the global var NUM_PORTS and global array PORTS
2936 	 * and get value of var VALID_NUM_PORTS according to system ports number
2937 	 */
2938 	valid_num_ports = check_ports_num(nb_ports);
2939 
2940 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2941 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2942 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2943 		return -1;
2944 	}
2945 
2946 	if (zero_copy == 0) {
2947 		/* Create the mbuf pool. */
2948 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2949 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2950 			0, MBUF_DATA_SIZE, rte_socket_id());
2951 		if (mbuf_pool == NULL)
2952 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2953 
2954 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2955 			vpool_array[queue_id].pool = mbuf_pool;
2956 
2957 		if (vm2vm_mode == VM2VM_HARDWARE) {
2958 			/* Enable VT loop back to let L2 switch to do it. */
2959 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2960 			LOG_DEBUG(VHOST_CONFIG,
2961 				"Enable loop back for L2 switch in vmdq.\n");
2962 		}
2963 	} else {
2964 		uint32_t nb_mbuf;
2965 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2966 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2967 
2968 		nb_mbuf = num_rx_descriptor
2969 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2970 			+ num_switching_cores * MAX_PKT_BURST;
2971 
2972 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2973 			snprintf(pool_name, sizeof(pool_name),
2974 				"rxmbuf_pool_%u", queue_id);
2975 			snprintf(ring_name, sizeof(ring_name),
2976 				"rxmbuf_ring_%u", queue_id);
2977 			setup_mempool_tbl(rte_socket_id(), queue_id,
2978 				pool_name, ring_name, nb_mbuf);
2979 		}
2980 
2981 		nb_mbuf = num_tx_descriptor
2982 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2983 				+ num_switching_cores * MAX_PKT_BURST;
2984 
2985 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2986 			snprintf(pool_name, sizeof(pool_name),
2987 				"txmbuf_pool_%u", queue_id);
2988 			snprintf(ring_name, sizeof(ring_name),
2989 				"txmbuf_ring_%u", queue_id);
2990 			setup_mempool_tbl(rte_socket_id(),
2991 				(queue_id + MAX_QUEUES),
2992 				pool_name, ring_name, nb_mbuf);
2993 		}
2994 
2995 		if (vm2vm_mode == VM2VM_HARDWARE) {
2996 			/* Enable VT loop back to let L2 switch to do it. */
2997 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2998 			LOG_DEBUG(VHOST_CONFIG,
2999 				"Enable loop back for L2 switch in vmdq.\n");
3000 		}
3001 	}
3002 	/* Set log level. */
3003 	rte_set_log_level(LOG_LEVEL);
3004 
3005 	/* initialize all ports */
3006 	for (portid = 0; portid < nb_ports; portid++) {
3007 		/* skip ports that are not enabled */
3008 		if ((enabled_port_mask & (1 << portid)) == 0) {
3009 			RTE_LOG(INFO, VHOST_PORT,
3010 				"Skipping disabled port %d\n", portid);
3011 			continue;
3012 		}
3013 		if (port_init(portid) != 0)
3014 			rte_exit(EXIT_FAILURE,
3015 				"Cannot initialize network ports\n");
3016 	}
3017 
3018 	/* Initialise all linked lists. */
3019 	if (init_data_ll() == -1)
3020 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3021 
3022 	/* Initialize device stats */
3023 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3024 
3025 	/* Enable stats if the user option is set. */
3026 	if (enable_stats) {
3027 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3028 		if (ret != 0)
3029 			rte_exit(EXIT_FAILURE,
3030 				"Cannot create print-stats thread\n");
3031 
3032 		/* Set thread_name for aid in debugging.  */
3033 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3034 		ret = rte_thread_setname(tid, thread_name);
3035 		if (ret != 0)
3036 			RTE_LOG(ERR, VHOST_CONFIG,
3037 				"Cannot set print-stats name\n");
3038 	}
3039 
3040 	/* Launch all data cores. */
3041 	if (zero_copy == 0) {
3042 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3043 			rte_eal_remote_launch(switch_worker,
3044 				mbuf_pool, lcore_id);
3045 		}
3046 	} else {
3047 		uint32_t count_in_mempool, index, i;
3048 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3049 			/* For all RX and TX queues. */
3050 			count_in_mempool
3051 				= rte_mempool_count(vpool_array[index].pool);
3052 
3053 			/*
3054 			 * Transfer all un-attached mbufs from vpool.pool
3055 			 * to vpoo.ring.
3056 			 */
3057 			for (i = 0; i < count_in_mempool; i++) {
3058 				struct rte_mbuf *mbuf
3059 					= __rte_mbuf_raw_alloc(
3060 						vpool_array[index].pool);
3061 				rte_ring_sp_enqueue(vpool_array[index].ring,
3062 						(void *)mbuf);
3063 			}
3064 
3065 			LOG_DEBUG(VHOST_CONFIG,
3066 				"in main: mbuf count in mempool at initial "
3067 				"is: %d\n", count_in_mempool);
3068 			LOG_DEBUG(VHOST_CONFIG,
3069 				"in main: mbuf count in  ring at initial  is :"
3070 				" %d\n",
3071 				rte_ring_count(vpool_array[index].ring));
3072 		}
3073 
3074 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3075 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3076 				lcore_id);
3077 	}
3078 
3079 	if (mergeable == 0)
3080 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3081 
3082 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3083 	ret = rte_vhost_driver_register((char *)&dev_basename);
3084 	if (ret != 0)
3085 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3086 
3087 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3088 
3089 	/* Start CUSE session. */
3090 	rte_vhost_driver_session_start();
3091 	return 0;
3092 
3093 }
3094