xref: /dpdk/examples/vhost/main.c (revision 176582b8a4868822b4e827a63af3db169fe03d4a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 #define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
143 
144 /* mask of enabled ports */
145 static uint32_t enabled_port_mask = 0;
146 
147 /* Promiscuous mode */
148 static uint32_t promiscuous;
149 
150 /*Number of switching cores enabled*/
151 static uint32_t num_switching_cores = 0;
152 
153 /* number of devices/queues to support*/
154 static uint32_t num_queues = 0;
155 static uint32_t num_devices;
156 
157 /*
158  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
159  * disabled on default.
160  */
161 static uint32_t zero_copy;
162 static int mergeable;
163 
164 /* Do vlan strip on host, enabled on default */
165 static uint32_t vlan_strip = 1;
166 
167 /* number of descriptors to apply*/
168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
170 
171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
172 #define MAX_RING_DESC 4096
173 
174 struct vpool {
175 	struct rte_mempool *pool;
176 	struct rte_ring *ring;
177 	uint32_t buf_size;
178 } vpool_array[MAX_QUEUES+MAX_QUEUES];
179 
180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
181 typedef enum {
182 	VM2VM_DISABLED = 0,
183 	VM2VM_SOFTWARE = 1,
184 	VM2VM_HARDWARE = 2,
185 	VM2VM_LAST
186 } vm2vm_type;
187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
188 
189 /* The type of host physical address translated from guest physical address. */
190 typedef enum {
191 	PHYS_ADDR_CONTINUOUS = 0,
192 	PHYS_ADDR_CROSS_SUBREG = 1,
193 	PHYS_ADDR_INVALID = 2,
194 	PHYS_ADDR_LAST
195 } hpa_type;
196 
197 /* Enable stats. */
198 static uint32_t enable_stats = 0;
199 /* Enable retries on RX. */
200 static uint32_t enable_retry = 1;
201 /* Specify timeout (in useconds) between retries on RX. */
202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
203 /* Specify the number of retries on RX. */
204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
205 
206 /* Character device basename. Can be set by user. */
207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
208 
209 /* empty vmdq configuration structure. Filled in programatically */
210 static struct rte_eth_conf vmdq_conf_default = {
211 	.rxmode = {
212 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
213 		.split_hdr_size = 0,
214 		.header_split   = 0, /**< Header Split disabled */
215 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
216 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
217 		/*
218 		 * It is necessary for 1G NIC such as I350,
219 		 * this fixes bug of ipv4 forwarding in guest can't
220 		 * forward pakets from one virtio dev to another virtio dev.
221 		 */
222 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
223 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
224 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
225 	},
226 
227 	.txmode = {
228 		.mq_mode = ETH_MQ_TX_NONE,
229 	},
230 	.rx_adv_conf = {
231 		/*
232 		 * should be overridden separately in code with
233 		 * appropriate values
234 		 */
235 		.vmdq_rx_conf = {
236 			.nb_queue_pools = ETH_8_POOLS,
237 			.enable_default_pool = 0,
238 			.default_pool = 0,
239 			.nb_pool_maps = 0,
240 			.pool_map = {{0, 0},},
241 		},
242 	},
243 };
244 
245 static unsigned lcore_ids[RTE_MAX_LCORE];
246 static uint8_t ports[RTE_MAX_ETHPORTS];
247 static unsigned num_ports = 0; /**< The number of ports specified in command line */
248 static uint16_t num_pf_queues, num_vmdq_queues;
249 static uint16_t vmdq_pool_base, vmdq_queue_base;
250 static uint16_t queues_per_pool;
251 
252 static const uint16_t external_pkt_default_vlan_tag = 2000;
253 const uint16_t vlan_tags[] = {
254 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
255 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
256 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
257 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
258 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
259 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
260 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
261 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
262 };
263 
264 /* ethernet addresses of ports */
265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
266 
267 /* heads for the main used and free linked lists for the data path. */
268 static struct virtio_net_data_ll *ll_root_used = NULL;
269 static struct virtio_net_data_ll *ll_root_free = NULL;
270 
271 /* Array of data core structures containing information on individual core linked lists. */
272 static struct lcore_info lcore_info[RTE_MAX_LCORE];
273 
274 /* Used for queueing bursts of TX packets. */
275 struct mbuf_table {
276 	unsigned len;
277 	unsigned txq_id;
278 	struct rte_mbuf *m_table[MAX_PKT_BURST];
279 };
280 
281 /* TX queue for each data core. */
282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
283 
284 /* TX queue fori each virtio device for zero copy. */
285 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
286 
287 /* Vlan header struct used to insert vlan tags on TX. */
288 struct vlan_ethhdr {
289 	unsigned char   h_dest[ETH_ALEN];
290 	unsigned char   h_source[ETH_ALEN];
291 	__be16          h_vlan_proto;
292 	__be16          h_vlan_TCI;
293 	__be16          h_vlan_encapsulated_proto;
294 };
295 
296 /* IPv4 Header */
297 struct ipv4_hdr {
298 	uint8_t  version_ihl;		/**< version and header length */
299 	uint8_t  type_of_service;	/**< type of service */
300 	uint16_t total_length;		/**< length of packet */
301 	uint16_t packet_id;		/**< packet ID */
302 	uint16_t fragment_offset;	/**< fragmentation offset */
303 	uint8_t  time_to_live;		/**< time to live */
304 	uint8_t  next_proto_id;		/**< protocol ID */
305 	uint16_t hdr_checksum;		/**< header checksum */
306 	uint32_t src_addr;		/**< source address */
307 	uint32_t dst_addr;		/**< destination address */
308 } __attribute__((__packed__));
309 
310 /* Header lengths. */
311 #define VLAN_HLEN       4
312 #define VLAN_ETH_HLEN   18
313 
314 /* Per-device statistics struct */
315 struct device_statistics {
316 	uint64_t tx_total;
317 	rte_atomic64_t rx_total_atomic;
318 	uint64_t rx_total;
319 	uint64_t tx;
320 	rte_atomic64_t rx_atomic;
321 	uint64_t rx;
322 } __rte_cache_aligned;
323 struct device_statistics dev_statistics[MAX_DEVICES];
324 
325 /*
326  * Builds up the correct configuration for VMDQ VLAN pool map
327  * according to the pool & queue limits.
328  */
329 static inline int
330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
331 {
332 	struct rte_eth_vmdq_rx_conf conf;
333 	struct rte_eth_vmdq_rx_conf *def_conf =
334 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
335 	unsigned i;
336 
337 	memset(&conf, 0, sizeof(conf));
338 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
339 	conf.nb_pool_maps = num_devices;
340 	conf.enable_loop_back = def_conf->enable_loop_back;
341 	conf.rx_mode = def_conf->rx_mode;
342 
343 	for (i = 0; i < conf.nb_pool_maps; i++) {
344 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
345 		conf.pool_map[i].pools = (1UL << i);
346 	}
347 
348 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
349 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
350 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
351 	return 0;
352 }
353 
354 /*
355  * Validate the device number according to the max pool number gotten form
356  * dev_info. If the device number is invalid, give the error message and
357  * return -1. Each device must have its own pool.
358  */
359 static inline int
360 validate_num_devices(uint32_t max_nb_devices)
361 {
362 	if (num_devices > max_nb_devices) {
363 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
364 		return -1;
365 	}
366 	return 0;
367 }
368 
369 /*
370  * Initialises a given port using global settings and with the rx buffers
371  * coming from the mbuf_pool passed as parameter
372  */
373 static inline int
374 port_init(uint8_t port)
375 {
376 	struct rte_eth_dev_info dev_info;
377 	struct rte_eth_conf port_conf;
378 	struct rte_eth_rxconf *rxconf;
379 	struct rte_eth_txconf *txconf;
380 	int16_t rx_rings, tx_rings;
381 	uint16_t rx_ring_size, tx_ring_size;
382 	int retval;
383 	uint16_t q;
384 
385 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
386 	rte_eth_dev_info_get (port, &dev_info);
387 
388 	if (dev_info.max_rx_queues > MAX_QUEUES) {
389 		rte_exit(EXIT_FAILURE,
390 			"please define MAX_QUEUES no less than %u in %s\n",
391 			dev_info.max_rx_queues, __FILE__);
392 	}
393 
394 	rxconf = &dev_info.default_rxconf;
395 	txconf = &dev_info.default_txconf;
396 	rxconf->rx_drop_en = 1;
397 
398 	/* Enable vlan offload */
399 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
400 
401 	/*
402 	 * Zero copy defers queue RX/TX start to the time when guest
403 	 * finishes its startup and packet buffers from that guest are
404 	 * available.
405 	 */
406 	if (zero_copy) {
407 		rxconf->rx_deferred_start = 1;
408 		rxconf->rx_drop_en = 0;
409 		txconf->tx_deferred_start = 1;
410 	}
411 
412 	/*configure the number of supported virtio devices based on VMDQ limits */
413 	num_devices = dev_info.max_vmdq_pools;
414 
415 	if (zero_copy) {
416 		rx_ring_size = num_rx_descriptor;
417 		tx_ring_size = num_tx_descriptor;
418 		tx_rings = dev_info.max_tx_queues;
419 	} else {
420 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
421 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
422 		tx_rings = (uint16_t)rte_lcore_count();
423 	}
424 
425 	retval = validate_num_devices(MAX_DEVICES);
426 	if (retval < 0)
427 		return retval;
428 
429 	/* Get port configuration. */
430 	retval = get_eth_conf(&port_conf, num_devices);
431 	if (retval < 0)
432 		return retval;
433 	/* NIC queues are divided into pf queues and vmdq queues.  */
434 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
435 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
436 	num_vmdq_queues = num_devices * queues_per_pool;
437 	num_queues = num_pf_queues + num_vmdq_queues;
438 	vmdq_queue_base = dev_info.vmdq_queue_base;
439 	vmdq_pool_base  = dev_info.vmdq_pool_base;
440 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
441 		num_pf_queues, num_devices, queues_per_pool);
442 
443 	if (port >= rte_eth_dev_count()) return -1;
444 
445 	rx_rings = (uint16_t)dev_info.max_rx_queues;
446 	/* Configure ethernet device. */
447 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
448 	if (retval != 0)
449 		return retval;
450 
451 	/* Setup the queues. */
452 	for (q = 0; q < rx_rings; q ++) {
453 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
454 						rte_eth_dev_socket_id(port),
455 						rxconf,
456 						vpool_array[q].pool);
457 		if (retval < 0)
458 			return retval;
459 	}
460 	for (q = 0; q < tx_rings; q ++) {
461 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462 						rte_eth_dev_socket_id(port),
463 						txconf);
464 		if (retval < 0)
465 			return retval;
466 	}
467 
468 	/* Start the device. */
469 	retval  = rte_eth_dev_start(port);
470 	if (retval < 0) {
471 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
472 		return retval;
473 	}
474 
475 	if (promiscuous)
476 		rte_eth_promiscuous_enable(port);
477 
478 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
479 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
480 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
481 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482 			(unsigned)port,
483 			vmdq_ports_eth_addr[port].addr_bytes[0],
484 			vmdq_ports_eth_addr[port].addr_bytes[1],
485 			vmdq_ports_eth_addr[port].addr_bytes[2],
486 			vmdq_ports_eth_addr[port].addr_bytes[3],
487 			vmdq_ports_eth_addr[port].addr_bytes[4],
488 			vmdq_ports_eth_addr[port].addr_bytes[5]);
489 
490 	return 0;
491 }
492 
493 /*
494  * Set character device basename.
495  */
496 static int
497 us_vhost_parse_basename(const char *q_arg)
498 {
499 	/* parse number string */
500 
501 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
502 		return -1;
503 	else
504 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
505 
506 	return 0;
507 }
508 
509 /*
510  * Parse the portmask provided at run time.
511  */
512 static int
513 parse_portmask(const char *portmask)
514 {
515 	char *end = NULL;
516 	unsigned long pm;
517 
518 	errno = 0;
519 
520 	/* parse hexadecimal string */
521 	pm = strtoul(portmask, &end, 16);
522 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
523 		return -1;
524 
525 	if (pm == 0)
526 		return -1;
527 
528 	return pm;
529 
530 }
531 
532 /*
533  * Parse num options at run time.
534  */
535 static int
536 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
537 {
538 	char *end = NULL;
539 	unsigned long num;
540 
541 	errno = 0;
542 
543 	/* parse unsigned int string */
544 	num = strtoul(q_arg, &end, 10);
545 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
546 		return -1;
547 
548 	if (num > max_valid_value)
549 		return -1;
550 
551 	return num;
552 
553 }
554 
555 /*
556  * Display usage
557  */
558 static void
559 us_vhost_usage(const char *prgname)
560 {
561 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562 	"		--vm2vm [0|1|2]\n"
563 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
564 	"		--dev-basename <name>\n"
565 	"		--nb-devices ND\n"
566 	"		-p PORTMASK: Set mask for ports to be used by application\n"
567 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
568 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
569 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
570 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
571 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
572 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
573 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574 	"		--dev-basename: The basename to be used for the character device.\n"
575 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
576 			"zero copy\n"
577 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
578 			"used only when zero copy is enabled.\n"
579 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
580 			"used only when zero copy is enabled.\n",
581 	       prgname);
582 }
583 
584 /*
585  * Parse the arguments given in the command line of the application.
586  */
587 static int
588 us_vhost_parse_args(int argc, char **argv)
589 {
590 	int opt, ret;
591 	int option_index;
592 	unsigned i;
593 	const char *prgname = argv[0];
594 	static struct option long_option[] = {
595 		{"vm2vm", required_argument, NULL, 0},
596 		{"rx-retry", required_argument, NULL, 0},
597 		{"rx-retry-delay", required_argument, NULL, 0},
598 		{"rx-retry-num", required_argument, NULL, 0},
599 		{"mergeable", required_argument, NULL, 0},
600 		{"vlan-strip", required_argument, NULL, 0},
601 		{"stats", required_argument, NULL, 0},
602 		{"dev-basename", required_argument, NULL, 0},
603 		{"zero-copy", required_argument, NULL, 0},
604 		{"rx-desc-num", required_argument, NULL, 0},
605 		{"tx-desc-num", required_argument, NULL, 0},
606 		{NULL, 0, 0, 0},
607 	};
608 
609 	/* Parse command line */
610 	while ((opt = getopt_long(argc, argv, "p:P",
611 			long_option, &option_index)) != EOF) {
612 		switch (opt) {
613 		/* Portmask */
614 		case 'p':
615 			enabled_port_mask = parse_portmask(optarg);
616 			if (enabled_port_mask == 0) {
617 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
618 				us_vhost_usage(prgname);
619 				return -1;
620 			}
621 			break;
622 
623 		case 'P':
624 			promiscuous = 1;
625 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
626 				ETH_VMDQ_ACCEPT_BROADCAST |
627 				ETH_VMDQ_ACCEPT_MULTICAST;
628 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
629 
630 			break;
631 
632 		case 0:
633 			/* Enable/disable vm2vm comms. */
634 			if (!strncmp(long_option[option_index].name, "vm2vm",
635 				MAX_LONG_OPT_SZ)) {
636 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
637 				if (ret == -1) {
638 					RTE_LOG(INFO, VHOST_CONFIG,
639 						"Invalid argument for "
640 						"vm2vm [0|1|2]\n");
641 					us_vhost_usage(prgname);
642 					return -1;
643 				} else {
644 					vm2vm_mode = (vm2vm_type)ret;
645 				}
646 			}
647 
648 			/* Enable/disable retries on RX. */
649 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
650 				ret = parse_num_opt(optarg, 1);
651 				if (ret == -1) {
652 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
653 					us_vhost_usage(prgname);
654 					return -1;
655 				} else {
656 					enable_retry = ret;
657 				}
658 			}
659 
660 			/* Specify the retries delay time (in useconds) on RX. */
661 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
662 				ret = parse_num_opt(optarg, INT32_MAX);
663 				if (ret == -1) {
664 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
665 					us_vhost_usage(prgname);
666 					return -1;
667 				} else {
668 					burst_rx_delay_time = ret;
669 				}
670 			}
671 
672 			/* Specify the retries number on RX. */
673 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
674 				ret = parse_num_opt(optarg, INT32_MAX);
675 				if (ret == -1) {
676 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
677 					us_vhost_usage(prgname);
678 					return -1;
679 				} else {
680 					burst_rx_retry_num = ret;
681 				}
682 			}
683 
684 			/* Enable/disable RX mergeable buffers. */
685 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
686 				ret = parse_num_opt(optarg, 1);
687 				if (ret == -1) {
688 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
689 					us_vhost_usage(prgname);
690 					return -1;
691 				} else {
692 					mergeable = !!ret;
693 					if (ret) {
694 						vmdq_conf_default.rxmode.jumbo_frame = 1;
695 						vmdq_conf_default.rxmode.max_rx_pkt_len
696 							= JUMBO_FRAME_MAX_SIZE;
697 					}
698 				}
699 			}
700 
701 			/* Enable/disable RX VLAN strip on host. */
702 			if (!strncmp(long_option[option_index].name,
703 				"vlan-strip", MAX_LONG_OPT_SZ)) {
704 				ret = parse_num_opt(optarg, 1);
705 				if (ret == -1) {
706 					RTE_LOG(INFO, VHOST_CONFIG,
707 						"Invalid argument for VLAN strip [0|1]\n");
708 					us_vhost_usage(prgname);
709 					return -1;
710 				} else {
711 					vlan_strip = !!ret;
712 					vmdq_conf_default.rxmode.hw_vlan_strip =
713 						vlan_strip;
714 				}
715 			}
716 
717 			/* Enable/disable stats. */
718 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
719 				ret = parse_num_opt(optarg, INT32_MAX);
720 				if (ret == -1) {
721 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
722 					us_vhost_usage(prgname);
723 					return -1;
724 				} else {
725 					enable_stats = ret;
726 				}
727 			}
728 
729 			/* Set character device basename. */
730 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
731 				if (us_vhost_parse_basename(optarg) == -1) {
732 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
733 					us_vhost_usage(prgname);
734 					return -1;
735 				}
736 			}
737 
738 			/* Enable/disable rx/tx zero copy. */
739 			if (!strncmp(long_option[option_index].name,
740 				"zero-copy", MAX_LONG_OPT_SZ)) {
741 				ret = parse_num_opt(optarg, 1);
742 				if (ret == -1) {
743 					RTE_LOG(INFO, VHOST_CONFIG,
744 						"Invalid argument"
745 						" for zero-copy [0|1]\n");
746 					us_vhost_usage(prgname);
747 					return -1;
748 				} else
749 					zero_copy = ret;
750 			}
751 
752 			/* Specify the descriptor number on RX. */
753 			if (!strncmp(long_option[option_index].name,
754 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
755 				ret = parse_num_opt(optarg, MAX_RING_DESC);
756 				if ((ret == -1) || (!POWEROF2(ret))) {
757 					RTE_LOG(INFO, VHOST_CONFIG,
758 					"Invalid argument for rx-desc-num[0-N],"
759 					"power of 2 required.\n");
760 					us_vhost_usage(prgname);
761 					return -1;
762 				} else {
763 					num_rx_descriptor = ret;
764 				}
765 			}
766 
767 			/* Specify the descriptor number on TX. */
768 			if (!strncmp(long_option[option_index].name,
769 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
770 				ret = parse_num_opt(optarg, MAX_RING_DESC);
771 				if ((ret == -1) || (!POWEROF2(ret))) {
772 					RTE_LOG(INFO, VHOST_CONFIG,
773 					"Invalid argument for tx-desc-num [0-N],"
774 					"power of 2 required.\n");
775 					us_vhost_usage(prgname);
776 					return -1;
777 				} else {
778 					num_tx_descriptor = ret;
779 				}
780 			}
781 
782 			break;
783 
784 			/* Invalid option - print options. */
785 		default:
786 			us_vhost_usage(prgname);
787 			return -1;
788 		}
789 	}
790 
791 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
792 		if (enabled_port_mask & (1 << i))
793 			ports[num_ports++] = (uint8_t)i;
794 	}
795 
796 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
797 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
798 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
799 		return -1;
800 	}
801 
802 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
803 		RTE_LOG(INFO, VHOST_PORT,
804 			"Vhost zero copy doesn't support software vm2vm,"
805 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
806 		return -1;
807 	}
808 
809 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
810 		RTE_LOG(INFO, VHOST_PORT,
811 			"Vhost zero copy doesn't support jumbo frame,"
812 			"please specify '--mergeable 0' to disable the "
813 			"mergeable feature.\n");
814 		return -1;
815 	}
816 
817 	return 0;
818 }
819 
820 /*
821  * Update the global var NUM_PORTS and array PORTS according to system ports number
822  * and return valid ports number
823  */
824 static unsigned check_ports_num(unsigned nb_ports)
825 {
826 	unsigned valid_num_ports = num_ports;
827 	unsigned portid;
828 
829 	if (num_ports > nb_ports) {
830 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
831 			num_ports, nb_ports);
832 		num_ports = nb_ports;
833 	}
834 
835 	for (portid = 0; portid < num_ports; portid ++) {
836 		if (ports[portid] >= nb_ports) {
837 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
838 				ports[portid], (nb_ports - 1));
839 			ports[portid] = INVALID_PORT_ID;
840 			valid_num_ports--;
841 		}
842 	}
843 	return valid_num_ports;
844 }
845 
846 /*
847  * Macro to print out packet contents. Wrapped in debug define so that the
848  * data path is not effected when debug is disabled.
849  */
850 #ifdef DEBUG
851 #define PRINT_PACKET(device, addr, size, header) do {																\
852 	char *pkt_addr = (char*)(addr);																					\
853 	unsigned int index;																								\
854 	char packet[MAX_PRINT_BUFF];																					\
855 																													\
856 	if ((header))																									\
857 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
858 	else																											\
859 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
860 	for (index = 0; index < (size); index++) {																		\
861 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
862 			"%02hhx ", pkt_addr[index]);																			\
863 	}																												\
864 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
865 																													\
866 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
867 } while(0)
868 #else
869 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
870 #endif
871 
872 /*
873  * Function to convert guest physical addresses to vhost physical addresses.
874  * This is used to convert virtio buffer addresses.
875  */
876 static inline uint64_t __attribute__((always_inline))
877 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
878 	uint32_t buf_len, hpa_type *addr_type)
879 {
880 	struct virtio_memory_regions_hpa *region;
881 	uint32_t regionidx;
882 	uint64_t vhost_pa = 0;
883 
884 	*addr_type = PHYS_ADDR_INVALID;
885 
886 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
887 		region = &vdev->regions_hpa[regionidx];
888 		if ((guest_pa >= region->guest_phys_address) &&
889 			(guest_pa <= region->guest_phys_address_end)) {
890 			vhost_pa = region->host_phys_addr_offset + guest_pa;
891 			if (likely((guest_pa + buf_len - 1)
892 				<= region->guest_phys_address_end))
893 				*addr_type = PHYS_ADDR_CONTINUOUS;
894 			else
895 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
896 			break;
897 		}
898 	}
899 
900 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
901 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
902 		(void *)(uintptr_t)vhost_pa);
903 
904 	return vhost_pa;
905 }
906 
907 /*
908  * Compares a packet destination MAC address to a device MAC address.
909  */
910 static inline int __attribute__((always_inline))
911 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
912 {
913 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
914 }
915 
916 /*
917  * This function learns the MAC address of the device and registers this along with a
918  * vlan tag to a VMDQ.
919  */
920 static int
921 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
922 {
923 	struct ether_hdr *pkt_hdr;
924 	struct virtio_net_data_ll *dev_ll;
925 	struct virtio_net *dev = vdev->dev;
926 	int i, ret;
927 
928 	/* Learn MAC address of guest device from packet */
929 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
930 
931 	dev_ll = ll_root_used;
932 
933 	while (dev_ll != NULL) {
934 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
935 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
936 			return -1;
937 		}
938 		dev_ll = dev_ll->next;
939 	}
940 
941 	for (i = 0; i < ETHER_ADDR_LEN; i++)
942 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
943 
944 	/* vlan_tag currently uses the device_id. */
945 	vdev->vlan_tag = vlan_tags[dev->device_fh];
946 
947 	/* Print out VMDQ registration info. */
948 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
949 		dev->device_fh,
950 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
951 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
952 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
953 		vdev->vlan_tag);
954 
955 	/* Register the MAC address. */
956 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
957 				(uint32_t)dev->device_fh + vmdq_pool_base);
958 	if (ret)
959 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
960 					dev->device_fh);
961 
962 	/* Enable stripping of the vlan tag as we handle routing. */
963 	if (vlan_strip)
964 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
965 			(uint16_t)vdev->vmdq_rx_q, 1);
966 
967 	/* Set device as ready for RX. */
968 	vdev->ready = DEVICE_RX;
969 
970 	return 0;
971 }
972 
973 /*
974  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
975  * queue before disabling RX on the device.
976  */
977 static inline void
978 unlink_vmdq(struct vhost_dev *vdev)
979 {
980 	unsigned i = 0;
981 	unsigned rx_count;
982 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
983 
984 	if (vdev->ready == DEVICE_RX) {
985 		/*clear MAC and VLAN settings*/
986 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
987 		for (i = 0; i < 6; i++)
988 			vdev->mac_address.addr_bytes[i] = 0;
989 
990 		vdev->vlan_tag = 0;
991 
992 		/*Clear out the receive buffers*/
993 		rx_count = rte_eth_rx_burst(ports[0],
994 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
995 
996 		while (rx_count) {
997 			for (i = 0; i < rx_count; i++)
998 				rte_pktmbuf_free(pkts_burst[i]);
999 
1000 			rx_count = rte_eth_rx_burst(ports[0],
1001 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1002 		}
1003 
1004 		vdev->ready = DEVICE_MAC_LEARNING;
1005 	}
1006 }
1007 
1008 /*
1009  * Check if the packet destination MAC address is for a local device. If so then put
1010  * the packet on that devices RX queue. If not then return.
1011  */
1012 static inline int __attribute__((always_inline))
1013 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1014 {
1015 	struct virtio_net_data_ll *dev_ll;
1016 	struct ether_hdr *pkt_hdr;
1017 	uint64_t ret = 0;
1018 	struct virtio_net *dev = vdev->dev;
1019 	struct virtio_net *tdev; /* destination virito device */
1020 
1021 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1022 
1023 	/*get the used devices list*/
1024 	dev_ll = ll_root_used;
1025 
1026 	while (dev_ll != NULL) {
1027 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1028 				          &dev_ll->vdev->mac_address)) {
1029 
1030 			/* Drop the packet if the TX packet is destined for the TX device. */
1031 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1032 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1033 							dev->device_fh);
1034 				return 0;
1035 			}
1036 			tdev = dev_ll->vdev->dev;
1037 
1038 
1039 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1040 
1041 			if (unlikely(dev_ll->vdev->remove)) {
1042 				/*drop the packet if the device is marked for removal*/
1043 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1044 			} else {
1045 				/*send the packet to the local virtio device*/
1046 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1047 				if (enable_stats) {
1048 					rte_atomic64_add(
1049 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1050 					1);
1051 					rte_atomic64_add(
1052 					&dev_statistics[tdev->device_fh].rx_atomic,
1053 					ret);
1054 					dev_statistics[tdev->device_fh].tx_total++;
1055 					dev_statistics[tdev->device_fh].tx += ret;
1056 				}
1057 			}
1058 
1059 			return 0;
1060 		}
1061 		dev_ll = dev_ll->next;
1062 	}
1063 
1064 	return -1;
1065 }
1066 
1067 /*
1068  * Check if the destination MAC of a packet is one local VM,
1069  * and get its vlan tag, and offset if it is.
1070  */
1071 static inline int __attribute__((always_inline))
1072 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1073 	uint32_t *offset, uint16_t *vlan_tag)
1074 {
1075 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1076 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1077 
1078 	while (dev_ll != NULL) {
1079 		if ((dev_ll->vdev->ready == DEVICE_RX)
1080 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1081 		&dev_ll->vdev->mac_address)) {
1082 			/*
1083 			 * Drop the packet if the TX packet is
1084 			 * destined for the TX device.
1085 			 */
1086 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1087 				LOG_DEBUG(VHOST_DATA,
1088 				"(%"PRIu64") TX: Source and destination"
1089 				" MAC addresses are the same. Dropping "
1090 				"packet.\n",
1091 				dev_ll->vdev->dev->device_fh);
1092 				return -1;
1093 			}
1094 
1095 			/*
1096 			 * HW vlan strip will reduce the packet length
1097 			 * by minus length of vlan tag, so need restore
1098 			 * the packet length by plus it.
1099 			 */
1100 			*offset = VLAN_HLEN;
1101 			*vlan_tag =
1102 			(uint16_t)
1103 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1104 
1105 			LOG_DEBUG(VHOST_DATA,
1106 			"(%"PRIu64") TX: pkt to local VM device id:"
1107 			"(%"PRIu64") vlan tag: %d.\n",
1108 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1109 			vlan_tag);
1110 
1111 			break;
1112 		}
1113 		dev_ll = dev_ll->next;
1114 	}
1115 	return 0;
1116 }
1117 
1118 /*
1119  * This function routes the TX packet to the correct interface. This may be a local device
1120  * or the physical port.
1121  */
1122 static inline void __attribute__((always_inline))
1123 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1124 {
1125 	struct mbuf_table *tx_q;
1126 	struct rte_mbuf **m_table;
1127 	unsigned len, ret, offset = 0;
1128 	const uint16_t lcore_id = rte_lcore_id();
1129 	struct virtio_net *dev = vdev->dev;
1130 	struct ether_hdr *nh;
1131 
1132 	/*check if destination is local VM*/
1133 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1134 		rte_pktmbuf_free(m);
1135 		return;
1136 	}
1137 
1138 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1139 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1140 			rte_pktmbuf_free(m);
1141 			return;
1142 		}
1143 	}
1144 
1145 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1146 
1147 	/*Add packet to the port tx queue*/
1148 	tx_q = &lcore_tx_queue[lcore_id];
1149 	len = tx_q->len;
1150 
1151 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1152 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1153 		/* Guest has inserted the vlan tag. */
1154 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1155 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1156 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1157 			(vh->vlan_tci != vlan_tag_be))
1158 			vh->vlan_tci = vlan_tag_be;
1159 	} else {
1160 		m->ol_flags = PKT_TX_VLAN_PKT;
1161 
1162 		/*
1163 		 * Find the right seg to adjust the data len when offset is
1164 		 * bigger than tail room size.
1165 		 */
1166 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1167 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1168 				m->data_len += offset;
1169 			else {
1170 				struct rte_mbuf *seg = m;
1171 
1172 				while ((seg->next != NULL) &&
1173 					(offset > rte_pktmbuf_tailroom(seg)))
1174 					seg = seg->next;
1175 
1176 				seg->data_len += offset;
1177 			}
1178 			m->pkt_len += offset;
1179 		}
1180 
1181 		m->vlan_tci = vlan_tag;
1182 	}
1183 
1184 	tx_q->m_table[len] = m;
1185 	len++;
1186 	if (enable_stats) {
1187 		dev_statistics[dev->device_fh].tx_total++;
1188 		dev_statistics[dev->device_fh].tx++;
1189 	}
1190 
1191 	if (unlikely(len == MAX_PKT_BURST)) {
1192 		m_table = (struct rte_mbuf **)tx_q->m_table;
1193 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1194 		/* Free any buffers not handled by TX and update the port stats. */
1195 		if (unlikely(ret < len)) {
1196 			do {
1197 				rte_pktmbuf_free(m_table[ret]);
1198 			} while (++ret < len);
1199 		}
1200 
1201 		len = 0;
1202 	}
1203 
1204 	tx_q->len = len;
1205 	return;
1206 }
1207 /*
1208  * This function is called by each data core. It handles all RX/TX registered with the
1209  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1210  * with all devices in the main linked list.
1211  */
1212 static int
1213 switch_worker(__attribute__((unused)) void *arg)
1214 {
1215 	struct rte_mempool *mbuf_pool = arg;
1216 	struct virtio_net *dev = NULL;
1217 	struct vhost_dev *vdev = NULL;
1218 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1219 	struct virtio_net_data_ll *dev_ll;
1220 	struct mbuf_table *tx_q;
1221 	volatile struct lcore_ll_info *lcore_ll;
1222 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1223 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1224 	unsigned ret, i;
1225 	const uint16_t lcore_id = rte_lcore_id();
1226 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1227 	uint16_t rx_count = 0;
1228 	uint16_t tx_count;
1229 	uint32_t retry = 0;
1230 
1231 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1232 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1233 	prev_tsc = 0;
1234 
1235 	tx_q = &lcore_tx_queue[lcore_id];
1236 	for (i = 0; i < num_cores; i ++) {
1237 		if (lcore_ids[i] == lcore_id) {
1238 			tx_q->txq_id = i;
1239 			break;
1240 		}
1241 	}
1242 
1243 	while(1) {
1244 		cur_tsc = rte_rdtsc();
1245 		/*
1246 		 * TX burst queue drain
1247 		 */
1248 		diff_tsc = cur_tsc - prev_tsc;
1249 		if (unlikely(diff_tsc > drain_tsc)) {
1250 
1251 			if (tx_q->len) {
1252 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1253 
1254 				/*Tx any packets in the queue*/
1255 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1256 									   (struct rte_mbuf **)tx_q->m_table,
1257 									   (uint16_t)tx_q->len);
1258 				if (unlikely(ret < tx_q->len)) {
1259 					do {
1260 						rte_pktmbuf_free(tx_q->m_table[ret]);
1261 					} while (++ret < tx_q->len);
1262 				}
1263 
1264 				tx_q->len = 0;
1265 			}
1266 
1267 			prev_tsc = cur_tsc;
1268 
1269 		}
1270 
1271 		rte_prefetch0(lcore_ll->ll_root_used);
1272 		/*
1273 		 * Inform the configuration core that we have exited the linked list and that no devices are
1274 		 * in use if requested.
1275 		 */
1276 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1277 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1278 
1279 		/*
1280 		 * Process devices
1281 		 */
1282 		dev_ll = lcore_ll->ll_root_used;
1283 
1284 		while (dev_ll != NULL) {
1285 			/*get virtio device ID*/
1286 			vdev = dev_ll->vdev;
1287 			dev = vdev->dev;
1288 
1289 			if (unlikely(vdev->remove)) {
1290 				dev_ll = dev_ll->next;
1291 				unlink_vmdq(vdev);
1292 				vdev->ready = DEVICE_SAFE_REMOVE;
1293 				continue;
1294 			}
1295 			if (likely(vdev->ready == DEVICE_RX)) {
1296 				/*Handle guest RX*/
1297 				rx_count = rte_eth_rx_burst(ports[0],
1298 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1299 
1300 				if (rx_count) {
1301 					/*
1302 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1303 					* Here MAX_PKT_BURST must be less than virtio queue size
1304 					*/
1305 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1306 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1307 							rte_delay_us(burst_rx_delay_time);
1308 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1309 								break;
1310 						}
1311 					}
1312 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1313 					if (enable_stats) {
1314 						rte_atomic64_add(
1315 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1316 						rx_count);
1317 						rte_atomic64_add(
1318 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1319 					}
1320 					while (likely(rx_count)) {
1321 						rx_count--;
1322 						rte_pktmbuf_free(pkts_burst[rx_count]);
1323 					}
1324 
1325 				}
1326 			}
1327 
1328 			if (likely(!vdev->remove)) {
1329 				/* Handle guest TX*/
1330 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1331 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1332 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1333 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1334 						while (tx_count)
1335 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1336 					}
1337 				}
1338 				while (tx_count)
1339 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1340 			}
1341 
1342 			/*move to the next device in the list*/
1343 			dev_ll = dev_ll->next;
1344 		}
1345 	}
1346 
1347 	return 0;
1348 }
1349 
1350 /*
1351  * This function gets available ring number for zero copy rx.
1352  * Only one thread will call this funciton for a paticular virtio device,
1353  * so, it is designed as non-thread-safe function.
1354  */
1355 static inline uint32_t __attribute__((always_inline))
1356 get_available_ring_num_zcp(struct virtio_net *dev)
1357 {
1358 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1359 	uint16_t avail_idx;
1360 
1361 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1362 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1363 }
1364 
1365 /*
1366  * This function gets available ring index for zero copy rx,
1367  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1368  * Only one thread will call this funciton for a paticular virtio device,
1369  * so, it is designed as non-thread-safe function.
1370  */
1371 static inline uint32_t __attribute__((always_inline))
1372 get_available_ring_index_zcp(struct virtio_net *dev,
1373 	uint16_t *res_base_idx, uint32_t count)
1374 {
1375 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1376 	uint16_t avail_idx;
1377 	uint32_t retry = 0;
1378 	uint16_t free_entries;
1379 
1380 	*res_base_idx = vq->last_used_idx_res;
1381 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1382 	free_entries = (avail_idx - *res_base_idx);
1383 
1384 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1385 			"avail idx: %d, "
1386 			"res base idx:%d, free entries:%d\n",
1387 			dev->device_fh, avail_idx, *res_base_idx,
1388 			free_entries);
1389 
1390 	/*
1391 	 * If retry is enabled and the queue is full then we wait
1392 	 * and retry to avoid packet loss.
1393 	 */
1394 	if (enable_retry && unlikely(count > free_entries)) {
1395 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1396 			rte_delay_us(burst_rx_delay_time);
1397 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1398 			free_entries = (avail_idx - *res_base_idx);
1399 			if (count <= free_entries)
1400 				break;
1401 		}
1402 	}
1403 
1404 	/*check that we have enough buffers*/
1405 	if (unlikely(count > free_entries))
1406 		count = free_entries;
1407 
1408 	if (unlikely(count == 0)) {
1409 		LOG_DEBUG(VHOST_DATA,
1410 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1411 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1412 			dev->device_fh, avail_idx,
1413 			*res_base_idx, free_entries);
1414 		return 0;
1415 	}
1416 
1417 	vq->last_used_idx_res = *res_base_idx + count;
1418 
1419 	return count;
1420 }
1421 
1422 /*
1423  * This function put descriptor back to used list.
1424  */
1425 static inline void __attribute__((always_inline))
1426 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1427 {
1428 	uint16_t res_cur_idx = vq->last_used_idx;
1429 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1430 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1431 	rte_compiler_barrier();
1432 	*(volatile uint16_t *)&vq->used->idx += 1;
1433 	vq->last_used_idx += 1;
1434 
1435 	/* Kick the guest if necessary. */
1436 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1437 		eventfd_write((int)vq->callfd, 1);
1438 }
1439 
1440 /*
1441  * This function get available descriptor from vitio vring and un-attached mbuf
1442  * from vpool->ring, and then attach them together. It needs adjust the offset
1443  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1444  * frame data may be put to wrong location in mbuf.
1445  */
1446 static inline void __attribute__((always_inline))
1447 attach_rxmbuf_zcp(struct virtio_net *dev)
1448 {
1449 	uint16_t res_base_idx, desc_idx;
1450 	uint64_t buff_addr, phys_addr;
1451 	struct vhost_virtqueue *vq;
1452 	struct vring_desc *desc;
1453 	struct rte_mbuf *mbuf = NULL;
1454 	struct vpool *vpool;
1455 	hpa_type addr_type;
1456 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1457 
1458 	vpool = &vpool_array[vdev->vmdq_rx_q];
1459 	vq = dev->virtqueue[VIRTIO_RXQ];
1460 
1461 	do {
1462 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1463 				1) != 1))
1464 			return;
1465 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1466 
1467 		desc = &vq->desc[desc_idx];
1468 		if (desc->flags & VRING_DESC_F_NEXT) {
1469 			desc = &vq->desc[desc->next];
1470 			buff_addr = gpa_to_vva(dev, desc->addr);
1471 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1472 					&addr_type);
1473 		} else {
1474 			buff_addr = gpa_to_vva(dev,
1475 					desc->addr + vq->vhost_hlen);
1476 			phys_addr = gpa_to_hpa(vdev,
1477 					desc->addr + vq->vhost_hlen,
1478 					desc->len, &addr_type);
1479 		}
1480 
1481 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1482 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1483 				" address found when attaching RX frame buffer"
1484 				" address!\n", dev->device_fh);
1485 			put_desc_to_used_list_zcp(vq, desc_idx);
1486 			continue;
1487 		}
1488 
1489 		/*
1490 		 * Check if the frame buffer address from guest crosses
1491 		 * sub-region or not.
1492 		 */
1493 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1494 			RTE_LOG(ERR, VHOST_DATA,
1495 				"(%"PRIu64") Frame buffer address cross "
1496 				"sub-regioin found when attaching RX frame "
1497 				"buffer address!\n",
1498 				dev->device_fh);
1499 			put_desc_to_used_list_zcp(vq, desc_idx);
1500 			continue;
1501 		}
1502 	} while (unlikely(phys_addr == 0));
1503 
1504 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1505 	if (unlikely(mbuf == NULL)) {
1506 		LOG_DEBUG(VHOST_DATA,
1507 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1508 			"ring_sc_dequeue fail.\n",
1509 			dev->device_fh);
1510 		put_desc_to_used_list_zcp(vq, desc_idx);
1511 		return;
1512 	}
1513 
1514 	if (unlikely(vpool->buf_size > desc->len)) {
1515 		LOG_DEBUG(VHOST_DATA,
1516 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1517 			"length(%d) of descriptor idx: %d less than room "
1518 			"size required: %d\n",
1519 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1520 		put_desc_to_used_list_zcp(vq, desc_idx);
1521 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1522 		return;
1523 	}
1524 
1525 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1526 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1527 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1528 	mbuf->data_len = desc->len;
1529 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1530 
1531 	LOG_DEBUG(VHOST_DATA,
1532 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1533 		"descriptor idx:%d\n",
1534 		dev->device_fh, res_base_idx, desc_idx);
1535 
1536 	__rte_mbuf_raw_free(mbuf);
1537 
1538 	return;
1539 }
1540 
1541 /*
1542  * Detach an attched packet mbuf -
1543  *  - restore original mbuf address and length values.
1544  *  - reset pktmbuf data and data_len to their default values.
1545  *  All other fields of the given packet mbuf will be left intact.
1546  *
1547  * @param m
1548  *   The attached packet mbuf.
1549  */
1550 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1551 {
1552 	const struct rte_mempool *mp = m->pool;
1553 	void *buf = RTE_MBUF_TO_BADDR(m);
1554 	uint32_t buf_ofs;
1555 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1556 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1557 
1558 	m->buf_addr = buf;
1559 	m->buf_len = (uint16_t)buf_len;
1560 
1561 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1562 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1563 	m->data_off = buf_ofs;
1564 
1565 	m->data_len = 0;
1566 }
1567 
1568 /*
1569  * This function is called after packets have been transimited. It fetchs mbuf
1570  * from vpool->pool, detached it and put into vpool->ring. It also update the
1571  * used index and kick the guest if necessary.
1572  */
1573 static inline uint32_t __attribute__((always_inline))
1574 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1575 {
1576 	struct rte_mbuf *mbuf;
1577 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1578 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1579 	uint32_t index = 0;
1580 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1581 
1582 	LOG_DEBUG(VHOST_DATA,
1583 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1584 		"clean is: %d\n",
1585 		dev->device_fh, mbuf_count);
1586 	LOG_DEBUG(VHOST_DATA,
1587 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1588 		"clean  is : %d\n",
1589 		dev->device_fh, rte_ring_count(vpool->ring));
1590 
1591 	for (index = 0; index < mbuf_count; index++) {
1592 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1593 		if (likely(MBUF_EXT_MEM(mbuf)))
1594 			pktmbuf_detach_zcp(mbuf);
1595 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1596 
1597 		/* Update used index buffer information. */
1598 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1599 		vq->used->ring[used_idx].len = 0;
1600 
1601 		used_idx = (used_idx + 1) & (vq->size - 1);
1602 	}
1603 
1604 	LOG_DEBUG(VHOST_DATA,
1605 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1606 		"clean is: %d\n",
1607 		dev->device_fh, rte_mempool_count(vpool->pool));
1608 	LOG_DEBUG(VHOST_DATA,
1609 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1610 		"clean  is : %d\n",
1611 		dev->device_fh, rte_ring_count(vpool->ring));
1612 	LOG_DEBUG(VHOST_DATA,
1613 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1614 		"vq->last_used_idx:%d\n",
1615 		dev->device_fh, vq->last_used_idx);
1616 
1617 	vq->last_used_idx += mbuf_count;
1618 
1619 	LOG_DEBUG(VHOST_DATA,
1620 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1621 		"vq->last_used_idx:%d\n",
1622 		dev->device_fh, vq->last_used_idx);
1623 
1624 	rte_compiler_barrier();
1625 
1626 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1627 
1628 	/* Kick guest if required. */
1629 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1630 		eventfd_write((int)vq->callfd, 1);
1631 
1632 	return 0;
1633 }
1634 
1635 /*
1636  * This function is called when a virtio device is destroy.
1637  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1638  */
1639 static void mbuf_destroy_zcp(struct vpool *vpool)
1640 {
1641 	struct rte_mbuf *mbuf = NULL;
1642 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1643 
1644 	LOG_DEBUG(VHOST_CONFIG,
1645 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1646 		"mbuf_destroy_zcp is: %d\n",
1647 		mbuf_count);
1648 	LOG_DEBUG(VHOST_CONFIG,
1649 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1650 		"mbuf_destroy_zcp  is : %d\n",
1651 		rte_ring_count(vpool->ring));
1652 
1653 	for (index = 0; index < mbuf_count; index++) {
1654 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1655 		if (likely(mbuf != NULL)) {
1656 			if (likely(MBUF_EXT_MEM(mbuf)))
1657 				pktmbuf_detach_zcp(mbuf);
1658 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1659 		}
1660 	}
1661 
1662 	LOG_DEBUG(VHOST_CONFIG,
1663 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1664 		"mbuf_destroy_zcp is: %d\n",
1665 		rte_mempool_count(vpool->pool));
1666 	LOG_DEBUG(VHOST_CONFIG,
1667 		"in mbuf_destroy_zcp: mbuf count in ring after "
1668 		"mbuf_destroy_zcp is : %d\n",
1669 		rte_ring_count(vpool->ring));
1670 }
1671 
1672 /*
1673  * This function update the use flag and counter.
1674  */
1675 static inline uint32_t __attribute__((always_inline))
1676 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1677 	uint32_t count)
1678 {
1679 	struct vhost_virtqueue *vq;
1680 	struct vring_desc *desc;
1681 	struct rte_mbuf *buff;
1682 	/* The virtio_hdr is initialised to 0. */
1683 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1684 		= {{0, 0, 0, 0, 0, 0}, 0};
1685 	uint64_t buff_hdr_addr = 0;
1686 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1687 	uint32_t head_idx, packet_success = 0;
1688 	uint16_t res_cur_idx;
1689 
1690 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1691 
1692 	if (count == 0)
1693 		return 0;
1694 
1695 	vq = dev->virtqueue[VIRTIO_RXQ];
1696 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1697 
1698 	res_cur_idx = vq->last_used_idx;
1699 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1700 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1701 
1702 	/* Retrieve all of the head indexes first to avoid caching issues. */
1703 	for (head_idx = 0; head_idx < count; head_idx++)
1704 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1705 
1706 	/*Prefetch descriptor index. */
1707 	rte_prefetch0(&vq->desc[head[packet_success]]);
1708 
1709 	while (packet_success != count) {
1710 		/* Get descriptor from available ring */
1711 		desc = &vq->desc[head[packet_success]];
1712 
1713 		buff = pkts[packet_success];
1714 		LOG_DEBUG(VHOST_DATA,
1715 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1716 			"pkt[%d] descriptor idx: %d\n",
1717 			dev->device_fh, packet_success,
1718 			MBUF_HEADROOM_UINT32(buff));
1719 
1720 		PRINT_PACKET(dev,
1721 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1722 			+ RTE_PKTMBUF_HEADROOM),
1723 			rte_pktmbuf_data_len(buff), 0);
1724 
1725 		/* Buffer address translation for virtio header. */
1726 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1727 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1728 
1729 		/*
1730 		 * If the descriptors are chained the header and data are
1731 		 * placed in separate buffers.
1732 		 */
1733 		if (desc->flags & VRING_DESC_F_NEXT) {
1734 			desc->len = vq->vhost_hlen;
1735 			desc = &vq->desc[desc->next];
1736 			desc->len = rte_pktmbuf_data_len(buff);
1737 		} else {
1738 			desc->len = packet_len;
1739 		}
1740 
1741 		/* Update used ring with desc information */
1742 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1743 			= head[packet_success];
1744 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1745 			= packet_len;
1746 		res_cur_idx++;
1747 		packet_success++;
1748 
1749 		/* A header is required per buffer. */
1750 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1751 			(const void *)&virtio_hdr, vq->vhost_hlen);
1752 
1753 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1754 
1755 		if (likely(packet_success < count)) {
1756 			/* Prefetch descriptor index. */
1757 			rte_prefetch0(&vq->desc[head[packet_success]]);
1758 		}
1759 	}
1760 
1761 	rte_compiler_barrier();
1762 
1763 	LOG_DEBUG(VHOST_DATA,
1764 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1765 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1766 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1767 
1768 	*(volatile uint16_t *)&vq->used->idx += count;
1769 	vq->last_used_idx += count;
1770 
1771 	LOG_DEBUG(VHOST_DATA,
1772 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1773 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1774 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1775 
1776 	/* Kick the guest if necessary. */
1777 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1778 		eventfd_write((int)vq->callfd, 1);
1779 
1780 	return count;
1781 }
1782 
1783 /*
1784  * This function routes the TX packet to the correct interface.
1785  * This may be a local device or the physical port.
1786  */
1787 static inline void __attribute__((always_inline))
1788 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1789 	uint32_t desc_idx, uint8_t need_copy)
1790 {
1791 	struct mbuf_table *tx_q;
1792 	struct rte_mbuf **m_table;
1793 	struct rte_mbuf *mbuf = NULL;
1794 	unsigned len, ret, offset = 0;
1795 	struct vpool *vpool;
1796 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1797 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1798 
1799 	/*Add packet to the port tx queue*/
1800 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1801 	len = tx_q->len;
1802 
1803 	/* Allocate an mbuf and populate the structure. */
1804 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1805 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1806 	if (unlikely(mbuf == NULL)) {
1807 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1808 		RTE_LOG(ERR, VHOST_DATA,
1809 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1810 			dev->device_fh);
1811 		put_desc_to_used_list_zcp(vq, desc_idx);
1812 		return;
1813 	}
1814 
1815 	if (vm2vm_mode == VM2VM_HARDWARE) {
1816 		/* Avoid using a vlan tag from any vm for external pkt, such as
1817 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1818 		 * selection, MAC address determines it as an external pkt
1819 		 * which should go to network, while vlan tag determine it as
1820 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1821 		 * such a ambiguous situation, so pkt will lost.
1822 		 */
1823 		vlan_tag = external_pkt_default_vlan_tag;
1824 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1825 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1826 			__rte_mbuf_raw_free(mbuf);
1827 			return;
1828 		}
1829 	}
1830 
1831 	mbuf->nb_segs = m->nb_segs;
1832 	mbuf->next = m->next;
1833 	mbuf->data_len = m->data_len + offset;
1834 	mbuf->pkt_len = mbuf->data_len;
1835 	if (unlikely(need_copy)) {
1836 		/* Copy the packet contents to the mbuf. */
1837 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1838 			rte_pktmbuf_mtod(m, void *),
1839 			m->data_len);
1840 	} else {
1841 		mbuf->data_off = m->data_off;
1842 		mbuf->buf_physaddr = m->buf_physaddr;
1843 		mbuf->buf_addr = m->buf_addr;
1844 	}
1845 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1846 	mbuf->vlan_tci = vlan_tag;
1847 	mbuf->l2_len = sizeof(struct ether_hdr);
1848 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1849 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1850 
1851 	tx_q->m_table[len] = mbuf;
1852 	len++;
1853 
1854 	LOG_DEBUG(VHOST_DATA,
1855 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1856 		dev->device_fh,
1857 		mbuf->nb_segs,
1858 		(mbuf->next == NULL) ? "null" : "non-null");
1859 
1860 	if (enable_stats) {
1861 		dev_statistics[dev->device_fh].tx_total++;
1862 		dev_statistics[dev->device_fh].tx++;
1863 	}
1864 
1865 	if (unlikely(len == MAX_PKT_BURST)) {
1866 		m_table = (struct rte_mbuf **)tx_q->m_table;
1867 		ret = rte_eth_tx_burst(ports[0],
1868 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1869 
1870 		/*
1871 		 * Free any buffers not handled by TX and update
1872 		 * the port stats.
1873 		 */
1874 		if (unlikely(ret < len)) {
1875 			do {
1876 				rte_pktmbuf_free(m_table[ret]);
1877 			} while (++ret < len);
1878 		}
1879 
1880 		len = 0;
1881 		txmbuf_clean_zcp(dev, vpool);
1882 	}
1883 
1884 	tx_q->len = len;
1885 
1886 	return;
1887 }
1888 
1889 /*
1890  * This function TX all available packets in virtio TX queue for one
1891  * virtio-net device. If it is first packet, it learns MAC address and
1892  * setup VMDQ.
1893  */
1894 static inline void __attribute__((always_inline))
1895 virtio_dev_tx_zcp(struct virtio_net *dev)
1896 {
1897 	struct rte_mbuf m;
1898 	struct vhost_virtqueue *vq;
1899 	struct vring_desc *desc;
1900 	uint64_t buff_addr = 0, phys_addr;
1901 	uint32_t head[MAX_PKT_BURST];
1902 	uint32_t i;
1903 	uint16_t free_entries, packet_success = 0;
1904 	uint16_t avail_idx;
1905 	uint8_t need_copy = 0;
1906 	hpa_type addr_type;
1907 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1908 
1909 	vq = dev->virtqueue[VIRTIO_TXQ];
1910 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1911 
1912 	/* If there are no available buffers then return. */
1913 	if (vq->last_used_idx_res == avail_idx)
1914 		return;
1915 
1916 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1917 
1918 	/* Prefetch available ring to retrieve head indexes. */
1919 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1920 
1921 	/* Get the number of free entries in the ring */
1922 	free_entries = (avail_idx - vq->last_used_idx_res);
1923 
1924 	/* Limit to MAX_PKT_BURST. */
1925 	free_entries
1926 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1927 
1928 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1929 		dev->device_fh, free_entries);
1930 
1931 	/* Retrieve all of the head indexes first to avoid caching issues. */
1932 	for (i = 0; i < free_entries; i++)
1933 		head[i]
1934 			= vq->avail->ring[(vq->last_used_idx_res + i)
1935 			& (vq->size - 1)];
1936 
1937 	vq->last_used_idx_res += free_entries;
1938 
1939 	/* Prefetch descriptor index. */
1940 	rte_prefetch0(&vq->desc[head[packet_success]]);
1941 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1942 
1943 	while (packet_success < free_entries) {
1944 		desc = &vq->desc[head[packet_success]];
1945 
1946 		/* Discard first buffer as it is the virtio header */
1947 		desc = &vq->desc[desc->next];
1948 
1949 		/* Buffer address translation. */
1950 		buff_addr = gpa_to_vva(dev, desc->addr);
1951 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1952 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1953 			&addr_type);
1954 
1955 		if (likely(packet_success < (free_entries - 1)))
1956 			/* Prefetch descriptor index. */
1957 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1958 
1959 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1960 			RTE_LOG(ERR, VHOST_DATA,
1961 				"(%"PRIu64") Invalid frame buffer address found"
1962 				"when TX packets!\n",
1963 				dev->device_fh);
1964 			packet_success++;
1965 			continue;
1966 		}
1967 
1968 		/* Prefetch buffer address. */
1969 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1970 
1971 		/*
1972 		 * Setup dummy mbuf. This is copied to a real mbuf if
1973 		 * transmitted out the physical port.
1974 		 */
1975 		m.data_len = desc->len;
1976 		m.nb_segs = 1;
1977 		m.next = NULL;
1978 		m.data_off = 0;
1979 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1980 		m.buf_physaddr = phys_addr;
1981 
1982 		/*
1983 		 * Check if the frame buffer address from guest crosses
1984 		 * sub-region or not.
1985 		 */
1986 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1987 			RTE_LOG(ERR, VHOST_DATA,
1988 				"(%"PRIu64") Frame buffer address cross "
1989 				"sub-regioin found when attaching TX frame "
1990 				"buffer address!\n",
1991 				dev->device_fh);
1992 			need_copy = 1;
1993 		} else
1994 			need_copy = 0;
1995 
1996 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1997 
1998 		/*
1999 		 * If this is the first received packet we need to learn
2000 		 * the MAC and setup VMDQ
2001 		 */
2002 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2003 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2004 				/*
2005 				 * Discard frame if device is scheduled for
2006 				 * removal or a duplicate MAC address is found.
2007 				 */
2008 				packet_success += free_entries;
2009 				vq->last_used_idx += packet_success;
2010 				break;
2011 			}
2012 		}
2013 
2014 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2015 		packet_success++;
2016 	}
2017 }
2018 
2019 /*
2020  * This function is called by each data core. It handles all RX/TX registered
2021  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2022  * addresses are compared with all devices in the main linked list.
2023  */
2024 static int
2025 switch_worker_zcp(__attribute__((unused)) void *arg)
2026 {
2027 	struct virtio_net *dev = NULL;
2028 	struct vhost_dev  *vdev = NULL;
2029 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2030 	struct virtio_net_data_ll *dev_ll;
2031 	struct mbuf_table *tx_q;
2032 	volatile struct lcore_ll_info *lcore_ll;
2033 	const uint64_t drain_tsc
2034 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2035 		* BURST_TX_DRAIN_US;
2036 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2037 	unsigned ret;
2038 	const uint16_t lcore_id = rte_lcore_id();
2039 	uint16_t count_in_ring, rx_count = 0;
2040 
2041 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2042 
2043 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2044 	prev_tsc = 0;
2045 
2046 	while (1) {
2047 		cur_tsc = rte_rdtsc();
2048 
2049 		/* TX burst queue drain */
2050 		diff_tsc = cur_tsc - prev_tsc;
2051 		if (unlikely(diff_tsc > drain_tsc)) {
2052 			/*
2053 			 * Get mbuf from vpool.pool and detach mbuf and
2054 			 * put back into vpool.ring.
2055 			 */
2056 			dev_ll = lcore_ll->ll_root_used;
2057 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2058 				/* Get virtio device ID */
2059 				vdev = dev_ll->vdev;
2060 				dev = vdev->dev;
2061 
2062 				if (likely(!vdev->remove)) {
2063 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2064 					if (tx_q->len) {
2065 						LOG_DEBUG(VHOST_DATA,
2066 						"TX queue drained after timeout"
2067 						" with burst size %u\n",
2068 						tx_q->len);
2069 
2070 						/*
2071 						 * Tx any packets in the queue
2072 						 */
2073 						ret = rte_eth_tx_burst(
2074 							ports[0],
2075 							(uint16_t)tx_q->txq_id,
2076 							(struct rte_mbuf **)
2077 							tx_q->m_table,
2078 							(uint16_t)tx_q->len);
2079 						if (unlikely(ret < tx_q->len)) {
2080 							do {
2081 								rte_pktmbuf_free(
2082 									tx_q->m_table[ret]);
2083 							} while (++ret < tx_q->len);
2084 						}
2085 						tx_q->len = 0;
2086 
2087 						txmbuf_clean_zcp(dev,
2088 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2089 					}
2090 				}
2091 				dev_ll = dev_ll->next;
2092 			}
2093 			prev_tsc = cur_tsc;
2094 		}
2095 
2096 		rte_prefetch0(lcore_ll->ll_root_used);
2097 
2098 		/*
2099 		 * Inform the configuration core that we have exited the linked
2100 		 * list and that no devices are in use if requested.
2101 		 */
2102 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2103 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2104 
2105 		/* Process devices */
2106 		dev_ll = lcore_ll->ll_root_used;
2107 
2108 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2109 			vdev = dev_ll->vdev;
2110 			dev  = vdev->dev;
2111 			if (unlikely(vdev->remove)) {
2112 				dev_ll = dev_ll->next;
2113 				unlink_vmdq(vdev);
2114 				vdev->ready = DEVICE_SAFE_REMOVE;
2115 				continue;
2116 			}
2117 
2118 			if (likely(vdev->ready == DEVICE_RX)) {
2119 				uint32_t index = vdev->vmdq_rx_q;
2120 				uint16_t i;
2121 				count_in_ring
2122 				= rte_ring_count(vpool_array[index].ring);
2123 				uint16_t free_entries
2124 				= (uint16_t)get_available_ring_num_zcp(dev);
2125 
2126 				/*
2127 				 * Attach all mbufs in vpool.ring and put back
2128 				 * into vpool.pool.
2129 				 */
2130 				for (i = 0;
2131 				i < RTE_MIN(free_entries,
2132 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2133 				i++)
2134 					attach_rxmbuf_zcp(dev);
2135 
2136 				/* Handle guest RX */
2137 				rx_count = rte_eth_rx_burst(ports[0],
2138 					vdev->vmdq_rx_q, pkts_burst,
2139 					MAX_PKT_BURST);
2140 
2141 				if (rx_count) {
2142 					ret_count = virtio_dev_rx_zcp(dev,
2143 							pkts_burst, rx_count);
2144 					if (enable_stats) {
2145 						dev_statistics[dev->device_fh].rx_total
2146 							+= rx_count;
2147 						dev_statistics[dev->device_fh].rx
2148 							+= ret_count;
2149 					}
2150 					while (likely(rx_count)) {
2151 						rx_count--;
2152 						pktmbuf_detach_zcp(
2153 							pkts_burst[rx_count]);
2154 						rte_ring_sp_enqueue(
2155 							vpool_array[index].ring,
2156 							(void *)pkts_burst[rx_count]);
2157 					}
2158 				}
2159 			}
2160 
2161 			if (likely(!vdev->remove))
2162 				/* Handle guest TX */
2163 				virtio_dev_tx_zcp(dev);
2164 
2165 			/* Move to the next device in the list */
2166 			dev_ll = dev_ll->next;
2167 		}
2168 	}
2169 
2170 	return 0;
2171 }
2172 
2173 
2174 /*
2175  * Add an entry to a used linked list. A free entry must first be found
2176  * in the free linked list using get_data_ll_free_entry();
2177  */
2178 static void
2179 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2180 	struct virtio_net_data_ll *ll_dev)
2181 {
2182 	struct virtio_net_data_ll *ll = *ll_root_addr;
2183 
2184 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2185 	ll_dev->next = NULL;
2186 	rte_compiler_barrier();
2187 
2188 	/* If ll == NULL then this is the first device. */
2189 	if (ll) {
2190 		/* Increment to the tail of the linked list. */
2191 		while ((ll->next != NULL) )
2192 			ll = ll->next;
2193 
2194 		ll->next = ll_dev;
2195 	} else {
2196 		*ll_root_addr = ll_dev;
2197 	}
2198 }
2199 
2200 /*
2201  * Remove an entry from a used linked list. The entry must then be added to
2202  * the free linked list using put_data_ll_free_entry().
2203  */
2204 static void
2205 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2206 	struct virtio_net_data_ll *ll_dev,
2207 	struct virtio_net_data_ll *ll_dev_last)
2208 {
2209 	struct virtio_net_data_ll *ll = *ll_root_addr;
2210 
2211 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2212 		return;
2213 
2214 	if (ll_dev == ll)
2215 		*ll_root_addr = ll_dev->next;
2216 	else
2217 		if (likely(ll_dev_last != NULL))
2218 			ll_dev_last->next = ll_dev->next;
2219 		else
2220 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2221 }
2222 
2223 /*
2224  * Find and return an entry from the free linked list.
2225  */
2226 static struct virtio_net_data_ll *
2227 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2228 {
2229 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2230 	struct virtio_net_data_ll *ll_dev;
2231 
2232 	if (ll_free == NULL)
2233 		return NULL;
2234 
2235 	ll_dev = ll_free;
2236 	*ll_root_addr = ll_free->next;
2237 
2238 	return ll_dev;
2239 }
2240 
2241 /*
2242  * Place an entry back on to the free linked list.
2243  */
2244 static void
2245 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2246 	struct virtio_net_data_ll *ll_dev)
2247 {
2248 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2249 
2250 	if (ll_dev == NULL)
2251 		return;
2252 
2253 	ll_dev->next = ll_free;
2254 	*ll_root_addr = ll_dev;
2255 }
2256 
2257 /*
2258  * Creates a linked list of a given size.
2259  */
2260 static struct virtio_net_data_ll *
2261 alloc_data_ll(uint32_t size)
2262 {
2263 	struct virtio_net_data_ll *ll_new;
2264 	uint32_t i;
2265 
2266 	/* Malloc and then chain the linked list. */
2267 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2268 	if (ll_new == NULL) {
2269 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2270 		return NULL;
2271 	}
2272 
2273 	for (i = 0; i < size - 1; i++) {
2274 		ll_new[i].vdev = NULL;
2275 		ll_new[i].next = &ll_new[i+1];
2276 	}
2277 	ll_new[i].next = NULL;
2278 
2279 	return (ll_new);
2280 }
2281 
2282 /*
2283  * Create the main linked list along with each individual cores linked list. A used and a free list
2284  * are created to manage entries.
2285  */
2286 static int
2287 init_data_ll (void)
2288 {
2289 	int lcore;
2290 
2291 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2292 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2293 		if (lcore_info[lcore].lcore_ll == NULL) {
2294 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2295 			return -1;
2296 		}
2297 
2298 		lcore_info[lcore].lcore_ll->device_num = 0;
2299 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2300 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2301 		if (num_devices % num_switching_cores)
2302 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2303 		else
2304 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2305 	}
2306 
2307 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2308 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2309 
2310 	return 0;
2311 }
2312 
2313 /*
2314  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2315  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2316  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2317  */
2318 static void
2319 destroy_device (volatile struct virtio_net *dev)
2320 {
2321 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2322 	struct virtio_net_data_ll *ll_main_dev_cur;
2323 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2324 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2325 	struct vhost_dev *vdev;
2326 	int lcore;
2327 
2328 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2329 
2330 	vdev = (struct vhost_dev *)dev->priv;
2331 	/*set the remove flag. */
2332 	vdev->remove = 1;
2333 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2334 		rte_pause();
2335 	}
2336 
2337 	/* Search for entry to be removed from lcore ll */
2338 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2339 	while (ll_lcore_dev_cur != NULL) {
2340 		if (ll_lcore_dev_cur->vdev == vdev) {
2341 			break;
2342 		} else {
2343 			ll_lcore_dev_last = ll_lcore_dev_cur;
2344 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2345 		}
2346 	}
2347 
2348 	if (ll_lcore_dev_cur == NULL) {
2349 		RTE_LOG(ERR, VHOST_CONFIG,
2350 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2351 			dev->device_fh);
2352 		return;
2353 	}
2354 
2355 	/* Search for entry to be removed from main ll */
2356 	ll_main_dev_cur = ll_root_used;
2357 	ll_main_dev_last = NULL;
2358 	while (ll_main_dev_cur != NULL) {
2359 		if (ll_main_dev_cur->vdev == vdev) {
2360 			break;
2361 		} else {
2362 			ll_main_dev_last = ll_main_dev_cur;
2363 			ll_main_dev_cur = ll_main_dev_cur->next;
2364 		}
2365 	}
2366 
2367 	/* Remove entries from the lcore and main ll. */
2368 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2369 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2370 
2371 	/* Set the dev_removal_flag on each lcore. */
2372 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2373 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2374 	}
2375 
2376 	/*
2377 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2378 	 * they can no longer access the device removed from the linked lists and that the devices
2379 	 * are no longer in use.
2380 	 */
2381 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2382 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2383 			rte_pause();
2384 		}
2385 	}
2386 
2387 	/* Add the entries back to the lcore and main free ll.*/
2388 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2389 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2390 
2391 	/* Decrement number of device on the lcore. */
2392 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2393 
2394 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2395 
2396 	if (zero_copy) {
2397 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2398 
2399 		/* Stop the RX queue. */
2400 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2401 			LOG_DEBUG(VHOST_CONFIG,
2402 				"(%"PRIu64") In destroy_device: Failed to stop "
2403 				"rx queue:%d\n",
2404 				dev->device_fh,
2405 				vdev->vmdq_rx_q);
2406 		}
2407 
2408 		LOG_DEBUG(VHOST_CONFIG,
2409 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2410 			"mempool back to ring for RX queue: %d\n",
2411 			dev->device_fh, vdev->vmdq_rx_q);
2412 
2413 		mbuf_destroy_zcp(vpool);
2414 
2415 		/* Stop the TX queue. */
2416 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2417 			LOG_DEBUG(VHOST_CONFIG,
2418 				"(%"PRIu64") In destroy_device: Failed to "
2419 				"stop tx queue:%d\n",
2420 				dev->device_fh, vdev->vmdq_rx_q);
2421 		}
2422 
2423 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2424 
2425 		LOG_DEBUG(VHOST_CONFIG,
2426 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2427 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2428 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2429 			dev->device_fh);
2430 
2431 		mbuf_destroy_zcp(vpool);
2432 		rte_free(vdev->regions_hpa);
2433 	}
2434 	rte_free(vdev);
2435 
2436 }
2437 
2438 /*
2439  * Calculate the region count of physical continous regions for one particular
2440  * region of whose vhost virtual address is continous. The particular region
2441  * start from vva_start, with size of 'size' in argument.
2442  */
2443 static uint32_t
2444 check_hpa_regions(uint64_t vva_start, uint64_t size)
2445 {
2446 	uint32_t i, nregions = 0, page_size = getpagesize();
2447 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2448 	if (vva_start % page_size) {
2449 		LOG_DEBUG(VHOST_CONFIG,
2450 			"in check_countinous: vva start(%p) mod page_size(%d) "
2451 			"has remainder\n",
2452 			(void *)(uintptr_t)vva_start, page_size);
2453 		return 0;
2454 	}
2455 	if (size % page_size) {
2456 		LOG_DEBUG(VHOST_CONFIG,
2457 			"in check_countinous: "
2458 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2459 			size, page_size);
2460 		return 0;
2461 	}
2462 	for (i = 0; i < size - page_size; i = i + page_size) {
2463 		cur_phys_addr
2464 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2465 		next_phys_addr = rte_mem_virt2phy(
2466 			(void *)(uintptr_t)(vva_start + i + page_size));
2467 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2468 			++nregions;
2469 			LOG_DEBUG(VHOST_CONFIG,
2470 				"in check_continuous: hva addr:(%p) is not "
2471 				"continuous with hva addr:(%p), diff:%d\n",
2472 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2473 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2474 				+ page_size), page_size);
2475 			LOG_DEBUG(VHOST_CONFIG,
2476 				"in check_continuous: hpa addr:(%p) is not "
2477 				"continuous with hpa addr:(%p), "
2478 				"diff:(%"PRIu64")\n",
2479 				(void *)(uintptr_t)cur_phys_addr,
2480 				(void *)(uintptr_t)next_phys_addr,
2481 				(next_phys_addr-cur_phys_addr));
2482 		}
2483 	}
2484 	return nregions;
2485 }
2486 
2487 /*
2488  * Divide each region whose vhost virtual address is continous into a few
2489  * sub-regions, make sure the physical address within each sub-region are
2490  * continous. And fill offset(to GPA) and size etc. information of each
2491  * sub-region into regions_hpa.
2492  */
2493 static uint32_t
2494 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2495 {
2496 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2497 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2498 
2499 	if (mem_region_hpa == NULL)
2500 		return 0;
2501 
2502 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2503 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2504 			virtio_memory->regions[regionidx].address_offset;
2505 		mem_region_hpa[regionidx_hpa].guest_phys_address
2506 			= virtio_memory->regions[regionidx].guest_phys_address;
2507 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2508 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2509 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2510 		LOG_DEBUG(VHOST_CONFIG,
2511 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2512 			regionidx_hpa,
2513 			(void *)(uintptr_t)
2514 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2515 		LOG_DEBUG(VHOST_CONFIG,
2516 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2517 			regionidx_hpa,
2518 			(void *)(uintptr_t)
2519 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2520 		for (i = 0, k = 0;
2521 			i < virtio_memory->regions[regionidx].memory_size -
2522 				page_size;
2523 			i += page_size) {
2524 			cur_phys_addr = rte_mem_virt2phy(
2525 					(void *)(uintptr_t)(vva_start + i));
2526 			next_phys_addr = rte_mem_virt2phy(
2527 					(void *)(uintptr_t)(vva_start +
2528 					i + page_size));
2529 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2530 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2531 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2532 					k + page_size;
2533 				mem_region_hpa[regionidx_hpa].memory_size
2534 					= k + page_size;
2535 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2536 					"phys addr end  [%d]:(%p)\n",
2537 					regionidx_hpa,
2538 					(void *)(uintptr_t)
2539 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2540 				LOG_DEBUG(VHOST_CONFIG,
2541 					"in fill_hpa_regions: guest phys addr "
2542 					"size [%d]:(%p)\n",
2543 					regionidx_hpa,
2544 					(void *)(uintptr_t)
2545 					(mem_region_hpa[regionidx_hpa].memory_size));
2546 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2547 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2548 				++regionidx_hpa;
2549 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2550 					next_phys_addr -
2551 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2552 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2553 					" phys addr start[%d]:(%p)\n",
2554 					regionidx_hpa,
2555 					(void *)(uintptr_t)
2556 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2557 				LOG_DEBUG(VHOST_CONFIG,
2558 					"in fill_hpa_regions: host  phys addr "
2559 					"start[%d]:(%p)\n",
2560 					regionidx_hpa,
2561 					(void *)(uintptr_t)
2562 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2563 				k = 0;
2564 			} else {
2565 				k += page_size;
2566 			}
2567 		}
2568 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2569 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2570 			+ k + page_size;
2571 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2572 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2573 			"[%d]:(%p)\n", regionidx_hpa,
2574 			(void *)(uintptr_t)
2575 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2576 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2577 			"[%d]:(%p)\n", regionidx_hpa,
2578 			(void *)(uintptr_t)
2579 			(mem_region_hpa[regionidx_hpa].memory_size));
2580 		++regionidx_hpa;
2581 	}
2582 	return regionidx_hpa;
2583 }
2584 
2585 /*
2586  * A new device is added to a data core. First the device is added to the main linked list
2587  * and the allocated to a specific data core.
2588  */
2589 static int
2590 new_device (struct virtio_net *dev)
2591 {
2592 	struct virtio_net_data_ll *ll_dev;
2593 	int lcore, core_add = 0;
2594 	uint32_t device_num_min = num_devices;
2595 	struct vhost_dev *vdev;
2596 	uint32_t regionidx;
2597 
2598 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2599 	if (vdev == NULL) {
2600 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2601 			dev->device_fh);
2602 		return -1;
2603 	}
2604 	vdev->dev = dev;
2605 	dev->priv = vdev;
2606 
2607 	if (zero_copy) {
2608 		vdev->nregions_hpa = dev->mem->nregions;
2609 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2610 			vdev->nregions_hpa
2611 				+= check_hpa_regions(
2612 					dev->mem->regions[regionidx].guest_phys_address
2613 					+ dev->mem->regions[regionidx].address_offset,
2614 					dev->mem->regions[regionidx].memory_size);
2615 
2616 		}
2617 
2618 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2619 					       vdev->nregions_hpa,
2620 					       sizeof(struct virtio_memory_regions_hpa),
2621 					       RTE_CACHE_LINE_SIZE);
2622 		if (vdev->regions_hpa == NULL) {
2623 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2624 			rte_free(vdev);
2625 			return -1;
2626 		}
2627 
2628 
2629 		if (fill_hpa_memory_regions(
2630 			vdev->regions_hpa, dev->mem
2631 			) != vdev->nregions_hpa) {
2632 
2633 			RTE_LOG(ERR, VHOST_CONFIG,
2634 				"hpa memory regions number mismatch: "
2635 				"[%d]\n", vdev->nregions_hpa);
2636 			rte_free(vdev->regions_hpa);
2637 			rte_free(vdev);
2638 			return -1;
2639 		}
2640 	}
2641 
2642 
2643 	/* Add device to main ll */
2644 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2645 	if (ll_dev == NULL) {
2646 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2647 			"of %d devices per core has been reached\n",
2648 			dev->device_fh, num_devices);
2649 		if (vdev->regions_hpa)
2650 			rte_free(vdev->regions_hpa);
2651 		rte_free(vdev);
2652 		return -1;
2653 	}
2654 	ll_dev->vdev = vdev;
2655 	add_data_ll_entry(&ll_root_used, ll_dev);
2656 	vdev->vmdq_rx_q
2657 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2658 
2659 	if (zero_copy) {
2660 		uint32_t index = vdev->vmdq_rx_q;
2661 		uint32_t count_in_ring, i;
2662 		struct mbuf_table *tx_q;
2663 
2664 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2665 
2666 		LOG_DEBUG(VHOST_CONFIG,
2667 			"(%"PRIu64") in new_device: mbuf count in mempool "
2668 			"before attach is: %d\n",
2669 			dev->device_fh,
2670 			rte_mempool_count(vpool_array[index].pool));
2671 		LOG_DEBUG(VHOST_CONFIG,
2672 			"(%"PRIu64") in new_device: mbuf count in  ring "
2673 			"before attach  is : %d\n",
2674 			dev->device_fh, count_in_ring);
2675 
2676 		/*
2677 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2678 		 */
2679 		for (i = 0; i < count_in_ring; i++)
2680 			attach_rxmbuf_zcp(dev);
2681 
2682 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2683 			"mempool after attach is: %d\n",
2684 			dev->device_fh,
2685 			rte_mempool_count(vpool_array[index].pool));
2686 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2687 			"ring after attach  is : %d\n",
2688 			dev->device_fh,
2689 			rte_ring_count(vpool_array[index].ring));
2690 
2691 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2692 		tx_q->txq_id = vdev->vmdq_rx_q;
2693 
2694 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2695 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2696 
2697 			LOG_DEBUG(VHOST_CONFIG,
2698 				"(%"PRIu64") In new_device: Failed to start "
2699 				"tx queue:%d\n",
2700 				dev->device_fh, vdev->vmdq_rx_q);
2701 
2702 			mbuf_destroy_zcp(vpool);
2703 			rte_free(vdev->regions_hpa);
2704 			rte_free(vdev);
2705 			return -1;
2706 		}
2707 
2708 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2709 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2710 
2711 			LOG_DEBUG(VHOST_CONFIG,
2712 				"(%"PRIu64") In new_device: Failed to start "
2713 				"rx queue:%d\n",
2714 				dev->device_fh, vdev->vmdq_rx_q);
2715 
2716 			/* Stop the TX queue. */
2717 			if (rte_eth_dev_tx_queue_stop(ports[0],
2718 				vdev->vmdq_rx_q) != 0) {
2719 				LOG_DEBUG(VHOST_CONFIG,
2720 					"(%"PRIu64") In new_device: Failed to "
2721 					"stop tx queue:%d\n",
2722 					dev->device_fh, vdev->vmdq_rx_q);
2723 			}
2724 
2725 			mbuf_destroy_zcp(vpool);
2726 			rte_free(vdev->regions_hpa);
2727 			rte_free(vdev);
2728 			return -1;
2729 		}
2730 
2731 	}
2732 
2733 	/*reset ready flag*/
2734 	vdev->ready = DEVICE_MAC_LEARNING;
2735 	vdev->remove = 0;
2736 
2737 	/* Find a suitable lcore to add the device. */
2738 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2739 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2740 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2741 			core_add = lcore;
2742 		}
2743 	}
2744 	/* Add device to lcore ll */
2745 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2746 	if (ll_dev == NULL) {
2747 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2748 		vdev->ready = DEVICE_SAFE_REMOVE;
2749 		destroy_device(dev);
2750 		rte_free(vdev->regions_hpa);
2751 		rte_free(vdev);
2752 		return -1;
2753 	}
2754 	ll_dev->vdev = vdev;
2755 	vdev->coreid = core_add;
2756 
2757 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2758 
2759 	/* Initialize device stats */
2760 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2761 
2762 	/* Disable notifications. */
2763 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2764 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2765 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2766 	dev->flags |= VIRTIO_DEV_RUNNING;
2767 
2768 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2769 
2770 	return 0;
2771 }
2772 
2773 /*
2774  * These callback allow devices to be added to the data core when configuration
2775  * has been fully complete.
2776  */
2777 static const struct virtio_net_device_ops virtio_net_device_ops =
2778 {
2779 	.new_device =  new_device,
2780 	.destroy_device = destroy_device,
2781 };
2782 
2783 /*
2784  * This is a thread will wake up after a period to print stats if the user has
2785  * enabled them.
2786  */
2787 static void
2788 print_stats(void)
2789 {
2790 	struct virtio_net_data_ll *dev_ll;
2791 	uint64_t tx_dropped, rx_dropped;
2792 	uint64_t tx, tx_total, rx, rx_total;
2793 	uint32_t device_fh;
2794 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2795 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2796 
2797 	while(1) {
2798 		sleep(enable_stats);
2799 
2800 		/* Clear screen and move to top left */
2801 		printf("%s%s", clr, top_left);
2802 
2803 		printf("\nDevice statistics ====================================");
2804 
2805 		dev_ll = ll_root_used;
2806 		while (dev_ll != NULL) {
2807 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2808 			tx_total = dev_statistics[device_fh].tx_total;
2809 			tx = dev_statistics[device_fh].tx;
2810 			tx_dropped = tx_total - tx;
2811 			if (zero_copy == 0) {
2812 				rx_total = rte_atomic64_read(
2813 					&dev_statistics[device_fh].rx_total_atomic);
2814 				rx = rte_atomic64_read(
2815 					&dev_statistics[device_fh].rx_atomic);
2816 			} else {
2817 				rx_total = dev_statistics[device_fh].rx_total;
2818 				rx = dev_statistics[device_fh].rx;
2819 			}
2820 			rx_dropped = rx_total - rx;
2821 
2822 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2823 					"\nTX total: 		%"PRIu64""
2824 					"\nTX dropped: 		%"PRIu64""
2825 					"\nTX successful: 		%"PRIu64""
2826 					"\nRX total: 		%"PRIu64""
2827 					"\nRX dropped: 		%"PRIu64""
2828 					"\nRX successful: 		%"PRIu64"",
2829 					device_fh,
2830 					tx_total,
2831 					tx_dropped,
2832 					tx,
2833 					rx_total,
2834 					rx_dropped,
2835 					rx);
2836 
2837 			dev_ll = dev_ll->next;
2838 		}
2839 		printf("\n======================================================\n");
2840 	}
2841 }
2842 
2843 static void
2844 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2845 	char *ring_name, uint32_t nb_mbuf)
2846 {
2847 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2848 	vpool_array[index].pool
2849 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2850 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2851 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2852 		rte_pktmbuf_init, NULL, socket, 0);
2853 	if (vpool_array[index].pool != NULL) {
2854 		vpool_array[index].ring
2855 			= rte_ring_create(ring_name,
2856 				rte_align32pow2(nb_mbuf + 1),
2857 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2858 		if (likely(vpool_array[index].ring != NULL)) {
2859 			LOG_DEBUG(VHOST_CONFIG,
2860 				"in setup_mempool_tbl: mbuf count in "
2861 				"mempool is: %d\n",
2862 				rte_mempool_count(vpool_array[index].pool));
2863 			LOG_DEBUG(VHOST_CONFIG,
2864 				"in setup_mempool_tbl: mbuf count in "
2865 				"ring   is: %d\n",
2866 				rte_ring_count(vpool_array[index].ring));
2867 		} else {
2868 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2869 				ring_name);
2870 		}
2871 
2872 		/* Need consider head room. */
2873 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2874 	} else {
2875 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2876 	}
2877 }
2878 
2879 
2880 /*
2881  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2882  * device is also registered here to handle the IOCTLs.
2883  */
2884 int
2885 main(int argc, char *argv[])
2886 {
2887 	struct rte_mempool *mbuf_pool = NULL;
2888 	unsigned lcore_id, core_id = 0;
2889 	unsigned nb_ports, valid_num_ports;
2890 	int ret;
2891 	uint8_t portid;
2892 	uint16_t queue_id;
2893 	static pthread_t tid;
2894 
2895 	/* init EAL */
2896 	ret = rte_eal_init(argc, argv);
2897 	if (ret < 0)
2898 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2899 	argc -= ret;
2900 	argv += ret;
2901 
2902 	/* parse app arguments */
2903 	ret = us_vhost_parse_args(argc, argv);
2904 	if (ret < 0)
2905 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2906 
2907 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2908 		if (rte_lcore_is_enabled(lcore_id))
2909 			lcore_ids[core_id ++] = lcore_id;
2910 
2911 	if (rte_lcore_count() > RTE_MAX_LCORE)
2912 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2913 
2914 	/*set the number of swithcing cores available*/
2915 	num_switching_cores = rte_lcore_count()-1;
2916 
2917 	/* Get the number of physical ports. */
2918 	nb_ports = rte_eth_dev_count();
2919 	if (nb_ports > RTE_MAX_ETHPORTS)
2920 		nb_ports = RTE_MAX_ETHPORTS;
2921 
2922 	/*
2923 	 * Update the global var NUM_PORTS and global array PORTS
2924 	 * and get value of var VALID_NUM_PORTS according to system ports number
2925 	 */
2926 	valid_num_ports = check_ports_num(nb_ports);
2927 
2928 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2929 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2930 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2931 		return -1;
2932 	}
2933 
2934 	if (zero_copy == 0) {
2935 		/* Create the mbuf pool. */
2936 		mbuf_pool = rte_mempool_create(
2937 				"MBUF_POOL",
2938 				NUM_MBUFS_PER_PORT
2939 				* valid_num_ports,
2940 				MBUF_SIZE, MBUF_CACHE_SIZE,
2941 				sizeof(struct rte_pktmbuf_pool_private),
2942 				rte_pktmbuf_pool_init, NULL,
2943 				rte_pktmbuf_init, NULL,
2944 				rte_socket_id(), 0);
2945 		if (mbuf_pool == NULL)
2946 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2947 
2948 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2949 			vpool_array[queue_id].pool = mbuf_pool;
2950 
2951 		if (vm2vm_mode == VM2VM_HARDWARE) {
2952 			/* Enable VT loop back to let L2 switch to do it. */
2953 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2954 			LOG_DEBUG(VHOST_CONFIG,
2955 				"Enable loop back for L2 switch in vmdq.\n");
2956 		}
2957 	} else {
2958 		uint32_t nb_mbuf;
2959 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2960 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2961 
2962 		nb_mbuf = num_rx_descriptor
2963 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2964 			+ num_switching_cores * MAX_PKT_BURST;
2965 
2966 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2967 			snprintf(pool_name, sizeof(pool_name),
2968 				"rxmbuf_pool_%u", queue_id);
2969 			snprintf(ring_name, sizeof(ring_name),
2970 				"rxmbuf_ring_%u", queue_id);
2971 			setup_mempool_tbl(rte_socket_id(), queue_id,
2972 				pool_name, ring_name, nb_mbuf);
2973 		}
2974 
2975 		nb_mbuf = num_tx_descriptor
2976 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2977 				+ num_switching_cores * MAX_PKT_BURST;
2978 
2979 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2980 			snprintf(pool_name, sizeof(pool_name),
2981 				"txmbuf_pool_%u", queue_id);
2982 			snprintf(ring_name, sizeof(ring_name),
2983 				"txmbuf_ring_%u", queue_id);
2984 			setup_mempool_tbl(rte_socket_id(),
2985 				(queue_id + MAX_QUEUES),
2986 				pool_name, ring_name, nb_mbuf);
2987 		}
2988 
2989 		if (vm2vm_mode == VM2VM_HARDWARE) {
2990 			/* Enable VT loop back to let L2 switch to do it. */
2991 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2992 			LOG_DEBUG(VHOST_CONFIG,
2993 				"Enable loop back for L2 switch in vmdq.\n");
2994 		}
2995 	}
2996 	/* Set log level. */
2997 	rte_set_log_level(LOG_LEVEL);
2998 
2999 	/* initialize all ports */
3000 	for (portid = 0; portid < nb_ports; portid++) {
3001 		/* skip ports that are not enabled */
3002 		if ((enabled_port_mask & (1 << portid)) == 0) {
3003 			RTE_LOG(INFO, VHOST_PORT,
3004 				"Skipping disabled port %d\n", portid);
3005 			continue;
3006 		}
3007 		if (port_init(portid) != 0)
3008 			rte_exit(EXIT_FAILURE,
3009 				"Cannot initialize network ports\n");
3010 	}
3011 
3012 	/* Initialise all linked lists. */
3013 	if (init_data_ll() == -1)
3014 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3015 
3016 	/* Initialize device stats */
3017 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3018 
3019 	/* Enable stats if the user option is set. */
3020 	if (enable_stats)
3021 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3022 
3023 	/* Launch all data cores. */
3024 	if (zero_copy == 0) {
3025 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3026 			rte_eal_remote_launch(switch_worker,
3027 				mbuf_pool, lcore_id);
3028 		}
3029 	} else {
3030 		uint32_t count_in_mempool, index, i;
3031 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3032 			/* For all RX and TX queues. */
3033 			count_in_mempool
3034 				= rte_mempool_count(vpool_array[index].pool);
3035 
3036 			/*
3037 			 * Transfer all un-attached mbufs from vpool.pool
3038 			 * to vpoo.ring.
3039 			 */
3040 			for (i = 0; i < count_in_mempool; i++) {
3041 				struct rte_mbuf *mbuf
3042 					= __rte_mbuf_raw_alloc(
3043 						vpool_array[index].pool);
3044 				rte_ring_sp_enqueue(vpool_array[index].ring,
3045 						(void *)mbuf);
3046 			}
3047 
3048 			LOG_DEBUG(VHOST_CONFIG,
3049 				"in main: mbuf count in mempool at initial "
3050 				"is: %d\n", count_in_mempool);
3051 			LOG_DEBUG(VHOST_CONFIG,
3052 				"in main: mbuf count in  ring at initial  is :"
3053 				" %d\n",
3054 				rte_ring_count(vpool_array[index].ring));
3055 		}
3056 
3057 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3058 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3059 				lcore_id);
3060 	}
3061 
3062 	if (mergeable == 0)
3063 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3064 
3065 	/* Register CUSE device to handle IOCTLs. */
3066 	ret = rte_vhost_driver_register((char *)&dev_basename);
3067 	if (ret != 0)
3068 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3069 
3070 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3071 
3072 	/* Start CUSE session. */
3073 	rte_vhost_driver_session_start();
3074 	return 0;
3075 
3076 }
3077 
3078