xref: /dpdk/examples/vhost/main.c (revision ceb1ccd5d50c1a89ba8bdd97cc199e7f07422b98)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
69 							(num_switching_cores*MAX_PKT_BURST) +  			\
70 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71 							((num_switching_cores+1)*MBUF_CACHE_SIZE))
72 
73 #define MBUF_CACHE_SIZE	128
74 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
75 
76 /*
77  * No frame data buffer allocated from host are required for zero copy
78  * implementation, guest will allocate the frame data buffer, and vhost
79  * directly use it.
80  */
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
84 
85 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
87 
88 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
90 
91 #define JUMBO_FRAME_MAX_SIZE    0x2600
92 
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
95 #define DEVICE_RX			1
96 #define DEVICE_SAFE_REMOVE	2
97 
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
101 
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
105 
106 /*
107  * Need refine these 2 macros for legacy and DPDK based front end:
108  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109  * And then adjust power 2.
110  */
111 /*
112  * For legacy front end, 128 descriptors,
113  * half for virtio header, another half for mbuf.
114  */
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
117 
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120 		+ sizeof(struct rte_mbuf)))
121 
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
124 
125 #define INVALID_PORT_ID 0xFF
126 
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
129 
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
132 
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
135 
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
138 
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
141 
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
144 
145 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
146 
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
149 
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
152 
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
155 
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
159 
160 /*
161  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162  * disabled on default.
163  */
164 static uint32_t zero_copy;
165 static int mergeable;
166 
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
169 
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
173 
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
176 
177 struct vpool {
178 	struct rte_mempool *pool;
179 	struct rte_ring *ring;
180 	uint32_t buf_size;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
182 
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
184 typedef enum {
185 	VM2VM_DISABLED = 0,
186 	VM2VM_SOFTWARE = 1,
187 	VM2VM_HARDWARE = 2,
188 	VM2VM_LAST
189 } vm2vm_type;
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
191 
192 /* The type of host physical address translated from guest physical address. */
193 typedef enum {
194 	PHYS_ADDR_CONTINUOUS = 0,
195 	PHYS_ADDR_CROSS_SUBREG = 1,
196 	PHYS_ADDR_INVALID = 2,
197 	PHYS_ADDR_LAST
198 } hpa_type;
199 
200 /* Enable stats. */
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
204 
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
207 
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
210 
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215 
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218 
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
221 	.rxmode = {
222 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
223 		.split_hdr_size = 0,
224 		.header_split   = 0, /**< Header Split disabled */
225 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
226 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
227 		/*
228 		 * It is necessary for 1G NIC such as I350,
229 		 * this fixes bug of ipv4 forwarding in guest can't
230 		 * forward pakets from one virtio dev to another virtio dev.
231 		 */
232 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
233 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
234 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
235 	},
236 
237 	.txmode = {
238 		.mq_mode = ETH_MQ_TX_NONE,
239 	},
240 	.rx_adv_conf = {
241 		/*
242 		 * should be overridden separately in code with
243 		 * appropriate values
244 		 */
245 		.vmdq_rx_conf = {
246 			.nb_queue_pools = ETH_8_POOLS,
247 			.enable_default_pool = 0,
248 			.default_pool = 0,
249 			.nb_pool_maps = 0,
250 			.pool_map = {{0, 0},},
251 		},
252 	},
253 };
254 
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
261 
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
266 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
272 };
273 
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
276 
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
280 
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
283 
284 /* Used for queueing bursts of TX packets. */
285 struct mbuf_table {
286 	unsigned len;
287 	unsigned txq_id;
288 	struct rte_mbuf *m_table[MAX_PKT_BURST];
289 };
290 
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
293 
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
296 
297 /* Vlan header struct used to insert vlan tags on TX. */
298 struct vlan_ethhdr {
299 	unsigned char   h_dest[ETH_ALEN];
300 	unsigned char   h_source[ETH_ALEN];
301 	__be16          h_vlan_proto;
302 	__be16          h_vlan_TCI;
303 	__be16          h_vlan_encapsulated_proto;
304 };
305 
306 /* Header lengths. */
307 #define VLAN_HLEN       4
308 #define VLAN_ETH_HLEN   18
309 
310 /* Per-device statistics struct */
311 struct device_statistics {
312 	uint64_t tx_total;
313 	rte_atomic64_t rx_total_atomic;
314 	uint64_t rx_total;
315 	uint64_t tx;
316 	rte_atomic64_t rx_atomic;
317 	uint64_t rx;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
320 
321 /*
322  * Builds up the correct configuration for VMDQ VLAN pool map
323  * according to the pool & queue limits.
324  */
325 static inline int
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
327 {
328 	struct rte_eth_vmdq_rx_conf conf;
329 	struct rte_eth_vmdq_rx_conf *def_conf =
330 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
331 	unsigned i;
332 
333 	memset(&conf, 0, sizeof(conf));
334 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335 	conf.nb_pool_maps = num_devices;
336 	conf.enable_loop_back = def_conf->enable_loop_back;
337 	conf.rx_mode = def_conf->rx_mode;
338 
339 	for (i = 0; i < conf.nb_pool_maps; i++) {
340 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
341 		conf.pool_map[i].pools = (1UL << i);
342 	}
343 
344 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
347 	return 0;
348 }
349 
350 /*
351  * Validate the device number according to the max pool number gotten form
352  * dev_info. If the device number is invalid, give the error message and
353  * return -1. Each device must have its own pool.
354  */
355 static inline int
356 validate_num_devices(uint32_t max_nb_devices)
357 {
358 	if (num_devices > max_nb_devices) {
359 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
360 		return -1;
361 	}
362 	return 0;
363 }
364 
365 /*
366  * Initialises a given port using global settings and with the rx buffers
367  * coming from the mbuf_pool passed as parameter
368  */
369 static inline int
370 port_init(uint8_t port)
371 {
372 	struct rte_eth_dev_info dev_info;
373 	struct rte_eth_conf port_conf;
374 	struct rte_eth_rxconf *rxconf;
375 	struct rte_eth_txconf *txconf;
376 	int16_t rx_rings, tx_rings;
377 	uint16_t rx_ring_size, tx_ring_size;
378 	int retval;
379 	uint16_t q;
380 
381 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382 	rte_eth_dev_info_get (port, &dev_info);
383 
384 	if (dev_info.max_rx_queues > MAX_QUEUES) {
385 		rte_exit(EXIT_FAILURE,
386 			"please define MAX_QUEUES no less than %u in %s\n",
387 			dev_info.max_rx_queues, __FILE__);
388 	}
389 
390 	rxconf = &dev_info.default_rxconf;
391 	txconf = &dev_info.default_txconf;
392 	rxconf->rx_drop_en = 1;
393 
394 	/* Enable vlan offload */
395 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
396 
397 	/*
398 	 * Zero copy defers queue RX/TX start to the time when guest
399 	 * finishes its startup and packet buffers from that guest are
400 	 * available.
401 	 */
402 	if (zero_copy) {
403 		rxconf->rx_deferred_start = 1;
404 		rxconf->rx_drop_en = 0;
405 		txconf->tx_deferred_start = 1;
406 	}
407 
408 	/*configure the number of supported virtio devices based on VMDQ limits */
409 	num_devices = dev_info.max_vmdq_pools;
410 
411 	if (zero_copy) {
412 		rx_ring_size = num_rx_descriptor;
413 		tx_ring_size = num_tx_descriptor;
414 		tx_rings = dev_info.max_tx_queues;
415 	} else {
416 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418 		tx_rings = (uint16_t)rte_lcore_count();
419 	}
420 
421 	retval = validate_num_devices(MAX_DEVICES);
422 	if (retval < 0)
423 		return retval;
424 
425 	/* Get port configuration. */
426 	retval = get_eth_conf(&port_conf, num_devices);
427 	if (retval < 0)
428 		return retval;
429 	/* NIC queues are divided into pf queues and vmdq queues.  */
430 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432 	num_vmdq_queues = num_devices * queues_per_pool;
433 	num_queues = num_pf_queues + num_vmdq_queues;
434 	vmdq_queue_base = dev_info.vmdq_queue_base;
435 	vmdq_pool_base  = dev_info.vmdq_pool_base;
436 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437 		num_pf_queues, num_devices, queues_per_pool);
438 
439 	if (port >= rte_eth_dev_count()) return -1;
440 
441 	if (enable_tx_csum == 0)
442 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
443 
444 	if (enable_tso == 0) {
445 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
447 	}
448 
449 	rx_rings = (uint16_t)dev_info.max_rx_queues;
450 	/* Configure ethernet device. */
451 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452 	if (retval != 0)
453 		return retval;
454 
455 	/* Setup the queues. */
456 	for (q = 0; q < rx_rings; q ++) {
457 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 						rte_eth_dev_socket_id(port),
459 						rxconf,
460 						vpool_array[q].pool);
461 		if (retval < 0)
462 			return retval;
463 	}
464 	for (q = 0; q < tx_rings; q ++) {
465 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466 						rte_eth_dev_socket_id(port),
467 						txconf);
468 		if (retval < 0)
469 			return retval;
470 	}
471 
472 	/* Start the device. */
473 	retval  = rte_eth_dev_start(port);
474 	if (retval < 0) {
475 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476 		return retval;
477 	}
478 
479 	if (promiscuous)
480 		rte_eth_promiscuous_enable(port);
481 
482 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
486 			(unsigned)port,
487 			vmdq_ports_eth_addr[port].addr_bytes[0],
488 			vmdq_ports_eth_addr[port].addr_bytes[1],
489 			vmdq_ports_eth_addr[port].addr_bytes[2],
490 			vmdq_ports_eth_addr[port].addr_bytes[3],
491 			vmdq_ports_eth_addr[port].addr_bytes[4],
492 			vmdq_ports_eth_addr[port].addr_bytes[5]);
493 
494 	return 0;
495 }
496 
497 /*
498  * Set character device basename.
499  */
500 static int
501 us_vhost_parse_basename(const char *q_arg)
502 {
503 	/* parse number string */
504 
505 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
506 		return -1;
507 	else
508 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509 
510 	return 0;
511 }
512 
513 /*
514  * Parse the portmask provided at run time.
515  */
516 static int
517 parse_portmask(const char *portmask)
518 {
519 	char *end = NULL;
520 	unsigned long pm;
521 
522 	errno = 0;
523 
524 	/* parse hexadecimal string */
525 	pm = strtoul(portmask, &end, 16);
526 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
527 		return -1;
528 
529 	if (pm == 0)
530 		return -1;
531 
532 	return pm;
533 
534 }
535 
536 /*
537  * Parse num options at run time.
538  */
539 static int
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 {
542 	char *end = NULL;
543 	unsigned long num;
544 
545 	errno = 0;
546 
547 	/* parse unsigned int string */
548 	num = strtoul(q_arg, &end, 10);
549 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
550 		return -1;
551 
552 	if (num > max_valid_value)
553 		return -1;
554 
555 	return num;
556 
557 }
558 
559 /*
560  * Display usage
561  */
562 static void
563 us_vhost_usage(const char *prgname)
564 {
565 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
566 	"		--vm2vm [0|1|2]\n"
567 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568 	"		--dev-basename <name>\n"
569 	"		--nb-devices ND\n"
570 	"		-p PORTMASK: Set mask for ports to be used by application\n"
571 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578 	"		--dev-basename: The basename to be used for the character device.\n"
579 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
580 			"zero copy\n"
581 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
582 			"used only when zero copy is enabled.\n"
583 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
584 			"used only when zero copy is enabled.\n"
585 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
586 	"		--tso [0|1] disable/enable TCP segment offload.\n",
587 	       prgname);
588 }
589 
590 /*
591  * Parse the arguments given in the command line of the application.
592  */
593 static int
594 us_vhost_parse_args(int argc, char **argv)
595 {
596 	int opt, ret;
597 	int option_index;
598 	unsigned i;
599 	const char *prgname = argv[0];
600 	static struct option long_option[] = {
601 		{"vm2vm", required_argument, NULL, 0},
602 		{"rx-retry", required_argument, NULL, 0},
603 		{"rx-retry-delay", required_argument, NULL, 0},
604 		{"rx-retry-num", required_argument, NULL, 0},
605 		{"mergeable", required_argument, NULL, 0},
606 		{"vlan-strip", required_argument, NULL, 0},
607 		{"stats", required_argument, NULL, 0},
608 		{"dev-basename", required_argument, NULL, 0},
609 		{"zero-copy", required_argument, NULL, 0},
610 		{"rx-desc-num", required_argument, NULL, 0},
611 		{"tx-desc-num", required_argument, NULL, 0},
612 		{"tx-csum", required_argument, NULL, 0},
613 		{"tso", required_argument, NULL, 0},
614 		{NULL, 0, 0, 0},
615 	};
616 
617 	/* Parse command line */
618 	while ((opt = getopt_long(argc, argv, "p:P",
619 			long_option, &option_index)) != EOF) {
620 		switch (opt) {
621 		/* Portmask */
622 		case 'p':
623 			enabled_port_mask = parse_portmask(optarg);
624 			if (enabled_port_mask == 0) {
625 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626 				us_vhost_usage(prgname);
627 				return -1;
628 			}
629 			break;
630 
631 		case 'P':
632 			promiscuous = 1;
633 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634 				ETH_VMDQ_ACCEPT_BROADCAST |
635 				ETH_VMDQ_ACCEPT_MULTICAST;
636 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
637 
638 			break;
639 
640 		case 0:
641 			/* Enable/disable vm2vm comms. */
642 			if (!strncmp(long_option[option_index].name, "vm2vm",
643 				MAX_LONG_OPT_SZ)) {
644 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
645 				if (ret == -1) {
646 					RTE_LOG(INFO, VHOST_CONFIG,
647 						"Invalid argument for "
648 						"vm2vm [0|1|2]\n");
649 					us_vhost_usage(prgname);
650 					return -1;
651 				} else {
652 					vm2vm_mode = (vm2vm_type)ret;
653 				}
654 			}
655 
656 			/* Enable/disable retries on RX. */
657 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658 				ret = parse_num_opt(optarg, 1);
659 				if (ret == -1) {
660 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661 					us_vhost_usage(prgname);
662 					return -1;
663 				} else {
664 					enable_retry = ret;
665 				}
666 			}
667 
668 			/* Enable/disable TX checksum offload. */
669 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670 				ret = parse_num_opt(optarg, 1);
671 				if (ret == -1) {
672 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673 					us_vhost_usage(prgname);
674 					return -1;
675 				} else
676 					enable_tx_csum = ret;
677 			}
678 
679 			/* Enable/disable TSO offload. */
680 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681 				ret = parse_num_opt(optarg, 1);
682 				if (ret == -1) {
683 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684 					us_vhost_usage(prgname);
685 					return -1;
686 				} else
687 					enable_tso = ret;
688 			}
689 
690 			/* Specify the retries delay time (in useconds) on RX. */
691 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692 				ret = parse_num_opt(optarg, INT32_MAX);
693 				if (ret == -1) {
694 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695 					us_vhost_usage(prgname);
696 					return -1;
697 				} else {
698 					burst_rx_delay_time = ret;
699 				}
700 			}
701 
702 			/* Specify the retries number on RX. */
703 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704 				ret = parse_num_opt(optarg, INT32_MAX);
705 				if (ret == -1) {
706 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707 					us_vhost_usage(prgname);
708 					return -1;
709 				} else {
710 					burst_rx_retry_num = ret;
711 				}
712 			}
713 
714 			/* Enable/disable RX mergeable buffers. */
715 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716 				ret = parse_num_opt(optarg, 1);
717 				if (ret == -1) {
718 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719 					us_vhost_usage(prgname);
720 					return -1;
721 				} else {
722 					mergeable = !!ret;
723 					if (ret) {
724 						vmdq_conf_default.rxmode.jumbo_frame = 1;
725 						vmdq_conf_default.rxmode.max_rx_pkt_len
726 							= JUMBO_FRAME_MAX_SIZE;
727 					}
728 				}
729 			}
730 
731 			/* Enable/disable RX VLAN strip on host. */
732 			if (!strncmp(long_option[option_index].name,
733 				"vlan-strip", MAX_LONG_OPT_SZ)) {
734 				ret = parse_num_opt(optarg, 1);
735 				if (ret == -1) {
736 					RTE_LOG(INFO, VHOST_CONFIG,
737 						"Invalid argument for VLAN strip [0|1]\n");
738 					us_vhost_usage(prgname);
739 					return -1;
740 				} else {
741 					vlan_strip = !!ret;
742 					vmdq_conf_default.rxmode.hw_vlan_strip =
743 						vlan_strip;
744 				}
745 			}
746 
747 			/* Enable/disable stats. */
748 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749 				ret = parse_num_opt(optarg, INT32_MAX);
750 				if (ret == -1) {
751 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752 					us_vhost_usage(prgname);
753 					return -1;
754 				} else {
755 					enable_stats = ret;
756 				}
757 			}
758 
759 			/* Set character device basename. */
760 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761 				if (us_vhost_parse_basename(optarg) == -1) {
762 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763 					us_vhost_usage(prgname);
764 					return -1;
765 				}
766 			}
767 
768 			/* Enable/disable rx/tx zero copy. */
769 			if (!strncmp(long_option[option_index].name,
770 				"zero-copy", MAX_LONG_OPT_SZ)) {
771 				ret = parse_num_opt(optarg, 1);
772 				if (ret == -1) {
773 					RTE_LOG(INFO, VHOST_CONFIG,
774 						"Invalid argument"
775 						" for zero-copy [0|1]\n");
776 					us_vhost_usage(prgname);
777 					return -1;
778 				} else
779 					zero_copy = ret;
780 			}
781 
782 			/* Specify the descriptor number on RX. */
783 			if (!strncmp(long_option[option_index].name,
784 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
785 				ret = parse_num_opt(optarg, MAX_RING_DESC);
786 				if ((ret == -1) || (!POWEROF2(ret))) {
787 					RTE_LOG(INFO, VHOST_CONFIG,
788 					"Invalid argument for rx-desc-num[0-N],"
789 					"power of 2 required.\n");
790 					us_vhost_usage(prgname);
791 					return -1;
792 				} else {
793 					num_rx_descriptor = ret;
794 				}
795 			}
796 
797 			/* Specify the descriptor number on TX. */
798 			if (!strncmp(long_option[option_index].name,
799 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
800 				ret = parse_num_opt(optarg, MAX_RING_DESC);
801 				if ((ret == -1) || (!POWEROF2(ret))) {
802 					RTE_LOG(INFO, VHOST_CONFIG,
803 					"Invalid argument for tx-desc-num [0-N],"
804 					"power of 2 required.\n");
805 					us_vhost_usage(prgname);
806 					return -1;
807 				} else {
808 					num_tx_descriptor = ret;
809 				}
810 			}
811 
812 			break;
813 
814 			/* Invalid option - print options. */
815 		default:
816 			us_vhost_usage(prgname);
817 			return -1;
818 		}
819 	}
820 
821 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822 		if (enabled_port_mask & (1 << i))
823 			ports[num_ports++] = (uint8_t)i;
824 	}
825 
826 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
827 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
829 		return -1;
830 	}
831 
832 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833 		RTE_LOG(INFO, VHOST_PORT,
834 			"Vhost zero copy doesn't support software vm2vm,"
835 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
836 		return -1;
837 	}
838 
839 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840 		RTE_LOG(INFO, VHOST_PORT,
841 			"Vhost zero copy doesn't support jumbo frame,"
842 			"please specify '--mergeable 0' to disable the "
843 			"mergeable feature.\n");
844 		return -1;
845 	}
846 
847 	return 0;
848 }
849 
850 /*
851  * Update the global var NUM_PORTS and array PORTS according to system ports number
852  * and return valid ports number
853  */
854 static unsigned check_ports_num(unsigned nb_ports)
855 {
856 	unsigned valid_num_ports = num_ports;
857 	unsigned portid;
858 
859 	if (num_ports > nb_ports) {
860 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861 			num_ports, nb_ports);
862 		num_ports = nb_ports;
863 	}
864 
865 	for (portid = 0; portid < num_ports; portid ++) {
866 		if (ports[portid] >= nb_ports) {
867 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868 				ports[portid], (nb_ports - 1));
869 			ports[portid] = INVALID_PORT_ID;
870 			valid_num_ports--;
871 		}
872 	}
873 	return valid_num_ports;
874 }
875 
876 /*
877  * Macro to print out packet contents. Wrapped in debug define so that the
878  * data path is not effected when debug is disabled.
879  */
880 #ifdef DEBUG
881 #define PRINT_PACKET(device, addr, size, header) do {																\
882 	char *pkt_addr = (char*)(addr);																					\
883 	unsigned int index;																								\
884 	char packet[MAX_PRINT_BUFF];																					\
885 																													\
886 	if ((header))																									\
887 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
888 	else																											\
889 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
890 	for (index = 0; index < (size); index++) {																		\
891 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
892 			"%02hhx ", pkt_addr[index]);																			\
893 	}																												\
894 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
895 																													\
896 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
897 } while(0)
898 #else
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
900 #endif
901 
902 /*
903  * Function to convert guest physical addresses to vhost physical addresses.
904  * This is used to convert virtio buffer addresses.
905  */
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
908 	uint32_t buf_len, hpa_type *addr_type)
909 {
910 	struct virtio_memory_regions_hpa *region;
911 	uint32_t regionidx;
912 	uint64_t vhost_pa = 0;
913 
914 	*addr_type = PHYS_ADDR_INVALID;
915 
916 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917 		region = &vdev->regions_hpa[regionidx];
918 		if ((guest_pa >= region->guest_phys_address) &&
919 			(guest_pa <= region->guest_phys_address_end)) {
920 			vhost_pa = region->host_phys_addr_offset + guest_pa;
921 			if (likely((guest_pa + buf_len - 1)
922 				<= region->guest_phys_address_end))
923 				*addr_type = PHYS_ADDR_CONTINUOUS;
924 			else
925 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
926 			break;
927 		}
928 	}
929 
930 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
931 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932 		(void *)(uintptr_t)vhost_pa);
933 
934 	return vhost_pa;
935 }
936 
937 /*
938  * Compares a packet destination MAC address to a device MAC address.
939  */
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
942 {
943 	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
944 }
945 
946 /*
947  * This function learns the MAC address of the device and registers this along with a
948  * vlan tag to a VMDQ.
949  */
950 static int
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
952 {
953 	struct ether_hdr *pkt_hdr;
954 	struct virtio_net_data_ll *dev_ll;
955 	struct virtio_net *dev = vdev->dev;
956 	int i, ret;
957 
958 	/* Learn MAC address of guest device from packet */
959 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960 
961 	dev_ll = ll_root_used;
962 
963 	while (dev_ll != NULL) {
964 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
966 			return -1;
967 		}
968 		dev_ll = dev_ll->next;
969 	}
970 
971 	for (i = 0; i < ETHER_ADDR_LEN; i++)
972 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
973 
974 	/* vlan_tag currently uses the device_id. */
975 	vdev->vlan_tag = vlan_tags[dev->device_fh];
976 
977 	/* Print out VMDQ registration info. */
978 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
979 		dev->device_fh,
980 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
983 		vdev->vlan_tag);
984 
985 	/* Register the MAC address. */
986 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987 				(uint32_t)dev->device_fh + vmdq_pool_base);
988 	if (ret)
989 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
990 					dev->device_fh);
991 
992 	/* Enable stripping of the vlan tag as we handle routing. */
993 	if (vlan_strip)
994 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995 			(uint16_t)vdev->vmdq_rx_q, 1);
996 
997 	/* Set device as ready for RX. */
998 	vdev->ready = DEVICE_RX;
999 
1000 	return 0;
1001 }
1002 
1003 /*
1004  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005  * queue before disabling RX on the device.
1006  */
1007 static inline void
1008 unlink_vmdq(struct vhost_dev *vdev)
1009 {
1010 	unsigned i = 0;
1011 	unsigned rx_count;
1012 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1013 
1014 	if (vdev->ready == DEVICE_RX) {
1015 		/*clear MAC and VLAN settings*/
1016 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017 		for (i = 0; i < 6; i++)
1018 			vdev->mac_address.addr_bytes[i] = 0;
1019 
1020 		vdev->vlan_tag = 0;
1021 
1022 		/*Clear out the receive buffers*/
1023 		rx_count = rte_eth_rx_burst(ports[0],
1024 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1025 
1026 		while (rx_count) {
1027 			for (i = 0; i < rx_count; i++)
1028 				rte_pktmbuf_free(pkts_burst[i]);
1029 
1030 			rx_count = rte_eth_rx_burst(ports[0],
1031 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1032 		}
1033 
1034 		vdev->ready = DEVICE_MAC_LEARNING;
1035 	}
1036 }
1037 
1038 /*
1039  * Check if the packet destination MAC address is for a local device. If so then put
1040  * the packet on that devices RX queue. If not then return.
1041  */
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1044 {
1045 	struct virtio_net_data_ll *dev_ll;
1046 	struct ether_hdr *pkt_hdr;
1047 	uint64_t ret = 0;
1048 	struct virtio_net *dev = vdev->dev;
1049 	struct virtio_net *tdev; /* destination virito device */
1050 
1051 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052 
1053 	/*get the used devices list*/
1054 	dev_ll = ll_root_used;
1055 
1056 	while (dev_ll != NULL) {
1057 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058 				          &dev_ll->vdev->mac_address)) {
1059 
1060 			/* Drop the packet if the TX packet is destined for the TX device. */
1061 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1063 							dev->device_fh);
1064 				return 0;
1065 			}
1066 			tdev = dev_ll->vdev->dev;
1067 
1068 
1069 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1070 
1071 			if (unlikely(dev_ll->vdev->remove)) {
1072 				/*drop the packet if the device is marked for removal*/
1073 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1074 			} else {
1075 				/*send the packet to the local virtio device*/
1076 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1077 				if (enable_stats) {
1078 					rte_atomic64_add(
1079 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1080 					1);
1081 					rte_atomic64_add(
1082 					&dev_statistics[tdev->device_fh].rx_atomic,
1083 					ret);
1084 					dev_statistics[dev->device_fh].tx_total++;
1085 					dev_statistics[dev->device_fh].tx += ret;
1086 				}
1087 			}
1088 
1089 			return 0;
1090 		}
1091 		dev_ll = dev_ll->next;
1092 	}
1093 
1094 	return -1;
1095 }
1096 
1097 /*
1098  * Check if the destination MAC of a packet is one local VM,
1099  * and get its vlan tag, and offset if it is.
1100  */
1101 static inline int __attribute__((always_inline))
1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1103 	uint32_t *offset, uint16_t *vlan_tag)
1104 {
1105 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1106 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1107 
1108 	while (dev_ll != NULL) {
1109 		if ((dev_ll->vdev->ready == DEVICE_RX)
1110 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1111 		&dev_ll->vdev->mac_address)) {
1112 			/*
1113 			 * Drop the packet if the TX packet is
1114 			 * destined for the TX device.
1115 			 */
1116 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1117 				LOG_DEBUG(VHOST_DATA,
1118 				"(%"PRIu64") TX: Source and destination"
1119 				" MAC addresses are the same. Dropping "
1120 				"packet.\n",
1121 				dev_ll->vdev->dev->device_fh);
1122 				return -1;
1123 			}
1124 
1125 			/*
1126 			 * HW vlan strip will reduce the packet length
1127 			 * by minus length of vlan tag, so need restore
1128 			 * the packet length by plus it.
1129 			 */
1130 			*offset = VLAN_HLEN;
1131 			*vlan_tag =
1132 			(uint16_t)
1133 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1134 
1135 			LOG_DEBUG(VHOST_DATA,
1136 			"(%"PRIu64") TX: pkt to local VM device id:"
1137 			"(%"PRIu64") vlan tag: %d.\n",
1138 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1139 			(int)*vlan_tag);
1140 
1141 			break;
1142 		}
1143 		dev_ll = dev_ll->next;
1144 	}
1145 	return 0;
1146 }
1147 
1148 static uint16_t
1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1150 {
1151 	if (ol_flags & PKT_TX_IPV4)
1152 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1153 	else /* assume ethertype == ETHER_TYPE_IPv6 */
1154 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1155 }
1156 
1157 static void virtio_tx_offload(struct rte_mbuf *m)
1158 {
1159 	void *l3_hdr;
1160 	struct ipv4_hdr *ipv4_hdr = NULL;
1161 	struct tcp_hdr *tcp_hdr = NULL;
1162 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163 
1164 	l3_hdr = (char *)eth_hdr + m->l2_len;
1165 
1166 	if (m->ol_flags & PKT_TX_IPV4) {
1167 		ipv4_hdr = l3_hdr;
1168 		ipv4_hdr->hdr_checksum = 0;
1169 		m->ol_flags |= PKT_TX_IP_CKSUM;
1170 	}
1171 
1172 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1173 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1174 }
1175 
1176 /*
1177  * This function routes the TX packet to the correct interface. This may be a local device
1178  * or the physical port.
1179  */
1180 static inline void __attribute__((always_inline))
1181 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1182 {
1183 	struct mbuf_table *tx_q;
1184 	struct rte_mbuf **m_table;
1185 	unsigned len, ret, offset = 0;
1186 	const uint16_t lcore_id = rte_lcore_id();
1187 	struct virtio_net *dev = vdev->dev;
1188 	struct ether_hdr *nh;
1189 
1190 	/*check if destination is local VM*/
1191 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1192 		rte_pktmbuf_free(m);
1193 		return;
1194 	}
1195 
1196 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1197 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1198 			rte_pktmbuf_free(m);
1199 			return;
1200 		}
1201 	}
1202 
1203 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1204 
1205 	/*Add packet to the port tx queue*/
1206 	tx_q = &lcore_tx_queue[lcore_id];
1207 	len = tx_q->len;
1208 
1209 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1210 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1211 		/* Guest has inserted the vlan tag. */
1212 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1213 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1214 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1215 			(vh->vlan_tci != vlan_tag_be))
1216 			vh->vlan_tci = vlan_tag_be;
1217 	} else {
1218 		m->ol_flags |= PKT_TX_VLAN_PKT;
1219 
1220 		/*
1221 		 * Find the right seg to adjust the data len when offset is
1222 		 * bigger than tail room size.
1223 		 */
1224 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1225 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1226 				m->data_len += offset;
1227 			else {
1228 				struct rte_mbuf *seg = m;
1229 
1230 				while ((seg->next != NULL) &&
1231 					(offset > rte_pktmbuf_tailroom(seg)))
1232 					seg = seg->next;
1233 
1234 				seg->data_len += offset;
1235 			}
1236 			m->pkt_len += offset;
1237 		}
1238 
1239 		m->vlan_tci = vlan_tag;
1240 	}
1241 
1242 	if (m->ol_flags & PKT_TX_TCP_SEG)
1243 		virtio_tx_offload(m);
1244 
1245 	tx_q->m_table[len] = m;
1246 	len++;
1247 	if (enable_stats) {
1248 		dev_statistics[dev->device_fh].tx_total++;
1249 		dev_statistics[dev->device_fh].tx++;
1250 	}
1251 
1252 	if (unlikely(len == MAX_PKT_BURST)) {
1253 		m_table = (struct rte_mbuf **)tx_q->m_table;
1254 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1255 		/* Free any buffers not handled by TX and update the port stats. */
1256 		if (unlikely(ret < len)) {
1257 			do {
1258 				rte_pktmbuf_free(m_table[ret]);
1259 			} while (++ret < len);
1260 		}
1261 
1262 		len = 0;
1263 	}
1264 
1265 	tx_q->len = len;
1266 	return;
1267 }
1268 /*
1269  * This function is called by each data core. It handles all RX/TX registered with the
1270  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1271  * with all devices in the main linked list.
1272  */
1273 static int
1274 switch_worker(__attribute__((unused)) void *arg)
1275 {
1276 	struct rte_mempool *mbuf_pool = arg;
1277 	struct virtio_net *dev = NULL;
1278 	struct vhost_dev *vdev = NULL;
1279 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1280 	struct virtio_net_data_ll *dev_ll;
1281 	struct mbuf_table *tx_q;
1282 	volatile struct lcore_ll_info *lcore_ll;
1283 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1284 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1285 	unsigned ret, i;
1286 	const uint16_t lcore_id = rte_lcore_id();
1287 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1288 	uint16_t rx_count = 0;
1289 	uint16_t tx_count;
1290 	uint32_t retry = 0;
1291 
1292 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1293 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1294 	prev_tsc = 0;
1295 
1296 	tx_q = &lcore_tx_queue[lcore_id];
1297 	for (i = 0; i < num_cores; i ++) {
1298 		if (lcore_ids[i] == lcore_id) {
1299 			tx_q->txq_id = i;
1300 			break;
1301 		}
1302 	}
1303 
1304 	while(1) {
1305 		cur_tsc = rte_rdtsc();
1306 		/*
1307 		 * TX burst queue drain
1308 		 */
1309 		diff_tsc = cur_tsc - prev_tsc;
1310 		if (unlikely(diff_tsc > drain_tsc)) {
1311 
1312 			if (tx_q->len) {
1313 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1314 
1315 				/*Tx any packets in the queue*/
1316 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1317 									   (struct rte_mbuf **)tx_q->m_table,
1318 									   (uint16_t)tx_q->len);
1319 				if (unlikely(ret < tx_q->len)) {
1320 					do {
1321 						rte_pktmbuf_free(tx_q->m_table[ret]);
1322 					} while (++ret < tx_q->len);
1323 				}
1324 
1325 				tx_q->len = 0;
1326 			}
1327 
1328 			prev_tsc = cur_tsc;
1329 
1330 		}
1331 
1332 		rte_prefetch0(lcore_ll->ll_root_used);
1333 		/*
1334 		 * Inform the configuration core that we have exited the linked list and that no devices are
1335 		 * in use if requested.
1336 		 */
1337 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1338 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1339 
1340 		/*
1341 		 * Process devices
1342 		 */
1343 		dev_ll = lcore_ll->ll_root_used;
1344 
1345 		while (dev_ll != NULL) {
1346 			/*get virtio device ID*/
1347 			vdev = dev_ll->vdev;
1348 			dev = vdev->dev;
1349 
1350 			if (unlikely(vdev->remove)) {
1351 				dev_ll = dev_ll->next;
1352 				unlink_vmdq(vdev);
1353 				vdev->ready = DEVICE_SAFE_REMOVE;
1354 				continue;
1355 			}
1356 			if (likely(vdev->ready == DEVICE_RX)) {
1357 				/*Handle guest RX*/
1358 				rx_count = rte_eth_rx_burst(ports[0],
1359 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1360 
1361 				if (rx_count) {
1362 					/*
1363 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1364 					* Here MAX_PKT_BURST must be less than virtio queue size
1365 					*/
1366 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1367 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1368 							rte_delay_us(burst_rx_delay_time);
1369 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1370 								break;
1371 						}
1372 					}
1373 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1374 					if (enable_stats) {
1375 						rte_atomic64_add(
1376 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1377 						rx_count);
1378 						rte_atomic64_add(
1379 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1380 					}
1381 					while (likely(rx_count)) {
1382 						rx_count--;
1383 						rte_pktmbuf_free(pkts_burst[rx_count]);
1384 					}
1385 
1386 				}
1387 			}
1388 
1389 			if (likely(!vdev->remove)) {
1390 				/* Handle guest TX*/
1391 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1392 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1393 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1394 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1395 						while (tx_count)
1396 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1397 					}
1398 				}
1399 				for (i = 0; i < tx_count; ++i) {
1400 					virtio_tx_route(vdev, pkts_burst[i],
1401 						vlan_tags[(uint16_t)dev->device_fh]);
1402 				}
1403 			}
1404 
1405 			/*move to the next device in the list*/
1406 			dev_ll = dev_ll->next;
1407 		}
1408 	}
1409 
1410 	return 0;
1411 }
1412 
1413 /*
1414  * This function gets available ring number for zero copy rx.
1415  * Only one thread will call this funciton for a paticular virtio device,
1416  * so, it is designed as non-thread-safe function.
1417  */
1418 static inline uint32_t __attribute__((always_inline))
1419 get_available_ring_num_zcp(struct virtio_net *dev)
1420 {
1421 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1422 	uint16_t avail_idx;
1423 
1424 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1425 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1426 }
1427 
1428 /*
1429  * This function gets available ring index for zero copy rx,
1430  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1431  * Only one thread will call this funciton for a paticular virtio device,
1432  * so, it is designed as non-thread-safe function.
1433  */
1434 static inline uint32_t __attribute__((always_inline))
1435 get_available_ring_index_zcp(struct virtio_net *dev,
1436 	uint16_t *res_base_idx, uint32_t count)
1437 {
1438 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1439 	uint16_t avail_idx;
1440 	uint32_t retry = 0;
1441 	uint16_t free_entries;
1442 
1443 	*res_base_idx = vq->last_used_idx_res;
1444 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1445 	free_entries = (avail_idx - *res_base_idx);
1446 
1447 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1448 			"avail idx: %d, "
1449 			"res base idx:%d, free entries:%d\n",
1450 			dev->device_fh, avail_idx, *res_base_idx,
1451 			free_entries);
1452 
1453 	/*
1454 	 * If retry is enabled and the queue is full then we wait
1455 	 * and retry to avoid packet loss.
1456 	 */
1457 	if (enable_retry && unlikely(count > free_entries)) {
1458 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1459 			rte_delay_us(burst_rx_delay_time);
1460 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1461 			free_entries = (avail_idx - *res_base_idx);
1462 			if (count <= free_entries)
1463 				break;
1464 		}
1465 	}
1466 
1467 	/*check that we have enough buffers*/
1468 	if (unlikely(count > free_entries))
1469 		count = free_entries;
1470 
1471 	if (unlikely(count == 0)) {
1472 		LOG_DEBUG(VHOST_DATA,
1473 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1474 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1475 			dev->device_fh, avail_idx,
1476 			*res_base_idx, free_entries);
1477 		return 0;
1478 	}
1479 
1480 	vq->last_used_idx_res = *res_base_idx + count;
1481 
1482 	return count;
1483 }
1484 
1485 /*
1486  * This function put descriptor back to used list.
1487  */
1488 static inline void __attribute__((always_inline))
1489 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1490 {
1491 	uint16_t res_cur_idx = vq->last_used_idx;
1492 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1493 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1494 	rte_compiler_barrier();
1495 	*(volatile uint16_t *)&vq->used->idx += 1;
1496 	vq->last_used_idx += 1;
1497 
1498 	/* Kick the guest if necessary. */
1499 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1500 		eventfd_write(vq->callfd, (eventfd_t)1);
1501 }
1502 
1503 /*
1504  * This function get available descriptor from vitio vring and un-attached mbuf
1505  * from vpool->ring, and then attach them together. It needs adjust the offset
1506  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1507  * frame data may be put to wrong location in mbuf.
1508  */
1509 static inline void __attribute__((always_inline))
1510 attach_rxmbuf_zcp(struct virtio_net *dev)
1511 {
1512 	uint16_t res_base_idx, desc_idx;
1513 	uint64_t buff_addr, phys_addr;
1514 	struct vhost_virtqueue *vq;
1515 	struct vring_desc *desc;
1516 	void *obj = NULL;
1517 	struct rte_mbuf *mbuf;
1518 	struct vpool *vpool;
1519 	hpa_type addr_type;
1520 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1521 
1522 	vpool = &vpool_array[vdev->vmdq_rx_q];
1523 	vq = dev->virtqueue[VIRTIO_RXQ];
1524 
1525 	do {
1526 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1527 				1) != 1))
1528 			return;
1529 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1530 
1531 		desc = &vq->desc[desc_idx];
1532 		if (desc->flags & VRING_DESC_F_NEXT) {
1533 			desc = &vq->desc[desc->next];
1534 			buff_addr = gpa_to_vva(dev, desc->addr);
1535 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1536 					&addr_type);
1537 		} else {
1538 			buff_addr = gpa_to_vva(dev,
1539 					desc->addr + vq->vhost_hlen);
1540 			phys_addr = gpa_to_hpa(vdev,
1541 					desc->addr + vq->vhost_hlen,
1542 					desc->len, &addr_type);
1543 		}
1544 
1545 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1546 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1547 				" address found when attaching RX frame buffer"
1548 				" address!\n", dev->device_fh);
1549 			put_desc_to_used_list_zcp(vq, desc_idx);
1550 			continue;
1551 		}
1552 
1553 		/*
1554 		 * Check if the frame buffer address from guest crosses
1555 		 * sub-region or not.
1556 		 */
1557 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1558 			RTE_LOG(ERR, VHOST_DATA,
1559 				"(%"PRIu64") Frame buffer address cross "
1560 				"sub-regioin found when attaching RX frame "
1561 				"buffer address!\n",
1562 				dev->device_fh);
1563 			put_desc_to_used_list_zcp(vq, desc_idx);
1564 			continue;
1565 		}
1566 	} while (unlikely(phys_addr == 0));
1567 
1568 	rte_ring_sc_dequeue(vpool->ring, &obj);
1569 	mbuf = obj;
1570 	if (unlikely(mbuf == NULL)) {
1571 		LOG_DEBUG(VHOST_DATA,
1572 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1573 			"ring_sc_dequeue fail.\n",
1574 			dev->device_fh);
1575 		put_desc_to_used_list_zcp(vq, desc_idx);
1576 		return;
1577 	}
1578 
1579 	if (unlikely(vpool->buf_size > desc->len)) {
1580 		LOG_DEBUG(VHOST_DATA,
1581 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1582 			"length(%d) of descriptor idx: %d less than room "
1583 			"size required: %d\n",
1584 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1585 		put_desc_to_used_list_zcp(vq, desc_idx);
1586 		rte_ring_sp_enqueue(vpool->ring, obj);
1587 		return;
1588 	}
1589 
1590 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1591 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1592 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1593 	mbuf->data_len = desc->len;
1594 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1595 
1596 	LOG_DEBUG(VHOST_DATA,
1597 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1598 		"descriptor idx:%d\n",
1599 		dev->device_fh, res_base_idx, desc_idx);
1600 
1601 	__rte_mbuf_raw_free(mbuf);
1602 
1603 	return;
1604 }
1605 
1606 /*
1607  * Detach an attched packet mbuf -
1608  *  - restore original mbuf address and length values.
1609  *  - reset pktmbuf data and data_len to their default values.
1610  *  All other fields of the given packet mbuf will be left intact.
1611  *
1612  * @param m
1613  *   The attached packet mbuf.
1614  */
1615 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1616 {
1617 	const struct rte_mempool *mp = m->pool;
1618 	void *buf = rte_mbuf_to_baddr(m);
1619 	uint32_t buf_ofs;
1620 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1621 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1622 
1623 	m->buf_addr = buf;
1624 	m->buf_len = (uint16_t)buf_len;
1625 
1626 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1627 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1628 	m->data_off = buf_ofs;
1629 
1630 	m->data_len = 0;
1631 }
1632 
1633 /*
1634  * This function is called after packets have been transimited. It fetchs mbuf
1635  * from vpool->pool, detached it and put into vpool->ring. It also update the
1636  * used index and kick the guest if necessary.
1637  */
1638 static inline uint32_t __attribute__((always_inline))
1639 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1640 {
1641 	struct rte_mbuf *mbuf;
1642 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1643 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1644 	uint32_t index = 0;
1645 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1646 
1647 	LOG_DEBUG(VHOST_DATA,
1648 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1649 		"clean is: %d\n",
1650 		dev->device_fh, mbuf_count);
1651 	LOG_DEBUG(VHOST_DATA,
1652 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1653 		"clean  is : %d\n",
1654 		dev->device_fh, rte_ring_count(vpool->ring));
1655 
1656 	for (index = 0; index < mbuf_count; index++) {
1657 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1658 		if (likely(MBUF_EXT_MEM(mbuf)))
1659 			pktmbuf_detach_zcp(mbuf);
1660 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1661 
1662 		/* Update used index buffer information. */
1663 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1664 		vq->used->ring[used_idx].len = 0;
1665 
1666 		used_idx = (used_idx + 1) & (vq->size - 1);
1667 	}
1668 
1669 	LOG_DEBUG(VHOST_DATA,
1670 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1671 		"clean is: %d\n",
1672 		dev->device_fh, rte_mempool_count(vpool->pool));
1673 	LOG_DEBUG(VHOST_DATA,
1674 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1675 		"clean  is : %d\n",
1676 		dev->device_fh, rte_ring_count(vpool->ring));
1677 	LOG_DEBUG(VHOST_DATA,
1678 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1679 		"vq->last_used_idx:%d\n",
1680 		dev->device_fh, vq->last_used_idx);
1681 
1682 	vq->last_used_idx += mbuf_count;
1683 
1684 	LOG_DEBUG(VHOST_DATA,
1685 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1686 		"vq->last_used_idx:%d\n",
1687 		dev->device_fh, vq->last_used_idx);
1688 
1689 	rte_compiler_barrier();
1690 
1691 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1692 
1693 	/* Kick guest if required. */
1694 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1695 		eventfd_write(vq->callfd, (eventfd_t)1);
1696 
1697 	return 0;
1698 }
1699 
1700 /*
1701  * This function is called when a virtio device is destroy.
1702  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1703  */
1704 static void mbuf_destroy_zcp(struct vpool *vpool)
1705 {
1706 	struct rte_mbuf *mbuf = NULL;
1707 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1708 
1709 	LOG_DEBUG(VHOST_CONFIG,
1710 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1711 		"mbuf_destroy_zcp is: %d\n",
1712 		mbuf_count);
1713 	LOG_DEBUG(VHOST_CONFIG,
1714 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1715 		"mbuf_destroy_zcp  is : %d\n",
1716 		rte_ring_count(vpool->ring));
1717 
1718 	for (index = 0; index < mbuf_count; index++) {
1719 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1720 		if (likely(mbuf != NULL)) {
1721 			if (likely(MBUF_EXT_MEM(mbuf)))
1722 				pktmbuf_detach_zcp(mbuf);
1723 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1724 		}
1725 	}
1726 
1727 	LOG_DEBUG(VHOST_CONFIG,
1728 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1729 		"mbuf_destroy_zcp is: %d\n",
1730 		rte_mempool_count(vpool->pool));
1731 	LOG_DEBUG(VHOST_CONFIG,
1732 		"in mbuf_destroy_zcp: mbuf count in ring after "
1733 		"mbuf_destroy_zcp is : %d\n",
1734 		rte_ring_count(vpool->ring));
1735 }
1736 
1737 /*
1738  * This function update the use flag and counter.
1739  */
1740 static inline uint32_t __attribute__((always_inline))
1741 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1742 	uint32_t count)
1743 {
1744 	struct vhost_virtqueue *vq;
1745 	struct vring_desc *desc;
1746 	struct rte_mbuf *buff;
1747 	/* The virtio_hdr is initialised to 0. */
1748 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1749 		= {{0, 0, 0, 0, 0, 0}, 0};
1750 	uint64_t buff_hdr_addr = 0;
1751 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1752 	uint32_t head_idx, packet_success = 0;
1753 	uint16_t res_cur_idx;
1754 
1755 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1756 
1757 	if (count == 0)
1758 		return 0;
1759 
1760 	vq = dev->virtqueue[VIRTIO_RXQ];
1761 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1762 
1763 	res_cur_idx = vq->last_used_idx;
1764 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1765 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1766 
1767 	/* Retrieve all of the head indexes first to avoid caching issues. */
1768 	for (head_idx = 0; head_idx < count; head_idx++)
1769 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1770 
1771 	/*Prefetch descriptor index. */
1772 	rte_prefetch0(&vq->desc[head[packet_success]]);
1773 
1774 	while (packet_success != count) {
1775 		/* Get descriptor from available ring */
1776 		desc = &vq->desc[head[packet_success]];
1777 
1778 		buff = pkts[packet_success];
1779 		LOG_DEBUG(VHOST_DATA,
1780 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1781 			"pkt[%d] descriptor idx: %d\n",
1782 			dev->device_fh, packet_success,
1783 			MBUF_HEADROOM_UINT32(buff));
1784 
1785 		PRINT_PACKET(dev,
1786 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1787 			+ RTE_PKTMBUF_HEADROOM),
1788 			rte_pktmbuf_data_len(buff), 0);
1789 
1790 		/* Buffer address translation for virtio header. */
1791 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1792 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1793 
1794 		/*
1795 		 * If the descriptors are chained the header and data are
1796 		 * placed in separate buffers.
1797 		 */
1798 		if (desc->flags & VRING_DESC_F_NEXT) {
1799 			desc->len = vq->vhost_hlen;
1800 			desc = &vq->desc[desc->next];
1801 			desc->len = rte_pktmbuf_data_len(buff);
1802 		} else {
1803 			desc->len = packet_len;
1804 		}
1805 
1806 		/* Update used ring with desc information */
1807 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1808 			= head[packet_success];
1809 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1810 			= packet_len;
1811 		res_cur_idx++;
1812 		packet_success++;
1813 
1814 		/* A header is required per buffer. */
1815 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1816 			(const void *)&virtio_hdr, vq->vhost_hlen);
1817 
1818 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1819 
1820 		if (likely(packet_success < count)) {
1821 			/* Prefetch descriptor index. */
1822 			rte_prefetch0(&vq->desc[head[packet_success]]);
1823 		}
1824 	}
1825 
1826 	rte_compiler_barrier();
1827 
1828 	LOG_DEBUG(VHOST_DATA,
1829 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1830 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1831 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1832 
1833 	*(volatile uint16_t *)&vq->used->idx += count;
1834 	vq->last_used_idx += count;
1835 
1836 	LOG_DEBUG(VHOST_DATA,
1837 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1838 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1839 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1840 
1841 	/* Kick the guest if necessary. */
1842 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1843 		eventfd_write(vq->callfd, (eventfd_t)1);
1844 
1845 	return count;
1846 }
1847 
1848 /*
1849  * This function routes the TX packet to the correct interface.
1850  * This may be a local device or the physical port.
1851  */
1852 static inline void __attribute__((always_inline))
1853 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1854 	uint32_t desc_idx, uint8_t need_copy)
1855 {
1856 	struct mbuf_table *tx_q;
1857 	struct rte_mbuf **m_table;
1858 	void *obj = NULL;
1859 	struct rte_mbuf *mbuf;
1860 	unsigned len, ret, offset = 0;
1861 	struct vpool *vpool;
1862 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1863 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1864 
1865 	/*Add packet to the port tx queue*/
1866 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1867 	len = tx_q->len;
1868 
1869 	/* Allocate an mbuf and populate the structure. */
1870 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1871 	rte_ring_sc_dequeue(vpool->ring, &obj);
1872 	mbuf = obj;
1873 	if (unlikely(mbuf == NULL)) {
1874 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1875 		RTE_LOG(ERR, VHOST_DATA,
1876 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1877 			dev->device_fh);
1878 		put_desc_to_used_list_zcp(vq, desc_idx);
1879 		return;
1880 	}
1881 
1882 	if (vm2vm_mode == VM2VM_HARDWARE) {
1883 		/* Avoid using a vlan tag from any vm for external pkt, such as
1884 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1885 		 * selection, MAC address determines it as an external pkt
1886 		 * which should go to network, while vlan tag determine it as
1887 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1888 		 * such a ambiguous situation, so pkt will lost.
1889 		 */
1890 		vlan_tag = external_pkt_default_vlan_tag;
1891 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1892 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1893 			__rte_mbuf_raw_free(mbuf);
1894 			return;
1895 		}
1896 	}
1897 
1898 	mbuf->nb_segs = m->nb_segs;
1899 	mbuf->next = m->next;
1900 	mbuf->data_len = m->data_len + offset;
1901 	mbuf->pkt_len = mbuf->data_len;
1902 	if (unlikely(need_copy)) {
1903 		/* Copy the packet contents to the mbuf. */
1904 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1905 			rte_pktmbuf_mtod(m, void *),
1906 			m->data_len);
1907 	} else {
1908 		mbuf->data_off = m->data_off;
1909 		mbuf->buf_physaddr = m->buf_physaddr;
1910 		mbuf->buf_addr = m->buf_addr;
1911 	}
1912 	mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1913 	mbuf->vlan_tci = vlan_tag;
1914 	mbuf->l2_len = sizeof(struct ether_hdr);
1915 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1916 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1917 
1918 	tx_q->m_table[len] = mbuf;
1919 	len++;
1920 
1921 	LOG_DEBUG(VHOST_DATA,
1922 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1923 		dev->device_fh,
1924 		mbuf->nb_segs,
1925 		(mbuf->next == NULL) ? "null" : "non-null");
1926 
1927 	if (enable_stats) {
1928 		dev_statistics[dev->device_fh].tx_total++;
1929 		dev_statistics[dev->device_fh].tx++;
1930 	}
1931 
1932 	if (unlikely(len == MAX_PKT_BURST)) {
1933 		m_table = (struct rte_mbuf **)tx_q->m_table;
1934 		ret = rte_eth_tx_burst(ports[0],
1935 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1936 
1937 		/*
1938 		 * Free any buffers not handled by TX and update
1939 		 * the port stats.
1940 		 */
1941 		if (unlikely(ret < len)) {
1942 			do {
1943 				rte_pktmbuf_free(m_table[ret]);
1944 			} while (++ret < len);
1945 		}
1946 
1947 		len = 0;
1948 		txmbuf_clean_zcp(dev, vpool);
1949 	}
1950 
1951 	tx_q->len = len;
1952 
1953 	return;
1954 }
1955 
1956 /*
1957  * This function TX all available packets in virtio TX queue for one
1958  * virtio-net device. If it is first packet, it learns MAC address and
1959  * setup VMDQ.
1960  */
1961 static inline void __attribute__((always_inline))
1962 virtio_dev_tx_zcp(struct virtio_net *dev)
1963 {
1964 	struct rte_mbuf m;
1965 	struct vhost_virtqueue *vq;
1966 	struct vring_desc *desc;
1967 	uint64_t buff_addr = 0, phys_addr;
1968 	uint32_t head[MAX_PKT_BURST];
1969 	uint32_t i;
1970 	uint16_t free_entries, packet_success = 0;
1971 	uint16_t avail_idx;
1972 	uint8_t need_copy = 0;
1973 	hpa_type addr_type;
1974 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1975 
1976 	vq = dev->virtqueue[VIRTIO_TXQ];
1977 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1978 
1979 	/* If there are no available buffers then return. */
1980 	if (vq->last_used_idx_res == avail_idx)
1981 		return;
1982 
1983 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1984 
1985 	/* Prefetch available ring to retrieve head indexes. */
1986 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1987 
1988 	/* Get the number of free entries in the ring */
1989 	free_entries = (avail_idx - vq->last_used_idx_res);
1990 
1991 	/* Limit to MAX_PKT_BURST. */
1992 	free_entries
1993 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1994 
1995 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1996 		dev->device_fh, free_entries);
1997 
1998 	/* Retrieve all of the head indexes first to avoid caching issues. */
1999 	for (i = 0; i < free_entries; i++)
2000 		head[i]
2001 			= vq->avail->ring[(vq->last_used_idx_res + i)
2002 			& (vq->size - 1)];
2003 
2004 	vq->last_used_idx_res += free_entries;
2005 
2006 	/* Prefetch descriptor index. */
2007 	rte_prefetch0(&vq->desc[head[packet_success]]);
2008 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2009 
2010 	while (packet_success < free_entries) {
2011 		desc = &vq->desc[head[packet_success]];
2012 
2013 		/* Discard first buffer as it is the virtio header */
2014 		desc = &vq->desc[desc->next];
2015 
2016 		/* Buffer address translation. */
2017 		buff_addr = gpa_to_vva(dev, desc->addr);
2018 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
2019 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2020 			&addr_type);
2021 
2022 		if (likely(packet_success < (free_entries - 1)))
2023 			/* Prefetch descriptor index. */
2024 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2025 
2026 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2027 			RTE_LOG(ERR, VHOST_DATA,
2028 				"(%"PRIu64") Invalid frame buffer address found"
2029 				"when TX packets!\n",
2030 				dev->device_fh);
2031 			packet_success++;
2032 			continue;
2033 		}
2034 
2035 		/* Prefetch buffer address. */
2036 		rte_prefetch0((void *)(uintptr_t)buff_addr);
2037 
2038 		/*
2039 		 * Setup dummy mbuf. This is copied to a real mbuf if
2040 		 * transmitted out the physical port.
2041 		 */
2042 		m.data_len = desc->len;
2043 		m.nb_segs = 1;
2044 		m.next = NULL;
2045 		m.data_off = 0;
2046 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2047 		m.buf_physaddr = phys_addr;
2048 
2049 		/*
2050 		 * Check if the frame buffer address from guest crosses
2051 		 * sub-region or not.
2052 		 */
2053 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2054 			RTE_LOG(ERR, VHOST_DATA,
2055 				"(%"PRIu64") Frame buffer address cross "
2056 				"sub-regioin found when attaching TX frame "
2057 				"buffer address!\n",
2058 				dev->device_fh);
2059 			need_copy = 1;
2060 		} else
2061 			need_copy = 0;
2062 
2063 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2064 
2065 		/*
2066 		 * If this is the first received packet we need to learn
2067 		 * the MAC and setup VMDQ
2068 		 */
2069 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2070 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2071 				/*
2072 				 * Discard frame if device is scheduled for
2073 				 * removal or a duplicate MAC address is found.
2074 				 */
2075 				packet_success += free_entries;
2076 				vq->last_used_idx += packet_success;
2077 				break;
2078 			}
2079 		}
2080 
2081 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2082 		packet_success++;
2083 	}
2084 }
2085 
2086 /*
2087  * This function is called by each data core. It handles all RX/TX registered
2088  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2089  * addresses are compared with all devices in the main linked list.
2090  */
2091 static int
2092 switch_worker_zcp(__attribute__((unused)) void *arg)
2093 {
2094 	struct virtio_net *dev = NULL;
2095 	struct vhost_dev  *vdev = NULL;
2096 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2097 	struct virtio_net_data_ll *dev_ll;
2098 	struct mbuf_table *tx_q;
2099 	volatile struct lcore_ll_info *lcore_ll;
2100 	const uint64_t drain_tsc
2101 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2102 		* BURST_TX_DRAIN_US;
2103 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2104 	unsigned ret;
2105 	const uint16_t lcore_id = rte_lcore_id();
2106 	uint16_t count_in_ring, rx_count = 0;
2107 
2108 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2109 
2110 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2111 	prev_tsc = 0;
2112 
2113 	while (1) {
2114 		cur_tsc = rte_rdtsc();
2115 
2116 		/* TX burst queue drain */
2117 		diff_tsc = cur_tsc - prev_tsc;
2118 		if (unlikely(diff_tsc > drain_tsc)) {
2119 			/*
2120 			 * Get mbuf from vpool.pool and detach mbuf and
2121 			 * put back into vpool.ring.
2122 			 */
2123 			dev_ll = lcore_ll->ll_root_used;
2124 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2125 				/* Get virtio device ID */
2126 				vdev = dev_ll->vdev;
2127 				dev = vdev->dev;
2128 
2129 				if (likely(!vdev->remove)) {
2130 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2131 					if (tx_q->len) {
2132 						LOG_DEBUG(VHOST_DATA,
2133 						"TX queue drained after timeout"
2134 						" with burst size %u\n",
2135 						tx_q->len);
2136 
2137 						/*
2138 						 * Tx any packets in the queue
2139 						 */
2140 						ret = rte_eth_tx_burst(
2141 							ports[0],
2142 							(uint16_t)tx_q->txq_id,
2143 							(struct rte_mbuf **)
2144 							tx_q->m_table,
2145 							(uint16_t)tx_q->len);
2146 						if (unlikely(ret < tx_q->len)) {
2147 							do {
2148 								rte_pktmbuf_free(
2149 									tx_q->m_table[ret]);
2150 							} while (++ret < tx_q->len);
2151 						}
2152 						tx_q->len = 0;
2153 
2154 						txmbuf_clean_zcp(dev,
2155 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2156 					}
2157 				}
2158 				dev_ll = dev_ll->next;
2159 			}
2160 			prev_tsc = cur_tsc;
2161 		}
2162 
2163 		rte_prefetch0(lcore_ll->ll_root_used);
2164 
2165 		/*
2166 		 * Inform the configuration core that we have exited the linked
2167 		 * list and that no devices are in use if requested.
2168 		 */
2169 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2170 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2171 
2172 		/* Process devices */
2173 		dev_ll = lcore_ll->ll_root_used;
2174 
2175 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2176 			vdev = dev_ll->vdev;
2177 			dev  = vdev->dev;
2178 			if (unlikely(vdev->remove)) {
2179 				dev_ll = dev_ll->next;
2180 				unlink_vmdq(vdev);
2181 				vdev->ready = DEVICE_SAFE_REMOVE;
2182 				continue;
2183 			}
2184 
2185 			if (likely(vdev->ready == DEVICE_RX)) {
2186 				uint32_t index = vdev->vmdq_rx_q;
2187 				uint16_t i;
2188 				count_in_ring
2189 				= rte_ring_count(vpool_array[index].ring);
2190 				uint16_t free_entries
2191 				= (uint16_t)get_available_ring_num_zcp(dev);
2192 
2193 				/*
2194 				 * Attach all mbufs in vpool.ring and put back
2195 				 * into vpool.pool.
2196 				 */
2197 				for (i = 0;
2198 				i < RTE_MIN(free_entries,
2199 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2200 				i++)
2201 					attach_rxmbuf_zcp(dev);
2202 
2203 				/* Handle guest RX */
2204 				rx_count = rte_eth_rx_burst(ports[0],
2205 					vdev->vmdq_rx_q, pkts_burst,
2206 					MAX_PKT_BURST);
2207 
2208 				if (rx_count) {
2209 					ret_count = virtio_dev_rx_zcp(dev,
2210 							pkts_burst, rx_count);
2211 					if (enable_stats) {
2212 						dev_statistics[dev->device_fh].rx_total
2213 							+= rx_count;
2214 						dev_statistics[dev->device_fh].rx
2215 							+= ret_count;
2216 					}
2217 					while (likely(rx_count)) {
2218 						rx_count--;
2219 						pktmbuf_detach_zcp(
2220 							pkts_burst[rx_count]);
2221 						rte_ring_sp_enqueue(
2222 							vpool_array[index].ring,
2223 							(void *)pkts_burst[rx_count]);
2224 					}
2225 				}
2226 			}
2227 
2228 			if (likely(!vdev->remove))
2229 				/* Handle guest TX */
2230 				virtio_dev_tx_zcp(dev);
2231 
2232 			/* Move to the next device in the list */
2233 			dev_ll = dev_ll->next;
2234 		}
2235 	}
2236 
2237 	return 0;
2238 }
2239 
2240 
2241 /*
2242  * Add an entry to a used linked list. A free entry must first be found
2243  * in the free linked list using get_data_ll_free_entry();
2244  */
2245 static void
2246 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2247 	struct virtio_net_data_ll *ll_dev)
2248 {
2249 	struct virtio_net_data_ll *ll = *ll_root_addr;
2250 
2251 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2252 	ll_dev->next = NULL;
2253 	rte_compiler_barrier();
2254 
2255 	/* If ll == NULL then this is the first device. */
2256 	if (ll) {
2257 		/* Increment to the tail of the linked list. */
2258 		while ((ll->next != NULL) )
2259 			ll = ll->next;
2260 
2261 		ll->next = ll_dev;
2262 	} else {
2263 		*ll_root_addr = ll_dev;
2264 	}
2265 }
2266 
2267 /*
2268  * Remove an entry from a used linked list. The entry must then be added to
2269  * the free linked list using put_data_ll_free_entry().
2270  */
2271 static void
2272 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2273 	struct virtio_net_data_ll *ll_dev,
2274 	struct virtio_net_data_ll *ll_dev_last)
2275 {
2276 	struct virtio_net_data_ll *ll = *ll_root_addr;
2277 
2278 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2279 		return;
2280 
2281 	if (ll_dev == ll)
2282 		*ll_root_addr = ll_dev->next;
2283 	else
2284 		if (likely(ll_dev_last != NULL))
2285 			ll_dev_last->next = ll_dev->next;
2286 		else
2287 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2288 }
2289 
2290 /*
2291  * Find and return an entry from the free linked list.
2292  */
2293 static struct virtio_net_data_ll *
2294 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2295 {
2296 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2297 	struct virtio_net_data_ll *ll_dev;
2298 
2299 	if (ll_free == NULL)
2300 		return NULL;
2301 
2302 	ll_dev = ll_free;
2303 	*ll_root_addr = ll_free->next;
2304 
2305 	return ll_dev;
2306 }
2307 
2308 /*
2309  * Place an entry back on to the free linked list.
2310  */
2311 static void
2312 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2313 	struct virtio_net_data_ll *ll_dev)
2314 {
2315 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2316 
2317 	if (ll_dev == NULL)
2318 		return;
2319 
2320 	ll_dev->next = ll_free;
2321 	*ll_root_addr = ll_dev;
2322 }
2323 
2324 /*
2325  * Creates a linked list of a given size.
2326  */
2327 static struct virtio_net_data_ll *
2328 alloc_data_ll(uint32_t size)
2329 {
2330 	struct virtio_net_data_ll *ll_new;
2331 	uint32_t i;
2332 
2333 	/* Malloc and then chain the linked list. */
2334 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2335 	if (ll_new == NULL) {
2336 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2337 		return NULL;
2338 	}
2339 
2340 	for (i = 0; i < size - 1; i++) {
2341 		ll_new[i].vdev = NULL;
2342 		ll_new[i].next = &ll_new[i+1];
2343 	}
2344 	ll_new[i].next = NULL;
2345 
2346 	return ll_new;
2347 }
2348 
2349 /*
2350  * Create the main linked list along with each individual cores linked list. A used and a free list
2351  * are created to manage entries.
2352  */
2353 static int
2354 init_data_ll (void)
2355 {
2356 	int lcore;
2357 
2358 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2359 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2360 		if (lcore_info[lcore].lcore_ll == NULL) {
2361 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2362 			return -1;
2363 		}
2364 
2365 		lcore_info[lcore].lcore_ll->device_num = 0;
2366 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2367 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2368 		if (num_devices % num_switching_cores)
2369 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2370 		else
2371 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2372 	}
2373 
2374 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2375 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2376 
2377 	return 0;
2378 }
2379 
2380 /*
2381  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2382  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2383  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2384  */
2385 static void
2386 destroy_device (volatile struct virtio_net *dev)
2387 {
2388 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2389 	struct virtio_net_data_ll *ll_main_dev_cur;
2390 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2391 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2392 	struct vhost_dev *vdev;
2393 	int lcore;
2394 
2395 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2396 
2397 	vdev = (struct vhost_dev *)dev->priv;
2398 	/*set the remove flag. */
2399 	vdev->remove = 1;
2400 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2401 		rte_pause();
2402 	}
2403 
2404 	/* Search for entry to be removed from lcore ll */
2405 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2406 	while (ll_lcore_dev_cur != NULL) {
2407 		if (ll_lcore_dev_cur->vdev == vdev) {
2408 			break;
2409 		} else {
2410 			ll_lcore_dev_last = ll_lcore_dev_cur;
2411 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2412 		}
2413 	}
2414 
2415 	if (ll_lcore_dev_cur == NULL) {
2416 		RTE_LOG(ERR, VHOST_CONFIG,
2417 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2418 			dev->device_fh);
2419 		return;
2420 	}
2421 
2422 	/* Search for entry to be removed from main ll */
2423 	ll_main_dev_cur = ll_root_used;
2424 	ll_main_dev_last = NULL;
2425 	while (ll_main_dev_cur != NULL) {
2426 		if (ll_main_dev_cur->vdev == vdev) {
2427 			break;
2428 		} else {
2429 			ll_main_dev_last = ll_main_dev_cur;
2430 			ll_main_dev_cur = ll_main_dev_cur->next;
2431 		}
2432 	}
2433 
2434 	/* Remove entries from the lcore and main ll. */
2435 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2436 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2437 
2438 	/* Set the dev_removal_flag on each lcore. */
2439 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2440 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2441 	}
2442 
2443 	/*
2444 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2445 	 * they can no longer access the device removed from the linked lists and that the devices
2446 	 * are no longer in use.
2447 	 */
2448 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2449 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2450 			rte_pause();
2451 		}
2452 	}
2453 
2454 	/* Add the entries back to the lcore and main free ll.*/
2455 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2456 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2457 
2458 	/* Decrement number of device on the lcore. */
2459 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2460 
2461 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2462 
2463 	if (zero_copy) {
2464 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2465 
2466 		/* Stop the RX queue. */
2467 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2468 			LOG_DEBUG(VHOST_CONFIG,
2469 				"(%"PRIu64") In destroy_device: Failed to stop "
2470 				"rx queue:%d\n",
2471 				dev->device_fh,
2472 				vdev->vmdq_rx_q);
2473 		}
2474 
2475 		LOG_DEBUG(VHOST_CONFIG,
2476 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2477 			"mempool back to ring for RX queue: %d\n",
2478 			dev->device_fh, vdev->vmdq_rx_q);
2479 
2480 		mbuf_destroy_zcp(vpool);
2481 
2482 		/* Stop the TX queue. */
2483 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2484 			LOG_DEBUG(VHOST_CONFIG,
2485 				"(%"PRIu64") In destroy_device: Failed to "
2486 				"stop tx queue:%d\n",
2487 				dev->device_fh, vdev->vmdq_rx_q);
2488 		}
2489 
2490 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2491 
2492 		LOG_DEBUG(VHOST_CONFIG,
2493 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2494 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2495 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2496 			dev->device_fh);
2497 
2498 		mbuf_destroy_zcp(vpool);
2499 		rte_free(vdev->regions_hpa);
2500 	}
2501 	rte_free(vdev);
2502 
2503 }
2504 
2505 /*
2506  * Calculate the region count of physical continous regions for one particular
2507  * region of whose vhost virtual address is continous. The particular region
2508  * start from vva_start, with size of 'size' in argument.
2509  */
2510 static uint32_t
2511 check_hpa_regions(uint64_t vva_start, uint64_t size)
2512 {
2513 	uint32_t i, nregions = 0, page_size = getpagesize();
2514 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2515 	if (vva_start % page_size) {
2516 		LOG_DEBUG(VHOST_CONFIG,
2517 			"in check_countinous: vva start(%p) mod page_size(%d) "
2518 			"has remainder\n",
2519 			(void *)(uintptr_t)vva_start, page_size);
2520 		return 0;
2521 	}
2522 	if (size % page_size) {
2523 		LOG_DEBUG(VHOST_CONFIG,
2524 			"in check_countinous: "
2525 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2526 			size, page_size);
2527 		return 0;
2528 	}
2529 	for (i = 0; i < size - page_size; i = i + page_size) {
2530 		cur_phys_addr
2531 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2532 		next_phys_addr = rte_mem_virt2phy(
2533 			(void *)(uintptr_t)(vva_start + i + page_size));
2534 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2535 			++nregions;
2536 			LOG_DEBUG(VHOST_CONFIG,
2537 				"in check_continuous: hva addr:(%p) is not "
2538 				"continuous with hva addr:(%p), diff:%d\n",
2539 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2540 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2541 				+ page_size), page_size);
2542 			LOG_DEBUG(VHOST_CONFIG,
2543 				"in check_continuous: hpa addr:(%p) is not "
2544 				"continuous with hpa addr:(%p), "
2545 				"diff:(%"PRIu64")\n",
2546 				(void *)(uintptr_t)cur_phys_addr,
2547 				(void *)(uintptr_t)next_phys_addr,
2548 				(next_phys_addr-cur_phys_addr));
2549 		}
2550 	}
2551 	return nregions;
2552 }
2553 
2554 /*
2555  * Divide each region whose vhost virtual address is continous into a few
2556  * sub-regions, make sure the physical address within each sub-region are
2557  * continous. And fill offset(to GPA) and size etc. information of each
2558  * sub-region into regions_hpa.
2559  */
2560 static uint32_t
2561 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2562 {
2563 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2564 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2565 
2566 	if (mem_region_hpa == NULL)
2567 		return 0;
2568 
2569 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2570 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2571 			virtio_memory->regions[regionidx].address_offset;
2572 		mem_region_hpa[regionidx_hpa].guest_phys_address
2573 			= virtio_memory->regions[regionidx].guest_phys_address;
2574 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2575 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2576 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2577 		LOG_DEBUG(VHOST_CONFIG,
2578 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2579 			regionidx_hpa,
2580 			(void *)(uintptr_t)
2581 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2582 		LOG_DEBUG(VHOST_CONFIG,
2583 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2584 			regionidx_hpa,
2585 			(void *)(uintptr_t)
2586 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2587 		for (i = 0, k = 0;
2588 			i < virtio_memory->regions[regionidx].memory_size -
2589 				page_size;
2590 			i += page_size) {
2591 			cur_phys_addr = rte_mem_virt2phy(
2592 					(void *)(uintptr_t)(vva_start + i));
2593 			next_phys_addr = rte_mem_virt2phy(
2594 					(void *)(uintptr_t)(vva_start +
2595 					i + page_size));
2596 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2597 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2598 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2599 					k + page_size;
2600 				mem_region_hpa[regionidx_hpa].memory_size
2601 					= k + page_size;
2602 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2603 					"phys addr end  [%d]:(%p)\n",
2604 					regionidx_hpa,
2605 					(void *)(uintptr_t)
2606 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2607 				LOG_DEBUG(VHOST_CONFIG,
2608 					"in fill_hpa_regions: guest phys addr "
2609 					"size [%d]:(%p)\n",
2610 					regionidx_hpa,
2611 					(void *)(uintptr_t)
2612 					(mem_region_hpa[regionidx_hpa].memory_size));
2613 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2614 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2615 				++regionidx_hpa;
2616 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2617 					next_phys_addr -
2618 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2619 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2620 					" phys addr start[%d]:(%p)\n",
2621 					regionidx_hpa,
2622 					(void *)(uintptr_t)
2623 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2624 				LOG_DEBUG(VHOST_CONFIG,
2625 					"in fill_hpa_regions: host  phys addr "
2626 					"start[%d]:(%p)\n",
2627 					regionidx_hpa,
2628 					(void *)(uintptr_t)
2629 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2630 				k = 0;
2631 			} else {
2632 				k += page_size;
2633 			}
2634 		}
2635 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2636 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2637 			+ k + page_size;
2638 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2639 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2640 			"[%d]:(%p)\n", regionidx_hpa,
2641 			(void *)(uintptr_t)
2642 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2643 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2644 			"[%d]:(%p)\n", regionidx_hpa,
2645 			(void *)(uintptr_t)
2646 			(mem_region_hpa[regionidx_hpa].memory_size));
2647 		++regionidx_hpa;
2648 	}
2649 	return regionidx_hpa;
2650 }
2651 
2652 /*
2653  * A new device is added to a data core. First the device is added to the main linked list
2654  * and the allocated to a specific data core.
2655  */
2656 static int
2657 new_device (struct virtio_net *dev)
2658 {
2659 	struct virtio_net_data_ll *ll_dev;
2660 	int lcore, core_add = 0;
2661 	uint32_t device_num_min = num_devices;
2662 	struct vhost_dev *vdev;
2663 	uint32_t regionidx;
2664 
2665 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2666 	if (vdev == NULL) {
2667 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2668 			dev->device_fh);
2669 		return -1;
2670 	}
2671 	vdev->dev = dev;
2672 	dev->priv = vdev;
2673 
2674 	if (zero_copy) {
2675 		vdev->nregions_hpa = dev->mem->nregions;
2676 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2677 			vdev->nregions_hpa
2678 				+= check_hpa_regions(
2679 					dev->mem->regions[regionidx].guest_phys_address
2680 					+ dev->mem->regions[regionidx].address_offset,
2681 					dev->mem->regions[regionidx].memory_size);
2682 
2683 		}
2684 
2685 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2686 					       vdev->nregions_hpa,
2687 					       sizeof(struct virtio_memory_regions_hpa),
2688 					       RTE_CACHE_LINE_SIZE);
2689 		if (vdev->regions_hpa == NULL) {
2690 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2691 			rte_free(vdev);
2692 			return -1;
2693 		}
2694 
2695 
2696 		if (fill_hpa_memory_regions(
2697 			vdev->regions_hpa, dev->mem
2698 			) != vdev->nregions_hpa) {
2699 
2700 			RTE_LOG(ERR, VHOST_CONFIG,
2701 				"hpa memory regions number mismatch: "
2702 				"[%d]\n", vdev->nregions_hpa);
2703 			rte_free(vdev->regions_hpa);
2704 			rte_free(vdev);
2705 			return -1;
2706 		}
2707 	}
2708 
2709 
2710 	/* Add device to main ll */
2711 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2712 	if (ll_dev == NULL) {
2713 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2714 			"of %d devices per core has been reached\n",
2715 			dev->device_fh, num_devices);
2716 		if (vdev->regions_hpa)
2717 			rte_free(vdev->regions_hpa);
2718 		rte_free(vdev);
2719 		return -1;
2720 	}
2721 	ll_dev->vdev = vdev;
2722 	add_data_ll_entry(&ll_root_used, ll_dev);
2723 	vdev->vmdq_rx_q
2724 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2725 
2726 	if (zero_copy) {
2727 		uint32_t index = vdev->vmdq_rx_q;
2728 		uint32_t count_in_ring, i;
2729 		struct mbuf_table *tx_q;
2730 
2731 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2732 
2733 		LOG_DEBUG(VHOST_CONFIG,
2734 			"(%"PRIu64") in new_device: mbuf count in mempool "
2735 			"before attach is: %d\n",
2736 			dev->device_fh,
2737 			rte_mempool_count(vpool_array[index].pool));
2738 		LOG_DEBUG(VHOST_CONFIG,
2739 			"(%"PRIu64") in new_device: mbuf count in  ring "
2740 			"before attach  is : %d\n",
2741 			dev->device_fh, count_in_ring);
2742 
2743 		/*
2744 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2745 		 */
2746 		for (i = 0; i < count_in_ring; i++)
2747 			attach_rxmbuf_zcp(dev);
2748 
2749 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2750 			"mempool after attach is: %d\n",
2751 			dev->device_fh,
2752 			rte_mempool_count(vpool_array[index].pool));
2753 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2754 			"ring after attach  is : %d\n",
2755 			dev->device_fh,
2756 			rte_ring_count(vpool_array[index].ring));
2757 
2758 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2759 		tx_q->txq_id = vdev->vmdq_rx_q;
2760 
2761 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2762 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2763 
2764 			LOG_DEBUG(VHOST_CONFIG,
2765 				"(%"PRIu64") In new_device: Failed to start "
2766 				"tx queue:%d\n",
2767 				dev->device_fh, vdev->vmdq_rx_q);
2768 
2769 			mbuf_destroy_zcp(vpool);
2770 			rte_free(vdev->regions_hpa);
2771 			rte_free(vdev);
2772 			return -1;
2773 		}
2774 
2775 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2776 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2777 
2778 			LOG_DEBUG(VHOST_CONFIG,
2779 				"(%"PRIu64") In new_device: Failed to start "
2780 				"rx queue:%d\n",
2781 				dev->device_fh, vdev->vmdq_rx_q);
2782 
2783 			/* Stop the TX queue. */
2784 			if (rte_eth_dev_tx_queue_stop(ports[0],
2785 				vdev->vmdq_rx_q) != 0) {
2786 				LOG_DEBUG(VHOST_CONFIG,
2787 					"(%"PRIu64") In new_device: Failed to "
2788 					"stop tx queue:%d\n",
2789 					dev->device_fh, vdev->vmdq_rx_q);
2790 			}
2791 
2792 			mbuf_destroy_zcp(vpool);
2793 			rte_free(vdev->regions_hpa);
2794 			rte_free(vdev);
2795 			return -1;
2796 		}
2797 
2798 	}
2799 
2800 	/*reset ready flag*/
2801 	vdev->ready = DEVICE_MAC_LEARNING;
2802 	vdev->remove = 0;
2803 
2804 	/* Find a suitable lcore to add the device. */
2805 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2806 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2807 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2808 			core_add = lcore;
2809 		}
2810 	}
2811 	/* Add device to lcore ll */
2812 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2813 	if (ll_dev == NULL) {
2814 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2815 		vdev->ready = DEVICE_SAFE_REMOVE;
2816 		destroy_device(dev);
2817 		rte_free(vdev->regions_hpa);
2818 		rte_free(vdev);
2819 		return -1;
2820 	}
2821 	ll_dev->vdev = vdev;
2822 	vdev->coreid = core_add;
2823 
2824 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2825 
2826 	/* Initialize device stats */
2827 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2828 
2829 	/* Disable notifications. */
2830 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2831 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2832 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2833 	dev->flags |= VIRTIO_DEV_RUNNING;
2834 
2835 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2836 
2837 	return 0;
2838 }
2839 
2840 /*
2841  * These callback allow devices to be added to the data core when configuration
2842  * has been fully complete.
2843  */
2844 static const struct virtio_net_device_ops virtio_net_device_ops =
2845 {
2846 	.new_device =  new_device,
2847 	.destroy_device = destroy_device,
2848 };
2849 
2850 /*
2851  * This is a thread will wake up after a period to print stats if the user has
2852  * enabled them.
2853  */
2854 static void
2855 print_stats(void)
2856 {
2857 	struct virtio_net_data_ll *dev_ll;
2858 	uint64_t tx_dropped, rx_dropped;
2859 	uint64_t tx, tx_total, rx, rx_total;
2860 	uint32_t device_fh;
2861 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2862 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2863 
2864 	while(1) {
2865 		sleep(enable_stats);
2866 
2867 		/* Clear screen and move to top left */
2868 		printf("%s%s", clr, top_left);
2869 
2870 		printf("\nDevice statistics ====================================");
2871 
2872 		dev_ll = ll_root_used;
2873 		while (dev_ll != NULL) {
2874 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2875 			tx_total = dev_statistics[device_fh].tx_total;
2876 			tx = dev_statistics[device_fh].tx;
2877 			tx_dropped = tx_total - tx;
2878 			if (zero_copy == 0) {
2879 				rx_total = rte_atomic64_read(
2880 					&dev_statistics[device_fh].rx_total_atomic);
2881 				rx = rte_atomic64_read(
2882 					&dev_statistics[device_fh].rx_atomic);
2883 			} else {
2884 				rx_total = dev_statistics[device_fh].rx_total;
2885 				rx = dev_statistics[device_fh].rx;
2886 			}
2887 			rx_dropped = rx_total - rx;
2888 
2889 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2890 					"\nTX total: 		%"PRIu64""
2891 					"\nTX dropped: 		%"PRIu64""
2892 					"\nTX successful: 		%"PRIu64""
2893 					"\nRX total: 		%"PRIu64""
2894 					"\nRX dropped: 		%"PRIu64""
2895 					"\nRX successful: 		%"PRIu64"",
2896 					device_fh,
2897 					tx_total,
2898 					tx_dropped,
2899 					tx,
2900 					rx_total,
2901 					rx_dropped,
2902 					rx);
2903 
2904 			dev_ll = dev_ll->next;
2905 		}
2906 		printf("\n======================================================\n");
2907 	}
2908 }
2909 
2910 static void
2911 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2912 	char *ring_name, uint32_t nb_mbuf)
2913 {
2914 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2915 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2916 	if (vpool_array[index].pool != NULL) {
2917 		vpool_array[index].ring
2918 			= rte_ring_create(ring_name,
2919 				rte_align32pow2(nb_mbuf + 1),
2920 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2921 		if (likely(vpool_array[index].ring != NULL)) {
2922 			LOG_DEBUG(VHOST_CONFIG,
2923 				"in setup_mempool_tbl: mbuf count in "
2924 				"mempool is: %d\n",
2925 				rte_mempool_count(vpool_array[index].pool));
2926 			LOG_DEBUG(VHOST_CONFIG,
2927 				"in setup_mempool_tbl: mbuf count in "
2928 				"ring   is: %d\n",
2929 				rte_ring_count(vpool_array[index].ring));
2930 		} else {
2931 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2932 				ring_name);
2933 		}
2934 
2935 		/* Need consider head room. */
2936 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2937 	} else {
2938 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2939 	}
2940 }
2941 
2942 /* When we receive a INT signal, unregister vhost driver */
2943 static void
2944 sigint_handler(__rte_unused int signum)
2945 {
2946 	/* Unregister vhost driver. */
2947 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2948 	if (ret != 0)
2949 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2950 	exit(0);
2951 }
2952 
2953 /*
2954  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2955  * device is also registered here to handle the IOCTLs.
2956  */
2957 int
2958 main(int argc, char *argv[])
2959 {
2960 	struct rte_mempool *mbuf_pool = NULL;
2961 	unsigned lcore_id, core_id = 0;
2962 	unsigned nb_ports, valid_num_ports;
2963 	int ret;
2964 	uint8_t portid;
2965 	uint16_t queue_id;
2966 	static pthread_t tid;
2967 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2968 
2969 	signal(SIGINT, sigint_handler);
2970 
2971 	/* init EAL */
2972 	ret = rte_eal_init(argc, argv);
2973 	if (ret < 0)
2974 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2975 	argc -= ret;
2976 	argv += ret;
2977 
2978 	/* parse app arguments */
2979 	ret = us_vhost_parse_args(argc, argv);
2980 	if (ret < 0)
2981 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2982 
2983 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2984 		if (rte_lcore_is_enabled(lcore_id))
2985 			lcore_ids[core_id ++] = lcore_id;
2986 
2987 	if (rte_lcore_count() > RTE_MAX_LCORE)
2988 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2989 
2990 	/*set the number of swithcing cores available*/
2991 	num_switching_cores = rte_lcore_count()-1;
2992 
2993 	/* Get the number of physical ports. */
2994 	nb_ports = rte_eth_dev_count();
2995 	if (nb_ports > RTE_MAX_ETHPORTS)
2996 		nb_ports = RTE_MAX_ETHPORTS;
2997 
2998 	/*
2999 	 * Update the global var NUM_PORTS and global array PORTS
3000 	 * and get value of var VALID_NUM_PORTS according to system ports number
3001 	 */
3002 	valid_num_ports = check_ports_num(nb_ports);
3003 
3004 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3005 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3006 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3007 		return -1;
3008 	}
3009 
3010 	if (zero_copy == 0) {
3011 		/* Create the mbuf pool. */
3012 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3013 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3014 			0, MBUF_DATA_SIZE, rte_socket_id());
3015 		if (mbuf_pool == NULL)
3016 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3017 
3018 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3019 			vpool_array[queue_id].pool = mbuf_pool;
3020 
3021 		if (vm2vm_mode == VM2VM_HARDWARE) {
3022 			/* Enable VT loop back to let L2 switch to do it. */
3023 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3024 			LOG_DEBUG(VHOST_CONFIG,
3025 				"Enable loop back for L2 switch in vmdq.\n");
3026 		}
3027 	} else {
3028 		uint32_t nb_mbuf;
3029 		char pool_name[RTE_MEMPOOL_NAMESIZE];
3030 		char ring_name[RTE_MEMPOOL_NAMESIZE];
3031 
3032 		nb_mbuf = num_rx_descriptor
3033 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3034 			+ num_switching_cores * MAX_PKT_BURST;
3035 
3036 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3037 			snprintf(pool_name, sizeof(pool_name),
3038 				"rxmbuf_pool_%u", queue_id);
3039 			snprintf(ring_name, sizeof(ring_name),
3040 				"rxmbuf_ring_%u", queue_id);
3041 			setup_mempool_tbl(rte_socket_id(), queue_id,
3042 				pool_name, ring_name, nb_mbuf);
3043 		}
3044 
3045 		nb_mbuf = num_tx_descriptor
3046 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3047 				+ num_switching_cores * MAX_PKT_BURST;
3048 
3049 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3050 			snprintf(pool_name, sizeof(pool_name),
3051 				"txmbuf_pool_%u", queue_id);
3052 			snprintf(ring_name, sizeof(ring_name),
3053 				"txmbuf_ring_%u", queue_id);
3054 			setup_mempool_tbl(rte_socket_id(),
3055 				(queue_id + MAX_QUEUES),
3056 				pool_name, ring_name, nb_mbuf);
3057 		}
3058 
3059 		if (vm2vm_mode == VM2VM_HARDWARE) {
3060 			/* Enable VT loop back to let L2 switch to do it. */
3061 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3062 			LOG_DEBUG(VHOST_CONFIG,
3063 				"Enable loop back for L2 switch in vmdq.\n");
3064 		}
3065 	}
3066 	/* Set log level. */
3067 	rte_set_log_level(LOG_LEVEL);
3068 
3069 	/* initialize all ports */
3070 	for (portid = 0; portid < nb_ports; portid++) {
3071 		/* skip ports that are not enabled */
3072 		if ((enabled_port_mask & (1 << portid)) == 0) {
3073 			RTE_LOG(INFO, VHOST_PORT,
3074 				"Skipping disabled port %d\n", portid);
3075 			continue;
3076 		}
3077 		if (port_init(portid) != 0)
3078 			rte_exit(EXIT_FAILURE,
3079 				"Cannot initialize network ports\n");
3080 	}
3081 
3082 	/* Initialise all linked lists. */
3083 	if (init_data_ll() == -1)
3084 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3085 
3086 	/* Initialize device stats */
3087 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3088 
3089 	/* Enable stats if the user option is set. */
3090 	if (enable_stats) {
3091 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3092 		if (ret != 0)
3093 			rte_exit(EXIT_FAILURE,
3094 				"Cannot create print-stats thread\n");
3095 
3096 		/* Set thread_name for aid in debugging.  */
3097 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3098 		ret = rte_thread_setname(tid, thread_name);
3099 		if (ret != 0)
3100 			RTE_LOG(ERR, VHOST_CONFIG,
3101 				"Cannot set print-stats name\n");
3102 	}
3103 
3104 	/* Launch all data cores. */
3105 	if (zero_copy == 0) {
3106 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3107 			rte_eal_remote_launch(switch_worker,
3108 				mbuf_pool, lcore_id);
3109 		}
3110 	} else {
3111 		uint32_t count_in_mempool, index, i;
3112 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3113 			/* For all RX and TX queues. */
3114 			count_in_mempool
3115 				= rte_mempool_count(vpool_array[index].pool);
3116 
3117 			/*
3118 			 * Transfer all un-attached mbufs from vpool.pool
3119 			 * to vpoo.ring.
3120 			 */
3121 			for (i = 0; i < count_in_mempool; i++) {
3122 				struct rte_mbuf *mbuf
3123 					= __rte_mbuf_raw_alloc(
3124 						vpool_array[index].pool);
3125 				rte_ring_sp_enqueue(vpool_array[index].ring,
3126 						(void *)mbuf);
3127 			}
3128 
3129 			LOG_DEBUG(VHOST_CONFIG,
3130 				"in main: mbuf count in mempool at initial "
3131 				"is: %d\n", count_in_mempool);
3132 			LOG_DEBUG(VHOST_CONFIG,
3133 				"in main: mbuf count in  ring at initial  is :"
3134 				" %d\n",
3135 				rte_ring_count(vpool_array[index].ring));
3136 		}
3137 
3138 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3139 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3140 				lcore_id);
3141 	}
3142 
3143 	if (mergeable == 0)
3144 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3145 
3146 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3147 	ret = rte_vhost_driver_register((char *)&dev_basename);
3148 	if (ret != 0)
3149 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3150 
3151 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3152 
3153 	/* Start CUSE session. */
3154 	rte_vhost_driver_session_start();
3155 	return 0;
3156 
3157 }
3158