xref: /dpdk/examples/vhost/main.c (revision 5674dad222d62130ae3ba59fe818caaa4843957c)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
69 							(num_switching_cores*MAX_PKT_BURST) +  			\
70 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71 							((num_switching_cores+1)*MBUF_CACHE_SIZE))
72 
73 #define MBUF_CACHE_SIZE	128
74 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
75 
76 /*
77  * No frame data buffer allocated from host are required for zero copy
78  * implementation, guest will allocate the frame data buffer, and vhost
79  * directly use it.
80  */
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
84 
85 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
87 
88 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
90 
91 #define JUMBO_FRAME_MAX_SIZE    0x2600
92 
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
95 #define DEVICE_RX			1
96 #define DEVICE_SAFE_REMOVE	2
97 
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
101 
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
105 
106 /*
107  * Need refine these 2 macros for legacy and DPDK based front end:
108  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109  * And then adjust power 2.
110  */
111 /*
112  * For legacy front end, 128 descriptors,
113  * half for virtio header, another half for mbuf.
114  */
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
117 
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120 		+ sizeof(struct rte_mbuf)))
121 
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
124 
125 #define INVALID_PORT_ID 0xFF
126 
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
129 
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
132 
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
135 
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
138 
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
141 
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
144 
145 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
146 
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
149 
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
152 
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
155 
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
159 
160 /*
161  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162  * disabled on default.
163  */
164 static uint32_t zero_copy;
165 static int mergeable;
166 
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
169 
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
173 
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
176 
177 struct vpool {
178 	struct rte_mempool *pool;
179 	struct rte_ring *ring;
180 	uint32_t buf_size;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
182 
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
184 typedef enum {
185 	VM2VM_DISABLED = 0,
186 	VM2VM_SOFTWARE = 1,
187 	VM2VM_HARDWARE = 2,
188 	VM2VM_LAST
189 } vm2vm_type;
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
191 
192 /* The type of host physical address translated from guest physical address. */
193 typedef enum {
194 	PHYS_ADDR_CONTINUOUS = 0,
195 	PHYS_ADDR_CROSS_SUBREG = 1,
196 	PHYS_ADDR_INVALID = 2,
197 	PHYS_ADDR_LAST
198 } hpa_type;
199 
200 /* Enable stats. */
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
204 
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
207 
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
210 
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215 
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218 
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
221 	.rxmode = {
222 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
223 		.split_hdr_size = 0,
224 		.header_split   = 0, /**< Header Split disabled */
225 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
226 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
227 		/*
228 		 * It is necessary for 1G NIC such as I350,
229 		 * this fixes bug of ipv4 forwarding in guest can't
230 		 * forward pakets from one virtio dev to another virtio dev.
231 		 */
232 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
233 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
234 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
235 	},
236 
237 	.txmode = {
238 		.mq_mode = ETH_MQ_TX_NONE,
239 	},
240 	.rx_adv_conf = {
241 		/*
242 		 * should be overridden separately in code with
243 		 * appropriate values
244 		 */
245 		.vmdq_rx_conf = {
246 			.nb_queue_pools = ETH_8_POOLS,
247 			.enable_default_pool = 0,
248 			.default_pool = 0,
249 			.nb_pool_maps = 0,
250 			.pool_map = {{0, 0},},
251 		},
252 	},
253 };
254 
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
261 
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
266 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
272 };
273 
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
276 
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
280 
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
283 
284 /* Used for queueing bursts of TX packets. */
285 struct mbuf_table {
286 	unsigned len;
287 	unsigned txq_id;
288 	struct rte_mbuf *m_table[MAX_PKT_BURST];
289 };
290 
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
293 
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
296 
297 /* Vlan header struct used to insert vlan tags on TX. */
298 struct vlan_ethhdr {
299 	unsigned char   h_dest[ETH_ALEN];
300 	unsigned char   h_source[ETH_ALEN];
301 	__be16          h_vlan_proto;
302 	__be16          h_vlan_TCI;
303 	__be16          h_vlan_encapsulated_proto;
304 };
305 
306 /* Header lengths. */
307 #define VLAN_HLEN       4
308 #define VLAN_ETH_HLEN   18
309 
310 /* Per-device statistics struct */
311 struct device_statistics {
312 	uint64_t tx_total;
313 	rte_atomic64_t rx_total_atomic;
314 	uint64_t rx_total;
315 	uint64_t tx;
316 	rte_atomic64_t rx_atomic;
317 	uint64_t rx;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
320 
321 /*
322  * Builds up the correct configuration for VMDQ VLAN pool map
323  * according to the pool & queue limits.
324  */
325 static inline int
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
327 {
328 	struct rte_eth_vmdq_rx_conf conf;
329 	struct rte_eth_vmdq_rx_conf *def_conf =
330 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
331 	unsigned i;
332 
333 	memset(&conf, 0, sizeof(conf));
334 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335 	conf.nb_pool_maps = num_devices;
336 	conf.enable_loop_back = def_conf->enable_loop_back;
337 	conf.rx_mode = def_conf->rx_mode;
338 
339 	for (i = 0; i < conf.nb_pool_maps; i++) {
340 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
341 		conf.pool_map[i].pools = (1UL << i);
342 	}
343 
344 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
347 	return 0;
348 }
349 
350 /*
351  * Validate the device number according to the max pool number gotten form
352  * dev_info. If the device number is invalid, give the error message and
353  * return -1. Each device must have its own pool.
354  */
355 static inline int
356 validate_num_devices(uint32_t max_nb_devices)
357 {
358 	if (num_devices > max_nb_devices) {
359 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
360 		return -1;
361 	}
362 	return 0;
363 }
364 
365 /*
366  * Initialises a given port using global settings and with the rx buffers
367  * coming from the mbuf_pool passed as parameter
368  */
369 static inline int
370 port_init(uint8_t port)
371 {
372 	struct rte_eth_dev_info dev_info;
373 	struct rte_eth_conf port_conf;
374 	struct rte_eth_rxconf *rxconf;
375 	struct rte_eth_txconf *txconf;
376 	int16_t rx_rings, tx_rings;
377 	uint16_t rx_ring_size, tx_ring_size;
378 	int retval;
379 	uint16_t q;
380 
381 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382 	rte_eth_dev_info_get (port, &dev_info);
383 
384 	if (dev_info.max_rx_queues > MAX_QUEUES) {
385 		rte_exit(EXIT_FAILURE,
386 			"please define MAX_QUEUES no less than %u in %s\n",
387 			dev_info.max_rx_queues, __FILE__);
388 	}
389 
390 	rxconf = &dev_info.default_rxconf;
391 	txconf = &dev_info.default_txconf;
392 	rxconf->rx_drop_en = 1;
393 
394 	/* Enable vlan offload */
395 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
396 
397 	/*
398 	 * Zero copy defers queue RX/TX start to the time when guest
399 	 * finishes its startup and packet buffers from that guest are
400 	 * available.
401 	 */
402 	if (zero_copy) {
403 		rxconf->rx_deferred_start = 1;
404 		rxconf->rx_drop_en = 0;
405 		txconf->tx_deferred_start = 1;
406 	}
407 
408 	/*configure the number of supported virtio devices based on VMDQ limits */
409 	num_devices = dev_info.max_vmdq_pools;
410 
411 	if (zero_copy) {
412 		rx_ring_size = num_rx_descriptor;
413 		tx_ring_size = num_tx_descriptor;
414 		tx_rings = dev_info.max_tx_queues;
415 	} else {
416 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418 		tx_rings = (uint16_t)rte_lcore_count();
419 	}
420 
421 	retval = validate_num_devices(MAX_DEVICES);
422 	if (retval < 0)
423 		return retval;
424 
425 	/* Get port configuration. */
426 	retval = get_eth_conf(&port_conf, num_devices);
427 	if (retval < 0)
428 		return retval;
429 	/* NIC queues are divided into pf queues and vmdq queues.  */
430 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432 	num_vmdq_queues = num_devices * queues_per_pool;
433 	num_queues = num_pf_queues + num_vmdq_queues;
434 	vmdq_queue_base = dev_info.vmdq_queue_base;
435 	vmdq_pool_base  = dev_info.vmdq_pool_base;
436 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437 		num_pf_queues, num_devices, queues_per_pool);
438 
439 	if (port >= rte_eth_dev_count()) return -1;
440 
441 	if (enable_tx_csum == 0)
442 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
443 
444 	if (enable_tso == 0) {
445 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
447 	}
448 
449 	rx_rings = (uint16_t)dev_info.max_rx_queues;
450 	/* Configure ethernet device. */
451 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452 	if (retval != 0)
453 		return retval;
454 
455 	/* Setup the queues. */
456 	for (q = 0; q < rx_rings; q ++) {
457 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 						rte_eth_dev_socket_id(port),
459 						rxconf,
460 						vpool_array[q].pool);
461 		if (retval < 0)
462 			return retval;
463 	}
464 	for (q = 0; q < tx_rings; q ++) {
465 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466 						rte_eth_dev_socket_id(port),
467 						txconf);
468 		if (retval < 0)
469 			return retval;
470 	}
471 
472 	/* Start the device. */
473 	retval  = rte_eth_dev_start(port);
474 	if (retval < 0) {
475 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476 		return retval;
477 	}
478 
479 	if (promiscuous)
480 		rte_eth_promiscuous_enable(port);
481 
482 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
486 			(unsigned)port,
487 			vmdq_ports_eth_addr[port].addr_bytes[0],
488 			vmdq_ports_eth_addr[port].addr_bytes[1],
489 			vmdq_ports_eth_addr[port].addr_bytes[2],
490 			vmdq_ports_eth_addr[port].addr_bytes[3],
491 			vmdq_ports_eth_addr[port].addr_bytes[4],
492 			vmdq_ports_eth_addr[port].addr_bytes[5]);
493 
494 	return 0;
495 }
496 
497 /*
498  * Set character device basename.
499  */
500 static int
501 us_vhost_parse_basename(const char *q_arg)
502 {
503 	/* parse number string */
504 
505 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
506 		return -1;
507 	else
508 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509 
510 	return 0;
511 }
512 
513 /*
514  * Parse the portmask provided at run time.
515  */
516 static int
517 parse_portmask(const char *portmask)
518 {
519 	char *end = NULL;
520 	unsigned long pm;
521 
522 	errno = 0;
523 
524 	/* parse hexadecimal string */
525 	pm = strtoul(portmask, &end, 16);
526 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
527 		return -1;
528 
529 	if (pm == 0)
530 		return -1;
531 
532 	return pm;
533 
534 }
535 
536 /*
537  * Parse num options at run time.
538  */
539 static int
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 {
542 	char *end = NULL;
543 	unsigned long num;
544 
545 	errno = 0;
546 
547 	/* parse unsigned int string */
548 	num = strtoul(q_arg, &end, 10);
549 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
550 		return -1;
551 
552 	if (num > max_valid_value)
553 		return -1;
554 
555 	return num;
556 
557 }
558 
559 /*
560  * Display usage
561  */
562 static void
563 us_vhost_usage(const char *prgname)
564 {
565 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
566 	"		--vm2vm [0|1|2]\n"
567 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568 	"		--dev-basename <name>\n"
569 	"		--nb-devices ND\n"
570 	"		-p PORTMASK: Set mask for ports to be used by application\n"
571 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578 	"		--dev-basename: The basename to be used for the character device.\n"
579 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
580 			"zero copy\n"
581 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
582 			"used only when zero copy is enabled.\n"
583 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
584 			"used only when zero copy is enabled.\n"
585 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
586 	"		--tso [0|1] disable/enable TCP segment offload.\n",
587 	       prgname);
588 }
589 
590 /*
591  * Parse the arguments given in the command line of the application.
592  */
593 static int
594 us_vhost_parse_args(int argc, char **argv)
595 {
596 	int opt, ret;
597 	int option_index;
598 	unsigned i;
599 	const char *prgname = argv[0];
600 	static struct option long_option[] = {
601 		{"vm2vm", required_argument, NULL, 0},
602 		{"rx-retry", required_argument, NULL, 0},
603 		{"rx-retry-delay", required_argument, NULL, 0},
604 		{"rx-retry-num", required_argument, NULL, 0},
605 		{"mergeable", required_argument, NULL, 0},
606 		{"vlan-strip", required_argument, NULL, 0},
607 		{"stats", required_argument, NULL, 0},
608 		{"dev-basename", required_argument, NULL, 0},
609 		{"zero-copy", required_argument, NULL, 0},
610 		{"rx-desc-num", required_argument, NULL, 0},
611 		{"tx-desc-num", required_argument, NULL, 0},
612 		{"tx-csum", required_argument, NULL, 0},
613 		{"tso", required_argument, NULL, 0},
614 		{NULL, 0, 0, 0},
615 	};
616 
617 	/* Parse command line */
618 	while ((opt = getopt_long(argc, argv, "p:P",
619 			long_option, &option_index)) != EOF) {
620 		switch (opt) {
621 		/* Portmask */
622 		case 'p':
623 			enabled_port_mask = parse_portmask(optarg);
624 			if (enabled_port_mask == 0) {
625 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626 				us_vhost_usage(prgname);
627 				return -1;
628 			}
629 			break;
630 
631 		case 'P':
632 			promiscuous = 1;
633 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634 				ETH_VMDQ_ACCEPT_BROADCAST |
635 				ETH_VMDQ_ACCEPT_MULTICAST;
636 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
637 
638 			break;
639 
640 		case 0:
641 			/* Enable/disable vm2vm comms. */
642 			if (!strncmp(long_option[option_index].name, "vm2vm",
643 				MAX_LONG_OPT_SZ)) {
644 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
645 				if (ret == -1) {
646 					RTE_LOG(INFO, VHOST_CONFIG,
647 						"Invalid argument for "
648 						"vm2vm [0|1|2]\n");
649 					us_vhost_usage(prgname);
650 					return -1;
651 				} else {
652 					vm2vm_mode = (vm2vm_type)ret;
653 				}
654 			}
655 
656 			/* Enable/disable retries on RX. */
657 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658 				ret = parse_num_opt(optarg, 1);
659 				if (ret == -1) {
660 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661 					us_vhost_usage(prgname);
662 					return -1;
663 				} else {
664 					enable_retry = ret;
665 				}
666 			}
667 
668 			/* Enable/disable TX checksum offload. */
669 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670 				ret = parse_num_opt(optarg, 1);
671 				if (ret == -1) {
672 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673 					us_vhost_usage(prgname);
674 					return -1;
675 				} else
676 					enable_tx_csum = ret;
677 			}
678 
679 			/* Enable/disable TSO offload. */
680 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681 				ret = parse_num_opt(optarg, 1);
682 				if (ret == -1) {
683 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684 					us_vhost_usage(prgname);
685 					return -1;
686 				} else
687 					enable_tso = ret;
688 			}
689 
690 			/* Specify the retries delay time (in useconds) on RX. */
691 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692 				ret = parse_num_opt(optarg, INT32_MAX);
693 				if (ret == -1) {
694 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695 					us_vhost_usage(prgname);
696 					return -1;
697 				} else {
698 					burst_rx_delay_time = ret;
699 				}
700 			}
701 
702 			/* Specify the retries number on RX. */
703 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704 				ret = parse_num_opt(optarg, INT32_MAX);
705 				if (ret == -1) {
706 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707 					us_vhost_usage(prgname);
708 					return -1;
709 				} else {
710 					burst_rx_retry_num = ret;
711 				}
712 			}
713 
714 			/* Enable/disable RX mergeable buffers. */
715 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716 				ret = parse_num_opt(optarg, 1);
717 				if (ret == -1) {
718 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719 					us_vhost_usage(prgname);
720 					return -1;
721 				} else {
722 					mergeable = !!ret;
723 					if (ret) {
724 						vmdq_conf_default.rxmode.jumbo_frame = 1;
725 						vmdq_conf_default.rxmode.max_rx_pkt_len
726 							= JUMBO_FRAME_MAX_SIZE;
727 					}
728 				}
729 			}
730 
731 			/* Enable/disable RX VLAN strip on host. */
732 			if (!strncmp(long_option[option_index].name,
733 				"vlan-strip", MAX_LONG_OPT_SZ)) {
734 				ret = parse_num_opt(optarg, 1);
735 				if (ret == -1) {
736 					RTE_LOG(INFO, VHOST_CONFIG,
737 						"Invalid argument for VLAN strip [0|1]\n");
738 					us_vhost_usage(prgname);
739 					return -1;
740 				} else {
741 					vlan_strip = !!ret;
742 					vmdq_conf_default.rxmode.hw_vlan_strip =
743 						vlan_strip;
744 				}
745 			}
746 
747 			/* Enable/disable stats. */
748 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749 				ret = parse_num_opt(optarg, INT32_MAX);
750 				if (ret == -1) {
751 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752 					us_vhost_usage(prgname);
753 					return -1;
754 				} else {
755 					enable_stats = ret;
756 				}
757 			}
758 
759 			/* Set character device basename. */
760 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761 				if (us_vhost_parse_basename(optarg) == -1) {
762 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763 					us_vhost_usage(prgname);
764 					return -1;
765 				}
766 			}
767 
768 			/* Enable/disable rx/tx zero copy. */
769 			if (!strncmp(long_option[option_index].name,
770 				"zero-copy", MAX_LONG_OPT_SZ)) {
771 				ret = parse_num_opt(optarg, 1);
772 				if (ret == -1) {
773 					RTE_LOG(INFO, VHOST_CONFIG,
774 						"Invalid argument"
775 						" for zero-copy [0|1]\n");
776 					us_vhost_usage(prgname);
777 					return -1;
778 				} else
779 					zero_copy = ret;
780 			}
781 
782 			/* Specify the descriptor number on RX. */
783 			if (!strncmp(long_option[option_index].name,
784 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
785 				ret = parse_num_opt(optarg, MAX_RING_DESC);
786 				if ((ret == -1) || (!POWEROF2(ret))) {
787 					RTE_LOG(INFO, VHOST_CONFIG,
788 					"Invalid argument for rx-desc-num[0-N],"
789 					"power of 2 required.\n");
790 					us_vhost_usage(prgname);
791 					return -1;
792 				} else {
793 					num_rx_descriptor = ret;
794 				}
795 			}
796 
797 			/* Specify the descriptor number on TX. */
798 			if (!strncmp(long_option[option_index].name,
799 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
800 				ret = parse_num_opt(optarg, MAX_RING_DESC);
801 				if ((ret == -1) || (!POWEROF2(ret))) {
802 					RTE_LOG(INFO, VHOST_CONFIG,
803 					"Invalid argument for tx-desc-num [0-N],"
804 					"power of 2 required.\n");
805 					us_vhost_usage(prgname);
806 					return -1;
807 				} else {
808 					num_tx_descriptor = ret;
809 				}
810 			}
811 
812 			break;
813 
814 			/* Invalid option - print options. */
815 		default:
816 			us_vhost_usage(prgname);
817 			return -1;
818 		}
819 	}
820 
821 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822 		if (enabled_port_mask & (1 << i))
823 			ports[num_ports++] = (uint8_t)i;
824 	}
825 
826 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
827 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
829 		return -1;
830 	}
831 
832 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833 		RTE_LOG(INFO, VHOST_PORT,
834 			"Vhost zero copy doesn't support software vm2vm,"
835 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
836 		return -1;
837 	}
838 
839 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840 		RTE_LOG(INFO, VHOST_PORT,
841 			"Vhost zero copy doesn't support jumbo frame,"
842 			"please specify '--mergeable 0' to disable the "
843 			"mergeable feature.\n");
844 		return -1;
845 	}
846 
847 	return 0;
848 }
849 
850 /*
851  * Update the global var NUM_PORTS and array PORTS according to system ports number
852  * and return valid ports number
853  */
854 static unsigned check_ports_num(unsigned nb_ports)
855 {
856 	unsigned valid_num_ports = num_ports;
857 	unsigned portid;
858 
859 	if (num_ports > nb_ports) {
860 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861 			num_ports, nb_ports);
862 		num_ports = nb_ports;
863 	}
864 
865 	for (portid = 0; portid < num_ports; portid ++) {
866 		if (ports[portid] >= nb_ports) {
867 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868 				ports[portid], (nb_ports - 1));
869 			ports[portid] = INVALID_PORT_ID;
870 			valid_num_ports--;
871 		}
872 	}
873 	return valid_num_ports;
874 }
875 
876 /*
877  * Macro to print out packet contents. Wrapped in debug define so that the
878  * data path is not effected when debug is disabled.
879  */
880 #ifdef DEBUG
881 #define PRINT_PACKET(device, addr, size, header) do {																\
882 	char *pkt_addr = (char*)(addr);																					\
883 	unsigned int index;																								\
884 	char packet[MAX_PRINT_BUFF];																					\
885 																													\
886 	if ((header))																									\
887 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
888 	else																											\
889 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
890 	for (index = 0; index < (size); index++) {																		\
891 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
892 			"%02hhx ", pkt_addr[index]);																			\
893 	}																												\
894 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
895 																													\
896 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
897 } while(0)
898 #else
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
900 #endif
901 
902 /*
903  * Function to convert guest physical addresses to vhost physical addresses.
904  * This is used to convert virtio buffer addresses.
905  */
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
908 	uint32_t buf_len, hpa_type *addr_type)
909 {
910 	struct virtio_memory_regions_hpa *region;
911 	uint32_t regionidx;
912 	uint64_t vhost_pa = 0;
913 
914 	*addr_type = PHYS_ADDR_INVALID;
915 
916 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917 		region = &vdev->regions_hpa[regionidx];
918 		if ((guest_pa >= region->guest_phys_address) &&
919 			(guest_pa <= region->guest_phys_address_end)) {
920 			vhost_pa = region->host_phys_addr_offset + guest_pa;
921 			if (likely((guest_pa + buf_len - 1)
922 				<= region->guest_phys_address_end))
923 				*addr_type = PHYS_ADDR_CONTINUOUS;
924 			else
925 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
926 			break;
927 		}
928 	}
929 
930 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
931 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932 		(void *)(uintptr_t)vhost_pa);
933 
934 	return vhost_pa;
935 }
936 
937 /*
938  * Compares a packet destination MAC address to a device MAC address.
939  */
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
942 {
943 	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
944 }
945 
946 /*
947  * This function learns the MAC address of the device and registers this along with a
948  * vlan tag to a VMDQ.
949  */
950 static int
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
952 {
953 	struct ether_hdr *pkt_hdr;
954 	struct virtio_net_data_ll *dev_ll;
955 	struct virtio_net *dev = vdev->dev;
956 	int i, ret;
957 
958 	/* Learn MAC address of guest device from packet */
959 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960 
961 	dev_ll = ll_root_used;
962 
963 	while (dev_ll != NULL) {
964 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
966 			return -1;
967 		}
968 		dev_ll = dev_ll->next;
969 	}
970 
971 	for (i = 0; i < ETHER_ADDR_LEN; i++)
972 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
973 
974 	/* vlan_tag currently uses the device_id. */
975 	vdev->vlan_tag = vlan_tags[dev->device_fh];
976 
977 	/* Print out VMDQ registration info. */
978 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
979 		dev->device_fh,
980 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
983 		vdev->vlan_tag);
984 
985 	/* Register the MAC address. */
986 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987 				(uint32_t)dev->device_fh + vmdq_pool_base);
988 	if (ret)
989 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
990 					dev->device_fh);
991 
992 	/* Enable stripping of the vlan tag as we handle routing. */
993 	if (vlan_strip)
994 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995 			(uint16_t)vdev->vmdq_rx_q, 1);
996 
997 	/* Set device as ready for RX. */
998 	vdev->ready = DEVICE_RX;
999 
1000 	return 0;
1001 }
1002 
1003 /*
1004  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005  * queue before disabling RX on the device.
1006  */
1007 static inline void
1008 unlink_vmdq(struct vhost_dev *vdev)
1009 {
1010 	unsigned i = 0;
1011 	unsigned rx_count;
1012 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1013 
1014 	if (vdev->ready == DEVICE_RX) {
1015 		/*clear MAC and VLAN settings*/
1016 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017 		for (i = 0; i < 6; i++)
1018 			vdev->mac_address.addr_bytes[i] = 0;
1019 
1020 		vdev->vlan_tag = 0;
1021 
1022 		/*Clear out the receive buffers*/
1023 		rx_count = rte_eth_rx_burst(ports[0],
1024 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1025 
1026 		while (rx_count) {
1027 			for (i = 0; i < rx_count; i++)
1028 				rte_pktmbuf_free(pkts_burst[i]);
1029 
1030 			rx_count = rte_eth_rx_burst(ports[0],
1031 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1032 		}
1033 
1034 		vdev->ready = DEVICE_MAC_LEARNING;
1035 	}
1036 }
1037 
1038 /*
1039  * Check if the packet destination MAC address is for a local device. If so then put
1040  * the packet on that devices RX queue. If not then return.
1041  */
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1044 {
1045 	struct virtio_net_data_ll *dev_ll;
1046 	struct ether_hdr *pkt_hdr;
1047 	uint64_t ret = 0;
1048 	struct virtio_net *dev = vdev->dev;
1049 	struct virtio_net *tdev; /* destination virito device */
1050 
1051 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052 
1053 	/*get the used devices list*/
1054 	dev_ll = ll_root_used;
1055 
1056 	while (dev_ll != NULL) {
1057 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058 				          &dev_ll->vdev->mac_address)) {
1059 
1060 			/* Drop the packet if the TX packet is destined for the TX device. */
1061 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1063 							dev->device_fh);
1064 				return 0;
1065 			}
1066 			tdev = dev_ll->vdev->dev;
1067 
1068 
1069 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1070 
1071 			if (unlikely(dev_ll->vdev->remove)) {
1072 				/*drop the packet if the device is marked for removal*/
1073 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1074 			} else {
1075 				/*send the packet to the local virtio device*/
1076 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1077 				if (enable_stats) {
1078 					rte_atomic64_add(
1079 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1080 					1);
1081 					rte_atomic64_add(
1082 					&dev_statistics[tdev->device_fh].rx_atomic,
1083 					ret);
1084 					dev_statistics[dev->device_fh].tx_total++;
1085 					dev_statistics[dev->device_fh].tx += ret;
1086 				}
1087 			}
1088 
1089 			return 0;
1090 		}
1091 		dev_ll = dev_ll->next;
1092 	}
1093 
1094 	return -1;
1095 }
1096 
1097 /*
1098  * Check if the destination MAC of a packet is one local VM,
1099  * and get its vlan tag, and offset if it is.
1100  */
1101 static inline int __attribute__((always_inline))
1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1103 	uint32_t *offset, uint16_t *vlan_tag)
1104 {
1105 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1106 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1107 
1108 	while (dev_ll != NULL) {
1109 		if ((dev_ll->vdev->ready == DEVICE_RX)
1110 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1111 		&dev_ll->vdev->mac_address)) {
1112 			/*
1113 			 * Drop the packet if the TX packet is
1114 			 * destined for the TX device.
1115 			 */
1116 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1117 				LOG_DEBUG(VHOST_DATA,
1118 				"(%"PRIu64") TX: Source and destination"
1119 				" MAC addresses are the same. Dropping "
1120 				"packet.\n",
1121 				dev_ll->vdev->dev->device_fh);
1122 				return -1;
1123 			}
1124 
1125 			/*
1126 			 * HW vlan strip will reduce the packet length
1127 			 * by minus length of vlan tag, so need restore
1128 			 * the packet length by plus it.
1129 			 */
1130 			*offset = VLAN_HLEN;
1131 			*vlan_tag =
1132 			(uint16_t)
1133 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1134 
1135 			LOG_DEBUG(VHOST_DATA,
1136 			"(%"PRIu64") TX: pkt to local VM device id:"
1137 			"(%"PRIu64") vlan tag: %d.\n",
1138 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1139 			(int)*vlan_tag);
1140 
1141 			break;
1142 		}
1143 		dev_ll = dev_ll->next;
1144 	}
1145 	return 0;
1146 }
1147 
1148 static uint16_t
1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1150 {
1151 	if (ol_flags & PKT_TX_IPV4)
1152 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1153 	else /* assume ethertype == ETHER_TYPE_IPv6 */
1154 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1155 }
1156 
1157 static void virtio_tx_offload(struct rte_mbuf *m)
1158 {
1159 	void *l3_hdr;
1160 	struct ipv4_hdr *ipv4_hdr = NULL;
1161 	struct tcp_hdr *tcp_hdr = NULL;
1162 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163 
1164 	l3_hdr = (char *)eth_hdr + m->l2_len;
1165 
1166 	ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
1167 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1168 	m->ol_flags |= PKT_TX_IP_CKSUM;
1169 	ipv4_hdr->hdr_checksum = 0;
1170 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1171 }
1172 
1173 /*
1174  * This function routes the TX packet to the correct interface. This may be a local device
1175  * or the physical port.
1176  */
1177 static inline void __attribute__((always_inline))
1178 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1179 {
1180 	struct mbuf_table *tx_q;
1181 	struct rte_mbuf **m_table;
1182 	unsigned len, ret, offset = 0;
1183 	const uint16_t lcore_id = rte_lcore_id();
1184 	struct virtio_net *dev = vdev->dev;
1185 	struct ether_hdr *nh;
1186 
1187 	/*check if destination is local VM*/
1188 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1189 		rte_pktmbuf_free(m);
1190 		return;
1191 	}
1192 
1193 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1194 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1195 			rte_pktmbuf_free(m);
1196 			return;
1197 		}
1198 	}
1199 
1200 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1201 
1202 	/*Add packet to the port tx queue*/
1203 	tx_q = &lcore_tx_queue[lcore_id];
1204 	len = tx_q->len;
1205 
1206 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1207 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1208 		/* Guest has inserted the vlan tag. */
1209 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1210 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1211 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1212 			(vh->vlan_tci != vlan_tag_be))
1213 			vh->vlan_tci = vlan_tag_be;
1214 	} else {
1215 		m->ol_flags |= PKT_TX_VLAN_PKT;
1216 
1217 		/*
1218 		 * Find the right seg to adjust the data len when offset is
1219 		 * bigger than tail room size.
1220 		 */
1221 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1222 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1223 				m->data_len += offset;
1224 			else {
1225 				struct rte_mbuf *seg = m;
1226 
1227 				while ((seg->next != NULL) &&
1228 					(offset > rte_pktmbuf_tailroom(seg)))
1229 					seg = seg->next;
1230 
1231 				seg->data_len += offset;
1232 			}
1233 			m->pkt_len += offset;
1234 		}
1235 
1236 		m->vlan_tci = vlan_tag;
1237 	}
1238 
1239 	if (m->ol_flags & PKT_TX_TCP_SEG)
1240 		virtio_tx_offload(m);
1241 
1242 	tx_q->m_table[len] = m;
1243 	len++;
1244 	if (enable_stats) {
1245 		dev_statistics[dev->device_fh].tx_total++;
1246 		dev_statistics[dev->device_fh].tx++;
1247 	}
1248 
1249 	if (unlikely(len == MAX_PKT_BURST)) {
1250 		m_table = (struct rte_mbuf **)tx_q->m_table;
1251 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1252 		/* Free any buffers not handled by TX and update the port stats. */
1253 		if (unlikely(ret < len)) {
1254 			do {
1255 				rte_pktmbuf_free(m_table[ret]);
1256 			} while (++ret < len);
1257 		}
1258 
1259 		len = 0;
1260 	}
1261 
1262 	tx_q->len = len;
1263 	return;
1264 }
1265 /*
1266  * This function is called by each data core. It handles all RX/TX registered with the
1267  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1268  * with all devices in the main linked list.
1269  */
1270 static int
1271 switch_worker(__attribute__((unused)) void *arg)
1272 {
1273 	struct rte_mempool *mbuf_pool = arg;
1274 	struct virtio_net *dev = NULL;
1275 	struct vhost_dev *vdev = NULL;
1276 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1277 	struct virtio_net_data_ll *dev_ll;
1278 	struct mbuf_table *tx_q;
1279 	volatile struct lcore_ll_info *lcore_ll;
1280 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1281 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1282 	unsigned ret, i;
1283 	const uint16_t lcore_id = rte_lcore_id();
1284 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1285 	uint16_t rx_count = 0;
1286 	uint16_t tx_count;
1287 	uint32_t retry = 0;
1288 
1289 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1290 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1291 	prev_tsc = 0;
1292 
1293 	tx_q = &lcore_tx_queue[lcore_id];
1294 	for (i = 0; i < num_cores; i ++) {
1295 		if (lcore_ids[i] == lcore_id) {
1296 			tx_q->txq_id = i;
1297 			break;
1298 		}
1299 	}
1300 
1301 	while(1) {
1302 		cur_tsc = rte_rdtsc();
1303 		/*
1304 		 * TX burst queue drain
1305 		 */
1306 		diff_tsc = cur_tsc - prev_tsc;
1307 		if (unlikely(diff_tsc > drain_tsc)) {
1308 
1309 			if (tx_q->len) {
1310 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1311 
1312 				/*Tx any packets in the queue*/
1313 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1314 									   (struct rte_mbuf **)tx_q->m_table,
1315 									   (uint16_t)tx_q->len);
1316 				if (unlikely(ret < tx_q->len)) {
1317 					do {
1318 						rte_pktmbuf_free(tx_q->m_table[ret]);
1319 					} while (++ret < tx_q->len);
1320 				}
1321 
1322 				tx_q->len = 0;
1323 			}
1324 
1325 			prev_tsc = cur_tsc;
1326 
1327 		}
1328 
1329 		rte_prefetch0(lcore_ll->ll_root_used);
1330 		/*
1331 		 * Inform the configuration core that we have exited the linked list and that no devices are
1332 		 * in use if requested.
1333 		 */
1334 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1335 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1336 
1337 		/*
1338 		 * Process devices
1339 		 */
1340 		dev_ll = lcore_ll->ll_root_used;
1341 
1342 		while (dev_ll != NULL) {
1343 			/*get virtio device ID*/
1344 			vdev = dev_ll->vdev;
1345 			dev = vdev->dev;
1346 
1347 			if (unlikely(vdev->remove)) {
1348 				dev_ll = dev_ll->next;
1349 				unlink_vmdq(vdev);
1350 				vdev->ready = DEVICE_SAFE_REMOVE;
1351 				continue;
1352 			}
1353 			if (likely(vdev->ready == DEVICE_RX)) {
1354 				/*Handle guest RX*/
1355 				rx_count = rte_eth_rx_burst(ports[0],
1356 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1357 
1358 				if (rx_count) {
1359 					/*
1360 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1361 					* Here MAX_PKT_BURST must be less than virtio queue size
1362 					*/
1363 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1364 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1365 							rte_delay_us(burst_rx_delay_time);
1366 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1367 								break;
1368 						}
1369 					}
1370 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1371 					if (enable_stats) {
1372 						rte_atomic64_add(
1373 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1374 						rx_count);
1375 						rte_atomic64_add(
1376 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1377 					}
1378 					while (likely(rx_count)) {
1379 						rx_count--;
1380 						rte_pktmbuf_free(pkts_burst[rx_count]);
1381 					}
1382 
1383 				}
1384 			}
1385 
1386 			if (likely(!vdev->remove)) {
1387 				/* Handle guest TX*/
1388 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1389 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1390 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1391 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1392 						while (tx_count)
1393 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1394 					}
1395 				}
1396 				for (i = 0; i < tx_count; ++i)
1397 					virtio_tx_route(vdev, pkts_burst[i], (uint16_t)dev->device_fh);
1398 			}
1399 
1400 			/*move to the next device in the list*/
1401 			dev_ll = dev_ll->next;
1402 		}
1403 	}
1404 
1405 	return 0;
1406 }
1407 
1408 /*
1409  * This function gets available ring number for zero copy rx.
1410  * Only one thread will call this funciton for a paticular virtio device,
1411  * so, it is designed as non-thread-safe function.
1412  */
1413 static inline uint32_t __attribute__((always_inline))
1414 get_available_ring_num_zcp(struct virtio_net *dev)
1415 {
1416 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1417 	uint16_t avail_idx;
1418 
1419 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1420 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1421 }
1422 
1423 /*
1424  * This function gets available ring index for zero copy rx,
1425  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1426  * Only one thread will call this funciton for a paticular virtio device,
1427  * so, it is designed as non-thread-safe function.
1428  */
1429 static inline uint32_t __attribute__((always_inline))
1430 get_available_ring_index_zcp(struct virtio_net *dev,
1431 	uint16_t *res_base_idx, uint32_t count)
1432 {
1433 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1434 	uint16_t avail_idx;
1435 	uint32_t retry = 0;
1436 	uint16_t free_entries;
1437 
1438 	*res_base_idx = vq->last_used_idx_res;
1439 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1440 	free_entries = (avail_idx - *res_base_idx);
1441 
1442 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1443 			"avail idx: %d, "
1444 			"res base idx:%d, free entries:%d\n",
1445 			dev->device_fh, avail_idx, *res_base_idx,
1446 			free_entries);
1447 
1448 	/*
1449 	 * If retry is enabled and the queue is full then we wait
1450 	 * and retry to avoid packet loss.
1451 	 */
1452 	if (enable_retry && unlikely(count > free_entries)) {
1453 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1454 			rte_delay_us(burst_rx_delay_time);
1455 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1456 			free_entries = (avail_idx - *res_base_idx);
1457 			if (count <= free_entries)
1458 				break;
1459 		}
1460 	}
1461 
1462 	/*check that we have enough buffers*/
1463 	if (unlikely(count > free_entries))
1464 		count = free_entries;
1465 
1466 	if (unlikely(count == 0)) {
1467 		LOG_DEBUG(VHOST_DATA,
1468 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1469 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1470 			dev->device_fh, avail_idx,
1471 			*res_base_idx, free_entries);
1472 		return 0;
1473 	}
1474 
1475 	vq->last_used_idx_res = *res_base_idx + count;
1476 
1477 	return count;
1478 }
1479 
1480 /*
1481  * This function put descriptor back to used list.
1482  */
1483 static inline void __attribute__((always_inline))
1484 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1485 {
1486 	uint16_t res_cur_idx = vq->last_used_idx;
1487 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1488 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1489 	rte_compiler_barrier();
1490 	*(volatile uint16_t *)&vq->used->idx += 1;
1491 	vq->last_used_idx += 1;
1492 
1493 	/* Kick the guest if necessary. */
1494 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1495 		eventfd_write(vq->callfd, (eventfd_t)1);
1496 }
1497 
1498 /*
1499  * This function get available descriptor from vitio vring and un-attached mbuf
1500  * from vpool->ring, and then attach them together. It needs adjust the offset
1501  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1502  * frame data may be put to wrong location in mbuf.
1503  */
1504 static inline void __attribute__((always_inline))
1505 attach_rxmbuf_zcp(struct virtio_net *dev)
1506 {
1507 	uint16_t res_base_idx, desc_idx;
1508 	uint64_t buff_addr, phys_addr;
1509 	struct vhost_virtqueue *vq;
1510 	struct vring_desc *desc;
1511 	void *obj = NULL;
1512 	struct rte_mbuf *mbuf;
1513 	struct vpool *vpool;
1514 	hpa_type addr_type;
1515 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1516 
1517 	vpool = &vpool_array[vdev->vmdq_rx_q];
1518 	vq = dev->virtqueue[VIRTIO_RXQ];
1519 
1520 	do {
1521 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1522 				1) != 1))
1523 			return;
1524 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1525 
1526 		desc = &vq->desc[desc_idx];
1527 		if (desc->flags & VRING_DESC_F_NEXT) {
1528 			desc = &vq->desc[desc->next];
1529 			buff_addr = gpa_to_vva(dev, desc->addr);
1530 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1531 					&addr_type);
1532 		} else {
1533 			buff_addr = gpa_to_vva(dev,
1534 					desc->addr + vq->vhost_hlen);
1535 			phys_addr = gpa_to_hpa(vdev,
1536 					desc->addr + vq->vhost_hlen,
1537 					desc->len, &addr_type);
1538 		}
1539 
1540 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1541 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1542 				" address found when attaching RX frame buffer"
1543 				" address!\n", dev->device_fh);
1544 			put_desc_to_used_list_zcp(vq, desc_idx);
1545 			continue;
1546 		}
1547 
1548 		/*
1549 		 * Check if the frame buffer address from guest crosses
1550 		 * sub-region or not.
1551 		 */
1552 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1553 			RTE_LOG(ERR, VHOST_DATA,
1554 				"(%"PRIu64") Frame buffer address cross "
1555 				"sub-regioin found when attaching RX frame "
1556 				"buffer address!\n",
1557 				dev->device_fh);
1558 			put_desc_to_used_list_zcp(vq, desc_idx);
1559 			continue;
1560 		}
1561 	} while (unlikely(phys_addr == 0));
1562 
1563 	rte_ring_sc_dequeue(vpool->ring, &obj);
1564 	mbuf = obj;
1565 	if (unlikely(mbuf == NULL)) {
1566 		LOG_DEBUG(VHOST_DATA,
1567 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1568 			"ring_sc_dequeue fail.\n",
1569 			dev->device_fh);
1570 		put_desc_to_used_list_zcp(vq, desc_idx);
1571 		return;
1572 	}
1573 
1574 	if (unlikely(vpool->buf_size > desc->len)) {
1575 		LOG_DEBUG(VHOST_DATA,
1576 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1577 			"length(%d) of descriptor idx: %d less than room "
1578 			"size required: %d\n",
1579 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1580 		put_desc_to_used_list_zcp(vq, desc_idx);
1581 		rte_ring_sp_enqueue(vpool->ring, obj);
1582 		return;
1583 	}
1584 
1585 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1586 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1587 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1588 	mbuf->data_len = desc->len;
1589 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1590 
1591 	LOG_DEBUG(VHOST_DATA,
1592 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1593 		"descriptor idx:%d\n",
1594 		dev->device_fh, res_base_idx, desc_idx);
1595 
1596 	__rte_mbuf_raw_free(mbuf);
1597 
1598 	return;
1599 }
1600 
1601 /*
1602  * Detach an attched packet mbuf -
1603  *  - restore original mbuf address and length values.
1604  *  - reset pktmbuf data and data_len to their default values.
1605  *  All other fields of the given packet mbuf will be left intact.
1606  *
1607  * @param m
1608  *   The attached packet mbuf.
1609  */
1610 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1611 {
1612 	const struct rte_mempool *mp = m->pool;
1613 	void *buf = rte_mbuf_to_baddr(m);
1614 	uint32_t buf_ofs;
1615 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1616 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1617 
1618 	m->buf_addr = buf;
1619 	m->buf_len = (uint16_t)buf_len;
1620 
1621 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1622 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1623 	m->data_off = buf_ofs;
1624 
1625 	m->data_len = 0;
1626 }
1627 
1628 /*
1629  * This function is called after packets have been transimited. It fetchs mbuf
1630  * from vpool->pool, detached it and put into vpool->ring. It also update the
1631  * used index and kick the guest if necessary.
1632  */
1633 static inline uint32_t __attribute__((always_inline))
1634 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1635 {
1636 	struct rte_mbuf *mbuf;
1637 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1638 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1639 	uint32_t index = 0;
1640 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1641 
1642 	LOG_DEBUG(VHOST_DATA,
1643 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1644 		"clean is: %d\n",
1645 		dev->device_fh, mbuf_count);
1646 	LOG_DEBUG(VHOST_DATA,
1647 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1648 		"clean  is : %d\n",
1649 		dev->device_fh, rte_ring_count(vpool->ring));
1650 
1651 	for (index = 0; index < mbuf_count; index++) {
1652 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1653 		if (likely(MBUF_EXT_MEM(mbuf)))
1654 			pktmbuf_detach_zcp(mbuf);
1655 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1656 
1657 		/* Update used index buffer information. */
1658 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1659 		vq->used->ring[used_idx].len = 0;
1660 
1661 		used_idx = (used_idx + 1) & (vq->size - 1);
1662 	}
1663 
1664 	LOG_DEBUG(VHOST_DATA,
1665 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1666 		"clean is: %d\n",
1667 		dev->device_fh, rte_mempool_count(vpool->pool));
1668 	LOG_DEBUG(VHOST_DATA,
1669 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1670 		"clean  is : %d\n",
1671 		dev->device_fh, rte_ring_count(vpool->ring));
1672 	LOG_DEBUG(VHOST_DATA,
1673 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1674 		"vq->last_used_idx:%d\n",
1675 		dev->device_fh, vq->last_used_idx);
1676 
1677 	vq->last_used_idx += mbuf_count;
1678 
1679 	LOG_DEBUG(VHOST_DATA,
1680 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1681 		"vq->last_used_idx:%d\n",
1682 		dev->device_fh, vq->last_used_idx);
1683 
1684 	rte_compiler_barrier();
1685 
1686 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1687 
1688 	/* Kick guest if required. */
1689 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1690 		eventfd_write(vq->callfd, (eventfd_t)1);
1691 
1692 	return 0;
1693 }
1694 
1695 /*
1696  * This function is called when a virtio device is destroy.
1697  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1698  */
1699 static void mbuf_destroy_zcp(struct vpool *vpool)
1700 {
1701 	struct rte_mbuf *mbuf = NULL;
1702 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1703 
1704 	LOG_DEBUG(VHOST_CONFIG,
1705 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1706 		"mbuf_destroy_zcp is: %d\n",
1707 		mbuf_count);
1708 	LOG_DEBUG(VHOST_CONFIG,
1709 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1710 		"mbuf_destroy_zcp  is : %d\n",
1711 		rte_ring_count(vpool->ring));
1712 
1713 	for (index = 0; index < mbuf_count; index++) {
1714 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1715 		if (likely(mbuf != NULL)) {
1716 			if (likely(MBUF_EXT_MEM(mbuf)))
1717 				pktmbuf_detach_zcp(mbuf);
1718 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1719 		}
1720 	}
1721 
1722 	LOG_DEBUG(VHOST_CONFIG,
1723 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1724 		"mbuf_destroy_zcp is: %d\n",
1725 		rte_mempool_count(vpool->pool));
1726 	LOG_DEBUG(VHOST_CONFIG,
1727 		"in mbuf_destroy_zcp: mbuf count in ring after "
1728 		"mbuf_destroy_zcp is : %d\n",
1729 		rte_ring_count(vpool->ring));
1730 }
1731 
1732 /*
1733  * This function update the use flag and counter.
1734  */
1735 static inline uint32_t __attribute__((always_inline))
1736 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1737 	uint32_t count)
1738 {
1739 	struct vhost_virtqueue *vq;
1740 	struct vring_desc *desc;
1741 	struct rte_mbuf *buff;
1742 	/* The virtio_hdr is initialised to 0. */
1743 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1744 		= {{0, 0, 0, 0, 0, 0}, 0};
1745 	uint64_t buff_hdr_addr = 0;
1746 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1747 	uint32_t head_idx, packet_success = 0;
1748 	uint16_t res_cur_idx;
1749 
1750 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1751 
1752 	if (count == 0)
1753 		return 0;
1754 
1755 	vq = dev->virtqueue[VIRTIO_RXQ];
1756 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1757 
1758 	res_cur_idx = vq->last_used_idx;
1759 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1760 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1761 
1762 	/* Retrieve all of the head indexes first to avoid caching issues. */
1763 	for (head_idx = 0; head_idx < count; head_idx++)
1764 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1765 
1766 	/*Prefetch descriptor index. */
1767 	rte_prefetch0(&vq->desc[head[packet_success]]);
1768 
1769 	while (packet_success != count) {
1770 		/* Get descriptor from available ring */
1771 		desc = &vq->desc[head[packet_success]];
1772 
1773 		buff = pkts[packet_success];
1774 		LOG_DEBUG(VHOST_DATA,
1775 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1776 			"pkt[%d] descriptor idx: %d\n",
1777 			dev->device_fh, packet_success,
1778 			MBUF_HEADROOM_UINT32(buff));
1779 
1780 		PRINT_PACKET(dev,
1781 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1782 			+ RTE_PKTMBUF_HEADROOM),
1783 			rte_pktmbuf_data_len(buff), 0);
1784 
1785 		/* Buffer address translation for virtio header. */
1786 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1787 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1788 
1789 		/*
1790 		 * If the descriptors are chained the header and data are
1791 		 * placed in separate buffers.
1792 		 */
1793 		if (desc->flags & VRING_DESC_F_NEXT) {
1794 			desc->len = vq->vhost_hlen;
1795 			desc = &vq->desc[desc->next];
1796 			desc->len = rte_pktmbuf_data_len(buff);
1797 		} else {
1798 			desc->len = packet_len;
1799 		}
1800 
1801 		/* Update used ring with desc information */
1802 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1803 			= head[packet_success];
1804 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1805 			= packet_len;
1806 		res_cur_idx++;
1807 		packet_success++;
1808 
1809 		/* A header is required per buffer. */
1810 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1811 			(const void *)&virtio_hdr, vq->vhost_hlen);
1812 
1813 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1814 
1815 		if (likely(packet_success < count)) {
1816 			/* Prefetch descriptor index. */
1817 			rte_prefetch0(&vq->desc[head[packet_success]]);
1818 		}
1819 	}
1820 
1821 	rte_compiler_barrier();
1822 
1823 	LOG_DEBUG(VHOST_DATA,
1824 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1825 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1826 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1827 
1828 	*(volatile uint16_t *)&vq->used->idx += count;
1829 	vq->last_used_idx += count;
1830 
1831 	LOG_DEBUG(VHOST_DATA,
1832 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1833 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1834 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1835 
1836 	/* Kick the guest if necessary. */
1837 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1838 		eventfd_write(vq->callfd, (eventfd_t)1);
1839 
1840 	return count;
1841 }
1842 
1843 /*
1844  * This function routes the TX packet to the correct interface.
1845  * This may be a local device or the physical port.
1846  */
1847 static inline void __attribute__((always_inline))
1848 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1849 	uint32_t desc_idx, uint8_t need_copy)
1850 {
1851 	struct mbuf_table *tx_q;
1852 	struct rte_mbuf **m_table;
1853 	void *obj = NULL;
1854 	struct rte_mbuf *mbuf;
1855 	unsigned len, ret, offset = 0;
1856 	struct vpool *vpool;
1857 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1858 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1859 
1860 	/*Add packet to the port tx queue*/
1861 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1862 	len = tx_q->len;
1863 
1864 	/* Allocate an mbuf and populate the structure. */
1865 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1866 	rte_ring_sc_dequeue(vpool->ring, &obj);
1867 	mbuf = obj;
1868 	if (unlikely(mbuf == NULL)) {
1869 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1870 		RTE_LOG(ERR, VHOST_DATA,
1871 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1872 			dev->device_fh);
1873 		put_desc_to_used_list_zcp(vq, desc_idx);
1874 		return;
1875 	}
1876 
1877 	if (vm2vm_mode == VM2VM_HARDWARE) {
1878 		/* Avoid using a vlan tag from any vm for external pkt, such as
1879 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1880 		 * selection, MAC address determines it as an external pkt
1881 		 * which should go to network, while vlan tag determine it as
1882 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1883 		 * such a ambiguous situation, so pkt will lost.
1884 		 */
1885 		vlan_tag = external_pkt_default_vlan_tag;
1886 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1887 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1888 			__rte_mbuf_raw_free(mbuf);
1889 			return;
1890 		}
1891 	}
1892 
1893 	mbuf->nb_segs = m->nb_segs;
1894 	mbuf->next = m->next;
1895 	mbuf->data_len = m->data_len + offset;
1896 	mbuf->pkt_len = mbuf->data_len;
1897 	if (unlikely(need_copy)) {
1898 		/* Copy the packet contents to the mbuf. */
1899 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1900 			rte_pktmbuf_mtod(m, void *),
1901 			m->data_len);
1902 	} else {
1903 		mbuf->data_off = m->data_off;
1904 		mbuf->buf_physaddr = m->buf_physaddr;
1905 		mbuf->buf_addr = m->buf_addr;
1906 	}
1907 	mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1908 	mbuf->vlan_tci = vlan_tag;
1909 	mbuf->l2_len = sizeof(struct ether_hdr);
1910 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1911 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1912 
1913 	tx_q->m_table[len] = mbuf;
1914 	len++;
1915 
1916 	LOG_DEBUG(VHOST_DATA,
1917 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1918 		dev->device_fh,
1919 		mbuf->nb_segs,
1920 		(mbuf->next == NULL) ? "null" : "non-null");
1921 
1922 	if (enable_stats) {
1923 		dev_statistics[dev->device_fh].tx_total++;
1924 		dev_statistics[dev->device_fh].tx++;
1925 	}
1926 
1927 	if (unlikely(len == MAX_PKT_BURST)) {
1928 		m_table = (struct rte_mbuf **)tx_q->m_table;
1929 		ret = rte_eth_tx_burst(ports[0],
1930 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1931 
1932 		/*
1933 		 * Free any buffers not handled by TX and update
1934 		 * the port stats.
1935 		 */
1936 		if (unlikely(ret < len)) {
1937 			do {
1938 				rte_pktmbuf_free(m_table[ret]);
1939 			} while (++ret < len);
1940 		}
1941 
1942 		len = 0;
1943 		txmbuf_clean_zcp(dev, vpool);
1944 	}
1945 
1946 	tx_q->len = len;
1947 
1948 	return;
1949 }
1950 
1951 /*
1952  * This function TX all available packets in virtio TX queue for one
1953  * virtio-net device. If it is first packet, it learns MAC address and
1954  * setup VMDQ.
1955  */
1956 static inline void __attribute__((always_inline))
1957 virtio_dev_tx_zcp(struct virtio_net *dev)
1958 {
1959 	struct rte_mbuf m;
1960 	struct vhost_virtqueue *vq;
1961 	struct vring_desc *desc;
1962 	uint64_t buff_addr = 0, phys_addr;
1963 	uint32_t head[MAX_PKT_BURST];
1964 	uint32_t i;
1965 	uint16_t free_entries, packet_success = 0;
1966 	uint16_t avail_idx;
1967 	uint8_t need_copy = 0;
1968 	hpa_type addr_type;
1969 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1970 
1971 	vq = dev->virtqueue[VIRTIO_TXQ];
1972 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1973 
1974 	/* If there are no available buffers then return. */
1975 	if (vq->last_used_idx_res == avail_idx)
1976 		return;
1977 
1978 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1979 
1980 	/* Prefetch available ring to retrieve head indexes. */
1981 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1982 
1983 	/* Get the number of free entries in the ring */
1984 	free_entries = (avail_idx - vq->last_used_idx_res);
1985 
1986 	/* Limit to MAX_PKT_BURST. */
1987 	free_entries
1988 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1989 
1990 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1991 		dev->device_fh, free_entries);
1992 
1993 	/* Retrieve all of the head indexes first to avoid caching issues. */
1994 	for (i = 0; i < free_entries; i++)
1995 		head[i]
1996 			= vq->avail->ring[(vq->last_used_idx_res + i)
1997 			& (vq->size - 1)];
1998 
1999 	vq->last_used_idx_res += free_entries;
2000 
2001 	/* Prefetch descriptor index. */
2002 	rte_prefetch0(&vq->desc[head[packet_success]]);
2003 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2004 
2005 	while (packet_success < free_entries) {
2006 		desc = &vq->desc[head[packet_success]];
2007 
2008 		/* Discard first buffer as it is the virtio header */
2009 		desc = &vq->desc[desc->next];
2010 
2011 		/* Buffer address translation. */
2012 		buff_addr = gpa_to_vva(dev, desc->addr);
2013 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
2014 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2015 			&addr_type);
2016 
2017 		if (likely(packet_success < (free_entries - 1)))
2018 			/* Prefetch descriptor index. */
2019 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2020 
2021 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2022 			RTE_LOG(ERR, VHOST_DATA,
2023 				"(%"PRIu64") Invalid frame buffer address found"
2024 				"when TX packets!\n",
2025 				dev->device_fh);
2026 			packet_success++;
2027 			continue;
2028 		}
2029 
2030 		/* Prefetch buffer address. */
2031 		rte_prefetch0((void *)(uintptr_t)buff_addr);
2032 
2033 		/*
2034 		 * Setup dummy mbuf. This is copied to a real mbuf if
2035 		 * transmitted out the physical port.
2036 		 */
2037 		m.data_len = desc->len;
2038 		m.nb_segs = 1;
2039 		m.next = NULL;
2040 		m.data_off = 0;
2041 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2042 		m.buf_physaddr = phys_addr;
2043 
2044 		/*
2045 		 * Check if the frame buffer address from guest crosses
2046 		 * sub-region or not.
2047 		 */
2048 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2049 			RTE_LOG(ERR, VHOST_DATA,
2050 				"(%"PRIu64") Frame buffer address cross "
2051 				"sub-regioin found when attaching TX frame "
2052 				"buffer address!\n",
2053 				dev->device_fh);
2054 			need_copy = 1;
2055 		} else
2056 			need_copy = 0;
2057 
2058 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2059 
2060 		/*
2061 		 * If this is the first received packet we need to learn
2062 		 * the MAC and setup VMDQ
2063 		 */
2064 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2065 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2066 				/*
2067 				 * Discard frame if device is scheduled for
2068 				 * removal or a duplicate MAC address is found.
2069 				 */
2070 				packet_success += free_entries;
2071 				vq->last_used_idx += packet_success;
2072 				break;
2073 			}
2074 		}
2075 
2076 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2077 		packet_success++;
2078 	}
2079 }
2080 
2081 /*
2082  * This function is called by each data core. It handles all RX/TX registered
2083  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2084  * addresses are compared with all devices in the main linked list.
2085  */
2086 static int
2087 switch_worker_zcp(__attribute__((unused)) void *arg)
2088 {
2089 	struct virtio_net *dev = NULL;
2090 	struct vhost_dev  *vdev = NULL;
2091 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2092 	struct virtio_net_data_ll *dev_ll;
2093 	struct mbuf_table *tx_q;
2094 	volatile struct lcore_ll_info *lcore_ll;
2095 	const uint64_t drain_tsc
2096 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2097 		* BURST_TX_DRAIN_US;
2098 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2099 	unsigned ret;
2100 	const uint16_t lcore_id = rte_lcore_id();
2101 	uint16_t count_in_ring, rx_count = 0;
2102 
2103 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2104 
2105 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2106 	prev_tsc = 0;
2107 
2108 	while (1) {
2109 		cur_tsc = rte_rdtsc();
2110 
2111 		/* TX burst queue drain */
2112 		diff_tsc = cur_tsc - prev_tsc;
2113 		if (unlikely(diff_tsc > drain_tsc)) {
2114 			/*
2115 			 * Get mbuf from vpool.pool and detach mbuf and
2116 			 * put back into vpool.ring.
2117 			 */
2118 			dev_ll = lcore_ll->ll_root_used;
2119 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2120 				/* Get virtio device ID */
2121 				vdev = dev_ll->vdev;
2122 				dev = vdev->dev;
2123 
2124 				if (likely(!vdev->remove)) {
2125 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2126 					if (tx_q->len) {
2127 						LOG_DEBUG(VHOST_DATA,
2128 						"TX queue drained after timeout"
2129 						" with burst size %u\n",
2130 						tx_q->len);
2131 
2132 						/*
2133 						 * Tx any packets in the queue
2134 						 */
2135 						ret = rte_eth_tx_burst(
2136 							ports[0],
2137 							(uint16_t)tx_q->txq_id,
2138 							(struct rte_mbuf **)
2139 							tx_q->m_table,
2140 							(uint16_t)tx_q->len);
2141 						if (unlikely(ret < tx_q->len)) {
2142 							do {
2143 								rte_pktmbuf_free(
2144 									tx_q->m_table[ret]);
2145 							} while (++ret < tx_q->len);
2146 						}
2147 						tx_q->len = 0;
2148 
2149 						txmbuf_clean_zcp(dev,
2150 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2151 					}
2152 				}
2153 				dev_ll = dev_ll->next;
2154 			}
2155 			prev_tsc = cur_tsc;
2156 		}
2157 
2158 		rte_prefetch0(lcore_ll->ll_root_used);
2159 
2160 		/*
2161 		 * Inform the configuration core that we have exited the linked
2162 		 * list and that no devices are in use if requested.
2163 		 */
2164 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2165 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2166 
2167 		/* Process devices */
2168 		dev_ll = lcore_ll->ll_root_used;
2169 
2170 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2171 			vdev = dev_ll->vdev;
2172 			dev  = vdev->dev;
2173 			if (unlikely(vdev->remove)) {
2174 				dev_ll = dev_ll->next;
2175 				unlink_vmdq(vdev);
2176 				vdev->ready = DEVICE_SAFE_REMOVE;
2177 				continue;
2178 			}
2179 
2180 			if (likely(vdev->ready == DEVICE_RX)) {
2181 				uint32_t index = vdev->vmdq_rx_q;
2182 				uint16_t i;
2183 				count_in_ring
2184 				= rte_ring_count(vpool_array[index].ring);
2185 				uint16_t free_entries
2186 				= (uint16_t)get_available_ring_num_zcp(dev);
2187 
2188 				/*
2189 				 * Attach all mbufs in vpool.ring and put back
2190 				 * into vpool.pool.
2191 				 */
2192 				for (i = 0;
2193 				i < RTE_MIN(free_entries,
2194 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2195 				i++)
2196 					attach_rxmbuf_zcp(dev);
2197 
2198 				/* Handle guest RX */
2199 				rx_count = rte_eth_rx_burst(ports[0],
2200 					vdev->vmdq_rx_q, pkts_burst,
2201 					MAX_PKT_BURST);
2202 
2203 				if (rx_count) {
2204 					ret_count = virtio_dev_rx_zcp(dev,
2205 							pkts_burst, rx_count);
2206 					if (enable_stats) {
2207 						dev_statistics[dev->device_fh].rx_total
2208 							+= rx_count;
2209 						dev_statistics[dev->device_fh].rx
2210 							+= ret_count;
2211 					}
2212 					while (likely(rx_count)) {
2213 						rx_count--;
2214 						pktmbuf_detach_zcp(
2215 							pkts_burst[rx_count]);
2216 						rte_ring_sp_enqueue(
2217 							vpool_array[index].ring,
2218 							(void *)pkts_burst[rx_count]);
2219 					}
2220 				}
2221 			}
2222 
2223 			if (likely(!vdev->remove))
2224 				/* Handle guest TX */
2225 				virtio_dev_tx_zcp(dev);
2226 
2227 			/* Move to the next device in the list */
2228 			dev_ll = dev_ll->next;
2229 		}
2230 	}
2231 
2232 	return 0;
2233 }
2234 
2235 
2236 /*
2237  * Add an entry to a used linked list. A free entry must first be found
2238  * in the free linked list using get_data_ll_free_entry();
2239  */
2240 static void
2241 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2242 	struct virtio_net_data_ll *ll_dev)
2243 {
2244 	struct virtio_net_data_ll *ll = *ll_root_addr;
2245 
2246 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2247 	ll_dev->next = NULL;
2248 	rte_compiler_barrier();
2249 
2250 	/* If ll == NULL then this is the first device. */
2251 	if (ll) {
2252 		/* Increment to the tail of the linked list. */
2253 		while ((ll->next != NULL) )
2254 			ll = ll->next;
2255 
2256 		ll->next = ll_dev;
2257 	} else {
2258 		*ll_root_addr = ll_dev;
2259 	}
2260 }
2261 
2262 /*
2263  * Remove an entry from a used linked list. The entry must then be added to
2264  * the free linked list using put_data_ll_free_entry().
2265  */
2266 static void
2267 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2268 	struct virtio_net_data_ll *ll_dev,
2269 	struct virtio_net_data_ll *ll_dev_last)
2270 {
2271 	struct virtio_net_data_ll *ll = *ll_root_addr;
2272 
2273 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2274 		return;
2275 
2276 	if (ll_dev == ll)
2277 		*ll_root_addr = ll_dev->next;
2278 	else
2279 		if (likely(ll_dev_last != NULL))
2280 			ll_dev_last->next = ll_dev->next;
2281 		else
2282 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2283 }
2284 
2285 /*
2286  * Find and return an entry from the free linked list.
2287  */
2288 static struct virtio_net_data_ll *
2289 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2290 {
2291 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2292 	struct virtio_net_data_ll *ll_dev;
2293 
2294 	if (ll_free == NULL)
2295 		return NULL;
2296 
2297 	ll_dev = ll_free;
2298 	*ll_root_addr = ll_free->next;
2299 
2300 	return ll_dev;
2301 }
2302 
2303 /*
2304  * Place an entry back on to the free linked list.
2305  */
2306 static void
2307 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2308 	struct virtio_net_data_ll *ll_dev)
2309 {
2310 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2311 
2312 	if (ll_dev == NULL)
2313 		return;
2314 
2315 	ll_dev->next = ll_free;
2316 	*ll_root_addr = ll_dev;
2317 }
2318 
2319 /*
2320  * Creates a linked list of a given size.
2321  */
2322 static struct virtio_net_data_ll *
2323 alloc_data_ll(uint32_t size)
2324 {
2325 	struct virtio_net_data_ll *ll_new;
2326 	uint32_t i;
2327 
2328 	/* Malloc and then chain the linked list. */
2329 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2330 	if (ll_new == NULL) {
2331 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2332 		return NULL;
2333 	}
2334 
2335 	for (i = 0; i < size - 1; i++) {
2336 		ll_new[i].vdev = NULL;
2337 		ll_new[i].next = &ll_new[i+1];
2338 	}
2339 	ll_new[i].next = NULL;
2340 
2341 	return ll_new;
2342 }
2343 
2344 /*
2345  * Create the main linked list along with each individual cores linked list. A used and a free list
2346  * are created to manage entries.
2347  */
2348 static int
2349 init_data_ll (void)
2350 {
2351 	int lcore;
2352 
2353 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2354 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2355 		if (lcore_info[lcore].lcore_ll == NULL) {
2356 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2357 			return -1;
2358 		}
2359 
2360 		lcore_info[lcore].lcore_ll->device_num = 0;
2361 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2362 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2363 		if (num_devices % num_switching_cores)
2364 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2365 		else
2366 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2367 	}
2368 
2369 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2370 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2371 
2372 	return 0;
2373 }
2374 
2375 /*
2376  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2377  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2378  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2379  */
2380 static void
2381 destroy_device (volatile struct virtio_net *dev)
2382 {
2383 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2384 	struct virtio_net_data_ll *ll_main_dev_cur;
2385 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2386 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2387 	struct vhost_dev *vdev;
2388 	int lcore;
2389 
2390 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2391 
2392 	vdev = (struct vhost_dev *)dev->priv;
2393 	/*set the remove flag. */
2394 	vdev->remove = 1;
2395 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2396 		rte_pause();
2397 	}
2398 
2399 	/* Search for entry to be removed from lcore ll */
2400 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2401 	while (ll_lcore_dev_cur != NULL) {
2402 		if (ll_lcore_dev_cur->vdev == vdev) {
2403 			break;
2404 		} else {
2405 			ll_lcore_dev_last = ll_lcore_dev_cur;
2406 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2407 		}
2408 	}
2409 
2410 	if (ll_lcore_dev_cur == NULL) {
2411 		RTE_LOG(ERR, VHOST_CONFIG,
2412 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2413 			dev->device_fh);
2414 		return;
2415 	}
2416 
2417 	/* Search for entry to be removed from main ll */
2418 	ll_main_dev_cur = ll_root_used;
2419 	ll_main_dev_last = NULL;
2420 	while (ll_main_dev_cur != NULL) {
2421 		if (ll_main_dev_cur->vdev == vdev) {
2422 			break;
2423 		} else {
2424 			ll_main_dev_last = ll_main_dev_cur;
2425 			ll_main_dev_cur = ll_main_dev_cur->next;
2426 		}
2427 	}
2428 
2429 	/* Remove entries from the lcore and main ll. */
2430 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2431 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2432 
2433 	/* Set the dev_removal_flag on each lcore. */
2434 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2435 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2436 	}
2437 
2438 	/*
2439 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2440 	 * they can no longer access the device removed from the linked lists and that the devices
2441 	 * are no longer in use.
2442 	 */
2443 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2444 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2445 			rte_pause();
2446 		}
2447 	}
2448 
2449 	/* Add the entries back to the lcore and main free ll.*/
2450 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2451 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2452 
2453 	/* Decrement number of device on the lcore. */
2454 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2455 
2456 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2457 
2458 	if (zero_copy) {
2459 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2460 
2461 		/* Stop the RX queue. */
2462 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2463 			LOG_DEBUG(VHOST_CONFIG,
2464 				"(%"PRIu64") In destroy_device: Failed to stop "
2465 				"rx queue:%d\n",
2466 				dev->device_fh,
2467 				vdev->vmdq_rx_q);
2468 		}
2469 
2470 		LOG_DEBUG(VHOST_CONFIG,
2471 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2472 			"mempool back to ring for RX queue: %d\n",
2473 			dev->device_fh, vdev->vmdq_rx_q);
2474 
2475 		mbuf_destroy_zcp(vpool);
2476 
2477 		/* Stop the TX queue. */
2478 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2479 			LOG_DEBUG(VHOST_CONFIG,
2480 				"(%"PRIu64") In destroy_device: Failed to "
2481 				"stop tx queue:%d\n",
2482 				dev->device_fh, vdev->vmdq_rx_q);
2483 		}
2484 
2485 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2486 
2487 		LOG_DEBUG(VHOST_CONFIG,
2488 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2489 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2490 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2491 			dev->device_fh);
2492 
2493 		mbuf_destroy_zcp(vpool);
2494 		rte_free(vdev->regions_hpa);
2495 	}
2496 	rte_free(vdev);
2497 
2498 }
2499 
2500 /*
2501  * Calculate the region count of physical continous regions for one particular
2502  * region of whose vhost virtual address is continous. The particular region
2503  * start from vva_start, with size of 'size' in argument.
2504  */
2505 static uint32_t
2506 check_hpa_regions(uint64_t vva_start, uint64_t size)
2507 {
2508 	uint32_t i, nregions = 0, page_size = getpagesize();
2509 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2510 	if (vva_start % page_size) {
2511 		LOG_DEBUG(VHOST_CONFIG,
2512 			"in check_countinous: vva start(%p) mod page_size(%d) "
2513 			"has remainder\n",
2514 			(void *)(uintptr_t)vva_start, page_size);
2515 		return 0;
2516 	}
2517 	if (size % page_size) {
2518 		LOG_DEBUG(VHOST_CONFIG,
2519 			"in check_countinous: "
2520 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2521 			size, page_size);
2522 		return 0;
2523 	}
2524 	for (i = 0; i < size - page_size; i = i + page_size) {
2525 		cur_phys_addr
2526 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2527 		next_phys_addr = rte_mem_virt2phy(
2528 			(void *)(uintptr_t)(vva_start + i + page_size));
2529 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2530 			++nregions;
2531 			LOG_DEBUG(VHOST_CONFIG,
2532 				"in check_continuous: hva addr:(%p) is not "
2533 				"continuous with hva addr:(%p), diff:%d\n",
2534 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2535 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2536 				+ page_size), page_size);
2537 			LOG_DEBUG(VHOST_CONFIG,
2538 				"in check_continuous: hpa addr:(%p) is not "
2539 				"continuous with hpa addr:(%p), "
2540 				"diff:(%"PRIu64")\n",
2541 				(void *)(uintptr_t)cur_phys_addr,
2542 				(void *)(uintptr_t)next_phys_addr,
2543 				(next_phys_addr-cur_phys_addr));
2544 		}
2545 	}
2546 	return nregions;
2547 }
2548 
2549 /*
2550  * Divide each region whose vhost virtual address is continous into a few
2551  * sub-regions, make sure the physical address within each sub-region are
2552  * continous. And fill offset(to GPA) and size etc. information of each
2553  * sub-region into regions_hpa.
2554  */
2555 static uint32_t
2556 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2557 {
2558 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2559 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2560 
2561 	if (mem_region_hpa == NULL)
2562 		return 0;
2563 
2564 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2565 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2566 			virtio_memory->regions[regionidx].address_offset;
2567 		mem_region_hpa[regionidx_hpa].guest_phys_address
2568 			= virtio_memory->regions[regionidx].guest_phys_address;
2569 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2570 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2571 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2572 		LOG_DEBUG(VHOST_CONFIG,
2573 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2574 			regionidx_hpa,
2575 			(void *)(uintptr_t)
2576 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2577 		LOG_DEBUG(VHOST_CONFIG,
2578 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2579 			regionidx_hpa,
2580 			(void *)(uintptr_t)
2581 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2582 		for (i = 0, k = 0;
2583 			i < virtio_memory->regions[regionidx].memory_size -
2584 				page_size;
2585 			i += page_size) {
2586 			cur_phys_addr = rte_mem_virt2phy(
2587 					(void *)(uintptr_t)(vva_start + i));
2588 			next_phys_addr = rte_mem_virt2phy(
2589 					(void *)(uintptr_t)(vva_start +
2590 					i + page_size));
2591 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2592 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2593 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2594 					k + page_size;
2595 				mem_region_hpa[regionidx_hpa].memory_size
2596 					= k + page_size;
2597 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2598 					"phys addr end  [%d]:(%p)\n",
2599 					regionidx_hpa,
2600 					(void *)(uintptr_t)
2601 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2602 				LOG_DEBUG(VHOST_CONFIG,
2603 					"in fill_hpa_regions: guest phys addr "
2604 					"size [%d]:(%p)\n",
2605 					regionidx_hpa,
2606 					(void *)(uintptr_t)
2607 					(mem_region_hpa[regionidx_hpa].memory_size));
2608 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2609 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2610 				++regionidx_hpa;
2611 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2612 					next_phys_addr -
2613 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2614 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2615 					" phys addr start[%d]:(%p)\n",
2616 					regionidx_hpa,
2617 					(void *)(uintptr_t)
2618 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2619 				LOG_DEBUG(VHOST_CONFIG,
2620 					"in fill_hpa_regions: host  phys addr "
2621 					"start[%d]:(%p)\n",
2622 					regionidx_hpa,
2623 					(void *)(uintptr_t)
2624 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2625 				k = 0;
2626 			} else {
2627 				k += page_size;
2628 			}
2629 		}
2630 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2631 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2632 			+ k + page_size;
2633 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2634 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2635 			"[%d]:(%p)\n", regionidx_hpa,
2636 			(void *)(uintptr_t)
2637 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2638 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2639 			"[%d]:(%p)\n", regionidx_hpa,
2640 			(void *)(uintptr_t)
2641 			(mem_region_hpa[regionidx_hpa].memory_size));
2642 		++regionidx_hpa;
2643 	}
2644 	return regionidx_hpa;
2645 }
2646 
2647 /*
2648  * A new device is added to a data core. First the device is added to the main linked list
2649  * and the allocated to a specific data core.
2650  */
2651 static int
2652 new_device (struct virtio_net *dev)
2653 {
2654 	struct virtio_net_data_ll *ll_dev;
2655 	int lcore, core_add = 0;
2656 	uint32_t device_num_min = num_devices;
2657 	struct vhost_dev *vdev;
2658 	uint32_t regionidx;
2659 
2660 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2661 	if (vdev == NULL) {
2662 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2663 			dev->device_fh);
2664 		return -1;
2665 	}
2666 	vdev->dev = dev;
2667 	dev->priv = vdev;
2668 
2669 	if (zero_copy) {
2670 		vdev->nregions_hpa = dev->mem->nregions;
2671 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2672 			vdev->nregions_hpa
2673 				+= check_hpa_regions(
2674 					dev->mem->regions[regionidx].guest_phys_address
2675 					+ dev->mem->regions[regionidx].address_offset,
2676 					dev->mem->regions[regionidx].memory_size);
2677 
2678 		}
2679 
2680 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2681 					       vdev->nregions_hpa,
2682 					       sizeof(struct virtio_memory_regions_hpa),
2683 					       RTE_CACHE_LINE_SIZE);
2684 		if (vdev->regions_hpa == NULL) {
2685 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2686 			rte_free(vdev);
2687 			return -1;
2688 		}
2689 
2690 
2691 		if (fill_hpa_memory_regions(
2692 			vdev->regions_hpa, dev->mem
2693 			) != vdev->nregions_hpa) {
2694 
2695 			RTE_LOG(ERR, VHOST_CONFIG,
2696 				"hpa memory regions number mismatch: "
2697 				"[%d]\n", vdev->nregions_hpa);
2698 			rte_free(vdev->regions_hpa);
2699 			rte_free(vdev);
2700 			return -1;
2701 		}
2702 	}
2703 
2704 
2705 	/* Add device to main ll */
2706 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2707 	if (ll_dev == NULL) {
2708 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2709 			"of %d devices per core has been reached\n",
2710 			dev->device_fh, num_devices);
2711 		if (vdev->regions_hpa)
2712 			rte_free(vdev->regions_hpa);
2713 		rte_free(vdev);
2714 		return -1;
2715 	}
2716 	ll_dev->vdev = vdev;
2717 	add_data_ll_entry(&ll_root_used, ll_dev);
2718 	vdev->vmdq_rx_q
2719 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2720 
2721 	if (zero_copy) {
2722 		uint32_t index = vdev->vmdq_rx_q;
2723 		uint32_t count_in_ring, i;
2724 		struct mbuf_table *tx_q;
2725 
2726 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2727 
2728 		LOG_DEBUG(VHOST_CONFIG,
2729 			"(%"PRIu64") in new_device: mbuf count in mempool "
2730 			"before attach is: %d\n",
2731 			dev->device_fh,
2732 			rte_mempool_count(vpool_array[index].pool));
2733 		LOG_DEBUG(VHOST_CONFIG,
2734 			"(%"PRIu64") in new_device: mbuf count in  ring "
2735 			"before attach  is : %d\n",
2736 			dev->device_fh, count_in_ring);
2737 
2738 		/*
2739 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2740 		 */
2741 		for (i = 0; i < count_in_ring; i++)
2742 			attach_rxmbuf_zcp(dev);
2743 
2744 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2745 			"mempool after attach is: %d\n",
2746 			dev->device_fh,
2747 			rte_mempool_count(vpool_array[index].pool));
2748 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2749 			"ring after attach  is : %d\n",
2750 			dev->device_fh,
2751 			rte_ring_count(vpool_array[index].ring));
2752 
2753 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2754 		tx_q->txq_id = vdev->vmdq_rx_q;
2755 
2756 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2757 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2758 
2759 			LOG_DEBUG(VHOST_CONFIG,
2760 				"(%"PRIu64") In new_device: Failed to start "
2761 				"tx queue:%d\n",
2762 				dev->device_fh, vdev->vmdq_rx_q);
2763 
2764 			mbuf_destroy_zcp(vpool);
2765 			rte_free(vdev->regions_hpa);
2766 			rte_free(vdev);
2767 			return -1;
2768 		}
2769 
2770 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2771 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2772 
2773 			LOG_DEBUG(VHOST_CONFIG,
2774 				"(%"PRIu64") In new_device: Failed to start "
2775 				"rx queue:%d\n",
2776 				dev->device_fh, vdev->vmdq_rx_q);
2777 
2778 			/* Stop the TX queue. */
2779 			if (rte_eth_dev_tx_queue_stop(ports[0],
2780 				vdev->vmdq_rx_q) != 0) {
2781 				LOG_DEBUG(VHOST_CONFIG,
2782 					"(%"PRIu64") In new_device: Failed to "
2783 					"stop tx queue:%d\n",
2784 					dev->device_fh, vdev->vmdq_rx_q);
2785 			}
2786 
2787 			mbuf_destroy_zcp(vpool);
2788 			rte_free(vdev->regions_hpa);
2789 			rte_free(vdev);
2790 			return -1;
2791 		}
2792 
2793 	}
2794 
2795 	/*reset ready flag*/
2796 	vdev->ready = DEVICE_MAC_LEARNING;
2797 	vdev->remove = 0;
2798 
2799 	/* Find a suitable lcore to add the device. */
2800 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2801 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2802 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2803 			core_add = lcore;
2804 		}
2805 	}
2806 	/* Add device to lcore ll */
2807 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2808 	if (ll_dev == NULL) {
2809 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2810 		vdev->ready = DEVICE_SAFE_REMOVE;
2811 		destroy_device(dev);
2812 		rte_free(vdev->regions_hpa);
2813 		rte_free(vdev);
2814 		return -1;
2815 	}
2816 	ll_dev->vdev = vdev;
2817 	vdev->coreid = core_add;
2818 
2819 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2820 
2821 	/* Initialize device stats */
2822 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2823 
2824 	/* Disable notifications. */
2825 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2826 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2827 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2828 	dev->flags |= VIRTIO_DEV_RUNNING;
2829 
2830 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2831 
2832 	return 0;
2833 }
2834 
2835 /*
2836  * These callback allow devices to be added to the data core when configuration
2837  * has been fully complete.
2838  */
2839 static const struct virtio_net_device_ops virtio_net_device_ops =
2840 {
2841 	.new_device =  new_device,
2842 	.destroy_device = destroy_device,
2843 };
2844 
2845 /*
2846  * This is a thread will wake up after a period to print stats if the user has
2847  * enabled them.
2848  */
2849 static void
2850 print_stats(void)
2851 {
2852 	struct virtio_net_data_ll *dev_ll;
2853 	uint64_t tx_dropped, rx_dropped;
2854 	uint64_t tx, tx_total, rx, rx_total;
2855 	uint32_t device_fh;
2856 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2857 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2858 
2859 	while(1) {
2860 		sleep(enable_stats);
2861 
2862 		/* Clear screen and move to top left */
2863 		printf("%s%s", clr, top_left);
2864 
2865 		printf("\nDevice statistics ====================================");
2866 
2867 		dev_ll = ll_root_used;
2868 		while (dev_ll != NULL) {
2869 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2870 			tx_total = dev_statistics[device_fh].tx_total;
2871 			tx = dev_statistics[device_fh].tx;
2872 			tx_dropped = tx_total - tx;
2873 			if (zero_copy == 0) {
2874 				rx_total = rte_atomic64_read(
2875 					&dev_statistics[device_fh].rx_total_atomic);
2876 				rx = rte_atomic64_read(
2877 					&dev_statistics[device_fh].rx_atomic);
2878 			} else {
2879 				rx_total = dev_statistics[device_fh].rx_total;
2880 				rx = dev_statistics[device_fh].rx;
2881 			}
2882 			rx_dropped = rx_total - rx;
2883 
2884 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2885 					"\nTX total: 		%"PRIu64""
2886 					"\nTX dropped: 		%"PRIu64""
2887 					"\nTX successful: 		%"PRIu64""
2888 					"\nRX total: 		%"PRIu64""
2889 					"\nRX dropped: 		%"PRIu64""
2890 					"\nRX successful: 		%"PRIu64"",
2891 					device_fh,
2892 					tx_total,
2893 					tx_dropped,
2894 					tx,
2895 					rx_total,
2896 					rx_dropped,
2897 					rx);
2898 
2899 			dev_ll = dev_ll->next;
2900 		}
2901 		printf("\n======================================================\n");
2902 	}
2903 }
2904 
2905 static void
2906 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2907 	char *ring_name, uint32_t nb_mbuf)
2908 {
2909 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2910 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2911 	if (vpool_array[index].pool != NULL) {
2912 		vpool_array[index].ring
2913 			= rte_ring_create(ring_name,
2914 				rte_align32pow2(nb_mbuf + 1),
2915 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2916 		if (likely(vpool_array[index].ring != NULL)) {
2917 			LOG_DEBUG(VHOST_CONFIG,
2918 				"in setup_mempool_tbl: mbuf count in "
2919 				"mempool is: %d\n",
2920 				rte_mempool_count(vpool_array[index].pool));
2921 			LOG_DEBUG(VHOST_CONFIG,
2922 				"in setup_mempool_tbl: mbuf count in "
2923 				"ring   is: %d\n",
2924 				rte_ring_count(vpool_array[index].ring));
2925 		} else {
2926 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2927 				ring_name);
2928 		}
2929 
2930 		/* Need consider head room. */
2931 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2932 	} else {
2933 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2934 	}
2935 }
2936 
2937 /* When we receive a INT signal, unregister vhost driver */
2938 static void
2939 sigint_handler(__rte_unused int signum)
2940 {
2941 	/* Unregister vhost driver. */
2942 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2943 	if (ret != 0)
2944 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2945 	exit(0);
2946 }
2947 
2948 /*
2949  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2950  * device is also registered here to handle the IOCTLs.
2951  */
2952 int
2953 main(int argc, char *argv[])
2954 {
2955 	struct rte_mempool *mbuf_pool = NULL;
2956 	unsigned lcore_id, core_id = 0;
2957 	unsigned nb_ports, valid_num_ports;
2958 	int ret;
2959 	uint8_t portid;
2960 	uint16_t queue_id;
2961 	static pthread_t tid;
2962 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2963 
2964 	signal(SIGINT, sigint_handler);
2965 
2966 	/* init EAL */
2967 	ret = rte_eal_init(argc, argv);
2968 	if (ret < 0)
2969 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2970 	argc -= ret;
2971 	argv += ret;
2972 
2973 	/* parse app arguments */
2974 	ret = us_vhost_parse_args(argc, argv);
2975 	if (ret < 0)
2976 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2977 
2978 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2979 		if (rte_lcore_is_enabled(lcore_id))
2980 			lcore_ids[core_id ++] = lcore_id;
2981 
2982 	if (rte_lcore_count() > RTE_MAX_LCORE)
2983 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2984 
2985 	/*set the number of swithcing cores available*/
2986 	num_switching_cores = rte_lcore_count()-1;
2987 
2988 	/* Get the number of physical ports. */
2989 	nb_ports = rte_eth_dev_count();
2990 	if (nb_ports > RTE_MAX_ETHPORTS)
2991 		nb_ports = RTE_MAX_ETHPORTS;
2992 
2993 	/*
2994 	 * Update the global var NUM_PORTS and global array PORTS
2995 	 * and get value of var VALID_NUM_PORTS according to system ports number
2996 	 */
2997 	valid_num_ports = check_ports_num(nb_ports);
2998 
2999 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3000 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3001 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3002 		return -1;
3003 	}
3004 
3005 	if (zero_copy == 0) {
3006 		/* Create the mbuf pool. */
3007 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3008 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3009 			0, MBUF_DATA_SIZE, rte_socket_id());
3010 		if (mbuf_pool == NULL)
3011 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3012 
3013 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3014 			vpool_array[queue_id].pool = mbuf_pool;
3015 
3016 		if (vm2vm_mode == VM2VM_HARDWARE) {
3017 			/* Enable VT loop back to let L2 switch to do it. */
3018 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3019 			LOG_DEBUG(VHOST_CONFIG,
3020 				"Enable loop back for L2 switch in vmdq.\n");
3021 		}
3022 	} else {
3023 		uint32_t nb_mbuf;
3024 		char pool_name[RTE_MEMPOOL_NAMESIZE];
3025 		char ring_name[RTE_MEMPOOL_NAMESIZE];
3026 
3027 		nb_mbuf = num_rx_descriptor
3028 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3029 			+ num_switching_cores * MAX_PKT_BURST;
3030 
3031 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3032 			snprintf(pool_name, sizeof(pool_name),
3033 				"rxmbuf_pool_%u", queue_id);
3034 			snprintf(ring_name, sizeof(ring_name),
3035 				"rxmbuf_ring_%u", queue_id);
3036 			setup_mempool_tbl(rte_socket_id(), queue_id,
3037 				pool_name, ring_name, nb_mbuf);
3038 		}
3039 
3040 		nb_mbuf = num_tx_descriptor
3041 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3042 				+ num_switching_cores * MAX_PKT_BURST;
3043 
3044 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3045 			snprintf(pool_name, sizeof(pool_name),
3046 				"txmbuf_pool_%u", queue_id);
3047 			snprintf(ring_name, sizeof(ring_name),
3048 				"txmbuf_ring_%u", queue_id);
3049 			setup_mempool_tbl(rte_socket_id(),
3050 				(queue_id + MAX_QUEUES),
3051 				pool_name, ring_name, nb_mbuf);
3052 		}
3053 
3054 		if (vm2vm_mode == VM2VM_HARDWARE) {
3055 			/* Enable VT loop back to let L2 switch to do it. */
3056 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3057 			LOG_DEBUG(VHOST_CONFIG,
3058 				"Enable loop back for L2 switch in vmdq.\n");
3059 		}
3060 	}
3061 	/* Set log level. */
3062 	rte_set_log_level(LOG_LEVEL);
3063 
3064 	/* initialize all ports */
3065 	for (portid = 0; portid < nb_ports; portid++) {
3066 		/* skip ports that are not enabled */
3067 		if ((enabled_port_mask & (1 << portid)) == 0) {
3068 			RTE_LOG(INFO, VHOST_PORT,
3069 				"Skipping disabled port %d\n", portid);
3070 			continue;
3071 		}
3072 		if (port_init(portid) != 0)
3073 			rte_exit(EXIT_FAILURE,
3074 				"Cannot initialize network ports\n");
3075 	}
3076 
3077 	/* Initialise all linked lists. */
3078 	if (init_data_ll() == -1)
3079 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3080 
3081 	/* Initialize device stats */
3082 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3083 
3084 	/* Enable stats if the user option is set. */
3085 	if (enable_stats) {
3086 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3087 		if (ret != 0)
3088 			rte_exit(EXIT_FAILURE,
3089 				"Cannot create print-stats thread\n");
3090 
3091 		/* Set thread_name for aid in debugging.  */
3092 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3093 		ret = rte_thread_setname(tid, thread_name);
3094 		if (ret != 0)
3095 			RTE_LOG(ERR, VHOST_CONFIG,
3096 				"Cannot set print-stats name\n");
3097 	}
3098 
3099 	/* Launch all data cores. */
3100 	if (zero_copy == 0) {
3101 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3102 			rte_eal_remote_launch(switch_worker,
3103 				mbuf_pool, lcore_id);
3104 		}
3105 	} else {
3106 		uint32_t count_in_mempool, index, i;
3107 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3108 			/* For all RX and TX queues. */
3109 			count_in_mempool
3110 				= rte_mempool_count(vpool_array[index].pool);
3111 
3112 			/*
3113 			 * Transfer all un-attached mbufs from vpool.pool
3114 			 * to vpoo.ring.
3115 			 */
3116 			for (i = 0; i < count_in_mempool; i++) {
3117 				struct rte_mbuf *mbuf
3118 					= __rte_mbuf_raw_alloc(
3119 						vpool_array[index].pool);
3120 				rte_ring_sp_enqueue(vpool_array[index].ring,
3121 						(void *)mbuf);
3122 			}
3123 
3124 			LOG_DEBUG(VHOST_CONFIG,
3125 				"in main: mbuf count in mempool at initial "
3126 				"is: %d\n", count_in_mempool);
3127 			LOG_DEBUG(VHOST_CONFIG,
3128 				"in main: mbuf count in  ring at initial  is :"
3129 				" %d\n",
3130 				rte_ring_count(vpool_array[index].ring));
3131 		}
3132 
3133 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3134 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3135 				lcore_id);
3136 	}
3137 
3138 	if (mergeable == 0)
3139 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3140 
3141 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3142 	ret = rte_vhost_driver_register((char *)&dev_basename);
3143 	if (ret != 0)
3144 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3145 
3146 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3147 
3148 	/* Start CUSE session. */
3149 	rte_vhost_driver_session_start();
3150 	return 0;
3151 
3152 }
3153