xref: /dpdk/examples/vhost/main.c (revision 67b6d3039e9edbc4624c878c6930be5e126e8b58)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE	128
70 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
78 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
79 #define MBUF_CACHE_SIZE_ZCP 0
80 
81 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
82 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
83 
84 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
85 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
86 
87 #define JUMBO_FRAME_MAX_SIZE    0x2600
88 
89 /* State of virtio device. */
90 #define DEVICE_MAC_LEARNING 0
91 #define DEVICE_RX			1
92 #define DEVICE_SAFE_REMOVE	2
93 
94 /* Config_core_flag status definitions. */
95 #define REQUEST_DEV_REMOVAL 1
96 #define ACK_DEV_REMOVAL 0
97 
98 /* Configurable number of RX/TX ring descriptors */
99 #define RTE_TEST_RX_DESC_DEFAULT 1024
100 #define RTE_TEST_TX_DESC_DEFAULT 512
101 
102 /*
103  * Need refine these 2 macros for legacy and DPDK based front end:
104  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
105  * And then adjust power 2.
106  */
107 /*
108  * For legacy front end, 128 descriptors,
109  * half for virtio header, another half for mbuf.
110  */
111 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
112 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
113 
114 /* Get first 4 bytes in mbuf headroom. */
115 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
116 		+ sizeof(struct rte_mbuf)))
117 
118 /* true if x is a power of 2 */
119 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
120 
121 #define INVALID_PORT_ID 0xFF
122 
123 /* Max number of devices. Limited by vmdq. */
124 #define MAX_DEVICES 64
125 
126 /* Size of buffers used for snprintfs. */
127 #define MAX_PRINT_BUFF 6072
128 
129 /* Maximum character device basename size. */
130 #define MAX_BASENAME_SZ 10
131 
132 /* Maximum long option length for option parsing. */
133 #define MAX_LONG_OPT_SZ 64
134 
135 /* Used to compare MAC addresses. */
136 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
137 
138 /* Number of descriptors per cacheline. */
139 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
140 
141 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
142 
143 /* mask of enabled ports */
144 static uint32_t enabled_port_mask = 0;
145 
146 /* Promiscuous mode */
147 static uint32_t promiscuous;
148 
149 /*Number of switching cores enabled*/
150 static uint32_t num_switching_cores = 0;
151 
152 /* number of devices/queues to support*/
153 static uint32_t num_queues = 0;
154 static uint32_t num_devices;
155 
156 /*
157  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
158  * disabled on default.
159  */
160 static uint32_t zero_copy;
161 static int mergeable;
162 
163 /* Do vlan strip on host, enabled on default */
164 static uint32_t vlan_strip = 1;
165 
166 /* number of descriptors to apply*/
167 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
168 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
169 
170 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
171 #define MAX_RING_DESC 4096
172 
173 struct vpool {
174 	struct rte_mempool *pool;
175 	struct rte_ring *ring;
176 	uint32_t buf_size;
177 } vpool_array[MAX_QUEUES+MAX_QUEUES];
178 
179 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
180 typedef enum {
181 	VM2VM_DISABLED = 0,
182 	VM2VM_SOFTWARE = 1,
183 	VM2VM_HARDWARE = 2,
184 	VM2VM_LAST
185 } vm2vm_type;
186 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
187 
188 /* The type of host physical address translated from guest physical address. */
189 typedef enum {
190 	PHYS_ADDR_CONTINUOUS = 0,
191 	PHYS_ADDR_CROSS_SUBREG = 1,
192 	PHYS_ADDR_INVALID = 2,
193 	PHYS_ADDR_LAST
194 } hpa_type;
195 
196 /* Enable stats. */
197 static uint32_t enable_stats = 0;
198 /* Enable retries on RX. */
199 static uint32_t enable_retry = 1;
200 /* Specify timeout (in useconds) between retries on RX. */
201 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
202 /* Specify the number of retries on RX. */
203 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
204 
205 /* Character device basename. Can be set by user. */
206 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
207 
208 /* empty vmdq configuration structure. Filled in programatically */
209 static struct rte_eth_conf vmdq_conf_default = {
210 	.rxmode = {
211 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
212 		.split_hdr_size = 0,
213 		.header_split   = 0, /**< Header Split disabled */
214 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
215 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
216 		/*
217 		 * It is necessary for 1G NIC such as I350,
218 		 * this fixes bug of ipv4 forwarding in guest can't
219 		 * forward pakets from one virtio dev to another virtio dev.
220 		 */
221 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
222 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
223 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
224 	},
225 
226 	.txmode = {
227 		.mq_mode = ETH_MQ_TX_NONE,
228 	},
229 	.rx_adv_conf = {
230 		/*
231 		 * should be overridden separately in code with
232 		 * appropriate values
233 		 */
234 		.vmdq_rx_conf = {
235 			.nb_queue_pools = ETH_8_POOLS,
236 			.enable_default_pool = 0,
237 			.default_pool = 0,
238 			.nb_pool_maps = 0,
239 			.pool_map = {{0, 0},},
240 		},
241 	},
242 };
243 
244 static unsigned lcore_ids[RTE_MAX_LCORE];
245 static uint8_t ports[RTE_MAX_ETHPORTS];
246 static unsigned num_ports = 0; /**< The number of ports specified in command line */
247 static uint16_t num_pf_queues, num_vmdq_queues;
248 static uint16_t vmdq_pool_base, vmdq_queue_base;
249 static uint16_t queues_per_pool;
250 
251 static const uint16_t external_pkt_default_vlan_tag = 2000;
252 const uint16_t vlan_tags[] = {
253 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
254 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
255 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
256 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
257 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
258 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
259 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
260 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
261 };
262 
263 /* ethernet addresses of ports */
264 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
265 
266 /* heads for the main used and free linked lists for the data path. */
267 static struct virtio_net_data_ll *ll_root_used = NULL;
268 static struct virtio_net_data_ll *ll_root_free = NULL;
269 
270 /* Array of data core structures containing information on individual core linked lists. */
271 static struct lcore_info lcore_info[RTE_MAX_LCORE];
272 
273 /* Used for queueing bursts of TX packets. */
274 struct mbuf_table {
275 	unsigned len;
276 	unsigned txq_id;
277 	struct rte_mbuf *m_table[MAX_PKT_BURST];
278 };
279 
280 /* TX queue for each data core. */
281 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
282 
283 /* TX queue fori each virtio device for zero copy. */
284 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
285 
286 /* Vlan header struct used to insert vlan tags on TX. */
287 struct vlan_ethhdr {
288 	unsigned char   h_dest[ETH_ALEN];
289 	unsigned char   h_source[ETH_ALEN];
290 	__be16          h_vlan_proto;
291 	__be16          h_vlan_TCI;
292 	__be16          h_vlan_encapsulated_proto;
293 };
294 
295 /* IPv4 Header */
296 struct ipv4_hdr {
297 	uint8_t  version_ihl;		/**< version and header length */
298 	uint8_t  type_of_service;	/**< type of service */
299 	uint16_t total_length;		/**< length of packet */
300 	uint16_t packet_id;		/**< packet ID */
301 	uint16_t fragment_offset;	/**< fragmentation offset */
302 	uint8_t  time_to_live;		/**< time to live */
303 	uint8_t  next_proto_id;		/**< protocol ID */
304 	uint16_t hdr_checksum;		/**< header checksum */
305 	uint32_t src_addr;		/**< source address */
306 	uint32_t dst_addr;		/**< destination address */
307 } __attribute__((__packed__));
308 
309 /* Header lengths. */
310 #define VLAN_HLEN       4
311 #define VLAN_ETH_HLEN   18
312 
313 /* Per-device statistics struct */
314 struct device_statistics {
315 	uint64_t tx_total;
316 	rte_atomic64_t rx_total_atomic;
317 	uint64_t rx_total;
318 	uint64_t tx;
319 	rte_atomic64_t rx_atomic;
320 	uint64_t rx;
321 } __rte_cache_aligned;
322 struct device_statistics dev_statistics[MAX_DEVICES];
323 
324 /*
325  * Builds up the correct configuration for VMDQ VLAN pool map
326  * according to the pool & queue limits.
327  */
328 static inline int
329 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
330 {
331 	struct rte_eth_vmdq_rx_conf conf;
332 	struct rte_eth_vmdq_rx_conf *def_conf =
333 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
334 	unsigned i;
335 
336 	memset(&conf, 0, sizeof(conf));
337 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
338 	conf.nb_pool_maps = num_devices;
339 	conf.enable_loop_back = def_conf->enable_loop_back;
340 	conf.rx_mode = def_conf->rx_mode;
341 
342 	for (i = 0; i < conf.nb_pool_maps; i++) {
343 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
344 		conf.pool_map[i].pools = (1UL << i);
345 	}
346 
347 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
348 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
349 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
350 	return 0;
351 }
352 
353 /*
354  * Validate the device number according to the max pool number gotten form
355  * dev_info. If the device number is invalid, give the error message and
356  * return -1. Each device must have its own pool.
357  */
358 static inline int
359 validate_num_devices(uint32_t max_nb_devices)
360 {
361 	if (num_devices > max_nb_devices) {
362 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
363 		return -1;
364 	}
365 	return 0;
366 }
367 
368 /*
369  * Initialises a given port using global settings and with the rx buffers
370  * coming from the mbuf_pool passed as parameter
371  */
372 static inline int
373 port_init(uint8_t port)
374 {
375 	struct rte_eth_dev_info dev_info;
376 	struct rte_eth_conf port_conf;
377 	struct rte_eth_rxconf *rxconf;
378 	struct rte_eth_txconf *txconf;
379 	int16_t rx_rings, tx_rings;
380 	uint16_t rx_ring_size, tx_ring_size;
381 	int retval;
382 	uint16_t q;
383 
384 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
385 	rte_eth_dev_info_get (port, &dev_info);
386 
387 	if (dev_info.max_rx_queues > MAX_QUEUES) {
388 		rte_exit(EXIT_FAILURE,
389 			"please define MAX_QUEUES no less than %u in %s\n",
390 			dev_info.max_rx_queues, __FILE__);
391 	}
392 
393 	rxconf = &dev_info.default_rxconf;
394 	txconf = &dev_info.default_txconf;
395 	rxconf->rx_drop_en = 1;
396 
397 	/* Enable vlan offload */
398 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
399 
400 	/*
401 	 * Zero copy defers queue RX/TX start to the time when guest
402 	 * finishes its startup and packet buffers from that guest are
403 	 * available.
404 	 */
405 	if (zero_copy) {
406 		rxconf->rx_deferred_start = 1;
407 		rxconf->rx_drop_en = 0;
408 		txconf->tx_deferred_start = 1;
409 	}
410 
411 	/*configure the number of supported virtio devices based on VMDQ limits */
412 	num_devices = dev_info.max_vmdq_pools;
413 
414 	if (zero_copy) {
415 		rx_ring_size = num_rx_descriptor;
416 		tx_ring_size = num_tx_descriptor;
417 		tx_rings = dev_info.max_tx_queues;
418 	} else {
419 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
420 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
421 		tx_rings = (uint16_t)rte_lcore_count();
422 	}
423 
424 	retval = validate_num_devices(MAX_DEVICES);
425 	if (retval < 0)
426 		return retval;
427 
428 	/* Get port configuration. */
429 	retval = get_eth_conf(&port_conf, num_devices);
430 	if (retval < 0)
431 		return retval;
432 	/* NIC queues are divided into pf queues and vmdq queues.  */
433 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
434 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
435 	num_vmdq_queues = num_devices * queues_per_pool;
436 	num_queues = num_pf_queues + num_vmdq_queues;
437 	vmdq_queue_base = dev_info.vmdq_queue_base;
438 	vmdq_pool_base  = dev_info.vmdq_pool_base;
439 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
440 		num_pf_queues, num_devices, queues_per_pool);
441 
442 	if (port >= rte_eth_dev_count()) return -1;
443 
444 	rx_rings = (uint16_t)dev_info.max_rx_queues;
445 	/* Configure ethernet device. */
446 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
447 	if (retval != 0)
448 		return retval;
449 
450 	/* Setup the queues. */
451 	for (q = 0; q < rx_rings; q ++) {
452 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
453 						rte_eth_dev_socket_id(port),
454 						rxconf,
455 						vpool_array[q].pool);
456 		if (retval < 0)
457 			return retval;
458 	}
459 	for (q = 0; q < tx_rings; q ++) {
460 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
461 						rte_eth_dev_socket_id(port),
462 						txconf);
463 		if (retval < 0)
464 			return retval;
465 	}
466 
467 	/* Start the device. */
468 	retval  = rte_eth_dev_start(port);
469 	if (retval < 0) {
470 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471 		return retval;
472 	}
473 
474 	if (promiscuous)
475 		rte_eth_promiscuous_enable(port);
476 
477 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481 			(unsigned)port,
482 			vmdq_ports_eth_addr[port].addr_bytes[0],
483 			vmdq_ports_eth_addr[port].addr_bytes[1],
484 			vmdq_ports_eth_addr[port].addr_bytes[2],
485 			vmdq_ports_eth_addr[port].addr_bytes[3],
486 			vmdq_ports_eth_addr[port].addr_bytes[4],
487 			vmdq_ports_eth_addr[port].addr_bytes[5]);
488 
489 	return 0;
490 }
491 
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498 	/* parse number string */
499 
500 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501 		return -1;
502 	else
503 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504 
505 	return 0;
506 }
507 
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514 	char *end = NULL;
515 	unsigned long pm;
516 
517 	errno = 0;
518 
519 	/* parse hexadecimal string */
520 	pm = strtoul(portmask, &end, 16);
521 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522 		return -1;
523 
524 	if (pm == 0)
525 		return -1;
526 
527 	return pm;
528 
529 }
530 
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537 	char *end = NULL;
538 	unsigned long num;
539 
540 	errno = 0;
541 
542 	/* parse unsigned int string */
543 	num = strtoul(q_arg, &end, 10);
544 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545 		return -1;
546 
547 	if (num > max_valid_value)
548 		return -1;
549 
550 	return num;
551 
552 }
553 
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561 	"		--vm2vm [0|1|2]\n"
562 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 	"		--dev-basename <name>\n"
564 	"		--nb-devices ND\n"
565 	"		-p PORTMASK: Set mask for ports to be used by application\n"
566 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
572 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
573 	"		--dev-basename: The basename to be used for the character device.\n"
574 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
575 			"zero copy\n"
576 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
577 			"used only when zero copy is enabled.\n"
578 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
579 			"used only when zero copy is enabled.\n",
580 	       prgname);
581 }
582 
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589 	int opt, ret;
590 	int option_index;
591 	unsigned i;
592 	const char *prgname = argv[0];
593 	static struct option long_option[] = {
594 		{"vm2vm", required_argument, NULL, 0},
595 		{"rx-retry", required_argument, NULL, 0},
596 		{"rx-retry-delay", required_argument, NULL, 0},
597 		{"rx-retry-num", required_argument, NULL, 0},
598 		{"mergeable", required_argument, NULL, 0},
599 		{"vlan-strip", required_argument, NULL, 0},
600 		{"stats", required_argument, NULL, 0},
601 		{"dev-basename", required_argument, NULL, 0},
602 		{"zero-copy", required_argument, NULL, 0},
603 		{"rx-desc-num", required_argument, NULL, 0},
604 		{"tx-desc-num", required_argument, NULL, 0},
605 		{NULL, 0, 0, 0},
606 	};
607 
608 	/* Parse command line */
609 	while ((opt = getopt_long(argc, argv, "p:P",
610 			long_option, &option_index)) != EOF) {
611 		switch (opt) {
612 		/* Portmask */
613 		case 'p':
614 			enabled_port_mask = parse_portmask(optarg);
615 			if (enabled_port_mask == 0) {
616 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617 				us_vhost_usage(prgname);
618 				return -1;
619 			}
620 			break;
621 
622 		case 'P':
623 			promiscuous = 1;
624 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625 				ETH_VMDQ_ACCEPT_BROADCAST |
626 				ETH_VMDQ_ACCEPT_MULTICAST;
627 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
628 
629 			break;
630 
631 		case 0:
632 			/* Enable/disable vm2vm comms. */
633 			if (!strncmp(long_option[option_index].name, "vm2vm",
634 				MAX_LONG_OPT_SZ)) {
635 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
636 				if (ret == -1) {
637 					RTE_LOG(INFO, VHOST_CONFIG,
638 						"Invalid argument for "
639 						"vm2vm [0|1|2]\n");
640 					us_vhost_usage(prgname);
641 					return -1;
642 				} else {
643 					vm2vm_mode = (vm2vm_type)ret;
644 				}
645 			}
646 
647 			/* Enable/disable retries on RX. */
648 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649 				ret = parse_num_opt(optarg, 1);
650 				if (ret == -1) {
651 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652 					us_vhost_usage(prgname);
653 					return -1;
654 				} else {
655 					enable_retry = ret;
656 				}
657 			}
658 
659 			/* Specify the retries delay time (in useconds) on RX. */
660 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661 				ret = parse_num_opt(optarg, INT32_MAX);
662 				if (ret == -1) {
663 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664 					us_vhost_usage(prgname);
665 					return -1;
666 				} else {
667 					burst_rx_delay_time = ret;
668 				}
669 			}
670 
671 			/* Specify the retries number on RX. */
672 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673 				ret = parse_num_opt(optarg, INT32_MAX);
674 				if (ret == -1) {
675 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676 					us_vhost_usage(prgname);
677 					return -1;
678 				} else {
679 					burst_rx_retry_num = ret;
680 				}
681 			}
682 
683 			/* Enable/disable RX mergeable buffers. */
684 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685 				ret = parse_num_opt(optarg, 1);
686 				if (ret == -1) {
687 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688 					us_vhost_usage(prgname);
689 					return -1;
690 				} else {
691 					mergeable = !!ret;
692 					if (ret) {
693 						vmdq_conf_default.rxmode.jumbo_frame = 1;
694 						vmdq_conf_default.rxmode.max_rx_pkt_len
695 							= JUMBO_FRAME_MAX_SIZE;
696 					}
697 				}
698 			}
699 
700 			/* Enable/disable RX VLAN strip on host. */
701 			if (!strncmp(long_option[option_index].name,
702 				"vlan-strip", MAX_LONG_OPT_SZ)) {
703 				ret = parse_num_opt(optarg, 1);
704 				if (ret == -1) {
705 					RTE_LOG(INFO, VHOST_CONFIG,
706 						"Invalid argument for VLAN strip [0|1]\n");
707 					us_vhost_usage(prgname);
708 					return -1;
709 				} else {
710 					vlan_strip = !!ret;
711 					vmdq_conf_default.rxmode.hw_vlan_strip =
712 						vlan_strip;
713 				}
714 			}
715 
716 			/* Enable/disable stats. */
717 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
718 				ret = parse_num_opt(optarg, INT32_MAX);
719 				if (ret == -1) {
720 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
721 					us_vhost_usage(prgname);
722 					return -1;
723 				} else {
724 					enable_stats = ret;
725 				}
726 			}
727 
728 			/* Set character device basename. */
729 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
730 				if (us_vhost_parse_basename(optarg) == -1) {
731 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
732 					us_vhost_usage(prgname);
733 					return -1;
734 				}
735 			}
736 
737 			/* Enable/disable rx/tx zero copy. */
738 			if (!strncmp(long_option[option_index].name,
739 				"zero-copy", MAX_LONG_OPT_SZ)) {
740 				ret = parse_num_opt(optarg, 1);
741 				if (ret == -1) {
742 					RTE_LOG(INFO, VHOST_CONFIG,
743 						"Invalid argument"
744 						" for zero-copy [0|1]\n");
745 					us_vhost_usage(prgname);
746 					return -1;
747 				} else
748 					zero_copy = ret;
749 			}
750 
751 			/* Specify the descriptor number on RX. */
752 			if (!strncmp(long_option[option_index].name,
753 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
754 				ret = parse_num_opt(optarg, MAX_RING_DESC);
755 				if ((ret == -1) || (!POWEROF2(ret))) {
756 					RTE_LOG(INFO, VHOST_CONFIG,
757 					"Invalid argument for rx-desc-num[0-N],"
758 					"power of 2 required.\n");
759 					us_vhost_usage(prgname);
760 					return -1;
761 				} else {
762 					num_rx_descriptor = ret;
763 				}
764 			}
765 
766 			/* Specify the descriptor number on TX. */
767 			if (!strncmp(long_option[option_index].name,
768 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
769 				ret = parse_num_opt(optarg, MAX_RING_DESC);
770 				if ((ret == -1) || (!POWEROF2(ret))) {
771 					RTE_LOG(INFO, VHOST_CONFIG,
772 					"Invalid argument for tx-desc-num [0-N],"
773 					"power of 2 required.\n");
774 					us_vhost_usage(prgname);
775 					return -1;
776 				} else {
777 					num_tx_descriptor = ret;
778 				}
779 			}
780 
781 			break;
782 
783 			/* Invalid option - print options. */
784 		default:
785 			us_vhost_usage(prgname);
786 			return -1;
787 		}
788 	}
789 
790 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
791 		if (enabled_port_mask & (1 << i))
792 			ports[num_ports++] = (uint8_t)i;
793 	}
794 
795 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
796 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
797 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
798 		return -1;
799 	}
800 
801 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
802 		RTE_LOG(INFO, VHOST_PORT,
803 			"Vhost zero copy doesn't support software vm2vm,"
804 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
805 		return -1;
806 	}
807 
808 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
809 		RTE_LOG(INFO, VHOST_PORT,
810 			"Vhost zero copy doesn't support jumbo frame,"
811 			"please specify '--mergeable 0' to disable the "
812 			"mergeable feature.\n");
813 		return -1;
814 	}
815 
816 	return 0;
817 }
818 
819 /*
820  * Update the global var NUM_PORTS and array PORTS according to system ports number
821  * and return valid ports number
822  */
823 static unsigned check_ports_num(unsigned nb_ports)
824 {
825 	unsigned valid_num_ports = num_ports;
826 	unsigned portid;
827 
828 	if (num_ports > nb_ports) {
829 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
830 			num_ports, nb_ports);
831 		num_ports = nb_ports;
832 	}
833 
834 	for (portid = 0; portid < num_ports; portid ++) {
835 		if (ports[portid] >= nb_ports) {
836 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
837 				ports[portid], (nb_ports - 1));
838 			ports[portid] = INVALID_PORT_ID;
839 			valid_num_ports--;
840 		}
841 	}
842 	return valid_num_ports;
843 }
844 
845 /*
846  * Macro to print out packet contents. Wrapped in debug define so that the
847  * data path is not effected when debug is disabled.
848  */
849 #ifdef DEBUG
850 #define PRINT_PACKET(device, addr, size, header) do {																\
851 	char *pkt_addr = (char*)(addr);																					\
852 	unsigned int index;																								\
853 	char packet[MAX_PRINT_BUFF];																					\
854 																													\
855 	if ((header))																									\
856 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
857 	else																											\
858 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
859 	for (index = 0; index < (size); index++) {																		\
860 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
861 			"%02hhx ", pkt_addr[index]);																			\
862 	}																												\
863 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
864 																													\
865 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
866 } while(0)
867 #else
868 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
869 #endif
870 
871 /*
872  * Function to convert guest physical addresses to vhost physical addresses.
873  * This is used to convert virtio buffer addresses.
874  */
875 static inline uint64_t __attribute__((always_inline))
876 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
877 	uint32_t buf_len, hpa_type *addr_type)
878 {
879 	struct virtio_memory_regions_hpa *region;
880 	uint32_t regionidx;
881 	uint64_t vhost_pa = 0;
882 
883 	*addr_type = PHYS_ADDR_INVALID;
884 
885 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
886 		region = &vdev->regions_hpa[regionidx];
887 		if ((guest_pa >= region->guest_phys_address) &&
888 			(guest_pa <= region->guest_phys_address_end)) {
889 			vhost_pa = region->host_phys_addr_offset + guest_pa;
890 			if (likely((guest_pa + buf_len - 1)
891 				<= region->guest_phys_address_end))
892 				*addr_type = PHYS_ADDR_CONTINUOUS;
893 			else
894 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
895 			break;
896 		}
897 	}
898 
899 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
900 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
901 		(void *)(uintptr_t)vhost_pa);
902 
903 	return vhost_pa;
904 }
905 
906 /*
907  * Compares a packet destination MAC address to a device MAC address.
908  */
909 static inline int __attribute__((always_inline))
910 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
911 {
912 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
913 }
914 
915 /*
916  * This function learns the MAC address of the device and registers this along with a
917  * vlan tag to a VMDQ.
918  */
919 static int
920 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
921 {
922 	struct ether_hdr *pkt_hdr;
923 	struct virtio_net_data_ll *dev_ll;
924 	struct virtio_net *dev = vdev->dev;
925 	int i, ret;
926 
927 	/* Learn MAC address of guest device from packet */
928 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
929 
930 	dev_ll = ll_root_used;
931 
932 	while (dev_ll != NULL) {
933 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
934 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
935 			return -1;
936 		}
937 		dev_ll = dev_ll->next;
938 	}
939 
940 	for (i = 0; i < ETHER_ADDR_LEN; i++)
941 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
942 
943 	/* vlan_tag currently uses the device_id. */
944 	vdev->vlan_tag = vlan_tags[dev->device_fh];
945 
946 	/* Print out VMDQ registration info. */
947 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
948 		dev->device_fh,
949 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
950 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
951 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
952 		vdev->vlan_tag);
953 
954 	/* Register the MAC address. */
955 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
956 				(uint32_t)dev->device_fh + vmdq_pool_base);
957 	if (ret)
958 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
959 					dev->device_fh);
960 
961 	/* Enable stripping of the vlan tag as we handle routing. */
962 	if (vlan_strip)
963 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
964 			(uint16_t)vdev->vmdq_rx_q, 1);
965 
966 	/* Set device as ready for RX. */
967 	vdev->ready = DEVICE_RX;
968 
969 	return 0;
970 }
971 
972 /*
973  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
974  * queue before disabling RX on the device.
975  */
976 static inline void
977 unlink_vmdq(struct vhost_dev *vdev)
978 {
979 	unsigned i = 0;
980 	unsigned rx_count;
981 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
982 
983 	if (vdev->ready == DEVICE_RX) {
984 		/*clear MAC and VLAN settings*/
985 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
986 		for (i = 0; i < 6; i++)
987 			vdev->mac_address.addr_bytes[i] = 0;
988 
989 		vdev->vlan_tag = 0;
990 
991 		/*Clear out the receive buffers*/
992 		rx_count = rte_eth_rx_burst(ports[0],
993 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
994 
995 		while (rx_count) {
996 			for (i = 0; i < rx_count; i++)
997 				rte_pktmbuf_free(pkts_burst[i]);
998 
999 			rx_count = rte_eth_rx_burst(ports[0],
1000 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1001 		}
1002 
1003 		vdev->ready = DEVICE_MAC_LEARNING;
1004 	}
1005 }
1006 
1007 /*
1008  * Check if the packet destination MAC address is for a local device. If so then put
1009  * the packet on that devices RX queue. If not then return.
1010  */
1011 static inline int __attribute__((always_inline))
1012 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1013 {
1014 	struct virtio_net_data_ll *dev_ll;
1015 	struct ether_hdr *pkt_hdr;
1016 	uint64_t ret = 0;
1017 	struct virtio_net *dev = vdev->dev;
1018 	struct virtio_net *tdev; /* destination virito device */
1019 
1020 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1021 
1022 	/*get the used devices list*/
1023 	dev_ll = ll_root_used;
1024 
1025 	while (dev_ll != NULL) {
1026 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1027 				          &dev_ll->vdev->mac_address)) {
1028 
1029 			/* Drop the packet if the TX packet is destined for the TX device. */
1030 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1031 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1032 							dev->device_fh);
1033 				return 0;
1034 			}
1035 			tdev = dev_ll->vdev->dev;
1036 
1037 
1038 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1039 
1040 			if (unlikely(dev_ll->vdev->remove)) {
1041 				/*drop the packet if the device is marked for removal*/
1042 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1043 			} else {
1044 				/*send the packet to the local virtio device*/
1045 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1046 				if (enable_stats) {
1047 					rte_atomic64_add(
1048 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1049 					1);
1050 					rte_atomic64_add(
1051 					&dev_statistics[tdev->device_fh].rx_atomic,
1052 					ret);
1053 					dev_statistics[tdev->device_fh].tx_total++;
1054 					dev_statistics[tdev->device_fh].tx += ret;
1055 				}
1056 			}
1057 
1058 			return 0;
1059 		}
1060 		dev_ll = dev_ll->next;
1061 	}
1062 
1063 	return -1;
1064 }
1065 
1066 /*
1067  * Check if the destination MAC of a packet is one local VM,
1068  * and get its vlan tag, and offset if it is.
1069  */
1070 static inline int __attribute__((always_inline))
1071 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1072 	uint32_t *offset, uint16_t *vlan_tag)
1073 {
1074 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1075 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1076 
1077 	while (dev_ll != NULL) {
1078 		if ((dev_ll->vdev->ready == DEVICE_RX)
1079 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1080 		&dev_ll->vdev->mac_address)) {
1081 			/*
1082 			 * Drop the packet if the TX packet is
1083 			 * destined for the TX device.
1084 			 */
1085 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1086 				LOG_DEBUG(VHOST_DATA,
1087 				"(%"PRIu64") TX: Source and destination"
1088 				" MAC addresses are the same. Dropping "
1089 				"packet.\n",
1090 				dev_ll->vdev->dev->device_fh);
1091 				return -1;
1092 			}
1093 
1094 			/*
1095 			 * HW vlan strip will reduce the packet length
1096 			 * by minus length of vlan tag, so need restore
1097 			 * the packet length by plus it.
1098 			 */
1099 			*offset = VLAN_HLEN;
1100 			*vlan_tag =
1101 			(uint16_t)
1102 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1103 
1104 			LOG_DEBUG(VHOST_DATA,
1105 			"(%"PRIu64") TX: pkt to local VM device id:"
1106 			"(%"PRIu64") vlan tag: %d.\n",
1107 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1108 			(int)*vlan_tag);
1109 
1110 			break;
1111 		}
1112 		dev_ll = dev_ll->next;
1113 	}
1114 	return 0;
1115 }
1116 
1117 /*
1118  * This function routes the TX packet to the correct interface. This may be a local device
1119  * or the physical port.
1120  */
1121 static inline void __attribute__((always_inline))
1122 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1123 {
1124 	struct mbuf_table *tx_q;
1125 	struct rte_mbuf **m_table;
1126 	unsigned len, ret, offset = 0;
1127 	const uint16_t lcore_id = rte_lcore_id();
1128 	struct virtio_net *dev = vdev->dev;
1129 	struct ether_hdr *nh;
1130 
1131 	/*check if destination is local VM*/
1132 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1133 		rte_pktmbuf_free(m);
1134 		return;
1135 	}
1136 
1137 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1138 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1139 			rte_pktmbuf_free(m);
1140 			return;
1141 		}
1142 	}
1143 
1144 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1145 
1146 	/*Add packet to the port tx queue*/
1147 	tx_q = &lcore_tx_queue[lcore_id];
1148 	len = tx_q->len;
1149 
1150 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1151 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1152 		/* Guest has inserted the vlan tag. */
1153 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1154 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1155 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1156 			(vh->vlan_tci != vlan_tag_be))
1157 			vh->vlan_tci = vlan_tag_be;
1158 	} else {
1159 		m->ol_flags = PKT_TX_VLAN_PKT;
1160 
1161 		/*
1162 		 * Find the right seg to adjust the data len when offset is
1163 		 * bigger than tail room size.
1164 		 */
1165 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1166 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1167 				m->data_len += offset;
1168 			else {
1169 				struct rte_mbuf *seg = m;
1170 
1171 				while ((seg->next != NULL) &&
1172 					(offset > rte_pktmbuf_tailroom(seg)))
1173 					seg = seg->next;
1174 
1175 				seg->data_len += offset;
1176 			}
1177 			m->pkt_len += offset;
1178 		}
1179 
1180 		m->vlan_tci = vlan_tag;
1181 	}
1182 
1183 	tx_q->m_table[len] = m;
1184 	len++;
1185 	if (enable_stats) {
1186 		dev_statistics[dev->device_fh].tx_total++;
1187 		dev_statistics[dev->device_fh].tx++;
1188 	}
1189 
1190 	if (unlikely(len == MAX_PKT_BURST)) {
1191 		m_table = (struct rte_mbuf **)tx_q->m_table;
1192 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1193 		/* Free any buffers not handled by TX and update the port stats. */
1194 		if (unlikely(ret < len)) {
1195 			do {
1196 				rte_pktmbuf_free(m_table[ret]);
1197 			} while (++ret < len);
1198 		}
1199 
1200 		len = 0;
1201 	}
1202 
1203 	tx_q->len = len;
1204 	return;
1205 }
1206 /*
1207  * This function is called by each data core. It handles all RX/TX registered with the
1208  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1209  * with all devices in the main linked list.
1210  */
1211 static int
1212 switch_worker(__attribute__((unused)) void *arg)
1213 {
1214 	struct rte_mempool *mbuf_pool = arg;
1215 	struct virtio_net *dev = NULL;
1216 	struct vhost_dev *vdev = NULL;
1217 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1218 	struct virtio_net_data_ll *dev_ll;
1219 	struct mbuf_table *tx_q;
1220 	volatile struct lcore_ll_info *lcore_ll;
1221 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1222 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1223 	unsigned ret, i;
1224 	const uint16_t lcore_id = rte_lcore_id();
1225 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1226 	uint16_t rx_count = 0;
1227 	uint16_t tx_count;
1228 	uint32_t retry = 0;
1229 
1230 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1231 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1232 	prev_tsc = 0;
1233 
1234 	tx_q = &lcore_tx_queue[lcore_id];
1235 	for (i = 0; i < num_cores; i ++) {
1236 		if (lcore_ids[i] == lcore_id) {
1237 			tx_q->txq_id = i;
1238 			break;
1239 		}
1240 	}
1241 
1242 	while(1) {
1243 		cur_tsc = rte_rdtsc();
1244 		/*
1245 		 * TX burst queue drain
1246 		 */
1247 		diff_tsc = cur_tsc - prev_tsc;
1248 		if (unlikely(diff_tsc > drain_tsc)) {
1249 
1250 			if (tx_q->len) {
1251 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1252 
1253 				/*Tx any packets in the queue*/
1254 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1255 									   (struct rte_mbuf **)tx_q->m_table,
1256 									   (uint16_t)tx_q->len);
1257 				if (unlikely(ret < tx_q->len)) {
1258 					do {
1259 						rte_pktmbuf_free(tx_q->m_table[ret]);
1260 					} while (++ret < tx_q->len);
1261 				}
1262 
1263 				tx_q->len = 0;
1264 			}
1265 
1266 			prev_tsc = cur_tsc;
1267 
1268 		}
1269 
1270 		rte_prefetch0(lcore_ll->ll_root_used);
1271 		/*
1272 		 * Inform the configuration core that we have exited the linked list and that no devices are
1273 		 * in use if requested.
1274 		 */
1275 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1276 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1277 
1278 		/*
1279 		 * Process devices
1280 		 */
1281 		dev_ll = lcore_ll->ll_root_used;
1282 
1283 		while (dev_ll != NULL) {
1284 			/*get virtio device ID*/
1285 			vdev = dev_ll->vdev;
1286 			dev = vdev->dev;
1287 
1288 			if (unlikely(vdev->remove)) {
1289 				dev_ll = dev_ll->next;
1290 				unlink_vmdq(vdev);
1291 				vdev->ready = DEVICE_SAFE_REMOVE;
1292 				continue;
1293 			}
1294 			if (likely(vdev->ready == DEVICE_RX)) {
1295 				/*Handle guest RX*/
1296 				rx_count = rte_eth_rx_burst(ports[0],
1297 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1298 
1299 				if (rx_count) {
1300 					/*
1301 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1302 					* Here MAX_PKT_BURST must be less than virtio queue size
1303 					*/
1304 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1305 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1306 							rte_delay_us(burst_rx_delay_time);
1307 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1308 								break;
1309 						}
1310 					}
1311 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1312 					if (enable_stats) {
1313 						rte_atomic64_add(
1314 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1315 						rx_count);
1316 						rte_atomic64_add(
1317 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1318 					}
1319 					while (likely(rx_count)) {
1320 						rx_count--;
1321 						rte_pktmbuf_free(pkts_burst[rx_count]);
1322 					}
1323 
1324 				}
1325 			}
1326 
1327 			if (likely(!vdev->remove)) {
1328 				/* Handle guest TX*/
1329 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1330 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1331 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1332 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1333 						while (tx_count)
1334 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1335 					}
1336 				}
1337 				while (tx_count)
1338 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1339 			}
1340 
1341 			/*move to the next device in the list*/
1342 			dev_ll = dev_ll->next;
1343 		}
1344 	}
1345 
1346 	return 0;
1347 }
1348 
1349 /*
1350  * This function gets available ring number for zero copy rx.
1351  * Only one thread will call this funciton for a paticular virtio device,
1352  * so, it is designed as non-thread-safe function.
1353  */
1354 static inline uint32_t __attribute__((always_inline))
1355 get_available_ring_num_zcp(struct virtio_net *dev)
1356 {
1357 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1358 	uint16_t avail_idx;
1359 
1360 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1361 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1362 }
1363 
1364 /*
1365  * This function gets available ring index for zero copy rx,
1366  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1367  * Only one thread will call this funciton for a paticular virtio device,
1368  * so, it is designed as non-thread-safe function.
1369  */
1370 static inline uint32_t __attribute__((always_inline))
1371 get_available_ring_index_zcp(struct virtio_net *dev,
1372 	uint16_t *res_base_idx, uint32_t count)
1373 {
1374 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1375 	uint16_t avail_idx;
1376 	uint32_t retry = 0;
1377 	uint16_t free_entries;
1378 
1379 	*res_base_idx = vq->last_used_idx_res;
1380 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1381 	free_entries = (avail_idx - *res_base_idx);
1382 
1383 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1384 			"avail idx: %d, "
1385 			"res base idx:%d, free entries:%d\n",
1386 			dev->device_fh, avail_idx, *res_base_idx,
1387 			free_entries);
1388 
1389 	/*
1390 	 * If retry is enabled and the queue is full then we wait
1391 	 * and retry to avoid packet loss.
1392 	 */
1393 	if (enable_retry && unlikely(count > free_entries)) {
1394 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1395 			rte_delay_us(burst_rx_delay_time);
1396 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1397 			free_entries = (avail_idx - *res_base_idx);
1398 			if (count <= free_entries)
1399 				break;
1400 		}
1401 	}
1402 
1403 	/*check that we have enough buffers*/
1404 	if (unlikely(count > free_entries))
1405 		count = free_entries;
1406 
1407 	if (unlikely(count == 0)) {
1408 		LOG_DEBUG(VHOST_DATA,
1409 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1410 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1411 			dev->device_fh, avail_idx,
1412 			*res_base_idx, free_entries);
1413 		return 0;
1414 	}
1415 
1416 	vq->last_used_idx_res = *res_base_idx + count;
1417 
1418 	return count;
1419 }
1420 
1421 /*
1422  * This function put descriptor back to used list.
1423  */
1424 static inline void __attribute__((always_inline))
1425 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1426 {
1427 	uint16_t res_cur_idx = vq->last_used_idx;
1428 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1429 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1430 	rte_compiler_barrier();
1431 	*(volatile uint16_t *)&vq->used->idx += 1;
1432 	vq->last_used_idx += 1;
1433 
1434 	/* Kick the guest if necessary. */
1435 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1436 		eventfd_write(vq->callfd, (eventfd_t)1);
1437 }
1438 
1439 /*
1440  * This function get available descriptor from vitio vring and un-attached mbuf
1441  * from vpool->ring, and then attach them together. It needs adjust the offset
1442  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1443  * frame data may be put to wrong location in mbuf.
1444  */
1445 static inline void __attribute__((always_inline))
1446 attach_rxmbuf_zcp(struct virtio_net *dev)
1447 {
1448 	uint16_t res_base_idx, desc_idx;
1449 	uint64_t buff_addr, phys_addr;
1450 	struct vhost_virtqueue *vq;
1451 	struct vring_desc *desc;
1452 	struct rte_mbuf *mbuf = NULL;
1453 	struct vpool *vpool;
1454 	hpa_type addr_type;
1455 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1456 
1457 	vpool = &vpool_array[vdev->vmdq_rx_q];
1458 	vq = dev->virtqueue[VIRTIO_RXQ];
1459 
1460 	do {
1461 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1462 				1) != 1))
1463 			return;
1464 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1465 
1466 		desc = &vq->desc[desc_idx];
1467 		if (desc->flags & VRING_DESC_F_NEXT) {
1468 			desc = &vq->desc[desc->next];
1469 			buff_addr = gpa_to_vva(dev, desc->addr);
1470 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1471 					&addr_type);
1472 		} else {
1473 			buff_addr = gpa_to_vva(dev,
1474 					desc->addr + vq->vhost_hlen);
1475 			phys_addr = gpa_to_hpa(vdev,
1476 					desc->addr + vq->vhost_hlen,
1477 					desc->len, &addr_type);
1478 		}
1479 
1480 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1481 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1482 				" address found when attaching RX frame buffer"
1483 				" address!\n", dev->device_fh);
1484 			put_desc_to_used_list_zcp(vq, desc_idx);
1485 			continue;
1486 		}
1487 
1488 		/*
1489 		 * Check if the frame buffer address from guest crosses
1490 		 * sub-region or not.
1491 		 */
1492 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1493 			RTE_LOG(ERR, VHOST_DATA,
1494 				"(%"PRIu64") Frame buffer address cross "
1495 				"sub-regioin found when attaching RX frame "
1496 				"buffer address!\n",
1497 				dev->device_fh);
1498 			put_desc_to_used_list_zcp(vq, desc_idx);
1499 			continue;
1500 		}
1501 	} while (unlikely(phys_addr == 0));
1502 
1503 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1504 	if (unlikely(mbuf == NULL)) {
1505 		LOG_DEBUG(VHOST_DATA,
1506 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1507 			"ring_sc_dequeue fail.\n",
1508 			dev->device_fh);
1509 		put_desc_to_used_list_zcp(vq, desc_idx);
1510 		return;
1511 	}
1512 
1513 	if (unlikely(vpool->buf_size > desc->len)) {
1514 		LOG_DEBUG(VHOST_DATA,
1515 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1516 			"length(%d) of descriptor idx: %d less than room "
1517 			"size required: %d\n",
1518 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1519 		put_desc_to_used_list_zcp(vq, desc_idx);
1520 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1521 		return;
1522 	}
1523 
1524 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1525 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1526 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1527 	mbuf->data_len = desc->len;
1528 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1529 
1530 	LOG_DEBUG(VHOST_DATA,
1531 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1532 		"descriptor idx:%d\n",
1533 		dev->device_fh, res_base_idx, desc_idx);
1534 
1535 	__rte_mbuf_raw_free(mbuf);
1536 
1537 	return;
1538 }
1539 
1540 /*
1541  * Detach an attched packet mbuf -
1542  *  - restore original mbuf address and length values.
1543  *  - reset pktmbuf data and data_len to their default values.
1544  *  All other fields of the given packet mbuf will be left intact.
1545  *
1546  * @param m
1547  *   The attached packet mbuf.
1548  */
1549 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1550 {
1551 	const struct rte_mempool *mp = m->pool;
1552 	void *buf = rte_mbuf_to_baddr(m);
1553 	uint32_t buf_ofs;
1554 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1555 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1556 
1557 	m->buf_addr = buf;
1558 	m->buf_len = (uint16_t)buf_len;
1559 
1560 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1561 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1562 	m->data_off = buf_ofs;
1563 
1564 	m->data_len = 0;
1565 }
1566 
1567 /*
1568  * This function is called after packets have been transimited. It fetchs mbuf
1569  * from vpool->pool, detached it and put into vpool->ring. It also update the
1570  * used index and kick the guest if necessary.
1571  */
1572 static inline uint32_t __attribute__((always_inline))
1573 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1574 {
1575 	struct rte_mbuf *mbuf;
1576 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1577 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1578 	uint32_t index = 0;
1579 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1580 
1581 	LOG_DEBUG(VHOST_DATA,
1582 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1583 		"clean is: %d\n",
1584 		dev->device_fh, mbuf_count);
1585 	LOG_DEBUG(VHOST_DATA,
1586 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1587 		"clean  is : %d\n",
1588 		dev->device_fh, rte_ring_count(vpool->ring));
1589 
1590 	for (index = 0; index < mbuf_count; index++) {
1591 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1592 		if (likely(MBUF_EXT_MEM(mbuf)))
1593 			pktmbuf_detach_zcp(mbuf);
1594 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1595 
1596 		/* Update used index buffer information. */
1597 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1598 		vq->used->ring[used_idx].len = 0;
1599 
1600 		used_idx = (used_idx + 1) & (vq->size - 1);
1601 	}
1602 
1603 	LOG_DEBUG(VHOST_DATA,
1604 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1605 		"clean is: %d\n",
1606 		dev->device_fh, rte_mempool_count(vpool->pool));
1607 	LOG_DEBUG(VHOST_DATA,
1608 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1609 		"clean  is : %d\n",
1610 		dev->device_fh, rte_ring_count(vpool->ring));
1611 	LOG_DEBUG(VHOST_DATA,
1612 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1613 		"vq->last_used_idx:%d\n",
1614 		dev->device_fh, vq->last_used_idx);
1615 
1616 	vq->last_used_idx += mbuf_count;
1617 
1618 	LOG_DEBUG(VHOST_DATA,
1619 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1620 		"vq->last_used_idx:%d\n",
1621 		dev->device_fh, vq->last_used_idx);
1622 
1623 	rte_compiler_barrier();
1624 
1625 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1626 
1627 	/* Kick guest if required. */
1628 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1629 		eventfd_write(vq->callfd, (eventfd_t)1);
1630 
1631 	return 0;
1632 }
1633 
1634 /*
1635  * This function is called when a virtio device is destroy.
1636  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1637  */
1638 static void mbuf_destroy_zcp(struct vpool *vpool)
1639 {
1640 	struct rte_mbuf *mbuf = NULL;
1641 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1642 
1643 	LOG_DEBUG(VHOST_CONFIG,
1644 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1645 		"mbuf_destroy_zcp is: %d\n",
1646 		mbuf_count);
1647 	LOG_DEBUG(VHOST_CONFIG,
1648 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1649 		"mbuf_destroy_zcp  is : %d\n",
1650 		rte_ring_count(vpool->ring));
1651 
1652 	for (index = 0; index < mbuf_count; index++) {
1653 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1654 		if (likely(mbuf != NULL)) {
1655 			if (likely(MBUF_EXT_MEM(mbuf)))
1656 				pktmbuf_detach_zcp(mbuf);
1657 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1658 		}
1659 	}
1660 
1661 	LOG_DEBUG(VHOST_CONFIG,
1662 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1663 		"mbuf_destroy_zcp is: %d\n",
1664 		rte_mempool_count(vpool->pool));
1665 	LOG_DEBUG(VHOST_CONFIG,
1666 		"in mbuf_destroy_zcp: mbuf count in ring after "
1667 		"mbuf_destroy_zcp is : %d\n",
1668 		rte_ring_count(vpool->ring));
1669 }
1670 
1671 /*
1672  * This function update the use flag and counter.
1673  */
1674 static inline uint32_t __attribute__((always_inline))
1675 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1676 	uint32_t count)
1677 {
1678 	struct vhost_virtqueue *vq;
1679 	struct vring_desc *desc;
1680 	struct rte_mbuf *buff;
1681 	/* The virtio_hdr is initialised to 0. */
1682 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1683 		= {{0, 0, 0, 0, 0, 0}, 0};
1684 	uint64_t buff_hdr_addr = 0;
1685 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1686 	uint32_t head_idx, packet_success = 0;
1687 	uint16_t res_cur_idx;
1688 
1689 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1690 
1691 	if (count == 0)
1692 		return 0;
1693 
1694 	vq = dev->virtqueue[VIRTIO_RXQ];
1695 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1696 
1697 	res_cur_idx = vq->last_used_idx;
1698 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1699 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1700 
1701 	/* Retrieve all of the head indexes first to avoid caching issues. */
1702 	for (head_idx = 0; head_idx < count; head_idx++)
1703 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1704 
1705 	/*Prefetch descriptor index. */
1706 	rte_prefetch0(&vq->desc[head[packet_success]]);
1707 
1708 	while (packet_success != count) {
1709 		/* Get descriptor from available ring */
1710 		desc = &vq->desc[head[packet_success]];
1711 
1712 		buff = pkts[packet_success];
1713 		LOG_DEBUG(VHOST_DATA,
1714 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1715 			"pkt[%d] descriptor idx: %d\n",
1716 			dev->device_fh, packet_success,
1717 			MBUF_HEADROOM_UINT32(buff));
1718 
1719 		PRINT_PACKET(dev,
1720 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1721 			+ RTE_PKTMBUF_HEADROOM),
1722 			rte_pktmbuf_data_len(buff), 0);
1723 
1724 		/* Buffer address translation for virtio header. */
1725 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1726 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1727 
1728 		/*
1729 		 * If the descriptors are chained the header and data are
1730 		 * placed in separate buffers.
1731 		 */
1732 		if (desc->flags & VRING_DESC_F_NEXT) {
1733 			desc->len = vq->vhost_hlen;
1734 			desc = &vq->desc[desc->next];
1735 			desc->len = rte_pktmbuf_data_len(buff);
1736 		} else {
1737 			desc->len = packet_len;
1738 		}
1739 
1740 		/* Update used ring with desc information */
1741 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1742 			= head[packet_success];
1743 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1744 			= packet_len;
1745 		res_cur_idx++;
1746 		packet_success++;
1747 
1748 		/* A header is required per buffer. */
1749 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1750 			(const void *)&virtio_hdr, vq->vhost_hlen);
1751 
1752 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1753 
1754 		if (likely(packet_success < count)) {
1755 			/* Prefetch descriptor index. */
1756 			rte_prefetch0(&vq->desc[head[packet_success]]);
1757 		}
1758 	}
1759 
1760 	rte_compiler_barrier();
1761 
1762 	LOG_DEBUG(VHOST_DATA,
1763 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1764 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1765 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1766 
1767 	*(volatile uint16_t *)&vq->used->idx += count;
1768 	vq->last_used_idx += count;
1769 
1770 	LOG_DEBUG(VHOST_DATA,
1771 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1772 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1773 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1774 
1775 	/* Kick the guest if necessary. */
1776 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1777 		eventfd_write(vq->callfd, (eventfd_t)1);
1778 
1779 	return count;
1780 }
1781 
1782 /*
1783  * This function routes the TX packet to the correct interface.
1784  * This may be a local device or the physical port.
1785  */
1786 static inline void __attribute__((always_inline))
1787 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1788 	uint32_t desc_idx, uint8_t need_copy)
1789 {
1790 	struct mbuf_table *tx_q;
1791 	struct rte_mbuf **m_table;
1792 	struct rte_mbuf *mbuf = NULL;
1793 	unsigned len, ret, offset = 0;
1794 	struct vpool *vpool;
1795 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1796 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1797 
1798 	/*Add packet to the port tx queue*/
1799 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1800 	len = tx_q->len;
1801 
1802 	/* Allocate an mbuf and populate the structure. */
1803 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1804 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1805 	if (unlikely(mbuf == NULL)) {
1806 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1807 		RTE_LOG(ERR, VHOST_DATA,
1808 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1809 			dev->device_fh);
1810 		put_desc_to_used_list_zcp(vq, desc_idx);
1811 		return;
1812 	}
1813 
1814 	if (vm2vm_mode == VM2VM_HARDWARE) {
1815 		/* Avoid using a vlan tag from any vm for external pkt, such as
1816 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1817 		 * selection, MAC address determines it as an external pkt
1818 		 * which should go to network, while vlan tag determine it as
1819 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1820 		 * such a ambiguous situation, so pkt will lost.
1821 		 */
1822 		vlan_tag = external_pkt_default_vlan_tag;
1823 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1824 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1825 			__rte_mbuf_raw_free(mbuf);
1826 			return;
1827 		}
1828 	}
1829 
1830 	mbuf->nb_segs = m->nb_segs;
1831 	mbuf->next = m->next;
1832 	mbuf->data_len = m->data_len + offset;
1833 	mbuf->pkt_len = mbuf->data_len;
1834 	if (unlikely(need_copy)) {
1835 		/* Copy the packet contents to the mbuf. */
1836 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1837 			rte_pktmbuf_mtod(m, void *),
1838 			m->data_len);
1839 	} else {
1840 		mbuf->data_off = m->data_off;
1841 		mbuf->buf_physaddr = m->buf_physaddr;
1842 		mbuf->buf_addr = m->buf_addr;
1843 	}
1844 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1845 	mbuf->vlan_tci = vlan_tag;
1846 	mbuf->l2_len = sizeof(struct ether_hdr);
1847 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1848 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1849 
1850 	tx_q->m_table[len] = mbuf;
1851 	len++;
1852 
1853 	LOG_DEBUG(VHOST_DATA,
1854 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1855 		dev->device_fh,
1856 		mbuf->nb_segs,
1857 		(mbuf->next == NULL) ? "null" : "non-null");
1858 
1859 	if (enable_stats) {
1860 		dev_statistics[dev->device_fh].tx_total++;
1861 		dev_statistics[dev->device_fh].tx++;
1862 	}
1863 
1864 	if (unlikely(len == MAX_PKT_BURST)) {
1865 		m_table = (struct rte_mbuf **)tx_q->m_table;
1866 		ret = rte_eth_tx_burst(ports[0],
1867 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1868 
1869 		/*
1870 		 * Free any buffers not handled by TX and update
1871 		 * the port stats.
1872 		 */
1873 		if (unlikely(ret < len)) {
1874 			do {
1875 				rte_pktmbuf_free(m_table[ret]);
1876 			} while (++ret < len);
1877 		}
1878 
1879 		len = 0;
1880 		txmbuf_clean_zcp(dev, vpool);
1881 	}
1882 
1883 	tx_q->len = len;
1884 
1885 	return;
1886 }
1887 
1888 /*
1889  * This function TX all available packets in virtio TX queue for one
1890  * virtio-net device. If it is first packet, it learns MAC address and
1891  * setup VMDQ.
1892  */
1893 static inline void __attribute__((always_inline))
1894 virtio_dev_tx_zcp(struct virtio_net *dev)
1895 {
1896 	struct rte_mbuf m;
1897 	struct vhost_virtqueue *vq;
1898 	struct vring_desc *desc;
1899 	uint64_t buff_addr = 0, phys_addr;
1900 	uint32_t head[MAX_PKT_BURST];
1901 	uint32_t i;
1902 	uint16_t free_entries, packet_success = 0;
1903 	uint16_t avail_idx;
1904 	uint8_t need_copy = 0;
1905 	hpa_type addr_type;
1906 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1907 
1908 	vq = dev->virtqueue[VIRTIO_TXQ];
1909 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1910 
1911 	/* If there are no available buffers then return. */
1912 	if (vq->last_used_idx_res == avail_idx)
1913 		return;
1914 
1915 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1916 
1917 	/* Prefetch available ring to retrieve head indexes. */
1918 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1919 
1920 	/* Get the number of free entries in the ring */
1921 	free_entries = (avail_idx - vq->last_used_idx_res);
1922 
1923 	/* Limit to MAX_PKT_BURST. */
1924 	free_entries
1925 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1926 
1927 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1928 		dev->device_fh, free_entries);
1929 
1930 	/* Retrieve all of the head indexes first to avoid caching issues. */
1931 	for (i = 0; i < free_entries; i++)
1932 		head[i]
1933 			= vq->avail->ring[(vq->last_used_idx_res + i)
1934 			& (vq->size - 1)];
1935 
1936 	vq->last_used_idx_res += free_entries;
1937 
1938 	/* Prefetch descriptor index. */
1939 	rte_prefetch0(&vq->desc[head[packet_success]]);
1940 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1941 
1942 	while (packet_success < free_entries) {
1943 		desc = &vq->desc[head[packet_success]];
1944 
1945 		/* Discard first buffer as it is the virtio header */
1946 		desc = &vq->desc[desc->next];
1947 
1948 		/* Buffer address translation. */
1949 		buff_addr = gpa_to_vva(dev, desc->addr);
1950 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1951 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1952 			&addr_type);
1953 
1954 		if (likely(packet_success < (free_entries - 1)))
1955 			/* Prefetch descriptor index. */
1956 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1957 
1958 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1959 			RTE_LOG(ERR, VHOST_DATA,
1960 				"(%"PRIu64") Invalid frame buffer address found"
1961 				"when TX packets!\n",
1962 				dev->device_fh);
1963 			packet_success++;
1964 			continue;
1965 		}
1966 
1967 		/* Prefetch buffer address. */
1968 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1969 
1970 		/*
1971 		 * Setup dummy mbuf. This is copied to a real mbuf if
1972 		 * transmitted out the physical port.
1973 		 */
1974 		m.data_len = desc->len;
1975 		m.nb_segs = 1;
1976 		m.next = NULL;
1977 		m.data_off = 0;
1978 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1979 		m.buf_physaddr = phys_addr;
1980 
1981 		/*
1982 		 * Check if the frame buffer address from guest crosses
1983 		 * sub-region or not.
1984 		 */
1985 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1986 			RTE_LOG(ERR, VHOST_DATA,
1987 				"(%"PRIu64") Frame buffer address cross "
1988 				"sub-regioin found when attaching TX frame "
1989 				"buffer address!\n",
1990 				dev->device_fh);
1991 			need_copy = 1;
1992 		} else
1993 			need_copy = 0;
1994 
1995 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1996 
1997 		/*
1998 		 * If this is the first received packet we need to learn
1999 		 * the MAC and setup VMDQ
2000 		 */
2001 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2002 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2003 				/*
2004 				 * Discard frame if device is scheduled for
2005 				 * removal or a duplicate MAC address is found.
2006 				 */
2007 				packet_success += free_entries;
2008 				vq->last_used_idx += packet_success;
2009 				break;
2010 			}
2011 		}
2012 
2013 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2014 		packet_success++;
2015 	}
2016 }
2017 
2018 /*
2019  * This function is called by each data core. It handles all RX/TX registered
2020  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2021  * addresses are compared with all devices in the main linked list.
2022  */
2023 static int
2024 switch_worker_zcp(__attribute__((unused)) void *arg)
2025 {
2026 	struct virtio_net *dev = NULL;
2027 	struct vhost_dev  *vdev = NULL;
2028 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2029 	struct virtio_net_data_ll *dev_ll;
2030 	struct mbuf_table *tx_q;
2031 	volatile struct lcore_ll_info *lcore_ll;
2032 	const uint64_t drain_tsc
2033 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2034 		* BURST_TX_DRAIN_US;
2035 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2036 	unsigned ret;
2037 	const uint16_t lcore_id = rte_lcore_id();
2038 	uint16_t count_in_ring, rx_count = 0;
2039 
2040 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2041 
2042 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2043 	prev_tsc = 0;
2044 
2045 	while (1) {
2046 		cur_tsc = rte_rdtsc();
2047 
2048 		/* TX burst queue drain */
2049 		diff_tsc = cur_tsc - prev_tsc;
2050 		if (unlikely(diff_tsc > drain_tsc)) {
2051 			/*
2052 			 * Get mbuf from vpool.pool and detach mbuf and
2053 			 * put back into vpool.ring.
2054 			 */
2055 			dev_ll = lcore_ll->ll_root_used;
2056 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2057 				/* Get virtio device ID */
2058 				vdev = dev_ll->vdev;
2059 				dev = vdev->dev;
2060 
2061 				if (likely(!vdev->remove)) {
2062 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2063 					if (tx_q->len) {
2064 						LOG_DEBUG(VHOST_DATA,
2065 						"TX queue drained after timeout"
2066 						" with burst size %u\n",
2067 						tx_q->len);
2068 
2069 						/*
2070 						 * Tx any packets in the queue
2071 						 */
2072 						ret = rte_eth_tx_burst(
2073 							ports[0],
2074 							(uint16_t)tx_q->txq_id,
2075 							(struct rte_mbuf **)
2076 							tx_q->m_table,
2077 							(uint16_t)tx_q->len);
2078 						if (unlikely(ret < tx_q->len)) {
2079 							do {
2080 								rte_pktmbuf_free(
2081 									tx_q->m_table[ret]);
2082 							} while (++ret < tx_q->len);
2083 						}
2084 						tx_q->len = 0;
2085 
2086 						txmbuf_clean_zcp(dev,
2087 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2088 					}
2089 				}
2090 				dev_ll = dev_ll->next;
2091 			}
2092 			prev_tsc = cur_tsc;
2093 		}
2094 
2095 		rte_prefetch0(lcore_ll->ll_root_used);
2096 
2097 		/*
2098 		 * Inform the configuration core that we have exited the linked
2099 		 * list and that no devices are in use if requested.
2100 		 */
2101 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2102 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2103 
2104 		/* Process devices */
2105 		dev_ll = lcore_ll->ll_root_used;
2106 
2107 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2108 			vdev = dev_ll->vdev;
2109 			dev  = vdev->dev;
2110 			if (unlikely(vdev->remove)) {
2111 				dev_ll = dev_ll->next;
2112 				unlink_vmdq(vdev);
2113 				vdev->ready = DEVICE_SAFE_REMOVE;
2114 				continue;
2115 			}
2116 
2117 			if (likely(vdev->ready == DEVICE_RX)) {
2118 				uint32_t index = vdev->vmdq_rx_q;
2119 				uint16_t i;
2120 				count_in_ring
2121 				= rte_ring_count(vpool_array[index].ring);
2122 				uint16_t free_entries
2123 				= (uint16_t)get_available_ring_num_zcp(dev);
2124 
2125 				/*
2126 				 * Attach all mbufs in vpool.ring and put back
2127 				 * into vpool.pool.
2128 				 */
2129 				for (i = 0;
2130 				i < RTE_MIN(free_entries,
2131 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2132 				i++)
2133 					attach_rxmbuf_zcp(dev);
2134 
2135 				/* Handle guest RX */
2136 				rx_count = rte_eth_rx_burst(ports[0],
2137 					vdev->vmdq_rx_q, pkts_burst,
2138 					MAX_PKT_BURST);
2139 
2140 				if (rx_count) {
2141 					ret_count = virtio_dev_rx_zcp(dev,
2142 							pkts_burst, rx_count);
2143 					if (enable_stats) {
2144 						dev_statistics[dev->device_fh].rx_total
2145 							+= rx_count;
2146 						dev_statistics[dev->device_fh].rx
2147 							+= ret_count;
2148 					}
2149 					while (likely(rx_count)) {
2150 						rx_count--;
2151 						pktmbuf_detach_zcp(
2152 							pkts_burst[rx_count]);
2153 						rte_ring_sp_enqueue(
2154 							vpool_array[index].ring,
2155 							(void *)pkts_burst[rx_count]);
2156 					}
2157 				}
2158 			}
2159 
2160 			if (likely(!vdev->remove))
2161 				/* Handle guest TX */
2162 				virtio_dev_tx_zcp(dev);
2163 
2164 			/* Move to the next device in the list */
2165 			dev_ll = dev_ll->next;
2166 		}
2167 	}
2168 
2169 	return 0;
2170 }
2171 
2172 
2173 /*
2174  * Add an entry to a used linked list. A free entry must first be found
2175  * in the free linked list using get_data_ll_free_entry();
2176  */
2177 static void
2178 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2179 	struct virtio_net_data_ll *ll_dev)
2180 {
2181 	struct virtio_net_data_ll *ll = *ll_root_addr;
2182 
2183 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2184 	ll_dev->next = NULL;
2185 	rte_compiler_barrier();
2186 
2187 	/* If ll == NULL then this is the first device. */
2188 	if (ll) {
2189 		/* Increment to the tail of the linked list. */
2190 		while ((ll->next != NULL) )
2191 			ll = ll->next;
2192 
2193 		ll->next = ll_dev;
2194 	} else {
2195 		*ll_root_addr = ll_dev;
2196 	}
2197 }
2198 
2199 /*
2200  * Remove an entry from a used linked list. The entry must then be added to
2201  * the free linked list using put_data_ll_free_entry().
2202  */
2203 static void
2204 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2205 	struct virtio_net_data_ll *ll_dev,
2206 	struct virtio_net_data_ll *ll_dev_last)
2207 {
2208 	struct virtio_net_data_ll *ll = *ll_root_addr;
2209 
2210 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2211 		return;
2212 
2213 	if (ll_dev == ll)
2214 		*ll_root_addr = ll_dev->next;
2215 	else
2216 		if (likely(ll_dev_last != NULL))
2217 			ll_dev_last->next = ll_dev->next;
2218 		else
2219 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2220 }
2221 
2222 /*
2223  * Find and return an entry from the free linked list.
2224  */
2225 static struct virtio_net_data_ll *
2226 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2227 {
2228 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2229 	struct virtio_net_data_ll *ll_dev;
2230 
2231 	if (ll_free == NULL)
2232 		return NULL;
2233 
2234 	ll_dev = ll_free;
2235 	*ll_root_addr = ll_free->next;
2236 
2237 	return ll_dev;
2238 }
2239 
2240 /*
2241  * Place an entry back on to the free linked list.
2242  */
2243 static void
2244 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2245 	struct virtio_net_data_ll *ll_dev)
2246 {
2247 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2248 
2249 	if (ll_dev == NULL)
2250 		return;
2251 
2252 	ll_dev->next = ll_free;
2253 	*ll_root_addr = ll_dev;
2254 }
2255 
2256 /*
2257  * Creates a linked list of a given size.
2258  */
2259 static struct virtio_net_data_ll *
2260 alloc_data_ll(uint32_t size)
2261 {
2262 	struct virtio_net_data_ll *ll_new;
2263 	uint32_t i;
2264 
2265 	/* Malloc and then chain the linked list. */
2266 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2267 	if (ll_new == NULL) {
2268 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2269 		return NULL;
2270 	}
2271 
2272 	for (i = 0; i < size - 1; i++) {
2273 		ll_new[i].vdev = NULL;
2274 		ll_new[i].next = &ll_new[i+1];
2275 	}
2276 	ll_new[i].next = NULL;
2277 
2278 	return (ll_new);
2279 }
2280 
2281 /*
2282  * Create the main linked list along with each individual cores linked list. A used and a free list
2283  * are created to manage entries.
2284  */
2285 static int
2286 init_data_ll (void)
2287 {
2288 	int lcore;
2289 
2290 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2291 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2292 		if (lcore_info[lcore].lcore_ll == NULL) {
2293 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2294 			return -1;
2295 		}
2296 
2297 		lcore_info[lcore].lcore_ll->device_num = 0;
2298 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2299 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2300 		if (num_devices % num_switching_cores)
2301 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2302 		else
2303 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2304 	}
2305 
2306 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2307 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2308 
2309 	return 0;
2310 }
2311 
2312 /*
2313  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2314  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2315  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2316  */
2317 static void
2318 destroy_device (volatile struct virtio_net *dev)
2319 {
2320 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2321 	struct virtio_net_data_ll *ll_main_dev_cur;
2322 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2323 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2324 	struct vhost_dev *vdev;
2325 	int lcore;
2326 
2327 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2328 
2329 	vdev = (struct vhost_dev *)dev->priv;
2330 	/*set the remove flag. */
2331 	vdev->remove = 1;
2332 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2333 		rte_pause();
2334 	}
2335 
2336 	/* Search for entry to be removed from lcore ll */
2337 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2338 	while (ll_lcore_dev_cur != NULL) {
2339 		if (ll_lcore_dev_cur->vdev == vdev) {
2340 			break;
2341 		} else {
2342 			ll_lcore_dev_last = ll_lcore_dev_cur;
2343 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2344 		}
2345 	}
2346 
2347 	if (ll_lcore_dev_cur == NULL) {
2348 		RTE_LOG(ERR, VHOST_CONFIG,
2349 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2350 			dev->device_fh);
2351 		return;
2352 	}
2353 
2354 	/* Search for entry to be removed from main ll */
2355 	ll_main_dev_cur = ll_root_used;
2356 	ll_main_dev_last = NULL;
2357 	while (ll_main_dev_cur != NULL) {
2358 		if (ll_main_dev_cur->vdev == vdev) {
2359 			break;
2360 		} else {
2361 			ll_main_dev_last = ll_main_dev_cur;
2362 			ll_main_dev_cur = ll_main_dev_cur->next;
2363 		}
2364 	}
2365 
2366 	/* Remove entries from the lcore and main ll. */
2367 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2368 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2369 
2370 	/* Set the dev_removal_flag on each lcore. */
2371 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2372 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2373 	}
2374 
2375 	/*
2376 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2377 	 * they can no longer access the device removed from the linked lists and that the devices
2378 	 * are no longer in use.
2379 	 */
2380 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2381 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2382 			rte_pause();
2383 		}
2384 	}
2385 
2386 	/* Add the entries back to the lcore and main free ll.*/
2387 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2388 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2389 
2390 	/* Decrement number of device on the lcore. */
2391 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2392 
2393 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2394 
2395 	if (zero_copy) {
2396 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2397 
2398 		/* Stop the RX queue. */
2399 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2400 			LOG_DEBUG(VHOST_CONFIG,
2401 				"(%"PRIu64") In destroy_device: Failed to stop "
2402 				"rx queue:%d\n",
2403 				dev->device_fh,
2404 				vdev->vmdq_rx_q);
2405 		}
2406 
2407 		LOG_DEBUG(VHOST_CONFIG,
2408 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2409 			"mempool back to ring for RX queue: %d\n",
2410 			dev->device_fh, vdev->vmdq_rx_q);
2411 
2412 		mbuf_destroy_zcp(vpool);
2413 
2414 		/* Stop the TX queue. */
2415 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2416 			LOG_DEBUG(VHOST_CONFIG,
2417 				"(%"PRIu64") In destroy_device: Failed to "
2418 				"stop tx queue:%d\n",
2419 				dev->device_fh, vdev->vmdq_rx_q);
2420 		}
2421 
2422 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2423 
2424 		LOG_DEBUG(VHOST_CONFIG,
2425 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2426 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2427 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2428 			dev->device_fh);
2429 
2430 		mbuf_destroy_zcp(vpool);
2431 		rte_free(vdev->regions_hpa);
2432 	}
2433 	rte_free(vdev);
2434 
2435 }
2436 
2437 /*
2438  * Calculate the region count of physical continous regions for one particular
2439  * region of whose vhost virtual address is continous. The particular region
2440  * start from vva_start, with size of 'size' in argument.
2441  */
2442 static uint32_t
2443 check_hpa_regions(uint64_t vva_start, uint64_t size)
2444 {
2445 	uint32_t i, nregions = 0, page_size = getpagesize();
2446 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2447 	if (vva_start % page_size) {
2448 		LOG_DEBUG(VHOST_CONFIG,
2449 			"in check_countinous: vva start(%p) mod page_size(%d) "
2450 			"has remainder\n",
2451 			(void *)(uintptr_t)vva_start, page_size);
2452 		return 0;
2453 	}
2454 	if (size % page_size) {
2455 		LOG_DEBUG(VHOST_CONFIG,
2456 			"in check_countinous: "
2457 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2458 			size, page_size);
2459 		return 0;
2460 	}
2461 	for (i = 0; i < size - page_size; i = i + page_size) {
2462 		cur_phys_addr
2463 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2464 		next_phys_addr = rte_mem_virt2phy(
2465 			(void *)(uintptr_t)(vva_start + i + page_size));
2466 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2467 			++nregions;
2468 			LOG_DEBUG(VHOST_CONFIG,
2469 				"in check_continuous: hva addr:(%p) is not "
2470 				"continuous with hva addr:(%p), diff:%d\n",
2471 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2472 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2473 				+ page_size), page_size);
2474 			LOG_DEBUG(VHOST_CONFIG,
2475 				"in check_continuous: hpa addr:(%p) is not "
2476 				"continuous with hpa addr:(%p), "
2477 				"diff:(%"PRIu64")\n",
2478 				(void *)(uintptr_t)cur_phys_addr,
2479 				(void *)(uintptr_t)next_phys_addr,
2480 				(next_phys_addr-cur_phys_addr));
2481 		}
2482 	}
2483 	return nregions;
2484 }
2485 
2486 /*
2487  * Divide each region whose vhost virtual address is continous into a few
2488  * sub-regions, make sure the physical address within each sub-region are
2489  * continous. And fill offset(to GPA) and size etc. information of each
2490  * sub-region into regions_hpa.
2491  */
2492 static uint32_t
2493 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2494 {
2495 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2496 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2497 
2498 	if (mem_region_hpa == NULL)
2499 		return 0;
2500 
2501 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2502 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2503 			virtio_memory->regions[regionidx].address_offset;
2504 		mem_region_hpa[regionidx_hpa].guest_phys_address
2505 			= virtio_memory->regions[regionidx].guest_phys_address;
2506 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2507 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2508 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2509 		LOG_DEBUG(VHOST_CONFIG,
2510 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2511 			regionidx_hpa,
2512 			(void *)(uintptr_t)
2513 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2514 		LOG_DEBUG(VHOST_CONFIG,
2515 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2516 			regionidx_hpa,
2517 			(void *)(uintptr_t)
2518 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2519 		for (i = 0, k = 0;
2520 			i < virtio_memory->regions[regionidx].memory_size -
2521 				page_size;
2522 			i += page_size) {
2523 			cur_phys_addr = rte_mem_virt2phy(
2524 					(void *)(uintptr_t)(vva_start + i));
2525 			next_phys_addr = rte_mem_virt2phy(
2526 					(void *)(uintptr_t)(vva_start +
2527 					i + page_size));
2528 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2529 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2530 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2531 					k + page_size;
2532 				mem_region_hpa[regionidx_hpa].memory_size
2533 					= k + page_size;
2534 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2535 					"phys addr end  [%d]:(%p)\n",
2536 					regionidx_hpa,
2537 					(void *)(uintptr_t)
2538 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2539 				LOG_DEBUG(VHOST_CONFIG,
2540 					"in fill_hpa_regions: guest phys addr "
2541 					"size [%d]:(%p)\n",
2542 					regionidx_hpa,
2543 					(void *)(uintptr_t)
2544 					(mem_region_hpa[regionidx_hpa].memory_size));
2545 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2546 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2547 				++regionidx_hpa;
2548 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2549 					next_phys_addr -
2550 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2551 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2552 					" phys addr start[%d]:(%p)\n",
2553 					regionidx_hpa,
2554 					(void *)(uintptr_t)
2555 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2556 				LOG_DEBUG(VHOST_CONFIG,
2557 					"in fill_hpa_regions: host  phys addr "
2558 					"start[%d]:(%p)\n",
2559 					regionidx_hpa,
2560 					(void *)(uintptr_t)
2561 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2562 				k = 0;
2563 			} else {
2564 				k += page_size;
2565 			}
2566 		}
2567 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2568 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2569 			+ k + page_size;
2570 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2571 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2572 			"[%d]:(%p)\n", regionidx_hpa,
2573 			(void *)(uintptr_t)
2574 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2575 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2576 			"[%d]:(%p)\n", regionidx_hpa,
2577 			(void *)(uintptr_t)
2578 			(mem_region_hpa[regionidx_hpa].memory_size));
2579 		++regionidx_hpa;
2580 	}
2581 	return regionidx_hpa;
2582 }
2583 
2584 /*
2585  * A new device is added to a data core. First the device is added to the main linked list
2586  * and the allocated to a specific data core.
2587  */
2588 static int
2589 new_device (struct virtio_net *dev)
2590 {
2591 	struct virtio_net_data_ll *ll_dev;
2592 	int lcore, core_add = 0;
2593 	uint32_t device_num_min = num_devices;
2594 	struct vhost_dev *vdev;
2595 	uint32_t regionidx;
2596 
2597 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2598 	if (vdev == NULL) {
2599 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2600 			dev->device_fh);
2601 		return -1;
2602 	}
2603 	vdev->dev = dev;
2604 	dev->priv = vdev;
2605 
2606 	if (zero_copy) {
2607 		vdev->nregions_hpa = dev->mem->nregions;
2608 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2609 			vdev->nregions_hpa
2610 				+= check_hpa_regions(
2611 					dev->mem->regions[regionidx].guest_phys_address
2612 					+ dev->mem->regions[regionidx].address_offset,
2613 					dev->mem->regions[regionidx].memory_size);
2614 
2615 		}
2616 
2617 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2618 					       vdev->nregions_hpa,
2619 					       sizeof(struct virtio_memory_regions_hpa),
2620 					       RTE_CACHE_LINE_SIZE);
2621 		if (vdev->regions_hpa == NULL) {
2622 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2623 			rte_free(vdev);
2624 			return -1;
2625 		}
2626 
2627 
2628 		if (fill_hpa_memory_regions(
2629 			vdev->regions_hpa, dev->mem
2630 			) != vdev->nregions_hpa) {
2631 
2632 			RTE_LOG(ERR, VHOST_CONFIG,
2633 				"hpa memory regions number mismatch: "
2634 				"[%d]\n", vdev->nregions_hpa);
2635 			rte_free(vdev->regions_hpa);
2636 			rte_free(vdev);
2637 			return -1;
2638 		}
2639 	}
2640 
2641 
2642 	/* Add device to main ll */
2643 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2644 	if (ll_dev == NULL) {
2645 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2646 			"of %d devices per core has been reached\n",
2647 			dev->device_fh, num_devices);
2648 		if (vdev->regions_hpa)
2649 			rte_free(vdev->regions_hpa);
2650 		rte_free(vdev);
2651 		return -1;
2652 	}
2653 	ll_dev->vdev = vdev;
2654 	add_data_ll_entry(&ll_root_used, ll_dev);
2655 	vdev->vmdq_rx_q
2656 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2657 
2658 	if (zero_copy) {
2659 		uint32_t index = vdev->vmdq_rx_q;
2660 		uint32_t count_in_ring, i;
2661 		struct mbuf_table *tx_q;
2662 
2663 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2664 
2665 		LOG_DEBUG(VHOST_CONFIG,
2666 			"(%"PRIu64") in new_device: mbuf count in mempool "
2667 			"before attach is: %d\n",
2668 			dev->device_fh,
2669 			rte_mempool_count(vpool_array[index].pool));
2670 		LOG_DEBUG(VHOST_CONFIG,
2671 			"(%"PRIu64") in new_device: mbuf count in  ring "
2672 			"before attach  is : %d\n",
2673 			dev->device_fh, count_in_ring);
2674 
2675 		/*
2676 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2677 		 */
2678 		for (i = 0; i < count_in_ring; i++)
2679 			attach_rxmbuf_zcp(dev);
2680 
2681 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2682 			"mempool after attach is: %d\n",
2683 			dev->device_fh,
2684 			rte_mempool_count(vpool_array[index].pool));
2685 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2686 			"ring after attach  is : %d\n",
2687 			dev->device_fh,
2688 			rte_ring_count(vpool_array[index].ring));
2689 
2690 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2691 		tx_q->txq_id = vdev->vmdq_rx_q;
2692 
2693 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2694 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2695 
2696 			LOG_DEBUG(VHOST_CONFIG,
2697 				"(%"PRIu64") In new_device: Failed to start "
2698 				"tx queue:%d\n",
2699 				dev->device_fh, vdev->vmdq_rx_q);
2700 
2701 			mbuf_destroy_zcp(vpool);
2702 			rte_free(vdev->regions_hpa);
2703 			rte_free(vdev);
2704 			return -1;
2705 		}
2706 
2707 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2708 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2709 
2710 			LOG_DEBUG(VHOST_CONFIG,
2711 				"(%"PRIu64") In new_device: Failed to start "
2712 				"rx queue:%d\n",
2713 				dev->device_fh, vdev->vmdq_rx_q);
2714 
2715 			/* Stop the TX queue. */
2716 			if (rte_eth_dev_tx_queue_stop(ports[0],
2717 				vdev->vmdq_rx_q) != 0) {
2718 				LOG_DEBUG(VHOST_CONFIG,
2719 					"(%"PRIu64") In new_device: Failed to "
2720 					"stop tx queue:%d\n",
2721 					dev->device_fh, vdev->vmdq_rx_q);
2722 			}
2723 
2724 			mbuf_destroy_zcp(vpool);
2725 			rte_free(vdev->regions_hpa);
2726 			rte_free(vdev);
2727 			return -1;
2728 		}
2729 
2730 	}
2731 
2732 	/*reset ready flag*/
2733 	vdev->ready = DEVICE_MAC_LEARNING;
2734 	vdev->remove = 0;
2735 
2736 	/* Find a suitable lcore to add the device. */
2737 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2738 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2739 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2740 			core_add = lcore;
2741 		}
2742 	}
2743 	/* Add device to lcore ll */
2744 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2745 	if (ll_dev == NULL) {
2746 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2747 		vdev->ready = DEVICE_SAFE_REMOVE;
2748 		destroy_device(dev);
2749 		rte_free(vdev->regions_hpa);
2750 		rte_free(vdev);
2751 		return -1;
2752 	}
2753 	ll_dev->vdev = vdev;
2754 	vdev->coreid = core_add;
2755 
2756 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2757 
2758 	/* Initialize device stats */
2759 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2760 
2761 	/* Disable notifications. */
2762 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2763 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2764 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2765 	dev->flags |= VIRTIO_DEV_RUNNING;
2766 
2767 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2768 
2769 	return 0;
2770 }
2771 
2772 /*
2773  * These callback allow devices to be added to the data core when configuration
2774  * has been fully complete.
2775  */
2776 static const struct virtio_net_device_ops virtio_net_device_ops =
2777 {
2778 	.new_device =  new_device,
2779 	.destroy_device = destroy_device,
2780 };
2781 
2782 /*
2783  * This is a thread will wake up after a period to print stats if the user has
2784  * enabled them.
2785  */
2786 static void
2787 print_stats(void)
2788 {
2789 	struct virtio_net_data_ll *dev_ll;
2790 	uint64_t tx_dropped, rx_dropped;
2791 	uint64_t tx, tx_total, rx, rx_total;
2792 	uint32_t device_fh;
2793 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2794 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2795 
2796 	while(1) {
2797 		sleep(enable_stats);
2798 
2799 		/* Clear screen and move to top left */
2800 		printf("%s%s", clr, top_left);
2801 
2802 		printf("\nDevice statistics ====================================");
2803 
2804 		dev_ll = ll_root_used;
2805 		while (dev_ll != NULL) {
2806 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2807 			tx_total = dev_statistics[device_fh].tx_total;
2808 			tx = dev_statistics[device_fh].tx;
2809 			tx_dropped = tx_total - tx;
2810 			if (zero_copy == 0) {
2811 				rx_total = rte_atomic64_read(
2812 					&dev_statistics[device_fh].rx_total_atomic);
2813 				rx = rte_atomic64_read(
2814 					&dev_statistics[device_fh].rx_atomic);
2815 			} else {
2816 				rx_total = dev_statistics[device_fh].rx_total;
2817 				rx = dev_statistics[device_fh].rx;
2818 			}
2819 			rx_dropped = rx_total - rx;
2820 
2821 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2822 					"\nTX total: 		%"PRIu64""
2823 					"\nTX dropped: 		%"PRIu64""
2824 					"\nTX successful: 		%"PRIu64""
2825 					"\nRX total: 		%"PRIu64""
2826 					"\nRX dropped: 		%"PRIu64""
2827 					"\nRX successful: 		%"PRIu64"",
2828 					device_fh,
2829 					tx_total,
2830 					tx_dropped,
2831 					tx,
2832 					rx_total,
2833 					rx_dropped,
2834 					rx);
2835 
2836 			dev_ll = dev_ll->next;
2837 		}
2838 		printf("\n======================================================\n");
2839 	}
2840 }
2841 
2842 static void
2843 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2844 	char *ring_name, uint32_t nb_mbuf)
2845 {
2846 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2847 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2848 	if (vpool_array[index].pool != NULL) {
2849 		vpool_array[index].ring
2850 			= rte_ring_create(ring_name,
2851 				rte_align32pow2(nb_mbuf + 1),
2852 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2853 		if (likely(vpool_array[index].ring != NULL)) {
2854 			LOG_DEBUG(VHOST_CONFIG,
2855 				"in setup_mempool_tbl: mbuf count in "
2856 				"mempool is: %d\n",
2857 				rte_mempool_count(vpool_array[index].pool));
2858 			LOG_DEBUG(VHOST_CONFIG,
2859 				"in setup_mempool_tbl: mbuf count in "
2860 				"ring   is: %d\n",
2861 				rte_ring_count(vpool_array[index].ring));
2862 		} else {
2863 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2864 				ring_name);
2865 		}
2866 
2867 		/* Need consider head room. */
2868 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2869 	} else {
2870 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2871 	}
2872 }
2873 
2874 /* When we receive a INT signal, unregister vhost driver */
2875 static void
2876 sigint_handler(__rte_unused int signum)
2877 {
2878 	/* Unregister vhost driver. */
2879 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2880 	if (ret != 0)
2881 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2882 	exit(0);
2883 }
2884 
2885 /*
2886  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2887  * device is also registered here to handle the IOCTLs.
2888  */
2889 int
2890 main(int argc, char *argv[])
2891 {
2892 	struct rte_mempool *mbuf_pool = NULL;
2893 	unsigned lcore_id, core_id = 0;
2894 	unsigned nb_ports, valid_num_ports;
2895 	int ret;
2896 	uint8_t portid;
2897 	uint16_t queue_id;
2898 	static pthread_t tid;
2899 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2900 
2901 	signal(SIGINT, sigint_handler);
2902 
2903 	/* init EAL */
2904 	ret = rte_eal_init(argc, argv);
2905 	if (ret < 0)
2906 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2907 	argc -= ret;
2908 	argv += ret;
2909 
2910 	/* parse app arguments */
2911 	ret = us_vhost_parse_args(argc, argv);
2912 	if (ret < 0)
2913 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2914 
2915 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2916 		if (rte_lcore_is_enabled(lcore_id))
2917 			lcore_ids[core_id ++] = lcore_id;
2918 
2919 	if (rte_lcore_count() > RTE_MAX_LCORE)
2920 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2921 
2922 	/*set the number of swithcing cores available*/
2923 	num_switching_cores = rte_lcore_count()-1;
2924 
2925 	/* Get the number of physical ports. */
2926 	nb_ports = rte_eth_dev_count();
2927 	if (nb_ports > RTE_MAX_ETHPORTS)
2928 		nb_ports = RTE_MAX_ETHPORTS;
2929 
2930 	/*
2931 	 * Update the global var NUM_PORTS and global array PORTS
2932 	 * and get value of var VALID_NUM_PORTS according to system ports number
2933 	 */
2934 	valid_num_ports = check_ports_num(nb_ports);
2935 
2936 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2937 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2938 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2939 		return -1;
2940 	}
2941 
2942 	if (zero_copy == 0) {
2943 		/* Create the mbuf pool. */
2944 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2945 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2946 			0, MBUF_DATA_SIZE, rte_socket_id());
2947 		if (mbuf_pool == NULL)
2948 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2949 
2950 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2951 			vpool_array[queue_id].pool = mbuf_pool;
2952 
2953 		if (vm2vm_mode == VM2VM_HARDWARE) {
2954 			/* Enable VT loop back to let L2 switch to do it. */
2955 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2956 			LOG_DEBUG(VHOST_CONFIG,
2957 				"Enable loop back for L2 switch in vmdq.\n");
2958 		}
2959 	} else {
2960 		uint32_t nb_mbuf;
2961 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2962 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2963 
2964 		nb_mbuf = num_rx_descriptor
2965 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2966 			+ num_switching_cores * MAX_PKT_BURST;
2967 
2968 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2969 			snprintf(pool_name, sizeof(pool_name),
2970 				"rxmbuf_pool_%u", queue_id);
2971 			snprintf(ring_name, sizeof(ring_name),
2972 				"rxmbuf_ring_%u", queue_id);
2973 			setup_mempool_tbl(rte_socket_id(), queue_id,
2974 				pool_name, ring_name, nb_mbuf);
2975 		}
2976 
2977 		nb_mbuf = num_tx_descriptor
2978 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2979 				+ num_switching_cores * MAX_PKT_BURST;
2980 
2981 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2982 			snprintf(pool_name, sizeof(pool_name),
2983 				"txmbuf_pool_%u", queue_id);
2984 			snprintf(ring_name, sizeof(ring_name),
2985 				"txmbuf_ring_%u", queue_id);
2986 			setup_mempool_tbl(rte_socket_id(),
2987 				(queue_id + MAX_QUEUES),
2988 				pool_name, ring_name, nb_mbuf);
2989 		}
2990 
2991 		if (vm2vm_mode == VM2VM_HARDWARE) {
2992 			/* Enable VT loop back to let L2 switch to do it. */
2993 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2994 			LOG_DEBUG(VHOST_CONFIG,
2995 				"Enable loop back for L2 switch in vmdq.\n");
2996 		}
2997 	}
2998 	/* Set log level. */
2999 	rte_set_log_level(LOG_LEVEL);
3000 
3001 	/* initialize all ports */
3002 	for (portid = 0; portid < nb_ports; portid++) {
3003 		/* skip ports that are not enabled */
3004 		if ((enabled_port_mask & (1 << portid)) == 0) {
3005 			RTE_LOG(INFO, VHOST_PORT,
3006 				"Skipping disabled port %d\n", portid);
3007 			continue;
3008 		}
3009 		if (port_init(portid) != 0)
3010 			rte_exit(EXIT_FAILURE,
3011 				"Cannot initialize network ports\n");
3012 	}
3013 
3014 	/* Initialise all linked lists. */
3015 	if (init_data_ll() == -1)
3016 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3017 
3018 	/* Initialize device stats */
3019 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3020 
3021 	/* Enable stats if the user option is set. */
3022 	if (enable_stats) {
3023 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3024 		if (ret != 0)
3025 			rte_exit(EXIT_FAILURE,
3026 				"Cannot create print-stats thread\n");
3027 
3028 		/* Set thread_name for aid in debugging.  */
3029 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3030 		ret = pthread_setname_np(tid, thread_name);
3031 		if (ret != 0)
3032 			RTE_LOG(ERR, VHOST_CONFIG,
3033 				"Cannot set print-stats name\n");
3034 	}
3035 
3036 	/* Launch all data cores. */
3037 	if (zero_copy == 0) {
3038 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3039 			rte_eal_remote_launch(switch_worker,
3040 				mbuf_pool, lcore_id);
3041 		}
3042 	} else {
3043 		uint32_t count_in_mempool, index, i;
3044 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3045 			/* For all RX and TX queues. */
3046 			count_in_mempool
3047 				= rte_mempool_count(vpool_array[index].pool);
3048 
3049 			/*
3050 			 * Transfer all un-attached mbufs from vpool.pool
3051 			 * to vpoo.ring.
3052 			 */
3053 			for (i = 0; i < count_in_mempool; i++) {
3054 				struct rte_mbuf *mbuf
3055 					= __rte_mbuf_raw_alloc(
3056 						vpool_array[index].pool);
3057 				rte_ring_sp_enqueue(vpool_array[index].ring,
3058 						(void *)mbuf);
3059 			}
3060 
3061 			LOG_DEBUG(VHOST_CONFIG,
3062 				"in main: mbuf count in mempool at initial "
3063 				"is: %d\n", count_in_mempool);
3064 			LOG_DEBUG(VHOST_CONFIG,
3065 				"in main: mbuf count in  ring at initial  is :"
3066 				" %d\n",
3067 				rte_ring_count(vpool_array[index].ring));
3068 		}
3069 
3070 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3071 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3072 				lcore_id);
3073 	}
3074 
3075 	if (mergeable == 0)
3076 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3077 
3078 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3079 	ret = rte_vhost_driver_register((char *)&dev_basename);
3080 	if (ret != 0)
3081 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3082 
3083 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3084 
3085 	/* Start CUSE session. */
3086 	rte_vhost_driver_session_start();
3087 	return 0;
3088 
3089 }
3090