xref: /dpdk/examples/vhost/main.c (revision 6f41fe75e2dd8dd38f7bea7b9501edd4f9b72fa5)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56 
57 #define MAX_QUEUES 128
58 
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61 
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
66 							(num_switching_cores*MAX_PKT_BURST) +  			\
67 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 							(num_switching_cores*MBUF_CACHE_SIZE))
69 
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72 
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 	+ RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82 
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92 
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101 
102 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
105 
106 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
108 
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX			1
112 #define DEVICE_SAFE_REMOVE	2
113 
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117 
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121 
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133 
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 		+ sizeof(struct rte_mbuf)))
137 
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140 
141 #define INVALID_PORT_ID 0xFF
142 
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145 
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148 
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151 
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154 
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157 
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160 
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163 
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166 
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 uint32_t num_devices = 0;
170 
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 
177 /* number of descriptors to apply*/
178 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
179 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
180 
181 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
182 #define MAX_RING_DESC 4096
183 
184 struct vpool {
185 	struct rte_mempool *pool;
186 	struct rte_ring *ring;
187 	uint32_t buf_size;
188 } vpool_array[MAX_QUEUES+MAX_QUEUES];
189 
190 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
191 typedef enum {
192 	VM2VM_DISABLED = 0,
193 	VM2VM_SOFTWARE = 1,
194 	VM2VM_HARDWARE = 2,
195 	VM2VM_LAST
196 } vm2vm_type;
197 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
198 
199 /* The type of host physical address translated from guest physical address. */
200 typedef enum {
201 	PHYS_ADDR_CONTINUOUS = 0,
202 	PHYS_ADDR_CROSS_SUBREG = 1,
203 	PHYS_ADDR_INVALID = 2,
204 	PHYS_ADDR_LAST
205 } hpa_type;
206 
207 /* Enable stats. */
208 static uint32_t enable_stats = 0;
209 /* Enable retries on RX. */
210 static uint32_t enable_retry = 1;
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215 
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218 
219 /* Charater device index. Can be set by user. */
220 static uint32_t dev_index = 0;
221 
222 /* This can be set by the user so it is made available here. */
223 extern uint64_t VHOST_FEATURES;
224 
225 /* Default configuration for rx and tx thresholds etc. */
226 static struct rte_eth_rxconf rx_conf_default = {
227 	.rx_thresh = {
228 		.pthresh = RX_PTHRESH,
229 		.hthresh = RX_HTHRESH,
230 		.wthresh = RX_WTHRESH,
231 	},
232 	.rx_drop_en = 1,
233 };
234 
235 /*
236  * These default values are optimized for use with the Intel(R) 82599 10 GbE
237  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
238  * network controllers and/or network drivers.
239  */
240 static struct rte_eth_txconf tx_conf_default = {
241 	.tx_thresh = {
242 		.pthresh = TX_PTHRESH,
243 		.hthresh = TX_HTHRESH,
244 		.wthresh = TX_WTHRESH,
245 	},
246 	.tx_free_thresh = 0, /* Use PMD default values */
247 	.tx_rs_thresh = 0, /* Use PMD default values */
248 };
249 
250 /* empty vmdq configuration structure. Filled in programatically */
251 static struct rte_eth_conf vmdq_conf_default = {
252 	.rxmode = {
253 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
254 		.split_hdr_size = 0,
255 		.header_split   = 0, /**< Header Split disabled */
256 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
257 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
258 		/*
259 		 * It is necessary for 1G NIC such as I350,
260 		 * this fixes bug of ipv4 forwarding in guest can't
261 		 * forward pakets from one virtio dev to another virtio dev.
262 		 */
263 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
264 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
265 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
266 	},
267 
268 	.txmode = {
269 		.mq_mode = ETH_MQ_TX_NONE,
270 	},
271 	.rx_adv_conf = {
272 		/*
273 		 * should be overridden separately in code with
274 		 * appropriate values
275 		 */
276 		.vmdq_rx_conf = {
277 			.nb_queue_pools = ETH_8_POOLS,
278 			.enable_default_pool = 0,
279 			.default_pool = 0,
280 			.nb_pool_maps = 0,
281 			.pool_map = {{0, 0},},
282 		},
283 	},
284 };
285 
286 static unsigned lcore_ids[RTE_MAX_LCORE];
287 static uint8_t ports[RTE_MAX_ETHPORTS];
288 static unsigned num_ports = 0; /**< The number of ports specified in command line */
289 
290 static const uint16_t external_pkt_default_vlan_tag = 2000;
291 const uint16_t vlan_tags[] = {
292 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
293 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
294 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
295 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
296 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
297 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
298 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
299 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
300 };
301 
302 /* ethernet addresses of ports */
303 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
304 
305 /* heads for the main used and free linked lists for the data path. */
306 static struct virtio_net_data_ll *ll_root_used = NULL;
307 static struct virtio_net_data_ll *ll_root_free = NULL;
308 
309 /* Array of data core structures containing information on individual core linked lists. */
310 static struct lcore_info lcore_info[RTE_MAX_LCORE];
311 
312 /* Used for queueing bursts of TX packets. */
313 struct mbuf_table {
314 	unsigned len;
315 	unsigned txq_id;
316 	struct rte_mbuf *m_table[MAX_PKT_BURST];
317 };
318 
319 /* TX queue for each data core. */
320 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
321 
322 /* TX queue fori each virtio device for zero copy. */
323 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
324 
325 /* Vlan header struct used to insert vlan tags on TX. */
326 struct vlan_ethhdr {
327 	unsigned char   h_dest[ETH_ALEN];
328 	unsigned char   h_source[ETH_ALEN];
329 	__be16          h_vlan_proto;
330 	__be16          h_vlan_TCI;
331 	__be16          h_vlan_encapsulated_proto;
332 };
333 
334 /* IPv4 Header */
335 struct ipv4_hdr {
336 	uint8_t  version_ihl;		/**< version and header length */
337 	uint8_t  type_of_service;	/**< type of service */
338 	uint16_t total_length;		/**< length of packet */
339 	uint16_t packet_id;		/**< packet ID */
340 	uint16_t fragment_offset;	/**< fragmentation offset */
341 	uint8_t  time_to_live;		/**< time to live */
342 	uint8_t  next_proto_id;		/**< protocol ID */
343 	uint16_t hdr_checksum;		/**< header checksum */
344 	uint32_t src_addr;		/**< source address */
345 	uint32_t dst_addr;		/**< destination address */
346 } __attribute__((__packed__));
347 
348 /* Header lengths. */
349 #define VLAN_HLEN       4
350 #define VLAN_ETH_HLEN   18
351 
352 /* Per-device statistics struct */
353 struct device_statistics {
354 	uint64_t tx_total;
355 	rte_atomic64_t rx_total_atomic;
356 	uint64_t rx_total;
357 	uint64_t tx;
358 	rte_atomic64_t rx_atomic;
359 	uint64_t rx;
360 } __rte_cache_aligned;
361 struct device_statistics dev_statistics[MAX_DEVICES];
362 
363 /*
364  * Builds up the correct configuration for VMDQ VLAN pool map
365  * according to the pool & queue limits.
366  */
367 static inline int
368 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
369 {
370 	struct rte_eth_vmdq_rx_conf conf;
371 	unsigned i;
372 
373 	memset(&conf, 0, sizeof(conf));
374 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
375 	conf.nb_pool_maps = num_devices;
376 	conf.enable_loop_back =
377 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
378 
379 	for (i = 0; i < conf.nb_pool_maps; i++) {
380 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
381 		conf.pool_map[i].pools = (1UL << i);
382 	}
383 
384 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
385 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
386 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
387 	return 0;
388 }
389 
390 /*
391  * Validate the device number according to the max pool number gotten form
392  * dev_info. If the device number is invalid, give the error message and
393  * return -1. Each device must have its own pool.
394  */
395 static inline int
396 validate_num_devices(uint32_t max_nb_devices)
397 {
398 	if (num_devices > max_nb_devices) {
399 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
400 		return -1;
401 	}
402 	return 0;
403 }
404 
405 /*
406  * Initialises a given port using global settings and with the rx buffers
407  * coming from the mbuf_pool passed as parameter
408  */
409 static inline int
410 port_init(uint8_t port)
411 {
412 	struct rte_eth_dev_info dev_info;
413 	struct rte_eth_conf port_conf;
414 	uint16_t rx_rings, tx_rings;
415 	uint16_t rx_ring_size, tx_ring_size;
416 	int retval;
417 	uint16_t q;
418 
419 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
420 	rte_eth_dev_info_get (port, &dev_info);
421 
422 	/*configure the number of supported virtio devices based on VMDQ limits */
423 	num_devices = dev_info.max_vmdq_pools;
424 	num_queues = dev_info.max_rx_queues;
425 
426 	if (zero_copy) {
427 		rx_ring_size = num_rx_descriptor;
428 		tx_ring_size = num_tx_descriptor;
429 		tx_rings = dev_info.max_tx_queues;
430 	} else {
431 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
432 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
433 		tx_rings = (uint16_t)rte_lcore_count();
434 	}
435 
436 	retval = validate_num_devices(MAX_DEVICES);
437 	if (retval < 0)
438 		return retval;
439 
440 	/* Get port configuration. */
441 	retval = get_eth_conf(&port_conf, num_devices);
442 	if (retval < 0)
443 		return retval;
444 
445 	if (port >= rte_eth_dev_count()) return -1;
446 
447 	rx_rings = (uint16_t)num_queues,
448 	/* Configure ethernet device. */
449 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
450 	if (retval != 0)
451 		return retval;
452 
453 	/* Setup the queues. */
454 	for (q = 0; q < rx_rings; q ++) {
455 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
456 						rte_eth_dev_socket_id(port), &rx_conf_default,
457 						vpool_array[q].pool);
458 		if (retval < 0)
459 			return retval;
460 	}
461 	for (q = 0; q < tx_rings; q ++) {
462 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
463 						rte_eth_dev_socket_id(port), &tx_conf_default);
464 		if (retval < 0)
465 			return retval;
466 	}
467 
468 	/* Start the device. */
469 	retval  = rte_eth_dev_start(port);
470 	if (retval < 0) {
471 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
472 		return retval;
473 	}
474 
475 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
476 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
477 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
478 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
479 			(unsigned)port,
480 			vmdq_ports_eth_addr[port].addr_bytes[0],
481 			vmdq_ports_eth_addr[port].addr_bytes[1],
482 			vmdq_ports_eth_addr[port].addr_bytes[2],
483 			vmdq_ports_eth_addr[port].addr_bytes[3],
484 			vmdq_ports_eth_addr[port].addr_bytes[4],
485 			vmdq_ports_eth_addr[port].addr_bytes[5]);
486 
487 	return 0;
488 }
489 
490 /*
491  * Set character device basename.
492  */
493 static int
494 us_vhost_parse_basename(const char *q_arg)
495 {
496 	/* parse number string */
497 
498 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
499 		return -1;
500 	else
501 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
502 
503 	return 0;
504 }
505 
506 /*
507  * Parse the portmask provided at run time.
508  */
509 static int
510 parse_portmask(const char *portmask)
511 {
512 	char *end = NULL;
513 	unsigned long pm;
514 
515 	errno = 0;
516 
517 	/* parse hexadecimal string */
518 	pm = strtoul(portmask, &end, 16);
519 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
520 		return -1;
521 
522 	if (pm == 0)
523 		return -1;
524 
525 	return pm;
526 
527 }
528 
529 /*
530  * Parse num options at run time.
531  */
532 static int
533 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
534 {
535 	char *end = NULL;
536 	unsigned long num;
537 
538 	errno = 0;
539 
540 	/* parse unsigned int string */
541 	num = strtoul(q_arg, &end, 10);
542 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
543 		return -1;
544 
545 	if (num > max_valid_value)
546 		return -1;
547 
548 	return num;
549 
550 }
551 
552 /*
553  * Display usage
554  */
555 static void
556 us_vhost_usage(const char *prgname)
557 {
558 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
559 	"		--vm2vm [0|1|2]\n"
560 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
561 	"		--dev-basename <name> --dev-index [0-N]\n"
562 	"		--nb-devices ND\n"
563 	"		-p PORTMASK: Set mask for ports to be used by application\n"
564 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
565 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
566 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
567 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
568 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
569 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
570 	"		--dev-basename: The basename to be used for the character device.\n"
571 	"		--dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
572 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
573 			"zero copy\n"
574 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
575 			"used only when zero copy is enabled.\n"
576 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
577 			"used only when zero copy is enabled.\n",
578 	       prgname);
579 }
580 
581 /*
582  * Parse the arguments given in the command line of the application.
583  */
584 static int
585 us_vhost_parse_args(int argc, char **argv)
586 {
587 	int opt, ret;
588 	int option_index;
589 	unsigned i;
590 	const char *prgname = argv[0];
591 	static struct option long_option[] = {
592 		{"vm2vm", required_argument, NULL, 0},
593 		{"rx-retry", required_argument, NULL, 0},
594 		{"rx-retry-delay", required_argument, NULL, 0},
595 		{"rx-retry-num", required_argument, NULL, 0},
596 		{"mergeable", required_argument, NULL, 0},
597 		{"stats", required_argument, NULL, 0},
598 		{"dev-basename", required_argument, NULL, 0},
599 		{"dev-index", required_argument, NULL, 0},
600 		{"zero-copy", required_argument, NULL, 0},
601 		{"rx-desc-num", required_argument, NULL, 0},
602 		{"tx-desc-num", required_argument, NULL, 0},
603 		{NULL, 0, 0, 0},
604 	};
605 
606 	/* Parse command line */
607 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
608 		switch (opt) {
609 		/* Portmask */
610 		case 'p':
611 			enabled_port_mask = parse_portmask(optarg);
612 			if (enabled_port_mask == 0) {
613 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
614 				us_vhost_usage(prgname);
615 				return -1;
616 			}
617 			break;
618 
619 		case 0:
620 			/* Enable/disable vm2vm comms. */
621 			if (!strncmp(long_option[option_index].name, "vm2vm",
622 				MAX_LONG_OPT_SZ)) {
623 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
624 				if (ret == -1) {
625 					RTE_LOG(INFO, VHOST_CONFIG,
626 						"Invalid argument for "
627 						"vm2vm [0|1|2]\n");
628 					us_vhost_usage(prgname);
629 					return -1;
630 				} else {
631 					vm2vm_mode = (vm2vm_type)ret;
632 				}
633 			}
634 
635 			/* Enable/disable retries on RX. */
636 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
637 				ret = parse_num_opt(optarg, 1);
638 				if (ret == -1) {
639 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
640 					us_vhost_usage(prgname);
641 					return -1;
642 				} else {
643 					enable_retry = ret;
644 				}
645 			}
646 
647 			/* Specify the retries delay time (in useconds) on RX. */
648 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
649 				ret = parse_num_opt(optarg, INT32_MAX);
650 				if (ret == -1) {
651 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
652 					us_vhost_usage(prgname);
653 					return -1;
654 				} else {
655 					burst_rx_delay_time = ret;
656 				}
657 			}
658 
659 			/* Specify the retries number on RX. */
660 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
661 				ret = parse_num_opt(optarg, INT32_MAX);
662 				if (ret == -1) {
663 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
664 					us_vhost_usage(prgname);
665 					return -1;
666 				} else {
667 					burst_rx_retry_num = ret;
668 				}
669 			}
670 
671 			/* Enable/disable RX mergeable buffers. */
672 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
673 				ret = parse_num_opt(optarg, 1);
674 				if (ret == -1) {
675 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
676 					us_vhost_usage(prgname);
677 					return -1;
678 				} else {
679 					if (ret)
680 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681 				}
682 			}
683 
684 			/* Enable/disable stats. */
685 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
686 				ret = parse_num_opt(optarg, INT32_MAX);
687 				if (ret == -1) {
688 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
689 					us_vhost_usage(prgname);
690 					return -1;
691 				} else {
692 					enable_stats = ret;
693 				}
694 			}
695 
696 			/* Set character device basename. */
697 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
698 				if (us_vhost_parse_basename(optarg) == -1) {
699 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
700 					us_vhost_usage(prgname);
701 					return -1;
702 				}
703 			}
704 
705 			/* Set character device index. */
706 			if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
707 				ret = parse_num_opt(optarg, INT32_MAX);
708 				if (ret == -1) {
709 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
710 					us_vhost_usage(prgname);
711 					return -1;
712 				} else
713 					dev_index = ret;
714 			}
715 
716 			/* Enable/disable rx/tx zero copy. */
717 			if (!strncmp(long_option[option_index].name,
718 				"zero-copy", MAX_LONG_OPT_SZ)) {
719 				ret = parse_num_opt(optarg, 1);
720 				if (ret == -1) {
721 					RTE_LOG(INFO, VHOST_CONFIG,
722 						"Invalid argument"
723 						" for zero-copy [0|1]\n");
724 					us_vhost_usage(prgname);
725 					return -1;
726 				} else
727 					zero_copy = ret;
728 
729 				if (zero_copy) {
730 #ifdef RTE_MBUF_SCATTER_GATHER
731 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
732 					"zero copy vhost APP, please "
733 					"disable RTE_MBUF_SCATTER_GATHER\n"
734 					"in config file and then rebuild DPDK "
735 					"core lib!\n"
736 					"Otherwise please disable zero copy "
737 					"flag in command line!\n");
738 					return -1;
739 #endif
740 				}
741 			}
742 
743 			/* Specify the descriptor number on RX. */
744 			if (!strncmp(long_option[option_index].name,
745 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
746 				ret = parse_num_opt(optarg, MAX_RING_DESC);
747 				if ((ret == -1) || (!POWEROF2(ret))) {
748 					RTE_LOG(INFO, VHOST_CONFIG,
749 					"Invalid argument for rx-desc-num[0-N],"
750 					"power of 2 required.\n");
751 					us_vhost_usage(prgname);
752 					return -1;
753 				} else {
754 					num_rx_descriptor = ret;
755 				}
756 			}
757 
758 			/* Specify the descriptor number on TX. */
759 			if (!strncmp(long_option[option_index].name,
760 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
761 				ret = parse_num_opt(optarg, MAX_RING_DESC);
762 				if ((ret == -1) || (!POWEROF2(ret))) {
763 					RTE_LOG(INFO, VHOST_CONFIG,
764 					"Invalid argument for tx-desc-num [0-N],"
765 					"power of 2 required.\n");
766 					us_vhost_usage(prgname);
767 					return -1;
768 				} else {
769 					num_tx_descriptor = ret;
770 				}
771 			}
772 
773 			break;
774 
775 			/* Invalid option - print options. */
776 		default:
777 			us_vhost_usage(prgname);
778 			return -1;
779 		}
780 	}
781 
782 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
783 		if (enabled_port_mask & (1 << i))
784 			ports[num_ports++] = (uint8_t)i;
785 	}
786 
787 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
788 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
789 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
790 		return -1;
791 	}
792 
793 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
794 		RTE_LOG(INFO, VHOST_PORT,
795 			"Vhost zero copy doesn't support software vm2vm,"
796 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
797 		return -1;
798 	}
799 
800 	return 0;
801 }
802 
803 /*
804  * Update the global var NUM_PORTS and array PORTS according to system ports number
805  * and return valid ports number
806  */
807 static unsigned check_ports_num(unsigned nb_ports)
808 {
809 	unsigned valid_num_ports = num_ports;
810 	unsigned portid;
811 
812 	if (num_ports > nb_ports) {
813 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
814 			num_ports, nb_ports);
815 		num_ports = nb_ports;
816 	}
817 
818 	for (portid = 0; portid < num_ports; portid ++) {
819 		if (ports[portid] >= nb_ports) {
820 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
821 				ports[portid], (nb_ports - 1));
822 			ports[portid] = INVALID_PORT_ID;
823 			valid_num_ports--;
824 		}
825 	}
826 	return valid_num_ports;
827 }
828 
829 /*
830  * Macro to print out packet contents. Wrapped in debug define so that the
831  * data path is not effected when debug is disabled.
832  */
833 #ifdef DEBUG
834 #define PRINT_PACKET(device, addr, size, header) do {																\
835 	char *pkt_addr = (char*)(addr);																					\
836 	unsigned int index;																								\
837 	char packet[MAX_PRINT_BUFF];																					\
838 																													\
839 	if ((header))																									\
840 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
841 	else																											\
842 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
843 	for (index = 0; index < (size); index++) {																		\
844 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
845 			"%02hhx ", pkt_addr[index]);																			\
846 	}																												\
847 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
848 																													\
849 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
850 } while(0)
851 #else
852 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
853 #endif
854 
855 /*
856  * Function to convert guest physical addresses to vhost virtual addresses. This
857  * is used to convert virtio buffer addresses.
858  */
859 static inline uint64_t __attribute__((always_inline))
860 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
861 {
862 	struct virtio_memory_regions *region;
863 	uint32_t regionidx;
864 	uint64_t vhost_va = 0;
865 
866 	for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
867 		region = &dev->mem->regions[regionidx];
868 		if ((guest_pa >= region->guest_phys_address) &&
869 			(guest_pa <= region->guest_phys_address_end)) {
870 			vhost_va = region->address_offset + guest_pa;
871 			break;
872 		}
873 	}
874 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| VVA %p\n",
875 		dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
876 
877 	return vhost_va;
878 }
879 
880 /*
881  * Function to convert guest physical addresses to vhost physical addresses.
882  * This is used to convert virtio buffer addresses.
883  */
884 static inline uint64_t __attribute__((always_inline))
885 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
886 	uint32_t buf_len, hpa_type *addr_type)
887 {
888 	struct virtio_memory_regions_hpa *region;
889 	uint32_t regionidx;
890 	uint64_t vhost_pa = 0;
891 
892 	*addr_type = PHYS_ADDR_INVALID;
893 
894 	for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
895 		region = &dev->mem->regions_hpa[regionidx];
896 		if ((guest_pa >= region->guest_phys_address) &&
897 			(guest_pa <= region->guest_phys_address_end)) {
898 			vhost_pa = region->host_phys_addr_offset + guest_pa;
899 			if (likely((guest_pa + buf_len - 1)
900 				<= region->guest_phys_address_end))
901 				*addr_type = PHYS_ADDR_CONTINUOUS;
902 			else
903 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
904 			break;
905 		}
906 	}
907 
908 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
909 		dev->device_fh, (void *)(uintptr_t)guest_pa,
910 		(void *)(uintptr_t)vhost_pa);
911 
912 	return vhost_pa;
913 }
914 
915 /*
916  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
917  * be received from the physical port or from another virtio device. A packet
918  * count is returned to indicate the number of packets that were succesfully
919  * added to the RX queue.
920  */
921 static inline uint32_t __attribute__((always_inline))
922 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
923 {
924 	struct vhost_virtqueue *vq;
925 	struct vring_desc *desc;
926 	struct rte_mbuf *buff;
927 	/* The virtio_hdr is initialised to 0. */
928 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
929 	uint64_t buff_addr = 0;
930 	uint64_t buff_hdr_addr = 0;
931 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
932 	uint32_t head_idx, packet_success = 0;
933 	uint32_t mergeable, mrg_count = 0;
934 	uint32_t retry = 0;
935 	uint16_t avail_idx, res_cur_idx;
936 	uint16_t res_base_idx, res_end_idx;
937 	uint16_t free_entries;
938 	uint8_t success = 0;
939 
940 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
941 	vq = dev->virtqueue[VIRTIO_RXQ];
942 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
943 	/* As many data cores may want access to available buffers, they need to be reserved. */
944 	do {
945 		res_base_idx = vq->last_used_idx_res;
946 		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
947 
948 		free_entries = (avail_idx - res_base_idx);
949 		/* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
950 		if (enable_retry && unlikely(count > free_entries)) {
951 			for (retry = 0; retry < burst_rx_retry_num; retry++) {
952 				rte_delay_us(burst_rx_delay_time);
953 				avail_idx =
954 					*((volatile uint16_t *)&vq->avail->idx);
955 				free_entries = (avail_idx - res_base_idx);
956 				if (count <= free_entries)
957 					break;
958 			}
959 		}
960 
961 		/*check that we have enough buffers*/
962 		if (unlikely(count > free_entries))
963 			count = free_entries;
964 
965 		if (count == 0)
966 			return 0;
967 
968 		res_end_idx = res_base_idx + count;
969 		/* vq->last_used_idx_res is atomically updated. */
970 		success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
971 									res_end_idx);
972 	} while (unlikely(success == 0));
973 	res_cur_idx = res_base_idx;
974 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
975 
976 	/* Prefetch available ring to retrieve indexes. */
977 	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
978 
979 	/* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
980 	mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
981 
982 	/* Retrieve all of the head indexes first to avoid caching issues. */
983 	for (head_idx = 0; head_idx < count; head_idx++)
984 		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
985 
986 	/*Prefetch descriptor index. */
987 	rte_prefetch0(&vq->desc[head[packet_success]]);
988 
989 	while (res_cur_idx != res_end_idx) {
990 		/* Get descriptor from available ring */
991 		desc = &vq->desc[head[packet_success]];
992 
993 		buff = pkts[packet_success];
994 
995 		/* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
996 		buff_addr = gpa_to_vva(dev, desc->addr);
997 		/* Prefetch buffer address. */
998 		rte_prefetch0((void*)(uintptr_t)buff_addr);
999 
1000 		if (mergeable && (mrg_count != 0)) {
1001 			desc->len = packet_len = rte_pktmbuf_data_len(buff);
1002 		} else {
1003 			/* Copy virtio_hdr to packet and increment buffer address */
1004 			buff_hdr_addr = buff_addr;
1005 			packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1006 
1007 			/*
1008 			 * If the descriptors are chained the header and data are placed in
1009 			 * separate buffers.
1010 			 */
1011 			if (desc->flags & VRING_DESC_F_NEXT) {
1012 				desc->len = vq->vhost_hlen;
1013 				desc = &vq->desc[desc->next];
1014 				/* Buffer address translation. */
1015 				buff_addr = gpa_to_vva(dev, desc->addr);
1016 				desc->len = rte_pktmbuf_data_len(buff);
1017 			} else {
1018 				buff_addr += vq->vhost_hlen;
1019 				desc->len = packet_len;
1020 			}
1021 		}
1022 
1023 		PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0);
1024 
1025 		/* Update used ring with desc information */
1026 		vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
1027 		vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
1028 
1029 		/* Copy mbuf data to buffer */
1030 		rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
1031 
1032 		res_cur_idx++;
1033 		packet_success++;
1034 
1035 		/* If mergeable is disabled then a header is required per buffer. */
1036 		if (!mergeable) {
1037 			rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
1038 			PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1039 		} else {
1040 			mrg_count++;
1041 			/* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */
1042 			if ((mrg_count == MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) {
1043 				virtio_hdr.num_buffers = mrg_count;
1044 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
1045 				rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
1046 				PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1047 				mrg_count = 0;
1048 			}
1049 		}
1050 		if (res_cur_idx < res_end_idx) {
1051 			/* Prefetch descriptor index. */
1052 			rte_prefetch0(&vq->desc[head[packet_success]]);
1053 		}
1054 	}
1055 
1056 	rte_compiler_barrier();
1057 
1058 	/* Wait until it's our turn to add our buffer to the used ring. */
1059 	while (unlikely(vq->last_used_idx != res_base_idx))
1060 		rte_pause();
1061 
1062 	*(volatile uint16_t *)&vq->used->idx += count;
1063 	vq->last_used_idx = res_end_idx;
1064 
1065 	/* Kick the guest if necessary. */
1066 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1067 		eventfd_write((int)vq->kickfd, 1);
1068 	return count;
1069 }
1070 
1071 /*
1072  * Compares a packet destination MAC address to a device MAC address.
1073  */
1074 static inline int __attribute__((always_inline))
1075 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
1076 {
1077 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
1078 }
1079 
1080 /*
1081  * This function learns the MAC address of the device and registers this along with a
1082  * vlan tag to a VMDQ.
1083  */
1084 static int
1085 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
1086 {
1087 	struct ether_hdr *pkt_hdr;
1088 	struct virtio_net_data_ll *dev_ll;
1089 	int i, ret;
1090 
1091 	/* Learn MAC address of guest device from packet */
1092 	pkt_hdr = (struct ether_hdr *)m->pkt.data;
1093 
1094 	dev_ll = ll_root_used;
1095 
1096 	while (dev_ll != NULL) {
1097 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
1098 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
1099 			return -1;
1100 		}
1101 		dev_ll = dev_ll->next;
1102 	}
1103 
1104 	for (i = 0; i < ETHER_ADDR_LEN; i++)
1105 		dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
1106 
1107 	/* vlan_tag currently uses the device_id. */
1108 	dev->vlan_tag = vlan_tags[dev->device_fh];
1109 
1110 	/* Print out VMDQ registration info. */
1111 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
1112 		dev->device_fh,
1113 		dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
1114 		dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
1115 		dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
1116 		dev->vlan_tag);
1117 
1118 	/* Register the MAC address. */
1119 	ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
1120 	if (ret)
1121 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
1122 					dev->device_fh);
1123 
1124 	/* Enable stripping of the vlan tag as we handle routing. */
1125 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
1126 
1127 	/* Set device as ready for RX. */
1128 	dev->ready = DEVICE_RX;
1129 
1130 	return 0;
1131 }
1132 
1133 /*
1134  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1135  * queue before disabling RX on the device.
1136  */
1137 static inline void
1138 unlink_vmdq(struct virtio_net *dev)
1139 {
1140 	unsigned i = 0;
1141 	unsigned rx_count;
1142 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1143 
1144 	if (dev->ready == DEVICE_RX) {
1145 		/*clear MAC and VLAN settings*/
1146 		rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
1147 		for (i = 0; i < 6; i++)
1148 			dev->mac_address.addr_bytes[i] = 0;
1149 
1150 		dev->vlan_tag = 0;
1151 
1152 		/*Clear out the receive buffers*/
1153 		rx_count = rte_eth_rx_burst(ports[0],
1154 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1155 
1156 		while (rx_count) {
1157 			for (i = 0; i < rx_count; i++)
1158 				rte_pktmbuf_free(pkts_burst[i]);
1159 
1160 			rx_count = rte_eth_rx_burst(ports[0],
1161 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1162 		}
1163 
1164 		dev->ready = DEVICE_MAC_LEARNING;
1165 	}
1166 }
1167 
1168 /*
1169  * Check if the packet destination MAC address is for a local device. If so then put
1170  * the packet on that devices RX queue. If not then return.
1171  */
1172 static inline unsigned __attribute__((always_inline))
1173 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1174 {
1175 	struct virtio_net_data_ll *dev_ll;
1176 	struct ether_hdr *pkt_hdr;
1177 	uint64_t ret = 0;
1178 
1179 	pkt_hdr = (struct ether_hdr *)m->pkt.data;
1180 
1181 	/*get the used devices list*/
1182 	dev_ll = ll_root_used;
1183 
1184 	while (dev_ll != NULL) {
1185 		if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1186 				          &dev_ll->dev->mac_address)) {
1187 
1188 			/* Drop the packet if the TX packet is destined for the TX device. */
1189 			if (dev_ll->dev->device_fh == dev->device_fh) {
1190 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1191 							dev_ll->dev->device_fh);
1192 				return 0;
1193 			}
1194 
1195 
1196 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1197 
1198 			if (dev_ll->dev->remove) {
1199 				/*drop the packet if the device is marked for removal*/
1200 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1201 			} else {
1202 				/*send the packet to the local virtio device*/
1203 				ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1204 				if (enable_stats) {
1205 					rte_atomic64_add(
1206 					&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1207 					1);
1208 					rte_atomic64_add(
1209 					&dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1210 					ret);
1211 					dev_statistics[dev->device_fh].tx_total++;
1212 					dev_statistics[dev->device_fh].tx += ret;
1213 				}
1214 			}
1215 
1216 			return 0;
1217 		}
1218 		dev_ll = dev_ll->next;
1219 	}
1220 
1221 	return -1;
1222 }
1223 
1224 /*
1225  * This function routes the TX packet to the correct interface. This may be a local device
1226  * or the physical port.
1227  */
1228 static inline void __attribute__((always_inline))
1229 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1230 {
1231 	struct mbuf_table *tx_q;
1232 	struct vlan_ethhdr *vlan_hdr;
1233 	struct rte_mbuf **m_table;
1234 	struct rte_mbuf *mbuf;
1235 	unsigned len, ret, offset = 0;
1236 	const uint16_t lcore_id = rte_lcore_id();
1237 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1238 	struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
1239 
1240 	/*check if destination is local VM*/
1241 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1242 		return;
1243 
1244 	if (vm2vm_mode == VM2VM_HARDWARE) {
1245 		while (dev_ll != NULL) {
1246 			if ((dev_ll->dev->ready == DEVICE_RX)
1247 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1248 				&dev_ll->dev->mac_address)) {
1249 				/*
1250 				 * Drop the packet if the TX packet is
1251 				 * destined for the TX device.
1252 				 */
1253 				if (dev_ll->dev->device_fh == dev->device_fh) {
1254 					LOG_DEBUG(VHOST_DATA,
1255 					"(%"PRIu64") TX: Source and destination"
1256 					" MAC addresses are the same. Dropping "
1257 					"packet.\n",
1258 					dev_ll->dev->device_fh);
1259 					return;
1260 				}
1261 				offset = 4;
1262 				vlan_tag =
1263 				(uint16_t)
1264 				vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1265 
1266 				LOG_DEBUG(VHOST_DATA,
1267 				"(%"PRIu64") TX: pkt to local VM device id:"
1268 				"(%"PRIu64") vlan tag: %d.\n",
1269 				dev->device_fh, dev_ll->dev->device_fh,
1270 				vlan_tag);
1271 
1272 				break;
1273 			}
1274 			dev_ll = dev_ll->next;
1275 		}
1276 	}
1277 
1278 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1279 
1280 	/*Add packet to the port tx queue*/
1281 	tx_q = &lcore_tx_queue[lcore_id];
1282 	len = tx_q->len;
1283 
1284 	/* Allocate an mbuf and populate the structure. */
1285 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
1286 	if (unlikely(mbuf == NULL)) {
1287 		RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n");
1288 		return;
1289 	}
1290 
1291 	mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
1292 	mbuf->pkt.pkt_len = mbuf->pkt.data_len;
1293 
1294 	/* Copy ethernet header to mbuf. */
1295 	rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN);
1296 
1297 
1298 	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1299 	vlan_hdr = (struct vlan_ethhdr *) mbuf->pkt.data;
1300 	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1301 	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1302 	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1303 
1304 	/* Copy the remaining packet contents to the mbuf. */
1305 	rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
1306 		(const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN));
1307 	tx_q->m_table[len] = mbuf;
1308 	len++;
1309 	if (enable_stats) {
1310 		dev_statistics[dev->device_fh].tx_total++;
1311 		dev_statistics[dev->device_fh].tx++;
1312 	}
1313 
1314 	if (unlikely(len == MAX_PKT_BURST)) {
1315 		m_table = (struct rte_mbuf **)tx_q->m_table;
1316 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1317 		/* Free any buffers not handled by TX and update the port stats. */
1318 		if (unlikely(ret < len)) {
1319 			do {
1320 				rte_pktmbuf_free(m_table[ret]);
1321 			} while (++ret < len);
1322 		}
1323 
1324 		len = 0;
1325 	}
1326 
1327 	tx_q->len = len;
1328 	return;
1329 }
1330 
1331 static inline void __attribute__((always_inline))
1332 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
1333 {
1334 	struct rte_mbuf m;
1335 	struct vhost_virtqueue *vq;
1336 	struct vring_desc *desc;
1337 	uint64_t buff_addr = 0;
1338 	uint32_t head[MAX_PKT_BURST];
1339 	uint32_t used_idx;
1340 	uint32_t i;
1341 	uint16_t free_entries, packet_success = 0;
1342 	uint16_t avail_idx;
1343 
1344 	vq = dev->virtqueue[VIRTIO_TXQ];
1345 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1346 
1347 	/* If there are no available buffers then return. */
1348 	if (vq->last_used_idx == avail_idx)
1349 		return;
1350 
1351 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1352 
1353 	/* Prefetch available ring to retrieve head indexes. */
1354 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1355 
1356 	/*get the number of free entries in the ring*/
1357 	free_entries = (avail_idx - vq->last_used_idx);
1358 
1359 	/* Limit to MAX_PKT_BURST. */
1360 	if (free_entries > MAX_PKT_BURST)
1361 		free_entries = MAX_PKT_BURST;
1362 
1363 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
1364 	/* Retrieve all of the head indexes first to avoid caching issues. */
1365 	for (i = 0; i < free_entries; i++)
1366 		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1367 
1368 	/* Prefetch descriptor index. */
1369 	rte_prefetch0(&vq->desc[head[packet_success]]);
1370 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1371 
1372 	while (packet_success < free_entries) {
1373 		desc = &vq->desc[head[packet_success]];
1374 
1375 		/* Discard first buffer as it is the virtio header */
1376 		desc = &vq->desc[desc->next];
1377 
1378 		/* Buffer address translation. */
1379 		buff_addr = gpa_to_vva(dev, desc->addr);
1380 		/* Prefetch buffer address. */
1381 		rte_prefetch0((void*)(uintptr_t)buff_addr);
1382 
1383 		used_idx = vq->last_used_idx & (vq->size - 1);
1384 
1385 		if (packet_success < (free_entries - 1)) {
1386 			/* Prefetch descriptor index. */
1387 			rte_prefetch0(&vq->desc[head[packet_success+1]]);
1388 			rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1389 		}
1390 
1391 		/* Update used index buffer information. */
1392 		vq->used->ring[used_idx].id = head[packet_success];
1393 		vq->used->ring[used_idx].len = 0;
1394 
1395 		/* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
1396 		m.pkt.data_len = desc->len;
1397 		m.pkt.data = (void*)(uintptr_t)buff_addr;
1398 
1399 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1400 
1401 		/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1402 		if (dev->ready == DEVICE_MAC_LEARNING) {
1403 			if (dev->remove || (link_vmdq(dev, &m) == -1)) {
1404 				/*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
1405 				packet_success += free_entries;
1406 				vq->last_used_idx += packet_success;
1407 				break;
1408 			}
1409 		}
1410 		virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
1411 
1412 		vq->last_used_idx++;
1413 		packet_success++;
1414 	}
1415 
1416 	rte_compiler_barrier();
1417 	vq->used->idx += packet_success;
1418 	/* Kick guest if required. */
1419 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1420 		eventfd_write((int)vq->kickfd, 1);
1421 }
1422 
1423 /*
1424  * This function is called by each data core. It handles all RX/TX registered with the
1425  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1426  * with all devices in the main linked list.
1427  */
1428 static int
1429 switch_worker(__attribute__((unused)) void *arg)
1430 {
1431 	struct rte_mempool *mbuf_pool = arg;
1432 	struct virtio_net *dev = NULL;
1433 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1434 	struct virtio_net_data_ll *dev_ll;
1435 	struct mbuf_table *tx_q;
1436 	volatile struct lcore_ll_info *lcore_ll;
1437 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1438 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1439 	unsigned ret, i;
1440 	const uint16_t lcore_id = rte_lcore_id();
1441 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1442 	uint16_t rx_count = 0;
1443 
1444 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", lcore_id);
1445 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1446 	prev_tsc = 0;
1447 
1448 	tx_q = &lcore_tx_queue[lcore_id];
1449 	for (i = 0; i < num_cores; i ++) {
1450 		if (lcore_ids[i] == lcore_id) {
1451 			tx_q->txq_id = i;
1452 			break;
1453 		}
1454 	}
1455 
1456 	while(1) {
1457 		cur_tsc = rte_rdtsc();
1458 		/*
1459 		 * TX burst queue drain
1460 		 */
1461 		diff_tsc = cur_tsc - prev_tsc;
1462 		if (unlikely(diff_tsc > drain_tsc)) {
1463 
1464 			if (tx_q->len) {
1465 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1466 
1467 				/*Tx any packets in the queue*/
1468 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1469 									   (struct rte_mbuf **)tx_q->m_table,
1470 									   (uint16_t)tx_q->len);
1471 				if (unlikely(ret < tx_q->len)) {
1472 					do {
1473 						rte_pktmbuf_free(tx_q->m_table[ret]);
1474 					} while (++ret < tx_q->len);
1475 				}
1476 
1477 				tx_q->len = 0;
1478 			}
1479 
1480 			prev_tsc = cur_tsc;
1481 
1482 		}
1483 
1484 		rte_prefetch0(lcore_ll->ll_root_used);
1485 		/*
1486 		 * Inform the configuration core that we have exited the linked list and that no devices are
1487 		 * in use if requested.
1488 		 */
1489 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1490 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1491 
1492 		/*
1493 		 * Process devices
1494 		 */
1495 		dev_ll = lcore_ll->ll_root_used;
1496 
1497 		while (dev_ll != NULL) {
1498 			/*get virtio device ID*/
1499 			dev = dev_ll->dev;
1500 
1501 			if (dev->remove) {
1502 				dev_ll = dev_ll->next;
1503 				unlink_vmdq(dev);
1504 				dev->ready = DEVICE_SAFE_REMOVE;
1505 				continue;
1506 			}
1507 			if (likely(dev->ready == DEVICE_RX)) {
1508 				/*Handle guest RX*/
1509 				rx_count = rte_eth_rx_burst(ports[0],
1510 					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1511 
1512 				if (rx_count) {
1513 					ret_count = virtio_dev_rx(dev, pkts_burst, rx_count);
1514 					if (enable_stats) {
1515 						rte_atomic64_add(
1516 						&dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1517 						rx_count);
1518 						rte_atomic64_add(
1519 						&dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
1520 					}
1521 					while (likely(rx_count)) {
1522 						rx_count--;
1523 						rte_pktmbuf_free_seg(pkts_burst[rx_count]);
1524 					}
1525 
1526 				}
1527 			}
1528 
1529 			if (!dev->remove)
1530 				/*Handle guest TX*/
1531 				virtio_dev_tx(dev, mbuf_pool);
1532 
1533 			/*move to the next device in the list*/
1534 			dev_ll = dev_ll->next;
1535 		}
1536 	}
1537 
1538 	return 0;
1539 }
1540 
1541 /*
1542  * This function gets available ring number for zero copy rx.
1543  * Only one thread will call this funciton for a paticular virtio device,
1544  * so, it is designed as non-thread-safe function.
1545  */
1546 static inline uint32_t __attribute__((always_inline))
1547 get_available_ring_num_zcp(struct virtio_net *dev)
1548 {
1549 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1550 	uint16_t avail_idx;
1551 
1552 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1553 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1554 }
1555 
1556 /*
1557  * This function gets available ring index for zero copy rx,
1558  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1559  * Only one thread will call this funciton for a paticular virtio device,
1560  * so, it is designed as non-thread-safe function.
1561  */
1562 static inline uint32_t __attribute__((always_inline))
1563 get_available_ring_index_zcp(struct virtio_net *dev,
1564 	uint16_t *res_base_idx, uint32_t count)
1565 {
1566 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1567 	uint16_t avail_idx;
1568 	uint32_t retry = 0;
1569 	uint16_t free_entries;
1570 
1571 	*res_base_idx = vq->last_used_idx_res;
1572 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1573 	free_entries = (avail_idx - *res_base_idx);
1574 
1575 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1576 			"avail idx: %d, "
1577 			"res base idx:%d, free entries:%d\n",
1578 			dev->device_fh, avail_idx, *res_base_idx,
1579 			free_entries);
1580 
1581 	/*
1582 	 * If retry is enabled and the queue is full then we wait
1583 	 * and retry to avoid packet loss.
1584 	 */
1585 	if (enable_retry && unlikely(count > free_entries)) {
1586 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1587 			rte_delay_us(burst_rx_delay_time);
1588 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1589 			free_entries = (avail_idx - *res_base_idx);
1590 			if (count <= free_entries)
1591 				break;
1592 		}
1593 	}
1594 
1595 	/*check that we have enough buffers*/
1596 	if (unlikely(count > free_entries))
1597 		count = free_entries;
1598 
1599 	if (unlikely(count == 0)) {
1600 		LOG_DEBUG(VHOST_DATA,
1601 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1602 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1603 			dev->device_fh, avail_idx,
1604 			*res_base_idx, free_entries);
1605 		return 0;
1606 	}
1607 
1608 	vq->last_used_idx_res = *res_base_idx + count;
1609 
1610 	return count;
1611 }
1612 
1613 /*
1614  * This function put descriptor back to used list.
1615  */
1616 static inline void __attribute__((always_inline))
1617 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1618 {
1619 	uint16_t res_cur_idx = vq->last_used_idx;
1620 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1621 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1622 	rte_compiler_barrier();
1623 	*(volatile uint16_t *)&vq->used->idx += 1;
1624 	vq->last_used_idx += 1;
1625 
1626 	/* Kick the guest if necessary. */
1627 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1628 		eventfd_write((int)vq->kickfd, 1);
1629 }
1630 
1631 /*
1632  * This function get available descriptor from vitio vring and un-attached mbuf
1633  * from vpool->ring, and then attach them together. It needs adjust the offset
1634  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1635  * frame data may be put to wrong location in mbuf.
1636  */
1637 static inline void __attribute__((always_inline))
1638 attach_rxmbuf_zcp(struct virtio_net *dev)
1639 {
1640 	uint16_t res_base_idx, desc_idx;
1641 	uint64_t buff_addr, phys_addr;
1642 	struct vhost_virtqueue *vq;
1643 	struct vring_desc *desc;
1644 	struct rte_mbuf *mbuf = NULL;
1645 	struct vpool *vpool;
1646 	hpa_type addr_type;
1647 
1648 	vpool = &vpool_array[dev->vmdq_rx_q];
1649 	vq = dev->virtqueue[VIRTIO_RXQ];
1650 
1651 	do {
1652 		if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
1653 				1) != 1))
1654 			return;
1655 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1656 
1657 		desc = &vq->desc[desc_idx];
1658 		if (desc->flags & VRING_DESC_F_NEXT) {
1659 			desc = &vq->desc[desc->next];
1660 			buff_addr = gpa_to_vva(dev, desc->addr);
1661 			phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
1662 					&addr_type);
1663 		} else {
1664 			buff_addr = gpa_to_vva(dev,
1665 					desc->addr + vq->vhost_hlen);
1666 			phys_addr = gpa_to_hpa(dev,
1667 					desc->addr + vq->vhost_hlen,
1668 					desc->len, &addr_type);
1669 		}
1670 
1671 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1672 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1673 				" address found when attaching RX frame buffer"
1674 				" address!\n", dev->device_fh);
1675 			put_desc_to_used_list_zcp(vq, desc_idx);
1676 			continue;
1677 		}
1678 
1679 		/*
1680 		 * Check if the frame buffer address from guest crosses
1681 		 * sub-region or not.
1682 		 */
1683 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1684 			RTE_LOG(ERR, VHOST_DATA,
1685 				"(%"PRIu64") Frame buffer address cross "
1686 				"sub-regioin found when attaching RX frame "
1687 				"buffer address!\n",
1688 				dev->device_fh);
1689 			put_desc_to_used_list_zcp(vq, desc_idx);
1690 			continue;
1691 		}
1692 	} while (unlikely(phys_addr == 0));
1693 
1694 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1695 	if (unlikely(mbuf == NULL)) {
1696 		LOG_DEBUG(VHOST_DATA,
1697 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1698 			"ring_sc_dequeue fail.\n",
1699 			dev->device_fh);
1700 		put_desc_to_used_list_zcp(vq, desc_idx);
1701 		return;
1702 	}
1703 
1704 	if (unlikely(vpool->buf_size > desc->len)) {
1705 		LOG_DEBUG(VHOST_DATA,
1706 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1707 			"length(%d) of descriptor idx: %d less than room "
1708 			"size required: %d\n",
1709 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1710 		put_desc_to_used_list_zcp(vq, desc_idx);
1711 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1712 		return;
1713 	}
1714 
1715 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1716 	mbuf->pkt.data = (void *)(uintptr_t)(buff_addr);
1717 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1718 	mbuf->pkt.data_len = desc->len;
1719 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1720 
1721 	LOG_DEBUG(VHOST_DATA,
1722 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1723 		"descriptor idx:%d\n",
1724 		dev->device_fh, res_base_idx, desc_idx);
1725 
1726 	__rte_mbuf_raw_free(mbuf);
1727 
1728 	return;
1729 }
1730 
1731 /*
1732  * Detach an attched packet mbuf -
1733  *  - restore original mbuf address and length values.
1734  *  - reset pktmbuf data and data_len to their default values.
1735  *  All other fields of the given packet mbuf will be left intact.
1736  *
1737  * @param m
1738  *   The attached packet mbuf.
1739  */
1740 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1741 {
1742 	const struct rte_mempool *mp = m->pool;
1743 	void *buf = RTE_MBUF_TO_BADDR(m);
1744 	uint32_t buf_ofs;
1745 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1746 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1747 
1748 	m->buf_addr = buf;
1749 	m->buf_len = (uint16_t)buf_len;
1750 
1751 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1752 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1753 	m->pkt.data = (char *) m->buf_addr + buf_ofs;
1754 
1755 	m->pkt.data_len = 0;
1756 }
1757 
1758 /*
1759  * This function is called after packets have been transimited. It fetchs mbuf
1760  * from vpool->pool, detached it and put into vpool->ring. It also update the
1761  * used index and kick the guest if necessary.
1762  */
1763 static inline uint32_t __attribute__((always_inline))
1764 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1765 {
1766 	struct rte_mbuf *mbuf;
1767 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1768 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1769 	uint32_t index = 0;
1770 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1771 
1772 	LOG_DEBUG(VHOST_DATA,
1773 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1774 		"clean is: %d\n",
1775 		dev->device_fh, mbuf_count);
1776 	LOG_DEBUG(VHOST_DATA,
1777 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1778 		"clean  is : %d\n",
1779 		dev->device_fh, rte_ring_count(vpool->ring));
1780 
1781 	for (index = 0; index < mbuf_count; index++) {
1782 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1783 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1784 			pktmbuf_detach_zcp(mbuf);
1785 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1786 
1787 		/* Update used index buffer information. */
1788 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1789 		vq->used->ring[used_idx].len = 0;
1790 
1791 		used_idx = (used_idx + 1) & (vq->size - 1);
1792 	}
1793 
1794 	LOG_DEBUG(VHOST_DATA,
1795 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1796 		"clean is: %d\n",
1797 		dev->device_fh, rte_mempool_count(vpool->pool));
1798 	LOG_DEBUG(VHOST_DATA,
1799 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1800 		"clean  is : %d\n",
1801 		dev->device_fh, rte_ring_count(vpool->ring));
1802 	LOG_DEBUG(VHOST_DATA,
1803 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1804 		"vq->last_used_idx:%d\n",
1805 		dev->device_fh, vq->last_used_idx);
1806 
1807 	vq->last_used_idx += mbuf_count;
1808 
1809 	LOG_DEBUG(VHOST_DATA,
1810 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1811 		"vq->last_used_idx:%d\n",
1812 		dev->device_fh, vq->last_used_idx);
1813 
1814 	rte_compiler_barrier();
1815 
1816 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1817 
1818 	/* Kick guest if required. */
1819 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1820 		eventfd_write((int)vq->kickfd, 1);
1821 
1822 	return 0;
1823 }
1824 
1825 /*
1826  * This function is called when a virtio device is destroy.
1827  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1828  */
1829 static void mbuf_destroy_zcp(struct vpool *vpool)
1830 {
1831 	struct rte_mbuf *mbuf = NULL;
1832 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1833 
1834 	LOG_DEBUG(VHOST_CONFIG,
1835 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1836 		"mbuf_destroy_zcp is: %d\n",
1837 		mbuf_count);
1838 	LOG_DEBUG(VHOST_CONFIG,
1839 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1840 		"mbuf_destroy_zcp  is : %d\n",
1841 		rte_ring_count(vpool->ring));
1842 
1843 	for (index = 0; index < mbuf_count; index++) {
1844 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1845 		if (likely(mbuf != NULL)) {
1846 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1847 				pktmbuf_detach_zcp(mbuf);
1848 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1849 		}
1850 	}
1851 
1852 	LOG_DEBUG(VHOST_CONFIG,
1853 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1854 		"mbuf_destroy_zcp is: %d\n",
1855 		rte_mempool_count(vpool->pool));
1856 	LOG_DEBUG(VHOST_CONFIG,
1857 		"in mbuf_destroy_zcp: mbuf count in ring after "
1858 		"mbuf_destroy_zcp is : %d\n",
1859 		rte_ring_count(vpool->ring));
1860 }
1861 
1862 /*
1863  * This function update the use flag and counter.
1864  */
1865 static inline uint32_t __attribute__((always_inline))
1866 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1867 	uint32_t count)
1868 {
1869 	struct vhost_virtqueue *vq;
1870 	struct vring_desc *desc;
1871 	struct rte_mbuf *buff;
1872 	/* The virtio_hdr is initialised to 0. */
1873 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1874 		= {{0, 0, 0, 0, 0, 0}, 0};
1875 	uint64_t buff_hdr_addr = 0;
1876 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1877 	uint32_t head_idx, packet_success = 0;
1878 	uint16_t res_cur_idx;
1879 
1880 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1881 
1882 	if (count == 0)
1883 		return 0;
1884 
1885 	vq = dev->virtqueue[VIRTIO_RXQ];
1886 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1887 
1888 	res_cur_idx = vq->last_used_idx;
1889 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1890 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1891 
1892 	/* Retrieve all of the head indexes first to avoid caching issues. */
1893 	for (head_idx = 0; head_idx < count; head_idx++)
1894 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1895 
1896 	/*Prefetch descriptor index. */
1897 	rte_prefetch0(&vq->desc[head[packet_success]]);
1898 
1899 	while (packet_success != count) {
1900 		/* Get descriptor from available ring */
1901 		desc = &vq->desc[head[packet_success]];
1902 
1903 		buff = pkts[packet_success];
1904 		LOG_DEBUG(VHOST_DATA,
1905 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1906 			"pkt[%d] descriptor idx: %d\n",
1907 			dev->device_fh, packet_success,
1908 			MBUF_HEADROOM_UINT32(buff));
1909 
1910 		PRINT_PACKET(dev,
1911 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1912 			+ RTE_PKTMBUF_HEADROOM),
1913 			rte_pktmbuf_data_len(buff), 0);
1914 
1915 		/* Buffer address translation for virtio header. */
1916 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1917 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1918 
1919 		/*
1920 		 * If the descriptors are chained the header and data are
1921 		 * placed in separate buffers.
1922 		 */
1923 		if (desc->flags & VRING_DESC_F_NEXT) {
1924 			desc->len = vq->vhost_hlen;
1925 			desc = &vq->desc[desc->next];
1926 			desc->len = rte_pktmbuf_data_len(buff);
1927 		} else {
1928 			desc->len = packet_len;
1929 		}
1930 
1931 		/* Update used ring with desc information */
1932 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1933 			= head[packet_success];
1934 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1935 			= packet_len;
1936 		res_cur_idx++;
1937 		packet_success++;
1938 
1939 		/* A header is required per buffer. */
1940 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1941 			(const void *)&virtio_hdr, vq->vhost_hlen);
1942 
1943 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1944 
1945 		if (likely(packet_success < count)) {
1946 			/* Prefetch descriptor index. */
1947 			rte_prefetch0(&vq->desc[head[packet_success]]);
1948 		}
1949 	}
1950 
1951 	rte_compiler_barrier();
1952 
1953 	LOG_DEBUG(VHOST_DATA,
1954 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1955 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1956 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1957 
1958 	*(volatile uint16_t *)&vq->used->idx += count;
1959 	vq->last_used_idx += count;
1960 
1961 	LOG_DEBUG(VHOST_DATA,
1962 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1963 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1964 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1965 
1966 	/* Kick the guest if necessary. */
1967 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1968 		eventfd_write((int)vq->kickfd, 1);
1969 
1970 	return count;
1971 }
1972 
1973 /*
1974  * This function routes the TX packet to the correct interface.
1975  * This may be a local device or the physical port.
1976  */
1977 static inline void __attribute__((always_inline))
1978 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1979 	uint32_t desc_idx, uint8_t need_copy)
1980 {
1981 	struct mbuf_table *tx_q;
1982 	struct rte_mbuf **m_table;
1983 	struct rte_mbuf *mbuf = NULL;
1984 	unsigned len, ret, offset = 0;
1985 	struct vpool *vpool;
1986 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1987 	struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
1988 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1989 
1990 	/*Add packet to the port tx queue*/
1991 	tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
1992 	len = tx_q->len;
1993 
1994 	/* Allocate an mbuf and populate the structure. */
1995 	vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
1996 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1997 	if (unlikely(mbuf == NULL)) {
1998 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1999 		RTE_LOG(ERR, VHOST_DATA,
2000 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
2001 			dev->device_fh);
2002 		put_desc_to_used_list_zcp(vq, desc_idx);
2003 		return;
2004 	}
2005 
2006 	if (vm2vm_mode == VM2VM_HARDWARE) {
2007 		/* Avoid using a vlan tag from any vm for external pkt, such as
2008 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
2009 		 * selection, MAC address determines it as an external pkt
2010 		 * which should go to network, while vlan tag determine it as
2011 		 * a vm2vm pkt should forward to another vm. Hardware confuse
2012 		 * such a ambiguous situation, so pkt will lost.
2013 		 */
2014 		vlan_tag = external_pkt_default_vlan_tag;
2015 		while (dev_ll != NULL) {
2016 			if (likely(dev_ll->dev->ready == DEVICE_RX) &&
2017 				ether_addr_cmp(&(pkt_hdr->d_addr),
2018 				&dev_ll->dev->mac_address)) {
2019 
2020 				/*
2021 				 * Drop the packet if the TX packet is destined
2022 				 * for the TX device.
2023 				 */
2024 				if (unlikely(dev_ll->dev->device_fh
2025 					== dev->device_fh)) {
2026 					LOG_DEBUG(VHOST_DATA,
2027 					"(%"PRIu64") TX: Source and destination"
2028 					"MAC addresses are the same. Dropping "
2029 					"packet.\n",
2030 					dev_ll->dev->device_fh);
2031 					MBUF_HEADROOM_UINT32(mbuf)
2032 						= (uint32_t)desc_idx;
2033 					__rte_mbuf_raw_free(mbuf);
2034 					return;
2035 				}
2036 
2037 				/*
2038 				 * Packet length offset 4 bytes for HW vlan
2039 				 * strip when L2 switch back.
2040 				 */
2041 				offset = 4;
2042 				vlan_tag =
2043 				(uint16_t)
2044 				vlan_tags[(uint16_t)dev_ll->dev->device_fh];
2045 
2046 				LOG_DEBUG(VHOST_DATA,
2047 				"(%"PRIu64") TX: pkt to local VM device id:"
2048 				"(%"PRIu64") vlan tag: %d.\n",
2049 				dev->device_fh, dev_ll->dev->device_fh,
2050 				vlan_tag);
2051 
2052 				break;
2053 			}
2054 			dev_ll = dev_ll->next;
2055 		}
2056 	}
2057 
2058 	mbuf->pkt.nb_segs = m->pkt.nb_segs;
2059 	mbuf->pkt.next = m->pkt.next;
2060 	mbuf->pkt.data_len = m->pkt.data_len + offset;
2061 	mbuf->pkt.pkt_len = mbuf->pkt.data_len;
2062 	if (unlikely(need_copy)) {
2063 		/* Copy the packet contents to the mbuf. */
2064 		rte_memcpy((void *)((uint8_t *)mbuf->pkt.data),
2065 			(const void *) ((uint8_t *)m->pkt.data),
2066 			m->pkt.data_len);
2067 	} else {
2068 		mbuf->pkt.data = m->pkt.data;
2069 		mbuf->buf_physaddr = m->buf_physaddr;
2070 		mbuf->buf_addr = m->buf_addr;
2071 	}
2072 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
2073 	mbuf->pkt.vlan_macip.f.vlan_tci = vlan_tag;
2074 	mbuf->pkt.vlan_macip.f.l2_len = sizeof(struct ether_hdr);
2075 	mbuf->pkt.vlan_macip.f.l3_len = sizeof(struct ipv4_hdr);
2076 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2077 
2078 	tx_q->m_table[len] = mbuf;
2079 	len++;
2080 
2081 	LOG_DEBUG(VHOST_DATA,
2082 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
2083 		dev->device_fh,
2084 		mbuf->pkt.nb_segs,
2085 		(mbuf->pkt.next == NULL) ? "null" : "non-null");
2086 
2087 	if (enable_stats) {
2088 		dev_statistics[dev->device_fh].tx_total++;
2089 		dev_statistics[dev->device_fh].tx++;
2090 	}
2091 
2092 	if (unlikely(len == MAX_PKT_BURST)) {
2093 		m_table = (struct rte_mbuf **)tx_q->m_table;
2094 		ret = rte_eth_tx_burst(ports[0],
2095 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
2096 
2097 		/*
2098 		 * Free any buffers not handled by TX and update
2099 		 * the port stats.
2100 		 */
2101 		if (unlikely(ret < len)) {
2102 			do {
2103 				rte_pktmbuf_free(m_table[ret]);
2104 			} while (++ret < len);
2105 		}
2106 
2107 		len = 0;
2108 		txmbuf_clean_zcp(dev, vpool);
2109 	}
2110 
2111 	tx_q->len = len;
2112 
2113 	return;
2114 }
2115 
2116 /*
2117  * This function TX all available packets in virtio TX queue for one
2118  * virtio-net device. If it is first packet, it learns MAC address and
2119  * setup VMDQ.
2120  */
2121 static inline void __attribute__((always_inline))
2122 virtio_dev_tx_zcp(struct virtio_net *dev)
2123 {
2124 	struct rte_mbuf m;
2125 	struct vhost_virtqueue *vq;
2126 	struct vring_desc *desc;
2127 	uint64_t buff_addr = 0, phys_addr;
2128 	uint32_t head[MAX_PKT_BURST];
2129 	uint32_t i;
2130 	uint16_t free_entries, packet_success = 0;
2131 	uint16_t avail_idx;
2132 	uint8_t need_copy = 0;
2133 	hpa_type addr_type;
2134 
2135 	vq = dev->virtqueue[VIRTIO_TXQ];
2136 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
2137 
2138 	/* If there are no available buffers then return. */
2139 	if (vq->last_used_idx_res == avail_idx)
2140 		return;
2141 
2142 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2143 
2144 	/* Prefetch available ring to retrieve head indexes. */
2145 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2146 
2147 	/* Get the number of free entries in the ring */
2148 	free_entries = (avail_idx - vq->last_used_idx_res);
2149 
2150 	/* Limit to MAX_PKT_BURST. */
2151 	free_entries
2152 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2153 
2154 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
2155 		dev->device_fh, free_entries);
2156 
2157 	/* Retrieve all of the head indexes first to avoid caching issues. */
2158 	for (i = 0; i < free_entries; i++)
2159 		head[i]
2160 			= vq->avail->ring[(vq->last_used_idx_res + i)
2161 			& (vq->size - 1)];
2162 
2163 	vq->last_used_idx_res += free_entries;
2164 
2165 	/* Prefetch descriptor index. */
2166 	rte_prefetch0(&vq->desc[head[packet_success]]);
2167 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2168 
2169 	while (packet_success < free_entries) {
2170 		desc = &vq->desc[head[packet_success]];
2171 
2172 		/* Discard first buffer as it is the virtio header */
2173 		desc = &vq->desc[desc->next];
2174 
2175 		/* Buffer address translation. */
2176 		buff_addr = gpa_to_vva(dev, desc->addr);
2177 		phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
2178 
2179 		if (likely(packet_success < (free_entries - 1)))
2180 			/* Prefetch descriptor index. */
2181 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2182 
2183 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2184 			RTE_LOG(ERR, VHOST_DATA,
2185 				"(%"PRIu64") Invalid frame buffer address found"
2186 				"when TX packets!\n",
2187 				dev->device_fh);
2188 			packet_success++;
2189 			continue;
2190 		}
2191 
2192 		/* Prefetch buffer address. */
2193 		rte_prefetch0((void *)(uintptr_t)buff_addr);
2194 
2195 		/*
2196 		 * Setup dummy mbuf. This is copied to a real mbuf if
2197 		 * transmitted out the physical port.
2198 		 */
2199 		m.pkt.data_len = desc->len;
2200 		m.pkt.nb_segs = 1;
2201 		m.pkt.next = NULL;
2202 		m.pkt.data = (void *)(uintptr_t)buff_addr;
2203 		m.buf_addr = m.pkt.data;
2204 		m.buf_physaddr = phys_addr;
2205 
2206 		/*
2207 		 * Check if the frame buffer address from guest crosses
2208 		 * sub-region or not.
2209 		 */
2210 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2211 			RTE_LOG(ERR, VHOST_DATA,
2212 				"(%"PRIu64") Frame buffer address cross "
2213 				"sub-regioin found when attaching TX frame "
2214 				"buffer address!\n",
2215 				dev->device_fh);
2216 			need_copy = 1;
2217 		} else
2218 			need_copy = 0;
2219 
2220 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2221 
2222 		/*
2223 		 * If this is the first received packet we need to learn
2224 		 * the MAC and setup VMDQ
2225 		 */
2226 		if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2227 			if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2228 				/*
2229 				 * Discard frame if device is scheduled for
2230 				 * removal or a duplicate MAC address is found.
2231 				 */
2232 				packet_success += free_entries;
2233 				vq->last_used_idx += packet_success;
2234 				break;
2235 			}
2236 		}
2237 
2238 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2239 		packet_success++;
2240 	}
2241 }
2242 
2243 /*
2244  * This function is called by each data core. It handles all RX/TX registered
2245  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2246  * addresses are compared with all devices in the main linked list.
2247  */
2248 static int
2249 switch_worker_zcp(__attribute__((unused)) void *arg)
2250 {
2251 	struct virtio_net *dev = NULL;
2252 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2253 	struct virtio_net_data_ll *dev_ll;
2254 	struct mbuf_table *tx_q;
2255 	volatile struct lcore_ll_info *lcore_ll;
2256 	const uint64_t drain_tsc
2257 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2258 		* BURST_TX_DRAIN_US;
2259 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2260 	unsigned ret;
2261 	const uint16_t lcore_id = rte_lcore_id();
2262 	uint16_t count_in_ring, rx_count = 0;
2263 
2264 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2265 
2266 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2267 	prev_tsc = 0;
2268 
2269 	while (1) {
2270 		cur_tsc = rte_rdtsc();
2271 
2272 		/* TX burst queue drain */
2273 		diff_tsc = cur_tsc - prev_tsc;
2274 		if (unlikely(diff_tsc > drain_tsc)) {
2275 			/*
2276 			 * Get mbuf from vpool.pool and detach mbuf and
2277 			 * put back into vpool.ring.
2278 			 */
2279 			dev_ll = lcore_ll->ll_root_used;
2280 			while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2281 				/* Get virtio device ID */
2282 				dev = dev_ll->dev;
2283 
2284 				if (likely(!dev->remove)) {
2285 					tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2286 					if (tx_q->len) {
2287 						LOG_DEBUG(VHOST_DATA,
2288 						"TX queue drained after timeout"
2289 						" with burst size %u\n",
2290 						tx_q->len);
2291 
2292 						/*
2293 						 * Tx any packets in the queue
2294 						 */
2295 						ret = rte_eth_tx_burst(
2296 							ports[0],
2297 							(uint16_t)tx_q->txq_id,
2298 							(struct rte_mbuf **)
2299 							tx_q->m_table,
2300 							(uint16_t)tx_q->len);
2301 						if (unlikely(ret < tx_q->len)) {
2302 							do {
2303 								rte_pktmbuf_free(
2304 									tx_q->m_table[ret]);
2305 							} while (++ret < tx_q->len);
2306 						}
2307 						tx_q->len = 0;
2308 
2309 						txmbuf_clean_zcp(dev,
2310 							&vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2311 					}
2312 				}
2313 				dev_ll = dev_ll->next;
2314 			}
2315 			prev_tsc = cur_tsc;
2316 		}
2317 
2318 		rte_prefetch0(lcore_ll->ll_root_used);
2319 
2320 		/*
2321 		 * Inform the configuration core that we have exited the linked
2322 		 * list and that no devices are in use if requested.
2323 		 */
2324 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2325 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2326 
2327 		/* Process devices */
2328 		dev_ll = lcore_ll->ll_root_used;
2329 
2330 		while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2331 			dev = dev_ll->dev;
2332 			if (unlikely(dev->remove)) {
2333 				dev_ll = dev_ll->next;
2334 				unlink_vmdq(dev);
2335 				dev->ready = DEVICE_SAFE_REMOVE;
2336 				continue;
2337 			}
2338 
2339 			if (likely(dev->ready == DEVICE_RX)) {
2340 				uint32_t index = dev->vmdq_rx_q;
2341 				uint16_t i;
2342 				count_in_ring
2343 				= rte_ring_count(vpool_array[index].ring);
2344 				uint16_t free_entries
2345 				= (uint16_t)get_available_ring_num_zcp(dev);
2346 
2347 				/*
2348 				 * Attach all mbufs in vpool.ring and put back
2349 				 * into vpool.pool.
2350 				 */
2351 				for (i = 0;
2352 				i < RTE_MIN(free_entries,
2353 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2354 				i++)
2355 					attach_rxmbuf_zcp(dev);
2356 
2357 				/* Handle guest RX */
2358 				rx_count = rte_eth_rx_burst(ports[0],
2359 					(uint16_t)dev->vmdq_rx_q, pkts_burst,
2360 					MAX_PKT_BURST);
2361 
2362 				if (rx_count) {
2363 					ret_count = virtio_dev_rx_zcp(dev,
2364 							pkts_burst, rx_count);
2365 					if (enable_stats) {
2366 						dev_statistics[dev->device_fh].rx_total
2367 							+= rx_count;
2368 						dev_statistics[dev->device_fh].rx
2369 							+= ret_count;
2370 					}
2371 					while (likely(rx_count)) {
2372 						rx_count--;
2373 						pktmbuf_detach_zcp(
2374 							pkts_burst[rx_count]);
2375 						rte_ring_sp_enqueue(
2376 							vpool_array[index].ring,
2377 							(void *)pkts_burst[rx_count]);
2378 					}
2379 				}
2380 			}
2381 
2382 			if (likely(!dev->remove))
2383 				/* Handle guest TX */
2384 				virtio_dev_tx_zcp(dev);
2385 
2386 			/* Move to the next device in the list */
2387 			dev_ll = dev_ll->next;
2388 		}
2389 	}
2390 
2391 	return 0;
2392 }
2393 
2394 
2395 /*
2396  * Add an entry to a used linked list. A free entry must first be found
2397  * in the free linked list using get_data_ll_free_entry();
2398  */
2399 static void
2400 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2401 	struct virtio_net_data_ll *ll_dev)
2402 {
2403 	struct virtio_net_data_ll *ll = *ll_root_addr;
2404 
2405 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2406 	ll_dev->next = NULL;
2407 	rte_compiler_barrier();
2408 
2409 	/* If ll == NULL then this is the first device. */
2410 	if (ll) {
2411 		/* Increment to the tail of the linked list. */
2412 		while ((ll->next != NULL) )
2413 			ll = ll->next;
2414 
2415 		ll->next = ll_dev;
2416 	} else {
2417 		*ll_root_addr = ll_dev;
2418 	}
2419 }
2420 
2421 /*
2422  * Remove an entry from a used linked list. The entry must then be added to
2423  * the free linked list using put_data_ll_free_entry().
2424  */
2425 static void
2426 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2427 	struct virtio_net_data_ll *ll_dev,
2428 	struct virtio_net_data_ll *ll_dev_last)
2429 {
2430 	struct virtio_net_data_ll *ll = *ll_root_addr;
2431 
2432 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2433 		return;
2434 
2435 	if (ll_dev == ll)
2436 		*ll_root_addr = ll_dev->next;
2437 	else
2438 		if (likely(ll_dev_last != NULL))
2439 			ll_dev_last->next = ll_dev->next;
2440 		else
2441 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2442 }
2443 
2444 /*
2445  * Find and return an entry from the free linked list.
2446  */
2447 static struct virtio_net_data_ll *
2448 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2449 {
2450 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2451 	struct virtio_net_data_ll *ll_dev;
2452 
2453 	if (ll_free == NULL)
2454 		return NULL;
2455 
2456 	ll_dev = ll_free;
2457 	*ll_root_addr = ll_free->next;
2458 
2459 	return ll_dev;
2460 }
2461 
2462 /*
2463  * Place an entry back on to the free linked list.
2464  */
2465 static void
2466 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2467 	struct virtio_net_data_ll *ll_dev)
2468 {
2469 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2470 
2471 	if (ll_dev == NULL)
2472 		return;
2473 
2474 	ll_dev->next = ll_free;
2475 	*ll_root_addr = ll_dev;
2476 }
2477 
2478 /*
2479  * Creates a linked list of a given size.
2480  */
2481 static struct virtio_net_data_ll *
2482 alloc_data_ll(uint32_t size)
2483 {
2484 	struct virtio_net_data_ll *ll_new;
2485 	uint32_t i;
2486 
2487 	/* Malloc and then chain the linked list. */
2488 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2489 	if (ll_new == NULL) {
2490 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2491 		return NULL;
2492 	}
2493 
2494 	for (i = 0; i < size - 1; i++) {
2495 		ll_new[i].dev = NULL;
2496 		ll_new[i].next = &ll_new[i+1];
2497 	}
2498 	ll_new[i].next = NULL;
2499 
2500 	return (ll_new);
2501 }
2502 
2503 /*
2504  * Create the main linked list along with each individual cores linked list. A used and a free list
2505  * are created to manage entries.
2506  */
2507 static int
2508 init_data_ll (void)
2509 {
2510 	int lcore;
2511 
2512 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2513 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2514 		if (lcore_info[lcore].lcore_ll == NULL) {
2515 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2516 			return -1;
2517 		}
2518 
2519 		lcore_info[lcore].lcore_ll->device_num = 0;
2520 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2521 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2522 		if (num_devices % num_switching_cores)
2523 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2524 		else
2525 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2526 	}
2527 
2528 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2529 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2530 
2531 	return 0;
2532 }
2533 
2534 /*
2535  * Set virtqueue flags so that we do not receive interrupts.
2536  */
2537 static void
2538 set_irq_status (struct virtio_net *dev)
2539 {
2540 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2541 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2542 }
2543 
2544 /*
2545  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2546  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2547  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2548  */
2549 static void
2550 destroy_device (volatile struct virtio_net *dev)
2551 {
2552 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2553 	struct virtio_net_data_ll *ll_main_dev_cur;
2554 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2555 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2556 	int lcore;
2557 
2558 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2559 
2560 	/*set the remove flag. */
2561 	dev->remove = 1;
2562 
2563 	while(dev->ready != DEVICE_SAFE_REMOVE) {
2564 		rte_pause();
2565 	}
2566 
2567 	/* Search for entry to be removed from lcore ll */
2568 	ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
2569 	while (ll_lcore_dev_cur != NULL) {
2570 		if (ll_lcore_dev_cur->dev == dev) {
2571 			break;
2572 		} else {
2573 			ll_lcore_dev_last = ll_lcore_dev_cur;
2574 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2575 		}
2576 	}
2577 
2578 	if (ll_lcore_dev_cur == NULL) {
2579 		RTE_LOG(ERR, VHOST_CONFIG,
2580 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2581 			dev->device_fh);
2582 		return;
2583 	}
2584 
2585 	/* Search for entry to be removed from main ll */
2586 	ll_main_dev_cur = ll_root_used;
2587 	ll_main_dev_last = NULL;
2588 	while (ll_main_dev_cur != NULL) {
2589 		if (ll_main_dev_cur->dev == dev) {
2590 			break;
2591 		} else {
2592 			ll_main_dev_last = ll_main_dev_cur;
2593 			ll_main_dev_cur = ll_main_dev_cur->next;
2594 		}
2595 	}
2596 
2597 	/* Remove entries from the lcore and main ll. */
2598 	rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2599 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2600 
2601 	/* Set the dev_removal_flag on each lcore. */
2602 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2603 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2604 	}
2605 
2606 	/*
2607 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2608 	 * they can no longer access the device removed from the linked lists and that the devices
2609 	 * are no longer in use.
2610 	 */
2611 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2612 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2613 			rte_pause();
2614 		}
2615 	}
2616 
2617 	/* Add the entries back to the lcore and main free ll.*/
2618 	put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2619 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2620 
2621 	/* Decrement number of device on the lcore. */
2622 	lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
2623 
2624 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2625 
2626 	if (zero_copy) {
2627 		struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2628 
2629 		/* Stop the RX queue. */
2630 		if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2631 			LOG_DEBUG(VHOST_CONFIG,
2632 				"(%"PRIu64") In destroy_device: Failed to stop "
2633 				"rx queue:%d\n",
2634 				dev->device_fh,
2635 				dev->vmdq_rx_q);
2636 		}
2637 
2638 		LOG_DEBUG(VHOST_CONFIG,
2639 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2640 			"mempool back to ring for RX queue: %d\n",
2641 			dev->device_fh, dev->vmdq_rx_q);
2642 
2643 		mbuf_destroy_zcp(vpool);
2644 
2645 		/* Stop the TX queue. */
2646 		if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2647 			LOG_DEBUG(VHOST_CONFIG,
2648 				"(%"PRIu64") In destroy_device: Failed to "
2649 				"stop tx queue:%d\n",
2650 				dev->device_fh, dev->vmdq_rx_q);
2651 		}
2652 
2653 		vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
2654 
2655 		LOG_DEBUG(VHOST_CONFIG,
2656 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2657 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2658 			dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
2659 			dev->device_fh);
2660 
2661 		mbuf_destroy_zcp(vpool);
2662 	}
2663 
2664 }
2665 
2666 /*
2667  * A new device is added to a data core. First the device is added to the main linked list
2668  * and the allocated to a specific data core.
2669  */
2670 static int
2671 new_device (struct virtio_net *dev)
2672 {
2673 	struct virtio_net_data_ll *ll_dev;
2674 	int lcore, core_add = 0;
2675 	uint32_t device_num_min = num_devices;
2676 
2677 	/* Add device to main ll */
2678 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2679 	if (ll_dev == NULL) {
2680 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2681 			"of %d devices per core has been reached\n",
2682 			dev->device_fh, num_devices);
2683 		return -1;
2684 	}
2685 	ll_dev->dev = dev;
2686 	add_data_ll_entry(&ll_root_used, ll_dev);
2687 	ll_dev->dev->vmdq_rx_q
2688 		= ll_dev->dev->device_fh * (num_queues / num_devices);
2689 
2690 	if (zero_copy) {
2691 		uint32_t index = ll_dev->dev->vmdq_rx_q;
2692 		uint32_t count_in_ring, i;
2693 		struct mbuf_table *tx_q;
2694 
2695 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2696 
2697 		LOG_DEBUG(VHOST_CONFIG,
2698 			"(%"PRIu64") in new_device: mbuf count in mempool "
2699 			"before attach is: %d\n",
2700 			dev->device_fh,
2701 			rte_mempool_count(vpool_array[index].pool));
2702 		LOG_DEBUG(VHOST_CONFIG,
2703 			"(%"PRIu64") in new_device: mbuf count in  ring "
2704 			"before attach  is : %d\n",
2705 			dev->device_fh, count_in_ring);
2706 
2707 		/*
2708 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2709 		 */
2710 		for (i = 0; i < count_in_ring; i++)
2711 			attach_rxmbuf_zcp(dev);
2712 
2713 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2714 			"mempool after attach is: %d\n",
2715 			dev->device_fh,
2716 			rte_mempool_count(vpool_array[index].pool));
2717 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2718 			"ring after attach  is : %d\n",
2719 			dev->device_fh,
2720 			rte_ring_count(vpool_array[index].ring));
2721 
2722 		tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2723 		tx_q->txq_id = dev->vmdq_rx_q;
2724 
2725 		if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2726 			struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2727 
2728 			LOG_DEBUG(VHOST_CONFIG,
2729 				"(%"PRIu64") In new_device: Failed to start "
2730 				"tx queue:%d\n",
2731 				dev->device_fh, dev->vmdq_rx_q);
2732 
2733 			mbuf_destroy_zcp(vpool);
2734 			return -1;
2735 		}
2736 
2737 		if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2738 			struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2739 
2740 			LOG_DEBUG(VHOST_CONFIG,
2741 				"(%"PRIu64") In new_device: Failed to start "
2742 				"rx queue:%d\n",
2743 				dev->device_fh, dev->vmdq_rx_q);
2744 
2745 			/* Stop the TX queue. */
2746 			if (rte_eth_dev_tx_queue_stop(ports[0],
2747 				dev->vmdq_rx_q) != 0) {
2748 				LOG_DEBUG(VHOST_CONFIG,
2749 					"(%"PRIu64") In new_device: Failed to "
2750 					"stop tx queue:%d\n",
2751 					dev->device_fh, dev->vmdq_rx_q);
2752 			}
2753 
2754 			mbuf_destroy_zcp(vpool);
2755 			return -1;
2756 		}
2757 
2758 	}
2759 
2760 	/*reset ready flag*/
2761 	dev->ready = DEVICE_MAC_LEARNING;
2762 	dev->remove = 0;
2763 
2764 	/* Find a suitable lcore to add the device. */
2765 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2766 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2767 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2768 			core_add = lcore;
2769 		}
2770 	}
2771 	/* Add device to lcore ll */
2772 	ll_dev->dev->coreid = core_add;
2773 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2774 	if (ll_dev == NULL) {
2775 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2776 		dev->ready = DEVICE_SAFE_REMOVE;
2777 		destroy_device(dev);
2778 		return -1;
2779 	}
2780 	ll_dev->dev = dev;
2781 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2782 
2783 	/* Initialize device stats */
2784 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2785 
2786 	/* Disable notifications. */
2787 	set_irq_status(dev);
2788 	lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
2789 	dev->flags |= VIRTIO_DEV_RUNNING;
2790 
2791 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
2792 
2793 	return 0;
2794 }
2795 
2796 /*
2797  * These callback allow devices to be added to the data core when configuration
2798  * has been fully complete.
2799  */
2800 static const struct virtio_net_device_ops virtio_net_device_ops =
2801 {
2802 	.new_device =  new_device,
2803 	.destroy_device = destroy_device,
2804 };
2805 
2806 /*
2807  * This is a thread will wake up after a period to print stats if the user has
2808  * enabled them.
2809  */
2810 static void
2811 print_stats(void)
2812 {
2813 	struct virtio_net_data_ll *dev_ll;
2814 	uint64_t tx_dropped, rx_dropped;
2815 	uint64_t tx, tx_total, rx, rx_total;
2816 	uint32_t device_fh;
2817 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2818 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2819 
2820 	while(1) {
2821 		sleep(enable_stats);
2822 
2823 		/* Clear screen and move to top left */
2824 		printf("%s%s", clr, top_left);
2825 
2826 		printf("\nDevice statistics ====================================");
2827 
2828 		dev_ll = ll_root_used;
2829 		while (dev_ll != NULL) {
2830 			device_fh = (uint32_t)dev_ll->dev->device_fh;
2831 			tx_total = dev_statistics[device_fh].tx_total;
2832 			tx = dev_statistics[device_fh].tx;
2833 			tx_dropped = tx_total - tx;
2834 			if (zero_copy == 0) {
2835 				rx_total = rte_atomic64_read(
2836 					&dev_statistics[device_fh].rx_total_atomic);
2837 				rx = rte_atomic64_read(
2838 					&dev_statistics[device_fh].rx_atomic);
2839 			} else {
2840 				rx_total = dev_statistics[device_fh].rx_total;
2841 				rx = dev_statistics[device_fh].rx;
2842 			}
2843 			rx_dropped = rx_total - rx;
2844 
2845 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2846 					"\nTX total: 		%"PRIu64""
2847 					"\nTX dropped: 		%"PRIu64""
2848 					"\nTX successful: 		%"PRIu64""
2849 					"\nRX total: 		%"PRIu64""
2850 					"\nRX dropped: 		%"PRIu64""
2851 					"\nRX successful: 		%"PRIu64"",
2852 					device_fh,
2853 					tx_total,
2854 					tx_dropped,
2855 					tx,
2856 					rx_total,
2857 					rx_dropped,
2858 					rx);
2859 
2860 			dev_ll = dev_ll->next;
2861 		}
2862 		printf("\n======================================================\n");
2863 	}
2864 }
2865 
2866 static void
2867 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2868 	char *ring_name, uint32_t nb_mbuf)
2869 {
2870 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2871 	vpool_array[index].pool
2872 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2873 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2874 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2875 		rte_pktmbuf_init, NULL, socket, 0);
2876 	if (vpool_array[index].pool != NULL) {
2877 		vpool_array[index].ring
2878 			= rte_ring_create(ring_name,
2879 				rte_align32pow2(nb_mbuf + 1),
2880 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2881 		if (likely(vpool_array[index].ring != NULL)) {
2882 			LOG_DEBUG(VHOST_CONFIG,
2883 				"in setup_mempool_tbl: mbuf count in "
2884 				"mempool is: %d\n",
2885 				rte_mempool_count(vpool_array[index].pool));
2886 			LOG_DEBUG(VHOST_CONFIG,
2887 				"in setup_mempool_tbl: mbuf count in "
2888 				"ring   is: %d\n",
2889 				rte_ring_count(vpool_array[index].ring));
2890 		} else {
2891 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2892 				ring_name);
2893 		}
2894 
2895 		/* Need consider head room. */
2896 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2897 	} else {
2898 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2899 	}
2900 }
2901 
2902 
2903 /*
2904  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2905  * device is also registered here to handle the IOCTLs.
2906  */
2907 int
2908 MAIN(int argc, char *argv[])
2909 {
2910 	struct rte_mempool *mbuf_pool = NULL;
2911 	unsigned lcore_id, core_id = 0;
2912 	unsigned nb_ports, valid_num_ports;
2913 	int ret;
2914 	uint8_t portid, queue_id = 0;
2915 	static pthread_t tid;
2916 
2917 	/* init EAL */
2918 	ret = rte_eal_init(argc, argv);
2919 	if (ret < 0)
2920 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2921 	argc -= ret;
2922 	argv += ret;
2923 
2924 	/* parse app arguments */
2925 	ret = us_vhost_parse_args(argc, argv);
2926 	if (ret < 0)
2927 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2928 
2929 	if (rte_eal_pci_probe() != 0)
2930 		rte_exit(EXIT_FAILURE, "Error with NIC driver initialization\n");
2931 
2932 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2933 		if (rte_lcore_is_enabled(lcore_id))
2934 			lcore_ids[core_id ++] = lcore_id;
2935 
2936 	if (rte_lcore_count() > RTE_MAX_LCORE)
2937 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2938 
2939 	/*set the number of swithcing cores available*/
2940 	num_switching_cores = rte_lcore_count()-1;
2941 
2942 	/* Get the number of physical ports. */
2943 	nb_ports = rte_eth_dev_count();
2944 	if (nb_ports > RTE_MAX_ETHPORTS)
2945 		nb_ports = RTE_MAX_ETHPORTS;
2946 
2947 	/*
2948 	 * Update the global var NUM_PORTS and global array PORTS
2949 	 * and get value of var VALID_NUM_PORTS according to system ports number
2950 	 */
2951 	valid_num_ports = check_ports_num(nb_ports);
2952 
2953 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2954 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2955 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2956 		return -1;
2957 	}
2958 
2959 	if (zero_copy == 0) {
2960 		/* Create the mbuf pool. */
2961 		mbuf_pool = rte_mempool_create(
2962 				"MBUF_POOL",
2963 				NUM_MBUFS_PER_PORT
2964 				* valid_num_ports,
2965 				MBUF_SIZE, MBUF_CACHE_SIZE,
2966 				sizeof(struct rte_pktmbuf_pool_private),
2967 				rte_pktmbuf_pool_init, NULL,
2968 				rte_pktmbuf_init, NULL,
2969 				rte_socket_id(), 0);
2970 		if (mbuf_pool == NULL)
2971 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2972 
2973 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2974 			vpool_array[queue_id].pool = mbuf_pool;
2975 
2976 		if (vm2vm_mode == VM2VM_HARDWARE) {
2977 			/* Enable VT loop back to let L2 switch to do it. */
2978 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2979 			LOG_DEBUG(VHOST_CONFIG,
2980 				"Enable loop back for L2 switch in vmdq.\n");
2981 		}
2982 	} else {
2983 		uint32_t nb_mbuf;
2984 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2985 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2986 
2987 		rx_conf_default.start_rx_per_q = (uint8_t)zero_copy;
2988 		rx_conf_default.rx_drop_en = 0;
2989 		tx_conf_default.start_tx_per_q = (uint8_t)zero_copy;
2990 		nb_mbuf = num_rx_descriptor
2991 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2992 			+ num_switching_cores * MAX_PKT_BURST;
2993 
2994 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2995 			snprintf(pool_name, sizeof(pool_name),
2996 				"rxmbuf_pool_%u", queue_id);
2997 			snprintf(ring_name, sizeof(ring_name),
2998 				"rxmbuf_ring_%u", queue_id);
2999 			setup_mempool_tbl(rte_socket_id(), queue_id,
3000 				pool_name, ring_name, nb_mbuf);
3001 		}
3002 
3003 		nb_mbuf = num_tx_descriptor
3004 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3005 				+ num_switching_cores * MAX_PKT_BURST;
3006 
3007 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3008 			snprintf(pool_name, sizeof(pool_name),
3009 				"txmbuf_pool_%u", queue_id);
3010 			snprintf(ring_name, sizeof(ring_name),
3011 				"txmbuf_ring_%u", queue_id);
3012 			setup_mempool_tbl(rte_socket_id(),
3013 				(queue_id + MAX_QUEUES),
3014 				pool_name, ring_name, nb_mbuf);
3015 		}
3016 
3017 		if (vm2vm_mode == VM2VM_HARDWARE) {
3018 			/* Enable VT loop back to let L2 switch to do it. */
3019 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3020 			LOG_DEBUG(VHOST_CONFIG,
3021 				"Enable loop back for L2 switch in vmdq.\n");
3022 		}
3023 	}
3024 	/* Set log level. */
3025 	rte_set_log_level(LOG_LEVEL);
3026 
3027 	/* initialize all ports */
3028 	for (portid = 0; portid < nb_ports; portid++) {
3029 		/* skip ports that are not enabled */
3030 		if ((enabled_port_mask & (1 << portid)) == 0) {
3031 			RTE_LOG(INFO, VHOST_PORT,
3032 				"Skipping disabled port %d\n", portid);
3033 			continue;
3034 		}
3035 		if (port_init(portid) != 0)
3036 			rte_exit(EXIT_FAILURE,
3037 				"Cannot initialize network ports\n");
3038 	}
3039 
3040 	/* Initialise all linked lists. */
3041 	if (init_data_ll() == -1)
3042 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3043 
3044 	/* Initialize device stats */
3045 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3046 
3047 	/* Enable stats if the user option is set. */
3048 	if (enable_stats)
3049 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3050 
3051 	/* Launch all data cores. */
3052 	if (zero_copy == 0) {
3053 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3054 			rte_eal_remote_launch(switch_worker,
3055 				mbuf_pool, lcore_id);
3056 		}
3057 	} else {
3058 		uint32_t count_in_mempool, index, i;
3059 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3060 			/* For all RX and TX queues. */
3061 			count_in_mempool
3062 				= rte_mempool_count(vpool_array[index].pool);
3063 
3064 			/*
3065 			 * Transfer all un-attached mbufs from vpool.pool
3066 			 * to vpoo.ring.
3067 			 */
3068 			for (i = 0; i < count_in_mempool; i++) {
3069 				struct rte_mbuf *mbuf
3070 					= __rte_mbuf_raw_alloc(
3071 						vpool_array[index].pool);
3072 				rte_ring_sp_enqueue(vpool_array[index].ring,
3073 						(void *)mbuf);
3074 			}
3075 
3076 			LOG_DEBUG(VHOST_CONFIG,
3077 				"in MAIN: mbuf count in mempool at initial "
3078 				"is: %d\n", count_in_mempool);
3079 			LOG_DEBUG(VHOST_CONFIG,
3080 				"in MAIN: mbuf count in  ring at initial  is :"
3081 				" %d\n",
3082 				rte_ring_count(vpool_array[index].ring));
3083 		}
3084 
3085 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3086 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3087 				lcore_id);
3088 	}
3089 
3090 	/* Register CUSE device to handle IOCTLs. */
3091 	ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3092 	if (ret != 0)
3093 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3094 
3095 	init_virtio_net(&virtio_net_device_ops);
3096 
3097 	/* Start CUSE session. */
3098 	start_cuse_session_loop();
3099 	return 0;
3100 
3101 }
3102 
3103