xref: /dpdk/examples/vhost/main.c (revision 5cf2714469fcbc619d15c61d6677d1766611ccc8)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 128
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91 
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100 
101 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
102 #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
103 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
104 
105 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
106 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
107 
108 #define JUMBO_FRAME_MAX_SIZE    0x2600
109 
110 /* State of virtio device. */
111 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_RX			1
113 #define DEVICE_SAFE_REMOVE	2
114 
115 /* Config_core_flag status definitions. */
116 #define REQUEST_DEV_REMOVAL 1
117 #define ACK_DEV_REMOVAL 0
118 
119 /* Configurable number of RX/TX ring descriptors */
120 #define RTE_TEST_RX_DESC_DEFAULT 1024
121 #define RTE_TEST_TX_DESC_DEFAULT 512
122 
123 /*
124  * Need refine these 2 macros for legacy and DPDK based front end:
125  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
126  * And then adjust power 2.
127  */
128 /*
129  * For legacy front end, 128 descriptors,
130  * half for virtio header, another half for mbuf.
131  */
132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
134 
135 /* Get first 4 bytes in mbuf headroom. */
136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
137 		+ sizeof(struct rte_mbuf)))
138 
139 /* true if x is a power of 2 */
140 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 
142 #define INVALID_PORT_ID 0xFF
143 
144 /* Max number of devices. Limited by vmdq. */
145 #define MAX_DEVICES 64
146 
147 /* Size of buffers used for snprintfs. */
148 #define MAX_PRINT_BUFF 6072
149 
150 /* Maximum character device basename size. */
151 #define MAX_BASENAME_SZ 10
152 
153 /* Maximum long option length for option parsing. */
154 #define MAX_LONG_OPT_SZ 64
155 
156 /* Used to compare MAC addresses. */
157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 
159 /* Number of descriptors per cacheline. */
160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 
162 /* mask of enabled ports */
163 static uint32_t enabled_port_mask = 0;
164 
165 /*Number of switching cores enabled*/
166 static uint32_t num_switching_cores = 0;
167 
168 /* number of devices/queues to support*/
169 static uint32_t num_queues = 0;
170 uint32_t num_devices = 0;
171 
172 /*
173  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
174  * disabled on default.
175  */
176 static uint32_t zero_copy;
177 
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184 
185 struct vpool {
186 	struct rte_mempool *pool;
187 	struct rte_ring *ring;
188 	uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193 	VM2VM_DISABLED = 0,
194 	VM2VM_SOFTWARE = 1,
195 	VM2VM_HARDWARE = 2,
196 	VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202 	PHYS_ADDR_CONTINUOUS = 0,
203 	PHYS_ADDR_CROSS_SUBREG = 1,
204 	PHYS_ADDR_INVALID = 2,
205 	PHYS_ADDR_LAST
206 } hpa_type;
207 
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 
220 
221 /* This can be set by the user so it is made available here. */
222 extern uint64_t VHOST_FEATURES;
223 
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226 	.rx_thresh = {
227 		.pthresh = RX_PTHRESH,
228 		.hthresh = RX_HTHRESH,
229 		.wthresh = RX_WTHRESH,
230 	},
231 	.rx_drop_en = 1,
232 };
233 
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240 	.tx_thresh = {
241 		.pthresh = TX_PTHRESH,
242 		.hthresh = TX_HTHRESH,
243 		.wthresh = TX_WTHRESH,
244 	},
245 	.tx_free_thresh = 0, /* Use PMD default values */
246 	.tx_rs_thresh = 0, /* Use PMD default values */
247 };
248 
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251 	.rxmode = {
252 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253 		.split_hdr_size = 0,
254 		.header_split   = 0, /**< Header Split disabled */
255 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
256 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
257 		/*
258 		 * It is necessary for 1G NIC such as I350,
259 		 * this fixes bug of ipv4 forwarding in guest can't
260 		 * forward pakets from one virtio dev to another virtio dev.
261 		 */
262 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
265 	},
266 
267 	.txmode = {
268 		.mq_mode = ETH_MQ_TX_NONE,
269 	},
270 	.rx_adv_conf = {
271 		/*
272 		 * should be overridden separately in code with
273 		 * appropriate values
274 		 */
275 		.vmdq_rx_conf = {
276 			.nb_queue_pools = ETH_8_POOLS,
277 			.enable_default_pool = 0,
278 			.default_pool = 0,
279 			.nb_pool_maps = 0,
280 			.pool_map = {{0, 0},},
281 		},
282 	},
283 };
284 
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288 
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
293 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300 
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303 
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307 
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310 
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313 	unsigned len;
314 	unsigned txq_id;
315 	struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317 
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320 
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323 
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326 	unsigned char   h_dest[ETH_ALEN];
327 	unsigned char   h_source[ETH_ALEN];
328 	__be16          h_vlan_proto;
329 	__be16          h_vlan_TCI;
330 	__be16          h_vlan_encapsulated_proto;
331 };
332 
333 /* IPv4 Header */
334 struct ipv4_hdr {
335 	uint8_t  version_ihl;		/**< version and header length */
336 	uint8_t  type_of_service;	/**< type of service */
337 	uint16_t total_length;		/**< length of packet */
338 	uint16_t packet_id;		/**< packet ID */
339 	uint16_t fragment_offset;	/**< fragmentation offset */
340 	uint8_t  time_to_live;		/**< time to live */
341 	uint8_t  next_proto_id;		/**< protocol ID */
342 	uint16_t hdr_checksum;		/**< header checksum */
343 	uint32_t src_addr;		/**< source address */
344 	uint32_t dst_addr;		/**< destination address */
345 } __attribute__((__packed__));
346 
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350 
351 /* Per-device statistics struct */
352 struct device_statistics {
353 	uint64_t tx_total;
354 	rte_atomic64_t rx_total_atomic;
355 	uint64_t rx_total;
356 	uint64_t tx;
357 	rte_atomic64_t rx_atomic;
358 	uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361 
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369 	struct rte_eth_vmdq_rx_conf conf;
370 	unsigned i;
371 
372 	memset(&conf, 0, sizeof(conf));
373 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
374 	conf.nb_pool_maps = num_devices;
375 	conf.enable_loop_back =
376 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
377 
378 	for (i = 0; i < conf.nb_pool_maps; i++) {
379 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
380 		conf.pool_map[i].pools = (1UL << i);
381 	}
382 
383 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
384 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
385 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
386 	return 0;
387 }
388 
389 /*
390  * Validate the device number according to the max pool number gotten form
391  * dev_info. If the device number is invalid, give the error message and
392  * return -1. Each device must have its own pool.
393  */
394 static inline int
395 validate_num_devices(uint32_t max_nb_devices)
396 {
397 	if (num_devices > max_nb_devices) {
398 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
399 		return -1;
400 	}
401 	return 0;
402 }
403 
404 /*
405  * Initialises a given port using global settings and with the rx buffers
406  * coming from the mbuf_pool passed as parameter
407  */
408 static inline int
409 port_init(uint8_t port)
410 {
411 	struct rte_eth_dev_info dev_info;
412 	struct rte_eth_conf port_conf;
413 	uint16_t rx_rings, tx_rings;
414 	uint16_t rx_ring_size, tx_ring_size;
415 	int retval;
416 	uint16_t q;
417 
418 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
419 	rte_eth_dev_info_get (port, &dev_info);
420 
421 	/*configure the number of supported virtio devices based on VMDQ limits */
422 	num_devices = dev_info.max_vmdq_pools;
423 	num_queues = dev_info.max_rx_queues;
424 
425 	if (zero_copy) {
426 		rx_ring_size = num_rx_descriptor;
427 		tx_ring_size = num_tx_descriptor;
428 		tx_rings = dev_info.max_tx_queues;
429 	} else {
430 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
431 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
432 		tx_rings = (uint16_t)rte_lcore_count();
433 	}
434 
435 	retval = validate_num_devices(MAX_DEVICES);
436 	if (retval < 0)
437 		return retval;
438 
439 	/* Get port configuration. */
440 	retval = get_eth_conf(&port_conf, num_devices);
441 	if (retval < 0)
442 		return retval;
443 
444 	if (port >= rte_eth_dev_count()) return -1;
445 
446 	rx_rings = (uint16_t)num_queues,
447 	/* Configure ethernet device. */
448 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449 	if (retval != 0)
450 		return retval;
451 
452 	/* Setup the queues. */
453 	for (q = 0; q < rx_rings; q ++) {
454 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455 						rte_eth_dev_socket_id(port), &rx_conf_default,
456 						vpool_array[q].pool);
457 		if (retval < 0)
458 			return retval;
459 	}
460 	for (q = 0; q < tx_rings; q ++) {
461 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462 						rte_eth_dev_socket_id(port), &tx_conf_default);
463 		if (retval < 0)
464 			return retval;
465 	}
466 
467 	/* Start the device. */
468 	retval  = rte_eth_dev_start(port);
469 	if (retval < 0) {
470 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471 		return retval;
472 	}
473 
474 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
475 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
476 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
477 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
478 			(unsigned)port,
479 			vmdq_ports_eth_addr[port].addr_bytes[0],
480 			vmdq_ports_eth_addr[port].addr_bytes[1],
481 			vmdq_ports_eth_addr[port].addr_bytes[2],
482 			vmdq_ports_eth_addr[port].addr_bytes[3],
483 			vmdq_ports_eth_addr[port].addr_bytes[4],
484 			vmdq_ports_eth_addr[port].addr_bytes[5]);
485 
486 	return 0;
487 }
488 
489 /*
490  * Set character device basename.
491  */
492 static int
493 us_vhost_parse_basename(const char *q_arg)
494 {
495 	/* parse number string */
496 
497 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
498 		return -1;
499 	else
500 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
501 
502 	return 0;
503 }
504 
505 /*
506  * Parse the portmask provided at run time.
507  */
508 static int
509 parse_portmask(const char *portmask)
510 {
511 	char *end = NULL;
512 	unsigned long pm;
513 
514 	errno = 0;
515 
516 	/* parse hexadecimal string */
517 	pm = strtoul(portmask, &end, 16);
518 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
519 		return -1;
520 
521 	if (pm == 0)
522 		return -1;
523 
524 	return pm;
525 
526 }
527 
528 /*
529  * Parse num options at run time.
530  */
531 static int
532 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
533 {
534 	char *end = NULL;
535 	unsigned long num;
536 
537 	errno = 0;
538 
539 	/* parse unsigned int string */
540 	num = strtoul(q_arg, &end, 10);
541 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
542 		return -1;
543 
544 	if (num > max_valid_value)
545 		return -1;
546 
547 	return num;
548 
549 }
550 
551 /*
552  * Display usage
553  */
554 static void
555 us_vhost_usage(const char *prgname)
556 {
557 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
558 	"		--vm2vm [0|1|2]\n"
559 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
560 	"		--dev-basename <name>\n"
561 	"		--nb-devices ND\n"
562 	"		-p PORTMASK: Set mask for ports to be used by application\n"
563 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
564 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
565 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
566 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
567 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
568 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
569 	"		--dev-basename: The basename to be used for the character device.\n"
570 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
571 			"zero copy\n"
572 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
573 			"used only when zero copy is enabled.\n"
574 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
575 			"used only when zero copy is enabled.\n",
576 	       prgname);
577 }
578 
579 /*
580  * Parse the arguments given in the command line of the application.
581  */
582 static int
583 us_vhost_parse_args(int argc, char **argv)
584 {
585 	int opt, ret;
586 	int option_index;
587 	unsigned i;
588 	const char *prgname = argv[0];
589 	static struct option long_option[] = {
590 		{"vm2vm", required_argument, NULL, 0},
591 		{"rx-retry", required_argument, NULL, 0},
592 		{"rx-retry-delay", required_argument, NULL, 0},
593 		{"rx-retry-num", required_argument, NULL, 0},
594 		{"mergeable", required_argument, NULL, 0},
595 		{"stats", required_argument, NULL, 0},
596 		{"dev-basename", required_argument, NULL, 0},
597 		{"zero-copy", required_argument, NULL, 0},
598 		{"rx-desc-num", required_argument, NULL, 0},
599 		{"tx-desc-num", required_argument, NULL, 0},
600 		{NULL, 0, 0, 0},
601 	};
602 
603 	/* Parse command line */
604 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605 		switch (opt) {
606 		/* Portmask */
607 		case 'p':
608 			enabled_port_mask = parse_portmask(optarg);
609 			if (enabled_port_mask == 0) {
610 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611 				us_vhost_usage(prgname);
612 				return -1;
613 			}
614 			break;
615 
616 		case 0:
617 			/* Enable/disable vm2vm comms. */
618 			if (!strncmp(long_option[option_index].name, "vm2vm",
619 				MAX_LONG_OPT_SZ)) {
620 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621 				if (ret == -1) {
622 					RTE_LOG(INFO, VHOST_CONFIG,
623 						"Invalid argument for "
624 						"vm2vm [0|1|2]\n");
625 					us_vhost_usage(prgname);
626 					return -1;
627 				} else {
628 					vm2vm_mode = (vm2vm_type)ret;
629 				}
630 			}
631 
632 			/* Enable/disable retries on RX. */
633 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634 				ret = parse_num_opt(optarg, 1);
635 				if (ret == -1) {
636 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637 					us_vhost_usage(prgname);
638 					return -1;
639 				} else {
640 					enable_retry = ret;
641 				}
642 			}
643 
644 			/* Specify the retries delay time (in useconds) on RX. */
645 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646 				ret = parse_num_opt(optarg, INT32_MAX);
647 				if (ret == -1) {
648 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649 					us_vhost_usage(prgname);
650 					return -1;
651 				} else {
652 					burst_rx_delay_time = ret;
653 				}
654 			}
655 
656 			/* Specify the retries number on RX. */
657 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658 				ret = parse_num_opt(optarg, INT32_MAX);
659 				if (ret == -1) {
660 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661 					us_vhost_usage(prgname);
662 					return -1;
663 				} else {
664 					burst_rx_retry_num = ret;
665 				}
666 			}
667 
668 			/* Enable/disable RX mergeable buffers. */
669 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670 				ret = parse_num_opt(optarg, 1);
671 				if (ret == -1) {
672 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673 					us_vhost_usage(prgname);
674 					return -1;
675 				} else {
676 					if (ret) {
677 						vmdq_conf_default.rxmode.jumbo_frame = 1;
678 						vmdq_conf_default.rxmode.max_rx_pkt_len
679 							= JUMBO_FRAME_MAX_SIZE;
680 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681 					}
682 				}
683 			}
684 
685 			/* Enable/disable stats. */
686 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687 				ret = parse_num_opt(optarg, INT32_MAX);
688 				if (ret == -1) {
689 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690 					us_vhost_usage(prgname);
691 					return -1;
692 				} else {
693 					enable_stats = ret;
694 				}
695 			}
696 
697 			/* Set character device basename. */
698 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699 				if (us_vhost_parse_basename(optarg) == -1) {
700 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701 					us_vhost_usage(prgname);
702 					return -1;
703 				}
704 			}
705 
706 			/* Enable/disable rx/tx zero copy. */
707 			if (!strncmp(long_option[option_index].name,
708 				"zero-copy", MAX_LONG_OPT_SZ)) {
709 				ret = parse_num_opt(optarg, 1);
710 				if (ret == -1) {
711 					RTE_LOG(INFO, VHOST_CONFIG,
712 						"Invalid argument"
713 						" for zero-copy [0|1]\n");
714 					us_vhost_usage(prgname);
715 					return -1;
716 				} else
717 					zero_copy = ret;
718 
719 				if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722 					"zero copy vhost APP, please "
723 					"disable RTE_MBUF_REFCNT\n"
724 					"in config file and then rebuild DPDK "
725 					"core lib!\n"
726 					"Otherwise please disable zero copy "
727 					"flag in command line!\n");
728 					return -1;
729 #endif
730 				}
731 			}
732 
733 			/* Specify the descriptor number on RX. */
734 			if (!strncmp(long_option[option_index].name,
735 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
736 				ret = parse_num_opt(optarg, MAX_RING_DESC);
737 				if ((ret == -1) || (!POWEROF2(ret))) {
738 					RTE_LOG(INFO, VHOST_CONFIG,
739 					"Invalid argument for rx-desc-num[0-N],"
740 					"power of 2 required.\n");
741 					us_vhost_usage(prgname);
742 					return -1;
743 				} else {
744 					num_rx_descriptor = ret;
745 				}
746 			}
747 
748 			/* Specify the descriptor number on TX. */
749 			if (!strncmp(long_option[option_index].name,
750 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
751 				ret = parse_num_opt(optarg, MAX_RING_DESC);
752 				if ((ret == -1) || (!POWEROF2(ret))) {
753 					RTE_LOG(INFO, VHOST_CONFIG,
754 					"Invalid argument for tx-desc-num [0-N],"
755 					"power of 2 required.\n");
756 					us_vhost_usage(prgname);
757 					return -1;
758 				} else {
759 					num_tx_descriptor = ret;
760 				}
761 			}
762 
763 			break;
764 
765 			/* Invalid option - print options. */
766 		default:
767 			us_vhost_usage(prgname);
768 			return -1;
769 		}
770 	}
771 
772 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773 		if (enabled_port_mask & (1 << i))
774 			ports[num_ports++] = (uint8_t)i;
775 	}
776 
777 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780 		return -1;
781 	}
782 
783 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784 		RTE_LOG(INFO, VHOST_PORT,
785 			"Vhost zero copy doesn't support software vm2vm,"
786 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
787 		return -1;
788 	}
789 
790 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791 		RTE_LOG(INFO, VHOST_PORT,
792 			"Vhost zero copy doesn't support jumbo frame,"
793 			"please specify '--mergeable 0' to disable the "
794 			"mergeable feature.\n");
795 		return -1;
796 	}
797 
798 	return 0;
799 }
800 
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807 	unsigned valid_num_ports = num_ports;
808 	unsigned portid;
809 
810 	if (num_ports > nb_ports) {
811 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812 			num_ports, nb_ports);
813 		num_ports = nb_ports;
814 	}
815 
816 	for (portid = 0; portid < num_ports; portid ++) {
817 		if (ports[portid] >= nb_ports) {
818 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819 				ports[portid], (nb_ports - 1));
820 			ports[portid] = INVALID_PORT_ID;
821 			valid_num_ports--;
822 		}
823 	}
824 	return valid_num_ports;
825 }
826 
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {																\
833 	char *pkt_addr = (char*)(addr);																					\
834 	unsigned int index;																								\
835 	char packet[MAX_PRINT_BUFF];																					\
836 																													\
837 	if ((header))																									\
838 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
839 	else																											\
840 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
841 	for (index = 0; index < (size); index++) {																		\
842 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
843 			"%02hhx ", pkt_addr[index]);																			\
844 	}																												\
845 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
846 																													\
847 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852 
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859 	uint32_t buf_len, hpa_type *addr_type)
860 {
861 	struct virtio_memory_regions_hpa *region;
862 	uint32_t regionidx;
863 	uint64_t vhost_pa = 0;
864 
865 	*addr_type = PHYS_ADDR_INVALID;
866 
867 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868 		region = &vdev->regions_hpa[regionidx];
869 		if ((guest_pa >= region->guest_phys_address) &&
870 			(guest_pa <= region->guest_phys_address_end)) {
871 			vhost_pa = region->host_phys_addr_offset + guest_pa;
872 			if (likely((guest_pa + buf_len - 1)
873 				<= region->guest_phys_address_end))
874 				*addr_type = PHYS_ADDR_CONTINUOUS;
875 			else
876 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
877 			break;
878 		}
879 	}
880 
881 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883 		(void *)(uintptr_t)vhost_pa);
884 
885 	return vhost_pa;
886 }
887 
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896 
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904 	struct ether_hdr *pkt_hdr;
905 	struct virtio_net_data_ll *dev_ll;
906 	struct virtio_net *dev = vdev->dev;
907 	int i, ret;
908 
909 	/* Learn MAC address of guest device from packet */
910 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911 
912 	dev_ll = ll_root_used;
913 
914 	while (dev_ll != NULL) {
915 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917 			return -1;
918 		}
919 		dev_ll = dev_ll->next;
920 	}
921 
922 	for (i = 0; i < ETHER_ADDR_LEN; i++)
923 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924 
925 	/* vlan_tag currently uses the device_id. */
926 	vdev->vlan_tag = vlan_tags[dev->device_fh];
927 
928 	/* Print out VMDQ registration info. */
929 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930 		dev->device_fh,
931 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934 		vdev->vlan_tag);
935 
936 	/* Register the MAC address. */
937 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
938 	if (ret)
939 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
940 					dev->device_fh);
941 
942 	/* Enable stripping of the vlan tag as we handle routing. */
943 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
944 
945 	/* Set device as ready for RX. */
946 	vdev->ready = DEVICE_RX;
947 
948 	return 0;
949 }
950 
951 /*
952  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
953  * queue before disabling RX on the device.
954  */
955 static inline void
956 unlink_vmdq(struct vhost_dev *vdev)
957 {
958 	unsigned i = 0;
959 	unsigned rx_count;
960 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
961 
962 	if (vdev->ready == DEVICE_RX) {
963 		/*clear MAC and VLAN settings*/
964 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
965 		for (i = 0; i < 6; i++)
966 			vdev->mac_address.addr_bytes[i] = 0;
967 
968 		vdev->vlan_tag = 0;
969 
970 		/*Clear out the receive buffers*/
971 		rx_count = rte_eth_rx_burst(ports[0],
972 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
973 
974 		while (rx_count) {
975 			for (i = 0; i < rx_count; i++)
976 				rte_pktmbuf_free(pkts_burst[i]);
977 
978 			rx_count = rte_eth_rx_burst(ports[0],
979 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980 		}
981 
982 		vdev->ready = DEVICE_MAC_LEARNING;
983 	}
984 }
985 
986 /*
987  * Check if the packet destination MAC address is for a local device. If so then put
988  * the packet on that devices RX queue. If not then return.
989  */
990 static inline unsigned __attribute__((always_inline))
991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
992 {
993 	struct virtio_net_data_ll *dev_ll;
994 	struct ether_hdr *pkt_hdr;
995 	uint64_t ret = 0;
996 	struct virtio_net *dev = vdev->dev;
997 	struct virtio_net *tdev; /* destination virito device */
998 
999 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1000 
1001 	/*get the used devices list*/
1002 	dev_ll = ll_root_used;
1003 
1004 	while (dev_ll != NULL) {
1005 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1006 				          &dev_ll->vdev->mac_address)) {
1007 
1008 			/* Drop the packet if the TX packet is destined for the TX device. */
1009 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1010 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011 							dev->device_fh);
1012 				return 0;
1013 			}
1014 			tdev = dev_ll->vdev->dev;
1015 
1016 
1017 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1018 
1019 			if (dev_ll->vdev->remove) {
1020 				/*drop the packet if the device is marked for removal*/
1021 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1022 			} else {
1023 				uint32_t mergeable =
1024 					dev_ll->dev->features &
1025 					(1 << VIRTIO_NET_F_MRG_RXBUF);
1026 
1027 				/*send the packet to the local virtio device*/
1028 				if (likely(mergeable == 0))
1029 					ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1030 				else
1031 					ret = virtio_dev_merge_rx(dev_ll->dev,
1032 						&m, 1);
1033 
1034 				if (enable_stats) {
1035 					rte_atomic64_add(
1036 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1037 					1);
1038 					rte_atomic64_add(
1039 					&dev_statistics[tdev->device_fh].rx_atomic,
1040 					ret);
1041 					dev_statistics[tdev->device_fh].tx_total++;
1042 					dev_statistics[tdev->device_fh].tx += ret;
1043 				}
1044 			}
1045 
1046 			return 0;
1047 		}
1048 		dev_ll = dev_ll->next;
1049 	}
1050 
1051 	return -1;
1052 }
1053 
1054 /*
1055  * This function routes the TX packet to the correct interface. This may be a local device
1056  * or the physical port.
1057  */
1058 static inline void __attribute__((always_inline))
1059 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1060 {
1061 	struct mbuf_table *tx_q;
1062 	struct vlan_ethhdr *vlan_hdr;
1063 	struct rte_mbuf **m_table;
1064 	struct rte_mbuf *mbuf, *prev;
1065 	unsigned len, ret, offset = 0;
1066 	const uint16_t lcore_id = rte_lcore_id();
1067 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1068 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1069 	struct virtio_net *dev = vdev->dev;
1070 
1071 	/*check if destination is local VM*/
1072 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1073 		return;
1074 
1075 	if (vm2vm_mode == VM2VM_HARDWARE) {
1076 		while (dev_ll != NULL) {
1077 			if ((dev_ll->vdev->ready == DEVICE_RX)
1078 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1079 				&dev_ll->vdev->mac_address)) {
1080 				/*
1081 				 * Drop the packet if the TX packet is
1082 				 * destined for the TX device.
1083 				 */
1084 				if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1085 					LOG_DEBUG(VHOST_DATA,
1086 					"(%"PRIu64") TX: Source and destination"
1087 					" MAC addresses are the same. Dropping "
1088 					"packet.\n",
1089 					dev_ll->vdev->device_fh);
1090 					return;
1091 				}
1092 				offset = 4;
1093 				vlan_tag =
1094 				(uint16_t)
1095 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1096 
1097 				LOG_DEBUG(VHOST_DATA,
1098 				"(%"PRIu64") TX: pkt to local VM device id:"
1099 				"(%"PRIu64") vlan tag: %d.\n",
1100 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1101 				vlan_tag);
1102 
1103 				break;
1104 			}
1105 			dev_ll = dev_ll->next;
1106 		}
1107 	}
1108 
1109 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1110 
1111 	/*Add packet to the port tx queue*/
1112 	tx_q = &lcore_tx_queue[lcore_id];
1113 	len = tx_q->len;
1114 
1115 	/* Allocate an mbuf and populate the structure. */
1116 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
1117 	if (unlikely(mbuf == NULL)) {
1118 		RTE_LOG(ERR, VHOST_DATA,
1119 			"Failed to allocate memory for mbuf.\n");
1120 		return;
1121 	}
1122 
1123 	mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1124 	mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1125 	mbuf->nb_segs = m->nb_segs;
1126 
1127 	/* Copy ethernet header to mbuf. */
1128 	rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1129 		rte_pktmbuf_mtod(m, const void *),
1130 		ETH_HLEN);
1131 
1132 
1133 	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1134 	vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1135 	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1136 	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1137 	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1138 
1139 	/* Copy the remaining packet contents to the mbuf. */
1140 	rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1141 		(const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1142 		(m->data_len - ETH_HLEN));
1143 
1144 	/* Copy the remaining segments for the whole packet. */
1145 	prev = mbuf;
1146 	while (m->next) {
1147 		/* Allocate an mbuf and populate the structure. */
1148 		struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1149 		if (unlikely(next_mbuf == NULL)) {
1150 			rte_pktmbuf_free(mbuf);
1151 			RTE_LOG(ERR, VHOST_DATA,
1152 				"Failed to allocate memory for mbuf.\n");
1153 			return;
1154 		}
1155 
1156 		m = m->next;
1157 		prev->next = next_mbuf;
1158 		prev = next_mbuf;
1159 		next_mbuf->data_len = m->data_len;
1160 
1161 		/* Copy data to next mbuf. */
1162 		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1163 			rte_pktmbuf_mtod(m, const void *), m->data_len);
1164 	}
1165 
1166 	tx_q->m_table[len] = mbuf;
1167 	len++;
1168 	if (enable_stats) {
1169 		dev_statistics[dev->device_fh].tx_total++;
1170 		dev_statistics[dev->device_fh].tx++;
1171 	}
1172 
1173 	if (unlikely(len == MAX_PKT_BURST)) {
1174 		m_table = (struct rte_mbuf **)tx_q->m_table;
1175 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1176 		/* Free any buffers not handled by TX and update the port stats. */
1177 		if (unlikely(ret < len)) {
1178 			do {
1179 				rte_pktmbuf_free(m_table[ret]);
1180 			} while (++ret < len);
1181 		}
1182 
1183 		len = 0;
1184 	}
1185 
1186 	tx_q->len = len;
1187 	return;
1188 }
1189 /*
1190  * This function is called by each data core. It handles all RX/TX registered with the
1191  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1192  * with all devices in the main linked list.
1193  */
1194 static int
1195 switch_worker(__attribute__((unused)) void *arg)
1196 {
1197 	struct rte_mempool *mbuf_pool = arg;
1198 	struct virtio_net *dev = NULL;
1199 	struct vhost_dev *vdev = NULL;
1200 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1201 	struct virtio_net_data_ll *dev_ll;
1202 	struct mbuf_table *tx_q;
1203 	volatile struct lcore_ll_info *lcore_ll;
1204 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1205 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1206 	unsigned ret, i;
1207 	const uint16_t lcore_id = rte_lcore_id();
1208 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1209 	uint16_t rx_count = 0;
1210 	uint32_t mergeable = 0;
1211 
1212 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1213 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1214 	prev_tsc = 0;
1215 
1216 	tx_q = &lcore_tx_queue[lcore_id];
1217 	for (i = 0; i < num_cores; i ++) {
1218 		if (lcore_ids[i] == lcore_id) {
1219 			tx_q->txq_id = i;
1220 			break;
1221 		}
1222 	}
1223 
1224 	while(1) {
1225 		cur_tsc = rte_rdtsc();
1226 		/*
1227 		 * TX burst queue drain
1228 		 */
1229 		diff_tsc = cur_tsc - prev_tsc;
1230 		if (unlikely(diff_tsc > drain_tsc)) {
1231 
1232 			if (tx_q->len) {
1233 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1234 
1235 				/*Tx any packets in the queue*/
1236 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1237 									   (struct rte_mbuf **)tx_q->m_table,
1238 									   (uint16_t)tx_q->len);
1239 				if (unlikely(ret < tx_q->len)) {
1240 					do {
1241 						rte_pktmbuf_free(tx_q->m_table[ret]);
1242 					} while (++ret < tx_q->len);
1243 				}
1244 
1245 				tx_q->len = 0;
1246 			}
1247 
1248 			prev_tsc = cur_tsc;
1249 
1250 		}
1251 
1252 		rte_prefetch0(lcore_ll->ll_root_used);
1253 		/*
1254 		 * Inform the configuration core that we have exited the linked list and that no devices are
1255 		 * in use if requested.
1256 		 */
1257 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1258 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1259 
1260 		/*
1261 		 * Process devices
1262 		 */
1263 		dev_ll = lcore_ll->ll_root_used;
1264 
1265 		while (dev_ll != NULL) {
1266 			/*get virtio device ID*/
1267 			vdev = dev_ll->vdev;
1268 			dev = vdev->dev;
1269 			mergeable =
1270 				dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1271 
1272 			if (vdev->remove) {
1273 				dev_ll = dev_ll->next;
1274 				unlink_vmdq(vdev);
1275 				vdev->ready = DEVICE_SAFE_REMOVE;
1276 				continue;
1277 			}
1278 			if (likely(vdev->ready == DEVICE_RX)) {
1279 				/*Handle guest RX*/
1280 				rx_count = rte_eth_rx_burst(ports[0],
1281 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1282 
1283 				if (rx_count) {
1284 					if (likely(mergeable == 0))
1285 						ret_count =
1286 							virtio_dev_rx(dev,
1287 							pkts_burst, rx_count);
1288 					else
1289 						ret_count =
1290 							virtio_dev_merge_rx(dev,
1291 							pkts_burst, rx_count);
1292 
1293 					if (enable_stats) {
1294 						rte_atomic64_add(
1295 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1296 						rx_count);
1297 						rte_atomic64_add(
1298 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1299 					}
1300 					while (likely(rx_count)) {
1301 						rx_count--;
1302 						rte_pktmbuf_free(pkts_burst[rx_count]);
1303 					}
1304 
1305 				}
1306 			}
1307 
1308 			if (!vdev->remove) {
1309 				/*Handle guest TX*/
1310 				if (likely(mergeable == 0))
1311 					virtio_dev_tx(dev, mbuf_pool);
1312 				else
1313 					virtio_dev_merge_tx(dev, mbuf_pool);
1314 			}
1315 
1316 			/*move to the next device in the list*/
1317 			dev_ll = dev_ll->next;
1318 		}
1319 	}
1320 
1321 	return 0;
1322 }
1323 
1324 /*
1325  * This function gets available ring number for zero copy rx.
1326  * Only one thread will call this funciton for a paticular virtio device,
1327  * so, it is designed as non-thread-safe function.
1328  */
1329 static inline uint32_t __attribute__((always_inline))
1330 get_available_ring_num_zcp(struct virtio_net *dev)
1331 {
1332 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1333 	uint16_t avail_idx;
1334 
1335 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1336 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1337 }
1338 
1339 /*
1340  * This function gets available ring index for zero copy rx,
1341  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1342  * Only one thread will call this funciton for a paticular virtio device,
1343  * so, it is designed as non-thread-safe function.
1344  */
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_index_zcp(struct virtio_net *dev,
1347 	uint16_t *res_base_idx, uint32_t count)
1348 {
1349 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1350 	uint16_t avail_idx;
1351 	uint32_t retry = 0;
1352 	uint16_t free_entries;
1353 
1354 	*res_base_idx = vq->last_used_idx_res;
1355 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1356 	free_entries = (avail_idx - *res_base_idx);
1357 
1358 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1359 			"avail idx: %d, "
1360 			"res base idx:%d, free entries:%d\n",
1361 			dev->device_fh, avail_idx, *res_base_idx,
1362 			free_entries);
1363 
1364 	/*
1365 	 * If retry is enabled and the queue is full then we wait
1366 	 * and retry to avoid packet loss.
1367 	 */
1368 	if (enable_retry && unlikely(count > free_entries)) {
1369 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1370 			rte_delay_us(burst_rx_delay_time);
1371 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372 			free_entries = (avail_idx - *res_base_idx);
1373 			if (count <= free_entries)
1374 				break;
1375 		}
1376 	}
1377 
1378 	/*check that we have enough buffers*/
1379 	if (unlikely(count > free_entries))
1380 		count = free_entries;
1381 
1382 	if (unlikely(count == 0)) {
1383 		LOG_DEBUG(VHOST_DATA,
1384 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1385 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1386 			dev->device_fh, avail_idx,
1387 			*res_base_idx, free_entries);
1388 		return 0;
1389 	}
1390 
1391 	vq->last_used_idx_res = *res_base_idx + count;
1392 
1393 	return count;
1394 }
1395 
1396 /*
1397  * This function put descriptor back to used list.
1398  */
1399 static inline void __attribute__((always_inline))
1400 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1401 {
1402 	uint16_t res_cur_idx = vq->last_used_idx;
1403 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1404 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1405 	rte_compiler_barrier();
1406 	*(volatile uint16_t *)&vq->used->idx += 1;
1407 	vq->last_used_idx += 1;
1408 
1409 	/* Kick the guest if necessary. */
1410 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1411 		eventfd_write((int)vq->kickfd, 1);
1412 }
1413 
1414 /*
1415  * This function get available descriptor from vitio vring and un-attached mbuf
1416  * from vpool->ring, and then attach them together. It needs adjust the offset
1417  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1418  * frame data may be put to wrong location in mbuf.
1419  */
1420 static inline void __attribute__((always_inline))
1421 attach_rxmbuf_zcp(struct virtio_net *dev)
1422 {
1423 	uint16_t res_base_idx, desc_idx;
1424 	uint64_t buff_addr, phys_addr;
1425 	struct vhost_virtqueue *vq;
1426 	struct vring_desc *desc;
1427 	struct rte_mbuf *mbuf = NULL;
1428 	struct vpool *vpool;
1429 	hpa_type addr_type;
1430 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1431 
1432 	vpool = &vpool_array[vdev->vmdq_rx_q];
1433 	vq = dev->virtqueue[VIRTIO_RXQ];
1434 
1435 	do {
1436 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1437 				1) != 1))
1438 			return;
1439 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1440 
1441 		desc = &vq->desc[desc_idx];
1442 		if (desc->flags & VRING_DESC_F_NEXT) {
1443 			desc = &vq->desc[desc->next];
1444 			buff_addr = gpa_to_vva(dev, desc->addr);
1445 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1446 					&addr_type);
1447 		} else {
1448 			buff_addr = gpa_to_vva(dev,
1449 					desc->addr + vq->vhost_hlen);
1450 			phys_addr = gpa_to_hpa(vdev,
1451 					desc->addr + vq->vhost_hlen,
1452 					desc->len, &addr_type);
1453 		}
1454 
1455 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1456 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1457 				" address found when attaching RX frame buffer"
1458 				" address!\n", dev->device_fh);
1459 			put_desc_to_used_list_zcp(vq, desc_idx);
1460 			continue;
1461 		}
1462 
1463 		/*
1464 		 * Check if the frame buffer address from guest crosses
1465 		 * sub-region or not.
1466 		 */
1467 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1468 			RTE_LOG(ERR, VHOST_DATA,
1469 				"(%"PRIu64") Frame buffer address cross "
1470 				"sub-regioin found when attaching RX frame "
1471 				"buffer address!\n",
1472 				dev->device_fh);
1473 			put_desc_to_used_list_zcp(vq, desc_idx);
1474 			continue;
1475 		}
1476 	} while (unlikely(phys_addr == 0));
1477 
1478 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1479 	if (unlikely(mbuf == NULL)) {
1480 		LOG_DEBUG(VHOST_DATA,
1481 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1482 			"ring_sc_dequeue fail.\n",
1483 			dev->device_fh);
1484 		put_desc_to_used_list_zcp(vq, desc_idx);
1485 		return;
1486 	}
1487 
1488 	if (unlikely(vpool->buf_size > desc->len)) {
1489 		LOG_DEBUG(VHOST_DATA,
1490 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1491 			"length(%d) of descriptor idx: %d less than room "
1492 			"size required: %d\n",
1493 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1494 		put_desc_to_used_list_zcp(vq, desc_idx);
1495 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1496 		return;
1497 	}
1498 
1499 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1500 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1501 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1502 	mbuf->data_len = desc->len;
1503 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1504 
1505 	LOG_DEBUG(VHOST_DATA,
1506 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1507 		"descriptor idx:%d\n",
1508 		dev->device_fh, res_base_idx, desc_idx);
1509 
1510 	__rte_mbuf_raw_free(mbuf);
1511 
1512 	return;
1513 }
1514 
1515 /*
1516  * Detach an attched packet mbuf -
1517  *  - restore original mbuf address and length values.
1518  *  - reset pktmbuf data and data_len to their default values.
1519  *  All other fields of the given packet mbuf will be left intact.
1520  *
1521  * @param m
1522  *   The attached packet mbuf.
1523  */
1524 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1525 {
1526 	const struct rte_mempool *mp = m->pool;
1527 	void *buf = RTE_MBUF_TO_BADDR(m);
1528 	uint32_t buf_ofs;
1529 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1530 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1531 
1532 	m->buf_addr = buf;
1533 	m->buf_len = (uint16_t)buf_len;
1534 
1535 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1536 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1537 	m->data_off = buf_ofs;
1538 
1539 	m->data_len = 0;
1540 }
1541 
1542 /*
1543  * This function is called after packets have been transimited. It fetchs mbuf
1544  * from vpool->pool, detached it and put into vpool->ring. It also update the
1545  * used index and kick the guest if necessary.
1546  */
1547 static inline uint32_t __attribute__((always_inline))
1548 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1549 {
1550 	struct rte_mbuf *mbuf;
1551 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1552 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1553 	uint32_t index = 0;
1554 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1555 
1556 	LOG_DEBUG(VHOST_DATA,
1557 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1558 		"clean is: %d\n",
1559 		dev->device_fh, mbuf_count);
1560 	LOG_DEBUG(VHOST_DATA,
1561 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1562 		"clean  is : %d\n",
1563 		dev->device_fh, rte_ring_count(vpool->ring));
1564 
1565 	for (index = 0; index < mbuf_count; index++) {
1566 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1567 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1568 			pktmbuf_detach_zcp(mbuf);
1569 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1570 
1571 		/* Update used index buffer information. */
1572 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1573 		vq->used->ring[used_idx].len = 0;
1574 
1575 		used_idx = (used_idx + 1) & (vq->size - 1);
1576 	}
1577 
1578 	LOG_DEBUG(VHOST_DATA,
1579 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1580 		"clean is: %d\n",
1581 		dev->device_fh, rte_mempool_count(vpool->pool));
1582 	LOG_DEBUG(VHOST_DATA,
1583 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1584 		"clean  is : %d\n",
1585 		dev->device_fh, rte_ring_count(vpool->ring));
1586 	LOG_DEBUG(VHOST_DATA,
1587 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1588 		"vq->last_used_idx:%d\n",
1589 		dev->device_fh, vq->last_used_idx);
1590 
1591 	vq->last_used_idx += mbuf_count;
1592 
1593 	LOG_DEBUG(VHOST_DATA,
1594 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1595 		"vq->last_used_idx:%d\n",
1596 		dev->device_fh, vq->last_used_idx);
1597 
1598 	rte_compiler_barrier();
1599 
1600 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1601 
1602 	/* Kick guest if required. */
1603 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1604 		eventfd_write((int)vq->kickfd, 1);
1605 
1606 	return 0;
1607 }
1608 
1609 /*
1610  * This function is called when a virtio device is destroy.
1611  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1612  */
1613 static void mbuf_destroy_zcp(struct vpool *vpool)
1614 {
1615 	struct rte_mbuf *mbuf = NULL;
1616 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1617 
1618 	LOG_DEBUG(VHOST_CONFIG,
1619 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1620 		"mbuf_destroy_zcp is: %d\n",
1621 		mbuf_count);
1622 	LOG_DEBUG(VHOST_CONFIG,
1623 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1624 		"mbuf_destroy_zcp  is : %d\n",
1625 		rte_ring_count(vpool->ring));
1626 
1627 	for (index = 0; index < mbuf_count; index++) {
1628 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1629 		if (likely(mbuf != NULL)) {
1630 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1631 				pktmbuf_detach_zcp(mbuf);
1632 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1633 		}
1634 	}
1635 
1636 	LOG_DEBUG(VHOST_CONFIG,
1637 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1638 		"mbuf_destroy_zcp is: %d\n",
1639 		rte_mempool_count(vpool->pool));
1640 	LOG_DEBUG(VHOST_CONFIG,
1641 		"in mbuf_destroy_zcp: mbuf count in ring after "
1642 		"mbuf_destroy_zcp is : %d\n",
1643 		rte_ring_count(vpool->ring));
1644 }
1645 
1646 /*
1647  * This function update the use flag and counter.
1648  */
1649 static inline uint32_t __attribute__((always_inline))
1650 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1651 	uint32_t count)
1652 {
1653 	struct vhost_virtqueue *vq;
1654 	struct vring_desc *desc;
1655 	struct rte_mbuf *buff;
1656 	/* The virtio_hdr is initialised to 0. */
1657 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1658 		= {{0, 0, 0, 0, 0, 0}, 0};
1659 	uint64_t buff_hdr_addr = 0;
1660 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1661 	uint32_t head_idx, packet_success = 0;
1662 	uint16_t res_cur_idx;
1663 
1664 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1665 
1666 	if (count == 0)
1667 		return 0;
1668 
1669 	vq = dev->virtqueue[VIRTIO_RXQ];
1670 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1671 
1672 	res_cur_idx = vq->last_used_idx;
1673 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1674 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1675 
1676 	/* Retrieve all of the head indexes first to avoid caching issues. */
1677 	for (head_idx = 0; head_idx < count; head_idx++)
1678 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1679 
1680 	/*Prefetch descriptor index. */
1681 	rte_prefetch0(&vq->desc[head[packet_success]]);
1682 
1683 	while (packet_success != count) {
1684 		/* Get descriptor from available ring */
1685 		desc = &vq->desc[head[packet_success]];
1686 
1687 		buff = pkts[packet_success];
1688 		LOG_DEBUG(VHOST_DATA,
1689 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1690 			"pkt[%d] descriptor idx: %d\n",
1691 			dev->device_fh, packet_success,
1692 			MBUF_HEADROOM_UINT32(buff));
1693 
1694 		PRINT_PACKET(dev,
1695 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1696 			+ RTE_PKTMBUF_HEADROOM),
1697 			rte_pktmbuf_data_len(buff), 0);
1698 
1699 		/* Buffer address translation for virtio header. */
1700 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1701 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1702 
1703 		/*
1704 		 * If the descriptors are chained the header and data are
1705 		 * placed in separate buffers.
1706 		 */
1707 		if (desc->flags & VRING_DESC_F_NEXT) {
1708 			desc->len = vq->vhost_hlen;
1709 			desc = &vq->desc[desc->next];
1710 			desc->len = rte_pktmbuf_data_len(buff);
1711 		} else {
1712 			desc->len = packet_len;
1713 		}
1714 
1715 		/* Update used ring with desc information */
1716 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1717 			= head[packet_success];
1718 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1719 			= packet_len;
1720 		res_cur_idx++;
1721 		packet_success++;
1722 
1723 		/* A header is required per buffer. */
1724 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1725 			(const void *)&virtio_hdr, vq->vhost_hlen);
1726 
1727 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1728 
1729 		if (likely(packet_success < count)) {
1730 			/* Prefetch descriptor index. */
1731 			rte_prefetch0(&vq->desc[head[packet_success]]);
1732 		}
1733 	}
1734 
1735 	rte_compiler_barrier();
1736 
1737 	LOG_DEBUG(VHOST_DATA,
1738 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1739 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1740 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1741 
1742 	*(volatile uint16_t *)&vq->used->idx += count;
1743 	vq->last_used_idx += count;
1744 
1745 	LOG_DEBUG(VHOST_DATA,
1746 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1747 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1748 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1749 
1750 	/* Kick the guest if necessary. */
1751 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1752 		eventfd_write((int)vq->kickfd, 1);
1753 
1754 	return count;
1755 }
1756 
1757 /*
1758  * This function routes the TX packet to the correct interface.
1759  * This may be a local device or the physical port.
1760  */
1761 static inline void __attribute__((always_inline))
1762 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1763 	uint32_t desc_idx, uint8_t need_copy)
1764 {
1765 	struct mbuf_table *tx_q;
1766 	struct rte_mbuf **m_table;
1767 	struct rte_mbuf *mbuf = NULL;
1768 	unsigned len, ret, offset = 0;
1769 	struct vpool *vpool;
1770 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1771 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1772 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1773 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1774 
1775 	/*Add packet to the port tx queue*/
1776 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1777 	len = tx_q->len;
1778 
1779 	/* Allocate an mbuf and populate the structure. */
1780 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1781 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1782 	if (unlikely(mbuf == NULL)) {
1783 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1784 		RTE_LOG(ERR, VHOST_DATA,
1785 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1786 			dev->device_fh);
1787 		put_desc_to_used_list_zcp(vq, desc_idx);
1788 		return;
1789 	}
1790 
1791 	if (vm2vm_mode == VM2VM_HARDWARE) {
1792 		/* Avoid using a vlan tag from any vm for external pkt, such as
1793 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1794 		 * selection, MAC address determines it as an external pkt
1795 		 * which should go to network, while vlan tag determine it as
1796 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1797 		 * such a ambiguous situation, so pkt will lost.
1798 		 */
1799 		vlan_tag = external_pkt_default_vlan_tag;
1800 		while (dev_ll != NULL) {
1801 			if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1802 				ether_addr_cmp(&(pkt_hdr->d_addr),
1803 				&dev_ll->vdev->mac_address)) {
1804 
1805 				/*
1806 				 * Drop the packet if the TX packet is destined
1807 				 * for the TX device.
1808 				 */
1809 				if (unlikely(dev_ll->vdev->dev->device_fh
1810 					== dev->device_fh)) {
1811 					LOG_DEBUG(VHOST_DATA,
1812 					"(%"PRIu64") TX: Source and destination"
1813 					"MAC addresses are the same. Dropping "
1814 					"packet.\n",
1815 					dev_ll->vdev->dev->device_fh);
1816 					MBUF_HEADROOM_UINT32(mbuf)
1817 						= (uint32_t)desc_idx;
1818 					__rte_mbuf_raw_free(mbuf);
1819 					return;
1820 				}
1821 
1822 				/*
1823 				 * Packet length offset 4 bytes for HW vlan
1824 				 * strip when L2 switch back.
1825 				 */
1826 				offset = 4;
1827 				vlan_tag =
1828 				(uint16_t)
1829 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1830 
1831 				LOG_DEBUG(VHOST_DATA,
1832 				"(%"PRIu64") TX: pkt to local VM device id:"
1833 				"(%"PRIu64") vlan tag: %d.\n",
1834 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1835 				vlan_tag);
1836 
1837 				break;
1838 			}
1839 			dev_ll = dev_ll->next;
1840 		}
1841 	}
1842 
1843 	mbuf->nb_segs = m->nb_segs;
1844 	mbuf->next = m->next;
1845 	mbuf->data_len = m->data_len + offset;
1846 	mbuf->pkt_len = mbuf->data_len;
1847 	if (unlikely(need_copy)) {
1848 		/* Copy the packet contents to the mbuf. */
1849 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1850 			rte_pktmbuf_mtod(m, void *),
1851 			m->data_len);
1852 	} else {
1853 		mbuf->data_off = m->data_off;
1854 		mbuf->buf_physaddr = m->buf_physaddr;
1855 		mbuf->buf_addr = m->buf_addr;
1856 	}
1857 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1858 	mbuf->vlan_tci = vlan_tag;
1859 	mbuf->l2_len = sizeof(struct ether_hdr);
1860 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1861 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1862 
1863 	tx_q->m_table[len] = mbuf;
1864 	len++;
1865 
1866 	LOG_DEBUG(VHOST_DATA,
1867 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1868 		dev->device_fh,
1869 		mbuf->nb_segs,
1870 		(mbuf->next == NULL) ? "null" : "non-null");
1871 
1872 	if (enable_stats) {
1873 		dev_statistics[dev->device_fh].tx_total++;
1874 		dev_statistics[dev->device_fh].tx++;
1875 	}
1876 
1877 	if (unlikely(len == MAX_PKT_BURST)) {
1878 		m_table = (struct rte_mbuf **)tx_q->m_table;
1879 		ret = rte_eth_tx_burst(ports[0],
1880 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1881 
1882 		/*
1883 		 * Free any buffers not handled by TX and update
1884 		 * the port stats.
1885 		 */
1886 		if (unlikely(ret < len)) {
1887 			do {
1888 				rte_pktmbuf_free(m_table[ret]);
1889 			} while (++ret < len);
1890 		}
1891 
1892 		len = 0;
1893 		txmbuf_clean_zcp(dev, vpool);
1894 	}
1895 
1896 	tx_q->len = len;
1897 
1898 	return;
1899 }
1900 
1901 /*
1902  * This function TX all available packets in virtio TX queue for one
1903  * virtio-net device. If it is first packet, it learns MAC address and
1904  * setup VMDQ.
1905  */
1906 static inline void __attribute__((always_inline))
1907 virtio_dev_tx_zcp(struct virtio_net *dev)
1908 {
1909 	struct rte_mbuf m;
1910 	struct vhost_virtqueue *vq;
1911 	struct vring_desc *desc;
1912 	uint64_t buff_addr = 0, phys_addr;
1913 	uint32_t head[MAX_PKT_BURST];
1914 	uint32_t i;
1915 	uint16_t free_entries, packet_success = 0;
1916 	uint16_t avail_idx;
1917 	uint8_t need_copy = 0;
1918 	hpa_type addr_type;
1919 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1920 
1921 	vq = dev->virtqueue[VIRTIO_TXQ];
1922 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1923 
1924 	/* If there are no available buffers then return. */
1925 	if (vq->last_used_idx_res == avail_idx)
1926 		return;
1927 
1928 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1929 
1930 	/* Prefetch available ring to retrieve head indexes. */
1931 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1932 
1933 	/* Get the number of free entries in the ring */
1934 	free_entries = (avail_idx - vq->last_used_idx_res);
1935 
1936 	/* Limit to MAX_PKT_BURST. */
1937 	free_entries
1938 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1939 
1940 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1941 		dev->device_fh, free_entries);
1942 
1943 	/* Retrieve all of the head indexes first to avoid caching issues. */
1944 	for (i = 0; i < free_entries; i++)
1945 		head[i]
1946 			= vq->avail->ring[(vq->last_used_idx_res + i)
1947 			& (vq->size - 1)];
1948 
1949 	vq->last_used_idx_res += free_entries;
1950 
1951 	/* Prefetch descriptor index. */
1952 	rte_prefetch0(&vq->desc[head[packet_success]]);
1953 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1954 
1955 	while (packet_success < free_entries) {
1956 		desc = &vq->desc[head[packet_success]];
1957 
1958 		/* Discard first buffer as it is the virtio header */
1959 		desc = &vq->desc[desc->next];
1960 
1961 		/* Buffer address translation. */
1962 		buff_addr = gpa_to_vva(dev, desc->addr);
1963 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1964 
1965 		if (likely(packet_success < (free_entries - 1)))
1966 			/* Prefetch descriptor index. */
1967 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1968 
1969 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1970 			RTE_LOG(ERR, VHOST_DATA,
1971 				"(%"PRIu64") Invalid frame buffer address found"
1972 				"when TX packets!\n",
1973 				dev->device_fh);
1974 			packet_success++;
1975 			continue;
1976 		}
1977 
1978 		/* Prefetch buffer address. */
1979 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1980 
1981 		/*
1982 		 * Setup dummy mbuf. This is copied to a real mbuf if
1983 		 * transmitted out the physical port.
1984 		 */
1985 		m.data_len = desc->len;
1986 		m.nb_segs = 1;
1987 		m.next = NULL;
1988 		m.data_off = 0;
1989 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1990 		m.buf_physaddr = phys_addr;
1991 
1992 		/*
1993 		 * Check if the frame buffer address from guest crosses
1994 		 * sub-region or not.
1995 		 */
1996 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1997 			RTE_LOG(ERR, VHOST_DATA,
1998 				"(%"PRIu64") Frame buffer address cross "
1999 				"sub-regioin found when attaching TX frame "
2000 				"buffer address!\n",
2001 				dev->device_fh);
2002 			need_copy = 1;
2003 		} else
2004 			need_copy = 0;
2005 
2006 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2007 
2008 		/*
2009 		 * If this is the first received packet we need to learn
2010 		 * the MAC and setup VMDQ
2011 		 */
2012 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2013 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2014 				/*
2015 				 * Discard frame if device is scheduled for
2016 				 * removal or a duplicate MAC address is found.
2017 				 */
2018 				packet_success += free_entries;
2019 				vq->last_used_idx += packet_success;
2020 				break;
2021 			}
2022 		}
2023 
2024 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2025 		packet_success++;
2026 	}
2027 }
2028 
2029 /*
2030  * This function is called by each data core. It handles all RX/TX registered
2031  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2032  * addresses are compared with all devices in the main linked list.
2033  */
2034 static int
2035 switch_worker_zcp(__attribute__((unused)) void *arg)
2036 {
2037 	struct virtio_net *dev = NULL;
2038 	struct vhost_dev  *vdev = NULL;
2039 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2040 	struct virtio_net_data_ll *dev_ll;
2041 	struct mbuf_table *tx_q;
2042 	volatile struct lcore_ll_info *lcore_ll;
2043 	const uint64_t drain_tsc
2044 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2045 		* BURST_TX_DRAIN_US;
2046 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2047 	unsigned ret;
2048 	const uint16_t lcore_id = rte_lcore_id();
2049 	uint16_t count_in_ring, rx_count = 0;
2050 
2051 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2052 
2053 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2054 	prev_tsc = 0;
2055 
2056 	while (1) {
2057 		cur_tsc = rte_rdtsc();
2058 
2059 		/* TX burst queue drain */
2060 		diff_tsc = cur_tsc - prev_tsc;
2061 		if (unlikely(diff_tsc > drain_tsc)) {
2062 			/*
2063 			 * Get mbuf from vpool.pool and detach mbuf and
2064 			 * put back into vpool.ring.
2065 			 */
2066 			dev_ll = lcore_ll->ll_root_used;
2067 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2068 				/* Get virtio device ID */
2069 				vdev = dev_ll->vdev;
2070 				dev = vdev->dev;
2071 
2072 				if (likely(!vdev->remove)) {
2073 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2074 					if (tx_q->len) {
2075 						LOG_DEBUG(VHOST_DATA,
2076 						"TX queue drained after timeout"
2077 						" with burst size %u\n",
2078 						tx_q->len);
2079 
2080 						/*
2081 						 * Tx any packets in the queue
2082 						 */
2083 						ret = rte_eth_tx_burst(
2084 							ports[0],
2085 							(uint16_t)tx_q->txq_id,
2086 							(struct rte_mbuf **)
2087 							tx_q->m_table,
2088 							(uint16_t)tx_q->len);
2089 						if (unlikely(ret < tx_q->len)) {
2090 							do {
2091 								rte_pktmbuf_free(
2092 									tx_q->m_table[ret]);
2093 							} while (++ret < tx_q->len);
2094 						}
2095 						tx_q->len = 0;
2096 
2097 						txmbuf_clean_zcp(dev,
2098 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2099 					}
2100 				}
2101 				dev_ll = dev_ll->next;
2102 			}
2103 			prev_tsc = cur_tsc;
2104 		}
2105 
2106 		rte_prefetch0(lcore_ll->ll_root_used);
2107 
2108 		/*
2109 		 * Inform the configuration core that we have exited the linked
2110 		 * list and that no devices are in use if requested.
2111 		 */
2112 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2113 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2114 
2115 		/* Process devices */
2116 		dev_ll = lcore_ll->ll_root_used;
2117 
2118 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2119 			vdev = dev_ll->vdev;
2120 			dev  = vdev->dev;
2121 			if (unlikely(vdev->remove)) {
2122 				dev_ll = dev_ll->next;
2123 				unlink_vmdq(vdev);
2124 				vdev->ready = DEVICE_SAFE_REMOVE;
2125 				continue;
2126 			}
2127 
2128 			if (likely(vdev->ready == DEVICE_RX)) {
2129 				uint32_t index = vdev->vmdq_rx_q;
2130 				uint16_t i;
2131 				count_in_ring
2132 				= rte_ring_count(vpool_array[index].ring);
2133 				uint16_t free_entries
2134 				= (uint16_t)get_available_ring_num_zcp(dev);
2135 
2136 				/*
2137 				 * Attach all mbufs in vpool.ring and put back
2138 				 * into vpool.pool.
2139 				 */
2140 				for (i = 0;
2141 				i < RTE_MIN(free_entries,
2142 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2143 				i++)
2144 					attach_rxmbuf_zcp(dev);
2145 
2146 				/* Handle guest RX */
2147 				rx_count = rte_eth_rx_burst(ports[0],
2148 					vdev->vmdq_rx_q, pkts_burst,
2149 					MAX_PKT_BURST);
2150 
2151 				if (rx_count) {
2152 					ret_count = virtio_dev_rx_zcp(dev,
2153 							pkts_burst, rx_count);
2154 					if (enable_stats) {
2155 						dev_statistics[dev->device_fh].rx_total
2156 							+= rx_count;
2157 						dev_statistics[dev->device_fh].rx
2158 							+= ret_count;
2159 					}
2160 					while (likely(rx_count)) {
2161 						rx_count--;
2162 						pktmbuf_detach_zcp(
2163 							pkts_burst[rx_count]);
2164 						rte_ring_sp_enqueue(
2165 							vpool_array[index].ring,
2166 							(void *)pkts_burst[rx_count]);
2167 					}
2168 				}
2169 			}
2170 
2171 			if (likely(!vdev->remove))
2172 				/* Handle guest TX */
2173 				virtio_dev_tx_zcp(dev);
2174 
2175 			/* Move to the next device in the list */
2176 			dev_ll = dev_ll->next;
2177 		}
2178 	}
2179 
2180 	return 0;
2181 }
2182 
2183 
2184 /*
2185  * Add an entry to a used linked list. A free entry must first be found
2186  * in the free linked list using get_data_ll_free_entry();
2187  */
2188 static void
2189 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2190 	struct virtio_net_data_ll *ll_dev)
2191 {
2192 	struct virtio_net_data_ll *ll = *ll_root_addr;
2193 
2194 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2195 	ll_dev->next = NULL;
2196 	rte_compiler_barrier();
2197 
2198 	/* If ll == NULL then this is the first device. */
2199 	if (ll) {
2200 		/* Increment to the tail of the linked list. */
2201 		while ((ll->next != NULL) )
2202 			ll = ll->next;
2203 
2204 		ll->next = ll_dev;
2205 	} else {
2206 		*ll_root_addr = ll_dev;
2207 	}
2208 }
2209 
2210 /*
2211  * Remove an entry from a used linked list. The entry must then be added to
2212  * the free linked list using put_data_ll_free_entry().
2213  */
2214 static void
2215 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2216 	struct virtio_net_data_ll *ll_dev,
2217 	struct virtio_net_data_ll *ll_dev_last)
2218 {
2219 	struct virtio_net_data_ll *ll = *ll_root_addr;
2220 
2221 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2222 		return;
2223 
2224 	if (ll_dev == ll)
2225 		*ll_root_addr = ll_dev->next;
2226 	else
2227 		if (likely(ll_dev_last != NULL))
2228 			ll_dev_last->next = ll_dev->next;
2229 		else
2230 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2231 }
2232 
2233 /*
2234  * Find and return an entry from the free linked list.
2235  */
2236 static struct virtio_net_data_ll *
2237 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2238 {
2239 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2240 	struct virtio_net_data_ll *ll_dev;
2241 
2242 	if (ll_free == NULL)
2243 		return NULL;
2244 
2245 	ll_dev = ll_free;
2246 	*ll_root_addr = ll_free->next;
2247 
2248 	return ll_dev;
2249 }
2250 
2251 /*
2252  * Place an entry back on to the free linked list.
2253  */
2254 static void
2255 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2256 	struct virtio_net_data_ll *ll_dev)
2257 {
2258 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2259 
2260 	if (ll_dev == NULL)
2261 		return;
2262 
2263 	ll_dev->next = ll_free;
2264 	*ll_root_addr = ll_dev;
2265 }
2266 
2267 /*
2268  * Creates a linked list of a given size.
2269  */
2270 static struct virtio_net_data_ll *
2271 alloc_data_ll(uint32_t size)
2272 {
2273 	struct virtio_net_data_ll *ll_new;
2274 	uint32_t i;
2275 
2276 	/* Malloc and then chain the linked list. */
2277 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2278 	if (ll_new == NULL) {
2279 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2280 		return NULL;
2281 	}
2282 
2283 	for (i = 0; i < size - 1; i++) {
2284 		ll_new[i].vdev = NULL;
2285 		ll_new[i].next = &ll_new[i+1];
2286 	}
2287 	ll_new[i].next = NULL;
2288 
2289 	return (ll_new);
2290 }
2291 
2292 /*
2293  * Create the main linked list along with each individual cores linked list. A used and a free list
2294  * are created to manage entries.
2295  */
2296 static int
2297 init_data_ll (void)
2298 {
2299 	int lcore;
2300 
2301 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2302 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2303 		if (lcore_info[lcore].lcore_ll == NULL) {
2304 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2305 			return -1;
2306 		}
2307 
2308 		lcore_info[lcore].lcore_ll->device_num = 0;
2309 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2310 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2311 		if (num_devices % num_switching_cores)
2312 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2313 		else
2314 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2315 	}
2316 
2317 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2318 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2319 
2320 	return 0;
2321 }
2322 
2323 /*
2324  * Set virtqueue flags so that we do not receive interrupts.
2325  */
2326 static void
2327 set_irq_status (struct virtio_net *dev)
2328 {
2329 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2330 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2331 }
2332 
2333 /*
2334  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2335  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2336  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2337  */
2338 static void
2339 destroy_device (volatile struct virtio_net *dev)
2340 {
2341 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2342 	struct virtio_net_data_ll *ll_main_dev_cur;
2343 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2344 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2345 	struct vhost_dev *vdev;
2346 	int lcore;
2347 
2348 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2349 
2350 	vdev = (struct vhost_dev *)dev->priv;
2351 	/*set the remove flag. */
2352 	vdev->remove = 1;
2353 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2354 		rte_pause();
2355 	}
2356 
2357 	/* Search for entry to be removed from lcore ll */
2358 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2359 	while (ll_lcore_dev_cur != NULL) {
2360 		if (ll_lcore_dev_cur->vdev == vdev) {
2361 			break;
2362 		} else {
2363 			ll_lcore_dev_last = ll_lcore_dev_cur;
2364 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2365 		}
2366 	}
2367 
2368 	if (ll_lcore_dev_cur == NULL) {
2369 		RTE_LOG(ERR, VHOST_CONFIG,
2370 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2371 			dev->device_fh);
2372 		return;
2373 	}
2374 
2375 	/* Search for entry to be removed from main ll */
2376 	ll_main_dev_cur = ll_root_used;
2377 	ll_main_dev_last = NULL;
2378 	while (ll_main_dev_cur != NULL) {
2379 		if (ll_main_dev_cur->vdev == vdev) {
2380 			break;
2381 		} else {
2382 			ll_main_dev_last = ll_main_dev_cur;
2383 			ll_main_dev_cur = ll_main_dev_cur->next;
2384 		}
2385 	}
2386 
2387 	/* Remove entries from the lcore and main ll. */
2388 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2389 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2390 
2391 	/* Set the dev_removal_flag on each lcore. */
2392 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2393 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2394 	}
2395 
2396 	/*
2397 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2398 	 * they can no longer access the device removed from the linked lists and that the devices
2399 	 * are no longer in use.
2400 	 */
2401 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2402 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2403 			rte_pause();
2404 		}
2405 	}
2406 
2407 	/* Add the entries back to the lcore and main free ll.*/
2408 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2409 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2410 
2411 	/* Decrement number of device on the lcore. */
2412 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2413 
2414 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2415 
2416 	if (zero_copy) {
2417 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2418 
2419 		/* Stop the RX queue. */
2420 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2421 			LOG_DEBUG(VHOST_CONFIG,
2422 				"(%"PRIu64") In destroy_device: Failed to stop "
2423 				"rx queue:%d\n",
2424 				dev->device_fh,
2425 				vdev->vmdq_rx_q);
2426 		}
2427 
2428 		LOG_DEBUG(VHOST_CONFIG,
2429 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2430 			"mempool back to ring for RX queue: %d\n",
2431 			dev->device_fh, vdev->vmdq_rx_q);
2432 
2433 		mbuf_destroy_zcp(vpool);
2434 
2435 		/* Stop the TX queue. */
2436 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2437 			LOG_DEBUG(VHOST_CONFIG,
2438 				"(%"PRIu64") In destroy_device: Failed to "
2439 				"stop tx queue:%d\n",
2440 				dev->device_fh, vdev->vmdq_rx_q);
2441 		}
2442 
2443 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2444 
2445 		LOG_DEBUG(VHOST_CONFIG,
2446 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2447 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2448 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2449 			dev->device_fh);
2450 
2451 		mbuf_destroy_zcp(vpool);
2452 		rte_free(vdev->regions_hpa);
2453 	}
2454 	rte_free(vdev);
2455 
2456 }
2457 
2458 /*
2459  * Calculate the region count of physical continous regions for one particular
2460  * region of whose vhost virtual address is continous. The particular region
2461  * start from vva_start, with size of 'size' in argument.
2462  */
2463 static uint32_t
2464 check_hpa_regions(uint64_t vva_start, uint64_t size)
2465 {
2466 	uint32_t i, nregions = 0, page_size = getpagesize();
2467 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2468 	if (vva_start % page_size) {
2469 		LOG_DEBUG(VHOST_CONFIG,
2470 			"in check_countinous: vva start(%p) mod page_size(%d) "
2471 			"has remainder\n",
2472 			(void *)(uintptr_t)vva_start, page_size);
2473 		return 0;
2474 	}
2475 	if (size % page_size) {
2476 		LOG_DEBUG(VHOST_CONFIG,
2477 			"in check_countinous: "
2478 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2479 			size, page_size);
2480 		return 0;
2481 	}
2482 	for (i = 0; i < size - page_size; i = i + page_size) {
2483 		cur_phys_addr
2484 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2485 		next_phys_addr = rte_mem_virt2phy(
2486 			(void *)(uintptr_t)(vva_start + i + page_size));
2487 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2488 			++nregions;
2489 			LOG_DEBUG(VHOST_CONFIG,
2490 				"in check_continuous: hva addr:(%p) is not "
2491 				"continuous with hva addr:(%p), diff:%d\n",
2492 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2493 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2494 				+ page_size), page_size);
2495 			LOG_DEBUG(VHOST_CONFIG,
2496 				"in check_continuous: hpa addr:(%p) is not "
2497 				"continuous with hpa addr:(%p), "
2498 				"diff:(%"PRIu64")\n",
2499 				(void *)(uintptr_t)cur_phys_addr,
2500 				(void *)(uintptr_t)next_phys_addr,
2501 				(next_phys_addr-cur_phys_addr));
2502 		}
2503 	}
2504 	return nregions;
2505 }
2506 
2507 /*
2508  * Divide each region whose vhost virtual address is continous into a few
2509  * sub-regions, make sure the physical address within each sub-region are
2510  * continous. And fill offset(to GPA) and size etc. information of each
2511  * sub-region into regions_hpa.
2512  */
2513 static uint32_t
2514 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2515 {
2516 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2517 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2518 
2519 	if (mem_region_hpa == NULL)
2520 		return 0;
2521 
2522 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2523 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2524 			virtio_memory->regions[regionidx].address_offset;
2525 		mem_region_hpa[regionidx_hpa].guest_phys_address
2526 			= virtio_memory->regions[regionidx].guest_phys_address;
2527 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2528 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2529 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2530 		LOG_DEBUG(VHOST_CONFIG,
2531 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2532 			regionidx_hpa,
2533 			(void *)(uintptr_t)
2534 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2535 		LOG_DEBUG(VHOST_CONFIG,
2536 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2537 			regionidx_hpa,
2538 			(void *)(uintptr_t)
2539 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2540 		for (i = 0, k = 0;
2541 			i < virtio_memory->regions[regionidx].memory_size -
2542 				page_size;
2543 			i += page_size) {
2544 			cur_phys_addr = rte_mem_virt2phy(
2545 					(void *)(uintptr_t)(vva_start + i));
2546 			next_phys_addr = rte_mem_virt2phy(
2547 					(void *)(uintptr_t)(vva_start +
2548 					i + page_size));
2549 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2550 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2551 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2552 					k + page_size;
2553 				mem_region_hpa[regionidx_hpa].memory_size
2554 					= k + page_size;
2555 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2556 					"phys addr end  [%d]:(%p)\n",
2557 					regionidx_hpa,
2558 					(void *)(uintptr_t)
2559 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2560 				LOG_DEBUG(VHOST_CONFIG,
2561 					"in fill_hpa_regions: guest phys addr "
2562 					"size [%d]:(%p)\n",
2563 					regionidx_hpa,
2564 					(void *)(uintptr_t)
2565 					(mem_region_hpa[regionidx_hpa].memory_size));
2566 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2567 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2568 				++regionidx_hpa;
2569 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2570 					next_phys_addr -
2571 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2572 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2573 					" phys addr start[%d]:(%p)\n",
2574 					regionidx_hpa,
2575 					(void *)(uintptr_t)
2576 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2577 				LOG_DEBUG(VHOST_CONFIG,
2578 					"in fill_hpa_regions: host  phys addr "
2579 					"start[%d]:(%p)\n",
2580 					regionidx_hpa,
2581 					(void *)(uintptr_t)
2582 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2583 				k = 0;
2584 			} else {
2585 				k += page_size;
2586 			}
2587 		}
2588 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2589 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2590 			+ k + page_size;
2591 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2592 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2593 			"[%d]:(%p)\n", regionidx_hpa,
2594 			(void *)(uintptr_t)
2595 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2596 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2597 			"[%d]:(%p)\n", regionidx_hpa,
2598 			(void *)(uintptr_t)
2599 			(mem_region_hpa[regionidx_hpa].memory_size));
2600 		++regionidx_hpa;
2601 	}
2602 	return regionidx_hpa;
2603 }
2604 
2605 /*
2606  * A new device is added to a data core. First the device is added to the main linked list
2607  * and the allocated to a specific data core.
2608  */
2609 static int
2610 new_device (struct virtio_net *dev)
2611 {
2612 	struct virtio_net_data_ll *ll_dev;
2613 	int lcore, core_add = 0;
2614 	uint32_t device_num_min = num_devices;
2615 	struct vhost_dev *vdev;
2616 	uint32_t regionidx;
2617 
2618 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2619 	if (vdev == NULL) {
2620 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2621 			dev->device_fh);
2622 		return -1;
2623 	}
2624 	vdev->dev = dev;
2625 	dev->priv = vdev;
2626 
2627 	if (zero_copy) {
2628 		vdev->nregions_hpa = dev->mem->nregions;
2629 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2630 			vdev->nregions_hpa
2631 				+= check_hpa_regions(
2632 					dev->mem->regions[regionidx].guest_phys_address
2633 					+ dev->mem->regions[regionidx].address_offset,
2634 					dev->mem->regions[regionidx].memory_size);
2635 
2636 		}
2637 
2638 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2639 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2640 			CACHE_LINE_SIZE);
2641 		if (vdev->regions_hpa == NULL) {
2642 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2643 			rte_free(vdev);
2644 			return -1;
2645 		}
2646 
2647 
2648 		if (fill_hpa_memory_regions(
2649 			vdev->regions_hpa, dev->mem
2650 			) != vdev->nregions_hpa) {
2651 
2652 			RTE_LOG(ERR, VHOST_CONFIG,
2653 				"hpa memory regions number mismatch: "
2654 				"[%d]\n", vdev->nregions_hpa);
2655 			rte_free(vdev->regions_hpa);
2656 			rte_free(vdev);
2657 			return -1;
2658 		}
2659 	}
2660 
2661 
2662 	/* Add device to main ll */
2663 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2664 	if (ll_dev == NULL) {
2665 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2666 			"of %d devices per core has been reached\n",
2667 			dev->device_fh, num_devices);
2668 		if (vdev->regions_hpa)
2669 			rte_free(vdev->regions_hpa);
2670 		rte_free(vdev);
2671 		return -1;
2672 	}
2673 	ll_dev->vdev = vdev;
2674 	add_data_ll_entry(&ll_root_used, ll_dev);
2675 	vdev->vmdq_rx_q
2676 		= dev->device_fh * (num_queues / num_devices);
2677 
2678 	if (zero_copy) {
2679 		uint32_t index = vdev->vmdq_rx_q;
2680 		uint32_t count_in_ring, i;
2681 		struct mbuf_table *tx_q;
2682 
2683 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2684 
2685 		LOG_DEBUG(VHOST_CONFIG,
2686 			"(%"PRIu64") in new_device: mbuf count in mempool "
2687 			"before attach is: %d\n",
2688 			dev->device_fh,
2689 			rte_mempool_count(vpool_array[index].pool));
2690 		LOG_DEBUG(VHOST_CONFIG,
2691 			"(%"PRIu64") in new_device: mbuf count in  ring "
2692 			"before attach  is : %d\n",
2693 			dev->device_fh, count_in_ring);
2694 
2695 		/*
2696 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2697 		 */
2698 		for (i = 0; i < count_in_ring; i++)
2699 			attach_rxmbuf_zcp(dev);
2700 
2701 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2702 			"mempool after attach is: %d\n",
2703 			dev->device_fh,
2704 			rte_mempool_count(vpool_array[index].pool));
2705 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2706 			"ring after attach  is : %d\n",
2707 			dev->device_fh,
2708 			rte_ring_count(vpool_array[index].ring));
2709 
2710 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2711 		tx_q->txq_id = vdev->vmdq_rx_q;
2712 
2713 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2714 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2715 
2716 			LOG_DEBUG(VHOST_CONFIG,
2717 				"(%"PRIu64") In new_device: Failed to start "
2718 				"tx queue:%d\n",
2719 				dev->device_fh, vdev->vmdq_rx_q);
2720 
2721 			mbuf_destroy_zcp(vpool);
2722 			rte_free(vdev->regions_hpa);
2723 			rte_free(vdev);
2724 			return -1;
2725 		}
2726 
2727 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2728 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2729 
2730 			LOG_DEBUG(VHOST_CONFIG,
2731 				"(%"PRIu64") In new_device: Failed to start "
2732 				"rx queue:%d\n",
2733 				dev->device_fh, vdev->vmdq_rx_q);
2734 
2735 			/* Stop the TX queue. */
2736 			if (rte_eth_dev_tx_queue_stop(ports[0],
2737 				vdev->vmdq_rx_q) != 0) {
2738 				LOG_DEBUG(VHOST_CONFIG,
2739 					"(%"PRIu64") In new_device: Failed to "
2740 					"stop tx queue:%d\n",
2741 					dev->device_fh, vdev->vmdq_rx_q);
2742 			}
2743 
2744 			mbuf_destroy_zcp(vpool);
2745 			rte_free(vdev->regions_hpa);
2746 			rte_free(vdev);
2747 			return -1;
2748 		}
2749 
2750 	}
2751 
2752 	/*reset ready flag*/
2753 	vdev->ready = DEVICE_MAC_LEARNING;
2754 	vdev->remove = 0;
2755 
2756 	/* Find a suitable lcore to add the device. */
2757 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2758 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2759 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2760 			core_add = lcore;
2761 		}
2762 	}
2763 	/* Add device to lcore ll */
2764 	ll_dev->dev->coreid = core_add;
2765 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2766 	if (ll_dev == NULL) {
2767 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2768 		vdev->ready = DEVICE_SAFE_REMOVE;
2769 		destroy_device(dev);
2770 		if (vdev->regions_hpa)
2771 			rte_free(vdev->regions_hpa);
2772 		rte_free(vdev);
2773 		return -1;
2774 	}
2775 	ll_dev->vdev = vdev;
2776 	vdev->coreid = core_add;
2777 
2778 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2779 
2780 	/* Initialize device stats */
2781 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2782 
2783 	/* Disable notifications. */
2784 	set_irq_status(dev);
2785 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2786 	dev->flags |= VIRTIO_DEV_RUNNING;
2787 
2788 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2789 
2790 	return 0;
2791 }
2792 
2793 /*
2794  * These callback allow devices to be added to the data core when configuration
2795  * has been fully complete.
2796  */
2797 static const struct virtio_net_device_ops virtio_net_device_ops =
2798 {
2799 	.new_device =  new_device,
2800 	.destroy_device = destroy_device,
2801 };
2802 
2803 /*
2804  * This is a thread will wake up after a period to print stats if the user has
2805  * enabled them.
2806  */
2807 static void
2808 print_stats(void)
2809 {
2810 	struct virtio_net_data_ll *dev_ll;
2811 	uint64_t tx_dropped, rx_dropped;
2812 	uint64_t tx, tx_total, rx, rx_total;
2813 	uint32_t device_fh;
2814 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2815 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2816 
2817 	while(1) {
2818 		sleep(enable_stats);
2819 
2820 		/* Clear screen and move to top left */
2821 		printf("%s%s", clr, top_left);
2822 
2823 		printf("\nDevice statistics ====================================");
2824 
2825 		dev_ll = ll_root_used;
2826 		while (dev_ll != NULL) {
2827 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2828 			tx_total = dev_statistics[device_fh].tx_total;
2829 			tx = dev_statistics[device_fh].tx;
2830 			tx_dropped = tx_total - tx;
2831 			if (zero_copy == 0) {
2832 				rx_total = rte_atomic64_read(
2833 					&dev_statistics[device_fh].rx_total_atomic);
2834 				rx = rte_atomic64_read(
2835 					&dev_statistics[device_fh].rx_atomic);
2836 			} else {
2837 				rx_total = dev_statistics[device_fh].rx_total;
2838 				rx = dev_statistics[device_fh].rx;
2839 			}
2840 			rx_dropped = rx_total - rx;
2841 
2842 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2843 					"\nTX total: 		%"PRIu64""
2844 					"\nTX dropped: 		%"PRIu64""
2845 					"\nTX successful: 		%"PRIu64""
2846 					"\nRX total: 		%"PRIu64""
2847 					"\nRX dropped: 		%"PRIu64""
2848 					"\nRX successful: 		%"PRIu64"",
2849 					device_fh,
2850 					tx_total,
2851 					tx_dropped,
2852 					tx,
2853 					rx_total,
2854 					rx_dropped,
2855 					rx);
2856 
2857 			dev_ll = dev_ll->next;
2858 		}
2859 		printf("\n======================================================\n");
2860 	}
2861 }
2862 
2863 static void
2864 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2865 	char *ring_name, uint32_t nb_mbuf)
2866 {
2867 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2868 	vpool_array[index].pool
2869 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2870 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2871 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2872 		rte_pktmbuf_init, NULL, socket, 0);
2873 	if (vpool_array[index].pool != NULL) {
2874 		vpool_array[index].ring
2875 			= rte_ring_create(ring_name,
2876 				rte_align32pow2(nb_mbuf + 1),
2877 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2878 		if (likely(vpool_array[index].ring != NULL)) {
2879 			LOG_DEBUG(VHOST_CONFIG,
2880 				"in setup_mempool_tbl: mbuf count in "
2881 				"mempool is: %d\n",
2882 				rte_mempool_count(vpool_array[index].pool));
2883 			LOG_DEBUG(VHOST_CONFIG,
2884 				"in setup_mempool_tbl: mbuf count in "
2885 				"ring   is: %d\n",
2886 				rte_ring_count(vpool_array[index].ring));
2887 		} else {
2888 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2889 				ring_name);
2890 		}
2891 
2892 		/* Need consider head room. */
2893 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2894 	} else {
2895 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2896 	}
2897 }
2898 
2899 
2900 /*
2901  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2902  * device is also registered here to handle the IOCTLs.
2903  */
2904 int
2905 MAIN(int argc, char *argv[])
2906 {
2907 	struct rte_mempool *mbuf_pool = NULL;
2908 	unsigned lcore_id, core_id = 0;
2909 	unsigned nb_ports, valid_num_ports;
2910 	int ret;
2911 	uint8_t portid, queue_id = 0;
2912 	static pthread_t tid;
2913 
2914 	/* init EAL */
2915 	ret = rte_eal_init(argc, argv);
2916 	if (ret < 0)
2917 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2918 	argc -= ret;
2919 	argv += ret;
2920 
2921 	/* parse app arguments */
2922 	ret = us_vhost_parse_args(argc, argv);
2923 	if (ret < 0)
2924 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2925 
2926 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2927 		if (rte_lcore_is_enabled(lcore_id))
2928 			lcore_ids[core_id ++] = lcore_id;
2929 
2930 	if (rte_lcore_count() > RTE_MAX_LCORE)
2931 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2932 
2933 	/*set the number of swithcing cores available*/
2934 	num_switching_cores = rte_lcore_count()-1;
2935 
2936 	/* Get the number of physical ports. */
2937 	nb_ports = rte_eth_dev_count();
2938 	if (nb_ports > RTE_MAX_ETHPORTS)
2939 		nb_ports = RTE_MAX_ETHPORTS;
2940 
2941 	/*
2942 	 * Update the global var NUM_PORTS and global array PORTS
2943 	 * and get value of var VALID_NUM_PORTS according to system ports number
2944 	 */
2945 	valid_num_ports = check_ports_num(nb_ports);
2946 
2947 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2948 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2949 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2950 		return -1;
2951 	}
2952 
2953 	if (zero_copy == 0) {
2954 		/* Create the mbuf pool. */
2955 		mbuf_pool = rte_mempool_create(
2956 				"MBUF_POOL",
2957 				NUM_MBUFS_PER_PORT
2958 				* valid_num_ports,
2959 				MBUF_SIZE, MBUF_CACHE_SIZE,
2960 				sizeof(struct rte_pktmbuf_pool_private),
2961 				rte_pktmbuf_pool_init, NULL,
2962 				rte_pktmbuf_init, NULL,
2963 				rte_socket_id(), 0);
2964 		if (mbuf_pool == NULL)
2965 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2966 
2967 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2968 			vpool_array[queue_id].pool = mbuf_pool;
2969 
2970 		if (vm2vm_mode == VM2VM_HARDWARE) {
2971 			/* Enable VT loop back to let L2 switch to do it. */
2972 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2973 			LOG_DEBUG(VHOST_CONFIG,
2974 				"Enable loop back for L2 switch in vmdq.\n");
2975 		}
2976 	} else {
2977 		uint32_t nb_mbuf;
2978 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2979 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2980 
2981 		/*
2982 		 * Zero copy defers queue RX/TX start to the time when guest
2983 		 * finishes its startup and packet buffers from that guest are
2984 		 * available.
2985 		 */
2986 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2987 		rx_conf_default.rx_drop_en = 0;
2988 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2989 		nb_mbuf = num_rx_descriptor
2990 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2991 			+ num_switching_cores * MAX_PKT_BURST;
2992 
2993 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2994 			snprintf(pool_name, sizeof(pool_name),
2995 				"rxmbuf_pool_%u", queue_id);
2996 			snprintf(ring_name, sizeof(ring_name),
2997 				"rxmbuf_ring_%u", queue_id);
2998 			setup_mempool_tbl(rte_socket_id(), queue_id,
2999 				pool_name, ring_name, nb_mbuf);
3000 		}
3001 
3002 		nb_mbuf = num_tx_descriptor
3003 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3004 				+ num_switching_cores * MAX_PKT_BURST;
3005 
3006 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3007 			snprintf(pool_name, sizeof(pool_name),
3008 				"txmbuf_pool_%u", queue_id);
3009 			snprintf(ring_name, sizeof(ring_name),
3010 				"txmbuf_ring_%u", queue_id);
3011 			setup_mempool_tbl(rte_socket_id(),
3012 				(queue_id + MAX_QUEUES),
3013 				pool_name, ring_name, nb_mbuf);
3014 		}
3015 
3016 		if (vm2vm_mode == VM2VM_HARDWARE) {
3017 			/* Enable VT loop back to let L2 switch to do it. */
3018 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3019 			LOG_DEBUG(VHOST_CONFIG,
3020 				"Enable loop back for L2 switch in vmdq.\n");
3021 		}
3022 	}
3023 	/* Set log level. */
3024 	rte_set_log_level(LOG_LEVEL);
3025 
3026 	/* initialize all ports */
3027 	for (portid = 0; portid < nb_ports; portid++) {
3028 		/* skip ports that are not enabled */
3029 		if ((enabled_port_mask & (1 << portid)) == 0) {
3030 			RTE_LOG(INFO, VHOST_PORT,
3031 				"Skipping disabled port %d\n", portid);
3032 			continue;
3033 		}
3034 		if (port_init(portid) != 0)
3035 			rte_exit(EXIT_FAILURE,
3036 				"Cannot initialize network ports\n");
3037 	}
3038 
3039 	/* Initialise all linked lists. */
3040 	if (init_data_ll() == -1)
3041 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3042 
3043 	/* Initialize device stats */
3044 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3045 
3046 	/* Enable stats if the user option is set. */
3047 	if (enable_stats)
3048 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3049 
3050 	/* Launch all data cores. */
3051 	if (zero_copy == 0) {
3052 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3053 			rte_eal_remote_launch(switch_worker,
3054 				mbuf_pool, lcore_id);
3055 		}
3056 	} else {
3057 		uint32_t count_in_mempool, index, i;
3058 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3059 			/* For all RX and TX queues. */
3060 			count_in_mempool
3061 				= rte_mempool_count(vpool_array[index].pool);
3062 
3063 			/*
3064 			 * Transfer all un-attached mbufs from vpool.pool
3065 			 * to vpoo.ring.
3066 			 */
3067 			for (i = 0; i < count_in_mempool; i++) {
3068 				struct rte_mbuf *mbuf
3069 					= __rte_mbuf_raw_alloc(
3070 						vpool_array[index].pool);
3071 				rte_ring_sp_enqueue(vpool_array[index].ring,
3072 						(void *)mbuf);
3073 			}
3074 
3075 			LOG_DEBUG(VHOST_CONFIG,
3076 				"in MAIN: mbuf count in mempool at initial "
3077 				"is: %d\n", count_in_mempool);
3078 			LOG_DEBUG(VHOST_CONFIG,
3079 				"in MAIN: mbuf count in  ring at initial  is :"
3080 				" %d\n",
3081 				rte_ring_count(vpool_array[index].ring));
3082 		}
3083 
3084 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3085 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3086 				lcore_id);
3087 	}
3088 
3089 	/* Register CUSE device to handle IOCTLs. */
3090 	ret = rte_vhost_driver_register((char *)&dev_basename);
3091 	if (ret != 0)
3092 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3093 
3094 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3095 
3096 	/* Start CUSE session. */
3097 	rte_vhost_driver_session_start();
3098 	return 0;
3099 
3100 }
3101 
3102