xref: /dpdk/examples/vhost/main.c (revision 9915bb1f212a62ff2e69ffdd4cb4739207fd1c53)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56 
57 #define MAX_QUEUES 128
58 
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61 
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
66 							(num_switching_cores*MAX_PKT_BURST) +  			\
67 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 							(num_switching_cores*MBUF_CACHE_SIZE))
69 
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72 
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 	+ RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82 
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92 
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101 
102 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 	/* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
105 
106 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
108 
109 #define JUMBO_FRAME_MAX_SIZE    0x2600
110 
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
113 #define DEVICE_RX			1
114 #define DEVICE_SAFE_REMOVE	2
115 
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
119 
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
123 
124 /*
125  * Need refine these 2 macros for legacy and DPDK based front end:
126  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127  * And then adjust power 2.
128  */
129 /*
130  * For legacy front end, 128 descriptors,
131  * half for virtio header, another half for mbuf.
132  */
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
135 
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138 		+ sizeof(struct rte_mbuf)))
139 
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
142 
143 #define INVALID_PORT_ID 0xFF
144 
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
147 
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
150 
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
153 
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
156 
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
159 
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
162 
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
165 
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
168 
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
172 
173 /*
174  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175  * disabled on default.
176  */
177 static uint32_t zero_copy;
178 
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182 
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
185 
186 struct vpool {
187 	struct rte_mempool *pool;
188 	struct rte_ring *ring;
189 	uint32_t buf_size;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191 
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
193 typedef enum {
194 	VM2VM_DISABLED = 0,
195 	VM2VM_SOFTWARE = 1,
196 	VM2VM_HARDWARE = 2,
197 	VM2VM_LAST
198 } vm2vm_type;
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200 
201 /* The type of host physical address translated from guest physical address. */
202 typedef enum {
203 	PHYS_ADDR_CONTINUOUS = 0,
204 	PHYS_ADDR_CROSS_SUBREG = 1,
205 	PHYS_ADDR_INVALID = 2,
206 	PHYS_ADDR_LAST
207 } hpa_type;
208 
209 /* Enable stats. */
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217 
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220 
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
223 
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
226 
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
229 	.rx_thresh = {
230 		.pthresh = RX_PTHRESH,
231 		.hthresh = RX_HTHRESH,
232 		.wthresh = RX_WTHRESH,
233 	},
234 	.rx_drop_en = 1,
235 };
236 
237 /*
238  * These default values are optimized for use with the Intel(R) 82599 10 GbE
239  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240  * network controllers and/or network drivers.
241  */
242 static struct rte_eth_txconf tx_conf_default = {
243 	.tx_thresh = {
244 		.pthresh = TX_PTHRESH,
245 		.hthresh = TX_HTHRESH,
246 		.wthresh = TX_WTHRESH,
247 	},
248 	.tx_free_thresh = 0, /* Use PMD default values */
249 	.tx_rs_thresh = 0, /* Use PMD default values */
250 };
251 
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
254 	.rxmode = {
255 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
256 		.split_hdr_size = 0,
257 		.header_split   = 0, /**< Header Split disabled */
258 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
259 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
260 		/*
261 		 * It is necessary for 1G NIC such as I350,
262 		 * this fixes bug of ipv4 forwarding in guest can't
263 		 * forward pakets from one virtio dev to another virtio dev.
264 		 */
265 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
266 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
267 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
268 	},
269 
270 	.txmode = {
271 		.mq_mode = ETH_MQ_TX_NONE,
272 	},
273 	.rx_adv_conf = {
274 		/*
275 		 * should be overridden separately in code with
276 		 * appropriate values
277 		 */
278 		.vmdq_rx_conf = {
279 			.nb_queue_pools = ETH_8_POOLS,
280 			.enable_default_pool = 0,
281 			.default_pool = 0,
282 			.nb_pool_maps = 0,
283 			.pool_map = {{0, 0},},
284 		},
285 	},
286 };
287 
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
291 
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
296 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 };
303 
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306 
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
310 
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
313 
314 /* Used for queueing bursts of TX packets. */
315 struct mbuf_table {
316 	unsigned len;
317 	unsigned txq_id;
318 	struct rte_mbuf *m_table[MAX_PKT_BURST];
319 };
320 
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323 
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326 
327 /* Vlan header struct used to insert vlan tags on TX. */
328 struct vlan_ethhdr {
329 	unsigned char   h_dest[ETH_ALEN];
330 	unsigned char   h_source[ETH_ALEN];
331 	__be16          h_vlan_proto;
332 	__be16          h_vlan_TCI;
333 	__be16          h_vlan_encapsulated_proto;
334 };
335 
336 /* IPv4 Header */
337 struct ipv4_hdr {
338 	uint8_t  version_ihl;		/**< version and header length */
339 	uint8_t  type_of_service;	/**< type of service */
340 	uint16_t total_length;		/**< length of packet */
341 	uint16_t packet_id;		/**< packet ID */
342 	uint16_t fragment_offset;	/**< fragmentation offset */
343 	uint8_t  time_to_live;		/**< time to live */
344 	uint8_t  next_proto_id;		/**< protocol ID */
345 	uint16_t hdr_checksum;		/**< header checksum */
346 	uint32_t src_addr;		/**< source address */
347 	uint32_t dst_addr;		/**< destination address */
348 } __attribute__((__packed__));
349 
350 /* Header lengths. */
351 #define VLAN_HLEN       4
352 #define VLAN_ETH_HLEN   18
353 
354 /* Per-device statistics struct */
355 struct device_statistics {
356 	uint64_t tx_total;
357 	rte_atomic64_t rx_total_atomic;
358 	uint64_t rx_total;
359 	uint64_t tx;
360 	rte_atomic64_t rx_atomic;
361 	uint64_t rx;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
364 
365 /*
366  * Builds up the correct configuration for VMDQ VLAN pool map
367  * according to the pool & queue limits.
368  */
369 static inline int
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371 {
372 	struct rte_eth_vmdq_rx_conf conf;
373 	unsigned i;
374 
375 	memset(&conf, 0, sizeof(conf));
376 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377 	conf.nb_pool_maps = num_devices;
378 	conf.enable_loop_back =
379 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
380 
381 	for (i = 0; i < conf.nb_pool_maps; i++) {
382 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
383 		conf.pool_map[i].pools = (1UL << i);
384 	}
385 
386 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
389 	return 0;
390 }
391 
392 /*
393  * Validate the device number according to the max pool number gotten form
394  * dev_info. If the device number is invalid, give the error message and
395  * return -1. Each device must have its own pool.
396  */
397 static inline int
398 validate_num_devices(uint32_t max_nb_devices)
399 {
400 	if (num_devices > max_nb_devices) {
401 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402 		return -1;
403 	}
404 	return 0;
405 }
406 
407 /*
408  * Initialises a given port using global settings and with the rx buffers
409  * coming from the mbuf_pool passed as parameter
410  */
411 static inline int
412 port_init(uint8_t port)
413 {
414 	struct rte_eth_dev_info dev_info;
415 	struct rte_eth_conf port_conf;
416 	uint16_t rx_rings, tx_rings;
417 	uint16_t rx_ring_size, tx_ring_size;
418 	int retval;
419 	uint16_t q;
420 
421 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422 	rte_eth_dev_info_get (port, &dev_info);
423 
424 	/*configure the number of supported virtio devices based on VMDQ limits */
425 	num_devices = dev_info.max_vmdq_pools;
426 	num_queues = dev_info.max_rx_queues;
427 
428 	if (zero_copy) {
429 		rx_ring_size = num_rx_descriptor;
430 		tx_ring_size = num_tx_descriptor;
431 		tx_rings = dev_info.max_tx_queues;
432 	} else {
433 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435 		tx_rings = (uint16_t)rte_lcore_count();
436 	}
437 
438 	retval = validate_num_devices(MAX_DEVICES);
439 	if (retval < 0)
440 		return retval;
441 
442 	/* Get port configuration. */
443 	retval = get_eth_conf(&port_conf, num_devices);
444 	if (retval < 0)
445 		return retval;
446 
447 	if (port >= rte_eth_dev_count()) return -1;
448 
449 	rx_rings = (uint16_t)num_queues,
450 	/* Configure ethernet device. */
451 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452 	if (retval != 0)
453 		return retval;
454 
455 	/* Setup the queues. */
456 	for (q = 0; q < rx_rings; q ++) {
457 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 						rte_eth_dev_socket_id(port), &rx_conf_default,
459 						vpool_array[q].pool);
460 		if (retval < 0)
461 			return retval;
462 	}
463 	for (q = 0; q < tx_rings; q ++) {
464 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465 						rte_eth_dev_socket_id(port), &tx_conf_default);
466 		if (retval < 0)
467 			return retval;
468 	}
469 
470 	/* Start the device. */
471 	retval  = rte_eth_dev_start(port);
472 	if (retval < 0) {
473 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474 		return retval;
475 	}
476 
477 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481 			(unsigned)port,
482 			vmdq_ports_eth_addr[port].addr_bytes[0],
483 			vmdq_ports_eth_addr[port].addr_bytes[1],
484 			vmdq_ports_eth_addr[port].addr_bytes[2],
485 			vmdq_ports_eth_addr[port].addr_bytes[3],
486 			vmdq_ports_eth_addr[port].addr_bytes[4],
487 			vmdq_ports_eth_addr[port].addr_bytes[5]);
488 
489 	return 0;
490 }
491 
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498 	/* parse number string */
499 
500 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501 		return -1;
502 	else
503 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504 
505 	return 0;
506 }
507 
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514 	char *end = NULL;
515 	unsigned long pm;
516 
517 	errno = 0;
518 
519 	/* parse hexadecimal string */
520 	pm = strtoul(portmask, &end, 16);
521 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522 		return -1;
523 
524 	if (pm == 0)
525 		return -1;
526 
527 	return pm;
528 
529 }
530 
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537 	char *end = NULL;
538 	unsigned long num;
539 
540 	errno = 0;
541 
542 	/* parse unsigned int string */
543 	num = strtoul(q_arg, &end, 10);
544 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545 		return -1;
546 
547 	if (num > max_valid_value)
548 		return -1;
549 
550 	return num;
551 
552 }
553 
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561 	"		--vm2vm [0|1|2]\n"
562 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 	"		--dev-basename <name> --dev-index [0-N]\n"
564 	"		--nb-devices ND\n"
565 	"		-p PORTMASK: Set mask for ports to be used by application\n"
566 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 	"		--dev-basename: The basename to be used for the character device.\n"
573 	"		--dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
575 			"zero copy\n"
576 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
577 			"used only when zero copy is enabled.\n"
578 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
579 			"used only when zero copy is enabled.\n",
580 	       prgname);
581 }
582 
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589 	int opt, ret;
590 	int option_index;
591 	unsigned i;
592 	const char *prgname = argv[0];
593 	static struct option long_option[] = {
594 		{"vm2vm", required_argument, NULL, 0},
595 		{"rx-retry", required_argument, NULL, 0},
596 		{"rx-retry-delay", required_argument, NULL, 0},
597 		{"rx-retry-num", required_argument, NULL, 0},
598 		{"mergeable", required_argument, NULL, 0},
599 		{"stats", required_argument, NULL, 0},
600 		{"dev-basename", required_argument, NULL, 0},
601 		{"dev-index", required_argument, NULL, 0},
602 		{"zero-copy", required_argument, NULL, 0},
603 		{"rx-desc-num", required_argument, NULL, 0},
604 		{"tx-desc-num", required_argument, NULL, 0},
605 		{NULL, 0, 0, 0},
606 	};
607 
608 	/* Parse command line */
609 	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
610 		switch (opt) {
611 		/* Portmask */
612 		case 'p':
613 			enabled_port_mask = parse_portmask(optarg);
614 			if (enabled_port_mask == 0) {
615 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 				us_vhost_usage(prgname);
617 				return -1;
618 			}
619 			break;
620 
621 		case 0:
622 			/* Enable/disable vm2vm comms. */
623 			if (!strncmp(long_option[option_index].name, "vm2vm",
624 				MAX_LONG_OPT_SZ)) {
625 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
626 				if (ret == -1) {
627 					RTE_LOG(INFO, VHOST_CONFIG,
628 						"Invalid argument for "
629 						"vm2vm [0|1|2]\n");
630 					us_vhost_usage(prgname);
631 					return -1;
632 				} else {
633 					vm2vm_mode = (vm2vm_type)ret;
634 				}
635 			}
636 
637 			/* Enable/disable retries on RX. */
638 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639 				ret = parse_num_opt(optarg, 1);
640 				if (ret == -1) {
641 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642 					us_vhost_usage(prgname);
643 					return -1;
644 				} else {
645 					enable_retry = ret;
646 				}
647 			}
648 
649 			/* Specify the retries delay time (in useconds) on RX. */
650 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651 				ret = parse_num_opt(optarg, INT32_MAX);
652 				if (ret == -1) {
653 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654 					us_vhost_usage(prgname);
655 					return -1;
656 				} else {
657 					burst_rx_delay_time = ret;
658 				}
659 			}
660 
661 			/* Specify the retries number on RX. */
662 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663 				ret = parse_num_opt(optarg, INT32_MAX);
664 				if (ret == -1) {
665 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666 					us_vhost_usage(prgname);
667 					return -1;
668 				} else {
669 					burst_rx_retry_num = ret;
670 				}
671 			}
672 
673 			/* Enable/disable RX mergeable buffers. */
674 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675 				ret = parse_num_opt(optarg, 1);
676 				if (ret == -1) {
677 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678 					us_vhost_usage(prgname);
679 					return -1;
680 				} else {
681 					if (ret) {
682 						vmdq_conf_default.rxmode.jumbo_frame = 1;
683 						vmdq_conf_default.rxmode.max_rx_pkt_len
684 							= JUMBO_FRAME_MAX_SIZE;
685 						VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
686 					}
687 				}
688 			}
689 
690 			/* Enable/disable stats. */
691 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692 				ret = parse_num_opt(optarg, INT32_MAX);
693 				if (ret == -1) {
694 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695 					us_vhost_usage(prgname);
696 					return -1;
697 				} else {
698 					enable_stats = ret;
699 				}
700 			}
701 
702 			/* Set character device basename. */
703 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704 				if (us_vhost_parse_basename(optarg) == -1) {
705 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706 					us_vhost_usage(prgname);
707 					return -1;
708 				}
709 			}
710 
711 			/* Set character device index. */
712 			if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713 				ret = parse_num_opt(optarg, INT32_MAX);
714 				if (ret == -1) {
715 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716 					us_vhost_usage(prgname);
717 					return -1;
718 				} else
719 					dev_index = ret;
720 			}
721 
722 			/* Enable/disable rx/tx zero copy. */
723 			if (!strncmp(long_option[option_index].name,
724 				"zero-copy", MAX_LONG_OPT_SZ)) {
725 				ret = parse_num_opt(optarg, 1);
726 				if (ret == -1) {
727 					RTE_LOG(INFO, VHOST_CONFIG,
728 						"Invalid argument"
729 						" for zero-copy [0|1]\n");
730 					us_vhost_usage(prgname);
731 					return -1;
732 				} else
733 					zero_copy = ret;
734 
735 				if (zero_copy) {
736 #ifdef RTE_MBUF_REFCNT
737 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738 					"zero copy vhost APP, please "
739 					"disable RTE_MBUF_REFCNT\n"
740 					"in config file and then rebuild DPDK "
741 					"core lib!\n"
742 					"Otherwise please disable zero copy "
743 					"flag in command line!\n");
744 					return -1;
745 #endif
746 				}
747 			}
748 
749 			/* Specify the descriptor number on RX. */
750 			if (!strncmp(long_option[option_index].name,
751 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
752 				ret = parse_num_opt(optarg, MAX_RING_DESC);
753 				if ((ret == -1) || (!POWEROF2(ret))) {
754 					RTE_LOG(INFO, VHOST_CONFIG,
755 					"Invalid argument for rx-desc-num[0-N],"
756 					"power of 2 required.\n");
757 					us_vhost_usage(prgname);
758 					return -1;
759 				} else {
760 					num_rx_descriptor = ret;
761 				}
762 			}
763 
764 			/* Specify the descriptor number on TX. */
765 			if (!strncmp(long_option[option_index].name,
766 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
767 				ret = parse_num_opt(optarg, MAX_RING_DESC);
768 				if ((ret == -1) || (!POWEROF2(ret))) {
769 					RTE_LOG(INFO, VHOST_CONFIG,
770 					"Invalid argument for tx-desc-num [0-N],"
771 					"power of 2 required.\n");
772 					us_vhost_usage(prgname);
773 					return -1;
774 				} else {
775 					num_tx_descriptor = ret;
776 				}
777 			}
778 
779 			break;
780 
781 			/* Invalid option - print options. */
782 		default:
783 			us_vhost_usage(prgname);
784 			return -1;
785 		}
786 	}
787 
788 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789 		if (enabled_port_mask & (1 << i))
790 			ports[num_ports++] = (uint8_t)i;
791 	}
792 
793 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
794 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
796 		return -1;
797 	}
798 
799 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800 		RTE_LOG(INFO, VHOST_PORT,
801 			"Vhost zero copy doesn't support software vm2vm,"
802 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
803 		return -1;
804 	}
805 
806 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807 		RTE_LOG(INFO, VHOST_PORT,
808 			"Vhost zero copy doesn't support jumbo frame,"
809 			"please specify '--mergeable 0' to disable the "
810 			"mergeable feature.\n");
811 		return -1;
812 	}
813 
814 	return 0;
815 }
816 
817 /*
818  * Update the global var NUM_PORTS and array PORTS according to system ports number
819  * and return valid ports number
820  */
821 static unsigned check_ports_num(unsigned nb_ports)
822 {
823 	unsigned valid_num_ports = num_ports;
824 	unsigned portid;
825 
826 	if (num_ports > nb_ports) {
827 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828 			num_ports, nb_ports);
829 		num_ports = nb_ports;
830 	}
831 
832 	for (portid = 0; portid < num_ports; portid ++) {
833 		if (ports[portid] >= nb_ports) {
834 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835 				ports[portid], (nb_ports - 1));
836 			ports[portid] = INVALID_PORT_ID;
837 			valid_num_ports--;
838 		}
839 	}
840 	return valid_num_ports;
841 }
842 
843 /*
844  * Macro to print out packet contents. Wrapped in debug define so that the
845  * data path is not effected when debug is disabled.
846  */
847 #ifdef DEBUG
848 #define PRINT_PACKET(device, addr, size, header) do {																\
849 	char *pkt_addr = (char*)(addr);																					\
850 	unsigned int index;																								\
851 	char packet[MAX_PRINT_BUFF];																					\
852 																													\
853 	if ((header))																									\
854 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
855 	else																											\
856 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
857 	for (index = 0; index < (size); index++) {																		\
858 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
859 			"%02hhx ", pkt_addr[index]);																			\
860 	}																												\
861 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
862 																													\
863 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
864 } while(0)
865 #else
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
867 #endif
868 
869 /*
870  * Function to convert guest physical addresses to vhost physical addresses.
871  * This is used to convert virtio buffer addresses.
872  */
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
875 	uint32_t buf_len, hpa_type *addr_type)
876 {
877 	struct virtio_memory_regions_hpa *region;
878 	uint32_t regionidx;
879 	uint64_t vhost_pa = 0;
880 
881 	*addr_type = PHYS_ADDR_INVALID;
882 
883 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
884 		region = &vdev->regions_hpa[regionidx];
885 		if ((guest_pa >= region->guest_phys_address) &&
886 			(guest_pa <= region->guest_phys_address_end)) {
887 			vhost_pa = region->host_phys_addr_offset + guest_pa;
888 			if (likely((guest_pa + buf_len - 1)
889 				<= region->guest_phys_address_end))
890 				*addr_type = PHYS_ADDR_CONTINUOUS;
891 			else
892 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
893 			break;
894 		}
895 	}
896 
897 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
898 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
899 		(void *)(uintptr_t)vhost_pa);
900 
901 	return vhost_pa;
902 }
903 
904 /*
905  * Compares a packet destination MAC address to a device MAC address.
906  */
907 static inline int __attribute__((always_inline))
908 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
909 {
910 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
911 }
912 
913 /*
914  * This function learns the MAC address of the device and registers this along with a
915  * vlan tag to a VMDQ.
916  */
917 static int
918 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
919 {
920 	struct ether_hdr *pkt_hdr;
921 	struct virtio_net_data_ll *dev_ll;
922 	struct virtio_net *dev = vdev->dev;
923 	int i, ret;
924 
925 	/* Learn MAC address of guest device from packet */
926 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
927 
928 	dev_ll = ll_root_used;
929 
930 	while (dev_ll != NULL) {
931 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
932 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
933 			return -1;
934 		}
935 		dev_ll = dev_ll->next;
936 	}
937 
938 	for (i = 0; i < ETHER_ADDR_LEN; i++)
939 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
940 
941 	/* vlan_tag currently uses the device_id. */
942 	vdev->vlan_tag = vlan_tags[dev->device_fh];
943 
944 	/* Print out VMDQ registration info. */
945 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
946 		dev->device_fh,
947 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
948 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
949 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
950 		vdev->vlan_tag);
951 
952 	/* Register the MAC address. */
953 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
954 	if (ret)
955 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
956 					dev->device_fh);
957 
958 	/* Enable stripping of the vlan tag as we handle routing. */
959 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
960 
961 	/* Set device as ready for RX. */
962 	vdev->ready = DEVICE_RX;
963 
964 	return 0;
965 }
966 
967 /*
968  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
969  * queue before disabling RX on the device.
970  */
971 static inline void
972 unlink_vmdq(struct vhost_dev *vdev)
973 {
974 	unsigned i = 0;
975 	unsigned rx_count;
976 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
977 
978 	if (vdev->ready == DEVICE_RX) {
979 		/*clear MAC and VLAN settings*/
980 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
981 		for (i = 0; i < 6; i++)
982 			vdev->mac_address.addr_bytes[i] = 0;
983 
984 		vdev->vlan_tag = 0;
985 
986 		/*Clear out the receive buffers*/
987 		rx_count = rte_eth_rx_burst(ports[0],
988 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
989 
990 		while (rx_count) {
991 			for (i = 0; i < rx_count; i++)
992 				rte_pktmbuf_free(pkts_burst[i]);
993 
994 			rx_count = rte_eth_rx_burst(ports[0],
995 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
996 		}
997 
998 		vdev->ready = DEVICE_MAC_LEARNING;
999 	}
1000 }
1001 
1002 /*
1003  * Check if the packet destination MAC address is for a local device. If so then put
1004  * the packet on that devices RX queue. If not then return.
1005  */
1006 static inline unsigned __attribute__((always_inline))
1007 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1008 {
1009 	struct virtio_net_data_ll *dev_ll;
1010 	struct ether_hdr *pkt_hdr;
1011 	uint64_t ret = 0;
1012 	struct virtio_net *dev = vdev->dev;
1013 	struct virtio_net *tdev; /* destination virito device */
1014 
1015 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1016 
1017 	/*get the used devices list*/
1018 	dev_ll = ll_root_used;
1019 
1020 	while (dev_ll != NULL) {
1021 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1022 				          &dev_ll->vdev->mac_address)) {
1023 
1024 			/* Drop the packet if the TX packet is destined for the TX device. */
1025 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1026 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1027 							dev->device_fh);
1028 				return 0;
1029 			}
1030 			tdev = dev_ll->vdev->dev;
1031 
1032 
1033 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1034 
1035 			if (dev_ll->vdev->remove) {
1036 				/*drop the packet if the device is marked for removal*/
1037 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1038 			} else {
1039 				uint32_t mergeable =
1040 					dev_ll->dev->features &
1041 					(1 << VIRTIO_NET_F_MRG_RXBUF);
1042 
1043 				/*send the packet to the local virtio device*/
1044 				if (likely(mergeable == 0))
1045 					ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1046 				else
1047 					ret = virtio_dev_merge_rx(dev_ll->dev,
1048 						&m, 1);
1049 
1050 				if (enable_stats) {
1051 					rte_atomic64_add(
1052 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1053 					1);
1054 					rte_atomic64_add(
1055 					&dev_statistics[tdev->device_fh].rx_atomic,
1056 					ret);
1057 					dev_statistics[tdev->device_fh].tx_total++;
1058 					dev_statistics[tdev->device_fh].tx += ret;
1059 				}
1060 			}
1061 
1062 			return 0;
1063 		}
1064 		dev_ll = dev_ll->next;
1065 	}
1066 
1067 	return -1;
1068 }
1069 
1070 /*
1071  * This function routes the TX packet to the correct interface. This may be a local device
1072  * or the physical port.
1073  */
1074 static inline void __attribute__((always_inline))
1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1076 {
1077 	struct mbuf_table *tx_q;
1078 	struct vlan_ethhdr *vlan_hdr;
1079 	struct rte_mbuf **m_table;
1080 	struct rte_mbuf *mbuf, *prev;
1081 	unsigned len, ret, offset = 0;
1082 	const uint16_t lcore_id = rte_lcore_id();
1083 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1084 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1085 	struct virtio_net *dev = vdev->dev;
1086 
1087 	/*check if destination is local VM*/
1088 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1089 		return;
1090 
1091 	if (vm2vm_mode == VM2VM_HARDWARE) {
1092 		while (dev_ll != NULL) {
1093 			if ((dev_ll->vdev->ready == DEVICE_RX)
1094 				&& ether_addr_cmp(&(pkt_hdr->d_addr),
1095 				&dev_ll->vdev->mac_address)) {
1096 				/*
1097 				 * Drop the packet if the TX packet is
1098 				 * destined for the TX device.
1099 				 */
1100 				if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1101 					LOG_DEBUG(VHOST_DATA,
1102 					"(%"PRIu64") TX: Source and destination"
1103 					" MAC addresses are the same. Dropping "
1104 					"packet.\n",
1105 					dev_ll->vdev->device_fh);
1106 					return;
1107 				}
1108 				offset = 4;
1109 				vlan_tag =
1110 				(uint16_t)
1111 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1112 
1113 				LOG_DEBUG(VHOST_DATA,
1114 				"(%"PRIu64") TX: pkt to local VM device id:"
1115 				"(%"PRIu64") vlan tag: %d.\n",
1116 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1117 				vlan_tag);
1118 
1119 				break;
1120 			}
1121 			dev_ll = dev_ll->next;
1122 		}
1123 	}
1124 
1125 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1126 
1127 	/*Add packet to the port tx queue*/
1128 	tx_q = &lcore_tx_queue[lcore_id];
1129 	len = tx_q->len;
1130 
1131 	/* Allocate an mbuf and populate the structure. */
1132 	mbuf = rte_pktmbuf_alloc(mbuf_pool);
1133 	if (unlikely(mbuf == NULL)) {
1134 		RTE_LOG(ERR, VHOST_DATA,
1135 			"Failed to allocate memory for mbuf.\n");
1136 		return;
1137 	}
1138 
1139 	mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1140 	mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1141 	mbuf->nb_segs = m->nb_segs;
1142 
1143 	/* Copy ethernet header to mbuf. */
1144 	rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1145 		rte_pktmbuf_mtod(m, const void *),
1146 		ETH_HLEN);
1147 
1148 
1149 	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1150 	vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1151 	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1152 	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1153 	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1154 
1155 	/* Copy the remaining packet contents to the mbuf. */
1156 	rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1157 		(const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1158 		(m->data_len - ETH_HLEN));
1159 
1160 	/* Copy the remaining segments for the whole packet. */
1161 	prev = mbuf;
1162 	while (m->next) {
1163 		/* Allocate an mbuf and populate the structure. */
1164 		struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1165 		if (unlikely(next_mbuf == NULL)) {
1166 			rte_pktmbuf_free(mbuf);
1167 			RTE_LOG(ERR, VHOST_DATA,
1168 				"Failed to allocate memory for mbuf.\n");
1169 			return;
1170 		}
1171 
1172 		m = m->next;
1173 		prev->next = next_mbuf;
1174 		prev = next_mbuf;
1175 		next_mbuf->data_len = m->data_len;
1176 
1177 		/* Copy data to next mbuf. */
1178 		rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1179 			rte_pktmbuf_mtod(m, const void *), m->data_len);
1180 	}
1181 
1182 	tx_q->m_table[len] = mbuf;
1183 	len++;
1184 	if (enable_stats) {
1185 		dev_statistics[dev->device_fh].tx_total++;
1186 		dev_statistics[dev->device_fh].tx++;
1187 	}
1188 
1189 	if (unlikely(len == MAX_PKT_BURST)) {
1190 		m_table = (struct rte_mbuf **)tx_q->m_table;
1191 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1192 		/* Free any buffers not handled by TX and update the port stats. */
1193 		if (unlikely(ret < len)) {
1194 			do {
1195 				rte_pktmbuf_free(m_table[ret]);
1196 			} while (++ret < len);
1197 		}
1198 
1199 		len = 0;
1200 	}
1201 
1202 	tx_q->len = len;
1203 	return;
1204 }
1205 /*
1206  * This function is called by each data core. It handles all RX/TX registered with the
1207  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1208  * with all devices in the main linked list.
1209  */
1210 static int
1211 switch_worker(__attribute__((unused)) void *arg)
1212 {
1213 	struct rte_mempool *mbuf_pool = arg;
1214 	struct virtio_net *dev = NULL;
1215 	struct vhost_dev *vdev = NULL;
1216 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1217 	struct virtio_net_data_ll *dev_ll;
1218 	struct mbuf_table *tx_q;
1219 	volatile struct lcore_ll_info *lcore_ll;
1220 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1221 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1222 	unsigned ret, i;
1223 	const uint16_t lcore_id = rte_lcore_id();
1224 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1225 	uint16_t rx_count = 0;
1226 	uint32_t mergeable = 0;
1227 
1228 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1229 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1230 	prev_tsc = 0;
1231 
1232 	tx_q = &lcore_tx_queue[lcore_id];
1233 	for (i = 0; i < num_cores; i ++) {
1234 		if (lcore_ids[i] == lcore_id) {
1235 			tx_q->txq_id = i;
1236 			break;
1237 		}
1238 	}
1239 
1240 	while(1) {
1241 		cur_tsc = rte_rdtsc();
1242 		/*
1243 		 * TX burst queue drain
1244 		 */
1245 		diff_tsc = cur_tsc - prev_tsc;
1246 		if (unlikely(diff_tsc > drain_tsc)) {
1247 
1248 			if (tx_q->len) {
1249 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1250 
1251 				/*Tx any packets in the queue*/
1252 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1253 									   (struct rte_mbuf **)tx_q->m_table,
1254 									   (uint16_t)tx_q->len);
1255 				if (unlikely(ret < tx_q->len)) {
1256 					do {
1257 						rte_pktmbuf_free(tx_q->m_table[ret]);
1258 					} while (++ret < tx_q->len);
1259 				}
1260 
1261 				tx_q->len = 0;
1262 			}
1263 
1264 			prev_tsc = cur_tsc;
1265 
1266 		}
1267 
1268 		rte_prefetch0(lcore_ll->ll_root_used);
1269 		/*
1270 		 * Inform the configuration core that we have exited the linked list and that no devices are
1271 		 * in use if requested.
1272 		 */
1273 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1274 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1275 
1276 		/*
1277 		 * Process devices
1278 		 */
1279 		dev_ll = lcore_ll->ll_root_used;
1280 
1281 		while (dev_ll != NULL) {
1282 			/*get virtio device ID*/
1283 			vdev = dev_ll->vdev;
1284 			dev = vdev->dev;
1285 			mergeable =
1286 				dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1287 
1288 			if (vdev->remove) {
1289 				dev_ll = dev_ll->next;
1290 				unlink_vmdq(vdev);
1291 				vdev->ready = DEVICE_SAFE_REMOVE;
1292 				continue;
1293 			}
1294 			if (likely(vdev->ready == DEVICE_RX)) {
1295 				/*Handle guest RX*/
1296 				rx_count = rte_eth_rx_burst(ports[0],
1297 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1298 
1299 				if (rx_count) {
1300 					if (likely(mergeable == 0))
1301 						ret_count =
1302 							virtio_dev_rx(dev,
1303 							pkts_burst, rx_count);
1304 					else
1305 						ret_count =
1306 							virtio_dev_merge_rx(dev,
1307 							pkts_burst, rx_count);
1308 
1309 					if (enable_stats) {
1310 						rte_atomic64_add(
1311 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1312 						rx_count);
1313 						rte_atomic64_add(
1314 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1315 					}
1316 					while (likely(rx_count)) {
1317 						rx_count--;
1318 						rte_pktmbuf_free(pkts_burst[rx_count]);
1319 					}
1320 
1321 				}
1322 			}
1323 
1324 			if (!vdev->remove) {
1325 				/*Handle guest TX*/
1326 				if (likely(mergeable == 0))
1327 					virtio_dev_tx(dev, mbuf_pool);
1328 				else
1329 					virtio_dev_merge_tx(dev, mbuf_pool);
1330 			}
1331 
1332 			/*move to the next device in the list*/
1333 			dev_ll = dev_ll->next;
1334 		}
1335 	}
1336 
1337 	return 0;
1338 }
1339 
1340 /*
1341  * This function gets available ring number for zero copy rx.
1342  * Only one thread will call this funciton for a paticular virtio device,
1343  * so, it is designed as non-thread-safe function.
1344  */
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_num_zcp(struct virtio_net *dev)
1347 {
1348 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1349 	uint16_t avail_idx;
1350 
1351 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1352 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1353 }
1354 
1355 /*
1356  * This function gets available ring index for zero copy rx,
1357  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1358  * Only one thread will call this funciton for a paticular virtio device,
1359  * so, it is designed as non-thread-safe function.
1360  */
1361 static inline uint32_t __attribute__((always_inline))
1362 get_available_ring_index_zcp(struct virtio_net *dev,
1363 	uint16_t *res_base_idx, uint32_t count)
1364 {
1365 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1366 	uint16_t avail_idx;
1367 	uint32_t retry = 0;
1368 	uint16_t free_entries;
1369 
1370 	*res_base_idx = vq->last_used_idx_res;
1371 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372 	free_entries = (avail_idx - *res_base_idx);
1373 
1374 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1375 			"avail idx: %d, "
1376 			"res base idx:%d, free entries:%d\n",
1377 			dev->device_fh, avail_idx, *res_base_idx,
1378 			free_entries);
1379 
1380 	/*
1381 	 * If retry is enabled and the queue is full then we wait
1382 	 * and retry to avoid packet loss.
1383 	 */
1384 	if (enable_retry && unlikely(count > free_entries)) {
1385 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1386 			rte_delay_us(burst_rx_delay_time);
1387 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1388 			free_entries = (avail_idx - *res_base_idx);
1389 			if (count <= free_entries)
1390 				break;
1391 		}
1392 	}
1393 
1394 	/*check that we have enough buffers*/
1395 	if (unlikely(count > free_entries))
1396 		count = free_entries;
1397 
1398 	if (unlikely(count == 0)) {
1399 		LOG_DEBUG(VHOST_DATA,
1400 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1401 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1402 			dev->device_fh, avail_idx,
1403 			*res_base_idx, free_entries);
1404 		return 0;
1405 	}
1406 
1407 	vq->last_used_idx_res = *res_base_idx + count;
1408 
1409 	return count;
1410 }
1411 
1412 /*
1413  * This function put descriptor back to used list.
1414  */
1415 static inline void __attribute__((always_inline))
1416 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1417 {
1418 	uint16_t res_cur_idx = vq->last_used_idx;
1419 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1420 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1421 	rte_compiler_barrier();
1422 	*(volatile uint16_t *)&vq->used->idx += 1;
1423 	vq->last_used_idx += 1;
1424 
1425 	/* Kick the guest if necessary. */
1426 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1427 		eventfd_write((int)vq->kickfd, 1);
1428 }
1429 
1430 /*
1431  * This function get available descriptor from vitio vring and un-attached mbuf
1432  * from vpool->ring, and then attach them together. It needs adjust the offset
1433  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1434  * frame data may be put to wrong location in mbuf.
1435  */
1436 static inline void __attribute__((always_inline))
1437 attach_rxmbuf_zcp(struct virtio_net *dev)
1438 {
1439 	uint16_t res_base_idx, desc_idx;
1440 	uint64_t buff_addr, phys_addr;
1441 	struct vhost_virtqueue *vq;
1442 	struct vring_desc *desc;
1443 	struct rte_mbuf *mbuf = NULL;
1444 	struct vpool *vpool;
1445 	hpa_type addr_type;
1446 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1447 
1448 	vpool = &vpool_array[vdev->vmdq_rx_q];
1449 	vq = dev->virtqueue[VIRTIO_RXQ];
1450 
1451 	do {
1452 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1453 				1) != 1))
1454 			return;
1455 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1456 
1457 		desc = &vq->desc[desc_idx];
1458 		if (desc->flags & VRING_DESC_F_NEXT) {
1459 			desc = &vq->desc[desc->next];
1460 			buff_addr = gpa_to_vva(dev, desc->addr);
1461 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1462 					&addr_type);
1463 		} else {
1464 			buff_addr = gpa_to_vva(dev,
1465 					desc->addr + vq->vhost_hlen);
1466 			phys_addr = gpa_to_hpa(vdev,
1467 					desc->addr + vq->vhost_hlen,
1468 					desc->len, &addr_type);
1469 		}
1470 
1471 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1472 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1473 				" address found when attaching RX frame buffer"
1474 				" address!\n", dev->device_fh);
1475 			put_desc_to_used_list_zcp(vq, desc_idx);
1476 			continue;
1477 		}
1478 
1479 		/*
1480 		 * Check if the frame buffer address from guest crosses
1481 		 * sub-region or not.
1482 		 */
1483 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1484 			RTE_LOG(ERR, VHOST_DATA,
1485 				"(%"PRIu64") Frame buffer address cross "
1486 				"sub-regioin found when attaching RX frame "
1487 				"buffer address!\n",
1488 				dev->device_fh);
1489 			put_desc_to_used_list_zcp(vq, desc_idx);
1490 			continue;
1491 		}
1492 	} while (unlikely(phys_addr == 0));
1493 
1494 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1495 	if (unlikely(mbuf == NULL)) {
1496 		LOG_DEBUG(VHOST_DATA,
1497 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1498 			"ring_sc_dequeue fail.\n",
1499 			dev->device_fh);
1500 		put_desc_to_used_list_zcp(vq, desc_idx);
1501 		return;
1502 	}
1503 
1504 	if (unlikely(vpool->buf_size > desc->len)) {
1505 		LOG_DEBUG(VHOST_DATA,
1506 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1507 			"length(%d) of descriptor idx: %d less than room "
1508 			"size required: %d\n",
1509 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1510 		put_desc_to_used_list_zcp(vq, desc_idx);
1511 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1512 		return;
1513 	}
1514 
1515 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1516 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1517 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1518 	mbuf->data_len = desc->len;
1519 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1520 
1521 	LOG_DEBUG(VHOST_DATA,
1522 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1523 		"descriptor idx:%d\n",
1524 		dev->device_fh, res_base_idx, desc_idx);
1525 
1526 	__rte_mbuf_raw_free(mbuf);
1527 
1528 	return;
1529 }
1530 
1531 /*
1532  * Detach an attched packet mbuf -
1533  *  - restore original mbuf address and length values.
1534  *  - reset pktmbuf data and data_len to their default values.
1535  *  All other fields of the given packet mbuf will be left intact.
1536  *
1537  * @param m
1538  *   The attached packet mbuf.
1539  */
1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1541 {
1542 	const struct rte_mempool *mp = m->pool;
1543 	void *buf = RTE_MBUF_TO_BADDR(m);
1544 	uint32_t buf_ofs;
1545 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1546 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1547 
1548 	m->buf_addr = buf;
1549 	m->buf_len = (uint16_t)buf_len;
1550 
1551 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1552 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1553 	m->data_off = buf_ofs;
1554 
1555 	m->data_len = 0;
1556 }
1557 
1558 /*
1559  * This function is called after packets have been transimited. It fetchs mbuf
1560  * from vpool->pool, detached it and put into vpool->ring. It also update the
1561  * used index and kick the guest if necessary.
1562  */
1563 static inline uint32_t __attribute__((always_inline))
1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1565 {
1566 	struct rte_mbuf *mbuf;
1567 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1568 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1569 	uint32_t index = 0;
1570 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1571 
1572 	LOG_DEBUG(VHOST_DATA,
1573 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1574 		"clean is: %d\n",
1575 		dev->device_fh, mbuf_count);
1576 	LOG_DEBUG(VHOST_DATA,
1577 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1578 		"clean  is : %d\n",
1579 		dev->device_fh, rte_ring_count(vpool->ring));
1580 
1581 	for (index = 0; index < mbuf_count; index++) {
1582 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1584 			pktmbuf_detach_zcp(mbuf);
1585 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1586 
1587 		/* Update used index buffer information. */
1588 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1589 		vq->used->ring[used_idx].len = 0;
1590 
1591 		used_idx = (used_idx + 1) & (vq->size - 1);
1592 	}
1593 
1594 	LOG_DEBUG(VHOST_DATA,
1595 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1596 		"clean is: %d\n",
1597 		dev->device_fh, rte_mempool_count(vpool->pool));
1598 	LOG_DEBUG(VHOST_DATA,
1599 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1600 		"clean  is : %d\n",
1601 		dev->device_fh, rte_ring_count(vpool->ring));
1602 	LOG_DEBUG(VHOST_DATA,
1603 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1604 		"vq->last_used_idx:%d\n",
1605 		dev->device_fh, vq->last_used_idx);
1606 
1607 	vq->last_used_idx += mbuf_count;
1608 
1609 	LOG_DEBUG(VHOST_DATA,
1610 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1611 		"vq->last_used_idx:%d\n",
1612 		dev->device_fh, vq->last_used_idx);
1613 
1614 	rte_compiler_barrier();
1615 
1616 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1617 
1618 	/* Kick guest if required. */
1619 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1620 		eventfd_write((int)vq->kickfd, 1);
1621 
1622 	return 0;
1623 }
1624 
1625 /*
1626  * This function is called when a virtio device is destroy.
1627  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1628  */
1629 static void mbuf_destroy_zcp(struct vpool *vpool)
1630 {
1631 	struct rte_mbuf *mbuf = NULL;
1632 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1633 
1634 	LOG_DEBUG(VHOST_CONFIG,
1635 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1636 		"mbuf_destroy_zcp is: %d\n",
1637 		mbuf_count);
1638 	LOG_DEBUG(VHOST_CONFIG,
1639 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1640 		"mbuf_destroy_zcp  is : %d\n",
1641 		rte_ring_count(vpool->ring));
1642 
1643 	for (index = 0; index < mbuf_count; index++) {
1644 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1645 		if (likely(mbuf != NULL)) {
1646 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1647 				pktmbuf_detach_zcp(mbuf);
1648 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1649 		}
1650 	}
1651 
1652 	LOG_DEBUG(VHOST_CONFIG,
1653 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1654 		"mbuf_destroy_zcp is: %d\n",
1655 		rte_mempool_count(vpool->pool));
1656 	LOG_DEBUG(VHOST_CONFIG,
1657 		"in mbuf_destroy_zcp: mbuf count in ring after "
1658 		"mbuf_destroy_zcp is : %d\n",
1659 		rte_ring_count(vpool->ring));
1660 }
1661 
1662 /*
1663  * This function update the use flag and counter.
1664  */
1665 static inline uint32_t __attribute__((always_inline))
1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1667 	uint32_t count)
1668 {
1669 	struct vhost_virtqueue *vq;
1670 	struct vring_desc *desc;
1671 	struct rte_mbuf *buff;
1672 	/* The virtio_hdr is initialised to 0. */
1673 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1674 		= {{0, 0, 0, 0, 0, 0}, 0};
1675 	uint64_t buff_hdr_addr = 0;
1676 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1677 	uint32_t head_idx, packet_success = 0;
1678 	uint16_t res_cur_idx;
1679 
1680 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1681 
1682 	if (count == 0)
1683 		return 0;
1684 
1685 	vq = dev->virtqueue[VIRTIO_RXQ];
1686 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1687 
1688 	res_cur_idx = vq->last_used_idx;
1689 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1690 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1691 
1692 	/* Retrieve all of the head indexes first to avoid caching issues. */
1693 	for (head_idx = 0; head_idx < count; head_idx++)
1694 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1695 
1696 	/*Prefetch descriptor index. */
1697 	rte_prefetch0(&vq->desc[head[packet_success]]);
1698 
1699 	while (packet_success != count) {
1700 		/* Get descriptor from available ring */
1701 		desc = &vq->desc[head[packet_success]];
1702 
1703 		buff = pkts[packet_success];
1704 		LOG_DEBUG(VHOST_DATA,
1705 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1706 			"pkt[%d] descriptor idx: %d\n",
1707 			dev->device_fh, packet_success,
1708 			MBUF_HEADROOM_UINT32(buff));
1709 
1710 		PRINT_PACKET(dev,
1711 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1712 			+ RTE_PKTMBUF_HEADROOM),
1713 			rte_pktmbuf_data_len(buff), 0);
1714 
1715 		/* Buffer address translation for virtio header. */
1716 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1717 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1718 
1719 		/*
1720 		 * If the descriptors are chained the header and data are
1721 		 * placed in separate buffers.
1722 		 */
1723 		if (desc->flags & VRING_DESC_F_NEXT) {
1724 			desc->len = vq->vhost_hlen;
1725 			desc = &vq->desc[desc->next];
1726 			desc->len = rte_pktmbuf_data_len(buff);
1727 		} else {
1728 			desc->len = packet_len;
1729 		}
1730 
1731 		/* Update used ring with desc information */
1732 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1733 			= head[packet_success];
1734 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1735 			= packet_len;
1736 		res_cur_idx++;
1737 		packet_success++;
1738 
1739 		/* A header is required per buffer. */
1740 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1741 			(const void *)&virtio_hdr, vq->vhost_hlen);
1742 
1743 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1744 
1745 		if (likely(packet_success < count)) {
1746 			/* Prefetch descriptor index. */
1747 			rte_prefetch0(&vq->desc[head[packet_success]]);
1748 		}
1749 	}
1750 
1751 	rte_compiler_barrier();
1752 
1753 	LOG_DEBUG(VHOST_DATA,
1754 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1755 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1756 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1757 
1758 	*(volatile uint16_t *)&vq->used->idx += count;
1759 	vq->last_used_idx += count;
1760 
1761 	LOG_DEBUG(VHOST_DATA,
1762 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1763 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1764 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1765 
1766 	/* Kick the guest if necessary. */
1767 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1768 		eventfd_write((int)vq->kickfd, 1);
1769 
1770 	return count;
1771 }
1772 
1773 /*
1774  * This function routes the TX packet to the correct interface.
1775  * This may be a local device or the physical port.
1776  */
1777 static inline void __attribute__((always_inline))
1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1779 	uint32_t desc_idx, uint8_t need_copy)
1780 {
1781 	struct mbuf_table *tx_q;
1782 	struct rte_mbuf **m_table;
1783 	struct rte_mbuf *mbuf = NULL;
1784 	unsigned len, ret, offset = 0;
1785 	struct vpool *vpool;
1786 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1787 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1788 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1789 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1790 
1791 	/*Add packet to the port tx queue*/
1792 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1793 	len = tx_q->len;
1794 
1795 	/* Allocate an mbuf and populate the structure. */
1796 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1797 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1798 	if (unlikely(mbuf == NULL)) {
1799 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1800 		RTE_LOG(ERR, VHOST_DATA,
1801 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1802 			dev->device_fh);
1803 		put_desc_to_used_list_zcp(vq, desc_idx);
1804 		return;
1805 	}
1806 
1807 	if (vm2vm_mode == VM2VM_HARDWARE) {
1808 		/* Avoid using a vlan tag from any vm for external pkt, such as
1809 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1810 		 * selection, MAC address determines it as an external pkt
1811 		 * which should go to network, while vlan tag determine it as
1812 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1813 		 * such a ambiguous situation, so pkt will lost.
1814 		 */
1815 		vlan_tag = external_pkt_default_vlan_tag;
1816 		while (dev_ll != NULL) {
1817 			if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1818 				ether_addr_cmp(&(pkt_hdr->d_addr),
1819 				&dev_ll->vdev->mac_address)) {
1820 
1821 				/*
1822 				 * Drop the packet if the TX packet is destined
1823 				 * for the TX device.
1824 				 */
1825 				if (unlikely(dev_ll->vdev->dev->device_fh
1826 					== dev->device_fh)) {
1827 					LOG_DEBUG(VHOST_DATA,
1828 					"(%"PRIu64") TX: Source and destination"
1829 					"MAC addresses are the same. Dropping "
1830 					"packet.\n",
1831 					dev_ll->vdev->dev->device_fh);
1832 					MBUF_HEADROOM_UINT32(mbuf)
1833 						= (uint32_t)desc_idx;
1834 					__rte_mbuf_raw_free(mbuf);
1835 					return;
1836 				}
1837 
1838 				/*
1839 				 * Packet length offset 4 bytes for HW vlan
1840 				 * strip when L2 switch back.
1841 				 */
1842 				offset = 4;
1843 				vlan_tag =
1844 				(uint16_t)
1845 				vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1846 
1847 				LOG_DEBUG(VHOST_DATA,
1848 				"(%"PRIu64") TX: pkt to local VM device id:"
1849 				"(%"PRIu64") vlan tag: %d.\n",
1850 				dev->device_fh, dev_ll->vdev->dev->device_fh,
1851 				vlan_tag);
1852 
1853 				break;
1854 			}
1855 			dev_ll = dev_ll->next;
1856 		}
1857 	}
1858 
1859 	mbuf->nb_segs = m->nb_segs;
1860 	mbuf->next = m->next;
1861 	mbuf->data_len = m->data_len + offset;
1862 	mbuf->pkt_len = mbuf->data_len;
1863 	if (unlikely(need_copy)) {
1864 		/* Copy the packet contents to the mbuf. */
1865 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1866 			rte_pktmbuf_mtod(m, void *),
1867 			m->data_len);
1868 	} else {
1869 		mbuf->data_off = m->data_off;
1870 		mbuf->buf_physaddr = m->buf_physaddr;
1871 		mbuf->buf_addr = m->buf_addr;
1872 	}
1873 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1874 	mbuf->vlan_tci = vlan_tag;
1875 	mbuf->l2_len = sizeof(struct ether_hdr);
1876 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1877 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1878 
1879 	tx_q->m_table[len] = mbuf;
1880 	len++;
1881 
1882 	LOG_DEBUG(VHOST_DATA,
1883 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1884 		dev->device_fh,
1885 		mbuf->nb_segs,
1886 		(mbuf->next == NULL) ? "null" : "non-null");
1887 
1888 	if (enable_stats) {
1889 		dev_statistics[dev->device_fh].tx_total++;
1890 		dev_statistics[dev->device_fh].tx++;
1891 	}
1892 
1893 	if (unlikely(len == MAX_PKT_BURST)) {
1894 		m_table = (struct rte_mbuf **)tx_q->m_table;
1895 		ret = rte_eth_tx_burst(ports[0],
1896 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1897 
1898 		/*
1899 		 * Free any buffers not handled by TX and update
1900 		 * the port stats.
1901 		 */
1902 		if (unlikely(ret < len)) {
1903 			do {
1904 				rte_pktmbuf_free(m_table[ret]);
1905 			} while (++ret < len);
1906 		}
1907 
1908 		len = 0;
1909 		txmbuf_clean_zcp(dev, vpool);
1910 	}
1911 
1912 	tx_q->len = len;
1913 
1914 	return;
1915 }
1916 
1917 /*
1918  * This function TX all available packets in virtio TX queue for one
1919  * virtio-net device. If it is first packet, it learns MAC address and
1920  * setup VMDQ.
1921  */
1922 static inline void __attribute__((always_inline))
1923 virtio_dev_tx_zcp(struct virtio_net *dev)
1924 {
1925 	struct rte_mbuf m;
1926 	struct vhost_virtqueue *vq;
1927 	struct vring_desc *desc;
1928 	uint64_t buff_addr = 0, phys_addr;
1929 	uint32_t head[MAX_PKT_BURST];
1930 	uint32_t i;
1931 	uint16_t free_entries, packet_success = 0;
1932 	uint16_t avail_idx;
1933 	uint8_t need_copy = 0;
1934 	hpa_type addr_type;
1935 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1936 
1937 	vq = dev->virtqueue[VIRTIO_TXQ];
1938 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1939 
1940 	/* If there are no available buffers then return. */
1941 	if (vq->last_used_idx_res == avail_idx)
1942 		return;
1943 
1944 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1945 
1946 	/* Prefetch available ring to retrieve head indexes. */
1947 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1948 
1949 	/* Get the number of free entries in the ring */
1950 	free_entries = (avail_idx - vq->last_used_idx_res);
1951 
1952 	/* Limit to MAX_PKT_BURST. */
1953 	free_entries
1954 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1955 
1956 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1957 		dev->device_fh, free_entries);
1958 
1959 	/* Retrieve all of the head indexes first to avoid caching issues. */
1960 	for (i = 0; i < free_entries; i++)
1961 		head[i]
1962 			= vq->avail->ring[(vq->last_used_idx_res + i)
1963 			& (vq->size - 1)];
1964 
1965 	vq->last_used_idx_res += free_entries;
1966 
1967 	/* Prefetch descriptor index. */
1968 	rte_prefetch0(&vq->desc[head[packet_success]]);
1969 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1970 
1971 	while (packet_success < free_entries) {
1972 		desc = &vq->desc[head[packet_success]];
1973 
1974 		/* Discard first buffer as it is the virtio header */
1975 		desc = &vq->desc[desc->next];
1976 
1977 		/* Buffer address translation. */
1978 		buff_addr = gpa_to_vva(dev, desc->addr);
1979 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1980 
1981 		if (likely(packet_success < (free_entries - 1)))
1982 			/* Prefetch descriptor index. */
1983 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1984 
1985 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1986 			RTE_LOG(ERR, VHOST_DATA,
1987 				"(%"PRIu64") Invalid frame buffer address found"
1988 				"when TX packets!\n",
1989 				dev->device_fh);
1990 			packet_success++;
1991 			continue;
1992 		}
1993 
1994 		/* Prefetch buffer address. */
1995 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1996 
1997 		/*
1998 		 * Setup dummy mbuf. This is copied to a real mbuf if
1999 		 * transmitted out the physical port.
2000 		 */
2001 		m.data_len = desc->len;
2002 		m.nb_segs = 1;
2003 		m.next = NULL;
2004 		m.data_off = 0;
2005 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2006 		m.buf_physaddr = phys_addr;
2007 
2008 		/*
2009 		 * Check if the frame buffer address from guest crosses
2010 		 * sub-region or not.
2011 		 */
2012 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2013 			RTE_LOG(ERR, VHOST_DATA,
2014 				"(%"PRIu64") Frame buffer address cross "
2015 				"sub-regioin found when attaching TX frame "
2016 				"buffer address!\n",
2017 				dev->device_fh);
2018 			need_copy = 1;
2019 		} else
2020 			need_copy = 0;
2021 
2022 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2023 
2024 		/*
2025 		 * If this is the first received packet we need to learn
2026 		 * the MAC and setup VMDQ
2027 		 */
2028 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2029 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2030 				/*
2031 				 * Discard frame if device is scheduled for
2032 				 * removal or a duplicate MAC address is found.
2033 				 */
2034 				packet_success += free_entries;
2035 				vq->last_used_idx += packet_success;
2036 				break;
2037 			}
2038 		}
2039 
2040 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2041 		packet_success++;
2042 	}
2043 }
2044 
2045 /*
2046  * This function is called by each data core. It handles all RX/TX registered
2047  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2048  * addresses are compared with all devices in the main linked list.
2049  */
2050 static int
2051 switch_worker_zcp(__attribute__((unused)) void *arg)
2052 {
2053 	struct virtio_net *dev = NULL;
2054 	struct vhost_dev  *vdev = NULL;
2055 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2056 	struct virtio_net_data_ll *dev_ll;
2057 	struct mbuf_table *tx_q;
2058 	volatile struct lcore_ll_info *lcore_ll;
2059 	const uint64_t drain_tsc
2060 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2061 		* BURST_TX_DRAIN_US;
2062 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2063 	unsigned ret;
2064 	const uint16_t lcore_id = rte_lcore_id();
2065 	uint16_t count_in_ring, rx_count = 0;
2066 
2067 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2068 
2069 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2070 	prev_tsc = 0;
2071 
2072 	while (1) {
2073 		cur_tsc = rte_rdtsc();
2074 
2075 		/* TX burst queue drain */
2076 		diff_tsc = cur_tsc - prev_tsc;
2077 		if (unlikely(diff_tsc > drain_tsc)) {
2078 			/*
2079 			 * Get mbuf from vpool.pool and detach mbuf and
2080 			 * put back into vpool.ring.
2081 			 */
2082 			dev_ll = lcore_ll->ll_root_used;
2083 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2084 				/* Get virtio device ID */
2085 				vdev = dev_ll->vdev;
2086 				dev = vdev->dev;
2087 
2088 				if (likely(!vdev->remove)) {
2089 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2090 					if (tx_q->len) {
2091 						LOG_DEBUG(VHOST_DATA,
2092 						"TX queue drained after timeout"
2093 						" with burst size %u\n",
2094 						tx_q->len);
2095 
2096 						/*
2097 						 * Tx any packets in the queue
2098 						 */
2099 						ret = rte_eth_tx_burst(
2100 							ports[0],
2101 							(uint16_t)tx_q->txq_id,
2102 							(struct rte_mbuf **)
2103 							tx_q->m_table,
2104 							(uint16_t)tx_q->len);
2105 						if (unlikely(ret < tx_q->len)) {
2106 							do {
2107 								rte_pktmbuf_free(
2108 									tx_q->m_table[ret]);
2109 							} while (++ret < tx_q->len);
2110 						}
2111 						tx_q->len = 0;
2112 
2113 						txmbuf_clean_zcp(dev,
2114 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2115 					}
2116 				}
2117 				dev_ll = dev_ll->next;
2118 			}
2119 			prev_tsc = cur_tsc;
2120 		}
2121 
2122 		rte_prefetch0(lcore_ll->ll_root_used);
2123 
2124 		/*
2125 		 * Inform the configuration core that we have exited the linked
2126 		 * list and that no devices are in use if requested.
2127 		 */
2128 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2129 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2130 
2131 		/* Process devices */
2132 		dev_ll = lcore_ll->ll_root_used;
2133 
2134 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2135 			vdev = dev_ll->vdev;
2136 			dev  = vdev->dev;
2137 			if (unlikely(vdev->remove)) {
2138 				dev_ll = dev_ll->next;
2139 				unlink_vmdq(vdev);
2140 				vdev->ready = DEVICE_SAFE_REMOVE;
2141 				continue;
2142 			}
2143 
2144 			if (likely(vdev->ready == DEVICE_RX)) {
2145 				uint32_t index = vdev->vmdq_rx_q;
2146 				uint16_t i;
2147 				count_in_ring
2148 				= rte_ring_count(vpool_array[index].ring);
2149 				uint16_t free_entries
2150 				= (uint16_t)get_available_ring_num_zcp(dev);
2151 
2152 				/*
2153 				 * Attach all mbufs in vpool.ring and put back
2154 				 * into vpool.pool.
2155 				 */
2156 				for (i = 0;
2157 				i < RTE_MIN(free_entries,
2158 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2159 				i++)
2160 					attach_rxmbuf_zcp(dev);
2161 
2162 				/* Handle guest RX */
2163 				rx_count = rte_eth_rx_burst(ports[0],
2164 					vdev->vmdq_rx_q, pkts_burst,
2165 					MAX_PKT_BURST);
2166 
2167 				if (rx_count) {
2168 					ret_count = virtio_dev_rx_zcp(dev,
2169 							pkts_burst, rx_count);
2170 					if (enable_stats) {
2171 						dev_statistics[dev->device_fh].rx_total
2172 							+= rx_count;
2173 						dev_statistics[dev->device_fh].rx
2174 							+= ret_count;
2175 					}
2176 					while (likely(rx_count)) {
2177 						rx_count--;
2178 						pktmbuf_detach_zcp(
2179 							pkts_burst[rx_count]);
2180 						rte_ring_sp_enqueue(
2181 							vpool_array[index].ring,
2182 							(void *)pkts_burst[rx_count]);
2183 					}
2184 				}
2185 			}
2186 
2187 			if (likely(!vdev->remove))
2188 				/* Handle guest TX */
2189 				virtio_dev_tx_zcp(dev);
2190 
2191 			/* Move to the next device in the list */
2192 			dev_ll = dev_ll->next;
2193 		}
2194 	}
2195 
2196 	return 0;
2197 }
2198 
2199 
2200 /*
2201  * Add an entry to a used linked list. A free entry must first be found
2202  * in the free linked list using get_data_ll_free_entry();
2203  */
2204 static void
2205 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2206 	struct virtio_net_data_ll *ll_dev)
2207 {
2208 	struct virtio_net_data_ll *ll = *ll_root_addr;
2209 
2210 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2211 	ll_dev->next = NULL;
2212 	rte_compiler_barrier();
2213 
2214 	/* If ll == NULL then this is the first device. */
2215 	if (ll) {
2216 		/* Increment to the tail of the linked list. */
2217 		while ((ll->next != NULL) )
2218 			ll = ll->next;
2219 
2220 		ll->next = ll_dev;
2221 	} else {
2222 		*ll_root_addr = ll_dev;
2223 	}
2224 }
2225 
2226 /*
2227  * Remove an entry from a used linked list. The entry must then be added to
2228  * the free linked list using put_data_ll_free_entry().
2229  */
2230 static void
2231 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2232 	struct virtio_net_data_ll *ll_dev,
2233 	struct virtio_net_data_ll *ll_dev_last)
2234 {
2235 	struct virtio_net_data_ll *ll = *ll_root_addr;
2236 
2237 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2238 		return;
2239 
2240 	if (ll_dev == ll)
2241 		*ll_root_addr = ll_dev->next;
2242 	else
2243 		if (likely(ll_dev_last != NULL))
2244 			ll_dev_last->next = ll_dev->next;
2245 		else
2246 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2247 }
2248 
2249 /*
2250  * Find and return an entry from the free linked list.
2251  */
2252 static struct virtio_net_data_ll *
2253 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2254 {
2255 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2256 	struct virtio_net_data_ll *ll_dev;
2257 
2258 	if (ll_free == NULL)
2259 		return NULL;
2260 
2261 	ll_dev = ll_free;
2262 	*ll_root_addr = ll_free->next;
2263 
2264 	return ll_dev;
2265 }
2266 
2267 /*
2268  * Place an entry back on to the free linked list.
2269  */
2270 static void
2271 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2272 	struct virtio_net_data_ll *ll_dev)
2273 {
2274 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2275 
2276 	if (ll_dev == NULL)
2277 		return;
2278 
2279 	ll_dev->next = ll_free;
2280 	*ll_root_addr = ll_dev;
2281 }
2282 
2283 /*
2284  * Creates a linked list of a given size.
2285  */
2286 static struct virtio_net_data_ll *
2287 alloc_data_ll(uint32_t size)
2288 {
2289 	struct virtio_net_data_ll *ll_new;
2290 	uint32_t i;
2291 
2292 	/* Malloc and then chain the linked list. */
2293 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2294 	if (ll_new == NULL) {
2295 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2296 		return NULL;
2297 	}
2298 
2299 	for (i = 0; i < size - 1; i++) {
2300 		ll_new[i].vdev = NULL;
2301 		ll_new[i].next = &ll_new[i+1];
2302 	}
2303 	ll_new[i].next = NULL;
2304 
2305 	return (ll_new);
2306 }
2307 
2308 /*
2309  * Create the main linked list along with each individual cores linked list. A used and a free list
2310  * are created to manage entries.
2311  */
2312 static int
2313 init_data_ll (void)
2314 {
2315 	int lcore;
2316 
2317 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2318 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2319 		if (lcore_info[lcore].lcore_ll == NULL) {
2320 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2321 			return -1;
2322 		}
2323 
2324 		lcore_info[lcore].lcore_ll->device_num = 0;
2325 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2326 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2327 		if (num_devices % num_switching_cores)
2328 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2329 		else
2330 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2331 	}
2332 
2333 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2334 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2335 
2336 	return 0;
2337 }
2338 
2339 /*
2340  * Set virtqueue flags so that we do not receive interrupts.
2341  */
2342 static void
2343 set_irq_status (struct virtio_net *dev)
2344 {
2345 	dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2346 	dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2347 }
2348 
2349 /*
2350  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2351  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2352  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2353  */
2354 static void
2355 destroy_device (volatile struct virtio_net *dev)
2356 {
2357 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2358 	struct virtio_net_data_ll *ll_main_dev_cur;
2359 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2360 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2361 	struct vhost_dev *vdev;
2362 	int lcore;
2363 
2364 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2365 
2366 	vdev = (struct vhost_dev *)dev->priv;
2367 	/*set the remove flag. */
2368 	vdev->remove = 1;
2369 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2370 		rte_pause();
2371 	}
2372 
2373 	/* Search for entry to be removed from lcore ll */
2374 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2375 	while (ll_lcore_dev_cur != NULL) {
2376 		if (ll_lcore_dev_cur->vdev == vdev) {
2377 			break;
2378 		} else {
2379 			ll_lcore_dev_last = ll_lcore_dev_cur;
2380 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2381 		}
2382 	}
2383 
2384 	if (ll_lcore_dev_cur == NULL) {
2385 		RTE_LOG(ERR, VHOST_CONFIG,
2386 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2387 			dev->device_fh);
2388 		return;
2389 	}
2390 
2391 	/* Search for entry to be removed from main ll */
2392 	ll_main_dev_cur = ll_root_used;
2393 	ll_main_dev_last = NULL;
2394 	while (ll_main_dev_cur != NULL) {
2395 		if (ll_main_dev_cur->vdev == vdev) {
2396 			break;
2397 		} else {
2398 			ll_main_dev_last = ll_main_dev_cur;
2399 			ll_main_dev_cur = ll_main_dev_cur->next;
2400 		}
2401 	}
2402 
2403 	/* Remove entries from the lcore and main ll. */
2404 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2405 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2406 
2407 	/* Set the dev_removal_flag on each lcore. */
2408 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2409 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2410 	}
2411 
2412 	/*
2413 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2414 	 * they can no longer access the device removed from the linked lists and that the devices
2415 	 * are no longer in use.
2416 	 */
2417 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2418 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2419 			rte_pause();
2420 		}
2421 	}
2422 
2423 	/* Add the entries back to the lcore and main free ll.*/
2424 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2425 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2426 
2427 	/* Decrement number of device on the lcore. */
2428 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2429 
2430 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2431 
2432 	if (zero_copy) {
2433 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2434 
2435 		/* Stop the RX queue. */
2436 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2437 			LOG_DEBUG(VHOST_CONFIG,
2438 				"(%"PRIu64") In destroy_device: Failed to stop "
2439 				"rx queue:%d\n",
2440 				dev->device_fh,
2441 				vdev->vmdq_rx_q);
2442 		}
2443 
2444 		LOG_DEBUG(VHOST_CONFIG,
2445 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2446 			"mempool back to ring for RX queue: %d\n",
2447 			dev->device_fh, vdev->vmdq_rx_q);
2448 
2449 		mbuf_destroy_zcp(vpool);
2450 
2451 		/* Stop the TX queue. */
2452 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2453 			LOG_DEBUG(VHOST_CONFIG,
2454 				"(%"PRIu64") In destroy_device: Failed to "
2455 				"stop tx queue:%d\n",
2456 				dev->device_fh, vdev->vmdq_rx_q);
2457 		}
2458 
2459 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2460 
2461 		LOG_DEBUG(VHOST_CONFIG,
2462 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2463 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2464 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2465 			dev->device_fh);
2466 
2467 		mbuf_destroy_zcp(vpool);
2468 		rte_free(vdev->regions_hpa);
2469 	}
2470 	rte_free(vdev);
2471 
2472 }
2473 
2474 /*
2475  * Calculate the region count of physical continous regions for one particular
2476  * region of whose vhost virtual address is continous. The particular region
2477  * start from vva_start, with size of 'size' in argument.
2478  */
2479 static uint32_t
2480 check_hpa_regions(uint64_t vva_start, uint64_t size)
2481 {
2482 	uint32_t i, nregions = 0, page_size = getpagesize();
2483 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2484 	if (vva_start % page_size) {
2485 		LOG_DEBUG(VHOST_CONFIG,
2486 			"in check_countinous: vva start(%p) mod page_size(%d) "
2487 			"has remainder\n",
2488 			(void *)(uintptr_t)vva_start, page_size);
2489 		return 0;
2490 	}
2491 	if (size % page_size) {
2492 		LOG_DEBUG(VHOST_CONFIG,
2493 			"in check_countinous: "
2494 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2495 			size, page_size);
2496 		return 0;
2497 	}
2498 	for (i = 0; i < size - page_size; i = i + page_size) {
2499 		cur_phys_addr
2500 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2501 		next_phys_addr = rte_mem_virt2phy(
2502 			(void *)(uintptr_t)(vva_start + i + page_size));
2503 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2504 			++nregions;
2505 			LOG_DEBUG(VHOST_CONFIG,
2506 				"in check_continuous: hva addr:(%p) is not "
2507 				"continuous with hva addr:(%p), diff:%d\n",
2508 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2509 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2510 				+ page_size), page_size);
2511 			LOG_DEBUG(VHOST_CONFIG,
2512 				"in check_continuous: hpa addr:(%p) is not "
2513 				"continuous with hpa addr:(%p), "
2514 				"diff:(%"PRIu64")\n",
2515 				(void *)(uintptr_t)cur_phys_addr,
2516 				(void *)(uintptr_t)next_phys_addr,
2517 				(next_phys_addr-cur_phys_addr));
2518 		}
2519 	}
2520 	return nregions;
2521 }
2522 
2523 /*
2524  * Divide each region whose vhost virtual address is continous into a few
2525  * sub-regions, make sure the physical address within each sub-region are
2526  * continous. And fill offset(to GPA) and size etc. information of each
2527  * sub-region into regions_hpa.
2528  */
2529 static uint32_t
2530 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2531 {
2532 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2533 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2534 
2535 	if (mem_region_hpa == NULL)
2536 		return 0;
2537 
2538 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2539 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2540 			virtio_memory->regions[regionidx].address_offset;
2541 		mem_region_hpa[regionidx_hpa].guest_phys_address
2542 			= virtio_memory->regions[regionidx].guest_phys_address;
2543 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2544 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2545 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2546 		LOG_DEBUG(VHOST_CONFIG,
2547 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2548 			regionidx_hpa,
2549 			(void *)(uintptr_t)
2550 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2551 		LOG_DEBUG(VHOST_CONFIG,
2552 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2553 			regionidx_hpa,
2554 			(void *)(uintptr_t)
2555 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2556 		for (i = 0, k = 0;
2557 			i < virtio_memory->regions[regionidx].memory_size -
2558 				page_size;
2559 			i += page_size) {
2560 			cur_phys_addr = rte_mem_virt2phy(
2561 					(void *)(uintptr_t)(vva_start + i));
2562 			next_phys_addr = rte_mem_virt2phy(
2563 					(void *)(uintptr_t)(vva_start +
2564 					i + page_size));
2565 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2566 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2567 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2568 					k + page_size;
2569 				mem_region_hpa[regionidx_hpa].memory_size
2570 					= k + page_size;
2571 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2572 					"phys addr end  [%d]:(%p)\n",
2573 					regionidx_hpa,
2574 					(void *)(uintptr_t)
2575 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2576 				LOG_DEBUG(VHOST_CONFIG,
2577 					"in fill_hpa_regions: guest phys addr "
2578 					"size [%d]:(%p)\n",
2579 					regionidx_hpa,
2580 					(void *)(uintptr_t)
2581 					(mem_region_hpa[regionidx_hpa].memory_size));
2582 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2583 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2584 				++regionidx_hpa;
2585 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2586 					next_phys_addr -
2587 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2588 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2589 					" phys addr start[%d]:(%p)\n",
2590 					regionidx_hpa,
2591 					(void *)(uintptr_t)
2592 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2593 				LOG_DEBUG(VHOST_CONFIG,
2594 					"in fill_hpa_regions: host  phys addr "
2595 					"start[%d]:(%p)\n",
2596 					regionidx_hpa,
2597 					(void *)(uintptr_t)
2598 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2599 				k = 0;
2600 			} else {
2601 				k += page_size;
2602 			}
2603 		}
2604 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2605 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2606 			+ k + page_size;
2607 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2608 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2609 			"[%d]:(%p)\n", regionidx_hpa,
2610 			(void *)(uintptr_t)
2611 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2612 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2613 			"[%d]:(%p)\n", regionidx_hpa,
2614 			(void *)(uintptr_t)
2615 			(mem_region_hpa[regionidx_hpa].memory_size));
2616 		++regionidx_hpa;
2617 	}
2618 	return regionidx_hpa;
2619 }
2620 
2621 /*
2622  * A new device is added to a data core. First the device is added to the main linked list
2623  * and the allocated to a specific data core.
2624  */
2625 static int
2626 new_device (struct virtio_net *dev)
2627 {
2628 	struct virtio_net_data_ll *ll_dev;
2629 	int lcore, core_add = 0;
2630 	uint32_t device_num_min = num_devices;
2631 	struct vhost_dev *vdev;
2632 	uint32_t regionidx;
2633 
2634 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2635 	if (vdev == NULL) {
2636 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2637 			dev->device_fh);
2638 		return -1;
2639 	}
2640 	vdev->dev = dev;
2641 	dev->priv = vdev;
2642 
2643 	if (zero_copy) {
2644 		vdev->nregions_hpa = dev->mem->nregions;
2645 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2646 			vdev->nregions_hpa
2647 				+= check_hpa_regions(
2648 					dev->mem->regions[regionidx].guest_phys_address
2649 					+ dev->mem->regions[regionidx].address_offset,
2650 					dev->mem->regions[regionidx].memory_size);
2651 
2652 		}
2653 
2654 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2655 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2656 			CACHE_LINE_SIZE);
2657 		if (vdev->regions_hpa == NULL) {
2658 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2659 			rte_free(vdev);
2660 			return -1;
2661 		}
2662 
2663 
2664 		if (fill_hpa_memory_regions(
2665 			vdev->regions_hpa, dev->mem
2666 			) != vdev->nregions_hpa) {
2667 
2668 			RTE_LOG(ERR, VHOST_CONFIG,
2669 				"hpa memory regions number mismatch: "
2670 				"[%d]\n", vdev->nregions_hpa);
2671 			rte_free(vdev->regions_hpa);
2672 			rte_free(vdev);
2673 			return -1;
2674 		}
2675 	}
2676 
2677 
2678 	/* Add device to main ll */
2679 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2680 	if (ll_dev == NULL) {
2681 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2682 			"of %d devices per core has been reached\n",
2683 			dev->device_fh, num_devices);
2684 		if (vdev->regions_hpa)
2685 			rte_free(vdev->regions_hpa);
2686 		rte_free(vdev);
2687 		return -1;
2688 	}
2689 	ll_dev->vdev = vdev;
2690 	add_data_ll_entry(&ll_root_used, ll_dev);
2691 	vdev->vmdq_rx_q
2692 		= dev->device_fh * (num_queues / num_devices);
2693 
2694 	if (zero_copy) {
2695 		uint32_t index = vdev->vmdq_rx_q;
2696 		uint32_t count_in_ring, i;
2697 		struct mbuf_table *tx_q;
2698 
2699 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2700 
2701 		LOG_DEBUG(VHOST_CONFIG,
2702 			"(%"PRIu64") in new_device: mbuf count in mempool "
2703 			"before attach is: %d\n",
2704 			dev->device_fh,
2705 			rte_mempool_count(vpool_array[index].pool));
2706 		LOG_DEBUG(VHOST_CONFIG,
2707 			"(%"PRIu64") in new_device: mbuf count in  ring "
2708 			"before attach  is : %d\n",
2709 			dev->device_fh, count_in_ring);
2710 
2711 		/*
2712 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2713 		 */
2714 		for (i = 0; i < count_in_ring; i++)
2715 			attach_rxmbuf_zcp(dev);
2716 
2717 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2718 			"mempool after attach is: %d\n",
2719 			dev->device_fh,
2720 			rte_mempool_count(vpool_array[index].pool));
2721 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2722 			"ring after attach  is : %d\n",
2723 			dev->device_fh,
2724 			rte_ring_count(vpool_array[index].ring));
2725 
2726 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2727 		tx_q->txq_id = vdev->vmdq_rx_q;
2728 
2729 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2730 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2731 
2732 			LOG_DEBUG(VHOST_CONFIG,
2733 				"(%"PRIu64") In new_device: Failed to start "
2734 				"tx queue:%d\n",
2735 				dev->device_fh, vdev->vmdq_rx_q);
2736 
2737 			mbuf_destroy_zcp(vpool);
2738 			rte_free(vdev->regions_hpa);
2739 			rte_free(vdev);
2740 			return -1;
2741 		}
2742 
2743 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2744 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2745 
2746 			LOG_DEBUG(VHOST_CONFIG,
2747 				"(%"PRIu64") In new_device: Failed to start "
2748 				"rx queue:%d\n",
2749 				dev->device_fh, vdev->vmdq_rx_q);
2750 
2751 			/* Stop the TX queue. */
2752 			if (rte_eth_dev_tx_queue_stop(ports[0],
2753 				vdev->vmdq_rx_q) != 0) {
2754 				LOG_DEBUG(VHOST_CONFIG,
2755 					"(%"PRIu64") In new_device: Failed to "
2756 					"stop tx queue:%d\n",
2757 					dev->device_fh, vdev->vmdq_rx_q);
2758 			}
2759 
2760 			mbuf_destroy_zcp(vpool);
2761 			rte_free(vdev->regions_hpa);
2762 			rte_free(vdev);
2763 			return -1;
2764 		}
2765 
2766 	}
2767 
2768 	/*reset ready flag*/
2769 	vdev->ready = DEVICE_MAC_LEARNING;
2770 	vdev->remove = 0;
2771 
2772 	/* Find a suitable lcore to add the device. */
2773 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2774 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2775 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2776 			core_add = lcore;
2777 		}
2778 	}
2779 	/* Add device to lcore ll */
2780 	ll_dev->dev->coreid = core_add;
2781 	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2782 	if (ll_dev == NULL) {
2783 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2784 		vdev->ready = DEVICE_SAFE_REMOVE;
2785 		destroy_device(dev);
2786 		if (vdev->regions_hpa)
2787 			rte_free(vdev->regions_hpa);
2788 		rte_free(vdev);
2789 		return -1;
2790 	}
2791 	ll_dev->vdev = vdev;
2792 	vdev->coreid = core_add;
2793 
2794 	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2795 
2796 	/* Initialize device stats */
2797 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2798 
2799 	/* Disable notifications. */
2800 	set_irq_status(dev);
2801 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2802 	dev->flags |= VIRTIO_DEV_RUNNING;
2803 
2804 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2805 
2806 	return 0;
2807 }
2808 
2809 /*
2810  * These callback allow devices to be added to the data core when configuration
2811  * has been fully complete.
2812  */
2813 static const struct virtio_net_device_ops virtio_net_device_ops =
2814 {
2815 	.new_device =  new_device,
2816 	.destroy_device = destroy_device,
2817 };
2818 
2819 /*
2820  * This is a thread will wake up after a period to print stats if the user has
2821  * enabled them.
2822  */
2823 static void
2824 print_stats(void)
2825 {
2826 	struct virtio_net_data_ll *dev_ll;
2827 	uint64_t tx_dropped, rx_dropped;
2828 	uint64_t tx, tx_total, rx, rx_total;
2829 	uint32_t device_fh;
2830 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2831 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2832 
2833 	while(1) {
2834 		sleep(enable_stats);
2835 
2836 		/* Clear screen and move to top left */
2837 		printf("%s%s", clr, top_left);
2838 
2839 		printf("\nDevice statistics ====================================");
2840 
2841 		dev_ll = ll_root_used;
2842 		while (dev_ll != NULL) {
2843 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2844 			tx_total = dev_statistics[device_fh].tx_total;
2845 			tx = dev_statistics[device_fh].tx;
2846 			tx_dropped = tx_total - tx;
2847 			if (zero_copy == 0) {
2848 				rx_total = rte_atomic64_read(
2849 					&dev_statistics[device_fh].rx_total_atomic);
2850 				rx = rte_atomic64_read(
2851 					&dev_statistics[device_fh].rx_atomic);
2852 			} else {
2853 				rx_total = dev_statistics[device_fh].rx_total;
2854 				rx = dev_statistics[device_fh].rx;
2855 			}
2856 			rx_dropped = rx_total - rx;
2857 
2858 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2859 					"\nTX total: 		%"PRIu64""
2860 					"\nTX dropped: 		%"PRIu64""
2861 					"\nTX successful: 		%"PRIu64""
2862 					"\nRX total: 		%"PRIu64""
2863 					"\nRX dropped: 		%"PRIu64""
2864 					"\nRX successful: 		%"PRIu64"",
2865 					device_fh,
2866 					tx_total,
2867 					tx_dropped,
2868 					tx,
2869 					rx_total,
2870 					rx_dropped,
2871 					rx);
2872 
2873 			dev_ll = dev_ll->next;
2874 		}
2875 		printf("\n======================================================\n");
2876 	}
2877 }
2878 
2879 static void
2880 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2881 	char *ring_name, uint32_t nb_mbuf)
2882 {
2883 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2884 	vpool_array[index].pool
2885 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2886 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2887 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2888 		rte_pktmbuf_init, NULL, socket, 0);
2889 	if (vpool_array[index].pool != NULL) {
2890 		vpool_array[index].ring
2891 			= rte_ring_create(ring_name,
2892 				rte_align32pow2(nb_mbuf + 1),
2893 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2894 		if (likely(vpool_array[index].ring != NULL)) {
2895 			LOG_DEBUG(VHOST_CONFIG,
2896 				"in setup_mempool_tbl: mbuf count in "
2897 				"mempool is: %d\n",
2898 				rte_mempool_count(vpool_array[index].pool));
2899 			LOG_DEBUG(VHOST_CONFIG,
2900 				"in setup_mempool_tbl: mbuf count in "
2901 				"ring   is: %d\n",
2902 				rte_ring_count(vpool_array[index].ring));
2903 		} else {
2904 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2905 				ring_name);
2906 		}
2907 
2908 		/* Need consider head room. */
2909 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2910 	} else {
2911 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2912 	}
2913 }
2914 
2915 
2916 /*
2917  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2918  * device is also registered here to handle the IOCTLs.
2919  */
2920 int
2921 MAIN(int argc, char *argv[])
2922 {
2923 	struct rte_mempool *mbuf_pool = NULL;
2924 	unsigned lcore_id, core_id = 0;
2925 	unsigned nb_ports, valid_num_ports;
2926 	int ret;
2927 	uint8_t portid, queue_id = 0;
2928 	static pthread_t tid;
2929 
2930 	/* init EAL */
2931 	ret = rte_eal_init(argc, argv);
2932 	if (ret < 0)
2933 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2934 	argc -= ret;
2935 	argv += ret;
2936 
2937 	/* parse app arguments */
2938 	ret = us_vhost_parse_args(argc, argv);
2939 	if (ret < 0)
2940 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2941 
2942 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2943 		if (rte_lcore_is_enabled(lcore_id))
2944 			lcore_ids[core_id ++] = lcore_id;
2945 
2946 	if (rte_lcore_count() > RTE_MAX_LCORE)
2947 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2948 
2949 	/*set the number of swithcing cores available*/
2950 	num_switching_cores = rte_lcore_count()-1;
2951 
2952 	/* Get the number of physical ports. */
2953 	nb_ports = rte_eth_dev_count();
2954 	if (nb_ports > RTE_MAX_ETHPORTS)
2955 		nb_ports = RTE_MAX_ETHPORTS;
2956 
2957 	/*
2958 	 * Update the global var NUM_PORTS and global array PORTS
2959 	 * and get value of var VALID_NUM_PORTS according to system ports number
2960 	 */
2961 	valid_num_ports = check_ports_num(nb_ports);
2962 
2963 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2964 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2965 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2966 		return -1;
2967 	}
2968 
2969 	if (zero_copy == 0) {
2970 		/* Create the mbuf pool. */
2971 		mbuf_pool = rte_mempool_create(
2972 				"MBUF_POOL",
2973 				NUM_MBUFS_PER_PORT
2974 				* valid_num_ports,
2975 				MBUF_SIZE, MBUF_CACHE_SIZE,
2976 				sizeof(struct rte_pktmbuf_pool_private),
2977 				rte_pktmbuf_pool_init, NULL,
2978 				rte_pktmbuf_init, NULL,
2979 				rte_socket_id(), 0);
2980 		if (mbuf_pool == NULL)
2981 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2982 
2983 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2984 			vpool_array[queue_id].pool = mbuf_pool;
2985 
2986 		if (vm2vm_mode == VM2VM_HARDWARE) {
2987 			/* Enable VT loop back to let L2 switch to do it. */
2988 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2989 			LOG_DEBUG(VHOST_CONFIG,
2990 				"Enable loop back for L2 switch in vmdq.\n");
2991 		}
2992 	} else {
2993 		uint32_t nb_mbuf;
2994 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2995 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2996 
2997 		/*
2998 		 * Zero copy defers queue RX/TX start to the time when guest
2999 		 * finishes its startup and packet buffers from that guest are
3000 		 * available.
3001 		 */
3002 		rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
3003 		rx_conf_default.rx_drop_en = 0;
3004 		tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
3005 		nb_mbuf = num_rx_descriptor
3006 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3007 			+ num_switching_cores * MAX_PKT_BURST;
3008 
3009 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3010 			snprintf(pool_name, sizeof(pool_name),
3011 				"rxmbuf_pool_%u", queue_id);
3012 			snprintf(ring_name, sizeof(ring_name),
3013 				"rxmbuf_ring_%u", queue_id);
3014 			setup_mempool_tbl(rte_socket_id(), queue_id,
3015 				pool_name, ring_name, nb_mbuf);
3016 		}
3017 
3018 		nb_mbuf = num_tx_descriptor
3019 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3020 				+ num_switching_cores * MAX_PKT_BURST;
3021 
3022 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3023 			snprintf(pool_name, sizeof(pool_name),
3024 				"txmbuf_pool_%u", queue_id);
3025 			snprintf(ring_name, sizeof(ring_name),
3026 				"txmbuf_ring_%u", queue_id);
3027 			setup_mempool_tbl(rte_socket_id(),
3028 				(queue_id + MAX_QUEUES),
3029 				pool_name, ring_name, nb_mbuf);
3030 		}
3031 
3032 		if (vm2vm_mode == VM2VM_HARDWARE) {
3033 			/* Enable VT loop back to let L2 switch to do it. */
3034 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3035 			LOG_DEBUG(VHOST_CONFIG,
3036 				"Enable loop back for L2 switch in vmdq.\n");
3037 		}
3038 	}
3039 	/* Set log level. */
3040 	rte_set_log_level(LOG_LEVEL);
3041 
3042 	/* initialize all ports */
3043 	for (portid = 0; portid < nb_ports; portid++) {
3044 		/* skip ports that are not enabled */
3045 		if ((enabled_port_mask & (1 << portid)) == 0) {
3046 			RTE_LOG(INFO, VHOST_PORT,
3047 				"Skipping disabled port %d\n", portid);
3048 			continue;
3049 		}
3050 		if (port_init(portid) != 0)
3051 			rte_exit(EXIT_FAILURE,
3052 				"Cannot initialize network ports\n");
3053 	}
3054 
3055 	/* Initialise all linked lists. */
3056 	if (init_data_ll() == -1)
3057 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3058 
3059 	/* Initialize device stats */
3060 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3061 
3062 	/* Enable stats if the user option is set. */
3063 	if (enable_stats)
3064 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3065 
3066 	/* Launch all data cores. */
3067 	if (zero_copy == 0) {
3068 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3069 			rte_eal_remote_launch(switch_worker,
3070 				mbuf_pool, lcore_id);
3071 		}
3072 	} else {
3073 		uint32_t count_in_mempool, index, i;
3074 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3075 			/* For all RX and TX queues. */
3076 			count_in_mempool
3077 				= rte_mempool_count(vpool_array[index].pool);
3078 
3079 			/*
3080 			 * Transfer all un-attached mbufs from vpool.pool
3081 			 * to vpoo.ring.
3082 			 */
3083 			for (i = 0; i < count_in_mempool; i++) {
3084 				struct rte_mbuf *mbuf
3085 					= __rte_mbuf_raw_alloc(
3086 						vpool_array[index].pool);
3087 				rte_ring_sp_enqueue(vpool_array[index].ring,
3088 						(void *)mbuf);
3089 			}
3090 
3091 			LOG_DEBUG(VHOST_CONFIG,
3092 				"in MAIN: mbuf count in mempool at initial "
3093 				"is: %d\n", count_in_mempool);
3094 			LOG_DEBUG(VHOST_CONFIG,
3095 				"in MAIN: mbuf count in  ring at initial  is :"
3096 				" %d\n",
3097 				rte_ring_count(vpool_array[index].ring));
3098 		}
3099 
3100 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3101 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3102 				lcore_id);
3103 	}
3104 
3105 	/* Register CUSE device to handle IOCTLs. */
3106 	ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3107 	if (ret != 0)
3108 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3109 
3110 	init_virtio_net(&virtio_net_device_ops);
3111 
3112 	/* Start CUSE session. */
3113 	start_cuse_session_loop();
3114 	return 0;
3115 
3116 }
3117 
3118