xref: /dpdk/examples/vhost/main.c (revision 1f49ec153c8f91ee34c23e58c7443eb87f566b60)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 
56 #include "main.h"
57 
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61 
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64 
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
69 							(num_switching_cores*MAX_PKT_BURST) +  			\
70 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71 							((num_switching_cores+1)*MBUF_CACHE_SIZE))
72 
73 #define MBUF_CACHE_SIZE	128
74 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
75 
76 /*
77  * No frame data buffer allocated from host are required for zero copy
78  * implementation, guest will allocate the frame data buffer, and vhost
79  * directly use it.
80  */
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP	RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP		RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
84 
85 #define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
87 
88 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
90 
91 #define JUMBO_FRAME_MAX_SIZE    0x2600
92 
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
95 #define DEVICE_RX			1
96 #define DEVICE_SAFE_REMOVE	2
97 
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
101 
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
105 
106 /*
107  * Need refine these 2 macros for legacy and DPDK based front end:
108  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109  * And then adjust power 2.
110  */
111 /*
112  * For legacy front end, 128 descriptors,
113  * half for virtio header, another half for mbuf.
114  */
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
117 
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120 		+ sizeof(struct rte_mbuf)))
121 
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
124 
125 #define INVALID_PORT_ID 0xFF
126 
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
129 
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
132 
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
135 
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
138 
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
141 
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
144 
145 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
146 
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
149 
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
152 
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
155 
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
159 
160 /*
161  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162  * disabled on default.
163  */
164 static uint32_t zero_copy;
165 static int mergeable;
166 
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
169 
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
173 
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
176 
177 struct vpool {
178 	struct rte_mempool *pool;
179 	struct rte_ring *ring;
180 	uint32_t buf_size;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
182 
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
184 typedef enum {
185 	VM2VM_DISABLED = 0,
186 	VM2VM_SOFTWARE = 1,
187 	VM2VM_HARDWARE = 2,
188 	VM2VM_LAST
189 } vm2vm_type;
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
191 
192 /* The type of host physical address translated from guest physical address. */
193 typedef enum {
194 	PHYS_ADDR_CONTINUOUS = 0,
195 	PHYS_ADDR_CROSS_SUBREG = 1,
196 	PHYS_ADDR_INVALID = 2,
197 	PHYS_ADDR_LAST
198 } hpa_type;
199 
200 /* Enable stats. */
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
204 
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
207 
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
210 
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215 
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218 
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
221 	.rxmode = {
222 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
223 		.split_hdr_size = 0,
224 		.header_split   = 0, /**< Header Split disabled */
225 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
226 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
227 		/*
228 		 * It is necessary for 1G NIC such as I350,
229 		 * this fixes bug of ipv4 forwarding in guest can't
230 		 * forward pakets from one virtio dev to another virtio dev.
231 		 */
232 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
233 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
234 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
235 	},
236 
237 	.txmode = {
238 		.mq_mode = ETH_MQ_TX_NONE,
239 	},
240 	.rx_adv_conf = {
241 		/*
242 		 * should be overridden separately in code with
243 		 * appropriate values
244 		 */
245 		.vmdq_rx_conf = {
246 			.nb_queue_pools = ETH_8_POOLS,
247 			.enable_default_pool = 0,
248 			.default_pool = 0,
249 			.nb_pool_maps = 0,
250 			.pool_map = {{0, 0},},
251 		},
252 	},
253 };
254 
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
261 
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
266 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
272 };
273 
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
276 
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
280 
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
283 
284 /* Used for queueing bursts of TX packets. */
285 struct mbuf_table {
286 	unsigned len;
287 	unsigned txq_id;
288 	struct rte_mbuf *m_table[MAX_PKT_BURST];
289 };
290 
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
293 
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
296 
297 /* Vlan header struct used to insert vlan tags on TX. */
298 struct vlan_ethhdr {
299 	unsigned char   h_dest[ETH_ALEN];
300 	unsigned char   h_source[ETH_ALEN];
301 	__be16          h_vlan_proto;
302 	__be16          h_vlan_TCI;
303 	__be16          h_vlan_encapsulated_proto;
304 };
305 
306 /* Header lengths. */
307 #define VLAN_HLEN       4
308 #define VLAN_ETH_HLEN   18
309 
310 /* Per-device statistics struct */
311 struct device_statistics {
312 	uint64_t tx_total;
313 	rte_atomic64_t rx_total_atomic;
314 	uint64_t rx_total;
315 	uint64_t tx;
316 	rte_atomic64_t rx_atomic;
317 	uint64_t rx;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
320 
321 /*
322  * Builds up the correct configuration for VMDQ VLAN pool map
323  * according to the pool & queue limits.
324  */
325 static inline int
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
327 {
328 	struct rte_eth_vmdq_rx_conf conf;
329 	struct rte_eth_vmdq_rx_conf *def_conf =
330 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
331 	unsigned i;
332 
333 	memset(&conf, 0, sizeof(conf));
334 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335 	conf.nb_pool_maps = num_devices;
336 	conf.enable_loop_back = def_conf->enable_loop_back;
337 	conf.rx_mode = def_conf->rx_mode;
338 
339 	for (i = 0; i < conf.nb_pool_maps; i++) {
340 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
341 		conf.pool_map[i].pools = (1UL << i);
342 	}
343 
344 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
347 	return 0;
348 }
349 
350 /*
351  * Validate the device number according to the max pool number gotten form
352  * dev_info. If the device number is invalid, give the error message and
353  * return -1. Each device must have its own pool.
354  */
355 static inline int
356 validate_num_devices(uint32_t max_nb_devices)
357 {
358 	if (num_devices > max_nb_devices) {
359 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
360 		return -1;
361 	}
362 	return 0;
363 }
364 
365 /*
366  * Initialises a given port using global settings and with the rx buffers
367  * coming from the mbuf_pool passed as parameter
368  */
369 static inline int
370 port_init(uint8_t port)
371 {
372 	struct rte_eth_dev_info dev_info;
373 	struct rte_eth_conf port_conf;
374 	struct rte_eth_rxconf *rxconf;
375 	struct rte_eth_txconf *txconf;
376 	int16_t rx_rings, tx_rings;
377 	uint16_t rx_ring_size, tx_ring_size;
378 	int retval;
379 	uint16_t q;
380 
381 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382 	rte_eth_dev_info_get (port, &dev_info);
383 
384 	if (dev_info.max_rx_queues > MAX_QUEUES) {
385 		rte_exit(EXIT_FAILURE,
386 			"please define MAX_QUEUES no less than %u in %s\n",
387 			dev_info.max_rx_queues, __FILE__);
388 	}
389 
390 	rxconf = &dev_info.default_rxconf;
391 	txconf = &dev_info.default_txconf;
392 	rxconf->rx_drop_en = 1;
393 
394 	/* Enable vlan offload */
395 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
396 
397 	/*
398 	 * Zero copy defers queue RX/TX start to the time when guest
399 	 * finishes its startup and packet buffers from that guest are
400 	 * available.
401 	 */
402 	if (zero_copy) {
403 		rxconf->rx_deferred_start = 1;
404 		rxconf->rx_drop_en = 0;
405 		txconf->tx_deferred_start = 1;
406 	}
407 
408 	/*configure the number of supported virtio devices based on VMDQ limits */
409 	num_devices = dev_info.max_vmdq_pools;
410 
411 	if (zero_copy) {
412 		rx_ring_size = num_rx_descriptor;
413 		tx_ring_size = num_tx_descriptor;
414 		tx_rings = dev_info.max_tx_queues;
415 	} else {
416 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418 		tx_rings = (uint16_t)rte_lcore_count();
419 	}
420 
421 	retval = validate_num_devices(MAX_DEVICES);
422 	if (retval < 0)
423 		return retval;
424 
425 	/* Get port configuration. */
426 	retval = get_eth_conf(&port_conf, num_devices);
427 	if (retval < 0)
428 		return retval;
429 	/* NIC queues are divided into pf queues and vmdq queues.  */
430 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432 	num_vmdq_queues = num_devices * queues_per_pool;
433 	num_queues = num_pf_queues + num_vmdq_queues;
434 	vmdq_queue_base = dev_info.vmdq_queue_base;
435 	vmdq_pool_base  = dev_info.vmdq_pool_base;
436 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437 		num_pf_queues, num_devices, queues_per_pool);
438 
439 	if (port >= rte_eth_dev_count()) return -1;
440 
441 	if (enable_tx_csum == 0)
442 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
443 
444 	if (enable_tso == 0) {
445 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
447 	}
448 
449 	rx_rings = (uint16_t)dev_info.max_rx_queues;
450 	/* Configure ethernet device. */
451 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452 	if (retval != 0)
453 		return retval;
454 
455 	/* Setup the queues. */
456 	for (q = 0; q < rx_rings; q ++) {
457 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 						rte_eth_dev_socket_id(port),
459 						rxconf,
460 						vpool_array[q].pool);
461 		if (retval < 0)
462 			return retval;
463 	}
464 	for (q = 0; q < tx_rings; q ++) {
465 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466 						rte_eth_dev_socket_id(port),
467 						txconf);
468 		if (retval < 0)
469 			return retval;
470 	}
471 
472 	/* Start the device. */
473 	retval  = rte_eth_dev_start(port);
474 	if (retval < 0) {
475 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476 		return retval;
477 	}
478 
479 	if (promiscuous)
480 		rte_eth_promiscuous_enable(port);
481 
482 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
486 			(unsigned)port,
487 			vmdq_ports_eth_addr[port].addr_bytes[0],
488 			vmdq_ports_eth_addr[port].addr_bytes[1],
489 			vmdq_ports_eth_addr[port].addr_bytes[2],
490 			vmdq_ports_eth_addr[port].addr_bytes[3],
491 			vmdq_ports_eth_addr[port].addr_bytes[4],
492 			vmdq_ports_eth_addr[port].addr_bytes[5]);
493 
494 	return 0;
495 }
496 
497 /*
498  * Set character device basename.
499  */
500 static int
501 us_vhost_parse_basename(const char *q_arg)
502 {
503 	/* parse number string */
504 
505 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
506 		return -1;
507 	else
508 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509 
510 	return 0;
511 }
512 
513 /*
514  * Parse the portmask provided at run time.
515  */
516 static int
517 parse_portmask(const char *portmask)
518 {
519 	char *end = NULL;
520 	unsigned long pm;
521 
522 	errno = 0;
523 
524 	/* parse hexadecimal string */
525 	pm = strtoul(portmask, &end, 16);
526 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
527 		return -1;
528 
529 	if (pm == 0)
530 		return -1;
531 
532 	return pm;
533 
534 }
535 
536 /*
537  * Parse num options at run time.
538  */
539 static int
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 {
542 	char *end = NULL;
543 	unsigned long num;
544 
545 	errno = 0;
546 
547 	/* parse unsigned int string */
548 	num = strtoul(q_arg, &end, 10);
549 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
550 		return -1;
551 
552 	if (num > max_valid_value)
553 		return -1;
554 
555 	return num;
556 
557 }
558 
559 /*
560  * Display usage
561  */
562 static void
563 us_vhost_usage(const char *prgname)
564 {
565 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
566 	"		--vm2vm [0|1|2]\n"
567 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568 	"		--dev-basename <name>\n"
569 	"		--nb-devices ND\n"
570 	"		-p PORTMASK: Set mask for ports to be used by application\n"
571 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578 	"		--dev-basename: The basename to be used for the character device.\n"
579 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
580 			"zero copy\n"
581 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
582 			"used only when zero copy is enabled.\n"
583 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
584 			"used only when zero copy is enabled.\n"
585 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
586 	"		--tso [0|1] disable/enable TCP segment offload.\n",
587 	       prgname);
588 }
589 
590 /*
591  * Parse the arguments given in the command line of the application.
592  */
593 static int
594 us_vhost_parse_args(int argc, char **argv)
595 {
596 	int opt, ret;
597 	int option_index;
598 	unsigned i;
599 	const char *prgname = argv[0];
600 	static struct option long_option[] = {
601 		{"vm2vm", required_argument, NULL, 0},
602 		{"rx-retry", required_argument, NULL, 0},
603 		{"rx-retry-delay", required_argument, NULL, 0},
604 		{"rx-retry-num", required_argument, NULL, 0},
605 		{"mergeable", required_argument, NULL, 0},
606 		{"vlan-strip", required_argument, NULL, 0},
607 		{"stats", required_argument, NULL, 0},
608 		{"dev-basename", required_argument, NULL, 0},
609 		{"zero-copy", required_argument, NULL, 0},
610 		{"rx-desc-num", required_argument, NULL, 0},
611 		{"tx-desc-num", required_argument, NULL, 0},
612 		{"tx-csum", required_argument, NULL, 0},
613 		{"tso", required_argument, NULL, 0},
614 		{NULL, 0, 0, 0},
615 	};
616 
617 	/* Parse command line */
618 	while ((opt = getopt_long(argc, argv, "p:P",
619 			long_option, &option_index)) != EOF) {
620 		switch (opt) {
621 		/* Portmask */
622 		case 'p':
623 			enabled_port_mask = parse_portmask(optarg);
624 			if (enabled_port_mask == 0) {
625 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626 				us_vhost_usage(prgname);
627 				return -1;
628 			}
629 			break;
630 
631 		case 'P':
632 			promiscuous = 1;
633 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634 				ETH_VMDQ_ACCEPT_BROADCAST |
635 				ETH_VMDQ_ACCEPT_MULTICAST;
636 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
637 
638 			break;
639 
640 		case 0:
641 			/* Enable/disable vm2vm comms. */
642 			if (!strncmp(long_option[option_index].name, "vm2vm",
643 				MAX_LONG_OPT_SZ)) {
644 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
645 				if (ret == -1) {
646 					RTE_LOG(INFO, VHOST_CONFIG,
647 						"Invalid argument for "
648 						"vm2vm [0|1|2]\n");
649 					us_vhost_usage(prgname);
650 					return -1;
651 				} else {
652 					vm2vm_mode = (vm2vm_type)ret;
653 				}
654 			}
655 
656 			/* Enable/disable retries on RX. */
657 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658 				ret = parse_num_opt(optarg, 1);
659 				if (ret == -1) {
660 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661 					us_vhost_usage(prgname);
662 					return -1;
663 				} else {
664 					enable_retry = ret;
665 				}
666 			}
667 
668 			/* Enable/disable TX checksum offload. */
669 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670 				ret = parse_num_opt(optarg, 1);
671 				if (ret == -1) {
672 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673 					us_vhost_usage(prgname);
674 					return -1;
675 				} else
676 					enable_tx_csum = ret;
677 			}
678 
679 			/* Enable/disable TSO offload. */
680 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681 				ret = parse_num_opt(optarg, 1);
682 				if (ret == -1) {
683 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684 					us_vhost_usage(prgname);
685 					return -1;
686 				} else
687 					enable_tso = ret;
688 			}
689 
690 			/* Specify the retries delay time (in useconds) on RX. */
691 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692 				ret = parse_num_opt(optarg, INT32_MAX);
693 				if (ret == -1) {
694 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695 					us_vhost_usage(prgname);
696 					return -1;
697 				} else {
698 					burst_rx_delay_time = ret;
699 				}
700 			}
701 
702 			/* Specify the retries number on RX. */
703 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704 				ret = parse_num_opt(optarg, INT32_MAX);
705 				if (ret == -1) {
706 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707 					us_vhost_usage(prgname);
708 					return -1;
709 				} else {
710 					burst_rx_retry_num = ret;
711 				}
712 			}
713 
714 			/* Enable/disable RX mergeable buffers. */
715 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716 				ret = parse_num_opt(optarg, 1);
717 				if (ret == -1) {
718 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719 					us_vhost_usage(prgname);
720 					return -1;
721 				} else {
722 					mergeable = !!ret;
723 					if (ret) {
724 						vmdq_conf_default.rxmode.jumbo_frame = 1;
725 						vmdq_conf_default.rxmode.max_rx_pkt_len
726 							= JUMBO_FRAME_MAX_SIZE;
727 					}
728 				}
729 			}
730 
731 			/* Enable/disable RX VLAN strip on host. */
732 			if (!strncmp(long_option[option_index].name,
733 				"vlan-strip", MAX_LONG_OPT_SZ)) {
734 				ret = parse_num_opt(optarg, 1);
735 				if (ret == -1) {
736 					RTE_LOG(INFO, VHOST_CONFIG,
737 						"Invalid argument for VLAN strip [0|1]\n");
738 					us_vhost_usage(prgname);
739 					return -1;
740 				} else {
741 					vlan_strip = !!ret;
742 					vmdq_conf_default.rxmode.hw_vlan_strip =
743 						vlan_strip;
744 				}
745 			}
746 
747 			/* Enable/disable stats. */
748 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749 				ret = parse_num_opt(optarg, INT32_MAX);
750 				if (ret == -1) {
751 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752 					us_vhost_usage(prgname);
753 					return -1;
754 				} else {
755 					enable_stats = ret;
756 				}
757 			}
758 
759 			/* Set character device basename. */
760 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761 				if (us_vhost_parse_basename(optarg) == -1) {
762 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763 					us_vhost_usage(prgname);
764 					return -1;
765 				}
766 			}
767 
768 			/* Enable/disable rx/tx zero copy. */
769 			if (!strncmp(long_option[option_index].name,
770 				"zero-copy", MAX_LONG_OPT_SZ)) {
771 				ret = parse_num_opt(optarg, 1);
772 				if (ret == -1) {
773 					RTE_LOG(INFO, VHOST_CONFIG,
774 						"Invalid argument"
775 						" for zero-copy [0|1]\n");
776 					us_vhost_usage(prgname);
777 					return -1;
778 				} else
779 					zero_copy = ret;
780 			}
781 
782 			/* Specify the descriptor number on RX. */
783 			if (!strncmp(long_option[option_index].name,
784 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
785 				ret = parse_num_opt(optarg, MAX_RING_DESC);
786 				if ((ret == -1) || (!POWEROF2(ret))) {
787 					RTE_LOG(INFO, VHOST_CONFIG,
788 					"Invalid argument for rx-desc-num[0-N],"
789 					"power of 2 required.\n");
790 					us_vhost_usage(prgname);
791 					return -1;
792 				} else {
793 					num_rx_descriptor = ret;
794 				}
795 			}
796 
797 			/* Specify the descriptor number on TX. */
798 			if (!strncmp(long_option[option_index].name,
799 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
800 				ret = parse_num_opt(optarg, MAX_RING_DESC);
801 				if ((ret == -1) || (!POWEROF2(ret))) {
802 					RTE_LOG(INFO, VHOST_CONFIG,
803 					"Invalid argument for tx-desc-num [0-N],"
804 					"power of 2 required.\n");
805 					us_vhost_usage(prgname);
806 					return -1;
807 				} else {
808 					num_tx_descriptor = ret;
809 				}
810 			}
811 
812 			break;
813 
814 			/* Invalid option - print options. */
815 		default:
816 			us_vhost_usage(prgname);
817 			return -1;
818 		}
819 	}
820 
821 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822 		if (enabled_port_mask & (1 << i))
823 			ports[num_ports++] = (uint8_t)i;
824 	}
825 
826 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
827 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
829 		return -1;
830 	}
831 
832 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833 		RTE_LOG(INFO, VHOST_PORT,
834 			"Vhost zero copy doesn't support software vm2vm,"
835 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
836 		return -1;
837 	}
838 
839 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840 		RTE_LOG(INFO, VHOST_PORT,
841 			"Vhost zero copy doesn't support jumbo frame,"
842 			"please specify '--mergeable 0' to disable the "
843 			"mergeable feature.\n");
844 		return -1;
845 	}
846 
847 	return 0;
848 }
849 
850 /*
851  * Update the global var NUM_PORTS and array PORTS according to system ports number
852  * and return valid ports number
853  */
854 static unsigned check_ports_num(unsigned nb_ports)
855 {
856 	unsigned valid_num_ports = num_ports;
857 	unsigned portid;
858 
859 	if (num_ports > nb_ports) {
860 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861 			num_ports, nb_ports);
862 		num_ports = nb_ports;
863 	}
864 
865 	for (portid = 0; portid < num_ports; portid ++) {
866 		if (ports[portid] >= nb_ports) {
867 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868 				ports[portid], (nb_ports - 1));
869 			ports[portid] = INVALID_PORT_ID;
870 			valid_num_ports--;
871 		}
872 	}
873 	return valid_num_ports;
874 }
875 
876 /*
877  * Macro to print out packet contents. Wrapped in debug define so that the
878  * data path is not effected when debug is disabled.
879  */
880 #if RTE_LOG_LEVEL >= RTE_LOG_DEBUG
881 #define PRINT_PACKET(device, addr, size, header) do {																\
882 	char *pkt_addr = (char*)(addr);																					\
883 	unsigned int index;																								\
884 	char packet[MAX_PRINT_BUFF];																					\
885 																													\
886 	if ((header))																									\
887 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
888 	else																											\
889 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
890 	for (index = 0; index < (size); index++) {																		\
891 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
892 			"%02hhx ", pkt_addr[index]);																			\
893 	}																												\
894 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
895 																													\
896 	RTE_LOG(DEBUG, VHOST_DATA, "%s", packet);																					\
897 } while(0)
898 #else
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
900 #endif
901 
902 /*
903  * Function to convert guest physical addresses to vhost physical addresses.
904  * This is used to convert virtio buffer addresses.
905  */
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
908 	uint32_t buf_len, hpa_type *addr_type)
909 {
910 	struct virtio_memory_regions_hpa *region;
911 	uint32_t regionidx;
912 	uint64_t vhost_pa = 0;
913 
914 	*addr_type = PHYS_ADDR_INVALID;
915 
916 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917 		region = &vdev->regions_hpa[regionidx];
918 		if ((guest_pa >= region->guest_phys_address) &&
919 			(guest_pa <= region->guest_phys_address_end)) {
920 			vhost_pa = region->host_phys_addr_offset + guest_pa;
921 			if (likely((guest_pa + buf_len - 1)
922 				<= region->guest_phys_address_end))
923 				*addr_type = PHYS_ADDR_CONTINUOUS;
924 			else
925 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
926 			break;
927 		}
928 	}
929 
930 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") GPA %p| HPA %p\n",
931 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932 		(void *)(uintptr_t)vhost_pa);
933 
934 	return vhost_pa;
935 }
936 
937 /*
938  * Compares a packet destination MAC address to a device MAC address.
939  */
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
942 {
943 	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
944 }
945 
946 /*
947  * This function learns the MAC address of the device and registers this along with a
948  * vlan tag to a VMDQ.
949  */
950 static int
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
952 {
953 	struct ether_hdr *pkt_hdr;
954 	struct virtio_net_data_ll *dev_ll;
955 	struct virtio_net *dev = vdev->dev;
956 	int i, ret;
957 
958 	/* Learn MAC address of guest device from packet */
959 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960 
961 	dev_ll = ll_root_used;
962 
963 	while (dev_ll != NULL) {
964 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
966 			return -1;
967 		}
968 		dev_ll = dev_ll->next;
969 	}
970 
971 	for (i = 0; i < ETHER_ADDR_LEN; i++)
972 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
973 
974 	/* vlan_tag currently uses the device_id. */
975 	vdev->vlan_tag = vlan_tags[dev->device_fh];
976 
977 	/* Print out VMDQ registration info. */
978 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
979 		dev->device_fh,
980 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
983 		vdev->vlan_tag);
984 
985 	/* Register the MAC address. */
986 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987 				(uint32_t)dev->device_fh + vmdq_pool_base);
988 	if (ret)
989 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
990 					dev->device_fh);
991 
992 	/* Enable stripping of the vlan tag as we handle routing. */
993 	if (vlan_strip)
994 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995 			(uint16_t)vdev->vmdq_rx_q, 1);
996 
997 	/* Set device as ready for RX. */
998 	vdev->ready = DEVICE_RX;
999 
1000 	return 0;
1001 }
1002 
1003 /*
1004  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005  * queue before disabling RX on the device.
1006  */
1007 static inline void
1008 unlink_vmdq(struct vhost_dev *vdev)
1009 {
1010 	unsigned i = 0;
1011 	unsigned rx_count;
1012 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1013 
1014 	if (vdev->ready == DEVICE_RX) {
1015 		/*clear MAC and VLAN settings*/
1016 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017 		for (i = 0; i < 6; i++)
1018 			vdev->mac_address.addr_bytes[i] = 0;
1019 
1020 		vdev->vlan_tag = 0;
1021 
1022 		/*Clear out the receive buffers*/
1023 		rx_count = rte_eth_rx_burst(ports[0],
1024 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1025 
1026 		while (rx_count) {
1027 			for (i = 0; i < rx_count; i++)
1028 				rte_pktmbuf_free(pkts_burst[i]);
1029 
1030 			rx_count = rte_eth_rx_burst(ports[0],
1031 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1032 		}
1033 
1034 		vdev->ready = DEVICE_MAC_LEARNING;
1035 	}
1036 }
1037 
1038 /*
1039  * Check if the packet destination MAC address is for a local device. If so then put
1040  * the packet on that devices RX queue. If not then return.
1041  */
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1044 {
1045 	struct virtio_net_data_ll *dev_ll;
1046 	struct ether_hdr *pkt_hdr;
1047 	uint64_t ret = 0;
1048 	struct virtio_net *dev = vdev->dev;
1049 	struct virtio_net *tdev; /* destination virito device */
1050 
1051 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052 
1053 	/*get the used devices list*/
1054 	dev_ll = ll_root_used;
1055 
1056 	while (dev_ll != NULL) {
1057 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058 				          &dev_ll->vdev->mac_address)) {
1059 
1060 			/* Drop the packet if the TX packet is destined for the TX device. */
1061 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 				RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1063 					"Source and destination MAC addresses are the same. "
1064 					"Dropping packet.\n",
1065 					dev->device_fh);
1066 				return 0;
1067 			}
1068 			tdev = dev_ll->vdev->dev;
1069 
1070 
1071 			RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1072 				"MAC address is local\n", tdev->device_fh);
1073 
1074 			if (unlikely(dev_ll->vdev->remove)) {
1075 				/*drop the packet if the device is marked for removal*/
1076 				RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
1077 					"Device is marked for removal\n", tdev->device_fh);
1078 			} else {
1079 				/*send the packet to the local virtio device*/
1080 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1081 				if (enable_stats) {
1082 					rte_atomic64_add(
1083 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1084 					1);
1085 					rte_atomic64_add(
1086 					&dev_statistics[tdev->device_fh].rx_atomic,
1087 					ret);
1088 					dev_statistics[dev->device_fh].tx_total++;
1089 					dev_statistics[dev->device_fh].tx += ret;
1090 				}
1091 			}
1092 
1093 			return 0;
1094 		}
1095 		dev_ll = dev_ll->next;
1096 	}
1097 
1098 	return -1;
1099 }
1100 
1101 /*
1102  * Check if the destination MAC of a packet is one local VM,
1103  * and get its vlan tag, and offset if it is.
1104  */
1105 static inline int __attribute__((always_inline))
1106 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1107 	uint32_t *offset, uint16_t *vlan_tag)
1108 {
1109 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1110 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1111 
1112 	while (dev_ll != NULL) {
1113 		if ((dev_ll->vdev->ready == DEVICE_RX)
1114 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1115 		&dev_ll->vdev->mac_address)) {
1116 			/*
1117 			 * Drop the packet if the TX packet is
1118 			 * destined for the TX device.
1119 			 */
1120 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1121 				RTE_LOG(DEBUG, VHOST_DATA,
1122 				"(%"PRIu64") TX: Source and destination"
1123 				" MAC addresses are the same. Dropping "
1124 				"packet.\n",
1125 				dev_ll->vdev->dev->device_fh);
1126 				return -1;
1127 			}
1128 
1129 			/*
1130 			 * HW vlan strip will reduce the packet length
1131 			 * by minus length of vlan tag, so need restore
1132 			 * the packet length by plus it.
1133 			 */
1134 			*offset = VLAN_HLEN;
1135 			*vlan_tag =
1136 			(uint16_t)
1137 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1138 
1139 			RTE_LOG(DEBUG, VHOST_DATA,
1140 			"(%"PRIu64") TX: pkt to local VM device id:"
1141 			"(%"PRIu64") vlan tag: %d.\n",
1142 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1143 			(int)*vlan_tag);
1144 
1145 			break;
1146 		}
1147 		dev_ll = dev_ll->next;
1148 	}
1149 	return 0;
1150 }
1151 
1152 static uint16_t
1153 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1154 {
1155 	if (ol_flags & PKT_TX_IPV4)
1156 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1157 	else /* assume ethertype == ETHER_TYPE_IPv6 */
1158 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1159 }
1160 
1161 static void virtio_tx_offload(struct rte_mbuf *m)
1162 {
1163 	void *l3_hdr;
1164 	struct ipv4_hdr *ipv4_hdr = NULL;
1165 	struct tcp_hdr *tcp_hdr = NULL;
1166 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1167 
1168 	l3_hdr = (char *)eth_hdr + m->l2_len;
1169 
1170 	if (m->ol_flags & PKT_TX_IPV4) {
1171 		ipv4_hdr = l3_hdr;
1172 		ipv4_hdr->hdr_checksum = 0;
1173 		m->ol_flags |= PKT_TX_IP_CKSUM;
1174 	}
1175 
1176 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1177 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1178 }
1179 
1180 /*
1181  * This function routes the TX packet to the correct interface. This may be a local device
1182  * or the physical port.
1183  */
1184 static inline void __attribute__((always_inline))
1185 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1186 {
1187 	struct mbuf_table *tx_q;
1188 	struct rte_mbuf **m_table;
1189 	unsigned len, ret, offset = 0;
1190 	const uint16_t lcore_id = rte_lcore_id();
1191 	struct virtio_net *dev = vdev->dev;
1192 	struct ether_hdr *nh;
1193 
1194 	/*check if destination is local VM*/
1195 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1196 		rte_pktmbuf_free(m);
1197 		return;
1198 	}
1199 
1200 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1201 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1202 			rte_pktmbuf_free(m);
1203 			return;
1204 		}
1205 	}
1206 
1207 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1208 		"MAC address is external\n", dev->device_fh);
1209 
1210 	/*Add packet to the port tx queue*/
1211 	tx_q = &lcore_tx_queue[lcore_id];
1212 	len = tx_q->len;
1213 
1214 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1215 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1216 		/* Guest has inserted the vlan tag. */
1217 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1218 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1219 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1220 			(vh->vlan_tci != vlan_tag_be))
1221 			vh->vlan_tci = vlan_tag_be;
1222 	} else {
1223 		m->ol_flags |= PKT_TX_VLAN_PKT;
1224 
1225 		/*
1226 		 * Find the right seg to adjust the data len when offset is
1227 		 * bigger than tail room size.
1228 		 */
1229 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1230 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1231 				m->data_len += offset;
1232 			else {
1233 				struct rte_mbuf *seg = m;
1234 
1235 				while ((seg->next != NULL) &&
1236 					(offset > rte_pktmbuf_tailroom(seg)))
1237 					seg = seg->next;
1238 
1239 				seg->data_len += offset;
1240 			}
1241 			m->pkt_len += offset;
1242 		}
1243 
1244 		m->vlan_tci = vlan_tag;
1245 	}
1246 
1247 	if (m->ol_flags & PKT_TX_TCP_SEG)
1248 		virtio_tx_offload(m);
1249 
1250 	tx_q->m_table[len] = m;
1251 	len++;
1252 	if (enable_stats) {
1253 		dev_statistics[dev->device_fh].tx_total++;
1254 		dev_statistics[dev->device_fh].tx++;
1255 	}
1256 
1257 	if (unlikely(len == MAX_PKT_BURST)) {
1258 		m_table = (struct rte_mbuf **)tx_q->m_table;
1259 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1260 		/* Free any buffers not handled by TX and update the port stats. */
1261 		if (unlikely(ret < len)) {
1262 			do {
1263 				rte_pktmbuf_free(m_table[ret]);
1264 			} while (++ret < len);
1265 		}
1266 
1267 		len = 0;
1268 	}
1269 
1270 	tx_q->len = len;
1271 	return;
1272 }
1273 /*
1274  * This function is called by each data core. It handles all RX/TX registered with the
1275  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1276  * with all devices in the main linked list.
1277  */
1278 static int
1279 switch_worker(__attribute__((unused)) void *arg)
1280 {
1281 	struct rte_mempool *mbuf_pool = arg;
1282 	struct virtio_net *dev = NULL;
1283 	struct vhost_dev *vdev = NULL;
1284 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1285 	struct virtio_net_data_ll *dev_ll;
1286 	struct mbuf_table *tx_q;
1287 	volatile struct lcore_ll_info *lcore_ll;
1288 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1289 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1290 	unsigned ret, i;
1291 	const uint16_t lcore_id = rte_lcore_id();
1292 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1293 	uint16_t rx_count = 0;
1294 	uint16_t tx_count;
1295 	uint32_t retry = 0;
1296 
1297 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1298 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1299 	prev_tsc = 0;
1300 
1301 	tx_q = &lcore_tx_queue[lcore_id];
1302 	for (i = 0; i < num_cores; i ++) {
1303 		if (lcore_ids[i] == lcore_id) {
1304 			tx_q->txq_id = i;
1305 			break;
1306 		}
1307 	}
1308 
1309 	while(1) {
1310 		cur_tsc = rte_rdtsc();
1311 		/*
1312 		 * TX burst queue drain
1313 		 */
1314 		diff_tsc = cur_tsc - prev_tsc;
1315 		if (unlikely(diff_tsc > drain_tsc)) {
1316 
1317 			if (tx_q->len) {
1318 				RTE_LOG(DEBUG, VHOST_DATA,
1319 					"TX queue drained after timeout with burst size %u\n",
1320 					tx_q->len);
1321 
1322 				/*Tx any packets in the queue*/
1323 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1324 									   (struct rte_mbuf **)tx_q->m_table,
1325 									   (uint16_t)tx_q->len);
1326 				if (unlikely(ret < tx_q->len)) {
1327 					do {
1328 						rte_pktmbuf_free(tx_q->m_table[ret]);
1329 					} while (++ret < tx_q->len);
1330 				}
1331 
1332 				tx_q->len = 0;
1333 			}
1334 
1335 			prev_tsc = cur_tsc;
1336 
1337 		}
1338 
1339 		rte_prefetch0(lcore_ll->ll_root_used);
1340 		/*
1341 		 * Inform the configuration core that we have exited the linked list and that no devices are
1342 		 * in use if requested.
1343 		 */
1344 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1345 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1346 
1347 		/*
1348 		 * Process devices
1349 		 */
1350 		dev_ll = lcore_ll->ll_root_used;
1351 
1352 		while (dev_ll != NULL) {
1353 			/*get virtio device ID*/
1354 			vdev = dev_ll->vdev;
1355 			dev = vdev->dev;
1356 
1357 			if (unlikely(vdev->remove)) {
1358 				dev_ll = dev_ll->next;
1359 				unlink_vmdq(vdev);
1360 				vdev->ready = DEVICE_SAFE_REMOVE;
1361 				continue;
1362 			}
1363 			if (likely(vdev->ready == DEVICE_RX)) {
1364 				/*Handle guest RX*/
1365 				rx_count = rte_eth_rx_burst(ports[0],
1366 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1367 
1368 				if (rx_count) {
1369 					/*
1370 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1371 					* Here MAX_PKT_BURST must be less than virtio queue size
1372 					*/
1373 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1374 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1375 							rte_delay_us(burst_rx_delay_time);
1376 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1377 								break;
1378 						}
1379 					}
1380 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1381 					if (enable_stats) {
1382 						rte_atomic64_add(
1383 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1384 						rx_count);
1385 						rte_atomic64_add(
1386 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1387 					}
1388 					while (likely(rx_count)) {
1389 						rx_count--;
1390 						rte_pktmbuf_free(pkts_burst[rx_count]);
1391 					}
1392 
1393 				}
1394 			}
1395 
1396 			if (likely(!vdev->remove)) {
1397 				/* Handle guest TX*/
1398 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1399 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1400 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1401 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1402 						while (tx_count)
1403 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1404 					}
1405 				}
1406 				for (i = 0; i < tx_count; ++i) {
1407 					virtio_tx_route(vdev, pkts_burst[i],
1408 						vlan_tags[(uint16_t)dev->device_fh]);
1409 				}
1410 			}
1411 
1412 			/*move to the next device in the list*/
1413 			dev_ll = dev_ll->next;
1414 		}
1415 	}
1416 
1417 	return 0;
1418 }
1419 
1420 /*
1421  * This function gets available ring number for zero copy rx.
1422  * Only one thread will call this funciton for a paticular virtio device,
1423  * so, it is designed as non-thread-safe function.
1424  */
1425 static inline uint32_t __attribute__((always_inline))
1426 get_available_ring_num_zcp(struct virtio_net *dev)
1427 {
1428 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1429 	uint16_t avail_idx;
1430 
1431 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1432 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1433 }
1434 
1435 /*
1436  * This function gets available ring index for zero copy rx,
1437  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1438  * Only one thread will call this funciton for a paticular virtio device,
1439  * so, it is designed as non-thread-safe function.
1440  */
1441 static inline uint32_t __attribute__((always_inline))
1442 get_available_ring_index_zcp(struct virtio_net *dev,
1443 	uint16_t *res_base_idx, uint32_t count)
1444 {
1445 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1446 	uint16_t avail_idx;
1447 	uint32_t retry = 0;
1448 	uint16_t free_entries;
1449 
1450 	*res_base_idx = vq->last_used_idx_res;
1451 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1452 	free_entries = (avail_idx - *res_base_idx);
1453 
1454 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") in get_available_ring_index_zcp: "
1455 			"avail idx: %d, "
1456 			"res base idx:%d, free entries:%d\n",
1457 			dev->device_fh, avail_idx, *res_base_idx,
1458 			free_entries);
1459 
1460 	/*
1461 	 * If retry is enabled and the queue is full then we wait
1462 	 * and retry to avoid packet loss.
1463 	 */
1464 	if (enable_retry && unlikely(count > free_entries)) {
1465 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1466 			rte_delay_us(burst_rx_delay_time);
1467 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1468 			free_entries = (avail_idx - *res_base_idx);
1469 			if (count <= free_entries)
1470 				break;
1471 		}
1472 	}
1473 
1474 	/*check that we have enough buffers*/
1475 	if (unlikely(count > free_entries))
1476 		count = free_entries;
1477 
1478 	if (unlikely(count == 0)) {
1479 		RTE_LOG(DEBUG, VHOST_DATA,
1480 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1481 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1482 			dev->device_fh, avail_idx,
1483 			*res_base_idx, free_entries);
1484 		return 0;
1485 	}
1486 
1487 	vq->last_used_idx_res = *res_base_idx + count;
1488 
1489 	return count;
1490 }
1491 
1492 /*
1493  * This function put descriptor back to used list.
1494  */
1495 static inline void __attribute__((always_inline))
1496 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1497 {
1498 	uint16_t res_cur_idx = vq->last_used_idx;
1499 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1500 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1501 	rte_compiler_barrier();
1502 	*(volatile uint16_t *)&vq->used->idx += 1;
1503 	vq->last_used_idx += 1;
1504 
1505 	/* Kick the guest if necessary. */
1506 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1507 		eventfd_write(vq->callfd, (eventfd_t)1);
1508 }
1509 
1510 /*
1511  * This function get available descriptor from vitio vring and un-attached mbuf
1512  * from vpool->ring, and then attach them together. It needs adjust the offset
1513  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1514  * frame data may be put to wrong location in mbuf.
1515  */
1516 static inline void __attribute__((always_inline))
1517 attach_rxmbuf_zcp(struct virtio_net *dev)
1518 {
1519 	uint16_t res_base_idx, desc_idx;
1520 	uint64_t buff_addr, phys_addr;
1521 	struct vhost_virtqueue *vq;
1522 	struct vring_desc *desc;
1523 	void *obj = NULL;
1524 	struct rte_mbuf *mbuf;
1525 	struct vpool *vpool;
1526 	hpa_type addr_type;
1527 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1528 
1529 	vpool = &vpool_array[vdev->vmdq_rx_q];
1530 	vq = dev->virtqueue[VIRTIO_RXQ];
1531 
1532 	do {
1533 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1534 				1) != 1))
1535 			return;
1536 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1537 
1538 		desc = &vq->desc[desc_idx];
1539 		if (desc->flags & VRING_DESC_F_NEXT) {
1540 			desc = &vq->desc[desc->next];
1541 			buff_addr = gpa_to_vva(dev, desc->addr);
1542 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1543 					&addr_type);
1544 		} else {
1545 			buff_addr = gpa_to_vva(dev,
1546 					desc->addr + vq->vhost_hlen);
1547 			phys_addr = gpa_to_hpa(vdev,
1548 					desc->addr + vq->vhost_hlen,
1549 					desc->len, &addr_type);
1550 		}
1551 
1552 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1553 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1554 				" address found when attaching RX frame buffer"
1555 				" address!\n", dev->device_fh);
1556 			put_desc_to_used_list_zcp(vq, desc_idx);
1557 			continue;
1558 		}
1559 
1560 		/*
1561 		 * Check if the frame buffer address from guest crosses
1562 		 * sub-region or not.
1563 		 */
1564 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1565 			RTE_LOG(ERR, VHOST_DATA,
1566 				"(%"PRIu64") Frame buffer address cross "
1567 				"sub-regioin found when attaching RX frame "
1568 				"buffer address!\n",
1569 				dev->device_fh);
1570 			put_desc_to_used_list_zcp(vq, desc_idx);
1571 			continue;
1572 		}
1573 	} while (unlikely(phys_addr == 0));
1574 
1575 	rte_ring_sc_dequeue(vpool->ring, &obj);
1576 	mbuf = obj;
1577 	if (unlikely(mbuf == NULL)) {
1578 		RTE_LOG(DEBUG, VHOST_DATA,
1579 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1580 			"ring_sc_dequeue fail.\n",
1581 			dev->device_fh);
1582 		put_desc_to_used_list_zcp(vq, desc_idx);
1583 		return;
1584 	}
1585 
1586 	if (unlikely(vpool->buf_size > desc->len)) {
1587 		RTE_LOG(DEBUG, VHOST_DATA,
1588 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1589 			"length(%d) of descriptor idx: %d less than room "
1590 			"size required: %d\n",
1591 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1592 		put_desc_to_used_list_zcp(vq, desc_idx);
1593 		rte_ring_sp_enqueue(vpool->ring, obj);
1594 		return;
1595 	}
1596 
1597 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1598 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1599 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1600 	mbuf->data_len = desc->len;
1601 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1602 
1603 	RTE_LOG(DEBUG, VHOST_DATA,
1604 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1605 		"descriptor idx:%d\n",
1606 		dev->device_fh, res_base_idx, desc_idx);
1607 
1608 	__rte_mbuf_raw_free(mbuf);
1609 
1610 	return;
1611 }
1612 
1613 /*
1614  * Detach an attched packet mbuf -
1615  *  - restore original mbuf address and length values.
1616  *  - reset pktmbuf data and data_len to their default values.
1617  *  All other fields of the given packet mbuf will be left intact.
1618  *
1619  * @param m
1620  *   The attached packet mbuf.
1621  */
1622 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1623 {
1624 	const struct rte_mempool *mp = m->pool;
1625 	void *buf = rte_mbuf_to_baddr(m);
1626 	uint32_t buf_ofs;
1627 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1628 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1629 
1630 	m->buf_addr = buf;
1631 	m->buf_len = (uint16_t)buf_len;
1632 
1633 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1634 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1635 	m->data_off = buf_ofs;
1636 
1637 	m->data_len = 0;
1638 }
1639 
1640 /*
1641  * This function is called after packets have been transimited. It fetchs mbuf
1642  * from vpool->pool, detached it and put into vpool->ring. It also update the
1643  * used index and kick the guest if necessary.
1644  */
1645 static inline uint32_t __attribute__((always_inline))
1646 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1647 {
1648 	struct rte_mbuf *mbuf;
1649 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1650 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1651 	uint32_t index = 0;
1652 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1653 
1654 	RTE_LOG(DEBUG, VHOST_DATA,
1655 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1656 		"clean is: %d\n",
1657 		dev->device_fh, mbuf_count);
1658 	RTE_LOG(DEBUG, VHOST_DATA,
1659 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1660 		"clean  is : %d\n",
1661 		dev->device_fh, rte_ring_count(vpool->ring));
1662 
1663 	for (index = 0; index < mbuf_count; index++) {
1664 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1665 		if (likely(MBUF_EXT_MEM(mbuf)))
1666 			pktmbuf_detach_zcp(mbuf);
1667 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1668 
1669 		/* Update used index buffer information. */
1670 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1671 		vq->used->ring[used_idx].len = 0;
1672 
1673 		used_idx = (used_idx + 1) & (vq->size - 1);
1674 	}
1675 
1676 	RTE_LOG(DEBUG, VHOST_DATA,
1677 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1678 		"clean is: %d\n",
1679 		dev->device_fh, rte_mempool_count(vpool->pool));
1680 	RTE_LOG(DEBUG, VHOST_DATA,
1681 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1682 		"clean  is : %d\n",
1683 		dev->device_fh, rte_ring_count(vpool->ring));
1684 	RTE_LOG(DEBUG, VHOST_DATA,
1685 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1686 		"vq->last_used_idx:%d\n",
1687 		dev->device_fh, vq->last_used_idx);
1688 
1689 	vq->last_used_idx += mbuf_count;
1690 
1691 	RTE_LOG(DEBUG, VHOST_DATA,
1692 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1693 		"vq->last_used_idx:%d\n",
1694 		dev->device_fh, vq->last_used_idx);
1695 
1696 	rte_compiler_barrier();
1697 
1698 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1699 
1700 	/* Kick guest if required. */
1701 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1702 		eventfd_write(vq->callfd, (eventfd_t)1);
1703 
1704 	return 0;
1705 }
1706 
1707 /*
1708  * This function is called when a virtio device is destroy.
1709  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1710  */
1711 static void mbuf_destroy_zcp(struct vpool *vpool)
1712 {
1713 	struct rte_mbuf *mbuf = NULL;
1714 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1715 
1716 	RTE_LOG(DEBUG, VHOST_CONFIG,
1717 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1718 		"mbuf_destroy_zcp is: %d\n",
1719 		mbuf_count);
1720 	RTE_LOG(DEBUG, VHOST_CONFIG,
1721 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1722 		"mbuf_destroy_zcp  is : %d\n",
1723 		rte_ring_count(vpool->ring));
1724 
1725 	for (index = 0; index < mbuf_count; index++) {
1726 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1727 		if (likely(mbuf != NULL)) {
1728 			if (likely(MBUF_EXT_MEM(mbuf)))
1729 				pktmbuf_detach_zcp(mbuf);
1730 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1731 		}
1732 	}
1733 
1734 	RTE_LOG(DEBUG, VHOST_CONFIG,
1735 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1736 		"mbuf_destroy_zcp is: %d\n",
1737 		rte_mempool_count(vpool->pool));
1738 	RTE_LOG(DEBUG, VHOST_CONFIG,
1739 		"in mbuf_destroy_zcp: mbuf count in ring after "
1740 		"mbuf_destroy_zcp is : %d\n",
1741 		rte_ring_count(vpool->ring));
1742 }
1743 
1744 /*
1745  * This function update the use flag and counter.
1746  */
1747 static inline uint32_t __attribute__((always_inline))
1748 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1749 	uint32_t count)
1750 {
1751 	struct vhost_virtqueue *vq;
1752 	struct vring_desc *desc;
1753 	struct rte_mbuf *buff;
1754 	/* The virtio_hdr is initialised to 0. */
1755 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1756 		= {{0, 0, 0, 0, 0, 0}, 0};
1757 	uint64_t buff_hdr_addr = 0;
1758 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1759 	uint32_t head_idx, packet_success = 0;
1760 	uint16_t res_cur_idx;
1761 
1762 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_rx()\n",
1763 		dev->device_fh);
1764 
1765 	if (count == 0)
1766 		return 0;
1767 
1768 	vq = dev->virtqueue[VIRTIO_RXQ];
1769 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1770 
1771 	res_cur_idx = vq->last_used_idx;
1772 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Current Index %d| End Index %d\n",
1773 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1774 
1775 	/* Retrieve all of the head indexes first to avoid caching issues. */
1776 	for (head_idx = 0; head_idx < count; head_idx++)
1777 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1778 
1779 	/*Prefetch descriptor index. */
1780 	rte_prefetch0(&vq->desc[head[packet_success]]);
1781 
1782 	while (packet_success != count) {
1783 		/* Get descriptor from available ring */
1784 		desc = &vq->desc[head[packet_success]];
1785 
1786 		buff = pkts[packet_success];
1787 		RTE_LOG(DEBUG, VHOST_DATA,
1788 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1789 			"pkt[%d] descriptor idx: %d\n",
1790 			dev->device_fh, packet_success,
1791 			MBUF_HEADROOM_UINT32(buff));
1792 
1793 		PRINT_PACKET(dev,
1794 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1795 			+ RTE_PKTMBUF_HEADROOM),
1796 			rte_pktmbuf_data_len(buff), 0);
1797 
1798 		/* Buffer address translation for virtio header. */
1799 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1800 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1801 
1802 		/*
1803 		 * If the descriptors are chained the header and data are
1804 		 * placed in separate buffers.
1805 		 */
1806 		if (desc->flags & VRING_DESC_F_NEXT) {
1807 			desc->len = vq->vhost_hlen;
1808 			desc = &vq->desc[desc->next];
1809 			desc->len = rte_pktmbuf_data_len(buff);
1810 		} else {
1811 			desc->len = packet_len;
1812 		}
1813 
1814 		/* Update used ring with desc information */
1815 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1816 			= head[packet_success];
1817 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1818 			= packet_len;
1819 		res_cur_idx++;
1820 		packet_success++;
1821 
1822 		/* A header is required per buffer. */
1823 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1824 			(const void *)&virtio_hdr, vq->vhost_hlen);
1825 
1826 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1827 
1828 		if (likely(packet_success < count)) {
1829 			/* Prefetch descriptor index. */
1830 			rte_prefetch0(&vq->desc[head[packet_success]]);
1831 		}
1832 	}
1833 
1834 	rte_compiler_barrier();
1835 
1836 	RTE_LOG(DEBUG, VHOST_DATA,
1837 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1838 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1839 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1840 
1841 	*(volatile uint16_t *)&vq->used->idx += count;
1842 	vq->last_used_idx += count;
1843 
1844 	RTE_LOG(DEBUG, VHOST_DATA,
1845 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1846 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1847 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1848 
1849 	/* Kick the guest if necessary. */
1850 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1851 		eventfd_write(vq->callfd, (eventfd_t)1);
1852 
1853 	return count;
1854 }
1855 
1856 /*
1857  * This function routes the TX packet to the correct interface.
1858  * This may be a local device or the physical port.
1859  */
1860 static inline void __attribute__((always_inline))
1861 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1862 	uint32_t desc_idx, uint8_t need_copy)
1863 {
1864 	struct mbuf_table *tx_q;
1865 	struct rte_mbuf **m_table;
1866 	void *obj = NULL;
1867 	struct rte_mbuf *mbuf;
1868 	unsigned len, ret, offset = 0;
1869 	struct vpool *vpool;
1870 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1871 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1872 
1873 	/*Add packet to the port tx queue*/
1874 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1875 	len = tx_q->len;
1876 
1877 	/* Allocate an mbuf and populate the structure. */
1878 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1879 	rte_ring_sc_dequeue(vpool->ring, &obj);
1880 	mbuf = obj;
1881 	if (unlikely(mbuf == NULL)) {
1882 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1883 		RTE_LOG(ERR, VHOST_DATA,
1884 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1885 			dev->device_fh);
1886 		put_desc_to_used_list_zcp(vq, desc_idx);
1887 		return;
1888 	}
1889 
1890 	if (vm2vm_mode == VM2VM_HARDWARE) {
1891 		/* Avoid using a vlan tag from any vm for external pkt, such as
1892 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1893 		 * selection, MAC address determines it as an external pkt
1894 		 * which should go to network, while vlan tag determine it as
1895 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1896 		 * such a ambiguous situation, so pkt will lost.
1897 		 */
1898 		vlan_tag = external_pkt_default_vlan_tag;
1899 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1900 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1901 			__rte_mbuf_raw_free(mbuf);
1902 			return;
1903 		}
1904 	}
1905 
1906 	mbuf->nb_segs = m->nb_segs;
1907 	mbuf->next = m->next;
1908 	mbuf->data_len = m->data_len + offset;
1909 	mbuf->pkt_len = mbuf->data_len;
1910 	if (unlikely(need_copy)) {
1911 		/* Copy the packet contents to the mbuf. */
1912 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1913 			rte_pktmbuf_mtod(m, void *),
1914 			m->data_len);
1915 	} else {
1916 		mbuf->data_off = m->data_off;
1917 		mbuf->buf_physaddr = m->buf_physaddr;
1918 		mbuf->buf_addr = m->buf_addr;
1919 	}
1920 	mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1921 	mbuf->vlan_tci = vlan_tag;
1922 	mbuf->l2_len = sizeof(struct ether_hdr);
1923 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1924 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1925 
1926 	tx_q->m_table[len] = mbuf;
1927 	len++;
1928 
1929 	RTE_LOG(DEBUG, VHOST_DATA,
1930 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1931 		dev->device_fh,
1932 		mbuf->nb_segs,
1933 		(mbuf->next == NULL) ? "null" : "non-null");
1934 
1935 	if (enable_stats) {
1936 		dev_statistics[dev->device_fh].tx_total++;
1937 		dev_statistics[dev->device_fh].tx++;
1938 	}
1939 
1940 	if (unlikely(len == MAX_PKT_BURST)) {
1941 		m_table = (struct rte_mbuf **)tx_q->m_table;
1942 		ret = rte_eth_tx_burst(ports[0],
1943 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1944 
1945 		/*
1946 		 * Free any buffers not handled by TX and update
1947 		 * the port stats.
1948 		 */
1949 		if (unlikely(ret < len)) {
1950 			do {
1951 				rte_pktmbuf_free(m_table[ret]);
1952 			} while (++ret < len);
1953 		}
1954 
1955 		len = 0;
1956 		txmbuf_clean_zcp(dev, vpool);
1957 	}
1958 
1959 	tx_q->len = len;
1960 
1961 	return;
1962 }
1963 
1964 /*
1965  * This function TX all available packets in virtio TX queue for one
1966  * virtio-net device. If it is first packet, it learns MAC address and
1967  * setup VMDQ.
1968  */
1969 static inline void __attribute__((always_inline))
1970 virtio_dev_tx_zcp(struct virtio_net *dev)
1971 {
1972 	struct rte_mbuf m;
1973 	struct vhost_virtqueue *vq;
1974 	struct vring_desc *desc;
1975 	uint64_t buff_addr = 0, phys_addr;
1976 	uint32_t head[MAX_PKT_BURST];
1977 	uint32_t i;
1978 	uint16_t free_entries, packet_success = 0;
1979 	uint16_t avail_idx;
1980 	uint8_t need_copy = 0;
1981 	hpa_type addr_type;
1982 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1983 
1984 	vq = dev->virtqueue[VIRTIO_TXQ];
1985 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1986 
1987 	/* If there are no available buffers then return. */
1988 	if (vq->last_used_idx_res == avail_idx)
1989 		return;
1990 
1991 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_tx()\n",
1992 		dev->device_fh);
1993 
1994 	/* Prefetch available ring to retrieve head indexes. */
1995 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1996 
1997 	/* Get the number of free entries in the ring */
1998 	free_entries = (avail_idx - vq->last_used_idx_res);
1999 
2000 	/* Limit to MAX_PKT_BURST. */
2001 	free_entries
2002 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2003 
2004 	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Buffers available %d\n",
2005 		dev->device_fh, free_entries);
2006 
2007 	/* Retrieve all of the head indexes first to avoid caching issues. */
2008 	for (i = 0; i < free_entries; i++)
2009 		head[i]
2010 			= vq->avail->ring[(vq->last_used_idx_res + i)
2011 			& (vq->size - 1)];
2012 
2013 	vq->last_used_idx_res += free_entries;
2014 
2015 	/* Prefetch descriptor index. */
2016 	rte_prefetch0(&vq->desc[head[packet_success]]);
2017 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2018 
2019 	while (packet_success < free_entries) {
2020 		desc = &vq->desc[head[packet_success]];
2021 
2022 		/* Discard first buffer as it is the virtio header */
2023 		desc = &vq->desc[desc->next];
2024 
2025 		/* Buffer address translation. */
2026 		buff_addr = gpa_to_vva(dev, desc->addr);
2027 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
2028 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2029 			&addr_type);
2030 
2031 		if (likely(packet_success < (free_entries - 1)))
2032 			/* Prefetch descriptor index. */
2033 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2034 
2035 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2036 			RTE_LOG(ERR, VHOST_DATA,
2037 				"(%"PRIu64") Invalid frame buffer address found"
2038 				"when TX packets!\n",
2039 				dev->device_fh);
2040 			packet_success++;
2041 			continue;
2042 		}
2043 
2044 		/* Prefetch buffer address. */
2045 		rte_prefetch0((void *)(uintptr_t)buff_addr);
2046 
2047 		/*
2048 		 * Setup dummy mbuf. This is copied to a real mbuf if
2049 		 * transmitted out the physical port.
2050 		 */
2051 		m.data_len = desc->len;
2052 		m.nb_segs = 1;
2053 		m.next = NULL;
2054 		m.data_off = 0;
2055 		m.buf_addr = (void *)(uintptr_t)buff_addr;
2056 		m.buf_physaddr = phys_addr;
2057 
2058 		/*
2059 		 * Check if the frame buffer address from guest crosses
2060 		 * sub-region or not.
2061 		 */
2062 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2063 			RTE_LOG(ERR, VHOST_DATA,
2064 				"(%"PRIu64") Frame buffer address cross "
2065 				"sub-regioin found when attaching TX frame "
2066 				"buffer address!\n",
2067 				dev->device_fh);
2068 			need_copy = 1;
2069 		} else
2070 			need_copy = 0;
2071 
2072 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2073 
2074 		/*
2075 		 * If this is the first received packet we need to learn
2076 		 * the MAC and setup VMDQ
2077 		 */
2078 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2079 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2080 				/*
2081 				 * Discard frame if device is scheduled for
2082 				 * removal or a duplicate MAC address is found.
2083 				 */
2084 				packet_success += free_entries;
2085 				vq->last_used_idx += packet_success;
2086 				break;
2087 			}
2088 		}
2089 
2090 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2091 		packet_success++;
2092 	}
2093 }
2094 
2095 /*
2096  * This function is called by each data core. It handles all RX/TX registered
2097  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2098  * addresses are compared with all devices in the main linked list.
2099  */
2100 static int
2101 switch_worker_zcp(__attribute__((unused)) void *arg)
2102 {
2103 	struct virtio_net *dev = NULL;
2104 	struct vhost_dev  *vdev = NULL;
2105 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2106 	struct virtio_net_data_ll *dev_ll;
2107 	struct mbuf_table *tx_q;
2108 	volatile struct lcore_ll_info *lcore_ll;
2109 	const uint64_t drain_tsc
2110 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2111 		* BURST_TX_DRAIN_US;
2112 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2113 	unsigned ret;
2114 	const uint16_t lcore_id = rte_lcore_id();
2115 	uint16_t count_in_ring, rx_count = 0;
2116 
2117 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2118 
2119 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2120 	prev_tsc = 0;
2121 
2122 	while (1) {
2123 		cur_tsc = rte_rdtsc();
2124 
2125 		/* TX burst queue drain */
2126 		diff_tsc = cur_tsc - prev_tsc;
2127 		if (unlikely(diff_tsc > drain_tsc)) {
2128 			/*
2129 			 * Get mbuf from vpool.pool and detach mbuf and
2130 			 * put back into vpool.ring.
2131 			 */
2132 			dev_ll = lcore_ll->ll_root_used;
2133 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2134 				/* Get virtio device ID */
2135 				vdev = dev_ll->vdev;
2136 				dev = vdev->dev;
2137 
2138 				if (likely(!vdev->remove)) {
2139 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2140 					if (tx_q->len) {
2141 						RTE_LOG(DEBUG, VHOST_DATA,
2142 						"TX queue drained after timeout"
2143 						" with burst size %u\n",
2144 						tx_q->len);
2145 
2146 						/*
2147 						 * Tx any packets in the queue
2148 						 */
2149 						ret = rte_eth_tx_burst(
2150 							ports[0],
2151 							(uint16_t)tx_q->txq_id,
2152 							(struct rte_mbuf **)
2153 							tx_q->m_table,
2154 							(uint16_t)tx_q->len);
2155 						if (unlikely(ret < tx_q->len)) {
2156 							do {
2157 								rte_pktmbuf_free(
2158 									tx_q->m_table[ret]);
2159 							} while (++ret < tx_q->len);
2160 						}
2161 						tx_q->len = 0;
2162 
2163 						txmbuf_clean_zcp(dev,
2164 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2165 					}
2166 				}
2167 				dev_ll = dev_ll->next;
2168 			}
2169 			prev_tsc = cur_tsc;
2170 		}
2171 
2172 		rte_prefetch0(lcore_ll->ll_root_used);
2173 
2174 		/*
2175 		 * Inform the configuration core that we have exited the linked
2176 		 * list and that no devices are in use if requested.
2177 		 */
2178 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2179 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2180 
2181 		/* Process devices */
2182 		dev_ll = lcore_ll->ll_root_used;
2183 
2184 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2185 			vdev = dev_ll->vdev;
2186 			dev  = vdev->dev;
2187 			if (unlikely(vdev->remove)) {
2188 				dev_ll = dev_ll->next;
2189 				unlink_vmdq(vdev);
2190 				vdev->ready = DEVICE_SAFE_REMOVE;
2191 				continue;
2192 			}
2193 
2194 			if (likely(vdev->ready == DEVICE_RX)) {
2195 				uint32_t index = vdev->vmdq_rx_q;
2196 				uint16_t i;
2197 				count_in_ring
2198 				= rte_ring_count(vpool_array[index].ring);
2199 				uint16_t free_entries
2200 				= (uint16_t)get_available_ring_num_zcp(dev);
2201 
2202 				/*
2203 				 * Attach all mbufs in vpool.ring and put back
2204 				 * into vpool.pool.
2205 				 */
2206 				for (i = 0;
2207 				i < RTE_MIN(free_entries,
2208 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2209 				i++)
2210 					attach_rxmbuf_zcp(dev);
2211 
2212 				/* Handle guest RX */
2213 				rx_count = rte_eth_rx_burst(ports[0],
2214 					vdev->vmdq_rx_q, pkts_burst,
2215 					MAX_PKT_BURST);
2216 
2217 				if (rx_count) {
2218 					ret_count = virtio_dev_rx_zcp(dev,
2219 							pkts_burst, rx_count);
2220 					if (enable_stats) {
2221 						dev_statistics[dev->device_fh].rx_total
2222 							+= rx_count;
2223 						dev_statistics[dev->device_fh].rx
2224 							+= ret_count;
2225 					}
2226 					while (likely(rx_count)) {
2227 						rx_count--;
2228 						pktmbuf_detach_zcp(
2229 							pkts_burst[rx_count]);
2230 						rte_ring_sp_enqueue(
2231 							vpool_array[index].ring,
2232 							(void *)pkts_burst[rx_count]);
2233 					}
2234 				}
2235 			}
2236 
2237 			if (likely(!vdev->remove))
2238 				/* Handle guest TX */
2239 				virtio_dev_tx_zcp(dev);
2240 
2241 			/* Move to the next device in the list */
2242 			dev_ll = dev_ll->next;
2243 		}
2244 	}
2245 
2246 	return 0;
2247 }
2248 
2249 
2250 /*
2251  * Add an entry to a used linked list. A free entry must first be found
2252  * in the free linked list using get_data_ll_free_entry();
2253  */
2254 static void
2255 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2256 	struct virtio_net_data_ll *ll_dev)
2257 {
2258 	struct virtio_net_data_ll *ll = *ll_root_addr;
2259 
2260 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2261 	ll_dev->next = NULL;
2262 	rte_compiler_barrier();
2263 
2264 	/* If ll == NULL then this is the first device. */
2265 	if (ll) {
2266 		/* Increment to the tail of the linked list. */
2267 		while ((ll->next != NULL) )
2268 			ll = ll->next;
2269 
2270 		ll->next = ll_dev;
2271 	} else {
2272 		*ll_root_addr = ll_dev;
2273 	}
2274 }
2275 
2276 /*
2277  * Remove an entry from a used linked list. The entry must then be added to
2278  * the free linked list using put_data_ll_free_entry().
2279  */
2280 static void
2281 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2282 	struct virtio_net_data_ll *ll_dev,
2283 	struct virtio_net_data_ll *ll_dev_last)
2284 {
2285 	struct virtio_net_data_ll *ll = *ll_root_addr;
2286 
2287 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2288 		return;
2289 
2290 	if (ll_dev == ll)
2291 		*ll_root_addr = ll_dev->next;
2292 	else
2293 		if (likely(ll_dev_last != NULL))
2294 			ll_dev_last->next = ll_dev->next;
2295 		else
2296 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2297 }
2298 
2299 /*
2300  * Find and return an entry from the free linked list.
2301  */
2302 static struct virtio_net_data_ll *
2303 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2304 {
2305 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2306 	struct virtio_net_data_ll *ll_dev;
2307 
2308 	if (ll_free == NULL)
2309 		return NULL;
2310 
2311 	ll_dev = ll_free;
2312 	*ll_root_addr = ll_free->next;
2313 
2314 	return ll_dev;
2315 }
2316 
2317 /*
2318  * Place an entry back on to the free linked list.
2319  */
2320 static void
2321 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2322 	struct virtio_net_data_ll *ll_dev)
2323 {
2324 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2325 
2326 	if (ll_dev == NULL)
2327 		return;
2328 
2329 	ll_dev->next = ll_free;
2330 	*ll_root_addr = ll_dev;
2331 }
2332 
2333 /*
2334  * Creates a linked list of a given size.
2335  */
2336 static struct virtio_net_data_ll *
2337 alloc_data_ll(uint32_t size)
2338 {
2339 	struct virtio_net_data_ll *ll_new;
2340 	uint32_t i;
2341 
2342 	/* Malloc and then chain the linked list. */
2343 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2344 	if (ll_new == NULL) {
2345 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2346 		return NULL;
2347 	}
2348 
2349 	for (i = 0; i < size - 1; i++) {
2350 		ll_new[i].vdev = NULL;
2351 		ll_new[i].next = &ll_new[i+1];
2352 	}
2353 	ll_new[i].next = NULL;
2354 
2355 	return ll_new;
2356 }
2357 
2358 /*
2359  * Create the main linked list along with each individual cores linked list. A used and a free list
2360  * are created to manage entries.
2361  */
2362 static int
2363 init_data_ll (void)
2364 {
2365 	int lcore;
2366 
2367 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2368 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2369 		if (lcore_info[lcore].lcore_ll == NULL) {
2370 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2371 			return -1;
2372 		}
2373 
2374 		lcore_info[lcore].lcore_ll->device_num = 0;
2375 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2376 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2377 		if (num_devices % num_switching_cores)
2378 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2379 		else
2380 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2381 	}
2382 
2383 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2384 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2385 
2386 	return 0;
2387 }
2388 
2389 /*
2390  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2391  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2392  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2393  */
2394 static void
2395 destroy_device (volatile struct virtio_net *dev)
2396 {
2397 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2398 	struct virtio_net_data_ll *ll_main_dev_cur;
2399 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2400 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2401 	struct vhost_dev *vdev;
2402 	int lcore;
2403 
2404 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2405 
2406 	vdev = (struct vhost_dev *)dev->priv;
2407 	/*set the remove flag. */
2408 	vdev->remove = 1;
2409 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2410 		rte_pause();
2411 	}
2412 
2413 	/* Search for entry to be removed from lcore ll */
2414 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2415 	while (ll_lcore_dev_cur != NULL) {
2416 		if (ll_lcore_dev_cur->vdev == vdev) {
2417 			break;
2418 		} else {
2419 			ll_lcore_dev_last = ll_lcore_dev_cur;
2420 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2421 		}
2422 	}
2423 
2424 	if (ll_lcore_dev_cur == NULL) {
2425 		RTE_LOG(ERR, VHOST_CONFIG,
2426 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2427 			dev->device_fh);
2428 		return;
2429 	}
2430 
2431 	/* Search for entry to be removed from main ll */
2432 	ll_main_dev_cur = ll_root_used;
2433 	ll_main_dev_last = NULL;
2434 	while (ll_main_dev_cur != NULL) {
2435 		if (ll_main_dev_cur->vdev == vdev) {
2436 			break;
2437 		} else {
2438 			ll_main_dev_last = ll_main_dev_cur;
2439 			ll_main_dev_cur = ll_main_dev_cur->next;
2440 		}
2441 	}
2442 
2443 	/* Remove entries from the lcore and main ll. */
2444 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2445 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2446 
2447 	/* Set the dev_removal_flag on each lcore. */
2448 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2449 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2450 	}
2451 
2452 	/*
2453 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2454 	 * they can no longer access the device removed from the linked lists and that the devices
2455 	 * are no longer in use.
2456 	 */
2457 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2458 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2459 			rte_pause();
2460 		}
2461 	}
2462 
2463 	/* Add the entries back to the lcore and main free ll.*/
2464 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2465 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2466 
2467 	/* Decrement number of device on the lcore. */
2468 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2469 
2470 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2471 
2472 	if (zero_copy) {
2473 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2474 
2475 		/* Stop the RX queue. */
2476 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2477 			RTE_LOG(DEBUG, VHOST_CONFIG,
2478 				"(%"PRIu64") In destroy_device: Failed to stop "
2479 				"rx queue:%d\n",
2480 				dev->device_fh,
2481 				vdev->vmdq_rx_q);
2482 		}
2483 
2484 		RTE_LOG(DEBUG, VHOST_CONFIG,
2485 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2486 			"mempool back to ring for RX queue: %d\n",
2487 			dev->device_fh, vdev->vmdq_rx_q);
2488 
2489 		mbuf_destroy_zcp(vpool);
2490 
2491 		/* Stop the TX queue. */
2492 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2493 			RTE_LOG(DEBUG, VHOST_CONFIG,
2494 				"(%"PRIu64") In destroy_device: Failed to "
2495 				"stop tx queue:%d\n",
2496 				dev->device_fh, vdev->vmdq_rx_q);
2497 		}
2498 
2499 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2500 
2501 		RTE_LOG(DEBUG, VHOST_CONFIG,
2502 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2503 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2504 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2505 			dev->device_fh);
2506 
2507 		mbuf_destroy_zcp(vpool);
2508 		rte_free(vdev->regions_hpa);
2509 	}
2510 	rte_free(vdev);
2511 
2512 }
2513 
2514 /*
2515  * Calculate the region count of physical continous regions for one particular
2516  * region of whose vhost virtual address is continous. The particular region
2517  * start from vva_start, with size of 'size' in argument.
2518  */
2519 static uint32_t
2520 check_hpa_regions(uint64_t vva_start, uint64_t size)
2521 {
2522 	uint32_t i, nregions = 0, page_size = getpagesize();
2523 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2524 	if (vva_start % page_size) {
2525 		RTE_LOG(DEBUG, VHOST_CONFIG,
2526 			"in check_countinous: vva start(%p) mod page_size(%d) "
2527 			"has remainder\n",
2528 			(void *)(uintptr_t)vva_start, page_size);
2529 		return 0;
2530 	}
2531 	if (size % page_size) {
2532 		RTE_LOG(DEBUG, VHOST_CONFIG,
2533 			"in check_countinous: "
2534 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2535 			size, page_size);
2536 		return 0;
2537 	}
2538 	for (i = 0; i < size - page_size; i = i + page_size) {
2539 		cur_phys_addr
2540 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2541 		next_phys_addr = rte_mem_virt2phy(
2542 			(void *)(uintptr_t)(vva_start + i + page_size));
2543 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2544 			++nregions;
2545 			RTE_LOG(DEBUG, VHOST_CONFIG,
2546 				"in check_continuous: hva addr:(%p) is not "
2547 				"continuous with hva addr:(%p), diff:%d\n",
2548 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2549 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2550 				+ page_size), page_size);
2551 			RTE_LOG(DEBUG, VHOST_CONFIG,
2552 				"in check_continuous: hpa addr:(%p) is not "
2553 				"continuous with hpa addr:(%p), "
2554 				"diff:(%"PRIu64")\n",
2555 				(void *)(uintptr_t)cur_phys_addr,
2556 				(void *)(uintptr_t)next_phys_addr,
2557 				(next_phys_addr-cur_phys_addr));
2558 		}
2559 	}
2560 	return nregions;
2561 }
2562 
2563 /*
2564  * Divide each region whose vhost virtual address is continous into a few
2565  * sub-regions, make sure the physical address within each sub-region are
2566  * continous. And fill offset(to GPA) and size etc. information of each
2567  * sub-region into regions_hpa.
2568  */
2569 static uint32_t
2570 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2571 {
2572 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2573 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2574 
2575 	if (mem_region_hpa == NULL)
2576 		return 0;
2577 
2578 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2579 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2580 			virtio_memory->regions[regionidx].address_offset;
2581 		mem_region_hpa[regionidx_hpa].guest_phys_address
2582 			= virtio_memory->regions[regionidx].guest_phys_address;
2583 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2584 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2585 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2586 		RTE_LOG(DEBUG, VHOST_CONFIG,
2587 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2588 			regionidx_hpa,
2589 			(void *)(uintptr_t)
2590 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2591 		RTE_LOG(DEBUG, VHOST_CONFIG,
2592 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2593 			regionidx_hpa,
2594 			(void *)(uintptr_t)
2595 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2596 		for (i = 0, k = 0;
2597 			i < virtio_memory->regions[regionidx].memory_size -
2598 				page_size;
2599 			i += page_size) {
2600 			cur_phys_addr = rte_mem_virt2phy(
2601 					(void *)(uintptr_t)(vva_start + i));
2602 			next_phys_addr = rte_mem_virt2phy(
2603 					(void *)(uintptr_t)(vva_start +
2604 					i + page_size));
2605 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2606 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2607 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2608 					k + page_size;
2609 				mem_region_hpa[regionidx_hpa].memory_size
2610 					= k + page_size;
2611 				RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest "
2612 					"phys addr end  [%d]:(%p)\n",
2613 					regionidx_hpa,
2614 					(void *)(uintptr_t)
2615 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2616 				RTE_LOG(DEBUG, VHOST_CONFIG,
2617 					"in fill_hpa_regions: guest phys addr "
2618 					"size [%d]:(%p)\n",
2619 					regionidx_hpa,
2620 					(void *)(uintptr_t)
2621 					(mem_region_hpa[regionidx_hpa].memory_size));
2622 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2623 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2624 				++regionidx_hpa;
2625 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2626 					next_phys_addr -
2627 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2628 				RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest"
2629 					" phys addr start[%d]:(%p)\n",
2630 					regionidx_hpa,
2631 					(void *)(uintptr_t)
2632 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2633 				RTE_LOG(DEBUG, VHOST_CONFIG,
2634 					"in fill_hpa_regions: host  phys addr "
2635 					"start[%d]:(%p)\n",
2636 					regionidx_hpa,
2637 					(void *)(uintptr_t)
2638 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2639 				k = 0;
2640 			} else {
2641 				k += page_size;
2642 			}
2643 		}
2644 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2645 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2646 			+ k + page_size;
2647 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2648 		RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2649 			"[%d]:(%p)\n", regionidx_hpa,
2650 			(void *)(uintptr_t)
2651 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2652 		RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2653 			"[%d]:(%p)\n", regionidx_hpa,
2654 			(void *)(uintptr_t)
2655 			(mem_region_hpa[regionidx_hpa].memory_size));
2656 		++regionidx_hpa;
2657 	}
2658 	return regionidx_hpa;
2659 }
2660 
2661 /*
2662  * A new device is added to a data core. First the device is added to the main linked list
2663  * and the allocated to a specific data core.
2664  */
2665 static int
2666 new_device (struct virtio_net *dev)
2667 {
2668 	struct virtio_net_data_ll *ll_dev;
2669 	int lcore, core_add = 0;
2670 	uint32_t device_num_min = num_devices;
2671 	struct vhost_dev *vdev;
2672 	uint32_t regionidx;
2673 
2674 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2675 	if (vdev == NULL) {
2676 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2677 			dev->device_fh);
2678 		return -1;
2679 	}
2680 	vdev->dev = dev;
2681 	dev->priv = vdev;
2682 
2683 	if (zero_copy) {
2684 		vdev->nregions_hpa = dev->mem->nregions;
2685 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2686 			vdev->nregions_hpa
2687 				+= check_hpa_regions(
2688 					dev->mem->regions[regionidx].guest_phys_address
2689 					+ dev->mem->regions[regionidx].address_offset,
2690 					dev->mem->regions[regionidx].memory_size);
2691 
2692 		}
2693 
2694 		vdev->regions_hpa = rte_calloc("vhost hpa region",
2695 					       vdev->nregions_hpa,
2696 					       sizeof(struct virtio_memory_regions_hpa),
2697 					       RTE_CACHE_LINE_SIZE);
2698 		if (vdev->regions_hpa == NULL) {
2699 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2700 			rte_free(vdev);
2701 			return -1;
2702 		}
2703 
2704 
2705 		if (fill_hpa_memory_regions(
2706 			vdev->regions_hpa, dev->mem
2707 			) != vdev->nregions_hpa) {
2708 
2709 			RTE_LOG(ERR, VHOST_CONFIG,
2710 				"hpa memory regions number mismatch: "
2711 				"[%d]\n", vdev->nregions_hpa);
2712 			rte_free(vdev->regions_hpa);
2713 			rte_free(vdev);
2714 			return -1;
2715 		}
2716 	}
2717 
2718 
2719 	/* Add device to main ll */
2720 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2721 	if (ll_dev == NULL) {
2722 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2723 			"of %d devices per core has been reached\n",
2724 			dev->device_fh, num_devices);
2725 		if (vdev->regions_hpa)
2726 			rte_free(vdev->regions_hpa);
2727 		rte_free(vdev);
2728 		return -1;
2729 	}
2730 	ll_dev->vdev = vdev;
2731 	add_data_ll_entry(&ll_root_used, ll_dev);
2732 	vdev->vmdq_rx_q
2733 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2734 
2735 	if (zero_copy) {
2736 		uint32_t index = vdev->vmdq_rx_q;
2737 		uint32_t count_in_ring, i;
2738 		struct mbuf_table *tx_q;
2739 
2740 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2741 
2742 		RTE_LOG(DEBUG, VHOST_CONFIG,
2743 			"(%"PRIu64") in new_device: mbuf count in mempool "
2744 			"before attach is: %d\n",
2745 			dev->device_fh,
2746 			rte_mempool_count(vpool_array[index].pool));
2747 		RTE_LOG(DEBUG, VHOST_CONFIG,
2748 			"(%"PRIu64") in new_device: mbuf count in  ring "
2749 			"before attach  is : %d\n",
2750 			dev->device_fh, count_in_ring);
2751 
2752 		/*
2753 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2754 		 */
2755 		for (i = 0; i < count_in_ring; i++)
2756 			attach_rxmbuf_zcp(dev);
2757 
2758 		RTE_LOG(DEBUG, VHOST_CONFIG, "(%" PRIu64 ") in new_device: "
2759 			"mbuf count in mempool after attach is: %d\n",
2760 			dev->device_fh,
2761 			rte_mempool_count(vpool_array[index].pool));
2762 		RTE_LOG(DEBUG, VHOST_CONFIG, "(%" PRIu64 ") in new_device: "
2763 			"mbuf count in ring after attach  is : %d\n",
2764 			dev->device_fh,
2765 			rte_ring_count(vpool_array[index].ring));
2766 
2767 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2768 		tx_q->txq_id = vdev->vmdq_rx_q;
2769 
2770 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2771 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2772 
2773 			RTE_LOG(DEBUG, VHOST_CONFIG,
2774 				"(%"PRIu64") In new_device: Failed to start "
2775 				"tx queue:%d\n",
2776 				dev->device_fh, vdev->vmdq_rx_q);
2777 
2778 			mbuf_destroy_zcp(vpool);
2779 			rte_free(vdev->regions_hpa);
2780 			rte_free(vdev);
2781 			return -1;
2782 		}
2783 
2784 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2785 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2786 
2787 			RTE_LOG(DEBUG, VHOST_CONFIG,
2788 				"(%"PRIu64") In new_device: Failed to start "
2789 				"rx queue:%d\n",
2790 				dev->device_fh, vdev->vmdq_rx_q);
2791 
2792 			/* Stop the TX queue. */
2793 			if (rte_eth_dev_tx_queue_stop(ports[0],
2794 				vdev->vmdq_rx_q) != 0) {
2795 				RTE_LOG(DEBUG, VHOST_CONFIG,
2796 					"(%"PRIu64") In new_device: Failed to "
2797 					"stop tx queue:%d\n",
2798 					dev->device_fh, vdev->vmdq_rx_q);
2799 			}
2800 
2801 			mbuf_destroy_zcp(vpool);
2802 			rte_free(vdev->regions_hpa);
2803 			rte_free(vdev);
2804 			return -1;
2805 		}
2806 
2807 	}
2808 
2809 	/*reset ready flag*/
2810 	vdev->ready = DEVICE_MAC_LEARNING;
2811 	vdev->remove = 0;
2812 
2813 	/* Find a suitable lcore to add the device. */
2814 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2815 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2816 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2817 			core_add = lcore;
2818 		}
2819 	}
2820 	/* Add device to lcore ll */
2821 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2822 	if (ll_dev == NULL) {
2823 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2824 		vdev->ready = DEVICE_SAFE_REMOVE;
2825 		destroy_device(dev);
2826 		rte_free(vdev->regions_hpa);
2827 		rte_free(vdev);
2828 		return -1;
2829 	}
2830 	ll_dev->vdev = vdev;
2831 	vdev->coreid = core_add;
2832 
2833 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2834 
2835 	/* Initialize device stats */
2836 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2837 
2838 	/* Disable notifications. */
2839 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2840 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2841 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2842 	dev->flags |= VIRTIO_DEV_RUNNING;
2843 
2844 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2845 
2846 	return 0;
2847 }
2848 
2849 /*
2850  * These callback allow devices to be added to the data core when configuration
2851  * has been fully complete.
2852  */
2853 static const struct virtio_net_device_ops virtio_net_device_ops =
2854 {
2855 	.new_device =  new_device,
2856 	.destroy_device = destroy_device,
2857 };
2858 
2859 /*
2860  * This is a thread will wake up after a period to print stats if the user has
2861  * enabled them.
2862  */
2863 static void
2864 print_stats(void)
2865 {
2866 	struct virtio_net_data_ll *dev_ll;
2867 	uint64_t tx_dropped, rx_dropped;
2868 	uint64_t tx, tx_total, rx, rx_total;
2869 	uint32_t device_fh;
2870 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2871 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2872 
2873 	while(1) {
2874 		sleep(enable_stats);
2875 
2876 		/* Clear screen and move to top left */
2877 		printf("%s%s", clr, top_left);
2878 
2879 		printf("\nDevice statistics ====================================");
2880 
2881 		dev_ll = ll_root_used;
2882 		while (dev_ll != NULL) {
2883 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2884 			tx_total = dev_statistics[device_fh].tx_total;
2885 			tx = dev_statistics[device_fh].tx;
2886 			tx_dropped = tx_total - tx;
2887 			if (zero_copy == 0) {
2888 				rx_total = rte_atomic64_read(
2889 					&dev_statistics[device_fh].rx_total_atomic);
2890 				rx = rte_atomic64_read(
2891 					&dev_statistics[device_fh].rx_atomic);
2892 			} else {
2893 				rx_total = dev_statistics[device_fh].rx_total;
2894 				rx = dev_statistics[device_fh].rx;
2895 			}
2896 			rx_dropped = rx_total - rx;
2897 
2898 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2899 					"\nTX total: 		%"PRIu64""
2900 					"\nTX dropped: 		%"PRIu64""
2901 					"\nTX successful: 		%"PRIu64""
2902 					"\nRX total: 		%"PRIu64""
2903 					"\nRX dropped: 		%"PRIu64""
2904 					"\nRX successful: 		%"PRIu64"",
2905 					device_fh,
2906 					tx_total,
2907 					tx_dropped,
2908 					tx,
2909 					rx_total,
2910 					rx_dropped,
2911 					rx);
2912 
2913 			dev_ll = dev_ll->next;
2914 		}
2915 		printf("\n======================================================\n");
2916 	}
2917 }
2918 
2919 static void
2920 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2921 	char *ring_name, uint32_t nb_mbuf)
2922 {
2923 	vpool_array[index].pool	= rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2924 		MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2925 	if (vpool_array[index].pool != NULL) {
2926 		vpool_array[index].ring
2927 			= rte_ring_create(ring_name,
2928 				rte_align32pow2(nb_mbuf + 1),
2929 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2930 		if (likely(vpool_array[index].ring != NULL)) {
2931 			RTE_LOG(DEBUG, VHOST_CONFIG,
2932 				"in setup_mempool_tbl: mbuf count in "
2933 				"mempool is: %d\n",
2934 				rte_mempool_count(vpool_array[index].pool));
2935 			RTE_LOG(DEBUG, VHOST_CONFIG,
2936 				"in setup_mempool_tbl: mbuf count in "
2937 				"ring   is: %d\n",
2938 				rte_ring_count(vpool_array[index].ring));
2939 		} else {
2940 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2941 				ring_name);
2942 		}
2943 
2944 		/* Need consider head room. */
2945 		vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2946 	} else {
2947 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2948 	}
2949 }
2950 
2951 /* When we receive a INT signal, unregister vhost driver */
2952 static void
2953 sigint_handler(__rte_unused int signum)
2954 {
2955 	/* Unregister vhost driver. */
2956 	int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2957 	if (ret != 0)
2958 		rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2959 	exit(0);
2960 }
2961 
2962 /*
2963  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2964  * device is also registered here to handle the IOCTLs.
2965  */
2966 int
2967 main(int argc, char *argv[])
2968 {
2969 	struct rte_mempool *mbuf_pool = NULL;
2970 	unsigned lcore_id, core_id = 0;
2971 	unsigned nb_ports, valid_num_ports;
2972 	int ret;
2973 	uint8_t portid;
2974 	uint16_t queue_id;
2975 	static pthread_t tid;
2976 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
2977 
2978 	signal(SIGINT, sigint_handler);
2979 
2980 	/* init EAL */
2981 	ret = rte_eal_init(argc, argv);
2982 	if (ret < 0)
2983 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2984 	argc -= ret;
2985 	argv += ret;
2986 
2987 	/* parse app arguments */
2988 	ret = us_vhost_parse_args(argc, argv);
2989 	if (ret < 0)
2990 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2991 
2992 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2993 		if (rte_lcore_is_enabled(lcore_id))
2994 			lcore_ids[core_id ++] = lcore_id;
2995 
2996 	if (rte_lcore_count() > RTE_MAX_LCORE)
2997 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2998 
2999 	/*set the number of swithcing cores available*/
3000 	num_switching_cores = rte_lcore_count()-1;
3001 
3002 	/* Get the number of physical ports. */
3003 	nb_ports = rte_eth_dev_count();
3004 	if (nb_ports > RTE_MAX_ETHPORTS)
3005 		nb_ports = RTE_MAX_ETHPORTS;
3006 
3007 	/*
3008 	 * Update the global var NUM_PORTS and global array PORTS
3009 	 * and get value of var VALID_NUM_PORTS according to system ports number
3010 	 */
3011 	valid_num_ports = check_ports_num(nb_ports);
3012 
3013 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3014 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3015 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3016 		return -1;
3017 	}
3018 
3019 	if (zero_copy == 0) {
3020 		/* Create the mbuf pool. */
3021 		mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3022 			NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3023 			0, MBUF_DATA_SIZE, rte_socket_id());
3024 		if (mbuf_pool == NULL)
3025 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3026 
3027 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3028 			vpool_array[queue_id].pool = mbuf_pool;
3029 
3030 		if (vm2vm_mode == VM2VM_HARDWARE) {
3031 			/* Enable VT loop back to let L2 switch to do it. */
3032 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3033 			RTE_LOG(DEBUG, VHOST_CONFIG,
3034 				"Enable loop back for L2 switch in vmdq.\n");
3035 		}
3036 	} else {
3037 		uint32_t nb_mbuf;
3038 		char pool_name[RTE_MEMPOOL_NAMESIZE];
3039 		char ring_name[RTE_MEMPOOL_NAMESIZE];
3040 
3041 		nb_mbuf = num_rx_descriptor
3042 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3043 			+ num_switching_cores * MAX_PKT_BURST;
3044 
3045 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3046 			snprintf(pool_name, sizeof(pool_name),
3047 				"rxmbuf_pool_%u", queue_id);
3048 			snprintf(ring_name, sizeof(ring_name),
3049 				"rxmbuf_ring_%u", queue_id);
3050 			setup_mempool_tbl(rte_socket_id(), queue_id,
3051 				pool_name, ring_name, nb_mbuf);
3052 		}
3053 
3054 		nb_mbuf = num_tx_descriptor
3055 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
3056 				+ num_switching_cores * MAX_PKT_BURST;
3057 
3058 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3059 			snprintf(pool_name, sizeof(pool_name),
3060 				"txmbuf_pool_%u", queue_id);
3061 			snprintf(ring_name, sizeof(ring_name),
3062 				"txmbuf_ring_%u", queue_id);
3063 			setup_mempool_tbl(rte_socket_id(),
3064 				(queue_id + MAX_QUEUES),
3065 				pool_name, ring_name, nb_mbuf);
3066 		}
3067 
3068 		if (vm2vm_mode == VM2VM_HARDWARE) {
3069 			/* Enable VT loop back to let L2 switch to do it. */
3070 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3071 			RTE_LOG(DEBUG, VHOST_CONFIG,
3072 				"Enable loop back for L2 switch in vmdq.\n");
3073 		}
3074 	}
3075 
3076 	/* initialize all ports */
3077 	for (portid = 0; portid < nb_ports; portid++) {
3078 		/* skip ports that are not enabled */
3079 		if ((enabled_port_mask & (1 << portid)) == 0) {
3080 			RTE_LOG(INFO, VHOST_PORT,
3081 				"Skipping disabled port %d\n", portid);
3082 			continue;
3083 		}
3084 		if (port_init(portid) != 0)
3085 			rte_exit(EXIT_FAILURE,
3086 				"Cannot initialize network ports\n");
3087 	}
3088 
3089 	/* Initialise all linked lists. */
3090 	if (init_data_ll() == -1)
3091 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3092 
3093 	/* Initialize device stats */
3094 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3095 
3096 	/* Enable stats if the user option is set. */
3097 	if (enable_stats) {
3098 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3099 		if (ret != 0)
3100 			rte_exit(EXIT_FAILURE,
3101 				"Cannot create print-stats thread\n");
3102 
3103 		/* Set thread_name for aid in debugging.  */
3104 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3105 		ret = rte_thread_setname(tid, thread_name);
3106 		if (ret != 0)
3107 			RTE_LOG(ERR, VHOST_CONFIG,
3108 				"Cannot set print-stats name\n");
3109 	}
3110 
3111 	/* Launch all data cores. */
3112 	if (zero_copy == 0) {
3113 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3114 			rte_eal_remote_launch(switch_worker,
3115 				mbuf_pool, lcore_id);
3116 		}
3117 	} else {
3118 		uint32_t count_in_mempool, index, i;
3119 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3120 			/* For all RX and TX queues. */
3121 			count_in_mempool
3122 				= rte_mempool_count(vpool_array[index].pool);
3123 
3124 			/*
3125 			 * Transfer all un-attached mbufs from vpool.pool
3126 			 * to vpoo.ring.
3127 			 */
3128 			for (i = 0; i < count_in_mempool; i++) {
3129 				struct rte_mbuf *mbuf
3130 					= __rte_mbuf_raw_alloc(
3131 						vpool_array[index].pool);
3132 				rte_ring_sp_enqueue(vpool_array[index].ring,
3133 						(void *)mbuf);
3134 			}
3135 
3136 			RTE_LOG(DEBUG, VHOST_CONFIG,
3137 				"in main: mbuf count in mempool at initial "
3138 				"is: %d\n", count_in_mempool);
3139 			RTE_LOG(DEBUG, VHOST_CONFIG,
3140 				"in main: mbuf count in  ring at initial  is :"
3141 				" %d\n",
3142 				rte_ring_count(vpool_array[index].ring));
3143 		}
3144 
3145 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3146 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3147 				lcore_id);
3148 	}
3149 
3150 	if (mergeable == 0)
3151 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3152 
3153 	/* Register vhost(cuse or user) driver to handle vhost messages. */
3154 	ret = rte_vhost_driver_register((char *)&dev_basename);
3155 	if (ret != 0)
3156 		rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3157 
3158 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3159 
3160 	/* Start CUSE session. */
3161 	rte_vhost_driver_session_start();
3162 	return 0;
3163 
3164 }
3165