xref: /dpdk/examples/vhost/main.c (revision bd89cca3ca34d255e48fa4246998c89bb38301d4)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144 
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147 
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150 
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154 
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161 
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165 
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168 
169 struct vpool {
170 	struct rte_mempool *pool;
171 	struct rte_ring *ring;
172 	uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174 
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177 	VM2VM_DISABLED = 0,
178 	VM2VM_SOFTWARE = 1,
179 	VM2VM_HARDWARE = 2,
180 	VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183 
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186 	PHYS_ADDR_CONTINUOUS = 0,
187 	PHYS_ADDR_CROSS_SUBREG = 1,
188 	PHYS_ADDR_INVALID = 2,
189 	PHYS_ADDR_LAST
190 } hpa_type;
191 
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200 
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203 
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206 	.rxmode = {
207 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208 		.split_hdr_size = 0,
209 		.header_split   = 0, /**< Header Split disabled */
210 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
211 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
212 		/*
213 		 * It is necessary for 1G NIC such as I350,
214 		 * this fixes bug of ipv4 forwarding in guest can't
215 		 * forward pakets from one virtio dev to another virtio dev.
216 		 */
217 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
220 	},
221 
222 	.txmode = {
223 		.mq_mode = ETH_MQ_TX_NONE,
224 	},
225 	.rx_adv_conf = {
226 		/*
227 		 * should be overridden separately in code with
228 		 * appropriate values
229 		 */
230 		.vmdq_rx_conf = {
231 			.nb_queue_pools = ETH_8_POOLS,
232 			.enable_default_pool = 0,
233 			.default_pool = 0,
234 			.nb_pool_maps = 0,
235 			.pool_map = {{0, 0},},
236 		},
237 	},
238 };
239 
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246 
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
251 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258 
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261 
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265 
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268 
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271 	unsigned len;
272 	unsigned txq_id;
273 	struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275 
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278 
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281 
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284 	unsigned char   h_dest[ETH_ALEN];
285 	unsigned char   h_source[ETH_ALEN];
286 	__be16          h_vlan_proto;
287 	__be16          h_vlan_TCI;
288 	__be16          h_vlan_encapsulated_proto;
289 };
290 
291 /* IPv4 Header */
292 struct ipv4_hdr {
293 	uint8_t  version_ihl;		/**< version and header length */
294 	uint8_t  type_of_service;	/**< type of service */
295 	uint16_t total_length;		/**< length of packet */
296 	uint16_t packet_id;		/**< packet ID */
297 	uint16_t fragment_offset;	/**< fragmentation offset */
298 	uint8_t  time_to_live;		/**< time to live */
299 	uint8_t  next_proto_id;		/**< protocol ID */
300 	uint16_t hdr_checksum;		/**< header checksum */
301 	uint32_t src_addr;		/**< source address */
302 	uint32_t dst_addr;		/**< destination address */
303 } __attribute__((__packed__));
304 
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308 
309 /* Per-device statistics struct */
310 struct device_statistics {
311 	uint64_t tx_total;
312 	rte_atomic64_t rx_total_atomic;
313 	uint64_t rx_total;
314 	uint64_t tx;
315 	rte_atomic64_t rx_atomic;
316 	uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319 
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327 	struct rte_eth_vmdq_rx_conf conf;
328 	struct rte_eth_vmdq_rx_conf *def_conf =
329 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330 	unsigned i;
331 
332 	memset(&conf, 0, sizeof(conf));
333 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334 	conf.nb_pool_maps = num_devices;
335 	conf.enable_loop_back = def_conf->enable_loop_back;
336 	conf.rx_mode = def_conf->rx_mode;
337 
338 	for (i = 0; i < conf.nb_pool_maps; i++) {
339 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
340 		conf.pool_map[i].pools = (1UL << i);
341 	}
342 
343 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346 	return 0;
347 }
348 
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357 	if (num_devices > max_nb_devices) {
358 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359 		return -1;
360 	}
361 	return 0;
362 }
363 
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371 	struct rte_eth_dev_info dev_info;
372 	struct rte_eth_conf port_conf;
373 	struct rte_eth_rxconf *rxconf;
374 	struct rte_eth_txconf *txconf;
375 	int16_t rx_rings, tx_rings;
376 	uint16_t rx_ring_size, tx_ring_size;
377 	int retval;
378 	uint16_t q;
379 
380 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381 	rte_eth_dev_info_get (port, &dev_info);
382 
383 	if (dev_info.max_rx_queues > MAX_QUEUES) {
384 		rte_exit(EXIT_FAILURE,
385 			"please define MAX_QUEUES no less than %u in %s\n",
386 			dev_info.max_rx_queues, __FILE__);
387 	}
388 
389 	rxconf = &dev_info.default_rxconf;
390 	txconf = &dev_info.default_txconf;
391 	rxconf->rx_drop_en = 1;
392 
393 	/* Enable vlan offload */
394 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
395 
396 	/*
397 	 * Zero copy defers queue RX/TX start to the time when guest
398 	 * finishes its startup and packet buffers from that guest are
399 	 * available.
400 	 */
401 	if (zero_copy) {
402 		rxconf->rx_deferred_start = 1;
403 		rxconf->rx_drop_en = 0;
404 		txconf->tx_deferred_start = 1;
405 	}
406 
407 	/*configure the number of supported virtio devices based on VMDQ limits */
408 	num_devices = dev_info.max_vmdq_pools;
409 
410 	if (zero_copy) {
411 		rx_ring_size = num_rx_descriptor;
412 		tx_ring_size = num_tx_descriptor;
413 		tx_rings = dev_info.max_tx_queues;
414 	} else {
415 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
416 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
417 		tx_rings = (uint16_t)rte_lcore_count();
418 	}
419 
420 	retval = validate_num_devices(MAX_DEVICES);
421 	if (retval < 0)
422 		return retval;
423 
424 	/* Get port configuration. */
425 	retval = get_eth_conf(&port_conf, num_devices);
426 	if (retval < 0)
427 		return retval;
428 	/* NIC queues are divided into pf queues and vmdq queues.  */
429 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
430 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
431 	num_vmdq_queues = num_devices * queues_per_pool;
432 	num_queues = num_pf_queues + num_vmdq_queues;
433 	vmdq_queue_base = dev_info.vmdq_queue_base;
434 	vmdq_pool_base  = dev_info.vmdq_pool_base;
435 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
436 		num_pf_queues, num_devices, queues_per_pool);
437 
438 	if (port >= rte_eth_dev_count()) return -1;
439 
440 	rx_rings = (uint16_t)dev_info.max_rx_queues;
441 	/* Configure ethernet device. */
442 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
443 	if (retval != 0)
444 		return retval;
445 
446 	/* Setup the queues. */
447 	for (q = 0; q < rx_rings; q ++) {
448 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
449 						rte_eth_dev_socket_id(port),
450 						rxconf,
451 						vpool_array[q].pool);
452 		if (retval < 0)
453 			return retval;
454 	}
455 	for (q = 0; q < tx_rings; q ++) {
456 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
457 						rte_eth_dev_socket_id(port),
458 						txconf);
459 		if (retval < 0)
460 			return retval;
461 	}
462 
463 	/* Start the device. */
464 	retval  = rte_eth_dev_start(port);
465 	if (retval < 0) {
466 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
467 		return retval;
468 	}
469 
470 	if (promiscuous)
471 		rte_eth_promiscuous_enable(port);
472 
473 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
474 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
475 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
476 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
477 			(unsigned)port,
478 			vmdq_ports_eth_addr[port].addr_bytes[0],
479 			vmdq_ports_eth_addr[port].addr_bytes[1],
480 			vmdq_ports_eth_addr[port].addr_bytes[2],
481 			vmdq_ports_eth_addr[port].addr_bytes[3],
482 			vmdq_ports_eth_addr[port].addr_bytes[4],
483 			vmdq_ports_eth_addr[port].addr_bytes[5]);
484 
485 	return 0;
486 }
487 
488 /*
489  * Set character device basename.
490  */
491 static int
492 us_vhost_parse_basename(const char *q_arg)
493 {
494 	/* parse number string */
495 
496 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
497 		return -1;
498 	else
499 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
500 
501 	return 0;
502 }
503 
504 /*
505  * Parse the portmask provided at run time.
506  */
507 static int
508 parse_portmask(const char *portmask)
509 {
510 	char *end = NULL;
511 	unsigned long pm;
512 
513 	errno = 0;
514 
515 	/* parse hexadecimal string */
516 	pm = strtoul(portmask, &end, 16);
517 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
518 		return -1;
519 
520 	if (pm == 0)
521 		return -1;
522 
523 	return pm;
524 
525 }
526 
527 /*
528  * Parse num options at run time.
529  */
530 static int
531 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
532 {
533 	char *end = NULL;
534 	unsigned long num;
535 
536 	errno = 0;
537 
538 	/* parse unsigned int string */
539 	num = strtoul(q_arg, &end, 10);
540 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
541 		return -1;
542 
543 	if (num > max_valid_value)
544 		return -1;
545 
546 	return num;
547 
548 }
549 
550 /*
551  * Display usage
552  */
553 static void
554 us_vhost_usage(const char *prgname)
555 {
556 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
557 	"		--vm2vm [0|1|2]\n"
558 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
559 	"		--dev-basename <name>\n"
560 	"		--nb-devices ND\n"
561 	"		-p PORTMASK: Set mask for ports to be used by application\n"
562 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
563 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
564 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
565 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
566 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
567 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
568 	"		--dev-basename: The basename to be used for the character device.\n"
569 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
570 			"zero copy\n"
571 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
572 			"used only when zero copy is enabled.\n"
573 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
574 			"used only when zero copy is enabled.\n",
575 	       prgname);
576 }
577 
578 /*
579  * Parse the arguments given in the command line of the application.
580  */
581 static int
582 us_vhost_parse_args(int argc, char **argv)
583 {
584 	int opt, ret;
585 	int option_index;
586 	unsigned i;
587 	const char *prgname = argv[0];
588 	static struct option long_option[] = {
589 		{"vm2vm", required_argument, NULL, 0},
590 		{"rx-retry", required_argument, NULL, 0},
591 		{"rx-retry-delay", required_argument, NULL, 0},
592 		{"rx-retry-num", required_argument, NULL, 0},
593 		{"mergeable", required_argument, NULL, 0},
594 		{"stats", required_argument, NULL, 0},
595 		{"dev-basename", required_argument, NULL, 0},
596 		{"zero-copy", required_argument, NULL, 0},
597 		{"rx-desc-num", required_argument, NULL, 0},
598 		{"tx-desc-num", required_argument, NULL, 0},
599 		{NULL, 0, 0, 0},
600 	};
601 
602 	/* Parse command line */
603 	while ((opt = getopt_long(argc, argv, "p:P",
604 			long_option, &option_index)) != EOF) {
605 		switch (opt) {
606 		/* Portmask */
607 		case 'p':
608 			enabled_port_mask = parse_portmask(optarg);
609 			if (enabled_port_mask == 0) {
610 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611 				us_vhost_usage(prgname);
612 				return -1;
613 			}
614 			break;
615 
616 		case 'P':
617 			promiscuous = 1;
618 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
619 				ETH_VMDQ_ACCEPT_BROADCAST |
620 				ETH_VMDQ_ACCEPT_MULTICAST;
621 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
622 
623 			break;
624 
625 		case 0:
626 			/* Enable/disable vm2vm comms. */
627 			if (!strncmp(long_option[option_index].name, "vm2vm",
628 				MAX_LONG_OPT_SZ)) {
629 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
630 				if (ret == -1) {
631 					RTE_LOG(INFO, VHOST_CONFIG,
632 						"Invalid argument for "
633 						"vm2vm [0|1|2]\n");
634 					us_vhost_usage(prgname);
635 					return -1;
636 				} else {
637 					vm2vm_mode = (vm2vm_type)ret;
638 				}
639 			}
640 
641 			/* Enable/disable retries on RX. */
642 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
643 				ret = parse_num_opt(optarg, 1);
644 				if (ret == -1) {
645 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
646 					us_vhost_usage(prgname);
647 					return -1;
648 				} else {
649 					enable_retry = ret;
650 				}
651 			}
652 
653 			/* Specify the retries delay time (in useconds) on RX. */
654 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
655 				ret = parse_num_opt(optarg, INT32_MAX);
656 				if (ret == -1) {
657 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
658 					us_vhost_usage(prgname);
659 					return -1;
660 				} else {
661 					burst_rx_delay_time = ret;
662 				}
663 			}
664 
665 			/* Specify the retries number on RX. */
666 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
667 				ret = parse_num_opt(optarg, INT32_MAX);
668 				if (ret == -1) {
669 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
670 					us_vhost_usage(prgname);
671 					return -1;
672 				} else {
673 					burst_rx_retry_num = ret;
674 				}
675 			}
676 
677 			/* Enable/disable RX mergeable buffers. */
678 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
679 				ret = parse_num_opt(optarg, 1);
680 				if (ret == -1) {
681 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
682 					us_vhost_usage(prgname);
683 					return -1;
684 				} else {
685 					mergeable = !!ret;
686 					if (ret) {
687 						vmdq_conf_default.rxmode.jumbo_frame = 1;
688 						vmdq_conf_default.rxmode.max_rx_pkt_len
689 							= JUMBO_FRAME_MAX_SIZE;
690 					}
691 				}
692 			}
693 
694 			/* Enable/disable stats. */
695 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
696 				ret = parse_num_opt(optarg, INT32_MAX);
697 				if (ret == -1) {
698 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
699 					us_vhost_usage(prgname);
700 					return -1;
701 				} else {
702 					enable_stats = ret;
703 				}
704 			}
705 
706 			/* Set character device basename. */
707 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
708 				if (us_vhost_parse_basename(optarg) == -1) {
709 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
710 					us_vhost_usage(prgname);
711 					return -1;
712 				}
713 			}
714 
715 			/* Enable/disable rx/tx zero copy. */
716 			if (!strncmp(long_option[option_index].name,
717 				"zero-copy", MAX_LONG_OPT_SZ)) {
718 				ret = parse_num_opt(optarg, 1);
719 				if (ret == -1) {
720 					RTE_LOG(INFO, VHOST_CONFIG,
721 						"Invalid argument"
722 						" for zero-copy [0|1]\n");
723 					us_vhost_usage(prgname);
724 					return -1;
725 				} else
726 					zero_copy = ret;
727 
728 				if (zero_copy) {
729 #ifdef RTE_MBUF_REFCNT
730 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
731 					"zero copy vhost APP, please "
732 					"disable RTE_MBUF_REFCNT\n"
733 					"in config file and then rebuild DPDK "
734 					"core lib!\n"
735 					"Otherwise please disable zero copy "
736 					"flag in command line!\n");
737 					return -1;
738 #endif
739 				}
740 			}
741 
742 			/* Specify the descriptor number on RX. */
743 			if (!strncmp(long_option[option_index].name,
744 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
745 				ret = parse_num_opt(optarg, MAX_RING_DESC);
746 				if ((ret == -1) || (!POWEROF2(ret))) {
747 					RTE_LOG(INFO, VHOST_CONFIG,
748 					"Invalid argument for rx-desc-num[0-N],"
749 					"power of 2 required.\n");
750 					us_vhost_usage(prgname);
751 					return -1;
752 				} else {
753 					num_rx_descriptor = ret;
754 				}
755 			}
756 
757 			/* Specify the descriptor number on TX. */
758 			if (!strncmp(long_option[option_index].name,
759 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
760 				ret = parse_num_opt(optarg, MAX_RING_DESC);
761 				if ((ret == -1) || (!POWEROF2(ret))) {
762 					RTE_LOG(INFO, VHOST_CONFIG,
763 					"Invalid argument for tx-desc-num [0-N],"
764 					"power of 2 required.\n");
765 					us_vhost_usage(prgname);
766 					return -1;
767 				} else {
768 					num_tx_descriptor = ret;
769 				}
770 			}
771 
772 			break;
773 
774 			/* Invalid option - print options. */
775 		default:
776 			us_vhost_usage(prgname);
777 			return -1;
778 		}
779 	}
780 
781 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
782 		if (enabled_port_mask & (1 << i))
783 			ports[num_ports++] = (uint8_t)i;
784 	}
785 
786 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
787 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
788 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
789 		return -1;
790 	}
791 
792 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
793 		RTE_LOG(INFO, VHOST_PORT,
794 			"Vhost zero copy doesn't support software vm2vm,"
795 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
796 		return -1;
797 	}
798 
799 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
800 		RTE_LOG(INFO, VHOST_PORT,
801 			"Vhost zero copy doesn't support jumbo frame,"
802 			"please specify '--mergeable 0' to disable the "
803 			"mergeable feature.\n");
804 		return -1;
805 	}
806 
807 	return 0;
808 }
809 
810 /*
811  * Update the global var NUM_PORTS and array PORTS according to system ports number
812  * and return valid ports number
813  */
814 static unsigned check_ports_num(unsigned nb_ports)
815 {
816 	unsigned valid_num_ports = num_ports;
817 	unsigned portid;
818 
819 	if (num_ports > nb_ports) {
820 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
821 			num_ports, nb_ports);
822 		num_ports = nb_ports;
823 	}
824 
825 	for (portid = 0; portid < num_ports; portid ++) {
826 		if (ports[portid] >= nb_ports) {
827 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
828 				ports[portid], (nb_ports - 1));
829 			ports[portid] = INVALID_PORT_ID;
830 			valid_num_ports--;
831 		}
832 	}
833 	return valid_num_ports;
834 }
835 
836 /*
837  * Macro to print out packet contents. Wrapped in debug define so that the
838  * data path is not effected when debug is disabled.
839  */
840 #ifdef DEBUG
841 #define PRINT_PACKET(device, addr, size, header) do {																\
842 	char *pkt_addr = (char*)(addr);																					\
843 	unsigned int index;																								\
844 	char packet[MAX_PRINT_BUFF];																					\
845 																													\
846 	if ((header))																									\
847 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
848 	else																											\
849 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
850 	for (index = 0; index < (size); index++) {																		\
851 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
852 			"%02hhx ", pkt_addr[index]);																			\
853 	}																												\
854 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
855 																													\
856 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
857 } while(0)
858 #else
859 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
860 #endif
861 
862 /*
863  * Function to convert guest physical addresses to vhost physical addresses.
864  * This is used to convert virtio buffer addresses.
865  */
866 static inline uint64_t __attribute__((always_inline))
867 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
868 	uint32_t buf_len, hpa_type *addr_type)
869 {
870 	struct virtio_memory_regions_hpa *region;
871 	uint32_t regionidx;
872 	uint64_t vhost_pa = 0;
873 
874 	*addr_type = PHYS_ADDR_INVALID;
875 
876 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
877 		region = &vdev->regions_hpa[regionidx];
878 		if ((guest_pa >= region->guest_phys_address) &&
879 			(guest_pa <= region->guest_phys_address_end)) {
880 			vhost_pa = region->host_phys_addr_offset + guest_pa;
881 			if (likely((guest_pa + buf_len - 1)
882 				<= region->guest_phys_address_end))
883 				*addr_type = PHYS_ADDR_CONTINUOUS;
884 			else
885 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
886 			break;
887 		}
888 	}
889 
890 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
891 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
892 		(void *)(uintptr_t)vhost_pa);
893 
894 	return vhost_pa;
895 }
896 
897 /*
898  * Compares a packet destination MAC address to a device MAC address.
899  */
900 static inline int __attribute__((always_inline))
901 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
902 {
903 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
904 }
905 
906 /*
907  * This function learns the MAC address of the device and registers this along with a
908  * vlan tag to a VMDQ.
909  */
910 static int
911 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
912 {
913 	struct ether_hdr *pkt_hdr;
914 	struct virtio_net_data_ll *dev_ll;
915 	struct virtio_net *dev = vdev->dev;
916 	int i, ret;
917 
918 	/* Learn MAC address of guest device from packet */
919 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
920 
921 	dev_ll = ll_root_used;
922 
923 	while (dev_ll != NULL) {
924 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
925 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
926 			return -1;
927 		}
928 		dev_ll = dev_ll->next;
929 	}
930 
931 	for (i = 0; i < ETHER_ADDR_LEN; i++)
932 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
933 
934 	/* vlan_tag currently uses the device_id. */
935 	vdev->vlan_tag = vlan_tags[dev->device_fh];
936 
937 	/* Print out VMDQ registration info. */
938 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
939 		dev->device_fh,
940 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
941 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
942 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
943 		vdev->vlan_tag);
944 
945 	/* Register the MAC address. */
946 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
947 				(uint32_t)dev->device_fh + vmdq_pool_base);
948 	if (ret)
949 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
950 					dev->device_fh);
951 
952 	/* Enable stripping of the vlan tag as we handle routing. */
953 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
954 
955 	/* Set device as ready for RX. */
956 	vdev->ready = DEVICE_RX;
957 
958 	return 0;
959 }
960 
961 /*
962  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
963  * queue before disabling RX on the device.
964  */
965 static inline void
966 unlink_vmdq(struct vhost_dev *vdev)
967 {
968 	unsigned i = 0;
969 	unsigned rx_count;
970 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
971 
972 	if (vdev->ready == DEVICE_RX) {
973 		/*clear MAC and VLAN settings*/
974 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
975 		for (i = 0; i < 6; i++)
976 			vdev->mac_address.addr_bytes[i] = 0;
977 
978 		vdev->vlan_tag = 0;
979 
980 		/*Clear out the receive buffers*/
981 		rx_count = rte_eth_rx_burst(ports[0],
982 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
983 
984 		while (rx_count) {
985 			for (i = 0; i < rx_count; i++)
986 				rte_pktmbuf_free(pkts_burst[i]);
987 
988 			rx_count = rte_eth_rx_burst(ports[0],
989 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
990 		}
991 
992 		vdev->ready = DEVICE_MAC_LEARNING;
993 	}
994 }
995 
996 /*
997  * Check if the packet destination MAC address is for a local device. If so then put
998  * the packet on that devices RX queue. If not then return.
999  */
1000 static inline int __attribute__((always_inline))
1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1002 {
1003 	struct virtio_net_data_ll *dev_ll;
1004 	struct ether_hdr *pkt_hdr;
1005 	uint64_t ret = 0;
1006 	struct virtio_net *dev = vdev->dev;
1007 	struct virtio_net *tdev; /* destination virito device */
1008 
1009 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1010 
1011 	/*get the used devices list*/
1012 	dev_ll = ll_root_used;
1013 
1014 	while (dev_ll != NULL) {
1015 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1016 				          &dev_ll->vdev->mac_address)) {
1017 
1018 			/* Drop the packet if the TX packet is destined for the TX device. */
1019 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1020 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1021 							dev->device_fh);
1022 				return 0;
1023 			}
1024 			tdev = dev_ll->vdev->dev;
1025 
1026 
1027 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1028 
1029 			if (unlikely(dev_ll->vdev->remove)) {
1030 				/*drop the packet if the device is marked for removal*/
1031 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1032 			} else {
1033 				/*send the packet to the local virtio device*/
1034 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1035 				if (enable_stats) {
1036 					rte_atomic64_add(
1037 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1038 					1);
1039 					rte_atomic64_add(
1040 					&dev_statistics[tdev->device_fh].rx_atomic,
1041 					ret);
1042 					dev_statistics[tdev->device_fh].tx_total++;
1043 					dev_statistics[tdev->device_fh].tx += ret;
1044 				}
1045 			}
1046 
1047 			return 0;
1048 		}
1049 		dev_ll = dev_ll->next;
1050 	}
1051 
1052 	return -1;
1053 }
1054 
1055 /*
1056  * Check if the destination MAC of a packet is one local VM,
1057  * and get its vlan tag, and offset if it is.
1058  */
1059 static inline int __attribute__((always_inline))
1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1061 	uint32_t *offset, uint16_t *vlan_tag)
1062 {
1063 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1064 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1065 
1066 	while (dev_ll != NULL) {
1067 		if ((dev_ll->vdev->ready == DEVICE_RX)
1068 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1069 		&dev_ll->vdev->mac_address)) {
1070 			/*
1071 			 * Drop the packet if the TX packet is
1072 			 * destined for the TX device.
1073 			 */
1074 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1075 				LOG_DEBUG(VHOST_DATA,
1076 				"(%"PRIu64") TX: Source and destination"
1077 				" MAC addresses are the same. Dropping "
1078 				"packet.\n",
1079 				dev_ll->vdev->dev->device_fh);
1080 				return -1;
1081 			}
1082 
1083 			/*
1084 			 * HW vlan strip will reduce the packet length
1085 			 * by minus length of vlan tag, so need restore
1086 			 * the packet length by plus it.
1087 			 */
1088 			*offset = VLAN_HLEN;
1089 			*vlan_tag =
1090 			(uint16_t)
1091 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1092 
1093 			LOG_DEBUG(VHOST_DATA,
1094 			"(%"PRIu64") TX: pkt to local VM device id:"
1095 			"(%"PRIu64") vlan tag: %d.\n",
1096 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1097 			vlan_tag);
1098 
1099 			break;
1100 		}
1101 		dev_ll = dev_ll->next;
1102 	}
1103 	return 0;
1104 }
1105 
1106 /*
1107  * This function routes the TX packet to the correct interface. This may be a local device
1108  * or the physical port.
1109  */
1110 static inline void __attribute__((always_inline))
1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1112 {
1113 	struct mbuf_table *tx_q;
1114 	struct rte_mbuf **m_table;
1115 	unsigned len, ret, offset = 0;
1116 	const uint16_t lcore_id = rte_lcore_id();
1117 	struct virtio_net *dev = vdev->dev;
1118 
1119 	/*check if destination is local VM*/
1120 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1121 		rte_pktmbuf_free(m);
1122 		return;
1123 	}
1124 
1125 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1126 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1127 			rte_pktmbuf_free(m);
1128 			return;
1129 		}
1130 	}
1131 
1132 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1133 
1134 	/*Add packet to the port tx queue*/
1135 	tx_q = &lcore_tx_queue[lcore_id];
1136 	len = tx_q->len;
1137 
1138 	m->ol_flags = PKT_TX_VLAN_PKT;
1139 
1140 	/*
1141 	 * Find the right seg to adjust the data len when offset is
1142 	 * bigger than tail room size.
1143 	 */
1144 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1145 		if (likely(offset <= rte_pktmbuf_tailroom(m)))
1146 			m->data_len += offset;
1147 		else {
1148 			struct rte_mbuf *seg = m;
1149 
1150 			while ((seg->next != NULL) &&
1151 				(offset > rte_pktmbuf_tailroom(seg)))
1152 				seg = seg->next;
1153 
1154 			seg->data_len += offset;
1155 		}
1156 		m->pkt_len += offset;
1157 	}
1158 
1159 	m->vlan_tci = vlan_tag;
1160 
1161 	tx_q->m_table[len] = m;
1162 	len++;
1163 	if (enable_stats) {
1164 		dev_statistics[dev->device_fh].tx_total++;
1165 		dev_statistics[dev->device_fh].tx++;
1166 	}
1167 
1168 	if (unlikely(len == MAX_PKT_BURST)) {
1169 		m_table = (struct rte_mbuf **)tx_q->m_table;
1170 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1171 		/* Free any buffers not handled by TX and update the port stats. */
1172 		if (unlikely(ret < len)) {
1173 			do {
1174 				rte_pktmbuf_free(m_table[ret]);
1175 			} while (++ret < len);
1176 		}
1177 
1178 		len = 0;
1179 	}
1180 
1181 	tx_q->len = len;
1182 	return;
1183 }
1184 /*
1185  * This function is called by each data core. It handles all RX/TX registered with the
1186  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1187  * with all devices in the main linked list.
1188  */
1189 static int
1190 switch_worker(__attribute__((unused)) void *arg)
1191 {
1192 	struct rte_mempool *mbuf_pool = arg;
1193 	struct virtio_net *dev = NULL;
1194 	struct vhost_dev *vdev = NULL;
1195 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1196 	struct virtio_net_data_ll *dev_ll;
1197 	struct mbuf_table *tx_q;
1198 	volatile struct lcore_ll_info *lcore_ll;
1199 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1200 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1201 	unsigned ret, i;
1202 	const uint16_t lcore_id = rte_lcore_id();
1203 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1204 	uint16_t rx_count = 0;
1205 	uint16_t tx_count;
1206 	uint32_t retry = 0;
1207 
1208 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1209 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1210 	prev_tsc = 0;
1211 
1212 	tx_q = &lcore_tx_queue[lcore_id];
1213 	for (i = 0; i < num_cores; i ++) {
1214 		if (lcore_ids[i] == lcore_id) {
1215 			tx_q->txq_id = i;
1216 			break;
1217 		}
1218 	}
1219 
1220 	while(1) {
1221 		cur_tsc = rte_rdtsc();
1222 		/*
1223 		 * TX burst queue drain
1224 		 */
1225 		diff_tsc = cur_tsc - prev_tsc;
1226 		if (unlikely(diff_tsc > drain_tsc)) {
1227 
1228 			if (tx_q->len) {
1229 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1230 
1231 				/*Tx any packets in the queue*/
1232 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1233 									   (struct rte_mbuf **)tx_q->m_table,
1234 									   (uint16_t)tx_q->len);
1235 				if (unlikely(ret < tx_q->len)) {
1236 					do {
1237 						rte_pktmbuf_free(tx_q->m_table[ret]);
1238 					} while (++ret < tx_q->len);
1239 				}
1240 
1241 				tx_q->len = 0;
1242 			}
1243 
1244 			prev_tsc = cur_tsc;
1245 
1246 		}
1247 
1248 		rte_prefetch0(lcore_ll->ll_root_used);
1249 		/*
1250 		 * Inform the configuration core that we have exited the linked list and that no devices are
1251 		 * in use if requested.
1252 		 */
1253 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1254 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1255 
1256 		/*
1257 		 * Process devices
1258 		 */
1259 		dev_ll = lcore_ll->ll_root_used;
1260 
1261 		while (dev_ll != NULL) {
1262 			/*get virtio device ID*/
1263 			vdev = dev_ll->vdev;
1264 			dev = vdev->dev;
1265 
1266 			if (unlikely(vdev->remove)) {
1267 				dev_ll = dev_ll->next;
1268 				unlink_vmdq(vdev);
1269 				vdev->ready = DEVICE_SAFE_REMOVE;
1270 				continue;
1271 			}
1272 			if (likely(vdev->ready == DEVICE_RX)) {
1273 				/*Handle guest RX*/
1274 				rx_count = rte_eth_rx_burst(ports[0],
1275 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1276 
1277 				if (rx_count) {
1278 					/*
1279 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1280 					* Here MAX_PKT_BURST must be less than virtio queue size
1281 					*/
1282 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1283 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1284 							rte_delay_us(burst_rx_delay_time);
1285 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1286 								break;
1287 						}
1288 					}
1289 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1290 					if (enable_stats) {
1291 						rte_atomic64_add(
1292 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1293 						rx_count);
1294 						rte_atomic64_add(
1295 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1296 					}
1297 					while (likely(rx_count)) {
1298 						rx_count--;
1299 						rte_pktmbuf_free(pkts_burst[rx_count]);
1300 					}
1301 
1302 				}
1303 			}
1304 
1305 			if (likely(!vdev->remove)) {
1306 				/* Handle guest TX*/
1307 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1308 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1309 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1310 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1311 						while (tx_count--)
1312 							rte_pktmbuf_free(pkts_burst[tx_count]);
1313 					}
1314 				}
1315 				while (tx_count)
1316 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1317 			}
1318 
1319 			/*move to the next device in the list*/
1320 			dev_ll = dev_ll->next;
1321 		}
1322 	}
1323 
1324 	return 0;
1325 }
1326 
1327 /*
1328  * This function gets available ring number for zero copy rx.
1329  * Only one thread will call this funciton for a paticular virtio device,
1330  * so, it is designed as non-thread-safe function.
1331  */
1332 static inline uint32_t __attribute__((always_inline))
1333 get_available_ring_num_zcp(struct virtio_net *dev)
1334 {
1335 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1336 	uint16_t avail_idx;
1337 
1338 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1339 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1340 }
1341 
1342 /*
1343  * This function gets available ring index for zero copy rx,
1344  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1345  * Only one thread will call this funciton for a paticular virtio device,
1346  * so, it is designed as non-thread-safe function.
1347  */
1348 static inline uint32_t __attribute__((always_inline))
1349 get_available_ring_index_zcp(struct virtio_net *dev,
1350 	uint16_t *res_base_idx, uint32_t count)
1351 {
1352 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1353 	uint16_t avail_idx;
1354 	uint32_t retry = 0;
1355 	uint16_t free_entries;
1356 
1357 	*res_base_idx = vq->last_used_idx_res;
1358 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1359 	free_entries = (avail_idx - *res_base_idx);
1360 
1361 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1362 			"avail idx: %d, "
1363 			"res base idx:%d, free entries:%d\n",
1364 			dev->device_fh, avail_idx, *res_base_idx,
1365 			free_entries);
1366 
1367 	/*
1368 	 * If retry is enabled and the queue is full then we wait
1369 	 * and retry to avoid packet loss.
1370 	 */
1371 	if (enable_retry && unlikely(count > free_entries)) {
1372 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1373 			rte_delay_us(burst_rx_delay_time);
1374 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1375 			free_entries = (avail_idx - *res_base_idx);
1376 			if (count <= free_entries)
1377 				break;
1378 		}
1379 	}
1380 
1381 	/*check that we have enough buffers*/
1382 	if (unlikely(count > free_entries))
1383 		count = free_entries;
1384 
1385 	if (unlikely(count == 0)) {
1386 		LOG_DEBUG(VHOST_DATA,
1387 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1388 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1389 			dev->device_fh, avail_idx,
1390 			*res_base_idx, free_entries);
1391 		return 0;
1392 	}
1393 
1394 	vq->last_used_idx_res = *res_base_idx + count;
1395 
1396 	return count;
1397 }
1398 
1399 /*
1400  * This function put descriptor back to used list.
1401  */
1402 static inline void __attribute__((always_inline))
1403 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1404 {
1405 	uint16_t res_cur_idx = vq->last_used_idx;
1406 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1407 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1408 	rte_compiler_barrier();
1409 	*(volatile uint16_t *)&vq->used->idx += 1;
1410 	vq->last_used_idx += 1;
1411 
1412 	/* Kick the guest if necessary. */
1413 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1414 		eventfd_write((int)vq->kickfd, 1);
1415 }
1416 
1417 /*
1418  * This function get available descriptor from vitio vring and un-attached mbuf
1419  * from vpool->ring, and then attach them together. It needs adjust the offset
1420  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1421  * frame data may be put to wrong location in mbuf.
1422  */
1423 static inline void __attribute__((always_inline))
1424 attach_rxmbuf_zcp(struct virtio_net *dev)
1425 {
1426 	uint16_t res_base_idx, desc_idx;
1427 	uint64_t buff_addr, phys_addr;
1428 	struct vhost_virtqueue *vq;
1429 	struct vring_desc *desc;
1430 	struct rte_mbuf *mbuf = NULL;
1431 	struct vpool *vpool;
1432 	hpa_type addr_type;
1433 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1434 
1435 	vpool = &vpool_array[vdev->vmdq_rx_q];
1436 	vq = dev->virtqueue[VIRTIO_RXQ];
1437 
1438 	do {
1439 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1440 				1) != 1))
1441 			return;
1442 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1443 
1444 		desc = &vq->desc[desc_idx];
1445 		if (desc->flags & VRING_DESC_F_NEXT) {
1446 			desc = &vq->desc[desc->next];
1447 			buff_addr = gpa_to_vva(dev, desc->addr);
1448 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1449 					&addr_type);
1450 		} else {
1451 			buff_addr = gpa_to_vva(dev,
1452 					desc->addr + vq->vhost_hlen);
1453 			phys_addr = gpa_to_hpa(vdev,
1454 					desc->addr + vq->vhost_hlen,
1455 					desc->len, &addr_type);
1456 		}
1457 
1458 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1459 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1460 				" address found when attaching RX frame buffer"
1461 				" address!\n", dev->device_fh);
1462 			put_desc_to_used_list_zcp(vq, desc_idx);
1463 			continue;
1464 		}
1465 
1466 		/*
1467 		 * Check if the frame buffer address from guest crosses
1468 		 * sub-region or not.
1469 		 */
1470 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1471 			RTE_LOG(ERR, VHOST_DATA,
1472 				"(%"PRIu64") Frame buffer address cross "
1473 				"sub-regioin found when attaching RX frame "
1474 				"buffer address!\n",
1475 				dev->device_fh);
1476 			put_desc_to_used_list_zcp(vq, desc_idx);
1477 			continue;
1478 		}
1479 	} while (unlikely(phys_addr == 0));
1480 
1481 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1482 	if (unlikely(mbuf == NULL)) {
1483 		LOG_DEBUG(VHOST_DATA,
1484 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1485 			"ring_sc_dequeue fail.\n",
1486 			dev->device_fh);
1487 		put_desc_to_used_list_zcp(vq, desc_idx);
1488 		return;
1489 	}
1490 
1491 	if (unlikely(vpool->buf_size > desc->len)) {
1492 		LOG_DEBUG(VHOST_DATA,
1493 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1494 			"length(%d) of descriptor idx: %d less than room "
1495 			"size required: %d\n",
1496 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1497 		put_desc_to_used_list_zcp(vq, desc_idx);
1498 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1499 		return;
1500 	}
1501 
1502 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1503 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1504 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1505 	mbuf->data_len = desc->len;
1506 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1507 
1508 	LOG_DEBUG(VHOST_DATA,
1509 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1510 		"descriptor idx:%d\n",
1511 		dev->device_fh, res_base_idx, desc_idx);
1512 
1513 	__rte_mbuf_raw_free(mbuf);
1514 
1515 	return;
1516 }
1517 
1518 /*
1519  * Detach an attched packet mbuf -
1520  *  - restore original mbuf address and length values.
1521  *  - reset pktmbuf data and data_len to their default values.
1522  *  All other fields of the given packet mbuf will be left intact.
1523  *
1524  * @param m
1525  *   The attached packet mbuf.
1526  */
1527 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1528 {
1529 	const struct rte_mempool *mp = m->pool;
1530 	void *buf = RTE_MBUF_TO_BADDR(m);
1531 	uint32_t buf_ofs;
1532 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1533 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1534 
1535 	m->buf_addr = buf;
1536 	m->buf_len = (uint16_t)buf_len;
1537 
1538 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1539 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1540 	m->data_off = buf_ofs;
1541 
1542 	m->data_len = 0;
1543 }
1544 
1545 /*
1546  * This function is called after packets have been transimited. It fetchs mbuf
1547  * from vpool->pool, detached it and put into vpool->ring. It also update the
1548  * used index and kick the guest if necessary.
1549  */
1550 static inline uint32_t __attribute__((always_inline))
1551 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1552 {
1553 	struct rte_mbuf *mbuf;
1554 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1555 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1556 	uint32_t index = 0;
1557 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1558 
1559 	LOG_DEBUG(VHOST_DATA,
1560 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1561 		"clean is: %d\n",
1562 		dev->device_fh, mbuf_count);
1563 	LOG_DEBUG(VHOST_DATA,
1564 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1565 		"clean  is : %d\n",
1566 		dev->device_fh, rte_ring_count(vpool->ring));
1567 
1568 	for (index = 0; index < mbuf_count; index++) {
1569 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1570 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1571 			pktmbuf_detach_zcp(mbuf);
1572 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1573 
1574 		/* Update used index buffer information. */
1575 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1576 		vq->used->ring[used_idx].len = 0;
1577 
1578 		used_idx = (used_idx + 1) & (vq->size - 1);
1579 	}
1580 
1581 	LOG_DEBUG(VHOST_DATA,
1582 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1583 		"clean is: %d\n",
1584 		dev->device_fh, rte_mempool_count(vpool->pool));
1585 	LOG_DEBUG(VHOST_DATA,
1586 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1587 		"clean  is : %d\n",
1588 		dev->device_fh, rte_ring_count(vpool->ring));
1589 	LOG_DEBUG(VHOST_DATA,
1590 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1591 		"vq->last_used_idx:%d\n",
1592 		dev->device_fh, vq->last_used_idx);
1593 
1594 	vq->last_used_idx += mbuf_count;
1595 
1596 	LOG_DEBUG(VHOST_DATA,
1597 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1598 		"vq->last_used_idx:%d\n",
1599 		dev->device_fh, vq->last_used_idx);
1600 
1601 	rte_compiler_barrier();
1602 
1603 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1604 
1605 	/* Kick guest if required. */
1606 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1607 		eventfd_write((int)vq->kickfd, 1);
1608 
1609 	return 0;
1610 }
1611 
1612 /*
1613  * This function is called when a virtio device is destroy.
1614  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1615  */
1616 static void mbuf_destroy_zcp(struct vpool *vpool)
1617 {
1618 	struct rte_mbuf *mbuf = NULL;
1619 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1620 
1621 	LOG_DEBUG(VHOST_CONFIG,
1622 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1623 		"mbuf_destroy_zcp is: %d\n",
1624 		mbuf_count);
1625 	LOG_DEBUG(VHOST_CONFIG,
1626 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1627 		"mbuf_destroy_zcp  is : %d\n",
1628 		rte_ring_count(vpool->ring));
1629 
1630 	for (index = 0; index < mbuf_count; index++) {
1631 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1632 		if (likely(mbuf != NULL)) {
1633 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1634 				pktmbuf_detach_zcp(mbuf);
1635 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1636 		}
1637 	}
1638 
1639 	LOG_DEBUG(VHOST_CONFIG,
1640 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1641 		"mbuf_destroy_zcp is: %d\n",
1642 		rte_mempool_count(vpool->pool));
1643 	LOG_DEBUG(VHOST_CONFIG,
1644 		"in mbuf_destroy_zcp: mbuf count in ring after "
1645 		"mbuf_destroy_zcp is : %d\n",
1646 		rte_ring_count(vpool->ring));
1647 }
1648 
1649 /*
1650  * This function update the use flag and counter.
1651  */
1652 static inline uint32_t __attribute__((always_inline))
1653 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1654 	uint32_t count)
1655 {
1656 	struct vhost_virtqueue *vq;
1657 	struct vring_desc *desc;
1658 	struct rte_mbuf *buff;
1659 	/* The virtio_hdr is initialised to 0. */
1660 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1661 		= {{0, 0, 0, 0, 0, 0}, 0};
1662 	uint64_t buff_hdr_addr = 0;
1663 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1664 	uint32_t head_idx, packet_success = 0;
1665 	uint16_t res_cur_idx;
1666 
1667 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1668 
1669 	if (count == 0)
1670 		return 0;
1671 
1672 	vq = dev->virtqueue[VIRTIO_RXQ];
1673 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1674 
1675 	res_cur_idx = vq->last_used_idx;
1676 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1677 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1678 
1679 	/* Retrieve all of the head indexes first to avoid caching issues. */
1680 	for (head_idx = 0; head_idx < count; head_idx++)
1681 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1682 
1683 	/*Prefetch descriptor index. */
1684 	rte_prefetch0(&vq->desc[head[packet_success]]);
1685 
1686 	while (packet_success != count) {
1687 		/* Get descriptor from available ring */
1688 		desc = &vq->desc[head[packet_success]];
1689 
1690 		buff = pkts[packet_success];
1691 		LOG_DEBUG(VHOST_DATA,
1692 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1693 			"pkt[%d] descriptor idx: %d\n",
1694 			dev->device_fh, packet_success,
1695 			MBUF_HEADROOM_UINT32(buff));
1696 
1697 		PRINT_PACKET(dev,
1698 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1699 			+ RTE_PKTMBUF_HEADROOM),
1700 			rte_pktmbuf_data_len(buff), 0);
1701 
1702 		/* Buffer address translation for virtio header. */
1703 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1704 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1705 
1706 		/*
1707 		 * If the descriptors are chained the header and data are
1708 		 * placed in separate buffers.
1709 		 */
1710 		if (desc->flags & VRING_DESC_F_NEXT) {
1711 			desc->len = vq->vhost_hlen;
1712 			desc = &vq->desc[desc->next];
1713 			desc->len = rte_pktmbuf_data_len(buff);
1714 		} else {
1715 			desc->len = packet_len;
1716 		}
1717 
1718 		/* Update used ring with desc information */
1719 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1720 			= head[packet_success];
1721 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1722 			= packet_len;
1723 		res_cur_idx++;
1724 		packet_success++;
1725 
1726 		/* A header is required per buffer. */
1727 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1728 			(const void *)&virtio_hdr, vq->vhost_hlen);
1729 
1730 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1731 
1732 		if (likely(packet_success < count)) {
1733 			/* Prefetch descriptor index. */
1734 			rte_prefetch0(&vq->desc[head[packet_success]]);
1735 		}
1736 	}
1737 
1738 	rte_compiler_barrier();
1739 
1740 	LOG_DEBUG(VHOST_DATA,
1741 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1742 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1743 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1744 
1745 	*(volatile uint16_t *)&vq->used->idx += count;
1746 	vq->last_used_idx += count;
1747 
1748 	LOG_DEBUG(VHOST_DATA,
1749 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1750 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1751 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1752 
1753 	/* Kick the guest if necessary. */
1754 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1755 		eventfd_write((int)vq->kickfd, 1);
1756 
1757 	return count;
1758 }
1759 
1760 /*
1761  * This function routes the TX packet to the correct interface.
1762  * This may be a local device or the physical port.
1763  */
1764 static inline void __attribute__((always_inline))
1765 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1766 	uint32_t desc_idx, uint8_t need_copy)
1767 {
1768 	struct mbuf_table *tx_q;
1769 	struct rte_mbuf **m_table;
1770 	struct rte_mbuf *mbuf = NULL;
1771 	unsigned len, ret, offset = 0;
1772 	struct vpool *vpool;
1773 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1774 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1775 
1776 	/*Add packet to the port tx queue*/
1777 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1778 	len = tx_q->len;
1779 
1780 	/* Allocate an mbuf and populate the structure. */
1781 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1782 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1783 	if (unlikely(mbuf == NULL)) {
1784 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1785 		RTE_LOG(ERR, VHOST_DATA,
1786 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1787 			dev->device_fh);
1788 		put_desc_to_used_list_zcp(vq, desc_idx);
1789 		return;
1790 	}
1791 
1792 	if (vm2vm_mode == VM2VM_HARDWARE) {
1793 		/* Avoid using a vlan tag from any vm for external pkt, such as
1794 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1795 		 * selection, MAC address determines it as an external pkt
1796 		 * which should go to network, while vlan tag determine it as
1797 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1798 		 * such a ambiguous situation, so pkt will lost.
1799 		 */
1800 		vlan_tag = external_pkt_default_vlan_tag;
1801 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1802 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1803 			__rte_mbuf_raw_free(mbuf);
1804 			return;
1805 		}
1806 	}
1807 
1808 	mbuf->nb_segs = m->nb_segs;
1809 	mbuf->next = m->next;
1810 	mbuf->data_len = m->data_len + offset;
1811 	mbuf->pkt_len = mbuf->data_len;
1812 	if (unlikely(need_copy)) {
1813 		/* Copy the packet contents to the mbuf. */
1814 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1815 			rte_pktmbuf_mtod(m, void *),
1816 			m->data_len);
1817 	} else {
1818 		mbuf->data_off = m->data_off;
1819 		mbuf->buf_physaddr = m->buf_physaddr;
1820 		mbuf->buf_addr = m->buf_addr;
1821 	}
1822 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1823 	mbuf->vlan_tci = vlan_tag;
1824 	mbuf->l2_len = sizeof(struct ether_hdr);
1825 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1826 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1827 
1828 	tx_q->m_table[len] = mbuf;
1829 	len++;
1830 
1831 	LOG_DEBUG(VHOST_DATA,
1832 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1833 		dev->device_fh,
1834 		mbuf->nb_segs,
1835 		(mbuf->next == NULL) ? "null" : "non-null");
1836 
1837 	if (enable_stats) {
1838 		dev_statistics[dev->device_fh].tx_total++;
1839 		dev_statistics[dev->device_fh].tx++;
1840 	}
1841 
1842 	if (unlikely(len == MAX_PKT_BURST)) {
1843 		m_table = (struct rte_mbuf **)tx_q->m_table;
1844 		ret = rte_eth_tx_burst(ports[0],
1845 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1846 
1847 		/*
1848 		 * Free any buffers not handled by TX and update
1849 		 * the port stats.
1850 		 */
1851 		if (unlikely(ret < len)) {
1852 			do {
1853 				rte_pktmbuf_free(m_table[ret]);
1854 			} while (++ret < len);
1855 		}
1856 
1857 		len = 0;
1858 		txmbuf_clean_zcp(dev, vpool);
1859 	}
1860 
1861 	tx_q->len = len;
1862 
1863 	return;
1864 }
1865 
1866 /*
1867  * This function TX all available packets in virtio TX queue for one
1868  * virtio-net device. If it is first packet, it learns MAC address and
1869  * setup VMDQ.
1870  */
1871 static inline void __attribute__((always_inline))
1872 virtio_dev_tx_zcp(struct virtio_net *dev)
1873 {
1874 	struct rte_mbuf m;
1875 	struct vhost_virtqueue *vq;
1876 	struct vring_desc *desc;
1877 	uint64_t buff_addr = 0, phys_addr;
1878 	uint32_t head[MAX_PKT_BURST];
1879 	uint32_t i;
1880 	uint16_t free_entries, packet_success = 0;
1881 	uint16_t avail_idx;
1882 	uint8_t need_copy = 0;
1883 	hpa_type addr_type;
1884 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1885 
1886 	vq = dev->virtqueue[VIRTIO_TXQ];
1887 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1888 
1889 	/* If there are no available buffers then return. */
1890 	if (vq->last_used_idx_res == avail_idx)
1891 		return;
1892 
1893 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1894 
1895 	/* Prefetch available ring to retrieve head indexes. */
1896 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1897 
1898 	/* Get the number of free entries in the ring */
1899 	free_entries = (avail_idx - vq->last_used_idx_res);
1900 
1901 	/* Limit to MAX_PKT_BURST. */
1902 	free_entries
1903 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1904 
1905 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1906 		dev->device_fh, free_entries);
1907 
1908 	/* Retrieve all of the head indexes first to avoid caching issues. */
1909 	for (i = 0; i < free_entries; i++)
1910 		head[i]
1911 			= vq->avail->ring[(vq->last_used_idx_res + i)
1912 			& (vq->size - 1)];
1913 
1914 	vq->last_used_idx_res += free_entries;
1915 
1916 	/* Prefetch descriptor index. */
1917 	rte_prefetch0(&vq->desc[head[packet_success]]);
1918 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1919 
1920 	while (packet_success < free_entries) {
1921 		desc = &vq->desc[head[packet_success]];
1922 
1923 		/* Discard first buffer as it is the virtio header */
1924 		desc = &vq->desc[desc->next];
1925 
1926 		/* Buffer address translation. */
1927 		buff_addr = gpa_to_vva(dev, desc->addr);
1928 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1929 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1930 			&addr_type);
1931 
1932 		if (likely(packet_success < (free_entries - 1)))
1933 			/* Prefetch descriptor index. */
1934 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1935 
1936 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1937 			RTE_LOG(ERR, VHOST_DATA,
1938 				"(%"PRIu64") Invalid frame buffer address found"
1939 				"when TX packets!\n",
1940 				dev->device_fh);
1941 			packet_success++;
1942 			continue;
1943 		}
1944 
1945 		/* Prefetch buffer address. */
1946 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1947 
1948 		/*
1949 		 * Setup dummy mbuf. This is copied to a real mbuf if
1950 		 * transmitted out the physical port.
1951 		 */
1952 		m.data_len = desc->len;
1953 		m.nb_segs = 1;
1954 		m.next = NULL;
1955 		m.data_off = 0;
1956 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1957 		m.buf_physaddr = phys_addr;
1958 
1959 		/*
1960 		 * Check if the frame buffer address from guest crosses
1961 		 * sub-region or not.
1962 		 */
1963 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1964 			RTE_LOG(ERR, VHOST_DATA,
1965 				"(%"PRIu64") Frame buffer address cross "
1966 				"sub-regioin found when attaching TX frame "
1967 				"buffer address!\n",
1968 				dev->device_fh);
1969 			need_copy = 1;
1970 		} else
1971 			need_copy = 0;
1972 
1973 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1974 
1975 		/*
1976 		 * If this is the first received packet we need to learn
1977 		 * the MAC and setup VMDQ
1978 		 */
1979 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1980 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1981 				/*
1982 				 * Discard frame if device is scheduled for
1983 				 * removal or a duplicate MAC address is found.
1984 				 */
1985 				packet_success += free_entries;
1986 				vq->last_used_idx += packet_success;
1987 				break;
1988 			}
1989 		}
1990 
1991 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1992 		packet_success++;
1993 	}
1994 }
1995 
1996 /*
1997  * This function is called by each data core. It handles all RX/TX registered
1998  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1999  * addresses are compared with all devices in the main linked list.
2000  */
2001 static int
2002 switch_worker_zcp(__attribute__((unused)) void *arg)
2003 {
2004 	struct virtio_net *dev = NULL;
2005 	struct vhost_dev  *vdev = NULL;
2006 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2007 	struct virtio_net_data_ll *dev_ll;
2008 	struct mbuf_table *tx_q;
2009 	volatile struct lcore_ll_info *lcore_ll;
2010 	const uint64_t drain_tsc
2011 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2012 		* BURST_TX_DRAIN_US;
2013 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2014 	unsigned ret;
2015 	const uint16_t lcore_id = rte_lcore_id();
2016 	uint16_t count_in_ring, rx_count = 0;
2017 
2018 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2019 
2020 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2021 	prev_tsc = 0;
2022 
2023 	while (1) {
2024 		cur_tsc = rte_rdtsc();
2025 
2026 		/* TX burst queue drain */
2027 		diff_tsc = cur_tsc - prev_tsc;
2028 		if (unlikely(diff_tsc > drain_tsc)) {
2029 			/*
2030 			 * Get mbuf from vpool.pool and detach mbuf and
2031 			 * put back into vpool.ring.
2032 			 */
2033 			dev_ll = lcore_ll->ll_root_used;
2034 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2035 				/* Get virtio device ID */
2036 				vdev = dev_ll->vdev;
2037 				dev = vdev->dev;
2038 
2039 				if (likely(!vdev->remove)) {
2040 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2041 					if (tx_q->len) {
2042 						LOG_DEBUG(VHOST_DATA,
2043 						"TX queue drained after timeout"
2044 						" with burst size %u\n",
2045 						tx_q->len);
2046 
2047 						/*
2048 						 * Tx any packets in the queue
2049 						 */
2050 						ret = rte_eth_tx_burst(
2051 							ports[0],
2052 							(uint16_t)tx_q->txq_id,
2053 							(struct rte_mbuf **)
2054 							tx_q->m_table,
2055 							(uint16_t)tx_q->len);
2056 						if (unlikely(ret < tx_q->len)) {
2057 							do {
2058 								rte_pktmbuf_free(
2059 									tx_q->m_table[ret]);
2060 							} while (++ret < tx_q->len);
2061 						}
2062 						tx_q->len = 0;
2063 
2064 						txmbuf_clean_zcp(dev,
2065 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2066 					}
2067 				}
2068 				dev_ll = dev_ll->next;
2069 			}
2070 			prev_tsc = cur_tsc;
2071 		}
2072 
2073 		rte_prefetch0(lcore_ll->ll_root_used);
2074 
2075 		/*
2076 		 * Inform the configuration core that we have exited the linked
2077 		 * list and that no devices are in use if requested.
2078 		 */
2079 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2080 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2081 
2082 		/* Process devices */
2083 		dev_ll = lcore_ll->ll_root_used;
2084 
2085 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2086 			vdev = dev_ll->vdev;
2087 			dev  = vdev->dev;
2088 			if (unlikely(vdev->remove)) {
2089 				dev_ll = dev_ll->next;
2090 				unlink_vmdq(vdev);
2091 				vdev->ready = DEVICE_SAFE_REMOVE;
2092 				continue;
2093 			}
2094 
2095 			if (likely(vdev->ready == DEVICE_RX)) {
2096 				uint32_t index = vdev->vmdq_rx_q;
2097 				uint16_t i;
2098 				count_in_ring
2099 				= rte_ring_count(vpool_array[index].ring);
2100 				uint16_t free_entries
2101 				= (uint16_t)get_available_ring_num_zcp(dev);
2102 
2103 				/*
2104 				 * Attach all mbufs in vpool.ring and put back
2105 				 * into vpool.pool.
2106 				 */
2107 				for (i = 0;
2108 				i < RTE_MIN(free_entries,
2109 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2110 				i++)
2111 					attach_rxmbuf_zcp(dev);
2112 
2113 				/* Handle guest RX */
2114 				rx_count = rte_eth_rx_burst(ports[0],
2115 					vdev->vmdq_rx_q, pkts_burst,
2116 					MAX_PKT_BURST);
2117 
2118 				if (rx_count) {
2119 					ret_count = virtio_dev_rx_zcp(dev,
2120 							pkts_burst, rx_count);
2121 					if (enable_stats) {
2122 						dev_statistics[dev->device_fh].rx_total
2123 							+= rx_count;
2124 						dev_statistics[dev->device_fh].rx
2125 							+= ret_count;
2126 					}
2127 					while (likely(rx_count)) {
2128 						rx_count--;
2129 						pktmbuf_detach_zcp(
2130 							pkts_burst[rx_count]);
2131 						rte_ring_sp_enqueue(
2132 							vpool_array[index].ring,
2133 							(void *)pkts_burst[rx_count]);
2134 					}
2135 				}
2136 			}
2137 
2138 			if (likely(!vdev->remove))
2139 				/* Handle guest TX */
2140 				virtio_dev_tx_zcp(dev);
2141 
2142 			/* Move to the next device in the list */
2143 			dev_ll = dev_ll->next;
2144 		}
2145 	}
2146 
2147 	return 0;
2148 }
2149 
2150 
2151 /*
2152  * Add an entry to a used linked list. A free entry must first be found
2153  * in the free linked list using get_data_ll_free_entry();
2154  */
2155 static void
2156 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2157 	struct virtio_net_data_ll *ll_dev)
2158 {
2159 	struct virtio_net_data_ll *ll = *ll_root_addr;
2160 
2161 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2162 	ll_dev->next = NULL;
2163 	rte_compiler_barrier();
2164 
2165 	/* If ll == NULL then this is the first device. */
2166 	if (ll) {
2167 		/* Increment to the tail of the linked list. */
2168 		while ((ll->next != NULL) )
2169 			ll = ll->next;
2170 
2171 		ll->next = ll_dev;
2172 	} else {
2173 		*ll_root_addr = ll_dev;
2174 	}
2175 }
2176 
2177 /*
2178  * Remove an entry from a used linked list. The entry must then be added to
2179  * the free linked list using put_data_ll_free_entry().
2180  */
2181 static void
2182 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2183 	struct virtio_net_data_ll *ll_dev,
2184 	struct virtio_net_data_ll *ll_dev_last)
2185 {
2186 	struct virtio_net_data_ll *ll = *ll_root_addr;
2187 
2188 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2189 		return;
2190 
2191 	if (ll_dev == ll)
2192 		*ll_root_addr = ll_dev->next;
2193 	else
2194 		if (likely(ll_dev_last != NULL))
2195 			ll_dev_last->next = ll_dev->next;
2196 		else
2197 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2198 }
2199 
2200 /*
2201  * Find and return an entry from the free linked list.
2202  */
2203 static struct virtio_net_data_ll *
2204 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2205 {
2206 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2207 	struct virtio_net_data_ll *ll_dev;
2208 
2209 	if (ll_free == NULL)
2210 		return NULL;
2211 
2212 	ll_dev = ll_free;
2213 	*ll_root_addr = ll_free->next;
2214 
2215 	return ll_dev;
2216 }
2217 
2218 /*
2219  * Place an entry back on to the free linked list.
2220  */
2221 static void
2222 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2223 	struct virtio_net_data_ll *ll_dev)
2224 {
2225 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2226 
2227 	if (ll_dev == NULL)
2228 		return;
2229 
2230 	ll_dev->next = ll_free;
2231 	*ll_root_addr = ll_dev;
2232 }
2233 
2234 /*
2235  * Creates a linked list of a given size.
2236  */
2237 static struct virtio_net_data_ll *
2238 alloc_data_ll(uint32_t size)
2239 {
2240 	struct virtio_net_data_ll *ll_new;
2241 	uint32_t i;
2242 
2243 	/* Malloc and then chain the linked list. */
2244 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2245 	if (ll_new == NULL) {
2246 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2247 		return NULL;
2248 	}
2249 
2250 	for (i = 0; i < size - 1; i++) {
2251 		ll_new[i].vdev = NULL;
2252 		ll_new[i].next = &ll_new[i+1];
2253 	}
2254 	ll_new[i].next = NULL;
2255 
2256 	return (ll_new);
2257 }
2258 
2259 /*
2260  * Create the main linked list along with each individual cores linked list. A used and a free list
2261  * are created to manage entries.
2262  */
2263 static int
2264 init_data_ll (void)
2265 {
2266 	int lcore;
2267 
2268 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2269 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2270 		if (lcore_info[lcore].lcore_ll == NULL) {
2271 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2272 			return -1;
2273 		}
2274 
2275 		lcore_info[lcore].lcore_ll->device_num = 0;
2276 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2277 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2278 		if (num_devices % num_switching_cores)
2279 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2280 		else
2281 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2282 	}
2283 
2284 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2285 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2286 
2287 	return 0;
2288 }
2289 
2290 /*
2291  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2292  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2293  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2294  */
2295 static void
2296 destroy_device (volatile struct virtio_net *dev)
2297 {
2298 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2299 	struct virtio_net_data_ll *ll_main_dev_cur;
2300 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2301 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2302 	struct vhost_dev *vdev;
2303 	int lcore;
2304 
2305 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2306 
2307 	vdev = (struct vhost_dev *)dev->priv;
2308 	/*set the remove flag. */
2309 	vdev->remove = 1;
2310 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2311 		rte_pause();
2312 	}
2313 
2314 	/* Search for entry to be removed from lcore ll */
2315 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2316 	while (ll_lcore_dev_cur != NULL) {
2317 		if (ll_lcore_dev_cur->vdev == vdev) {
2318 			break;
2319 		} else {
2320 			ll_lcore_dev_last = ll_lcore_dev_cur;
2321 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2322 		}
2323 	}
2324 
2325 	if (ll_lcore_dev_cur == NULL) {
2326 		RTE_LOG(ERR, VHOST_CONFIG,
2327 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2328 			dev->device_fh);
2329 		return;
2330 	}
2331 
2332 	/* Search for entry to be removed from main ll */
2333 	ll_main_dev_cur = ll_root_used;
2334 	ll_main_dev_last = NULL;
2335 	while (ll_main_dev_cur != NULL) {
2336 		if (ll_main_dev_cur->vdev == vdev) {
2337 			break;
2338 		} else {
2339 			ll_main_dev_last = ll_main_dev_cur;
2340 			ll_main_dev_cur = ll_main_dev_cur->next;
2341 		}
2342 	}
2343 
2344 	/* Remove entries from the lcore and main ll. */
2345 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2346 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2347 
2348 	/* Set the dev_removal_flag on each lcore. */
2349 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2350 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2351 	}
2352 
2353 	/*
2354 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2355 	 * they can no longer access the device removed from the linked lists and that the devices
2356 	 * are no longer in use.
2357 	 */
2358 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2359 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2360 			rte_pause();
2361 		}
2362 	}
2363 
2364 	/* Add the entries back to the lcore and main free ll.*/
2365 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2366 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2367 
2368 	/* Decrement number of device on the lcore. */
2369 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2370 
2371 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2372 
2373 	if (zero_copy) {
2374 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2375 
2376 		/* Stop the RX queue. */
2377 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2378 			LOG_DEBUG(VHOST_CONFIG,
2379 				"(%"PRIu64") In destroy_device: Failed to stop "
2380 				"rx queue:%d\n",
2381 				dev->device_fh,
2382 				vdev->vmdq_rx_q);
2383 		}
2384 
2385 		LOG_DEBUG(VHOST_CONFIG,
2386 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2387 			"mempool back to ring for RX queue: %d\n",
2388 			dev->device_fh, vdev->vmdq_rx_q);
2389 
2390 		mbuf_destroy_zcp(vpool);
2391 
2392 		/* Stop the TX queue. */
2393 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2394 			LOG_DEBUG(VHOST_CONFIG,
2395 				"(%"PRIu64") In destroy_device: Failed to "
2396 				"stop tx queue:%d\n",
2397 				dev->device_fh, vdev->vmdq_rx_q);
2398 		}
2399 
2400 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2401 
2402 		LOG_DEBUG(VHOST_CONFIG,
2403 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2404 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2405 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2406 			dev->device_fh);
2407 
2408 		mbuf_destroy_zcp(vpool);
2409 		rte_free(vdev->regions_hpa);
2410 	}
2411 	rte_free(vdev);
2412 
2413 }
2414 
2415 /*
2416  * Calculate the region count of physical continous regions for one particular
2417  * region of whose vhost virtual address is continous. The particular region
2418  * start from vva_start, with size of 'size' in argument.
2419  */
2420 static uint32_t
2421 check_hpa_regions(uint64_t vva_start, uint64_t size)
2422 {
2423 	uint32_t i, nregions = 0, page_size = getpagesize();
2424 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2425 	if (vva_start % page_size) {
2426 		LOG_DEBUG(VHOST_CONFIG,
2427 			"in check_countinous: vva start(%p) mod page_size(%d) "
2428 			"has remainder\n",
2429 			(void *)(uintptr_t)vva_start, page_size);
2430 		return 0;
2431 	}
2432 	if (size % page_size) {
2433 		LOG_DEBUG(VHOST_CONFIG,
2434 			"in check_countinous: "
2435 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2436 			size, page_size);
2437 		return 0;
2438 	}
2439 	for (i = 0; i < size - page_size; i = i + page_size) {
2440 		cur_phys_addr
2441 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2442 		next_phys_addr = rte_mem_virt2phy(
2443 			(void *)(uintptr_t)(vva_start + i + page_size));
2444 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2445 			++nregions;
2446 			LOG_DEBUG(VHOST_CONFIG,
2447 				"in check_continuous: hva addr:(%p) is not "
2448 				"continuous with hva addr:(%p), diff:%d\n",
2449 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2450 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2451 				+ page_size), page_size);
2452 			LOG_DEBUG(VHOST_CONFIG,
2453 				"in check_continuous: hpa addr:(%p) is not "
2454 				"continuous with hpa addr:(%p), "
2455 				"diff:(%"PRIu64")\n",
2456 				(void *)(uintptr_t)cur_phys_addr,
2457 				(void *)(uintptr_t)next_phys_addr,
2458 				(next_phys_addr-cur_phys_addr));
2459 		}
2460 	}
2461 	return nregions;
2462 }
2463 
2464 /*
2465  * Divide each region whose vhost virtual address is continous into a few
2466  * sub-regions, make sure the physical address within each sub-region are
2467  * continous. And fill offset(to GPA) and size etc. information of each
2468  * sub-region into regions_hpa.
2469  */
2470 static uint32_t
2471 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2472 {
2473 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2474 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2475 
2476 	if (mem_region_hpa == NULL)
2477 		return 0;
2478 
2479 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2480 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2481 			virtio_memory->regions[regionidx].address_offset;
2482 		mem_region_hpa[regionidx_hpa].guest_phys_address
2483 			= virtio_memory->regions[regionidx].guest_phys_address;
2484 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2485 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2486 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2487 		LOG_DEBUG(VHOST_CONFIG,
2488 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2489 			regionidx_hpa,
2490 			(void *)(uintptr_t)
2491 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2492 		LOG_DEBUG(VHOST_CONFIG,
2493 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2494 			regionidx_hpa,
2495 			(void *)(uintptr_t)
2496 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2497 		for (i = 0, k = 0;
2498 			i < virtio_memory->regions[regionidx].memory_size -
2499 				page_size;
2500 			i += page_size) {
2501 			cur_phys_addr = rte_mem_virt2phy(
2502 					(void *)(uintptr_t)(vva_start + i));
2503 			next_phys_addr = rte_mem_virt2phy(
2504 					(void *)(uintptr_t)(vva_start +
2505 					i + page_size));
2506 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2507 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2508 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2509 					k + page_size;
2510 				mem_region_hpa[regionidx_hpa].memory_size
2511 					= k + page_size;
2512 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2513 					"phys addr end  [%d]:(%p)\n",
2514 					regionidx_hpa,
2515 					(void *)(uintptr_t)
2516 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2517 				LOG_DEBUG(VHOST_CONFIG,
2518 					"in fill_hpa_regions: guest phys addr "
2519 					"size [%d]:(%p)\n",
2520 					regionidx_hpa,
2521 					(void *)(uintptr_t)
2522 					(mem_region_hpa[regionidx_hpa].memory_size));
2523 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2524 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2525 				++regionidx_hpa;
2526 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2527 					next_phys_addr -
2528 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2529 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2530 					" phys addr start[%d]:(%p)\n",
2531 					regionidx_hpa,
2532 					(void *)(uintptr_t)
2533 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2534 				LOG_DEBUG(VHOST_CONFIG,
2535 					"in fill_hpa_regions: host  phys addr "
2536 					"start[%d]:(%p)\n",
2537 					regionidx_hpa,
2538 					(void *)(uintptr_t)
2539 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2540 				k = 0;
2541 			} else {
2542 				k += page_size;
2543 			}
2544 		}
2545 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2546 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2547 			+ k + page_size;
2548 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2549 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2550 			"[%d]:(%p)\n", regionidx_hpa,
2551 			(void *)(uintptr_t)
2552 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2553 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2554 			"[%d]:(%p)\n", regionidx_hpa,
2555 			(void *)(uintptr_t)
2556 			(mem_region_hpa[regionidx_hpa].memory_size));
2557 		++regionidx_hpa;
2558 	}
2559 	return regionidx_hpa;
2560 }
2561 
2562 /*
2563  * A new device is added to a data core. First the device is added to the main linked list
2564  * and the allocated to a specific data core.
2565  */
2566 static int
2567 new_device (struct virtio_net *dev)
2568 {
2569 	struct virtio_net_data_ll *ll_dev;
2570 	int lcore, core_add = 0;
2571 	uint32_t device_num_min = num_devices;
2572 	struct vhost_dev *vdev;
2573 	uint32_t regionidx;
2574 
2575 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2576 	if (vdev == NULL) {
2577 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2578 			dev->device_fh);
2579 		return -1;
2580 	}
2581 	vdev->dev = dev;
2582 	dev->priv = vdev;
2583 
2584 	if (zero_copy) {
2585 		vdev->nregions_hpa = dev->mem->nregions;
2586 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2587 			vdev->nregions_hpa
2588 				+= check_hpa_regions(
2589 					dev->mem->regions[regionidx].guest_phys_address
2590 					+ dev->mem->regions[regionidx].address_offset,
2591 					dev->mem->regions[regionidx].memory_size);
2592 
2593 		}
2594 
2595 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2596 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2597 			RTE_CACHE_LINE_SIZE);
2598 		if (vdev->regions_hpa == NULL) {
2599 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2600 			rte_free(vdev);
2601 			return -1;
2602 		}
2603 
2604 
2605 		if (fill_hpa_memory_regions(
2606 			vdev->regions_hpa, dev->mem
2607 			) != vdev->nregions_hpa) {
2608 
2609 			RTE_LOG(ERR, VHOST_CONFIG,
2610 				"hpa memory regions number mismatch: "
2611 				"[%d]\n", vdev->nregions_hpa);
2612 			rte_free(vdev->regions_hpa);
2613 			rte_free(vdev);
2614 			return -1;
2615 		}
2616 	}
2617 
2618 
2619 	/* Add device to main ll */
2620 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2621 	if (ll_dev == NULL) {
2622 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2623 			"of %d devices per core has been reached\n",
2624 			dev->device_fh, num_devices);
2625 		if (vdev->regions_hpa)
2626 			rte_free(vdev->regions_hpa);
2627 		rte_free(vdev);
2628 		return -1;
2629 	}
2630 	ll_dev->vdev = vdev;
2631 	add_data_ll_entry(&ll_root_used, ll_dev);
2632 	vdev->vmdq_rx_q
2633 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2634 
2635 	if (zero_copy) {
2636 		uint32_t index = vdev->vmdq_rx_q;
2637 		uint32_t count_in_ring, i;
2638 		struct mbuf_table *tx_q;
2639 
2640 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2641 
2642 		LOG_DEBUG(VHOST_CONFIG,
2643 			"(%"PRIu64") in new_device: mbuf count in mempool "
2644 			"before attach is: %d\n",
2645 			dev->device_fh,
2646 			rte_mempool_count(vpool_array[index].pool));
2647 		LOG_DEBUG(VHOST_CONFIG,
2648 			"(%"PRIu64") in new_device: mbuf count in  ring "
2649 			"before attach  is : %d\n",
2650 			dev->device_fh, count_in_ring);
2651 
2652 		/*
2653 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2654 		 */
2655 		for (i = 0; i < count_in_ring; i++)
2656 			attach_rxmbuf_zcp(dev);
2657 
2658 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2659 			"mempool after attach is: %d\n",
2660 			dev->device_fh,
2661 			rte_mempool_count(vpool_array[index].pool));
2662 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2663 			"ring after attach  is : %d\n",
2664 			dev->device_fh,
2665 			rte_ring_count(vpool_array[index].ring));
2666 
2667 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2668 		tx_q->txq_id = vdev->vmdq_rx_q;
2669 
2670 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2671 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2672 
2673 			LOG_DEBUG(VHOST_CONFIG,
2674 				"(%"PRIu64") In new_device: Failed to start "
2675 				"tx queue:%d\n",
2676 				dev->device_fh, vdev->vmdq_rx_q);
2677 
2678 			mbuf_destroy_zcp(vpool);
2679 			rte_free(vdev->regions_hpa);
2680 			rte_free(vdev);
2681 			return -1;
2682 		}
2683 
2684 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2685 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2686 
2687 			LOG_DEBUG(VHOST_CONFIG,
2688 				"(%"PRIu64") In new_device: Failed to start "
2689 				"rx queue:%d\n",
2690 				dev->device_fh, vdev->vmdq_rx_q);
2691 
2692 			/* Stop the TX queue. */
2693 			if (rte_eth_dev_tx_queue_stop(ports[0],
2694 				vdev->vmdq_rx_q) != 0) {
2695 				LOG_DEBUG(VHOST_CONFIG,
2696 					"(%"PRIu64") In new_device: Failed to "
2697 					"stop tx queue:%d\n",
2698 					dev->device_fh, vdev->vmdq_rx_q);
2699 			}
2700 
2701 			mbuf_destroy_zcp(vpool);
2702 			rte_free(vdev->regions_hpa);
2703 			rte_free(vdev);
2704 			return -1;
2705 		}
2706 
2707 	}
2708 
2709 	/*reset ready flag*/
2710 	vdev->ready = DEVICE_MAC_LEARNING;
2711 	vdev->remove = 0;
2712 
2713 	/* Find a suitable lcore to add the device. */
2714 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2715 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2716 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2717 			core_add = lcore;
2718 		}
2719 	}
2720 	/* Add device to lcore ll */
2721 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2722 	if (ll_dev == NULL) {
2723 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2724 		vdev->ready = DEVICE_SAFE_REMOVE;
2725 		destroy_device(dev);
2726 		if (vdev->regions_hpa)
2727 			rte_free(vdev->regions_hpa);
2728 		rte_free(vdev);
2729 		return -1;
2730 	}
2731 	ll_dev->vdev = vdev;
2732 	vdev->coreid = core_add;
2733 
2734 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2735 
2736 	/* Initialize device stats */
2737 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2738 
2739 	/* Disable notifications. */
2740 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2741 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2742 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2743 	dev->flags |= VIRTIO_DEV_RUNNING;
2744 
2745 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2746 
2747 	return 0;
2748 }
2749 
2750 /*
2751  * These callback allow devices to be added to the data core when configuration
2752  * has been fully complete.
2753  */
2754 static const struct virtio_net_device_ops virtio_net_device_ops =
2755 {
2756 	.new_device =  new_device,
2757 	.destroy_device = destroy_device,
2758 };
2759 
2760 /*
2761  * This is a thread will wake up after a period to print stats if the user has
2762  * enabled them.
2763  */
2764 static void
2765 print_stats(void)
2766 {
2767 	struct virtio_net_data_ll *dev_ll;
2768 	uint64_t tx_dropped, rx_dropped;
2769 	uint64_t tx, tx_total, rx, rx_total;
2770 	uint32_t device_fh;
2771 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2772 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2773 
2774 	while(1) {
2775 		sleep(enable_stats);
2776 
2777 		/* Clear screen and move to top left */
2778 		printf("%s%s", clr, top_left);
2779 
2780 		printf("\nDevice statistics ====================================");
2781 
2782 		dev_ll = ll_root_used;
2783 		while (dev_ll != NULL) {
2784 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2785 			tx_total = dev_statistics[device_fh].tx_total;
2786 			tx = dev_statistics[device_fh].tx;
2787 			tx_dropped = tx_total - tx;
2788 			if (zero_copy == 0) {
2789 				rx_total = rte_atomic64_read(
2790 					&dev_statistics[device_fh].rx_total_atomic);
2791 				rx = rte_atomic64_read(
2792 					&dev_statistics[device_fh].rx_atomic);
2793 			} else {
2794 				rx_total = dev_statistics[device_fh].rx_total;
2795 				rx = dev_statistics[device_fh].rx;
2796 			}
2797 			rx_dropped = rx_total - rx;
2798 
2799 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2800 					"\nTX total: 		%"PRIu64""
2801 					"\nTX dropped: 		%"PRIu64""
2802 					"\nTX successful: 		%"PRIu64""
2803 					"\nRX total: 		%"PRIu64""
2804 					"\nRX dropped: 		%"PRIu64""
2805 					"\nRX successful: 		%"PRIu64"",
2806 					device_fh,
2807 					tx_total,
2808 					tx_dropped,
2809 					tx,
2810 					rx_total,
2811 					rx_dropped,
2812 					rx);
2813 
2814 			dev_ll = dev_ll->next;
2815 		}
2816 		printf("\n======================================================\n");
2817 	}
2818 }
2819 
2820 static void
2821 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2822 	char *ring_name, uint32_t nb_mbuf)
2823 {
2824 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2825 	vpool_array[index].pool
2826 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2827 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2828 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2829 		rte_pktmbuf_init, NULL, socket, 0);
2830 	if (vpool_array[index].pool != NULL) {
2831 		vpool_array[index].ring
2832 			= rte_ring_create(ring_name,
2833 				rte_align32pow2(nb_mbuf + 1),
2834 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2835 		if (likely(vpool_array[index].ring != NULL)) {
2836 			LOG_DEBUG(VHOST_CONFIG,
2837 				"in setup_mempool_tbl: mbuf count in "
2838 				"mempool is: %d\n",
2839 				rte_mempool_count(vpool_array[index].pool));
2840 			LOG_DEBUG(VHOST_CONFIG,
2841 				"in setup_mempool_tbl: mbuf count in "
2842 				"ring   is: %d\n",
2843 				rte_ring_count(vpool_array[index].ring));
2844 		} else {
2845 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2846 				ring_name);
2847 		}
2848 
2849 		/* Need consider head room. */
2850 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2851 	} else {
2852 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2853 	}
2854 }
2855 
2856 
2857 /*
2858  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2859  * device is also registered here to handle the IOCTLs.
2860  */
2861 int
2862 main(int argc, char *argv[])
2863 {
2864 	struct rte_mempool *mbuf_pool = NULL;
2865 	unsigned lcore_id, core_id = 0;
2866 	unsigned nb_ports, valid_num_ports;
2867 	int ret;
2868 	uint8_t portid;
2869 	uint16_t queue_id;
2870 	static pthread_t tid;
2871 
2872 	/* init EAL */
2873 	ret = rte_eal_init(argc, argv);
2874 	if (ret < 0)
2875 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2876 	argc -= ret;
2877 	argv += ret;
2878 
2879 	/* parse app arguments */
2880 	ret = us_vhost_parse_args(argc, argv);
2881 	if (ret < 0)
2882 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2883 
2884 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2885 		if (rte_lcore_is_enabled(lcore_id))
2886 			lcore_ids[core_id ++] = lcore_id;
2887 
2888 	if (rte_lcore_count() > RTE_MAX_LCORE)
2889 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2890 
2891 	/*set the number of swithcing cores available*/
2892 	num_switching_cores = rte_lcore_count()-1;
2893 
2894 	/* Get the number of physical ports. */
2895 	nb_ports = rte_eth_dev_count();
2896 	if (nb_ports > RTE_MAX_ETHPORTS)
2897 		nb_ports = RTE_MAX_ETHPORTS;
2898 
2899 	/*
2900 	 * Update the global var NUM_PORTS and global array PORTS
2901 	 * and get value of var VALID_NUM_PORTS according to system ports number
2902 	 */
2903 	valid_num_ports = check_ports_num(nb_ports);
2904 
2905 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2906 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2907 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2908 		return -1;
2909 	}
2910 
2911 	if (zero_copy == 0) {
2912 		/* Create the mbuf pool. */
2913 		mbuf_pool = rte_mempool_create(
2914 				"MBUF_POOL",
2915 				NUM_MBUFS_PER_PORT
2916 				* valid_num_ports,
2917 				MBUF_SIZE, MBUF_CACHE_SIZE,
2918 				sizeof(struct rte_pktmbuf_pool_private),
2919 				rte_pktmbuf_pool_init, NULL,
2920 				rte_pktmbuf_init, NULL,
2921 				rte_socket_id(), 0);
2922 		if (mbuf_pool == NULL)
2923 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2924 
2925 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2926 			vpool_array[queue_id].pool = mbuf_pool;
2927 
2928 		if (vm2vm_mode == VM2VM_HARDWARE) {
2929 			/* Enable VT loop back to let L2 switch to do it. */
2930 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2931 			LOG_DEBUG(VHOST_CONFIG,
2932 				"Enable loop back for L2 switch in vmdq.\n");
2933 		}
2934 	} else {
2935 		uint32_t nb_mbuf;
2936 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2937 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2938 
2939 		nb_mbuf = num_rx_descriptor
2940 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2941 			+ num_switching_cores * MAX_PKT_BURST;
2942 
2943 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2944 			snprintf(pool_name, sizeof(pool_name),
2945 				"rxmbuf_pool_%u", queue_id);
2946 			snprintf(ring_name, sizeof(ring_name),
2947 				"rxmbuf_ring_%u", queue_id);
2948 			setup_mempool_tbl(rte_socket_id(), queue_id,
2949 				pool_name, ring_name, nb_mbuf);
2950 		}
2951 
2952 		nb_mbuf = num_tx_descriptor
2953 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2954 				+ num_switching_cores * MAX_PKT_BURST;
2955 
2956 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2957 			snprintf(pool_name, sizeof(pool_name),
2958 				"txmbuf_pool_%u", queue_id);
2959 			snprintf(ring_name, sizeof(ring_name),
2960 				"txmbuf_ring_%u", queue_id);
2961 			setup_mempool_tbl(rte_socket_id(),
2962 				(queue_id + MAX_QUEUES),
2963 				pool_name, ring_name, nb_mbuf);
2964 		}
2965 
2966 		if (vm2vm_mode == VM2VM_HARDWARE) {
2967 			/* Enable VT loop back to let L2 switch to do it. */
2968 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2969 			LOG_DEBUG(VHOST_CONFIG,
2970 				"Enable loop back for L2 switch in vmdq.\n");
2971 		}
2972 	}
2973 	/* Set log level. */
2974 	rte_set_log_level(LOG_LEVEL);
2975 
2976 	/* initialize all ports */
2977 	for (portid = 0; portid < nb_ports; portid++) {
2978 		/* skip ports that are not enabled */
2979 		if ((enabled_port_mask & (1 << portid)) == 0) {
2980 			RTE_LOG(INFO, VHOST_PORT,
2981 				"Skipping disabled port %d\n", portid);
2982 			continue;
2983 		}
2984 		if (port_init(portid) != 0)
2985 			rte_exit(EXIT_FAILURE,
2986 				"Cannot initialize network ports\n");
2987 	}
2988 
2989 	/* Initialise all linked lists. */
2990 	if (init_data_ll() == -1)
2991 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2992 
2993 	/* Initialize device stats */
2994 	memset(&dev_statistics, 0, sizeof(dev_statistics));
2995 
2996 	/* Enable stats if the user option is set. */
2997 	if (enable_stats)
2998 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
2999 
3000 	/* Launch all data cores. */
3001 	if (zero_copy == 0) {
3002 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3003 			rte_eal_remote_launch(switch_worker,
3004 				mbuf_pool, lcore_id);
3005 		}
3006 	} else {
3007 		uint32_t count_in_mempool, index, i;
3008 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3009 			/* For all RX and TX queues. */
3010 			count_in_mempool
3011 				= rte_mempool_count(vpool_array[index].pool);
3012 
3013 			/*
3014 			 * Transfer all un-attached mbufs from vpool.pool
3015 			 * to vpoo.ring.
3016 			 */
3017 			for (i = 0; i < count_in_mempool; i++) {
3018 				struct rte_mbuf *mbuf
3019 					= __rte_mbuf_raw_alloc(
3020 						vpool_array[index].pool);
3021 				rte_ring_sp_enqueue(vpool_array[index].ring,
3022 						(void *)mbuf);
3023 			}
3024 
3025 			LOG_DEBUG(VHOST_CONFIG,
3026 				"in main: mbuf count in mempool at initial "
3027 				"is: %d\n", count_in_mempool);
3028 			LOG_DEBUG(VHOST_CONFIG,
3029 				"in main: mbuf count in  ring at initial  is :"
3030 				" %d\n",
3031 				rte_ring_count(vpool_array[index].ring));
3032 		}
3033 
3034 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3035 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3036 				lcore_id);
3037 	}
3038 
3039 	if (mergeable == 0)
3040 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3041 
3042 	/* Register CUSE device to handle IOCTLs. */
3043 	ret = rte_vhost_driver_register((char *)&dev_basename);
3044 	if (ret != 0)
3045 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3046 
3047 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3048 
3049 	/* Start CUSE session. */
3050 	rte_vhost_driver_session_start();
3051 	return 0;
3052 
3053 }
3054 
3055