xref: /dpdk/examples/vhost/main.c (revision e3d61d1609cb9b3ea851c7776bfbb60dcfe8844a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144 
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147 
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150 
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154 
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161 
162 /* Do vlan strip on host, enabled on default */
163 static uint32_t vlan_strip = 1;
164 
165 /* number of descriptors to apply*/
166 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
167 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
168 
169 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
170 #define MAX_RING_DESC 4096
171 
172 struct vpool {
173 	struct rte_mempool *pool;
174 	struct rte_ring *ring;
175 	uint32_t buf_size;
176 } vpool_array[MAX_QUEUES+MAX_QUEUES];
177 
178 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
179 typedef enum {
180 	VM2VM_DISABLED = 0,
181 	VM2VM_SOFTWARE = 1,
182 	VM2VM_HARDWARE = 2,
183 	VM2VM_LAST
184 } vm2vm_type;
185 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
186 
187 /* The type of host physical address translated from guest physical address. */
188 typedef enum {
189 	PHYS_ADDR_CONTINUOUS = 0,
190 	PHYS_ADDR_CROSS_SUBREG = 1,
191 	PHYS_ADDR_INVALID = 2,
192 	PHYS_ADDR_LAST
193 } hpa_type;
194 
195 /* Enable stats. */
196 static uint32_t enable_stats = 0;
197 /* Enable retries on RX. */
198 static uint32_t enable_retry = 1;
199 /* Specify timeout (in useconds) between retries on RX. */
200 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
201 /* Specify the number of retries on RX. */
202 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
203 
204 /* Character device basename. Can be set by user. */
205 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
206 
207 /* empty vmdq configuration structure. Filled in programatically */
208 static struct rte_eth_conf vmdq_conf_default = {
209 	.rxmode = {
210 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
211 		.split_hdr_size = 0,
212 		.header_split   = 0, /**< Header Split disabled */
213 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
214 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
215 		/*
216 		 * It is necessary for 1G NIC such as I350,
217 		 * this fixes bug of ipv4 forwarding in guest can't
218 		 * forward pakets from one virtio dev to another virtio dev.
219 		 */
220 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
221 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
222 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
223 	},
224 
225 	.txmode = {
226 		.mq_mode = ETH_MQ_TX_NONE,
227 	},
228 	.rx_adv_conf = {
229 		/*
230 		 * should be overridden separately in code with
231 		 * appropriate values
232 		 */
233 		.vmdq_rx_conf = {
234 			.nb_queue_pools = ETH_8_POOLS,
235 			.enable_default_pool = 0,
236 			.default_pool = 0,
237 			.nb_pool_maps = 0,
238 			.pool_map = {{0, 0},},
239 		},
240 	},
241 };
242 
243 static unsigned lcore_ids[RTE_MAX_LCORE];
244 static uint8_t ports[RTE_MAX_ETHPORTS];
245 static unsigned num_ports = 0; /**< The number of ports specified in command line */
246 static uint16_t num_pf_queues, num_vmdq_queues;
247 static uint16_t vmdq_pool_base, vmdq_queue_base;
248 static uint16_t queues_per_pool;
249 
250 static const uint16_t external_pkt_default_vlan_tag = 2000;
251 const uint16_t vlan_tags[] = {
252 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
253 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
254 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
255 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
256 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
257 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
258 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
259 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
260 };
261 
262 /* ethernet addresses of ports */
263 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
264 
265 /* heads for the main used and free linked lists for the data path. */
266 static struct virtio_net_data_ll *ll_root_used = NULL;
267 static struct virtio_net_data_ll *ll_root_free = NULL;
268 
269 /* Array of data core structures containing information on individual core linked lists. */
270 static struct lcore_info lcore_info[RTE_MAX_LCORE];
271 
272 /* Used for queueing bursts of TX packets. */
273 struct mbuf_table {
274 	unsigned len;
275 	unsigned txq_id;
276 	struct rte_mbuf *m_table[MAX_PKT_BURST];
277 };
278 
279 /* TX queue for each data core. */
280 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
281 
282 /* TX queue fori each virtio device for zero copy. */
283 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
284 
285 /* Vlan header struct used to insert vlan tags on TX. */
286 struct vlan_ethhdr {
287 	unsigned char   h_dest[ETH_ALEN];
288 	unsigned char   h_source[ETH_ALEN];
289 	__be16          h_vlan_proto;
290 	__be16          h_vlan_TCI;
291 	__be16          h_vlan_encapsulated_proto;
292 };
293 
294 /* IPv4 Header */
295 struct ipv4_hdr {
296 	uint8_t  version_ihl;		/**< version and header length */
297 	uint8_t  type_of_service;	/**< type of service */
298 	uint16_t total_length;		/**< length of packet */
299 	uint16_t packet_id;		/**< packet ID */
300 	uint16_t fragment_offset;	/**< fragmentation offset */
301 	uint8_t  time_to_live;		/**< time to live */
302 	uint8_t  next_proto_id;		/**< protocol ID */
303 	uint16_t hdr_checksum;		/**< header checksum */
304 	uint32_t src_addr;		/**< source address */
305 	uint32_t dst_addr;		/**< destination address */
306 } __attribute__((__packed__));
307 
308 /* Header lengths. */
309 #define VLAN_HLEN       4
310 #define VLAN_ETH_HLEN   18
311 
312 /* Per-device statistics struct */
313 struct device_statistics {
314 	uint64_t tx_total;
315 	rte_atomic64_t rx_total_atomic;
316 	uint64_t rx_total;
317 	uint64_t tx;
318 	rte_atomic64_t rx_atomic;
319 	uint64_t rx;
320 } __rte_cache_aligned;
321 struct device_statistics dev_statistics[MAX_DEVICES];
322 
323 /*
324  * Builds up the correct configuration for VMDQ VLAN pool map
325  * according to the pool & queue limits.
326  */
327 static inline int
328 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
329 {
330 	struct rte_eth_vmdq_rx_conf conf;
331 	struct rte_eth_vmdq_rx_conf *def_conf =
332 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
333 	unsigned i;
334 
335 	memset(&conf, 0, sizeof(conf));
336 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
337 	conf.nb_pool_maps = num_devices;
338 	conf.enable_loop_back = def_conf->enable_loop_back;
339 	conf.rx_mode = def_conf->rx_mode;
340 
341 	for (i = 0; i < conf.nb_pool_maps; i++) {
342 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
343 		conf.pool_map[i].pools = (1UL << i);
344 	}
345 
346 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
347 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
348 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
349 	return 0;
350 }
351 
352 /*
353  * Validate the device number according to the max pool number gotten form
354  * dev_info. If the device number is invalid, give the error message and
355  * return -1. Each device must have its own pool.
356  */
357 static inline int
358 validate_num_devices(uint32_t max_nb_devices)
359 {
360 	if (num_devices > max_nb_devices) {
361 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
362 		return -1;
363 	}
364 	return 0;
365 }
366 
367 /*
368  * Initialises a given port using global settings and with the rx buffers
369  * coming from the mbuf_pool passed as parameter
370  */
371 static inline int
372 port_init(uint8_t port)
373 {
374 	struct rte_eth_dev_info dev_info;
375 	struct rte_eth_conf port_conf;
376 	struct rte_eth_rxconf *rxconf;
377 	struct rte_eth_txconf *txconf;
378 	int16_t rx_rings, tx_rings;
379 	uint16_t rx_ring_size, tx_ring_size;
380 	int retval;
381 	uint16_t q;
382 
383 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
384 	rte_eth_dev_info_get (port, &dev_info);
385 
386 	if (dev_info.max_rx_queues > MAX_QUEUES) {
387 		rte_exit(EXIT_FAILURE,
388 			"please define MAX_QUEUES no less than %u in %s\n",
389 			dev_info.max_rx_queues, __FILE__);
390 	}
391 
392 	rxconf = &dev_info.default_rxconf;
393 	txconf = &dev_info.default_txconf;
394 	rxconf->rx_drop_en = 1;
395 
396 	/* Enable vlan offload */
397 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
398 
399 	/*
400 	 * Zero copy defers queue RX/TX start to the time when guest
401 	 * finishes its startup and packet buffers from that guest are
402 	 * available.
403 	 */
404 	if (zero_copy) {
405 		rxconf->rx_deferred_start = 1;
406 		rxconf->rx_drop_en = 0;
407 		txconf->tx_deferred_start = 1;
408 	}
409 
410 	/*configure the number of supported virtio devices based on VMDQ limits */
411 	num_devices = dev_info.max_vmdq_pools;
412 
413 	if (zero_copy) {
414 		rx_ring_size = num_rx_descriptor;
415 		tx_ring_size = num_tx_descriptor;
416 		tx_rings = dev_info.max_tx_queues;
417 	} else {
418 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
419 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
420 		tx_rings = (uint16_t)rte_lcore_count();
421 	}
422 
423 	retval = validate_num_devices(MAX_DEVICES);
424 	if (retval < 0)
425 		return retval;
426 
427 	/* Get port configuration. */
428 	retval = get_eth_conf(&port_conf, num_devices);
429 	if (retval < 0)
430 		return retval;
431 	/* NIC queues are divided into pf queues and vmdq queues.  */
432 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
433 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
434 	num_vmdq_queues = num_devices * queues_per_pool;
435 	num_queues = num_pf_queues + num_vmdq_queues;
436 	vmdq_queue_base = dev_info.vmdq_queue_base;
437 	vmdq_pool_base  = dev_info.vmdq_pool_base;
438 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
439 		num_pf_queues, num_devices, queues_per_pool);
440 
441 	if (port >= rte_eth_dev_count()) return -1;
442 
443 	rx_rings = (uint16_t)dev_info.max_rx_queues;
444 	/* Configure ethernet device. */
445 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446 	if (retval != 0)
447 		return retval;
448 
449 	/* Setup the queues. */
450 	for (q = 0; q < rx_rings; q ++) {
451 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 						rte_eth_dev_socket_id(port),
453 						rxconf,
454 						vpool_array[q].pool);
455 		if (retval < 0)
456 			return retval;
457 	}
458 	for (q = 0; q < tx_rings; q ++) {
459 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
460 						rte_eth_dev_socket_id(port),
461 						txconf);
462 		if (retval < 0)
463 			return retval;
464 	}
465 
466 	/* Start the device. */
467 	retval  = rte_eth_dev_start(port);
468 	if (retval < 0) {
469 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
470 		return retval;
471 	}
472 
473 	if (promiscuous)
474 		rte_eth_promiscuous_enable(port);
475 
476 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
477 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
478 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
479 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
480 			(unsigned)port,
481 			vmdq_ports_eth_addr[port].addr_bytes[0],
482 			vmdq_ports_eth_addr[port].addr_bytes[1],
483 			vmdq_ports_eth_addr[port].addr_bytes[2],
484 			vmdq_ports_eth_addr[port].addr_bytes[3],
485 			vmdq_ports_eth_addr[port].addr_bytes[4],
486 			vmdq_ports_eth_addr[port].addr_bytes[5]);
487 
488 	return 0;
489 }
490 
491 /*
492  * Set character device basename.
493  */
494 static int
495 us_vhost_parse_basename(const char *q_arg)
496 {
497 	/* parse number string */
498 
499 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
500 		return -1;
501 	else
502 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
503 
504 	return 0;
505 }
506 
507 /*
508  * Parse the portmask provided at run time.
509  */
510 static int
511 parse_portmask(const char *portmask)
512 {
513 	char *end = NULL;
514 	unsigned long pm;
515 
516 	errno = 0;
517 
518 	/* parse hexadecimal string */
519 	pm = strtoul(portmask, &end, 16);
520 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
521 		return -1;
522 
523 	if (pm == 0)
524 		return -1;
525 
526 	return pm;
527 
528 }
529 
530 /*
531  * Parse num options at run time.
532  */
533 static int
534 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
535 {
536 	char *end = NULL;
537 	unsigned long num;
538 
539 	errno = 0;
540 
541 	/* parse unsigned int string */
542 	num = strtoul(q_arg, &end, 10);
543 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
544 		return -1;
545 
546 	if (num > max_valid_value)
547 		return -1;
548 
549 	return num;
550 
551 }
552 
553 /*
554  * Display usage
555  */
556 static void
557 us_vhost_usage(const char *prgname)
558 {
559 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
560 	"		--vm2vm [0|1|2]\n"
561 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
562 	"		--dev-basename <name>\n"
563 	"		--nb-devices ND\n"
564 	"		-p PORTMASK: Set mask for ports to be used by application\n"
565 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
566 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
567 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
568 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
569 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
570 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
571 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 	"		--dev-basename: The basename to be used for the character device.\n"
573 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
574 			"zero copy\n"
575 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
576 			"used only when zero copy is enabled.\n"
577 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
578 			"used only when zero copy is enabled.\n",
579 	       prgname);
580 }
581 
582 /*
583  * Parse the arguments given in the command line of the application.
584  */
585 static int
586 us_vhost_parse_args(int argc, char **argv)
587 {
588 	int opt, ret;
589 	int option_index;
590 	unsigned i;
591 	const char *prgname = argv[0];
592 	static struct option long_option[] = {
593 		{"vm2vm", required_argument, NULL, 0},
594 		{"rx-retry", required_argument, NULL, 0},
595 		{"rx-retry-delay", required_argument, NULL, 0},
596 		{"rx-retry-num", required_argument, NULL, 0},
597 		{"mergeable", required_argument, NULL, 0},
598 		{"vlan-strip", required_argument, NULL, 0},
599 		{"stats", required_argument, NULL, 0},
600 		{"dev-basename", required_argument, NULL, 0},
601 		{"zero-copy", required_argument, NULL, 0},
602 		{"rx-desc-num", required_argument, NULL, 0},
603 		{"tx-desc-num", required_argument, NULL, 0},
604 		{NULL, 0, 0, 0},
605 	};
606 
607 	/* Parse command line */
608 	while ((opt = getopt_long(argc, argv, "p:P",
609 			long_option, &option_index)) != EOF) {
610 		switch (opt) {
611 		/* Portmask */
612 		case 'p':
613 			enabled_port_mask = parse_portmask(optarg);
614 			if (enabled_port_mask == 0) {
615 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 				us_vhost_usage(prgname);
617 				return -1;
618 			}
619 			break;
620 
621 		case 'P':
622 			promiscuous = 1;
623 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
624 				ETH_VMDQ_ACCEPT_BROADCAST |
625 				ETH_VMDQ_ACCEPT_MULTICAST;
626 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
627 
628 			break;
629 
630 		case 0:
631 			/* Enable/disable vm2vm comms. */
632 			if (!strncmp(long_option[option_index].name, "vm2vm",
633 				MAX_LONG_OPT_SZ)) {
634 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
635 				if (ret == -1) {
636 					RTE_LOG(INFO, VHOST_CONFIG,
637 						"Invalid argument for "
638 						"vm2vm [0|1|2]\n");
639 					us_vhost_usage(prgname);
640 					return -1;
641 				} else {
642 					vm2vm_mode = (vm2vm_type)ret;
643 				}
644 			}
645 
646 			/* Enable/disable retries on RX. */
647 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
648 				ret = parse_num_opt(optarg, 1);
649 				if (ret == -1) {
650 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
651 					us_vhost_usage(prgname);
652 					return -1;
653 				} else {
654 					enable_retry = ret;
655 				}
656 			}
657 
658 			/* Specify the retries delay time (in useconds) on RX. */
659 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
660 				ret = parse_num_opt(optarg, INT32_MAX);
661 				if (ret == -1) {
662 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
663 					us_vhost_usage(prgname);
664 					return -1;
665 				} else {
666 					burst_rx_delay_time = ret;
667 				}
668 			}
669 
670 			/* Specify the retries number on RX. */
671 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
672 				ret = parse_num_opt(optarg, INT32_MAX);
673 				if (ret == -1) {
674 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
675 					us_vhost_usage(prgname);
676 					return -1;
677 				} else {
678 					burst_rx_retry_num = ret;
679 				}
680 			}
681 
682 			/* Enable/disable RX mergeable buffers. */
683 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
684 				ret = parse_num_opt(optarg, 1);
685 				if (ret == -1) {
686 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
687 					us_vhost_usage(prgname);
688 					return -1;
689 				} else {
690 					mergeable = !!ret;
691 					if (ret) {
692 						vmdq_conf_default.rxmode.jumbo_frame = 1;
693 						vmdq_conf_default.rxmode.max_rx_pkt_len
694 							= JUMBO_FRAME_MAX_SIZE;
695 					}
696 				}
697 			}
698 
699 			/* Enable/disable RX VLAN strip on host. */
700 			if (!strncmp(long_option[option_index].name,
701 				"vlan-strip", MAX_LONG_OPT_SZ)) {
702 				ret = parse_num_opt(optarg, 1);
703 				if (ret == -1) {
704 					RTE_LOG(INFO, VHOST_CONFIG,
705 						"Invalid argument for VLAN strip [0|1]\n");
706 					us_vhost_usage(prgname);
707 					return -1;
708 				} else {
709 					vlan_strip = !!ret;
710 					vmdq_conf_default.rxmode.hw_vlan_strip =
711 						vlan_strip;
712 				}
713 			}
714 
715 			/* Enable/disable stats. */
716 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
717 				ret = parse_num_opt(optarg, INT32_MAX);
718 				if (ret == -1) {
719 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
720 					us_vhost_usage(prgname);
721 					return -1;
722 				} else {
723 					enable_stats = ret;
724 				}
725 			}
726 
727 			/* Set character device basename. */
728 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
729 				if (us_vhost_parse_basename(optarg) == -1) {
730 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
731 					us_vhost_usage(prgname);
732 					return -1;
733 				}
734 			}
735 
736 			/* Enable/disable rx/tx zero copy. */
737 			if (!strncmp(long_option[option_index].name,
738 				"zero-copy", MAX_LONG_OPT_SZ)) {
739 				ret = parse_num_opt(optarg, 1);
740 				if (ret == -1) {
741 					RTE_LOG(INFO, VHOST_CONFIG,
742 						"Invalid argument"
743 						" for zero-copy [0|1]\n");
744 					us_vhost_usage(prgname);
745 					return -1;
746 				} else
747 					zero_copy = ret;
748 
749 				if (zero_copy) {
750 #ifdef RTE_MBUF_REFCNT
751 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
752 					"zero copy vhost APP, please "
753 					"disable RTE_MBUF_REFCNT\n"
754 					"in config file and then rebuild DPDK "
755 					"core lib!\n"
756 					"Otherwise please disable zero copy "
757 					"flag in command line!\n");
758 					return -1;
759 #endif
760 				}
761 			}
762 
763 			/* Specify the descriptor number on RX. */
764 			if (!strncmp(long_option[option_index].name,
765 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
766 				ret = parse_num_opt(optarg, MAX_RING_DESC);
767 				if ((ret == -1) || (!POWEROF2(ret))) {
768 					RTE_LOG(INFO, VHOST_CONFIG,
769 					"Invalid argument for rx-desc-num[0-N],"
770 					"power of 2 required.\n");
771 					us_vhost_usage(prgname);
772 					return -1;
773 				} else {
774 					num_rx_descriptor = ret;
775 				}
776 			}
777 
778 			/* Specify the descriptor number on TX. */
779 			if (!strncmp(long_option[option_index].name,
780 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
781 				ret = parse_num_opt(optarg, MAX_RING_DESC);
782 				if ((ret == -1) || (!POWEROF2(ret))) {
783 					RTE_LOG(INFO, VHOST_CONFIG,
784 					"Invalid argument for tx-desc-num [0-N],"
785 					"power of 2 required.\n");
786 					us_vhost_usage(prgname);
787 					return -1;
788 				} else {
789 					num_tx_descriptor = ret;
790 				}
791 			}
792 
793 			break;
794 
795 			/* Invalid option - print options. */
796 		default:
797 			us_vhost_usage(prgname);
798 			return -1;
799 		}
800 	}
801 
802 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
803 		if (enabled_port_mask & (1 << i))
804 			ports[num_ports++] = (uint8_t)i;
805 	}
806 
807 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
808 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
809 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
810 		return -1;
811 	}
812 
813 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
814 		RTE_LOG(INFO, VHOST_PORT,
815 			"Vhost zero copy doesn't support software vm2vm,"
816 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
817 		return -1;
818 	}
819 
820 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
821 		RTE_LOG(INFO, VHOST_PORT,
822 			"Vhost zero copy doesn't support jumbo frame,"
823 			"please specify '--mergeable 0' to disable the "
824 			"mergeable feature.\n");
825 		return -1;
826 	}
827 
828 	return 0;
829 }
830 
831 /*
832  * Update the global var NUM_PORTS and array PORTS according to system ports number
833  * and return valid ports number
834  */
835 static unsigned check_ports_num(unsigned nb_ports)
836 {
837 	unsigned valid_num_ports = num_ports;
838 	unsigned portid;
839 
840 	if (num_ports > nb_ports) {
841 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
842 			num_ports, nb_ports);
843 		num_ports = nb_ports;
844 	}
845 
846 	for (portid = 0; portid < num_ports; portid ++) {
847 		if (ports[portid] >= nb_ports) {
848 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
849 				ports[portid], (nb_ports - 1));
850 			ports[portid] = INVALID_PORT_ID;
851 			valid_num_ports--;
852 		}
853 	}
854 	return valid_num_ports;
855 }
856 
857 /*
858  * Macro to print out packet contents. Wrapped in debug define so that the
859  * data path is not effected when debug is disabled.
860  */
861 #ifdef DEBUG
862 #define PRINT_PACKET(device, addr, size, header) do {																\
863 	char *pkt_addr = (char*)(addr);																					\
864 	unsigned int index;																								\
865 	char packet[MAX_PRINT_BUFF];																					\
866 																													\
867 	if ((header))																									\
868 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
869 	else																											\
870 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
871 	for (index = 0; index < (size); index++) {																		\
872 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
873 			"%02hhx ", pkt_addr[index]);																			\
874 	}																												\
875 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
876 																													\
877 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
878 } while(0)
879 #else
880 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
881 #endif
882 
883 /*
884  * Function to convert guest physical addresses to vhost physical addresses.
885  * This is used to convert virtio buffer addresses.
886  */
887 static inline uint64_t __attribute__((always_inline))
888 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
889 	uint32_t buf_len, hpa_type *addr_type)
890 {
891 	struct virtio_memory_regions_hpa *region;
892 	uint32_t regionidx;
893 	uint64_t vhost_pa = 0;
894 
895 	*addr_type = PHYS_ADDR_INVALID;
896 
897 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
898 		region = &vdev->regions_hpa[regionidx];
899 		if ((guest_pa >= region->guest_phys_address) &&
900 			(guest_pa <= region->guest_phys_address_end)) {
901 			vhost_pa = region->host_phys_addr_offset + guest_pa;
902 			if (likely((guest_pa + buf_len - 1)
903 				<= region->guest_phys_address_end))
904 				*addr_type = PHYS_ADDR_CONTINUOUS;
905 			else
906 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
907 			break;
908 		}
909 	}
910 
911 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
912 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
913 		(void *)(uintptr_t)vhost_pa);
914 
915 	return vhost_pa;
916 }
917 
918 /*
919  * Compares a packet destination MAC address to a device MAC address.
920  */
921 static inline int __attribute__((always_inline))
922 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
923 {
924 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
925 }
926 
927 /*
928  * This function learns the MAC address of the device and registers this along with a
929  * vlan tag to a VMDQ.
930  */
931 static int
932 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
933 {
934 	struct ether_hdr *pkt_hdr;
935 	struct virtio_net_data_ll *dev_ll;
936 	struct virtio_net *dev = vdev->dev;
937 	int i, ret;
938 
939 	/* Learn MAC address of guest device from packet */
940 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
941 
942 	dev_ll = ll_root_used;
943 
944 	while (dev_ll != NULL) {
945 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
946 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
947 			return -1;
948 		}
949 		dev_ll = dev_ll->next;
950 	}
951 
952 	for (i = 0; i < ETHER_ADDR_LEN; i++)
953 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
954 
955 	/* vlan_tag currently uses the device_id. */
956 	vdev->vlan_tag = vlan_tags[dev->device_fh];
957 
958 	/* Print out VMDQ registration info. */
959 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
960 		dev->device_fh,
961 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
962 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
963 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
964 		vdev->vlan_tag);
965 
966 	/* Register the MAC address. */
967 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
968 				(uint32_t)dev->device_fh + vmdq_pool_base);
969 	if (ret)
970 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
971 					dev->device_fh);
972 
973 	/* Enable stripping of the vlan tag as we handle routing. */
974 	if (vlan_strip)
975 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
976 			(uint16_t)vdev->vmdq_rx_q, 1);
977 
978 	/* Set device as ready for RX. */
979 	vdev->ready = DEVICE_RX;
980 
981 	return 0;
982 }
983 
984 /*
985  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
986  * queue before disabling RX on the device.
987  */
988 static inline void
989 unlink_vmdq(struct vhost_dev *vdev)
990 {
991 	unsigned i = 0;
992 	unsigned rx_count;
993 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
994 
995 	if (vdev->ready == DEVICE_RX) {
996 		/*clear MAC and VLAN settings*/
997 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
998 		for (i = 0; i < 6; i++)
999 			vdev->mac_address.addr_bytes[i] = 0;
1000 
1001 		vdev->vlan_tag = 0;
1002 
1003 		/*Clear out the receive buffers*/
1004 		rx_count = rte_eth_rx_burst(ports[0],
1005 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1006 
1007 		while (rx_count) {
1008 			for (i = 0; i < rx_count; i++)
1009 				rte_pktmbuf_free(pkts_burst[i]);
1010 
1011 			rx_count = rte_eth_rx_burst(ports[0],
1012 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1013 		}
1014 
1015 		vdev->ready = DEVICE_MAC_LEARNING;
1016 	}
1017 }
1018 
1019 /*
1020  * Check if the packet destination MAC address is for a local device. If so then put
1021  * the packet on that devices RX queue. If not then return.
1022  */
1023 static inline int __attribute__((always_inline))
1024 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1025 {
1026 	struct virtio_net_data_ll *dev_ll;
1027 	struct ether_hdr *pkt_hdr;
1028 	uint64_t ret = 0;
1029 	struct virtio_net *dev = vdev->dev;
1030 	struct virtio_net *tdev; /* destination virito device */
1031 
1032 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1033 
1034 	/*get the used devices list*/
1035 	dev_ll = ll_root_used;
1036 
1037 	while (dev_ll != NULL) {
1038 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1039 				          &dev_ll->vdev->mac_address)) {
1040 
1041 			/* Drop the packet if the TX packet is destined for the TX device. */
1042 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1043 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1044 							dev->device_fh);
1045 				return 0;
1046 			}
1047 			tdev = dev_ll->vdev->dev;
1048 
1049 
1050 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1051 
1052 			if (unlikely(dev_ll->vdev->remove)) {
1053 				/*drop the packet if the device is marked for removal*/
1054 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1055 			} else {
1056 				/*send the packet to the local virtio device*/
1057 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1058 				if (enable_stats) {
1059 					rte_atomic64_add(
1060 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1061 					1);
1062 					rte_atomic64_add(
1063 					&dev_statistics[tdev->device_fh].rx_atomic,
1064 					ret);
1065 					dev_statistics[tdev->device_fh].tx_total++;
1066 					dev_statistics[tdev->device_fh].tx += ret;
1067 				}
1068 			}
1069 
1070 			return 0;
1071 		}
1072 		dev_ll = dev_ll->next;
1073 	}
1074 
1075 	return -1;
1076 }
1077 
1078 /*
1079  * Check if the destination MAC of a packet is one local VM,
1080  * and get its vlan tag, and offset if it is.
1081  */
1082 static inline int __attribute__((always_inline))
1083 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1084 	uint32_t *offset, uint16_t *vlan_tag)
1085 {
1086 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1087 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1088 
1089 	while (dev_ll != NULL) {
1090 		if ((dev_ll->vdev->ready == DEVICE_RX)
1091 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1092 		&dev_ll->vdev->mac_address)) {
1093 			/*
1094 			 * Drop the packet if the TX packet is
1095 			 * destined for the TX device.
1096 			 */
1097 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1098 				LOG_DEBUG(VHOST_DATA,
1099 				"(%"PRIu64") TX: Source and destination"
1100 				" MAC addresses are the same. Dropping "
1101 				"packet.\n",
1102 				dev_ll->vdev->dev->device_fh);
1103 				return -1;
1104 			}
1105 
1106 			/*
1107 			 * HW vlan strip will reduce the packet length
1108 			 * by minus length of vlan tag, so need restore
1109 			 * the packet length by plus it.
1110 			 */
1111 			*offset = VLAN_HLEN;
1112 			*vlan_tag =
1113 			(uint16_t)
1114 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1115 
1116 			LOG_DEBUG(VHOST_DATA,
1117 			"(%"PRIu64") TX: pkt to local VM device id:"
1118 			"(%"PRIu64") vlan tag: %d.\n",
1119 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1120 			vlan_tag);
1121 
1122 			break;
1123 		}
1124 		dev_ll = dev_ll->next;
1125 	}
1126 	return 0;
1127 }
1128 
1129 /*
1130  * This function routes the TX packet to the correct interface. This may be a local device
1131  * or the physical port.
1132  */
1133 static inline void __attribute__((always_inline))
1134 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1135 {
1136 	struct mbuf_table *tx_q;
1137 	struct rte_mbuf **m_table;
1138 	unsigned len, ret, offset = 0;
1139 	const uint16_t lcore_id = rte_lcore_id();
1140 	struct virtio_net *dev = vdev->dev;
1141 	struct ether_hdr *nh;
1142 
1143 	/*check if destination is local VM*/
1144 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1145 		rte_pktmbuf_free(m);
1146 		return;
1147 	}
1148 
1149 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1150 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1151 			rte_pktmbuf_free(m);
1152 			return;
1153 		}
1154 	}
1155 
1156 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1157 
1158 	/*Add packet to the port tx queue*/
1159 	tx_q = &lcore_tx_queue[lcore_id];
1160 	len = tx_q->len;
1161 
1162 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1164 		/* Guest has inserted the vlan tag. */
1165 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1166 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1167 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1168 			(vh->vlan_tci != vlan_tag_be))
1169 			vh->vlan_tci = vlan_tag_be;
1170 	} else {
1171 		m->ol_flags = PKT_TX_VLAN_PKT;
1172 
1173 		/*
1174 		 * Find the right seg to adjust the data len when offset is
1175 		 * bigger than tail room size.
1176 		 */
1177 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1178 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1179 				m->data_len += offset;
1180 			else {
1181 				struct rte_mbuf *seg = m;
1182 
1183 				while ((seg->next != NULL) &&
1184 					(offset > rte_pktmbuf_tailroom(seg)))
1185 					seg = seg->next;
1186 
1187 				seg->data_len += offset;
1188 			}
1189 			m->pkt_len += offset;
1190 		}
1191 
1192 		m->vlan_tci = vlan_tag;
1193 	}
1194 
1195 	tx_q->m_table[len] = m;
1196 	len++;
1197 	if (enable_stats) {
1198 		dev_statistics[dev->device_fh].tx_total++;
1199 		dev_statistics[dev->device_fh].tx++;
1200 	}
1201 
1202 	if (unlikely(len == MAX_PKT_BURST)) {
1203 		m_table = (struct rte_mbuf **)tx_q->m_table;
1204 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1205 		/* Free any buffers not handled by TX and update the port stats. */
1206 		if (unlikely(ret < len)) {
1207 			do {
1208 				rte_pktmbuf_free(m_table[ret]);
1209 			} while (++ret < len);
1210 		}
1211 
1212 		len = 0;
1213 	}
1214 
1215 	tx_q->len = len;
1216 	return;
1217 }
1218 /*
1219  * This function is called by each data core. It handles all RX/TX registered with the
1220  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1221  * with all devices in the main linked list.
1222  */
1223 static int
1224 switch_worker(__attribute__((unused)) void *arg)
1225 {
1226 	struct rte_mempool *mbuf_pool = arg;
1227 	struct virtio_net *dev = NULL;
1228 	struct vhost_dev *vdev = NULL;
1229 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1230 	struct virtio_net_data_ll *dev_ll;
1231 	struct mbuf_table *tx_q;
1232 	volatile struct lcore_ll_info *lcore_ll;
1233 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1234 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1235 	unsigned ret, i;
1236 	const uint16_t lcore_id = rte_lcore_id();
1237 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1238 	uint16_t rx_count = 0;
1239 	uint16_t tx_count;
1240 	uint32_t retry = 0;
1241 
1242 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1243 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1244 	prev_tsc = 0;
1245 
1246 	tx_q = &lcore_tx_queue[lcore_id];
1247 	for (i = 0; i < num_cores; i ++) {
1248 		if (lcore_ids[i] == lcore_id) {
1249 			tx_q->txq_id = i;
1250 			break;
1251 		}
1252 	}
1253 
1254 	while(1) {
1255 		cur_tsc = rte_rdtsc();
1256 		/*
1257 		 * TX burst queue drain
1258 		 */
1259 		diff_tsc = cur_tsc - prev_tsc;
1260 		if (unlikely(diff_tsc > drain_tsc)) {
1261 
1262 			if (tx_q->len) {
1263 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1264 
1265 				/*Tx any packets in the queue*/
1266 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1267 									   (struct rte_mbuf **)tx_q->m_table,
1268 									   (uint16_t)tx_q->len);
1269 				if (unlikely(ret < tx_q->len)) {
1270 					do {
1271 						rte_pktmbuf_free(tx_q->m_table[ret]);
1272 					} while (++ret < tx_q->len);
1273 				}
1274 
1275 				tx_q->len = 0;
1276 			}
1277 
1278 			prev_tsc = cur_tsc;
1279 
1280 		}
1281 
1282 		rte_prefetch0(lcore_ll->ll_root_used);
1283 		/*
1284 		 * Inform the configuration core that we have exited the linked list and that no devices are
1285 		 * in use if requested.
1286 		 */
1287 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1288 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1289 
1290 		/*
1291 		 * Process devices
1292 		 */
1293 		dev_ll = lcore_ll->ll_root_used;
1294 
1295 		while (dev_ll != NULL) {
1296 			/*get virtio device ID*/
1297 			vdev = dev_ll->vdev;
1298 			dev = vdev->dev;
1299 
1300 			if (unlikely(vdev->remove)) {
1301 				dev_ll = dev_ll->next;
1302 				unlink_vmdq(vdev);
1303 				vdev->ready = DEVICE_SAFE_REMOVE;
1304 				continue;
1305 			}
1306 			if (likely(vdev->ready == DEVICE_RX)) {
1307 				/*Handle guest RX*/
1308 				rx_count = rte_eth_rx_burst(ports[0],
1309 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1310 
1311 				if (rx_count) {
1312 					/*
1313 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1314 					* Here MAX_PKT_BURST must be less than virtio queue size
1315 					*/
1316 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1317 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1318 							rte_delay_us(burst_rx_delay_time);
1319 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1320 								break;
1321 						}
1322 					}
1323 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1324 					if (enable_stats) {
1325 						rte_atomic64_add(
1326 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1327 						rx_count);
1328 						rte_atomic64_add(
1329 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1330 					}
1331 					while (likely(rx_count)) {
1332 						rx_count--;
1333 						rte_pktmbuf_free(pkts_burst[rx_count]);
1334 					}
1335 
1336 				}
1337 			}
1338 
1339 			if (likely(!vdev->remove)) {
1340 				/* Handle guest TX*/
1341 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1342 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1343 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1344 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1345 						while (tx_count)
1346 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1347 					}
1348 				}
1349 				while (tx_count)
1350 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1351 			}
1352 
1353 			/*move to the next device in the list*/
1354 			dev_ll = dev_ll->next;
1355 		}
1356 	}
1357 
1358 	return 0;
1359 }
1360 
1361 /*
1362  * This function gets available ring number for zero copy rx.
1363  * Only one thread will call this funciton for a paticular virtio device,
1364  * so, it is designed as non-thread-safe function.
1365  */
1366 static inline uint32_t __attribute__((always_inline))
1367 get_available_ring_num_zcp(struct virtio_net *dev)
1368 {
1369 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1370 	uint16_t avail_idx;
1371 
1372 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1373 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1374 }
1375 
1376 /*
1377  * This function gets available ring index for zero copy rx,
1378  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1379  * Only one thread will call this funciton for a paticular virtio device,
1380  * so, it is designed as non-thread-safe function.
1381  */
1382 static inline uint32_t __attribute__((always_inline))
1383 get_available_ring_index_zcp(struct virtio_net *dev,
1384 	uint16_t *res_base_idx, uint32_t count)
1385 {
1386 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1387 	uint16_t avail_idx;
1388 	uint32_t retry = 0;
1389 	uint16_t free_entries;
1390 
1391 	*res_base_idx = vq->last_used_idx_res;
1392 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1393 	free_entries = (avail_idx - *res_base_idx);
1394 
1395 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1396 			"avail idx: %d, "
1397 			"res base idx:%d, free entries:%d\n",
1398 			dev->device_fh, avail_idx, *res_base_idx,
1399 			free_entries);
1400 
1401 	/*
1402 	 * If retry is enabled and the queue is full then we wait
1403 	 * and retry to avoid packet loss.
1404 	 */
1405 	if (enable_retry && unlikely(count > free_entries)) {
1406 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1407 			rte_delay_us(burst_rx_delay_time);
1408 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1409 			free_entries = (avail_idx - *res_base_idx);
1410 			if (count <= free_entries)
1411 				break;
1412 		}
1413 	}
1414 
1415 	/*check that we have enough buffers*/
1416 	if (unlikely(count > free_entries))
1417 		count = free_entries;
1418 
1419 	if (unlikely(count == 0)) {
1420 		LOG_DEBUG(VHOST_DATA,
1421 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1422 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1423 			dev->device_fh, avail_idx,
1424 			*res_base_idx, free_entries);
1425 		return 0;
1426 	}
1427 
1428 	vq->last_used_idx_res = *res_base_idx + count;
1429 
1430 	return count;
1431 }
1432 
1433 /*
1434  * This function put descriptor back to used list.
1435  */
1436 static inline void __attribute__((always_inline))
1437 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1438 {
1439 	uint16_t res_cur_idx = vq->last_used_idx;
1440 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1441 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1442 	rte_compiler_barrier();
1443 	*(volatile uint16_t *)&vq->used->idx += 1;
1444 	vq->last_used_idx += 1;
1445 
1446 	/* Kick the guest if necessary. */
1447 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1448 		eventfd_write((int)vq->kickfd, 1);
1449 }
1450 
1451 /*
1452  * This function get available descriptor from vitio vring and un-attached mbuf
1453  * from vpool->ring, and then attach them together. It needs adjust the offset
1454  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1455  * frame data may be put to wrong location in mbuf.
1456  */
1457 static inline void __attribute__((always_inline))
1458 attach_rxmbuf_zcp(struct virtio_net *dev)
1459 {
1460 	uint16_t res_base_idx, desc_idx;
1461 	uint64_t buff_addr, phys_addr;
1462 	struct vhost_virtqueue *vq;
1463 	struct vring_desc *desc;
1464 	struct rte_mbuf *mbuf = NULL;
1465 	struct vpool *vpool;
1466 	hpa_type addr_type;
1467 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1468 
1469 	vpool = &vpool_array[vdev->vmdq_rx_q];
1470 	vq = dev->virtqueue[VIRTIO_RXQ];
1471 
1472 	do {
1473 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1474 				1) != 1))
1475 			return;
1476 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1477 
1478 		desc = &vq->desc[desc_idx];
1479 		if (desc->flags & VRING_DESC_F_NEXT) {
1480 			desc = &vq->desc[desc->next];
1481 			buff_addr = gpa_to_vva(dev, desc->addr);
1482 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1483 					&addr_type);
1484 		} else {
1485 			buff_addr = gpa_to_vva(dev,
1486 					desc->addr + vq->vhost_hlen);
1487 			phys_addr = gpa_to_hpa(vdev,
1488 					desc->addr + vq->vhost_hlen,
1489 					desc->len, &addr_type);
1490 		}
1491 
1492 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1493 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1494 				" address found when attaching RX frame buffer"
1495 				" address!\n", dev->device_fh);
1496 			put_desc_to_used_list_zcp(vq, desc_idx);
1497 			continue;
1498 		}
1499 
1500 		/*
1501 		 * Check if the frame buffer address from guest crosses
1502 		 * sub-region or not.
1503 		 */
1504 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1505 			RTE_LOG(ERR, VHOST_DATA,
1506 				"(%"PRIu64") Frame buffer address cross "
1507 				"sub-regioin found when attaching RX frame "
1508 				"buffer address!\n",
1509 				dev->device_fh);
1510 			put_desc_to_used_list_zcp(vq, desc_idx);
1511 			continue;
1512 		}
1513 	} while (unlikely(phys_addr == 0));
1514 
1515 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1516 	if (unlikely(mbuf == NULL)) {
1517 		LOG_DEBUG(VHOST_DATA,
1518 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1519 			"ring_sc_dequeue fail.\n",
1520 			dev->device_fh);
1521 		put_desc_to_used_list_zcp(vq, desc_idx);
1522 		return;
1523 	}
1524 
1525 	if (unlikely(vpool->buf_size > desc->len)) {
1526 		LOG_DEBUG(VHOST_DATA,
1527 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1528 			"length(%d) of descriptor idx: %d less than room "
1529 			"size required: %d\n",
1530 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1531 		put_desc_to_used_list_zcp(vq, desc_idx);
1532 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1533 		return;
1534 	}
1535 
1536 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1537 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1538 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1539 	mbuf->data_len = desc->len;
1540 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1541 
1542 	LOG_DEBUG(VHOST_DATA,
1543 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1544 		"descriptor idx:%d\n",
1545 		dev->device_fh, res_base_idx, desc_idx);
1546 
1547 	__rte_mbuf_raw_free(mbuf);
1548 
1549 	return;
1550 }
1551 
1552 /*
1553  * Detach an attched packet mbuf -
1554  *  - restore original mbuf address and length values.
1555  *  - reset pktmbuf data and data_len to their default values.
1556  *  All other fields of the given packet mbuf will be left intact.
1557  *
1558  * @param m
1559  *   The attached packet mbuf.
1560  */
1561 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1562 {
1563 	const struct rte_mempool *mp = m->pool;
1564 	void *buf = RTE_MBUF_TO_BADDR(m);
1565 	uint32_t buf_ofs;
1566 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1567 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1568 
1569 	m->buf_addr = buf;
1570 	m->buf_len = (uint16_t)buf_len;
1571 
1572 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1573 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1574 	m->data_off = buf_ofs;
1575 
1576 	m->data_len = 0;
1577 }
1578 
1579 /*
1580  * This function is called after packets have been transimited. It fetchs mbuf
1581  * from vpool->pool, detached it and put into vpool->ring. It also update the
1582  * used index and kick the guest if necessary.
1583  */
1584 static inline uint32_t __attribute__((always_inline))
1585 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1586 {
1587 	struct rte_mbuf *mbuf;
1588 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1589 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1590 	uint32_t index = 0;
1591 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1592 
1593 	LOG_DEBUG(VHOST_DATA,
1594 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1595 		"clean is: %d\n",
1596 		dev->device_fh, mbuf_count);
1597 	LOG_DEBUG(VHOST_DATA,
1598 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1599 		"clean  is : %d\n",
1600 		dev->device_fh, rte_ring_count(vpool->ring));
1601 
1602 	for (index = 0; index < mbuf_count; index++) {
1603 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1604 		if (likely(RTE_MBUF_INDIRECT(mbuf)))
1605 			pktmbuf_detach_zcp(mbuf);
1606 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1607 
1608 		/* Update used index buffer information. */
1609 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1610 		vq->used->ring[used_idx].len = 0;
1611 
1612 		used_idx = (used_idx + 1) & (vq->size - 1);
1613 	}
1614 
1615 	LOG_DEBUG(VHOST_DATA,
1616 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1617 		"clean is: %d\n",
1618 		dev->device_fh, rte_mempool_count(vpool->pool));
1619 	LOG_DEBUG(VHOST_DATA,
1620 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1621 		"clean  is : %d\n",
1622 		dev->device_fh, rte_ring_count(vpool->ring));
1623 	LOG_DEBUG(VHOST_DATA,
1624 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1625 		"vq->last_used_idx:%d\n",
1626 		dev->device_fh, vq->last_used_idx);
1627 
1628 	vq->last_used_idx += mbuf_count;
1629 
1630 	LOG_DEBUG(VHOST_DATA,
1631 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1632 		"vq->last_used_idx:%d\n",
1633 		dev->device_fh, vq->last_used_idx);
1634 
1635 	rte_compiler_barrier();
1636 
1637 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1638 
1639 	/* Kick guest if required. */
1640 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1641 		eventfd_write((int)vq->kickfd, 1);
1642 
1643 	return 0;
1644 }
1645 
1646 /*
1647  * This function is called when a virtio device is destroy.
1648  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1649  */
1650 static void mbuf_destroy_zcp(struct vpool *vpool)
1651 {
1652 	struct rte_mbuf *mbuf = NULL;
1653 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1654 
1655 	LOG_DEBUG(VHOST_CONFIG,
1656 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1657 		"mbuf_destroy_zcp is: %d\n",
1658 		mbuf_count);
1659 	LOG_DEBUG(VHOST_CONFIG,
1660 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1661 		"mbuf_destroy_zcp  is : %d\n",
1662 		rte_ring_count(vpool->ring));
1663 
1664 	for (index = 0; index < mbuf_count; index++) {
1665 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1666 		if (likely(mbuf != NULL)) {
1667 			if (likely(RTE_MBUF_INDIRECT(mbuf)))
1668 				pktmbuf_detach_zcp(mbuf);
1669 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1670 		}
1671 	}
1672 
1673 	LOG_DEBUG(VHOST_CONFIG,
1674 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1675 		"mbuf_destroy_zcp is: %d\n",
1676 		rte_mempool_count(vpool->pool));
1677 	LOG_DEBUG(VHOST_CONFIG,
1678 		"in mbuf_destroy_zcp: mbuf count in ring after "
1679 		"mbuf_destroy_zcp is : %d\n",
1680 		rte_ring_count(vpool->ring));
1681 }
1682 
1683 /*
1684  * This function update the use flag and counter.
1685  */
1686 static inline uint32_t __attribute__((always_inline))
1687 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1688 	uint32_t count)
1689 {
1690 	struct vhost_virtqueue *vq;
1691 	struct vring_desc *desc;
1692 	struct rte_mbuf *buff;
1693 	/* The virtio_hdr is initialised to 0. */
1694 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1695 		= {{0, 0, 0, 0, 0, 0}, 0};
1696 	uint64_t buff_hdr_addr = 0;
1697 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1698 	uint32_t head_idx, packet_success = 0;
1699 	uint16_t res_cur_idx;
1700 
1701 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1702 
1703 	if (count == 0)
1704 		return 0;
1705 
1706 	vq = dev->virtqueue[VIRTIO_RXQ];
1707 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1708 
1709 	res_cur_idx = vq->last_used_idx;
1710 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1711 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1712 
1713 	/* Retrieve all of the head indexes first to avoid caching issues. */
1714 	for (head_idx = 0; head_idx < count; head_idx++)
1715 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1716 
1717 	/*Prefetch descriptor index. */
1718 	rte_prefetch0(&vq->desc[head[packet_success]]);
1719 
1720 	while (packet_success != count) {
1721 		/* Get descriptor from available ring */
1722 		desc = &vq->desc[head[packet_success]];
1723 
1724 		buff = pkts[packet_success];
1725 		LOG_DEBUG(VHOST_DATA,
1726 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1727 			"pkt[%d] descriptor idx: %d\n",
1728 			dev->device_fh, packet_success,
1729 			MBUF_HEADROOM_UINT32(buff));
1730 
1731 		PRINT_PACKET(dev,
1732 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1733 			+ RTE_PKTMBUF_HEADROOM),
1734 			rte_pktmbuf_data_len(buff), 0);
1735 
1736 		/* Buffer address translation for virtio header. */
1737 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1738 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1739 
1740 		/*
1741 		 * If the descriptors are chained the header and data are
1742 		 * placed in separate buffers.
1743 		 */
1744 		if (desc->flags & VRING_DESC_F_NEXT) {
1745 			desc->len = vq->vhost_hlen;
1746 			desc = &vq->desc[desc->next];
1747 			desc->len = rte_pktmbuf_data_len(buff);
1748 		} else {
1749 			desc->len = packet_len;
1750 		}
1751 
1752 		/* Update used ring with desc information */
1753 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1754 			= head[packet_success];
1755 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1756 			= packet_len;
1757 		res_cur_idx++;
1758 		packet_success++;
1759 
1760 		/* A header is required per buffer. */
1761 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1762 			(const void *)&virtio_hdr, vq->vhost_hlen);
1763 
1764 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1765 
1766 		if (likely(packet_success < count)) {
1767 			/* Prefetch descriptor index. */
1768 			rte_prefetch0(&vq->desc[head[packet_success]]);
1769 		}
1770 	}
1771 
1772 	rte_compiler_barrier();
1773 
1774 	LOG_DEBUG(VHOST_DATA,
1775 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1776 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1777 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1778 
1779 	*(volatile uint16_t *)&vq->used->idx += count;
1780 	vq->last_used_idx += count;
1781 
1782 	LOG_DEBUG(VHOST_DATA,
1783 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1784 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1785 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1786 
1787 	/* Kick the guest if necessary. */
1788 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1789 		eventfd_write((int)vq->kickfd, 1);
1790 
1791 	return count;
1792 }
1793 
1794 /*
1795  * This function routes the TX packet to the correct interface.
1796  * This may be a local device or the physical port.
1797  */
1798 static inline void __attribute__((always_inline))
1799 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1800 	uint32_t desc_idx, uint8_t need_copy)
1801 {
1802 	struct mbuf_table *tx_q;
1803 	struct rte_mbuf **m_table;
1804 	struct rte_mbuf *mbuf = NULL;
1805 	unsigned len, ret, offset = 0;
1806 	struct vpool *vpool;
1807 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1808 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1809 
1810 	/*Add packet to the port tx queue*/
1811 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1812 	len = tx_q->len;
1813 
1814 	/* Allocate an mbuf and populate the structure. */
1815 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1816 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1817 	if (unlikely(mbuf == NULL)) {
1818 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1819 		RTE_LOG(ERR, VHOST_DATA,
1820 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1821 			dev->device_fh);
1822 		put_desc_to_used_list_zcp(vq, desc_idx);
1823 		return;
1824 	}
1825 
1826 	if (vm2vm_mode == VM2VM_HARDWARE) {
1827 		/* Avoid using a vlan tag from any vm for external pkt, such as
1828 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1829 		 * selection, MAC address determines it as an external pkt
1830 		 * which should go to network, while vlan tag determine it as
1831 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1832 		 * such a ambiguous situation, so pkt will lost.
1833 		 */
1834 		vlan_tag = external_pkt_default_vlan_tag;
1835 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1836 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1837 			__rte_mbuf_raw_free(mbuf);
1838 			return;
1839 		}
1840 	}
1841 
1842 	mbuf->nb_segs = m->nb_segs;
1843 	mbuf->next = m->next;
1844 	mbuf->data_len = m->data_len + offset;
1845 	mbuf->pkt_len = mbuf->data_len;
1846 	if (unlikely(need_copy)) {
1847 		/* Copy the packet contents to the mbuf. */
1848 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1849 			rte_pktmbuf_mtod(m, void *),
1850 			m->data_len);
1851 	} else {
1852 		mbuf->data_off = m->data_off;
1853 		mbuf->buf_physaddr = m->buf_physaddr;
1854 		mbuf->buf_addr = m->buf_addr;
1855 	}
1856 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1857 	mbuf->vlan_tci = vlan_tag;
1858 	mbuf->l2_len = sizeof(struct ether_hdr);
1859 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1860 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1861 
1862 	tx_q->m_table[len] = mbuf;
1863 	len++;
1864 
1865 	LOG_DEBUG(VHOST_DATA,
1866 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1867 		dev->device_fh,
1868 		mbuf->nb_segs,
1869 		(mbuf->next == NULL) ? "null" : "non-null");
1870 
1871 	if (enable_stats) {
1872 		dev_statistics[dev->device_fh].tx_total++;
1873 		dev_statistics[dev->device_fh].tx++;
1874 	}
1875 
1876 	if (unlikely(len == MAX_PKT_BURST)) {
1877 		m_table = (struct rte_mbuf **)tx_q->m_table;
1878 		ret = rte_eth_tx_burst(ports[0],
1879 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1880 
1881 		/*
1882 		 * Free any buffers not handled by TX and update
1883 		 * the port stats.
1884 		 */
1885 		if (unlikely(ret < len)) {
1886 			do {
1887 				rte_pktmbuf_free(m_table[ret]);
1888 			} while (++ret < len);
1889 		}
1890 
1891 		len = 0;
1892 		txmbuf_clean_zcp(dev, vpool);
1893 	}
1894 
1895 	tx_q->len = len;
1896 
1897 	return;
1898 }
1899 
1900 /*
1901  * This function TX all available packets in virtio TX queue for one
1902  * virtio-net device. If it is first packet, it learns MAC address and
1903  * setup VMDQ.
1904  */
1905 static inline void __attribute__((always_inline))
1906 virtio_dev_tx_zcp(struct virtio_net *dev)
1907 {
1908 	struct rte_mbuf m;
1909 	struct vhost_virtqueue *vq;
1910 	struct vring_desc *desc;
1911 	uint64_t buff_addr = 0, phys_addr;
1912 	uint32_t head[MAX_PKT_BURST];
1913 	uint32_t i;
1914 	uint16_t free_entries, packet_success = 0;
1915 	uint16_t avail_idx;
1916 	uint8_t need_copy = 0;
1917 	hpa_type addr_type;
1918 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1919 
1920 	vq = dev->virtqueue[VIRTIO_TXQ];
1921 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1922 
1923 	/* If there are no available buffers then return. */
1924 	if (vq->last_used_idx_res == avail_idx)
1925 		return;
1926 
1927 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1928 
1929 	/* Prefetch available ring to retrieve head indexes. */
1930 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1931 
1932 	/* Get the number of free entries in the ring */
1933 	free_entries = (avail_idx - vq->last_used_idx_res);
1934 
1935 	/* Limit to MAX_PKT_BURST. */
1936 	free_entries
1937 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1938 
1939 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1940 		dev->device_fh, free_entries);
1941 
1942 	/* Retrieve all of the head indexes first to avoid caching issues. */
1943 	for (i = 0; i < free_entries; i++)
1944 		head[i]
1945 			= vq->avail->ring[(vq->last_used_idx_res + i)
1946 			& (vq->size - 1)];
1947 
1948 	vq->last_used_idx_res += free_entries;
1949 
1950 	/* Prefetch descriptor index. */
1951 	rte_prefetch0(&vq->desc[head[packet_success]]);
1952 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1953 
1954 	while (packet_success < free_entries) {
1955 		desc = &vq->desc[head[packet_success]];
1956 
1957 		/* Discard first buffer as it is the virtio header */
1958 		desc = &vq->desc[desc->next];
1959 
1960 		/* Buffer address translation. */
1961 		buff_addr = gpa_to_vva(dev, desc->addr);
1962 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1963 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1964 			&addr_type);
1965 
1966 		if (likely(packet_success < (free_entries - 1)))
1967 			/* Prefetch descriptor index. */
1968 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1969 
1970 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1971 			RTE_LOG(ERR, VHOST_DATA,
1972 				"(%"PRIu64") Invalid frame buffer address found"
1973 				"when TX packets!\n",
1974 				dev->device_fh);
1975 			packet_success++;
1976 			continue;
1977 		}
1978 
1979 		/* Prefetch buffer address. */
1980 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1981 
1982 		/*
1983 		 * Setup dummy mbuf. This is copied to a real mbuf if
1984 		 * transmitted out the physical port.
1985 		 */
1986 		m.data_len = desc->len;
1987 		m.nb_segs = 1;
1988 		m.next = NULL;
1989 		m.data_off = 0;
1990 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1991 		m.buf_physaddr = phys_addr;
1992 
1993 		/*
1994 		 * Check if the frame buffer address from guest crosses
1995 		 * sub-region or not.
1996 		 */
1997 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1998 			RTE_LOG(ERR, VHOST_DATA,
1999 				"(%"PRIu64") Frame buffer address cross "
2000 				"sub-regioin found when attaching TX frame "
2001 				"buffer address!\n",
2002 				dev->device_fh);
2003 			need_copy = 1;
2004 		} else
2005 			need_copy = 0;
2006 
2007 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2008 
2009 		/*
2010 		 * If this is the first received packet we need to learn
2011 		 * the MAC and setup VMDQ
2012 		 */
2013 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2014 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2015 				/*
2016 				 * Discard frame if device is scheduled for
2017 				 * removal or a duplicate MAC address is found.
2018 				 */
2019 				packet_success += free_entries;
2020 				vq->last_used_idx += packet_success;
2021 				break;
2022 			}
2023 		}
2024 
2025 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2026 		packet_success++;
2027 	}
2028 }
2029 
2030 /*
2031  * This function is called by each data core. It handles all RX/TX registered
2032  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2033  * addresses are compared with all devices in the main linked list.
2034  */
2035 static int
2036 switch_worker_zcp(__attribute__((unused)) void *arg)
2037 {
2038 	struct virtio_net *dev = NULL;
2039 	struct vhost_dev  *vdev = NULL;
2040 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2041 	struct virtio_net_data_ll *dev_ll;
2042 	struct mbuf_table *tx_q;
2043 	volatile struct lcore_ll_info *lcore_ll;
2044 	const uint64_t drain_tsc
2045 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2046 		* BURST_TX_DRAIN_US;
2047 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2048 	unsigned ret;
2049 	const uint16_t lcore_id = rte_lcore_id();
2050 	uint16_t count_in_ring, rx_count = 0;
2051 
2052 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2053 
2054 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2055 	prev_tsc = 0;
2056 
2057 	while (1) {
2058 		cur_tsc = rte_rdtsc();
2059 
2060 		/* TX burst queue drain */
2061 		diff_tsc = cur_tsc - prev_tsc;
2062 		if (unlikely(diff_tsc > drain_tsc)) {
2063 			/*
2064 			 * Get mbuf from vpool.pool and detach mbuf and
2065 			 * put back into vpool.ring.
2066 			 */
2067 			dev_ll = lcore_ll->ll_root_used;
2068 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2069 				/* Get virtio device ID */
2070 				vdev = dev_ll->vdev;
2071 				dev = vdev->dev;
2072 
2073 				if (likely(!vdev->remove)) {
2074 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2075 					if (tx_q->len) {
2076 						LOG_DEBUG(VHOST_DATA,
2077 						"TX queue drained after timeout"
2078 						" with burst size %u\n",
2079 						tx_q->len);
2080 
2081 						/*
2082 						 * Tx any packets in the queue
2083 						 */
2084 						ret = rte_eth_tx_burst(
2085 							ports[0],
2086 							(uint16_t)tx_q->txq_id,
2087 							(struct rte_mbuf **)
2088 							tx_q->m_table,
2089 							(uint16_t)tx_q->len);
2090 						if (unlikely(ret < tx_q->len)) {
2091 							do {
2092 								rte_pktmbuf_free(
2093 									tx_q->m_table[ret]);
2094 							} while (++ret < tx_q->len);
2095 						}
2096 						tx_q->len = 0;
2097 
2098 						txmbuf_clean_zcp(dev,
2099 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2100 					}
2101 				}
2102 				dev_ll = dev_ll->next;
2103 			}
2104 			prev_tsc = cur_tsc;
2105 		}
2106 
2107 		rte_prefetch0(lcore_ll->ll_root_used);
2108 
2109 		/*
2110 		 * Inform the configuration core that we have exited the linked
2111 		 * list and that no devices are in use if requested.
2112 		 */
2113 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2114 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2115 
2116 		/* Process devices */
2117 		dev_ll = lcore_ll->ll_root_used;
2118 
2119 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2120 			vdev = dev_ll->vdev;
2121 			dev  = vdev->dev;
2122 			if (unlikely(vdev->remove)) {
2123 				dev_ll = dev_ll->next;
2124 				unlink_vmdq(vdev);
2125 				vdev->ready = DEVICE_SAFE_REMOVE;
2126 				continue;
2127 			}
2128 
2129 			if (likely(vdev->ready == DEVICE_RX)) {
2130 				uint32_t index = vdev->vmdq_rx_q;
2131 				uint16_t i;
2132 				count_in_ring
2133 				= rte_ring_count(vpool_array[index].ring);
2134 				uint16_t free_entries
2135 				= (uint16_t)get_available_ring_num_zcp(dev);
2136 
2137 				/*
2138 				 * Attach all mbufs in vpool.ring and put back
2139 				 * into vpool.pool.
2140 				 */
2141 				for (i = 0;
2142 				i < RTE_MIN(free_entries,
2143 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2144 				i++)
2145 					attach_rxmbuf_zcp(dev);
2146 
2147 				/* Handle guest RX */
2148 				rx_count = rte_eth_rx_burst(ports[0],
2149 					vdev->vmdq_rx_q, pkts_burst,
2150 					MAX_PKT_BURST);
2151 
2152 				if (rx_count) {
2153 					ret_count = virtio_dev_rx_zcp(dev,
2154 							pkts_burst, rx_count);
2155 					if (enable_stats) {
2156 						dev_statistics[dev->device_fh].rx_total
2157 							+= rx_count;
2158 						dev_statistics[dev->device_fh].rx
2159 							+= ret_count;
2160 					}
2161 					while (likely(rx_count)) {
2162 						rx_count--;
2163 						pktmbuf_detach_zcp(
2164 							pkts_burst[rx_count]);
2165 						rte_ring_sp_enqueue(
2166 							vpool_array[index].ring,
2167 							(void *)pkts_burst[rx_count]);
2168 					}
2169 				}
2170 			}
2171 
2172 			if (likely(!vdev->remove))
2173 				/* Handle guest TX */
2174 				virtio_dev_tx_zcp(dev);
2175 
2176 			/* Move to the next device in the list */
2177 			dev_ll = dev_ll->next;
2178 		}
2179 	}
2180 
2181 	return 0;
2182 }
2183 
2184 
2185 /*
2186  * Add an entry to a used linked list. A free entry must first be found
2187  * in the free linked list using get_data_ll_free_entry();
2188  */
2189 static void
2190 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2191 	struct virtio_net_data_ll *ll_dev)
2192 {
2193 	struct virtio_net_data_ll *ll = *ll_root_addr;
2194 
2195 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2196 	ll_dev->next = NULL;
2197 	rte_compiler_barrier();
2198 
2199 	/* If ll == NULL then this is the first device. */
2200 	if (ll) {
2201 		/* Increment to the tail of the linked list. */
2202 		while ((ll->next != NULL) )
2203 			ll = ll->next;
2204 
2205 		ll->next = ll_dev;
2206 	} else {
2207 		*ll_root_addr = ll_dev;
2208 	}
2209 }
2210 
2211 /*
2212  * Remove an entry from a used linked list. The entry must then be added to
2213  * the free linked list using put_data_ll_free_entry().
2214  */
2215 static void
2216 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2217 	struct virtio_net_data_ll *ll_dev,
2218 	struct virtio_net_data_ll *ll_dev_last)
2219 {
2220 	struct virtio_net_data_ll *ll = *ll_root_addr;
2221 
2222 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2223 		return;
2224 
2225 	if (ll_dev == ll)
2226 		*ll_root_addr = ll_dev->next;
2227 	else
2228 		if (likely(ll_dev_last != NULL))
2229 			ll_dev_last->next = ll_dev->next;
2230 		else
2231 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2232 }
2233 
2234 /*
2235  * Find and return an entry from the free linked list.
2236  */
2237 static struct virtio_net_data_ll *
2238 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2239 {
2240 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2241 	struct virtio_net_data_ll *ll_dev;
2242 
2243 	if (ll_free == NULL)
2244 		return NULL;
2245 
2246 	ll_dev = ll_free;
2247 	*ll_root_addr = ll_free->next;
2248 
2249 	return ll_dev;
2250 }
2251 
2252 /*
2253  * Place an entry back on to the free linked list.
2254  */
2255 static void
2256 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2257 	struct virtio_net_data_ll *ll_dev)
2258 {
2259 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2260 
2261 	if (ll_dev == NULL)
2262 		return;
2263 
2264 	ll_dev->next = ll_free;
2265 	*ll_root_addr = ll_dev;
2266 }
2267 
2268 /*
2269  * Creates a linked list of a given size.
2270  */
2271 static struct virtio_net_data_ll *
2272 alloc_data_ll(uint32_t size)
2273 {
2274 	struct virtio_net_data_ll *ll_new;
2275 	uint32_t i;
2276 
2277 	/* Malloc and then chain the linked list. */
2278 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2279 	if (ll_new == NULL) {
2280 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2281 		return NULL;
2282 	}
2283 
2284 	for (i = 0; i < size - 1; i++) {
2285 		ll_new[i].vdev = NULL;
2286 		ll_new[i].next = &ll_new[i+1];
2287 	}
2288 	ll_new[i].next = NULL;
2289 
2290 	return (ll_new);
2291 }
2292 
2293 /*
2294  * Create the main linked list along with each individual cores linked list. A used and a free list
2295  * are created to manage entries.
2296  */
2297 static int
2298 init_data_ll (void)
2299 {
2300 	int lcore;
2301 
2302 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2303 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2304 		if (lcore_info[lcore].lcore_ll == NULL) {
2305 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2306 			return -1;
2307 		}
2308 
2309 		lcore_info[lcore].lcore_ll->device_num = 0;
2310 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2311 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2312 		if (num_devices % num_switching_cores)
2313 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2314 		else
2315 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2316 	}
2317 
2318 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2319 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2320 
2321 	return 0;
2322 }
2323 
2324 /*
2325  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2326  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2327  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2328  */
2329 static void
2330 destroy_device (volatile struct virtio_net *dev)
2331 {
2332 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2333 	struct virtio_net_data_ll *ll_main_dev_cur;
2334 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2335 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2336 	struct vhost_dev *vdev;
2337 	int lcore;
2338 
2339 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2340 
2341 	vdev = (struct vhost_dev *)dev->priv;
2342 	/*set the remove flag. */
2343 	vdev->remove = 1;
2344 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2345 		rte_pause();
2346 	}
2347 
2348 	/* Search for entry to be removed from lcore ll */
2349 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2350 	while (ll_lcore_dev_cur != NULL) {
2351 		if (ll_lcore_dev_cur->vdev == vdev) {
2352 			break;
2353 		} else {
2354 			ll_lcore_dev_last = ll_lcore_dev_cur;
2355 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2356 		}
2357 	}
2358 
2359 	if (ll_lcore_dev_cur == NULL) {
2360 		RTE_LOG(ERR, VHOST_CONFIG,
2361 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2362 			dev->device_fh);
2363 		return;
2364 	}
2365 
2366 	/* Search for entry to be removed from main ll */
2367 	ll_main_dev_cur = ll_root_used;
2368 	ll_main_dev_last = NULL;
2369 	while (ll_main_dev_cur != NULL) {
2370 		if (ll_main_dev_cur->vdev == vdev) {
2371 			break;
2372 		} else {
2373 			ll_main_dev_last = ll_main_dev_cur;
2374 			ll_main_dev_cur = ll_main_dev_cur->next;
2375 		}
2376 	}
2377 
2378 	/* Remove entries from the lcore and main ll. */
2379 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2380 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2381 
2382 	/* Set the dev_removal_flag on each lcore. */
2383 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2384 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2385 	}
2386 
2387 	/*
2388 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2389 	 * they can no longer access the device removed from the linked lists and that the devices
2390 	 * are no longer in use.
2391 	 */
2392 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2393 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2394 			rte_pause();
2395 		}
2396 	}
2397 
2398 	/* Add the entries back to the lcore and main free ll.*/
2399 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2400 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2401 
2402 	/* Decrement number of device on the lcore. */
2403 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2404 
2405 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2406 
2407 	if (zero_copy) {
2408 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2409 
2410 		/* Stop the RX queue. */
2411 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2412 			LOG_DEBUG(VHOST_CONFIG,
2413 				"(%"PRIu64") In destroy_device: Failed to stop "
2414 				"rx queue:%d\n",
2415 				dev->device_fh,
2416 				vdev->vmdq_rx_q);
2417 		}
2418 
2419 		LOG_DEBUG(VHOST_CONFIG,
2420 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2421 			"mempool back to ring for RX queue: %d\n",
2422 			dev->device_fh, vdev->vmdq_rx_q);
2423 
2424 		mbuf_destroy_zcp(vpool);
2425 
2426 		/* Stop the TX queue. */
2427 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2428 			LOG_DEBUG(VHOST_CONFIG,
2429 				"(%"PRIu64") In destroy_device: Failed to "
2430 				"stop tx queue:%d\n",
2431 				dev->device_fh, vdev->vmdq_rx_q);
2432 		}
2433 
2434 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2435 
2436 		LOG_DEBUG(VHOST_CONFIG,
2437 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2438 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2439 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2440 			dev->device_fh);
2441 
2442 		mbuf_destroy_zcp(vpool);
2443 		rte_free(vdev->regions_hpa);
2444 	}
2445 	rte_free(vdev);
2446 
2447 }
2448 
2449 /*
2450  * Calculate the region count of physical continous regions for one particular
2451  * region of whose vhost virtual address is continous. The particular region
2452  * start from vva_start, with size of 'size' in argument.
2453  */
2454 static uint32_t
2455 check_hpa_regions(uint64_t vva_start, uint64_t size)
2456 {
2457 	uint32_t i, nregions = 0, page_size = getpagesize();
2458 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2459 	if (vva_start % page_size) {
2460 		LOG_DEBUG(VHOST_CONFIG,
2461 			"in check_countinous: vva start(%p) mod page_size(%d) "
2462 			"has remainder\n",
2463 			(void *)(uintptr_t)vva_start, page_size);
2464 		return 0;
2465 	}
2466 	if (size % page_size) {
2467 		LOG_DEBUG(VHOST_CONFIG,
2468 			"in check_countinous: "
2469 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2470 			size, page_size);
2471 		return 0;
2472 	}
2473 	for (i = 0; i < size - page_size; i = i + page_size) {
2474 		cur_phys_addr
2475 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2476 		next_phys_addr = rte_mem_virt2phy(
2477 			(void *)(uintptr_t)(vva_start + i + page_size));
2478 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2479 			++nregions;
2480 			LOG_DEBUG(VHOST_CONFIG,
2481 				"in check_continuous: hva addr:(%p) is not "
2482 				"continuous with hva addr:(%p), diff:%d\n",
2483 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2484 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2485 				+ page_size), page_size);
2486 			LOG_DEBUG(VHOST_CONFIG,
2487 				"in check_continuous: hpa addr:(%p) is not "
2488 				"continuous with hpa addr:(%p), "
2489 				"diff:(%"PRIu64")\n",
2490 				(void *)(uintptr_t)cur_phys_addr,
2491 				(void *)(uintptr_t)next_phys_addr,
2492 				(next_phys_addr-cur_phys_addr));
2493 		}
2494 	}
2495 	return nregions;
2496 }
2497 
2498 /*
2499  * Divide each region whose vhost virtual address is continous into a few
2500  * sub-regions, make sure the physical address within each sub-region are
2501  * continous. And fill offset(to GPA) and size etc. information of each
2502  * sub-region into regions_hpa.
2503  */
2504 static uint32_t
2505 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2506 {
2507 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2508 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2509 
2510 	if (mem_region_hpa == NULL)
2511 		return 0;
2512 
2513 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2514 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2515 			virtio_memory->regions[regionidx].address_offset;
2516 		mem_region_hpa[regionidx_hpa].guest_phys_address
2517 			= virtio_memory->regions[regionidx].guest_phys_address;
2518 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2519 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2520 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2521 		LOG_DEBUG(VHOST_CONFIG,
2522 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2523 			regionidx_hpa,
2524 			(void *)(uintptr_t)
2525 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2526 		LOG_DEBUG(VHOST_CONFIG,
2527 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2528 			regionidx_hpa,
2529 			(void *)(uintptr_t)
2530 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2531 		for (i = 0, k = 0;
2532 			i < virtio_memory->regions[regionidx].memory_size -
2533 				page_size;
2534 			i += page_size) {
2535 			cur_phys_addr = rte_mem_virt2phy(
2536 					(void *)(uintptr_t)(vva_start + i));
2537 			next_phys_addr = rte_mem_virt2phy(
2538 					(void *)(uintptr_t)(vva_start +
2539 					i + page_size));
2540 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2541 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2542 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2543 					k + page_size;
2544 				mem_region_hpa[regionidx_hpa].memory_size
2545 					= k + page_size;
2546 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2547 					"phys addr end  [%d]:(%p)\n",
2548 					regionidx_hpa,
2549 					(void *)(uintptr_t)
2550 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2551 				LOG_DEBUG(VHOST_CONFIG,
2552 					"in fill_hpa_regions: guest phys addr "
2553 					"size [%d]:(%p)\n",
2554 					regionidx_hpa,
2555 					(void *)(uintptr_t)
2556 					(mem_region_hpa[regionidx_hpa].memory_size));
2557 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2558 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2559 				++regionidx_hpa;
2560 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2561 					next_phys_addr -
2562 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2563 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2564 					" phys addr start[%d]:(%p)\n",
2565 					regionidx_hpa,
2566 					(void *)(uintptr_t)
2567 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2568 				LOG_DEBUG(VHOST_CONFIG,
2569 					"in fill_hpa_regions: host  phys addr "
2570 					"start[%d]:(%p)\n",
2571 					regionidx_hpa,
2572 					(void *)(uintptr_t)
2573 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2574 				k = 0;
2575 			} else {
2576 				k += page_size;
2577 			}
2578 		}
2579 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2580 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2581 			+ k + page_size;
2582 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2583 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2584 			"[%d]:(%p)\n", regionidx_hpa,
2585 			(void *)(uintptr_t)
2586 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2587 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2588 			"[%d]:(%p)\n", regionidx_hpa,
2589 			(void *)(uintptr_t)
2590 			(mem_region_hpa[regionidx_hpa].memory_size));
2591 		++regionidx_hpa;
2592 	}
2593 	return regionidx_hpa;
2594 }
2595 
2596 /*
2597  * A new device is added to a data core. First the device is added to the main linked list
2598  * and the allocated to a specific data core.
2599  */
2600 static int
2601 new_device (struct virtio_net *dev)
2602 {
2603 	struct virtio_net_data_ll *ll_dev;
2604 	int lcore, core_add = 0;
2605 	uint32_t device_num_min = num_devices;
2606 	struct vhost_dev *vdev;
2607 	uint32_t regionidx;
2608 
2609 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2610 	if (vdev == NULL) {
2611 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2612 			dev->device_fh);
2613 		return -1;
2614 	}
2615 	vdev->dev = dev;
2616 	dev->priv = vdev;
2617 
2618 	if (zero_copy) {
2619 		vdev->nregions_hpa = dev->mem->nregions;
2620 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2621 			vdev->nregions_hpa
2622 				+= check_hpa_regions(
2623 					dev->mem->regions[regionidx].guest_phys_address
2624 					+ dev->mem->regions[regionidx].address_offset,
2625 					dev->mem->regions[regionidx].memory_size);
2626 
2627 		}
2628 
2629 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2630 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2631 			RTE_CACHE_LINE_SIZE);
2632 		if (vdev->regions_hpa == NULL) {
2633 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2634 			rte_free(vdev);
2635 			return -1;
2636 		}
2637 
2638 
2639 		if (fill_hpa_memory_regions(
2640 			vdev->regions_hpa, dev->mem
2641 			) != vdev->nregions_hpa) {
2642 
2643 			RTE_LOG(ERR, VHOST_CONFIG,
2644 				"hpa memory regions number mismatch: "
2645 				"[%d]\n", vdev->nregions_hpa);
2646 			rte_free(vdev->regions_hpa);
2647 			rte_free(vdev);
2648 			return -1;
2649 		}
2650 	}
2651 
2652 
2653 	/* Add device to main ll */
2654 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2655 	if (ll_dev == NULL) {
2656 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2657 			"of %d devices per core has been reached\n",
2658 			dev->device_fh, num_devices);
2659 		if (vdev->regions_hpa)
2660 			rte_free(vdev->regions_hpa);
2661 		rte_free(vdev);
2662 		return -1;
2663 	}
2664 	ll_dev->vdev = vdev;
2665 	add_data_ll_entry(&ll_root_used, ll_dev);
2666 	vdev->vmdq_rx_q
2667 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2668 
2669 	if (zero_copy) {
2670 		uint32_t index = vdev->vmdq_rx_q;
2671 		uint32_t count_in_ring, i;
2672 		struct mbuf_table *tx_q;
2673 
2674 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2675 
2676 		LOG_DEBUG(VHOST_CONFIG,
2677 			"(%"PRIu64") in new_device: mbuf count in mempool "
2678 			"before attach is: %d\n",
2679 			dev->device_fh,
2680 			rte_mempool_count(vpool_array[index].pool));
2681 		LOG_DEBUG(VHOST_CONFIG,
2682 			"(%"PRIu64") in new_device: mbuf count in  ring "
2683 			"before attach  is : %d\n",
2684 			dev->device_fh, count_in_ring);
2685 
2686 		/*
2687 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2688 		 */
2689 		for (i = 0; i < count_in_ring; i++)
2690 			attach_rxmbuf_zcp(dev);
2691 
2692 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2693 			"mempool after attach is: %d\n",
2694 			dev->device_fh,
2695 			rte_mempool_count(vpool_array[index].pool));
2696 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2697 			"ring after attach  is : %d\n",
2698 			dev->device_fh,
2699 			rte_ring_count(vpool_array[index].ring));
2700 
2701 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2702 		tx_q->txq_id = vdev->vmdq_rx_q;
2703 
2704 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2705 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2706 
2707 			LOG_DEBUG(VHOST_CONFIG,
2708 				"(%"PRIu64") In new_device: Failed to start "
2709 				"tx queue:%d\n",
2710 				dev->device_fh, vdev->vmdq_rx_q);
2711 
2712 			mbuf_destroy_zcp(vpool);
2713 			rte_free(vdev->regions_hpa);
2714 			rte_free(vdev);
2715 			return -1;
2716 		}
2717 
2718 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2719 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2720 
2721 			LOG_DEBUG(VHOST_CONFIG,
2722 				"(%"PRIu64") In new_device: Failed to start "
2723 				"rx queue:%d\n",
2724 				dev->device_fh, vdev->vmdq_rx_q);
2725 
2726 			/* Stop the TX queue. */
2727 			if (rte_eth_dev_tx_queue_stop(ports[0],
2728 				vdev->vmdq_rx_q) != 0) {
2729 				LOG_DEBUG(VHOST_CONFIG,
2730 					"(%"PRIu64") In new_device: Failed to "
2731 					"stop tx queue:%d\n",
2732 					dev->device_fh, vdev->vmdq_rx_q);
2733 			}
2734 
2735 			mbuf_destroy_zcp(vpool);
2736 			rte_free(vdev->regions_hpa);
2737 			rte_free(vdev);
2738 			return -1;
2739 		}
2740 
2741 	}
2742 
2743 	/*reset ready flag*/
2744 	vdev->ready = DEVICE_MAC_LEARNING;
2745 	vdev->remove = 0;
2746 
2747 	/* Find a suitable lcore to add the device. */
2748 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2749 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2750 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2751 			core_add = lcore;
2752 		}
2753 	}
2754 	/* Add device to lcore ll */
2755 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2756 	if (ll_dev == NULL) {
2757 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2758 		vdev->ready = DEVICE_SAFE_REMOVE;
2759 		destroy_device(dev);
2760 		if (vdev->regions_hpa)
2761 			rte_free(vdev->regions_hpa);
2762 		rte_free(vdev);
2763 		return -1;
2764 	}
2765 	ll_dev->vdev = vdev;
2766 	vdev->coreid = core_add;
2767 
2768 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2769 
2770 	/* Initialize device stats */
2771 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2772 
2773 	/* Disable notifications. */
2774 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2775 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2776 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2777 	dev->flags |= VIRTIO_DEV_RUNNING;
2778 
2779 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2780 
2781 	return 0;
2782 }
2783 
2784 /*
2785  * These callback allow devices to be added to the data core when configuration
2786  * has been fully complete.
2787  */
2788 static const struct virtio_net_device_ops virtio_net_device_ops =
2789 {
2790 	.new_device =  new_device,
2791 	.destroy_device = destroy_device,
2792 };
2793 
2794 /*
2795  * This is a thread will wake up after a period to print stats if the user has
2796  * enabled them.
2797  */
2798 static void
2799 print_stats(void)
2800 {
2801 	struct virtio_net_data_ll *dev_ll;
2802 	uint64_t tx_dropped, rx_dropped;
2803 	uint64_t tx, tx_total, rx, rx_total;
2804 	uint32_t device_fh;
2805 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2806 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2807 
2808 	while(1) {
2809 		sleep(enable_stats);
2810 
2811 		/* Clear screen and move to top left */
2812 		printf("%s%s", clr, top_left);
2813 
2814 		printf("\nDevice statistics ====================================");
2815 
2816 		dev_ll = ll_root_used;
2817 		while (dev_ll != NULL) {
2818 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2819 			tx_total = dev_statistics[device_fh].tx_total;
2820 			tx = dev_statistics[device_fh].tx;
2821 			tx_dropped = tx_total - tx;
2822 			if (zero_copy == 0) {
2823 				rx_total = rte_atomic64_read(
2824 					&dev_statistics[device_fh].rx_total_atomic);
2825 				rx = rte_atomic64_read(
2826 					&dev_statistics[device_fh].rx_atomic);
2827 			} else {
2828 				rx_total = dev_statistics[device_fh].rx_total;
2829 				rx = dev_statistics[device_fh].rx;
2830 			}
2831 			rx_dropped = rx_total - rx;
2832 
2833 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2834 					"\nTX total: 		%"PRIu64""
2835 					"\nTX dropped: 		%"PRIu64""
2836 					"\nTX successful: 		%"PRIu64""
2837 					"\nRX total: 		%"PRIu64""
2838 					"\nRX dropped: 		%"PRIu64""
2839 					"\nRX successful: 		%"PRIu64"",
2840 					device_fh,
2841 					tx_total,
2842 					tx_dropped,
2843 					tx,
2844 					rx_total,
2845 					rx_dropped,
2846 					rx);
2847 
2848 			dev_ll = dev_ll->next;
2849 		}
2850 		printf("\n======================================================\n");
2851 	}
2852 }
2853 
2854 static void
2855 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2856 	char *ring_name, uint32_t nb_mbuf)
2857 {
2858 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2859 	vpool_array[index].pool
2860 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2861 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2862 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2863 		rte_pktmbuf_init, NULL, socket, 0);
2864 	if (vpool_array[index].pool != NULL) {
2865 		vpool_array[index].ring
2866 			= rte_ring_create(ring_name,
2867 				rte_align32pow2(nb_mbuf + 1),
2868 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2869 		if (likely(vpool_array[index].ring != NULL)) {
2870 			LOG_DEBUG(VHOST_CONFIG,
2871 				"in setup_mempool_tbl: mbuf count in "
2872 				"mempool is: %d\n",
2873 				rte_mempool_count(vpool_array[index].pool));
2874 			LOG_DEBUG(VHOST_CONFIG,
2875 				"in setup_mempool_tbl: mbuf count in "
2876 				"ring   is: %d\n",
2877 				rte_ring_count(vpool_array[index].ring));
2878 		} else {
2879 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2880 				ring_name);
2881 		}
2882 
2883 		/* Need consider head room. */
2884 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2885 	} else {
2886 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2887 	}
2888 }
2889 
2890 
2891 /*
2892  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2893  * device is also registered here to handle the IOCTLs.
2894  */
2895 int
2896 main(int argc, char *argv[])
2897 {
2898 	struct rte_mempool *mbuf_pool = NULL;
2899 	unsigned lcore_id, core_id = 0;
2900 	unsigned nb_ports, valid_num_ports;
2901 	int ret;
2902 	uint8_t portid;
2903 	uint16_t queue_id;
2904 	static pthread_t tid;
2905 
2906 	/* init EAL */
2907 	ret = rte_eal_init(argc, argv);
2908 	if (ret < 0)
2909 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2910 	argc -= ret;
2911 	argv += ret;
2912 
2913 	/* parse app arguments */
2914 	ret = us_vhost_parse_args(argc, argv);
2915 	if (ret < 0)
2916 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2917 
2918 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2919 		if (rte_lcore_is_enabled(lcore_id))
2920 			lcore_ids[core_id ++] = lcore_id;
2921 
2922 	if (rte_lcore_count() > RTE_MAX_LCORE)
2923 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2924 
2925 	/*set the number of swithcing cores available*/
2926 	num_switching_cores = rte_lcore_count()-1;
2927 
2928 	/* Get the number of physical ports. */
2929 	nb_ports = rte_eth_dev_count();
2930 	if (nb_ports > RTE_MAX_ETHPORTS)
2931 		nb_ports = RTE_MAX_ETHPORTS;
2932 
2933 	/*
2934 	 * Update the global var NUM_PORTS and global array PORTS
2935 	 * and get value of var VALID_NUM_PORTS according to system ports number
2936 	 */
2937 	valid_num_ports = check_ports_num(nb_ports);
2938 
2939 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2940 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2941 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2942 		return -1;
2943 	}
2944 
2945 	if (zero_copy == 0) {
2946 		/* Create the mbuf pool. */
2947 		mbuf_pool = rte_mempool_create(
2948 				"MBUF_POOL",
2949 				NUM_MBUFS_PER_PORT
2950 				* valid_num_ports,
2951 				MBUF_SIZE, MBUF_CACHE_SIZE,
2952 				sizeof(struct rte_pktmbuf_pool_private),
2953 				rte_pktmbuf_pool_init, NULL,
2954 				rte_pktmbuf_init, NULL,
2955 				rte_socket_id(), 0);
2956 		if (mbuf_pool == NULL)
2957 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2958 
2959 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2960 			vpool_array[queue_id].pool = mbuf_pool;
2961 
2962 		if (vm2vm_mode == VM2VM_HARDWARE) {
2963 			/* Enable VT loop back to let L2 switch to do it. */
2964 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2965 			LOG_DEBUG(VHOST_CONFIG,
2966 				"Enable loop back for L2 switch in vmdq.\n");
2967 		}
2968 	} else {
2969 		uint32_t nb_mbuf;
2970 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2971 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2972 
2973 		nb_mbuf = num_rx_descriptor
2974 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2975 			+ num_switching_cores * MAX_PKT_BURST;
2976 
2977 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2978 			snprintf(pool_name, sizeof(pool_name),
2979 				"rxmbuf_pool_%u", queue_id);
2980 			snprintf(ring_name, sizeof(ring_name),
2981 				"rxmbuf_ring_%u", queue_id);
2982 			setup_mempool_tbl(rte_socket_id(), queue_id,
2983 				pool_name, ring_name, nb_mbuf);
2984 		}
2985 
2986 		nb_mbuf = num_tx_descriptor
2987 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2988 				+ num_switching_cores * MAX_PKT_BURST;
2989 
2990 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2991 			snprintf(pool_name, sizeof(pool_name),
2992 				"txmbuf_pool_%u", queue_id);
2993 			snprintf(ring_name, sizeof(ring_name),
2994 				"txmbuf_ring_%u", queue_id);
2995 			setup_mempool_tbl(rte_socket_id(),
2996 				(queue_id + MAX_QUEUES),
2997 				pool_name, ring_name, nb_mbuf);
2998 		}
2999 
3000 		if (vm2vm_mode == VM2VM_HARDWARE) {
3001 			/* Enable VT loop back to let L2 switch to do it. */
3002 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3003 			LOG_DEBUG(VHOST_CONFIG,
3004 				"Enable loop back for L2 switch in vmdq.\n");
3005 		}
3006 	}
3007 	/* Set log level. */
3008 	rte_set_log_level(LOG_LEVEL);
3009 
3010 	/* initialize all ports */
3011 	for (portid = 0; portid < nb_ports; portid++) {
3012 		/* skip ports that are not enabled */
3013 		if ((enabled_port_mask & (1 << portid)) == 0) {
3014 			RTE_LOG(INFO, VHOST_PORT,
3015 				"Skipping disabled port %d\n", portid);
3016 			continue;
3017 		}
3018 		if (port_init(portid) != 0)
3019 			rte_exit(EXIT_FAILURE,
3020 				"Cannot initialize network ports\n");
3021 	}
3022 
3023 	/* Initialise all linked lists. */
3024 	if (init_data_ll() == -1)
3025 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3026 
3027 	/* Initialize device stats */
3028 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3029 
3030 	/* Enable stats if the user option is set. */
3031 	if (enable_stats)
3032 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3033 
3034 	/* Launch all data cores. */
3035 	if (zero_copy == 0) {
3036 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3037 			rte_eal_remote_launch(switch_worker,
3038 				mbuf_pool, lcore_id);
3039 		}
3040 	} else {
3041 		uint32_t count_in_mempool, index, i;
3042 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3043 			/* For all RX and TX queues. */
3044 			count_in_mempool
3045 				= rte_mempool_count(vpool_array[index].pool);
3046 
3047 			/*
3048 			 * Transfer all un-attached mbufs from vpool.pool
3049 			 * to vpoo.ring.
3050 			 */
3051 			for (i = 0; i < count_in_mempool; i++) {
3052 				struct rte_mbuf *mbuf
3053 					= __rte_mbuf_raw_alloc(
3054 						vpool_array[index].pool);
3055 				rte_ring_sp_enqueue(vpool_array[index].ring,
3056 						(void *)mbuf);
3057 			}
3058 
3059 			LOG_DEBUG(VHOST_CONFIG,
3060 				"in main: mbuf count in mempool at initial "
3061 				"is: %d\n", count_in_mempool);
3062 			LOG_DEBUG(VHOST_CONFIG,
3063 				"in main: mbuf count in  ring at initial  is :"
3064 				" %d\n",
3065 				rte_ring_count(vpool_array[index].ring));
3066 		}
3067 
3068 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3069 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3070 				lcore_id);
3071 	}
3072 
3073 	if (mergeable == 0)
3074 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3075 
3076 	/* Register CUSE device to handle IOCTLs. */
3077 	ret = rte_vhost_driver_register((char *)&dev_basename);
3078 	if (ret != 0)
3079 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3080 
3081 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3082 
3083 	/* Start CUSE session. */
3084 	rte_vhost_driver_session_start();
3085 	return 0;
3086 
3087 }
3088 
3089