xref: /dpdk/examples/vhost/main.c (revision e8b9ef877e4fd4bf723bb4d987e4bece5d276a88)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 
54 #include "main.h"
55 
56 #define MAX_QUEUES 512
57 
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60 
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +  		\
65 							(num_switching_cores*MAX_PKT_BURST) +  			\
66 							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 							(num_switching_cores*MBUF_CACHE_SIZE))
68 
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71 
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 	+ RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81 
82 #define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 	/* TX drain every ~100us */
84 
85 #define BURST_RX_WAIT_US 15 	/* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
87 
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89 
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX			1
93 #define DEVICE_SAFE_REMOVE	2
94 
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98 
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102 
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114 
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 		+ sizeof(struct rte_mbuf)))
118 
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 
122 #define INVALID_PORT_ID 0xFF
123 
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126 
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129 
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132 
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135 
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 
142 #define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
143 
144 /* mask of enabled ports */
145 static uint32_t enabled_port_mask = 0;
146 
147 /* Promiscuous mode */
148 static uint32_t promiscuous;
149 
150 /*Number of switching cores enabled*/
151 static uint32_t num_switching_cores = 0;
152 
153 /* number of devices/queues to support*/
154 static uint32_t num_queues = 0;
155 static uint32_t num_devices;
156 
157 /*
158  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
159  * disabled on default.
160  */
161 static uint32_t zero_copy;
162 static int mergeable;
163 
164 /* Do vlan strip on host, enabled on default */
165 static uint32_t vlan_strip = 1;
166 
167 /* number of descriptors to apply*/
168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
170 
171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
172 #define MAX_RING_DESC 4096
173 
174 struct vpool {
175 	struct rte_mempool *pool;
176 	struct rte_ring *ring;
177 	uint32_t buf_size;
178 } vpool_array[MAX_QUEUES+MAX_QUEUES];
179 
180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
181 typedef enum {
182 	VM2VM_DISABLED = 0,
183 	VM2VM_SOFTWARE = 1,
184 	VM2VM_HARDWARE = 2,
185 	VM2VM_LAST
186 } vm2vm_type;
187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
188 
189 /* The type of host physical address translated from guest physical address. */
190 typedef enum {
191 	PHYS_ADDR_CONTINUOUS = 0,
192 	PHYS_ADDR_CROSS_SUBREG = 1,
193 	PHYS_ADDR_INVALID = 2,
194 	PHYS_ADDR_LAST
195 } hpa_type;
196 
197 /* Enable stats. */
198 static uint32_t enable_stats = 0;
199 /* Enable retries on RX. */
200 static uint32_t enable_retry = 1;
201 /* Specify timeout (in useconds) between retries on RX. */
202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
203 /* Specify the number of retries on RX. */
204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
205 
206 /* Character device basename. Can be set by user. */
207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
208 
209 /* empty vmdq configuration structure. Filled in programatically */
210 static struct rte_eth_conf vmdq_conf_default = {
211 	.rxmode = {
212 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
213 		.split_hdr_size = 0,
214 		.header_split   = 0, /**< Header Split disabled */
215 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
216 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
217 		/*
218 		 * It is necessary for 1G NIC such as I350,
219 		 * this fixes bug of ipv4 forwarding in guest can't
220 		 * forward pakets from one virtio dev to another virtio dev.
221 		 */
222 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
223 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
224 		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
225 	},
226 
227 	.txmode = {
228 		.mq_mode = ETH_MQ_TX_NONE,
229 	},
230 	.rx_adv_conf = {
231 		/*
232 		 * should be overridden separately in code with
233 		 * appropriate values
234 		 */
235 		.vmdq_rx_conf = {
236 			.nb_queue_pools = ETH_8_POOLS,
237 			.enable_default_pool = 0,
238 			.default_pool = 0,
239 			.nb_pool_maps = 0,
240 			.pool_map = {{0, 0},},
241 		},
242 	},
243 };
244 
245 static unsigned lcore_ids[RTE_MAX_LCORE];
246 static uint8_t ports[RTE_MAX_ETHPORTS];
247 static unsigned num_ports = 0; /**< The number of ports specified in command line */
248 static uint16_t num_pf_queues, num_vmdq_queues;
249 static uint16_t vmdq_pool_base, vmdq_queue_base;
250 static uint16_t queues_per_pool;
251 
252 static const uint16_t external_pkt_default_vlan_tag = 2000;
253 const uint16_t vlan_tags[] = {
254 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
255 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
256 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
257 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
258 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
259 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
260 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
261 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
262 };
263 
264 /* ethernet addresses of ports */
265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
266 
267 /* heads for the main used and free linked lists for the data path. */
268 static struct virtio_net_data_ll *ll_root_used = NULL;
269 static struct virtio_net_data_ll *ll_root_free = NULL;
270 
271 /* Array of data core structures containing information on individual core linked lists. */
272 static struct lcore_info lcore_info[RTE_MAX_LCORE];
273 
274 /* Used for queueing bursts of TX packets. */
275 struct mbuf_table {
276 	unsigned len;
277 	unsigned txq_id;
278 	struct rte_mbuf *m_table[MAX_PKT_BURST];
279 };
280 
281 /* TX queue for each data core. */
282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
283 
284 /* TX queue fori each virtio device for zero copy. */
285 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
286 
287 /* Vlan header struct used to insert vlan tags on TX. */
288 struct vlan_ethhdr {
289 	unsigned char   h_dest[ETH_ALEN];
290 	unsigned char   h_source[ETH_ALEN];
291 	__be16          h_vlan_proto;
292 	__be16          h_vlan_TCI;
293 	__be16          h_vlan_encapsulated_proto;
294 };
295 
296 /* IPv4 Header */
297 struct ipv4_hdr {
298 	uint8_t  version_ihl;		/**< version and header length */
299 	uint8_t  type_of_service;	/**< type of service */
300 	uint16_t total_length;		/**< length of packet */
301 	uint16_t packet_id;		/**< packet ID */
302 	uint16_t fragment_offset;	/**< fragmentation offset */
303 	uint8_t  time_to_live;		/**< time to live */
304 	uint8_t  next_proto_id;		/**< protocol ID */
305 	uint16_t hdr_checksum;		/**< header checksum */
306 	uint32_t src_addr;		/**< source address */
307 	uint32_t dst_addr;		/**< destination address */
308 } __attribute__((__packed__));
309 
310 /* Header lengths. */
311 #define VLAN_HLEN       4
312 #define VLAN_ETH_HLEN   18
313 
314 /* Per-device statistics struct */
315 struct device_statistics {
316 	uint64_t tx_total;
317 	rte_atomic64_t rx_total_atomic;
318 	uint64_t rx_total;
319 	uint64_t tx;
320 	rte_atomic64_t rx_atomic;
321 	uint64_t rx;
322 } __rte_cache_aligned;
323 struct device_statistics dev_statistics[MAX_DEVICES];
324 
325 /*
326  * Builds up the correct configuration for VMDQ VLAN pool map
327  * according to the pool & queue limits.
328  */
329 static inline int
330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
331 {
332 	struct rte_eth_vmdq_rx_conf conf;
333 	struct rte_eth_vmdq_rx_conf *def_conf =
334 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
335 	unsigned i;
336 
337 	memset(&conf, 0, sizeof(conf));
338 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
339 	conf.nb_pool_maps = num_devices;
340 	conf.enable_loop_back = def_conf->enable_loop_back;
341 	conf.rx_mode = def_conf->rx_mode;
342 
343 	for (i = 0; i < conf.nb_pool_maps; i++) {
344 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
345 		conf.pool_map[i].pools = (1UL << i);
346 	}
347 
348 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
349 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
350 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
351 	return 0;
352 }
353 
354 /*
355  * Validate the device number according to the max pool number gotten form
356  * dev_info. If the device number is invalid, give the error message and
357  * return -1. Each device must have its own pool.
358  */
359 static inline int
360 validate_num_devices(uint32_t max_nb_devices)
361 {
362 	if (num_devices > max_nb_devices) {
363 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
364 		return -1;
365 	}
366 	return 0;
367 }
368 
369 /*
370  * Initialises a given port using global settings and with the rx buffers
371  * coming from the mbuf_pool passed as parameter
372  */
373 static inline int
374 port_init(uint8_t port)
375 {
376 	struct rte_eth_dev_info dev_info;
377 	struct rte_eth_conf port_conf;
378 	struct rte_eth_rxconf *rxconf;
379 	struct rte_eth_txconf *txconf;
380 	int16_t rx_rings, tx_rings;
381 	uint16_t rx_ring_size, tx_ring_size;
382 	int retval;
383 	uint16_t q;
384 
385 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
386 	rte_eth_dev_info_get (port, &dev_info);
387 
388 	if (dev_info.max_rx_queues > MAX_QUEUES) {
389 		rte_exit(EXIT_FAILURE,
390 			"please define MAX_QUEUES no less than %u in %s\n",
391 			dev_info.max_rx_queues, __FILE__);
392 	}
393 
394 	rxconf = &dev_info.default_rxconf;
395 	txconf = &dev_info.default_txconf;
396 	rxconf->rx_drop_en = 1;
397 
398 	/* Enable vlan offload */
399 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
400 
401 	/*
402 	 * Zero copy defers queue RX/TX start to the time when guest
403 	 * finishes its startup and packet buffers from that guest are
404 	 * available.
405 	 */
406 	if (zero_copy) {
407 		rxconf->rx_deferred_start = 1;
408 		rxconf->rx_drop_en = 0;
409 		txconf->tx_deferred_start = 1;
410 	}
411 
412 	/*configure the number of supported virtio devices based on VMDQ limits */
413 	num_devices = dev_info.max_vmdq_pools;
414 
415 	if (zero_copy) {
416 		rx_ring_size = num_rx_descriptor;
417 		tx_ring_size = num_tx_descriptor;
418 		tx_rings = dev_info.max_tx_queues;
419 	} else {
420 		rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
421 		tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
422 		tx_rings = (uint16_t)rte_lcore_count();
423 	}
424 
425 	retval = validate_num_devices(MAX_DEVICES);
426 	if (retval < 0)
427 		return retval;
428 
429 	/* Get port configuration. */
430 	retval = get_eth_conf(&port_conf, num_devices);
431 	if (retval < 0)
432 		return retval;
433 	/* NIC queues are divided into pf queues and vmdq queues.  */
434 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
435 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
436 	num_vmdq_queues = num_devices * queues_per_pool;
437 	num_queues = num_pf_queues + num_vmdq_queues;
438 	vmdq_queue_base = dev_info.vmdq_queue_base;
439 	vmdq_pool_base  = dev_info.vmdq_pool_base;
440 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
441 		num_pf_queues, num_devices, queues_per_pool);
442 
443 	if (port >= rte_eth_dev_count()) return -1;
444 
445 	rx_rings = (uint16_t)dev_info.max_rx_queues;
446 	/* Configure ethernet device. */
447 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
448 	if (retval != 0)
449 		return retval;
450 
451 	/* Setup the queues. */
452 	for (q = 0; q < rx_rings; q ++) {
453 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
454 						rte_eth_dev_socket_id(port),
455 						rxconf,
456 						vpool_array[q].pool);
457 		if (retval < 0)
458 			return retval;
459 	}
460 	for (q = 0; q < tx_rings; q ++) {
461 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462 						rte_eth_dev_socket_id(port),
463 						txconf);
464 		if (retval < 0)
465 			return retval;
466 	}
467 
468 	/* Start the device. */
469 	retval  = rte_eth_dev_start(port);
470 	if (retval < 0) {
471 		RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
472 		return retval;
473 	}
474 
475 	if (promiscuous)
476 		rte_eth_promiscuous_enable(port);
477 
478 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
479 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
480 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
481 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482 			(unsigned)port,
483 			vmdq_ports_eth_addr[port].addr_bytes[0],
484 			vmdq_ports_eth_addr[port].addr_bytes[1],
485 			vmdq_ports_eth_addr[port].addr_bytes[2],
486 			vmdq_ports_eth_addr[port].addr_bytes[3],
487 			vmdq_ports_eth_addr[port].addr_bytes[4],
488 			vmdq_ports_eth_addr[port].addr_bytes[5]);
489 
490 	return 0;
491 }
492 
493 /*
494  * Set character device basename.
495  */
496 static int
497 us_vhost_parse_basename(const char *q_arg)
498 {
499 	/* parse number string */
500 
501 	if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
502 		return -1;
503 	else
504 		snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
505 
506 	return 0;
507 }
508 
509 /*
510  * Parse the portmask provided at run time.
511  */
512 static int
513 parse_portmask(const char *portmask)
514 {
515 	char *end = NULL;
516 	unsigned long pm;
517 
518 	errno = 0;
519 
520 	/* parse hexadecimal string */
521 	pm = strtoul(portmask, &end, 16);
522 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
523 		return -1;
524 
525 	if (pm == 0)
526 		return -1;
527 
528 	return pm;
529 
530 }
531 
532 /*
533  * Parse num options at run time.
534  */
535 static int
536 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
537 {
538 	char *end = NULL;
539 	unsigned long num;
540 
541 	errno = 0;
542 
543 	/* parse unsigned int string */
544 	num = strtoul(q_arg, &end, 10);
545 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
546 		return -1;
547 
548 	if (num > max_valid_value)
549 		return -1;
550 
551 	return num;
552 
553 }
554 
555 /*
556  * Display usage
557  */
558 static void
559 us_vhost_usage(const char *prgname)
560 {
561 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562 	"		--vm2vm [0|1|2]\n"
563 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
564 	"		--dev-basename <name>\n"
565 	"		--nb-devices ND\n"
566 	"		-p PORTMASK: Set mask for ports to be used by application\n"
567 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
568 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
569 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
570 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
571 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
572 	"		--vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
573 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574 	"		--dev-basename: The basename to be used for the character device.\n"
575 	"		--zero-copy [0|1]: disable(default)/enable rx/tx "
576 			"zero copy\n"
577 	"		--rx-desc-num [0-N]: the number of descriptors on rx, "
578 			"used only when zero copy is enabled.\n"
579 	"		--tx-desc-num [0-N]: the number of descriptors on tx, "
580 			"used only when zero copy is enabled.\n",
581 	       prgname);
582 }
583 
584 /*
585  * Parse the arguments given in the command line of the application.
586  */
587 static int
588 us_vhost_parse_args(int argc, char **argv)
589 {
590 	int opt, ret;
591 	int option_index;
592 	unsigned i;
593 	const char *prgname = argv[0];
594 	static struct option long_option[] = {
595 		{"vm2vm", required_argument, NULL, 0},
596 		{"rx-retry", required_argument, NULL, 0},
597 		{"rx-retry-delay", required_argument, NULL, 0},
598 		{"rx-retry-num", required_argument, NULL, 0},
599 		{"mergeable", required_argument, NULL, 0},
600 		{"vlan-strip", required_argument, NULL, 0},
601 		{"stats", required_argument, NULL, 0},
602 		{"dev-basename", required_argument, NULL, 0},
603 		{"zero-copy", required_argument, NULL, 0},
604 		{"rx-desc-num", required_argument, NULL, 0},
605 		{"tx-desc-num", required_argument, NULL, 0},
606 		{NULL, 0, 0, 0},
607 	};
608 
609 	/* Parse command line */
610 	while ((opt = getopt_long(argc, argv, "p:P",
611 			long_option, &option_index)) != EOF) {
612 		switch (opt) {
613 		/* Portmask */
614 		case 'p':
615 			enabled_port_mask = parse_portmask(optarg);
616 			if (enabled_port_mask == 0) {
617 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
618 				us_vhost_usage(prgname);
619 				return -1;
620 			}
621 			break;
622 
623 		case 'P':
624 			promiscuous = 1;
625 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
626 				ETH_VMDQ_ACCEPT_BROADCAST |
627 				ETH_VMDQ_ACCEPT_MULTICAST;
628 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
629 
630 			break;
631 
632 		case 0:
633 			/* Enable/disable vm2vm comms. */
634 			if (!strncmp(long_option[option_index].name, "vm2vm",
635 				MAX_LONG_OPT_SZ)) {
636 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
637 				if (ret == -1) {
638 					RTE_LOG(INFO, VHOST_CONFIG,
639 						"Invalid argument for "
640 						"vm2vm [0|1|2]\n");
641 					us_vhost_usage(prgname);
642 					return -1;
643 				} else {
644 					vm2vm_mode = (vm2vm_type)ret;
645 				}
646 			}
647 
648 			/* Enable/disable retries on RX. */
649 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
650 				ret = parse_num_opt(optarg, 1);
651 				if (ret == -1) {
652 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
653 					us_vhost_usage(prgname);
654 					return -1;
655 				} else {
656 					enable_retry = ret;
657 				}
658 			}
659 
660 			/* Specify the retries delay time (in useconds) on RX. */
661 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
662 				ret = parse_num_opt(optarg, INT32_MAX);
663 				if (ret == -1) {
664 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
665 					us_vhost_usage(prgname);
666 					return -1;
667 				} else {
668 					burst_rx_delay_time = ret;
669 				}
670 			}
671 
672 			/* Specify the retries number on RX. */
673 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
674 				ret = parse_num_opt(optarg, INT32_MAX);
675 				if (ret == -1) {
676 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
677 					us_vhost_usage(prgname);
678 					return -1;
679 				} else {
680 					burst_rx_retry_num = ret;
681 				}
682 			}
683 
684 			/* Enable/disable RX mergeable buffers. */
685 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
686 				ret = parse_num_opt(optarg, 1);
687 				if (ret == -1) {
688 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
689 					us_vhost_usage(prgname);
690 					return -1;
691 				} else {
692 					mergeable = !!ret;
693 					if (ret) {
694 						vmdq_conf_default.rxmode.jumbo_frame = 1;
695 						vmdq_conf_default.rxmode.max_rx_pkt_len
696 							= JUMBO_FRAME_MAX_SIZE;
697 					}
698 				}
699 			}
700 
701 			/* Enable/disable RX VLAN strip on host. */
702 			if (!strncmp(long_option[option_index].name,
703 				"vlan-strip", MAX_LONG_OPT_SZ)) {
704 				ret = parse_num_opt(optarg, 1);
705 				if (ret == -1) {
706 					RTE_LOG(INFO, VHOST_CONFIG,
707 						"Invalid argument for VLAN strip [0|1]\n");
708 					us_vhost_usage(prgname);
709 					return -1;
710 				} else {
711 					vlan_strip = !!ret;
712 					vmdq_conf_default.rxmode.hw_vlan_strip =
713 						vlan_strip;
714 				}
715 			}
716 
717 			/* Enable/disable stats. */
718 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
719 				ret = parse_num_opt(optarg, INT32_MAX);
720 				if (ret == -1) {
721 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
722 					us_vhost_usage(prgname);
723 					return -1;
724 				} else {
725 					enable_stats = ret;
726 				}
727 			}
728 
729 			/* Set character device basename. */
730 			if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
731 				if (us_vhost_parse_basename(optarg) == -1) {
732 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
733 					us_vhost_usage(prgname);
734 					return -1;
735 				}
736 			}
737 
738 			/* Enable/disable rx/tx zero copy. */
739 			if (!strncmp(long_option[option_index].name,
740 				"zero-copy", MAX_LONG_OPT_SZ)) {
741 				ret = parse_num_opt(optarg, 1);
742 				if (ret == -1) {
743 					RTE_LOG(INFO, VHOST_CONFIG,
744 						"Invalid argument"
745 						" for zero-copy [0|1]\n");
746 					us_vhost_usage(prgname);
747 					return -1;
748 				} else
749 					zero_copy = ret;
750 
751 				if (zero_copy) {
752 #ifdef RTE_MBUF_REFCNT
753 					RTE_LOG(ERR, VHOST_CONFIG, "Before running "
754 					"zero copy vhost APP, please "
755 					"disable RTE_MBUF_REFCNT\n"
756 					"in config file and then rebuild DPDK "
757 					"core lib!\n"
758 					"Otherwise please disable zero copy "
759 					"flag in command line!\n");
760 					return -1;
761 #endif
762 				}
763 			}
764 
765 			/* Specify the descriptor number on RX. */
766 			if (!strncmp(long_option[option_index].name,
767 				"rx-desc-num", MAX_LONG_OPT_SZ)) {
768 				ret = parse_num_opt(optarg, MAX_RING_DESC);
769 				if ((ret == -1) || (!POWEROF2(ret))) {
770 					RTE_LOG(INFO, VHOST_CONFIG,
771 					"Invalid argument for rx-desc-num[0-N],"
772 					"power of 2 required.\n");
773 					us_vhost_usage(prgname);
774 					return -1;
775 				} else {
776 					num_rx_descriptor = ret;
777 				}
778 			}
779 
780 			/* Specify the descriptor number on TX. */
781 			if (!strncmp(long_option[option_index].name,
782 				"tx-desc-num", MAX_LONG_OPT_SZ)) {
783 				ret = parse_num_opt(optarg, MAX_RING_DESC);
784 				if ((ret == -1) || (!POWEROF2(ret))) {
785 					RTE_LOG(INFO, VHOST_CONFIG,
786 					"Invalid argument for tx-desc-num [0-N],"
787 					"power of 2 required.\n");
788 					us_vhost_usage(prgname);
789 					return -1;
790 				} else {
791 					num_tx_descriptor = ret;
792 				}
793 			}
794 
795 			break;
796 
797 			/* Invalid option - print options. */
798 		default:
799 			us_vhost_usage(prgname);
800 			return -1;
801 		}
802 	}
803 
804 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
805 		if (enabled_port_mask & (1 << i))
806 			ports[num_ports++] = (uint8_t)i;
807 	}
808 
809 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
810 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
811 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
812 		return -1;
813 	}
814 
815 	if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
816 		RTE_LOG(INFO, VHOST_PORT,
817 			"Vhost zero copy doesn't support software vm2vm,"
818 			"please specify 'vm2vm 2' to use hardware vm2vm.\n");
819 		return -1;
820 	}
821 
822 	if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
823 		RTE_LOG(INFO, VHOST_PORT,
824 			"Vhost zero copy doesn't support jumbo frame,"
825 			"please specify '--mergeable 0' to disable the "
826 			"mergeable feature.\n");
827 		return -1;
828 	}
829 
830 	return 0;
831 }
832 
833 /*
834  * Update the global var NUM_PORTS and array PORTS according to system ports number
835  * and return valid ports number
836  */
837 static unsigned check_ports_num(unsigned nb_ports)
838 {
839 	unsigned valid_num_ports = num_ports;
840 	unsigned portid;
841 
842 	if (num_ports > nb_ports) {
843 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
844 			num_ports, nb_ports);
845 		num_ports = nb_ports;
846 	}
847 
848 	for (portid = 0; portid < num_ports; portid ++) {
849 		if (ports[portid] >= nb_ports) {
850 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
851 				ports[portid], (nb_ports - 1));
852 			ports[portid] = INVALID_PORT_ID;
853 			valid_num_ports--;
854 		}
855 	}
856 	return valid_num_ports;
857 }
858 
859 /*
860  * Macro to print out packet contents. Wrapped in debug define so that the
861  * data path is not effected when debug is disabled.
862  */
863 #ifdef DEBUG
864 #define PRINT_PACKET(device, addr, size, header) do {																\
865 	char *pkt_addr = (char*)(addr);																					\
866 	unsigned int index;																								\
867 	char packet[MAX_PRINT_BUFF];																					\
868 																													\
869 	if ((header))																									\
870 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));				\
871 	else																											\
872 		snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));				\
873 	for (index = 0; index < (size); index++) {																		\
874 		snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),	\
875 			"%02hhx ", pkt_addr[index]);																			\
876 	}																												\
877 	snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");	\
878 																													\
879 	LOG_DEBUG(VHOST_DATA, "%s", packet);																					\
880 } while(0)
881 #else
882 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
883 #endif
884 
885 /*
886  * Function to convert guest physical addresses to vhost physical addresses.
887  * This is used to convert virtio buffer addresses.
888  */
889 static inline uint64_t __attribute__((always_inline))
890 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
891 	uint32_t buf_len, hpa_type *addr_type)
892 {
893 	struct virtio_memory_regions_hpa *region;
894 	uint32_t regionidx;
895 	uint64_t vhost_pa = 0;
896 
897 	*addr_type = PHYS_ADDR_INVALID;
898 
899 	for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
900 		region = &vdev->regions_hpa[regionidx];
901 		if ((guest_pa >= region->guest_phys_address) &&
902 			(guest_pa <= region->guest_phys_address_end)) {
903 			vhost_pa = region->host_phys_addr_offset + guest_pa;
904 			if (likely((guest_pa + buf_len - 1)
905 				<= region->guest_phys_address_end))
906 				*addr_type = PHYS_ADDR_CONTINUOUS;
907 			else
908 				*addr_type = PHYS_ADDR_CROSS_SUBREG;
909 			break;
910 		}
911 	}
912 
913 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
914 		vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
915 		(void *)(uintptr_t)vhost_pa);
916 
917 	return vhost_pa;
918 }
919 
920 /*
921  * Compares a packet destination MAC address to a device MAC address.
922  */
923 static inline int __attribute__((always_inline))
924 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
925 {
926 	return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
927 }
928 
929 /*
930  * This function learns the MAC address of the device and registers this along with a
931  * vlan tag to a VMDQ.
932  */
933 static int
934 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
935 {
936 	struct ether_hdr *pkt_hdr;
937 	struct virtio_net_data_ll *dev_ll;
938 	struct virtio_net *dev = vdev->dev;
939 	int i, ret;
940 
941 	/* Learn MAC address of guest device from packet */
942 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
943 
944 	dev_ll = ll_root_used;
945 
946 	while (dev_ll != NULL) {
947 		if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
948 			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
949 			return -1;
950 		}
951 		dev_ll = dev_ll->next;
952 	}
953 
954 	for (i = 0; i < ETHER_ADDR_LEN; i++)
955 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
956 
957 	/* vlan_tag currently uses the device_id. */
958 	vdev->vlan_tag = vlan_tags[dev->device_fh];
959 
960 	/* Print out VMDQ registration info. */
961 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
962 		dev->device_fh,
963 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
964 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
965 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
966 		vdev->vlan_tag);
967 
968 	/* Register the MAC address. */
969 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
970 				(uint32_t)dev->device_fh + vmdq_pool_base);
971 	if (ret)
972 		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
973 					dev->device_fh);
974 
975 	/* Enable stripping of the vlan tag as we handle routing. */
976 	if (vlan_strip)
977 		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
978 			(uint16_t)vdev->vmdq_rx_q, 1);
979 
980 	/* Set device as ready for RX. */
981 	vdev->ready = DEVICE_RX;
982 
983 	return 0;
984 }
985 
986 /*
987  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
988  * queue before disabling RX on the device.
989  */
990 static inline void
991 unlink_vmdq(struct vhost_dev *vdev)
992 {
993 	unsigned i = 0;
994 	unsigned rx_count;
995 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
996 
997 	if (vdev->ready == DEVICE_RX) {
998 		/*clear MAC and VLAN settings*/
999 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1000 		for (i = 0; i < 6; i++)
1001 			vdev->mac_address.addr_bytes[i] = 0;
1002 
1003 		vdev->vlan_tag = 0;
1004 
1005 		/*Clear out the receive buffers*/
1006 		rx_count = rte_eth_rx_burst(ports[0],
1007 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1008 
1009 		while (rx_count) {
1010 			for (i = 0; i < rx_count; i++)
1011 				rte_pktmbuf_free(pkts_burst[i]);
1012 
1013 			rx_count = rte_eth_rx_burst(ports[0],
1014 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1015 		}
1016 
1017 		vdev->ready = DEVICE_MAC_LEARNING;
1018 	}
1019 }
1020 
1021 /*
1022  * Check if the packet destination MAC address is for a local device. If so then put
1023  * the packet on that devices RX queue. If not then return.
1024  */
1025 static inline int __attribute__((always_inline))
1026 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1027 {
1028 	struct virtio_net_data_ll *dev_ll;
1029 	struct ether_hdr *pkt_hdr;
1030 	uint64_t ret = 0;
1031 	struct virtio_net *dev = vdev->dev;
1032 	struct virtio_net *tdev; /* destination virito device */
1033 
1034 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1035 
1036 	/*get the used devices list*/
1037 	dev_ll = ll_root_used;
1038 
1039 	while (dev_ll != NULL) {
1040 		if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1041 				          &dev_ll->vdev->mac_address)) {
1042 
1043 			/* Drop the packet if the TX packet is destined for the TX device. */
1044 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1045 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1046 							dev->device_fh);
1047 				return 0;
1048 			}
1049 			tdev = dev_ll->vdev->dev;
1050 
1051 
1052 			LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1053 
1054 			if (unlikely(dev_ll->vdev->remove)) {
1055 				/*drop the packet if the device is marked for removal*/
1056 				LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1057 			} else {
1058 				/*send the packet to the local virtio device*/
1059 				ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1060 				if (enable_stats) {
1061 					rte_atomic64_add(
1062 					&dev_statistics[tdev->device_fh].rx_total_atomic,
1063 					1);
1064 					rte_atomic64_add(
1065 					&dev_statistics[tdev->device_fh].rx_atomic,
1066 					ret);
1067 					dev_statistics[tdev->device_fh].tx_total++;
1068 					dev_statistics[tdev->device_fh].tx += ret;
1069 				}
1070 			}
1071 
1072 			return 0;
1073 		}
1074 		dev_ll = dev_ll->next;
1075 	}
1076 
1077 	return -1;
1078 }
1079 
1080 /*
1081  * Check if the destination MAC of a packet is one local VM,
1082  * and get its vlan tag, and offset if it is.
1083  */
1084 static inline int __attribute__((always_inline))
1085 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1086 	uint32_t *offset, uint16_t *vlan_tag)
1087 {
1088 	struct virtio_net_data_ll *dev_ll = ll_root_used;
1089 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1090 
1091 	while (dev_ll != NULL) {
1092 		if ((dev_ll->vdev->ready == DEVICE_RX)
1093 			&& ether_addr_cmp(&(pkt_hdr->d_addr),
1094 		&dev_ll->vdev->mac_address)) {
1095 			/*
1096 			 * Drop the packet if the TX packet is
1097 			 * destined for the TX device.
1098 			 */
1099 			if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1100 				LOG_DEBUG(VHOST_DATA,
1101 				"(%"PRIu64") TX: Source and destination"
1102 				" MAC addresses are the same. Dropping "
1103 				"packet.\n",
1104 				dev_ll->vdev->dev->device_fh);
1105 				return -1;
1106 			}
1107 
1108 			/*
1109 			 * HW vlan strip will reduce the packet length
1110 			 * by minus length of vlan tag, so need restore
1111 			 * the packet length by plus it.
1112 			 */
1113 			*offset = VLAN_HLEN;
1114 			*vlan_tag =
1115 			(uint16_t)
1116 			vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1117 
1118 			LOG_DEBUG(VHOST_DATA,
1119 			"(%"PRIu64") TX: pkt to local VM device id:"
1120 			"(%"PRIu64") vlan tag: %d.\n",
1121 			dev->device_fh, dev_ll->vdev->dev->device_fh,
1122 			vlan_tag);
1123 
1124 			break;
1125 		}
1126 		dev_ll = dev_ll->next;
1127 	}
1128 	return 0;
1129 }
1130 
1131 /*
1132  * This function routes the TX packet to the correct interface. This may be a local device
1133  * or the physical port.
1134  */
1135 static inline void __attribute__((always_inline))
1136 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1137 {
1138 	struct mbuf_table *tx_q;
1139 	struct rte_mbuf **m_table;
1140 	unsigned len, ret, offset = 0;
1141 	const uint16_t lcore_id = rte_lcore_id();
1142 	struct virtio_net *dev = vdev->dev;
1143 	struct ether_hdr *nh;
1144 
1145 	/*check if destination is local VM*/
1146 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1147 		rte_pktmbuf_free(m);
1148 		return;
1149 	}
1150 
1151 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1152 		if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1153 			rte_pktmbuf_free(m);
1154 			return;
1155 		}
1156 	}
1157 
1158 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1159 
1160 	/*Add packet to the port tx queue*/
1161 	tx_q = &lcore_tx_queue[lcore_id];
1162 	len = tx_q->len;
1163 
1164 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1165 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1166 		/* Guest has inserted the vlan tag. */
1167 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1168 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1169 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1170 			(vh->vlan_tci != vlan_tag_be))
1171 			vh->vlan_tci = vlan_tag_be;
1172 	} else {
1173 		m->ol_flags = PKT_TX_VLAN_PKT;
1174 
1175 		/*
1176 		 * Find the right seg to adjust the data len when offset is
1177 		 * bigger than tail room size.
1178 		 */
1179 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1180 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1181 				m->data_len += offset;
1182 			else {
1183 				struct rte_mbuf *seg = m;
1184 
1185 				while ((seg->next != NULL) &&
1186 					(offset > rte_pktmbuf_tailroom(seg)))
1187 					seg = seg->next;
1188 
1189 				seg->data_len += offset;
1190 			}
1191 			m->pkt_len += offset;
1192 		}
1193 
1194 		m->vlan_tci = vlan_tag;
1195 	}
1196 
1197 	tx_q->m_table[len] = m;
1198 	len++;
1199 	if (enable_stats) {
1200 		dev_statistics[dev->device_fh].tx_total++;
1201 		dev_statistics[dev->device_fh].tx++;
1202 	}
1203 
1204 	if (unlikely(len == MAX_PKT_BURST)) {
1205 		m_table = (struct rte_mbuf **)tx_q->m_table;
1206 		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1207 		/* Free any buffers not handled by TX and update the port stats. */
1208 		if (unlikely(ret < len)) {
1209 			do {
1210 				rte_pktmbuf_free(m_table[ret]);
1211 			} while (++ret < len);
1212 		}
1213 
1214 		len = 0;
1215 	}
1216 
1217 	tx_q->len = len;
1218 	return;
1219 }
1220 /*
1221  * This function is called by each data core. It handles all RX/TX registered with the
1222  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1223  * with all devices in the main linked list.
1224  */
1225 static int
1226 switch_worker(__attribute__((unused)) void *arg)
1227 {
1228 	struct rte_mempool *mbuf_pool = arg;
1229 	struct virtio_net *dev = NULL;
1230 	struct vhost_dev *vdev = NULL;
1231 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1232 	struct virtio_net_data_ll *dev_ll;
1233 	struct mbuf_table *tx_q;
1234 	volatile struct lcore_ll_info *lcore_ll;
1235 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1236 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1237 	unsigned ret, i;
1238 	const uint16_t lcore_id = rte_lcore_id();
1239 	const uint16_t num_cores = (uint16_t)rte_lcore_count();
1240 	uint16_t rx_count = 0;
1241 	uint16_t tx_count;
1242 	uint32_t retry = 0;
1243 
1244 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1245 	lcore_ll = lcore_info[lcore_id].lcore_ll;
1246 	prev_tsc = 0;
1247 
1248 	tx_q = &lcore_tx_queue[lcore_id];
1249 	for (i = 0; i < num_cores; i ++) {
1250 		if (lcore_ids[i] == lcore_id) {
1251 			tx_q->txq_id = i;
1252 			break;
1253 		}
1254 	}
1255 
1256 	while(1) {
1257 		cur_tsc = rte_rdtsc();
1258 		/*
1259 		 * TX burst queue drain
1260 		 */
1261 		diff_tsc = cur_tsc - prev_tsc;
1262 		if (unlikely(diff_tsc > drain_tsc)) {
1263 
1264 			if (tx_q->len) {
1265 				LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1266 
1267 				/*Tx any packets in the queue*/
1268 				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1269 									   (struct rte_mbuf **)tx_q->m_table,
1270 									   (uint16_t)tx_q->len);
1271 				if (unlikely(ret < tx_q->len)) {
1272 					do {
1273 						rte_pktmbuf_free(tx_q->m_table[ret]);
1274 					} while (++ret < tx_q->len);
1275 				}
1276 
1277 				tx_q->len = 0;
1278 			}
1279 
1280 			prev_tsc = cur_tsc;
1281 
1282 		}
1283 
1284 		rte_prefetch0(lcore_ll->ll_root_used);
1285 		/*
1286 		 * Inform the configuration core that we have exited the linked list and that no devices are
1287 		 * in use if requested.
1288 		 */
1289 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1290 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1291 
1292 		/*
1293 		 * Process devices
1294 		 */
1295 		dev_ll = lcore_ll->ll_root_used;
1296 
1297 		while (dev_ll != NULL) {
1298 			/*get virtio device ID*/
1299 			vdev = dev_ll->vdev;
1300 			dev = vdev->dev;
1301 
1302 			if (unlikely(vdev->remove)) {
1303 				dev_ll = dev_ll->next;
1304 				unlink_vmdq(vdev);
1305 				vdev->ready = DEVICE_SAFE_REMOVE;
1306 				continue;
1307 			}
1308 			if (likely(vdev->ready == DEVICE_RX)) {
1309 				/*Handle guest RX*/
1310 				rx_count = rte_eth_rx_burst(ports[0],
1311 					vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1312 
1313 				if (rx_count) {
1314 					/*
1315 					* Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1316 					* Here MAX_PKT_BURST must be less than virtio queue size
1317 					*/
1318 					if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1319 						for (retry = 0; retry < burst_rx_retry_num; retry++) {
1320 							rte_delay_us(burst_rx_delay_time);
1321 							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1322 								break;
1323 						}
1324 					}
1325 					ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1326 					if (enable_stats) {
1327 						rte_atomic64_add(
1328 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1329 						rx_count);
1330 						rte_atomic64_add(
1331 						&dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1332 					}
1333 					while (likely(rx_count)) {
1334 						rx_count--;
1335 						rte_pktmbuf_free(pkts_burst[rx_count]);
1336 					}
1337 
1338 				}
1339 			}
1340 
1341 			if (likely(!vdev->remove)) {
1342 				/* Handle guest TX*/
1343 				tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1344 				/* If this is the first received packet we need to learn the MAC and setup VMDQ */
1345 				if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1346 					if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1347 						while (tx_count)
1348 							rte_pktmbuf_free(pkts_burst[--tx_count]);
1349 					}
1350 				}
1351 				while (tx_count)
1352 					virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1353 			}
1354 
1355 			/*move to the next device in the list*/
1356 			dev_ll = dev_ll->next;
1357 		}
1358 	}
1359 
1360 	return 0;
1361 }
1362 
1363 /*
1364  * This function gets available ring number for zero copy rx.
1365  * Only one thread will call this funciton for a paticular virtio device,
1366  * so, it is designed as non-thread-safe function.
1367  */
1368 static inline uint32_t __attribute__((always_inline))
1369 get_available_ring_num_zcp(struct virtio_net *dev)
1370 {
1371 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1372 	uint16_t avail_idx;
1373 
1374 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1375 	return (uint32_t)(avail_idx - vq->last_used_idx_res);
1376 }
1377 
1378 /*
1379  * This function gets available ring index for zero copy rx,
1380  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1381  * Only one thread will call this funciton for a paticular virtio device,
1382  * so, it is designed as non-thread-safe function.
1383  */
1384 static inline uint32_t __attribute__((always_inline))
1385 get_available_ring_index_zcp(struct virtio_net *dev,
1386 	uint16_t *res_base_idx, uint32_t count)
1387 {
1388 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1389 	uint16_t avail_idx;
1390 	uint32_t retry = 0;
1391 	uint16_t free_entries;
1392 
1393 	*res_base_idx = vq->last_used_idx_res;
1394 	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1395 	free_entries = (avail_idx - *res_base_idx);
1396 
1397 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1398 			"avail idx: %d, "
1399 			"res base idx:%d, free entries:%d\n",
1400 			dev->device_fh, avail_idx, *res_base_idx,
1401 			free_entries);
1402 
1403 	/*
1404 	 * If retry is enabled and the queue is full then we wait
1405 	 * and retry to avoid packet loss.
1406 	 */
1407 	if (enable_retry && unlikely(count > free_entries)) {
1408 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1409 			rte_delay_us(burst_rx_delay_time);
1410 			avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1411 			free_entries = (avail_idx - *res_base_idx);
1412 			if (count <= free_entries)
1413 				break;
1414 		}
1415 	}
1416 
1417 	/*check that we have enough buffers*/
1418 	if (unlikely(count > free_entries))
1419 		count = free_entries;
1420 
1421 	if (unlikely(count == 0)) {
1422 		LOG_DEBUG(VHOST_DATA,
1423 			"(%"PRIu64") Fail in get_available_ring_index_zcp: "
1424 			"avail idx: %d, res base idx:%d, free entries:%d\n",
1425 			dev->device_fh, avail_idx,
1426 			*res_base_idx, free_entries);
1427 		return 0;
1428 	}
1429 
1430 	vq->last_used_idx_res = *res_base_idx + count;
1431 
1432 	return count;
1433 }
1434 
1435 /*
1436  * This function put descriptor back to used list.
1437  */
1438 static inline void __attribute__((always_inline))
1439 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1440 {
1441 	uint16_t res_cur_idx = vq->last_used_idx;
1442 	vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1443 	vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1444 	rte_compiler_barrier();
1445 	*(volatile uint16_t *)&vq->used->idx += 1;
1446 	vq->last_used_idx += 1;
1447 
1448 	/* Kick the guest if necessary. */
1449 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1450 		eventfd_write((int)vq->kickfd, 1);
1451 }
1452 
1453 /*
1454  * This function get available descriptor from vitio vring and un-attached mbuf
1455  * from vpool->ring, and then attach them together. It needs adjust the offset
1456  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1457  * frame data may be put to wrong location in mbuf.
1458  */
1459 static inline void __attribute__((always_inline))
1460 attach_rxmbuf_zcp(struct virtio_net *dev)
1461 {
1462 	uint16_t res_base_idx, desc_idx;
1463 	uint64_t buff_addr, phys_addr;
1464 	struct vhost_virtqueue *vq;
1465 	struct vring_desc *desc;
1466 	struct rte_mbuf *mbuf = NULL;
1467 	struct vpool *vpool;
1468 	hpa_type addr_type;
1469 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1470 
1471 	vpool = &vpool_array[vdev->vmdq_rx_q];
1472 	vq = dev->virtqueue[VIRTIO_RXQ];
1473 
1474 	do {
1475 		if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1476 				1) != 1))
1477 			return;
1478 		desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1479 
1480 		desc = &vq->desc[desc_idx];
1481 		if (desc->flags & VRING_DESC_F_NEXT) {
1482 			desc = &vq->desc[desc->next];
1483 			buff_addr = gpa_to_vva(dev, desc->addr);
1484 			phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1485 					&addr_type);
1486 		} else {
1487 			buff_addr = gpa_to_vva(dev,
1488 					desc->addr + vq->vhost_hlen);
1489 			phys_addr = gpa_to_hpa(vdev,
1490 					desc->addr + vq->vhost_hlen,
1491 					desc->len, &addr_type);
1492 		}
1493 
1494 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1495 			RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1496 				" address found when attaching RX frame buffer"
1497 				" address!\n", dev->device_fh);
1498 			put_desc_to_used_list_zcp(vq, desc_idx);
1499 			continue;
1500 		}
1501 
1502 		/*
1503 		 * Check if the frame buffer address from guest crosses
1504 		 * sub-region or not.
1505 		 */
1506 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1507 			RTE_LOG(ERR, VHOST_DATA,
1508 				"(%"PRIu64") Frame buffer address cross "
1509 				"sub-regioin found when attaching RX frame "
1510 				"buffer address!\n",
1511 				dev->device_fh);
1512 			put_desc_to_used_list_zcp(vq, desc_idx);
1513 			continue;
1514 		}
1515 	} while (unlikely(phys_addr == 0));
1516 
1517 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1518 	if (unlikely(mbuf == NULL)) {
1519 		LOG_DEBUG(VHOST_DATA,
1520 			"(%"PRIu64") in attach_rxmbuf_zcp: "
1521 			"ring_sc_dequeue fail.\n",
1522 			dev->device_fh);
1523 		put_desc_to_used_list_zcp(vq, desc_idx);
1524 		return;
1525 	}
1526 
1527 	if (unlikely(vpool->buf_size > desc->len)) {
1528 		LOG_DEBUG(VHOST_DATA,
1529 			"(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1530 			"length(%d) of descriptor idx: %d less than room "
1531 			"size required: %d\n",
1532 			dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1533 		put_desc_to_used_list_zcp(vq, desc_idx);
1534 		rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1535 		return;
1536 	}
1537 
1538 	mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1539 	mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1540 	mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1541 	mbuf->data_len = desc->len;
1542 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1543 
1544 	LOG_DEBUG(VHOST_DATA,
1545 		"(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1546 		"descriptor idx:%d\n",
1547 		dev->device_fh, res_base_idx, desc_idx);
1548 
1549 	__rte_mbuf_raw_free(mbuf);
1550 
1551 	return;
1552 }
1553 
1554 /*
1555  * Detach an attched packet mbuf -
1556  *  - restore original mbuf address and length values.
1557  *  - reset pktmbuf data and data_len to their default values.
1558  *  All other fields of the given packet mbuf will be left intact.
1559  *
1560  * @param m
1561  *   The attached packet mbuf.
1562  */
1563 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1564 {
1565 	const struct rte_mempool *mp = m->pool;
1566 	void *buf = RTE_MBUF_TO_BADDR(m);
1567 	uint32_t buf_ofs;
1568 	uint32_t buf_len = mp->elt_size - sizeof(*m);
1569 	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1570 
1571 	m->buf_addr = buf;
1572 	m->buf_len = (uint16_t)buf_len;
1573 
1574 	buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1575 			RTE_PKTMBUF_HEADROOM : m->buf_len;
1576 	m->data_off = buf_ofs;
1577 
1578 	m->data_len = 0;
1579 }
1580 
1581 /*
1582  * This function is called after packets have been transimited. It fetchs mbuf
1583  * from vpool->pool, detached it and put into vpool->ring. It also update the
1584  * used index and kick the guest if necessary.
1585  */
1586 static inline uint32_t __attribute__((always_inline))
1587 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1588 {
1589 	struct rte_mbuf *mbuf;
1590 	struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1591 	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1592 	uint32_t index = 0;
1593 	uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1594 
1595 	LOG_DEBUG(VHOST_DATA,
1596 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1597 		"clean is: %d\n",
1598 		dev->device_fh, mbuf_count);
1599 	LOG_DEBUG(VHOST_DATA,
1600 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1601 		"clean  is : %d\n",
1602 		dev->device_fh, rte_ring_count(vpool->ring));
1603 
1604 	for (index = 0; index < mbuf_count; index++) {
1605 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1606 		if (likely(MBUF_EXT_MEM(mbuf)))
1607 			pktmbuf_detach_zcp(mbuf);
1608 		rte_ring_sp_enqueue(vpool->ring, mbuf);
1609 
1610 		/* Update used index buffer information. */
1611 		vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1612 		vq->used->ring[used_idx].len = 0;
1613 
1614 		used_idx = (used_idx + 1) & (vq->size - 1);
1615 	}
1616 
1617 	LOG_DEBUG(VHOST_DATA,
1618 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1619 		"clean is: %d\n",
1620 		dev->device_fh, rte_mempool_count(vpool->pool));
1621 	LOG_DEBUG(VHOST_DATA,
1622 		"(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1623 		"clean  is : %d\n",
1624 		dev->device_fh, rte_ring_count(vpool->ring));
1625 	LOG_DEBUG(VHOST_DATA,
1626 		"(%"PRIu64") in txmbuf_clean_zcp: before updated "
1627 		"vq->last_used_idx:%d\n",
1628 		dev->device_fh, vq->last_used_idx);
1629 
1630 	vq->last_used_idx += mbuf_count;
1631 
1632 	LOG_DEBUG(VHOST_DATA,
1633 		"(%"PRIu64") in txmbuf_clean_zcp: after updated "
1634 		"vq->last_used_idx:%d\n",
1635 		dev->device_fh, vq->last_used_idx);
1636 
1637 	rte_compiler_barrier();
1638 
1639 	*(volatile uint16_t *)&vq->used->idx += mbuf_count;
1640 
1641 	/* Kick guest if required. */
1642 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1643 		eventfd_write((int)vq->kickfd, 1);
1644 
1645 	return 0;
1646 }
1647 
1648 /*
1649  * This function is called when a virtio device is destroy.
1650  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1651  */
1652 static void mbuf_destroy_zcp(struct vpool *vpool)
1653 {
1654 	struct rte_mbuf *mbuf = NULL;
1655 	uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1656 
1657 	LOG_DEBUG(VHOST_CONFIG,
1658 		"in mbuf_destroy_zcp: mbuf count in mempool before "
1659 		"mbuf_destroy_zcp is: %d\n",
1660 		mbuf_count);
1661 	LOG_DEBUG(VHOST_CONFIG,
1662 		"in mbuf_destroy_zcp: mbuf count in  ring before "
1663 		"mbuf_destroy_zcp  is : %d\n",
1664 		rte_ring_count(vpool->ring));
1665 
1666 	for (index = 0; index < mbuf_count; index++) {
1667 		mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1668 		if (likely(mbuf != NULL)) {
1669 			if (likely(MBUF_EXT_MEM(mbuf)))
1670 				pktmbuf_detach_zcp(mbuf);
1671 			rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1672 		}
1673 	}
1674 
1675 	LOG_DEBUG(VHOST_CONFIG,
1676 		"in mbuf_destroy_zcp: mbuf count in mempool after "
1677 		"mbuf_destroy_zcp is: %d\n",
1678 		rte_mempool_count(vpool->pool));
1679 	LOG_DEBUG(VHOST_CONFIG,
1680 		"in mbuf_destroy_zcp: mbuf count in ring after "
1681 		"mbuf_destroy_zcp is : %d\n",
1682 		rte_ring_count(vpool->ring));
1683 }
1684 
1685 /*
1686  * This function update the use flag and counter.
1687  */
1688 static inline uint32_t __attribute__((always_inline))
1689 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1690 	uint32_t count)
1691 {
1692 	struct vhost_virtqueue *vq;
1693 	struct vring_desc *desc;
1694 	struct rte_mbuf *buff;
1695 	/* The virtio_hdr is initialised to 0. */
1696 	struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1697 		= {{0, 0, 0, 0, 0, 0}, 0};
1698 	uint64_t buff_hdr_addr = 0;
1699 	uint32_t head[MAX_PKT_BURST], packet_len = 0;
1700 	uint32_t head_idx, packet_success = 0;
1701 	uint16_t res_cur_idx;
1702 
1703 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1704 
1705 	if (count == 0)
1706 		return 0;
1707 
1708 	vq = dev->virtqueue[VIRTIO_RXQ];
1709 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1710 
1711 	res_cur_idx = vq->last_used_idx;
1712 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1713 		dev->device_fh, res_cur_idx, res_cur_idx + count);
1714 
1715 	/* Retrieve all of the head indexes first to avoid caching issues. */
1716 	for (head_idx = 0; head_idx < count; head_idx++)
1717 		head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1718 
1719 	/*Prefetch descriptor index. */
1720 	rte_prefetch0(&vq->desc[head[packet_success]]);
1721 
1722 	while (packet_success != count) {
1723 		/* Get descriptor from available ring */
1724 		desc = &vq->desc[head[packet_success]];
1725 
1726 		buff = pkts[packet_success];
1727 		LOG_DEBUG(VHOST_DATA,
1728 			"(%"PRIu64") in dev_rx_zcp: update the used idx for "
1729 			"pkt[%d] descriptor idx: %d\n",
1730 			dev->device_fh, packet_success,
1731 			MBUF_HEADROOM_UINT32(buff));
1732 
1733 		PRINT_PACKET(dev,
1734 			(uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1735 			+ RTE_PKTMBUF_HEADROOM),
1736 			rte_pktmbuf_data_len(buff), 0);
1737 
1738 		/* Buffer address translation for virtio header. */
1739 		buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1740 		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1741 
1742 		/*
1743 		 * If the descriptors are chained the header and data are
1744 		 * placed in separate buffers.
1745 		 */
1746 		if (desc->flags & VRING_DESC_F_NEXT) {
1747 			desc->len = vq->vhost_hlen;
1748 			desc = &vq->desc[desc->next];
1749 			desc->len = rte_pktmbuf_data_len(buff);
1750 		} else {
1751 			desc->len = packet_len;
1752 		}
1753 
1754 		/* Update used ring with desc information */
1755 		vq->used->ring[res_cur_idx & (vq->size - 1)].id
1756 			= head[packet_success];
1757 		vq->used->ring[res_cur_idx & (vq->size - 1)].len
1758 			= packet_len;
1759 		res_cur_idx++;
1760 		packet_success++;
1761 
1762 		/* A header is required per buffer. */
1763 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1764 			(const void *)&virtio_hdr, vq->vhost_hlen);
1765 
1766 		PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1767 
1768 		if (likely(packet_success < count)) {
1769 			/* Prefetch descriptor index. */
1770 			rte_prefetch0(&vq->desc[head[packet_success]]);
1771 		}
1772 	}
1773 
1774 	rte_compiler_barrier();
1775 
1776 	LOG_DEBUG(VHOST_DATA,
1777 		"(%"PRIu64") in dev_rx_zcp: before update used idx: "
1778 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1779 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1780 
1781 	*(volatile uint16_t *)&vq->used->idx += count;
1782 	vq->last_used_idx += count;
1783 
1784 	LOG_DEBUG(VHOST_DATA,
1785 		"(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1786 		"vq.last_used_idx: %d, vq->used->idx: %d\n",
1787 		dev->device_fh, vq->last_used_idx, vq->used->idx);
1788 
1789 	/* Kick the guest if necessary. */
1790 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1791 		eventfd_write((int)vq->kickfd, 1);
1792 
1793 	return count;
1794 }
1795 
1796 /*
1797  * This function routes the TX packet to the correct interface.
1798  * This may be a local device or the physical port.
1799  */
1800 static inline void __attribute__((always_inline))
1801 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1802 	uint32_t desc_idx, uint8_t need_copy)
1803 {
1804 	struct mbuf_table *tx_q;
1805 	struct rte_mbuf **m_table;
1806 	struct rte_mbuf *mbuf = NULL;
1807 	unsigned len, ret, offset = 0;
1808 	struct vpool *vpool;
1809 	uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1810 	uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1811 
1812 	/*Add packet to the port tx queue*/
1813 	tx_q = &tx_queue_zcp[vmdq_rx_q];
1814 	len = tx_q->len;
1815 
1816 	/* Allocate an mbuf and populate the structure. */
1817 	vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1818 	rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1819 	if (unlikely(mbuf == NULL)) {
1820 		struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1821 		RTE_LOG(ERR, VHOST_DATA,
1822 			"(%"PRIu64") Failed to allocate memory for mbuf.\n",
1823 			dev->device_fh);
1824 		put_desc_to_used_list_zcp(vq, desc_idx);
1825 		return;
1826 	}
1827 
1828 	if (vm2vm_mode == VM2VM_HARDWARE) {
1829 		/* Avoid using a vlan tag from any vm for external pkt, such as
1830 		 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1831 		 * selection, MAC address determines it as an external pkt
1832 		 * which should go to network, while vlan tag determine it as
1833 		 * a vm2vm pkt should forward to another vm. Hardware confuse
1834 		 * such a ambiguous situation, so pkt will lost.
1835 		 */
1836 		vlan_tag = external_pkt_default_vlan_tag;
1837 		if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1838 			MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1839 			__rte_mbuf_raw_free(mbuf);
1840 			return;
1841 		}
1842 	}
1843 
1844 	mbuf->nb_segs = m->nb_segs;
1845 	mbuf->next = m->next;
1846 	mbuf->data_len = m->data_len + offset;
1847 	mbuf->pkt_len = mbuf->data_len;
1848 	if (unlikely(need_copy)) {
1849 		/* Copy the packet contents to the mbuf. */
1850 		rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1851 			rte_pktmbuf_mtod(m, void *),
1852 			m->data_len);
1853 	} else {
1854 		mbuf->data_off = m->data_off;
1855 		mbuf->buf_physaddr = m->buf_physaddr;
1856 		mbuf->buf_addr = m->buf_addr;
1857 	}
1858 	mbuf->ol_flags = PKT_TX_VLAN_PKT;
1859 	mbuf->vlan_tci = vlan_tag;
1860 	mbuf->l2_len = sizeof(struct ether_hdr);
1861 	mbuf->l3_len = sizeof(struct ipv4_hdr);
1862 	MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1863 
1864 	tx_q->m_table[len] = mbuf;
1865 	len++;
1866 
1867 	LOG_DEBUG(VHOST_DATA,
1868 		"(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1869 		dev->device_fh,
1870 		mbuf->nb_segs,
1871 		(mbuf->next == NULL) ? "null" : "non-null");
1872 
1873 	if (enable_stats) {
1874 		dev_statistics[dev->device_fh].tx_total++;
1875 		dev_statistics[dev->device_fh].tx++;
1876 	}
1877 
1878 	if (unlikely(len == MAX_PKT_BURST)) {
1879 		m_table = (struct rte_mbuf **)tx_q->m_table;
1880 		ret = rte_eth_tx_burst(ports[0],
1881 			(uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1882 
1883 		/*
1884 		 * Free any buffers not handled by TX and update
1885 		 * the port stats.
1886 		 */
1887 		if (unlikely(ret < len)) {
1888 			do {
1889 				rte_pktmbuf_free(m_table[ret]);
1890 			} while (++ret < len);
1891 		}
1892 
1893 		len = 0;
1894 		txmbuf_clean_zcp(dev, vpool);
1895 	}
1896 
1897 	tx_q->len = len;
1898 
1899 	return;
1900 }
1901 
1902 /*
1903  * This function TX all available packets in virtio TX queue for one
1904  * virtio-net device. If it is first packet, it learns MAC address and
1905  * setup VMDQ.
1906  */
1907 static inline void __attribute__((always_inline))
1908 virtio_dev_tx_zcp(struct virtio_net *dev)
1909 {
1910 	struct rte_mbuf m;
1911 	struct vhost_virtqueue *vq;
1912 	struct vring_desc *desc;
1913 	uint64_t buff_addr = 0, phys_addr;
1914 	uint32_t head[MAX_PKT_BURST];
1915 	uint32_t i;
1916 	uint16_t free_entries, packet_success = 0;
1917 	uint16_t avail_idx;
1918 	uint8_t need_copy = 0;
1919 	hpa_type addr_type;
1920 	struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1921 
1922 	vq = dev->virtqueue[VIRTIO_TXQ];
1923 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1924 
1925 	/* If there are no available buffers then return. */
1926 	if (vq->last_used_idx_res == avail_idx)
1927 		return;
1928 
1929 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1930 
1931 	/* Prefetch available ring to retrieve head indexes. */
1932 	rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1933 
1934 	/* Get the number of free entries in the ring */
1935 	free_entries = (avail_idx - vq->last_used_idx_res);
1936 
1937 	/* Limit to MAX_PKT_BURST. */
1938 	free_entries
1939 		= (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1940 
1941 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1942 		dev->device_fh, free_entries);
1943 
1944 	/* Retrieve all of the head indexes first to avoid caching issues. */
1945 	for (i = 0; i < free_entries; i++)
1946 		head[i]
1947 			= vq->avail->ring[(vq->last_used_idx_res + i)
1948 			& (vq->size - 1)];
1949 
1950 	vq->last_used_idx_res += free_entries;
1951 
1952 	/* Prefetch descriptor index. */
1953 	rte_prefetch0(&vq->desc[head[packet_success]]);
1954 	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1955 
1956 	while (packet_success < free_entries) {
1957 		desc = &vq->desc[head[packet_success]];
1958 
1959 		/* Discard first buffer as it is the virtio header */
1960 		desc = &vq->desc[desc->next];
1961 
1962 		/* Buffer address translation. */
1963 		buff_addr = gpa_to_vva(dev, desc->addr);
1964 		/* Need check extra VLAN_HLEN size for inserting VLAN tag */
1965 		phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1966 			&addr_type);
1967 
1968 		if (likely(packet_success < (free_entries - 1)))
1969 			/* Prefetch descriptor index. */
1970 			rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1971 
1972 		if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1973 			RTE_LOG(ERR, VHOST_DATA,
1974 				"(%"PRIu64") Invalid frame buffer address found"
1975 				"when TX packets!\n",
1976 				dev->device_fh);
1977 			packet_success++;
1978 			continue;
1979 		}
1980 
1981 		/* Prefetch buffer address. */
1982 		rte_prefetch0((void *)(uintptr_t)buff_addr);
1983 
1984 		/*
1985 		 * Setup dummy mbuf. This is copied to a real mbuf if
1986 		 * transmitted out the physical port.
1987 		 */
1988 		m.data_len = desc->len;
1989 		m.nb_segs = 1;
1990 		m.next = NULL;
1991 		m.data_off = 0;
1992 		m.buf_addr = (void *)(uintptr_t)buff_addr;
1993 		m.buf_physaddr = phys_addr;
1994 
1995 		/*
1996 		 * Check if the frame buffer address from guest crosses
1997 		 * sub-region or not.
1998 		 */
1999 		if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2000 			RTE_LOG(ERR, VHOST_DATA,
2001 				"(%"PRIu64") Frame buffer address cross "
2002 				"sub-regioin found when attaching TX frame "
2003 				"buffer address!\n",
2004 				dev->device_fh);
2005 			need_copy = 1;
2006 		} else
2007 			need_copy = 0;
2008 
2009 		PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2010 
2011 		/*
2012 		 * If this is the first received packet we need to learn
2013 		 * the MAC and setup VMDQ
2014 		 */
2015 		if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2016 			if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2017 				/*
2018 				 * Discard frame if device is scheduled for
2019 				 * removal or a duplicate MAC address is found.
2020 				 */
2021 				packet_success += free_entries;
2022 				vq->last_used_idx += packet_success;
2023 				break;
2024 			}
2025 		}
2026 
2027 		virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2028 		packet_success++;
2029 	}
2030 }
2031 
2032 /*
2033  * This function is called by each data core. It handles all RX/TX registered
2034  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2035  * addresses are compared with all devices in the main linked list.
2036  */
2037 static int
2038 switch_worker_zcp(__attribute__((unused)) void *arg)
2039 {
2040 	struct virtio_net *dev = NULL;
2041 	struct vhost_dev  *vdev = NULL;
2042 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2043 	struct virtio_net_data_ll *dev_ll;
2044 	struct mbuf_table *tx_q;
2045 	volatile struct lcore_ll_info *lcore_ll;
2046 	const uint64_t drain_tsc
2047 		= (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2048 		* BURST_TX_DRAIN_US;
2049 	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2050 	unsigned ret;
2051 	const uint16_t lcore_id = rte_lcore_id();
2052 	uint16_t count_in_ring, rx_count = 0;
2053 
2054 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2055 
2056 	lcore_ll = lcore_info[lcore_id].lcore_ll;
2057 	prev_tsc = 0;
2058 
2059 	while (1) {
2060 		cur_tsc = rte_rdtsc();
2061 
2062 		/* TX burst queue drain */
2063 		diff_tsc = cur_tsc - prev_tsc;
2064 		if (unlikely(diff_tsc > drain_tsc)) {
2065 			/*
2066 			 * Get mbuf from vpool.pool and detach mbuf and
2067 			 * put back into vpool.ring.
2068 			 */
2069 			dev_ll = lcore_ll->ll_root_used;
2070 			while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2071 				/* Get virtio device ID */
2072 				vdev = dev_ll->vdev;
2073 				dev = vdev->dev;
2074 
2075 				if (likely(!vdev->remove)) {
2076 					tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2077 					if (tx_q->len) {
2078 						LOG_DEBUG(VHOST_DATA,
2079 						"TX queue drained after timeout"
2080 						" with burst size %u\n",
2081 						tx_q->len);
2082 
2083 						/*
2084 						 * Tx any packets in the queue
2085 						 */
2086 						ret = rte_eth_tx_burst(
2087 							ports[0],
2088 							(uint16_t)tx_q->txq_id,
2089 							(struct rte_mbuf **)
2090 							tx_q->m_table,
2091 							(uint16_t)tx_q->len);
2092 						if (unlikely(ret < tx_q->len)) {
2093 							do {
2094 								rte_pktmbuf_free(
2095 									tx_q->m_table[ret]);
2096 							} while (++ret < tx_q->len);
2097 						}
2098 						tx_q->len = 0;
2099 
2100 						txmbuf_clean_zcp(dev,
2101 							&vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2102 					}
2103 				}
2104 				dev_ll = dev_ll->next;
2105 			}
2106 			prev_tsc = cur_tsc;
2107 		}
2108 
2109 		rte_prefetch0(lcore_ll->ll_root_used);
2110 
2111 		/*
2112 		 * Inform the configuration core that we have exited the linked
2113 		 * list and that no devices are in use if requested.
2114 		 */
2115 		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2116 			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2117 
2118 		/* Process devices */
2119 		dev_ll = lcore_ll->ll_root_used;
2120 
2121 		while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2122 			vdev = dev_ll->vdev;
2123 			dev  = vdev->dev;
2124 			if (unlikely(vdev->remove)) {
2125 				dev_ll = dev_ll->next;
2126 				unlink_vmdq(vdev);
2127 				vdev->ready = DEVICE_SAFE_REMOVE;
2128 				continue;
2129 			}
2130 
2131 			if (likely(vdev->ready == DEVICE_RX)) {
2132 				uint32_t index = vdev->vmdq_rx_q;
2133 				uint16_t i;
2134 				count_in_ring
2135 				= rte_ring_count(vpool_array[index].ring);
2136 				uint16_t free_entries
2137 				= (uint16_t)get_available_ring_num_zcp(dev);
2138 
2139 				/*
2140 				 * Attach all mbufs in vpool.ring and put back
2141 				 * into vpool.pool.
2142 				 */
2143 				for (i = 0;
2144 				i < RTE_MIN(free_entries,
2145 				RTE_MIN(count_in_ring, MAX_PKT_BURST));
2146 				i++)
2147 					attach_rxmbuf_zcp(dev);
2148 
2149 				/* Handle guest RX */
2150 				rx_count = rte_eth_rx_burst(ports[0],
2151 					vdev->vmdq_rx_q, pkts_burst,
2152 					MAX_PKT_BURST);
2153 
2154 				if (rx_count) {
2155 					ret_count = virtio_dev_rx_zcp(dev,
2156 							pkts_burst, rx_count);
2157 					if (enable_stats) {
2158 						dev_statistics[dev->device_fh].rx_total
2159 							+= rx_count;
2160 						dev_statistics[dev->device_fh].rx
2161 							+= ret_count;
2162 					}
2163 					while (likely(rx_count)) {
2164 						rx_count--;
2165 						pktmbuf_detach_zcp(
2166 							pkts_burst[rx_count]);
2167 						rte_ring_sp_enqueue(
2168 							vpool_array[index].ring,
2169 							(void *)pkts_burst[rx_count]);
2170 					}
2171 				}
2172 			}
2173 
2174 			if (likely(!vdev->remove))
2175 				/* Handle guest TX */
2176 				virtio_dev_tx_zcp(dev);
2177 
2178 			/* Move to the next device in the list */
2179 			dev_ll = dev_ll->next;
2180 		}
2181 	}
2182 
2183 	return 0;
2184 }
2185 
2186 
2187 /*
2188  * Add an entry to a used linked list. A free entry must first be found
2189  * in the free linked list using get_data_ll_free_entry();
2190  */
2191 static void
2192 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2193 	struct virtio_net_data_ll *ll_dev)
2194 {
2195 	struct virtio_net_data_ll *ll = *ll_root_addr;
2196 
2197 	/* Set next as NULL and use a compiler barrier to avoid reordering. */
2198 	ll_dev->next = NULL;
2199 	rte_compiler_barrier();
2200 
2201 	/* If ll == NULL then this is the first device. */
2202 	if (ll) {
2203 		/* Increment to the tail of the linked list. */
2204 		while ((ll->next != NULL) )
2205 			ll = ll->next;
2206 
2207 		ll->next = ll_dev;
2208 	} else {
2209 		*ll_root_addr = ll_dev;
2210 	}
2211 }
2212 
2213 /*
2214  * Remove an entry from a used linked list. The entry must then be added to
2215  * the free linked list using put_data_ll_free_entry().
2216  */
2217 static void
2218 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2219 	struct virtio_net_data_ll *ll_dev,
2220 	struct virtio_net_data_ll *ll_dev_last)
2221 {
2222 	struct virtio_net_data_ll *ll = *ll_root_addr;
2223 
2224 	if (unlikely((ll == NULL) || (ll_dev == NULL)))
2225 		return;
2226 
2227 	if (ll_dev == ll)
2228 		*ll_root_addr = ll_dev->next;
2229 	else
2230 		if (likely(ll_dev_last != NULL))
2231 			ll_dev_last->next = ll_dev->next;
2232 		else
2233 			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2234 }
2235 
2236 /*
2237  * Find and return an entry from the free linked list.
2238  */
2239 static struct virtio_net_data_ll *
2240 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2241 {
2242 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2243 	struct virtio_net_data_ll *ll_dev;
2244 
2245 	if (ll_free == NULL)
2246 		return NULL;
2247 
2248 	ll_dev = ll_free;
2249 	*ll_root_addr = ll_free->next;
2250 
2251 	return ll_dev;
2252 }
2253 
2254 /*
2255  * Place an entry back on to the free linked list.
2256  */
2257 static void
2258 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2259 	struct virtio_net_data_ll *ll_dev)
2260 {
2261 	struct virtio_net_data_ll *ll_free = *ll_root_addr;
2262 
2263 	if (ll_dev == NULL)
2264 		return;
2265 
2266 	ll_dev->next = ll_free;
2267 	*ll_root_addr = ll_dev;
2268 }
2269 
2270 /*
2271  * Creates a linked list of a given size.
2272  */
2273 static struct virtio_net_data_ll *
2274 alloc_data_ll(uint32_t size)
2275 {
2276 	struct virtio_net_data_ll *ll_new;
2277 	uint32_t i;
2278 
2279 	/* Malloc and then chain the linked list. */
2280 	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2281 	if (ll_new == NULL) {
2282 		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2283 		return NULL;
2284 	}
2285 
2286 	for (i = 0; i < size - 1; i++) {
2287 		ll_new[i].vdev = NULL;
2288 		ll_new[i].next = &ll_new[i+1];
2289 	}
2290 	ll_new[i].next = NULL;
2291 
2292 	return (ll_new);
2293 }
2294 
2295 /*
2296  * Create the main linked list along with each individual cores linked list. A used and a free list
2297  * are created to manage entries.
2298  */
2299 static int
2300 init_data_ll (void)
2301 {
2302 	int lcore;
2303 
2304 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2305 		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2306 		if (lcore_info[lcore].lcore_ll == NULL) {
2307 			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2308 			return -1;
2309 		}
2310 
2311 		lcore_info[lcore].lcore_ll->device_num = 0;
2312 		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2313 		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2314 		if (num_devices % num_switching_cores)
2315 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2316 		else
2317 			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2318 	}
2319 
2320 	/* Allocate devices up to a maximum of MAX_DEVICES. */
2321 	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2322 
2323 	return 0;
2324 }
2325 
2326 /*
2327  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2328  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2329  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2330  */
2331 static void
2332 destroy_device (volatile struct virtio_net *dev)
2333 {
2334 	struct virtio_net_data_ll *ll_lcore_dev_cur;
2335 	struct virtio_net_data_ll *ll_main_dev_cur;
2336 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2337 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
2338 	struct vhost_dev *vdev;
2339 	int lcore;
2340 
2341 	dev->flags &= ~VIRTIO_DEV_RUNNING;
2342 
2343 	vdev = (struct vhost_dev *)dev->priv;
2344 	/*set the remove flag. */
2345 	vdev->remove = 1;
2346 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
2347 		rte_pause();
2348 	}
2349 
2350 	/* Search for entry to be removed from lcore ll */
2351 	ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2352 	while (ll_lcore_dev_cur != NULL) {
2353 		if (ll_lcore_dev_cur->vdev == vdev) {
2354 			break;
2355 		} else {
2356 			ll_lcore_dev_last = ll_lcore_dev_cur;
2357 			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2358 		}
2359 	}
2360 
2361 	if (ll_lcore_dev_cur == NULL) {
2362 		RTE_LOG(ERR, VHOST_CONFIG,
2363 			"(%"PRIu64") Failed to find the dev to be destroy.\n",
2364 			dev->device_fh);
2365 		return;
2366 	}
2367 
2368 	/* Search for entry to be removed from main ll */
2369 	ll_main_dev_cur = ll_root_used;
2370 	ll_main_dev_last = NULL;
2371 	while (ll_main_dev_cur != NULL) {
2372 		if (ll_main_dev_cur->vdev == vdev) {
2373 			break;
2374 		} else {
2375 			ll_main_dev_last = ll_main_dev_cur;
2376 			ll_main_dev_cur = ll_main_dev_cur->next;
2377 		}
2378 	}
2379 
2380 	/* Remove entries from the lcore and main ll. */
2381 	rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2382 	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2383 
2384 	/* Set the dev_removal_flag on each lcore. */
2385 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2386 		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2387 	}
2388 
2389 	/*
2390 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2391 	 * they can no longer access the device removed from the linked lists and that the devices
2392 	 * are no longer in use.
2393 	 */
2394 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2395 		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2396 			rte_pause();
2397 		}
2398 	}
2399 
2400 	/* Add the entries back to the lcore and main free ll.*/
2401 	put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2402 	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2403 
2404 	/* Decrement number of device on the lcore. */
2405 	lcore_info[vdev->coreid].lcore_ll->device_num--;
2406 
2407 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2408 
2409 	if (zero_copy) {
2410 		struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2411 
2412 		/* Stop the RX queue. */
2413 		if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2414 			LOG_DEBUG(VHOST_CONFIG,
2415 				"(%"PRIu64") In destroy_device: Failed to stop "
2416 				"rx queue:%d\n",
2417 				dev->device_fh,
2418 				vdev->vmdq_rx_q);
2419 		}
2420 
2421 		LOG_DEBUG(VHOST_CONFIG,
2422 			"(%"PRIu64") in destroy_device: Start put mbuf in "
2423 			"mempool back to ring for RX queue: %d\n",
2424 			dev->device_fh, vdev->vmdq_rx_q);
2425 
2426 		mbuf_destroy_zcp(vpool);
2427 
2428 		/* Stop the TX queue. */
2429 		if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2430 			LOG_DEBUG(VHOST_CONFIG,
2431 				"(%"PRIu64") In destroy_device: Failed to "
2432 				"stop tx queue:%d\n",
2433 				dev->device_fh, vdev->vmdq_rx_q);
2434 		}
2435 
2436 		vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2437 
2438 		LOG_DEBUG(VHOST_CONFIG,
2439 			"(%"PRIu64") destroy_device: Start put mbuf in mempool "
2440 			"back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2441 			dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2442 			dev->device_fh);
2443 
2444 		mbuf_destroy_zcp(vpool);
2445 		rte_free(vdev->regions_hpa);
2446 	}
2447 	rte_free(vdev);
2448 
2449 }
2450 
2451 /*
2452  * Calculate the region count of physical continous regions for one particular
2453  * region of whose vhost virtual address is continous. The particular region
2454  * start from vva_start, with size of 'size' in argument.
2455  */
2456 static uint32_t
2457 check_hpa_regions(uint64_t vva_start, uint64_t size)
2458 {
2459 	uint32_t i, nregions = 0, page_size = getpagesize();
2460 	uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2461 	if (vva_start % page_size) {
2462 		LOG_DEBUG(VHOST_CONFIG,
2463 			"in check_countinous: vva start(%p) mod page_size(%d) "
2464 			"has remainder\n",
2465 			(void *)(uintptr_t)vva_start, page_size);
2466 		return 0;
2467 	}
2468 	if (size % page_size) {
2469 		LOG_DEBUG(VHOST_CONFIG,
2470 			"in check_countinous: "
2471 			"size((%"PRIu64")) mod page_size(%d) has remainder\n",
2472 			size, page_size);
2473 		return 0;
2474 	}
2475 	for (i = 0; i < size - page_size; i = i + page_size) {
2476 		cur_phys_addr
2477 			= rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2478 		next_phys_addr = rte_mem_virt2phy(
2479 			(void *)(uintptr_t)(vva_start + i + page_size));
2480 		if ((cur_phys_addr + page_size) != next_phys_addr) {
2481 			++nregions;
2482 			LOG_DEBUG(VHOST_CONFIG,
2483 				"in check_continuous: hva addr:(%p) is not "
2484 				"continuous with hva addr:(%p), diff:%d\n",
2485 				(void *)(uintptr_t)(vva_start + (uint64_t)i),
2486 				(void *)(uintptr_t)(vva_start + (uint64_t)i
2487 				+ page_size), page_size);
2488 			LOG_DEBUG(VHOST_CONFIG,
2489 				"in check_continuous: hpa addr:(%p) is not "
2490 				"continuous with hpa addr:(%p), "
2491 				"diff:(%"PRIu64")\n",
2492 				(void *)(uintptr_t)cur_phys_addr,
2493 				(void *)(uintptr_t)next_phys_addr,
2494 				(next_phys_addr-cur_phys_addr));
2495 		}
2496 	}
2497 	return nregions;
2498 }
2499 
2500 /*
2501  * Divide each region whose vhost virtual address is continous into a few
2502  * sub-regions, make sure the physical address within each sub-region are
2503  * continous. And fill offset(to GPA) and size etc. information of each
2504  * sub-region into regions_hpa.
2505  */
2506 static uint32_t
2507 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2508 {
2509 	uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2510 	uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2511 
2512 	if (mem_region_hpa == NULL)
2513 		return 0;
2514 
2515 	for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2516 		vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2517 			virtio_memory->regions[regionidx].address_offset;
2518 		mem_region_hpa[regionidx_hpa].guest_phys_address
2519 			= virtio_memory->regions[regionidx].guest_phys_address;
2520 		mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2521 			rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2522 			mem_region_hpa[regionidx_hpa].guest_phys_address;
2523 		LOG_DEBUG(VHOST_CONFIG,
2524 			"in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2525 			regionidx_hpa,
2526 			(void *)(uintptr_t)
2527 			(mem_region_hpa[regionidx_hpa].guest_phys_address));
2528 		LOG_DEBUG(VHOST_CONFIG,
2529 			"in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2530 			regionidx_hpa,
2531 			(void *)(uintptr_t)
2532 			(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2533 		for (i = 0, k = 0;
2534 			i < virtio_memory->regions[regionidx].memory_size -
2535 				page_size;
2536 			i += page_size) {
2537 			cur_phys_addr = rte_mem_virt2phy(
2538 					(void *)(uintptr_t)(vva_start + i));
2539 			next_phys_addr = rte_mem_virt2phy(
2540 					(void *)(uintptr_t)(vva_start +
2541 					i + page_size));
2542 			if ((cur_phys_addr + page_size) != next_phys_addr) {
2543 				mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2544 					mem_region_hpa[regionidx_hpa].guest_phys_address +
2545 					k + page_size;
2546 				mem_region_hpa[regionidx_hpa].memory_size
2547 					= k + page_size;
2548 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2549 					"phys addr end  [%d]:(%p)\n",
2550 					regionidx_hpa,
2551 					(void *)(uintptr_t)
2552 					(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2553 				LOG_DEBUG(VHOST_CONFIG,
2554 					"in fill_hpa_regions: guest phys addr "
2555 					"size [%d]:(%p)\n",
2556 					regionidx_hpa,
2557 					(void *)(uintptr_t)
2558 					(mem_region_hpa[regionidx_hpa].memory_size));
2559 				mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2560 					= mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2561 				++regionidx_hpa;
2562 				mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2563 					next_phys_addr -
2564 					mem_region_hpa[regionidx_hpa].guest_phys_address;
2565 				LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2566 					" phys addr start[%d]:(%p)\n",
2567 					regionidx_hpa,
2568 					(void *)(uintptr_t)
2569 					(mem_region_hpa[regionidx_hpa].guest_phys_address));
2570 				LOG_DEBUG(VHOST_CONFIG,
2571 					"in fill_hpa_regions: host  phys addr "
2572 					"start[%d]:(%p)\n",
2573 					regionidx_hpa,
2574 					(void *)(uintptr_t)
2575 					(mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2576 				k = 0;
2577 			} else {
2578 				k += page_size;
2579 			}
2580 		}
2581 		mem_region_hpa[regionidx_hpa].guest_phys_address_end
2582 			= mem_region_hpa[regionidx_hpa].guest_phys_address
2583 			+ k + page_size;
2584 		mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2585 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2586 			"[%d]:(%p)\n", regionidx_hpa,
2587 			(void *)(uintptr_t)
2588 			(mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2589 		LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2590 			"[%d]:(%p)\n", regionidx_hpa,
2591 			(void *)(uintptr_t)
2592 			(mem_region_hpa[regionidx_hpa].memory_size));
2593 		++regionidx_hpa;
2594 	}
2595 	return regionidx_hpa;
2596 }
2597 
2598 /*
2599  * A new device is added to a data core. First the device is added to the main linked list
2600  * and the allocated to a specific data core.
2601  */
2602 static int
2603 new_device (struct virtio_net *dev)
2604 {
2605 	struct virtio_net_data_ll *ll_dev;
2606 	int lcore, core_add = 0;
2607 	uint32_t device_num_min = num_devices;
2608 	struct vhost_dev *vdev;
2609 	uint32_t regionidx;
2610 
2611 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2612 	if (vdev == NULL) {
2613 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2614 			dev->device_fh);
2615 		return -1;
2616 	}
2617 	vdev->dev = dev;
2618 	dev->priv = vdev;
2619 
2620 	if (zero_copy) {
2621 		vdev->nregions_hpa = dev->mem->nregions;
2622 		for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2623 			vdev->nregions_hpa
2624 				+= check_hpa_regions(
2625 					dev->mem->regions[regionidx].guest_phys_address
2626 					+ dev->mem->regions[regionidx].address_offset,
2627 					dev->mem->regions[regionidx].memory_size);
2628 
2629 		}
2630 
2631 		vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2632 			sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2633 			RTE_CACHE_LINE_SIZE);
2634 		if (vdev->regions_hpa == NULL) {
2635 			RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2636 			rte_free(vdev);
2637 			return -1;
2638 		}
2639 
2640 
2641 		if (fill_hpa_memory_regions(
2642 			vdev->regions_hpa, dev->mem
2643 			) != vdev->nregions_hpa) {
2644 
2645 			RTE_LOG(ERR, VHOST_CONFIG,
2646 				"hpa memory regions number mismatch: "
2647 				"[%d]\n", vdev->nregions_hpa);
2648 			rte_free(vdev->regions_hpa);
2649 			rte_free(vdev);
2650 			return -1;
2651 		}
2652 	}
2653 
2654 
2655 	/* Add device to main ll */
2656 	ll_dev = get_data_ll_free_entry(&ll_root_free);
2657 	if (ll_dev == NULL) {
2658 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2659 			"of %d devices per core has been reached\n",
2660 			dev->device_fh, num_devices);
2661 		if (vdev->regions_hpa)
2662 			rte_free(vdev->regions_hpa);
2663 		rte_free(vdev);
2664 		return -1;
2665 	}
2666 	ll_dev->vdev = vdev;
2667 	add_data_ll_entry(&ll_root_used, ll_dev);
2668 	vdev->vmdq_rx_q
2669 		= dev->device_fh * queues_per_pool + vmdq_queue_base;
2670 
2671 	if (zero_copy) {
2672 		uint32_t index = vdev->vmdq_rx_q;
2673 		uint32_t count_in_ring, i;
2674 		struct mbuf_table *tx_q;
2675 
2676 		count_in_ring = rte_ring_count(vpool_array[index].ring);
2677 
2678 		LOG_DEBUG(VHOST_CONFIG,
2679 			"(%"PRIu64") in new_device: mbuf count in mempool "
2680 			"before attach is: %d\n",
2681 			dev->device_fh,
2682 			rte_mempool_count(vpool_array[index].pool));
2683 		LOG_DEBUG(VHOST_CONFIG,
2684 			"(%"PRIu64") in new_device: mbuf count in  ring "
2685 			"before attach  is : %d\n",
2686 			dev->device_fh, count_in_ring);
2687 
2688 		/*
2689 		 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2690 		 */
2691 		for (i = 0; i < count_in_ring; i++)
2692 			attach_rxmbuf_zcp(dev);
2693 
2694 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2695 			"mempool after attach is: %d\n",
2696 			dev->device_fh,
2697 			rte_mempool_count(vpool_array[index].pool));
2698 		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2699 			"ring after attach  is : %d\n",
2700 			dev->device_fh,
2701 			rte_ring_count(vpool_array[index].ring));
2702 
2703 		tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2704 		tx_q->txq_id = vdev->vmdq_rx_q;
2705 
2706 		if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2707 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2708 
2709 			LOG_DEBUG(VHOST_CONFIG,
2710 				"(%"PRIu64") In new_device: Failed to start "
2711 				"tx queue:%d\n",
2712 				dev->device_fh, vdev->vmdq_rx_q);
2713 
2714 			mbuf_destroy_zcp(vpool);
2715 			rte_free(vdev->regions_hpa);
2716 			rte_free(vdev);
2717 			return -1;
2718 		}
2719 
2720 		if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2721 			struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2722 
2723 			LOG_DEBUG(VHOST_CONFIG,
2724 				"(%"PRIu64") In new_device: Failed to start "
2725 				"rx queue:%d\n",
2726 				dev->device_fh, vdev->vmdq_rx_q);
2727 
2728 			/* Stop the TX queue. */
2729 			if (rte_eth_dev_tx_queue_stop(ports[0],
2730 				vdev->vmdq_rx_q) != 0) {
2731 				LOG_DEBUG(VHOST_CONFIG,
2732 					"(%"PRIu64") In new_device: Failed to "
2733 					"stop tx queue:%d\n",
2734 					dev->device_fh, vdev->vmdq_rx_q);
2735 			}
2736 
2737 			mbuf_destroy_zcp(vpool);
2738 			rte_free(vdev->regions_hpa);
2739 			rte_free(vdev);
2740 			return -1;
2741 		}
2742 
2743 	}
2744 
2745 	/*reset ready flag*/
2746 	vdev->ready = DEVICE_MAC_LEARNING;
2747 	vdev->remove = 0;
2748 
2749 	/* Find a suitable lcore to add the device. */
2750 	RTE_LCORE_FOREACH_SLAVE(lcore) {
2751 		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2752 			device_num_min = lcore_info[lcore].lcore_ll->device_num;
2753 			core_add = lcore;
2754 		}
2755 	}
2756 	/* Add device to lcore ll */
2757 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2758 	if (ll_dev == NULL) {
2759 		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2760 		vdev->ready = DEVICE_SAFE_REMOVE;
2761 		destroy_device(dev);
2762 		if (vdev->regions_hpa)
2763 			rte_free(vdev->regions_hpa);
2764 		rte_free(vdev);
2765 		return -1;
2766 	}
2767 	ll_dev->vdev = vdev;
2768 	vdev->coreid = core_add;
2769 
2770 	add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2771 
2772 	/* Initialize device stats */
2773 	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2774 
2775 	/* Disable notifications. */
2776 	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2777 	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2778 	lcore_info[vdev->coreid].lcore_ll->device_num++;
2779 	dev->flags |= VIRTIO_DEV_RUNNING;
2780 
2781 	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2782 
2783 	return 0;
2784 }
2785 
2786 /*
2787  * These callback allow devices to be added to the data core when configuration
2788  * has been fully complete.
2789  */
2790 static const struct virtio_net_device_ops virtio_net_device_ops =
2791 {
2792 	.new_device =  new_device,
2793 	.destroy_device = destroy_device,
2794 };
2795 
2796 /*
2797  * This is a thread will wake up after a period to print stats if the user has
2798  * enabled them.
2799  */
2800 static void
2801 print_stats(void)
2802 {
2803 	struct virtio_net_data_ll *dev_ll;
2804 	uint64_t tx_dropped, rx_dropped;
2805 	uint64_t tx, tx_total, rx, rx_total;
2806 	uint32_t device_fh;
2807 	const char clr[] = { 27, '[', '2', 'J', '\0' };
2808 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2809 
2810 	while(1) {
2811 		sleep(enable_stats);
2812 
2813 		/* Clear screen and move to top left */
2814 		printf("%s%s", clr, top_left);
2815 
2816 		printf("\nDevice statistics ====================================");
2817 
2818 		dev_ll = ll_root_used;
2819 		while (dev_ll != NULL) {
2820 			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2821 			tx_total = dev_statistics[device_fh].tx_total;
2822 			tx = dev_statistics[device_fh].tx;
2823 			tx_dropped = tx_total - tx;
2824 			if (zero_copy == 0) {
2825 				rx_total = rte_atomic64_read(
2826 					&dev_statistics[device_fh].rx_total_atomic);
2827 				rx = rte_atomic64_read(
2828 					&dev_statistics[device_fh].rx_atomic);
2829 			} else {
2830 				rx_total = dev_statistics[device_fh].rx_total;
2831 				rx = dev_statistics[device_fh].rx;
2832 			}
2833 			rx_dropped = rx_total - rx;
2834 
2835 			printf("\nStatistics for device %"PRIu32" ------------------------------"
2836 					"\nTX total: 		%"PRIu64""
2837 					"\nTX dropped: 		%"PRIu64""
2838 					"\nTX successful: 		%"PRIu64""
2839 					"\nRX total: 		%"PRIu64""
2840 					"\nRX dropped: 		%"PRIu64""
2841 					"\nRX successful: 		%"PRIu64"",
2842 					device_fh,
2843 					tx_total,
2844 					tx_dropped,
2845 					tx,
2846 					rx_total,
2847 					rx_dropped,
2848 					rx);
2849 
2850 			dev_ll = dev_ll->next;
2851 		}
2852 		printf("\n======================================================\n");
2853 	}
2854 }
2855 
2856 static void
2857 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2858 	char *ring_name, uint32_t nb_mbuf)
2859 {
2860 	uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2861 	vpool_array[index].pool
2862 		= rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2863 		MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2864 		rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2865 		rte_pktmbuf_init, NULL, socket, 0);
2866 	if (vpool_array[index].pool != NULL) {
2867 		vpool_array[index].ring
2868 			= rte_ring_create(ring_name,
2869 				rte_align32pow2(nb_mbuf + 1),
2870 				socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2871 		if (likely(vpool_array[index].ring != NULL)) {
2872 			LOG_DEBUG(VHOST_CONFIG,
2873 				"in setup_mempool_tbl: mbuf count in "
2874 				"mempool is: %d\n",
2875 				rte_mempool_count(vpool_array[index].pool));
2876 			LOG_DEBUG(VHOST_CONFIG,
2877 				"in setup_mempool_tbl: mbuf count in "
2878 				"ring   is: %d\n",
2879 				rte_ring_count(vpool_array[index].ring));
2880 		} else {
2881 			rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2882 				ring_name);
2883 		}
2884 
2885 		/* Need consider head room. */
2886 		vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2887 	} else {
2888 		rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2889 	}
2890 }
2891 
2892 
2893 /*
2894  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2895  * device is also registered here to handle the IOCTLs.
2896  */
2897 int
2898 main(int argc, char *argv[])
2899 {
2900 	struct rte_mempool *mbuf_pool = NULL;
2901 	unsigned lcore_id, core_id = 0;
2902 	unsigned nb_ports, valid_num_ports;
2903 	int ret;
2904 	uint8_t portid;
2905 	uint16_t queue_id;
2906 	static pthread_t tid;
2907 
2908 	/* init EAL */
2909 	ret = rte_eal_init(argc, argv);
2910 	if (ret < 0)
2911 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2912 	argc -= ret;
2913 	argv += ret;
2914 
2915 	/* parse app arguments */
2916 	ret = us_vhost_parse_args(argc, argv);
2917 	if (ret < 0)
2918 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
2919 
2920 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2921 		if (rte_lcore_is_enabled(lcore_id))
2922 			lcore_ids[core_id ++] = lcore_id;
2923 
2924 	if (rte_lcore_count() > RTE_MAX_LCORE)
2925 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
2926 
2927 	/*set the number of swithcing cores available*/
2928 	num_switching_cores = rte_lcore_count()-1;
2929 
2930 	/* Get the number of physical ports. */
2931 	nb_ports = rte_eth_dev_count();
2932 	if (nb_ports > RTE_MAX_ETHPORTS)
2933 		nb_ports = RTE_MAX_ETHPORTS;
2934 
2935 	/*
2936 	 * Update the global var NUM_PORTS and global array PORTS
2937 	 * and get value of var VALID_NUM_PORTS according to system ports number
2938 	 */
2939 	valid_num_ports = check_ports_num(nb_ports);
2940 
2941 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2942 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2943 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2944 		return -1;
2945 	}
2946 
2947 	if (zero_copy == 0) {
2948 		/* Create the mbuf pool. */
2949 		mbuf_pool = rte_mempool_create(
2950 				"MBUF_POOL",
2951 				NUM_MBUFS_PER_PORT
2952 				* valid_num_ports,
2953 				MBUF_SIZE, MBUF_CACHE_SIZE,
2954 				sizeof(struct rte_pktmbuf_pool_private),
2955 				rte_pktmbuf_pool_init, NULL,
2956 				rte_pktmbuf_init, NULL,
2957 				rte_socket_id(), 0);
2958 		if (mbuf_pool == NULL)
2959 			rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2960 
2961 		for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2962 			vpool_array[queue_id].pool = mbuf_pool;
2963 
2964 		if (vm2vm_mode == VM2VM_HARDWARE) {
2965 			/* Enable VT loop back to let L2 switch to do it. */
2966 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2967 			LOG_DEBUG(VHOST_CONFIG,
2968 				"Enable loop back for L2 switch in vmdq.\n");
2969 		}
2970 	} else {
2971 		uint32_t nb_mbuf;
2972 		char pool_name[RTE_MEMPOOL_NAMESIZE];
2973 		char ring_name[RTE_MEMPOOL_NAMESIZE];
2974 
2975 		nb_mbuf = num_rx_descriptor
2976 			+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2977 			+ num_switching_cores * MAX_PKT_BURST;
2978 
2979 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2980 			snprintf(pool_name, sizeof(pool_name),
2981 				"rxmbuf_pool_%u", queue_id);
2982 			snprintf(ring_name, sizeof(ring_name),
2983 				"rxmbuf_ring_%u", queue_id);
2984 			setup_mempool_tbl(rte_socket_id(), queue_id,
2985 				pool_name, ring_name, nb_mbuf);
2986 		}
2987 
2988 		nb_mbuf = num_tx_descriptor
2989 				+ num_switching_cores * MBUF_CACHE_SIZE_ZCP
2990 				+ num_switching_cores * MAX_PKT_BURST;
2991 
2992 		for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2993 			snprintf(pool_name, sizeof(pool_name),
2994 				"txmbuf_pool_%u", queue_id);
2995 			snprintf(ring_name, sizeof(ring_name),
2996 				"txmbuf_ring_%u", queue_id);
2997 			setup_mempool_tbl(rte_socket_id(),
2998 				(queue_id + MAX_QUEUES),
2999 				pool_name, ring_name, nb_mbuf);
3000 		}
3001 
3002 		if (vm2vm_mode == VM2VM_HARDWARE) {
3003 			/* Enable VT loop back to let L2 switch to do it. */
3004 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3005 			LOG_DEBUG(VHOST_CONFIG,
3006 				"Enable loop back for L2 switch in vmdq.\n");
3007 		}
3008 	}
3009 	/* Set log level. */
3010 	rte_set_log_level(LOG_LEVEL);
3011 
3012 	/* initialize all ports */
3013 	for (portid = 0; portid < nb_ports; portid++) {
3014 		/* skip ports that are not enabled */
3015 		if ((enabled_port_mask & (1 << portid)) == 0) {
3016 			RTE_LOG(INFO, VHOST_PORT,
3017 				"Skipping disabled port %d\n", portid);
3018 			continue;
3019 		}
3020 		if (port_init(portid) != 0)
3021 			rte_exit(EXIT_FAILURE,
3022 				"Cannot initialize network ports\n");
3023 	}
3024 
3025 	/* Initialise all linked lists. */
3026 	if (init_data_ll() == -1)
3027 		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3028 
3029 	/* Initialize device stats */
3030 	memset(&dev_statistics, 0, sizeof(dev_statistics));
3031 
3032 	/* Enable stats if the user option is set. */
3033 	if (enable_stats)
3034 		pthread_create(&tid, NULL, (void*)print_stats, NULL );
3035 
3036 	/* Launch all data cores. */
3037 	if (zero_copy == 0) {
3038 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3039 			rte_eal_remote_launch(switch_worker,
3040 				mbuf_pool, lcore_id);
3041 		}
3042 	} else {
3043 		uint32_t count_in_mempool, index, i;
3044 		for (index = 0; index < 2*MAX_QUEUES; index++) {
3045 			/* For all RX and TX queues. */
3046 			count_in_mempool
3047 				= rte_mempool_count(vpool_array[index].pool);
3048 
3049 			/*
3050 			 * Transfer all un-attached mbufs from vpool.pool
3051 			 * to vpoo.ring.
3052 			 */
3053 			for (i = 0; i < count_in_mempool; i++) {
3054 				struct rte_mbuf *mbuf
3055 					= __rte_mbuf_raw_alloc(
3056 						vpool_array[index].pool);
3057 				rte_ring_sp_enqueue(vpool_array[index].ring,
3058 						(void *)mbuf);
3059 			}
3060 
3061 			LOG_DEBUG(VHOST_CONFIG,
3062 				"in main: mbuf count in mempool at initial "
3063 				"is: %d\n", count_in_mempool);
3064 			LOG_DEBUG(VHOST_CONFIG,
3065 				"in main: mbuf count in  ring at initial  is :"
3066 				" %d\n",
3067 				rte_ring_count(vpool_array[index].ring));
3068 		}
3069 
3070 		RTE_LCORE_FOREACH_SLAVE(lcore_id)
3071 			rte_eal_remote_launch(switch_worker_zcp, NULL,
3072 				lcore_id);
3073 	}
3074 
3075 	if (mergeable == 0)
3076 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3077 
3078 	/* Register CUSE device to handle IOCTLs. */
3079 	ret = rte_vhost_driver_register((char *)&dev_basename);
3080 	if (ret != 0)
3081 		rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3082 
3083 	rte_vhost_driver_callback_register(&virtio_net_device_ops);
3084 
3085 	/* Start CUSE session. */
3086 	rte_vhost_driver_session_start();
3087 	return 0;
3088 
3089 }
3090 
3091