xref: /dpdk/examples/vhost/main.c (revision 72206323a5dd3182b13f61b25a64abdddfee595c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29 
30 #include "main.h"
31 
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35 
36 #define NUM_MBUFS_DEFAULT 0x24000
37 
38 /* the maximum number of external ports supported */
39 #define MAX_SUP_PORTS 1
40 
41 #define MBUF_CACHE_SIZE	128
42 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
43 
44 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
45 
46 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
47 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
48 
49 #define JUMBO_FRAME_MAX_SIZE    0x2600
50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
51 
52 /* State of virtio device. */
53 #define DEVICE_MAC_LEARNING 0
54 #define DEVICE_RX			1
55 #define DEVICE_SAFE_REMOVE	2
56 
57 /* Configurable number of RX/TX ring descriptors */
58 #define RTE_TEST_RX_DESC_DEFAULT 1024
59 #define RTE_TEST_TX_DESC_DEFAULT 512
60 
61 #define INVALID_PORT_ID 0xFF
62 #define INVALID_DMA_ID -1
63 
64 #define DMA_RING_SIZE 4096
65 
66 #define ASYNC_ENQUEUE_VHOST 1
67 #define ASYNC_DEQUEUE_VHOST 2
68 
69 /* number of mbufs in all pools - if specified on command-line. */
70 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
71 
72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
74 static int dma_count;
75 
76 /* mask of enabled ports */
77 static uint32_t enabled_port_mask = 0;
78 
79 /* Promiscuous mode */
80 static uint32_t promiscuous;
81 
82 /* number of devices/queues to support*/
83 static uint32_t num_queues = 0;
84 static uint32_t num_devices;
85 
86 static struct rte_mempool *mbuf_pool;
87 static int mergeable;
88 
89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
90 typedef enum {
91 	VM2VM_DISABLED = 0,
92 	VM2VM_SOFTWARE = 1,
93 	VM2VM_HARDWARE = 2,
94 	VM2VM_LAST
95 } vm2vm_type;
96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
97 
98 /* Enable stats. */
99 static uint32_t enable_stats = 0;
100 /* Enable retries on RX. */
101 static uint32_t enable_retry = 1;
102 
103 /* Disable TX checksum offload */
104 static uint32_t enable_tx_csum;
105 
106 /* Disable TSO offload */
107 static uint32_t enable_tso;
108 
109 static int client_mode;
110 
111 static int builtin_net_driver;
112 
113 /* Specify timeout (in useconds) between retries on RX. */
114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
115 /* Specify the number of retries on RX. */
116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
117 
118 /* Socket file paths. Can be set by user */
119 static char *socket_files;
120 static int nb_sockets;
121 
122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
123 
124 /* empty VMDq configuration structure. Filled in programmatically */
125 static struct rte_eth_conf vmdq_conf_default = {
126 	.rxmode = {
127 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
128 		.split_hdr_size = 0,
129 		/*
130 		 * VLAN strip is necessary for 1G NIC such as I350,
131 		 * this fixes bug of ipv4 forwarding in guest can't
132 		 * forward packets from one virtio dev to another virtio dev.
133 		 */
134 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
135 	},
136 
137 	.txmode = {
138 		.mq_mode = RTE_ETH_MQ_TX_NONE,
139 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
140 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
141 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
142 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
143 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
144 	},
145 	.rx_adv_conf = {
146 		/*
147 		 * should be overridden separately in code with
148 		 * appropriate values
149 		 */
150 		.vmdq_rx_conf = {
151 			.nb_queue_pools = RTE_ETH_8_POOLS,
152 			.enable_default_pool = 0,
153 			.default_pool = 0,
154 			.nb_pool_maps = 0,
155 			.pool_map = {{0, 0},},
156 		},
157 	},
158 };
159 
160 
161 static unsigned lcore_ids[RTE_MAX_LCORE];
162 static uint16_t ports[RTE_MAX_ETHPORTS];
163 static unsigned num_ports = 0; /**< The number of ports specified in command line */
164 static uint16_t num_pf_queues, num_vmdq_queues;
165 static uint16_t vmdq_pool_base, vmdq_queue_base;
166 static uint16_t queues_per_pool;
167 
168 const uint16_t vlan_tags[] = {
169 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
170 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
171 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
172 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
173 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
174 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
175 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
176 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
177 };
178 
179 /* ethernet addresses of ports */
180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
181 
182 static struct vhost_dev_tailq_list vhost_dev_list =
183 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
184 
185 static struct lcore_info lcore_info[RTE_MAX_LCORE];
186 
187 /* Used for queueing bursts of TX packets. */
188 struct mbuf_table {
189 	unsigned len;
190 	unsigned txq_id;
191 	struct rte_mbuf *m_table[MAX_PKT_BURST];
192 };
193 
194 struct vhost_bufftable {
195 	uint32_t len;
196 	uint64_t pre_tsc;
197 	struct rte_mbuf *m_table[MAX_PKT_BURST];
198 };
199 
200 /* TX queue for each data core. */
201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
202 
203 /*
204  * Vhost TX buffer for each data core.
205  * Every data core maintains a TX buffer for every vhost device,
206  * which is used for batch pkts enqueue for higher performance.
207  */
208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
209 
210 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
211 				 / US_PER_S * BURST_TX_DRAIN_US)
212 
213 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
214 
215 static inline uint32_t
216 get_async_flag_by_socketid(int socketid)
217 {
218 	return dma_bind[socketid].async_flag;
219 }
220 
221 static inline void
222 init_vid2socketid_array(int vid, int socketid)
223 {
224 	vid2socketid[vid] = socketid;
225 }
226 
227 static inline bool
228 is_dma_configured(int16_t dev_id)
229 {
230 	int i;
231 
232 	for (i = 0; i < dma_count; i++)
233 		if (dmas_id[i] == dev_id)
234 			return true;
235 	return false;
236 }
237 
238 static inline int
239 open_dma(const char *value)
240 {
241 	struct dma_for_vhost *dma_info = dma_bind;
242 	char *input = strndup(value, strlen(value) + 1);
243 	char *addrs = input;
244 	char *ptrs[2];
245 	char *start, *end, *substr;
246 	int64_t socketid, vring_id;
247 
248 	struct rte_dma_info info;
249 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
250 	struct rte_dma_vchan_conf qconf = {
251 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
252 		.nb_desc = DMA_RING_SIZE
253 	};
254 
255 	int dev_id;
256 	int ret = 0;
257 	uint16_t i = 0;
258 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
259 	int args_nr;
260 
261 	while (isblank(*addrs))
262 		addrs++;
263 	if (*addrs == '\0') {
264 		ret = -1;
265 		goto out;
266 	}
267 
268 	/* process DMA devices within bracket. */
269 	addrs++;
270 	substr = strtok(addrs, ";]");
271 	if (!substr) {
272 		ret = -1;
273 		goto out;
274 	}
275 
276 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
277 	if (args_nr <= 0) {
278 		ret = -1;
279 		goto out;
280 	}
281 
282 	while (i < args_nr) {
283 		char *arg_temp = dma_arg[i];
284 		char *txd, *rxd;
285 		uint8_t sub_nr;
286 		int async_flag;
287 
288 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
289 		if (sub_nr != 2) {
290 			ret = -1;
291 			goto out;
292 		}
293 
294 		txd = strstr(ptrs[0], "txd");
295 		rxd = strstr(ptrs[0], "rxd");
296 		if (txd) {
297 			start = txd;
298 			vring_id = VIRTIO_RXQ;
299 			async_flag = ASYNC_ENQUEUE_VHOST;
300 		} else if (rxd) {
301 			start = rxd;
302 			vring_id = VIRTIO_TXQ;
303 			async_flag = ASYNC_DEQUEUE_VHOST;
304 		} else {
305 			ret = -1;
306 			goto out;
307 		}
308 
309 		start += 3;
310 		socketid = strtol(start, &end, 0);
311 		if (end == start) {
312 			ret = -1;
313 			goto out;
314 		}
315 
316 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
317 		if (dev_id < 0) {
318 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
319 			ret = -1;
320 			goto out;
321 		}
322 
323 		/* DMA device is already configured, so skip */
324 		if (is_dma_configured(dev_id))
325 			goto done;
326 
327 		if (rte_dma_info_get(dev_id, &info) != 0) {
328 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
329 			ret = -1;
330 			goto out;
331 		}
332 
333 		if (info.max_vchans < 1) {
334 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
335 			ret = -1;
336 			goto out;
337 		}
338 
339 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
340 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
341 			ret = -1;
342 			goto out;
343 		}
344 
345 		/* Check the max desc supported by DMA device */
346 		rte_dma_info_get(dev_id, &info);
347 		if (info.nb_vchans != 1) {
348 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
349 					dev_id);
350 			ret = -1;
351 			goto out;
352 		}
353 
354 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
355 
356 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
357 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
358 			ret = -1;
359 			goto out;
360 		}
361 
362 		if (rte_dma_start(dev_id) != 0) {
363 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
364 			ret = -1;
365 			goto out;
366 		}
367 
368 		dmas_id[dma_count++] = dev_id;
369 
370 done:
371 		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
372 		(dma_info + socketid)->async_flag |= async_flag;
373 		i++;
374 	}
375 out:
376 	free(input);
377 	return ret;
378 }
379 
380 /*
381  * Builds up the correct configuration for VMDQ VLAN pool map
382  * according to the pool & queue limits.
383  */
384 static inline int
385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
386 {
387 	struct rte_eth_vmdq_rx_conf conf;
388 	struct rte_eth_vmdq_rx_conf *def_conf =
389 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
390 	unsigned i;
391 
392 	memset(&conf, 0, sizeof(conf));
393 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
394 	conf.nb_pool_maps = num_devices;
395 	conf.enable_loop_back = def_conf->enable_loop_back;
396 	conf.rx_mode = def_conf->rx_mode;
397 
398 	for (i = 0; i < conf.nb_pool_maps; i++) {
399 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
400 		conf.pool_map[i].pools = (1UL << i);
401 	}
402 
403 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
404 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
405 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
406 	return 0;
407 }
408 
409 /*
410  * Initialises a given port using global settings and with the rx buffers
411  * coming from the mbuf_pool passed as parameter
412  */
413 static inline int
414 port_init(uint16_t port)
415 {
416 	struct rte_eth_dev_info dev_info;
417 	struct rte_eth_conf port_conf;
418 	struct rte_eth_rxconf *rxconf;
419 	struct rte_eth_txconf *txconf;
420 	int16_t rx_rings, tx_rings;
421 	uint16_t rx_ring_size, tx_ring_size;
422 	int retval;
423 	uint16_t q;
424 
425 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
426 	retval = rte_eth_dev_info_get(port, &dev_info);
427 	if (retval != 0) {
428 		RTE_LOG(ERR, VHOST_PORT,
429 			"Error during getting device (port %u) info: %s\n",
430 			port, strerror(-retval));
431 
432 		return retval;
433 	}
434 	if (dev_info.max_vmdq_pools == 0) {
435 		RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
436 		return -1;
437 	}
438 
439 	rxconf = &dev_info.default_rxconf;
440 	txconf = &dev_info.default_txconf;
441 	rxconf->rx_drop_en = 1;
442 
443 	/*configure the number of supported virtio devices based on VMDQ limits */
444 	num_devices = dev_info.max_vmdq_pools;
445 
446 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
447 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
448 
449 	tx_rings = (uint16_t)rte_lcore_count();
450 
451 	if (mergeable) {
452 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
453 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
454 		else
455 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
456 	}
457 
458 	/* Get port configuration. */
459 	retval = get_eth_conf(&port_conf, num_devices);
460 	if (retval < 0)
461 		return retval;
462 	/* NIC queues are divided into pf queues and vmdq queues.  */
463 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
464 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
465 	num_vmdq_queues = num_devices * queues_per_pool;
466 	num_queues = num_pf_queues + num_vmdq_queues;
467 	vmdq_queue_base = dev_info.vmdq_queue_base;
468 	vmdq_pool_base  = dev_info.vmdq_pool_base;
469 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
470 		num_pf_queues, num_devices, queues_per_pool);
471 
472 	if (!rte_eth_dev_is_valid_port(port))
473 		return -1;
474 
475 	rx_rings = (uint16_t)dev_info.max_rx_queues;
476 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
477 		port_conf.txmode.offloads |=
478 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
479 	/* Configure ethernet device. */
480 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
481 	if (retval != 0) {
482 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
483 			port, strerror(-retval));
484 		return retval;
485 	}
486 
487 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
488 		&tx_ring_size);
489 	if (retval != 0) {
490 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
491 			"for port %u: %s.\n", port, strerror(-retval));
492 		return retval;
493 	}
494 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
495 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
496 			"for Rx queues on port %u.\n", port);
497 		return -1;
498 	}
499 
500 	/* Setup the queues. */
501 	rxconf->offloads = port_conf.rxmode.offloads;
502 	for (q = 0; q < rx_rings; q ++) {
503 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
504 						rte_eth_dev_socket_id(port),
505 						rxconf,
506 						mbuf_pool);
507 		if (retval < 0) {
508 			RTE_LOG(ERR, VHOST_PORT,
509 				"Failed to setup rx queue %u of port %u: %s.\n",
510 				q, port, strerror(-retval));
511 			return retval;
512 		}
513 	}
514 	txconf->offloads = port_conf.txmode.offloads;
515 	for (q = 0; q < tx_rings; q ++) {
516 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
517 						rte_eth_dev_socket_id(port),
518 						txconf);
519 		if (retval < 0) {
520 			RTE_LOG(ERR, VHOST_PORT,
521 				"Failed to setup tx queue %u of port %u: %s.\n",
522 				q, port, strerror(-retval));
523 			return retval;
524 		}
525 	}
526 
527 	/* Start the device. */
528 	retval  = rte_eth_dev_start(port);
529 	if (retval < 0) {
530 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
531 			port, strerror(-retval));
532 		return retval;
533 	}
534 
535 	if (promiscuous) {
536 		retval = rte_eth_promiscuous_enable(port);
537 		if (retval != 0) {
538 			RTE_LOG(ERR, VHOST_PORT,
539 				"Failed to enable promiscuous mode on port %u: %s\n",
540 				port, rte_strerror(-retval));
541 			return retval;
542 		}
543 	}
544 
545 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
546 	if (retval < 0) {
547 		RTE_LOG(ERR, VHOST_PORT,
548 			"Failed to get MAC address on port %u: %s\n",
549 			port, rte_strerror(-retval));
550 		return retval;
551 	}
552 
553 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
554 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
555 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
556 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
557 
558 	return 0;
559 }
560 
561 /*
562  * Set socket file path.
563  */
564 static int
565 us_vhost_parse_socket_path(const char *q_arg)
566 {
567 	char *old;
568 
569 	/* parse number string */
570 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
571 		return -1;
572 
573 	old = socket_files;
574 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
575 	if (socket_files == NULL) {
576 		free(old);
577 		return -1;
578 	}
579 
580 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
581 	nb_sockets++;
582 
583 	return 0;
584 }
585 
586 /*
587  * Parse the portmask provided at run time.
588  */
589 static int
590 parse_portmask(const char *portmask)
591 {
592 	char *end = NULL;
593 	unsigned long pm;
594 
595 	errno = 0;
596 
597 	/* parse hexadecimal string */
598 	pm = strtoul(portmask, &end, 16);
599 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
600 		return 0;
601 
602 	return pm;
603 
604 }
605 
606 /*
607  * Parse num options at run time.
608  */
609 static int
610 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
611 {
612 	char *end = NULL;
613 	unsigned long num;
614 
615 	errno = 0;
616 
617 	/* parse unsigned int string */
618 	num = strtoul(q_arg, &end, 10);
619 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
620 		return -1;
621 
622 	if (num > max_valid_value)
623 		return -1;
624 
625 	return num;
626 
627 }
628 
629 /*
630  * Display usage
631  */
632 static void
633 us_vhost_usage(const char *prgname)
634 {
635 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
636 	"		--vm2vm [0|1|2]\n"
637 	"		--rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n"
638 	"		--socket-file <path>\n"
639 	"		-p PORTMASK: Set mask for ports to be used by application\n"
640 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
641 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
642 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
643 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
644 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
645 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
646 	"		--socket-file: The path of the socket file.\n"
647 	"		--tx-csum [0|1]: disable/enable TX checksum offload.\n"
648 	"		--tso [0|1]: disable/enable TCP segment offload.\n"
649 	"		--client: register a vhost-user socket as client mode.\n"
650 	"		--dmas: register dma channel for specific vhost device.\n"
651 	"		--total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n"
652 	"		--builtin-net-driver: enable simple vhost-user net driver\n",
653 	       prgname);
654 }
655 
656 enum {
657 #define OPT_VM2VM               "vm2vm"
658 	OPT_VM2VM_NUM = 256,
659 #define OPT_RX_RETRY            "rx-retry"
660 	OPT_RX_RETRY_NUM,
661 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
662 	OPT_RX_RETRY_DELAY_NUM,
663 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
664 	OPT_RX_RETRY_NUMB_NUM,
665 #define OPT_MERGEABLE           "mergeable"
666 	OPT_MERGEABLE_NUM,
667 #define OPT_STATS               "stats"
668 	OPT_STATS_NUM,
669 #define OPT_SOCKET_FILE         "socket-file"
670 	OPT_SOCKET_FILE_NUM,
671 #define OPT_TX_CSUM             "tx-csum"
672 	OPT_TX_CSUM_NUM,
673 #define OPT_TSO                 "tso"
674 	OPT_TSO_NUM,
675 #define OPT_CLIENT              "client"
676 	OPT_CLIENT_NUM,
677 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
678 	OPT_BUILTIN_NET_DRIVER_NUM,
679 #define OPT_DMAS                "dmas"
680 	OPT_DMAS_NUM,
681 #define OPT_NUM_MBUFS           "total-num-mbufs"
682 	OPT_NUM_MBUFS_NUM,
683 };
684 
685 /*
686  * Parse the arguments given in the command line of the application.
687  */
688 static int
689 us_vhost_parse_args(int argc, char **argv)
690 {
691 	int opt, ret;
692 	int option_index;
693 	unsigned i;
694 	const char *prgname = argv[0];
695 	static struct option long_option[] = {
696 		{OPT_VM2VM, required_argument,
697 				NULL, OPT_VM2VM_NUM},
698 		{OPT_RX_RETRY, required_argument,
699 				NULL, OPT_RX_RETRY_NUM},
700 		{OPT_RX_RETRY_DELAY, required_argument,
701 				NULL, OPT_RX_RETRY_DELAY_NUM},
702 		{OPT_RX_RETRY_NUMB, required_argument,
703 				NULL, OPT_RX_RETRY_NUMB_NUM},
704 		{OPT_MERGEABLE, required_argument,
705 				NULL, OPT_MERGEABLE_NUM},
706 		{OPT_STATS, required_argument,
707 				NULL, OPT_STATS_NUM},
708 		{OPT_SOCKET_FILE, required_argument,
709 				NULL, OPT_SOCKET_FILE_NUM},
710 		{OPT_TX_CSUM, required_argument,
711 				NULL, OPT_TX_CSUM_NUM},
712 		{OPT_TSO, required_argument,
713 				NULL, OPT_TSO_NUM},
714 		{OPT_CLIENT, no_argument,
715 				NULL, OPT_CLIENT_NUM},
716 		{OPT_BUILTIN_NET_DRIVER, no_argument,
717 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
718 		{OPT_DMAS, required_argument,
719 				NULL, OPT_DMAS_NUM},
720 		{OPT_NUM_MBUFS, required_argument,
721 				NULL, OPT_NUM_MBUFS_NUM},
722 		{NULL, 0, 0, 0},
723 	};
724 
725 	/* Parse command line */
726 	while ((opt = getopt_long(argc, argv, "p:P",
727 			long_option, &option_index)) != EOF) {
728 		switch (opt) {
729 		/* Portmask */
730 		case 'p':
731 			enabled_port_mask = parse_portmask(optarg);
732 			if (enabled_port_mask == 0) {
733 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
734 				us_vhost_usage(prgname);
735 				return -1;
736 			}
737 			break;
738 
739 		case 'P':
740 			promiscuous = 1;
741 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
742 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
743 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
744 			break;
745 
746 		case OPT_VM2VM_NUM:
747 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
748 			if (ret == -1) {
749 				RTE_LOG(INFO, VHOST_CONFIG,
750 					"Invalid argument for "
751 					"vm2vm [0|1|2]\n");
752 				us_vhost_usage(prgname);
753 				return -1;
754 			}
755 			vm2vm_mode = (vm2vm_type)ret;
756 			break;
757 
758 		case OPT_RX_RETRY_NUM:
759 			ret = parse_num_opt(optarg, 1);
760 			if (ret == -1) {
761 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
762 				us_vhost_usage(prgname);
763 				return -1;
764 			}
765 			enable_retry = ret;
766 			break;
767 
768 		case OPT_TX_CSUM_NUM:
769 			ret = parse_num_opt(optarg, 1);
770 			if (ret == -1) {
771 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
772 				us_vhost_usage(prgname);
773 				return -1;
774 			}
775 			enable_tx_csum = ret;
776 			break;
777 
778 		case OPT_TSO_NUM:
779 			ret = parse_num_opt(optarg, 1);
780 			if (ret == -1) {
781 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
782 				us_vhost_usage(prgname);
783 				return -1;
784 			}
785 			enable_tso = ret;
786 			break;
787 
788 		case OPT_RX_RETRY_DELAY_NUM:
789 			ret = parse_num_opt(optarg, INT32_MAX);
790 			if (ret == -1) {
791 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
792 				us_vhost_usage(prgname);
793 				return -1;
794 			}
795 			burst_rx_delay_time = ret;
796 			break;
797 
798 		case OPT_RX_RETRY_NUMB_NUM:
799 			ret = parse_num_opt(optarg, INT32_MAX);
800 			if (ret == -1) {
801 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
802 				us_vhost_usage(prgname);
803 				return -1;
804 			}
805 			burst_rx_retry_num = ret;
806 			break;
807 
808 		case OPT_MERGEABLE_NUM:
809 			ret = parse_num_opt(optarg, 1);
810 			if (ret == -1) {
811 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
812 				us_vhost_usage(prgname);
813 				return -1;
814 			}
815 			mergeable = !!ret;
816 			break;
817 
818 		case OPT_STATS_NUM:
819 			ret = parse_num_opt(optarg, INT32_MAX);
820 			if (ret == -1) {
821 				RTE_LOG(INFO, VHOST_CONFIG,
822 					"Invalid argument for stats [0..N]\n");
823 				us_vhost_usage(prgname);
824 				return -1;
825 			}
826 			enable_stats = ret;
827 			break;
828 
829 		/* Set socket file path. */
830 		case OPT_SOCKET_FILE_NUM:
831 			if (us_vhost_parse_socket_path(optarg) == -1) {
832 				RTE_LOG(INFO, VHOST_CONFIG,
833 				"Invalid argument for socket name (Max %d characters)\n",
834 				PATH_MAX);
835 				us_vhost_usage(prgname);
836 				return -1;
837 			}
838 			break;
839 
840 		case OPT_DMAS_NUM:
841 			if (open_dma(optarg) == -1) {
842 				RTE_LOG(INFO, VHOST_CONFIG,
843 					"Wrong DMA args\n");
844 				us_vhost_usage(prgname);
845 				return -1;
846 			}
847 			break;
848 
849 		case OPT_NUM_MBUFS_NUM:
850 			ret = parse_num_opt(optarg, INT32_MAX);
851 			if (ret == -1) {
852 				RTE_LOG(INFO, VHOST_CONFIG,
853 					"Invalid argument for total-num-mbufs [0..N]\n");
854 				us_vhost_usage(prgname);
855 				return -1;
856 			}
857 
858 			if (total_num_mbufs < ret)
859 				total_num_mbufs = ret;
860 			break;
861 
862 		case OPT_CLIENT_NUM:
863 			client_mode = 1;
864 			break;
865 
866 		case OPT_BUILTIN_NET_DRIVER_NUM:
867 			builtin_net_driver = 1;
868 			break;
869 
870 		/* Invalid option - print options. */
871 		default:
872 			us_vhost_usage(prgname);
873 			return -1;
874 		}
875 	}
876 
877 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
878 		if (enabled_port_mask & (1 << i))
879 			ports[num_ports++] = i;
880 	}
881 
882 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
883 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
884 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
885 		return -1;
886 	}
887 
888 	return 0;
889 }
890 
891 /*
892  * Update the global var NUM_PORTS and array PORTS according to system ports number
893  * and return valid ports number
894  */
895 static unsigned check_ports_num(unsigned nb_ports)
896 {
897 	unsigned valid_num_ports = num_ports;
898 	unsigned portid;
899 
900 	if (num_ports > nb_ports) {
901 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
902 			num_ports, nb_ports);
903 		num_ports = nb_ports;
904 	}
905 
906 	for (portid = 0; portid < num_ports; portid ++) {
907 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
908 			RTE_LOG(INFO, VHOST_PORT,
909 				"\nSpecified port ID(%u) is not valid\n",
910 				ports[portid]);
911 			ports[portid] = INVALID_PORT_ID;
912 			valid_num_ports--;
913 		}
914 	}
915 	return valid_num_ports;
916 }
917 
918 static __rte_always_inline struct vhost_dev *
919 find_vhost_dev(struct rte_ether_addr *mac)
920 {
921 	struct vhost_dev *vdev;
922 
923 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
924 		if (vdev->ready == DEVICE_RX &&
925 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
926 			return vdev;
927 	}
928 
929 	return NULL;
930 }
931 
932 /*
933  * This function learns the MAC address of the device and registers this along with a
934  * vlan tag to a VMDQ.
935  */
936 static int
937 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
938 {
939 	struct rte_ether_hdr *pkt_hdr;
940 	int i, ret;
941 
942 	/* Learn MAC address of guest device from packet */
943 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
944 
945 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
946 		RTE_LOG(ERR, VHOST_DATA,
947 			"(%d) device is using a registered MAC!\n",
948 			vdev->vid);
949 		return -1;
950 	}
951 
952 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
953 		vdev->mac_address.addr_bytes[i] =
954 			pkt_hdr->src_addr.addr_bytes[i];
955 
956 	/* vlan_tag currently uses the device_id. */
957 	vdev->vlan_tag = vlan_tags[vdev->vid];
958 
959 	/* Print out VMDQ registration info. */
960 	RTE_LOG(INFO, VHOST_DATA,
961 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
962 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
963 		vdev->vlan_tag);
964 
965 	/* Register the MAC address. */
966 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
967 				(uint32_t)vdev->vid + vmdq_pool_base);
968 	if (ret)
969 		RTE_LOG(ERR, VHOST_DATA,
970 			"(%d) failed to add device MAC address to VMDQ\n",
971 			vdev->vid);
972 
973 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
974 
975 	/* Set device as ready for RX. */
976 	vdev->ready = DEVICE_RX;
977 
978 	return 0;
979 }
980 
981 /*
982  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
983  * queue before disabling RX on the device.
984  */
985 static inline void
986 unlink_vmdq(struct vhost_dev *vdev)
987 {
988 	unsigned i = 0;
989 	unsigned rx_count;
990 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
991 
992 	if (vdev->ready == DEVICE_RX) {
993 		/*clear MAC and VLAN settings*/
994 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
995 		for (i = 0; i < 6; i++)
996 			vdev->mac_address.addr_bytes[i] = 0;
997 
998 		vdev->vlan_tag = 0;
999 
1000 		/*Clear out the receive buffers*/
1001 		rx_count = rte_eth_rx_burst(ports[0],
1002 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1003 
1004 		while (rx_count) {
1005 			for (i = 0; i < rx_count; i++)
1006 				rte_pktmbuf_free(pkts_burst[i]);
1007 
1008 			rx_count = rte_eth_rx_burst(ports[0],
1009 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1010 		}
1011 
1012 		vdev->ready = DEVICE_MAC_LEARNING;
1013 	}
1014 }
1015 
1016 static inline void
1017 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1018 {
1019 	while (n--)
1020 		rte_pktmbuf_free(pkts[n]);
1021 }
1022 
1023 static __rte_always_inline void
1024 complete_async_pkts(struct vhost_dev *vdev)
1025 {
1026 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1027 	uint16_t complete_count;
1028 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1029 
1030 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1031 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1032 	if (complete_count)
1033 		free_pkts(p_cpl, complete_count);
1034 
1035 }
1036 
1037 static __rte_always_inline void
1038 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1039 	    struct rte_mbuf *m)
1040 {
1041 	uint16_t ret;
1042 
1043 	if (builtin_net_driver) {
1044 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1045 	} else {
1046 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1047 	}
1048 
1049 	if (enable_stats) {
1050 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1051 				__ATOMIC_SEQ_CST);
1052 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1053 				__ATOMIC_SEQ_CST);
1054 		src_vdev->stats.tx_total++;
1055 		src_vdev->stats.tx += ret;
1056 	}
1057 }
1058 
1059 static __rte_always_inline void
1060 drain_vhost(struct vhost_dev *vdev)
1061 {
1062 	uint16_t ret;
1063 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1064 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1065 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1066 
1067 	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1068 
1069 	if (enable_stats) {
1070 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1071 				__ATOMIC_SEQ_CST);
1072 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1073 				__ATOMIC_SEQ_CST);
1074 	}
1075 
1076 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1077 		free_pkts(m, nr_xmit);
1078 }
1079 
1080 static __rte_always_inline void
1081 drain_vhost_table(void)
1082 {
1083 	uint16_t lcore_id = rte_lcore_id();
1084 	struct vhost_bufftable *vhost_txq;
1085 	struct vhost_dev *vdev;
1086 	uint64_t cur_tsc;
1087 
1088 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1089 		if (unlikely(vdev->remove == 1))
1090 			continue;
1091 
1092 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1093 
1094 		cur_tsc = rte_rdtsc();
1095 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1096 				> MBUF_TABLE_DRAIN_TSC)) {
1097 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1098 				"Vhost TX queue drained after timeout with burst size %u\n",
1099 				vhost_txq->len);
1100 			drain_vhost(vdev);
1101 			vhost_txq->len = 0;
1102 			vhost_txq->pre_tsc = cur_tsc;
1103 		}
1104 	}
1105 }
1106 
1107 /*
1108  * Check if the packet destination MAC address is for a local device. If so then put
1109  * the packet on that devices RX queue. If not then return.
1110  */
1111 static __rte_always_inline int
1112 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1113 {
1114 	struct rte_ether_hdr *pkt_hdr;
1115 	struct vhost_dev *dst_vdev;
1116 	struct vhost_bufftable *vhost_txq;
1117 	uint16_t lcore_id = rte_lcore_id();
1118 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1119 
1120 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1121 	if (!dst_vdev)
1122 		return -1;
1123 
1124 	if (vdev->vid == dst_vdev->vid) {
1125 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1126 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1127 			vdev->vid);
1128 		return 0;
1129 	}
1130 
1131 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1132 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1133 
1134 	if (unlikely(dst_vdev->remove)) {
1135 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1136 			"(%d) device is marked for removal\n", dst_vdev->vid);
1137 		return 0;
1138 	}
1139 
1140 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1141 	vhost_txq->m_table[vhost_txq->len++] = m;
1142 
1143 	if (enable_stats) {
1144 		vdev->stats.tx_total++;
1145 		vdev->stats.tx++;
1146 	}
1147 
1148 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1149 		drain_vhost(dst_vdev);
1150 		vhost_txq->len = 0;
1151 		vhost_txq->pre_tsc = rte_rdtsc();
1152 	}
1153 	return 0;
1154 }
1155 
1156 /*
1157  * Check if the destination MAC of a packet is one local VM,
1158  * and get its vlan tag, and offset if it is.
1159  */
1160 static __rte_always_inline int
1161 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1162 	uint32_t *offset, uint16_t *vlan_tag)
1163 {
1164 	struct vhost_dev *dst_vdev;
1165 	struct rte_ether_hdr *pkt_hdr =
1166 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1167 
1168 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1169 	if (!dst_vdev)
1170 		return 0;
1171 
1172 	if (vdev->vid == dst_vdev->vid) {
1173 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1174 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1175 			vdev->vid);
1176 		return -1;
1177 	}
1178 
1179 	/*
1180 	 * HW vlan strip will reduce the packet length
1181 	 * by minus length of vlan tag, so need restore
1182 	 * the packet length by plus it.
1183 	 */
1184 	*offset  = RTE_VLAN_HLEN;
1185 	*vlan_tag = vlan_tags[vdev->vid];
1186 
1187 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1188 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1189 		vdev->vid, dst_vdev->vid, *vlan_tag);
1190 
1191 	return 0;
1192 }
1193 
1194 static void virtio_tx_offload(struct rte_mbuf *m)
1195 {
1196 	struct rte_net_hdr_lens hdr_lens;
1197 	struct rte_ipv4_hdr *ipv4_hdr;
1198 	struct rte_tcp_hdr *tcp_hdr;
1199 	uint32_t ptype;
1200 	void *l3_hdr;
1201 
1202 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1203 	m->l2_len = hdr_lens.l2_len;
1204 	m->l3_len = hdr_lens.l3_len;
1205 	m->l4_len = hdr_lens.l4_len;
1206 
1207 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1208 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1209 		m->l2_len + m->l3_len);
1210 
1211 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1212 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1213 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1214 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1215 		ipv4_hdr = l3_hdr;
1216 		ipv4_hdr->hdr_checksum = 0;
1217 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1218 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1219 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1220 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1221 	}
1222 }
1223 
1224 static __rte_always_inline void
1225 do_drain_mbuf_table(struct mbuf_table *tx_q)
1226 {
1227 	uint16_t count;
1228 
1229 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1230 				 tx_q->m_table, tx_q->len);
1231 	if (unlikely(count < tx_q->len))
1232 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1233 
1234 	tx_q->len = 0;
1235 }
1236 
1237 /*
1238  * This function routes the TX packet to the correct interface. This
1239  * may be a local device or the physical port.
1240  */
1241 static __rte_always_inline void
1242 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1243 {
1244 	struct mbuf_table *tx_q;
1245 	unsigned offset = 0;
1246 	const uint16_t lcore_id = rte_lcore_id();
1247 	struct rte_ether_hdr *nh;
1248 
1249 
1250 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1251 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1252 		struct vhost_dev *vdev2;
1253 
1254 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1255 			if (vdev2 != vdev)
1256 				sync_virtio_xmit(vdev2, vdev, m);
1257 		}
1258 		goto queue2nic;
1259 	}
1260 
1261 	/*check if destination is local VM*/
1262 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1263 		return;
1264 
1265 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1266 		if (unlikely(find_local_dest(vdev, m, &offset,
1267 					     &vlan_tag) != 0)) {
1268 			rte_pktmbuf_free(m);
1269 			return;
1270 		}
1271 	}
1272 
1273 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1274 		"(%d) TX: MAC address is external\n", vdev->vid);
1275 
1276 queue2nic:
1277 
1278 	/*Add packet to the port tx queue*/
1279 	tx_q = &lcore_tx_queue[lcore_id];
1280 
1281 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1282 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1283 		/* Guest has inserted the vlan tag. */
1284 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1285 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1286 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1287 			(vh->vlan_tci != vlan_tag_be))
1288 			vh->vlan_tci = vlan_tag_be;
1289 	} else {
1290 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1291 
1292 		/*
1293 		 * Find the right seg to adjust the data len when offset is
1294 		 * bigger than tail room size.
1295 		 */
1296 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1297 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1298 				m->data_len += offset;
1299 			else {
1300 				struct rte_mbuf *seg = m;
1301 
1302 				while ((seg->next != NULL) &&
1303 					(offset > rte_pktmbuf_tailroom(seg)))
1304 					seg = seg->next;
1305 
1306 				seg->data_len += offset;
1307 			}
1308 			m->pkt_len += offset;
1309 		}
1310 
1311 		m->vlan_tci = vlan_tag;
1312 	}
1313 
1314 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1315 		virtio_tx_offload(m);
1316 
1317 	tx_q->m_table[tx_q->len++] = m;
1318 	if (enable_stats) {
1319 		vdev->stats.tx_total++;
1320 		vdev->stats.tx++;
1321 	}
1322 
1323 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1324 		do_drain_mbuf_table(tx_q);
1325 }
1326 
1327 
1328 static __rte_always_inline void
1329 drain_mbuf_table(struct mbuf_table *tx_q)
1330 {
1331 	static uint64_t prev_tsc;
1332 	uint64_t cur_tsc;
1333 
1334 	if (tx_q->len == 0)
1335 		return;
1336 
1337 	cur_tsc = rte_rdtsc();
1338 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1339 		prev_tsc = cur_tsc;
1340 
1341 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1342 			"TX queue drained after timeout with burst size %u\n",
1343 			tx_q->len);
1344 		do_drain_mbuf_table(tx_q);
1345 	}
1346 }
1347 
1348 uint16_t
1349 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1350 		struct rte_mbuf **pkts, uint32_t rx_count)
1351 {
1352 	uint16_t enqueue_count;
1353 	uint16_t enqueue_fail = 0;
1354 	uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1355 
1356 	complete_async_pkts(dev);
1357 	enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1358 					pkts, rx_count, dma_id, 0);
1359 
1360 	enqueue_fail = rx_count - enqueue_count;
1361 	if (enqueue_fail)
1362 		free_pkts(&pkts[enqueue_count], enqueue_fail);
1363 
1364 	return enqueue_count;
1365 }
1366 
1367 uint16_t
1368 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1369 		struct rte_mbuf **pkts, uint32_t rx_count)
1370 {
1371 	return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1372 }
1373 
1374 static __rte_always_inline void
1375 drain_eth_rx(struct vhost_dev *vdev)
1376 {
1377 	uint16_t rx_count, enqueue_count;
1378 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1379 
1380 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1381 				    pkts, MAX_PKT_BURST);
1382 
1383 	if (!rx_count)
1384 		return;
1385 
1386 	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1387 						VIRTIO_RXQ, pkts, rx_count);
1388 
1389 	/* Retry if necessary */
1390 	if (enable_retry && unlikely(enqueue_count < rx_count)) {
1391 		uint32_t retry = 0;
1392 
1393 		while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) {
1394 			rte_delay_us(burst_rx_delay_time);
1395 			enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1396 							VIRTIO_RXQ, &pkts[enqueue_count],
1397 							rx_count - enqueue_count);
1398 		}
1399 	}
1400 
1401 	if (enable_stats) {
1402 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1403 				__ATOMIC_SEQ_CST);
1404 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1405 				__ATOMIC_SEQ_CST);
1406 	}
1407 
1408 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1409 		free_pkts(pkts, rx_count);
1410 }
1411 
1412 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1413 			    struct rte_mempool *mbuf_pool,
1414 			    struct rte_mbuf **pkts, uint16_t count)
1415 {
1416 	int nr_inflight;
1417 	uint16_t dequeue_count;
1418 	int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1419 
1420 	dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1421 			mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1422 
1423 	return dequeue_count;
1424 }
1425 
1426 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1427 			   struct rte_mempool *mbuf_pool,
1428 			   struct rte_mbuf **pkts, uint16_t count)
1429 {
1430 	return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1431 }
1432 
1433 static __rte_always_inline void
1434 drain_virtio_tx(struct vhost_dev *vdev)
1435 {
1436 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1437 	uint16_t count;
1438 	uint16_t i;
1439 
1440 	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1441 				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1442 
1443 	/* setup VMDq for the first packet */
1444 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1445 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1446 			free_pkts(pkts, count);
1447 	}
1448 
1449 	for (i = 0; i < count; ++i)
1450 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1451 }
1452 
1453 /*
1454  * Main function of vhost-switch. It basically does:
1455  *
1456  * for each vhost device {
1457  *    - drain_eth_rx()
1458  *
1459  *      Which drains the host eth Rx queue linked to the vhost device,
1460  *      and deliver all of them to guest virito Rx ring associated with
1461  *      this vhost device.
1462  *
1463  *    - drain_virtio_tx()
1464  *
1465  *      Which drains the guest virtio Tx queue and deliver all of them
1466  *      to the target, which could be another vhost device, or the
1467  *      physical eth dev. The route is done in function "virtio_tx_route".
1468  * }
1469  */
1470 static int
1471 switch_worker(void *arg __rte_unused)
1472 {
1473 	unsigned i;
1474 	unsigned lcore_id = rte_lcore_id();
1475 	struct vhost_dev *vdev;
1476 	struct mbuf_table *tx_q;
1477 
1478 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1479 
1480 	tx_q = &lcore_tx_queue[lcore_id];
1481 	for (i = 0; i < rte_lcore_count(); i++) {
1482 		if (lcore_ids[i] == lcore_id) {
1483 			tx_q->txq_id = i;
1484 			break;
1485 		}
1486 	}
1487 
1488 	while(1) {
1489 		drain_mbuf_table(tx_q);
1490 		drain_vhost_table();
1491 		/*
1492 		 * Inform the configuration core that we have exited the
1493 		 * linked list and that no devices are in use if requested.
1494 		 */
1495 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1496 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1497 
1498 		/*
1499 		 * Process vhost devices
1500 		 */
1501 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1502 			      lcore_vdev_entry) {
1503 			if (unlikely(vdev->remove)) {
1504 				unlink_vmdq(vdev);
1505 				vdev->ready = DEVICE_SAFE_REMOVE;
1506 				continue;
1507 			}
1508 
1509 			if (likely(vdev->ready == DEVICE_RX))
1510 				drain_eth_rx(vdev);
1511 
1512 			if (likely(!vdev->remove))
1513 				drain_virtio_tx(vdev);
1514 		}
1515 	}
1516 
1517 	return 0;
1518 }
1519 
1520 static void
1521 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1522 {
1523 	uint16_t n_pkt = 0;
1524 	int pkts_inflight;
1525 
1526 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1527 	pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1528 
1529 	struct rte_mbuf *m_cpl[pkts_inflight];
1530 
1531 	while (pkts_inflight) {
1532 		n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1533 							pkts_inflight, dma_id, 0);
1534 		free_pkts(m_cpl, n_pkt);
1535 		pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1536 									queue_id);
1537 	}
1538 }
1539 
1540 static void
1541 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id)
1542 {
1543 	uint16_t n_pkt = 0;
1544 	int pkts_inflight;
1545 
1546 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1547 	pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1548 
1549 	struct rte_mbuf *m_cpl[pkts_inflight];
1550 
1551 	while (pkts_inflight) {
1552 		n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl,
1553 						pkts_inflight, dma_id, 0);
1554 		free_pkts(m_cpl, n_pkt);
1555 		pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1556 	}
1557 }
1558 
1559 /*
1560  * Remove a device from the specific data core linked list and from the
1561  * main linked list. Synchronization  occurs through the use of the
1562  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1563  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1564  */
1565 static void
1566 destroy_device(int vid)
1567 {
1568 	struct vhost_dev *vdev = NULL;
1569 	int lcore;
1570 	uint16_t i;
1571 
1572 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1573 		if (vdev->vid == vid)
1574 			break;
1575 	}
1576 	if (!vdev)
1577 		return;
1578 	/*set the remove flag. */
1579 	vdev->remove = 1;
1580 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1581 		rte_pause();
1582 	}
1583 
1584 	for (i = 0; i < RTE_MAX_LCORE; i++)
1585 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1586 
1587 	if (builtin_net_driver)
1588 		vs_vhost_net_remove(vdev);
1589 
1590 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1591 		     lcore_vdev_entry);
1592 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1593 
1594 
1595 	/* Set the dev_removal_flag on each lcore. */
1596 	RTE_LCORE_FOREACH_WORKER(lcore)
1597 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1598 
1599 	/*
1600 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1601 	 * we can be sure that they can no longer access the device removed
1602 	 * from the linked lists and that the devices are no longer in use.
1603 	 */
1604 	RTE_LCORE_FOREACH_WORKER(lcore) {
1605 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1606 			rte_pause();
1607 	}
1608 
1609 	lcore_info[vdev->coreid].device_num--;
1610 
1611 	RTE_LOG(INFO, VHOST_DATA,
1612 		"(%d) device has been removed from data core\n",
1613 		vdev->vid);
1614 
1615 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1616 		vhost_clear_queue(vdev, VIRTIO_RXQ);
1617 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1618 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1619 	}
1620 
1621 	if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1622 		vhost_clear_queue(vdev, VIRTIO_TXQ);
1623 		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1624 		dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1625 	}
1626 
1627 	rte_free(vdev);
1628 }
1629 
1630 static inline int
1631 get_socketid_by_vid(int vid)
1632 {
1633 	int i;
1634 	char ifname[PATH_MAX];
1635 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1636 
1637 	for (i = 0; i < nb_sockets; i++) {
1638 		char *file = socket_files + i * PATH_MAX;
1639 		if (strcmp(file, ifname) == 0)
1640 			return i;
1641 	}
1642 
1643 	return -1;
1644 }
1645 
1646 static int
1647 init_vhost_queue_ops(int vid)
1648 {
1649 	if (builtin_net_driver) {
1650 		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1651 		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1652 	} else {
1653 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1654 			vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1655 		else
1656 			vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1657 
1658 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1659 			vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1660 		else
1661 			vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1662 	}
1663 
1664 	return 0;
1665 }
1666 
1667 static inline int
1668 vhost_async_channel_register(int vid)
1669 {
1670 	int rx_ret = 0, tx_ret = 0;
1671 
1672 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1673 		rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1674 		if (rx_ret == 0)
1675 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1676 	}
1677 
1678 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1679 		tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1680 		if (tx_ret == 0)
1681 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1682 	}
1683 
1684 	return rx_ret | tx_ret;
1685 }
1686 
1687 
1688 
1689 /*
1690  * A new device is added to a data core. First the device is added to the main linked list
1691  * and then allocated to a specific data core.
1692  */
1693 static int
1694 new_device(int vid)
1695 {
1696 	int lcore, core_add = 0;
1697 	uint16_t i;
1698 	uint32_t device_num_min = num_devices;
1699 	struct vhost_dev *vdev;
1700 	int ret;
1701 
1702 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1703 	if (vdev == NULL) {
1704 		RTE_LOG(INFO, VHOST_DATA,
1705 			"(%d) couldn't allocate memory for vhost dev\n",
1706 			vid);
1707 		return -1;
1708 	}
1709 	vdev->vid = vid;
1710 
1711 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1712 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1713 			= rte_zmalloc("vhost bufftable",
1714 				sizeof(struct vhost_bufftable),
1715 				RTE_CACHE_LINE_SIZE);
1716 
1717 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1718 			RTE_LOG(INFO, VHOST_DATA,
1719 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1720 			return -1;
1721 		}
1722 	}
1723 
1724 	int socketid = get_socketid_by_vid(vid);
1725 	if (socketid == -1)
1726 		return -1;
1727 
1728 	init_vid2socketid_array(vid, socketid);
1729 
1730 	ret =  vhost_async_channel_register(vid);
1731 
1732 	if (init_vhost_queue_ops(vid) != 0)
1733 		return -1;
1734 
1735 	if (builtin_net_driver)
1736 		vs_vhost_net_setup(vdev);
1737 
1738 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1739 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1740 
1741 	/*reset ready flag*/
1742 	vdev->ready = DEVICE_MAC_LEARNING;
1743 	vdev->remove = 0;
1744 
1745 	/* Find a suitable lcore to add the device. */
1746 	RTE_LCORE_FOREACH_WORKER(lcore) {
1747 		if (lcore_info[lcore].device_num < device_num_min) {
1748 			device_num_min = lcore_info[lcore].device_num;
1749 			core_add = lcore;
1750 		}
1751 	}
1752 	vdev->coreid = core_add;
1753 
1754 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1755 			  lcore_vdev_entry);
1756 	lcore_info[vdev->coreid].device_num++;
1757 
1758 	/* Disable notifications. */
1759 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1760 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1761 
1762 	RTE_LOG(INFO, VHOST_DATA,
1763 		"(%d) device has been added to data core %d\n",
1764 		vid, vdev->coreid);
1765 
1766 	return ret;
1767 }
1768 
1769 static int
1770 vring_state_changed(int vid, uint16_t queue_id, int enable)
1771 {
1772 	struct vhost_dev *vdev = NULL;
1773 
1774 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1775 		if (vdev->vid == vid)
1776 			break;
1777 	}
1778 	if (!vdev)
1779 		return -1;
1780 
1781 	if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1782 		if (!enable)
1783 			vhost_clear_queue_thread_unsafe(vdev, queue_id);
1784 	}
1785 
1786 	return 0;
1787 }
1788 
1789 /*
1790  * These callback allow devices to be added to the data core when configuration
1791  * has been fully complete.
1792  */
1793 static const struct rte_vhost_device_ops virtio_net_device_ops =
1794 {
1795 	.new_device =  new_device,
1796 	.destroy_device = destroy_device,
1797 	.vring_state_changed = vring_state_changed,
1798 };
1799 
1800 /*
1801  * This is a thread will wake up after a period to print stats if the user has
1802  * enabled them.
1803  */
1804 static void *
1805 print_stats(__rte_unused void *arg)
1806 {
1807 	struct vhost_dev *vdev;
1808 	uint64_t tx_dropped, rx_dropped;
1809 	uint64_t tx, tx_total, rx, rx_total;
1810 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1811 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1812 
1813 	while(1) {
1814 		sleep(enable_stats);
1815 
1816 		/* Clear screen and move to top left */
1817 		printf("%s%s\n", clr, top_left);
1818 		printf("Device statistics =================================\n");
1819 
1820 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1821 			tx_total   = vdev->stats.tx_total;
1822 			tx         = vdev->stats.tx;
1823 			tx_dropped = tx_total - tx;
1824 
1825 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1826 				__ATOMIC_SEQ_CST);
1827 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1828 				__ATOMIC_SEQ_CST);
1829 			rx_dropped = rx_total - rx;
1830 
1831 			printf("Statistics for device %d\n"
1832 				"-----------------------\n"
1833 				"TX total:              %" PRIu64 "\n"
1834 				"TX dropped:            %" PRIu64 "\n"
1835 				"TX successful:         %" PRIu64 "\n"
1836 				"RX total:              %" PRIu64 "\n"
1837 				"RX dropped:            %" PRIu64 "\n"
1838 				"RX successful:         %" PRIu64 "\n",
1839 				vdev->vid,
1840 				tx_total, tx_dropped, tx,
1841 				rx_total, rx_dropped, rx);
1842 		}
1843 
1844 		printf("===================================================\n");
1845 
1846 		fflush(stdout);
1847 	}
1848 
1849 	return NULL;
1850 }
1851 
1852 static void
1853 unregister_drivers(int socket_num)
1854 {
1855 	int i, ret;
1856 
1857 	for (i = 0; i < socket_num; i++) {
1858 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1859 		if (ret != 0)
1860 			RTE_LOG(ERR, VHOST_CONFIG,
1861 				"Fail to unregister vhost driver for %s.\n",
1862 				socket_files + i * PATH_MAX);
1863 	}
1864 }
1865 
1866 /* When we receive a INT signal, unregister vhost driver */
1867 static void
1868 sigint_handler(__rte_unused int signum)
1869 {
1870 	/* Unregister vhost driver. */
1871 	unregister_drivers(nb_sockets);
1872 
1873 	exit(0);
1874 }
1875 
1876 static void
1877 reset_dma(void)
1878 {
1879 	int i;
1880 
1881 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1882 		int j;
1883 
1884 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1885 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1886 			dma_bind[i].dmas[j].async_enabled = false;
1887 		}
1888 	}
1889 
1890 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1891 		dmas_id[i] = INVALID_DMA_ID;
1892 }
1893 
1894 /*
1895  * Main function, does initialisation and calls the per-lcore functions.
1896  */
1897 int
1898 main(int argc, char *argv[])
1899 {
1900 	unsigned lcore_id, core_id = 0;
1901 	unsigned nb_ports, valid_num_ports;
1902 	int ret, i;
1903 	uint16_t portid;
1904 	static pthread_t tid;
1905 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1906 
1907 	signal(SIGINT, sigint_handler);
1908 
1909 	/* init EAL */
1910 	ret = rte_eal_init(argc, argv);
1911 	if (ret < 0)
1912 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1913 	argc -= ret;
1914 	argv += ret;
1915 
1916 	/* initialize dma structures */
1917 	reset_dma();
1918 
1919 	/* parse app arguments */
1920 	ret = us_vhost_parse_args(argc, argv);
1921 	if (ret < 0)
1922 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1923 
1924 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1925 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1926 
1927 		if (rte_lcore_is_enabled(lcore_id))
1928 			lcore_ids[core_id++] = lcore_id;
1929 	}
1930 
1931 	if (rte_lcore_count() > RTE_MAX_LCORE)
1932 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1933 
1934 	/* Get the number of physical ports. */
1935 	nb_ports = rte_eth_dev_count_avail();
1936 
1937 	/*
1938 	 * Update the global var NUM_PORTS and global array PORTS
1939 	 * and get value of var VALID_NUM_PORTS according to system ports number
1940 	 */
1941 	valid_num_ports = check_ports_num(nb_ports);
1942 
1943 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1944 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1945 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1946 		return -1;
1947 	}
1948 
1949 	/*
1950 	 * FIXME: here we are trying to allocate mbufs big enough for
1951 	 * @MAX_QUEUES, but the truth is we're never going to use that
1952 	 * many queues here. We probably should only do allocation for
1953 	 * those queues we are going to use.
1954 	 */
1955 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1956 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1957 					    rte_socket_id());
1958 	if (mbuf_pool == NULL)
1959 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1960 
1961 	if (vm2vm_mode == VM2VM_HARDWARE) {
1962 		/* Enable VT loop back to let L2 switch to do it. */
1963 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1964 		RTE_LOG(DEBUG, VHOST_CONFIG,
1965 			"Enable loop back for L2 switch in vmdq.\n");
1966 	}
1967 
1968 	/* initialize all ports */
1969 	RTE_ETH_FOREACH_DEV(portid) {
1970 		/* skip ports that are not enabled */
1971 		if ((enabled_port_mask & (1 << portid)) == 0) {
1972 			RTE_LOG(INFO, VHOST_PORT,
1973 				"Skipping disabled port %d\n", portid);
1974 			continue;
1975 		}
1976 		if (port_init(portid) != 0)
1977 			rte_exit(EXIT_FAILURE,
1978 				"Cannot initialize network ports\n");
1979 	}
1980 
1981 	/* Enable stats if the user option is set. */
1982 	if (enable_stats) {
1983 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1984 					print_stats, NULL);
1985 		if (ret < 0)
1986 			rte_exit(EXIT_FAILURE,
1987 				"Cannot create print-stats thread\n");
1988 	}
1989 
1990 	/* Launch all data cores. */
1991 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1992 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1993 
1994 	if (client_mode)
1995 		flags |= RTE_VHOST_USER_CLIENT;
1996 
1997 	for (i = 0; i < dma_count; i++) {
1998 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1999 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
2000 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2001 		}
2002 	}
2003 
2004 	/* Register vhost user driver to handle vhost messages. */
2005 	for (i = 0; i < nb_sockets; i++) {
2006 		char *file = socket_files + i * PATH_MAX;
2007 
2008 		if (dma_count && get_async_flag_by_socketid(i) != 0)
2009 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2010 
2011 		ret = rte_vhost_driver_register(file, flags);
2012 		if (ret != 0) {
2013 			unregister_drivers(i);
2014 			rte_exit(EXIT_FAILURE,
2015 				"vhost driver register failure.\n");
2016 		}
2017 
2018 		if (builtin_net_driver)
2019 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2020 
2021 		if (mergeable == 0) {
2022 			rte_vhost_driver_disable_features(file,
2023 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
2024 		}
2025 
2026 		if (enable_tx_csum == 0) {
2027 			rte_vhost_driver_disable_features(file,
2028 				1ULL << VIRTIO_NET_F_CSUM);
2029 		}
2030 
2031 		if (enable_tso == 0) {
2032 			rte_vhost_driver_disable_features(file,
2033 				1ULL << VIRTIO_NET_F_HOST_TSO4);
2034 			rte_vhost_driver_disable_features(file,
2035 				1ULL << VIRTIO_NET_F_HOST_TSO6);
2036 			rte_vhost_driver_disable_features(file,
2037 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
2038 			rte_vhost_driver_disable_features(file,
2039 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
2040 		}
2041 
2042 		if (promiscuous) {
2043 			rte_vhost_driver_enable_features(file,
2044 				1ULL << VIRTIO_NET_F_CTRL_RX);
2045 		}
2046 
2047 		ret = rte_vhost_driver_callback_register(file,
2048 			&virtio_net_device_ops);
2049 		if (ret != 0) {
2050 			rte_exit(EXIT_FAILURE,
2051 				"failed to register vhost driver callbacks.\n");
2052 		}
2053 
2054 		if (rte_vhost_driver_start(file) < 0) {
2055 			rte_exit(EXIT_FAILURE,
2056 				"failed to start vhost driver.\n");
2057 		}
2058 	}
2059 
2060 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2061 		rte_eal_wait_lcore(lcore_id);
2062 
2063 	/* clean up the EAL */
2064 	rte_eal_cleanup();
2065 
2066 	return 0;
2067 }
2068