xref: /dpdk/examples/vhost/main.c (revision f9dfb59edbccae50e7c5508348aa2b4b84413048)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <ctype.h>
6 #include <arpa/inet.h>
7 #include <getopt.h>
8 #include <linux/if_ether.h>
9 #include <linux/if_vlan.h>
10 #include <linux/virtio_net.h>
11 #include <linux/virtio_ring.h>
12 #include <signal.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <sys/eventfd.h>
16 #include <sys/param.h>
17 #include <unistd.h>
18 
19 #include <rte_cycles.h>
20 #include <rte_ethdev.h>
21 #include <rte_log.h>
22 #include <rte_string_fns.h>
23 #include <rte_malloc.h>
24 #include <rte_net.h>
25 #include <rte_vhost.h>
26 #include <rte_ip.h>
27 #include <rte_tcp.h>
28 #include <rte_pause.h>
29 #include <rte_dmadev.h>
30 #include <rte_vhost_async.h>
31 
32 #include "main.h"
33 
34 #ifndef MAX_QUEUES
35 #define MAX_QUEUES 128
36 #endif
37 
38 #define NUM_MBUFS_DEFAULT 0x24000
39 
40 /* the maximum number of external ports supported */
41 #define MAX_SUP_PORTS 1
42 
43 #define MBUF_CACHE_SIZE	128
44 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
45 
46 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
47 
48 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
49 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
50 
51 #define JUMBO_FRAME_MAX_SIZE    0x2600
52 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
53 
54 /* State of virtio device. */
55 #define DEVICE_MAC_LEARNING 0
56 #define DEVICE_RX			1
57 #define DEVICE_SAFE_REMOVE	2
58 
59 /* Configurable number of RX/TX ring descriptors */
60 #define RX_DESC_DEFAULT 1024
61 #define TX_DESC_DEFAULT 512
62 
63 #define INVALID_PORT_ID 0xFF
64 #define INVALID_DMA_ID -1
65 
66 #define DMA_RING_SIZE 4096
67 
68 #define ASYNC_ENQUEUE_VHOST 1
69 #define ASYNC_DEQUEUE_VHOST 2
70 
71 /* number of mbufs in all pools - if specified on command-line. */
72 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
73 
74 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
75 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
76 static int dma_count;
77 
78 /* mask of enabled ports */
79 static uint32_t enabled_port_mask = 0;
80 
81 /* Promiscuous mode */
82 static uint32_t promiscuous;
83 
84 /* number of devices/queues to support*/
85 static uint32_t num_queues = 0;
86 static uint32_t num_devices;
87 
88 static struct rte_mempool *mbuf_pool;
89 static int mergeable;
90 
91 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
92 typedef enum {
93 	VM2VM_DISABLED = 0,
94 	VM2VM_SOFTWARE = 1,
95 	VM2VM_HARDWARE = 2,
96 	VM2VM_LAST
97 } vm2vm_type;
98 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
99 
100 /* Enable stats. */
101 static uint32_t enable_stats = 0;
102 /* Enable retries on RX. */
103 static uint32_t enable_retry = 1;
104 
105 /* Disable TX checksum offload */
106 static uint32_t enable_tx_csum;
107 
108 /* Disable TSO offload */
109 static uint32_t enable_tso;
110 
111 static int client_mode;
112 
113 static int builtin_net_driver;
114 
115 /* Specify timeout (in useconds) between retries on RX. */
116 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
117 /* Specify the number of retries on RX. */
118 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
119 
120 /* Socket file paths. Can be set by user */
121 static char *socket_files;
122 static int nb_sockets;
123 
124 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
125 
126 /* empty VMDq configuration structure. Filled in programmatically */
127 static struct rte_eth_conf vmdq_conf_default = {
128 	.rxmode = {
129 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
130 		/*
131 		 * VLAN strip is necessary for 1G NIC such as I350,
132 		 * this fixes bug of ipv4 forwarding in guest can't
133 		 * forward packets from one virtio dev to another virtio dev.
134 		 */
135 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
136 	},
137 
138 	.txmode = {
139 		.mq_mode = RTE_ETH_MQ_TX_NONE,
140 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
141 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
142 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
143 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
144 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
145 	},
146 	.rx_adv_conf = {
147 		/*
148 		 * should be overridden separately in code with
149 		 * appropriate values
150 		 */
151 		.vmdq_rx_conf = {
152 			.nb_queue_pools = RTE_ETH_8_POOLS,
153 			.enable_default_pool = 0,
154 			.default_pool = 0,
155 			.nb_pool_maps = 0,
156 			.pool_map = {{0, 0},},
157 		},
158 	},
159 };
160 
161 
162 static unsigned lcore_ids[RTE_MAX_LCORE];
163 static uint16_t ports[RTE_MAX_ETHPORTS];
164 static unsigned num_ports = 0; /**< The number of ports specified in command line */
165 static uint16_t num_pf_queues, num_vmdq_queues;
166 static uint16_t vmdq_pool_base, vmdq_queue_base;
167 static uint16_t queues_per_pool;
168 
169 const uint16_t vlan_tags[] = {
170 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
171 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
172 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
173 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
174 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
175 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
176 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
177 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
178 };
179 
180 /* ethernet addresses of ports */
181 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
182 
183 static struct vhost_dev_tailq_list vhost_dev_list =
184 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
185 
186 static struct lcore_info lcore_info[RTE_MAX_LCORE];
187 
188 /* Used for queueing bursts of TX packets. */
189 struct mbuf_table {
190 	unsigned len;
191 	unsigned txq_id;
192 	struct rte_mbuf *m_table[MAX_PKT_BURST];
193 };
194 
195 struct vhost_bufftable {
196 	uint32_t len;
197 	uint64_t pre_tsc;
198 	struct rte_mbuf *m_table[MAX_PKT_BURST];
199 };
200 
201 /* TX queue for each data core. */
202 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
203 
204 /*
205  * Vhost TX buffer for each data core.
206  * Every data core maintains a TX buffer for every vhost device,
207  * which is used for batch pkts enqueue for higher performance.
208  */
209 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
210 
211 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
212 				 / US_PER_S * BURST_TX_DRAIN_US)
213 
214 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
215 
216 static inline uint32_t
217 get_async_flag_by_socketid(int socketid)
218 {
219 	return dma_bind[socketid].async_flag;
220 }
221 
222 static inline void
223 init_vid2socketid_array(int vid, int socketid)
224 {
225 	vid2socketid[vid] = socketid;
226 }
227 
228 static inline bool
229 is_dma_configured(int16_t dev_id)
230 {
231 	int i;
232 
233 	for (i = 0; i < dma_count; i++)
234 		if (dmas_id[i] == dev_id)
235 			return true;
236 	return false;
237 }
238 
239 static inline int
240 open_dma(const char *value)
241 {
242 	struct dma_for_vhost *dma_info = dma_bind;
243 	char *input = strndup(value, strlen(value) + 1);
244 	char *addrs = input;
245 	char *ptrs[2];
246 	char *start, *end, *substr;
247 	int64_t socketid, vring_id;
248 
249 	struct rte_dma_info info;
250 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
251 	struct rte_dma_vchan_conf qconf = {
252 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
253 		.nb_desc = DMA_RING_SIZE
254 	};
255 
256 	int dev_id;
257 	int ret = 0;
258 	uint16_t i = 0;
259 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
260 	int args_nr;
261 
262 	while (isblank(*addrs))
263 		addrs++;
264 	if (*addrs == '\0') {
265 		ret = -1;
266 		goto out;
267 	}
268 
269 	/* process DMA devices within bracket. */
270 	addrs++;
271 	substr = strtok(addrs, ";]");
272 	if (!substr) {
273 		ret = -1;
274 		goto out;
275 	}
276 
277 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
278 	if (args_nr <= 0) {
279 		ret = -1;
280 		goto out;
281 	}
282 
283 	while (i < args_nr) {
284 		char *arg_temp = dma_arg[i];
285 		char *txd, *rxd;
286 		uint8_t sub_nr;
287 		int async_flag;
288 
289 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
290 		if (sub_nr != 2) {
291 			ret = -1;
292 			goto out;
293 		}
294 
295 		txd = strstr(ptrs[0], "txd");
296 		rxd = strstr(ptrs[0], "rxd");
297 		if (txd) {
298 			start = txd;
299 			vring_id = VIRTIO_RXQ;
300 			async_flag = ASYNC_ENQUEUE_VHOST;
301 		} else if (rxd) {
302 			start = rxd;
303 			vring_id = VIRTIO_TXQ;
304 			async_flag = ASYNC_DEQUEUE_VHOST;
305 		} else {
306 			ret = -1;
307 			goto out;
308 		}
309 
310 		start += 3;
311 		socketid = strtol(start, &end, 0);
312 		if (end == start) {
313 			ret = -1;
314 			goto out;
315 		}
316 
317 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
318 		if (dev_id < 0) {
319 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
320 			ret = -1;
321 			goto out;
322 		}
323 
324 		/* DMA device is already configured, so skip */
325 		if (is_dma_configured(dev_id))
326 			goto done;
327 
328 		if (rte_dma_info_get(dev_id, &info) != 0) {
329 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
330 			ret = -1;
331 			goto out;
332 		}
333 
334 		if (info.max_vchans < 1) {
335 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
336 			ret = -1;
337 			goto out;
338 		}
339 
340 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
341 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
342 			ret = -1;
343 			goto out;
344 		}
345 
346 		/* Check the max desc supported by DMA device */
347 		rte_dma_info_get(dev_id, &info);
348 		if (info.nb_vchans != 1) {
349 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
350 					dev_id);
351 			ret = -1;
352 			goto out;
353 		}
354 
355 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
356 
357 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
358 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
359 			ret = -1;
360 			goto out;
361 		}
362 
363 		if (rte_dma_start(dev_id) != 0) {
364 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
365 			ret = -1;
366 			goto out;
367 		}
368 
369 		dmas_id[dma_count++] = dev_id;
370 
371 done:
372 		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
373 		(dma_info + socketid)->async_flag |= async_flag;
374 		i++;
375 	}
376 out:
377 	free(input);
378 	return ret;
379 }
380 
381 /*
382  * Builds up the correct configuration for VMDQ VLAN pool map
383  * according to the pool & queue limits.
384  */
385 static inline int
386 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
387 {
388 	struct rte_eth_vmdq_rx_conf conf;
389 	struct rte_eth_vmdq_rx_conf *def_conf =
390 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
391 	unsigned i;
392 
393 	memset(&conf, 0, sizeof(conf));
394 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
395 	conf.nb_pool_maps = num_devices;
396 	conf.enable_loop_back = def_conf->enable_loop_back;
397 	conf.rx_mode = def_conf->rx_mode;
398 
399 	for (i = 0; i < conf.nb_pool_maps; i++) {
400 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
401 		conf.pool_map[i].pools = (1UL << i);
402 	}
403 
404 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
405 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
406 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
407 	return 0;
408 }
409 
410 /*
411  * Initialises a given port using global settings and with the rx buffers
412  * coming from the mbuf_pool passed as parameter
413  */
414 static inline int
415 port_init(uint16_t port)
416 {
417 	struct rte_eth_dev_info dev_info;
418 	struct rte_eth_conf port_conf;
419 	struct rte_eth_rxconf *rxconf;
420 	struct rte_eth_txconf *txconf;
421 	int16_t rx_rings, tx_rings;
422 	uint16_t rx_ring_size, tx_ring_size;
423 	int retval;
424 	uint16_t q;
425 
426 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
427 	retval = rte_eth_dev_info_get(port, &dev_info);
428 	if (retval != 0) {
429 		RTE_LOG(ERR, VHOST_PORT,
430 			"Error during getting device (port %u) info: %s\n",
431 			port, strerror(-retval));
432 
433 		return retval;
434 	}
435 	if (dev_info.max_vmdq_pools == 0) {
436 		RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
437 		return -1;
438 	}
439 
440 	rxconf = &dev_info.default_rxconf;
441 	txconf = &dev_info.default_txconf;
442 	rxconf->rx_drop_en = 1;
443 
444 	/*configure the number of supported virtio devices based on VMDQ limits */
445 	num_devices = dev_info.max_vmdq_pools;
446 
447 	rx_ring_size = RX_DESC_DEFAULT;
448 	tx_ring_size = TX_DESC_DEFAULT;
449 
450 	tx_rings = (uint16_t)rte_lcore_count();
451 
452 	if (mergeable) {
453 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
454 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
455 		else
456 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
457 	}
458 
459 	/* Get port configuration. */
460 	retval = get_eth_conf(&port_conf, num_devices);
461 	if (retval < 0)
462 		return retval;
463 	/* NIC queues are divided into pf queues and vmdq queues.  */
464 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
465 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
466 	num_vmdq_queues = num_devices * queues_per_pool;
467 	num_queues = num_pf_queues + num_vmdq_queues;
468 	vmdq_queue_base = dev_info.vmdq_queue_base;
469 	vmdq_pool_base  = dev_info.vmdq_pool_base;
470 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
471 		num_pf_queues, num_devices, queues_per_pool);
472 
473 	if (!rte_eth_dev_is_valid_port(port))
474 		return -1;
475 
476 	rx_rings = (uint16_t)dev_info.max_rx_queues;
477 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
478 		port_conf.txmode.offloads |=
479 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
480 	/* Configure ethernet device. */
481 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
482 	if (retval != 0) {
483 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
484 			port, strerror(-retval));
485 		return retval;
486 	}
487 
488 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
489 		&tx_ring_size);
490 	if (retval != 0) {
491 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
492 			"for port %u: %s.\n", port, strerror(-retval));
493 		return retval;
494 	}
495 	if (rx_ring_size > RX_DESC_DEFAULT) {
496 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
497 			"for Rx queues on port %u.\n", port);
498 		return -1;
499 	}
500 
501 	/* Setup the queues. */
502 	rxconf->offloads = port_conf.rxmode.offloads;
503 	for (q = 0; q < rx_rings; q ++) {
504 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
505 						rte_eth_dev_socket_id(port),
506 						rxconf,
507 						mbuf_pool);
508 		if (retval < 0) {
509 			RTE_LOG(ERR, VHOST_PORT,
510 				"Failed to setup rx queue %u of port %u: %s.\n",
511 				q, port, strerror(-retval));
512 			return retval;
513 		}
514 	}
515 	txconf->offloads = port_conf.txmode.offloads;
516 	for (q = 0; q < tx_rings; q ++) {
517 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
518 						rte_eth_dev_socket_id(port),
519 						txconf);
520 		if (retval < 0) {
521 			RTE_LOG(ERR, VHOST_PORT,
522 				"Failed to setup tx queue %u of port %u: %s.\n",
523 				q, port, strerror(-retval));
524 			return retval;
525 		}
526 	}
527 
528 	/* Start the device. */
529 	retval  = rte_eth_dev_start(port);
530 	if (retval < 0) {
531 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
532 			port, strerror(-retval));
533 		return retval;
534 	}
535 
536 	if (promiscuous) {
537 		retval = rte_eth_promiscuous_enable(port);
538 		if (retval != 0) {
539 			RTE_LOG(ERR, VHOST_PORT,
540 				"Failed to enable promiscuous mode on port %u: %s\n",
541 				port, rte_strerror(-retval));
542 			return retval;
543 		}
544 	}
545 
546 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
547 	if (retval < 0) {
548 		RTE_LOG(ERR, VHOST_PORT,
549 			"Failed to get MAC address on port %u: %s\n",
550 			port, rte_strerror(-retval));
551 		return retval;
552 	}
553 
554 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
555 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
556 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
557 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
558 
559 	return 0;
560 }
561 
562 /*
563  * Set socket file path.
564  */
565 static int
566 us_vhost_parse_socket_path(const char *q_arg)
567 {
568 	char *old;
569 
570 	/* parse number string */
571 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
572 		return -1;
573 
574 	old = socket_files;
575 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
576 	if (socket_files == NULL) {
577 		free(old);
578 		return -1;
579 	}
580 
581 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
582 	nb_sockets++;
583 
584 	return 0;
585 }
586 
587 /*
588  * Parse the portmask provided at run time.
589  */
590 static int
591 parse_portmask(const char *portmask)
592 {
593 	char *end = NULL;
594 	unsigned long pm;
595 
596 	errno = 0;
597 
598 	/* parse hexadecimal string */
599 	pm = strtoul(portmask, &end, 16);
600 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
601 		return 0;
602 
603 	return pm;
604 
605 }
606 
607 /*
608  * Parse num options at run time.
609  */
610 static int
611 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
612 {
613 	char *end = NULL;
614 	unsigned long num;
615 
616 	errno = 0;
617 
618 	/* parse unsigned int string */
619 	num = strtoul(q_arg, &end, 10);
620 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
621 		return -1;
622 
623 	if (num > max_valid_value)
624 		return -1;
625 
626 	return num;
627 
628 }
629 
630 /*
631  * Display usage
632  */
633 static void
634 us_vhost_usage(const char *prgname)
635 {
636 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
637 	"		--vm2vm [0|1|2]\n"
638 	"		--rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n"
639 	"		--socket-file <path>\n"
640 	"		-p PORTMASK: Set mask for ports to be used by application\n"
641 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
642 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
643 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
644 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
645 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
646 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
647 	"		--socket-file: The path of the socket file.\n"
648 	"		--tx-csum [0|1]: disable/enable TX checksum offload.\n"
649 	"		--tso [0|1]: disable/enable TCP segment offload.\n"
650 	"		--client: register a vhost-user socket as client mode.\n"
651 	"		--dmas: register dma channel for specific vhost device.\n"
652 	"		--total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n"
653 	"		--builtin-net-driver: enable simple vhost-user net driver\n",
654 	       prgname);
655 }
656 
657 enum {
658 #define OPT_VM2VM               "vm2vm"
659 	OPT_VM2VM_NUM = 256,
660 #define OPT_RX_RETRY            "rx-retry"
661 	OPT_RX_RETRY_NUM,
662 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
663 	OPT_RX_RETRY_DELAY_NUM,
664 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
665 	OPT_RX_RETRY_NUMB_NUM,
666 #define OPT_MERGEABLE           "mergeable"
667 	OPT_MERGEABLE_NUM,
668 #define OPT_STATS               "stats"
669 	OPT_STATS_NUM,
670 #define OPT_SOCKET_FILE         "socket-file"
671 	OPT_SOCKET_FILE_NUM,
672 #define OPT_TX_CSUM             "tx-csum"
673 	OPT_TX_CSUM_NUM,
674 #define OPT_TSO                 "tso"
675 	OPT_TSO_NUM,
676 #define OPT_CLIENT              "client"
677 	OPT_CLIENT_NUM,
678 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
679 	OPT_BUILTIN_NET_DRIVER_NUM,
680 #define OPT_DMAS                "dmas"
681 	OPT_DMAS_NUM,
682 #define OPT_NUM_MBUFS           "total-num-mbufs"
683 	OPT_NUM_MBUFS_NUM,
684 };
685 
686 /*
687  * Parse the arguments given in the command line of the application.
688  */
689 static int
690 us_vhost_parse_args(int argc, char **argv)
691 {
692 	int opt, ret;
693 	int option_index;
694 	unsigned i;
695 	const char *prgname = argv[0];
696 	static struct option long_option[] = {
697 		{OPT_VM2VM, required_argument,
698 				NULL, OPT_VM2VM_NUM},
699 		{OPT_RX_RETRY, required_argument,
700 				NULL, OPT_RX_RETRY_NUM},
701 		{OPT_RX_RETRY_DELAY, required_argument,
702 				NULL, OPT_RX_RETRY_DELAY_NUM},
703 		{OPT_RX_RETRY_NUMB, required_argument,
704 				NULL, OPT_RX_RETRY_NUMB_NUM},
705 		{OPT_MERGEABLE, required_argument,
706 				NULL, OPT_MERGEABLE_NUM},
707 		{OPT_STATS, required_argument,
708 				NULL, OPT_STATS_NUM},
709 		{OPT_SOCKET_FILE, required_argument,
710 				NULL, OPT_SOCKET_FILE_NUM},
711 		{OPT_TX_CSUM, required_argument,
712 				NULL, OPT_TX_CSUM_NUM},
713 		{OPT_TSO, required_argument,
714 				NULL, OPT_TSO_NUM},
715 		{OPT_CLIENT, no_argument,
716 				NULL, OPT_CLIENT_NUM},
717 		{OPT_BUILTIN_NET_DRIVER, no_argument,
718 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
719 		{OPT_DMAS, required_argument,
720 				NULL, OPT_DMAS_NUM},
721 		{OPT_NUM_MBUFS, required_argument,
722 				NULL, OPT_NUM_MBUFS_NUM},
723 		{NULL, 0, 0, 0},
724 	};
725 
726 	/* Parse command line */
727 	while ((opt = getopt_long(argc, argv, "p:P",
728 			long_option, &option_index)) != EOF) {
729 		switch (opt) {
730 		/* Portmask */
731 		case 'p':
732 			enabled_port_mask = parse_portmask(optarg);
733 			if (enabled_port_mask == 0) {
734 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
735 				us_vhost_usage(prgname);
736 				return -1;
737 			}
738 			break;
739 
740 		case 'P':
741 			promiscuous = 1;
742 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
743 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
744 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
745 			break;
746 
747 		case OPT_VM2VM_NUM:
748 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
749 			if (ret == -1) {
750 				RTE_LOG(INFO, VHOST_CONFIG,
751 					"Invalid argument for "
752 					"vm2vm [0|1|2]\n");
753 				us_vhost_usage(prgname);
754 				return -1;
755 			}
756 			vm2vm_mode = (vm2vm_type)ret;
757 			break;
758 
759 		case OPT_RX_RETRY_NUM:
760 			ret = parse_num_opt(optarg, 1);
761 			if (ret == -1) {
762 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
763 				us_vhost_usage(prgname);
764 				return -1;
765 			}
766 			enable_retry = ret;
767 			break;
768 
769 		case OPT_TX_CSUM_NUM:
770 			ret = parse_num_opt(optarg, 1);
771 			if (ret == -1) {
772 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
773 				us_vhost_usage(prgname);
774 				return -1;
775 			}
776 			enable_tx_csum = ret;
777 			break;
778 
779 		case OPT_TSO_NUM:
780 			ret = parse_num_opt(optarg, 1);
781 			if (ret == -1) {
782 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
783 				us_vhost_usage(prgname);
784 				return -1;
785 			}
786 			enable_tso = ret;
787 			break;
788 
789 		case OPT_RX_RETRY_DELAY_NUM:
790 			ret = parse_num_opt(optarg, INT32_MAX);
791 			if (ret == -1) {
792 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
793 				us_vhost_usage(prgname);
794 				return -1;
795 			}
796 			burst_rx_delay_time = ret;
797 			break;
798 
799 		case OPT_RX_RETRY_NUMB_NUM:
800 			ret = parse_num_opt(optarg, INT32_MAX);
801 			if (ret == -1) {
802 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
803 				us_vhost_usage(prgname);
804 				return -1;
805 			}
806 			burst_rx_retry_num = ret;
807 			break;
808 
809 		case OPT_MERGEABLE_NUM:
810 			ret = parse_num_opt(optarg, 1);
811 			if (ret == -1) {
812 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
813 				us_vhost_usage(prgname);
814 				return -1;
815 			}
816 			mergeable = !!ret;
817 			break;
818 
819 		case OPT_STATS_NUM:
820 			ret = parse_num_opt(optarg, INT32_MAX);
821 			if (ret == -1) {
822 				RTE_LOG(INFO, VHOST_CONFIG,
823 					"Invalid argument for stats [0..N]\n");
824 				us_vhost_usage(prgname);
825 				return -1;
826 			}
827 			enable_stats = ret;
828 			break;
829 
830 		/* Set socket file path. */
831 		case OPT_SOCKET_FILE_NUM:
832 			if (us_vhost_parse_socket_path(optarg) == -1) {
833 				RTE_LOG(INFO, VHOST_CONFIG,
834 				"Invalid argument for socket name (Max %d characters)\n",
835 				PATH_MAX);
836 				us_vhost_usage(prgname);
837 				return -1;
838 			}
839 			break;
840 
841 		case OPT_DMAS_NUM:
842 			if (open_dma(optarg) == -1) {
843 				RTE_LOG(INFO, VHOST_CONFIG,
844 					"Wrong DMA args\n");
845 				us_vhost_usage(prgname);
846 				return -1;
847 			}
848 			break;
849 
850 		case OPT_NUM_MBUFS_NUM:
851 			ret = parse_num_opt(optarg, INT32_MAX);
852 			if (ret == -1) {
853 				RTE_LOG(INFO, VHOST_CONFIG,
854 					"Invalid argument for total-num-mbufs [0..N]\n");
855 				us_vhost_usage(prgname);
856 				return -1;
857 			}
858 
859 			if (total_num_mbufs < ret)
860 				total_num_mbufs = ret;
861 			break;
862 
863 		case OPT_CLIENT_NUM:
864 			client_mode = 1;
865 			break;
866 
867 		case OPT_BUILTIN_NET_DRIVER_NUM:
868 			builtin_net_driver = 1;
869 			break;
870 
871 		/* Invalid option - print options. */
872 		default:
873 			us_vhost_usage(prgname);
874 			return -1;
875 		}
876 	}
877 
878 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
879 		if (enabled_port_mask & (1 << i))
880 			ports[num_ports++] = i;
881 	}
882 
883 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
884 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
885 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
886 		return -1;
887 	}
888 
889 	return 0;
890 }
891 
892 /*
893  * Update the global var NUM_PORTS and array PORTS according to system ports number
894  * and return valid ports number
895  */
896 static unsigned check_ports_num(unsigned nb_ports)
897 {
898 	unsigned valid_num_ports = num_ports;
899 	unsigned portid;
900 
901 	if (num_ports > nb_ports) {
902 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
903 			num_ports, nb_ports);
904 		num_ports = nb_ports;
905 	}
906 
907 	for (portid = 0; portid < num_ports; portid ++) {
908 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
909 			RTE_LOG(INFO, VHOST_PORT,
910 				"\nSpecified port ID(%u) is not valid\n",
911 				ports[portid]);
912 			ports[portid] = INVALID_PORT_ID;
913 			valid_num_ports--;
914 		}
915 	}
916 	return valid_num_ports;
917 }
918 
919 static __rte_always_inline struct vhost_dev *
920 find_vhost_dev(struct rte_ether_addr *mac)
921 {
922 	struct vhost_dev *vdev;
923 
924 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
925 		if (vdev->ready == DEVICE_RX &&
926 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
927 			return vdev;
928 	}
929 
930 	return NULL;
931 }
932 
933 /*
934  * This function learns the MAC address of the device and registers this along with a
935  * vlan tag to a VMDQ.
936  */
937 static int
938 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
939 {
940 	struct rte_ether_hdr *pkt_hdr;
941 	int i, ret;
942 
943 	/* Learn MAC address of guest device from packet */
944 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
945 
946 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
947 		RTE_LOG(ERR, VHOST_DATA,
948 			"(%d) device is using a registered MAC!\n",
949 			vdev->vid);
950 		return -1;
951 	}
952 
953 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
954 		vdev->mac_address.addr_bytes[i] =
955 			pkt_hdr->src_addr.addr_bytes[i];
956 
957 	/* vlan_tag currently uses the device_id. */
958 	vdev->vlan_tag = vlan_tags[vdev->vid];
959 
960 	/* Print out VMDQ registration info. */
961 	RTE_LOG(INFO, VHOST_DATA,
962 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
963 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
964 		vdev->vlan_tag);
965 
966 	/* Register the MAC address. */
967 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
968 				(uint32_t)vdev->vid + vmdq_pool_base);
969 	if (ret)
970 		RTE_LOG(ERR, VHOST_DATA,
971 			"(%d) failed to add device MAC address to VMDQ\n",
972 			vdev->vid);
973 
974 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
975 
976 	/* Set device as ready for RX. */
977 	vdev->ready = DEVICE_RX;
978 
979 	return 0;
980 }
981 
982 /*
983  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
984  * queue before disabling RX on the device.
985  */
986 static inline void
987 unlink_vmdq(struct vhost_dev *vdev)
988 {
989 	unsigned i = 0;
990 	unsigned rx_count;
991 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
992 
993 	if (vdev->ready == DEVICE_RX) {
994 		/*clear MAC and VLAN settings*/
995 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
996 		for (i = 0; i < 6; i++)
997 			vdev->mac_address.addr_bytes[i] = 0;
998 
999 		vdev->vlan_tag = 0;
1000 
1001 		/*Clear out the receive buffers*/
1002 		rx_count = rte_eth_rx_burst(ports[0],
1003 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1004 
1005 		while (rx_count) {
1006 			for (i = 0; i < rx_count; i++)
1007 				rte_pktmbuf_free(pkts_burst[i]);
1008 
1009 			rx_count = rte_eth_rx_burst(ports[0],
1010 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1011 		}
1012 
1013 		vdev->ready = DEVICE_MAC_LEARNING;
1014 	}
1015 }
1016 
1017 static inline void
1018 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1019 {
1020 	while (n--)
1021 		rte_pktmbuf_free(pkts[n]);
1022 }
1023 
1024 static __rte_always_inline void
1025 complete_async_pkts(struct vhost_dev *vdev)
1026 {
1027 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1028 	uint16_t complete_count;
1029 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1030 
1031 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1032 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1033 	if (complete_count)
1034 		free_pkts(p_cpl, complete_count);
1035 
1036 }
1037 
1038 static __rte_always_inline void
1039 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1040 	    struct rte_mbuf *m)
1041 {
1042 	uint16_t ret;
1043 
1044 	if (builtin_net_driver) {
1045 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1046 	} else {
1047 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1048 	}
1049 
1050 	if (enable_stats) {
1051 		__atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1052 				__ATOMIC_SEQ_CST);
1053 		__atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1054 				__ATOMIC_SEQ_CST);
1055 		src_vdev->stats.tx_total++;
1056 		src_vdev->stats.tx += ret;
1057 	}
1058 }
1059 
1060 static __rte_always_inline void
1061 drain_vhost(struct vhost_dev *vdev)
1062 {
1063 	uint16_t ret;
1064 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1065 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1066 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1067 
1068 	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1069 
1070 	if (enable_stats) {
1071 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1072 				__ATOMIC_SEQ_CST);
1073 		__atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1074 				__ATOMIC_SEQ_CST);
1075 	}
1076 
1077 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1078 		free_pkts(m, nr_xmit);
1079 	} else {
1080 		uint16_t enqueue_fail = nr_xmit - ret;
1081 		if (enqueue_fail > 0)
1082 			free_pkts(&m[ret], enqueue_fail);
1083 	}
1084 }
1085 
1086 static __rte_always_inline void
1087 drain_vhost_table(void)
1088 {
1089 	uint16_t lcore_id = rte_lcore_id();
1090 	struct vhost_bufftable *vhost_txq;
1091 	struct vhost_dev *vdev;
1092 	uint64_t cur_tsc;
1093 
1094 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1095 		if (unlikely(vdev->remove == 1))
1096 			continue;
1097 
1098 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1099 
1100 		cur_tsc = rte_rdtsc();
1101 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1102 				> MBUF_TABLE_DRAIN_TSC)) {
1103 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1104 				"Vhost TX queue drained after timeout with burst size %u\n",
1105 				vhost_txq->len);
1106 			drain_vhost(vdev);
1107 			vhost_txq->len = 0;
1108 			vhost_txq->pre_tsc = cur_tsc;
1109 		}
1110 	}
1111 }
1112 
1113 /*
1114  * Check if the packet destination MAC address is for a local device. If so then put
1115  * the packet on that devices RX queue. If not then return.
1116  */
1117 static __rte_always_inline int
1118 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1119 {
1120 	struct rte_ether_hdr *pkt_hdr;
1121 	struct vhost_dev *dst_vdev;
1122 	struct vhost_bufftable *vhost_txq;
1123 	uint16_t lcore_id = rte_lcore_id();
1124 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1125 
1126 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1127 	if (!dst_vdev)
1128 		return -1;
1129 
1130 	if (vdev->vid == dst_vdev->vid) {
1131 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1132 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1133 			vdev->vid);
1134 		return 0;
1135 	}
1136 
1137 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1138 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1139 
1140 	if (unlikely(dst_vdev->remove)) {
1141 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1142 			"(%d) device is marked for removal\n", dst_vdev->vid);
1143 		return 0;
1144 	}
1145 
1146 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1147 	vhost_txq->m_table[vhost_txq->len++] = m;
1148 
1149 	if (enable_stats) {
1150 		vdev->stats.tx_total++;
1151 		vdev->stats.tx++;
1152 	}
1153 
1154 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1155 		drain_vhost(dst_vdev);
1156 		vhost_txq->len = 0;
1157 		vhost_txq->pre_tsc = rte_rdtsc();
1158 	}
1159 	return 0;
1160 }
1161 
1162 /*
1163  * Check if the destination MAC of a packet is one local VM,
1164  * and get its vlan tag, and offset if it is.
1165  */
1166 static __rte_always_inline int
1167 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1168 	uint32_t *offset, uint16_t *vlan_tag)
1169 {
1170 	struct vhost_dev *dst_vdev;
1171 	struct rte_ether_hdr *pkt_hdr =
1172 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1173 
1174 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1175 	if (!dst_vdev)
1176 		return 0;
1177 
1178 	if (vdev->vid == dst_vdev->vid) {
1179 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1180 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1181 			vdev->vid);
1182 		return -1;
1183 	}
1184 
1185 	/*
1186 	 * HW vlan strip will reduce the packet length
1187 	 * by minus length of vlan tag, so need restore
1188 	 * the packet length by plus it.
1189 	 */
1190 	*offset  = RTE_VLAN_HLEN;
1191 	*vlan_tag = vlan_tags[vdev->vid];
1192 
1193 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1194 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1195 		vdev->vid, dst_vdev->vid, *vlan_tag);
1196 
1197 	return 0;
1198 }
1199 
1200 static void virtio_tx_offload(struct rte_mbuf *m)
1201 {
1202 	struct rte_net_hdr_lens hdr_lens;
1203 	struct rte_ipv4_hdr *ipv4_hdr;
1204 	struct rte_tcp_hdr *tcp_hdr;
1205 	uint32_t ptype;
1206 	void *l3_hdr;
1207 
1208 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1209 	m->l2_len = hdr_lens.l2_len;
1210 	m->l3_len = hdr_lens.l3_len;
1211 	m->l4_len = hdr_lens.l4_len;
1212 
1213 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1214 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1215 		m->l2_len + m->l3_len);
1216 
1217 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1218 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1219 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1220 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1221 		ipv4_hdr = l3_hdr;
1222 		ipv4_hdr->hdr_checksum = 0;
1223 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1224 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1225 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1226 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1227 	}
1228 }
1229 
1230 static __rte_always_inline void
1231 do_drain_mbuf_table(struct mbuf_table *tx_q)
1232 {
1233 	uint16_t count;
1234 
1235 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1236 				 tx_q->m_table, tx_q->len);
1237 	if (unlikely(count < tx_q->len))
1238 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1239 
1240 	tx_q->len = 0;
1241 }
1242 
1243 /*
1244  * This function routes the TX packet to the correct interface. This
1245  * may be a local device or the physical port.
1246  */
1247 static __rte_always_inline void
1248 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1249 {
1250 	struct mbuf_table *tx_q;
1251 	unsigned offset = 0;
1252 	const uint16_t lcore_id = rte_lcore_id();
1253 	struct rte_ether_hdr *nh;
1254 
1255 
1256 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1257 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1258 		struct vhost_dev *vdev2;
1259 
1260 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1261 			if (vdev2 != vdev)
1262 				sync_virtio_xmit(vdev2, vdev, m);
1263 		}
1264 		goto queue2nic;
1265 	}
1266 
1267 	/*check if destination is local VM*/
1268 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1269 		return;
1270 
1271 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1272 		if (unlikely(find_local_dest(vdev, m, &offset,
1273 					     &vlan_tag) != 0)) {
1274 			rte_pktmbuf_free(m);
1275 			return;
1276 		}
1277 	}
1278 
1279 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1280 		"(%d) TX: MAC address is external\n", vdev->vid);
1281 
1282 queue2nic:
1283 
1284 	/*Add packet to the port tx queue*/
1285 	tx_q = &lcore_tx_queue[lcore_id];
1286 
1287 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1288 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1289 		/* Guest has inserted the vlan tag. */
1290 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1291 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1292 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1293 			(vh->vlan_tci != vlan_tag_be))
1294 			vh->vlan_tci = vlan_tag_be;
1295 	} else {
1296 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1297 
1298 		/*
1299 		 * Find the right seg to adjust the data len when offset is
1300 		 * bigger than tail room size.
1301 		 */
1302 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1303 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1304 				m->data_len += offset;
1305 			else {
1306 				struct rte_mbuf *seg = m;
1307 
1308 				while ((seg->next != NULL) &&
1309 					(offset > rte_pktmbuf_tailroom(seg)))
1310 					seg = seg->next;
1311 
1312 				seg->data_len += offset;
1313 			}
1314 			m->pkt_len += offset;
1315 		}
1316 
1317 		m->vlan_tci = vlan_tag;
1318 	}
1319 
1320 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1321 		virtio_tx_offload(m);
1322 
1323 	tx_q->m_table[tx_q->len++] = m;
1324 	if (enable_stats) {
1325 		vdev->stats.tx_total++;
1326 		vdev->stats.tx++;
1327 	}
1328 
1329 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1330 		do_drain_mbuf_table(tx_q);
1331 }
1332 
1333 
1334 static __rte_always_inline void
1335 drain_mbuf_table(struct mbuf_table *tx_q)
1336 {
1337 	static uint64_t prev_tsc;
1338 	uint64_t cur_tsc;
1339 
1340 	if (tx_q->len == 0)
1341 		return;
1342 
1343 	cur_tsc = rte_rdtsc();
1344 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1345 		prev_tsc = cur_tsc;
1346 
1347 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1348 			"TX queue drained after timeout with burst size %u\n",
1349 			tx_q->len);
1350 		do_drain_mbuf_table(tx_q);
1351 	}
1352 }
1353 
1354 uint16_t
1355 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1356 		struct rte_mbuf **pkts, uint32_t rx_count)
1357 {
1358 	uint16_t enqueue_count;
1359 	uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1360 
1361 	complete_async_pkts(dev);
1362 	enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1363 					pkts, rx_count, dma_id, 0);
1364 
1365 	return enqueue_count;
1366 }
1367 
1368 uint16_t
1369 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1370 		struct rte_mbuf **pkts, uint32_t rx_count)
1371 {
1372 	return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1373 }
1374 
1375 static __rte_always_inline void
1376 drain_eth_rx(struct vhost_dev *vdev)
1377 {
1378 	uint16_t rx_count, enqueue_count;
1379 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1380 
1381 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1382 				    pkts, MAX_PKT_BURST);
1383 
1384 	if (!rx_count)
1385 		return;
1386 
1387 	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1388 						VIRTIO_RXQ, pkts, rx_count);
1389 
1390 	/* Retry if necessary */
1391 	if (enable_retry && unlikely(enqueue_count < rx_count)) {
1392 		uint32_t retry = 0;
1393 
1394 		while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) {
1395 			rte_delay_us(burst_rx_delay_time);
1396 			enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1397 							VIRTIO_RXQ, &pkts[enqueue_count],
1398 							rx_count - enqueue_count);
1399 		}
1400 	}
1401 
1402 	if (enable_stats) {
1403 		__atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1404 				__ATOMIC_SEQ_CST);
1405 		__atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1406 				__ATOMIC_SEQ_CST);
1407 	}
1408 
1409 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1410 		free_pkts(pkts, rx_count);
1411 	} else {
1412 		uint16_t enqueue_fail = rx_count - enqueue_count;
1413 		if (enqueue_fail > 0)
1414 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1415 	}
1416 }
1417 
1418 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1419 			    struct rte_mempool *mbuf_pool,
1420 			    struct rte_mbuf **pkts, uint16_t count)
1421 {
1422 	int nr_inflight;
1423 	uint16_t dequeue_count;
1424 	int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1425 
1426 	dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1427 			mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1428 
1429 	return dequeue_count;
1430 }
1431 
1432 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1433 			   struct rte_mempool *mbuf_pool,
1434 			   struct rte_mbuf **pkts, uint16_t count)
1435 {
1436 	return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1437 }
1438 
1439 static __rte_always_inline void
1440 drain_virtio_tx(struct vhost_dev *vdev)
1441 {
1442 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1443 	uint16_t count;
1444 	uint16_t i;
1445 
1446 	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1447 				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1448 
1449 	/* setup VMDq for the first packet */
1450 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1451 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1452 			free_pkts(pkts, count);
1453 	}
1454 
1455 	for (i = 0; i < count; ++i)
1456 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1457 }
1458 
1459 /*
1460  * Main function of vhost-switch. It basically does:
1461  *
1462  * for each vhost device {
1463  *    - drain_eth_rx()
1464  *
1465  *      Which drains the host eth Rx queue linked to the vhost device,
1466  *      and deliver all of them to guest virito Rx ring associated with
1467  *      this vhost device.
1468  *
1469  *    - drain_virtio_tx()
1470  *
1471  *      Which drains the guest virtio Tx queue and deliver all of them
1472  *      to the target, which could be another vhost device, or the
1473  *      physical eth dev. The route is done in function "virtio_tx_route".
1474  * }
1475  */
1476 static int
1477 switch_worker(void *arg __rte_unused)
1478 {
1479 	unsigned i;
1480 	unsigned lcore_id = rte_lcore_id();
1481 	struct vhost_dev *vdev;
1482 	struct mbuf_table *tx_q;
1483 
1484 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1485 
1486 	tx_q = &lcore_tx_queue[lcore_id];
1487 	for (i = 0; i < rte_lcore_count(); i++) {
1488 		if (lcore_ids[i] == lcore_id) {
1489 			tx_q->txq_id = i;
1490 			break;
1491 		}
1492 	}
1493 
1494 	while(1) {
1495 		drain_mbuf_table(tx_q);
1496 		drain_vhost_table();
1497 		/*
1498 		 * Inform the configuration core that we have exited the
1499 		 * linked list and that no devices are in use if requested.
1500 		 */
1501 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1502 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1503 
1504 		/*
1505 		 * Process vhost devices
1506 		 */
1507 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1508 			      lcore_vdev_entry) {
1509 			if (unlikely(vdev->remove)) {
1510 				unlink_vmdq(vdev);
1511 				vdev->ready = DEVICE_SAFE_REMOVE;
1512 				continue;
1513 			}
1514 
1515 			if (likely(vdev->ready == DEVICE_RX))
1516 				drain_eth_rx(vdev);
1517 
1518 			if (likely(!vdev->remove))
1519 				drain_virtio_tx(vdev);
1520 		}
1521 	}
1522 
1523 	return 0;
1524 }
1525 
1526 static void
1527 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1528 {
1529 	uint16_t n_pkt = 0;
1530 	int pkts_inflight;
1531 
1532 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1533 	pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1534 
1535 	struct rte_mbuf *m_cpl[pkts_inflight];
1536 
1537 	while (pkts_inflight) {
1538 		n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1539 							pkts_inflight, dma_id, 0);
1540 		free_pkts(m_cpl, n_pkt);
1541 		pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1542 									queue_id);
1543 	}
1544 }
1545 
1546 static void
1547 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id)
1548 {
1549 	uint16_t n_pkt = 0;
1550 	int pkts_inflight;
1551 
1552 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1553 	pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1554 
1555 	struct rte_mbuf *m_cpl[pkts_inflight];
1556 
1557 	while (pkts_inflight) {
1558 		n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl,
1559 						pkts_inflight, dma_id, 0);
1560 		free_pkts(m_cpl, n_pkt);
1561 		pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1562 	}
1563 }
1564 
1565 /*
1566  * Remove a device from the specific data core linked list and from the
1567  * main linked list. Synchronization  occurs through the use of the
1568  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1569  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1570  */
1571 static void
1572 destroy_device(int vid)
1573 {
1574 	struct vhost_dev *vdev = NULL;
1575 	int lcore;
1576 	uint16_t i;
1577 
1578 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1579 		if (vdev->vid == vid)
1580 			break;
1581 	}
1582 	if (!vdev)
1583 		return;
1584 	/*set the remove flag. */
1585 	vdev->remove = 1;
1586 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1587 		rte_pause();
1588 	}
1589 
1590 	for (i = 0; i < RTE_MAX_LCORE; i++)
1591 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1592 
1593 	if (builtin_net_driver)
1594 		vs_vhost_net_remove(vdev);
1595 
1596 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1597 		     lcore_vdev_entry);
1598 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1599 
1600 
1601 	/* Set the dev_removal_flag on each lcore. */
1602 	RTE_LCORE_FOREACH_WORKER(lcore)
1603 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1604 
1605 	/*
1606 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1607 	 * we can be sure that they can no longer access the device removed
1608 	 * from the linked lists and that the devices are no longer in use.
1609 	 */
1610 	RTE_LCORE_FOREACH_WORKER(lcore) {
1611 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1612 			rte_pause();
1613 	}
1614 
1615 	lcore_info[vdev->coreid].device_num--;
1616 
1617 	RTE_LOG(INFO, VHOST_DATA,
1618 		"(%d) device has been removed from data core\n",
1619 		vdev->vid);
1620 
1621 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1622 		vhost_clear_queue(vdev, VIRTIO_RXQ);
1623 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1624 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1625 	}
1626 
1627 	if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1628 		vhost_clear_queue(vdev, VIRTIO_TXQ);
1629 		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1630 		dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1631 	}
1632 
1633 	rte_free(vdev);
1634 }
1635 
1636 static inline int
1637 get_socketid_by_vid(int vid)
1638 {
1639 	int i;
1640 	char ifname[PATH_MAX];
1641 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1642 
1643 	for (i = 0; i < nb_sockets; i++) {
1644 		char *file = socket_files + i * PATH_MAX;
1645 		if (strcmp(file, ifname) == 0)
1646 			return i;
1647 	}
1648 
1649 	return -1;
1650 }
1651 
1652 static int
1653 init_vhost_queue_ops(int vid)
1654 {
1655 	if (builtin_net_driver) {
1656 		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1657 		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1658 	} else {
1659 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1660 			vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1661 		else
1662 			vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1663 
1664 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1665 			vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1666 		else
1667 			vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1668 	}
1669 
1670 	return 0;
1671 }
1672 
1673 static inline int
1674 vhost_async_channel_register(int vid)
1675 {
1676 	int rx_ret = 0, tx_ret = 0;
1677 
1678 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1679 		rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1680 		if (rx_ret == 0)
1681 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1682 	}
1683 
1684 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1685 		tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1686 		if (tx_ret == 0)
1687 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1688 	}
1689 
1690 	return rx_ret | tx_ret;
1691 }
1692 
1693 
1694 
1695 /*
1696  * A new device is added to a data core. First the device is added to the main linked list
1697  * and then allocated to a specific data core.
1698  */
1699 static int
1700 new_device(int vid)
1701 {
1702 	int lcore, core_add = 0;
1703 	uint16_t i;
1704 	uint32_t device_num_min = num_devices;
1705 	struct vhost_dev *vdev;
1706 	int ret;
1707 
1708 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1709 	if (vdev == NULL) {
1710 		RTE_LOG(INFO, VHOST_DATA,
1711 			"(%d) couldn't allocate memory for vhost dev\n",
1712 			vid);
1713 		return -1;
1714 	}
1715 	vdev->vid = vid;
1716 
1717 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1718 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1719 			= rte_zmalloc("vhost bufftable",
1720 				sizeof(struct vhost_bufftable),
1721 				RTE_CACHE_LINE_SIZE);
1722 
1723 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1724 			RTE_LOG(INFO, VHOST_DATA,
1725 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1726 			return -1;
1727 		}
1728 	}
1729 
1730 	int socketid = get_socketid_by_vid(vid);
1731 	if (socketid == -1)
1732 		return -1;
1733 
1734 	init_vid2socketid_array(vid, socketid);
1735 
1736 	ret =  vhost_async_channel_register(vid);
1737 
1738 	if (init_vhost_queue_ops(vid) != 0)
1739 		return -1;
1740 
1741 	if (builtin_net_driver)
1742 		vs_vhost_net_setup(vdev);
1743 
1744 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1745 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1746 
1747 	/*reset ready flag*/
1748 	vdev->ready = DEVICE_MAC_LEARNING;
1749 	vdev->remove = 0;
1750 
1751 	/* Find a suitable lcore to add the device. */
1752 	RTE_LCORE_FOREACH_WORKER(lcore) {
1753 		if (lcore_info[lcore].device_num < device_num_min) {
1754 			device_num_min = lcore_info[lcore].device_num;
1755 			core_add = lcore;
1756 		}
1757 	}
1758 	vdev->coreid = core_add;
1759 
1760 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1761 			  lcore_vdev_entry);
1762 	lcore_info[vdev->coreid].device_num++;
1763 
1764 	/* Disable notifications. */
1765 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1766 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1767 
1768 	RTE_LOG(INFO, VHOST_DATA,
1769 		"(%d) device has been added to data core %d\n",
1770 		vid, vdev->coreid);
1771 
1772 	return ret;
1773 }
1774 
1775 static int
1776 vring_state_changed(int vid, uint16_t queue_id, int enable)
1777 {
1778 	struct vhost_dev *vdev = NULL;
1779 
1780 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1781 		if (vdev->vid == vid)
1782 			break;
1783 	}
1784 	if (!vdev)
1785 		return -1;
1786 
1787 	if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1788 		if (!enable)
1789 			vhost_clear_queue_thread_unsafe(vdev, queue_id);
1790 	}
1791 
1792 	return 0;
1793 }
1794 
1795 /*
1796  * These callback allow devices to be added to the data core when configuration
1797  * has been fully complete.
1798  */
1799 static const struct rte_vhost_device_ops virtio_net_device_ops =
1800 {
1801 	.new_device =  new_device,
1802 	.destroy_device = destroy_device,
1803 	.vring_state_changed = vring_state_changed,
1804 };
1805 
1806 /*
1807  * This is a thread will wake up after a period to print stats if the user has
1808  * enabled them.
1809  */
1810 static void *
1811 print_stats(__rte_unused void *arg)
1812 {
1813 	struct vhost_dev *vdev;
1814 	uint64_t tx_dropped, rx_dropped;
1815 	uint64_t tx, tx_total, rx, rx_total;
1816 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1817 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1818 
1819 	while(1) {
1820 		sleep(enable_stats);
1821 
1822 		/* Clear screen and move to top left */
1823 		printf("%s%s\n", clr, top_left);
1824 		printf("Device statistics =================================\n");
1825 
1826 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1827 			tx_total   = vdev->stats.tx_total;
1828 			tx         = vdev->stats.tx;
1829 			tx_dropped = tx_total - tx;
1830 
1831 			rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1832 				__ATOMIC_SEQ_CST);
1833 			rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1834 				__ATOMIC_SEQ_CST);
1835 			rx_dropped = rx_total - rx;
1836 
1837 			printf("Statistics for device %d\n"
1838 				"-----------------------\n"
1839 				"TX total:              %" PRIu64 "\n"
1840 				"TX dropped:            %" PRIu64 "\n"
1841 				"TX successful:         %" PRIu64 "\n"
1842 				"RX total:              %" PRIu64 "\n"
1843 				"RX dropped:            %" PRIu64 "\n"
1844 				"RX successful:         %" PRIu64 "\n",
1845 				vdev->vid,
1846 				tx_total, tx_dropped, tx,
1847 				rx_total, rx_dropped, rx);
1848 		}
1849 
1850 		printf("===================================================\n");
1851 
1852 		fflush(stdout);
1853 	}
1854 
1855 	return NULL;
1856 }
1857 
1858 static void
1859 unregister_drivers(int socket_num)
1860 {
1861 	int i, ret;
1862 
1863 	for (i = 0; i < socket_num; i++) {
1864 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1865 		if (ret != 0)
1866 			RTE_LOG(ERR, VHOST_CONFIG,
1867 				"Fail to unregister vhost driver for %s.\n",
1868 				socket_files + i * PATH_MAX);
1869 	}
1870 }
1871 
1872 /* When we receive a INT signal, unregister vhost driver */
1873 static void
1874 sigint_handler(__rte_unused int signum)
1875 {
1876 	/* Unregister vhost driver. */
1877 	unregister_drivers(nb_sockets);
1878 
1879 	exit(0);
1880 }
1881 
1882 static void
1883 reset_dma(void)
1884 {
1885 	int i;
1886 
1887 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1888 		int j;
1889 
1890 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1891 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1892 			dma_bind[i].dmas[j].async_enabled = false;
1893 		}
1894 	}
1895 
1896 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1897 		dmas_id[i] = INVALID_DMA_ID;
1898 }
1899 
1900 /*
1901  * Main function, does initialisation and calls the per-lcore functions.
1902  */
1903 int
1904 main(int argc, char *argv[])
1905 {
1906 	unsigned lcore_id, core_id = 0;
1907 	unsigned nb_ports, valid_num_ports;
1908 	int ret, i;
1909 	uint16_t portid;
1910 	static pthread_t tid;
1911 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1912 
1913 	signal(SIGINT, sigint_handler);
1914 
1915 	/* init EAL */
1916 	ret = rte_eal_init(argc, argv);
1917 	if (ret < 0)
1918 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1919 	argc -= ret;
1920 	argv += ret;
1921 
1922 	/* initialize dma structures */
1923 	reset_dma();
1924 
1925 	/* parse app arguments */
1926 	ret = us_vhost_parse_args(argc, argv);
1927 	if (ret < 0)
1928 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1929 
1930 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1931 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1932 
1933 		if (rte_lcore_is_enabled(lcore_id))
1934 			lcore_ids[core_id++] = lcore_id;
1935 	}
1936 
1937 	if (rte_lcore_count() > RTE_MAX_LCORE)
1938 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1939 
1940 	/* Get the number of physical ports. */
1941 	nb_ports = rte_eth_dev_count_avail();
1942 
1943 	/*
1944 	 * Update the global var NUM_PORTS and global array PORTS
1945 	 * and get value of var VALID_NUM_PORTS according to system ports number
1946 	 */
1947 	valid_num_ports = check_ports_num(nb_ports);
1948 
1949 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1950 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1951 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1952 		return -1;
1953 	}
1954 
1955 	/*
1956 	 * FIXME: here we are trying to allocate mbufs big enough for
1957 	 * @MAX_QUEUES, but the truth is we're never going to use that
1958 	 * many queues here. We probably should only do allocation for
1959 	 * those queues we are going to use.
1960 	 */
1961 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1962 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1963 					    rte_socket_id());
1964 	if (mbuf_pool == NULL)
1965 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1966 
1967 	if (vm2vm_mode == VM2VM_HARDWARE) {
1968 		/* Enable VT loop back to let L2 switch to do it. */
1969 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1970 		RTE_LOG(DEBUG, VHOST_CONFIG,
1971 			"Enable loop back for L2 switch in vmdq.\n");
1972 	}
1973 
1974 	/* initialize all ports */
1975 	RTE_ETH_FOREACH_DEV(portid) {
1976 		/* skip ports that are not enabled */
1977 		if ((enabled_port_mask & (1 << portid)) == 0) {
1978 			RTE_LOG(INFO, VHOST_PORT,
1979 				"Skipping disabled port %d\n", portid);
1980 			continue;
1981 		}
1982 		if (port_init(portid) != 0)
1983 			rte_exit(EXIT_FAILURE,
1984 				"Cannot initialize network ports\n");
1985 	}
1986 
1987 	/* Enable stats if the user option is set. */
1988 	if (enable_stats) {
1989 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1990 					print_stats, NULL);
1991 		if (ret < 0)
1992 			rte_exit(EXIT_FAILURE,
1993 				"Cannot create print-stats thread\n");
1994 	}
1995 
1996 	/* Launch all data cores. */
1997 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1998 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1999 
2000 	if (client_mode)
2001 		flags |= RTE_VHOST_USER_CLIENT;
2002 
2003 	for (i = 0; i < dma_count; i++) {
2004 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
2005 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
2006 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2007 		}
2008 	}
2009 
2010 	/* Register vhost user driver to handle vhost messages. */
2011 	for (i = 0; i < nb_sockets; i++) {
2012 		char *file = socket_files + i * PATH_MAX;
2013 
2014 		if (dma_count && get_async_flag_by_socketid(i) != 0)
2015 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2016 
2017 		ret = rte_vhost_driver_register(file, flags);
2018 		if (ret != 0) {
2019 			unregister_drivers(i);
2020 			rte_exit(EXIT_FAILURE,
2021 				"vhost driver register failure.\n");
2022 		}
2023 
2024 		if (builtin_net_driver)
2025 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2026 
2027 		if (mergeable == 0) {
2028 			rte_vhost_driver_disable_features(file,
2029 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
2030 		}
2031 
2032 		if (enable_tx_csum == 0) {
2033 			rte_vhost_driver_disable_features(file,
2034 				1ULL << VIRTIO_NET_F_CSUM);
2035 		}
2036 
2037 		if (enable_tso == 0) {
2038 			rte_vhost_driver_disable_features(file,
2039 				1ULL << VIRTIO_NET_F_HOST_TSO4);
2040 			rte_vhost_driver_disable_features(file,
2041 				1ULL << VIRTIO_NET_F_HOST_TSO6);
2042 			rte_vhost_driver_disable_features(file,
2043 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
2044 			rte_vhost_driver_disable_features(file,
2045 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
2046 		}
2047 
2048 		if (promiscuous) {
2049 			rte_vhost_driver_enable_features(file,
2050 				1ULL << VIRTIO_NET_F_CTRL_RX);
2051 		}
2052 
2053 		ret = rte_vhost_driver_callback_register(file,
2054 			&virtio_net_device_ops);
2055 		if (ret != 0) {
2056 			rte_exit(EXIT_FAILURE,
2057 				"failed to register vhost driver callbacks.\n");
2058 		}
2059 
2060 		if (rte_vhost_driver_start(file) < 0) {
2061 			rte_exit(EXIT_FAILURE,
2062 				"failed to start vhost driver.\n");
2063 		}
2064 	}
2065 
2066 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2067 		rte_eal_wait_lcore(lcore_id);
2068 
2069 	for (i = 0; i < dma_count; i++) {
2070 		if (rte_vhost_async_dma_unconfigure(dmas_id[i], 0) < 0) {
2071 			RTE_LOG(ERR, VHOST_PORT,
2072 				"Failed to unconfigure DMA %d in vhost.\n", dmas_id[i]);
2073 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2074 		}
2075 	}
2076 
2077 	/* clean up the EAL */
2078 	rte_eal_cleanup();
2079 
2080 	return 0;
2081 }
2082