xref: /dpdk/examples/vhost/main.c (revision 92e68d9c97730ede6539d8f4c5748065aae96134)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <ctype.h>
6 #include <arpa/inet.h>
7 #include <getopt.h>
8 #include <linux/if_ether.h>
9 #include <linux/if_vlan.h>
10 #include <linux/virtio_net.h>
11 #include <linux/virtio_ring.h>
12 #include <signal.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <sys/eventfd.h>
16 #include <sys/param.h>
17 #include <unistd.h>
18 
19 #include <rte_cycles.h>
20 #include <rte_ethdev.h>
21 #include <rte_log.h>
22 #include <rte_string_fns.h>
23 #include <rte_malloc.h>
24 #include <rte_net.h>
25 #include <rte_vhost.h>
26 #include <rte_ip.h>
27 #include <rte_tcp.h>
28 #include <rte_pause.h>
29 #include <rte_dmadev.h>
30 #include <rte_vhost_async.h>
31 #include <rte_thread.h>
32 
33 #include "main.h"
34 
35 #ifndef MAX_QUEUES
36 #define MAX_QUEUES 128
37 #endif
38 
39 #define NUM_MBUFS_DEFAULT 0x24000
40 
41 /* the maximum number of external ports supported */
42 #define MAX_SUP_PORTS 1
43 
44 #define MBUF_CACHE_SIZE	128
45 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
46 
47 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
48 
49 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
50 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
51 
52 #define JUMBO_FRAME_MAX_SIZE    0x2600
53 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
54 
55 /* State of virtio device. */
56 #define DEVICE_MAC_LEARNING 0
57 #define DEVICE_RX			1
58 #define DEVICE_SAFE_REMOVE	2
59 
60 /* Configurable number of RX/TX ring descriptors */
61 #define RX_DESC_DEFAULT 1024
62 #define TX_DESC_DEFAULT 512
63 
64 #define INVALID_PORT_ID 0xFF
65 #define INVALID_DMA_ID -1
66 
67 #define DMA_RING_SIZE 4096
68 
69 #define ASYNC_ENQUEUE_VHOST 1
70 #define ASYNC_DEQUEUE_VHOST 2
71 
72 /* number of mbufs in all pools - if specified on command-line. */
73 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
74 
75 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
76 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
77 static int dma_count;
78 
79 /* mask of enabled ports */
80 static uint32_t enabled_port_mask = 0;
81 
82 /* Promiscuous mode */
83 static uint32_t promiscuous;
84 
85 /* number of devices/queues to support*/
86 static uint32_t num_queues = 0;
87 static uint32_t num_devices;
88 
89 static struct rte_mempool *mbuf_pool;
90 static int mergeable;
91 
92 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
93 typedef enum {
94 	VM2VM_DISABLED = 0,
95 	VM2VM_SOFTWARE = 1,
96 	VM2VM_HARDWARE = 2,
97 	VM2VM_LAST
98 } vm2vm_type;
99 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
100 
101 /* Enable stats. */
102 static uint32_t enable_stats = 0;
103 /* Enable retries on RX. */
104 static uint32_t enable_retry = 1;
105 
106 /* Disable TX checksum offload */
107 static uint32_t enable_tx_csum;
108 
109 /* Disable TSO offload */
110 static uint32_t enable_tso;
111 
112 static int client_mode;
113 
114 static int builtin_net_driver;
115 
116 /* Specify timeout (in useconds) between retries on RX. */
117 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
118 /* Specify the number of retries on RX. */
119 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
120 
121 /* Socket file paths. Can be set by user */
122 static char *socket_files;
123 static int nb_sockets;
124 
125 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
126 
127 /* empty VMDq configuration structure. Filled in programmatically */
128 static struct rte_eth_conf vmdq_conf_default = {
129 	.rxmode = {
130 		.mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
131 		/*
132 		 * VLAN strip is necessary for 1G NIC such as I350,
133 		 * this fixes bug of ipv4 forwarding in guest can't
134 		 * forward packets from one virtio dev to another virtio dev.
135 		 */
136 		.offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
137 	},
138 
139 	.txmode = {
140 		.mq_mode = RTE_ETH_MQ_TX_NONE,
141 		.offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
142 			     RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
143 			     RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
144 			     RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
145 			     RTE_ETH_TX_OFFLOAD_TCP_TSO),
146 	},
147 	.rx_adv_conf = {
148 		/*
149 		 * should be overridden separately in code with
150 		 * appropriate values
151 		 */
152 		.vmdq_rx_conf = {
153 			.nb_queue_pools = RTE_ETH_8_POOLS,
154 			.enable_default_pool = 0,
155 			.default_pool = 0,
156 			.nb_pool_maps = 0,
157 			.pool_map = {{0, 0},},
158 		},
159 	},
160 };
161 
162 
163 static unsigned lcore_ids[RTE_MAX_LCORE];
164 static uint16_t ports[RTE_MAX_ETHPORTS];
165 static unsigned num_ports = 0; /**< The number of ports specified in command line */
166 static uint16_t num_pf_queues, num_vmdq_queues;
167 static uint16_t vmdq_pool_base, vmdq_queue_base;
168 static uint16_t queues_per_pool;
169 
170 const uint16_t vlan_tags[] = {
171 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
172 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
173 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
174 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
175 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
176 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
177 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
178 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
179 };
180 
181 /* ethernet addresses of ports */
182 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
183 
184 static struct vhost_dev_tailq_list vhost_dev_list =
185 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
186 
187 static struct lcore_info lcore_info[RTE_MAX_LCORE];
188 
189 /* Used for queueing bursts of TX packets. */
190 struct mbuf_table {
191 	unsigned len;
192 	unsigned txq_id;
193 	struct rte_mbuf *m_table[MAX_PKT_BURST];
194 };
195 
196 struct vhost_bufftable {
197 	uint32_t len;
198 	uint64_t pre_tsc;
199 	struct rte_mbuf *m_table[MAX_PKT_BURST];
200 };
201 
202 /* TX queue for each data core. */
203 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
204 
205 /*
206  * Vhost TX buffer for each data core.
207  * Every data core maintains a TX buffer for every vhost device,
208  * which is used for batch pkts enqueue for higher performance.
209  */
210 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
211 
212 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
213 				 / US_PER_S * BURST_TX_DRAIN_US)
214 
215 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
216 
217 static inline uint32_t
get_async_flag_by_socketid(int socketid)218 get_async_flag_by_socketid(int socketid)
219 {
220 	return dma_bind[socketid].async_flag;
221 }
222 
223 static inline void
init_vid2socketid_array(int vid,int socketid)224 init_vid2socketid_array(int vid, int socketid)
225 {
226 	vid2socketid[vid] = socketid;
227 }
228 
229 static inline bool
is_dma_configured(int16_t dev_id)230 is_dma_configured(int16_t dev_id)
231 {
232 	int i;
233 
234 	for (i = 0; i < dma_count; i++)
235 		if (dmas_id[i] == dev_id)
236 			return true;
237 	return false;
238 }
239 
240 static inline int
open_dma(const char * value)241 open_dma(const char *value)
242 {
243 	struct dma_for_vhost *dma_info = dma_bind;
244 	char *input = strndup(value, strlen(value) + 1);
245 	char *addrs = input;
246 	char *ptrs[2];
247 	char *start, *end, *substr;
248 	int64_t socketid, vring_id;
249 
250 	struct rte_dma_info info;
251 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
252 	struct rte_dma_vchan_conf qconf = {
253 		.direction = RTE_DMA_DIR_MEM_TO_MEM,
254 		.nb_desc = DMA_RING_SIZE
255 	};
256 
257 	int dev_id;
258 	int ret = 0;
259 	uint16_t i = 0;
260 	char *dma_arg[RTE_MAX_VHOST_DEVICE];
261 	int args_nr;
262 
263 	if (input == NULL)
264 		return -1;
265 
266 	while (isblank(*addrs))
267 		addrs++;
268 	if (*addrs == '\0') {
269 		ret = -1;
270 		goto out;
271 	}
272 
273 	/* process DMA devices within bracket. */
274 	addrs++;
275 	substr = strtok(addrs, ";]");
276 	if (!substr) {
277 		ret = -1;
278 		goto out;
279 	}
280 
281 	args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
282 	if (args_nr <= 0) {
283 		ret = -1;
284 		goto out;
285 	}
286 
287 	while (i < args_nr) {
288 		char *arg_temp = dma_arg[i];
289 		char *txd, *rxd;
290 		uint8_t sub_nr;
291 		int async_flag;
292 
293 		sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
294 		if (sub_nr != 2) {
295 			ret = -1;
296 			goto out;
297 		}
298 
299 		txd = strstr(ptrs[0], "txd");
300 		rxd = strstr(ptrs[0], "rxd");
301 		if (txd) {
302 			start = txd;
303 			vring_id = VIRTIO_RXQ;
304 			async_flag = ASYNC_ENQUEUE_VHOST;
305 		} else if (rxd) {
306 			start = rxd;
307 			vring_id = VIRTIO_TXQ;
308 			async_flag = ASYNC_DEQUEUE_VHOST;
309 		} else {
310 			ret = -1;
311 			goto out;
312 		}
313 
314 		start += 3;
315 		socketid = strtol(start, &end, 0);
316 		if (end == start) {
317 			ret = -1;
318 			goto out;
319 		}
320 
321 		dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
322 		if (dev_id < 0) {
323 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
324 			ret = -1;
325 			goto out;
326 		}
327 
328 		/* DMA device is already configured, so skip */
329 		if (is_dma_configured(dev_id))
330 			goto done;
331 
332 		if (rte_dma_info_get(dev_id, &info) != 0) {
333 			RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
334 			ret = -1;
335 			goto out;
336 		}
337 
338 		if (info.max_vchans < 1) {
339 			RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
340 			ret = -1;
341 			goto out;
342 		}
343 
344 		if (rte_dma_configure(dev_id, &dev_config) != 0) {
345 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
346 			ret = -1;
347 			goto out;
348 		}
349 
350 		/* Check the max desc supported by DMA device */
351 		rte_dma_info_get(dev_id, &info);
352 		if (info.nb_vchans != 1) {
353 			RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
354 					dev_id);
355 			ret = -1;
356 			goto out;
357 		}
358 
359 		qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
360 
361 		if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
362 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
363 			ret = -1;
364 			goto out;
365 		}
366 
367 		if (rte_dma_start(dev_id) != 0) {
368 			RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
369 			ret = -1;
370 			goto out;
371 		}
372 
373 		dmas_id[dma_count++] = dev_id;
374 
375 done:
376 		(dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
377 		(dma_info + socketid)->async_flag |= async_flag;
378 		i++;
379 	}
380 out:
381 	free(input);
382 	return ret;
383 }
384 
385 /*
386  * Builds up the correct configuration for VMDQ VLAN pool map
387  * according to the pool & queue limits.
388  */
389 static inline int
get_eth_conf(struct rte_eth_conf * eth_conf,uint32_t num_devices)390 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
391 {
392 	struct rte_eth_vmdq_rx_conf conf;
393 	struct rte_eth_vmdq_rx_conf *def_conf =
394 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
395 	unsigned i;
396 
397 	memset(&conf, 0, sizeof(conf));
398 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
399 	conf.nb_pool_maps = num_devices;
400 	conf.enable_loop_back = def_conf->enable_loop_back;
401 	conf.rx_mode = def_conf->rx_mode;
402 
403 	for (i = 0; i < conf.nb_pool_maps; i++) {
404 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
405 		conf.pool_map[i].pools = (1UL << i);
406 	}
407 
408 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
409 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
410 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
411 	return 0;
412 }
413 
414 /*
415  * Initialises a given port using global settings and with the rx buffers
416  * coming from the mbuf_pool passed as parameter
417  */
418 static inline int
port_init(uint16_t port)419 port_init(uint16_t port)
420 {
421 	struct rte_eth_dev_info dev_info;
422 	struct rte_eth_conf port_conf;
423 	struct rte_eth_rxconf *rxconf;
424 	struct rte_eth_txconf *txconf;
425 	int16_t rx_rings, tx_rings;
426 	uint16_t rx_ring_size, tx_ring_size;
427 	int retval;
428 	uint16_t q;
429 
430 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
431 	retval = rte_eth_dev_info_get(port, &dev_info);
432 	if (retval != 0) {
433 		RTE_LOG(ERR, VHOST_PORT,
434 			"Error during getting device (port %u) info: %s\n",
435 			port, strerror(-retval));
436 
437 		return retval;
438 	}
439 	if (dev_info.max_vmdq_pools == 0) {
440 		RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
441 		return -1;
442 	}
443 
444 	rxconf = &dev_info.default_rxconf;
445 	txconf = &dev_info.default_txconf;
446 	rxconf->rx_drop_en = 1;
447 
448 	/*configure the number of supported virtio devices based on VMDQ limits */
449 	num_devices = dev_info.max_vmdq_pools;
450 
451 	rx_ring_size = RX_DESC_DEFAULT;
452 	tx_ring_size = TX_DESC_DEFAULT;
453 
454 	tx_rings = (uint16_t)rte_lcore_count();
455 
456 	if (mergeable) {
457 		if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
458 			vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
459 		else
460 			vmdq_conf_default.rxmode.mtu = MAX_MTU;
461 	}
462 
463 	/* Get port configuration. */
464 	retval = get_eth_conf(&port_conf, num_devices);
465 	if (retval < 0)
466 		return retval;
467 	/* NIC queues are divided into pf queues and vmdq queues.  */
468 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
469 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
470 	num_vmdq_queues = num_devices * queues_per_pool;
471 	num_queues = num_pf_queues + num_vmdq_queues;
472 	vmdq_queue_base = dev_info.vmdq_queue_base;
473 	vmdq_pool_base  = dev_info.vmdq_pool_base;
474 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
475 		num_pf_queues, num_devices, queues_per_pool);
476 
477 	if (!rte_eth_dev_is_valid_port(port))
478 		return -1;
479 
480 	rx_rings = (uint16_t)dev_info.max_rx_queues;
481 	if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
482 		port_conf.txmode.offloads |=
483 			RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
484 	/* Configure ethernet device. */
485 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
486 	if (retval != 0) {
487 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
488 			port, strerror(-retval));
489 		return retval;
490 	}
491 
492 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
493 		&tx_ring_size);
494 	if (retval != 0) {
495 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
496 			"for port %u: %s.\n", port, strerror(-retval));
497 		return retval;
498 	}
499 	if (rx_ring_size > RX_DESC_DEFAULT) {
500 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
501 			"for Rx queues on port %u.\n", port);
502 		return -1;
503 	}
504 
505 	/* Setup the queues. */
506 	rxconf->offloads = port_conf.rxmode.offloads;
507 	for (q = 0; q < rx_rings; q ++) {
508 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
509 						rte_eth_dev_socket_id(port),
510 						rxconf,
511 						mbuf_pool);
512 		if (retval < 0) {
513 			RTE_LOG(ERR, VHOST_PORT,
514 				"Failed to setup rx queue %u of port %u: %s.\n",
515 				q, port, strerror(-retval));
516 			return retval;
517 		}
518 	}
519 	txconf->offloads = port_conf.txmode.offloads;
520 	for (q = 0; q < tx_rings; q ++) {
521 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
522 						rte_eth_dev_socket_id(port),
523 						txconf);
524 		if (retval < 0) {
525 			RTE_LOG(ERR, VHOST_PORT,
526 				"Failed to setup tx queue %u of port %u: %s.\n",
527 				q, port, strerror(-retval));
528 			return retval;
529 		}
530 	}
531 
532 	/* Start the device. */
533 	retval  = rte_eth_dev_start(port);
534 	if (retval < 0) {
535 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
536 			port, strerror(-retval));
537 		return retval;
538 	}
539 
540 	if (promiscuous) {
541 		retval = rte_eth_promiscuous_enable(port);
542 		if (retval != 0) {
543 			RTE_LOG(ERR, VHOST_PORT,
544 				"Failed to enable promiscuous mode on port %u: %s\n",
545 				port, rte_strerror(-retval));
546 			return retval;
547 		}
548 	}
549 
550 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
551 	if (retval < 0) {
552 		RTE_LOG(ERR, VHOST_PORT,
553 			"Failed to get MAC address on port %u: %s\n",
554 			port, rte_strerror(-retval));
555 		return retval;
556 	}
557 
558 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
559 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
560 		" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
561 		port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
562 
563 	return 0;
564 }
565 
566 /*
567  * Set socket file path.
568  */
569 static int
us_vhost_parse_socket_path(const char * q_arg)570 us_vhost_parse_socket_path(const char *q_arg)
571 {
572 	char *old;
573 
574 	/* parse number string */
575 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
576 		return -1;
577 
578 	old = socket_files;
579 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
580 	if (socket_files == NULL) {
581 		free(old);
582 		return -1;
583 	}
584 
585 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
586 	nb_sockets++;
587 
588 	return 0;
589 }
590 
591 /*
592  * Parse the portmask provided at run time.
593  */
594 static int
parse_portmask(const char * portmask)595 parse_portmask(const char *portmask)
596 {
597 	char *end = NULL;
598 	unsigned long pm;
599 
600 	errno = 0;
601 
602 	/* parse hexadecimal string */
603 	pm = strtoul(portmask, &end, 16);
604 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
605 		return 0;
606 
607 	return pm;
608 
609 }
610 
611 /*
612  * Parse num options at run time.
613  */
614 static int
parse_num_opt(const char * q_arg,uint32_t max_valid_value)615 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
616 {
617 	char *end = NULL;
618 	unsigned long num;
619 
620 	errno = 0;
621 
622 	/* parse unsigned int string */
623 	num = strtoul(q_arg, &end, 10);
624 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
625 		return -1;
626 
627 	if (num > max_valid_value)
628 		return -1;
629 
630 	return num;
631 
632 }
633 
634 /*
635  * Display usage
636  */
637 static void
us_vhost_usage(const char * prgname)638 us_vhost_usage(const char *prgname)
639 {
640 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
641 	"		--vm2vm [0|1|2]\n"
642 	"		--rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n"
643 	"		--socket-file <path>\n"
644 	"		-p PORTMASK: Set mask for ports to be used by application\n"
645 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
646 	"		--rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
647 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
648 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
649 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
650 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
651 	"		--socket-file: The path of the socket file.\n"
652 	"		--tx-csum [0|1]: disable/enable TX checksum offload.\n"
653 	"		--tso [0|1]: disable/enable TCP segment offload.\n"
654 	"		--client: register a vhost-user socket as client mode.\n"
655 	"		--dmas: register dma channel for specific vhost device.\n"
656 	"		--total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n"
657 	"		--builtin-net-driver: enable simple vhost-user net driver\n",
658 	       prgname);
659 }
660 
661 enum {
662 #define OPT_VM2VM               "vm2vm"
663 	OPT_VM2VM_NUM = 256,
664 #define OPT_RX_RETRY            "rx-retry"
665 	OPT_RX_RETRY_NUM,
666 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
667 	OPT_RX_RETRY_DELAY_NUM,
668 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
669 	OPT_RX_RETRY_NUMB_NUM,
670 #define OPT_MERGEABLE           "mergeable"
671 	OPT_MERGEABLE_NUM,
672 #define OPT_STATS               "stats"
673 	OPT_STATS_NUM,
674 #define OPT_SOCKET_FILE         "socket-file"
675 	OPT_SOCKET_FILE_NUM,
676 #define OPT_TX_CSUM             "tx-csum"
677 	OPT_TX_CSUM_NUM,
678 #define OPT_TSO                 "tso"
679 	OPT_TSO_NUM,
680 #define OPT_CLIENT              "client"
681 	OPT_CLIENT_NUM,
682 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
683 	OPT_BUILTIN_NET_DRIVER_NUM,
684 #define OPT_DMAS                "dmas"
685 	OPT_DMAS_NUM,
686 #define OPT_NUM_MBUFS           "total-num-mbufs"
687 	OPT_NUM_MBUFS_NUM,
688 };
689 
690 /*
691  * Parse the arguments given in the command line of the application.
692  */
693 static int
us_vhost_parse_args(int argc,char ** argv)694 us_vhost_parse_args(int argc, char **argv)
695 {
696 	int opt, ret;
697 	int option_index;
698 	unsigned i;
699 	const char *prgname = argv[0];
700 	static struct option long_option[] = {
701 		{OPT_VM2VM, required_argument,
702 				NULL, OPT_VM2VM_NUM},
703 		{OPT_RX_RETRY, required_argument,
704 				NULL, OPT_RX_RETRY_NUM},
705 		{OPT_RX_RETRY_DELAY, required_argument,
706 				NULL, OPT_RX_RETRY_DELAY_NUM},
707 		{OPT_RX_RETRY_NUMB, required_argument,
708 				NULL, OPT_RX_RETRY_NUMB_NUM},
709 		{OPT_MERGEABLE, required_argument,
710 				NULL, OPT_MERGEABLE_NUM},
711 		{OPT_STATS, required_argument,
712 				NULL, OPT_STATS_NUM},
713 		{OPT_SOCKET_FILE, required_argument,
714 				NULL, OPT_SOCKET_FILE_NUM},
715 		{OPT_TX_CSUM, required_argument,
716 				NULL, OPT_TX_CSUM_NUM},
717 		{OPT_TSO, required_argument,
718 				NULL, OPT_TSO_NUM},
719 		{OPT_CLIENT, no_argument,
720 				NULL, OPT_CLIENT_NUM},
721 		{OPT_BUILTIN_NET_DRIVER, no_argument,
722 				NULL, OPT_BUILTIN_NET_DRIVER_NUM},
723 		{OPT_DMAS, required_argument,
724 				NULL, OPT_DMAS_NUM},
725 		{OPT_NUM_MBUFS, required_argument,
726 				NULL, OPT_NUM_MBUFS_NUM},
727 		{NULL, 0, 0, 0},
728 	};
729 
730 	/* Parse command line */
731 	while ((opt = getopt_long(argc, argv, "p:P",
732 			long_option, &option_index)) != EOF) {
733 		switch (opt) {
734 		/* Portmask */
735 		case 'p':
736 			enabled_port_mask = parse_portmask(optarg);
737 			if (enabled_port_mask == 0) {
738 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
739 				us_vhost_usage(prgname);
740 				return -1;
741 			}
742 			break;
743 
744 		case 'P':
745 			promiscuous = 1;
746 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
747 				RTE_ETH_VMDQ_ACCEPT_BROADCAST |
748 				RTE_ETH_VMDQ_ACCEPT_MULTICAST;
749 			break;
750 
751 		case OPT_VM2VM_NUM:
752 			ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
753 			if (ret == -1) {
754 				RTE_LOG(INFO, VHOST_CONFIG,
755 					"Invalid argument for "
756 					"vm2vm [0|1|2]\n");
757 				us_vhost_usage(prgname);
758 				return -1;
759 			}
760 			vm2vm_mode = (vm2vm_type)ret;
761 			break;
762 
763 		case OPT_RX_RETRY_NUM:
764 			ret = parse_num_opt(optarg, 1);
765 			if (ret == -1) {
766 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
767 				us_vhost_usage(prgname);
768 				return -1;
769 			}
770 			enable_retry = ret;
771 			break;
772 
773 		case OPT_TX_CSUM_NUM:
774 			ret = parse_num_opt(optarg, 1);
775 			if (ret == -1) {
776 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
777 				us_vhost_usage(prgname);
778 				return -1;
779 			}
780 			enable_tx_csum = ret;
781 			break;
782 
783 		case OPT_TSO_NUM:
784 			ret = parse_num_opt(optarg, 1);
785 			if (ret == -1) {
786 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
787 				us_vhost_usage(prgname);
788 				return -1;
789 			}
790 			enable_tso = ret;
791 			break;
792 
793 		case OPT_RX_RETRY_DELAY_NUM:
794 			ret = parse_num_opt(optarg, INT32_MAX);
795 			if (ret == -1) {
796 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
797 				us_vhost_usage(prgname);
798 				return -1;
799 			}
800 			burst_rx_delay_time = ret;
801 			break;
802 
803 		case OPT_RX_RETRY_NUMB_NUM:
804 			ret = parse_num_opt(optarg, INT32_MAX);
805 			if (ret == -1) {
806 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
807 				us_vhost_usage(prgname);
808 				return -1;
809 			}
810 			burst_rx_retry_num = ret;
811 			break;
812 
813 		case OPT_MERGEABLE_NUM:
814 			ret = parse_num_opt(optarg, 1);
815 			if (ret == -1) {
816 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
817 				us_vhost_usage(prgname);
818 				return -1;
819 			}
820 			mergeable = !!ret;
821 			break;
822 
823 		case OPT_STATS_NUM:
824 			ret = parse_num_opt(optarg, INT32_MAX);
825 			if (ret == -1) {
826 				RTE_LOG(INFO, VHOST_CONFIG,
827 					"Invalid argument for stats [0..N]\n");
828 				us_vhost_usage(prgname);
829 				return -1;
830 			}
831 			enable_stats = ret;
832 			break;
833 
834 		/* Set socket file path. */
835 		case OPT_SOCKET_FILE_NUM:
836 			if (us_vhost_parse_socket_path(optarg) == -1) {
837 				RTE_LOG(INFO, VHOST_CONFIG,
838 				"Invalid argument for socket name (Max %d characters)\n",
839 				PATH_MAX);
840 				us_vhost_usage(prgname);
841 				return -1;
842 			}
843 			break;
844 
845 		case OPT_DMAS_NUM:
846 			if (open_dma(optarg) == -1) {
847 				RTE_LOG(INFO, VHOST_CONFIG,
848 					"Wrong DMA args\n");
849 				us_vhost_usage(prgname);
850 				return -1;
851 			}
852 			break;
853 
854 		case OPT_NUM_MBUFS_NUM:
855 			ret = parse_num_opt(optarg, INT32_MAX);
856 			if (ret == -1) {
857 				RTE_LOG(INFO, VHOST_CONFIG,
858 					"Invalid argument for total-num-mbufs [0..N]\n");
859 				us_vhost_usage(prgname);
860 				return -1;
861 			}
862 
863 			if (total_num_mbufs < ret)
864 				total_num_mbufs = ret;
865 			break;
866 
867 		case OPT_CLIENT_NUM:
868 			client_mode = 1;
869 			break;
870 
871 		case OPT_BUILTIN_NET_DRIVER_NUM:
872 			builtin_net_driver = 1;
873 			break;
874 
875 		/* Invalid option - print options. */
876 		default:
877 			us_vhost_usage(prgname);
878 			return -1;
879 		}
880 	}
881 
882 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
883 		if (enabled_port_mask & (1 << i))
884 			ports[num_ports++] = i;
885 	}
886 
887 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
888 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
889 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
890 		return -1;
891 	}
892 
893 	return 0;
894 }
895 
896 /*
897  * Update the global var NUM_PORTS and array PORTS according to system ports number
898  * and return valid ports number
899  */
check_ports_num(unsigned nb_ports)900 static unsigned check_ports_num(unsigned nb_ports)
901 {
902 	unsigned valid_num_ports = num_ports;
903 	unsigned portid;
904 
905 	if (num_ports > nb_ports) {
906 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
907 			num_ports, nb_ports);
908 		num_ports = nb_ports;
909 	}
910 
911 	for (portid = 0; portid < num_ports; portid ++) {
912 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
913 			RTE_LOG(INFO, VHOST_PORT,
914 				"\nSpecified port ID(%u) is not valid\n",
915 				ports[portid]);
916 			ports[portid] = INVALID_PORT_ID;
917 			valid_num_ports--;
918 		}
919 	}
920 	return valid_num_ports;
921 }
922 
923 static __rte_always_inline struct vhost_dev *
find_vhost_dev(struct rte_ether_addr * mac)924 find_vhost_dev(struct rte_ether_addr *mac)
925 {
926 	struct vhost_dev *vdev;
927 
928 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
929 		if (vdev->ready == DEVICE_RX &&
930 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
931 			return vdev;
932 	}
933 
934 	return NULL;
935 }
936 
937 /*
938  * This function learns the MAC address of the device and registers this along with a
939  * vlan tag to a VMDQ.
940  */
941 static int
link_vmdq(struct vhost_dev * vdev,struct rte_mbuf * m)942 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
943 {
944 	struct rte_ether_hdr *pkt_hdr;
945 	int i, ret;
946 
947 	/* Learn MAC address of guest device from packet */
948 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
949 
950 	if (find_vhost_dev(&pkt_hdr->src_addr)) {
951 		RTE_LOG(ERR, VHOST_DATA,
952 			"(%d) device is using a registered MAC!\n",
953 			vdev->vid);
954 		return -1;
955 	}
956 
957 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
958 		vdev->mac_address.addr_bytes[i] =
959 			pkt_hdr->src_addr.addr_bytes[i];
960 
961 	/* vlan_tag currently uses the device_id. */
962 	vdev->vlan_tag = vlan_tags[vdev->vid];
963 
964 	/* Print out VMDQ registration info. */
965 	RTE_LOG(INFO, VHOST_DATA,
966 		"(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
967 		vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
968 		vdev->vlan_tag);
969 
970 	/* Register the MAC address. */
971 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
972 				(uint32_t)vdev->vid + vmdq_pool_base);
973 	if (ret)
974 		RTE_LOG(ERR, VHOST_DATA,
975 			"(%d) failed to add device MAC address to VMDQ\n",
976 			vdev->vid);
977 
978 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
979 
980 	/* Set device as ready for RX. */
981 	vdev->ready = DEVICE_RX;
982 
983 	return 0;
984 }
985 
986 /*
987  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
988  * queue before disabling RX on the device.
989  */
990 static inline void
unlink_vmdq(struct vhost_dev * vdev)991 unlink_vmdq(struct vhost_dev *vdev)
992 {
993 	unsigned i = 0;
994 	unsigned rx_count;
995 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
996 
997 	if (vdev->ready == DEVICE_RX) {
998 		/*clear MAC and VLAN settings*/
999 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1000 		for (i = 0; i < 6; i++)
1001 			vdev->mac_address.addr_bytes[i] = 0;
1002 
1003 		vdev->vlan_tag = 0;
1004 
1005 		/*Clear out the receive buffers*/
1006 		rx_count = rte_eth_rx_burst(ports[0],
1007 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1008 
1009 		while (rx_count) {
1010 			for (i = 0; i < rx_count; i++)
1011 				rte_pktmbuf_free(pkts_burst[i]);
1012 
1013 			rx_count = rte_eth_rx_burst(ports[0],
1014 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1015 		}
1016 
1017 		vdev->ready = DEVICE_MAC_LEARNING;
1018 	}
1019 }
1020 
1021 static inline void
free_pkts(struct rte_mbuf ** pkts,uint16_t n)1022 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1023 {
1024 	while (n--)
1025 		rte_pktmbuf_free(pkts[n]);
1026 }
1027 
1028 static __rte_always_inline void
complete_async_pkts(struct vhost_dev * vdev)1029 complete_async_pkts(struct vhost_dev *vdev)
1030 {
1031 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1032 	uint16_t complete_count;
1033 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1034 
1035 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1036 					VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1037 	if (complete_count)
1038 		free_pkts(p_cpl, complete_count);
1039 
1040 }
1041 
1042 static __rte_always_inline void
sync_virtio_xmit(struct vhost_dev * dst_vdev,struct vhost_dev * src_vdev,struct rte_mbuf * m)1043 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1044 	    struct rte_mbuf *m)
1045 {
1046 	uint16_t ret;
1047 
1048 	if (builtin_net_driver) {
1049 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1050 	} else {
1051 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1052 	}
1053 
1054 	if (enable_stats) {
1055 		rte_atomic_fetch_add_explicit(&dst_vdev->stats.rx_total_atomic, 1,
1056 				rte_memory_order_seq_cst);
1057 		rte_atomic_fetch_add_explicit(&dst_vdev->stats.rx_atomic, ret,
1058 				rte_memory_order_seq_cst);
1059 		src_vdev->stats.tx_total++;
1060 		src_vdev->stats.tx += ret;
1061 	}
1062 }
1063 
1064 static __rte_always_inline void
drain_vhost(struct vhost_dev * vdev)1065 drain_vhost(struct vhost_dev *vdev)
1066 {
1067 	uint16_t ret;
1068 	uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1069 	uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1070 	struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1071 
1072 	ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1073 
1074 	if (enable_stats) {
1075 		rte_atomic_fetch_add_explicit(&vdev->stats.rx_total_atomic, nr_xmit,
1076 				rte_memory_order_seq_cst);
1077 		rte_atomic_fetch_add_explicit(&vdev->stats.rx_atomic, ret,
1078 				rte_memory_order_seq_cst);
1079 	}
1080 
1081 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1082 		free_pkts(m, nr_xmit);
1083 	} else {
1084 		uint16_t enqueue_fail = nr_xmit - ret;
1085 		if (enqueue_fail > 0)
1086 			free_pkts(&m[ret], enqueue_fail);
1087 	}
1088 }
1089 
1090 static __rte_always_inline void
drain_vhost_table(void)1091 drain_vhost_table(void)
1092 {
1093 	uint16_t lcore_id = rte_lcore_id();
1094 	struct vhost_bufftable *vhost_txq;
1095 	struct vhost_dev *vdev;
1096 	uint64_t cur_tsc;
1097 
1098 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1099 		if (unlikely(vdev->remove == 1))
1100 			continue;
1101 
1102 		vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1103 
1104 		cur_tsc = rte_rdtsc();
1105 		if (unlikely(cur_tsc - vhost_txq->pre_tsc
1106 				> MBUF_TABLE_DRAIN_TSC)) {
1107 			RTE_LOG_DP(DEBUG, VHOST_DATA,
1108 				"Vhost TX queue drained after timeout with burst size %u\n",
1109 				vhost_txq->len);
1110 			drain_vhost(vdev);
1111 			vhost_txq->len = 0;
1112 			vhost_txq->pre_tsc = cur_tsc;
1113 		}
1114 	}
1115 }
1116 
1117 /*
1118  * Check if the packet destination MAC address is for a local device. If so then put
1119  * the packet on that devices RX queue. If not then return.
1120  */
1121 static __rte_always_inline int
virtio_tx_local(struct vhost_dev * vdev,struct rte_mbuf * m)1122 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1123 {
1124 	struct rte_ether_hdr *pkt_hdr;
1125 	struct vhost_dev *dst_vdev;
1126 	struct vhost_bufftable *vhost_txq;
1127 	uint16_t lcore_id = rte_lcore_id();
1128 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1129 
1130 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1131 	if (!dst_vdev)
1132 		return -1;
1133 
1134 	if (vdev->vid == dst_vdev->vid) {
1135 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1136 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1137 			vdev->vid);
1138 		return 0;
1139 	}
1140 
1141 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1142 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
1143 
1144 	if (unlikely(dst_vdev->remove)) {
1145 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1146 			"(%d) device is marked for removal\n", dst_vdev->vid);
1147 		return 0;
1148 	}
1149 
1150 	vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1151 	vhost_txq->m_table[vhost_txq->len++] = m;
1152 
1153 	if (enable_stats) {
1154 		vdev->stats.tx_total++;
1155 		vdev->stats.tx++;
1156 	}
1157 
1158 	if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1159 		drain_vhost(dst_vdev);
1160 		vhost_txq->len = 0;
1161 		vhost_txq->pre_tsc = rte_rdtsc();
1162 	}
1163 	return 0;
1164 }
1165 
1166 /*
1167  * Check if the destination MAC of a packet is one local VM,
1168  * and get its vlan tag, and offset if it is.
1169  */
1170 static __rte_always_inline int
find_local_dest(struct vhost_dev * vdev,struct rte_mbuf * m,uint32_t * offset,uint16_t * vlan_tag)1171 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1172 	uint32_t *offset, uint16_t *vlan_tag)
1173 {
1174 	struct vhost_dev *dst_vdev;
1175 	struct rte_ether_hdr *pkt_hdr =
1176 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1177 
1178 	dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1179 	if (!dst_vdev)
1180 		return 0;
1181 
1182 	if (vdev->vid == dst_vdev->vid) {
1183 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1184 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
1185 			vdev->vid);
1186 		return -1;
1187 	}
1188 
1189 	/*
1190 	 * HW vlan strip will reduce the packet length
1191 	 * by minus length of vlan tag, so need restore
1192 	 * the packet length by plus it.
1193 	 */
1194 	*offset  = RTE_VLAN_HLEN;
1195 	*vlan_tag = vlan_tags[vdev->vid];
1196 
1197 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1198 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1199 		vdev->vid, dst_vdev->vid, *vlan_tag);
1200 
1201 	return 0;
1202 }
1203 
virtio_tx_offload(struct rte_mbuf * m)1204 static void virtio_tx_offload(struct rte_mbuf *m)
1205 {
1206 	struct rte_net_hdr_lens hdr_lens;
1207 	struct rte_ipv4_hdr *ipv4_hdr;
1208 	struct rte_tcp_hdr *tcp_hdr;
1209 	uint32_t ptype;
1210 	void *l3_hdr;
1211 
1212 	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1213 	m->l2_len = hdr_lens.l2_len;
1214 	m->l3_len = hdr_lens.l3_len;
1215 	m->l4_len = hdr_lens.l4_len;
1216 
1217 	l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1218 	tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1219 		m->l2_len + m->l3_len);
1220 
1221 	m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1222 	if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1223 		m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1224 		m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1225 		ipv4_hdr = l3_hdr;
1226 		ipv4_hdr->hdr_checksum = 0;
1227 		tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1228 	} else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1229 		m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1230 		tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1231 	}
1232 }
1233 
1234 static __rte_always_inline void
do_drain_mbuf_table(struct mbuf_table * tx_q)1235 do_drain_mbuf_table(struct mbuf_table *tx_q)
1236 {
1237 	uint16_t count;
1238 
1239 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1240 				 tx_q->m_table, tx_q->len);
1241 	if (unlikely(count < tx_q->len))
1242 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
1243 
1244 	tx_q->len = 0;
1245 }
1246 
1247 /*
1248  * This function routes the TX packet to the correct interface. This
1249  * may be a local device or the physical port.
1250  */
1251 static __rte_always_inline void
virtio_tx_route(struct vhost_dev * vdev,struct rte_mbuf * m,uint16_t vlan_tag)1252 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1253 {
1254 	struct mbuf_table *tx_q;
1255 	unsigned offset = 0;
1256 	const uint16_t lcore_id = rte_lcore_id();
1257 	struct rte_ether_hdr *nh;
1258 
1259 
1260 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1261 	if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1262 		struct vhost_dev *vdev2;
1263 
1264 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1265 			if (vdev2 != vdev)
1266 				sync_virtio_xmit(vdev2, vdev, m);
1267 		}
1268 		goto queue2nic;
1269 	}
1270 
1271 	/*check if destination is local VM*/
1272 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1273 		return;
1274 
1275 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1276 		if (unlikely(find_local_dest(vdev, m, &offset,
1277 					     &vlan_tag) != 0)) {
1278 			rte_pktmbuf_free(m);
1279 			return;
1280 		}
1281 	}
1282 
1283 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1284 		"(%d) TX: MAC address is external\n", vdev->vid);
1285 
1286 queue2nic:
1287 
1288 	/*Add packet to the port tx queue*/
1289 	tx_q = &lcore_tx_queue[lcore_id];
1290 
1291 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1292 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1293 		/* Guest has inserted the vlan tag. */
1294 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1295 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1296 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1297 			(vh->vlan_tci != vlan_tag_be))
1298 			vh->vlan_tci = vlan_tag_be;
1299 	} else {
1300 		m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1301 
1302 		/*
1303 		 * Find the right seg to adjust the data len when offset is
1304 		 * bigger than tail room size.
1305 		 */
1306 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1307 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1308 				m->data_len += offset;
1309 			else {
1310 				struct rte_mbuf *seg = m;
1311 
1312 				while ((seg->next != NULL) &&
1313 					(offset > rte_pktmbuf_tailroom(seg)))
1314 					seg = seg->next;
1315 
1316 				seg->data_len += offset;
1317 			}
1318 			m->pkt_len += offset;
1319 		}
1320 
1321 		m->vlan_tci = vlan_tag;
1322 	}
1323 
1324 	if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1325 		virtio_tx_offload(m);
1326 
1327 	tx_q->m_table[tx_q->len++] = m;
1328 	if (enable_stats) {
1329 		vdev->stats.tx_total++;
1330 		vdev->stats.tx++;
1331 	}
1332 
1333 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1334 		do_drain_mbuf_table(tx_q);
1335 }
1336 
1337 
1338 static __rte_always_inline void
drain_mbuf_table(struct mbuf_table * tx_q)1339 drain_mbuf_table(struct mbuf_table *tx_q)
1340 {
1341 	static uint64_t prev_tsc;
1342 	uint64_t cur_tsc;
1343 
1344 	if (tx_q->len == 0)
1345 		return;
1346 
1347 	cur_tsc = rte_rdtsc();
1348 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1349 		prev_tsc = cur_tsc;
1350 
1351 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1352 			"TX queue drained after timeout with burst size %u\n",
1353 			tx_q->len);
1354 		do_drain_mbuf_table(tx_q);
1355 	}
1356 }
1357 
1358 uint16_t
async_enqueue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t rx_count)1359 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1360 		struct rte_mbuf **pkts, uint32_t rx_count)
1361 {
1362 	uint16_t enqueue_count;
1363 	uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1364 
1365 	complete_async_pkts(dev);
1366 	enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1367 					pkts, rx_count, dma_id, 0);
1368 
1369 	return enqueue_count;
1370 }
1371 
1372 uint16_t
sync_enqueue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mbuf ** pkts,uint32_t rx_count)1373 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1374 		struct rte_mbuf **pkts, uint32_t rx_count)
1375 {
1376 	return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1377 }
1378 
1379 static __rte_always_inline void
drain_eth_rx(struct vhost_dev * vdev)1380 drain_eth_rx(struct vhost_dev *vdev)
1381 {
1382 	uint16_t rx_count, enqueue_count;
1383 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1384 
1385 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1386 				    pkts, MAX_PKT_BURST);
1387 
1388 	if (!rx_count)
1389 		return;
1390 
1391 	enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1392 						VIRTIO_RXQ, pkts, rx_count);
1393 
1394 	/* Retry if necessary */
1395 	if (enable_retry && unlikely(enqueue_count < rx_count)) {
1396 		uint32_t retry = 0;
1397 
1398 		while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) {
1399 			rte_delay_us(burst_rx_delay_time);
1400 			enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1401 							VIRTIO_RXQ, &pkts[enqueue_count],
1402 							rx_count - enqueue_count);
1403 		}
1404 	}
1405 
1406 	if (enable_stats) {
1407 		rte_atomic_fetch_add_explicit(&vdev->stats.rx_total_atomic, rx_count,
1408 				rte_memory_order_seq_cst);
1409 		rte_atomic_fetch_add_explicit(&vdev->stats.rx_atomic, enqueue_count,
1410 				rte_memory_order_seq_cst);
1411 	}
1412 
1413 	if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) {
1414 		free_pkts(pkts, rx_count);
1415 	} else {
1416 		uint16_t enqueue_fail = rx_count - enqueue_count;
1417 		if (enqueue_fail > 0)
1418 			free_pkts(&pkts[enqueue_count], enqueue_fail);
1419 	}
1420 }
1421 
async_dequeue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)1422 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1423 			    struct rte_mempool *mbuf_pool,
1424 			    struct rte_mbuf **pkts, uint16_t count)
1425 {
1426 	int nr_inflight;
1427 	uint16_t dequeue_count;
1428 	int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1429 
1430 	dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1431 			mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1432 
1433 	return dequeue_count;
1434 }
1435 
sync_dequeue_pkts(struct vhost_dev * dev,uint16_t queue_id,struct rte_mempool * mbuf_pool,struct rte_mbuf ** pkts,uint16_t count)1436 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1437 			   struct rte_mempool *mbuf_pool,
1438 			   struct rte_mbuf **pkts, uint16_t count)
1439 {
1440 	return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1441 }
1442 
1443 static __rte_always_inline void
drain_virtio_tx(struct vhost_dev * vdev)1444 drain_virtio_tx(struct vhost_dev *vdev)
1445 {
1446 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1447 	uint16_t count;
1448 	uint16_t i;
1449 
1450 	count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1451 				VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1452 
1453 	/* setup VMDq for the first packet */
1454 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1455 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1456 			free_pkts(pkts, count);
1457 	}
1458 
1459 	for (i = 0; i < count; ++i)
1460 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1461 }
1462 
1463 /*
1464  * Main function of vhost-switch. It basically does:
1465  *
1466  * for each vhost device {
1467  *    - drain_eth_rx()
1468  *
1469  *      Which drains the host eth Rx queue linked to the vhost device,
1470  *      and deliver all of them to guest virito Rx ring associated with
1471  *      this vhost device.
1472  *
1473  *    - drain_virtio_tx()
1474  *
1475  *      Which drains the guest virtio Tx queue and deliver all of them
1476  *      to the target, which could be another vhost device, or the
1477  *      physical eth dev. The route is done in function "virtio_tx_route".
1478  * }
1479  */
1480 static int
switch_worker(void * arg __rte_unused)1481 switch_worker(void *arg __rte_unused)
1482 {
1483 	unsigned i;
1484 	unsigned lcore_id = rte_lcore_id();
1485 	struct vhost_dev *vdev;
1486 	struct mbuf_table *tx_q;
1487 
1488 	RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1489 
1490 	tx_q = &lcore_tx_queue[lcore_id];
1491 	for (i = 0; i < rte_lcore_count(); i++) {
1492 		if (lcore_ids[i] == lcore_id) {
1493 			tx_q->txq_id = i;
1494 			break;
1495 		}
1496 	}
1497 
1498 	while(1) {
1499 		drain_mbuf_table(tx_q);
1500 		drain_vhost_table();
1501 		/*
1502 		 * Inform the configuration core that we have exited the
1503 		 * linked list and that no devices are in use if requested.
1504 		 */
1505 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1506 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1507 
1508 		/*
1509 		 * Process vhost devices
1510 		 */
1511 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1512 			      lcore_vdev_entry) {
1513 			if (unlikely(vdev->remove)) {
1514 				unlink_vmdq(vdev);
1515 				vdev->ready = DEVICE_SAFE_REMOVE;
1516 				continue;
1517 			}
1518 
1519 			if (likely(vdev->ready == DEVICE_RX))
1520 				drain_eth_rx(vdev);
1521 
1522 			if (likely(!vdev->remove))
1523 				drain_virtio_tx(vdev);
1524 		}
1525 	}
1526 
1527 	return 0;
1528 }
1529 
1530 static void
vhost_clear_queue_thread_unsafe(struct vhost_dev * vdev,uint16_t queue_id)1531 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1532 {
1533 	uint16_t n_pkt = 0;
1534 	int pkts_inflight;
1535 
1536 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1537 	pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1538 
1539 	struct rte_mbuf *m_cpl[pkts_inflight];
1540 
1541 	while (pkts_inflight) {
1542 		n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1543 							pkts_inflight, dma_id, 0);
1544 		free_pkts(m_cpl, n_pkt);
1545 		pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1546 									queue_id);
1547 	}
1548 }
1549 
1550 static void
vhost_clear_queue(struct vhost_dev * vdev,uint16_t queue_id)1551 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id)
1552 {
1553 	uint16_t n_pkt = 0;
1554 	int pkts_inflight;
1555 
1556 	int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1557 	pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1558 
1559 	struct rte_mbuf *m_cpl[pkts_inflight];
1560 
1561 	while (pkts_inflight) {
1562 		n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl,
1563 						pkts_inflight, dma_id, 0);
1564 		free_pkts(m_cpl, n_pkt);
1565 		pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id);
1566 	}
1567 }
1568 
1569 /*
1570  * Remove a device from the specific data core linked list and from the
1571  * main linked list. Synchronization  occurs through the use of the
1572  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1573  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1574  */
1575 static void
destroy_device(int vid)1576 destroy_device(int vid)
1577 {
1578 	struct vhost_dev *vdev = NULL;
1579 	int lcore;
1580 	uint16_t i;
1581 
1582 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1583 		if (vdev->vid == vid)
1584 			break;
1585 	}
1586 	if (!vdev)
1587 		return;
1588 	/*set the remove flag. */
1589 	vdev->remove = 1;
1590 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1591 		rte_pause();
1592 	}
1593 
1594 	for (i = 0; i < RTE_MAX_LCORE; i++)
1595 		rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1596 
1597 	if (builtin_net_driver)
1598 		vs_vhost_net_remove(vdev);
1599 
1600 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1601 		     lcore_vdev_entry);
1602 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1603 
1604 
1605 	/* Set the dev_removal_flag on each lcore. */
1606 	RTE_LCORE_FOREACH_WORKER(lcore)
1607 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1608 
1609 	/*
1610 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1611 	 * we can be sure that they can no longer access the device removed
1612 	 * from the linked lists and that the devices are no longer in use.
1613 	 */
1614 	RTE_LCORE_FOREACH_WORKER(lcore) {
1615 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1616 			rte_pause();
1617 	}
1618 
1619 	lcore_info[vdev->coreid].device_num--;
1620 
1621 	RTE_LOG(INFO, VHOST_DATA,
1622 		"(%d) device has been removed from data core\n",
1623 		vdev->vid);
1624 
1625 	if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1626 		vhost_clear_queue(vdev, VIRTIO_RXQ);
1627 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1628 		dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1629 	}
1630 
1631 	if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1632 		vhost_clear_queue(vdev, VIRTIO_TXQ);
1633 		rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1634 		dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1635 	}
1636 
1637 	rte_free(vdev);
1638 }
1639 
1640 static inline int
get_socketid_by_vid(int vid)1641 get_socketid_by_vid(int vid)
1642 {
1643 	int i;
1644 	char ifname[PATH_MAX];
1645 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1646 
1647 	for (i = 0; i < nb_sockets; i++) {
1648 		char *file = socket_files + i * PATH_MAX;
1649 		if (strcmp(file, ifname) == 0)
1650 			return i;
1651 	}
1652 
1653 	return -1;
1654 }
1655 
1656 static int
init_vhost_queue_ops(int vid)1657 init_vhost_queue_ops(int vid)
1658 {
1659 	if (builtin_net_driver) {
1660 		vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1661 		vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1662 	} else {
1663 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1664 			vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1665 		else
1666 			vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1667 
1668 		if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1669 			vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1670 		else
1671 			vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1672 	}
1673 
1674 	return 0;
1675 }
1676 
1677 static inline int
vhost_async_channel_register(int vid)1678 vhost_async_channel_register(int vid)
1679 {
1680 	int rx_ret = 0, tx_ret = 0;
1681 
1682 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1683 		rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1684 		if (rx_ret == 0)
1685 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1686 	}
1687 
1688 	if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1689 		tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1690 		if (tx_ret == 0)
1691 			dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1692 	}
1693 
1694 	return rx_ret | tx_ret;
1695 }
1696 
1697 
1698 
1699 /*
1700  * A new device is added to a data core. First the device is added to the main linked list
1701  * and then allocated to a specific data core.
1702  */
1703 static int
new_device(int vid)1704 new_device(int vid)
1705 {
1706 	int lcore, core_add = 0;
1707 	uint16_t i;
1708 	uint32_t device_num_min = num_devices;
1709 	struct vhost_dev *vdev;
1710 	int ret;
1711 
1712 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1713 	if (vdev == NULL) {
1714 		RTE_LOG(INFO, VHOST_DATA,
1715 			"(%d) couldn't allocate memory for vhost dev\n",
1716 			vid);
1717 		return -1;
1718 	}
1719 	vdev->vid = vid;
1720 
1721 	for (i = 0; i < RTE_MAX_LCORE; i++) {
1722 		vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1723 			= rte_zmalloc("vhost bufftable",
1724 				sizeof(struct vhost_bufftable),
1725 				RTE_CACHE_LINE_SIZE);
1726 
1727 		if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1728 			RTE_LOG(INFO, VHOST_DATA,
1729 			  "(%d) couldn't allocate memory for vhost TX\n", vid);
1730 			return -1;
1731 		}
1732 	}
1733 
1734 	int socketid = get_socketid_by_vid(vid);
1735 	if (socketid == -1)
1736 		return -1;
1737 
1738 	init_vid2socketid_array(vid, socketid);
1739 
1740 	ret =  vhost_async_channel_register(vid);
1741 
1742 	if (init_vhost_queue_ops(vid) != 0)
1743 		return -1;
1744 
1745 	if (builtin_net_driver)
1746 		vs_vhost_net_setup(vdev);
1747 
1748 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1749 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1750 
1751 	/*reset ready flag*/
1752 	vdev->ready = DEVICE_MAC_LEARNING;
1753 	vdev->remove = 0;
1754 
1755 	/* Find a suitable lcore to add the device. */
1756 	RTE_LCORE_FOREACH_WORKER(lcore) {
1757 		if (lcore_info[lcore].device_num < device_num_min) {
1758 			device_num_min = lcore_info[lcore].device_num;
1759 			core_add = lcore;
1760 		}
1761 	}
1762 	vdev->coreid = core_add;
1763 
1764 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1765 			  lcore_vdev_entry);
1766 	lcore_info[vdev->coreid].device_num++;
1767 
1768 	/* Disable notifications. */
1769 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1770 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1771 
1772 	RTE_LOG(INFO, VHOST_DATA,
1773 		"(%d) device has been added to data core %d\n",
1774 		vid, vdev->coreid);
1775 
1776 	return ret;
1777 }
1778 
1779 static int
vring_state_changed(int vid,uint16_t queue_id,int enable)1780 vring_state_changed(int vid, uint16_t queue_id, int enable)
1781 {
1782 	struct vhost_dev *vdev = NULL;
1783 
1784 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1785 		if (vdev->vid == vid)
1786 			break;
1787 	}
1788 	if (!vdev)
1789 		return -1;
1790 
1791 	if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1792 		if (!enable)
1793 			vhost_clear_queue_thread_unsafe(vdev, queue_id);
1794 	}
1795 
1796 	return 0;
1797 }
1798 
1799 /*
1800  * These callback allow devices to be added to the data core when configuration
1801  * has been fully complete.
1802  */
1803 static const struct rte_vhost_device_ops virtio_net_device_ops =
1804 {
1805 	.new_device =  new_device,
1806 	.destroy_device = destroy_device,
1807 	.vring_state_changed = vring_state_changed,
1808 };
1809 
1810 /*
1811  * This is a thread will wake up after a period to print stats if the user has
1812  * enabled them.
1813  */
1814 static uint32_t
print_stats(__rte_unused void * arg)1815 print_stats(__rte_unused void *arg)
1816 {
1817 	struct vhost_dev *vdev;
1818 	uint64_t tx_dropped, rx_dropped;
1819 	uint64_t tx, tx_total, rx, rx_total;
1820 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1821 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1822 
1823 	while(1) {
1824 		sleep(enable_stats);
1825 
1826 		/* Clear screen and move to top left */
1827 		printf("%s%s\n", clr, top_left);
1828 		printf("Device statistics =================================\n");
1829 
1830 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1831 			tx_total   = vdev->stats.tx_total;
1832 			tx         = vdev->stats.tx;
1833 			tx_dropped = tx_total - tx;
1834 
1835 			rx_total = rte_atomic_load_explicit(&vdev->stats.rx_total_atomic,
1836 				rte_memory_order_seq_cst);
1837 			rx         = rte_atomic_load_explicit(&vdev->stats.rx_atomic,
1838 				rte_memory_order_seq_cst);
1839 			rx_dropped = rx_total - rx;
1840 
1841 			printf("Statistics for device %d\n"
1842 				"-----------------------\n"
1843 				"TX total:              %" PRIu64 "\n"
1844 				"TX dropped:            %" PRIu64 "\n"
1845 				"TX successful:         %" PRIu64 "\n"
1846 				"RX total:              %" PRIu64 "\n"
1847 				"RX dropped:            %" PRIu64 "\n"
1848 				"RX successful:         %" PRIu64 "\n",
1849 				vdev->vid,
1850 				tx_total, tx_dropped, tx,
1851 				rx_total, rx_dropped, rx);
1852 		}
1853 
1854 		printf("===================================================\n");
1855 
1856 		fflush(stdout);
1857 	}
1858 
1859 	return 0;
1860 }
1861 
1862 static void
unregister_drivers(int socket_num)1863 unregister_drivers(int socket_num)
1864 {
1865 	int i, ret;
1866 
1867 	for (i = 0; i < socket_num; i++) {
1868 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1869 		if (ret != 0)
1870 			RTE_LOG(ERR, VHOST_CONFIG,
1871 				"Fail to unregister vhost driver for %s.\n",
1872 				socket_files + i * PATH_MAX);
1873 	}
1874 }
1875 
1876 /* When we receive a INT signal, unregister vhost driver */
1877 static void
sigint_handler(__rte_unused int signum)1878 sigint_handler(__rte_unused int signum)
1879 {
1880 	/* Unregister vhost driver. */
1881 	unregister_drivers(nb_sockets);
1882 
1883 	exit(0);
1884 }
1885 
1886 static void
reset_dma(void)1887 reset_dma(void)
1888 {
1889 	int i;
1890 
1891 	for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1892 		int j;
1893 
1894 		for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1895 			dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1896 			dma_bind[i].dmas[j].async_enabled = false;
1897 		}
1898 	}
1899 
1900 	for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1901 		dmas_id[i] = INVALID_DMA_ID;
1902 }
1903 
1904 /*
1905  * Main function, does initialisation and calls the per-lcore functions.
1906  */
1907 int
main(int argc,char * argv[])1908 main(int argc, char *argv[])
1909 {
1910 	unsigned lcore_id, core_id = 0;
1911 	unsigned nb_ports, valid_num_ports;
1912 	int ret, i;
1913 	uint16_t portid;
1914 	rte_thread_t tid;
1915 	uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1916 
1917 	signal(SIGINT, sigint_handler);
1918 
1919 	/* init EAL */
1920 	ret = rte_eal_init(argc, argv);
1921 	if (ret < 0)
1922 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1923 	argc -= ret;
1924 	argv += ret;
1925 
1926 	/* initialize dma structures */
1927 	reset_dma();
1928 
1929 	/* parse app arguments */
1930 	ret = us_vhost_parse_args(argc, argv);
1931 	if (ret < 0)
1932 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1933 
1934 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1935 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1936 
1937 		if (rte_lcore_is_enabled(lcore_id))
1938 			lcore_ids[core_id++] = lcore_id;
1939 	}
1940 
1941 	if (rte_lcore_count() > RTE_MAX_LCORE)
1942 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1943 
1944 	/* Get the number of physical ports. */
1945 	nb_ports = rte_eth_dev_count_avail();
1946 
1947 	/*
1948 	 * Update the global var NUM_PORTS and global array PORTS
1949 	 * and get value of var VALID_NUM_PORTS according to system ports number
1950 	 */
1951 	valid_num_ports = check_ports_num(nb_ports);
1952 
1953 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1954 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1955 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1956 		return -1;
1957 	}
1958 
1959 	/*
1960 	 * FIXME: here we are trying to allocate mbufs big enough for
1961 	 * @MAX_QUEUES, but the truth is we're never going to use that
1962 	 * many queues here. We probably should only do allocation for
1963 	 * those queues we are going to use.
1964 	 */
1965 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1966 					    MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1967 					    rte_socket_id());
1968 	if (mbuf_pool == NULL)
1969 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1970 
1971 	if (vm2vm_mode == VM2VM_HARDWARE) {
1972 		/* Enable VT loop back to let L2 switch to do it. */
1973 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1974 		RTE_LOG(DEBUG, VHOST_CONFIG,
1975 			"Enable loop back for L2 switch in vmdq.\n");
1976 	}
1977 
1978 	/* initialize all ports */
1979 	RTE_ETH_FOREACH_DEV(portid) {
1980 		/* skip ports that are not enabled */
1981 		if ((enabled_port_mask & (1 << portid)) == 0) {
1982 			RTE_LOG(INFO, VHOST_PORT,
1983 				"Skipping disabled port %d\n", portid);
1984 			continue;
1985 		}
1986 		if (port_init(portid) != 0)
1987 			rte_exit(EXIT_FAILURE,
1988 				"Cannot initialize network ports\n");
1989 	}
1990 
1991 	/* Enable stats if the user option is set. */
1992 	if (enable_stats) {
1993 		ret = rte_thread_create_control(&tid, "dpdk-vhost-stat",
1994 					print_stats, NULL);
1995 		if (ret < 0)
1996 			rte_exit(EXIT_FAILURE,
1997 				"Cannot create dpdk-vhost-stat thread\n");
1998 	}
1999 
2000 	/* Launch all data cores. */
2001 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2002 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
2003 
2004 	if (client_mode)
2005 		flags |= RTE_VHOST_USER_CLIENT;
2006 
2007 	for (i = 0; i < dma_count; i++) {
2008 		if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
2009 			RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
2010 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2011 		}
2012 	}
2013 
2014 	/* Register vhost user driver to handle vhost messages. */
2015 	for (i = 0; i < nb_sockets; i++) {
2016 		char *file = socket_files + i * PATH_MAX;
2017 
2018 		if (dma_count && get_async_flag_by_socketid(i) != 0)
2019 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2020 
2021 		ret = rte_vhost_driver_register(file, flags);
2022 		if (ret != 0) {
2023 			unregister_drivers(i);
2024 			rte_exit(EXIT_FAILURE,
2025 				"vhost driver register failure.\n");
2026 		}
2027 
2028 		if (builtin_net_driver)
2029 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2030 
2031 		if (mergeable == 0) {
2032 			rte_vhost_driver_disable_features(file,
2033 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
2034 		}
2035 
2036 		if (enable_tx_csum == 0) {
2037 			rte_vhost_driver_disable_features(file,
2038 				1ULL << VIRTIO_NET_F_CSUM);
2039 		}
2040 
2041 		if (enable_tso == 0) {
2042 			rte_vhost_driver_disable_features(file,
2043 				1ULL << VIRTIO_NET_F_HOST_TSO4);
2044 			rte_vhost_driver_disable_features(file,
2045 				1ULL << VIRTIO_NET_F_HOST_TSO6);
2046 			rte_vhost_driver_disable_features(file,
2047 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
2048 			rte_vhost_driver_disable_features(file,
2049 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
2050 		}
2051 
2052 		if (promiscuous) {
2053 			rte_vhost_driver_enable_features(file,
2054 				1ULL << VIRTIO_NET_F_CTRL_RX);
2055 		}
2056 
2057 		ret = rte_vhost_driver_callback_register(file,
2058 			&virtio_net_device_ops);
2059 		if (ret != 0) {
2060 			rte_exit(EXIT_FAILURE,
2061 				"failed to register vhost driver callbacks.\n");
2062 		}
2063 
2064 		if (rte_vhost_driver_start(file) < 0) {
2065 			rte_exit(EXIT_FAILURE,
2066 				"failed to start vhost driver.\n");
2067 		}
2068 	}
2069 
2070 	RTE_LCORE_FOREACH_WORKER(lcore_id)
2071 		rte_eal_wait_lcore(lcore_id);
2072 
2073 	for (i = 0; i < dma_count; i++) {
2074 		if (rte_vhost_async_dma_unconfigure(dmas_id[i], 0) < 0) {
2075 			RTE_LOG(ERR, VHOST_PORT,
2076 				"Failed to unconfigure DMA %d in vhost.\n", dmas_id[i]);
2077 			rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
2078 		}
2079 	}
2080 
2081 	/* clean up the EAL */
2082 	rte_eal_cleanup();
2083 
2084 	return 0;
2085 }
2086