1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <ctype.h> 6 #include <arpa/inet.h> 7 #include <getopt.h> 8 #include <linux/if_ether.h> 9 #include <linux/if_vlan.h> 10 #include <linux/virtio_net.h> 11 #include <linux/virtio_ring.h> 12 #include <signal.h> 13 #include <stdint.h> 14 #include <stdlib.h> 15 #include <sys/eventfd.h> 16 #include <sys/param.h> 17 #include <unistd.h> 18 19 #include <rte_cycles.h> 20 #include <rte_ethdev.h> 21 #include <rte_log.h> 22 #include <rte_string_fns.h> 23 #include <rte_malloc.h> 24 #include <rte_net.h> 25 #include <rte_vhost.h> 26 #include <rte_ip.h> 27 #include <rte_tcp.h> 28 #include <rte_pause.h> 29 #include <rte_dmadev.h> 30 #include <rte_vhost_async.h> 31 32 #include "main.h" 33 34 #ifndef MAX_QUEUES 35 #define MAX_QUEUES 128 36 #endif 37 38 #define NUM_MBUFS_DEFAULT 0x24000 39 40 /* the maximum number of external ports supported */ 41 #define MAX_SUP_PORTS 1 42 43 #define MBUF_CACHE_SIZE 128 44 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 45 46 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 47 48 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 49 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 50 51 #define JUMBO_FRAME_MAX_SIZE 0x2600 52 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 53 54 /* State of virtio device. */ 55 #define DEVICE_MAC_LEARNING 0 56 #define DEVICE_RX 1 57 #define DEVICE_SAFE_REMOVE 2 58 59 /* Configurable number of RX/TX ring descriptors */ 60 #define RTE_TEST_RX_DESC_DEFAULT 1024 61 #define RTE_TEST_TX_DESC_DEFAULT 512 62 63 #define INVALID_PORT_ID 0xFF 64 #define INVALID_DMA_ID -1 65 66 #define DMA_RING_SIZE 4096 67 68 #define ASYNC_ENQUEUE_VHOST 1 69 #define ASYNC_DEQUEUE_VHOST 2 70 71 /* number of mbufs in all pools - if specified on command-line. */ 72 static int total_num_mbufs = NUM_MBUFS_DEFAULT; 73 74 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 75 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 76 static int dma_count; 77 78 /* mask of enabled ports */ 79 static uint32_t enabled_port_mask = 0; 80 81 /* Promiscuous mode */ 82 static uint32_t promiscuous; 83 84 /* number of devices/queues to support*/ 85 static uint32_t num_queues = 0; 86 static uint32_t num_devices; 87 88 static struct rte_mempool *mbuf_pool; 89 static int mergeable; 90 91 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 92 typedef enum { 93 VM2VM_DISABLED = 0, 94 VM2VM_SOFTWARE = 1, 95 VM2VM_HARDWARE = 2, 96 VM2VM_LAST 97 } vm2vm_type; 98 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 99 100 /* Enable stats. */ 101 static uint32_t enable_stats = 0; 102 /* Enable retries on RX. */ 103 static uint32_t enable_retry = 1; 104 105 /* Disable TX checksum offload */ 106 static uint32_t enable_tx_csum; 107 108 /* Disable TSO offload */ 109 static uint32_t enable_tso; 110 111 static int client_mode; 112 113 static int builtin_net_driver; 114 115 /* Specify timeout (in useconds) between retries on RX. */ 116 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 117 /* Specify the number of retries on RX. */ 118 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 119 120 /* Socket file paths. Can be set by user */ 121 static char *socket_files; 122 static int nb_sockets; 123 124 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE]; 125 126 /* empty VMDq configuration structure. Filled in programmatically */ 127 static struct rte_eth_conf vmdq_conf_default = { 128 .rxmode = { 129 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 130 .split_hdr_size = 0, 131 /* 132 * VLAN strip is necessary for 1G NIC such as I350, 133 * this fixes bug of ipv4 forwarding in guest can't 134 * forward packets from one virtio dev to another virtio dev. 135 */ 136 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 137 }, 138 139 .txmode = { 140 .mq_mode = RTE_ETH_MQ_TX_NONE, 141 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 142 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 143 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 144 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 145 RTE_ETH_TX_OFFLOAD_TCP_TSO), 146 }, 147 .rx_adv_conf = { 148 /* 149 * should be overridden separately in code with 150 * appropriate values 151 */ 152 .vmdq_rx_conf = { 153 .nb_queue_pools = RTE_ETH_8_POOLS, 154 .enable_default_pool = 0, 155 .default_pool = 0, 156 .nb_pool_maps = 0, 157 .pool_map = {{0, 0},}, 158 }, 159 }, 160 }; 161 162 163 static unsigned lcore_ids[RTE_MAX_LCORE]; 164 static uint16_t ports[RTE_MAX_ETHPORTS]; 165 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 166 static uint16_t num_pf_queues, num_vmdq_queues; 167 static uint16_t vmdq_pool_base, vmdq_queue_base; 168 static uint16_t queues_per_pool; 169 170 const uint16_t vlan_tags[] = { 171 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 172 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 173 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 174 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 175 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 176 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 177 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 178 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 179 }; 180 181 /* ethernet addresses of ports */ 182 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 183 184 static struct vhost_dev_tailq_list vhost_dev_list = 185 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 186 187 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 188 189 /* Used for queueing bursts of TX packets. */ 190 struct mbuf_table { 191 unsigned len; 192 unsigned txq_id; 193 struct rte_mbuf *m_table[MAX_PKT_BURST]; 194 }; 195 196 struct vhost_bufftable { 197 uint32_t len; 198 uint64_t pre_tsc; 199 struct rte_mbuf *m_table[MAX_PKT_BURST]; 200 }; 201 202 /* TX queue for each data core. */ 203 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 204 205 /* 206 * Vhost TX buffer for each data core. 207 * Every data core maintains a TX buffer for every vhost device, 208 * which is used for batch pkts enqueue for higher performance. 209 */ 210 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 211 212 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 213 / US_PER_S * BURST_TX_DRAIN_US) 214 215 static int vid2socketid[RTE_MAX_VHOST_DEVICE]; 216 217 static inline uint32_t 218 get_async_flag_by_socketid(int socketid) 219 { 220 return dma_bind[socketid].async_flag; 221 } 222 223 static inline void 224 init_vid2socketid_array(int vid, int socketid) 225 { 226 vid2socketid[vid] = socketid; 227 } 228 229 static inline bool 230 is_dma_configured(int16_t dev_id) 231 { 232 int i; 233 234 for (i = 0; i < dma_count; i++) 235 if (dmas_id[i] == dev_id) 236 return true; 237 return false; 238 } 239 240 static inline int 241 open_dma(const char *value) 242 { 243 struct dma_for_vhost *dma_info = dma_bind; 244 char *input = strndup(value, strlen(value) + 1); 245 char *addrs = input; 246 char *ptrs[2]; 247 char *start, *end, *substr; 248 int64_t socketid, vring_id; 249 250 struct rte_dma_info info; 251 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 252 struct rte_dma_vchan_conf qconf = { 253 .direction = RTE_DMA_DIR_MEM_TO_MEM, 254 .nb_desc = DMA_RING_SIZE 255 }; 256 257 int dev_id; 258 int ret = 0; 259 uint16_t i = 0; 260 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 261 int args_nr; 262 263 while (isblank(*addrs)) 264 addrs++; 265 if (*addrs == '\0') { 266 ret = -1; 267 goto out; 268 } 269 270 /* process DMA devices within bracket. */ 271 addrs++; 272 substr = strtok(addrs, ";]"); 273 if (!substr) { 274 ret = -1; 275 goto out; 276 } 277 278 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 279 if (args_nr <= 0) { 280 ret = -1; 281 goto out; 282 } 283 284 while (i < args_nr) { 285 char *arg_temp = dma_arg[i]; 286 char *txd, *rxd; 287 uint8_t sub_nr; 288 int async_flag; 289 290 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 291 if (sub_nr != 2) { 292 ret = -1; 293 goto out; 294 } 295 296 txd = strstr(ptrs[0], "txd"); 297 rxd = strstr(ptrs[0], "rxd"); 298 if (txd) { 299 start = txd; 300 vring_id = VIRTIO_RXQ; 301 async_flag = ASYNC_ENQUEUE_VHOST; 302 } else if (rxd) { 303 start = rxd; 304 vring_id = VIRTIO_TXQ; 305 async_flag = ASYNC_DEQUEUE_VHOST; 306 } else { 307 ret = -1; 308 goto out; 309 } 310 311 start += 3; 312 socketid = strtol(start, &end, 0); 313 if (end == start) { 314 ret = -1; 315 goto out; 316 } 317 318 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 319 if (dev_id < 0) { 320 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 321 ret = -1; 322 goto out; 323 } 324 325 /* DMA device is already configured, so skip */ 326 if (is_dma_configured(dev_id)) 327 goto done; 328 329 if (rte_dma_info_get(dev_id, &info) != 0) { 330 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 331 ret = -1; 332 goto out; 333 } 334 335 if (info.max_vchans < 1) { 336 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 337 ret = -1; 338 goto out; 339 } 340 341 if (rte_dma_configure(dev_id, &dev_config) != 0) { 342 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 343 ret = -1; 344 goto out; 345 } 346 347 /* Check the max desc supported by DMA device */ 348 rte_dma_info_get(dev_id, &info); 349 if (info.nb_vchans != 1) { 350 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 351 dev_id); 352 ret = -1; 353 goto out; 354 } 355 356 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 357 358 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 359 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 360 ret = -1; 361 goto out; 362 } 363 364 if (rte_dma_start(dev_id) != 0) { 365 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 366 ret = -1; 367 goto out; 368 } 369 370 dmas_id[dma_count++] = dev_id; 371 372 done: 373 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id; 374 (dma_info + socketid)->async_flag |= async_flag; 375 i++; 376 } 377 out: 378 free(input); 379 return ret; 380 } 381 382 /* 383 * Builds up the correct configuration for VMDQ VLAN pool map 384 * according to the pool & queue limits. 385 */ 386 static inline int 387 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 388 { 389 struct rte_eth_vmdq_rx_conf conf; 390 struct rte_eth_vmdq_rx_conf *def_conf = 391 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 392 unsigned i; 393 394 memset(&conf, 0, sizeof(conf)); 395 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 396 conf.nb_pool_maps = num_devices; 397 conf.enable_loop_back = def_conf->enable_loop_back; 398 conf.rx_mode = def_conf->rx_mode; 399 400 for (i = 0; i < conf.nb_pool_maps; i++) { 401 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 402 conf.pool_map[i].pools = (1UL << i); 403 } 404 405 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 406 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 407 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 408 return 0; 409 } 410 411 /* 412 * Initialises a given port using global settings and with the rx buffers 413 * coming from the mbuf_pool passed as parameter 414 */ 415 static inline int 416 port_init(uint16_t port) 417 { 418 struct rte_eth_dev_info dev_info; 419 struct rte_eth_conf port_conf; 420 struct rte_eth_rxconf *rxconf; 421 struct rte_eth_txconf *txconf; 422 int16_t rx_rings, tx_rings; 423 uint16_t rx_ring_size, tx_ring_size; 424 int retval; 425 uint16_t q; 426 427 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 428 retval = rte_eth_dev_info_get(port, &dev_info); 429 if (retval != 0) { 430 RTE_LOG(ERR, VHOST_PORT, 431 "Error during getting device (port %u) info: %s\n", 432 port, strerror(-retval)); 433 434 return retval; 435 } 436 if (dev_info.max_vmdq_pools == 0) { 437 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n"); 438 return -1; 439 } 440 441 rxconf = &dev_info.default_rxconf; 442 txconf = &dev_info.default_txconf; 443 rxconf->rx_drop_en = 1; 444 445 /*configure the number of supported virtio devices based on VMDQ limits */ 446 num_devices = dev_info.max_vmdq_pools; 447 448 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 449 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 450 451 tx_rings = (uint16_t)rte_lcore_count(); 452 453 if (mergeable) { 454 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 455 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 456 else 457 vmdq_conf_default.rxmode.mtu = MAX_MTU; 458 } 459 460 /* Get port configuration. */ 461 retval = get_eth_conf(&port_conf, num_devices); 462 if (retval < 0) 463 return retval; 464 /* NIC queues are divided into pf queues and vmdq queues. */ 465 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 466 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 467 num_vmdq_queues = num_devices * queues_per_pool; 468 num_queues = num_pf_queues + num_vmdq_queues; 469 vmdq_queue_base = dev_info.vmdq_queue_base; 470 vmdq_pool_base = dev_info.vmdq_pool_base; 471 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 472 num_pf_queues, num_devices, queues_per_pool); 473 474 if (!rte_eth_dev_is_valid_port(port)) 475 return -1; 476 477 rx_rings = (uint16_t)dev_info.max_rx_queues; 478 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 479 port_conf.txmode.offloads |= 480 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 481 /* Configure ethernet device. */ 482 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 483 if (retval != 0) { 484 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 485 port, strerror(-retval)); 486 return retval; 487 } 488 489 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 490 &tx_ring_size); 491 if (retval != 0) { 492 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 493 "for port %u: %s.\n", port, strerror(-retval)); 494 return retval; 495 } 496 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 497 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 498 "for Rx queues on port %u.\n", port); 499 return -1; 500 } 501 502 /* Setup the queues. */ 503 rxconf->offloads = port_conf.rxmode.offloads; 504 for (q = 0; q < rx_rings; q ++) { 505 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 506 rte_eth_dev_socket_id(port), 507 rxconf, 508 mbuf_pool); 509 if (retval < 0) { 510 RTE_LOG(ERR, VHOST_PORT, 511 "Failed to setup rx queue %u of port %u: %s.\n", 512 q, port, strerror(-retval)); 513 return retval; 514 } 515 } 516 txconf->offloads = port_conf.txmode.offloads; 517 for (q = 0; q < tx_rings; q ++) { 518 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 519 rte_eth_dev_socket_id(port), 520 txconf); 521 if (retval < 0) { 522 RTE_LOG(ERR, VHOST_PORT, 523 "Failed to setup tx queue %u of port %u: %s.\n", 524 q, port, strerror(-retval)); 525 return retval; 526 } 527 } 528 529 /* Start the device. */ 530 retval = rte_eth_dev_start(port); 531 if (retval < 0) { 532 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 533 port, strerror(-retval)); 534 return retval; 535 } 536 537 if (promiscuous) { 538 retval = rte_eth_promiscuous_enable(port); 539 if (retval != 0) { 540 RTE_LOG(ERR, VHOST_PORT, 541 "Failed to enable promiscuous mode on port %u: %s\n", 542 port, rte_strerror(-retval)); 543 return retval; 544 } 545 } 546 547 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 548 if (retval < 0) { 549 RTE_LOG(ERR, VHOST_PORT, 550 "Failed to get MAC address on port %u: %s\n", 551 port, rte_strerror(-retval)); 552 return retval; 553 } 554 555 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 556 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 557 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 558 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 559 560 return 0; 561 } 562 563 /* 564 * Set socket file path. 565 */ 566 static int 567 us_vhost_parse_socket_path(const char *q_arg) 568 { 569 char *old; 570 571 /* parse number string */ 572 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 573 return -1; 574 575 old = socket_files; 576 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 577 if (socket_files == NULL) { 578 free(old); 579 return -1; 580 } 581 582 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 583 nb_sockets++; 584 585 return 0; 586 } 587 588 /* 589 * Parse the portmask provided at run time. 590 */ 591 static int 592 parse_portmask(const char *portmask) 593 { 594 char *end = NULL; 595 unsigned long pm; 596 597 errno = 0; 598 599 /* parse hexadecimal string */ 600 pm = strtoul(portmask, &end, 16); 601 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 602 return 0; 603 604 return pm; 605 606 } 607 608 /* 609 * Parse num options at run time. 610 */ 611 static int 612 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 613 { 614 char *end = NULL; 615 unsigned long num; 616 617 errno = 0; 618 619 /* parse unsigned int string */ 620 num = strtoul(q_arg, &end, 10); 621 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 622 return -1; 623 624 if (num > max_valid_value) 625 return -1; 626 627 return num; 628 629 } 630 631 /* 632 * Display usage 633 */ 634 static void 635 us_vhost_usage(const char *prgname) 636 { 637 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 638 " --vm2vm [0|1|2]\n" 639 " --rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n" 640 " --socket-file <path>\n" 641 " -p PORTMASK: Set mask for ports to be used by application\n" 642 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 643 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 644 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 645 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 646 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 647 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 648 " --socket-file: The path of the socket file.\n" 649 " --tx-csum [0|1]: disable/enable TX checksum offload.\n" 650 " --tso [0|1]: disable/enable TCP segment offload.\n" 651 " --client: register a vhost-user socket as client mode.\n" 652 " --dmas: register dma channel for specific vhost device.\n" 653 " --total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n" 654 " --builtin-net-driver: enable simple vhost-user net driver\n", 655 prgname); 656 } 657 658 enum { 659 #define OPT_VM2VM "vm2vm" 660 OPT_VM2VM_NUM = 256, 661 #define OPT_RX_RETRY "rx-retry" 662 OPT_RX_RETRY_NUM, 663 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 664 OPT_RX_RETRY_DELAY_NUM, 665 #define OPT_RX_RETRY_NUMB "rx-retry-num" 666 OPT_RX_RETRY_NUMB_NUM, 667 #define OPT_MERGEABLE "mergeable" 668 OPT_MERGEABLE_NUM, 669 #define OPT_STATS "stats" 670 OPT_STATS_NUM, 671 #define OPT_SOCKET_FILE "socket-file" 672 OPT_SOCKET_FILE_NUM, 673 #define OPT_TX_CSUM "tx-csum" 674 OPT_TX_CSUM_NUM, 675 #define OPT_TSO "tso" 676 OPT_TSO_NUM, 677 #define OPT_CLIENT "client" 678 OPT_CLIENT_NUM, 679 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 680 OPT_BUILTIN_NET_DRIVER_NUM, 681 #define OPT_DMAS "dmas" 682 OPT_DMAS_NUM, 683 #define OPT_NUM_MBUFS "total-num-mbufs" 684 OPT_NUM_MBUFS_NUM, 685 }; 686 687 /* 688 * Parse the arguments given in the command line of the application. 689 */ 690 static int 691 us_vhost_parse_args(int argc, char **argv) 692 { 693 int opt, ret; 694 int option_index; 695 unsigned i; 696 const char *prgname = argv[0]; 697 static struct option long_option[] = { 698 {OPT_VM2VM, required_argument, 699 NULL, OPT_VM2VM_NUM}, 700 {OPT_RX_RETRY, required_argument, 701 NULL, OPT_RX_RETRY_NUM}, 702 {OPT_RX_RETRY_DELAY, required_argument, 703 NULL, OPT_RX_RETRY_DELAY_NUM}, 704 {OPT_RX_RETRY_NUMB, required_argument, 705 NULL, OPT_RX_RETRY_NUMB_NUM}, 706 {OPT_MERGEABLE, required_argument, 707 NULL, OPT_MERGEABLE_NUM}, 708 {OPT_STATS, required_argument, 709 NULL, OPT_STATS_NUM}, 710 {OPT_SOCKET_FILE, required_argument, 711 NULL, OPT_SOCKET_FILE_NUM}, 712 {OPT_TX_CSUM, required_argument, 713 NULL, OPT_TX_CSUM_NUM}, 714 {OPT_TSO, required_argument, 715 NULL, OPT_TSO_NUM}, 716 {OPT_CLIENT, no_argument, 717 NULL, OPT_CLIENT_NUM}, 718 {OPT_BUILTIN_NET_DRIVER, no_argument, 719 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 720 {OPT_DMAS, required_argument, 721 NULL, OPT_DMAS_NUM}, 722 {OPT_NUM_MBUFS, required_argument, 723 NULL, OPT_NUM_MBUFS_NUM}, 724 {NULL, 0, 0, 0}, 725 }; 726 727 /* Parse command line */ 728 while ((opt = getopt_long(argc, argv, "p:P", 729 long_option, &option_index)) != EOF) { 730 switch (opt) { 731 /* Portmask */ 732 case 'p': 733 enabled_port_mask = parse_portmask(optarg); 734 if (enabled_port_mask == 0) { 735 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 736 us_vhost_usage(prgname); 737 return -1; 738 } 739 break; 740 741 case 'P': 742 promiscuous = 1; 743 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 744 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 745 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 746 break; 747 748 case OPT_VM2VM_NUM: 749 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 750 if (ret == -1) { 751 RTE_LOG(INFO, VHOST_CONFIG, 752 "Invalid argument for " 753 "vm2vm [0|1|2]\n"); 754 us_vhost_usage(prgname); 755 return -1; 756 } 757 vm2vm_mode = (vm2vm_type)ret; 758 break; 759 760 case OPT_RX_RETRY_NUM: 761 ret = parse_num_opt(optarg, 1); 762 if (ret == -1) { 763 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 764 us_vhost_usage(prgname); 765 return -1; 766 } 767 enable_retry = ret; 768 break; 769 770 case OPT_TX_CSUM_NUM: 771 ret = parse_num_opt(optarg, 1); 772 if (ret == -1) { 773 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 774 us_vhost_usage(prgname); 775 return -1; 776 } 777 enable_tx_csum = ret; 778 break; 779 780 case OPT_TSO_NUM: 781 ret = parse_num_opt(optarg, 1); 782 if (ret == -1) { 783 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 784 us_vhost_usage(prgname); 785 return -1; 786 } 787 enable_tso = ret; 788 break; 789 790 case OPT_RX_RETRY_DELAY_NUM: 791 ret = parse_num_opt(optarg, INT32_MAX); 792 if (ret == -1) { 793 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 794 us_vhost_usage(prgname); 795 return -1; 796 } 797 burst_rx_delay_time = ret; 798 break; 799 800 case OPT_RX_RETRY_NUMB_NUM: 801 ret = parse_num_opt(optarg, INT32_MAX); 802 if (ret == -1) { 803 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 804 us_vhost_usage(prgname); 805 return -1; 806 } 807 burst_rx_retry_num = ret; 808 break; 809 810 case OPT_MERGEABLE_NUM: 811 ret = parse_num_opt(optarg, 1); 812 if (ret == -1) { 813 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 814 us_vhost_usage(prgname); 815 return -1; 816 } 817 mergeable = !!ret; 818 break; 819 820 case OPT_STATS_NUM: 821 ret = parse_num_opt(optarg, INT32_MAX); 822 if (ret == -1) { 823 RTE_LOG(INFO, VHOST_CONFIG, 824 "Invalid argument for stats [0..N]\n"); 825 us_vhost_usage(prgname); 826 return -1; 827 } 828 enable_stats = ret; 829 break; 830 831 /* Set socket file path. */ 832 case OPT_SOCKET_FILE_NUM: 833 if (us_vhost_parse_socket_path(optarg) == -1) { 834 RTE_LOG(INFO, VHOST_CONFIG, 835 "Invalid argument for socket name (Max %d characters)\n", 836 PATH_MAX); 837 us_vhost_usage(prgname); 838 return -1; 839 } 840 break; 841 842 case OPT_DMAS_NUM: 843 if (open_dma(optarg) == -1) { 844 RTE_LOG(INFO, VHOST_CONFIG, 845 "Wrong DMA args\n"); 846 us_vhost_usage(prgname); 847 return -1; 848 } 849 break; 850 851 case OPT_NUM_MBUFS_NUM: 852 ret = parse_num_opt(optarg, INT32_MAX); 853 if (ret == -1) { 854 RTE_LOG(INFO, VHOST_CONFIG, 855 "Invalid argument for total-num-mbufs [0..N]\n"); 856 us_vhost_usage(prgname); 857 return -1; 858 } 859 860 if (total_num_mbufs < ret) 861 total_num_mbufs = ret; 862 break; 863 864 case OPT_CLIENT_NUM: 865 client_mode = 1; 866 break; 867 868 case OPT_BUILTIN_NET_DRIVER_NUM: 869 builtin_net_driver = 1; 870 break; 871 872 /* Invalid option - print options. */ 873 default: 874 us_vhost_usage(prgname); 875 return -1; 876 } 877 } 878 879 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 880 if (enabled_port_mask & (1 << i)) 881 ports[num_ports++] = i; 882 } 883 884 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 885 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 886 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 887 return -1; 888 } 889 890 return 0; 891 } 892 893 /* 894 * Update the global var NUM_PORTS and array PORTS according to system ports number 895 * and return valid ports number 896 */ 897 static unsigned check_ports_num(unsigned nb_ports) 898 { 899 unsigned valid_num_ports = num_ports; 900 unsigned portid; 901 902 if (num_ports > nb_ports) { 903 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 904 num_ports, nb_ports); 905 num_ports = nb_ports; 906 } 907 908 for (portid = 0; portid < num_ports; portid ++) { 909 if (!rte_eth_dev_is_valid_port(ports[portid])) { 910 RTE_LOG(INFO, VHOST_PORT, 911 "\nSpecified port ID(%u) is not valid\n", 912 ports[portid]); 913 ports[portid] = INVALID_PORT_ID; 914 valid_num_ports--; 915 } 916 } 917 return valid_num_ports; 918 } 919 920 static __rte_always_inline struct vhost_dev * 921 find_vhost_dev(struct rte_ether_addr *mac) 922 { 923 struct vhost_dev *vdev; 924 925 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 926 if (vdev->ready == DEVICE_RX && 927 rte_is_same_ether_addr(mac, &vdev->mac_address)) 928 return vdev; 929 } 930 931 return NULL; 932 } 933 934 /* 935 * This function learns the MAC address of the device and registers this along with a 936 * vlan tag to a VMDQ. 937 */ 938 static int 939 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 940 { 941 struct rte_ether_hdr *pkt_hdr; 942 int i, ret; 943 944 /* Learn MAC address of guest device from packet */ 945 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 946 947 if (find_vhost_dev(&pkt_hdr->src_addr)) { 948 RTE_LOG(ERR, VHOST_DATA, 949 "(%d) device is using a registered MAC!\n", 950 vdev->vid); 951 return -1; 952 } 953 954 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 955 vdev->mac_address.addr_bytes[i] = 956 pkt_hdr->src_addr.addr_bytes[i]; 957 958 /* vlan_tag currently uses the device_id. */ 959 vdev->vlan_tag = vlan_tags[vdev->vid]; 960 961 /* Print out VMDQ registration info. */ 962 RTE_LOG(INFO, VHOST_DATA, 963 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 964 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 965 vdev->vlan_tag); 966 967 /* Register the MAC address. */ 968 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 969 (uint32_t)vdev->vid + vmdq_pool_base); 970 if (ret) 971 RTE_LOG(ERR, VHOST_DATA, 972 "(%d) failed to add device MAC address to VMDQ\n", 973 vdev->vid); 974 975 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 976 977 /* Set device as ready for RX. */ 978 vdev->ready = DEVICE_RX; 979 980 return 0; 981 } 982 983 /* 984 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 985 * queue before disabling RX on the device. 986 */ 987 static inline void 988 unlink_vmdq(struct vhost_dev *vdev) 989 { 990 unsigned i = 0; 991 unsigned rx_count; 992 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 993 994 if (vdev->ready == DEVICE_RX) { 995 /*clear MAC and VLAN settings*/ 996 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 997 for (i = 0; i < 6; i++) 998 vdev->mac_address.addr_bytes[i] = 0; 999 1000 vdev->vlan_tag = 0; 1001 1002 /*Clear out the receive buffers*/ 1003 rx_count = rte_eth_rx_burst(ports[0], 1004 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1005 1006 while (rx_count) { 1007 for (i = 0; i < rx_count; i++) 1008 rte_pktmbuf_free(pkts_burst[i]); 1009 1010 rx_count = rte_eth_rx_burst(ports[0], 1011 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1012 } 1013 1014 vdev->ready = DEVICE_MAC_LEARNING; 1015 } 1016 } 1017 1018 static inline void 1019 free_pkts(struct rte_mbuf **pkts, uint16_t n) 1020 { 1021 while (n--) 1022 rte_pktmbuf_free(pkts[n]); 1023 } 1024 1025 static __rte_always_inline void 1026 complete_async_pkts(struct vhost_dev *vdev) 1027 { 1028 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 1029 uint16_t complete_count; 1030 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id; 1031 1032 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 1033 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 1034 if (complete_count) 1035 free_pkts(p_cpl, complete_count); 1036 1037 } 1038 1039 static __rte_always_inline void 1040 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 1041 struct rte_mbuf *m) 1042 { 1043 uint16_t ret; 1044 1045 if (builtin_net_driver) { 1046 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 1047 } else { 1048 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 1049 } 1050 1051 if (enable_stats) { 1052 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 1053 __ATOMIC_SEQ_CST); 1054 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 1055 __ATOMIC_SEQ_CST); 1056 src_vdev->stats.tx_total++; 1057 src_vdev->stats.tx += ret; 1058 } 1059 } 1060 1061 static __rte_always_inline void 1062 drain_vhost(struct vhost_dev *vdev) 1063 { 1064 uint16_t ret; 1065 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1066 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1067 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1068 1069 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit); 1070 1071 if (enable_stats) { 1072 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 1073 __ATOMIC_SEQ_CST); 1074 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 1075 __ATOMIC_SEQ_CST); 1076 } 1077 1078 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1079 free_pkts(m, nr_xmit); 1080 } 1081 1082 static __rte_always_inline void 1083 drain_vhost_table(void) 1084 { 1085 uint16_t lcore_id = rte_lcore_id(); 1086 struct vhost_bufftable *vhost_txq; 1087 struct vhost_dev *vdev; 1088 uint64_t cur_tsc; 1089 1090 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1091 if (unlikely(vdev->remove == 1)) 1092 continue; 1093 1094 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1095 1096 cur_tsc = rte_rdtsc(); 1097 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1098 > MBUF_TABLE_DRAIN_TSC)) { 1099 RTE_LOG_DP(DEBUG, VHOST_DATA, 1100 "Vhost TX queue drained after timeout with burst size %u\n", 1101 vhost_txq->len); 1102 drain_vhost(vdev); 1103 vhost_txq->len = 0; 1104 vhost_txq->pre_tsc = cur_tsc; 1105 } 1106 } 1107 } 1108 1109 /* 1110 * Check if the packet destination MAC address is for a local device. If so then put 1111 * the packet on that devices RX queue. If not then return. 1112 */ 1113 static __rte_always_inline int 1114 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1115 { 1116 struct rte_ether_hdr *pkt_hdr; 1117 struct vhost_dev *dst_vdev; 1118 struct vhost_bufftable *vhost_txq; 1119 uint16_t lcore_id = rte_lcore_id(); 1120 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1121 1122 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1123 if (!dst_vdev) 1124 return -1; 1125 1126 if (vdev->vid == dst_vdev->vid) { 1127 RTE_LOG_DP(DEBUG, VHOST_DATA, 1128 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1129 vdev->vid); 1130 return 0; 1131 } 1132 1133 RTE_LOG_DP(DEBUG, VHOST_DATA, 1134 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1135 1136 if (unlikely(dst_vdev->remove)) { 1137 RTE_LOG_DP(DEBUG, VHOST_DATA, 1138 "(%d) device is marked for removal\n", dst_vdev->vid); 1139 return 0; 1140 } 1141 1142 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1143 vhost_txq->m_table[vhost_txq->len++] = m; 1144 1145 if (enable_stats) { 1146 vdev->stats.tx_total++; 1147 vdev->stats.tx++; 1148 } 1149 1150 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1151 drain_vhost(dst_vdev); 1152 vhost_txq->len = 0; 1153 vhost_txq->pre_tsc = rte_rdtsc(); 1154 } 1155 return 0; 1156 } 1157 1158 /* 1159 * Check if the destination MAC of a packet is one local VM, 1160 * and get its vlan tag, and offset if it is. 1161 */ 1162 static __rte_always_inline int 1163 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1164 uint32_t *offset, uint16_t *vlan_tag) 1165 { 1166 struct vhost_dev *dst_vdev; 1167 struct rte_ether_hdr *pkt_hdr = 1168 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1169 1170 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1171 if (!dst_vdev) 1172 return 0; 1173 1174 if (vdev->vid == dst_vdev->vid) { 1175 RTE_LOG_DP(DEBUG, VHOST_DATA, 1176 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1177 vdev->vid); 1178 return -1; 1179 } 1180 1181 /* 1182 * HW vlan strip will reduce the packet length 1183 * by minus length of vlan tag, so need restore 1184 * the packet length by plus it. 1185 */ 1186 *offset = RTE_VLAN_HLEN; 1187 *vlan_tag = vlan_tags[vdev->vid]; 1188 1189 RTE_LOG_DP(DEBUG, VHOST_DATA, 1190 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1191 vdev->vid, dst_vdev->vid, *vlan_tag); 1192 1193 return 0; 1194 } 1195 1196 static void virtio_tx_offload(struct rte_mbuf *m) 1197 { 1198 struct rte_net_hdr_lens hdr_lens; 1199 struct rte_ipv4_hdr *ipv4_hdr; 1200 struct rte_tcp_hdr *tcp_hdr; 1201 uint32_t ptype; 1202 void *l3_hdr; 1203 1204 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1205 m->l2_len = hdr_lens.l2_len; 1206 m->l3_len = hdr_lens.l3_len; 1207 m->l4_len = hdr_lens.l4_len; 1208 1209 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1210 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1211 m->l2_len + m->l3_len); 1212 1213 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1214 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1215 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1216 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1217 ipv4_hdr = l3_hdr; 1218 ipv4_hdr->hdr_checksum = 0; 1219 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1220 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1221 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1222 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1223 } 1224 } 1225 1226 static __rte_always_inline void 1227 do_drain_mbuf_table(struct mbuf_table *tx_q) 1228 { 1229 uint16_t count; 1230 1231 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1232 tx_q->m_table, tx_q->len); 1233 if (unlikely(count < tx_q->len)) 1234 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1235 1236 tx_q->len = 0; 1237 } 1238 1239 /* 1240 * This function routes the TX packet to the correct interface. This 1241 * may be a local device or the physical port. 1242 */ 1243 static __rte_always_inline void 1244 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1245 { 1246 struct mbuf_table *tx_q; 1247 unsigned offset = 0; 1248 const uint16_t lcore_id = rte_lcore_id(); 1249 struct rte_ether_hdr *nh; 1250 1251 1252 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1253 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1254 struct vhost_dev *vdev2; 1255 1256 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1257 if (vdev2 != vdev) 1258 sync_virtio_xmit(vdev2, vdev, m); 1259 } 1260 goto queue2nic; 1261 } 1262 1263 /*check if destination is local VM*/ 1264 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1265 return; 1266 1267 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1268 if (unlikely(find_local_dest(vdev, m, &offset, 1269 &vlan_tag) != 0)) { 1270 rte_pktmbuf_free(m); 1271 return; 1272 } 1273 } 1274 1275 RTE_LOG_DP(DEBUG, VHOST_DATA, 1276 "(%d) TX: MAC address is external\n", vdev->vid); 1277 1278 queue2nic: 1279 1280 /*Add packet to the port tx queue*/ 1281 tx_q = &lcore_tx_queue[lcore_id]; 1282 1283 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1284 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1285 /* Guest has inserted the vlan tag. */ 1286 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1287 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1288 if ((vm2vm_mode == VM2VM_HARDWARE) && 1289 (vh->vlan_tci != vlan_tag_be)) 1290 vh->vlan_tci = vlan_tag_be; 1291 } else { 1292 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1293 1294 /* 1295 * Find the right seg to adjust the data len when offset is 1296 * bigger than tail room size. 1297 */ 1298 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1299 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1300 m->data_len += offset; 1301 else { 1302 struct rte_mbuf *seg = m; 1303 1304 while ((seg->next != NULL) && 1305 (offset > rte_pktmbuf_tailroom(seg))) 1306 seg = seg->next; 1307 1308 seg->data_len += offset; 1309 } 1310 m->pkt_len += offset; 1311 } 1312 1313 m->vlan_tci = vlan_tag; 1314 } 1315 1316 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1317 virtio_tx_offload(m); 1318 1319 tx_q->m_table[tx_q->len++] = m; 1320 if (enable_stats) { 1321 vdev->stats.tx_total++; 1322 vdev->stats.tx++; 1323 } 1324 1325 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1326 do_drain_mbuf_table(tx_q); 1327 } 1328 1329 1330 static __rte_always_inline void 1331 drain_mbuf_table(struct mbuf_table *tx_q) 1332 { 1333 static uint64_t prev_tsc; 1334 uint64_t cur_tsc; 1335 1336 if (tx_q->len == 0) 1337 return; 1338 1339 cur_tsc = rte_rdtsc(); 1340 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1341 prev_tsc = cur_tsc; 1342 1343 RTE_LOG_DP(DEBUG, VHOST_DATA, 1344 "TX queue drained after timeout with burst size %u\n", 1345 tx_q->len); 1346 do_drain_mbuf_table(tx_q); 1347 } 1348 } 1349 1350 uint16_t 1351 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1352 struct rte_mbuf **pkts, uint32_t rx_count) 1353 { 1354 uint16_t enqueue_count; 1355 uint16_t enqueue_fail = 0; 1356 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id; 1357 1358 complete_async_pkts(dev); 1359 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id, 1360 pkts, rx_count, dma_id, 0); 1361 1362 enqueue_fail = rx_count - enqueue_count; 1363 if (enqueue_fail) 1364 free_pkts(&pkts[enqueue_count], enqueue_fail); 1365 1366 return enqueue_count; 1367 } 1368 1369 uint16_t 1370 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1371 struct rte_mbuf **pkts, uint32_t rx_count) 1372 { 1373 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count); 1374 } 1375 1376 static __rte_always_inline void 1377 drain_eth_rx(struct vhost_dev *vdev) 1378 { 1379 uint16_t rx_count, enqueue_count; 1380 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1381 1382 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1383 pkts, MAX_PKT_BURST); 1384 1385 if (!rx_count) 1386 return; 1387 1388 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1389 VIRTIO_RXQ, pkts, rx_count); 1390 1391 /* Retry if necessary */ 1392 if (enable_retry && unlikely(enqueue_count < rx_count)) { 1393 uint32_t retry = 0; 1394 1395 while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) { 1396 rte_delay_us(burst_rx_delay_time); 1397 enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1398 VIRTIO_RXQ, &pkts[enqueue_count], 1399 rx_count - enqueue_count); 1400 } 1401 } 1402 1403 if (enable_stats) { 1404 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1405 __ATOMIC_SEQ_CST); 1406 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1407 __ATOMIC_SEQ_CST); 1408 } 1409 1410 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1411 free_pkts(pkts, rx_count); 1412 } 1413 1414 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1415 struct rte_mempool *mbuf_pool, 1416 struct rte_mbuf **pkts, uint16_t count) 1417 { 1418 int nr_inflight; 1419 uint16_t dequeue_count; 1420 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id; 1421 1422 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id, 1423 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0); 1424 1425 return dequeue_count; 1426 } 1427 1428 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1429 struct rte_mempool *mbuf_pool, 1430 struct rte_mbuf **pkts, uint16_t count) 1431 { 1432 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count); 1433 } 1434 1435 static __rte_always_inline void 1436 drain_virtio_tx(struct vhost_dev *vdev) 1437 { 1438 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1439 uint16_t count; 1440 uint16_t i; 1441 1442 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev, 1443 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST); 1444 1445 /* setup VMDq for the first packet */ 1446 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1447 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1448 free_pkts(pkts, count); 1449 } 1450 1451 for (i = 0; i < count; ++i) 1452 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1453 } 1454 1455 /* 1456 * Main function of vhost-switch. It basically does: 1457 * 1458 * for each vhost device { 1459 * - drain_eth_rx() 1460 * 1461 * Which drains the host eth Rx queue linked to the vhost device, 1462 * and deliver all of them to guest virito Rx ring associated with 1463 * this vhost device. 1464 * 1465 * - drain_virtio_tx() 1466 * 1467 * Which drains the guest virtio Tx queue and deliver all of them 1468 * to the target, which could be another vhost device, or the 1469 * physical eth dev. The route is done in function "virtio_tx_route". 1470 * } 1471 */ 1472 static int 1473 switch_worker(void *arg __rte_unused) 1474 { 1475 unsigned i; 1476 unsigned lcore_id = rte_lcore_id(); 1477 struct vhost_dev *vdev; 1478 struct mbuf_table *tx_q; 1479 1480 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1481 1482 tx_q = &lcore_tx_queue[lcore_id]; 1483 for (i = 0; i < rte_lcore_count(); i++) { 1484 if (lcore_ids[i] == lcore_id) { 1485 tx_q->txq_id = i; 1486 break; 1487 } 1488 } 1489 1490 while(1) { 1491 drain_mbuf_table(tx_q); 1492 drain_vhost_table(); 1493 /* 1494 * Inform the configuration core that we have exited the 1495 * linked list and that no devices are in use if requested. 1496 */ 1497 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1498 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1499 1500 /* 1501 * Process vhost devices 1502 */ 1503 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1504 lcore_vdev_entry) { 1505 if (unlikely(vdev->remove)) { 1506 unlink_vmdq(vdev); 1507 vdev->ready = DEVICE_SAFE_REMOVE; 1508 continue; 1509 } 1510 1511 if (likely(vdev->ready == DEVICE_RX)) 1512 drain_eth_rx(vdev); 1513 1514 if (likely(!vdev->remove)) 1515 drain_virtio_tx(vdev); 1516 } 1517 } 1518 1519 return 0; 1520 } 1521 1522 static void 1523 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id) 1524 { 1525 uint16_t n_pkt = 0; 1526 int pkts_inflight; 1527 1528 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1529 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id); 1530 1531 struct rte_mbuf *m_cpl[pkts_inflight]; 1532 1533 while (pkts_inflight) { 1534 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl, 1535 pkts_inflight, dma_id, 0); 1536 free_pkts(m_cpl, n_pkt); 1537 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, 1538 queue_id); 1539 } 1540 } 1541 1542 static void 1543 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id) 1544 { 1545 uint16_t n_pkt = 0; 1546 int pkts_inflight; 1547 1548 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1549 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1550 1551 struct rte_mbuf *m_cpl[pkts_inflight]; 1552 1553 while (pkts_inflight) { 1554 n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl, 1555 pkts_inflight, dma_id, 0); 1556 free_pkts(m_cpl, n_pkt); 1557 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1558 } 1559 } 1560 1561 /* 1562 * Remove a device from the specific data core linked list and from the 1563 * main linked list. Synchronization occurs through the use of the 1564 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1565 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1566 */ 1567 static void 1568 destroy_device(int vid) 1569 { 1570 struct vhost_dev *vdev = NULL; 1571 int lcore; 1572 uint16_t i; 1573 1574 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1575 if (vdev->vid == vid) 1576 break; 1577 } 1578 if (!vdev) 1579 return; 1580 /*set the remove flag. */ 1581 vdev->remove = 1; 1582 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1583 rte_pause(); 1584 } 1585 1586 for (i = 0; i < RTE_MAX_LCORE; i++) 1587 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1588 1589 if (builtin_net_driver) 1590 vs_vhost_net_remove(vdev); 1591 1592 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1593 lcore_vdev_entry); 1594 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1595 1596 1597 /* Set the dev_removal_flag on each lcore. */ 1598 RTE_LCORE_FOREACH_WORKER(lcore) 1599 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1600 1601 /* 1602 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1603 * we can be sure that they can no longer access the device removed 1604 * from the linked lists and that the devices are no longer in use. 1605 */ 1606 RTE_LCORE_FOREACH_WORKER(lcore) { 1607 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1608 rte_pause(); 1609 } 1610 1611 lcore_info[vdev->coreid].device_num--; 1612 1613 RTE_LOG(INFO, VHOST_DATA, 1614 "(%d) device has been removed from data core\n", 1615 vdev->vid); 1616 1617 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1618 vhost_clear_queue(vdev, VIRTIO_RXQ); 1619 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1620 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1621 } 1622 1623 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) { 1624 vhost_clear_queue(vdev, VIRTIO_TXQ); 1625 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ); 1626 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false; 1627 } 1628 1629 rte_free(vdev); 1630 } 1631 1632 static inline int 1633 get_socketid_by_vid(int vid) 1634 { 1635 int i; 1636 char ifname[PATH_MAX]; 1637 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 1638 1639 for (i = 0; i < nb_sockets; i++) { 1640 char *file = socket_files + i * PATH_MAX; 1641 if (strcmp(file, ifname) == 0) 1642 return i; 1643 } 1644 1645 return -1; 1646 } 1647 1648 static int 1649 init_vhost_queue_ops(int vid) 1650 { 1651 if (builtin_net_driver) { 1652 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; 1653 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; 1654 } else { 1655 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled) 1656 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts; 1657 else 1658 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts; 1659 1660 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled) 1661 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts; 1662 else 1663 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; 1664 } 1665 1666 return 0; 1667 } 1668 1669 static inline int 1670 vhost_async_channel_register(int vid) 1671 { 1672 int rx_ret = 0, tx_ret = 0; 1673 1674 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1675 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1676 if (rx_ret == 0) 1677 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true; 1678 } 1679 1680 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) { 1681 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ); 1682 if (tx_ret == 0) 1683 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true; 1684 } 1685 1686 return rx_ret | tx_ret; 1687 } 1688 1689 1690 1691 /* 1692 * A new device is added to a data core. First the device is added to the main linked list 1693 * and then allocated to a specific data core. 1694 */ 1695 static int 1696 new_device(int vid) 1697 { 1698 int lcore, core_add = 0; 1699 uint16_t i; 1700 uint32_t device_num_min = num_devices; 1701 struct vhost_dev *vdev; 1702 int ret; 1703 1704 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1705 if (vdev == NULL) { 1706 RTE_LOG(INFO, VHOST_DATA, 1707 "(%d) couldn't allocate memory for vhost dev\n", 1708 vid); 1709 return -1; 1710 } 1711 vdev->vid = vid; 1712 1713 for (i = 0; i < RTE_MAX_LCORE; i++) { 1714 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1715 = rte_zmalloc("vhost bufftable", 1716 sizeof(struct vhost_bufftable), 1717 RTE_CACHE_LINE_SIZE); 1718 1719 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1720 RTE_LOG(INFO, VHOST_DATA, 1721 "(%d) couldn't allocate memory for vhost TX\n", vid); 1722 return -1; 1723 } 1724 } 1725 1726 int socketid = get_socketid_by_vid(vid); 1727 if (socketid == -1) 1728 return -1; 1729 1730 init_vid2socketid_array(vid, socketid); 1731 1732 ret = vhost_async_channel_register(vid); 1733 1734 if (init_vhost_queue_ops(vid) != 0) 1735 return -1; 1736 1737 if (builtin_net_driver) 1738 vs_vhost_net_setup(vdev); 1739 1740 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1741 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1742 1743 /*reset ready flag*/ 1744 vdev->ready = DEVICE_MAC_LEARNING; 1745 vdev->remove = 0; 1746 1747 /* Find a suitable lcore to add the device. */ 1748 RTE_LCORE_FOREACH_WORKER(lcore) { 1749 if (lcore_info[lcore].device_num < device_num_min) { 1750 device_num_min = lcore_info[lcore].device_num; 1751 core_add = lcore; 1752 } 1753 } 1754 vdev->coreid = core_add; 1755 1756 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1757 lcore_vdev_entry); 1758 lcore_info[vdev->coreid].device_num++; 1759 1760 /* Disable notifications. */ 1761 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1762 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1763 1764 RTE_LOG(INFO, VHOST_DATA, 1765 "(%d) device has been added to data core %d\n", 1766 vid, vdev->coreid); 1767 1768 return ret; 1769 } 1770 1771 static int 1772 vring_state_changed(int vid, uint16_t queue_id, int enable) 1773 { 1774 struct vhost_dev *vdev = NULL; 1775 1776 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1777 if (vdev->vid == vid) 1778 break; 1779 } 1780 if (!vdev) 1781 return -1; 1782 1783 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) { 1784 if (!enable) 1785 vhost_clear_queue_thread_unsafe(vdev, queue_id); 1786 } 1787 1788 return 0; 1789 } 1790 1791 /* 1792 * These callback allow devices to be added to the data core when configuration 1793 * has been fully complete. 1794 */ 1795 static const struct rte_vhost_device_ops virtio_net_device_ops = 1796 { 1797 .new_device = new_device, 1798 .destroy_device = destroy_device, 1799 .vring_state_changed = vring_state_changed, 1800 }; 1801 1802 /* 1803 * This is a thread will wake up after a period to print stats if the user has 1804 * enabled them. 1805 */ 1806 static void * 1807 print_stats(__rte_unused void *arg) 1808 { 1809 struct vhost_dev *vdev; 1810 uint64_t tx_dropped, rx_dropped; 1811 uint64_t tx, tx_total, rx, rx_total; 1812 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1813 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1814 1815 while(1) { 1816 sleep(enable_stats); 1817 1818 /* Clear screen and move to top left */ 1819 printf("%s%s\n", clr, top_left); 1820 printf("Device statistics =================================\n"); 1821 1822 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1823 tx_total = vdev->stats.tx_total; 1824 tx = vdev->stats.tx; 1825 tx_dropped = tx_total - tx; 1826 1827 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1828 __ATOMIC_SEQ_CST); 1829 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1830 __ATOMIC_SEQ_CST); 1831 rx_dropped = rx_total - rx; 1832 1833 printf("Statistics for device %d\n" 1834 "-----------------------\n" 1835 "TX total: %" PRIu64 "\n" 1836 "TX dropped: %" PRIu64 "\n" 1837 "TX successful: %" PRIu64 "\n" 1838 "RX total: %" PRIu64 "\n" 1839 "RX dropped: %" PRIu64 "\n" 1840 "RX successful: %" PRIu64 "\n", 1841 vdev->vid, 1842 tx_total, tx_dropped, tx, 1843 rx_total, rx_dropped, rx); 1844 } 1845 1846 printf("===================================================\n"); 1847 1848 fflush(stdout); 1849 } 1850 1851 return NULL; 1852 } 1853 1854 static void 1855 unregister_drivers(int socket_num) 1856 { 1857 int i, ret; 1858 1859 for (i = 0; i < socket_num; i++) { 1860 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1861 if (ret != 0) 1862 RTE_LOG(ERR, VHOST_CONFIG, 1863 "Fail to unregister vhost driver for %s.\n", 1864 socket_files + i * PATH_MAX); 1865 } 1866 } 1867 1868 /* When we receive a INT signal, unregister vhost driver */ 1869 static void 1870 sigint_handler(__rte_unused int signum) 1871 { 1872 /* Unregister vhost driver. */ 1873 unregister_drivers(nb_sockets); 1874 1875 exit(0); 1876 } 1877 1878 static void 1879 reset_dma(void) 1880 { 1881 int i; 1882 1883 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1884 int j; 1885 1886 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1887 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1888 dma_bind[i].dmas[j].async_enabled = false; 1889 } 1890 } 1891 1892 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1893 dmas_id[i] = INVALID_DMA_ID; 1894 } 1895 1896 /* 1897 * Main function, does initialisation and calls the per-lcore functions. 1898 */ 1899 int 1900 main(int argc, char *argv[]) 1901 { 1902 unsigned lcore_id, core_id = 0; 1903 unsigned nb_ports, valid_num_ports; 1904 int ret, i; 1905 uint16_t portid; 1906 static pthread_t tid; 1907 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1908 1909 signal(SIGINT, sigint_handler); 1910 1911 /* init EAL */ 1912 ret = rte_eal_init(argc, argv); 1913 if (ret < 0) 1914 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1915 argc -= ret; 1916 argv += ret; 1917 1918 /* initialize dma structures */ 1919 reset_dma(); 1920 1921 /* parse app arguments */ 1922 ret = us_vhost_parse_args(argc, argv); 1923 if (ret < 0) 1924 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1925 1926 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1927 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1928 1929 if (rte_lcore_is_enabled(lcore_id)) 1930 lcore_ids[core_id++] = lcore_id; 1931 } 1932 1933 if (rte_lcore_count() > RTE_MAX_LCORE) 1934 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1935 1936 /* Get the number of physical ports. */ 1937 nb_ports = rte_eth_dev_count_avail(); 1938 1939 /* 1940 * Update the global var NUM_PORTS and global array PORTS 1941 * and get value of var VALID_NUM_PORTS according to system ports number 1942 */ 1943 valid_num_ports = check_ports_num(nb_ports); 1944 1945 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1946 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1947 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1948 return -1; 1949 } 1950 1951 /* 1952 * FIXME: here we are trying to allocate mbufs big enough for 1953 * @MAX_QUEUES, but the truth is we're never going to use that 1954 * many queues here. We probably should only do allocation for 1955 * those queues we are going to use. 1956 */ 1957 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs, 1958 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE, 1959 rte_socket_id()); 1960 if (mbuf_pool == NULL) 1961 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1962 1963 if (vm2vm_mode == VM2VM_HARDWARE) { 1964 /* Enable VT loop back to let L2 switch to do it. */ 1965 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1966 RTE_LOG(DEBUG, VHOST_CONFIG, 1967 "Enable loop back for L2 switch in vmdq.\n"); 1968 } 1969 1970 /* initialize all ports */ 1971 RTE_ETH_FOREACH_DEV(portid) { 1972 /* skip ports that are not enabled */ 1973 if ((enabled_port_mask & (1 << portid)) == 0) { 1974 RTE_LOG(INFO, VHOST_PORT, 1975 "Skipping disabled port %d\n", portid); 1976 continue; 1977 } 1978 if (port_init(portid) != 0) 1979 rte_exit(EXIT_FAILURE, 1980 "Cannot initialize network ports\n"); 1981 } 1982 1983 /* Enable stats if the user option is set. */ 1984 if (enable_stats) { 1985 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1986 print_stats, NULL); 1987 if (ret < 0) 1988 rte_exit(EXIT_FAILURE, 1989 "Cannot create print-stats thread\n"); 1990 } 1991 1992 /* Launch all data cores. */ 1993 RTE_LCORE_FOREACH_WORKER(lcore_id) 1994 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1995 1996 if (client_mode) 1997 flags |= RTE_VHOST_USER_CLIENT; 1998 1999 for (i = 0; i < dma_count; i++) { 2000 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 2001 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 2002 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 2003 } 2004 } 2005 2006 /* Register vhost user driver to handle vhost messages. */ 2007 for (i = 0; i < nb_sockets; i++) { 2008 char *file = socket_files + i * PATH_MAX; 2009 2010 if (dma_count && get_async_flag_by_socketid(i) != 0) 2011 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 2012 2013 ret = rte_vhost_driver_register(file, flags); 2014 if (ret != 0) { 2015 unregister_drivers(i); 2016 rte_exit(EXIT_FAILURE, 2017 "vhost driver register failure.\n"); 2018 } 2019 2020 if (builtin_net_driver) 2021 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 2022 2023 if (mergeable == 0) { 2024 rte_vhost_driver_disable_features(file, 2025 1ULL << VIRTIO_NET_F_MRG_RXBUF); 2026 } 2027 2028 if (enable_tx_csum == 0) { 2029 rte_vhost_driver_disable_features(file, 2030 1ULL << VIRTIO_NET_F_CSUM); 2031 } 2032 2033 if (enable_tso == 0) { 2034 rte_vhost_driver_disable_features(file, 2035 1ULL << VIRTIO_NET_F_HOST_TSO4); 2036 rte_vhost_driver_disable_features(file, 2037 1ULL << VIRTIO_NET_F_HOST_TSO6); 2038 rte_vhost_driver_disable_features(file, 2039 1ULL << VIRTIO_NET_F_GUEST_TSO4); 2040 rte_vhost_driver_disable_features(file, 2041 1ULL << VIRTIO_NET_F_GUEST_TSO6); 2042 } 2043 2044 if (promiscuous) { 2045 rte_vhost_driver_enable_features(file, 2046 1ULL << VIRTIO_NET_F_CTRL_RX); 2047 } 2048 2049 ret = rte_vhost_driver_callback_register(file, 2050 &virtio_net_device_ops); 2051 if (ret != 0) { 2052 rte_exit(EXIT_FAILURE, 2053 "failed to register vhost driver callbacks.\n"); 2054 } 2055 2056 if (rte_vhost_driver_start(file) < 0) { 2057 rte_exit(EXIT_FAILURE, 2058 "failed to start vhost driver.\n"); 2059 } 2060 } 2061 2062 RTE_LCORE_FOREACH_WORKER(lcore_id) 2063 rte_eal_wait_lcore(lcore_id); 2064 2065 /* clean up the EAL */ 2066 rte_eal_cleanup(); 2067 2068 return 0; 2069 } 2070