1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 #include <rte_dmadev.h> 28 #include <rte_vhost_async.h> 29 30 #include "main.h" 31 32 #ifndef MAX_QUEUES 33 #define MAX_QUEUES 128 34 #endif 35 36 #define NUM_MBUFS_DEFAULT 0x24000 37 38 /* the maximum number of external ports supported */ 39 #define MAX_SUP_PORTS 1 40 41 #define MBUF_CACHE_SIZE 128 42 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 43 44 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 45 46 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 47 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 48 49 #define JUMBO_FRAME_MAX_SIZE 0x2600 50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 51 52 /* State of virtio device. */ 53 #define DEVICE_MAC_LEARNING 0 54 #define DEVICE_RX 1 55 #define DEVICE_SAFE_REMOVE 2 56 57 /* Configurable number of RX/TX ring descriptors */ 58 #define RTE_TEST_RX_DESC_DEFAULT 1024 59 #define RTE_TEST_TX_DESC_DEFAULT 512 60 61 #define INVALID_PORT_ID 0xFF 62 #define INVALID_DMA_ID -1 63 64 #define DMA_RING_SIZE 4096 65 66 #define ASYNC_ENQUEUE_VHOST 1 67 #define ASYNC_DEQUEUE_VHOST 2 68 69 /* number of mbufs in all pools - if specified on command-line. */ 70 static int total_num_mbufs = NUM_MBUFS_DEFAULT; 71 72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 74 static int dma_count; 75 76 /* mask of enabled ports */ 77 static uint32_t enabled_port_mask = 0; 78 79 /* Promiscuous mode */ 80 static uint32_t promiscuous; 81 82 /* number of devices/queues to support*/ 83 static uint32_t num_queues = 0; 84 static uint32_t num_devices; 85 86 static struct rte_mempool *mbuf_pool; 87 static int mergeable; 88 89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 90 typedef enum { 91 VM2VM_DISABLED = 0, 92 VM2VM_SOFTWARE = 1, 93 VM2VM_HARDWARE = 2, 94 VM2VM_LAST 95 } vm2vm_type; 96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 97 98 /* Enable stats. */ 99 static uint32_t enable_stats = 0; 100 /* Enable retries on RX. */ 101 static uint32_t enable_retry = 1; 102 103 /* Disable TX checksum offload */ 104 static uint32_t enable_tx_csum; 105 106 /* Disable TSO offload */ 107 static uint32_t enable_tso; 108 109 static int client_mode; 110 111 static int builtin_net_driver; 112 113 /* Specify timeout (in useconds) between retries on RX. */ 114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 115 /* Specify the number of retries on RX. */ 116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 117 118 /* Socket file paths. Can be set by user */ 119 static char *socket_files; 120 static int nb_sockets; 121 122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE]; 123 124 /* empty VMDq configuration structure. Filled in programmatically */ 125 static struct rte_eth_conf vmdq_conf_default = { 126 .rxmode = { 127 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 128 .split_hdr_size = 0, 129 /* 130 * VLAN strip is necessary for 1G NIC such as I350, 131 * this fixes bug of ipv4 forwarding in guest can't 132 * forward packets from one virtio dev to another virtio dev. 133 */ 134 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 135 }, 136 137 .txmode = { 138 .mq_mode = RTE_ETH_MQ_TX_NONE, 139 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 140 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 141 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 142 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 143 RTE_ETH_TX_OFFLOAD_TCP_TSO), 144 }, 145 .rx_adv_conf = { 146 /* 147 * should be overridden separately in code with 148 * appropriate values 149 */ 150 .vmdq_rx_conf = { 151 .nb_queue_pools = RTE_ETH_8_POOLS, 152 .enable_default_pool = 0, 153 .default_pool = 0, 154 .nb_pool_maps = 0, 155 .pool_map = {{0, 0},}, 156 }, 157 }, 158 }; 159 160 161 static unsigned lcore_ids[RTE_MAX_LCORE]; 162 static uint16_t ports[RTE_MAX_ETHPORTS]; 163 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 164 static uint16_t num_pf_queues, num_vmdq_queues; 165 static uint16_t vmdq_pool_base, vmdq_queue_base; 166 static uint16_t queues_per_pool; 167 168 const uint16_t vlan_tags[] = { 169 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 170 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 171 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 172 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 173 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 174 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 175 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 176 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 177 }; 178 179 /* ethernet addresses of ports */ 180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 181 182 static struct vhost_dev_tailq_list vhost_dev_list = 183 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 184 185 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 186 187 /* Used for queueing bursts of TX packets. */ 188 struct mbuf_table { 189 unsigned len; 190 unsigned txq_id; 191 struct rte_mbuf *m_table[MAX_PKT_BURST]; 192 }; 193 194 struct vhost_bufftable { 195 uint32_t len; 196 uint64_t pre_tsc; 197 struct rte_mbuf *m_table[MAX_PKT_BURST]; 198 }; 199 200 /* TX queue for each data core. */ 201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 202 203 /* 204 * Vhost TX buffer for each data core. 205 * Every data core maintains a TX buffer for every vhost device, 206 * which is used for batch pkts enqueue for higher performance. 207 */ 208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 209 210 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 211 / US_PER_S * BURST_TX_DRAIN_US) 212 213 static int vid2socketid[RTE_MAX_VHOST_DEVICE]; 214 215 static inline uint32_t 216 get_async_flag_by_socketid(int socketid) 217 { 218 return dma_bind[socketid].async_flag; 219 } 220 221 static inline void 222 init_vid2socketid_array(int vid, int socketid) 223 { 224 vid2socketid[vid] = socketid; 225 } 226 227 static inline bool 228 is_dma_configured(int16_t dev_id) 229 { 230 int i; 231 232 for (i = 0; i < dma_count; i++) 233 if (dmas_id[i] == dev_id) 234 return true; 235 return false; 236 } 237 238 static inline int 239 open_dma(const char *value) 240 { 241 struct dma_for_vhost *dma_info = dma_bind; 242 char *input = strndup(value, strlen(value) + 1); 243 char *addrs = input; 244 char *ptrs[2]; 245 char *start, *end, *substr; 246 int64_t socketid, vring_id; 247 248 struct rte_dma_info info; 249 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 250 struct rte_dma_vchan_conf qconf = { 251 .direction = RTE_DMA_DIR_MEM_TO_MEM, 252 .nb_desc = DMA_RING_SIZE 253 }; 254 255 int dev_id; 256 int ret = 0; 257 uint16_t i = 0; 258 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 259 int args_nr; 260 261 while (isblank(*addrs)) 262 addrs++; 263 if (*addrs == '\0') { 264 ret = -1; 265 goto out; 266 } 267 268 /* process DMA devices within bracket. */ 269 addrs++; 270 substr = strtok(addrs, ";]"); 271 if (!substr) { 272 ret = -1; 273 goto out; 274 } 275 276 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 277 if (args_nr <= 0) { 278 ret = -1; 279 goto out; 280 } 281 282 while (i < args_nr) { 283 char *arg_temp = dma_arg[i]; 284 char *txd, *rxd; 285 uint8_t sub_nr; 286 int async_flag; 287 288 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 289 if (sub_nr != 2) { 290 ret = -1; 291 goto out; 292 } 293 294 txd = strstr(ptrs[0], "txd"); 295 rxd = strstr(ptrs[0], "rxd"); 296 if (txd) { 297 start = txd; 298 vring_id = VIRTIO_RXQ; 299 async_flag = ASYNC_ENQUEUE_VHOST; 300 } else if (rxd) { 301 start = rxd; 302 vring_id = VIRTIO_TXQ; 303 async_flag = ASYNC_DEQUEUE_VHOST; 304 } else { 305 ret = -1; 306 goto out; 307 } 308 309 start += 3; 310 socketid = strtol(start, &end, 0); 311 if (end == start) { 312 ret = -1; 313 goto out; 314 } 315 316 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 317 if (dev_id < 0) { 318 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 319 ret = -1; 320 goto out; 321 } 322 323 /* DMA device is already configured, so skip */ 324 if (is_dma_configured(dev_id)) 325 goto done; 326 327 if (rte_dma_info_get(dev_id, &info) != 0) { 328 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 329 ret = -1; 330 goto out; 331 } 332 333 if (info.max_vchans < 1) { 334 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 335 ret = -1; 336 goto out; 337 } 338 339 if (rte_dma_configure(dev_id, &dev_config) != 0) { 340 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 341 ret = -1; 342 goto out; 343 } 344 345 /* Check the max desc supported by DMA device */ 346 rte_dma_info_get(dev_id, &info); 347 if (info.nb_vchans != 1) { 348 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 349 dev_id); 350 ret = -1; 351 goto out; 352 } 353 354 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 355 356 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 357 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 358 ret = -1; 359 goto out; 360 } 361 362 if (rte_dma_start(dev_id) != 0) { 363 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 364 ret = -1; 365 goto out; 366 } 367 368 dmas_id[dma_count++] = dev_id; 369 370 done: 371 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id; 372 (dma_info + socketid)->async_flag |= async_flag; 373 i++; 374 } 375 out: 376 free(input); 377 return ret; 378 } 379 380 /* 381 * Builds up the correct configuration for VMDQ VLAN pool map 382 * according to the pool & queue limits. 383 */ 384 static inline int 385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 386 { 387 struct rte_eth_vmdq_rx_conf conf; 388 struct rte_eth_vmdq_rx_conf *def_conf = 389 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 390 unsigned i; 391 392 memset(&conf, 0, sizeof(conf)); 393 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 394 conf.nb_pool_maps = num_devices; 395 conf.enable_loop_back = def_conf->enable_loop_back; 396 conf.rx_mode = def_conf->rx_mode; 397 398 for (i = 0; i < conf.nb_pool_maps; i++) { 399 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 400 conf.pool_map[i].pools = (1UL << i); 401 } 402 403 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 404 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 405 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 406 return 0; 407 } 408 409 /* 410 * Initialises a given port using global settings and with the rx buffers 411 * coming from the mbuf_pool passed as parameter 412 */ 413 static inline int 414 port_init(uint16_t port) 415 { 416 struct rte_eth_dev_info dev_info; 417 struct rte_eth_conf port_conf; 418 struct rte_eth_rxconf *rxconf; 419 struct rte_eth_txconf *txconf; 420 int16_t rx_rings, tx_rings; 421 uint16_t rx_ring_size, tx_ring_size; 422 int retval; 423 uint16_t q; 424 425 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 426 retval = rte_eth_dev_info_get(port, &dev_info); 427 if (retval != 0) { 428 RTE_LOG(ERR, VHOST_PORT, 429 "Error during getting device (port %u) info: %s\n", 430 port, strerror(-retval)); 431 432 return retval; 433 } 434 if (dev_info.max_vmdq_pools == 0) { 435 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n"); 436 return -1; 437 } 438 439 rxconf = &dev_info.default_rxconf; 440 txconf = &dev_info.default_txconf; 441 rxconf->rx_drop_en = 1; 442 443 /*configure the number of supported virtio devices based on VMDQ limits */ 444 num_devices = dev_info.max_vmdq_pools; 445 446 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 447 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 448 449 tx_rings = (uint16_t)rte_lcore_count(); 450 451 if (mergeable) { 452 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 453 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 454 else 455 vmdq_conf_default.rxmode.mtu = MAX_MTU; 456 } 457 458 /* Get port configuration. */ 459 retval = get_eth_conf(&port_conf, num_devices); 460 if (retval < 0) 461 return retval; 462 /* NIC queues are divided into pf queues and vmdq queues. */ 463 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 464 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 465 num_vmdq_queues = num_devices * queues_per_pool; 466 num_queues = num_pf_queues + num_vmdq_queues; 467 vmdq_queue_base = dev_info.vmdq_queue_base; 468 vmdq_pool_base = dev_info.vmdq_pool_base; 469 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 470 num_pf_queues, num_devices, queues_per_pool); 471 472 if (!rte_eth_dev_is_valid_port(port)) 473 return -1; 474 475 rx_rings = (uint16_t)dev_info.max_rx_queues; 476 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 477 port_conf.txmode.offloads |= 478 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 479 /* Configure ethernet device. */ 480 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 481 if (retval != 0) { 482 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 483 port, strerror(-retval)); 484 return retval; 485 } 486 487 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 488 &tx_ring_size); 489 if (retval != 0) { 490 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 491 "for port %u: %s.\n", port, strerror(-retval)); 492 return retval; 493 } 494 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 495 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 496 "for Rx queues on port %u.\n", port); 497 return -1; 498 } 499 500 /* Setup the queues. */ 501 rxconf->offloads = port_conf.rxmode.offloads; 502 for (q = 0; q < rx_rings; q ++) { 503 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 504 rte_eth_dev_socket_id(port), 505 rxconf, 506 mbuf_pool); 507 if (retval < 0) { 508 RTE_LOG(ERR, VHOST_PORT, 509 "Failed to setup rx queue %u of port %u: %s.\n", 510 q, port, strerror(-retval)); 511 return retval; 512 } 513 } 514 txconf->offloads = port_conf.txmode.offloads; 515 for (q = 0; q < tx_rings; q ++) { 516 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 517 rte_eth_dev_socket_id(port), 518 txconf); 519 if (retval < 0) { 520 RTE_LOG(ERR, VHOST_PORT, 521 "Failed to setup tx queue %u of port %u: %s.\n", 522 q, port, strerror(-retval)); 523 return retval; 524 } 525 } 526 527 /* Start the device. */ 528 retval = rte_eth_dev_start(port); 529 if (retval < 0) { 530 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 531 port, strerror(-retval)); 532 return retval; 533 } 534 535 if (promiscuous) { 536 retval = rte_eth_promiscuous_enable(port); 537 if (retval != 0) { 538 RTE_LOG(ERR, VHOST_PORT, 539 "Failed to enable promiscuous mode on port %u: %s\n", 540 port, rte_strerror(-retval)); 541 return retval; 542 } 543 } 544 545 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 546 if (retval < 0) { 547 RTE_LOG(ERR, VHOST_PORT, 548 "Failed to get MAC address on port %u: %s\n", 549 port, rte_strerror(-retval)); 550 return retval; 551 } 552 553 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 554 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 555 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 556 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 557 558 return 0; 559 } 560 561 /* 562 * Set socket file path. 563 */ 564 static int 565 us_vhost_parse_socket_path(const char *q_arg) 566 { 567 char *old; 568 569 /* parse number string */ 570 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 571 return -1; 572 573 old = socket_files; 574 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 575 if (socket_files == NULL) { 576 free(old); 577 return -1; 578 } 579 580 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 581 nb_sockets++; 582 583 return 0; 584 } 585 586 /* 587 * Parse the portmask provided at run time. 588 */ 589 static int 590 parse_portmask(const char *portmask) 591 { 592 char *end = NULL; 593 unsigned long pm; 594 595 errno = 0; 596 597 /* parse hexadecimal string */ 598 pm = strtoul(portmask, &end, 16); 599 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 600 return 0; 601 602 return pm; 603 604 } 605 606 /* 607 * Parse num options at run time. 608 */ 609 static int 610 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 611 { 612 char *end = NULL; 613 unsigned long num; 614 615 errno = 0; 616 617 /* parse unsigned int string */ 618 num = strtoul(q_arg, &end, 10); 619 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 620 return -1; 621 622 if (num > max_valid_value) 623 return -1; 624 625 return num; 626 627 } 628 629 /* 630 * Display usage 631 */ 632 static void 633 us_vhost_usage(const char *prgname) 634 { 635 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 636 " --vm2vm [0|1|2]\n" 637 " --rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n" 638 " --socket-file <path>\n" 639 " -p PORTMASK: Set mask for ports to be used by application\n" 640 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 641 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 642 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 643 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 644 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 645 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 646 " --socket-file: The path of the socket file.\n" 647 " --tx-csum [0|1]: disable/enable TX checksum offload.\n" 648 " --tso [0|1]: disable/enable TCP segment offload.\n" 649 " --client: register a vhost-user socket as client mode.\n" 650 " --dmas: register dma channel for specific vhost device.\n" 651 " --total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n" 652 " --builtin-net-driver: enable simple vhost-user net driver\n", 653 prgname); 654 } 655 656 enum { 657 #define OPT_VM2VM "vm2vm" 658 OPT_VM2VM_NUM = 256, 659 #define OPT_RX_RETRY "rx-retry" 660 OPT_RX_RETRY_NUM, 661 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 662 OPT_RX_RETRY_DELAY_NUM, 663 #define OPT_RX_RETRY_NUMB "rx-retry-num" 664 OPT_RX_RETRY_NUMB_NUM, 665 #define OPT_MERGEABLE "mergeable" 666 OPT_MERGEABLE_NUM, 667 #define OPT_STATS "stats" 668 OPT_STATS_NUM, 669 #define OPT_SOCKET_FILE "socket-file" 670 OPT_SOCKET_FILE_NUM, 671 #define OPT_TX_CSUM "tx-csum" 672 OPT_TX_CSUM_NUM, 673 #define OPT_TSO "tso" 674 OPT_TSO_NUM, 675 #define OPT_CLIENT "client" 676 OPT_CLIENT_NUM, 677 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 678 OPT_BUILTIN_NET_DRIVER_NUM, 679 #define OPT_DMAS "dmas" 680 OPT_DMAS_NUM, 681 #define OPT_NUM_MBUFS "total-num-mbufs" 682 OPT_NUM_MBUFS_NUM, 683 }; 684 685 /* 686 * Parse the arguments given in the command line of the application. 687 */ 688 static int 689 us_vhost_parse_args(int argc, char **argv) 690 { 691 int opt, ret; 692 int option_index; 693 unsigned i; 694 const char *prgname = argv[0]; 695 static struct option long_option[] = { 696 {OPT_VM2VM, required_argument, 697 NULL, OPT_VM2VM_NUM}, 698 {OPT_RX_RETRY, required_argument, 699 NULL, OPT_RX_RETRY_NUM}, 700 {OPT_RX_RETRY_DELAY, required_argument, 701 NULL, OPT_RX_RETRY_DELAY_NUM}, 702 {OPT_RX_RETRY_NUMB, required_argument, 703 NULL, OPT_RX_RETRY_NUMB_NUM}, 704 {OPT_MERGEABLE, required_argument, 705 NULL, OPT_MERGEABLE_NUM}, 706 {OPT_STATS, required_argument, 707 NULL, OPT_STATS_NUM}, 708 {OPT_SOCKET_FILE, required_argument, 709 NULL, OPT_SOCKET_FILE_NUM}, 710 {OPT_TX_CSUM, required_argument, 711 NULL, OPT_TX_CSUM_NUM}, 712 {OPT_TSO, required_argument, 713 NULL, OPT_TSO_NUM}, 714 {OPT_CLIENT, no_argument, 715 NULL, OPT_CLIENT_NUM}, 716 {OPT_BUILTIN_NET_DRIVER, no_argument, 717 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 718 {OPT_DMAS, required_argument, 719 NULL, OPT_DMAS_NUM}, 720 {OPT_NUM_MBUFS, required_argument, 721 NULL, OPT_NUM_MBUFS_NUM}, 722 {NULL, 0, 0, 0}, 723 }; 724 725 /* Parse command line */ 726 while ((opt = getopt_long(argc, argv, "p:P", 727 long_option, &option_index)) != EOF) { 728 switch (opt) { 729 /* Portmask */ 730 case 'p': 731 enabled_port_mask = parse_portmask(optarg); 732 if (enabled_port_mask == 0) { 733 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 734 us_vhost_usage(prgname); 735 return -1; 736 } 737 break; 738 739 case 'P': 740 promiscuous = 1; 741 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 742 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 743 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 744 break; 745 746 case OPT_VM2VM_NUM: 747 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 748 if (ret == -1) { 749 RTE_LOG(INFO, VHOST_CONFIG, 750 "Invalid argument for " 751 "vm2vm [0|1|2]\n"); 752 us_vhost_usage(prgname); 753 return -1; 754 } 755 vm2vm_mode = (vm2vm_type)ret; 756 break; 757 758 case OPT_RX_RETRY_NUM: 759 ret = parse_num_opt(optarg, 1); 760 if (ret == -1) { 761 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 762 us_vhost_usage(prgname); 763 return -1; 764 } 765 enable_retry = ret; 766 break; 767 768 case OPT_TX_CSUM_NUM: 769 ret = parse_num_opt(optarg, 1); 770 if (ret == -1) { 771 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 772 us_vhost_usage(prgname); 773 return -1; 774 } 775 enable_tx_csum = ret; 776 break; 777 778 case OPT_TSO_NUM: 779 ret = parse_num_opt(optarg, 1); 780 if (ret == -1) { 781 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 782 us_vhost_usage(prgname); 783 return -1; 784 } 785 enable_tso = ret; 786 break; 787 788 case OPT_RX_RETRY_DELAY_NUM: 789 ret = parse_num_opt(optarg, INT32_MAX); 790 if (ret == -1) { 791 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 792 us_vhost_usage(prgname); 793 return -1; 794 } 795 burst_rx_delay_time = ret; 796 break; 797 798 case OPT_RX_RETRY_NUMB_NUM: 799 ret = parse_num_opt(optarg, INT32_MAX); 800 if (ret == -1) { 801 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 802 us_vhost_usage(prgname); 803 return -1; 804 } 805 burst_rx_retry_num = ret; 806 break; 807 808 case OPT_MERGEABLE_NUM: 809 ret = parse_num_opt(optarg, 1); 810 if (ret == -1) { 811 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 812 us_vhost_usage(prgname); 813 return -1; 814 } 815 mergeable = !!ret; 816 break; 817 818 case OPT_STATS_NUM: 819 ret = parse_num_opt(optarg, INT32_MAX); 820 if (ret == -1) { 821 RTE_LOG(INFO, VHOST_CONFIG, 822 "Invalid argument for stats [0..N]\n"); 823 us_vhost_usage(prgname); 824 return -1; 825 } 826 enable_stats = ret; 827 break; 828 829 /* Set socket file path. */ 830 case OPT_SOCKET_FILE_NUM: 831 if (us_vhost_parse_socket_path(optarg) == -1) { 832 RTE_LOG(INFO, VHOST_CONFIG, 833 "Invalid argument for socket name (Max %d characters)\n", 834 PATH_MAX); 835 us_vhost_usage(prgname); 836 return -1; 837 } 838 break; 839 840 case OPT_DMAS_NUM: 841 if (open_dma(optarg) == -1) { 842 RTE_LOG(INFO, VHOST_CONFIG, 843 "Wrong DMA args\n"); 844 us_vhost_usage(prgname); 845 return -1; 846 } 847 break; 848 849 case OPT_NUM_MBUFS_NUM: 850 ret = parse_num_opt(optarg, INT32_MAX); 851 if (ret == -1) { 852 RTE_LOG(INFO, VHOST_CONFIG, 853 "Invalid argument for total-num-mbufs [0..N]\n"); 854 us_vhost_usage(prgname); 855 return -1; 856 } 857 858 if (total_num_mbufs < ret) 859 total_num_mbufs = ret; 860 break; 861 862 case OPT_CLIENT_NUM: 863 client_mode = 1; 864 break; 865 866 case OPT_BUILTIN_NET_DRIVER_NUM: 867 builtin_net_driver = 1; 868 break; 869 870 /* Invalid option - print options. */ 871 default: 872 us_vhost_usage(prgname); 873 return -1; 874 } 875 } 876 877 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 878 if (enabled_port_mask & (1 << i)) 879 ports[num_ports++] = i; 880 } 881 882 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 883 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 884 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 885 return -1; 886 } 887 888 return 0; 889 } 890 891 /* 892 * Update the global var NUM_PORTS and array PORTS according to system ports number 893 * and return valid ports number 894 */ 895 static unsigned check_ports_num(unsigned nb_ports) 896 { 897 unsigned valid_num_ports = num_ports; 898 unsigned portid; 899 900 if (num_ports > nb_ports) { 901 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 902 num_ports, nb_ports); 903 num_ports = nb_ports; 904 } 905 906 for (portid = 0; portid < num_ports; portid ++) { 907 if (!rte_eth_dev_is_valid_port(ports[portid])) { 908 RTE_LOG(INFO, VHOST_PORT, 909 "\nSpecified port ID(%u) is not valid\n", 910 ports[portid]); 911 ports[portid] = INVALID_PORT_ID; 912 valid_num_ports--; 913 } 914 } 915 return valid_num_ports; 916 } 917 918 static __rte_always_inline struct vhost_dev * 919 find_vhost_dev(struct rte_ether_addr *mac) 920 { 921 struct vhost_dev *vdev; 922 923 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 924 if (vdev->ready == DEVICE_RX && 925 rte_is_same_ether_addr(mac, &vdev->mac_address)) 926 return vdev; 927 } 928 929 return NULL; 930 } 931 932 /* 933 * This function learns the MAC address of the device and registers this along with a 934 * vlan tag to a VMDQ. 935 */ 936 static int 937 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 938 { 939 struct rte_ether_hdr *pkt_hdr; 940 int i, ret; 941 942 /* Learn MAC address of guest device from packet */ 943 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 944 945 if (find_vhost_dev(&pkt_hdr->src_addr)) { 946 RTE_LOG(ERR, VHOST_DATA, 947 "(%d) device is using a registered MAC!\n", 948 vdev->vid); 949 return -1; 950 } 951 952 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 953 vdev->mac_address.addr_bytes[i] = 954 pkt_hdr->src_addr.addr_bytes[i]; 955 956 /* vlan_tag currently uses the device_id. */ 957 vdev->vlan_tag = vlan_tags[vdev->vid]; 958 959 /* Print out VMDQ registration info. */ 960 RTE_LOG(INFO, VHOST_DATA, 961 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 962 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 963 vdev->vlan_tag); 964 965 /* Register the MAC address. */ 966 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 967 (uint32_t)vdev->vid + vmdq_pool_base); 968 if (ret) 969 RTE_LOG(ERR, VHOST_DATA, 970 "(%d) failed to add device MAC address to VMDQ\n", 971 vdev->vid); 972 973 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 974 975 /* Set device as ready for RX. */ 976 vdev->ready = DEVICE_RX; 977 978 return 0; 979 } 980 981 /* 982 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 983 * queue before disabling RX on the device. 984 */ 985 static inline void 986 unlink_vmdq(struct vhost_dev *vdev) 987 { 988 unsigned i = 0; 989 unsigned rx_count; 990 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 991 992 if (vdev->ready == DEVICE_RX) { 993 /*clear MAC and VLAN settings*/ 994 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 995 for (i = 0; i < 6; i++) 996 vdev->mac_address.addr_bytes[i] = 0; 997 998 vdev->vlan_tag = 0; 999 1000 /*Clear out the receive buffers*/ 1001 rx_count = rte_eth_rx_burst(ports[0], 1002 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1003 1004 while (rx_count) { 1005 for (i = 0; i < rx_count; i++) 1006 rte_pktmbuf_free(pkts_burst[i]); 1007 1008 rx_count = rte_eth_rx_burst(ports[0], 1009 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1010 } 1011 1012 vdev->ready = DEVICE_MAC_LEARNING; 1013 } 1014 } 1015 1016 static inline void 1017 free_pkts(struct rte_mbuf **pkts, uint16_t n) 1018 { 1019 while (n--) 1020 rte_pktmbuf_free(pkts[n]); 1021 } 1022 1023 static __rte_always_inline void 1024 complete_async_pkts(struct vhost_dev *vdev) 1025 { 1026 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 1027 uint16_t complete_count; 1028 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id; 1029 1030 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 1031 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 1032 if (complete_count) 1033 free_pkts(p_cpl, complete_count); 1034 1035 } 1036 1037 static __rte_always_inline void 1038 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 1039 struct rte_mbuf *m) 1040 { 1041 uint16_t ret; 1042 1043 if (builtin_net_driver) { 1044 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 1045 } else { 1046 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 1047 } 1048 1049 if (enable_stats) { 1050 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 1051 __ATOMIC_SEQ_CST); 1052 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 1053 __ATOMIC_SEQ_CST); 1054 src_vdev->stats.tx_total++; 1055 src_vdev->stats.tx += ret; 1056 } 1057 } 1058 1059 static __rte_always_inline void 1060 drain_vhost(struct vhost_dev *vdev) 1061 { 1062 uint16_t ret; 1063 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1064 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1065 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1066 1067 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit); 1068 1069 if (enable_stats) { 1070 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 1071 __ATOMIC_SEQ_CST); 1072 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 1073 __ATOMIC_SEQ_CST); 1074 } 1075 1076 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1077 free_pkts(m, nr_xmit); 1078 } 1079 1080 static __rte_always_inline void 1081 drain_vhost_table(void) 1082 { 1083 uint16_t lcore_id = rte_lcore_id(); 1084 struct vhost_bufftable *vhost_txq; 1085 struct vhost_dev *vdev; 1086 uint64_t cur_tsc; 1087 1088 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1089 if (unlikely(vdev->remove == 1)) 1090 continue; 1091 1092 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1093 1094 cur_tsc = rte_rdtsc(); 1095 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1096 > MBUF_TABLE_DRAIN_TSC)) { 1097 RTE_LOG_DP(DEBUG, VHOST_DATA, 1098 "Vhost TX queue drained after timeout with burst size %u\n", 1099 vhost_txq->len); 1100 drain_vhost(vdev); 1101 vhost_txq->len = 0; 1102 vhost_txq->pre_tsc = cur_tsc; 1103 } 1104 } 1105 } 1106 1107 /* 1108 * Check if the packet destination MAC address is for a local device. If so then put 1109 * the packet on that devices RX queue. If not then return. 1110 */ 1111 static __rte_always_inline int 1112 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1113 { 1114 struct rte_ether_hdr *pkt_hdr; 1115 struct vhost_dev *dst_vdev; 1116 struct vhost_bufftable *vhost_txq; 1117 uint16_t lcore_id = rte_lcore_id(); 1118 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1119 1120 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1121 if (!dst_vdev) 1122 return -1; 1123 1124 if (vdev->vid == dst_vdev->vid) { 1125 RTE_LOG_DP(DEBUG, VHOST_DATA, 1126 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1127 vdev->vid); 1128 return 0; 1129 } 1130 1131 RTE_LOG_DP(DEBUG, VHOST_DATA, 1132 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1133 1134 if (unlikely(dst_vdev->remove)) { 1135 RTE_LOG_DP(DEBUG, VHOST_DATA, 1136 "(%d) device is marked for removal\n", dst_vdev->vid); 1137 return 0; 1138 } 1139 1140 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1141 vhost_txq->m_table[vhost_txq->len++] = m; 1142 1143 if (enable_stats) { 1144 vdev->stats.tx_total++; 1145 vdev->stats.tx++; 1146 } 1147 1148 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1149 drain_vhost(dst_vdev); 1150 vhost_txq->len = 0; 1151 vhost_txq->pre_tsc = rte_rdtsc(); 1152 } 1153 return 0; 1154 } 1155 1156 /* 1157 * Check if the destination MAC of a packet is one local VM, 1158 * and get its vlan tag, and offset if it is. 1159 */ 1160 static __rte_always_inline int 1161 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1162 uint32_t *offset, uint16_t *vlan_tag) 1163 { 1164 struct vhost_dev *dst_vdev; 1165 struct rte_ether_hdr *pkt_hdr = 1166 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1167 1168 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1169 if (!dst_vdev) 1170 return 0; 1171 1172 if (vdev->vid == dst_vdev->vid) { 1173 RTE_LOG_DP(DEBUG, VHOST_DATA, 1174 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1175 vdev->vid); 1176 return -1; 1177 } 1178 1179 /* 1180 * HW vlan strip will reduce the packet length 1181 * by minus length of vlan tag, so need restore 1182 * the packet length by plus it. 1183 */ 1184 *offset = RTE_VLAN_HLEN; 1185 *vlan_tag = vlan_tags[vdev->vid]; 1186 1187 RTE_LOG_DP(DEBUG, VHOST_DATA, 1188 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1189 vdev->vid, dst_vdev->vid, *vlan_tag); 1190 1191 return 0; 1192 } 1193 1194 static void virtio_tx_offload(struct rte_mbuf *m) 1195 { 1196 struct rte_net_hdr_lens hdr_lens; 1197 struct rte_ipv4_hdr *ipv4_hdr; 1198 struct rte_tcp_hdr *tcp_hdr; 1199 uint32_t ptype; 1200 void *l3_hdr; 1201 1202 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1203 m->l2_len = hdr_lens.l2_len; 1204 m->l3_len = hdr_lens.l3_len; 1205 m->l4_len = hdr_lens.l4_len; 1206 1207 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1208 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1209 m->l2_len + m->l3_len); 1210 1211 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1212 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1213 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1214 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1215 ipv4_hdr = l3_hdr; 1216 ipv4_hdr->hdr_checksum = 0; 1217 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1218 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1219 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1220 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1221 } 1222 } 1223 1224 static __rte_always_inline void 1225 do_drain_mbuf_table(struct mbuf_table *tx_q) 1226 { 1227 uint16_t count; 1228 1229 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1230 tx_q->m_table, tx_q->len); 1231 if (unlikely(count < tx_q->len)) 1232 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1233 1234 tx_q->len = 0; 1235 } 1236 1237 /* 1238 * This function routes the TX packet to the correct interface. This 1239 * may be a local device or the physical port. 1240 */ 1241 static __rte_always_inline void 1242 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1243 { 1244 struct mbuf_table *tx_q; 1245 unsigned offset = 0; 1246 const uint16_t lcore_id = rte_lcore_id(); 1247 struct rte_ether_hdr *nh; 1248 1249 1250 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1251 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1252 struct vhost_dev *vdev2; 1253 1254 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1255 if (vdev2 != vdev) 1256 sync_virtio_xmit(vdev2, vdev, m); 1257 } 1258 goto queue2nic; 1259 } 1260 1261 /*check if destination is local VM*/ 1262 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1263 return; 1264 1265 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1266 if (unlikely(find_local_dest(vdev, m, &offset, 1267 &vlan_tag) != 0)) { 1268 rte_pktmbuf_free(m); 1269 return; 1270 } 1271 } 1272 1273 RTE_LOG_DP(DEBUG, VHOST_DATA, 1274 "(%d) TX: MAC address is external\n", vdev->vid); 1275 1276 queue2nic: 1277 1278 /*Add packet to the port tx queue*/ 1279 tx_q = &lcore_tx_queue[lcore_id]; 1280 1281 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1282 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1283 /* Guest has inserted the vlan tag. */ 1284 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1285 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1286 if ((vm2vm_mode == VM2VM_HARDWARE) && 1287 (vh->vlan_tci != vlan_tag_be)) 1288 vh->vlan_tci = vlan_tag_be; 1289 } else { 1290 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1291 1292 /* 1293 * Find the right seg to adjust the data len when offset is 1294 * bigger than tail room size. 1295 */ 1296 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1297 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1298 m->data_len += offset; 1299 else { 1300 struct rte_mbuf *seg = m; 1301 1302 while ((seg->next != NULL) && 1303 (offset > rte_pktmbuf_tailroom(seg))) 1304 seg = seg->next; 1305 1306 seg->data_len += offset; 1307 } 1308 m->pkt_len += offset; 1309 } 1310 1311 m->vlan_tci = vlan_tag; 1312 } 1313 1314 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1315 virtio_tx_offload(m); 1316 1317 tx_q->m_table[tx_q->len++] = m; 1318 if (enable_stats) { 1319 vdev->stats.tx_total++; 1320 vdev->stats.tx++; 1321 } 1322 1323 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1324 do_drain_mbuf_table(tx_q); 1325 } 1326 1327 1328 static __rte_always_inline void 1329 drain_mbuf_table(struct mbuf_table *tx_q) 1330 { 1331 static uint64_t prev_tsc; 1332 uint64_t cur_tsc; 1333 1334 if (tx_q->len == 0) 1335 return; 1336 1337 cur_tsc = rte_rdtsc(); 1338 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1339 prev_tsc = cur_tsc; 1340 1341 RTE_LOG_DP(DEBUG, VHOST_DATA, 1342 "TX queue drained after timeout with burst size %u\n", 1343 tx_q->len); 1344 do_drain_mbuf_table(tx_q); 1345 } 1346 } 1347 1348 uint16_t 1349 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1350 struct rte_mbuf **pkts, uint32_t rx_count) 1351 { 1352 uint16_t enqueue_count; 1353 uint16_t enqueue_fail = 0; 1354 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id; 1355 1356 complete_async_pkts(dev); 1357 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id, 1358 pkts, rx_count, dma_id, 0); 1359 1360 enqueue_fail = rx_count - enqueue_count; 1361 if (enqueue_fail) 1362 free_pkts(&pkts[enqueue_count], enqueue_fail); 1363 1364 return enqueue_count; 1365 } 1366 1367 uint16_t 1368 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1369 struct rte_mbuf **pkts, uint32_t rx_count) 1370 { 1371 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count); 1372 } 1373 1374 static __rte_always_inline void 1375 drain_eth_rx(struct vhost_dev *vdev) 1376 { 1377 uint16_t rx_count, enqueue_count; 1378 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1379 1380 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1381 pkts, MAX_PKT_BURST); 1382 1383 if (!rx_count) 1384 return; 1385 1386 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1387 VIRTIO_RXQ, pkts, rx_count); 1388 1389 /* Retry if necessary */ 1390 if (enable_retry && unlikely(enqueue_count < rx_count)) { 1391 uint32_t retry = 0; 1392 1393 while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) { 1394 rte_delay_us(burst_rx_delay_time); 1395 enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1396 VIRTIO_RXQ, &pkts[enqueue_count], 1397 rx_count - enqueue_count); 1398 } 1399 } 1400 1401 if (enable_stats) { 1402 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1403 __ATOMIC_SEQ_CST); 1404 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1405 __ATOMIC_SEQ_CST); 1406 } 1407 1408 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1409 free_pkts(pkts, rx_count); 1410 } 1411 1412 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1413 struct rte_mempool *mbuf_pool, 1414 struct rte_mbuf **pkts, uint16_t count) 1415 { 1416 int nr_inflight; 1417 uint16_t dequeue_count; 1418 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id; 1419 1420 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id, 1421 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0); 1422 1423 return dequeue_count; 1424 } 1425 1426 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1427 struct rte_mempool *mbuf_pool, 1428 struct rte_mbuf **pkts, uint16_t count) 1429 { 1430 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count); 1431 } 1432 1433 static __rte_always_inline void 1434 drain_virtio_tx(struct vhost_dev *vdev) 1435 { 1436 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1437 uint16_t count; 1438 uint16_t i; 1439 1440 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev, 1441 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST); 1442 1443 /* setup VMDq for the first packet */ 1444 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1445 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1446 free_pkts(pkts, count); 1447 } 1448 1449 for (i = 0; i < count; ++i) 1450 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1451 } 1452 1453 /* 1454 * Main function of vhost-switch. It basically does: 1455 * 1456 * for each vhost device { 1457 * - drain_eth_rx() 1458 * 1459 * Which drains the host eth Rx queue linked to the vhost device, 1460 * and deliver all of them to guest virito Rx ring associated with 1461 * this vhost device. 1462 * 1463 * - drain_virtio_tx() 1464 * 1465 * Which drains the guest virtio Tx queue and deliver all of them 1466 * to the target, which could be another vhost device, or the 1467 * physical eth dev. The route is done in function "virtio_tx_route". 1468 * } 1469 */ 1470 static int 1471 switch_worker(void *arg __rte_unused) 1472 { 1473 unsigned i; 1474 unsigned lcore_id = rte_lcore_id(); 1475 struct vhost_dev *vdev; 1476 struct mbuf_table *tx_q; 1477 1478 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1479 1480 tx_q = &lcore_tx_queue[lcore_id]; 1481 for (i = 0; i < rte_lcore_count(); i++) { 1482 if (lcore_ids[i] == lcore_id) { 1483 tx_q->txq_id = i; 1484 break; 1485 } 1486 } 1487 1488 while(1) { 1489 drain_mbuf_table(tx_q); 1490 drain_vhost_table(); 1491 /* 1492 * Inform the configuration core that we have exited the 1493 * linked list and that no devices are in use if requested. 1494 */ 1495 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1496 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1497 1498 /* 1499 * Process vhost devices 1500 */ 1501 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1502 lcore_vdev_entry) { 1503 if (unlikely(vdev->remove)) { 1504 unlink_vmdq(vdev); 1505 vdev->ready = DEVICE_SAFE_REMOVE; 1506 continue; 1507 } 1508 1509 if (likely(vdev->ready == DEVICE_RX)) 1510 drain_eth_rx(vdev); 1511 1512 if (likely(!vdev->remove)) 1513 drain_virtio_tx(vdev); 1514 } 1515 } 1516 1517 return 0; 1518 } 1519 1520 static void 1521 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id) 1522 { 1523 uint16_t n_pkt = 0; 1524 int pkts_inflight; 1525 1526 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1527 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id); 1528 1529 struct rte_mbuf *m_cpl[pkts_inflight]; 1530 1531 while (pkts_inflight) { 1532 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl, 1533 pkts_inflight, dma_id, 0); 1534 free_pkts(m_cpl, n_pkt); 1535 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, 1536 queue_id); 1537 } 1538 } 1539 1540 static void 1541 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id) 1542 { 1543 uint16_t n_pkt = 0; 1544 int pkts_inflight; 1545 1546 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1547 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1548 1549 struct rte_mbuf *m_cpl[pkts_inflight]; 1550 1551 while (pkts_inflight) { 1552 n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl, 1553 pkts_inflight, dma_id, 0); 1554 free_pkts(m_cpl, n_pkt); 1555 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1556 } 1557 } 1558 1559 /* 1560 * Remove a device from the specific data core linked list and from the 1561 * main linked list. Synchronization occurs through the use of the 1562 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1563 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1564 */ 1565 static void 1566 destroy_device(int vid) 1567 { 1568 struct vhost_dev *vdev = NULL; 1569 int lcore; 1570 uint16_t i; 1571 1572 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1573 if (vdev->vid == vid) 1574 break; 1575 } 1576 if (!vdev) 1577 return; 1578 /*set the remove flag. */ 1579 vdev->remove = 1; 1580 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1581 rte_pause(); 1582 } 1583 1584 for (i = 0; i < RTE_MAX_LCORE; i++) 1585 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1586 1587 if (builtin_net_driver) 1588 vs_vhost_net_remove(vdev); 1589 1590 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1591 lcore_vdev_entry); 1592 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1593 1594 1595 /* Set the dev_removal_flag on each lcore. */ 1596 RTE_LCORE_FOREACH_WORKER(lcore) 1597 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1598 1599 /* 1600 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1601 * we can be sure that they can no longer access the device removed 1602 * from the linked lists and that the devices are no longer in use. 1603 */ 1604 RTE_LCORE_FOREACH_WORKER(lcore) { 1605 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1606 rte_pause(); 1607 } 1608 1609 lcore_info[vdev->coreid].device_num--; 1610 1611 RTE_LOG(INFO, VHOST_DATA, 1612 "(%d) device has been removed from data core\n", 1613 vdev->vid); 1614 1615 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1616 vhost_clear_queue(vdev, VIRTIO_RXQ); 1617 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1618 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1619 } 1620 1621 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) { 1622 vhost_clear_queue(vdev, VIRTIO_TXQ); 1623 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ); 1624 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false; 1625 } 1626 1627 rte_free(vdev); 1628 } 1629 1630 static inline int 1631 get_socketid_by_vid(int vid) 1632 { 1633 int i; 1634 char ifname[PATH_MAX]; 1635 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 1636 1637 for (i = 0; i < nb_sockets; i++) { 1638 char *file = socket_files + i * PATH_MAX; 1639 if (strcmp(file, ifname) == 0) 1640 return i; 1641 } 1642 1643 return -1; 1644 } 1645 1646 static int 1647 init_vhost_queue_ops(int vid) 1648 { 1649 if (builtin_net_driver) { 1650 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; 1651 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; 1652 } else { 1653 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled) 1654 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts; 1655 else 1656 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts; 1657 1658 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled) 1659 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts; 1660 else 1661 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; 1662 } 1663 1664 return 0; 1665 } 1666 1667 static inline int 1668 vhost_async_channel_register(int vid) 1669 { 1670 int rx_ret = 0, tx_ret = 0; 1671 1672 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1673 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1674 if (rx_ret == 0) 1675 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true; 1676 } 1677 1678 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) { 1679 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ); 1680 if (tx_ret == 0) 1681 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true; 1682 } 1683 1684 return rx_ret | tx_ret; 1685 } 1686 1687 1688 1689 /* 1690 * A new device is added to a data core. First the device is added to the main linked list 1691 * and then allocated to a specific data core. 1692 */ 1693 static int 1694 new_device(int vid) 1695 { 1696 int lcore, core_add = 0; 1697 uint16_t i; 1698 uint32_t device_num_min = num_devices; 1699 struct vhost_dev *vdev; 1700 int ret; 1701 1702 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1703 if (vdev == NULL) { 1704 RTE_LOG(INFO, VHOST_DATA, 1705 "(%d) couldn't allocate memory for vhost dev\n", 1706 vid); 1707 return -1; 1708 } 1709 vdev->vid = vid; 1710 1711 for (i = 0; i < RTE_MAX_LCORE; i++) { 1712 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1713 = rte_zmalloc("vhost bufftable", 1714 sizeof(struct vhost_bufftable), 1715 RTE_CACHE_LINE_SIZE); 1716 1717 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1718 RTE_LOG(INFO, VHOST_DATA, 1719 "(%d) couldn't allocate memory for vhost TX\n", vid); 1720 return -1; 1721 } 1722 } 1723 1724 int socketid = get_socketid_by_vid(vid); 1725 if (socketid == -1) 1726 return -1; 1727 1728 init_vid2socketid_array(vid, socketid); 1729 1730 ret = vhost_async_channel_register(vid); 1731 1732 if (init_vhost_queue_ops(vid) != 0) 1733 return -1; 1734 1735 if (builtin_net_driver) 1736 vs_vhost_net_setup(vdev); 1737 1738 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1739 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1740 1741 /*reset ready flag*/ 1742 vdev->ready = DEVICE_MAC_LEARNING; 1743 vdev->remove = 0; 1744 1745 /* Find a suitable lcore to add the device. */ 1746 RTE_LCORE_FOREACH_WORKER(lcore) { 1747 if (lcore_info[lcore].device_num < device_num_min) { 1748 device_num_min = lcore_info[lcore].device_num; 1749 core_add = lcore; 1750 } 1751 } 1752 vdev->coreid = core_add; 1753 1754 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1755 lcore_vdev_entry); 1756 lcore_info[vdev->coreid].device_num++; 1757 1758 /* Disable notifications. */ 1759 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1760 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1761 1762 RTE_LOG(INFO, VHOST_DATA, 1763 "(%d) device has been added to data core %d\n", 1764 vid, vdev->coreid); 1765 1766 return ret; 1767 } 1768 1769 static int 1770 vring_state_changed(int vid, uint16_t queue_id, int enable) 1771 { 1772 struct vhost_dev *vdev = NULL; 1773 1774 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1775 if (vdev->vid == vid) 1776 break; 1777 } 1778 if (!vdev) 1779 return -1; 1780 1781 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) { 1782 if (!enable) 1783 vhost_clear_queue_thread_unsafe(vdev, queue_id); 1784 } 1785 1786 return 0; 1787 } 1788 1789 /* 1790 * These callback allow devices to be added to the data core when configuration 1791 * has been fully complete. 1792 */ 1793 static const struct rte_vhost_device_ops virtio_net_device_ops = 1794 { 1795 .new_device = new_device, 1796 .destroy_device = destroy_device, 1797 .vring_state_changed = vring_state_changed, 1798 }; 1799 1800 /* 1801 * This is a thread will wake up after a period to print stats if the user has 1802 * enabled them. 1803 */ 1804 static void * 1805 print_stats(__rte_unused void *arg) 1806 { 1807 struct vhost_dev *vdev; 1808 uint64_t tx_dropped, rx_dropped; 1809 uint64_t tx, tx_total, rx, rx_total; 1810 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1811 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1812 1813 while(1) { 1814 sleep(enable_stats); 1815 1816 /* Clear screen and move to top left */ 1817 printf("%s%s\n", clr, top_left); 1818 printf("Device statistics =================================\n"); 1819 1820 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1821 tx_total = vdev->stats.tx_total; 1822 tx = vdev->stats.tx; 1823 tx_dropped = tx_total - tx; 1824 1825 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1826 __ATOMIC_SEQ_CST); 1827 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1828 __ATOMIC_SEQ_CST); 1829 rx_dropped = rx_total - rx; 1830 1831 printf("Statistics for device %d\n" 1832 "-----------------------\n" 1833 "TX total: %" PRIu64 "\n" 1834 "TX dropped: %" PRIu64 "\n" 1835 "TX successful: %" PRIu64 "\n" 1836 "RX total: %" PRIu64 "\n" 1837 "RX dropped: %" PRIu64 "\n" 1838 "RX successful: %" PRIu64 "\n", 1839 vdev->vid, 1840 tx_total, tx_dropped, tx, 1841 rx_total, rx_dropped, rx); 1842 } 1843 1844 printf("===================================================\n"); 1845 1846 fflush(stdout); 1847 } 1848 1849 return NULL; 1850 } 1851 1852 static void 1853 unregister_drivers(int socket_num) 1854 { 1855 int i, ret; 1856 1857 for (i = 0; i < socket_num; i++) { 1858 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1859 if (ret != 0) 1860 RTE_LOG(ERR, VHOST_CONFIG, 1861 "Fail to unregister vhost driver for %s.\n", 1862 socket_files + i * PATH_MAX); 1863 } 1864 } 1865 1866 /* When we receive a INT signal, unregister vhost driver */ 1867 static void 1868 sigint_handler(__rte_unused int signum) 1869 { 1870 /* Unregister vhost driver. */ 1871 unregister_drivers(nb_sockets); 1872 1873 exit(0); 1874 } 1875 1876 static void 1877 reset_dma(void) 1878 { 1879 int i; 1880 1881 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1882 int j; 1883 1884 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1885 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1886 dma_bind[i].dmas[j].async_enabled = false; 1887 } 1888 } 1889 1890 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1891 dmas_id[i] = INVALID_DMA_ID; 1892 } 1893 1894 /* 1895 * Main function, does initialisation and calls the per-lcore functions. 1896 */ 1897 int 1898 main(int argc, char *argv[]) 1899 { 1900 unsigned lcore_id, core_id = 0; 1901 unsigned nb_ports, valid_num_ports; 1902 int ret, i; 1903 uint16_t portid; 1904 static pthread_t tid; 1905 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1906 1907 signal(SIGINT, sigint_handler); 1908 1909 /* init EAL */ 1910 ret = rte_eal_init(argc, argv); 1911 if (ret < 0) 1912 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1913 argc -= ret; 1914 argv += ret; 1915 1916 /* initialize dma structures */ 1917 reset_dma(); 1918 1919 /* parse app arguments */ 1920 ret = us_vhost_parse_args(argc, argv); 1921 if (ret < 0) 1922 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1923 1924 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1925 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1926 1927 if (rte_lcore_is_enabled(lcore_id)) 1928 lcore_ids[core_id++] = lcore_id; 1929 } 1930 1931 if (rte_lcore_count() > RTE_MAX_LCORE) 1932 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1933 1934 /* Get the number of physical ports. */ 1935 nb_ports = rte_eth_dev_count_avail(); 1936 1937 /* 1938 * Update the global var NUM_PORTS and global array PORTS 1939 * and get value of var VALID_NUM_PORTS according to system ports number 1940 */ 1941 valid_num_ports = check_ports_num(nb_ports); 1942 1943 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1944 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1945 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1946 return -1; 1947 } 1948 1949 /* 1950 * FIXME: here we are trying to allocate mbufs big enough for 1951 * @MAX_QUEUES, but the truth is we're never going to use that 1952 * many queues here. We probably should only do allocation for 1953 * those queues we are going to use. 1954 */ 1955 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs, 1956 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE, 1957 rte_socket_id()); 1958 if (mbuf_pool == NULL) 1959 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1960 1961 if (vm2vm_mode == VM2VM_HARDWARE) { 1962 /* Enable VT loop back to let L2 switch to do it. */ 1963 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1964 RTE_LOG(DEBUG, VHOST_CONFIG, 1965 "Enable loop back for L2 switch in vmdq.\n"); 1966 } 1967 1968 /* initialize all ports */ 1969 RTE_ETH_FOREACH_DEV(portid) { 1970 /* skip ports that are not enabled */ 1971 if ((enabled_port_mask & (1 << portid)) == 0) { 1972 RTE_LOG(INFO, VHOST_PORT, 1973 "Skipping disabled port %d\n", portid); 1974 continue; 1975 } 1976 if (port_init(portid) != 0) 1977 rte_exit(EXIT_FAILURE, 1978 "Cannot initialize network ports\n"); 1979 } 1980 1981 /* Enable stats if the user option is set. */ 1982 if (enable_stats) { 1983 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1984 print_stats, NULL); 1985 if (ret < 0) 1986 rte_exit(EXIT_FAILURE, 1987 "Cannot create print-stats thread\n"); 1988 } 1989 1990 /* Launch all data cores. */ 1991 RTE_LCORE_FOREACH_WORKER(lcore_id) 1992 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1993 1994 if (client_mode) 1995 flags |= RTE_VHOST_USER_CLIENT; 1996 1997 for (i = 0; i < dma_count; i++) { 1998 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 1999 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 2000 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 2001 } 2002 } 2003 2004 /* Register vhost user driver to handle vhost messages. */ 2005 for (i = 0; i < nb_sockets; i++) { 2006 char *file = socket_files + i * PATH_MAX; 2007 2008 if (dma_count && get_async_flag_by_socketid(i) != 0) 2009 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 2010 2011 ret = rte_vhost_driver_register(file, flags); 2012 if (ret != 0) { 2013 unregister_drivers(i); 2014 rte_exit(EXIT_FAILURE, 2015 "vhost driver register failure.\n"); 2016 } 2017 2018 if (builtin_net_driver) 2019 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 2020 2021 if (mergeable == 0) { 2022 rte_vhost_driver_disable_features(file, 2023 1ULL << VIRTIO_NET_F_MRG_RXBUF); 2024 } 2025 2026 if (enable_tx_csum == 0) { 2027 rte_vhost_driver_disable_features(file, 2028 1ULL << VIRTIO_NET_F_CSUM); 2029 } 2030 2031 if (enable_tso == 0) { 2032 rte_vhost_driver_disable_features(file, 2033 1ULL << VIRTIO_NET_F_HOST_TSO4); 2034 rte_vhost_driver_disable_features(file, 2035 1ULL << VIRTIO_NET_F_HOST_TSO6); 2036 rte_vhost_driver_disable_features(file, 2037 1ULL << VIRTIO_NET_F_GUEST_TSO4); 2038 rte_vhost_driver_disable_features(file, 2039 1ULL << VIRTIO_NET_F_GUEST_TSO6); 2040 } 2041 2042 if (promiscuous) { 2043 rte_vhost_driver_enable_features(file, 2044 1ULL << VIRTIO_NET_F_CTRL_RX); 2045 } 2046 2047 ret = rte_vhost_driver_callback_register(file, 2048 &virtio_net_device_ops); 2049 if (ret != 0) { 2050 rte_exit(EXIT_FAILURE, 2051 "failed to register vhost driver callbacks.\n"); 2052 } 2053 2054 if (rte_vhost_driver_start(file) < 0) { 2055 rte_exit(EXIT_FAILURE, 2056 "failed to start vhost driver.\n"); 2057 } 2058 } 2059 2060 RTE_LCORE_FOREACH_WORKER(lcore_id) 2061 rte_eal_wait_lcore(lcore_id); 2062 2063 /* clean up the EAL */ 2064 rte_eal_cleanup(); 2065 2066 return 0; 2067 } 2068