1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 #include <rte_dmadev.h> 28 #include <rte_vhost_async.h> 29 30 #include "main.h" 31 32 #ifndef MAX_QUEUES 33 #define MAX_QUEUES 128 34 #endif 35 36 #define NUM_MBUFS_DEFAULT 0x24000 37 38 /* the maximum number of external ports supported */ 39 #define MAX_SUP_PORTS 1 40 41 #define MBUF_CACHE_SIZE 128 42 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 43 44 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 45 46 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 47 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 48 49 #define JUMBO_FRAME_MAX_SIZE 0x2600 50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 51 52 /* State of virtio device. */ 53 #define DEVICE_MAC_LEARNING 0 54 #define DEVICE_RX 1 55 #define DEVICE_SAFE_REMOVE 2 56 57 /* Configurable number of RX/TX ring descriptors */ 58 #define RTE_TEST_RX_DESC_DEFAULT 1024 59 #define RTE_TEST_TX_DESC_DEFAULT 512 60 61 #define INVALID_PORT_ID 0xFF 62 #define INVALID_DMA_ID -1 63 64 #define DMA_RING_SIZE 4096 65 66 #define ASYNC_ENQUEUE_VHOST 1 67 #define ASYNC_DEQUEUE_VHOST 2 68 69 /* number of mbufs in all pools - if specified on command-line. */ 70 static int total_num_mbufs = NUM_MBUFS_DEFAULT; 71 72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 74 static int dma_count; 75 76 /* mask of enabled ports */ 77 static uint32_t enabled_port_mask = 0; 78 79 /* Promiscuous mode */ 80 static uint32_t promiscuous; 81 82 /* number of devices/queues to support*/ 83 static uint32_t num_queues = 0; 84 static uint32_t num_devices; 85 86 static struct rte_mempool *mbuf_pool; 87 static int mergeable; 88 89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 90 typedef enum { 91 VM2VM_DISABLED = 0, 92 VM2VM_SOFTWARE = 1, 93 VM2VM_HARDWARE = 2, 94 VM2VM_LAST 95 } vm2vm_type; 96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 97 98 /* Enable stats. */ 99 static uint32_t enable_stats = 0; 100 /* Enable retries on RX. */ 101 static uint32_t enable_retry = 1; 102 103 /* Disable TX checksum offload */ 104 static uint32_t enable_tx_csum; 105 106 /* Disable TSO offload */ 107 static uint32_t enable_tso; 108 109 static int client_mode; 110 111 static int builtin_net_driver; 112 113 /* Specify timeout (in useconds) between retries on RX. */ 114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 115 /* Specify the number of retries on RX. */ 116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 117 118 /* Socket file paths. Can be set by user */ 119 static char *socket_files; 120 static int nb_sockets; 121 122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE]; 123 124 /* empty VMDq configuration structure. Filled in programmatically */ 125 static struct rte_eth_conf vmdq_conf_default = { 126 .rxmode = { 127 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 128 .split_hdr_size = 0, 129 /* 130 * VLAN strip is necessary for 1G NIC such as I350, 131 * this fixes bug of ipv4 forwarding in guest can't 132 * forward packets from one virtio dev to another virtio dev. 133 */ 134 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 135 }, 136 137 .txmode = { 138 .mq_mode = RTE_ETH_MQ_TX_NONE, 139 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 140 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 141 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 142 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 143 RTE_ETH_TX_OFFLOAD_TCP_TSO), 144 }, 145 .rx_adv_conf = { 146 /* 147 * should be overridden separately in code with 148 * appropriate values 149 */ 150 .vmdq_rx_conf = { 151 .nb_queue_pools = RTE_ETH_8_POOLS, 152 .enable_default_pool = 0, 153 .default_pool = 0, 154 .nb_pool_maps = 0, 155 .pool_map = {{0, 0},}, 156 }, 157 }, 158 }; 159 160 161 static unsigned lcore_ids[RTE_MAX_LCORE]; 162 static uint16_t ports[RTE_MAX_ETHPORTS]; 163 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 164 static uint16_t num_pf_queues, num_vmdq_queues; 165 static uint16_t vmdq_pool_base, vmdq_queue_base; 166 static uint16_t queues_per_pool; 167 168 const uint16_t vlan_tags[] = { 169 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 170 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 171 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 172 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 173 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 174 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 175 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 176 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 177 }; 178 179 /* ethernet addresses of ports */ 180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 181 182 static struct vhost_dev_tailq_list vhost_dev_list = 183 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 184 185 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 186 187 /* Used for queueing bursts of TX packets. */ 188 struct mbuf_table { 189 unsigned len; 190 unsigned txq_id; 191 struct rte_mbuf *m_table[MAX_PKT_BURST]; 192 }; 193 194 struct vhost_bufftable { 195 uint32_t len; 196 uint64_t pre_tsc; 197 struct rte_mbuf *m_table[MAX_PKT_BURST]; 198 }; 199 200 /* TX queue for each data core. */ 201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 202 203 /* 204 * Vhost TX buffer for each data core. 205 * Every data core maintains a TX buffer for every vhost device, 206 * which is used for batch pkts enqueue for higher performance. 207 */ 208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 209 210 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 211 / US_PER_S * BURST_TX_DRAIN_US) 212 213 static int vid2socketid[RTE_MAX_VHOST_DEVICE]; 214 215 static inline uint32_t 216 get_async_flag_by_socketid(int socketid) 217 { 218 return dma_bind[socketid].async_flag; 219 } 220 221 static inline void 222 init_vid2socketid_array(int vid, int socketid) 223 { 224 vid2socketid[vid] = socketid; 225 } 226 227 static inline bool 228 is_dma_configured(int16_t dev_id) 229 { 230 int i; 231 232 for (i = 0; i < dma_count; i++) 233 if (dmas_id[i] == dev_id) 234 return true; 235 return false; 236 } 237 238 static inline int 239 open_dma(const char *value) 240 { 241 struct dma_for_vhost *dma_info = dma_bind; 242 char *input = strndup(value, strlen(value) + 1); 243 char *addrs = input; 244 char *ptrs[2]; 245 char *start, *end, *substr; 246 int64_t socketid, vring_id; 247 248 struct rte_dma_info info; 249 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 250 struct rte_dma_vchan_conf qconf = { 251 .direction = RTE_DMA_DIR_MEM_TO_MEM, 252 .nb_desc = DMA_RING_SIZE 253 }; 254 255 int dev_id; 256 int ret = 0; 257 uint16_t i = 0; 258 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 259 int args_nr; 260 261 while (isblank(*addrs)) 262 addrs++; 263 if (*addrs == '\0') { 264 ret = -1; 265 goto out; 266 } 267 268 /* process DMA devices within bracket. */ 269 addrs++; 270 substr = strtok(addrs, ";]"); 271 if (!substr) { 272 ret = -1; 273 goto out; 274 } 275 276 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 277 if (args_nr <= 0) { 278 ret = -1; 279 goto out; 280 } 281 282 while (i < args_nr) { 283 char *arg_temp = dma_arg[i]; 284 char *txd, *rxd; 285 uint8_t sub_nr; 286 int async_flag; 287 288 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 289 if (sub_nr != 2) { 290 ret = -1; 291 goto out; 292 } 293 294 txd = strstr(ptrs[0], "txd"); 295 rxd = strstr(ptrs[0], "rxd"); 296 if (txd) { 297 start = txd; 298 vring_id = VIRTIO_RXQ; 299 async_flag = ASYNC_ENQUEUE_VHOST; 300 } else if (rxd) { 301 start = rxd; 302 vring_id = VIRTIO_TXQ; 303 async_flag = ASYNC_DEQUEUE_VHOST; 304 } else { 305 ret = -1; 306 goto out; 307 } 308 309 start += 3; 310 socketid = strtol(start, &end, 0); 311 if (end == start) { 312 ret = -1; 313 goto out; 314 } 315 316 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 317 if (dev_id < 0) { 318 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 319 ret = -1; 320 goto out; 321 } 322 323 /* DMA device is already configured, so skip */ 324 if (is_dma_configured(dev_id)) 325 goto done; 326 327 if (rte_dma_info_get(dev_id, &info) != 0) { 328 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 329 ret = -1; 330 goto out; 331 } 332 333 if (info.max_vchans < 1) { 334 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 335 ret = -1; 336 goto out; 337 } 338 339 if (rte_dma_configure(dev_id, &dev_config) != 0) { 340 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 341 ret = -1; 342 goto out; 343 } 344 345 /* Check the max desc supported by DMA device */ 346 rte_dma_info_get(dev_id, &info); 347 if (info.nb_vchans != 1) { 348 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 349 dev_id); 350 ret = -1; 351 goto out; 352 } 353 354 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 355 356 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 357 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 358 ret = -1; 359 goto out; 360 } 361 362 if (rte_dma_start(dev_id) != 0) { 363 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 364 ret = -1; 365 goto out; 366 } 367 368 dmas_id[dma_count++] = dev_id; 369 370 done: 371 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id; 372 (dma_info + socketid)->async_flag |= async_flag; 373 i++; 374 } 375 out: 376 free(input); 377 return ret; 378 } 379 380 /* 381 * Builds up the correct configuration for VMDQ VLAN pool map 382 * according to the pool & queue limits. 383 */ 384 static inline int 385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 386 { 387 struct rte_eth_vmdq_rx_conf conf; 388 struct rte_eth_vmdq_rx_conf *def_conf = 389 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 390 unsigned i; 391 392 memset(&conf, 0, sizeof(conf)); 393 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 394 conf.nb_pool_maps = num_devices; 395 conf.enable_loop_back = def_conf->enable_loop_back; 396 conf.rx_mode = def_conf->rx_mode; 397 398 for (i = 0; i < conf.nb_pool_maps; i++) { 399 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 400 conf.pool_map[i].pools = (1UL << i); 401 } 402 403 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 404 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 405 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 406 return 0; 407 } 408 409 /* 410 * Initialises a given port using global settings and with the rx buffers 411 * coming from the mbuf_pool passed as parameter 412 */ 413 static inline int 414 port_init(uint16_t port) 415 { 416 struct rte_eth_dev_info dev_info; 417 struct rte_eth_conf port_conf; 418 struct rte_eth_rxconf *rxconf; 419 struct rte_eth_txconf *txconf; 420 int16_t rx_rings, tx_rings; 421 uint16_t rx_ring_size, tx_ring_size; 422 int retval; 423 uint16_t q; 424 425 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 426 retval = rte_eth_dev_info_get(port, &dev_info); 427 if (retval != 0) { 428 RTE_LOG(ERR, VHOST_PORT, 429 "Error during getting device (port %u) info: %s\n", 430 port, strerror(-retval)); 431 432 return retval; 433 } 434 if (dev_info.max_vmdq_pools == 0) { 435 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n"); 436 return -1; 437 } 438 439 rxconf = &dev_info.default_rxconf; 440 txconf = &dev_info.default_txconf; 441 rxconf->rx_drop_en = 1; 442 443 /*configure the number of supported virtio devices based on VMDQ limits */ 444 num_devices = dev_info.max_vmdq_pools; 445 446 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 447 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 448 449 tx_rings = (uint16_t)rte_lcore_count(); 450 451 if (mergeable) { 452 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 453 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 454 else 455 vmdq_conf_default.rxmode.mtu = MAX_MTU; 456 } 457 458 /* Get port configuration. */ 459 retval = get_eth_conf(&port_conf, num_devices); 460 if (retval < 0) 461 return retval; 462 /* NIC queues are divided into pf queues and vmdq queues. */ 463 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 464 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 465 num_vmdq_queues = num_devices * queues_per_pool; 466 num_queues = num_pf_queues + num_vmdq_queues; 467 vmdq_queue_base = dev_info.vmdq_queue_base; 468 vmdq_pool_base = dev_info.vmdq_pool_base; 469 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 470 num_pf_queues, num_devices, queues_per_pool); 471 472 if (!rte_eth_dev_is_valid_port(port)) 473 return -1; 474 475 rx_rings = (uint16_t)dev_info.max_rx_queues; 476 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 477 port_conf.txmode.offloads |= 478 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 479 /* Configure ethernet device. */ 480 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 481 if (retval != 0) { 482 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 483 port, strerror(-retval)); 484 return retval; 485 } 486 487 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 488 &tx_ring_size); 489 if (retval != 0) { 490 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 491 "for port %u: %s.\n", port, strerror(-retval)); 492 return retval; 493 } 494 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 495 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 496 "for Rx queues on port %u.\n", port); 497 return -1; 498 } 499 500 /* Setup the queues. */ 501 rxconf->offloads = port_conf.rxmode.offloads; 502 for (q = 0; q < rx_rings; q ++) { 503 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 504 rte_eth_dev_socket_id(port), 505 rxconf, 506 mbuf_pool); 507 if (retval < 0) { 508 RTE_LOG(ERR, VHOST_PORT, 509 "Failed to setup rx queue %u of port %u: %s.\n", 510 q, port, strerror(-retval)); 511 return retval; 512 } 513 } 514 txconf->offloads = port_conf.txmode.offloads; 515 for (q = 0; q < tx_rings; q ++) { 516 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 517 rte_eth_dev_socket_id(port), 518 txconf); 519 if (retval < 0) { 520 RTE_LOG(ERR, VHOST_PORT, 521 "Failed to setup tx queue %u of port %u: %s.\n", 522 q, port, strerror(-retval)); 523 return retval; 524 } 525 } 526 527 /* Start the device. */ 528 retval = rte_eth_dev_start(port); 529 if (retval < 0) { 530 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 531 port, strerror(-retval)); 532 return retval; 533 } 534 535 if (promiscuous) { 536 retval = rte_eth_promiscuous_enable(port); 537 if (retval != 0) { 538 RTE_LOG(ERR, VHOST_PORT, 539 "Failed to enable promiscuous mode on port %u: %s\n", 540 port, rte_strerror(-retval)); 541 return retval; 542 } 543 } 544 545 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 546 if (retval < 0) { 547 RTE_LOG(ERR, VHOST_PORT, 548 "Failed to get MAC address on port %u: %s\n", 549 port, rte_strerror(-retval)); 550 return retval; 551 } 552 553 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 554 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 555 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 556 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 557 558 return 0; 559 } 560 561 /* 562 * Set socket file path. 563 */ 564 static int 565 us_vhost_parse_socket_path(const char *q_arg) 566 { 567 char *old; 568 569 /* parse number string */ 570 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 571 return -1; 572 573 old = socket_files; 574 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 575 if (socket_files == NULL) { 576 free(old); 577 return -1; 578 } 579 580 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 581 nb_sockets++; 582 583 return 0; 584 } 585 586 /* 587 * Parse the portmask provided at run time. 588 */ 589 static int 590 parse_portmask(const char *portmask) 591 { 592 char *end = NULL; 593 unsigned long pm; 594 595 errno = 0; 596 597 /* parse hexadecimal string */ 598 pm = strtoul(portmask, &end, 16); 599 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 600 return 0; 601 602 return pm; 603 604 } 605 606 /* 607 * Parse num options at run time. 608 */ 609 static int 610 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 611 { 612 char *end = NULL; 613 unsigned long num; 614 615 errno = 0; 616 617 /* parse unsigned int string */ 618 num = strtoul(q_arg, &end, 10); 619 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 620 return -1; 621 622 if (num > max_valid_value) 623 return -1; 624 625 return num; 626 627 } 628 629 /* 630 * Display usage 631 */ 632 static void 633 us_vhost_usage(const char *prgname) 634 { 635 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 636 " --vm2vm [0|1|2]\n" 637 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 638 " --socket-file <path>\n" 639 " --nb-devices ND\n" 640 " -p PORTMASK: Set mask for ports to be used by application\n" 641 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 642 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 643 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 644 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 645 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 646 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 647 " --socket-file: The path of the socket file.\n" 648 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 649 " --tso [0|1] disable/enable TCP segment offload.\n" 650 " --client register a vhost-user socket as client mode.\n" 651 " --dmas register dma channel for specific vhost device.\n" 652 " --total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n", 653 prgname); 654 } 655 656 enum { 657 #define OPT_VM2VM "vm2vm" 658 OPT_VM2VM_NUM = 256, 659 #define OPT_RX_RETRY "rx-retry" 660 OPT_RX_RETRY_NUM, 661 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 662 OPT_RX_RETRY_DELAY_NUM, 663 #define OPT_RX_RETRY_NUMB "rx-retry-num" 664 OPT_RX_RETRY_NUMB_NUM, 665 #define OPT_MERGEABLE "mergeable" 666 OPT_MERGEABLE_NUM, 667 #define OPT_STATS "stats" 668 OPT_STATS_NUM, 669 #define OPT_SOCKET_FILE "socket-file" 670 OPT_SOCKET_FILE_NUM, 671 #define OPT_TX_CSUM "tx-csum" 672 OPT_TX_CSUM_NUM, 673 #define OPT_TSO "tso" 674 OPT_TSO_NUM, 675 #define OPT_CLIENT "client" 676 OPT_CLIENT_NUM, 677 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 678 OPT_BUILTIN_NET_DRIVER_NUM, 679 #define OPT_DMAS "dmas" 680 OPT_DMAS_NUM, 681 #define OPT_NUM_MBUFS "total-num-mbufs" 682 OPT_NUM_MBUFS_NUM, 683 }; 684 685 /* 686 * Parse the arguments given in the command line of the application. 687 */ 688 static int 689 us_vhost_parse_args(int argc, char **argv) 690 { 691 int opt, ret; 692 int option_index; 693 unsigned i; 694 const char *prgname = argv[0]; 695 static struct option long_option[] = { 696 {OPT_VM2VM, required_argument, 697 NULL, OPT_VM2VM_NUM}, 698 {OPT_RX_RETRY, required_argument, 699 NULL, OPT_RX_RETRY_NUM}, 700 {OPT_RX_RETRY_DELAY, required_argument, 701 NULL, OPT_RX_RETRY_DELAY_NUM}, 702 {OPT_RX_RETRY_NUMB, required_argument, 703 NULL, OPT_RX_RETRY_NUMB_NUM}, 704 {OPT_MERGEABLE, required_argument, 705 NULL, OPT_MERGEABLE_NUM}, 706 {OPT_STATS, required_argument, 707 NULL, OPT_STATS_NUM}, 708 {OPT_SOCKET_FILE, required_argument, 709 NULL, OPT_SOCKET_FILE_NUM}, 710 {OPT_TX_CSUM, required_argument, 711 NULL, OPT_TX_CSUM_NUM}, 712 {OPT_TSO, required_argument, 713 NULL, OPT_TSO_NUM}, 714 {OPT_CLIENT, no_argument, 715 NULL, OPT_CLIENT_NUM}, 716 {OPT_BUILTIN_NET_DRIVER, no_argument, 717 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 718 {OPT_DMAS, required_argument, 719 NULL, OPT_DMAS_NUM}, 720 {OPT_NUM_MBUFS, required_argument, 721 NULL, OPT_NUM_MBUFS_NUM}, 722 {NULL, 0, 0, 0}, 723 }; 724 725 /* Parse command line */ 726 while ((opt = getopt_long(argc, argv, "p:P", 727 long_option, &option_index)) != EOF) { 728 switch (opt) { 729 /* Portmask */ 730 case 'p': 731 enabled_port_mask = parse_portmask(optarg); 732 if (enabled_port_mask == 0) { 733 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 734 us_vhost_usage(prgname); 735 return -1; 736 } 737 break; 738 739 case 'P': 740 promiscuous = 1; 741 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 742 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 743 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 744 break; 745 746 case OPT_VM2VM_NUM: 747 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 748 if (ret == -1) { 749 RTE_LOG(INFO, VHOST_CONFIG, 750 "Invalid argument for " 751 "vm2vm [0|1|2]\n"); 752 us_vhost_usage(prgname); 753 return -1; 754 } 755 vm2vm_mode = (vm2vm_type)ret; 756 break; 757 758 case OPT_RX_RETRY_NUM: 759 ret = parse_num_opt(optarg, 1); 760 if (ret == -1) { 761 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 762 us_vhost_usage(prgname); 763 return -1; 764 } 765 enable_retry = ret; 766 break; 767 768 case OPT_TX_CSUM_NUM: 769 ret = parse_num_opt(optarg, 1); 770 if (ret == -1) { 771 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 772 us_vhost_usage(prgname); 773 return -1; 774 } 775 enable_tx_csum = ret; 776 break; 777 778 case OPT_TSO_NUM: 779 ret = parse_num_opt(optarg, 1); 780 if (ret == -1) { 781 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 782 us_vhost_usage(prgname); 783 return -1; 784 } 785 enable_tso = ret; 786 break; 787 788 case OPT_RX_RETRY_DELAY_NUM: 789 ret = parse_num_opt(optarg, INT32_MAX); 790 if (ret == -1) { 791 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 792 us_vhost_usage(prgname); 793 return -1; 794 } 795 burst_rx_delay_time = ret; 796 break; 797 798 case OPT_RX_RETRY_NUMB_NUM: 799 ret = parse_num_opt(optarg, INT32_MAX); 800 if (ret == -1) { 801 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 802 us_vhost_usage(prgname); 803 return -1; 804 } 805 burst_rx_retry_num = ret; 806 break; 807 808 case OPT_MERGEABLE_NUM: 809 ret = parse_num_opt(optarg, 1); 810 if (ret == -1) { 811 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 812 us_vhost_usage(prgname); 813 return -1; 814 } 815 mergeable = !!ret; 816 break; 817 818 case OPT_STATS_NUM: 819 ret = parse_num_opt(optarg, INT32_MAX); 820 if (ret == -1) { 821 RTE_LOG(INFO, VHOST_CONFIG, 822 "Invalid argument for stats [0..N]\n"); 823 us_vhost_usage(prgname); 824 return -1; 825 } 826 enable_stats = ret; 827 break; 828 829 /* Set socket file path. */ 830 case OPT_SOCKET_FILE_NUM: 831 if (us_vhost_parse_socket_path(optarg) == -1) { 832 RTE_LOG(INFO, VHOST_CONFIG, 833 "Invalid argument for socket name (Max %d characters)\n", 834 PATH_MAX); 835 us_vhost_usage(prgname); 836 return -1; 837 } 838 break; 839 840 case OPT_DMAS_NUM: 841 if (open_dma(optarg) == -1) { 842 RTE_LOG(INFO, VHOST_CONFIG, 843 "Wrong DMA args\n"); 844 us_vhost_usage(prgname); 845 return -1; 846 } 847 break; 848 849 case OPT_NUM_MBUFS_NUM: 850 ret = parse_num_opt(optarg, INT32_MAX); 851 if (ret == -1) { 852 RTE_LOG(INFO, VHOST_CONFIG, 853 "Invalid argument for total-num-mbufs [0..N]\n"); 854 us_vhost_usage(prgname); 855 return -1; 856 } 857 858 if (total_num_mbufs < ret) 859 total_num_mbufs = ret; 860 break; 861 862 case OPT_CLIENT_NUM: 863 client_mode = 1; 864 break; 865 866 case OPT_BUILTIN_NET_DRIVER_NUM: 867 builtin_net_driver = 1; 868 break; 869 870 /* Invalid option - print options. */ 871 default: 872 us_vhost_usage(prgname); 873 return -1; 874 } 875 } 876 877 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 878 if (enabled_port_mask & (1 << i)) 879 ports[num_ports++] = i; 880 } 881 882 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 883 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 884 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 885 return -1; 886 } 887 888 return 0; 889 } 890 891 /* 892 * Update the global var NUM_PORTS and array PORTS according to system ports number 893 * and return valid ports number 894 */ 895 static unsigned check_ports_num(unsigned nb_ports) 896 { 897 unsigned valid_num_ports = num_ports; 898 unsigned portid; 899 900 if (num_ports > nb_ports) { 901 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 902 num_ports, nb_ports); 903 num_ports = nb_ports; 904 } 905 906 for (portid = 0; portid < num_ports; portid ++) { 907 if (!rte_eth_dev_is_valid_port(ports[portid])) { 908 RTE_LOG(INFO, VHOST_PORT, 909 "\nSpecified port ID(%u) is not valid\n", 910 ports[portid]); 911 ports[portid] = INVALID_PORT_ID; 912 valid_num_ports--; 913 } 914 } 915 return valid_num_ports; 916 } 917 918 static __rte_always_inline struct vhost_dev * 919 find_vhost_dev(struct rte_ether_addr *mac) 920 { 921 struct vhost_dev *vdev; 922 923 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 924 if (vdev->ready == DEVICE_RX && 925 rte_is_same_ether_addr(mac, &vdev->mac_address)) 926 return vdev; 927 } 928 929 return NULL; 930 } 931 932 /* 933 * This function learns the MAC address of the device and registers this along with a 934 * vlan tag to a VMDQ. 935 */ 936 static int 937 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 938 { 939 struct rte_ether_hdr *pkt_hdr; 940 int i, ret; 941 942 /* Learn MAC address of guest device from packet */ 943 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 944 945 if (find_vhost_dev(&pkt_hdr->src_addr)) { 946 RTE_LOG(ERR, VHOST_DATA, 947 "(%d) device is using a registered MAC!\n", 948 vdev->vid); 949 return -1; 950 } 951 952 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 953 vdev->mac_address.addr_bytes[i] = 954 pkt_hdr->src_addr.addr_bytes[i]; 955 956 /* vlan_tag currently uses the device_id. */ 957 vdev->vlan_tag = vlan_tags[vdev->vid]; 958 959 /* Print out VMDQ registration info. */ 960 RTE_LOG(INFO, VHOST_DATA, 961 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 962 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 963 vdev->vlan_tag); 964 965 /* Register the MAC address. */ 966 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 967 (uint32_t)vdev->vid + vmdq_pool_base); 968 if (ret) 969 RTE_LOG(ERR, VHOST_DATA, 970 "(%d) failed to add device MAC address to VMDQ\n", 971 vdev->vid); 972 973 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 974 975 /* Set device as ready for RX. */ 976 vdev->ready = DEVICE_RX; 977 978 return 0; 979 } 980 981 /* 982 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 983 * queue before disabling RX on the device. 984 */ 985 static inline void 986 unlink_vmdq(struct vhost_dev *vdev) 987 { 988 unsigned i = 0; 989 unsigned rx_count; 990 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 991 992 if (vdev->ready == DEVICE_RX) { 993 /*clear MAC and VLAN settings*/ 994 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 995 for (i = 0; i < 6; i++) 996 vdev->mac_address.addr_bytes[i] = 0; 997 998 vdev->vlan_tag = 0; 999 1000 /*Clear out the receive buffers*/ 1001 rx_count = rte_eth_rx_burst(ports[0], 1002 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1003 1004 while (rx_count) { 1005 for (i = 0; i < rx_count; i++) 1006 rte_pktmbuf_free(pkts_burst[i]); 1007 1008 rx_count = rte_eth_rx_burst(ports[0], 1009 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1010 } 1011 1012 vdev->ready = DEVICE_MAC_LEARNING; 1013 } 1014 } 1015 1016 static inline void 1017 free_pkts(struct rte_mbuf **pkts, uint16_t n) 1018 { 1019 while (n--) 1020 rte_pktmbuf_free(pkts[n]); 1021 } 1022 1023 static __rte_always_inline void 1024 complete_async_pkts(struct vhost_dev *vdev) 1025 { 1026 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 1027 uint16_t complete_count; 1028 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id; 1029 1030 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 1031 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 1032 if (complete_count) 1033 free_pkts(p_cpl, complete_count); 1034 1035 } 1036 1037 static __rte_always_inline void 1038 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 1039 struct rte_mbuf *m) 1040 { 1041 uint16_t ret; 1042 1043 if (builtin_net_driver) { 1044 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 1045 } else { 1046 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 1047 } 1048 1049 if (enable_stats) { 1050 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 1051 __ATOMIC_SEQ_CST); 1052 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 1053 __ATOMIC_SEQ_CST); 1054 src_vdev->stats.tx_total++; 1055 src_vdev->stats.tx += ret; 1056 } 1057 } 1058 1059 static __rte_always_inline void 1060 drain_vhost(struct vhost_dev *vdev) 1061 { 1062 uint16_t ret; 1063 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1064 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1065 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1066 1067 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit); 1068 1069 if (enable_stats) { 1070 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 1071 __ATOMIC_SEQ_CST); 1072 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 1073 __ATOMIC_SEQ_CST); 1074 } 1075 1076 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1077 free_pkts(m, nr_xmit); 1078 } 1079 1080 static __rte_always_inline void 1081 drain_vhost_table(void) 1082 { 1083 uint16_t lcore_id = rte_lcore_id(); 1084 struct vhost_bufftable *vhost_txq; 1085 struct vhost_dev *vdev; 1086 uint64_t cur_tsc; 1087 1088 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1089 if (unlikely(vdev->remove == 1)) 1090 continue; 1091 1092 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1093 1094 cur_tsc = rte_rdtsc(); 1095 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1096 > MBUF_TABLE_DRAIN_TSC)) { 1097 RTE_LOG_DP(DEBUG, VHOST_DATA, 1098 "Vhost TX queue drained after timeout with burst size %u\n", 1099 vhost_txq->len); 1100 drain_vhost(vdev); 1101 vhost_txq->len = 0; 1102 vhost_txq->pre_tsc = cur_tsc; 1103 } 1104 } 1105 } 1106 1107 /* 1108 * Check if the packet destination MAC address is for a local device. If so then put 1109 * the packet on that devices RX queue. If not then return. 1110 */ 1111 static __rte_always_inline int 1112 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1113 { 1114 struct rte_ether_hdr *pkt_hdr; 1115 struct vhost_dev *dst_vdev; 1116 struct vhost_bufftable *vhost_txq; 1117 uint16_t lcore_id = rte_lcore_id(); 1118 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1119 1120 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1121 if (!dst_vdev) 1122 return -1; 1123 1124 if (vdev->vid == dst_vdev->vid) { 1125 RTE_LOG_DP(DEBUG, VHOST_DATA, 1126 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1127 vdev->vid); 1128 return 0; 1129 } 1130 1131 RTE_LOG_DP(DEBUG, VHOST_DATA, 1132 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1133 1134 if (unlikely(dst_vdev->remove)) { 1135 RTE_LOG_DP(DEBUG, VHOST_DATA, 1136 "(%d) device is marked for removal\n", dst_vdev->vid); 1137 return 0; 1138 } 1139 1140 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1141 vhost_txq->m_table[vhost_txq->len++] = m; 1142 1143 if (enable_stats) { 1144 vdev->stats.tx_total++; 1145 vdev->stats.tx++; 1146 } 1147 1148 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1149 drain_vhost(dst_vdev); 1150 vhost_txq->len = 0; 1151 vhost_txq->pre_tsc = rte_rdtsc(); 1152 } 1153 return 0; 1154 } 1155 1156 /* 1157 * Check if the destination MAC of a packet is one local VM, 1158 * and get its vlan tag, and offset if it is. 1159 */ 1160 static __rte_always_inline int 1161 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1162 uint32_t *offset, uint16_t *vlan_tag) 1163 { 1164 struct vhost_dev *dst_vdev; 1165 struct rte_ether_hdr *pkt_hdr = 1166 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1167 1168 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1169 if (!dst_vdev) 1170 return 0; 1171 1172 if (vdev->vid == dst_vdev->vid) { 1173 RTE_LOG_DP(DEBUG, VHOST_DATA, 1174 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1175 vdev->vid); 1176 return -1; 1177 } 1178 1179 /* 1180 * HW vlan strip will reduce the packet length 1181 * by minus length of vlan tag, so need restore 1182 * the packet length by plus it. 1183 */ 1184 *offset = RTE_VLAN_HLEN; 1185 *vlan_tag = vlan_tags[vdev->vid]; 1186 1187 RTE_LOG_DP(DEBUG, VHOST_DATA, 1188 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1189 vdev->vid, dst_vdev->vid, *vlan_tag); 1190 1191 return 0; 1192 } 1193 1194 static void virtio_tx_offload(struct rte_mbuf *m) 1195 { 1196 struct rte_net_hdr_lens hdr_lens; 1197 struct rte_ipv4_hdr *ipv4_hdr; 1198 struct rte_tcp_hdr *tcp_hdr; 1199 uint32_t ptype; 1200 void *l3_hdr; 1201 1202 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1203 m->l2_len = hdr_lens.l2_len; 1204 m->l3_len = hdr_lens.l3_len; 1205 m->l4_len = hdr_lens.l4_len; 1206 1207 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1208 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1209 m->l2_len + m->l3_len); 1210 1211 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1212 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1213 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1214 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1215 ipv4_hdr = l3_hdr; 1216 ipv4_hdr->hdr_checksum = 0; 1217 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1218 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1219 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1220 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1221 } 1222 } 1223 1224 static __rte_always_inline void 1225 do_drain_mbuf_table(struct mbuf_table *tx_q) 1226 { 1227 uint16_t count; 1228 1229 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1230 tx_q->m_table, tx_q->len); 1231 if (unlikely(count < tx_q->len)) 1232 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1233 1234 tx_q->len = 0; 1235 } 1236 1237 /* 1238 * This function routes the TX packet to the correct interface. This 1239 * may be a local device or the physical port. 1240 */ 1241 static __rte_always_inline void 1242 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1243 { 1244 struct mbuf_table *tx_q; 1245 unsigned offset = 0; 1246 const uint16_t lcore_id = rte_lcore_id(); 1247 struct rte_ether_hdr *nh; 1248 1249 1250 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1251 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1252 struct vhost_dev *vdev2; 1253 1254 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1255 if (vdev2 != vdev) 1256 sync_virtio_xmit(vdev2, vdev, m); 1257 } 1258 goto queue2nic; 1259 } 1260 1261 /*check if destination is local VM*/ 1262 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1263 return; 1264 1265 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1266 if (unlikely(find_local_dest(vdev, m, &offset, 1267 &vlan_tag) != 0)) { 1268 rte_pktmbuf_free(m); 1269 return; 1270 } 1271 } 1272 1273 RTE_LOG_DP(DEBUG, VHOST_DATA, 1274 "(%d) TX: MAC address is external\n", vdev->vid); 1275 1276 queue2nic: 1277 1278 /*Add packet to the port tx queue*/ 1279 tx_q = &lcore_tx_queue[lcore_id]; 1280 1281 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1282 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1283 /* Guest has inserted the vlan tag. */ 1284 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1285 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1286 if ((vm2vm_mode == VM2VM_HARDWARE) && 1287 (vh->vlan_tci != vlan_tag_be)) 1288 vh->vlan_tci = vlan_tag_be; 1289 } else { 1290 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1291 1292 /* 1293 * Find the right seg to adjust the data len when offset is 1294 * bigger than tail room size. 1295 */ 1296 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1297 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1298 m->data_len += offset; 1299 else { 1300 struct rte_mbuf *seg = m; 1301 1302 while ((seg->next != NULL) && 1303 (offset > rte_pktmbuf_tailroom(seg))) 1304 seg = seg->next; 1305 1306 seg->data_len += offset; 1307 } 1308 m->pkt_len += offset; 1309 } 1310 1311 m->vlan_tci = vlan_tag; 1312 } 1313 1314 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1315 virtio_tx_offload(m); 1316 1317 tx_q->m_table[tx_q->len++] = m; 1318 if (enable_stats) { 1319 vdev->stats.tx_total++; 1320 vdev->stats.tx++; 1321 } 1322 1323 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1324 do_drain_mbuf_table(tx_q); 1325 } 1326 1327 1328 static __rte_always_inline void 1329 drain_mbuf_table(struct mbuf_table *tx_q) 1330 { 1331 static uint64_t prev_tsc; 1332 uint64_t cur_tsc; 1333 1334 if (tx_q->len == 0) 1335 return; 1336 1337 cur_tsc = rte_rdtsc(); 1338 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1339 prev_tsc = cur_tsc; 1340 1341 RTE_LOG_DP(DEBUG, VHOST_DATA, 1342 "TX queue drained after timeout with burst size %u\n", 1343 tx_q->len); 1344 do_drain_mbuf_table(tx_q); 1345 } 1346 } 1347 1348 uint16_t 1349 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1350 struct rte_mbuf **pkts, uint32_t rx_count) 1351 { 1352 uint16_t enqueue_count; 1353 uint16_t enqueue_fail = 0; 1354 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id; 1355 1356 complete_async_pkts(dev); 1357 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id, 1358 pkts, rx_count, dma_id, 0); 1359 1360 enqueue_fail = rx_count - enqueue_count; 1361 if (enqueue_fail) 1362 free_pkts(&pkts[enqueue_count], enqueue_fail); 1363 1364 return enqueue_count; 1365 } 1366 1367 uint16_t 1368 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1369 struct rte_mbuf **pkts, uint32_t rx_count) 1370 { 1371 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count); 1372 } 1373 1374 static __rte_always_inline void 1375 drain_eth_rx(struct vhost_dev *vdev) 1376 { 1377 uint16_t rx_count, enqueue_count; 1378 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1379 1380 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1381 pkts, MAX_PKT_BURST); 1382 1383 if (!rx_count) 1384 return; 1385 1386 /* 1387 * When "enable_retry" is set, here we wait and retry when there 1388 * is no enough free slots in the queue to hold @rx_count packets, 1389 * to diminish packet loss. 1390 */ 1391 if (enable_retry && 1392 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1393 VIRTIO_RXQ))) { 1394 uint32_t retry; 1395 1396 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1397 rte_delay_us(burst_rx_delay_time); 1398 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1399 VIRTIO_RXQ)) 1400 break; 1401 } 1402 } 1403 1404 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1405 VIRTIO_RXQ, pkts, rx_count); 1406 1407 if (enable_stats) { 1408 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1409 __ATOMIC_SEQ_CST); 1410 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1411 __ATOMIC_SEQ_CST); 1412 } 1413 1414 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1415 free_pkts(pkts, rx_count); 1416 } 1417 1418 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1419 struct rte_mempool *mbuf_pool, 1420 struct rte_mbuf **pkts, uint16_t count) 1421 { 1422 int nr_inflight; 1423 uint16_t dequeue_count; 1424 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id; 1425 1426 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id, 1427 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0); 1428 1429 return dequeue_count; 1430 } 1431 1432 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1433 struct rte_mempool *mbuf_pool, 1434 struct rte_mbuf **pkts, uint16_t count) 1435 { 1436 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count); 1437 } 1438 1439 static __rte_always_inline void 1440 drain_virtio_tx(struct vhost_dev *vdev) 1441 { 1442 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1443 uint16_t count; 1444 uint16_t i; 1445 1446 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev, 1447 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST); 1448 1449 /* setup VMDq for the first packet */ 1450 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1451 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1452 free_pkts(pkts, count); 1453 } 1454 1455 for (i = 0; i < count; ++i) 1456 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1457 } 1458 1459 /* 1460 * Main function of vhost-switch. It basically does: 1461 * 1462 * for each vhost device { 1463 * - drain_eth_rx() 1464 * 1465 * Which drains the host eth Rx queue linked to the vhost device, 1466 * and deliver all of them to guest virito Rx ring associated with 1467 * this vhost device. 1468 * 1469 * - drain_virtio_tx() 1470 * 1471 * Which drains the guest virtio Tx queue and deliver all of them 1472 * to the target, which could be another vhost device, or the 1473 * physical eth dev. The route is done in function "virtio_tx_route". 1474 * } 1475 */ 1476 static int 1477 switch_worker(void *arg __rte_unused) 1478 { 1479 unsigned i; 1480 unsigned lcore_id = rte_lcore_id(); 1481 struct vhost_dev *vdev; 1482 struct mbuf_table *tx_q; 1483 1484 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1485 1486 tx_q = &lcore_tx_queue[lcore_id]; 1487 for (i = 0; i < rte_lcore_count(); i++) { 1488 if (lcore_ids[i] == lcore_id) { 1489 tx_q->txq_id = i; 1490 break; 1491 } 1492 } 1493 1494 while(1) { 1495 drain_mbuf_table(tx_q); 1496 drain_vhost_table(); 1497 /* 1498 * Inform the configuration core that we have exited the 1499 * linked list and that no devices are in use if requested. 1500 */ 1501 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1502 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1503 1504 /* 1505 * Process vhost devices 1506 */ 1507 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1508 lcore_vdev_entry) { 1509 if (unlikely(vdev->remove)) { 1510 unlink_vmdq(vdev); 1511 vdev->ready = DEVICE_SAFE_REMOVE; 1512 continue; 1513 } 1514 1515 if (likely(vdev->ready == DEVICE_RX)) 1516 drain_eth_rx(vdev); 1517 1518 if (likely(!vdev->remove)) 1519 drain_virtio_tx(vdev); 1520 } 1521 } 1522 1523 return 0; 1524 } 1525 1526 static void 1527 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id) 1528 { 1529 uint16_t n_pkt = 0; 1530 int pkts_inflight; 1531 1532 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1533 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id); 1534 1535 struct rte_mbuf *m_cpl[pkts_inflight]; 1536 1537 while (pkts_inflight) { 1538 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl, 1539 pkts_inflight, dma_id, 0); 1540 free_pkts(m_cpl, n_pkt); 1541 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, 1542 queue_id); 1543 } 1544 } 1545 1546 static void 1547 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id) 1548 { 1549 uint16_t n_pkt = 0; 1550 int pkts_inflight; 1551 1552 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1553 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1554 1555 struct rte_mbuf *m_cpl[pkts_inflight]; 1556 1557 while (pkts_inflight) { 1558 n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl, 1559 pkts_inflight, dma_id, 0); 1560 free_pkts(m_cpl, n_pkt); 1561 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1562 } 1563 } 1564 1565 /* 1566 * Remove a device from the specific data core linked list and from the 1567 * main linked list. Synchronization occurs through the use of the 1568 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1569 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1570 */ 1571 static void 1572 destroy_device(int vid) 1573 { 1574 struct vhost_dev *vdev = NULL; 1575 int lcore; 1576 uint16_t i; 1577 1578 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1579 if (vdev->vid == vid) 1580 break; 1581 } 1582 if (!vdev) 1583 return; 1584 /*set the remove flag. */ 1585 vdev->remove = 1; 1586 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1587 rte_pause(); 1588 } 1589 1590 for (i = 0; i < RTE_MAX_LCORE; i++) 1591 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1592 1593 if (builtin_net_driver) 1594 vs_vhost_net_remove(vdev); 1595 1596 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1597 lcore_vdev_entry); 1598 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1599 1600 1601 /* Set the dev_removal_flag on each lcore. */ 1602 RTE_LCORE_FOREACH_WORKER(lcore) 1603 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1604 1605 /* 1606 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1607 * we can be sure that they can no longer access the device removed 1608 * from the linked lists and that the devices are no longer in use. 1609 */ 1610 RTE_LCORE_FOREACH_WORKER(lcore) { 1611 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1612 rte_pause(); 1613 } 1614 1615 lcore_info[vdev->coreid].device_num--; 1616 1617 RTE_LOG(INFO, VHOST_DATA, 1618 "(%d) device has been removed from data core\n", 1619 vdev->vid); 1620 1621 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1622 vhost_clear_queue(vdev, VIRTIO_RXQ); 1623 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1624 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1625 } 1626 1627 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) { 1628 vhost_clear_queue(vdev, VIRTIO_TXQ); 1629 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ); 1630 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false; 1631 } 1632 1633 rte_free(vdev); 1634 } 1635 1636 static inline int 1637 get_socketid_by_vid(int vid) 1638 { 1639 int i; 1640 char ifname[PATH_MAX]; 1641 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 1642 1643 for (i = 0; i < nb_sockets; i++) { 1644 char *file = socket_files + i * PATH_MAX; 1645 if (strcmp(file, ifname) == 0) 1646 return i; 1647 } 1648 1649 return -1; 1650 } 1651 1652 static int 1653 init_vhost_queue_ops(int vid) 1654 { 1655 if (builtin_net_driver) { 1656 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; 1657 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; 1658 } else { 1659 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled) 1660 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts; 1661 else 1662 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts; 1663 1664 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled) 1665 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts; 1666 else 1667 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; 1668 } 1669 1670 return 0; 1671 } 1672 1673 static inline int 1674 vhost_async_channel_register(int vid) 1675 { 1676 int rx_ret = 0, tx_ret = 0; 1677 1678 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1679 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1680 if (rx_ret == 0) 1681 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true; 1682 } 1683 1684 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) { 1685 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ); 1686 if (tx_ret == 0) 1687 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true; 1688 } 1689 1690 return rx_ret | tx_ret; 1691 } 1692 1693 1694 1695 /* 1696 * A new device is added to a data core. First the device is added to the main linked list 1697 * and then allocated to a specific data core. 1698 */ 1699 static int 1700 new_device(int vid) 1701 { 1702 int lcore, core_add = 0; 1703 uint16_t i; 1704 uint32_t device_num_min = num_devices; 1705 struct vhost_dev *vdev; 1706 int ret; 1707 1708 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1709 if (vdev == NULL) { 1710 RTE_LOG(INFO, VHOST_DATA, 1711 "(%d) couldn't allocate memory for vhost dev\n", 1712 vid); 1713 return -1; 1714 } 1715 vdev->vid = vid; 1716 1717 for (i = 0; i < RTE_MAX_LCORE; i++) { 1718 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1719 = rte_zmalloc("vhost bufftable", 1720 sizeof(struct vhost_bufftable), 1721 RTE_CACHE_LINE_SIZE); 1722 1723 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1724 RTE_LOG(INFO, VHOST_DATA, 1725 "(%d) couldn't allocate memory for vhost TX\n", vid); 1726 return -1; 1727 } 1728 } 1729 1730 int socketid = get_socketid_by_vid(vid); 1731 if (socketid == -1) 1732 return -1; 1733 1734 init_vid2socketid_array(vid, socketid); 1735 1736 ret = vhost_async_channel_register(vid); 1737 1738 if (init_vhost_queue_ops(vid) != 0) 1739 return -1; 1740 1741 if (builtin_net_driver) 1742 vs_vhost_net_setup(vdev); 1743 1744 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1745 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1746 1747 /*reset ready flag*/ 1748 vdev->ready = DEVICE_MAC_LEARNING; 1749 vdev->remove = 0; 1750 1751 /* Find a suitable lcore to add the device. */ 1752 RTE_LCORE_FOREACH_WORKER(lcore) { 1753 if (lcore_info[lcore].device_num < device_num_min) { 1754 device_num_min = lcore_info[lcore].device_num; 1755 core_add = lcore; 1756 } 1757 } 1758 vdev->coreid = core_add; 1759 1760 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1761 lcore_vdev_entry); 1762 lcore_info[vdev->coreid].device_num++; 1763 1764 /* Disable notifications. */ 1765 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1766 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1767 1768 RTE_LOG(INFO, VHOST_DATA, 1769 "(%d) device has been added to data core %d\n", 1770 vid, vdev->coreid); 1771 1772 return ret; 1773 } 1774 1775 static int 1776 vring_state_changed(int vid, uint16_t queue_id, int enable) 1777 { 1778 struct vhost_dev *vdev = NULL; 1779 1780 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1781 if (vdev->vid == vid) 1782 break; 1783 } 1784 if (!vdev) 1785 return -1; 1786 1787 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) { 1788 if (!enable) 1789 vhost_clear_queue_thread_unsafe(vdev, queue_id); 1790 } 1791 1792 return 0; 1793 } 1794 1795 /* 1796 * These callback allow devices to be added to the data core when configuration 1797 * has been fully complete. 1798 */ 1799 static const struct rte_vhost_device_ops virtio_net_device_ops = 1800 { 1801 .new_device = new_device, 1802 .destroy_device = destroy_device, 1803 .vring_state_changed = vring_state_changed, 1804 }; 1805 1806 /* 1807 * This is a thread will wake up after a period to print stats if the user has 1808 * enabled them. 1809 */ 1810 static void * 1811 print_stats(__rte_unused void *arg) 1812 { 1813 struct vhost_dev *vdev; 1814 uint64_t tx_dropped, rx_dropped; 1815 uint64_t tx, tx_total, rx, rx_total; 1816 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1817 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1818 1819 while(1) { 1820 sleep(enable_stats); 1821 1822 /* Clear screen and move to top left */ 1823 printf("%s%s\n", clr, top_left); 1824 printf("Device statistics =================================\n"); 1825 1826 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1827 tx_total = vdev->stats.tx_total; 1828 tx = vdev->stats.tx; 1829 tx_dropped = tx_total - tx; 1830 1831 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1832 __ATOMIC_SEQ_CST); 1833 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1834 __ATOMIC_SEQ_CST); 1835 rx_dropped = rx_total - rx; 1836 1837 printf("Statistics for device %d\n" 1838 "-----------------------\n" 1839 "TX total: %" PRIu64 "\n" 1840 "TX dropped: %" PRIu64 "\n" 1841 "TX successful: %" PRIu64 "\n" 1842 "RX total: %" PRIu64 "\n" 1843 "RX dropped: %" PRIu64 "\n" 1844 "RX successful: %" PRIu64 "\n", 1845 vdev->vid, 1846 tx_total, tx_dropped, tx, 1847 rx_total, rx_dropped, rx); 1848 } 1849 1850 printf("===================================================\n"); 1851 1852 fflush(stdout); 1853 } 1854 1855 return NULL; 1856 } 1857 1858 static void 1859 unregister_drivers(int socket_num) 1860 { 1861 int i, ret; 1862 1863 for (i = 0; i < socket_num; i++) { 1864 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1865 if (ret != 0) 1866 RTE_LOG(ERR, VHOST_CONFIG, 1867 "Fail to unregister vhost driver for %s.\n", 1868 socket_files + i * PATH_MAX); 1869 } 1870 } 1871 1872 /* When we receive a INT signal, unregister vhost driver */ 1873 static void 1874 sigint_handler(__rte_unused int signum) 1875 { 1876 /* Unregister vhost driver. */ 1877 unregister_drivers(nb_sockets); 1878 1879 exit(0); 1880 } 1881 1882 static void 1883 reset_dma(void) 1884 { 1885 int i; 1886 1887 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1888 int j; 1889 1890 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1891 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1892 dma_bind[i].dmas[j].async_enabled = false; 1893 } 1894 } 1895 1896 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1897 dmas_id[i] = INVALID_DMA_ID; 1898 } 1899 1900 /* 1901 * Main function, does initialisation and calls the per-lcore functions. 1902 */ 1903 int 1904 main(int argc, char *argv[]) 1905 { 1906 unsigned lcore_id, core_id = 0; 1907 unsigned nb_ports, valid_num_ports; 1908 int ret, i; 1909 uint16_t portid; 1910 static pthread_t tid; 1911 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1912 1913 signal(SIGINT, sigint_handler); 1914 1915 /* init EAL */ 1916 ret = rte_eal_init(argc, argv); 1917 if (ret < 0) 1918 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1919 argc -= ret; 1920 argv += ret; 1921 1922 /* initialize dma structures */ 1923 reset_dma(); 1924 1925 /* parse app arguments */ 1926 ret = us_vhost_parse_args(argc, argv); 1927 if (ret < 0) 1928 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1929 1930 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1931 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1932 1933 if (rte_lcore_is_enabled(lcore_id)) 1934 lcore_ids[core_id++] = lcore_id; 1935 } 1936 1937 if (rte_lcore_count() > RTE_MAX_LCORE) 1938 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1939 1940 /* Get the number of physical ports. */ 1941 nb_ports = rte_eth_dev_count_avail(); 1942 1943 /* 1944 * Update the global var NUM_PORTS and global array PORTS 1945 * and get value of var VALID_NUM_PORTS according to system ports number 1946 */ 1947 valid_num_ports = check_ports_num(nb_ports); 1948 1949 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1950 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1951 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1952 return -1; 1953 } 1954 1955 /* 1956 * FIXME: here we are trying to allocate mbufs big enough for 1957 * @MAX_QUEUES, but the truth is we're never going to use that 1958 * many queues here. We probably should only do allocation for 1959 * those queues we are going to use. 1960 */ 1961 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs, 1962 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE, 1963 rte_socket_id()); 1964 if (mbuf_pool == NULL) 1965 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1966 1967 if (vm2vm_mode == VM2VM_HARDWARE) { 1968 /* Enable VT loop back to let L2 switch to do it. */ 1969 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1970 RTE_LOG(DEBUG, VHOST_CONFIG, 1971 "Enable loop back for L2 switch in vmdq.\n"); 1972 } 1973 1974 /* initialize all ports */ 1975 RTE_ETH_FOREACH_DEV(portid) { 1976 /* skip ports that are not enabled */ 1977 if ((enabled_port_mask & (1 << portid)) == 0) { 1978 RTE_LOG(INFO, VHOST_PORT, 1979 "Skipping disabled port %d\n", portid); 1980 continue; 1981 } 1982 if (port_init(portid) != 0) 1983 rte_exit(EXIT_FAILURE, 1984 "Cannot initialize network ports\n"); 1985 } 1986 1987 /* Enable stats if the user option is set. */ 1988 if (enable_stats) { 1989 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1990 print_stats, NULL); 1991 if (ret < 0) 1992 rte_exit(EXIT_FAILURE, 1993 "Cannot create print-stats thread\n"); 1994 } 1995 1996 /* Launch all data cores. */ 1997 RTE_LCORE_FOREACH_WORKER(lcore_id) 1998 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1999 2000 if (client_mode) 2001 flags |= RTE_VHOST_USER_CLIENT; 2002 2003 for (i = 0; i < dma_count; i++) { 2004 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 2005 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 2006 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 2007 } 2008 } 2009 2010 /* Register vhost user driver to handle vhost messages. */ 2011 for (i = 0; i < nb_sockets; i++) { 2012 char *file = socket_files + i * PATH_MAX; 2013 2014 if (dma_count && get_async_flag_by_socketid(i) != 0) 2015 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 2016 2017 ret = rte_vhost_driver_register(file, flags); 2018 if (ret != 0) { 2019 unregister_drivers(i); 2020 rte_exit(EXIT_FAILURE, 2021 "vhost driver register failure.\n"); 2022 } 2023 2024 if (builtin_net_driver) 2025 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 2026 2027 if (mergeable == 0) { 2028 rte_vhost_driver_disable_features(file, 2029 1ULL << VIRTIO_NET_F_MRG_RXBUF); 2030 } 2031 2032 if (enable_tx_csum == 0) { 2033 rte_vhost_driver_disable_features(file, 2034 1ULL << VIRTIO_NET_F_CSUM); 2035 } 2036 2037 if (enable_tso == 0) { 2038 rte_vhost_driver_disable_features(file, 2039 1ULL << VIRTIO_NET_F_HOST_TSO4); 2040 rte_vhost_driver_disable_features(file, 2041 1ULL << VIRTIO_NET_F_HOST_TSO6); 2042 rte_vhost_driver_disable_features(file, 2043 1ULL << VIRTIO_NET_F_GUEST_TSO4); 2044 rte_vhost_driver_disable_features(file, 2045 1ULL << VIRTIO_NET_F_GUEST_TSO6); 2046 } 2047 2048 if (promiscuous) { 2049 rte_vhost_driver_enable_features(file, 2050 1ULL << VIRTIO_NET_F_CTRL_RX); 2051 } 2052 2053 ret = rte_vhost_driver_callback_register(file, 2054 &virtio_net_device_ops); 2055 if (ret != 0) { 2056 rte_exit(EXIT_FAILURE, 2057 "failed to register vhost driver callbacks.\n"); 2058 } 2059 2060 if (rte_vhost_driver_start(file) < 0) { 2061 rte_exit(EXIT_FAILURE, 2062 "failed to start vhost driver.\n"); 2063 } 2064 } 2065 2066 RTE_LCORE_FOREACH_WORKER(lcore_id) 2067 rte_eal_wait_lcore(lcore_id); 2068 2069 /* clean up the EAL */ 2070 rte_eal_cleanup(); 2071 2072 return 0; 2073 } 2074