1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 28 #include "ioat.h" 29 #include "main.h" 30 31 #ifndef MAX_QUEUES 32 #define MAX_QUEUES 128 33 #endif 34 35 /* the maximum number of external ports supported */ 36 #define MAX_SUP_PORTS 1 37 38 #define MBUF_CACHE_SIZE 128 39 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 40 41 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 42 43 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 44 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 45 46 #define JUMBO_FRAME_MAX_SIZE 0x2600 47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 48 49 /* State of virtio device. */ 50 #define DEVICE_MAC_LEARNING 0 51 #define DEVICE_RX 1 52 #define DEVICE_SAFE_REMOVE 2 53 54 /* Configurable number of RX/TX ring descriptors */ 55 #define RTE_TEST_RX_DESC_DEFAULT 1024 56 #define RTE_TEST_TX_DESC_DEFAULT 512 57 58 #define INVALID_PORT_ID 0xFF 59 60 /* mask of enabled ports */ 61 static uint32_t enabled_port_mask = 0; 62 63 /* Promiscuous mode */ 64 static uint32_t promiscuous; 65 66 /* number of devices/queues to support*/ 67 static uint32_t num_queues = 0; 68 static uint32_t num_devices; 69 70 static struct rte_mempool *mbuf_pool; 71 static int mergeable; 72 73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 74 typedef enum { 75 VM2VM_DISABLED = 0, 76 VM2VM_SOFTWARE = 1, 77 VM2VM_HARDWARE = 2, 78 VM2VM_LAST 79 } vm2vm_type; 80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 81 82 /* Enable stats. */ 83 static uint32_t enable_stats = 0; 84 /* Enable retries on RX. */ 85 static uint32_t enable_retry = 1; 86 87 /* Disable TX checksum offload */ 88 static uint32_t enable_tx_csum; 89 90 /* Disable TSO offload */ 91 static uint32_t enable_tso; 92 93 static int client_mode; 94 95 static int builtin_net_driver; 96 97 static int async_vhost_driver; 98 99 static char *dma_type; 100 101 /* Specify timeout (in useconds) between retries on RX. */ 102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 103 /* Specify the number of retries on RX. */ 104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 105 106 /* Socket file paths. Can be set by user */ 107 static char *socket_files; 108 static int nb_sockets; 109 110 /* empty vmdq configuration structure. Filled in programatically */ 111 static struct rte_eth_conf vmdq_conf_default = { 112 .rxmode = { 113 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 114 .split_hdr_size = 0, 115 /* 116 * VLAN strip is necessary for 1G NIC such as I350, 117 * this fixes bug of ipv4 forwarding in guest can't 118 * forward pakets from one virtio dev to another virtio dev. 119 */ 120 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 121 }, 122 123 .txmode = { 124 .mq_mode = RTE_ETH_MQ_TX_NONE, 125 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 126 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 127 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 128 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 129 RTE_ETH_TX_OFFLOAD_TCP_TSO), 130 }, 131 .rx_adv_conf = { 132 /* 133 * should be overridden separately in code with 134 * appropriate values 135 */ 136 .vmdq_rx_conf = { 137 .nb_queue_pools = RTE_ETH_8_POOLS, 138 .enable_default_pool = 0, 139 .default_pool = 0, 140 .nb_pool_maps = 0, 141 .pool_map = {{0, 0},}, 142 }, 143 }, 144 }; 145 146 147 static unsigned lcore_ids[RTE_MAX_LCORE]; 148 static uint16_t ports[RTE_MAX_ETHPORTS]; 149 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 150 static uint16_t num_pf_queues, num_vmdq_queues; 151 static uint16_t vmdq_pool_base, vmdq_queue_base; 152 static uint16_t queues_per_pool; 153 154 const uint16_t vlan_tags[] = { 155 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 156 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 157 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 158 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 159 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 160 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 161 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 162 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 163 }; 164 165 /* ethernet addresses of ports */ 166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 167 168 static struct vhost_dev_tailq_list vhost_dev_list = 169 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 170 171 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 172 173 /* Used for queueing bursts of TX packets. */ 174 struct mbuf_table { 175 unsigned len; 176 unsigned txq_id; 177 struct rte_mbuf *m_table[MAX_PKT_BURST]; 178 }; 179 180 struct vhost_bufftable { 181 uint32_t len; 182 uint64_t pre_tsc; 183 struct rte_mbuf *m_table[MAX_PKT_BURST]; 184 }; 185 186 /* TX queue for each data core. */ 187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 188 189 /* 190 * Vhost TX buffer for each data core. 191 * Every data core maintains a TX buffer for every vhost device, 192 * which is used for batch pkts enqueue for higher performance. 193 */ 194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE]; 195 196 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 197 / US_PER_S * BURST_TX_DRAIN_US) 198 #define VLAN_HLEN 4 199 200 static inline int 201 open_dma(const char *value) 202 { 203 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) 204 return open_ioat(value); 205 206 return -1; 207 } 208 209 /* 210 * Builds up the correct configuration for VMDQ VLAN pool map 211 * according to the pool & queue limits. 212 */ 213 static inline int 214 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 215 { 216 struct rte_eth_vmdq_rx_conf conf; 217 struct rte_eth_vmdq_rx_conf *def_conf = 218 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 219 unsigned i; 220 221 memset(&conf, 0, sizeof(conf)); 222 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 223 conf.nb_pool_maps = num_devices; 224 conf.enable_loop_back = def_conf->enable_loop_back; 225 conf.rx_mode = def_conf->rx_mode; 226 227 for (i = 0; i < conf.nb_pool_maps; i++) { 228 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 229 conf.pool_map[i].pools = (1UL << i); 230 } 231 232 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 233 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 234 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 235 return 0; 236 } 237 238 /* 239 * Initialises a given port using global settings and with the rx buffers 240 * coming from the mbuf_pool passed as parameter 241 */ 242 static inline int 243 port_init(uint16_t port) 244 { 245 struct rte_eth_dev_info dev_info; 246 struct rte_eth_conf port_conf; 247 struct rte_eth_rxconf *rxconf; 248 struct rte_eth_txconf *txconf; 249 int16_t rx_rings, tx_rings; 250 uint16_t rx_ring_size, tx_ring_size; 251 int retval; 252 uint16_t q; 253 254 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 255 retval = rte_eth_dev_info_get(port, &dev_info); 256 if (retval != 0) { 257 RTE_LOG(ERR, VHOST_PORT, 258 "Error during getting device (port %u) info: %s\n", 259 port, strerror(-retval)); 260 261 return retval; 262 } 263 264 rxconf = &dev_info.default_rxconf; 265 txconf = &dev_info.default_txconf; 266 rxconf->rx_drop_en = 1; 267 268 /*configure the number of supported virtio devices based on VMDQ limits */ 269 num_devices = dev_info.max_vmdq_pools; 270 271 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 272 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 273 274 tx_rings = (uint16_t)rte_lcore_count(); 275 276 if (mergeable) { 277 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 278 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 279 else 280 vmdq_conf_default.rxmode.mtu = MAX_MTU; 281 } 282 283 /* Get port configuration. */ 284 retval = get_eth_conf(&port_conf, num_devices); 285 if (retval < 0) 286 return retval; 287 /* NIC queues are divided into pf queues and vmdq queues. */ 288 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 289 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 290 num_vmdq_queues = num_devices * queues_per_pool; 291 num_queues = num_pf_queues + num_vmdq_queues; 292 vmdq_queue_base = dev_info.vmdq_queue_base; 293 vmdq_pool_base = dev_info.vmdq_pool_base; 294 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 295 num_pf_queues, num_devices, queues_per_pool); 296 297 if (!rte_eth_dev_is_valid_port(port)) 298 return -1; 299 300 rx_rings = (uint16_t)dev_info.max_rx_queues; 301 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 302 port_conf.txmode.offloads |= 303 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 304 /* Configure ethernet device. */ 305 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 306 if (retval != 0) { 307 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 308 port, strerror(-retval)); 309 return retval; 310 } 311 312 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 313 &tx_ring_size); 314 if (retval != 0) { 315 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 316 "for port %u: %s.\n", port, strerror(-retval)); 317 return retval; 318 } 319 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 320 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 321 "for Rx queues on port %u.\n", port); 322 return -1; 323 } 324 325 /* Setup the queues. */ 326 rxconf->offloads = port_conf.rxmode.offloads; 327 for (q = 0; q < rx_rings; q ++) { 328 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 329 rte_eth_dev_socket_id(port), 330 rxconf, 331 mbuf_pool); 332 if (retval < 0) { 333 RTE_LOG(ERR, VHOST_PORT, 334 "Failed to setup rx queue %u of port %u: %s.\n", 335 q, port, strerror(-retval)); 336 return retval; 337 } 338 } 339 txconf->offloads = port_conf.txmode.offloads; 340 for (q = 0; q < tx_rings; q ++) { 341 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 342 rte_eth_dev_socket_id(port), 343 txconf); 344 if (retval < 0) { 345 RTE_LOG(ERR, VHOST_PORT, 346 "Failed to setup tx queue %u of port %u: %s.\n", 347 q, port, strerror(-retval)); 348 return retval; 349 } 350 } 351 352 /* Start the device. */ 353 retval = rte_eth_dev_start(port); 354 if (retval < 0) { 355 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 356 port, strerror(-retval)); 357 return retval; 358 } 359 360 if (promiscuous) { 361 retval = rte_eth_promiscuous_enable(port); 362 if (retval != 0) { 363 RTE_LOG(ERR, VHOST_PORT, 364 "Failed to enable promiscuous mode on port %u: %s\n", 365 port, rte_strerror(-retval)); 366 return retval; 367 } 368 } 369 370 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 371 if (retval < 0) { 372 RTE_LOG(ERR, VHOST_PORT, 373 "Failed to get MAC address on port %u: %s\n", 374 port, rte_strerror(-retval)); 375 return retval; 376 } 377 378 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 379 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 380 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 381 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 382 383 return 0; 384 } 385 386 /* 387 * Set socket file path. 388 */ 389 static int 390 us_vhost_parse_socket_path(const char *q_arg) 391 { 392 char *old; 393 394 /* parse number string */ 395 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 396 return -1; 397 398 old = socket_files; 399 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 400 if (socket_files == NULL) { 401 free(old); 402 return -1; 403 } 404 405 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 406 nb_sockets++; 407 408 return 0; 409 } 410 411 /* 412 * Parse the portmask provided at run time. 413 */ 414 static int 415 parse_portmask(const char *portmask) 416 { 417 char *end = NULL; 418 unsigned long pm; 419 420 errno = 0; 421 422 /* parse hexadecimal string */ 423 pm = strtoul(portmask, &end, 16); 424 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 425 return 0; 426 427 return pm; 428 429 } 430 431 /* 432 * Parse num options at run time. 433 */ 434 static int 435 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 436 { 437 char *end = NULL; 438 unsigned long num; 439 440 errno = 0; 441 442 /* parse unsigned int string */ 443 num = strtoul(q_arg, &end, 10); 444 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 445 return -1; 446 447 if (num > max_valid_value) 448 return -1; 449 450 return num; 451 452 } 453 454 /* 455 * Display usage 456 */ 457 static void 458 us_vhost_usage(const char *prgname) 459 { 460 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 461 " --vm2vm [0|1|2]\n" 462 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 463 " --socket-file <path>\n" 464 " --nb-devices ND\n" 465 " -p PORTMASK: Set mask for ports to be used by application\n" 466 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 467 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 468 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 469 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 470 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 471 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 472 " --socket-file: The path of the socket file.\n" 473 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 474 " --tso [0|1] disable/enable TCP segment offload.\n" 475 " --client register a vhost-user socket as client mode.\n" 476 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n" 477 " --dmas register dma channel for specific vhost device.\n", 478 prgname); 479 } 480 481 enum { 482 #define OPT_VM2VM "vm2vm" 483 OPT_VM2VM_NUM = 256, 484 #define OPT_RX_RETRY "rx-retry" 485 OPT_RX_RETRY_NUM, 486 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 487 OPT_RX_RETRY_DELAY_NUM, 488 #define OPT_RX_RETRY_NUMB "rx-retry-num" 489 OPT_RX_RETRY_NUMB_NUM, 490 #define OPT_MERGEABLE "mergeable" 491 OPT_MERGEABLE_NUM, 492 #define OPT_STATS "stats" 493 OPT_STATS_NUM, 494 #define OPT_SOCKET_FILE "socket-file" 495 OPT_SOCKET_FILE_NUM, 496 #define OPT_TX_CSUM "tx-csum" 497 OPT_TX_CSUM_NUM, 498 #define OPT_TSO "tso" 499 OPT_TSO_NUM, 500 #define OPT_CLIENT "client" 501 OPT_CLIENT_NUM, 502 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 503 OPT_BUILTIN_NET_DRIVER_NUM, 504 #define OPT_DMA_TYPE "dma-type" 505 OPT_DMA_TYPE_NUM, 506 #define OPT_DMAS "dmas" 507 OPT_DMAS_NUM, 508 }; 509 510 /* 511 * Parse the arguments given in the command line of the application. 512 */ 513 static int 514 us_vhost_parse_args(int argc, char **argv) 515 { 516 int opt, ret; 517 int option_index; 518 unsigned i; 519 const char *prgname = argv[0]; 520 static struct option long_option[] = { 521 {OPT_VM2VM, required_argument, 522 NULL, OPT_VM2VM_NUM}, 523 {OPT_RX_RETRY, required_argument, 524 NULL, OPT_RX_RETRY_NUM}, 525 {OPT_RX_RETRY_DELAY, required_argument, 526 NULL, OPT_RX_RETRY_DELAY_NUM}, 527 {OPT_RX_RETRY_NUMB, required_argument, 528 NULL, OPT_RX_RETRY_NUMB_NUM}, 529 {OPT_MERGEABLE, required_argument, 530 NULL, OPT_MERGEABLE_NUM}, 531 {OPT_STATS, required_argument, 532 NULL, OPT_STATS_NUM}, 533 {OPT_SOCKET_FILE, required_argument, 534 NULL, OPT_SOCKET_FILE_NUM}, 535 {OPT_TX_CSUM, required_argument, 536 NULL, OPT_TX_CSUM_NUM}, 537 {OPT_TSO, required_argument, 538 NULL, OPT_TSO_NUM}, 539 {OPT_CLIENT, no_argument, 540 NULL, OPT_CLIENT_NUM}, 541 {OPT_BUILTIN_NET_DRIVER, no_argument, 542 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 543 {OPT_DMA_TYPE, required_argument, 544 NULL, OPT_DMA_TYPE_NUM}, 545 {OPT_DMAS, required_argument, 546 NULL, OPT_DMAS_NUM}, 547 {NULL, 0, 0, 0}, 548 }; 549 550 /* Parse command line */ 551 while ((opt = getopt_long(argc, argv, "p:P", 552 long_option, &option_index)) != EOF) { 553 switch (opt) { 554 /* Portmask */ 555 case 'p': 556 enabled_port_mask = parse_portmask(optarg); 557 if (enabled_port_mask == 0) { 558 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 559 us_vhost_usage(prgname); 560 return -1; 561 } 562 break; 563 564 case 'P': 565 promiscuous = 1; 566 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 567 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 568 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 569 break; 570 571 case OPT_VM2VM_NUM: 572 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 573 if (ret == -1) { 574 RTE_LOG(INFO, VHOST_CONFIG, 575 "Invalid argument for " 576 "vm2vm [0|1|2]\n"); 577 us_vhost_usage(prgname); 578 return -1; 579 } 580 vm2vm_mode = (vm2vm_type)ret; 581 break; 582 583 case OPT_RX_RETRY_NUM: 584 ret = parse_num_opt(optarg, 1); 585 if (ret == -1) { 586 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 587 us_vhost_usage(prgname); 588 return -1; 589 } 590 enable_retry = ret; 591 break; 592 593 case OPT_TX_CSUM_NUM: 594 ret = parse_num_opt(optarg, 1); 595 if (ret == -1) { 596 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 597 us_vhost_usage(prgname); 598 return -1; 599 } 600 enable_tx_csum = ret; 601 break; 602 603 case OPT_TSO_NUM: 604 ret = parse_num_opt(optarg, 1); 605 if (ret == -1) { 606 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 607 us_vhost_usage(prgname); 608 return -1; 609 } 610 enable_tso = ret; 611 break; 612 613 case OPT_RX_RETRY_DELAY_NUM: 614 ret = parse_num_opt(optarg, INT32_MAX); 615 if (ret == -1) { 616 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 617 us_vhost_usage(prgname); 618 return -1; 619 } 620 burst_rx_delay_time = ret; 621 break; 622 623 case OPT_RX_RETRY_NUMB_NUM: 624 ret = parse_num_opt(optarg, INT32_MAX); 625 if (ret == -1) { 626 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 627 us_vhost_usage(prgname); 628 return -1; 629 } 630 burst_rx_retry_num = ret; 631 break; 632 633 case OPT_MERGEABLE_NUM: 634 ret = parse_num_opt(optarg, 1); 635 if (ret == -1) { 636 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 637 us_vhost_usage(prgname); 638 return -1; 639 } 640 mergeable = !!ret; 641 break; 642 643 case OPT_STATS_NUM: 644 ret = parse_num_opt(optarg, INT32_MAX); 645 if (ret == -1) { 646 RTE_LOG(INFO, VHOST_CONFIG, 647 "Invalid argument for stats [0..N]\n"); 648 us_vhost_usage(prgname); 649 return -1; 650 } 651 enable_stats = ret; 652 break; 653 654 /* Set socket file path. */ 655 case OPT_SOCKET_FILE_NUM: 656 if (us_vhost_parse_socket_path(optarg) == -1) { 657 RTE_LOG(INFO, VHOST_CONFIG, 658 "Invalid argument for socket name (Max %d characters)\n", 659 PATH_MAX); 660 us_vhost_usage(prgname); 661 return -1; 662 } 663 break; 664 665 case OPT_DMA_TYPE_NUM: 666 dma_type = optarg; 667 break; 668 669 case OPT_DMAS_NUM: 670 if (open_dma(optarg) == -1) { 671 RTE_LOG(INFO, VHOST_CONFIG, 672 "Wrong DMA args\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } 676 async_vhost_driver = 1; 677 break; 678 679 case OPT_CLIENT_NUM: 680 client_mode = 1; 681 break; 682 683 case OPT_BUILTIN_NET_DRIVER_NUM: 684 builtin_net_driver = 1; 685 break; 686 687 /* Invalid option - print options. */ 688 default: 689 us_vhost_usage(prgname); 690 return -1; 691 } 692 } 693 694 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 695 if (enabled_port_mask & (1 << i)) 696 ports[num_ports++] = i; 697 } 698 699 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 700 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 701 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 702 return -1; 703 } 704 705 return 0; 706 } 707 708 /* 709 * Update the global var NUM_PORTS and array PORTS according to system ports number 710 * and return valid ports number 711 */ 712 static unsigned check_ports_num(unsigned nb_ports) 713 { 714 unsigned valid_num_ports = num_ports; 715 unsigned portid; 716 717 if (num_ports > nb_ports) { 718 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 719 num_ports, nb_ports); 720 num_ports = nb_ports; 721 } 722 723 for (portid = 0; portid < num_ports; portid ++) { 724 if (!rte_eth_dev_is_valid_port(ports[portid])) { 725 RTE_LOG(INFO, VHOST_PORT, 726 "\nSpecified port ID(%u) is not valid\n", 727 ports[portid]); 728 ports[portid] = INVALID_PORT_ID; 729 valid_num_ports--; 730 } 731 } 732 return valid_num_ports; 733 } 734 735 static __rte_always_inline struct vhost_dev * 736 find_vhost_dev(struct rte_ether_addr *mac) 737 { 738 struct vhost_dev *vdev; 739 740 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 741 if (vdev->ready == DEVICE_RX && 742 rte_is_same_ether_addr(mac, &vdev->mac_address)) 743 return vdev; 744 } 745 746 return NULL; 747 } 748 749 /* 750 * This function learns the MAC address of the device and registers this along with a 751 * vlan tag to a VMDQ. 752 */ 753 static int 754 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 755 { 756 struct rte_ether_hdr *pkt_hdr; 757 int i, ret; 758 759 /* Learn MAC address of guest device from packet */ 760 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 761 762 if (find_vhost_dev(&pkt_hdr->src_addr)) { 763 RTE_LOG(ERR, VHOST_DATA, 764 "(%d) device is using a registered MAC!\n", 765 vdev->vid); 766 return -1; 767 } 768 769 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 770 vdev->mac_address.addr_bytes[i] = 771 pkt_hdr->src_addr.addr_bytes[i]; 772 773 /* vlan_tag currently uses the device_id. */ 774 vdev->vlan_tag = vlan_tags[vdev->vid]; 775 776 /* Print out VMDQ registration info. */ 777 RTE_LOG(INFO, VHOST_DATA, 778 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 779 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 780 vdev->vlan_tag); 781 782 /* Register the MAC address. */ 783 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 784 (uint32_t)vdev->vid + vmdq_pool_base); 785 if (ret) 786 RTE_LOG(ERR, VHOST_DATA, 787 "(%d) failed to add device MAC address to VMDQ\n", 788 vdev->vid); 789 790 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 791 792 /* Set device as ready for RX. */ 793 vdev->ready = DEVICE_RX; 794 795 return 0; 796 } 797 798 /* 799 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 800 * queue before disabling RX on the device. 801 */ 802 static inline void 803 unlink_vmdq(struct vhost_dev *vdev) 804 { 805 unsigned i = 0; 806 unsigned rx_count; 807 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 808 809 if (vdev->ready == DEVICE_RX) { 810 /*clear MAC and VLAN settings*/ 811 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 812 for (i = 0; i < 6; i++) 813 vdev->mac_address.addr_bytes[i] = 0; 814 815 vdev->vlan_tag = 0; 816 817 /*Clear out the receive buffers*/ 818 rx_count = rte_eth_rx_burst(ports[0], 819 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 820 821 while (rx_count) { 822 for (i = 0; i < rx_count; i++) 823 rte_pktmbuf_free(pkts_burst[i]); 824 825 rx_count = rte_eth_rx_burst(ports[0], 826 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 827 } 828 829 vdev->ready = DEVICE_MAC_LEARNING; 830 } 831 } 832 833 static inline void 834 free_pkts(struct rte_mbuf **pkts, uint16_t n) 835 { 836 while (n--) 837 rte_pktmbuf_free(pkts[n]); 838 } 839 840 static __rte_always_inline void 841 complete_async_pkts(struct vhost_dev *vdev) 842 { 843 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 844 uint16_t complete_count; 845 846 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 847 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST); 848 if (complete_count) { 849 free_pkts(p_cpl, complete_count); 850 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST); 851 } 852 853 } 854 855 static __rte_always_inline void 856 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 857 struct rte_mbuf *m) 858 { 859 uint16_t ret; 860 861 if (builtin_net_driver) { 862 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 863 } else { 864 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 865 } 866 867 if (enable_stats) { 868 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 869 __ATOMIC_SEQ_CST); 870 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 871 __ATOMIC_SEQ_CST); 872 src_vdev->stats.tx_total++; 873 src_vdev->stats.tx += ret; 874 } 875 } 876 877 static __rte_always_inline void 878 drain_vhost(struct vhost_dev *vdev) 879 { 880 uint16_t ret; 881 uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid; 882 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 883 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 884 885 if (builtin_net_driver) { 886 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); 887 } else if (async_vhost_driver) { 888 uint16_t enqueue_fail = 0; 889 890 complete_async_pkts(vdev); 891 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit); 892 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST); 893 894 enqueue_fail = nr_xmit - ret; 895 if (enqueue_fail) 896 free_pkts(&m[ret], nr_xmit - ret); 897 } else { 898 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 899 m, nr_xmit); 900 } 901 902 if (enable_stats) { 903 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 904 __ATOMIC_SEQ_CST); 905 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 906 __ATOMIC_SEQ_CST); 907 } 908 909 if (!async_vhost_driver) 910 free_pkts(m, nr_xmit); 911 } 912 913 static __rte_always_inline void 914 drain_vhost_table(void) 915 { 916 uint16_t lcore_id = rte_lcore_id(); 917 struct vhost_bufftable *vhost_txq; 918 struct vhost_dev *vdev; 919 uint64_t cur_tsc; 920 921 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 922 if (unlikely(vdev->remove == 1)) 923 continue; 924 925 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE 926 + vdev->vid]; 927 928 cur_tsc = rte_rdtsc(); 929 if (unlikely(cur_tsc - vhost_txq->pre_tsc 930 > MBUF_TABLE_DRAIN_TSC)) { 931 RTE_LOG_DP(DEBUG, VHOST_DATA, 932 "Vhost TX queue drained after timeout with burst size %u\n", 933 vhost_txq->len); 934 drain_vhost(vdev); 935 vhost_txq->len = 0; 936 vhost_txq->pre_tsc = cur_tsc; 937 } 938 } 939 } 940 941 /* 942 * Check if the packet destination MAC address is for a local device. If so then put 943 * the packet on that devices RX queue. If not then return. 944 */ 945 static __rte_always_inline int 946 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 947 { 948 struct rte_ether_hdr *pkt_hdr; 949 struct vhost_dev *dst_vdev; 950 struct vhost_bufftable *vhost_txq; 951 uint16_t lcore_id = rte_lcore_id(); 952 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 953 954 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 955 if (!dst_vdev) 956 return -1; 957 958 if (vdev->vid == dst_vdev->vid) { 959 RTE_LOG_DP(DEBUG, VHOST_DATA, 960 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 961 vdev->vid); 962 return 0; 963 } 964 965 RTE_LOG_DP(DEBUG, VHOST_DATA, 966 "(%d) TX: MAC address is local\n", dst_vdev->vid); 967 968 if (unlikely(dst_vdev->remove)) { 969 RTE_LOG_DP(DEBUG, VHOST_DATA, 970 "(%d) device is marked for removal\n", dst_vdev->vid); 971 return 0; 972 } 973 974 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid]; 975 vhost_txq->m_table[vhost_txq->len++] = m; 976 977 if (enable_stats) { 978 vdev->stats.tx_total++; 979 vdev->stats.tx++; 980 } 981 982 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 983 drain_vhost(dst_vdev); 984 vhost_txq->len = 0; 985 vhost_txq->pre_tsc = rte_rdtsc(); 986 } 987 return 0; 988 } 989 990 /* 991 * Check if the destination MAC of a packet is one local VM, 992 * and get its vlan tag, and offset if it is. 993 */ 994 static __rte_always_inline int 995 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 996 uint32_t *offset, uint16_t *vlan_tag) 997 { 998 struct vhost_dev *dst_vdev; 999 struct rte_ether_hdr *pkt_hdr = 1000 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1001 1002 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1003 if (!dst_vdev) 1004 return 0; 1005 1006 if (vdev->vid == dst_vdev->vid) { 1007 RTE_LOG_DP(DEBUG, VHOST_DATA, 1008 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1009 vdev->vid); 1010 return -1; 1011 } 1012 1013 /* 1014 * HW vlan strip will reduce the packet length 1015 * by minus length of vlan tag, so need restore 1016 * the packet length by plus it. 1017 */ 1018 *offset = VLAN_HLEN; 1019 *vlan_tag = vlan_tags[vdev->vid]; 1020 1021 RTE_LOG_DP(DEBUG, VHOST_DATA, 1022 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1023 vdev->vid, dst_vdev->vid, *vlan_tag); 1024 1025 return 0; 1026 } 1027 1028 static void virtio_tx_offload(struct rte_mbuf *m) 1029 { 1030 struct rte_net_hdr_lens hdr_lens; 1031 struct rte_ipv4_hdr *ipv4_hdr; 1032 struct rte_tcp_hdr *tcp_hdr; 1033 uint32_t ptype; 1034 void *l3_hdr; 1035 1036 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1037 m->l2_len = hdr_lens.l2_len; 1038 m->l3_len = hdr_lens.l3_len; 1039 m->l4_len = hdr_lens.l4_len; 1040 1041 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1042 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1043 m->l2_len + m->l3_len); 1044 1045 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1046 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1047 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1048 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1049 ipv4_hdr = l3_hdr; 1050 ipv4_hdr->hdr_checksum = 0; 1051 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1052 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1053 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1054 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1055 } 1056 } 1057 1058 static __rte_always_inline void 1059 do_drain_mbuf_table(struct mbuf_table *tx_q) 1060 { 1061 uint16_t count; 1062 1063 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1064 tx_q->m_table, tx_q->len); 1065 if (unlikely(count < tx_q->len)) 1066 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1067 1068 tx_q->len = 0; 1069 } 1070 1071 /* 1072 * This function routes the TX packet to the correct interface. This 1073 * may be a local device or the physical port. 1074 */ 1075 static __rte_always_inline void 1076 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1077 { 1078 struct mbuf_table *tx_q; 1079 unsigned offset = 0; 1080 const uint16_t lcore_id = rte_lcore_id(); 1081 struct rte_ether_hdr *nh; 1082 1083 1084 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1085 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1086 struct vhost_dev *vdev2; 1087 1088 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1089 if (vdev2 != vdev) 1090 sync_virtio_xmit(vdev2, vdev, m); 1091 } 1092 goto queue2nic; 1093 } 1094 1095 /*check if destination is local VM*/ 1096 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1097 return; 1098 1099 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1100 if (unlikely(find_local_dest(vdev, m, &offset, 1101 &vlan_tag) != 0)) { 1102 rte_pktmbuf_free(m); 1103 return; 1104 } 1105 } 1106 1107 RTE_LOG_DP(DEBUG, VHOST_DATA, 1108 "(%d) TX: MAC address is external\n", vdev->vid); 1109 1110 queue2nic: 1111 1112 /*Add packet to the port tx queue*/ 1113 tx_q = &lcore_tx_queue[lcore_id]; 1114 1115 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1116 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1117 /* Guest has inserted the vlan tag. */ 1118 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1119 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1120 if ((vm2vm_mode == VM2VM_HARDWARE) && 1121 (vh->vlan_tci != vlan_tag_be)) 1122 vh->vlan_tci = vlan_tag_be; 1123 } else { 1124 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1125 1126 /* 1127 * Find the right seg to adjust the data len when offset is 1128 * bigger than tail room size. 1129 */ 1130 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1131 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1132 m->data_len += offset; 1133 else { 1134 struct rte_mbuf *seg = m; 1135 1136 while ((seg->next != NULL) && 1137 (offset > rte_pktmbuf_tailroom(seg))) 1138 seg = seg->next; 1139 1140 seg->data_len += offset; 1141 } 1142 m->pkt_len += offset; 1143 } 1144 1145 m->vlan_tci = vlan_tag; 1146 } 1147 1148 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1149 virtio_tx_offload(m); 1150 1151 tx_q->m_table[tx_q->len++] = m; 1152 if (enable_stats) { 1153 vdev->stats.tx_total++; 1154 vdev->stats.tx++; 1155 } 1156 1157 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1158 do_drain_mbuf_table(tx_q); 1159 } 1160 1161 1162 static __rte_always_inline void 1163 drain_mbuf_table(struct mbuf_table *tx_q) 1164 { 1165 static uint64_t prev_tsc; 1166 uint64_t cur_tsc; 1167 1168 if (tx_q->len == 0) 1169 return; 1170 1171 cur_tsc = rte_rdtsc(); 1172 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1173 prev_tsc = cur_tsc; 1174 1175 RTE_LOG_DP(DEBUG, VHOST_DATA, 1176 "TX queue drained after timeout with burst size %u\n", 1177 tx_q->len); 1178 do_drain_mbuf_table(tx_q); 1179 } 1180 } 1181 1182 static __rte_always_inline void 1183 drain_eth_rx(struct vhost_dev *vdev) 1184 { 1185 uint16_t rx_count, enqueue_count; 1186 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1187 1188 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1189 pkts, MAX_PKT_BURST); 1190 1191 if (!rx_count) 1192 return; 1193 1194 /* 1195 * When "enable_retry" is set, here we wait and retry when there 1196 * is no enough free slots in the queue to hold @rx_count packets, 1197 * to diminish packet loss. 1198 */ 1199 if (enable_retry && 1200 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1201 VIRTIO_RXQ))) { 1202 uint32_t retry; 1203 1204 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1205 rte_delay_us(burst_rx_delay_time); 1206 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1207 VIRTIO_RXQ)) 1208 break; 1209 } 1210 } 1211 1212 if (builtin_net_driver) { 1213 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1214 pkts, rx_count); 1215 } else if (async_vhost_driver) { 1216 uint16_t enqueue_fail = 0; 1217 1218 complete_async_pkts(vdev); 1219 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, 1220 VIRTIO_RXQ, pkts, rx_count); 1221 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST); 1222 1223 enqueue_fail = rx_count - enqueue_count; 1224 if (enqueue_fail) 1225 free_pkts(&pkts[enqueue_count], enqueue_fail); 1226 1227 } else { 1228 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1229 pkts, rx_count); 1230 } 1231 1232 if (enable_stats) { 1233 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1234 __ATOMIC_SEQ_CST); 1235 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1236 __ATOMIC_SEQ_CST); 1237 } 1238 1239 if (!async_vhost_driver) 1240 free_pkts(pkts, rx_count); 1241 } 1242 1243 static __rte_always_inline void 1244 drain_virtio_tx(struct vhost_dev *vdev) 1245 { 1246 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1247 uint16_t count; 1248 uint16_t i; 1249 1250 if (builtin_net_driver) { 1251 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1252 pkts, MAX_PKT_BURST); 1253 } else { 1254 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1255 mbuf_pool, pkts, MAX_PKT_BURST); 1256 } 1257 1258 /* setup VMDq for the first packet */ 1259 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1260 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1261 free_pkts(pkts, count); 1262 } 1263 1264 for (i = 0; i < count; ++i) 1265 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1266 } 1267 1268 /* 1269 * Main function of vhost-switch. It basically does: 1270 * 1271 * for each vhost device { 1272 * - drain_eth_rx() 1273 * 1274 * Which drains the host eth Rx queue linked to the vhost device, 1275 * and deliver all of them to guest virito Rx ring associated with 1276 * this vhost device. 1277 * 1278 * - drain_virtio_tx() 1279 * 1280 * Which drains the guest virtio Tx queue and deliver all of them 1281 * to the target, which could be another vhost device, or the 1282 * physical eth dev. The route is done in function "virtio_tx_route". 1283 * } 1284 */ 1285 static int 1286 switch_worker(void *arg __rte_unused) 1287 { 1288 unsigned i; 1289 unsigned lcore_id = rte_lcore_id(); 1290 struct vhost_dev *vdev; 1291 struct mbuf_table *tx_q; 1292 1293 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1294 1295 tx_q = &lcore_tx_queue[lcore_id]; 1296 for (i = 0; i < rte_lcore_count(); i++) { 1297 if (lcore_ids[i] == lcore_id) { 1298 tx_q->txq_id = i; 1299 break; 1300 } 1301 } 1302 1303 while(1) { 1304 drain_mbuf_table(tx_q); 1305 drain_vhost_table(); 1306 /* 1307 * Inform the configuration core that we have exited the 1308 * linked list and that no devices are in use if requested. 1309 */ 1310 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1311 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1312 1313 /* 1314 * Process vhost devices 1315 */ 1316 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1317 lcore_vdev_entry) { 1318 if (unlikely(vdev->remove)) { 1319 unlink_vmdq(vdev); 1320 vdev->ready = DEVICE_SAFE_REMOVE; 1321 continue; 1322 } 1323 1324 if (likely(vdev->ready == DEVICE_RX)) 1325 drain_eth_rx(vdev); 1326 1327 if (likely(!vdev->remove)) 1328 drain_virtio_tx(vdev); 1329 } 1330 } 1331 1332 return 0; 1333 } 1334 1335 /* 1336 * Remove a device from the specific data core linked list and from the 1337 * main linked list. Synchonization occurs through the use of the 1338 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1339 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1340 */ 1341 static void 1342 destroy_device(int vid) 1343 { 1344 struct vhost_dev *vdev = NULL; 1345 int lcore; 1346 uint16_t i; 1347 1348 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1349 if (vdev->vid == vid) 1350 break; 1351 } 1352 if (!vdev) 1353 return; 1354 /*set the remove flag. */ 1355 vdev->remove = 1; 1356 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1357 rte_pause(); 1358 } 1359 1360 for (i = 0; i < RTE_MAX_LCORE; i++) 1361 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]); 1362 1363 if (builtin_net_driver) 1364 vs_vhost_net_remove(vdev); 1365 1366 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1367 lcore_vdev_entry); 1368 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1369 1370 1371 /* Set the dev_removal_flag on each lcore. */ 1372 RTE_LCORE_FOREACH_WORKER(lcore) 1373 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1374 1375 /* 1376 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1377 * we can be sure that they can no longer access the device removed 1378 * from the linked lists and that the devices are no longer in use. 1379 */ 1380 RTE_LCORE_FOREACH_WORKER(lcore) { 1381 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1382 rte_pause(); 1383 } 1384 1385 lcore_info[vdev->coreid].device_num--; 1386 1387 RTE_LOG(INFO, VHOST_DATA, 1388 "(%d) device has been removed from data core\n", 1389 vdev->vid); 1390 1391 if (async_vhost_driver) { 1392 uint16_t n_pkt = 0; 1393 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1394 1395 while (vdev->pkts_inflight) { 1396 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ, 1397 m_cpl, vdev->pkts_inflight); 1398 free_pkts(m_cpl, n_pkt); 1399 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1400 } 1401 1402 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1403 } 1404 1405 rte_free(vdev); 1406 } 1407 1408 /* 1409 * A new device is added to a data core. First the device is added to the main linked list 1410 * and then allocated to a specific data core. 1411 */ 1412 static int 1413 new_device(int vid) 1414 { 1415 int lcore, core_add = 0; 1416 uint16_t i; 1417 uint32_t device_num_min = num_devices; 1418 struct vhost_dev *vdev; 1419 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1420 if (vdev == NULL) { 1421 RTE_LOG(INFO, VHOST_DATA, 1422 "(%d) couldn't allocate memory for vhost dev\n", 1423 vid); 1424 return -1; 1425 } 1426 vdev->vid = vid; 1427 1428 for (i = 0; i < RTE_MAX_LCORE; i++) { 1429 vhost_txbuff[i * MAX_VHOST_DEVICE + vid] 1430 = rte_zmalloc("vhost bufftable", 1431 sizeof(struct vhost_bufftable), 1432 RTE_CACHE_LINE_SIZE); 1433 1434 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) { 1435 RTE_LOG(INFO, VHOST_DATA, 1436 "(%d) couldn't allocate memory for vhost TX\n", vid); 1437 return -1; 1438 } 1439 } 1440 1441 if (builtin_net_driver) 1442 vs_vhost_net_setup(vdev); 1443 1444 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1445 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1446 1447 /*reset ready flag*/ 1448 vdev->ready = DEVICE_MAC_LEARNING; 1449 vdev->remove = 0; 1450 1451 /* Find a suitable lcore to add the device. */ 1452 RTE_LCORE_FOREACH_WORKER(lcore) { 1453 if (lcore_info[lcore].device_num < device_num_min) { 1454 device_num_min = lcore_info[lcore].device_num; 1455 core_add = lcore; 1456 } 1457 } 1458 vdev->coreid = core_add; 1459 1460 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1461 lcore_vdev_entry); 1462 lcore_info[vdev->coreid].device_num++; 1463 1464 /* Disable notifications. */ 1465 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1466 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1467 1468 RTE_LOG(INFO, VHOST_DATA, 1469 "(%d) device has been added to data core %d\n", 1470 vid, vdev->coreid); 1471 1472 if (async_vhost_driver) { 1473 struct rte_vhost_async_config config = {0}; 1474 struct rte_vhost_async_channel_ops channel_ops; 1475 1476 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) { 1477 channel_ops.transfer_data = ioat_transfer_data_cb; 1478 channel_ops.check_completed_copies = 1479 ioat_check_completed_copies_cb; 1480 1481 config.features = RTE_VHOST_ASYNC_INORDER; 1482 1483 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ, 1484 config, &channel_ops); 1485 } 1486 } 1487 1488 return 0; 1489 } 1490 1491 static int 1492 vring_state_changed(int vid, uint16_t queue_id, int enable) 1493 { 1494 struct vhost_dev *vdev = NULL; 1495 1496 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1497 if (vdev->vid == vid) 1498 break; 1499 } 1500 if (!vdev) 1501 return -1; 1502 1503 if (queue_id != VIRTIO_RXQ) 1504 return 0; 1505 1506 if (async_vhost_driver) { 1507 if (!enable) { 1508 uint16_t n_pkt = 0; 1509 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1510 1511 while (vdev->pkts_inflight) { 1512 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id, 1513 m_cpl, vdev->pkts_inflight); 1514 free_pkts(m_cpl, n_pkt); 1515 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1516 } 1517 } 1518 } 1519 1520 return 0; 1521 } 1522 1523 /* 1524 * These callback allow devices to be added to the data core when configuration 1525 * has been fully complete. 1526 */ 1527 static const struct rte_vhost_device_ops virtio_net_device_ops = 1528 { 1529 .new_device = new_device, 1530 .destroy_device = destroy_device, 1531 .vring_state_changed = vring_state_changed, 1532 }; 1533 1534 /* 1535 * This is a thread will wake up after a period to print stats if the user has 1536 * enabled them. 1537 */ 1538 static void * 1539 print_stats(__rte_unused void *arg) 1540 { 1541 struct vhost_dev *vdev; 1542 uint64_t tx_dropped, rx_dropped; 1543 uint64_t tx, tx_total, rx, rx_total; 1544 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1545 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1546 1547 while(1) { 1548 sleep(enable_stats); 1549 1550 /* Clear screen and move to top left */ 1551 printf("%s%s\n", clr, top_left); 1552 printf("Device statistics =================================\n"); 1553 1554 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1555 tx_total = vdev->stats.tx_total; 1556 tx = vdev->stats.tx; 1557 tx_dropped = tx_total - tx; 1558 1559 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1560 __ATOMIC_SEQ_CST); 1561 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1562 __ATOMIC_SEQ_CST); 1563 rx_dropped = rx_total - rx; 1564 1565 printf("Statistics for device %d\n" 1566 "-----------------------\n" 1567 "TX total: %" PRIu64 "\n" 1568 "TX dropped: %" PRIu64 "\n" 1569 "TX successful: %" PRIu64 "\n" 1570 "RX total: %" PRIu64 "\n" 1571 "RX dropped: %" PRIu64 "\n" 1572 "RX successful: %" PRIu64 "\n", 1573 vdev->vid, 1574 tx_total, tx_dropped, tx, 1575 rx_total, rx_dropped, rx); 1576 } 1577 1578 printf("===================================================\n"); 1579 1580 fflush(stdout); 1581 } 1582 1583 return NULL; 1584 } 1585 1586 static void 1587 unregister_drivers(int socket_num) 1588 { 1589 int i, ret; 1590 1591 for (i = 0; i < socket_num; i++) { 1592 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1593 if (ret != 0) 1594 RTE_LOG(ERR, VHOST_CONFIG, 1595 "Fail to unregister vhost driver for %s.\n", 1596 socket_files + i * PATH_MAX); 1597 } 1598 } 1599 1600 /* When we receive a INT signal, unregister vhost driver */ 1601 static void 1602 sigint_handler(__rte_unused int signum) 1603 { 1604 /* Unregister vhost driver. */ 1605 unregister_drivers(nb_sockets); 1606 1607 exit(0); 1608 } 1609 1610 /* 1611 * While creating an mbuf pool, one key thing is to figure out how 1612 * many mbuf entries is enough for our use. FYI, here are some 1613 * guidelines: 1614 * 1615 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1616 * 1617 * - For each switch core (A CPU core does the packet switch), we need 1618 * also make some reservation for receiving the packets from virtio 1619 * Tx queue. How many is enough depends on the usage. It's normally 1620 * a simple calculation like following: 1621 * 1622 * MAX_PKT_BURST * max packet size / mbuf size 1623 * 1624 * So, we definitely need allocate more mbufs when TSO is enabled. 1625 * 1626 * - Similarly, for each switching core, we should serve @nr_rx_desc 1627 * mbufs for receiving the packets from physical NIC device. 1628 * 1629 * - We also need make sure, for each switch core, we have allocated 1630 * enough mbufs to fill up the mbuf cache. 1631 */ 1632 static void 1633 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1634 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1635 { 1636 uint32_t nr_mbufs; 1637 uint32_t nr_mbufs_per_core; 1638 uint32_t mtu = 1500; 1639 1640 if (mergeable) 1641 mtu = 9000; 1642 if (enable_tso) 1643 mtu = 64 * 1024; 1644 1645 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1646 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1647 nr_mbufs_per_core += nr_rx_desc; 1648 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1649 1650 nr_mbufs = nr_queues * nr_rx_desc; 1651 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1652 nr_mbufs *= nr_port; 1653 1654 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1655 nr_mbuf_cache, 0, mbuf_size, 1656 rte_socket_id()); 1657 if (mbuf_pool == NULL) 1658 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1659 } 1660 1661 /* 1662 * Main function, does initialisation and calls the per-lcore functions. 1663 */ 1664 int 1665 main(int argc, char *argv[]) 1666 { 1667 unsigned lcore_id, core_id = 0; 1668 unsigned nb_ports, valid_num_ports; 1669 int ret, i; 1670 uint16_t portid; 1671 static pthread_t tid; 1672 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1673 1674 signal(SIGINT, sigint_handler); 1675 1676 /* init EAL */ 1677 ret = rte_eal_init(argc, argv); 1678 if (ret < 0) 1679 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1680 argc -= ret; 1681 argv += ret; 1682 1683 /* parse app arguments */ 1684 ret = us_vhost_parse_args(argc, argv); 1685 if (ret < 0) 1686 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1687 1688 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1689 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1690 1691 if (rte_lcore_is_enabled(lcore_id)) 1692 lcore_ids[core_id++] = lcore_id; 1693 } 1694 1695 if (rte_lcore_count() > RTE_MAX_LCORE) 1696 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1697 1698 /* Get the number of physical ports. */ 1699 nb_ports = rte_eth_dev_count_avail(); 1700 1701 /* 1702 * Update the global var NUM_PORTS and global array PORTS 1703 * and get value of var VALID_NUM_PORTS according to system ports number 1704 */ 1705 valid_num_ports = check_ports_num(nb_ports); 1706 1707 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1708 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1709 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1710 return -1; 1711 } 1712 1713 /* 1714 * FIXME: here we are trying to allocate mbufs big enough for 1715 * @MAX_QUEUES, but the truth is we're never going to use that 1716 * many queues here. We probably should only do allocation for 1717 * those queues we are going to use. 1718 */ 1719 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1720 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1721 1722 if (vm2vm_mode == VM2VM_HARDWARE) { 1723 /* Enable VT loop back to let L2 switch to do it. */ 1724 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1725 RTE_LOG(DEBUG, VHOST_CONFIG, 1726 "Enable loop back for L2 switch in vmdq.\n"); 1727 } 1728 1729 /* initialize all ports */ 1730 RTE_ETH_FOREACH_DEV(portid) { 1731 /* skip ports that are not enabled */ 1732 if ((enabled_port_mask & (1 << portid)) == 0) { 1733 RTE_LOG(INFO, VHOST_PORT, 1734 "Skipping disabled port %d\n", portid); 1735 continue; 1736 } 1737 if (port_init(portid) != 0) 1738 rte_exit(EXIT_FAILURE, 1739 "Cannot initialize network ports\n"); 1740 } 1741 1742 /* Enable stats if the user option is set. */ 1743 if (enable_stats) { 1744 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1745 print_stats, NULL); 1746 if (ret < 0) 1747 rte_exit(EXIT_FAILURE, 1748 "Cannot create print-stats thread\n"); 1749 } 1750 1751 /* Launch all data cores. */ 1752 RTE_LCORE_FOREACH_WORKER(lcore_id) 1753 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1754 1755 if (client_mode) 1756 flags |= RTE_VHOST_USER_CLIENT; 1757 1758 /* Register vhost user driver to handle vhost messages. */ 1759 for (i = 0; i < nb_sockets; i++) { 1760 char *file = socket_files + i * PATH_MAX; 1761 1762 if (async_vhost_driver) 1763 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1764 1765 ret = rte_vhost_driver_register(file, flags); 1766 if (ret != 0) { 1767 unregister_drivers(i); 1768 rte_exit(EXIT_FAILURE, 1769 "vhost driver register failure.\n"); 1770 } 1771 1772 if (builtin_net_driver) 1773 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1774 1775 if (mergeable == 0) { 1776 rte_vhost_driver_disable_features(file, 1777 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1778 } 1779 1780 if (enable_tx_csum == 0) { 1781 rte_vhost_driver_disable_features(file, 1782 1ULL << VIRTIO_NET_F_CSUM); 1783 } 1784 1785 if (enable_tso == 0) { 1786 rte_vhost_driver_disable_features(file, 1787 1ULL << VIRTIO_NET_F_HOST_TSO4); 1788 rte_vhost_driver_disable_features(file, 1789 1ULL << VIRTIO_NET_F_HOST_TSO6); 1790 rte_vhost_driver_disable_features(file, 1791 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1792 rte_vhost_driver_disable_features(file, 1793 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1794 } 1795 1796 if (promiscuous) { 1797 rte_vhost_driver_enable_features(file, 1798 1ULL << VIRTIO_NET_F_CTRL_RX); 1799 } 1800 1801 ret = rte_vhost_driver_callback_register(file, 1802 &virtio_net_device_ops); 1803 if (ret != 0) { 1804 rte_exit(EXIT_FAILURE, 1805 "failed to register vhost driver callbacks.\n"); 1806 } 1807 1808 if (rte_vhost_driver_start(file) < 0) { 1809 rte_exit(EXIT_FAILURE, 1810 "failed to start vhost driver.\n"); 1811 } 1812 } 1813 1814 RTE_LCORE_FOREACH_WORKER(lcore_id) 1815 rte_eal_wait_lcore(lcore_id); 1816 1817 /* clean up the EAL */ 1818 rte_eal_cleanup(); 1819 1820 return 0; 1821 } 1822