1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_vhost.h> 23 #include <rte_ip.h> 24 #include <rte_tcp.h> 25 #include <rte_pause.h> 26 27 #include "ioat.h" 28 #include "main.h" 29 30 #ifndef MAX_QUEUES 31 #define MAX_QUEUES 128 32 #endif 33 34 /* the maximum number of external ports supported */ 35 #define MAX_SUP_PORTS 1 36 37 #define MBUF_CACHE_SIZE 128 38 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 39 40 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 41 42 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 43 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 44 45 #define JUMBO_FRAME_MAX_SIZE 0x2600 46 47 /* State of virtio device. */ 48 #define DEVICE_MAC_LEARNING 0 49 #define DEVICE_RX 1 50 #define DEVICE_SAFE_REMOVE 2 51 52 /* Configurable number of RX/TX ring descriptors */ 53 #define RTE_TEST_RX_DESC_DEFAULT 1024 54 #define RTE_TEST_TX_DESC_DEFAULT 512 55 56 #define INVALID_PORT_ID 0xFF 57 58 /* mask of enabled ports */ 59 static uint32_t enabled_port_mask = 0; 60 61 /* Promiscuous mode */ 62 static uint32_t promiscuous; 63 64 /* number of devices/queues to support*/ 65 static uint32_t num_queues = 0; 66 static uint32_t num_devices; 67 68 static struct rte_mempool *mbuf_pool; 69 static int mergeable; 70 71 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 72 typedef enum { 73 VM2VM_DISABLED = 0, 74 VM2VM_SOFTWARE = 1, 75 VM2VM_HARDWARE = 2, 76 VM2VM_LAST 77 } vm2vm_type; 78 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 79 80 /* Enable stats. */ 81 static uint32_t enable_stats = 0; 82 /* Enable retries on RX. */ 83 static uint32_t enable_retry = 1; 84 85 /* Disable TX checksum offload */ 86 static uint32_t enable_tx_csum; 87 88 /* Disable TSO offload */ 89 static uint32_t enable_tso; 90 91 static int client_mode; 92 93 static int builtin_net_driver; 94 95 static int async_vhost_driver; 96 97 static char *dma_type; 98 99 /* Specify timeout (in useconds) between retries on RX. */ 100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 101 /* Specify the number of retries on RX. */ 102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 103 104 /* Socket file paths. Can be set by user */ 105 static char *socket_files; 106 static int nb_sockets; 107 108 /* empty vmdq configuration structure. Filled in programatically */ 109 static struct rte_eth_conf vmdq_conf_default = { 110 .rxmode = { 111 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 112 .split_hdr_size = 0, 113 /* 114 * VLAN strip is necessary for 1G NIC such as I350, 115 * this fixes bug of ipv4 forwarding in guest can't 116 * forward pakets from one virtio dev to another virtio dev. 117 */ 118 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP, 119 }, 120 121 .txmode = { 122 .mq_mode = ETH_MQ_TX_NONE, 123 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM | 124 DEV_TX_OFFLOAD_TCP_CKSUM | 125 DEV_TX_OFFLOAD_VLAN_INSERT | 126 DEV_TX_OFFLOAD_MULTI_SEGS | 127 DEV_TX_OFFLOAD_TCP_TSO), 128 }, 129 .rx_adv_conf = { 130 /* 131 * should be overridden separately in code with 132 * appropriate values 133 */ 134 .vmdq_rx_conf = { 135 .nb_queue_pools = ETH_8_POOLS, 136 .enable_default_pool = 0, 137 .default_pool = 0, 138 .nb_pool_maps = 0, 139 .pool_map = {{0, 0},}, 140 }, 141 }, 142 }; 143 144 145 static unsigned lcore_ids[RTE_MAX_LCORE]; 146 static uint16_t ports[RTE_MAX_ETHPORTS]; 147 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 148 static uint16_t num_pf_queues, num_vmdq_queues; 149 static uint16_t vmdq_pool_base, vmdq_queue_base; 150 static uint16_t queues_per_pool; 151 152 const uint16_t vlan_tags[] = { 153 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 154 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 155 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 156 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 157 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 158 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 159 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 160 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 161 }; 162 163 /* ethernet addresses of ports */ 164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 165 166 static struct vhost_dev_tailq_list vhost_dev_list = 167 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 168 169 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 170 171 /* Used for queueing bursts of TX packets. */ 172 struct mbuf_table { 173 unsigned len; 174 unsigned txq_id; 175 struct rte_mbuf *m_table[MAX_PKT_BURST]; 176 }; 177 178 struct vhost_bufftable { 179 uint32_t len; 180 uint64_t pre_tsc; 181 struct rte_mbuf *m_table[MAX_PKT_BURST]; 182 }; 183 184 /* TX queue for each data core. */ 185 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 186 187 /* 188 * Vhost TX buffer for each data core. 189 * Every data core maintains a TX buffer for every vhost device, 190 * which is used for batch pkts enqueue for higher performance. 191 */ 192 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE]; 193 194 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 195 / US_PER_S * BURST_TX_DRAIN_US) 196 #define VLAN_HLEN 4 197 198 static inline int 199 open_dma(const char *value) 200 { 201 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) 202 return open_ioat(value); 203 204 return -1; 205 } 206 207 /* 208 * Builds up the correct configuration for VMDQ VLAN pool map 209 * according to the pool & queue limits. 210 */ 211 static inline int 212 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 213 { 214 struct rte_eth_vmdq_rx_conf conf; 215 struct rte_eth_vmdq_rx_conf *def_conf = 216 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 217 unsigned i; 218 219 memset(&conf, 0, sizeof(conf)); 220 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 221 conf.nb_pool_maps = num_devices; 222 conf.enable_loop_back = def_conf->enable_loop_back; 223 conf.rx_mode = def_conf->rx_mode; 224 225 for (i = 0; i < conf.nb_pool_maps; i++) { 226 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 227 conf.pool_map[i].pools = (1UL << i); 228 } 229 230 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 231 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 232 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 233 return 0; 234 } 235 236 /* 237 * Initialises a given port using global settings and with the rx buffers 238 * coming from the mbuf_pool passed as parameter 239 */ 240 static inline int 241 port_init(uint16_t port) 242 { 243 struct rte_eth_dev_info dev_info; 244 struct rte_eth_conf port_conf; 245 struct rte_eth_rxconf *rxconf; 246 struct rte_eth_txconf *txconf; 247 int16_t rx_rings, tx_rings; 248 uint16_t rx_ring_size, tx_ring_size; 249 int retval; 250 uint16_t q; 251 252 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 253 retval = rte_eth_dev_info_get(port, &dev_info); 254 if (retval != 0) { 255 RTE_LOG(ERR, VHOST_PORT, 256 "Error during getting device (port %u) info: %s\n", 257 port, strerror(-retval)); 258 259 return retval; 260 } 261 262 rxconf = &dev_info.default_rxconf; 263 txconf = &dev_info.default_txconf; 264 rxconf->rx_drop_en = 1; 265 266 /*configure the number of supported virtio devices based on VMDQ limits */ 267 num_devices = dev_info.max_vmdq_pools; 268 269 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 270 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 271 272 tx_rings = (uint16_t)rte_lcore_count(); 273 274 /* Get port configuration. */ 275 retval = get_eth_conf(&port_conf, num_devices); 276 if (retval < 0) 277 return retval; 278 /* NIC queues are divided into pf queues and vmdq queues. */ 279 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 280 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 281 num_vmdq_queues = num_devices * queues_per_pool; 282 num_queues = num_pf_queues + num_vmdq_queues; 283 vmdq_queue_base = dev_info.vmdq_queue_base; 284 vmdq_pool_base = dev_info.vmdq_pool_base; 285 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 286 num_pf_queues, num_devices, queues_per_pool); 287 288 if (!rte_eth_dev_is_valid_port(port)) 289 return -1; 290 291 rx_rings = (uint16_t)dev_info.max_rx_queues; 292 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) 293 port_conf.txmode.offloads |= 294 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 295 /* Configure ethernet device. */ 296 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 297 if (retval != 0) { 298 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 299 port, strerror(-retval)); 300 return retval; 301 } 302 303 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 304 &tx_ring_size); 305 if (retval != 0) { 306 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 307 "for port %u: %s.\n", port, strerror(-retval)); 308 return retval; 309 } 310 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 311 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 312 "for Rx queues on port %u.\n", port); 313 return -1; 314 } 315 316 /* Setup the queues. */ 317 rxconf->offloads = port_conf.rxmode.offloads; 318 for (q = 0; q < rx_rings; q ++) { 319 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 320 rte_eth_dev_socket_id(port), 321 rxconf, 322 mbuf_pool); 323 if (retval < 0) { 324 RTE_LOG(ERR, VHOST_PORT, 325 "Failed to setup rx queue %u of port %u: %s.\n", 326 q, port, strerror(-retval)); 327 return retval; 328 } 329 } 330 txconf->offloads = port_conf.txmode.offloads; 331 for (q = 0; q < tx_rings; q ++) { 332 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 333 rte_eth_dev_socket_id(port), 334 txconf); 335 if (retval < 0) { 336 RTE_LOG(ERR, VHOST_PORT, 337 "Failed to setup tx queue %u of port %u: %s.\n", 338 q, port, strerror(-retval)); 339 return retval; 340 } 341 } 342 343 /* Start the device. */ 344 retval = rte_eth_dev_start(port); 345 if (retval < 0) { 346 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 347 port, strerror(-retval)); 348 return retval; 349 } 350 351 if (promiscuous) { 352 retval = rte_eth_promiscuous_enable(port); 353 if (retval != 0) { 354 RTE_LOG(ERR, VHOST_PORT, 355 "Failed to enable promiscuous mode on port %u: %s\n", 356 port, rte_strerror(-retval)); 357 return retval; 358 } 359 } 360 361 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 362 if (retval < 0) { 363 RTE_LOG(ERR, VHOST_PORT, 364 "Failed to get MAC address on port %u: %s\n", 365 port, rte_strerror(-retval)); 366 return retval; 367 } 368 369 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 370 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 371 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 372 port, 373 vmdq_ports_eth_addr[port].addr_bytes[0], 374 vmdq_ports_eth_addr[port].addr_bytes[1], 375 vmdq_ports_eth_addr[port].addr_bytes[2], 376 vmdq_ports_eth_addr[port].addr_bytes[3], 377 vmdq_ports_eth_addr[port].addr_bytes[4], 378 vmdq_ports_eth_addr[port].addr_bytes[5]); 379 380 return 0; 381 } 382 383 /* 384 * Set socket file path. 385 */ 386 static int 387 us_vhost_parse_socket_path(const char *q_arg) 388 { 389 char *old; 390 391 /* parse number string */ 392 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 393 return -1; 394 395 old = socket_files; 396 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 397 if (socket_files == NULL) { 398 free(old); 399 return -1; 400 } 401 402 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 403 nb_sockets++; 404 405 return 0; 406 } 407 408 /* 409 * Parse the portmask provided at run time. 410 */ 411 static int 412 parse_portmask(const char *portmask) 413 { 414 char *end = NULL; 415 unsigned long pm; 416 417 errno = 0; 418 419 /* parse hexadecimal string */ 420 pm = strtoul(portmask, &end, 16); 421 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 422 return 0; 423 424 return pm; 425 426 } 427 428 /* 429 * Parse num options at run time. 430 */ 431 static int 432 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 433 { 434 char *end = NULL; 435 unsigned long num; 436 437 errno = 0; 438 439 /* parse unsigned int string */ 440 num = strtoul(q_arg, &end, 10); 441 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 442 return -1; 443 444 if (num > max_valid_value) 445 return -1; 446 447 return num; 448 449 } 450 451 /* 452 * Display usage 453 */ 454 static void 455 us_vhost_usage(const char *prgname) 456 { 457 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 458 " --vm2vm [0|1|2]\n" 459 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 460 " --socket-file <path>\n" 461 " --nb-devices ND\n" 462 " -p PORTMASK: Set mask for ports to be used by application\n" 463 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 464 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 465 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 466 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 467 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 468 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 469 " --socket-file: The path of the socket file.\n" 470 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 471 " --tso [0|1] disable/enable TCP segment offload.\n" 472 " --client register a vhost-user socket as client mode.\n" 473 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n" 474 " --dmas register dma channel for specific vhost device.\n", 475 prgname); 476 } 477 478 enum { 479 #define OPT_VM2VM "vm2vm" 480 OPT_VM2VM_NUM = 256, 481 #define OPT_RX_RETRY "rx-retry" 482 OPT_RX_RETRY_NUM, 483 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 484 OPT_RX_RETRY_DELAY_NUM, 485 #define OPT_RX_RETRY_NUMB "rx-retry-num" 486 OPT_RX_RETRY_NUMB_NUM, 487 #define OPT_MERGEABLE "mergeable" 488 OPT_MERGEABLE_NUM, 489 #define OPT_STATS "stats" 490 OPT_STATS_NUM, 491 #define OPT_SOCKET_FILE "socket-file" 492 OPT_SOCKET_FILE_NUM, 493 #define OPT_TX_CSUM "tx-csum" 494 OPT_TX_CSUM_NUM, 495 #define OPT_TSO "tso" 496 OPT_TSO_NUM, 497 #define OPT_CLIENT "client" 498 OPT_CLIENT_NUM, 499 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 500 OPT_BUILTIN_NET_DRIVER_NUM, 501 #define OPT_DMA_TYPE "dma-type" 502 OPT_DMA_TYPE_NUM, 503 #define OPT_DMAS "dmas" 504 OPT_DMAS_NUM, 505 }; 506 507 /* 508 * Parse the arguments given in the command line of the application. 509 */ 510 static int 511 us_vhost_parse_args(int argc, char **argv) 512 { 513 int opt, ret; 514 int option_index; 515 unsigned i; 516 const char *prgname = argv[0]; 517 static struct option long_option[] = { 518 {OPT_VM2VM, required_argument, 519 NULL, OPT_VM2VM_NUM}, 520 {OPT_RX_RETRY, required_argument, 521 NULL, OPT_RX_RETRY_NUM}, 522 {OPT_RX_RETRY_DELAY, required_argument, 523 NULL, OPT_RX_RETRY_DELAY_NUM}, 524 {OPT_RX_RETRY_NUMB, required_argument, 525 NULL, OPT_RX_RETRY_NUMB_NUM}, 526 {OPT_MERGEABLE, required_argument, 527 NULL, OPT_MERGEABLE_NUM}, 528 {OPT_STATS, required_argument, 529 NULL, OPT_STATS_NUM}, 530 {OPT_SOCKET_FILE, required_argument, 531 NULL, OPT_SOCKET_FILE_NUM}, 532 {OPT_TX_CSUM, required_argument, 533 NULL, OPT_TX_CSUM_NUM}, 534 {OPT_TSO, required_argument, 535 NULL, OPT_TSO_NUM}, 536 {OPT_CLIENT, no_argument, 537 NULL, OPT_CLIENT_NUM}, 538 {OPT_BUILTIN_NET_DRIVER, no_argument, 539 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 540 {OPT_DMA_TYPE, required_argument, 541 NULL, OPT_DMA_TYPE_NUM}, 542 {OPT_DMAS, required_argument, 543 NULL, OPT_DMAS_NUM}, 544 {NULL, 0, 0, 0}, 545 }; 546 547 /* Parse command line */ 548 while ((opt = getopt_long(argc, argv, "p:P", 549 long_option, &option_index)) != EOF) { 550 switch (opt) { 551 /* Portmask */ 552 case 'p': 553 enabled_port_mask = parse_portmask(optarg); 554 if (enabled_port_mask == 0) { 555 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 556 us_vhost_usage(prgname); 557 return -1; 558 } 559 break; 560 561 case 'P': 562 promiscuous = 1; 563 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 564 ETH_VMDQ_ACCEPT_BROADCAST | 565 ETH_VMDQ_ACCEPT_MULTICAST; 566 break; 567 568 case OPT_VM2VM_NUM: 569 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 570 if (ret == -1) { 571 RTE_LOG(INFO, VHOST_CONFIG, 572 "Invalid argument for " 573 "vm2vm [0|1|2]\n"); 574 us_vhost_usage(prgname); 575 return -1; 576 } 577 vm2vm_mode = (vm2vm_type)ret; 578 break; 579 580 case OPT_RX_RETRY_NUM: 581 ret = parse_num_opt(optarg, 1); 582 if (ret == -1) { 583 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 584 us_vhost_usage(prgname); 585 return -1; 586 } 587 enable_retry = ret; 588 break; 589 590 case OPT_TX_CSUM_NUM: 591 ret = parse_num_opt(optarg, 1); 592 if (ret == -1) { 593 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 594 us_vhost_usage(prgname); 595 return -1; 596 } 597 enable_tx_csum = ret; 598 break; 599 600 case OPT_TSO_NUM: 601 ret = parse_num_opt(optarg, 1); 602 if (ret == -1) { 603 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 604 us_vhost_usage(prgname); 605 return -1; 606 } 607 enable_tso = ret; 608 break; 609 610 case OPT_RX_RETRY_DELAY_NUM: 611 ret = parse_num_opt(optarg, INT32_MAX); 612 if (ret == -1) { 613 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 614 us_vhost_usage(prgname); 615 return -1; 616 } 617 burst_rx_delay_time = ret; 618 break; 619 620 case OPT_RX_RETRY_NUMB_NUM: 621 ret = parse_num_opt(optarg, INT32_MAX); 622 if (ret == -1) { 623 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 624 us_vhost_usage(prgname); 625 return -1; 626 } 627 burst_rx_retry_num = ret; 628 break; 629 630 case OPT_MERGEABLE_NUM: 631 ret = parse_num_opt(optarg, 1); 632 if (ret == -1) { 633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } 637 mergeable = !!ret; 638 if (ret) { 639 vmdq_conf_default.rxmode.offloads |= 640 DEV_RX_OFFLOAD_JUMBO_FRAME; 641 vmdq_conf_default.rxmode.max_rx_pkt_len 642 = JUMBO_FRAME_MAX_SIZE; 643 } 644 break; 645 646 case OPT_STATS_NUM: 647 ret = parse_num_opt(optarg, INT32_MAX); 648 if (ret == -1) { 649 RTE_LOG(INFO, VHOST_CONFIG, 650 "Invalid argument for stats [0..N]\n"); 651 us_vhost_usage(prgname); 652 return -1; 653 } 654 enable_stats = ret; 655 break; 656 657 /* Set socket file path. */ 658 case OPT_SOCKET_FILE_NUM: 659 if (us_vhost_parse_socket_path(optarg) == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, 661 "Invalid argument for socket name (Max %d characters)\n", 662 PATH_MAX); 663 us_vhost_usage(prgname); 664 return -1; 665 } 666 break; 667 668 case OPT_DMA_TYPE_NUM: 669 dma_type = optarg; 670 break; 671 672 case OPT_DMAS_NUM: 673 if (open_dma(optarg) == -1) { 674 RTE_LOG(INFO, VHOST_CONFIG, 675 "Wrong DMA args\n"); 676 us_vhost_usage(prgname); 677 return -1; 678 } 679 async_vhost_driver = 1; 680 break; 681 682 case OPT_CLIENT_NUM: 683 client_mode = 1; 684 break; 685 686 case OPT_BUILTIN_NET_DRIVER_NUM: 687 builtin_net_driver = 1; 688 break; 689 690 /* Invalid option - print options. */ 691 default: 692 us_vhost_usage(prgname); 693 return -1; 694 } 695 } 696 697 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 698 if (enabled_port_mask & (1 << i)) 699 ports[num_ports++] = i; 700 } 701 702 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 703 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 704 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 705 return -1; 706 } 707 708 return 0; 709 } 710 711 /* 712 * Update the global var NUM_PORTS and array PORTS according to system ports number 713 * and return valid ports number 714 */ 715 static unsigned check_ports_num(unsigned nb_ports) 716 { 717 unsigned valid_num_ports = num_ports; 718 unsigned portid; 719 720 if (num_ports > nb_ports) { 721 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 722 num_ports, nb_ports); 723 num_ports = nb_ports; 724 } 725 726 for (portid = 0; portid < num_ports; portid ++) { 727 if (!rte_eth_dev_is_valid_port(ports[portid])) { 728 RTE_LOG(INFO, VHOST_PORT, 729 "\nSpecified port ID(%u) is not valid\n", 730 ports[portid]); 731 ports[portid] = INVALID_PORT_ID; 732 valid_num_ports--; 733 } 734 } 735 return valid_num_ports; 736 } 737 738 static __rte_always_inline struct vhost_dev * 739 find_vhost_dev(struct rte_ether_addr *mac) 740 { 741 struct vhost_dev *vdev; 742 743 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 744 if (vdev->ready == DEVICE_RX && 745 rte_is_same_ether_addr(mac, &vdev->mac_address)) 746 return vdev; 747 } 748 749 return NULL; 750 } 751 752 /* 753 * This function learns the MAC address of the device and registers this along with a 754 * vlan tag to a VMDQ. 755 */ 756 static int 757 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 758 { 759 struct rte_ether_hdr *pkt_hdr; 760 int i, ret; 761 762 /* Learn MAC address of guest device from packet */ 763 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 764 765 if (find_vhost_dev(&pkt_hdr->s_addr)) { 766 RTE_LOG(ERR, VHOST_DATA, 767 "(%d) device is using a registered MAC!\n", 768 vdev->vid); 769 return -1; 770 } 771 772 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 773 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 774 775 /* vlan_tag currently uses the device_id. */ 776 vdev->vlan_tag = vlan_tags[vdev->vid]; 777 778 /* Print out VMDQ registration info. */ 779 RTE_LOG(INFO, VHOST_DATA, 780 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 781 vdev->vid, 782 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 783 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 784 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 785 vdev->vlan_tag); 786 787 /* Register the MAC address. */ 788 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 789 (uint32_t)vdev->vid + vmdq_pool_base); 790 if (ret) 791 RTE_LOG(ERR, VHOST_DATA, 792 "(%d) failed to add device MAC address to VMDQ\n", 793 vdev->vid); 794 795 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 796 797 /* Set device as ready for RX. */ 798 vdev->ready = DEVICE_RX; 799 800 return 0; 801 } 802 803 /* 804 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 805 * queue before disabling RX on the device. 806 */ 807 static inline void 808 unlink_vmdq(struct vhost_dev *vdev) 809 { 810 unsigned i = 0; 811 unsigned rx_count; 812 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 813 814 if (vdev->ready == DEVICE_RX) { 815 /*clear MAC and VLAN settings*/ 816 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 817 for (i = 0; i < 6; i++) 818 vdev->mac_address.addr_bytes[i] = 0; 819 820 vdev->vlan_tag = 0; 821 822 /*Clear out the receive buffers*/ 823 rx_count = rte_eth_rx_burst(ports[0], 824 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 825 826 while (rx_count) { 827 for (i = 0; i < rx_count; i++) 828 rte_pktmbuf_free(pkts_burst[i]); 829 830 rx_count = rte_eth_rx_burst(ports[0], 831 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 832 } 833 834 vdev->ready = DEVICE_MAC_LEARNING; 835 } 836 } 837 838 static inline void 839 free_pkts(struct rte_mbuf **pkts, uint16_t n) 840 { 841 while (n--) 842 rte_pktmbuf_free(pkts[n]); 843 } 844 845 static __rte_always_inline void 846 complete_async_pkts(struct vhost_dev *vdev) 847 { 848 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 849 uint16_t complete_count; 850 851 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 852 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST); 853 if (complete_count) 854 free_pkts(p_cpl, complete_count); 855 } 856 857 static __rte_always_inline void 858 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 859 struct rte_mbuf *m) 860 { 861 uint16_t ret; 862 863 if (builtin_net_driver) { 864 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 865 } else { 866 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 867 } 868 869 if (enable_stats) { 870 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 871 __ATOMIC_SEQ_CST); 872 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 873 __ATOMIC_SEQ_CST); 874 src_vdev->stats.tx_total++; 875 src_vdev->stats.tx += ret; 876 } 877 } 878 879 static __rte_always_inline void 880 drain_vhost(struct vhost_dev *vdev) 881 { 882 uint16_t ret; 883 uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid; 884 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 885 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 886 887 if (builtin_net_driver) { 888 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); 889 } else if (async_vhost_driver) { 890 uint32_t cpu_cpl_nr = 0; 891 uint16_t enqueue_fail = 0; 892 struct rte_mbuf *m_cpu_cpl[nr_xmit]; 893 894 complete_async_pkts(vdev); 895 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, 896 m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr); 897 898 if (cpu_cpl_nr) 899 free_pkts(m_cpu_cpl, cpu_cpl_nr); 900 901 enqueue_fail = nr_xmit - ret; 902 if (enqueue_fail) 903 free_pkts(&m[ret], nr_xmit - ret); 904 } else { 905 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 906 m, nr_xmit); 907 } 908 909 if (enable_stats) { 910 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 911 __ATOMIC_SEQ_CST); 912 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 913 __ATOMIC_SEQ_CST); 914 } 915 916 if (!async_vhost_driver) 917 free_pkts(m, nr_xmit); 918 } 919 920 static __rte_always_inline void 921 drain_vhost_table(void) 922 { 923 uint16_t lcore_id = rte_lcore_id(); 924 struct vhost_bufftable *vhost_txq; 925 struct vhost_dev *vdev; 926 uint64_t cur_tsc; 927 928 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 929 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE 930 + vdev->vid]; 931 932 cur_tsc = rte_rdtsc(); 933 if (unlikely(cur_tsc - vhost_txq->pre_tsc 934 > MBUF_TABLE_DRAIN_TSC)) { 935 RTE_LOG_DP(DEBUG, VHOST_DATA, 936 "Vhost TX queue drained after timeout with burst size %u\n", 937 vhost_txq->len); 938 drain_vhost(vdev); 939 vhost_txq->len = 0; 940 vhost_txq->pre_tsc = cur_tsc; 941 } 942 } 943 } 944 945 /* 946 * Check if the packet destination MAC address is for a local device. If so then put 947 * the packet on that devices RX queue. If not then return. 948 */ 949 static __rte_always_inline int 950 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 951 { 952 struct rte_ether_hdr *pkt_hdr; 953 struct vhost_dev *dst_vdev; 954 struct vhost_bufftable *vhost_txq; 955 uint16_t lcore_id = rte_lcore_id(); 956 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 957 958 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 959 if (!dst_vdev) 960 return -1; 961 962 if (vdev->vid == dst_vdev->vid) { 963 RTE_LOG_DP(DEBUG, VHOST_DATA, 964 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 965 vdev->vid); 966 return 0; 967 } 968 969 RTE_LOG_DP(DEBUG, VHOST_DATA, 970 "(%d) TX: MAC address is local\n", dst_vdev->vid); 971 972 if (unlikely(dst_vdev->remove)) { 973 RTE_LOG_DP(DEBUG, VHOST_DATA, 974 "(%d) device is marked for removal\n", dst_vdev->vid); 975 return 0; 976 } 977 978 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid]; 979 vhost_txq->m_table[vhost_txq->len++] = m; 980 981 if (enable_stats) { 982 vdev->stats.tx_total++; 983 vdev->stats.tx++; 984 } 985 986 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 987 drain_vhost(dst_vdev); 988 vhost_txq->len = 0; 989 vhost_txq->pre_tsc = rte_rdtsc(); 990 } 991 return 0; 992 } 993 994 /* 995 * Check if the destination MAC of a packet is one local VM, 996 * and get its vlan tag, and offset if it is. 997 */ 998 static __rte_always_inline int 999 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1000 uint32_t *offset, uint16_t *vlan_tag) 1001 { 1002 struct vhost_dev *dst_vdev; 1003 struct rte_ether_hdr *pkt_hdr = 1004 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1005 1006 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 1007 if (!dst_vdev) 1008 return 0; 1009 1010 if (vdev->vid == dst_vdev->vid) { 1011 RTE_LOG_DP(DEBUG, VHOST_DATA, 1012 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1013 vdev->vid); 1014 return -1; 1015 } 1016 1017 /* 1018 * HW vlan strip will reduce the packet length 1019 * by minus length of vlan tag, so need restore 1020 * the packet length by plus it. 1021 */ 1022 *offset = VLAN_HLEN; 1023 *vlan_tag = vlan_tags[vdev->vid]; 1024 1025 RTE_LOG_DP(DEBUG, VHOST_DATA, 1026 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1027 vdev->vid, dst_vdev->vid, *vlan_tag); 1028 1029 return 0; 1030 } 1031 1032 static uint16_t 1033 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 1034 { 1035 if (ol_flags & PKT_TX_IPV4) 1036 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 1037 else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1038 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 1039 } 1040 1041 static void virtio_tx_offload(struct rte_mbuf *m) 1042 { 1043 void *l3_hdr; 1044 struct rte_ipv4_hdr *ipv4_hdr = NULL; 1045 struct rte_tcp_hdr *tcp_hdr = NULL; 1046 struct rte_ether_hdr *eth_hdr = 1047 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1048 1049 l3_hdr = (char *)eth_hdr + m->l2_len; 1050 1051 if (m->ol_flags & PKT_TX_IPV4) { 1052 ipv4_hdr = l3_hdr; 1053 ipv4_hdr->hdr_checksum = 0; 1054 m->ol_flags |= PKT_TX_IP_CKSUM; 1055 } 1056 1057 tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len); 1058 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 1059 } 1060 1061 static __rte_always_inline void 1062 do_drain_mbuf_table(struct mbuf_table *tx_q) 1063 { 1064 uint16_t count; 1065 1066 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1067 tx_q->m_table, tx_q->len); 1068 if (unlikely(count < tx_q->len)) 1069 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1070 1071 tx_q->len = 0; 1072 } 1073 1074 /* 1075 * This function routes the TX packet to the correct interface. This 1076 * may be a local device or the physical port. 1077 */ 1078 static __rte_always_inline void 1079 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1080 { 1081 struct mbuf_table *tx_q; 1082 unsigned offset = 0; 1083 const uint16_t lcore_id = rte_lcore_id(); 1084 struct rte_ether_hdr *nh; 1085 1086 1087 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1088 if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) { 1089 struct vhost_dev *vdev2; 1090 1091 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1092 if (vdev2 != vdev) 1093 sync_virtio_xmit(vdev2, vdev, m); 1094 } 1095 goto queue2nic; 1096 } 1097 1098 /*check if destination is local VM*/ 1099 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1100 return; 1101 1102 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1103 if (unlikely(find_local_dest(vdev, m, &offset, 1104 &vlan_tag) != 0)) { 1105 rte_pktmbuf_free(m); 1106 return; 1107 } 1108 } 1109 1110 RTE_LOG_DP(DEBUG, VHOST_DATA, 1111 "(%d) TX: MAC address is external\n", vdev->vid); 1112 1113 queue2nic: 1114 1115 /*Add packet to the port tx queue*/ 1116 tx_q = &lcore_tx_queue[lcore_id]; 1117 1118 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1119 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1120 /* Guest has inserted the vlan tag. */ 1121 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1122 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1123 if ((vm2vm_mode == VM2VM_HARDWARE) && 1124 (vh->vlan_tci != vlan_tag_be)) 1125 vh->vlan_tci = vlan_tag_be; 1126 } else { 1127 m->ol_flags |= PKT_TX_VLAN_PKT; 1128 1129 /* 1130 * Find the right seg to adjust the data len when offset is 1131 * bigger than tail room size. 1132 */ 1133 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1134 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1135 m->data_len += offset; 1136 else { 1137 struct rte_mbuf *seg = m; 1138 1139 while ((seg->next != NULL) && 1140 (offset > rte_pktmbuf_tailroom(seg))) 1141 seg = seg->next; 1142 1143 seg->data_len += offset; 1144 } 1145 m->pkt_len += offset; 1146 } 1147 1148 m->vlan_tci = vlan_tag; 1149 } 1150 1151 if (m->ol_flags & PKT_TX_TCP_SEG) 1152 virtio_tx_offload(m); 1153 1154 tx_q->m_table[tx_q->len++] = m; 1155 if (enable_stats) { 1156 vdev->stats.tx_total++; 1157 vdev->stats.tx++; 1158 } 1159 1160 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1161 do_drain_mbuf_table(tx_q); 1162 } 1163 1164 1165 static __rte_always_inline void 1166 drain_mbuf_table(struct mbuf_table *tx_q) 1167 { 1168 static uint64_t prev_tsc; 1169 uint64_t cur_tsc; 1170 1171 if (tx_q->len == 0) 1172 return; 1173 1174 cur_tsc = rte_rdtsc(); 1175 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1176 prev_tsc = cur_tsc; 1177 1178 RTE_LOG_DP(DEBUG, VHOST_DATA, 1179 "TX queue drained after timeout with burst size %u\n", 1180 tx_q->len); 1181 do_drain_mbuf_table(tx_q); 1182 } 1183 } 1184 1185 static __rte_always_inline void 1186 drain_eth_rx(struct vhost_dev *vdev) 1187 { 1188 uint16_t rx_count, enqueue_count; 1189 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1190 1191 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1192 pkts, MAX_PKT_BURST); 1193 1194 if (!rx_count) 1195 return; 1196 1197 /* 1198 * When "enable_retry" is set, here we wait and retry when there 1199 * is no enough free slots in the queue to hold @rx_count packets, 1200 * to diminish packet loss. 1201 */ 1202 if (enable_retry && 1203 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1204 VIRTIO_RXQ))) { 1205 uint32_t retry; 1206 1207 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1208 rte_delay_us(burst_rx_delay_time); 1209 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1210 VIRTIO_RXQ)) 1211 break; 1212 } 1213 } 1214 1215 if (builtin_net_driver) { 1216 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1217 pkts, rx_count); 1218 } else if (async_vhost_driver) { 1219 uint32_t cpu_cpl_nr = 0; 1220 uint16_t enqueue_fail = 0; 1221 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST]; 1222 1223 complete_async_pkts(vdev); 1224 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, 1225 VIRTIO_RXQ, pkts, rx_count, 1226 m_cpu_cpl, &cpu_cpl_nr); 1227 if (cpu_cpl_nr) 1228 free_pkts(m_cpu_cpl, cpu_cpl_nr); 1229 1230 enqueue_fail = rx_count - enqueue_count; 1231 if (enqueue_fail) 1232 free_pkts(&pkts[enqueue_count], enqueue_fail); 1233 1234 } else { 1235 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1236 pkts, rx_count); 1237 } 1238 1239 if (enable_stats) { 1240 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1241 __ATOMIC_SEQ_CST); 1242 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1243 __ATOMIC_SEQ_CST); 1244 } 1245 1246 if (!async_vhost_driver) 1247 free_pkts(pkts, rx_count); 1248 } 1249 1250 static __rte_always_inline void 1251 drain_virtio_tx(struct vhost_dev *vdev) 1252 { 1253 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1254 uint16_t count; 1255 uint16_t i; 1256 1257 if (builtin_net_driver) { 1258 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1259 pkts, MAX_PKT_BURST); 1260 } else { 1261 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1262 mbuf_pool, pkts, MAX_PKT_BURST); 1263 } 1264 1265 /* setup VMDq for the first packet */ 1266 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1267 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1268 free_pkts(pkts, count); 1269 } 1270 1271 for (i = 0; i < count; ++i) 1272 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1273 } 1274 1275 /* 1276 * Main function of vhost-switch. It basically does: 1277 * 1278 * for each vhost device { 1279 * - drain_eth_rx() 1280 * 1281 * Which drains the host eth Rx queue linked to the vhost device, 1282 * and deliver all of them to guest virito Rx ring associated with 1283 * this vhost device. 1284 * 1285 * - drain_virtio_tx() 1286 * 1287 * Which drains the guest virtio Tx queue and deliver all of them 1288 * to the target, which could be another vhost device, or the 1289 * physical eth dev. The route is done in function "virtio_tx_route". 1290 * } 1291 */ 1292 static int 1293 switch_worker(void *arg __rte_unused) 1294 { 1295 unsigned i; 1296 unsigned lcore_id = rte_lcore_id(); 1297 struct vhost_dev *vdev; 1298 struct mbuf_table *tx_q; 1299 1300 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1301 1302 tx_q = &lcore_tx_queue[lcore_id]; 1303 for (i = 0; i < rte_lcore_count(); i++) { 1304 if (lcore_ids[i] == lcore_id) { 1305 tx_q->txq_id = i; 1306 break; 1307 } 1308 } 1309 1310 while(1) { 1311 drain_mbuf_table(tx_q); 1312 drain_vhost_table(); 1313 /* 1314 * Inform the configuration core that we have exited the 1315 * linked list and that no devices are in use if requested. 1316 */ 1317 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1318 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1319 1320 /* 1321 * Process vhost devices 1322 */ 1323 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1324 lcore_vdev_entry) { 1325 if (unlikely(vdev->remove)) { 1326 unlink_vmdq(vdev); 1327 vdev->ready = DEVICE_SAFE_REMOVE; 1328 continue; 1329 } 1330 1331 if (likely(vdev->ready == DEVICE_RX)) 1332 drain_eth_rx(vdev); 1333 1334 if (likely(!vdev->remove)) 1335 drain_virtio_tx(vdev); 1336 } 1337 } 1338 1339 return 0; 1340 } 1341 1342 /* 1343 * Remove a device from the specific data core linked list and from the 1344 * main linked list. Synchonization occurs through the use of the 1345 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1346 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1347 */ 1348 static void 1349 destroy_device(int vid) 1350 { 1351 struct vhost_dev *vdev = NULL; 1352 int lcore; 1353 uint16_t i; 1354 1355 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1356 if (vdev->vid == vid) 1357 break; 1358 } 1359 if (!vdev) 1360 return; 1361 /*set the remove flag. */ 1362 vdev->remove = 1; 1363 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1364 rte_pause(); 1365 } 1366 1367 for (i = 0; i < RTE_MAX_LCORE; i++) 1368 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]); 1369 1370 if (builtin_net_driver) 1371 vs_vhost_net_remove(vdev); 1372 1373 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1374 lcore_vdev_entry); 1375 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1376 1377 1378 /* Set the dev_removal_flag on each lcore. */ 1379 RTE_LCORE_FOREACH_WORKER(lcore) 1380 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1381 1382 /* 1383 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1384 * we can be sure that they can no longer access the device removed 1385 * from the linked lists and that the devices are no longer in use. 1386 */ 1387 RTE_LCORE_FOREACH_WORKER(lcore) { 1388 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1389 rte_pause(); 1390 } 1391 1392 lcore_info[vdev->coreid].device_num--; 1393 1394 RTE_LOG(INFO, VHOST_DATA, 1395 "(%d) device has been removed from data core\n", 1396 vdev->vid); 1397 1398 if (async_vhost_driver) 1399 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1400 1401 rte_free(vdev); 1402 } 1403 1404 /* 1405 * A new device is added to a data core. First the device is added to the main linked list 1406 * and then allocated to a specific data core. 1407 */ 1408 static int 1409 new_device(int vid) 1410 { 1411 int lcore, core_add = 0; 1412 uint16_t i; 1413 uint32_t device_num_min = num_devices; 1414 struct vhost_dev *vdev; 1415 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1416 if (vdev == NULL) { 1417 RTE_LOG(INFO, VHOST_DATA, 1418 "(%d) couldn't allocate memory for vhost dev\n", 1419 vid); 1420 return -1; 1421 } 1422 vdev->vid = vid; 1423 1424 for (i = 0; i < RTE_MAX_LCORE; i++) { 1425 vhost_txbuff[i * MAX_VHOST_DEVICE + vid] 1426 = rte_zmalloc("vhost bufftable", 1427 sizeof(struct vhost_bufftable), 1428 RTE_CACHE_LINE_SIZE); 1429 1430 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) { 1431 RTE_LOG(INFO, VHOST_DATA, 1432 "(%d) couldn't allocate memory for vhost TX\n", vid); 1433 return -1; 1434 } 1435 } 1436 1437 if (builtin_net_driver) 1438 vs_vhost_net_setup(vdev); 1439 1440 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1441 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1442 1443 /*reset ready flag*/ 1444 vdev->ready = DEVICE_MAC_LEARNING; 1445 vdev->remove = 0; 1446 1447 /* Find a suitable lcore to add the device. */ 1448 RTE_LCORE_FOREACH_WORKER(lcore) { 1449 if (lcore_info[lcore].device_num < device_num_min) { 1450 device_num_min = lcore_info[lcore].device_num; 1451 core_add = lcore; 1452 } 1453 } 1454 vdev->coreid = core_add; 1455 1456 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1457 lcore_vdev_entry); 1458 lcore_info[vdev->coreid].device_num++; 1459 1460 /* Disable notifications. */ 1461 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1462 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1463 1464 RTE_LOG(INFO, VHOST_DATA, 1465 "(%d) device has been added to data core %d\n", 1466 vid, vdev->coreid); 1467 1468 if (async_vhost_driver) { 1469 struct rte_vhost_async_features f; 1470 struct rte_vhost_async_channel_ops channel_ops; 1471 1472 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) { 1473 channel_ops.transfer_data = ioat_transfer_data_cb; 1474 channel_ops.check_completed_copies = 1475 ioat_check_completed_copies_cb; 1476 1477 f.async_inorder = 1; 1478 f.async_threshold = 256; 1479 1480 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ, 1481 f.intval, &channel_ops); 1482 } 1483 } 1484 1485 return 0; 1486 } 1487 1488 /* 1489 * These callback allow devices to be added to the data core when configuration 1490 * has been fully complete. 1491 */ 1492 static const struct vhost_device_ops virtio_net_device_ops = 1493 { 1494 .new_device = new_device, 1495 .destroy_device = destroy_device, 1496 }; 1497 1498 /* 1499 * This is a thread will wake up after a period to print stats if the user has 1500 * enabled them. 1501 */ 1502 static void * 1503 print_stats(__rte_unused void *arg) 1504 { 1505 struct vhost_dev *vdev; 1506 uint64_t tx_dropped, rx_dropped; 1507 uint64_t tx, tx_total, rx, rx_total; 1508 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1509 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1510 1511 while(1) { 1512 sleep(enable_stats); 1513 1514 /* Clear screen and move to top left */ 1515 printf("%s%s\n", clr, top_left); 1516 printf("Device statistics =================================\n"); 1517 1518 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1519 tx_total = vdev->stats.tx_total; 1520 tx = vdev->stats.tx; 1521 tx_dropped = tx_total - tx; 1522 1523 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1524 __ATOMIC_SEQ_CST); 1525 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1526 __ATOMIC_SEQ_CST); 1527 rx_dropped = rx_total - rx; 1528 1529 printf("Statistics for device %d\n" 1530 "-----------------------\n" 1531 "TX total: %" PRIu64 "\n" 1532 "TX dropped: %" PRIu64 "\n" 1533 "TX successful: %" PRIu64 "\n" 1534 "RX total: %" PRIu64 "\n" 1535 "RX dropped: %" PRIu64 "\n" 1536 "RX successful: %" PRIu64 "\n", 1537 vdev->vid, 1538 tx_total, tx_dropped, tx, 1539 rx_total, rx_dropped, rx); 1540 } 1541 1542 printf("===================================================\n"); 1543 1544 fflush(stdout); 1545 } 1546 1547 return NULL; 1548 } 1549 1550 static void 1551 unregister_drivers(int socket_num) 1552 { 1553 int i, ret; 1554 1555 for (i = 0; i < socket_num; i++) { 1556 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1557 if (ret != 0) 1558 RTE_LOG(ERR, VHOST_CONFIG, 1559 "Fail to unregister vhost driver for %s.\n", 1560 socket_files + i * PATH_MAX); 1561 } 1562 } 1563 1564 /* When we receive a INT signal, unregister vhost driver */ 1565 static void 1566 sigint_handler(__rte_unused int signum) 1567 { 1568 /* Unregister vhost driver. */ 1569 unregister_drivers(nb_sockets); 1570 1571 exit(0); 1572 } 1573 1574 /* 1575 * While creating an mbuf pool, one key thing is to figure out how 1576 * many mbuf entries is enough for our use. FYI, here are some 1577 * guidelines: 1578 * 1579 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1580 * 1581 * - For each switch core (A CPU core does the packet switch), we need 1582 * also make some reservation for receiving the packets from virtio 1583 * Tx queue. How many is enough depends on the usage. It's normally 1584 * a simple calculation like following: 1585 * 1586 * MAX_PKT_BURST * max packet size / mbuf size 1587 * 1588 * So, we definitely need allocate more mbufs when TSO is enabled. 1589 * 1590 * - Similarly, for each switching core, we should serve @nr_rx_desc 1591 * mbufs for receiving the packets from physical NIC device. 1592 * 1593 * - We also need make sure, for each switch core, we have allocated 1594 * enough mbufs to fill up the mbuf cache. 1595 */ 1596 static void 1597 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1598 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1599 { 1600 uint32_t nr_mbufs; 1601 uint32_t nr_mbufs_per_core; 1602 uint32_t mtu = 1500; 1603 1604 if (mergeable) 1605 mtu = 9000; 1606 if (enable_tso) 1607 mtu = 64 * 1024; 1608 1609 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1610 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1611 nr_mbufs_per_core += nr_rx_desc; 1612 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1613 1614 nr_mbufs = nr_queues * nr_rx_desc; 1615 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1616 nr_mbufs *= nr_port; 1617 1618 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1619 nr_mbuf_cache, 0, mbuf_size, 1620 rte_socket_id()); 1621 if (mbuf_pool == NULL) 1622 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1623 } 1624 1625 /* 1626 * Main function, does initialisation and calls the per-lcore functions. 1627 */ 1628 int 1629 main(int argc, char *argv[]) 1630 { 1631 unsigned lcore_id, core_id = 0; 1632 unsigned nb_ports, valid_num_ports; 1633 int ret, i; 1634 uint16_t portid; 1635 static pthread_t tid; 1636 uint64_t flags = 0; 1637 1638 signal(SIGINT, sigint_handler); 1639 1640 /* init EAL */ 1641 ret = rte_eal_init(argc, argv); 1642 if (ret < 0) 1643 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1644 argc -= ret; 1645 argv += ret; 1646 1647 /* parse app arguments */ 1648 ret = us_vhost_parse_args(argc, argv); 1649 if (ret < 0) 1650 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1651 1652 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1653 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1654 1655 if (rte_lcore_is_enabled(lcore_id)) 1656 lcore_ids[core_id++] = lcore_id; 1657 } 1658 1659 if (rte_lcore_count() > RTE_MAX_LCORE) 1660 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1661 1662 /* Get the number of physical ports. */ 1663 nb_ports = rte_eth_dev_count_avail(); 1664 1665 /* 1666 * Update the global var NUM_PORTS and global array PORTS 1667 * and get value of var VALID_NUM_PORTS according to system ports number 1668 */ 1669 valid_num_ports = check_ports_num(nb_ports); 1670 1671 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1672 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1673 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1674 return -1; 1675 } 1676 1677 /* 1678 * FIXME: here we are trying to allocate mbufs big enough for 1679 * @MAX_QUEUES, but the truth is we're never going to use that 1680 * many queues here. We probably should only do allocation for 1681 * those queues we are going to use. 1682 */ 1683 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1684 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1685 1686 if (vm2vm_mode == VM2VM_HARDWARE) { 1687 /* Enable VT loop back to let L2 switch to do it. */ 1688 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1689 RTE_LOG(DEBUG, VHOST_CONFIG, 1690 "Enable loop back for L2 switch in vmdq.\n"); 1691 } 1692 1693 /* initialize all ports */ 1694 RTE_ETH_FOREACH_DEV(portid) { 1695 /* skip ports that are not enabled */ 1696 if ((enabled_port_mask & (1 << portid)) == 0) { 1697 RTE_LOG(INFO, VHOST_PORT, 1698 "Skipping disabled port %d\n", portid); 1699 continue; 1700 } 1701 if (port_init(portid) != 0) 1702 rte_exit(EXIT_FAILURE, 1703 "Cannot initialize network ports\n"); 1704 } 1705 1706 /* Enable stats if the user option is set. */ 1707 if (enable_stats) { 1708 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1709 print_stats, NULL); 1710 if (ret < 0) 1711 rte_exit(EXIT_FAILURE, 1712 "Cannot create print-stats thread\n"); 1713 } 1714 1715 /* Launch all data cores. */ 1716 RTE_LCORE_FOREACH_WORKER(lcore_id) 1717 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1718 1719 if (client_mode) 1720 flags |= RTE_VHOST_USER_CLIENT; 1721 1722 /* Register vhost user driver to handle vhost messages. */ 1723 for (i = 0; i < nb_sockets; i++) { 1724 char *file = socket_files + i * PATH_MAX; 1725 1726 if (async_vhost_driver) 1727 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1728 1729 ret = rte_vhost_driver_register(file, flags); 1730 if (ret != 0) { 1731 unregister_drivers(i); 1732 rte_exit(EXIT_FAILURE, 1733 "vhost driver register failure.\n"); 1734 } 1735 1736 if (builtin_net_driver) 1737 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1738 1739 if (mergeable == 0) { 1740 rte_vhost_driver_disable_features(file, 1741 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1742 } 1743 1744 if (enable_tx_csum == 0) { 1745 rte_vhost_driver_disable_features(file, 1746 1ULL << VIRTIO_NET_F_CSUM); 1747 } 1748 1749 if (enable_tso == 0) { 1750 rte_vhost_driver_disable_features(file, 1751 1ULL << VIRTIO_NET_F_HOST_TSO4); 1752 rte_vhost_driver_disable_features(file, 1753 1ULL << VIRTIO_NET_F_HOST_TSO6); 1754 rte_vhost_driver_disable_features(file, 1755 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1756 rte_vhost_driver_disable_features(file, 1757 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1758 } 1759 1760 if (promiscuous) { 1761 rte_vhost_driver_enable_features(file, 1762 1ULL << VIRTIO_NET_F_CTRL_RX); 1763 } 1764 1765 ret = rte_vhost_driver_callback_register(file, 1766 &virtio_net_device_ops); 1767 if (ret != 0) { 1768 rte_exit(EXIT_FAILURE, 1769 "failed to register vhost driver callbacks.\n"); 1770 } 1771 1772 if (rte_vhost_driver_start(file) < 0) { 1773 rte_exit(EXIT_FAILURE, 1774 "failed to start vhost driver.\n"); 1775 } 1776 } 1777 1778 RTE_LCORE_FOREACH_WORKER(lcore_id) 1779 rte_eal_wait_lcore(lcore_id); 1780 1781 /* clean up the EAL */ 1782 rte_eal_cleanup(); 1783 1784 return 0; 1785 } 1786