1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 28 #include "ioat.h" 29 #include "main.h" 30 31 #ifndef MAX_QUEUES 32 #define MAX_QUEUES 128 33 #endif 34 35 /* the maximum number of external ports supported */ 36 #define MAX_SUP_PORTS 1 37 38 #define MBUF_CACHE_SIZE 128 39 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 40 41 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 42 43 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 44 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 45 46 #define JUMBO_FRAME_MAX_SIZE 0x2600 47 48 /* State of virtio device. */ 49 #define DEVICE_MAC_LEARNING 0 50 #define DEVICE_RX 1 51 #define DEVICE_SAFE_REMOVE 2 52 53 /* Configurable number of RX/TX ring descriptors */ 54 #define RTE_TEST_RX_DESC_DEFAULT 1024 55 #define RTE_TEST_TX_DESC_DEFAULT 512 56 57 #define INVALID_PORT_ID 0xFF 58 59 /* mask of enabled ports */ 60 static uint32_t enabled_port_mask = 0; 61 62 /* Promiscuous mode */ 63 static uint32_t promiscuous; 64 65 /* number of devices/queues to support*/ 66 static uint32_t num_queues = 0; 67 static uint32_t num_devices; 68 69 static struct rte_mempool *mbuf_pool; 70 static int mergeable; 71 72 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 73 typedef enum { 74 VM2VM_DISABLED = 0, 75 VM2VM_SOFTWARE = 1, 76 VM2VM_HARDWARE = 2, 77 VM2VM_LAST 78 } vm2vm_type; 79 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 80 81 /* Enable stats. */ 82 static uint32_t enable_stats = 0; 83 /* Enable retries on RX. */ 84 static uint32_t enable_retry = 1; 85 86 /* Disable TX checksum offload */ 87 static uint32_t enable_tx_csum; 88 89 /* Disable TSO offload */ 90 static uint32_t enable_tso; 91 92 static int client_mode; 93 94 static int builtin_net_driver; 95 96 static int async_vhost_driver; 97 98 static char *dma_type; 99 100 /* Specify timeout (in useconds) between retries on RX. */ 101 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 102 /* Specify the number of retries on RX. */ 103 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 104 105 /* Socket file paths. Can be set by user */ 106 static char *socket_files; 107 static int nb_sockets; 108 109 /* empty vmdq configuration structure. Filled in programatically */ 110 static struct rte_eth_conf vmdq_conf_default = { 111 .rxmode = { 112 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 113 .split_hdr_size = 0, 114 /* 115 * VLAN strip is necessary for 1G NIC such as I350, 116 * this fixes bug of ipv4 forwarding in guest can't 117 * forward pakets from one virtio dev to another virtio dev. 118 */ 119 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP, 120 }, 121 122 .txmode = { 123 .mq_mode = ETH_MQ_TX_NONE, 124 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM | 125 DEV_TX_OFFLOAD_TCP_CKSUM | 126 DEV_TX_OFFLOAD_VLAN_INSERT | 127 DEV_TX_OFFLOAD_MULTI_SEGS | 128 DEV_TX_OFFLOAD_TCP_TSO), 129 }, 130 .rx_adv_conf = { 131 /* 132 * should be overridden separately in code with 133 * appropriate values 134 */ 135 .vmdq_rx_conf = { 136 .nb_queue_pools = ETH_8_POOLS, 137 .enable_default_pool = 0, 138 .default_pool = 0, 139 .nb_pool_maps = 0, 140 .pool_map = {{0, 0},}, 141 }, 142 }, 143 }; 144 145 146 static unsigned lcore_ids[RTE_MAX_LCORE]; 147 static uint16_t ports[RTE_MAX_ETHPORTS]; 148 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 149 static uint16_t num_pf_queues, num_vmdq_queues; 150 static uint16_t vmdq_pool_base, vmdq_queue_base; 151 static uint16_t queues_per_pool; 152 153 const uint16_t vlan_tags[] = { 154 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 155 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 156 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 157 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 158 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 159 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 160 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 161 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 162 }; 163 164 /* ethernet addresses of ports */ 165 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 166 167 static struct vhost_dev_tailq_list vhost_dev_list = 168 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 169 170 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 171 172 /* Used for queueing bursts of TX packets. */ 173 struct mbuf_table { 174 unsigned len; 175 unsigned txq_id; 176 struct rte_mbuf *m_table[MAX_PKT_BURST]; 177 }; 178 179 struct vhost_bufftable { 180 uint32_t len; 181 uint64_t pre_tsc; 182 struct rte_mbuf *m_table[MAX_PKT_BURST]; 183 }; 184 185 /* TX queue for each data core. */ 186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 187 188 /* 189 * Vhost TX buffer for each data core. 190 * Every data core maintains a TX buffer for every vhost device, 191 * which is used for batch pkts enqueue for higher performance. 192 */ 193 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE]; 194 195 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 196 / US_PER_S * BURST_TX_DRAIN_US) 197 #define VLAN_HLEN 4 198 199 static inline int 200 open_dma(const char *value) 201 { 202 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) 203 return open_ioat(value); 204 205 return -1; 206 } 207 208 /* 209 * Builds up the correct configuration for VMDQ VLAN pool map 210 * according to the pool & queue limits. 211 */ 212 static inline int 213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 214 { 215 struct rte_eth_vmdq_rx_conf conf; 216 struct rte_eth_vmdq_rx_conf *def_conf = 217 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 218 unsigned i; 219 220 memset(&conf, 0, sizeof(conf)); 221 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 222 conf.nb_pool_maps = num_devices; 223 conf.enable_loop_back = def_conf->enable_loop_back; 224 conf.rx_mode = def_conf->rx_mode; 225 226 for (i = 0; i < conf.nb_pool_maps; i++) { 227 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 228 conf.pool_map[i].pools = (1UL << i); 229 } 230 231 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 232 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 233 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 234 return 0; 235 } 236 237 /* 238 * Initialises a given port using global settings and with the rx buffers 239 * coming from the mbuf_pool passed as parameter 240 */ 241 static inline int 242 port_init(uint16_t port) 243 { 244 struct rte_eth_dev_info dev_info; 245 struct rte_eth_conf port_conf; 246 struct rte_eth_rxconf *rxconf; 247 struct rte_eth_txconf *txconf; 248 int16_t rx_rings, tx_rings; 249 uint16_t rx_ring_size, tx_ring_size; 250 int retval; 251 uint16_t q; 252 253 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 254 retval = rte_eth_dev_info_get(port, &dev_info); 255 if (retval != 0) { 256 RTE_LOG(ERR, VHOST_PORT, 257 "Error during getting device (port %u) info: %s\n", 258 port, strerror(-retval)); 259 260 return retval; 261 } 262 263 rxconf = &dev_info.default_rxconf; 264 txconf = &dev_info.default_txconf; 265 rxconf->rx_drop_en = 1; 266 267 /*configure the number of supported virtio devices based on VMDQ limits */ 268 num_devices = dev_info.max_vmdq_pools; 269 270 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 271 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 272 273 tx_rings = (uint16_t)rte_lcore_count(); 274 275 /* Get port configuration. */ 276 retval = get_eth_conf(&port_conf, num_devices); 277 if (retval < 0) 278 return retval; 279 /* NIC queues are divided into pf queues and vmdq queues. */ 280 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 281 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 282 num_vmdq_queues = num_devices * queues_per_pool; 283 num_queues = num_pf_queues + num_vmdq_queues; 284 vmdq_queue_base = dev_info.vmdq_queue_base; 285 vmdq_pool_base = dev_info.vmdq_pool_base; 286 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 287 num_pf_queues, num_devices, queues_per_pool); 288 289 if (!rte_eth_dev_is_valid_port(port)) 290 return -1; 291 292 rx_rings = (uint16_t)dev_info.max_rx_queues; 293 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) 294 port_conf.txmode.offloads |= 295 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 296 /* Configure ethernet device. */ 297 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 298 if (retval != 0) { 299 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 300 port, strerror(-retval)); 301 return retval; 302 } 303 304 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 305 &tx_ring_size); 306 if (retval != 0) { 307 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 308 "for port %u: %s.\n", port, strerror(-retval)); 309 return retval; 310 } 311 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 312 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 313 "for Rx queues on port %u.\n", port); 314 return -1; 315 } 316 317 /* Setup the queues. */ 318 rxconf->offloads = port_conf.rxmode.offloads; 319 for (q = 0; q < rx_rings; q ++) { 320 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 321 rte_eth_dev_socket_id(port), 322 rxconf, 323 mbuf_pool); 324 if (retval < 0) { 325 RTE_LOG(ERR, VHOST_PORT, 326 "Failed to setup rx queue %u of port %u: %s.\n", 327 q, port, strerror(-retval)); 328 return retval; 329 } 330 } 331 txconf->offloads = port_conf.txmode.offloads; 332 for (q = 0; q < tx_rings; q ++) { 333 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 334 rte_eth_dev_socket_id(port), 335 txconf); 336 if (retval < 0) { 337 RTE_LOG(ERR, VHOST_PORT, 338 "Failed to setup tx queue %u of port %u: %s.\n", 339 q, port, strerror(-retval)); 340 return retval; 341 } 342 } 343 344 /* Start the device. */ 345 retval = rte_eth_dev_start(port); 346 if (retval < 0) { 347 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 348 port, strerror(-retval)); 349 return retval; 350 } 351 352 if (promiscuous) { 353 retval = rte_eth_promiscuous_enable(port); 354 if (retval != 0) { 355 RTE_LOG(ERR, VHOST_PORT, 356 "Failed to enable promiscuous mode on port %u: %s\n", 357 port, rte_strerror(-retval)); 358 return retval; 359 } 360 } 361 362 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 363 if (retval < 0) { 364 RTE_LOG(ERR, VHOST_PORT, 365 "Failed to get MAC address on port %u: %s\n", 366 port, rte_strerror(-retval)); 367 return retval; 368 } 369 370 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 371 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 372 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 373 port, 374 vmdq_ports_eth_addr[port].addr_bytes[0], 375 vmdq_ports_eth_addr[port].addr_bytes[1], 376 vmdq_ports_eth_addr[port].addr_bytes[2], 377 vmdq_ports_eth_addr[port].addr_bytes[3], 378 vmdq_ports_eth_addr[port].addr_bytes[4], 379 vmdq_ports_eth_addr[port].addr_bytes[5]); 380 381 return 0; 382 } 383 384 /* 385 * Set socket file path. 386 */ 387 static int 388 us_vhost_parse_socket_path(const char *q_arg) 389 { 390 char *old; 391 392 /* parse number string */ 393 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 394 return -1; 395 396 old = socket_files; 397 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 398 if (socket_files == NULL) { 399 free(old); 400 return -1; 401 } 402 403 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 404 nb_sockets++; 405 406 return 0; 407 } 408 409 /* 410 * Parse the portmask provided at run time. 411 */ 412 static int 413 parse_portmask(const char *portmask) 414 { 415 char *end = NULL; 416 unsigned long pm; 417 418 errno = 0; 419 420 /* parse hexadecimal string */ 421 pm = strtoul(portmask, &end, 16); 422 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 423 return 0; 424 425 return pm; 426 427 } 428 429 /* 430 * Parse num options at run time. 431 */ 432 static int 433 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 434 { 435 char *end = NULL; 436 unsigned long num; 437 438 errno = 0; 439 440 /* parse unsigned int string */ 441 num = strtoul(q_arg, &end, 10); 442 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 443 return -1; 444 445 if (num > max_valid_value) 446 return -1; 447 448 return num; 449 450 } 451 452 /* 453 * Display usage 454 */ 455 static void 456 us_vhost_usage(const char *prgname) 457 { 458 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 459 " --vm2vm [0|1|2]\n" 460 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 461 " --socket-file <path>\n" 462 " --nb-devices ND\n" 463 " -p PORTMASK: Set mask for ports to be used by application\n" 464 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 465 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 466 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 467 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 468 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 469 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 470 " --socket-file: The path of the socket file.\n" 471 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 472 " --tso [0|1] disable/enable TCP segment offload.\n" 473 " --client register a vhost-user socket as client mode.\n" 474 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n" 475 " --dmas register dma channel for specific vhost device.\n", 476 prgname); 477 } 478 479 enum { 480 #define OPT_VM2VM "vm2vm" 481 OPT_VM2VM_NUM = 256, 482 #define OPT_RX_RETRY "rx-retry" 483 OPT_RX_RETRY_NUM, 484 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 485 OPT_RX_RETRY_DELAY_NUM, 486 #define OPT_RX_RETRY_NUMB "rx-retry-num" 487 OPT_RX_RETRY_NUMB_NUM, 488 #define OPT_MERGEABLE "mergeable" 489 OPT_MERGEABLE_NUM, 490 #define OPT_STATS "stats" 491 OPT_STATS_NUM, 492 #define OPT_SOCKET_FILE "socket-file" 493 OPT_SOCKET_FILE_NUM, 494 #define OPT_TX_CSUM "tx-csum" 495 OPT_TX_CSUM_NUM, 496 #define OPT_TSO "tso" 497 OPT_TSO_NUM, 498 #define OPT_CLIENT "client" 499 OPT_CLIENT_NUM, 500 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 501 OPT_BUILTIN_NET_DRIVER_NUM, 502 #define OPT_DMA_TYPE "dma-type" 503 OPT_DMA_TYPE_NUM, 504 #define OPT_DMAS "dmas" 505 OPT_DMAS_NUM, 506 }; 507 508 /* 509 * Parse the arguments given in the command line of the application. 510 */ 511 static int 512 us_vhost_parse_args(int argc, char **argv) 513 { 514 int opt, ret; 515 int option_index; 516 unsigned i; 517 const char *prgname = argv[0]; 518 static struct option long_option[] = { 519 {OPT_VM2VM, required_argument, 520 NULL, OPT_VM2VM_NUM}, 521 {OPT_RX_RETRY, required_argument, 522 NULL, OPT_RX_RETRY_NUM}, 523 {OPT_RX_RETRY_DELAY, required_argument, 524 NULL, OPT_RX_RETRY_DELAY_NUM}, 525 {OPT_RX_RETRY_NUMB, required_argument, 526 NULL, OPT_RX_RETRY_NUMB_NUM}, 527 {OPT_MERGEABLE, required_argument, 528 NULL, OPT_MERGEABLE_NUM}, 529 {OPT_STATS, required_argument, 530 NULL, OPT_STATS_NUM}, 531 {OPT_SOCKET_FILE, required_argument, 532 NULL, OPT_SOCKET_FILE_NUM}, 533 {OPT_TX_CSUM, required_argument, 534 NULL, OPT_TX_CSUM_NUM}, 535 {OPT_TSO, required_argument, 536 NULL, OPT_TSO_NUM}, 537 {OPT_CLIENT, no_argument, 538 NULL, OPT_CLIENT_NUM}, 539 {OPT_BUILTIN_NET_DRIVER, no_argument, 540 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 541 {OPT_DMA_TYPE, required_argument, 542 NULL, OPT_DMA_TYPE_NUM}, 543 {OPT_DMAS, required_argument, 544 NULL, OPT_DMAS_NUM}, 545 {NULL, 0, 0, 0}, 546 }; 547 548 /* Parse command line */ 549 while ((opt = getopt_long(argc, argv, "p:P", 550 long_option, &option_index)) != EOF) { 551 switch (opt) { 552 /* Portmask */ 553 case 'p': 554 enabled_port_mask = parse_portmask(optarg); 555 if (enabled_port_mask == 0) { 556 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 557 us_vhost_usage(prgname); 558 return -1; 559 } 560 break; 561 562 case 'P': 563 promiscuous = 1; 564 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 565 ETH_VMDQ_ACCEPT_BROADCAST | 566 ETH_VMDQ_ACCEPT_MULTICAST; 567 break; 568 569 case OPT_VM2VM_NUM: 570 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 571 if (ret == -1) { 572 RTE_LOG(INFO, VHOST_CONFIG, 573 "Invalid argument for " 574 "vm2vm [0|1|2]\n"); 575 us_vhost_usage(prgname); 576 return -1; 577 } 578 vm2vm_mode = (vm2vm_type)ret; 579 break; 580 581 case OPT_RX_RETRY_NUM: 582 ret = parse_num_opt(optarg, 1); 583 if (ret == -1) { 584 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 585 us_vhost_usage(prgname); 586 return -1; 587 } 588 enable_retry = ret; 589 break; 590 591 case OPT_TX_CSUM_NUM: 592 ret = parse_num_opt(optarg, 1); 593 if (ret == -1) { 594 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 595 us_vhost_usage(prgname); 596 return -1; 597 } 598 enable_tx_csum = ret; 599 break; 600 601 case OPT_TSO_NUM: 602 ret = parse_num_opt(optarg, 1); 603 if (ret == -1) { 604 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 605 us_vhost_usage(prgname); 606 return -1; 607 } 608 enable_tso = ret; 609 break; 610 611 case OPT_RX_RETRY_DELAY_NUM: 612 ret = parse_num_opt(optarg, INT32_MAX); 613 if (ret == -1) { 614 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 615 us_vhost_usage(prgname); 616 return -1; 617 } 618 burst_rx_delay_time = ret; 619 break; 620 621 case OPT_RX_RETRY_NUMB_NUM: 622 ret = parse_num_opt(optarg, INT32_MAX); 623 if (ret == -1) { 624 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 625 us_vhost_usage(prgname); 626 return -1; 627 } 628 burst_rx_retry_num = ret; 629 break; 630 631 case OPT_MERGEABLE_NUM: 632 ret = parse_num_opt(optarg, 1); 633 if (ret == -1) { 634 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 635 us_vhost_usage(prgname); 636 return -1; 637 } 638 mergeable = !!ret; 639 if (ret) { 640 vmdq_conf_default.rxmode.offloads |= 641 DEV_RX_OFFLOAD_JUMBO_FRAME; 642 vmdq_conf_default.rxmode.max_rx_pkt_len 643 = JUMBO_FRAME_MAX_SIZE; 644 } 645 break; 646 647 case OPT_STATS_NUM: 648 ret = parse_num_opt(optarg, INT32_MAX); 649 if (ret == -1) { 650 RTE_LOG(INFO, VHOST_CONFIG, 651 "Invalid argument for stats [0..N]\n"); 652 us_vhost_usage(prgname); 653 return -1; 654 } 655 enable_stats = ret; 656 break; 657 658 /* Set socket file path. */ 659 case OPT_SOCKET_FILE_NUM: 660 if (us_vhost_parse_socket_path(optarg) == -1) { 661 RTE_LOG(INFO, VHOST_CONFIG, 662 "Invalid argument for socket name (Max %d characters)\n", 663 PATH_MAX); 664 us_vhost_usage(prgname); 665 return -1; 666 } 667 break; 668 669 case OPT_DMA_TYPE_NUM: 670 dma_type = optarg; 671 break; 672 673 case OPT_DMAS_NUM: 674 if (open_dma(optarg) == -1) { 675 RTE_LOG(INFO, VHOST_CONFIG, 676 "Wrong DMA args\n"); 677 us_vhost_usage(prgname); 678 return -1; 679 } 680 async_vhost_driver = 1; 681 break; 682 683 case OPT_CLIENT_NUM: 684 client_mode = 1; 685 break; 686 687 case OPT_BUILTIN_NET_DRIVER_NUM: 688 builtin_net_driver = 1; 689 break; 690 691 /* Invalid option - print options. */ 692 default: 693 us_vhost_usage(prgname); 694 return -1; 695 } 696 } 697 698 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 699 if (enabled_port_mask & (1 << i)) 700 ports[num_ports++] = i; 701 } 702 703 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 704 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 705 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 706 return -1; 707 } 708 709 return 0; 710 } 711 712 /* 713 * Update the global var NUM_PORTS and array PORTS according to system ports number 714 * and return valid ports number 715 */ 716 static unsigned check_ports_num(unsigned nb_ports) 717 { 718 unsigned valid_num_ports = num_ports; 719 unsigned portid; 720 721 if (num_ports > nb_ports) { 722 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 723 num_ports, nb_ports); 724 num_ports = nb_ports; 725 } 726 727 for (portid = 0; portid < num_ports; portid ++) { 728 if (!rte_eth_dev_is_valid_port(ports[portid])) { 729 RTE_LOG(INFO, VHOST_PORT, 730 "\nSpecified port ID(%u) is not valid\n", 731 ports[portid]); 732 ports[portid] = INVALID_PORT_ID; 733 valid_num_ports--; 734 } 735 } 736 return valid_num_ports; 737 } 738 739 static __rte_always_inline struct vhost_dev * 740 find_vhost_dev(struct rte_ether_addr *mac) 741 { 742 struct vhost_dev *vdev; 743 744 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 745 if (vdev->ready == DEVICE_RX && 746 rte_is_same_ether_addr(mac, &vdev->mac_address)) 747 return vdev; 748 } 749 750 return NULL; 751 } 752 753 /* 754 * This function learns the MAC address of the device and registers this along with a 755 * vlan tag to a VMDQ. 756 */ 757 static int 758 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 759 { 760 struct rte_ether_hdr *pkt_hdr; 761 int i, ret; 762 763 /* Learn MAC address of guest device from packet */ 764 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 765 766 if (find_vhost_dev(&pkt_hdr->s_addr)) { 767 RTE_LOG(ERR, VHOST_DATA, 768 "(%d) device is using a registered MAC!\n", 769 vdev->vid); 770 return -1; 771 } 772 773 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 774 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 775 776 /* vlan_tag currently uses the device_id. */ 777 vdev->vlan_tag = vlan_tags[vdev->vid]; 778 779 /* Print out VMDQ registration info. */ 780 RTE_LOG(INFO, VHOST_DATA, 781 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 782 vdev->vid, 783 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 784 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 785 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 786 vdev->vlan_tag); 787 788 /* Register the MAC address. */ 789 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 790 (uint32_t)vdev->vid + vmdq_pool_base); 791 if (ret) 792 RTE_LOG(ERR, VHOST_DATA, 793 "(%d) failed to add device MAC address to VMDQ\n", 794 vdev->vid); 795 796 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 797 798 /* Set device as ready for RX. */ 799 vdev->ready = DEVICE_RX; 800 801 return 0; 802 } 803 804 /* 805 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 806 * queue before disabling RX on the device. 807 */ 808 static inline void 809 unlink_vmdq(struct vhost_dev *vdev) 810 { 811 unsigned i = 0; 812 unsigned rx_count; 813 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 814 815 if (vdev->ready == DEVICE_RX) { 816 /*clear MAC and VLAN settings*/ 817 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 818 for (i = 0; i < 6; i++) 819 vdev->mac_address.addr_bytes[i] = 0; 820 821 vdev->vlan_tag = 0; 822 823 /*Clear out the receive buffers*/ 824 rx_count = rte_eth_rx_burst(ports[0], 825 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 826 827 while (rx_count) { 828 for (i = 0; i < rx_count; i++) 829 rte_pktmbuf_free(pkts_burst[i]); 830 831 rx_count = rte_eth_rx_burst(ports[0], 832 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 833 } 834 835 vdev->ready = DEVICE_MAC_LEARNING; 836 } 837 } 838 839 static inline void 840 free_pkts(struct rte_mbuf **pkts, uint16_t n) 841 { 842 while (n--) 843 rte_pktmbuf_free(pkts[n]); 844 } 845 846 static __rte_always_inline void 847 complete_async_pkts(struct vhost_dev *vdev) 848 { 849 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 850 uint16_t complete_count; 851 852 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 853 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST); 854 if (complete_count) { 855 free_pkts(p_cpl, complete_count); 856 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST); 857 } 858 859 } 860 861 static __rte_always_inline void 862 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 863 struct rte_mbuf *m) 864 { 865 uint16_t ret; 866 867 if (builtin_net_driver) { 868 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 869 } else { 870 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 871 } 872 873 if (enable_stats) { 874 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 875 __ATOMIC_SEQ_CST); 876 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 877 __ATOMIC_SEQ_CST); 878 src_vdev->stats.tx_total++; 879 src_vdev->stats.tx += ret; 880 } 881 } 882 883 static __rte_always_inline void 884 drain_vhost(struct vhost_dev *vdev) 885 { 886 uint16_t ret; 887 uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid; 888 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 889 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 890 891 if (builtin_net_driver) { 892 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); 893 } else if (async_vhost_driver) { 894 uint32_t cpu_cpl_nr = 0; 895 uint16_t enqueue_fail = 0; 896 struct rte_mbuf *m_cpu_cpl[nr_xmit]; 897 898 complete_async_pkts(vdev); 899 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, 900 m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr); 901 __atomic_add_fetch(&vdev->pkts_inflight, ret - cpu_cpl_nr, __ATOMIC_SEQ_CST); 902 903 if (cpu_cpl_nr) 904 free_pkts(m_cpu_cpl, cpu_cpl_nr); 905 906 enqueue_fail = nr_xmit - ret; 907 if (enqueue_fail) 908 free_pkts(&m[ret], nr_xmit - ret); 909 } else { 910 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 911 m, nr_xmit); 912 } 913 914 if (enable_stats) { 915 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 916 __ATOMIC_SEQ_CST); 917 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 918 __ATOMIC_SEQ_CST); 919 } 920 921 if (!async_vhost_driver) 922 free_pkts(m, nr_xmit); 923 } 924 925 static __rte_always_inline void 926 drain_vhost_table(void) 927 { 928 uint16_t lcore_id = rte_lcore_id(); 929 struct vhost_bufftable *vhost_txq; 930 struct vhost_dev *vdev; 931 uint64_t cur_tsc; 932 933 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 934 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE 935 + vdev->vid]; 936 937 cur_tsc = rte_rdtsc(); 938 if (unlikely(cur_tsc - vhost_txq->pre_tsc 939 > MBUF_TABLE_DRAIN_TSC)) { 940 RTE_LOG_DP(DEBUG, VHOST_DATA, 941 "Vhost TX queue drained after timeout with burst size %u\n", 942 vhost_txq->len); 943 drain_vhost(vdev); 944 vhost_txq->len = 0; 945 vhost_txq->pre_tsc = cur_tsc; 946 } 947 } 948 } 949 950 /* 951 * Check if the packet destination MAC address is for a local device. If so then put 952 * the packet on that devices RX queue. If not then return. 953 */ 954 static __rte_always_inline int 955 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 956 { 957 struct rte_ether_hdr *pkt_hdr; 958 struct vhost_dev *dst_vdev; 959 struct vhost_bufftable *vhost_txq; 960 uint16_t lcore_id = rte_lcore_id(); 961 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 962 963 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 964 if (!dst_vdev) 965 return -1; 966 967 if (vdev->vid == dst_vdev->vid) { 968 RTE_LOG_DP(DEBUG, VHOST_DATA, 969 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 970 vdev->vid); 971 return 0; 972 } 973 974 RTE_LOG_DP(DEBUG, VHOST_DATA, 975 "(%d) TX: MAC address is local\n", dst_vdev->vid); 976 977 if (unlikely(dst_vdev->remove)) { 978 RTE_LOG_DP(DEBUG, VHOST_DATA, 979 "(%d) device is marked for removal\n", dst_vdev->vid); 980 return 0; 981 } 982 983 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid]; 984 vhost_txq->m_table[vhost_txq->len++] = m; 985 986 if (enable_stats) { 987 vdev->stats.tx_total++; 988 vdev->stats.tx++; 989 } 990 991 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 992 drain_vhost(dst_vdev); 993 vhost_txq->len = 0; 994 vhost_txq->pre_tsc = rte_rdtsc(); 995 } 996 return 0; 997 } 998 999 /* 1000 * Check if the destination MAC of a packet is one local VM, 1001 * and get its vlan tag, and offset if it is. 1002 */ 1003 static __rte_always_inline int 1004 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1005 uint32_t *offset, uint16_t *vlan_tag) 1006 { 1007 struct vhost_dev *dst_vdev; 1008 struct rte_ether_hdr *pkt_hdr = 1009 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1010 1011 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 1012 if (!dst_vdev) 1013 return 0; 1014 1015 if (vdev->vid == dst_vdev->vid) { 1016 RTE_LOG_DP(DEBUG, VHOST_DATA, 1017 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1018 vdev->vid); 1019 return -1; 1020 } 1021 1022 /* 1023 * HW vlan strip will reduce the packet length 1024 * by minus length of vlan tag, so need restore 1025 * the packet length by plus it. 1026 */ 1027 *offset = VLAN_HLEN; 1028 *vlan_tag = vlan_tags[vdev->vid]; 1029 1030 RTE_LOG_DP(DEBUG, VHOST_DATA, 1031 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1032 vdev->vid, dst_vdev->vid, *vlan_tag); 1033 1034 return 0; 1035 } 1036 1037 static void virtio_tx_offload(struct rte_mbuf *m) 1038 { 1039 struct rte_net_hdr_lens hdr_lens; 1040 struct rte_ipv4_hdr *ipv4_hdr; 1041 struct rte_tcp_hdr *tcp_hdr; 1042 uint32_t ptype; 1043 void *l3_hdr; 1044 1045 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1046 m->l2_len = hdr_lens.l2_len; 1047 m->l3_len = hdr_lens.l3_len; 1048 m->l4_len = hdr_lens.l4_len; 1049 1050 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1051 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1052 m->l2_len + m->l3_len); 1053 1054 m->ol_flags |= PKT_TX_TCP_SEG; 1055 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1056 m->ol_flags |= PKT_TX_IPV4; 1057 m->ol_flags |= PKT_TX_IP_CKSUM; 1058 ipv4_hdr = l3_hdr; 1059 ipv4_hdr->hdr_checksum = 0; 1060 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1061 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1062 m->ol_flags |= PKT_TX_IPV6; 1063 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1064 } 1065 } 1066 1067 static __rte_always_inline void 1068 do_drain_mbuf_table(struct mbuf_table *tx_q) 1069 { 1070 uint16_t count; 1071 1072 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1073 tx_q->m_table, tx_q->len); 1074 if (unlikely(count < tx_q->len)) 1075 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1076 1077 tx_q->len = 0; 1078 } 1079 1080 /* 1081 * This function routes the TX packet to the correct interface. This 1082 * may be a local device or the physical port. 1083 */ 1084 static __rte_always_inline void 1085 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1086 { 1087 struct mbuf_table *tx_q; 1088 unsigned offset = 0; 1089 const uint16_t lcore_id = rte_lcore_id(); 1090 struct rte_ether_hdr *nh; 1091 1092 1093 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1094 if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) { 1095 struct vhost_dev *vdev2; 1096 1097 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1098 if (vdev2 != vdev) 1099 sync_virtio_xmit(vdev2, vdev, m); 1100 } 1101 goto queue2nic; 1102 } 1103 1104 /*check if destination is local VM*/ 1105 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1106 return; 1107 1108 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1109 if (unlikely(find_local_dest(vdev, m, &offset, 1110 &vlan_tag) != 0)) { 1111 rte_pktmbuf_free(m); 1112 return; 1113 } 1114 } 1115 1116 RTE_LOG_DP(DEBUG, VHOST_DATA, 1117 "(%d) TX: MAC address is external\n", vdev->vid); 1118 1119 queue2nic: 1120 1121 /*Add packet to the port tx queue*/ 1122 tx_q = &lcore_tx_queue[lcore_id]; 1123 1124 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1125 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1126 /* Guest has inserted the vlan tag. */ 1127 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1128 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1129 if ((vm2vm_mode == VM2VM_HARDWARE) && 1130 (vh->vlan_tci != vlan_tag_be)) 1131 vh->vlan_tci = vlan_tag_be; 1132 } else { 1133 m->ol_flags |= PKT_TX_VLAN_PKT; 1134 1135 /* 1136 * Find the right seg to adjust the data len when offset is 1137 * bigger than tail room size. 1138 */ 1139 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1140 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1141 m->data_len += offset; 1142 else { 1143 struct rte_mbuf *seg = m; 1144 1145 while ((seg->next != NULL) && 1146 (offset > rte_pktmbuf_tailroom(seg))) 1147 seg = seg->next; 1148 1149 seg->data_len += offset; 1150 } 1151 m->pkt_len += offset; 1152 } 1153 1154 m->vlan_tci = vlan_tag; 1155 } 1156 1157 if (m->ol_flags & PKT_RX_LRO) 1158 virtio_tx_offload(m); 1159 1160 tx_q->m_table[tx_q->len++] = m; 1161 if (enable_stats) { 1162 vdev->stats.tx_total++; 1163 vdev->stats.tx++; 1164 } 1165 1166 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1167 do_drain_mbuf_table(tx_q); 1168 } 1169 1170 1171 static __rte_always_inline void 1172 drain_mbuf_table(struct mbuf_table *tx_q) 1173 { 1174 static uint64_t prev_tsc; 1175 uint64_t cur_tsc; 1176 1177 if (tx_q->len == 0) 1178 return; 1179 1180 cur_tsc = rte_rdtsc(); 1181 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1182 prev_tsc = cur_tsc; 1183 1184 RTE_LOG_DP(DEBUG, VHOST_DATA, 1185 "TX queue drained after timeout with burst size %u\n", 1186 tx_q->len); 1187 do_drain_mbuf_table(tx_q); 1188 } 1189 } 1190 1191 static __rte_always_inline void 1192 drain_eth_rx(struct vhost_dev *vdev) 1193 { 1194 uint16_t rx_count, enqueue_count; 1195 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1196 1197 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1198 pkts, MAX_PKT_BURST); 1199 1200 if (!rx_count) 1201 return; 1202 1203 /* 1204 * When "enable_retry" is set, here we wait and retry when there 1205 * is no enough free slots in the queue to hold @rx_count packets, 1206 * to diminish packet loss. 1207 */ 1208 if (enable_retry && 1209 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1210 VIRTIO_RXQ))) { 1211 uint32_t retry; 1212 1213 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1214 rte_delay_us(burst_rx_delay_time); 1215 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1216 VIRTIO_RXQ)) 1217 break; 1218 } 1219 } 1220 1221 if (builtin_net_driver) { 1222 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1223 pkts, rx_count); 1224 } else if (async_vhost_driver) { 1225 uint32_t cpu_cpl_nr = 0; 1226 uint16_t enqueue_fail = 0; 1227 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST]; 1228 1229 complete_async_pkts(vdev); 1230 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, 1231 VIRTIO_RXQ, pkts, rx_count, 1232 m_cpu_cpl, &cpu_cpl_nr); 1233 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count - cpu_cpl_nr, 1234 __ATOMIC_SEQ_CST); 1235 1236 if (cpu_cpl_nr) 1237 free_pkts(m_cpu_cpl, cpu_cpl_nr); 1238 1239 enqueue_fail = rx_count - enqueue_count; 1240 if (enqueue_fail) 1241 free_pkts(&pkts[enqueue_count], enqueue_fail); 1242 1243 } else { 1244 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1245 pkts, rx_count); 1246 } 1247 1248 if (enable_stats) { 1249 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1250 __ATOMIC_SEQ_CST); 1251 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1252 __ATOMIC_SEQ_CST); 1253 } 1254 1255 if (!async_vhost_driver) 1256 free_pkts(pkts, rx_count); 1257 } 1258 1259 static __rte_always_inline void 1260 drain_virtio_tx(struct vhost_dev *vdev) 1261 { 1262 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1263 uint16_t count; 1264 uint16_t i; 1265 1266 if (builtin_net_driver) { 1267 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1268 pkts, MAX_PKT_BURST); 1269 } else { 1270 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1271 mbuf_pool, pkts, MAX_PKT_BURST); 1272 } 1273 1274 /* setup VMDq for the first packet */ 1275 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1276 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1277 free_pkts(pkts, count); 1278 } 1279 1280 for (i = 0; i < count; ++i) 1281 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1282 } 1283 1284 /* 1285 * Main function of vhost-switch. It basically does: 1286 * 1287 * for each vhost device { 1288 * - drain_eth_rx() 1289 * 1290 * Which drains the host eth Rx queue linked to the vhost device, 1291 * and deliver all of them to guest virito Rx ring associated with 1292 * this vhost device. 1293 * 1294 * - drain_virtio_tx() 1295 * 1296 * Which drains the guest virtio Tx queue and deliver all of them 1297 * to the target, which could be another vhost device, or the 1298 * physical eth dev. The route is done in function "virtio_tx_route". 1299 * } 1300 */ 1301 static int 1302 switch_worker(void *arg __rte_unused) 1303 { 1304 unsigned i; 1305 unsigned lcore_id = rte_lcore_id(); 1306 struct vhost_dev *vdev; 1307 struct mbuf_table *tx_q; 1308 1309 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1310 1311 tx_q = &lcore_tx_queue[lcore_id]; 1312 for (i = 0; i < rte_lcore_count(); i++) { 1313 if (lcore_ids[i] == lcore_id) { 1314 tx_q->txq_id = i; 1315 break; 1316 } 1317 } 1318 1319 while(1) { 1320 drain_mbuf_table(tx_q); 1321 drain_vhost_table(); 1322 /* 1323 * Inform the configuration core that we have exited the 1324 * linked list and that no devices are in use if requested. 1325 */ 1326 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1327 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1328 1329 /* 1330 * Process vhost devices 1331 */ 1332 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1333 lcore_vdev_entry) { 1334 if (unlikely(vdev->remove)) { 1335 unlink_vmdq(vdev); 1336 vdev->ready = DEVICE_SAFE_REMOVE; 1337 continue; 1338 } 1339 1340 if (likely(vdev->ready == DEVICE_RX)) 1341 drain_eth_rx(vdev); 1342 1343 if (likely(!vdev->remove)) 1344 drain_virtio_tx(vdev); 1345 } 1346 } 1347 1348 return 0; 1349 } 1350 1351 /* 1352 * Remove a device from the specific data core linked list and from the 1353 * main linked list. Synchonization occurs through the use of the 1354 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1355 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1356 */ 1357 static void 1358 destroy_device(int vid) 1359 { 1360 struct vhost_dev *vdev = NULL; 1361 int lcore; 1362 uint16_t i; 1363 1364 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1365 if (vdev->vid == vid) 1366 break; 1367 } 1368 if (!vdev) 1369 return; 1370 /*set the remove flag. */ 1371 vdev->remove = 1; 1372 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1373 rte_pause(); 1374 } 1375 1376 for (i = 0; i < RTE_MAX_LCORE; i++) 1377 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]); 1378 1379 if (builtin_net_driver) 1380 vs_vhost_net_remove(vdev); 1381 1382 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1383 lcore_vdev_entry); 1384 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1385 1386 1387 /* Set the dev_removal_flag on each lcore. */ 1388 RTE_LCORE_FOREACH_WORKER(lcore) 1389 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1390 1391 /* 1392 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1393 * we can be sure that they can no longer access the device removed 1394 * from the linked lists and that the devices are no longer in use. 1395 */ 1396 RTE_LCORE_FOREACH_WORKER(lcore) { 1397 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1398 rte_pause(); 1399 } 1400 1401 lcore_info[vdev->coreid].device_num--; 1402 1403 RTE_LOG(INFO, VHOST_DATA, 1404 "(%d) device has been removed from data core\n", 1405 vdev->vid); 1406 1407 if (async_vhost_driver) { 1408 uint16_t n_pkt = 0; 1409 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1410 1411 while (vdev->pkts_inflight) { 1412 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ, 1413 m_cpl, vdev->pkts_inflight); 1414 free_pkts(m_cpl, n_pkt); 1415 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1416 } 1417 1418 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1419 } 1420 1421 rte_free(vdev); 1422 } 1423 1424 /* 1425 * A new device is added to a data core. First the device is added to the main linked list 1426 * and then allocated to a specific data core. 1427 */ 1428 static int 1429 new_device(int vid) 1430 { 1431 int lcore, core_add = 0; 1432 uint16_t i; 1433 uint32_t device_num_min = num_devices; 1434 struct vhost_dev *vdev; 1435 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1436 if (vdev == NULL) { 1437 RTE_LOG(INFO, VHOST_DATA, 1438 "(%d) couldn't allocate memory for vhost dev\n", 1439 vid); 1440 return -1; 1441 } 1442 vdev->vid = vid; 1443 1444 for (i = 0; i < RTE_MAX_LCORE; i++) { 1445 vhost_txbuff[i * MAX_VHOST_DEVICE + vid] 1446 = rte_zmalloc("vhost bufftable", 1447 sizeof(struct vhost_bufftable), 1448 RTE_CACHE_LINE_SIZE); 1449 1450 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) { 1451 RTE_LOG(INFO, VHOST_DATA, 1452 "(%d) couldn't allocate memory for vhost TX\n", vid); 1453 return -1; 1454 } 1455 } 1456 1457 if (builtin_net_driver) 1458 vs_vhost_net_setup(vdev); 1459 1460 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1461 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1462 1463 /*reset ready flag*/ 1464 vdev->ready = DEVICE_MAC_LEARNING; 1465 vdev->remove = 0; 1466 1467 /* Find a suitable lcore to add the device. */ 1468 RTE_LCORE_FOREACH_WORKER(lcore) { 1469 if (lcore_info[lcore].device_num < device_num_min) { 1470 device_num_min = lcore_info[lcore].device_num; 1471 core_add = lcore; 1472 } 1473 } 1474 vdev->coreid = core_add; 1475 1476 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1477 lcore_vdev_entry); 1478 lcore_info[vdev->coreid].device_num++; 1479 1480 /* Disable notifications. */ 1481 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1482 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1483 1484 RTE_LOG(INFO, VHOST_DATA, 1485 "(%d) device has been added to data core %d\n", 1486 vid, vdev->coreid); 1487 1488 if (async_vhost_driver) { 1489 struct rte_vhost_async_config config = {0}; 1490 struct rte_vhost_async_channel_ops channel_ops; 1491 1492 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) { 1493 channel_ops.transfer_data = ioat_transfer_data_cb; 1494 channel_ops.check_completed_copies = 1495 ioat_check_completed_copies_cb; 1496 1497 config.features = RTE_VHOST_ASYNC_INORDER; 1498 config.async_threshold = 256; 1499 1500 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ, 1501 config, &channel_ops); 1502 } 1503 } 1504 1505 return 0; 1506 } 1507 1508 static int 1509 vring_state_changed(int vid, uint16_t queue_id, int enable) 1510 { 1511 struct vhost_dev *vdev = NULL; 1512 1513 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1514 if (vdev->vid == vid) 1515 break; 1516 } 1517 if (!vdev) 1518 return -1; 1519 1520 if (queue_id != VIRTIO_RXQ) 1521 return 0; 1522 1523 if (async_vhost_driver) { 1524 if (!enable) { 1525 uint16_t n_pkt = 0; 1526 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1527 1528 while (vdev->pkts_inflight) { 1529 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id, 1530 m_cpl, vdev->pkts_inflight); 1531 free_pkts(m_cpl, n_pkt); 1532 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1533 } 1534 } 1535 } 1536 1537 return 0; 1538 } 1539 1540 /* 1541 * These callback allow devices to be added to the data core when configuration 1542 * has been fully complete. 1543 */ 1544 static const struct vhost_device_ops virtio_net_device_ops = 1545 { 1546 .new_device = new_device, 1547 .destroy_device = destroy_device, 1548 .vring_state_changed = vring_state_changed, 1549 }; 1550 1551 /* 1552 * This is a thread will wake up after a period to print stats if the user has 1553 * enabled them. 1554 */ 1555 static void * 1556 print_stats(__rte_unused void *arg) 1557 { 1558 struct vhost_dev *vdev; 1559 uint64_t tx_dropped, rx_dropped; 1560 uint64_t tx, tx_total, rx, rx_total; 1561 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1562 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1563 1564 while(1) { 1565 sleep(enable_stats); 1566 1567 /* Clear screen and move to top left */ 1568 printf("%s%s\n", clr, top_left); 1569 printf("Device statistics =================================\n"); 1570 1571 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1572 tx_total = vdev->stats.tx_total; 1573 tx = vdev->stats.tx; 1574 tx_dropped = tx_total - tx; 1575 1576 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1577 __ATOMIC_SEQ_CST); 1578 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1579 __ATOMIC_SEQ_CST); 1580 rx_dropped = rx_total - rx; 1581 1582 printf("Statistics for device %d\n" 1583 "-----------------------\n" 1584 "TX total: %" PRIu64 "\n" 1585 "TX dropped: %" PRIu64 "\n" 1586 "TX successful: %" PRIu64 "\n" 1587 "RX total: %" PRIu64 "\n" 1588 "RX dropped: %" PRIu64 "\n" 1589 "RX successful: %" PRIu64 "\n", 1590 vdev->vid, 1591 tx_total, tx_dropped, tx, 1592 rx_total, rx_dropped, rx); 1593 } 1594 1595 printf("===================================================\n"); 1596 1597 fflush(stdout); 1598 } 1599 1600 return NULL; 1601 } 1602 1603 static void 1604 unregister_drivers(int socket_num) 1605 { 1606 int i, ret; 1607 1608 for (i = 0; i < socket_num; i++) { 1609 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1610 if (ret != 0) 1611 RTE_LOG(ERR, VHOST_CONFIG, 1612 "Fail to unregister vhost driver for %s.\n", 1613 socket_files + i * PATH_MAX); 1614 } 1615 } 1616 1617 /* When we receive a INT signal, unregister vhost driver */ 1618 static void 1619 sigint_handler(__rte_unused int signum) 1620 { 1621 /* Unregister vhost driver. */ 1622 unregister_drivers(nb_sockets); 1623 1624 exit(0); 1625 } 1626 1627 /* 1628 * While creating an mbuf pool, one key thing is to figure out how 1629 * many mbuf entries is enough for our use. FYI, here are some 1630 * guidelines: 1631 * 1632 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1633 * 1634 * - For each switch core (A CPU core does the packet switch), we need 1635 * also make some reservation for receiving the packets from virtio 1636 * Tx queue. How many is enough depends on the usage. It's normally 1637 * a simple calculation like following: 1638 * 1639 * MAX_PKT_BURST * max packet size / mbuf size 1640 * 1641 * So, we definitely need allocate more mbufs when TSO is enabled. 1642 * 1643 * - Similarly, for each switching core, we should serve @nr_rx_desc 1644 * mbufs for receiving the packets from physical NIC device. 1645 * 1646 * - We also need make sure, for each switch core, we have allocated 1647 * enough mbufs to fill up the mbuf cache. 1648 */ 1649 static void 1650 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1651 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1652 { 1653 uint32_t nr_mbufs; 1654 uint32_t nr_mbufs_per_core; 1655 uint32_t mtu = 1500; 1656 1657 if (mergeable) 1658 mtu = 9000; 1659 if (enable_tso) 1660 mtu = 64 * 1024; 1661 1662 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1663 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1664 nr_mbufs_per_core += nr_rx_desc; 1665 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1666 1667 nr_mbufs = nr_queues * nr_rx_desc; 1668 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1669 nr_mbufs *= nr_port; 1670 1671 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1672 nr_mbuf_cache, 0, mbuf_size, 1673 rte_socket_id()); 1674 if (mbuf_pool == NULL) 1675 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1676 } 1677 1678 /* 1679 * Main function, does initialisation and calls the per-lcore functions. 1680 */ 1681 int 1682 main(int argc, char *argv[]) 1683 { 1684 unsigned lcore_id, core_id = 0; 1685 unsigned nb_ports, valid_num_ports; 1686 int ret, i; 1687 uint16_t portid; 1688 static pthread_t tid; 1689 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1690 1691 signal(SIGINT, sigint_handler); 1692 1693 /* init EAL */ 1694 ret = rte_eal_init(argc, argv); 1695 if (ret < 0) 1696 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1697 argc -= ret; 1698 argv += ret; 1699 1700 /* parse app arguments */ 1701 ret = us_vhost_parse_args(argc, argv); 1702 if (ret < 0) 1703 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1704 1705 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1706 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1707 1708 if (rte_lcore_is_enabled(lcore_id)) 1709 lcore_ids[core_id++] = lcore_id; 1710 } 1711 1712 if (rte_lcore_count() > RTE_MAX_LCORE) 1713 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1714 1715 /* Get the number of physical ports. */ 1716 nb_ports = rte_eth_dev_count_avail(); 1717 1718 /* 1719 * Update the global var NUM_PORTS and global array PORTS 1720 * and get value of var VALID_NUM_PORTS according to system ports number 1721 */ 1722 valid_num_ports = check_ports_num(nb_ports); 1723 1724 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1725 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1726 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1727 return -1; 1728 } 1729 1730 /* 1731 * FIXME: here we are trying to allocate mbufs big enough for 1732 * @MAX_QUEUES, but the truth is we're never going to use that 1733 * many queues here. We probably should only do allocation for 1734 * those queues we are going to use. 1735 */ 1736 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1737 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1738 1739 if (vm2vm_mode == VM2VM_HARDWARE) { 1740 /* Enable VT loop back to let L2 switch to do it. */ 1741 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1742 RTE_LOG(DEBUG, VHOST_CONFIG, 1743 "Enable loop back for L2 switch in vmdq.\n"); 1744 } 1745 1746 /* initialize all ports */ 1747 RTE_ETH_FOREACH_DEV(portid) { 1748 /* skip ports that are not enabled */ 1749 if ((enabled_port_mask & (1 << portid)) == 0) { 1750 RTE_LOG(INFO, VHOST_PORT, 1751 "Skipping disabled port %d\n", portid); 1752 continue; 1753 } 1754 if (port_init(portid) != 0) 1755 rte_exit(EXIT_FAILURE, 1756 "Cannot initialize network ports\n"); 1757 } 1758 1759 /* Enable stats if the user option is set. */ 1760 if (enable_stats) { 1761 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1762 print_stats, NULL); 1763 if (ret < 0) 1764 rte_exit(EXIT_FAILURE, 1765 "Cannot create print-stats thread\n"); 1766 } 1767 1768 /* Launch all data cores. */ 1769 RTE_LCORE_FOREACH_WORKER(lcore_id) 1770 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1771 1772 if (client_mode) 1773 flags |= RTE_VHOST_USER_CLIENT; 1774 1775 /* Register vhost user driver to handle vhost messages. */ 1776 for (i = 0; i < nb_sockets; i++) { 1777 char *file = socket_files + i * PATH_MAX; 1778 1779 if (async_vhost_driver) 1780 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1781 1782 ret = rte_vhost_driver_register(file, flags); 1783 if (ret != 0) { 1784 unregister_drivers(i); 1785 rte_exit(EXIT_FAILURE, 1786 "vhost driver register failure.\n"); 1787 } 1788 1789 if (builtin_net_driver) 1790 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1791 1792 if (mergeable == 0) { 1793 rte_vhost_driver_disable_features(file, 1794 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1795 } 1796 1797 if (enable_tx_csum == 0) { 1798 rte_vhost_driver_disable_features(file, 1799 1ULL << VIRTIO_NET_F_CSUM); 1800 } 1801 1802 if (enable_tso == 0) { 1803 rte_vhost_driver_disable_features(file, 1804 1ULL << VIRTIO_NET_F_HOST_TSO4); 1805 rte_vhost_driver_disable_features(file, 1806 1ULL << VIRTIO_NET_F_HOST_TSO6); 1807 rte_vhost_driver_disable_features(file, 1808 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1809 rte_vhost_driver_disable_features(file, 1810 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1811 } 1812 1813 if (promiscuous) { 1814 rte_vhost_driver_enable_features(file, 1815 1ULL << VIRTIO_NET_F_CTRL_RX); 1816 } 1817 1818 ret = rte_vhost_driver_callback_register(file, 1819 &virtio_net_device_ops); 1820 if (ret != 0) { 1821 rte_exit(EXIT_FAILURE, 1822 "failed to register vhost driver callbacks.\n"); 1823 } 1824 1825 if (rte_vhost_driver_start(file) < 0) { 1826 rte_exit(EXIT_FAILURE, 1827 "failed to start vhost driver.\n"); 1828 } 1829 } 1830 1831 RTE_LCORE_FOREACH_WORKER(lcore_id) 1832 rte_eal_wait_lcore(lcore_id); 1833 1834 /* clean up the EAL */ 1835 rte_eal_cleanup(); 1836 1837 return 0; 1838 } 1839