1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_vhost.h> 23 #include <rte_ip.h> 24 #include <rte_tcp.h> 25 #include <rte_pause.h> 26 27 #include "ioat.h" 28 #include "main.h" 29 30 #ifndef MAX_QUEUES 31 #define MAX_QUEUES 128 32 #endif 33 34 /* the maximum number of external ports supported */ 35 #define MAX_SUP_PORTS 1 36 37 #define MBUF_CACHE_SIZE 128 38 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 39 40 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 41 42 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 43 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 44 45 #define JUMBO_FRAME_MAX_SIZE 0x2600 46 47 /* State of virtio device. */ 48 #define DEVICE_MAC_LEARNING 0 49 #define DEVICE_RX 1 50 #define DEVICE_SAFE_REMOVE 2 51 52 /* Configurable number of RX/TX ring descriptors */ 53 #define RTE_TEST_RX_DESC_DEFAULT 1024 54 #define RTE_TEST_TX_DESC_DEFAULT 512 55 56 #define INVALID_PORT_ID 0xFF 57 58 /* Maximum long option length for option parsing. */ 59 #define MAX_LONG_OPT_SZ 64 60 61 /* mask of enabled ports */ 62 static uint32_t enabled_port_mask = 0; 63 64 /* Promiscuous mode */ 65 static uint32_t promiscuous; 66 67 /* number of devices/queues to support*/ 68 static uint32_t num_queues = 0; 69 static uint32_t num_devices; 70 71 static struct rte_mempool *mbuf_pool; 72 static int mergeable; 73 74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 75 typedef enum { 76 VM2VM_DISABLED = 0, 77 VM2VM_SOFTWARE = 1, 78 VM2VM_HARDWARE = 2, 79 VM2VM_LAST 80 } vm2vm_type; 81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 82 83 /* Enable stats. */ 84 static uint32_t enable_stats = 0; 85 /* Enable retries on RX. */ 86 static uint32_t enable_retry = 1; 87 88 /* Disable TX checksum offload */ 89 static uint32_t enable_tx_csum; 90 91 /* Disable TSO offload */ 92 static uint32_t enable_tso; 93 94 static int client_mode; 95 96 static int builtin_net_driver; 97 98 static int async_vhost_driver; 99 100 static char dma_type[MAX_LONG_OPT_SZ]; 101 102 /* Specify timeout (in useconds) between retries on RX. */ 103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 104 /* Specify the number of retries on RX. */ 105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 106 107 /* Socket file paths. Can be set by user */ 108 static char *socket_files; 109 static int nb_sockets; 110 111 /* empty vmdq configuration structure. Filled in programatically */ 112 static struct rte_eth_conf vmdq_conf_default = { 113 .rxmode = { 114 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 115 .split_hdr_size = 0, 116 /* 117 * VLAN strip is necessary for 1G NIC such as I350, 118 * this fixes bug of ipv4 forwarding in guest can't 119 * forward pakets from one virtio dev to another virtio dev. 120 */ 121 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP, 122 }, 123 124 .txmode = { 125 .mq_mode = ETH_MQ_TX_NONE, 126 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM | 127 DEV_TX_OFFLOAD_TCP_CKSUM | 128 DEV_TX_OFFLOAD_VLAN_INSERT | 129 DEV_TX_OFFLOAD_MULTI_SEGS | 130 DEV_TX_OFFLOAD_TCP_TSO), 131 }, 132 .rx_adv_conf = { 133 /* 134 * should be overridden separately in code with 135 * appropriate values 136 */ 137 .vmdq_rx_conf = { 138 .nb_queue_pools = ETH_8_POOLS, 139 .enable_default_pool = 0, 140 .default_pool = 0, 141 .nb_pool_maps = 0, 142 .pool_map = {{0, 0},}, 143 }, 144 }, 145 }; 146 147 148 static unsigned lcore_ids[RTE_MAX_LCORE]; 149 static uint16_t ports[RTE_MAX_ETHPORTS]; 150 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 151 static uint16_t num_pf_queues, num_vmdq_queues; 152 static uint16_t vmdq_pool_base, vmdq_queue_base; 153 static uint16_t queues_per_pool; 154 155 const uint16_t vlan_tags[] = { 156 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 157 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 158 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 159 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 160 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 161 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 162 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 163 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 164 }; 165 166 /* ethernet addresses of ports */ 167 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 168 169 static struct vhost_dev_tailq_list vhost_dev_list = 170 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 171 172 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 173 174 /* Used for queueing bursts of TX packets. */ 175 struct mbuf_table { 176 unsigned len; 177 unsigned txq_id; 178 struct rte_mbuf *m_table[MAX_PKT_BURST]; 179 }; 180 181 struct vhost_bufftable { 182 uint32_t len; 183 uint64_t pre_tsc; 184 struct rte_mbuf *m_table[MAX_PKT_BURST]; 185 }; 186 187 /* TX queue for each data core. */ 188 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 189 190 /* 191 * Vhost TX buffer for each data core. 192 * Every data core maintains a TX buffer for every vhost device, 193 * which is used for batch pkts enqueue for higher performance. 194 */ 195 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE]; 196 197 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 198 / US_PER_S * BURST_TX_DRAIN_US) 199 #define VLAN_HLEN 4 200 201 static inline int 202 open_dma(const char *value) 203 { 204 if (strncmp(dma_type, "ioat", 4) == 0) 205 return open_ioat(value); 206 207 return -1; 208 } 209 210 /* 211 * Builds up the correct configuration for VMDQ VLAN pool map 212 * according to the pool & queue limits. 213 */ 214 static inline int 215 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 216 { 217 struct rte_eth_vmdq_rx_conf conf; 218 struct rte_eth_vmdq_rx_conf *def_conf = 219 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 220 unsigned i; 221 222 memset(&conf, 0, sizeof(conf)); 223 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 224 conf.nb_pool_maps = num_devices; 225 conf.enable_loop_back = def_conf->enable_loop_back; 226 conf.rx_mode = def_conf->rx_mode; 227 228 for (i = 0; i < conf.nb_pool_maps; i++) { 229 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 230 conf.pool_map[i].pools = (1UL << i); 231 } 232 233 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 234 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 235 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 236 return 0; 237 } 238 239 /* 240 * Initialises a given port using global settings and with the rx buffers 241 * coming from the mbuf_pool passed as parameter 242 */ 243 static inline int 244 port_init(uint16_t port) 245 { 246 struct rte_eth_dev_info dev_info; 247 struct rte_eth_conf port_conf; 248 struct rte_eth_rxconf *rxconf; 249 struct rte_eth_txconf *txconf; 250 int16_t rx_rings, tx_rings; 251 uint16_t rx_ring_size, tx_ring_size; 252 int retval; 253 uint16_t q; 254 255 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 256 retval = rte_eth_dev_info_get(port, &dev_info); 257 if (retval != 0) { 258 RTE_LOG(ERR, VHOST_PORT, 259 "Error during getting device (port %u) info: %s\n", 260 port, strerror(-retval)); 261 262 return retval; 263 } 264 265 rxconf = &dev_info.default_rxconf; 266 txconf = &dev_info.default_txconf; 267 rxconf->rx_drop_en = 1; 268 269 /*configure the number of supported virtio devices based on VMDQ limits */ 270 num_devices = dev_info.max_vmdq_pools; 271 272 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 273 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 274 275 tx_rings = (uint16_t)rte_lcore_count(); 276 277 /* Get port configuration. */ 278 retval = get_eth_conf(&port_conf, num_devices); 279 if (retval < 0) 280 return retval; 281 /* NIC queues are divided into pf queues and vmdq queues. */ 282 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 283 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 284 num_vmdq_queues = num_devices * queues_per_pool; 285 num_queues = num_pf_queues + num_vmdq_queues; 286 vmdq_queue_base = dev_info.vmdq_queue_base; 287 vmdq_pool_base = dev_info.vmdq_pool_base; 288 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 289 num_pf_queues, num_devices, queues_per_pool); 290 291 if (!rte_eth_dev_is_valid_port(port)) 292 return -1; 293 294 rx_rings = (uint16_t)dev_info.max_rx_queues; 295 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) 296 port_conf.txmode.offloads |= 297 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 298 /* Configure ethernet device. */ 299 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 300 if (retval != 0) { 301 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 302 port, strerror(-retval)); 303 return retval; 304 } 305 306 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 307 &tx_ring_size); 308 if (retval != 0) { 309 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 310 "for port %u: %s.\n", port, strerror(-retval)); 311 return retval; 312 } 313 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 314 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 315 "for Rx queues on port %u.\n", port); 316 return -1; 317 } 318 319 /* Setup the queues. */ 320 rxconf->offloads = port_conf.rxmode.offloads; 321 for (q = 0; q < rx_rings; q ++) { 322 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 323 rte_eth_dev_socket_id(port), 324 rxconf, 325 mbuf_pool); 326 if (retval < 0) { 327 RTE_LOG(ERR, VHOST_PORT, 328 "Failed to setup rx queue %u of port %u: %s.\n", 329 q, port, strerror(-retval)); 330 return retval; 331 } 332 } 333 txconf->offloads = port_conf.txmode.offloads; 334 for (q = 0; q < tx_rings; q ++) { 335 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 336 rte_eth_dev_socket_id(port), 337 txconf); 338 if (retval < 0) { 339 RTE_LOG(ERR, VHOST_PORT, 340 "Failed to setup tx queue %u of port %u: %s.\n", 341 q, port, strerror(-retval)); 342 return retval; 343 } 344 } 345 346 /* Start the device. */ 347 retval = rte_eth_dev_start(port); 348 if (retval < 0) { 349 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 350 port, strerror(-retval)); 351 return retval; 352 } 353 354 if (promiscuous) { 355 retval = rte_eth_promiscuous_enable(port); 356 if (retval != 0) { 357 RTE_LOG(ERR, VHOST_PORT, 358 "Failed to enable promiscuous mode on port %u: %s\n", 359 port, rte_strerror(-retval)); 360 return retval; 361 } 362 } 363 364 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 365 if (retval < 0) { 366 RTE_LOG(ERR, VHOST_PORT, 367 "Failed to get MAC address on port %u: %s\n", 368 port, rte_strerror(-retval)); 369 return retval; 370 } 371 372 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 373 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 374 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 375 port, 376 vmdq_ports_eth_addr[port].addr_bytes[0], 377 vmdq_ports_eth_addr[port].addr_bytes[1], 378 vmdq_ports_eth_addr[port].addr_bytes[2], 379 vmdq_ports_eth_addr[port].addr_bytes[3], 380 vmdq_ports_eth_addr[port].addr_bytes[4], 381 vmdq_ports_eth_addr[port].addr_bytes[5]); 382 383 return 0; 384 } 385 386 /* 387 * Set socket file path. 388 */ 389 static int 390 us_vhost_parse_socket_path(const char *q_arg) 391 { 392 char *old; 393 394 /* parse number string */ 395 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 396 return -1; 397 398 old = socket_files; 399 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 400 if (socket_files == NULL) { 401 free(old); 402 return -1; 403 } 404 405 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 406 nb_sockets++; 407 408 return 0; 409 } 410 411 /* 412 * Parse the portmask provided at run time. 413 */ 414 static int 415 parse_portmask(const char *portmask) 416 { 417 char *end = NULL; 418 unsigned long pm; 419 420 errno = 0; 421 422 /* parse hexadecimal string */ 423 pm = strtoul(portmask, &end, 16); 424 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 425 return 0; 426 427 return pm; 428 429 } 430 431 /* 432 * Parse num options at run time. 433 */ 434 static int 435 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 436 { 437 char *end = NULL; 438 unsigned long num; 439 440 errno = 0; 441 442 /* parse unsigned int string */ 443 num = strtoul(q_arg, &end, 10); 444 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 445 return -1; 446 447 if (num > max_valid_value) 448 return -1; 449 450 return num; 451 452 } 453 454 /* 455 * Display usage 456 */ 457 static void 458 us_vhost_usage(const char *prgname) 459 { 460 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 461 " --vm2vm [0|1|2]\n" 462 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 463 " --socket-file <path>\n" 464 " --nb-devices ND\n" 465 " -p PORTMASK: Set mask for ports to be used by application\n" 466 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 467 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 468 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 469 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 470 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 471 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 472 " --socket-file: The path of the socket file.\n" 473 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 474 " --tso [0|1] disable/enable TCP segment offload.\n" 475 " --client register a vhost-user socket as client mode.\n" 476 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n" 477 " --dmas register dma channel for specific vhost device.\n", 478 prgname); 479 } 480 481 enum { 482 #define OPT_VM2VM "vm2vm" 483 OPT_VM2VM_NUM = 256, 484 #define OPT_RX_RETRY "rx-retry" 485 OPT_RX_RETRY_NUM, 486 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 487 OPT_RX_RETRY_DELAY_NUM, 488 #define OPT_RX_RETRY_NUMB "rx-retry-num" 489 OPT_RX_RETRY_NUMB_NUM, 490 #define OPT_MERGEABLE "mergeable" 491 OPT_MERGEABLE_NUM, 492 #define OPT_STATS "stats" 493 OPT_STATS_NUM, 494 #define OPT_SOCKET_FILE "socket-file" 495 OPT_SOCKET_FILE_NUM, 496 #define OPT_TX_CSUM "tx-csum" 497 OPT_TX_CSUM_NUM, 498 #define OPT_TSO "tso" 499 OPT_TSO_NUM, 500 #define OPT_CLIENT "client" 501 OPT_CLIENT_NUM, 502 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 503 OPT_BUILTIN_NET_DRIVER_NUM, 504 #define OPT_DMA_TYPE "dma-type" 505 OPT_DMA_TYPE_NUM, 506 #define OPT_DMAS "dmas" 507 OPT_DMAS_NUM, 508 }; 509 510 /* 511 * Parse the arguments given in the command line of the application. 512 */ 513 static int 514 us_vhost_parse_args(int argc, char **argv) 515 { 516 int opt, ret; 517 int option_index; 518 unsigned i; 519 const char *prgname = argv[0]; 520 static struct option long_option[] = { 521 {OPT_VM2VM, required_argument, 522 NULL, OPT_VM2VM_NUM}, 523 {OPT_RX_RETRY, required_argument, 524 NULL, OPT_RX_RETRY_NUM}, 525 {OPT_RX_RETRY_DELAY, required_argument, 526 NULL, OPT_RX_RETRY_DELAY_NUM}, 527 {OPT_RX_RETRY_NUMB, required_argument, 528 NULL, OPT_RX_RETRY_NUMB_NUM}, 529 {OPT_MERGEABLE, required_argument, 530 NULL, OPT_MERGEABLE_NUM}, 531 {OPT_STATS, required_argument, 532 NULL, OPT_STATS_NUM}, 533 {OPT_SOCKET_FILE, required_argument, 534 NULL, OPT_SOCKET_FILE_NUM}, 535 {OPT_TX_CSUM, required_argument, 536 NULL, OPT_TX_CSUM_NUM}, 537 {OPT_TSO, required_argument, 538 NULL, OPT_TSO_NUM}, 539 {OPT_CLIENT, no_argument, 540 NULL, OPT_CLIENT_NUM}, 541 {OPT_BUILTIN_NET_DRIVER, no_argument, 542 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 543 {OPT_DMA_TYPE, required_argument, 544 NULL, OPT_DMA_TYPE_NUM}, 545 {OPT_DMAS, required_argument, 546 NULL, OPT_DMAS_NUM}, 547 {NULL, 0, 0, 0}, 548 }; 549 550 /* Parse command line */ 551 while ((opt = getopt_long(argc, argv, "p:P", 552 long_option, &option_index)) != EOF) { 553 switch (opt) { 554 /* Portmask */ 555 case 'p': 556 enabled_port_mask = parse_portmask(optarg); 557 if (enabled_port_mask == 0) { 558 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 559 us_vhost_usage(prgname); 560 return -1; 561 } 562 break; 563 564 case 'P': 565 promiscuous = 1; 566 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 567 ETH_VMDQ_ACCEPT_BROADCAST | 568 ETH_VMDQ_ACCEPT_MULTICAST; 569 break; 570 571 case OPT_VM2VM_NUM: 572 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 573 if (ret == -1) { 574 RTE_LOG(INFO, VHOST_CONFIG, 575 "Invalid argument for " 576 "vm2vm [0|1|2]\n"); 577 us_vhost_usage(prgname); 578 return -1; 579 } 580 vm2vm_mode = (vm2vm_type)ret; 581 break; 582 583 case OPT_RX_RETRY_NUM: 584 ret = parse_num_opt(optarg, 1); 585 if (ret == -1) { 586 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 587 us_vhost_usage(prgname); 588 return -1; 589 } 590 enable_retry = ret; 591 break; 592 593 case OPT_TX_CSUM_NUM: 594 ret = parse_num_opt(optarg, 1); 595 if (ret == -1) { 596 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 597 us_vhost_usage(prgname); 598 return -1; 599 } 600 enable_tx_csum = ret; 601 break; 602 603 case OPT_TSO_NUM: 604 ret = parse_num_opt(optarg, 1); 605 if (ret == -1) { 606 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 607 us_vhost_usage(prgname); 608 return -1; 609 } 610 enable_tso = ret; 611 break; 612 613 case OPT_RX_RETRY_DELAY_NUM: 614 ret = parse_num_opt(optarg, INT32_MAX); 615 if (ret == -1) { 616 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 617 us_vhost_usage(prgname); 618 return -1; 619 } 620 burst_rx_delay_time = ret; 621 break; 622 623 case OPT_RX_RETRY_NUMB_NUM: 624 ret = parse_num_opt(optarg, INT32_MAX); 625 if (ret == -1) { 626 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 627 us_vhost_usage(prgname); 628 return -1; 629 } 630 burst_rx_retry_num = ret; 631 break; 632 633 case OPT_MERGEABLE_NUM: 634 ret = parse_num_opt(optarg, 1); 635 if (ret == -1) { 636 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 637 us_vhost_usage(prgname); 638 return -1; 639 } 640 mergeable = !!ret; 641 if (ret) { 642 vmdq_conf_default.rxmode.offloads |= 643 DEV_RX_OFFLOAD_JUMBO_FRAME; 644 vmdq_conf_default.rxmode.max_rx_pkt_len 645 = JUMBO_FRAME_MAX_SIZE; 646 } 647 break; 648 649 case OPT_STATS_NUM: 650 ret = parse_num_opt(optarg, INT32_MAX); 651 if (ret == -1) { 652 RTE_LOG(INFO, VHOST_CONFIG, 653 "Invalid argument for stats [0..N]\n"); 654 us_vhost_usage(prgname); 655 return -1; 656 } 657 enable_stats = ret; 658 break; 659 660 /* Set socket file path. */ 661 case OPT_SOCKET_FILE_NUM: 662 if (us_vhost_parse_socket_path(optarg) == -1) { 663 RTE_LOG(INFO, VHOST_CONFIG, 664 "Invalid argument for socket name (Max %d characters)\n", 665 PATH_MAX); 666 us_vhost_usage(prgname); 667 return -1; 668 } 669 break; 670 671 case OPT_DMA_TYPE_NUM: 672 strcpy(dma_type, optarg); 673 break; 674 675 case OPT_DMAS_NUM: 676 if (open_dma(optarg) == -1) { 677 RTE_LOG(INFO, VHOST_CONFIG, 678 "Wrong DMA args\n"); 679 us_vhost_usage(prgname); 680 return -1; 681 } 682 async_vhost_driver = 1; 683 break; 684 685 case OPT_CLIENT_NUM: 686 client_mode = 1; 687 break; 688 689 case OPT_BUILTIN_NET_DRIVER_NUM: 690 builtin_net_driver = 1; 691 break; 692 693 /* Invalid option - print options. */ 694 default: 695 us_vhost_usage(prgname); 696 return -1; 697 } 698 } 699 700 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 701 if (enabled_port_mask & (1 << i)) 702 ports[num_ports++] = i; 703 } 704 705 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 706 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 707 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 708 return -1; 709 } 710 711 return 0; 712 } 713 714 /* 715 * Update the global var NUM_PORTS and array PORTS according to system ports number 716 * and return valid ports number 717 */ 718 static unsigned check_ports_num(unsigned nb_ports) 719 { 720 unsigned valid_num_ports = num_ports; 721 unsigned portid; 722 723 if (num_ports > nb_ports) { 724 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 725 num_ports, nb_ports); 726 num_ports = nb_ports; 727 } 728 729 for (portid = 0; portid < num_ports; portid ++) { 730 if (!rte_eth_dev_is_valid_port(ports[portid])) { 731 RTE_LOG(INFO, VHOST_PORT, 732 "\nSpecified port ID(%u) is not valid\n", 733 ports[portid]); 734 ports[portid] = INVALID_PORT_ID; 735 valid_num_ports--; 736 } 737 } 738 return valid_num_ports; 739 } 740 741 static __rte_always_inline struct vhost_dev * 742 find_vhost_dev(struct rte_ether_addr *mac) 743 { 744 struct vhost_dev *vdev; 745 746 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 747 if (vdev->ready == DEVICE_RX && 748 rte_is_same_ether_addr(mac, &vdev->mac_address)) 749 return vdev; 750 } 751 752 return NULL; 753 } 754 755 /* 756 * This function learns the MAC address of the device and registers this along with a 757 * vlan tag to a VMDQ. 758 */ 759 static int 760 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 761 { 762 struct rte_ether_hdr *pkt_hdr; 763 int i, ret; 764 765 /* Learn MAC address of guest device from packet */ 766 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 767 768 if (find_vhost_dev(&pkt_hdr->s_addr)) { 769 RTE_LOG(ERR, VHOST_DATA, 770 "(%d) device is using a registered MAC!\n", 771 vdev->vid); 772 return -1; 773 } 774 775 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 776 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 777 778 /* vlan_tag currently uses the device_id. */ 779 vdev->vlan_tag = vlan_tags[vdev->vid]; 780 781 /* Print out VMDQ registration info. */ 782 RTE_LOG(INFO, VHOST_DATA, 783 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 784 vdev->vid, 785 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 786 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 787 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 788 vdev->vlan_tag); 789 790 /* Register the MAC address. */ 791 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 792 (uint32_t)vdev->vid + vmdq_pool_base); 793 if (ret) 794 RTE_LOG(ERR, VHOST_DATA, 795 "(%d) failed to add device MAC address to VMDQ\n", 796 vdev->vid); 797 798 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 799 800 /* Set device as ready for RX. */ 801 vdev->ready = DEVICE_RX; 802 803 return 0; 804 } 805 806 /* 807 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 808 * queue before disabling RX on the device. 809 */ 810 static inline void 811 unlink_vmdq(struct vhost_dev *vdev) 812 { 813 unsigned i = 0; 814 unsigned rx_count; 815 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 816 817 if (vdev->ready == DEVICE_RX) { 818 /*clear MAC and VLAN settings*/ 819 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 820 for (i = 0; i < 6; i++) 821 vdev->mac_address.addr_bytes[i] = 0; 822 823 vdev->vlan_tag = 0; 824 825 /*Clear out the receive buffers*/ 826 rx_count = rte_eth_rx_burst(ports[0], 827 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 828 829 while (rx_count) { 830 for (i = 0; i < rx_count; i++) 831 rte_pktmbuf_free(pkts_burst[i]); 832 833 rx_count = rte_eth_rx_burst(ports[0], 834 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 835 } 836 837 vdev->ready = DEVICE_MAC_LEARNING; 838 } 839 } 840 841 static inline void 842 free_pkts(struct rte_mbuf **pkts, uint16_t n) 843 { 844 while (n--) 845 rte_pktmbuf_free(pkts[n]); 846 } 847 848 static __rte_always_inline void 849 complete_async_pkts(struct vhost_dev *vdev) 850 { 851 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 852 uint16_t complete_count; 853 854 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 855 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST); 856 if (complete_count) 857 free_pkts(p_cpl, complete_count); 858 } 859 860 static __rte_always_inline void 861 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 862 struct rte_mbuf *m) 863 { 864 uint16_t ret; 865 866 if (builtin_net_driver) { 867 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 868 } else { 869 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 870 } 871 872 if (enable_stats) { 873 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 874 __ATOMIC_SEQ_CST); 875 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 876 __ATOMIC_SEQ_CST); 877 src_vdev->stats.tx_total++; 878 src_vdev->stats.tx += ret; 879 } 880 } 881 882 static __rte_always_inline void 883 drain_vhost(struct vhost_dev *vdev) 884 { 885 uint16_t ret; 886 uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid; 887 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 888 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 889 890 if (builtin_net_driver) { 891 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); 892 } else if (async_vhost_driver) { 893 uint32_t cpu_cpl_nr = 0; 894 uint16_t enqueue_fail = 0; 895 struct rte_mbuf *m_cpu_cpl[nr_xmit]; 896 897 complete_async_pkts(vdev); 898 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, 899 m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr); 900 901 if (cpu_cpl_nr) 902 free_pkts(m_cpu_cpl, cpu_cpl_nr); 903 904 enqueue_fail = nr_xmit - ret; 905 if (enqueue_fail) 906 free_pkts(&m[ret], nr_xmit - ret); 907 } else { 908 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 909 m, nr_xmit); 910 } 911 912 if (enable_stats) { 913 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 914 __ATOMIC_SEQ_CST); 915 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 916 __ATOMIC_SEQ_CST); 917 } 918 919 if (!async_vhost_driver) 920 free_pkts(m, nr_xmit); 921 } 922 923 static __rte_always_inline void 924 drain_vhost_table(void) 925 { 926 uint16_t lcore_id = rte_lcore_id(); 927 struct vhost_bufftable *vhost_txq; 928 struct vhost_dev *vdev; 929 uint64_t cur_tsc; 930 931 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 932 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE 933 + vdev->vid]; 934 935 cur_tsc = rte_rdtsc(); 936 if (unlikely(cur_tsc - vhost_txq->pre_tsc 937 > MBUF_TABLE_DRAIN_TSC)) { 938 RTE_LOG_DP(DEBUG, VHOST_DATA, 939 "Vhost TX queue drained after timeout with burst size %u\n", 940 vhost_txq->len); 941 drain_vhost(vdev); 942 vhost_txq->len = 0; 943 vhost_txq->pre_tsc = cur_tsc; 944 } 945 } 946 } 947 948 /* 949 * Check if the packet destination MAC address is for a local device. If so then put 950 * the packet on that devices RX queue. If not then return. 951 */ 952 static __rte_always_inline int 953 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 954 { 955 struct rte_ether_hdr *pkt_hdr; 956 struct vhost_dev *dst_vdev; 957 struct vhost_bufftable *vhost_txq; 958 uint16_t lcore_id = rte_lcore_id(); 959 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 960 961 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 962 if (!dst_vdev) 963 return -1; 964 965 if (vdev->vid == dst_vdev->vid) { 966 RTE_LOG_DP(DEBUG, VHOST_DATA, 967 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 968 vdev->vid); 969 return 0; 970 } 971 972 RTE_LOG_DP(DEBUG, VHOST_DATA, 973 "(%d) TX: MAC address is local\n", dst_vdev->vid); 974 975 if (unlikely(dst_vdev->remove)) { 976 RTE_LOG_DP(DEBUG, VHOST_DATA, 977 "(%d) device is marked for removal\n", dst_vdev->vid); 978 return 0; 979 } 980 981 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid]; 982 vhost_txq->m_table[vhost_txq->len++] = m; 983 984 if (enable_stats) { 985 vdev->stats.tx_total++; 986 vdev->stats.tx++; 987 } 988 989 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 990 drain_vhost(dst_vdev); 991 vhost_txq->len = 0; 992 vhost_txq->pre_tsc = rte_rdtsc(); 993 } 994 return 0; 995 } 996 997 /* 998 * Check if the destination MAC of a packet is one local VM, 999 * and get its vlan tag, and offset if it is. 1000 */ 1001 static __rte_always_inline int 1002 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1003 uint32_t *offset, uint16_t *vlan_tag) 1004 { 1005 struct vhost_dev *dst_vdev; 1006 struct rte_ether_hdr *pkt_hdr = 1007 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1008 1009 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 1010 if (!dst_vdev) 1011 return 0; 1012 1013 if (vdev->vid == dst_vdev->vid) { 1014 RTE_LOG_DP(DEBUG, VHOST_DATA, 1015 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1016 vdev->vid); 1017 return -1; 1018 } 1019 1020 /* 1021 * HW vlan strip will reduce the packet length 1022 * by minus length of vlan tag, so need restore 1023 * the packet length by plus it. 1024 */ 1025 *offset = VLAN_HLEN; 1026 *vlan_tag = vlan_tags[vdev->vid]; 1027 1028 RTE_LOG_DP(DEBUG, VHOST_DATA, 1029 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1030 vdev->vid, dst_vdev->vid, *vlan_tag); 1031 1032 return 0; 1033 } 1034 1035 static uint16_t 1036 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 1037 { 1038 if (ol_flags & PKT_TX_IPV4) 1039 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 1040 else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1041 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 1042 } 1043 1044 static void virtio_tx_offload(struct rte_mbuf *m) 1045 { 1046 void *l3_hdr; 1047 struct rte_ipv4_hdr *ipv4_hdr = NULL; 1048 struct rte_tcp_hdr *tcp_hdr = NULL; 1049 struct rte_ether_hdr *eth_hdr = 1050 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1051 1052 l3_hdr = (char *)eth_hdr + m->l2_len; 1053 1054 if (m->ol_flags & PKT_TX_IPV4) { 1055 ipv4_hdr = l3_hdr; 1056 ipv4_hdr->hdr_checksum = 0; 1057 m->ol_flags |= PKT_TX_IP_CKSUM; 1058 } 1059 1060 tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len); 1061 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 1062 } 1063 1064 static __rte_always_inline void 1065 do_drain_mbuf_table(struct mbuf_table *tx_q) 1066 { 1067 uint16_t count; 1068 1069 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1070 tx_q->m_table, tx_q->len); 1071 if (unlikely(count < tx_q->len)) 1072 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1073 1074 tx_q->len = 0; 1075 } 1076 1077 /* 1078 * This function routes the TX packet to the correct interface. This 1079 * may be a local device or the physical port. 1080 */ 1081 static __rte_always_inline void 1082 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1083 { 1084 struct mbuf_table *tx_q; 1085 unsigned offset = 0; 1086 const uint16_t lcore_id = rte_lcore_id(); 1087 struct rte_ether_hdr *nh; 1088 1089 1090 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1091 if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) { 1092 struct vhost_dev *vdev2; 1093 1094 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1095 if (vdev2 != vdev) 1096 sync_virtio_xmit(vdev2, vdev, m); 1097 } 1098 goto queue2nic; 1099 } 1100 1101 /*check if destination is local VM*/ 1102 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1103 return; 1104 1105 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1106 if (unlikely(find_local_dest(vdev, m, &offset, 1107 &vlan_tag) != 0)) { 1108 rte_pktmbuf_free(m); 1109 return; 1110 } 1111 } 1112 1113 RTE_LOG_DP(DEBUG, VHOST_DATA, 1114 "(%d) TX: MAC address is external\n", vdev->vid); 1115 1116 queue2nic: 1117 1118 /*Add packet to the port tx queue*/ 1119 tx_q = &lcore_tx_queue[lcore_id]; 1120 1121 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1122 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1123 /* Guest has inserted the vlan tag. */ 1124 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1125 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1126 if ((vm2vm_mode == VM2VM_HARDWARE) && 1127 (vh->vlan_tci != vlan_tag_be)) 1128 vh->vlan_tci = vlan_tag_be; 1129 } else { 1130 m->ol_flags |= PKT_TX_VLAN_PKT; 1131 1132 /* 1133 * Find the right seg to adjust the data len when offset is 1134 * bigger than tail room size. 1135 */ 1136 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1137 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1138 m->data_len += offset; 1139 else { 1140 struct rte_mbuf *seg = m; 1141 1142 while ((seg->next != NULL) && 1143 (offset > rte_pktmbuf_tailroom(seg))) 1144 seg = seg->next; 1145 1146 seg->data_len += offset; 1147 } 1148 m->pkt_len += offset; 1149 } 1150 1151 m->vlan_tci = vlan_tag; 1152 } 1153 1154 if (m->ol_flags & PKT_TX_TCP_SEG) 1155 virtio_tx_offload(m); 1156 1157 tx_q->m_table[tx_q->len++] = m; 1158 if (enable_stats) { 1159 vdev->stats.tx_total++; 1160 vdev->stats.tx++; 1161 } 1162 1163 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1164 do_drain_mbuf_table(tx_q); 1165 } 1166 1167 1168 static __rte_always_inline void 1169 drain_mbuf_table(struct mbuf_table *tx_q) 1170 { 1171 static uint64_t prev_tsc; 1172 uint64_t cur_tsc; 1173 1174 if (tx_q->len == 0) 1175 return; 1176 1177 cur_tsc = rte_rdtsc(); 1178 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1179 prev_tsc = cur_tsc; 1180 1181 RTE_LOG_DP(DEBUG, VHOST_DATA, 1182 "TX queue drained after timeout with burst size %u\n", 1183 tx_q->len); 1184 do_drain_mbuf_table(tx_q); 1185 } 1186 } 1187 1188 static __rte_always_inline void 1189 drain_eth_rx(struct vhost_dev *vdev) 1190 { 1191 uint16_t rx_count, enqueue_count; 1192 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1193 1194 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1195 pkts, MAX_PKT_BURST); 1196 1197 if (!rx_count) 1198 return; 1199 1200 /* 1201 * When "enable_retry" is set, here we wait and retry when there 1202 * is no enough free slots in the queue to hold @rx_count packets, 1203 * to diminish packet loss. 1204 */ 1205 if (enable_retry && 1206 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1207 VIRTIO_RXQ))) { 1208 uint32_t retry; 1209 1210 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1211 rte_delay_us(burst_rx_delay_time); 1212 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1213 VIRTIO_RXQ)) 1214 break; 1215 } 1216 } 1217 1218 if (builtin_net_driver) { 1219 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1220 pkts, rx_count); 1221 } else if (async_vhost_driver) { 1222 uint32_t cpu_cpl_nr = 0; 1223 uint16_t enqueue_fail = 0; 1224 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST]; 1225 1226 complete_async_pkts(vdev); 1227 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, 1228 VIRTIO_RXQ, pkts, rx_count, 1229 m_cpu_cpl, &cpu_cpl_nr); 1230 if (cpu_cpl_nr) 1231 free_pkts(m_cpu_cpl, cpu_cpl_nr); 1232 1233 enqueue_fail = rx_count - enqueue_count; 1234 if (enqueue_fail) 1235 free_pkts(&pkts[enqueue_count], enqueue_fail); 1236 1237 } else { 1238 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1239 pkts, rx_count); 1240 } 1241 1242 if (enable_stats) { 1243 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1244 __ATOMIC_SEQ_CST); 1245 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1246 __ATOMIC_SEQ_CST); 1247 } 1248 1249 if (!async_vhost_driver) 1250 free_pkts(pkts, rx_count); 1251 } 1252 1253 static __rte_always_inline void 1254 drain_virtio_tx(struct vhost_dev *vdev) 1255 { 1256 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1257 uint16_t count; 1258 uint16_t i; 1259 1260 if (builtin_net_driver) { 1261 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1262 pkts, MAX_PKT_BURST); 1263 } else { 1264 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1265 mbuf_pool, pkts, MAX_PKT_BURST); 1266 } 1267 1268 /* setup VMDq for the first packet */ 1269 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1270 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1271 free_pkts(pkts, count); 1272 } 1273 1274 for (i = 0; i < count; ++i) 1275 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1276 } 1277 1278 /* 1279 * Main function of vhost-switch. It basically does: 1280 * 1281 * for each vhost device { 1282 * - drain_eth_rx() 1283 * 1284 * Which drains the host eth Rx queue linked to the vhost device, 1285 * and deliver all of them to guest virito Rx ring associated with 1286 * this vhost device. 1287 * 1288 * - drain_virtio_tx() 1289 * 1290 * Which drains the guest virtio Tx queue and deliver all of them 1291 * to the target, which could be another vhost device, or the 1292 * physical eth dev. The route is done in function "virtio_tx_route". 1293 * } 1294 */ 1295 static int 1296 switch_worker(void *arg __rte_unused) 1297 { 1298 unsigned i; 1299 unsigned lcore_id = rte_lcore_id(); 1300 struct vhost_dev *vdev; 1301 struct mbuf_table *tx_q; 1302 1303 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1304 1305 tx_q = &lcore_tx_queue[lcore_id]; 1306 for (i = 0; i < rte_lcore_count(); i++) { 1307 if (lcore_ids[i] == lcore_id) { 1308 tx_q->txq_id = i; 1309 break; 1310 } 1311 } 1312 1313 while(1) { 1314 drain_mbuf_table(tx_q); 1315 drain_vhost_table(); 1316 /* 1317 * Inform the configuration core that we have exited the 1318 * linked list and that no devices are in use if requested. 1319 */ 1320 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1321 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1322 1323 /* 1324 * Process vhost devices 1325 */ 1326 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1327 lcore_vdev_entry) { 1328 if (unlikely(vdev->remove)) { 1329 unlink_vmdq(vdev); 1330 vdev->ready = DEVICE_SAFE_REMOVE; 1331 continue; 1332 } 1333 1334 if (likely(vdev->ready == DEVICE_RX)) 1335 drain_eth_rx(vdev); 1336 1337 if (likely(!vdev->remove)) 1338 drain_virtio_tx(vdev); 1339 } 1340 } 1341 1342 return 0; 1343 } 1344 1345 /* 1346 * Remove a device from the specific data core linked list and from the 1347 * main linked list. Synchonization occurs through the use of the 1348 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1349 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1350 */ 1351 static void 1352 destroy_device(int vid) 1353 { 1354 struct vhost_dev *vdev = NULL; 1355 int lcore; 1356 uint16_t i; 1357 1358 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1359 if (vdev->vid == vid) 1360 break; 1361 } 1362 if (!vdev) 1363 return; 1364 /*set the remove flag. */ 1365 vdev->remove = 1; 1366 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1367 rte_pause(); 1368 } 1369 1370 for (i = 0; i < RTE_MAX_LCORE; i++) 1371 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]); 1372 1373 if (builtin_net_driver) 1374 vs_vhost_net_remove(vdev); 1375 1376 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1377 lcore_vdev_entry); 1378 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1379 1380 1381 /* Set the dev_removal_flag on each lcore. */ 1382 RTE_LCORE_FOREACH_WORKER(lcore) 1383 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1384 1385 /* 1386 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1387 * we can be sure that they can no longer access the device removed 1388 * from the linked lists and that the devices are no longer in use. 1389 */ 1390 RTE_LCORE_FOREACH_WORKER(lcore) { 1391 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1392 rte_pause(); 1393 } 1394 1395 lcore_info[vdev->coreid].device_num--; 1396 1397 RTE_LOG(INFO, VHOST_DATA, 1398 "(%d) device has been removed from data core\n", 1399 vdev->vid); 1400 1401 if (async_vhost_driver) 1402 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1403 1404 rte_free(vdev); 1405 } 1406 1407 /* 1408 * A new device is added to a data core. First the device is added to the main linked list 1409 * and then allocated to a specific data core. 1410 */ 1411 static int 1412 new_device(int vid) 1413 { 1414 int lcore, core_add = 0; 1415 uint16_t i; 1416 uint32_t device_num_min = num_devices; 1417 struct vhost_dev *vdev; 1418 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1419 if (vdev == NULL) { 1420 RTE_LOG(INFO, VHOST_DATA, 1421 "(%d) couldn't allocate memory for vhost dev\n", 1422 vid); 1423 return -1; 1424 } 1425 vdev->vid = vid; 1426 1427 for (i = 0; i < RTE_MAX_LCORE; i++) { 1428 vhost_txbuff[i * MAX_VHOST_DEVICE + vid] 1429 = rte_zmalloc("vhost bufftable", 1430 sizeof(struct vhost_bufftable), 1431 RTE_CACHE_LINE_SIZE); 1432 1433 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) { 1434 RTE_LOG(INFO, VHOST_DATA, 1435 "(%d) couldn't allocate memory for vhost TX\n", vid); 1436 return -1; 1437 } 1438 } 1439 1440 if (builtin_net_driver) 1441 vs_vhost_net_setup(vdev); 1442 1443 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1444 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1445 1446 /*reset ready flag*/ 1447 vdev->ready = DEVICE_MAC_LEARNING; 1448 vdev->remove = 0; 1449 1450 /* Find a suitable lcore to add the device. */ 1451 RTE_LCORE_FOREACH_WORKER(lcore) { 1452 if (lcore_info[lcore].device_num < device_num_min) { 1453 device_num_min = lcore_info[lcore].device_num; 1454 core_add = lcore; 1455 } 1456 } 1457 vdev->coreid = core_add; 1458 1459 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1460 lcore_vdev_entry); 1461 lcore_info[vdev->coreid].device_num++; 1462 1463 /* Disable notifications. */ 1464 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1465 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1466 1467 RTE_LOG(INFO, VHOST_DATA, 1468 "(%d) device has been added to data core %d\n", 1469 vid, vdev->coreid); 1470 1471 if (async_vhost_driver) { 1472 struct rte_vhost_async_features f; 1473 struct rte_vhost_async_channel_ops channel_ops; 1474 1475 if (strncmp(dma_type, "ioat", 4) == 0) { 1476 channel_ops.transfer_data = ioat_transfer_data_cb; 1477 channel_ops.check_completed_copies = 1478 ioat_check_completed_copies_cb; 1479 1480 f.async_inorder = 1; 1481 f.async_threshold = 256; 1482 1483 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ, 1484 f.intval, &channel_ops); 1485 } 1486 } 1487 1488 return 0; 1489 } 1490 1491 /* 1492 * These callback allow devices to be added to the data core when configuration 1493 * has been fully complete. 1494 */ 1495 static const struct vhost_device_ops virtio_net_device_ops = 1496 { 1497 .new_device = new_device, 1498 .destroy_device = destroy_device, 1499 }; 1500 1501 /* 1502 * This is a thread will wake up after a period to print stats if the user has 1503 * enabled them. 1504 */ 1505 static void * 1506 print_stats(__rte_unused void *arg) 1507 { 1508 struct vhost_dev *vdev; 1509 uint64_t tx_dropped, rx_dropped; 1510 uint64_t tx, tx_total, rx, rx_total; 1511 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1512 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1513 1514 while(1) { 1515 sleep(enable_stats); 1516 1517 /* Clear screen and move to top left */ 1518 printf("%s%s\n", clr, top_left); 1519 printf("Device statistics =================================\n"); 1520 1521 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1522 tx_total = vdev->stats.tx_total; 1523 tx = vdev->stats.tx; 1524 tx_dropped = tx_total - tx; 1525 1526 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1527 __ATOMIC_SEQ_CST); 1528 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1529 __ATOMIC_SEQ_CST); 1530 rx_dropped = rx_total - rx; 1531 1532 printf("Statistics for device %d\n" 1533 "-----------------------\n" 1534 "TX total: %" PRIu64 "\n" 1535 "TX dropped: %" PRIu64 "\n" 1536 "TX successful: %" PRIu64 "\n" 1537 "RX total: %" PRIu64 "\n" 1538 "RX dropped: %" PRIu64 "\n" 1539 "RX successful: %" PRIu64 "\n", 1540 vdev->vid, 1541 tx_total, tx_dropped, tx, 1542 rx_total, rx_dropped, rx); 1543 } 1544 1545 printf("===================================================\n"); 1546 1547 fflush(stdout); 1548 } 1549 1550 return NULL; 1551 } 1552 1553 static void 1554 unregister_drivers(int socket_num) 1555 { 1556 int i, ret; 1557 1558 for (i = 0; i < socket_num; i++) { 1559 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1560 if (ret != 0) 1561 RTE_LOG(ERR, VHOST_CONFIG, 1562 "Fail to unregister vhost driver for %s.\n", 1563 socket_files + i * PATH_MAX); 1564 } 1565 } 1566 1567 /* When we receive a INT signal, unregister vhost driver */ 1568 static void 1569 sigint_handler(__rte_unused int signum) 1570 { 1571 /* Unregister vhost driver. */ 1572 unregister_drivers(nb_sockets); 1573 1574 exit(0); 1575 } 1576 1577 /* 1578 * While creating an mbuf pool, one key thing is to figure out how 1579 * many mbuf entries is enough for our use. FYI, here are some 1580 * guidelines: 1581 * 1582 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1583 * 1584 * - For each switch core (A CPU core does the packet switch), we need 1585 * also make some reservation for receiving the packets from virtio 1586 * Tx queue. How many is enough depends on the usage. It's normally 1587 * a simple calculation like following: 1588 * 1589 * MAX_PKT_BURST * max packet size / mbuf size 1590 * 1591 * So, we definitely need allocate more mbufs when TSO is enabled. 1592 * 1593 * - Similarly, for each switching core, we should serve @nr_rx_desc 1594 * mbufs for receiving the packets from physical NIC device. 1595 * 1596 * - We also need make sure, for each switch core, we have allocated 1597 * enough mbufs to fill up the mbuf cache. 1598 */ 1599 static void 1600 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1601 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1602 { 1603 uint32_t nr_mbufs; 1604 uint32_t nr_mbufs_per_core; 1605 uint32_t mtu = 1500; 1606 1607 if (mergeable) 1608 mtu = 9000; 1609 if (enable_tso) 1610 mtu = 64 * 1024; 1611 1612 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1613 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1614 nr_mbufs_per_core += nr_rx_desc; 1615 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1616 1617 nr_mbufs = nr_queues * nr_rx_desc; 1618 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1619 nr_mbufs *= nr_port; 1620 1621 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1622 nr_mbuf_cache, 0, mbuf_size, 1623 rte_socket_id()); 1624 if (mbuf_pool == NULL) 1625 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1626 } 1627 1628 /* 1629 * Main function, does initialisation and calls the per-lcore functions. 1630 */ 1631 int 1632 main(int argc, char *argv[]) 1633 { 1634 unsigned lcore_id, core_id = 0; 1635 unsigned nb_ports, valid_num_ports; 1636 int ret, i; 1637 uint16_t portid; 1638 static pthread_t tid; 1639 uint64_t flags = 0; 1640 1641 signal(SIGINT, sigint_handler); 1642 1643 /* init EAL */ 1644 ret = rte_eal_init(argc, argv); 1645 if (ret < 0) 1646 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1647 argc -= ret; 1648 argv += ret; 1649 1650 /* parse app arguments */ 1651 ret = us_vhost_parse_args(argc, argv); 1652 if (ret < 0) 1653 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1654 1655 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1656 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1657 1658 if (rte_lcore_is_enabled(lcore_id)) 1659 lcore_ids[core_id++] = lcore_id; 1660 } 1661 1662 if (rte_lcore_count() > RTE_MAX_LCORE) 1663 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1664 1665 /* Get the number of physical ports. */ 1666 nb_ports = rte_eth_dev_count_avail(); 1667 1668 /* 1669 * Update the global var NUM_PORTS and global array PORTS 1670 * and get value of var VALID_NUM_PORTS according to system ports number 1671 */ 1672 valid_num_ports = check_ports_num(nb_ports); 1673 1674 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1675 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1676 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1677 return -1; 1678 } 1679 1680 /* 1681 * FIXME: here we are trying to allocate mbufs big enough for 1682 * @MAX_QUEUES, but the truth is we're never going to use that 1683 * many queues here. We probably should only do allocation for 1684 * those queues we are going to use. 1685 */ 1686 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1687 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1688 1689 if (vm2vm_mode == VM2VM_HARDWARE) { 1690 /* Enable VT loop back to let L2 switch to do it. */ 1691 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1692 RTE_LOG(DEBUG, VHOST_CONFIG, 1693 "Enable loop back for L2 switch in vmdq.\n"); 1694 } 1695 1696 /* initialize all ports */ 1697 RTE_ETH_FOREACH_DEV(portid) { 1698 /* skip ports that are not enabled */ 1699 if ((enabled_port_mask & (1 << portid)) == 0) { 1700 RTE_LOG(INFO, VHOST_PORT, 1701 "Skipping disabled port %d\n", portid); 1702 continue; 1703 } 1704 if (port_init(portid) != 0) 1705 rte_exit(EXIT_FAILURE, 1706 "Cannot initialize network ports\n"); 1707 } 1708 1709 /* Enable stats if the user option is set. */ 1710 if (enable_stats) { 1711 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1712 print_stats, NULL); 1713 if (ret < 0) 1714 rte_exit(EXIT_FAILURE, 1715 "Cannot create print-stats thread\n"); 1716 } 1717 1718 /* Launch all data cores. */ 1719 RTE_LCORE_FOREACH_WORKER(lcore_id) 1720 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1721 1722 if (client_mode) 1723 flags |= RTE_VHOST_USER_CLIENT; 1724 1725 /* Register vhost user driver to handle vhost messages. */ 1726 for (i = 0; i < nb_sockets; i++) { 1727 char *file = socket_files + i * PATH_MAX; 1728 1729 if (async_vhost_driver) 1730 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1731 1732 ret = rte_vhost_driver_register(file, flags); 1733 if (ret != 0) { 1734 unregister_drivers(i); 1735 rte_exit(EXIT_FAILURE, 1736 "vhost driver register failure.\n"); 1737 } 1738 1739 if (builtin_net_driver) 1740 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1741 1742 if (mergeable == 0) { 1743 rte_vhost_driver_disable_features(file, 1744 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1745 } 1746 1747 if (enable_tx_csum == 0) { 1748 rte_vhost_driver_disable_features(file, 1749 1ULL << VIRTIO_NET_F_CSUM); 1750 } 1751 1752 if (enable_tso == 0) { 1753 rte_vhost_driver_disable_features(file, 1754 1ULL << VIRTIO_NET_F_HOST_TSO4); 1755 rte_vhost_driver_disable_features(file, 1756 1ULL << VIRTIO_NET_F_HOST_TSO6); 1757 rte_vhost_driver_disable_features(file, 1758 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1759 rte_vhost_driver_disable_features(file, 1760 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1761 } 1762 1763 if (promiscuous) { 1764 rte_vhost_driver_enable_features(file, 1765 1ULL << VIRTIO_NET_F_CTRL_RX); 1766 } 1767 1768 ret = rte_vhost_driver_callback_register(file, 1769 &virtio_net_device_ops); 1770 if (ret != 0) { 1771 rte_exit(EXIT_FAILURE, 1772 "failed to register vhost driver callbacks.\n"); 1773 } 1774 1775 if (rte_vhost_driver_start(file) < 0) { 1776 rte_exit(EXIT_FAILURE, 1777 "failed to start vhost driver.\n"); 1778 } 1779 } 1780 1781 RTE_LCORE_FOREACH_WORKER(lcore_id) 1782 rte_eal_wait_lcore(lcore_id); 1783 1784 return 0; 1785 1786 } 1787