1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_atomic.h> 18 #include <rte_cycles.h> 19 #include <rte_ethdev.h> 20 #include <rte_log.h> 21 #include <rte_string_fns.h> 22 #include <rte_malloc.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 28 #include "main.h" 29 30 #ifndef MAX_QUEUES 31 #define MAX_QUEUES 128 32 #endif 33 34 /* the maximum number of external ports supported */ 35 #define MAX_SUP_PORTS 1 36 37 #define MBUF_CACHE_SIZE 128 38 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 39 40 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 41 42 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 43 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 44 45 #define JUMBO_FRAME_MAX_SIZE 0x2600 46 47 /* State of virtio device. */ 48 #define DEVICE_MAC_LEARNING 0 49 #define DEVICE_RX 1 50 #define DEVICE_SAFE_REMOVE 2 51 52 /* Configurable number of RX/TX ring descriptors */ 53 #define RTE_TEST_RX_DESC_DEFAULT 1024 54 #define RTE_TEST_TX_DESC_DEFAULT 512 55 56 #define INVALID_PORT_ID 0xFF 57 58 /* Max number of devices. Limited by vmdq. */ 59 #define MAX_DEVICES 64 60 61 /* Size of buffers used for snprintfs. */ 62 #define MAX_PRINT_BUFF 6072 63 64 /* Maximum long option length for option parsing. */ 65 #define MAX_LONG_OPT_SZ 64 66 67 /* mask of enabled ports */ 68 static uint32_t enabled_port_mask = 0; 69 70 /* Promiscuous mode */ 71 static uint32_t promiscuous; 72 73 /* number of devices/queues to support*/ 74 static uint32_t num_queues = 0; 75 static uint32_t num_devices; 76 77 static struct rte_mempool *mbuf_pool; 78 static int mergeable; 79 80 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 81 typedef enum { 82 VM2VM_DISABLED = 0, 83 VM2VM_SOFTWARE = 1, 84 VM2VM_HARDWARE = 2, 85 VM2VM_LAST 86 } vm2vm_type; 87 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 88 89 /* Enable stats. */ 90 static uint32_t enable_stats = 0; 91 /* Enable retries on RX. */ 92 static uint32_t enable_retry = 1; 93 94 /* Disable TX checksum offload */ 95 static uint32_t enable_tx_csum; 96 97 /* Disable TSO offload */ 98 static uint32_t enable_tso; 99 100 static int client_mode; 101 static int dequeue_zero_copy; 102 103 static int builtin_net_driver; 104 105 /* Specify timeout (in useconds) between retries on RX. */ 106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 107 /* Specify the number of retries on RX. */ 108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 109 110 /* Socket file paths. Can be set by user */ 111 static char *socket_files; 112 static int nb_sockets; 113 114 /* empty vmdq configuration structure. Filled in programatically */ 115 static struct rte_eth_conf vmdq_conf_default = { 116 .rxmode = { 117 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 118 .split_hdr_size = 0, 119 .header_split = 0, /**< Header Split disabled */ 120 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 121 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 122 /* 123 * It is necessary for 1G NIC such as I350, 124 * this fixes bug of ipv4 forwarding in guest can't 125 * forward pakets from one virtio dev to another virtio dev. 126 */ 127 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 128 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 129 .hw_strip_crc = 1, /**< CRC stripped by hardware */ 130 }, 131 132 .txmode = { 133 .mq_mode = ETH_MQ_TX_NONE, 134 }, 135 .rx_adv_conf = { 136 /* 137 * should be overridden separately in code with 138 * appropriate values 139 */ 140 .vmdq_rx_conf = { 141 .nb_queue_pools = ETH_8_POOLS, 142 .enable_default_pool = 0, 143 .default_pool = 0, 144 .nb_pool_maps = 0, 145 .pool_map = {{0, 0},}, 146 }, 147 }, 148 }; 149 150 static unsigned lcore_ids[RTE_MAX_LCORE]; 151 static uint16_t ports[RTE_MAX_ETHPORTS]; 152 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 153 static uint16_t num_pf_queues, num_vmdq_queues; 154 static uint16_t vmdq_pool_base, vmdq_queue_base; 155 static uint16_t queues_per_pool; 156 157 const uint16_t vlan_tags[] = { 158 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 159 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 160 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 161 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 162 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 163 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 164 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 165 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 166 }; 167 168 /* ethernet addresses of ports */ 169 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 170 171 static struct vhost_dev_tailq_list vhost_dev_list = 172 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 173 174 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 175 176 /* Used for queueing bursts of TX packets. */ 177 struct mbuf_table { 178 unsigned len; 179 unsigned txq_id; 180 struct rte_mbuf *m_table[MAX_PKT_BURST]; 181 }; 182 183 /* TX queue for each data core. */ 184 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 185 186 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 187 / US_PER_S * BURST_TX_DRAIN_US) 188 #define VLAN_HLEN 4 189 190 /* 191 * Builds up the correct configuration for VMDQ VLAN pool map 192 * according to the pool & queue limits. 193 */ 194 static inline int 195 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 196 { 197 struct rte_eth_vmdq_rx_conf conf; 198 struct rte_eth_vmdq_rx_conf *def_conf = 199 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 200 unsigned i; 201 202 memset(&conf, 0, sizeof(conf)); 203 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 204 conf.nb_pool_maps = num_devices; 205 conf.enable_loop_back = def_conf->enable_loop_back; 206 conf.rx_mode = def_conf->rx_mode; 207 208 for (i = 0; i < conf.nb_pool_maps; i++) { 209 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 210 conf.pool_map[i].pools = (1UL << i); 211 } 212 213 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 214 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 215 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 216 return 0; 217 } 218 219 /* 220 * Validate the device number according to the max pool number gotten form 221 * dev_info. If the device number is invalid, give the error message and 222 * return -1. Each device must have its own pool. 223 */ 224 static inline int 225 validate_num_devices(uint32_t max_nb_devices) 226 { 227 if (num_devices > max_nb_devices) { 228 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 229 return -1; 230 } 231 return 0; 232 } 233 234 /* 235 * Initialises a given port using global settings and with the rx buffers 236 * coming from the mbuf_pool passed as parameter 237 */ 238 static inline int 239 port_init(uint16_t port) 240 { 241 struct rte_eth_dev_info dev_info; 242 struct rte_eth_conf port_conf; 243 struct rte_eth_rxconf *rxconf; 244 struct rte_eth_txconf *txconf; 245 int16_t rx_rings, tx_rings; 246 uint16_t rx_ring_size, tx_ring_size; 247 int retval; 248 uint16_t q; 249 250 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 251 rte_eth_dev_info_get (port, &dev_info); 252 253 if (dev_info.max_rx_queues > MAX_QUEUES) { 254 rte_exit(EXIT_FAILURE, 255 "please define MAX_QUEUES no less than %u in %s\n", 256 dev_info.max_rx_queues, __FILE__); 257 } 258 259 rxconf = &dev_info.default_rxconf; 260 txconf = &dev_info.default_txconf; 261 rxconf->rx_drop_en = 1; 262 263 /* Enable vlan offload */ 264 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 265 266 /*configure the number of supported virtio devices based on VMDQ limits */ 267 num_devices = dev_info.max_vmdq_pools; 268 269 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 270 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 271 272 /* 273 * When dequeue zero copy is enabled, guest Tx used vring will be 274 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc 275 * (tx_ring_size here) must be small enough so that the driver will 276 * hit the free threshold easily and free mbufs timely. Otherwise, 277 * guest Tx vring would be starved. 278 */ 279 if (dequeue_zero_copy) 280 tx_ring_size = 64; 281 282 tx_rings = (uint16_t)rte_lcore_count(); 283 284 retval = validate_num_devices(MAX_DEVICES); 285 if (retval < 0) 286 return retval; 287 288 /* Get port configuration. */ 289 retval = get_eth_conf(&port_conf, num_devices); 290 if (retval < 0) 291 return retval; 292 /* NIC queues are divided into pf queues and vmdq queues. */ 293 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 294 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 295 num_vmdq_queues = num_devices * queues_per_pool; 296 num_queues = num_pf_queues + num_vmdq_queues; 297 vmdq_queue_base = dev_info.vmdq_queue_base; 298 vmdq_pool_base = dev_info.vmdq_pool_base; 299 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 300 num_pf_queues, num_devices, queues_per_pool); 301 302 if (port >= rte_eth_dev_count()) return -1; 303 304 rx_rings = (uint16_t)dev_info.max_rx_queues; 305 /* Configure ethernet device. */ 306 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 307 if (retval != 0) { 308 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 309 port, strerror(-retval)); 310 return retval; 311 } 312 313 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 314 &tx_ring_size); 315 if (retval != 0) { 316 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 317 "for port %u: %s.\n", port, strerror(-retval)); 318 return retval; 319 } 320 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 321 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 322 "for Rx queues on port %u.\n", port); 323 return -1; 324 } 325 326 /* Setup the queues. */ 327 for (q = 0; q < rx_rings; q ++) { 328 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 329 rte_eth_dev_socket_id(port), 330 rxconf, 331 mbuf_pool); 332 if (retval < 0) { 333 RTE_LOG(ERR, VHOST_PORT, 334 "Failed to setup rx queue %u of port %u: %s.\n", 335 q, port, strerror(-retval)); 336 return retval; 337 } 338 } 339 for (q = 0; q < tx_rings; q ++) { 340 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 341 rte_eth_dev_socket_id(port), 342 txconf); 343 if (retval < 0) { 344 RTE_LOG(ERR, VHOST_PORT, 345 "Failed to setup tx queue %u of port %u: %s.\n", 346 q, port, strerror(-retval)); 347 return retval; 348 } 349 } 350 351 /* Start the device. */ 352 retval = rte_eth_dev_start(port); 353 if (retval < 0) { 354 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 355 port, strerror(-retval)); 356 return retval; 357 } 358 359 if (promiscuous) 360 rte_eth_promiscuous_enable(port); 361 362 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 363 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 364 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 365 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 366 port, 367 vmdq_ports_eth_addr[port].addr_bytes[0], 368 vmdq_ports_eth_addr[port].addr_bytes[1], 369 vmdq_ports_eth_addr[port].addr_bytes[2], 370 vmdq_ports_eth_addr[port].addr_bytes[3], 371 vmdq_ports_eth_addr[port].addr_bytes[4], 372 vmdq_ports_eth_addr[port].addr_bytes[5]); 373 374 return 0; 375 } 376 377 /* 378 * Set socket file path. 379 */ 380 static int 381 us_vhost_parse_socket_path(const char *q_arg) 382 { 383 /* parse number string */ 384 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 385 return -1; 386 387 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 388 snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg); 389 nb_sockets++; 390 391 return 0; 392 } 393 394 /* 395 * Parse the portmask provided at run time. 396 */ 397 static int 398 parse_portmask(const char *portmask) 399 { 400 char *end = NULL; 401 unsigned long pm; 402 403 errno = 0; 404 405 /* parse hexadecimal string */ 406 pm = strtoul(portmask, &end, 16); 407 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 408 return -1; 409 410 if (pm == 0) 411 return -1; 412 413 return pm; 414 415 } 416 417 /* 418 * Parse num options at run time. 419 */ 420 static int 421 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 422 { 423 char *end = NULL; 424 unsigned long num; 425 426 errno = 0; 427 428 /* parse unsigned int string */ 429 num = strtoul(q_arg, &end, 10); 430 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 431 return -1; 432 433 if (num > max_valid_value) 434 return -1; 435 436 return num; 437 438 } 439 440 /* 441 * Display usage 442 */ 443 static void 444 us_vhost_usage(const char *prgname) 445 { 446 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 447 " --vm2vm [0|1|2]\n" 448 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 449 " --socket-file <path>\n" 450 " --nb-devices ND\n" 451 " -p PORTMASK: Set mask for ports to be used by application\n" 452 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 453 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 454 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 455 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 456 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 457 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 458 " --socket-file: The path of the socket file.\n" 459 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 460 " --tso [0|1] disable/enable TCP segment offload.\n" 461 " --client register a vhost-user socket as client mode.\n" 462 " --dequeue-zero-copy enables dequeue zero copy\n", 463 prgname); 464 } 465 466 /* 467 * Parse the arguments given in the command line of the application. 468 */ 469 static int 470 us_vhost_parse_args(int argc, char **argv) 471 { 472 int opt, ret; 473 int option_index; 474 unsigned i; 475 const char *prgname = argv[0]; 476 static struct option long_option[] = { 477 {"vm2vm", required_argument, NULL, 0}, 478 {"rx-retry", required_argument, NULL, 0}, 479 {"rx-retry-delay", required_argument, NULL, 0}, 480 {"rx-retry-num", required_argument, NULL, 0}, 481 {"mergeable", required_argument, NULL, 0}, 482 {"stats", required_argument, NULL, 0}, 483 {"socket-file", required_argument, NULL, 0}, 484 {"tx-csum", required_argument, NULL, 0}, 485 {"tso", required_argument, NULL, 0}, 486 {"client", no_argument, &client_mode, 1}, 487 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1}, 488 {"builtin-net-driver", no_argument, &builtin_net_driver, 1}, 489 {NULL, 0, 0, 0}, 490 }; 491 492 /* Parse command line */ 493 while ((opt = getopt_long(argc, argv, "p:P", 494 long_option, &option_index)) != EOF) { 495 switch (opt) { 496 /* Portmask */ 497 case 'p': 498 enabled_port_mask = parse_portmask(optarg); 499 if (enabled_port_mask == 0) { 500 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 501 us_vhost_usage(prgname); 502 return -1; 503 } 504 break; 505 506 case 'P': 507 promiscuous = 1; 508 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 509 ETH_VMDQ_ACCEPT_BROADCAST | 510 ETH_VMDQ_ACCEPT_MULTICAST; 511 512 break; 513 514 case 0: 515 /* Enable/disable vm2vm comms. */ 516 if (!strncmp(long_option[option_index].name, "vm2vm", 517 MAX_LONG_OPT_SZ)) { 518 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 519 if (ret == -1) { 520 RTE_LOG(INFO, VHOST_CONFIG, 521 "Invalid argument for " 522 "vm2vm [0|1|2]\n"); 523 us_vhost_usage(prgname); 524 return -1; 525 } else { 526 vm2vm_mode = (vm2vm_type)ret; 527 } 528 } 529 530 /* Enable/disable retries on RX. */ 531 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 532 ret = parse_num_opt(optarg, 1); 533 if (ret == -1) { 534 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 535 us_vhost_usage(prgname); 536 return -1; 537 } else { 538 enable_retry = ret; 539 } 540 } 541 542 /* Enable/disable TX checksum offload. */ 543 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 544 ret = parse_num_opt(optarg, 1); 545 if (ret == -1) { 546 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 547 us_vhost_usage(prgname); 548 return -1; 549 } else 550 enable_tx_csum = ret; 551 } 552 553 /* Enable/disable TSO offload. */ 554 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 555 ret = parse_num_opt(optarg, 1); 556 if (ret == -1) { 557 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 558 us_vhost_usage(prgname); 559 return -1; 560 } else 561 enable_tso = ret; 562 } 563 564 /* Specify the retries delay time (in useconds) on RX. */ 565 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 566 ret = parse_num_opt(optarg, INT32_MAX); 567 if (ret == -1) { 568 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 569 us_vhost_usage(prgname); 570 return -1; 571 } else { 572 burst_rx_delay_time = ret; 573 } 574 } 575 576 /* Specify the retries number on RX. */ 577 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 578 ret = parse_num_opt(optarg, INT32_MAX); 579 if (ret == -1) { 580 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 581 us_vhost_usage(prgname); 582 return -1; 583 } else { 584 burst_rx_retry_num = ret; 585 } 586 } 587 588 /* Enable/disable RX mergeable buffers. */ 589 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 590 ret = parse_num_opt(optarg, 1); 591 if (ret == -1) { 592 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 593 us_vhost_usage(prgname); 594 return -1; 595 } else { 596 mergeable = !!ret; 597 if (ret) { 598 vmdq_conf_default.rxmode.jumbo_frame = 1; 599 vmdq_conf_default.rxmode.max_rx_pkt_len 600 = JUMBO_FRAME_MAX_SIZE; 601 } 602 } 603 } 604 605 /* Enable/disable stats. */ 606 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 607 ret = parse_num_opt(optarg, INT32_MAX); 608 if (ret == -1) { 609 RTE_LOG(INFO, VHOST_CONFIG, 610 "Invalid argument for stats [0..N]\n"); 611 us_vhost_usage(prgname); 612 return -1; 613 } else { 614 enable_stats = ret; 615 } 616 } 617 618 /* Set socket file path. */ 619 if (!strncmp(long_option[option_index].name, 620 "socket-file", MAX_LONG_OPT_SZ)) { 621 if (us_vhost_parse_socket_path(optarg) == -1) { 622 RTE_LOG(INFO, VHOST_CONFIG, 623 "Invalid argument for socket name (Max %d characters)\n", 624 PATH_MAX); 625 us_vhost_usage(prgname); 626 return -1; 627 } 628 } 629 630 break; 631 632 /* Invalid option - print options. */ 633 default: 634 us_vhost_usage(prgname); 635 return -1; 636 } 637 } 638 639 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 640 if (enabled_port_mask & (1 << i)) 641 ports[num_ports++] = i; 642 } 643 644 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 645 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 646 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 647 return -1; 648 } 649 650 return 0; 651 } 652 653 /* 654 * Update the global var NUM_PORTS and array PORTS according to system ports number 655 * and return valid ports number 656 */ 657 static unsigned check_ports_num(unsigned nb_ports) 658 { 659 unsigned valid_num_ports = num_ports; 660 unsigned portid; 661 662 if (num_ports > nb_ports) { 663 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 664 num_ports, nb_ports); 665 num_ports = nb_ports; 666 } 667 668 for (portid = 0; portid < num_ports; portid ++) { 669 if (ports[portid] >= nb_ports) { 670 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 671 ports[portid], (nb_ports - 1)); 672 ports[portid] = INVALID_PORT_ID; 673 valid_num_ports--; 674 } 675 } 676 return valid_num_ports; 677 } 678 679 static __rte_always_inline struct vhost_dev * 680 find_vhost_dev(struct ether_addr *mac) 681 { 682 struct vhost_dev *vdev; 683 684 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 685 if (vdev->ready == DEVICE_RX && 686 is_same_ether_addr(mac, &vdev->mac_address)) 687 return vdev; 688 } 689 690 return NULL; 691 } 692 693 /* 694 * This function learns the MAC address of the device and registers this along with a 695 * vlan tag to a VMDQ. 696 */ 697 static int 698 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 699 { 700 struct ether_hdr *pkt_hdr; 701 int i, ret; 702 703 /* Learn MAC address of guest device from packet */ 704 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 705 706 if (find_vhost_dev(&pkt_hdr->s_addr)) { 707 RTE_LOG(ERR, VHOST_DATA, 708 "(%d) device is using a registered MAC!\n", 709 vdev->vid); 710 return -1; 711 } 712 713 for (i = 0; i < ETHER_ADDR_LEN; i++) 714 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 715 716 /* vlan_tag currently uses the device_id. */ 717 vdev->vlan_tag = vlan_tags[vdev->vid]; 718 719 /* Print out VMDQ registration info. */ 720 RTE_LOG(INFO, VHOST_DATA, 721 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 722 vdev->vid, 723 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 724 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 725 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 726 vdev->vlan_tag); 727 728 /* Register the MAC address. */ 729 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 730 (uint32_t)vdev->vid + vmdq_pool_base); 731 if (ret) 732 RTE_LOG(ERR, VHOST_DATA, 733 "(%d) failed to add device MAC address to VMDQ\n", 734 vdev->vid); 735 736 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 737 738 /* Set device as ready for RX. */ 739 vdev->ready = DEVICE_RX; 740 741 return 0; 742 } 743 744 /* 745 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 746 * queue before disabling RX on the device. 747 */ 748 static inline void 749 unlink_vmdq(struct vhost_dev *vdev) 750 { 751 unsigned i = 0; 752 unsigned rx_count; 753 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 754 755 if (vdev->ready == DEVICE_RX) { 756 /*clear MAC and VLAN settings*/ 757 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 758 for (i = 0; i < 6; i++) 759 vdev->mac_address.addr_bytes[i] = 0; 760 761 vdev->vlan_tag = 0; 762 763 /*Clear out the receive buffers*/ 764 rx_count = rte_eth_rx_burst(ports[0], 765 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 766 767 while (rx_count) { 768 for (i = 0; i < rx_count; i++) 769 rte_pktmbuf_free(pkts_burst[i]); 770 771 rx_count = rte_eth_rx_burst(ports[0], 772 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 773 } 774 775 vdev->ready = DEVICE_MAC_LEARNING; 776 } 777 } 778 779 static __rte_always_inline void 780 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 781 struct rte_mbuf *m) 782 { 783 uint16_t ret; 784 785 if (builtin_net_driver) { 786 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 787 } else { 788 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 789 } 790 791 if (enable_stats) { 792 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic); 793 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret); 794 src_vdev->stats.tx_total++; 795 src_vdev->stats.tx += ret; 796 } 797 } 798 799 /* 800 * Check if the packet destination MAC address is for a local device. If so then put 801 * the packet on that devices RX queue. If not then return. 802 */ 803 static __rte_always_inline int 804 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 805 { 806 struct ether_hdr *pkt_hdr; 807 struct vhost_dev *dst_vdev; 808 809 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 810 811 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 812 if (!dst_vdev) 813 return -1; 814 815 if (vdev->vid == dst_vdev->vid) { 816 RTE_LOG_DP(DEBUG, VHOST_DATA, 817 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 818 vdev->vid); 819 return 0; 820 } 821 822 RTE_LOG_DP(DEBUG, VHOST_DATA, 823 "(%d) TX: MAC address is local\n", dst_vdev->vid); 824 825 if (unlikely(dst_vdev->remove)) { 826 RTE_LOG_DP(DEBUG, VHOST_DATA, 827 "(%d) device is marked for removal\n", dst_vdev->vid); 828 return 0; 829 } 830 831 virtio_xmit(dst_vdev, vdev, m); 832 return 0; 833 } 834 835 /* 836 * Check if the destination MAC of a packet is one local VM, 837 * and get its vlan tag, and offset if it is. 838 */ 839 static __rte_always_inline int 840 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 841 uint32_t *offset, uint16_t *vlan_tag) 842 { 843 struct vhost_dev *dst_vdev; 844 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 845 846 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 847 if (!dst_vdev) 848 return 0; 849 850 if (vdev->vid == dst_vdev->vid) { 851 RTE_LOG_DP(DEBUG, VHOST_DATA, 852 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 853 vdev->vid); 854 return -1; 855 } 856 857 /* 858 * HW vlan strip will reduce the packet length 859 * by minus length of vlan tag, so need restore 860 * the packet length by plus it. 861 */ 862 *offset = VLAN_HLEN; 863 *vlan_tag = vlan_tags[vdev->vid]; 864 865 RTE_LOG_DP(DEBUG, VHOST_DATA, 866 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 867 vdev->vid, dst_vdev->vid, *vlan_tag); 868 869 return 0; 870 } 871 872 static uint16_t 873 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 874 { 875 if (ol_flags & PKT_TX_IPV4) 876 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 877 else /* assume ethertype == ETHER_TYPE_IPv6 */ 878 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 879 } 880 881 static void virtio_tx_offload(struct rte_mbuf *m) 882 { 883 void *l3_hdr; 884 struct ipv4_hdr *ipv4_hdr = NULL; 885 struct tcp_hdr *tcp_hdr = NULL; 886 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 887 888 l3_hdr = (char *)eth_hdr + m->l2_len; 889 890 if (m->ol_flags & PKT_TX_IPV4) { 891 ipv4_hdr = l3_hdr; 892 ipv4_hdr->hdr_checksum = 0; 893 m->ol_flags |= PKT_TX_IP_CKSUM; 894 } 895 896 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 897 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 898 } 899 900 static inline void 901 free_pkts(struct rte_mbuf **pkts, uint16_t n) 902 { 903 while (n--) 904 rte_pktmbuf_free(pkts[n]); 905 } 906 907 static __rte_always_inline void 908 do_drain_mbuf_table(struct mbuf_table *tx_q) 909 { 910 uint16_t count; 911 912 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 913 tx_q->m_table, tx_q->len); 914 if (unlikely(count < tx_q->len)) 915 free_pkts(&tx_q->m_table[count], tx_q->len - count); 916 917 tx_q->len = 0; 918 } 919 920 /* 921 * This function routes the TX packet to the correct interface. This 922 * may be a local device or the physical port. 923 */ 924 static __rte_always_inline void 925 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 926 { 927 struct mbuf_table *tx_q; 928 unsigned offset = 0; 929 const uint16_t lcore_id = rte_lcore_id(); 930 struct ether_hdr *nh; 931 932 933 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 934 if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) { 935 struct vhost_dev *vdev2; 936 937 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 938 if (vdev2 != vdev) 939 virtio_xmit(vdev2, vdev, m); 940 } 941 goto queue2nic; 942 } 943 944 /*check if destination is local VM*/ 945 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 946 rte_pktmbuf_free(m); 947 return; 948 } 949 950 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 951 if (unlikely(find_local_dest(vdev, m, &offset, 952 &vlan_tag) != 0)) { 953 rte_pktmbuf_free(m); 954 return; 955 } 956 } 957 958 RTE_LOG_DP(DEBUG, VHOST_DATA, 959 "(%d) TX: MAC address is external\n", vdev->vid); 960 961 queue2nic: 962 963 /*Add packet to the port tx queue*/ 964 tx_q = &lcore_tx_queue[lcore_id]; 965 966 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 967 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 968 /* Guest has inserted the vlan tag. */ 969 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 970 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 971 if ((vm2vm_mode == VM2VM_HARDWARE) && 972 (vh->vlan_tci != vlan_tag_be)) 973 vh->vlan_tci = vlan_tag_be; 974 } else { 975 m->ol_flags |= PKT_TX_VLAN_PKT; 976 977 /* 978 * Find the right seg to adjust the data len when offset is 979 * bigger than tail room size. 980 */ 981 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 982 if (likely(offset <= rte_pktmbuf_tailroom(m))) 983 m->data_len += offset; 984 else { 985 struct rte_mbuf *seg = m; 986 987 while ((seg->next != NULL) && 988 (offset > rte_pktmbuf_tailroom(seg))) 989 seg = seg->next; 990 991 seg->data_len += offset; 992 } 993 m->pkt_len += offset; 994 } 995 996 m->vlan_tci = vlan_tag; 997 } 998 999 if (m->ol_flags & PKT_TX_TCP_SEG) 1000 virtio_tx_offload(m); 1001 1002 tx_q->m_table[tx_q->len++] = m; 1003 if (enable_stats) { 1004 vdev->stats.tx_total++; 1005 vdev->stats.tx++; 1006 } 1007 1008 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1009 do_drain_mbuf_table(tx_q); 1010 } 1011 1012 1013 static __rte_always_inline void 1014 drain_mbuf_table(struct mbuf_table *tx_q) 1015 { 1016 static uint64_t prev_tsc; 1017 uint64_t cur_tsc; 1018 1019 if (tx_q->len == 0) 1020 return; 1021 1022 cur_tsc = rte_rdtsc(); 1023 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1024 prev_tsc = cur_tsc; 1025 1026 RTE_LOG_DP(DEBUG, VHOST_DATA, 1027 "TX queue drained after timeout with burst size %u\n", 1028 tx_q->len); 1029 do_drain_mbuf_table(tx_q); 1030 } 1031 } 1032 1033 static __rte_always_inline void 1034 drain_eth_rx(struct vhost_dev *vdev) 1035 { 1036 uint16_t rx_count, enqueue_count; 1037 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1038 1039 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1040 pkts, MAX_PKT_BURST); 1041 if (!rx_count) 1042 return; 1043 1044 /* 1045 * When "enable_retry" is set, here we wait and retry when there 1046 * is no enough free slots in the queue to hold @rx_count packets, 1047 * to diminish packet loss. 1048 */ 1049 if (enable_retry && 1050 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1051 VIRTIO_RXQ))) { 1052 uint32_t retry; 1053 1054 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1055 rte_delay_us(burst_rx_delay_time); 1056 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1057 VIRTIO_RXQ)) 1058 break; 1059 } 1060 } 1061 1062 if (builtin_net_driver) { 1063 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1064 pkts, rx_count); 1065 } else { 1066 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1067 pkts, rx_count); 1068 } 1069 if (enable_stats) { 1070 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count); 1071 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count); 1072 } 1073 1074 free_pkts(pkts, rx_count); 1075 } 1076 1077 static __rte_always_inline void 1078 drain_virtio_tx(struct vhost_dev *vdev) 1079 { 1080 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1081 uint16_t count; 1082 uint16_t i; 1083 1084 if (builtin_net_driver) { 1085 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1086 pkts, MAX_PKT_BURST); 1087 } else { 1088 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1089 mbuf_pool, pkts, MAX_PKT_BURST); 1090 } 1091 1092 /* setup VMDq for the first packet */ 1093 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1094 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1095 free_pkts(pkts, count); 1096 } 1097 1098 for (i = 0; i < count; ++i) 1099 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1100 } 1101 1102 /* 1103 * Main function of vhost-switch. It basically does: 1104 * 1105 * for each vhost device { 1106 * - drain_eth_rx() 1107 * 1108 * Which drains the host eth Rx queue linked to the vhost device, 1109 * and deliver all of them to guest virito Rx ring associated with 1110 * this vhost device. 1111 * 1112 * - drain_virtio_tx() 1113 * 1114 * Which drains the guest virtio Tx queue and deliver all of them 1115 * to the target, which could be another vhost device, or the 1116 * physical eth dev. The route is done in function "virtio_tx_route". 1117 * } 1118 */ 1119 static int 1120 switch_worker(void *arg __rte_unused) 1121 { 1122 unsigned i; 1123 unsigned lcore_id = rte_lcore_id(); 1124 struct vhost_dev *vdev; 1125 struct mbuf_table *tx_q; 1126 1127 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1128 1129 tx_q = &lcore_tx_queue[lcore_id]; 1130 for (i = 0; i < rte_lcore_count(); i++) { 1131 if (lcore_ids[i] == lcore_id) { 1132 tx_q->txq_id = i; 1133 break; 1134 } 1135 } 1136 1137 while(1) { 1138 drain_mbuf_table(tx_q); 1139 1140 /* 1141 * Inform the configuration core that we have exited the 1142 * linked list and that no devices are in use if requested. 1143 */ 1144 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1145 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1146 1147 /* 1148 * Process vhost devices 1149 */ 1150 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1151 lcore_vdev_entry) { 1152 if (unlikely(vdev->remove)) { 1153 unlink_vmdq(vdev); 1154 vdev->ready = DEVICE_SAFE_REMOVE; 1155 continue; 1156 } 1157 1158 if (likely(vdev->ready == DEVICE_RX)) 1159 drain_eth_rx(vdev); 1160 1161 if (likely(!vdev->remove)) 1162 drain_virtio_tx(vdev); 1163 } 1164 } 1165 1166 return 0; 1167 } 1168 1169 /* 1170 * Remove a device from the specific data core linked list and from the 1171 * main linked list. Synchonization occurs through the use of the 1172 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1173 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1174 */ 1175 static void 1176 destroy_device(int vid) 1177 { 1178 struct vhost_dev *vdev = NULL; 1179 int lcore; 1180 1181 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1182 if (vdev->vid == vid) 1183 break; 1184 } 1185 if (!vdev) 1186 return; 1187 /*set the remove flag. */ 1188 vdev->remove = 1; 1189 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1190 rte_pause(); 1191 } 1192 1193 if (builtin_net_driver) 1194 vs_vhost_net_remove(vdev); 1195 1196 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1197 lcore_vdev_entry); 1198 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1199 1200 1201 /* Set the dev_removal_flag on each lcore. */ 1202 RTE_LCORE_FOREACH_SLAVE(lcore) 1203 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1204 1205 /* 1206 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1207 * we can be sure that they can no longer access the device removed 1208 * from the linked lists and that the devices are no longer in use. 1209 */ 1210 RTE_LCORE_FOREACH_SLAVE(lcore) { 1211 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1212 rte_pause(); 1213 } 1214 1215 lcore_info[vdev->coreid].device_num--; 1216 1217 RTE_LOG(INFO, VHOST_DATA, 1218 "(%d) device has been removed from data core\n", 1219 vdev->vid); 1220 1221 rte_free(vdev); 1222 } 1223 1224 /* 1225 * A new device is added to a data core. First the device is added to the main linked list 1226 * and the allocated to a specific data core. 1227 */ 1228 static int 1229 new_device(int vid) 1230 { 1231 int lcore, core_add = 0; 1232 uint32_t device_num_min = num_devices; 1233 struct vhost_dev *vdev; 1234 1235 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1236 if (vdev == NULL) { 1237 RTE_LOG(INFO, VHOST_DATA, 1238 "(%d) couldn't allocate memory for vhost dev\n", 1239 vid); 1240 return -1; 1241 } 1242 vdev->vid = vid; 1243 1244 if (builtin_net_driver) 1245 vs_vhost_net_setup(vdev); 1246 1247 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1248 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1249 1250 /*reset ready flag*/ 1251 vdev->ready = DEVICE_MAC_LEARNING; 1252 vdev->remove = 0; 1253 1254 /* Find a suitable lcore to add the device. */ 1255 RTE_LCORE_FOREACH_SLAVE(lcore) { 1256 if (lcore_info[lcore].device_num < device_num_min) { 1257 device_num_min = lcore_info[lcore].device_num; 1258 core_add = lcore; 1259 } 1260 } 1261 vdev->coreid = core_add; 1262 1263 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1264 lcore_vdev_entry); 1265 lcore_info[vdev->coreid].device_num++; 1266 1267 /* Disable notifications. */ 1268 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1269 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1270 1271 RTE_LOG(INFO, VHOST_DATA, 1272 "(%d) device has been added to data core %d\n", 1273 vid, vdev->coreid); 1274 1275 return 0; 1276 } 1277 1278 /* 1279 * These callback allow devices to be added to the data core when configuration 1280 * has been fully complete. 1281 */ 1282 static const struct vhost_device_ops virtio_net_device_ops = 1283 { 1284 .new_device = new_device, 1285 .destroy_device = destroy_device, 1286 }; 1287 1288 /* 1289 * This is a thread will wake up after a period to print stats if the user has 1290 * enabled them. 1291 */ 1292 static void 1293 print_stats(void) 1294 { 1295 struct vhost_dev *vdev; 1296 uint64_t tx_dropped, rx_dropped; 1297 uint64_t tx, tx_total, rx, rx_total; 1298 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1299 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1300 1301 while(1) { 1302 sleep(enable_stats); 1303 1304 /* Clear screen and move to top left */ 1305 printf("%s%s\n", clr, top_left); 1306 printf("Device statistics =================================\n"); 1307 1308 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1309 tx_total = vdev->stats.tx_total; 1310 tx = vdev->stats.tx; 1311 tx_dropped = tx_total - tx; 1312 1313 rx_total = rte_atomic64_read(&vdev->stats.rx_total_atomic); 1314 rx = rte_atomic64_read(&vdev->stats.rx_atomic); 1315 rx_dropped = rx_total - rx; 1316 1317 printf("Statistics for device %d\n" 1318 "-----------------------\n" 1319 "TX total: %" PRIu64 "\n" 1320 "TX dropped: %" PRIu64 "\n" 1321 "TX successful: %" PRIu64 "\n" 1322 "RX total: %" PRIu64 "\n" 1323 "RX dropped: %" PRIu64 "\n" 1324 "RX successful: %" PRIu64 "\n", 1325 vdev->vid, 1326 tx_total, tx_dropped, tx, 1327 rx_total, rx_dropped, rx); 1328 } 1329 1330 printf("===================================================\n"); 1331 } 1332 } 1333 1334 static void 1335 unregister_drivers(int socket_num) 1336 { 1337 int i, ret; 1338 1339 for (i = 0; i < socket_num; i++) { 1340 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1341 if (ret != 0) 1342 RTE_LOG(ERR, VHOST_CONFIG, 1343 "Fail to unregister vhost driver for %s.\n", 1344 socket_files + i * PATH_MAX); 1345 } 1346 } 1347 1348 /* When we receive a INT signal, unregister vhost driver */ 1349 static void 1350 sigint_handler(__rte_unused int signum) 1351 { 1352 /* Unregister vhost driver. */ 1353 unregister_drivers(nb_sockets); 1354 1355 exit(0); 1356 } 1357 1358 /* 1359 * While creating an mbuf pool, one key thing is to figure out how 1360 * many mbuf entries is enough for our use. FYI, here are some 1361 * guidelines: 1362 * 1363 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1364 * 1365 * - For each switch core (A CPU core does the packet switch), we need 1366 * also make some reservation for receiving the packets from virtio 1367 * Tx queue. How many is enough depends on the usage. It's normally 1368 * a simple calculation like following: 1369 * 1370 * MAX_PKT_BURST * max packet size / mbuf size 1371 * 1372 * So, we definitely need allocate more mbufs when TSO is enabled. 1373 * 1374 * - Similarly, for each switching core, we should serve @nr_rx_desc 1375 * mbufs for receiving the packets from physical NIC device. 1376 * 1377 * - We also need make sure, for each switch core, we have allocated 1378 * enough mbufs to fill up the mbuf cache. 1379 */ 1380 static void 1381 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1382 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1383 { 1384 uint32_t nr_mbufs; 1385 uint32_t nr_mbufs_per_core; 1386 uint32_t mtu = 1500; 1387 1388 if (mergeable) 1389 mtu = 9000; 1390 if (enable_tso) 1391 mtu = 64 * 1024; 1392 1393 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1394 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1395 nr_mbufs_per_core += nr_rx_desc; 1396 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1397 1398 nr_mbufs = nr_queues * nr_rx_desc; 1399 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1400 nr_mbufs *= nr_port; 1401 1402 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1403 nr_mbuf_cache, 0, mbuf_size, 1404 rte_socket_id()); 1405 if (mbuf_pool == NULL) 1406 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1407 } 1408 1409 /* 1410 * Main function, does initialisation and calls the per-lcore functions. 1411 */ 1412 int 1413 main(int argc, char *argv[]) 1414 { 1415 unsigned lcore_id, core_id = 0; 1416 unsigned nb_ports, valid_num_ports; 1417 int ret, i; 1418 uint16_t portid; 1419 static pthread_t tid; 1420 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 1421 uint64_t flags = 0; 1422 1423 signal(SIGINT, sigint_handler); 1424 1425 /* init EAL */ 1426 ret = rte_eal_init(argc, argv); 1427 if (ret < 0) 1428 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1429 argc -= ret; 1430 argv += ret; 1431 1432 /* parse app arguments */ 1433 ret = us_vhost_parse_args(argc, argv); 1434 if (ret < 0) 1435 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1436 1437 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1438 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1439 1440 if (rte_lcore_is_enabled(lcore_id)) 1441 lcore_ids[core_id++] = lcore_id; 1442 } 1443 1444 if (rte_lcore_count() > RTE_MAX_LCORE) 1445 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1446 1447 /* Get the number of physical ports. */ 1448 nb_ports = rte_eth_dev_count(); 1449 1450 /* 1451 * Update the global var NUM_PORTS and global array PORTS 1452 * and get value of var VALID_NUM_PORTS according to system ports number 1453 */ 1454 valid_num_ports = check_ports_num(nb_ports); 1455 1456 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1457 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1458 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1459 return -1; 1460 } 1461 1462 /* 1463 * FIXME: here we are trying to allocate mbufs big enough for 1464 * @MAX_QUEUES, but the truth is we're never going to use that 1465 * many queues here. We probably should only do allocation for 1466 * those queues we are going to use. 1467 */ 1468 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1469 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1470 1471 if (vm2vm_mode == VM2VM_HARDWARE) { 1472 /* Enable VT loop back to let L2 switch to do it. */ 1473 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1474 RTE_LOG(DEBUG, VHOST_CONFIG, 1475 "Enable loop back for L2 switch in vmdq.\n"); 1476 } 1477 1478 /* initialize all ports */ 1479 for (portid = 0; portid < nb_ports; portid++) { 1480 /* skip ports that are not enabled */ 1481 if ((enabled_port_mask & (1 << portid)) == 0) { 1482 RTE_LOG(INFO, VHOST_PORT, 1483 "Skipping disabled port %d\n", portid); 1484 continue; 1485 } 1486 if (port_init(portid) != 0) 1487 rte_exit(EXIT_FAILURE, 1488 "Cannot initialize network ports\n"); 1489 } 1490 1491 /* Enable stats if the user option is set. */ 1492 if (enable_stats) { 1493 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 1494 if (ret != 0) 1495 rte_exit(EXIT_FAILURE, 1496 "Cannot create print-stats thread\n"); 1497 1498 /* Set thread_name for aid in debugging. */ 1499 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 1500 ret = rte_thread_setname(tid, thread_name); 1501 if (ret != 0) 1502 RTE_LOG(DEBUG, VHOST_CONFIG, 1503 "Cannot set print-stats name\n"); 1504 } 1505 1506 /* Launch all data cores. */ 1507 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1508 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1509 1510 if (client_mode) 1511 flags |= RTE_VHOST_USER_CLIENT; 1512 1513 if (dequeue_zero_copy) 1514 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY; 1515 1516 /* Register vhost user driver to handle vhost messages. */ 1517 for (i = 0; i < nb_sockets; i++) { 1518 char *file = socket_files + i * PATH_MAX; 1519 ret = rte_vhost_driver_register(file, flags); 1520 if (ret != 0) { 1521 unregister_drivers(i); 1522 rte_exit(EXIT_FAILURE, 1523 "vhost driver register failure.\n"); 1524 } 1525 1526 if (builtin_net_driver) 1527 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1528 1529 if (mergeable == 0) { 1530 rte_vhost_driver_disable_features(file, 1531 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1532 } 1533 1534 if (enable_tx_csum == 0) { 1535 rte_vhost_driver_disable_features(file, 1536 1ULL << VIRTIO_NET_F_CSUM); 1537 } 1538 1539 if (enable_tso == 0) { 1540 rte_vhost_driver_disable_features(file, 1541 1ULL << VIRTIO_NET_F_HOST_TSO4); 1542 rte_vhost_driver_disable_features(file, 1543 1ULL << VIRTIO_NET_F_HOST_TSO6); 1544 rte_vhost_driver_disable_features(file, 1545 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1546 rte_vhost_driver_disable_features(file, 1547 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1548 } 1549 1550 if (promiscuous) { 1551 rte_vhost_driver_enable_features(file, 1552 1ULL << VIRTIO_NET_F_CTRL_RX); 1553 } 1554 1555 ret = rte_vhost_driver_callback_register(file, 1556 &virtio_net_device_ops); 1557 if (ret != 0) { 1558 rte_exit(EXIT_FAILURE, 1559 "failed to register vhost driver callbacks.\n"); 1560 } 1561 1562 if (rte_vhost_driver_start(file) < 0) { 1563 rte_exit(EXIT_FAILURE, 1564 "failed to start vhost driver.\n"); 1565 } 1566 } 1567 1568 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1569 rte_eal_wait_lcore(lcore_id); 1570 1571 return 0; 1572 1573 } 1574