1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 56 #include "main.h" 57 58 #ifndef MAX_QUEUES 59 #define MAX_QUEUES 128 60 #endif 61 62 /* the maximum number of external ports supported */ 63 #define MAX_SUP_PORTS 1 64 65 #define MBUF_CACHE_SIZE 128 66 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 67 68 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 69 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 70 71 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 72 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 73 74 #define JUMBO_FRAME_MAX_SIZE 0x2600 75 76 /* State of virtio device. */ 77 #define DEVICE_MAC_LEARNING 0 78 #define DEVICE_RX 1 79 #define DEVICE_SAFE_REMOVE 2 80 81 /* Configurable number of RX/TX ring descriptors */ 82 #define RTE_TEST_RX_DESC_DEFAULT 1024 83 #define RTE_TEST_TX_DESC_DEFAULT 512 84 85 #define INVALID_PORT_ID 0xFF 86 87 /* Max number of devices. Limited by vmdq. */ 88 #define MAX_DEVICES 64 89 90 /* Size of buffers used for snprintfs. */ 91 #define MAX_PRINT_BUFF 6072 92 93 /* Maximum long option length for option parsing. */ 94 #define MAX_LONG_OPT_SZ 64 95 96 /* mask of enabled ports */ 97 static uint32_t enabled_port_mask = 0; 98 99 /* Promiscuous mode */ 100 static uint32_t promiscuous; 101 102 /* number of devices/queues to support*/ 103 static uint32_t num_queues = 0; 104 static uint32_t num_devices; 105 106 static struct rte_mempool *mbuf_pool; 107 static int mergeable; 108 109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 110 typedef enum { 111 VM2VM_DISABLED = 0, 112 VM2VM_SOFTWARE = 1, 113 VM2VM_HARDWARE = 2, 114 VM2VM_LAST 115 } vm2vm_type; 116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 117 118 /* Enable stats. */ 119 static uint32_t enable_stats = 0; 120 /* Enable retries on RX. */ 121 static uint32_t enable_retry = 1; 122 123 /* Disable TX checksum offload */ 124 static uint32_t enable_tx_csum; 125 126 /* Disable TSO offload */ 127 static uint32_t enable_tso; 128 129 static int client_mode; 130 131 /* Specify timeout (in useconds) between retries on RX. */ 132 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 133 /* Specify the number of retries on RX. */ 134 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 135 136 /* Socket file paths. Can be set by user */ 137 static char *socket_files; 138 static int nb_sockets; 139 140 /* empty vmdq configuration structure. Filled in programatically */ 141 static struct rte_eth_conf vmdq_conf_default = { 142 .rxmode = { 143 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 144 .split_hdr_size = 0, 145 .header_split = 0, /**< Header Split disabled */ 146 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 147 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 148 /* 149 * It is necessary for 1G NIC such as I350, 150 * this fixes bug of ipv4 forwarding in guest can't 151 * forward pakets from one virtio dev to another virtio dev. 152 */ 153 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 154 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 155 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 156 }, 157 158 .txmode = { 159 .mq_mode = ETH_MQ_TX_NONE, 160 }, 161 .rx_adv_conf = { 162 /* 163 * should be overridden separately in code with 164 * appropriate values 165 */ 166 .vmdq_rx_conf = { 167 .nb_queue_pools = ETH_8_POOLS, 168 .enable_default_pool = 0, 169 .default_pool = 0, 170 .nb_pool_maps = 0, 171 .pool_map = {{0, 0},}, 172 }, 173 }, 174 }; 175 176 static unsigned lcore_ids[RTE_MAX_LCORE]; 177 static uint8_t ports[RTE_MAX_ETHPORTS]; 178 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 179 static uint16_t num_pf_queues, num_vmdq_queues; 180 static uint16_t vmdq_pool_base, vmdq_queue_base; 181 static uint16_t queues_per_pool; 182 183 const uint16_t vlan_tags[] = { 184 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 185 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 186 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 187 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 188 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 189 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 190 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 191 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 192 }; 193 194 /* ethernet addresses of ports */ 195 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 196 197 static struct vhost_dev_tailq_list vhost_dev_list = 198 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 199 200 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 201 202 /* Used for queueing bursts of TX packets. */ 203 struct mbuf_table { 204 unsigned len; 205 unsigned txq_id; 206 struct rte_mbuf *m_table[MAX_PKT_BURST]; 207 }; 208 209 /* TX queue for each data core. */ 210 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 211 212 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 213 / US_PER_S * BURST_TX_DRAIN_US) 214 #define VLAN_HLEN 4 215 216 /* 217 * Builds up the correct configuration for VMDQ VLAN pool map 218 * according to the pool & queue limits. 219 */ 220 static inline int 221 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 222 { 223 struct rte_eth_vmdq_rx_conf conf; 224 struct rte_eth_vmdq_rx_conf *def_conf = 225 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 226 unsigned i; 227 228 memset(&conf, 0, sizeof(conf)); 229 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 230 conf.nb_pool_maps = num_devices; 231 conf.enable_loop_back = def_conf->enable_loop_back; 232 conf.rx_mode = def_conf->rx_mode; 233 234 for (i = 0; i < conf.nb_pool_maps; i++) { 235 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 236 conf.pool_map[i].pools = (1UL << i); 237 } 238 239 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 240 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 241 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 242 return 0; 243 } 244 245 /* 246 * Validate the device number according to the max pool number gotten form 247 * dev_info. If the device number is invalid, give the error message and 248 * return -1. Each device must have its own pool. 249 */ 250 static inline int 251 validate_num_devices(uint32_t max_nb_devices) 252 { 253 if (num_devices > max_nb_devices) { 254 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 255 return -1; 256 } 257 return 0; 258 } 259 260 /* 261 * Initialises a given port using global settings and with the rx buffers 262 * coming from the mbuf_pool passed as parameter 263 */ 264 static inline int 265 port_init(uint8_t port) 266 { 267 struct rte_eth_dev_info dev_info; 268 struct rte_eth_conf port_conf; 269 struct rte_eth_rxconf *rxconf; 270 struct rte_eth_txconf *txconf; 271 int16_t rx_rings, tx_rings; 272 uint16_t rx_ring_size, tx_ring_size; 273 int retval; 274 uint16_t q; 275 276 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 277 rte_eth_dev_info_get (port, &dev_info); 278 279 if (dev_info.max_rx_queues > MAX_QUEUES) { 280 rte_exit(EXIT_FAILURE, 281 "please define MAX_QUEUES no less than %u in %s\n", 282 dev_info.max_rx_queues, __FILE__); 283 } 284 285 rxconf = &dev_info.default_rxconf; 286 txconf = &dev_info.default_txconf; 287 rxconf->rx_drop_en = 1; 288 289 /* Enable vlan offload */ 290 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 291 292 /*configure the number of supported virtio devices based on VMDQ limits */ 293 num_devices = dev_info.max_vmdq_pools; 294 295 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 296 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 297 tx_rings = (uint16_t)rte_lcore_count(); 298 299 retval = validate_num_devices(MAX_DEVICES); 300 if (retval < 0) 301 return retval; 302 303 /* Get port configuration. */ 304 retval = get_eth_conf(&port_conf, num_devices); 305 if (retval < 0) 306 return retval; 307 /* NIC queues are divided into pf queues and vmdq queues. */ 308 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 309 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 310 num_vmdq_queues = num_devices * queues_per_pool; 311 num_queues = num_pf_queues + num_vmdq_queues; 312 vmdq_queue_base = dev_info.vmdq_queue_base; 313 vmdq_pool_base = dev_info.vmdq_pool_base; 314 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 315 num_pf_queues, num_devices, queues_per_pool); 316 317 if (port >= rte_eth_dev_count()) return -1; 318 319 if (enable_tx_csum == 0) 320 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); 321 322 if (enable_tso == 0) { 323 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); 324 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); 325 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO4); 326 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO6); 327 } 328 329 rx_rings = (uint16_t)dev_info.max_rx_queues; 330 /* Configure ethernet device. */ 331 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 332 if (retval != 0) { 333 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 334 port, strerror(-retval)); 335 return retval; 336 } 337 338 /* Setup the queues. */ 339 for (q = 0; q < rx_rings; q ++) { 340 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 341 rte_eth_dev_socket_id(port), 342 rxconf, 343 mbuf_pool); 344 if (retval < 0) { 345 RTE_LOG(ERR, VHOST_PORT, 346 "Failed to setup rx queue %u of port %u: %s.\n", 347 q, port, strerror(-retval)); 348 return retval; 349 } 350 } 351 for (q = 0; q < tx_rings; q ++) { 352 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 353 rte_eth_dev_socket_id(port), 354 txconf); 355 if (retval < 0) { 356 RTE_LOG(ERR, VHOST_PORT, 357 "Failed to setup tx queue %u of port %u: %s.\n", 358 q, port, strerror(-retval)); 359 return retval; 360 } 361 } 362 363 /* Start the device. */ 364 retval = rte_eth_dev_start(port); 365 if (retval < 0) { 366 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 367 port, strerror(-retval)); 368 return retval; 369 } 370 371 if (promiscuous) 372 rte_eth_promiscuous_enable(port); 373 374 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 375 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 376 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 377 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 378 (unsigned)port, 379 vmdq_ports_eth_addr[port].addr_bytes[0], 380 vmdq_ports_eth_addr[port].addr_bytes[1], 381 vmdq_ports_eth_addr[port].addr_bytes[2], 382 vmdq_ports_eth_addr[port].addr_bytes[3], 383 vmdq_ports_eth_addr[port].addr_bytes[4], 384 vmdq_ports_eth_addr[port].addr_bytes[5]); 385 386 return 0; 387 } 388 389 /* 390 * Set socket file path. 391 */ 392 static int 393 us_vhost_parse_socket_path(const char *q_arg) 394 { 395 /* parse number string */ 396 if (strnlen(q_arg, PATH_MAX) > PATH_MAX) 397 return -1; 398 399 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 400 snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg); 401 nb_sockets++; 402 403 return 0; 404 } 405 406 /* 407 * Parse the portmask provided at run time. 408 */ 409 static int 410 parse_portmask(const char *portmask) 411 { 412 char *end = NULL; 413 unsigned long pm; 414 415 errno = 0; 416 417 /* parse hexadecimal string */ 418 pm = strtoul(portmask, &end, 16); 419 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 420 return -1; 421 422 if (pm == 0) 423 return -1; 424 425 return pm; 426 427 } 428 429 /* 430 * Parse num options at run time. 431 */ 432 static int 433 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 434 { 435 char *end = NULL; 436 unsigned long num; 437 438 errno = 0; 439 440 /* parse unsigned int string */ 441 num = strtoul(q_arg, &end, 10); 442 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 443 return -1; 444 445 if (num > max_valid_value) 446 return -1; 447 448 return num; 449 450 } 451 452 /* 453 * Display usage 454 */ 455 static void 456 us_vhost_usage(const char *prgname) 457 { 458 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 459 " --vm2vm [0|1|2]\n" 460 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 461 " --socket-file <path>\n" 462 " --nb-devices ND\n" 463 " -p PORTMASK: Set mask for ports to be used by application\n" 464 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 465 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 466 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 467 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 468 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 469 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 470 " --socket-file: The path of the socket file.\n" 471 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 472 " --tso [0|1] disable/enable TCP segment offload.\n" 473 " --client register a vhost-user socket as client mode.\n", 474 prgname); 475 } 476 477 /* 478 * Parse the arguments given in the command line of the application. 479 */ 480 static int 481 us_vhost_parse_args(int argc, char **argv) 482 { 483 int opt, ret; 484 int option_index; 485 unsigned i; 486 const char *prgname = argv[0]; 487 static struct option long_option[] = { 488 {"vm2vm", required_argument, NULL, 0}, 489 {"rx-retry", required_argument, NULL, 0}, 490 {"rx-retry-delay", required_argument, NULL, 0}, 491 {"rx-retry-num", required_argument, NULL, 0}, 492 {"mergeable", required_argument, NULL, 0}, 493 {"stats", required_argument, NULL, 0}, 494 {"socket-file", required_argument, NULL, 0}, 495 {"tx-csum", required_argument, NULL, 0}, 496 {"tso", required_argument, NULL, 0}, 497 {"client", no_argument, &client_mode, 1}, 498 {NULL, 0, 0, 0}, 499 }; 500 501 /* Parse command line */ 502 while ((opt = getopt_long(argc, argv, "p:P", 503 long_option, &option_index)) != EOF) { 504 switch (opt) { 505 /* Portmask */ 506 case 'p': 507 enabled_port_mask = parse_portmask(optarg); 508 if (enabled_port_mask == 0) { 509 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 510 us_vhost_usage(prgname); 511 return -1; 512 } 513 break; 514 515 case 'P': 516 promiscuous = 1; 517 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 518 ETH_VMDQ_ACCEPT_BROADCAST | 519 ETH_VMDQ_ACCEPT_MULTICAST; 520 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 521 522 break; 523 524 case 0: 525 /* Enable/disable vm2vm comms. */ 526 if (!strncmp(long_option[option_index].name, "vm2vm", 527 MAX_LONG_OPT_SZ)) { 528 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 529 if (ret == -1) { 530 RTE_LOG(INFO, VHOST_CONFIG, 531 "Invalid argument for " 532 "vm2vm [0|1|2]\n"); 533 us_vhost_usage(prgname); 534 return -1; 535 } else { 536 vm2vm_mode = (vm2vm_type)ret; 537 } 538 } 539 540 /* Enable/disable retries on RX. */ 541 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 542 ret = parse_num_opt(optarg, 1); 543 if (ret == -1) { 544 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 545 us_vhost_usage(prgname); 546 return -1; 547 } else { 548 enable_retry = ret; 549 } 550 } 551 552 /* Enable/disable TX checksum offload. */ 553 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 554 ret = parse_num_opt(optarg, 1); 555 if (ret == -1) { 556 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 557 us_vhost_usage(prgname); 558 return -1; 559 } else 560 enable_tx_csum = ret; 561 } 562 563 /* Enable/disable TSO offload. */ 564 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 565 ret = parse_num_opt(optarg, 1); 566 if (ret == -1) { 567 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 568 us_vhost_usage(prgname); 569 return -1; 570 } else 571 enable_tso = ret; 572 } 573 574 /* Specify the retries delay time (in useconds) on RX. */ 575 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 576 ret = parse_num_opt(optarg, INT32_MAX); 577 if (ret == -1) { 578 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 579 us_vhost_usage(prgname); 580 return -1; 581 } else { 582 burst_rx_delay_time = ret; 583 } 584 } 585 586 /* Specify the retries number on RX. */ 587 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 588 ret = parse_num_opt(optarg, INT32_MAX); 589 if (ret == -1) { 590 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 591 us_vhost_usage(prgname); 592 return -1; 593 } else { 594 burst_rx_retry_num = ret; 595 } 596 } 597 598 /* Enable/disable RX mergeable buffers. */ 599 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 600 ret = parse_num_opt(optarg, 1); 601 if (ret == -1) { 602 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 603 us_vhost_usage(prgname); 604 return -1; 605 } else { 606 mergeable = !!ret; 607 if (ret) { 608 vmdq_conf_default.rxmode.jumbo_frame = 1; 609 vmdq_conf_default.rxmode.max_rx_pkt_len 610 = JUMBO_FRAME_MAX_SIZE; 611 } 612 } 613 } 614 615 /* Enable/disable stats. */ 616 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 617 ret = parse_num_opt(optarg, INT32_MAX); 618 if (ret == -1) { 619 RTE_LOG(INFO, VHOST_CONFIG, 620 "Invalid argument for stats [0..N]\n"); 621 us_vhost_usage(prgname); 622 return -1; 623 } else { 624 enable_stats = ret; 625 } 626 } 627 628 /* Set socket file path. */ 629 if (!strncmp(long_option[option_index].name, 630 "socket-file", MAX_LONG_OPT_SZ)) { 631 if (us_vhost_parse_socket_path(optarg) == -1) { 632 RTE_LOG(INFO, VHOST_CONFIG, 633 "Invalid argument for socket name (Max %d characters)\n", 634 PATH_MAX); 635 us_vhost_usage(prgname); 636 return -1; 637 } 638 } 639 640 break; 641 642 /* Invalid option - print options. */ 643 default: 644 us_vhost_usage(prgname); 645 return -1; 646 } 647 } 648 649 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 650 if (enabled_port_mask & (1 << i)) 651 ports[num_ports++] = (uint8_t)i; 652 } 653 654 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 655 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 656 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 657 return -1; 658 } 659 660 return 0; 661 } 662 663 /* 664 * Update the global var NUM_PORTS and array PORTS according to system ports number 665 * and return valid ports number 666 */ 667 static unsigned check_ports_num(unsigned nb_ports) 668 { 669 unsigned valid_num_ports = num_ports; 670 unsigned portid; 671 672 if (num_ports > nb_ports) { 673 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 674 num_ports, nb_ports); 675 num_ports = nb_ports; 676 } 677 678 for (portid = 0; portid < num_ports; portid ++) { 679 if (ports[portid] >= nb_ports) { 680 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 681 ports[portid], (nb_ports - 1)); 682 ports[portid] = INVALID_PORT_ID; 683 valid_num_ports--; 684 } 685 } 686 return valid_num_ports; 687 } 688 689 static inline struct vhost_dev *__attribute__((always_inline)) 690 find_vhost_dev(struct ether_addr *mac) 691 { 692 struct vhost_dev *vdev; 693 694 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 695 if (vdev->ready == DEVICE_RX && 696 is_same_ether_addr(mac, &vdev->mac_address)) 697 return vdev; 698 } 699 700 return NULL; 701 } 702 703 /* 704 * This function learns the MAC address of the device and registers this along with a 705 * vlan tag to a VMDQ. 706 */ 707 static int 708 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 709 { 710 struct ether_hdr *pkt_hdr; 711 int i, ret; 712 713 /* Learn MAC address of guest device from packet */ 714 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 715 716 if (find_vhost_dev(&pkt_hdr->s_addr)) { 717 RTE_LOG(ERR, VHOST_DATA, 718 "(%d) device is using a registered MAC!\n", 719 vdev->vid); 720 return -1; 721 } 722 723 for (i = 0; i < ETHER_ADDR_LEN; i++) 724 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 725 726 /* vlan_tag currently uses the device_id. */ 727 vdev->vlan_tag = vlan_tags[vdev->vid]; 728 729 /* Print out VMDQ registration info. */ 730 RTE_LOG(INFO, VHOST_DATA, 731 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 732 vdev->vid, 733 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 734 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 735 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 736 vdev->vlan_tag); 737 738 /* Register the MAC address. */ 739 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 740 (uint32_t)vdev->vid + vmdq_pool_base); 741 if (ret) 742 RTE_LOG(ERR, VHOST_DATA, 743 "(%d) failed to add device MAC address to VMDQ\n", 744 vdev->vid); 745 746 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 747 748 /* Set device as ready for RX. */ 749 vdev->ready = DEVICE_RX; 750 751 return 0; 752 } 753 754 /* 755 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 756 * queue before disabling RX on the device. 757 */ 758 static inline void 759 unlink_vmdq(struct vhost_dev *vdev) 760 { 761 unsigned i = 0; 762 unsigned rx_count; 763 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 764 765 if (vdev->ready == DEVICE_RX) { 766 /*clear MAC and VLAN settings*/ 767 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 768 for (i = 0; i < 6; i++) 769 vdev->mac_address.addr_bytes[i] = 0; 770 771 vdev->vlan_tag = 0; 772 773 /*Clear out the receive buffers*/ 774 rx_count = rte_eth_rx_burst(ports[0], 775 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 776 777 while (rx_count) { 778 for (i = 0; i < rx_count; i++) 779 rte_pktmbuf_free(pkts_burst[i]); 780 781 rx_count = rte_eth_rx_burst(ports[0], 782 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 783 } 784 785 vdev->ready = DEVICE_MAC_LEARNING; 786 } 787 } 788 789 static inline void __attribute__((always_inline)) 790 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 791 struct rte_mbuf *m) 792 { 793 uint16_t ret; 794 795 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 796 if (enable_stats) { 797 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic); 798 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret); 799 src_vdev->stats.tx_total++; 800 src_vdev->stats.tx += ret; 801 } 802 } 803 804 /* 805 * Check if the packet destination MAC address is for a local device. If so then put 806 * the packet on that devices RX queue. If not then return. 807 */ 808 static inline int __attribute__((always_inline)) 809 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 810 { 811 struct ether_hdr *pkt_hdr; 812 struct vhost_dev *dst_vdev; 813 814 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 815 816 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 817 if (!dst_vdev) 818 return -1; 819 820 if (vdev->vid == dst_vdev->vid) { 821 RTE_LOG(DEBUG, VHOST_DATA, 822 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 823 vdev->vid); 824 return 0; 825 } 826 827 RTE_LOG(DEBUG, VHOST_DATA, 828 "(%d) TX: MAC address is local\n", dst_vdev->vid); 829 830 if (unlikely(dst_vdev->remove)) { 831 RTE_LOG(DEBUG, VHOST_DATA, 832 "(%d) device is marked for removal\n", dst_vdev->vid); 833 return 0; 834 } 835 836 virtio_xmit(dst_vdev, vdev, m); 837 return 0; 838 } 839 840 /* 841 * Check if the destination MAC of a packet is one local VM, 842 * and get its vlan tag, and offset if it is. 843 */ 844 static inline int __attribute__((always_inline)) 845 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 846 uint32_t *offset, uint16_t *vlan_tag) 847 { 848 struct vhost_dev *dst_vdev; 849 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 850 851 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 852 if (!dst_vdev) 853 return 0; 854 855 if (vdev->vid == dst_vdev->vid) { 856 RTE_LOG(DEBUG, VHOST_DATA, 857 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 858 vdev->vid); 859 return -1; 860 } 861 862 /* 863 * HW vlan strip will reduce the packet length 864 * by minus length of vlan tag, so need restore 865 * the packet length by plus it. 866 */ 867 *offset = VLAN_HLEN; 868 *vlan_tag = vlan_tags[vdev->vid]; 869 870 RTE_LOG(DEBUG, VHOST_DATA, 871 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 872 vdev->vid, dst_vdev->vid, *vlan_tag); 873 874 return 0; 875 } 876 877 static uint16_t 878 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 879 { 880 if (ol_flags & PKT_TX_IPV4) 881 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 882 else /* assume ethertype == ETHER_TYPE_IPv6 */ 883 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 884 } 885 886 static void virtio_tx_offload(struct rte_mbuf *m) 887 { 888 void *l3_hdr; 889 struct ipv4_hdr *ipv4_hdr = NULL; 890 struct tcp_hdr *tcp_hdr = NULL; 891 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 892 893 l3_hdr = (char *)eth_hdr + m->l2_len; 894 895 if (m->ol_flags & PKT_TX_IPV4) { 896 ipv4_hdr = l3_hdr; 897 ipv4_hdr->hdr_checksum = 0; 898 m->ol_flags |= PKT_TX_IP_CKSUM; 899 } 900 901 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 902 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 903 } 904 905 static inline void 906 free_pkts(struct rte_mbuf **pkts, uint16_t n) 907 { 908 while (n--) 909 rte_pktmbuf_free(pkts[n]); 910 } 911 912 static inline void __attribute__((always_inline)) 913 do_drain_mbuf_table(struct mbuf_table *tx_q) 914 { 915 uint16_t count; 916 917 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 918 tx_q->m_table, tx_q->len); 919 if (unlikely(count < tx_q->len)) 920 free_pkts(&tx_q->m_table[count], tx_q->len - count); 921 922 tx_q->len = 0; 923 } 924 925 /* 926 * This function routes the TX packet to the correct interface. This 927 * may be a local device or the physical port. 928 */ 929 static inline void __attribute__((always_inline)) 930 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 931 { 932 struct mbuf_table *tx_q; 933 unsigned offset = 0; 934 const uint16_t lcore_id = rte_lcore_id(); 935 struct ether_hdr *nh; 936 937 938 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 939 if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) { 940 struct vhost_dev *vdev2; 941 942 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 943 virtio_xmit(vdev2, vdev, m); 944 } 945 goto queue2nic; 946 } 947 948 /*check if destination is local VM*/ 949 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 950 rte_pktmbuf_free(m); 951 return; 952 } 953 954 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 955 if (unlikely(find_local_dest(vdev, m, &offset, 956 &vlan_tag) != 0)) { 957 rte_pktmbuf_free(m); 958 return; 959 } 960 } 961 962 RTE_LOG(DEBUG, VHOST_DATA, 963 "(%d) TX: MAC address is external\n", vdev->vid); 964 965 queue2nic: 966 967 /*Add packet to the port tx queue*/ 968 tx_q = &lcore_tx_queue[lcore_id]; 969 970 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 971 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 972 /* Guest has inserted the vlan tag. */ 973 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 974 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 975 if ((vm2vm_mode == VM2VM_HARDWARE) && 976 (vh->vlan_tci != vlan_tag_be)) 977 vh->vlan_tci = vlan_tag_be; 978 } else { 979 m->ol_flags |= PKT_TX_VLAN_PKT; 980 981 /* 982 * Find the right seg to adjust the data len when offset is 983 * bigger than tail room size. 984 */ 985 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 986 if (likely(offset <= rte_pktmbuf_tailroom(m))) 987 m->data_len += offset; 988 else { 989 struct rte_mbuf *seg = m; 990 991 while ((seg->next != NULL) && 992 (offset > rte_pktmbuf_tailroom(seg))) 993 seg = seg->next; 994 995 seg->data_len += offset; 996 } 997 m->pkt_len += offset; 998 } 999 1000 m->vlan_tci = vlan_tag; 1001 } 1002 1003 if (m->ol_flags & PKT_TX_TCP_SEG) 1004 virtio_tx_offload(m); 1005 1006 tx_q->m_table[tx_q->len++] = m; 1007 if (enable_stats) { 1008 vdev->stats.tx_total++; 1009 vdev->stats.tx++; 1010 } 1011 1012 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1013 do_drain_mbuf_table(tx_q); 1014 } 1015 1016 1017 static inline void __attribute__((always_inline)) 1018 drain_mbuf_table(struct mbuf_table *tx_q) 1019 { 1020 static uint64_t prev_tsc; 1021 uint64_t cur_tsc; 1022 1023 if (tx_q->len == 0) 1024 return; 1025 1026 cur_tsc = rte_rdtsc(); 1027 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1028 prev_tsc = cur_tsc; 1029 1030 RTE_LOG(DEBUG, VHOST_DATA, 1031 "TX queue drained after timeout with burst size %u\n", 1032 tx_q->len); 1033 do_drain_mbuf_table(tx_q); 1034 } 1035 } 1036 1037 static inline void __attribute__((always_inline)) 1038 drain_eth_rx(struct vhost_dev *vdev) 1039 { 1040 uint16_t rx_count, enqueue_count; 1041 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1042 1043 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1044 pkts, MAX_PKT_BURST); 1045 if (!rx_count) 1046 return; 1047 1048 /* 1049 * When "enable_retry" is set, here we wait and retry when there 1050 * is no enough free slots in the queue to hold @rx_count packets, 1051 * to diminish packet loss. 1052 */ 1053 if (enable_retry && 1054 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1055 VIRTIO_RXQ))) { 1056 uint32_t retry; 1057 1058 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1059 rte_delay_us(burst_rx_delay_time); 1060 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1061 VIRTIO_RXQ)) 1062 break; 1063 } 1064 } 1065 1066 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1067 pkts, rx_count); 1068 if (enable_stats) { 1069 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count); 1070 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count); 1071 } 1072 1073 free_pkts(pkts, rx_count); 1074 } 1075 1076 static inline void __attribute__((always_inline)) 1077 drain_virtio_tx(struct vhost_dev *vdev) 1078 { 1079 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1080 uint16_t count; 1081 uint16_t i; 1082 1083 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, mbuf_pool, 1084 pkts, MAX_PKT_BURST); 1085 1086 /* setup VMDq for the first packet */ 1087 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1088 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1089 free_pkts(pkts, count); 1090 } 1091 1092 for (i = 0; i < count; ++i) 1093 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1094 } 1095 1096 /* 1097 * Main function of vhost-switch. It basically does: 1098 * 1099 * for each vhost device { 1100 * - drain_eth_rx() 1101 * 1102 * Which drains the host eth Rx queue linked to the vhost device, 1103 * and deliver all of them to guest virito Rx ring associated with 1104 * this vhost device. 1105 * 1106 * - drain_virtio_tx() 1107 * 1108 * Which drains the guest virtio Tx queue and deliver all of them 1109 * to the target, which could be another vhost device, or the 1110 * physical eth dev. The route is done in function "virtio_tx_route". 1111 * } 1112 */ 1113 static int 1114 switch_worker(void *arg __rte_unused) 1115 { 1116 unsigned i; 1117 unsigned lcore_id = rte_lcore_id(); 1118 struct vhost_dev *vdev; 1119 struct mbuf_table *tx_q; 1120 1121 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1122 1123 tx_q = &lcore_tx_queue[lcore_id]; 1124 for (i = 0; i < rte_lcore_count(); i++) { 1125 if (lcore_ids[i] == lcore_id) { 1126 tx_q->txq_id = i; 1127 break; 1128 } 1129 } 1130 1131 while(1) { 1132 drain_mbuf_table(tx_q); 1133 1134 /* 1135 * Inform the configuration core that we have exited the 1136 * linked list and that no devices are in use if requested. 1137 */ 1138 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1139 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1140 1141 /* 1142 * Process vhost devices 1143 */ 1144 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1145 lcore_vdev_entry) { 1146 if (unlikely(vdev->remove)) { 1147 unlink_vmdq(vdev); 1148 vdev->ready = DEVICE_SAFE_REMOVE; 1149 continue; 1150 } 1151 1152 if (likely(vdev->ready == DEVICE_RX)) 1153 drain_eth_rx(vdev); 1154 1155 if (likely(!vdev->remove)) 1156 drain_virtio_tx(vdev); 1157 } 1158 } 1159 1160 return 0; 1161 } 1162 1163 /* 1164 * Remove a device from the specific data core linked list and from the 1165 * main linked list. Synchonization occurs through the use of the 1166 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1167 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1168 */ 1169 static void 1170 destroy_device(int vid) 1171 { 1172 struct vhost_dev *vdev = NULL; 1173 int lcore; 1174 1175 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1176 if (vdev->vid == vid) 1177 break; 1178 } 1179 if (!vdev) 1180 return; 1181 /*set the remove flag. */ 1182 vdev->remove = 1; 1183 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1184 rte_pause(); 1185 } 1186 1187 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1188 lcore_vdev_entry); 1189 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1190 1191 1192 /* Set the dev_removal_flag on each lcore. */ 1193 RTE_LCORE_FOREACH_SLAVE(lcore) 1194 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1195 1196 /* 1197 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1198 * we can be sure that they can no longer access the device removed 1199 * from the linked lists and that the devices are no longer in use. 1200 */ 1201 RTE_LCORE_FOREACH_SLAVE(lcore) { 1202 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1203 rte_pause(); 1204 } 1205 1206 lcore_info[vdev->coreid].device_num--; 1207 1208 RTE_LOG(INFO, VHOST_DATA, 1209 "(%d) device has been removed from data core\n", 1210 vdev->vid); 1211 1212 rte_free(vdev); 1213 } 1214 1215 /* 1216 * A new device is added to a data core. First the device is added to the main linked list 1217 * and the allocated to a specific data core. 1218 */ 1219 static int 1220 new_device(int vid) 1221 { 1222 int lcore, core_add = 0; 1223 uint32_t device_num_min = num_devices; 1224 struct vhost_dev *vdev; 1225 1226 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1227 if (vdev == NULL) { 1228 RTE_LOG(INFO, VHOST_DATA, 1229 "(%d) couldn't allocate memory for vhost dev\n", 1230 vid); 1231 return -1; 1232 } 1233 vdev->vid = vid; 1234 1235 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1236 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1237 1238 /*reset ready flag*/ 1239 vdev->ready = DEVICE_MAC_LEARNING; 1240 vdev->remove = 0; 1241 1242 /* Find a suitable lcore to add the device. */ 1243 RTE_LCORE_FOREACH_SLAVE(lcore) { 1244 if (lcore_info[lcore].device_num < device_num_min) { 1245 device_num_min = lcore_info[lcore].device_num; 1246 core_add = lcore; 1247 } 1248 } 1249 vdev->coreid = core_add; 1250 1251 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1252 lcore_vdev_entry); 1253 lcore_info[vdev->coreid].device_num++; 1254 1255 /* Disable notifications. */ 1256 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1257 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1258 1259 RTE_LOG(INFO, VHOST_DATA, 1260 "(%d) device has been added to data core %d\n", 1261 vid, vdev->coreid); 1262 1263 return 0; 1264 } 1265 1266 /* 1267 * These callback allow devices to be added to the data core when configuration 1268 * has been fully complete. 1269 */ 1270 static const struct virtio_net_device_ops virtio_net_device_ops = 1271 { 1272 .new_device = new_device, 1273 .destroy_device = destroy_device, 1274 }; 1275 1276 /* 1277 * This is a thread will wake up after a period to print stats if the user has 1278 * enabled them. 1279 */ 1280 static void 1281 print_stats(void) 1282 { 1283 struct vhost_dev *vdev; 1284 uint64_t tx_dropped, rx_dropped; 1285 uint64_t tx, tx_total, rx, rx_total; 1286 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1287 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1288 1289 while(1) { 1290 sleep(enable_stats); 1291 1292 /* Clear screen and move to top left */ 1293 printf("%s%s\n", clr, top_left); 1294 printf("Device statistics =================================\n"); 1295 1296 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1297 tx_total = vdev->stats.tx_total; 1298 tx = vdev->stats.tx; 1299 tx_dropped = tx_total - tx; 1300 1301 rx_total = rte_atomic64_read(&vdev->stats.rx_total_atomic); 1302 rx = rte_atomic64_read(&vdev->stats.rx_atomic); 1303 rx_dropped = rx_total - rx; 1304 1305 printf("Statistics for device %d\n" 1306 "-----------------------\n" 1307 "TX total: %" PRIu64 "\n" 1308 "TX dropped: %" PRIu64 "\n" 1309 "TX successful: %" PRIu64 "\n" 1310 "RX total: %" PRIu64 "\n" 1311 "RX dropped: %" PRIu64 "\n" 1312 "RX successful: %" PRIu64 "\n", 1313 vdev->vid, 1314 tx_total, tx_dropped, tx, 1315 rx_total, rx_dropped, rx); 1316 } 1317 1318 printf("===================================================\n"); 1319 } 1320 } 1321 1322 static void 1323 unregister_drivers(int socket_num) 1324 { 1325 int i, ret; 1326 1327 for (i = 0; i < socket_num; i++) { 1328 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1329 if (ret != 0) 1330 RTE_LOG(ERR, VHOST_CONFIG, 1331 "Fail to unregister vhost driver for %s.\n", 1332 socket_files + i * PATH_MAX); 1333 } 1334 } 1335 1336 /* When we receive a INT signal, unregister vhost driver */ 1337 static void 1338 sigint_handler(__rte_unused int signum) 1339 { 1340 /* Unregister vhost driver. */ 1341 unregister_drivers(nb_sockets); 1342 1343 exit(0); 1344 } 1345 1346 /* 1347 * While creating an mbuf pool, one key thing is to figure out how 1348 * many mbuf entries is enough for our use. FYI, here are some 1349 * guidelines: 1350 * 1351 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1352 * 1353 * - For each switch core (A CPU core does the packet switch), we need 1354 * also make some reservation for receiving the packets from virtio 1355 * Tx queue. How many is enough depends on the usage. It's normally 1356 * a simple calculation like following: 1357 * 1358 * MAX_PKT_BURST * max packet size / mbuf size 1359 * 1360 * So, we definitely need allocate more mbufs when TSO is enabled. 1361 * 1362 * - Similarly, for each switching core, we should serve @nr_rx_desc 1363 * mbufs for receiving the packets from physical NIC device. 1364 * 1365 * - We also need make sure, for each switch core, we have allocated 1366 * enough mbufs to fill up the mbuf cache. 1367 */ 1368 static void 1369 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1370 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1371 { 1372 uint32_t nr_mbufs; 1373 uint32_t nr_mbufs_per_core; 1374 uint32_t mtu = 1500; 1375 1376 if (mergeable) 1377 mtu = 9000; 1378 if (enable_tso) 1379 mtu = 64 * 1024; 1380 1381 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1382 (mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST; 1383 nr_mbufs_per_core += nr_rx_desc; 1384 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1385 1386 nr_mbufs = nr_queues * nr_rx_desc; 1387 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1388 nr_mbufs *= nr_port; 1389 1390 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1391 nr_mbuf_cache, 0, mbuf_size, 1392 rte_socket_id()); 1393 if (mbuf_pool == NULL) 1394 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1395 } 1396 1397 /* 1398 * Main function, does initialisation and calls the per-lcore functions. The CUSE 1399 * device is also registered here to handle the IOCTLs. 1400 */ 1401 int 1402 main(int argc, char *argv[]) 1403 { 1404 unsigned lcore_id, core_id = 0; 1405 unsigned nb_ports, valid_num_ports; 1406 int ret, i; 1407 uint8_t portid; 1408 static pthread_t tid; 1409 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 1410 uint64_t flags = 0; 1411 1412 signal(SIGINT, sigint_handler); 1413 1414 /* init EAL */ 1415 ret = rte_eal_init(argc, argv); 1416 if (ret < 0) 1417 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1418 argc -= ret; 1419 argv += ret; 1420 1421 /* parse app arguments */ 1422 ret = us_vhost_parse_args(argc, argv); 1423 if (ret < 0) 1424 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1425 1426 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 1427 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1428 1429 if (rte_lcore_is_enabled(lcore_id)) 1430 lcore_ids[core_id ++] = lcore_id; 1431 1432 if (rte_lcore_count() > RTE_MAX_LCORE) 1433 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1434 1435 /* Get the number of physical ports. */ 1436 nb_ports = rte_eth_dev_count(); 1437 1438 /* 1439 * Update the global var NUM_PORTS and global array PORTS 1440 * and get value of var VALID_NUM_PORTS according to system ports number 1441 */ 1442 valid_num_ports = check_ports_num(nb_ports); 1443 1444 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1445 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1446 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1447 return -1; 1448 } 1449 1450 /* 1451 * FIXME: here we are trying to allocate mbufs big enough for 1452 * @MAX_QUEUES, but the truth is we're never going to use that 1453 * many queues here. We probably should only do allocation for 1454 * those queues we are going to use. 1455 */ 1456 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1457 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1458 1459 if (vm2vm_mode == VM2VM_HARDWARE) { 1460 /* Enable VT loop back to let L2 switch to do it. */ 1461 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1462 RTE_LOG(DEBUG, VHOST_CONFIG, 1463 "Enable loop back for L2 switch in vmdq.\n"); 1464 } 1465 1466 /* initialize all ports */ 1467 for (portid = 0; portid < nb_ports; portid++) { 1468 /* skip ports that are not enabled */ 1469 if ((enabled_port_mask & (1 << portid)) == 0) { 1470 RTE_LOG(INFO, VHOST_PORT, 1471 "Skipping disabled port %d\n", portid); 1472 continue; 1473 } 1474 if (port_init(portid) != 0) 1475 rte_exit(EXIT_FAILURE, 1476 "Cannot initialize network ports\n"); 1477 } 1478 1479 /* Enable stats if the user option is set. */ 1480 if (enable_stats) { 1481 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 1482 if (ret != 0) 1483 rte_exit(EXIT_FAILURE, 1484 "Cannot create print-stats thread\n"); 1485 1486 /* Set thread_name for aid in debugging. */ 1487 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 1488 ret = rte_thread_setname(tid, thread_name); 1489 if (ret != 0) 1490 RTE_LOG(DEBUG, VHOST_CONFIG, 1491 "Cannot set print-stats name\n"); 1492 } 1493 1494 /* Launch all data cores. */ 1495 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1496 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1497 1498 if (mergeable == 0) 1499 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 1500 1501 if (client_mode) 1502 flags |= RTE_VHOST_USER_CLIENT; 1503 1504 /* Register vhost user driver to handle vhost messages. */ 1505 for (i = 0; i < nb_sockets; i++) { 1506 ret = rte_vhost_driver_register 1507 (socket_files + i * PATH_MAX, flags); 1508 if (ret != 0) { 1509 unregister_drivers(i); 1510 rte_exit(EXIT_FAILURE, 1511 "vhost driver register failure.\n"); 1512 } 1513 } 1514 1515 rte_vhost_driver_callback_register(&virtio_net_device_ops); 1516 1517 /* Start CUSE session. */ 1518 rte_vhost_driver_session_start(); 1519 return 0; 1520 1521 } 1522