1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 56 #include "main.h" 57 58 #ifndef MAX_QUEUES 59 #define MAX_QUEUES 128 60 #endif 61 62 /* the maximum number of external ports supported */ 63 #define MAX_SUP_PORTS 1 64 65 #define MBUF_CACHE_SIZE 128 66 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 67 68 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 69 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 70 71 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 72 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 73 74 #define JUMBO_FRAME_MAX_SIZE 0x2600 75 76 /* State of virtio device. */ 77 #define DEVICE_MAC_LEARNING 0 78 #define DEVICE_RX 1 79 #define DEVICE_SAFE_REMOVE 2 80 81 /* Configurable number of RX/TX ring descriptors */ 82 #define RTE_TEST_RX_DESC_DEFAULT 1024 83 #define RTE_TEST_TX_DESC_DEFAULT 512 84 85 #define INVALID_PORT_ID 0xFF 86 87 /* Max number of devices. Limited by vmdq. */ 88 #define MAX_DEVICES 64 89 90 /* Size of buffers used for snprintfs. */ 91 #define MAX_PRINT_BUFF 6072 92 93 /* Maximum character device basename size. */ 94 #define MAX_BASENAME_SZ 10 95 96 /* Maximum long option length for option parsing. */ 97 #define MAX_LONG_OPT_SZ 64 98 99 /* mask of enabled ports */ 100 static uint32_t enabled_port_mask = 0; 101 102 /* Promiscuous mode */ 103 static uint32_t promiscuous; 104 105 /* number of devices/queues to support*/ 106 static uint32_t num_queues = 0; 107 static uint32_t num_devices; 108 109 static struct rte_mempool *mbuf_pool; 110 static int mergeable; 111 112 /* Do vlan strip on host, enabled on default */ 113 static uint32_t vlan_strip = 1; 114 115 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 116 typedef enum { 117 VM2VM_DISABLED = 0, 118 VM2VM_SOFTWARE = 1, 119 VM2VM_HARDWARE = 2, 120 VM2VM_LAST 121 } vm2vm_type; 122 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 123 124 /* Enable stats. */ 125 static uint32_t enable_stats = 0; 126 /* Enable retries on RX. */ 127 static uint32_t enable_retry = 1; 128 129 /* Disable TX checksum offload */ 130 static uint32_t enable_tx_csum; 131 132 /* Disable TSO offload */ 133 static uint32_t enable_tso; 134 135 /* Specify timeout (in useconds) between retries on RX. */ 136 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 137 /* Specify the number of retries on RX. */ 138 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 139 140 /* Character device basename. Can be set by user. */ 141 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 142 143 /* empty vmdq configuration structure. Filled in programatically */ 144 static struct rte_eth_conf vmdq_conf_default = { 145 .rxmode = { 146 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 147 .split_hdr_size = 0, 148 .header_split = 0, /**< Header Split disabled */ 149 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 150 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 151 /* 152 * It is necessary for 1G NIC such as I350, 153 * this fixes bug of ipv4 forwarding in guest can't 154 * forward pakets from one virtio dev to another virtio dev. 155 */ 156 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 157 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 158 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 159 }, 160 161 .txmode = { 162 .mq_mode = ETH_MQ_TX_NONE, 163 }, 164 .rx_adv_conf = { 165 /* 166 * should be overridden separately in code with 167 * appropriate values 168 */ 169 .vmdq_rx_conf = { 170 .nb_queue_pools = ETH_8_POOLS, 171 .enable_default_pool = 0, 172 .default_pool = 0, 173 .nb_pool_maps = 0, 174 .pool_map = {{0, 0},}, 175 }, 176 }, 177 }; 178 179 static unsigned lcore_ids[RTE_MAX_LCORE]; 180 static uint8_t ports[RTE_MAX_ETHPORTS]; 181 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 182 static uint16_t num_pf_queues, num_vmdq_queues; 183 static uint16_t vmdq_pool_base, vmdq_queue_base; 184 static uint16_t queues_per_pool; 185 186 const uint16_t vlan_tags[] = { 187 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 188 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 189 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 190 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 191 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 192 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 193 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 194 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 195 }; 196 197 /* ethernet addresses of ports */ 198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 199 200 static struct vhost_dev_tailq_list vhost_dev_list = 201 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 202 203 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 204 205 /* Used for queueing bursts of TX packets. */ 206 struct mbuf_table { 207 unsigned len; 208 unsigned txq_id; 209 struct rte_mbuf *m_table[MAX_PKT_BURST]; 210 }; 211 212 /* TX queue for each data core. */ 213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 214 215 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 216 / US_PER_S * BURST_TX_DRAIN_US) 217 #define VLAN_HLEN 4 218 219 /* Per-device statistics struct */ 220 struct device_statistics { 221 uint64_t tx_total; 222 rte_atomic64_t rx_total_atomic; 223 uint64_t tx; 224 rte_atomic64_t rx_atomic; 225 } __rte_cache_aligned; 226 struct device_statistics dev_statistics[MAX_DEVICES]; 227 228 /* 229 * Builds up the correct configuration for VMDQ VLAN pool map 230 * according to the pool & queue limits. 231 */ 232 static inline int 233 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 234 { 235 struct rte_eth_vmdq_rx_conf conf; 236 struct rte_eth_vmdq_rx_conf *def_conf = 237 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 238 unsigned i; 239 240 memset(&conf, 0, sizeof(conf)); 241 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 242 conf.nb_pool_maps = num_devices; 243 conf.enable_loop_back = def_conf->enable_loop_back; 244 conf.rx_mode = def_conf->rx_mode; 245 246 for (i = 0; i < conf.nb_pool_maps; i++) { 247 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 248 conf.pool_map[i].pools = (1UL << i); 249 } 250 251 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 252 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 253 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 254 return 0; 255 } 256 257 /* 258 * Validate the device number according to the max pool number gotten form 259 * dev_info. If the device number is invalid, give the error message and 260 * return -1. Each device must have its own pool. 261 */ 262 static inline int 263 validate_num_devices(uint32_t max_nb_devices) 264 { 265 if (num_devices > max_nb_devices) { 266 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 267 return -1; 268 } 269 return 0; 270 } 271 272 /* 273 * Initialises a given port using global settings and with the rx buffers 274 * coming from the mbuf_pool passed as parameter 275 */ 276 static inline int 277 port_init(uint8_t port) 278 { 279 struct rte_eth_dev_info dev_info; 280 struct rte_eth_conf port_conf; 281 struct rte_eth_rxconf *rxconf; 282 struct rte_eth_txconf *txconf; 283 int16_t rx_rings, tx_rings; 284 uint16_t rx_ring_size, tx_ring_size; 285 int retval; 286 uint16_t q; 287 288 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 289 rte_eth_dev_info_get (port, &dev_info); 290 291 if (dev_info.max_rx_queues > MAX_QUEUES) { 292 rte_exit(EXIT_FAILURE, 293 "please define MAX_QUEUES no less than %u in %s\n", 294 dev_info.max_rx_queues, __FILE__); 295 } 296 297 rxconf = &dev_info.default_rxconf; 298 txconf = &dev_info.default_txconf; 299 rxconf->rx_drop_en = 1; 300 301 /* Enable vlan offload */ 302 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 303 304 /*configure the number of supported virtio devices based on VMDQ limits */ 305 num_devices = dev_info.max_vmdq_pools; 306 307 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 308 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 309 tx_rings = (uint16_t)rte_lcore_count(); 310 311 retval = validate_num_devices(MAX_DEVICES); 312 if (retval < 0) 313 return retval; 314 315 /* Get port configuration. */ 316 retval = get_eth_conf(&port_conf, num_devices); 317 if (retval < 0) 318 return retval; 319 /* NIC queues are divided into pf queues and vmdq queues. */ 320 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 321 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 322 num_vmdq_queues = num_devices * queues_per_pool; 323 num_queues = num_pf_queues + num_vmdq_queues; 324 vmdq_queue_base = dev_info.vmdq_queue_base; 325 vmdq_pool_base = dev_info.vmdq_pool_base; 326 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 327 num_pf_queues, num_devices, queues_per_pool); 328 329 if (port >= rte_eth_dev_count()) return -1; 330 331 if (enable_tx_csum == 0) 332 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); 333 334 if (enable_tso == 0) { 335 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); 336 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); 337 } 338 339 rx_rings = (uint16_t)dev_info.max_rx_queues; 340 /* Configure ethernet device. */ 341 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 342 if (retval != 0) 343 return retval; 344 345 /* Setup the queues. */ 346 for (q = 0; q < rx_rings; q ++) { 347 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 348 rte_eth_dev_socket_id(port), 349 rxconf, 350 mbuf_pool); 351 if (retval < 0) 352 return retval; 353 } 354 for (q = 0; q < tx_rings; q ++) { 355 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 356 rte_eth_dev_socket_id(port), 357 txconf); 358 if (retval < 0) 359 return retval; 360 } 361 362 /* Start the device. */ 363 retval = rte_eth_dev_start(port); 364 if (retval < 0) { 365 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 366 return retval; 367 } 368 369 if (promiscuous) 370 rte_eth_promiscuous_enable(port); 371 372 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 373 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 374 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 375 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 376 (unsigned)port, 377 vmdq_ports_eth_addr[port].addr_bytes[0], 378 vmdq_ports_eth_addr[port].addr_bytes[1], 379 vmdq_ports_eth_addr[port].addr_bytes[2], 380 vmdq_ports_eth_addr[port].addr_bytes[3], 381 vmdq_ports_eth_addr[port].addr_bytes[4], 382 vmdq_ports_eth_addr[port].addr_bytes[5]); 383 384 return 0; 385 } 386 387 /* 388 * Set character device basename. 389 */ 390 static int 391 us_vhost_parse_basename(const char *q_arg) 392 { 393 /* parse number string */ 394 395 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 396 return -1; 397 else 398 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 399 400 return 0; 401 } 402 403 /* 404 * Parse the portmask provided at run time. 405 */ 406 static int 407 parse_portmask(const char *portmask) 408 { 409 char *end = NULL; 410 unsigned long pm; 411 412 errno = 0; 413 414 /* parse hexadecimal string */ 415 pm = strtoul(portmask, &end, 16); 416 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 417 return -1; 418 419 if (pm == 0) 420 return -1; 421 422 return pm; 423 424 } 425 426 /* 427 * Parse num options at run time. 428 */ 429 static int 430 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 431 { 432 char *end = NULL; 433 unsigned long num; 434 435 errno = 0; 436 437 /* parse unsigned int string */ 438 num = strtoul(q_arg, &end, 10); 439 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 440 return -1; 441 442 if (num > max_valid_value) 443 return -1; 444 445 return num; 446 447 } 448 449 /* 450 * Display usage 451 */ 452 static void 453 us_vhost_usage(const char *prgname) 454 { 455 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 456 " --vm2vm [0|1|2]\n" 457 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 458 " --dev-basename <name>\n" 459 " --nb-devices ND\n" 460 " -p PORTMASK: Set mask for ports to be used by application\n" 461 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 462 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 463 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 464 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 465 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 466 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 467 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 468 " --dev-basename: The basename to be used for the character device.\n" 469 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 470 " --tso [0|1] disable/enable TCP segment offload.\n", 471 prgname); 472 } 473 474 /* 475 * Parse the arguments given in the command line of the application. 476 */ 477 static int 478 us_vhost_parse_args(int argc, char **argv) 479 { 480 int opt, ret; 481 int option_index; 482 unsigned i; 483 const char *prgname = argv[0]; 484 static struct option long_option[] = { 485 {"vm2vm", required_argument, NULL, 0}, 486 {"rx-retry", required_argument, NULL, 0}, 487 {"rx-retry-delay", required_argument, NULL, 0}, 488 {"rx-retry-num", required_argument, NULL, 0}, 489 {"mergeable", required_argument, NULL, 0}, 490 {"vlan-strip", required_argument, NULL, 0}, 491 {"stats", required_argument, NULL, 0}, 492 {"dev-basename", required_argument, NULL, 0}, 493 {"tx-csum", required_argument, NULL, 0}, 494 {"tso", required_argument, NULL, 0}, 495 {NULL, 0, 0, 0}, 496 }; 497 498 /* Parse command line */ 499 while ((opt = getopt_long(argc, argv, "p:P", 500 long_option, &option_index)) != EOF) { 501 switch (opt) { 502 /* Portmask */ 503 case 'p': 504 enabled_port_mask = parse_portmask(optarg); 505 if (enabled_port_mask == 0) { 506 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 507 us_vhost_usage(prgname); 508 return -1; 509 } 510 break; 511 512 case 'P': 513 promiscuous = 1; 514 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 515 ETH_VMDQ_ACCEPT_BROADCAST | 516 ETH_VMDQ_ACCEPT_MULTICAST; 517 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 518 519 break; 520 521 case 0: 522 /* Enable/disable vm2vm comms. */ 523 if (!strncmp(long_option[option_index].name, "vm2vm", 524 MAX_LONG_OPT_SZ)) { 525 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 526 if (ret == -1) { 527 RTE_LOG(INFO, VHOST_CONFIG, 528 "Invalid argument for " 529 "vm2vm [0|1|2]\n"); 530 us_vhost_usage(prgname); 531 return -1; 532 } else { 533 vm2vm_mode = (vm2vm_type)ret; 534 } 535 } 536 537 /* Enable/disable retries on RX. */ 538 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 539 ret = parse_num_opt(optarg, 1); 540 if (ret == -1) { 541 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 542 us_vhost_usage(prgname); 543 return -1; 544 } else { 545 enable_retry = ret; 546 } 547 } 548 549 /* Enable/disable TX checksum offload. */ 550 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 551 ret = parse_num_opt(optarg, 1); 552 if (ret == -1) { 553 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 554 us_vhost_usage(prgname); 555 return -1; 556 } else 557 enable_tx_csum = ret; 558 } 559 560 /* Enable/disable TSO offload. */ 561 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 562 ret = parse_num_opt(optarg, 1); 563 if (ret == -1) { 564 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 565 us_vhost_usage(prgname); 566 return -1; 567 } else 568 enable_tso = ret; 569 } 570 571 /* Specify the retries delay time (in useconds) on RX. */ 572 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 573 ret = parse_num_opt(optarg, INT32_MAX); 574 if (ret == -1) { 575 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 576 us_vhost_usage(prgname); 577 return -1; 578 } else { 579 burst_rx_delay_time = ret; 580 } 581 } 582 583 /* Specify the retries number on RX. */ 584 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 585 ret = parse_num_opt(optarg, INT32_MAX); 586 if (ret == -1) { 587 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 588 us_vhost_usage(prgname); 589 return -1; 590 } else { 591 burst_rx_retry_num = ret; 592 } 593 } 594 595 /* Enable/disable RX mergeable buffers. */ 596 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 597 ret = parse_num_opt(optarg, 1); 598 if (ret == -1) { 599 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 600 us_vhost_usage(prgname); 601 return -1; 602 } else { 603 mergeable = !!ret; 604 if (ret) { 605 vmdq_conf_default.rxmode.jumbo_frame = 1; 606 vmdq_conf_default.rxmode.max_rx_pkt_len 607 = JUMBO_FRAME_MAX_SIZE; 608 } 609 } 610 } 611 612 /* Enable/disable RX VLAN strip on host. */ 613 if (!strncmp(long_option[option_index].name, 614 "vlan-strip", MAX_LONG_OPT_SZ)) { 615 ret = parse_num_opt(optarg, 1); 616 if (ret == -1) { 617 RTE_LOG(INFO, VHOST_CONFIG, 618 "Invalid argument for VLAN strip [0|1]\n"); 619 us_vhost_usage(prgname); 620 return -1; 621 } else { 622 vlan_strip = !!ret; 623 vmdq_conf_default.rxmode.hw_vlan_strip = 624 vlan_strip; 625 } 626 } 627 628 /* Enable/disable stats. */ 629 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 630 ret = parse_num_opt(optarg, INT32_MAX); 631 if (ret == -1) { 632 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 633 us_vhost_usage(prgname); 634 return -1; 635 } else { 636 enable_stats = ret; 637 } 638 } 639 640 /* Set character device basename. */ 641 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 642 if (us_vhost_parse_basename(optarg) == -1) { 643 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 644 us_vhost_usage(prgname); 645 return -1; 646 } 647 } 648 649 break; 650 651 /* Invalid option - print options. */ 652 default: 653 us_vhost_usage(prgname); 654 return -1; 655 } 656 } 657 658 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 659 if (enabled_port_mask & (1 << i)) 660 ports[num_ports++] = (uint8_t)i; 661 } 662 663 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 664 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 665 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 666 return -1; 667 } 668 669 return 0; 670 } 671 672 /* 673 * Update the global var NUM_PORTS and array PORTS according to system ports number 674 * and return valid ports number 675 */ 676 static unsigned check_ports_num(unsigned nb_ports) 677 { 678 unsigned valid_num_ports = num_ports; 679 unsigned portid; 680 681 if (num_ports > nb_ports) { 682 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 683 num_ports, nb_ports); 684 num_ports = nb_ports; 685 } 686 687 for (portid = 0; portid < num_ports; portid ++) { 688 if (ports[portid] >= nb_ports) { 689 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 690 ports[portid], (nb_ports - 1)); 691 ports[portid] = INVALID_PORT_ID; 692 valid_num_ports--; 693 } 694 } 695 return valid_num_ports; 696 } 697 698 static inline struct vhost_dev *__attribute__((always_inline)) 699 find_vhost_dev(struct ether_addr *mac) 700 { 701 struct vhost_dev *vdev; 702 703 TAILQ_FOREACH(vdev, &vhost_dev_list, next) { 704 if (vdev->ready == DEVICE_RX && 705 is_same_ether_addr(mac, &vdev->mac_address)) 706 return vdev; 707 } 708 709 return NULL; 710 } 711 712 /* 713 * This function learns the MAC address of the device and registers this along with a 714 * vlan tag to a VMDQ. 715 */ 716 static int 717 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 718 { 719 struct ether_hdr *pkt_hdr; 720 struct virtio_net *dev = vdev->dev; 721 int i, ret; 722 723 /* Learn MAC address of guest device from packet */ 724 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 725 726 if (find_vhost_dev(&pkt_hdr->s_addr)) { 727 RTE_LOG(ERR, VHOST_DATA, 728 "Device (%" PRIu64 ") is using a registered MAC!\n", 729 dev->device_fh); 730 return -1; 731 } 732 733 for (i = 0; i < ETHER_ADDR_LEN; i++) 734 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 735 736 /* vlan_tag currently uses the device_id. */ 737 vdev->vlan_tag = vlan_tags[dev->device_fh]; 738 739 /* Print out VMDQ registration info. */ 740 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 741 dev->device_fh, 742 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 743 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 744 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 745 vdev->vlan_tag); 746 747 /* Register the MAC address. */ 748 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 749 (uint32_t)dev->device_fh + vmdq_pool_base); 750 if (ret) 751 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 752 dev->device_fh); 753 754 /* Enable stripping of the vlan tag as we handle routing. */ 755 if (vlan_strip) 756 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 757 (uint16_t)vdev->vmdq_rx_q, 1); 758 759 /* Set device as ready for RX. */ 760 vdev->ready = DEVICE_RX; 761 762 return 0; 763 } 764 765 /* 766 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 767 * queue before disabling RX on the device. 768 */ 769 static inline void 770 unlink_vmdq(struct vhost_dev *vdev) 771 { 772 unsigned i = 0; 773 unsigned rx_count; 774 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 775 776 if (vdev->ready == DEVICE_RX) { 777 /*clear MAC and VLAN settings*/ 778 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 779 for (i = 0; i < 6; i++) 780 vdev->mac_address.addr_bytes[i] = 0; 781 782 vdev->vlan_tag = 0; 783 784 /*Clear out the receive buffers*/ 785 rx_count = rte_eth_rx_burst(ports[0], 786 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 787 788 while (rx_count) { 789 for (i = 0; i < rx_count; i++) 790 rte_pktmbuf_free(pkts_burst[i]); 791 792 rx_count = rte_eth_rx_burst(ports[0], 793 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 794 } 795 796 vdev->ready = DEVICE_MAC_LEARNING; 797 } 798 } 799 800 static inline void __attribute__((always_inline)) 801 virtio_xmit(struct virtio_net *dst_dev, struct virtio_net *src_dev, 802 struct rte_mbuf *m) 803 { 804 uint16_t ret; 805 806 ret = rte_vhost_enqueue_burst(dst_dev, VIRTIO_RXQ, &m, 1); 807 if (enable_stats) { 808 rte_atomic64_inc(&dev_statistics[dst_dev->device_fh].rx_total_atomic); 809 rte_atomic64_add(&dev_statistics[dst_dev->device_fh].rx_atomic, ret); 810 dev_statistics[src_dev->device_fh].tx_total++; 811 dev_statistics[src_dev->device_fh].tx += ret; 812 } 813 } 814 815 /* 816 * Check if the packet destination MAC address is for a local device. If so then put 817 * the packet on that devices RX queue. If not then return. 818 */ 819 static inline int __attribute__((always_inline)) 820 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 821 { 822 struct ether_hdr *pkt_hdr; 823 struct vhost_dev *dst_vdev; 824 uint64_t fh; 825 826 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 827 828 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 829 if (!dst_vdev) 830 return -1; 831 832 fh = dst_vdev->dev->device_fh; 833 if (fh == vdev->dev->device_fh) { 834 RTE_LOG(DEBUG, VHOST_DATA, 835 "(%" PRIu64 ") TX: src and dst MAC is same. " 836 "Dropping packet.\n", fh); 837 return 0; 838 } 839 840 RTE_LOG(DEBUG, VHOST_DATA, 841 "(%" PRIu64 ") TX: MAC address is local\n", fh); 842 843 if (unlikely(dst_vdev->remove)) { 844 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") " 845 "Device is marked for removal\n", fh); 846 return 0; 847 } 848 849 virtio_xmit(dst_vdev->dev, vdev->dev, m); 850 return 0; 851 } 852 853 /* 854 * Check if the destination MAC of a packet is one local VM, 855 * and get its vlan tag, and offset if it is. 856 */ 857 static inline int __attribute__((always_inline)) 858 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 859 uint32_t *offset, uint16_t *vlan_tag) 860 { 861 struct vhost_dev *dst_vdev; 862 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 863 864 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 865 if (!dst_vdev) 866 return 0; 867 868 if (dst_vdev->dev->device_fh == dev->device_fh) { 869 RTE_LOG(DEBUG, VHOST_DATA, 870 "(%" PRIu64 ") TX: src and dst MAC is same. " 871 " Dropping packet.\n", dst_vdev->dev->device_fh); 872 return -1; 873 } 874 875 /* 876 * HW vlan strip will reduce the packet length 877 * by minus length of vlan tag, so need restore 878 * the packet length by plus it. 879 */ 880 *offset = VLAN_HLEN; 881 *vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh]; 882 883 RTE_LOG(DEBUG, VHOST_DATA, 884 "(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") " 885 "vlan tag: %u.\n", 886 dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag); 887 888 return 0; 889 } 890 891 static uint16_t 892 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 893 { 894 if (ol_flags & PKT_TX_IPV4) 895 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 896 else /* assume ethertype == ETHER_TYPE_IPv6 */ 897 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 898 } 899 900 static void virtio_tx_offload(struct rte_mbuf *m) 901 { 902 void *l3_hdr; 903 struct ipv4_hdr *ipv4_hdr = NULL; 904 struct tcp_hdr *tcp_hdr = NULL; 905 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 906 907 l3_hdr = (char *)eth_hdr + m->l2_len; 908 909 if (m->ol_flags & PKT_TX_IPV4) { 910 ipv4_hdr = l3_hdr; 911 ipv4_hdr->hdr_checksum = 0; 912 m->ol_flags |= PKT_TX_IP_CKSUM; 913 } 914 915 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 916 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 917 } 918 919 static inline void 920 free_pkts(struct rte_mbuf **pkts, uint16_t n) 921 { 922 while (n--) 923 rte_pktmbuf_free(pkts[n]); 924 } 925 926 static inline void __attribute__((always_inline)) 927 do_drain_mbuf_table(struct mbuf_table *tx_q) 928 { 929 uint16_t count; 930 931 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 932 tx_q->m_table, tx_q->len); 933 if (unlikely(count < tx_q->len)) 934 free_pkts(&tx_q->m_table[count], tx_q->len - count); 935 936 tx_q->len = 0; 937 } 938 939 /* 940 * This function routes the TX packet to the correct interface. This 941 * may be a local device or the physical port. 942 */ 943 static inline void __attribute__((always_inline)) 944 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 945 { 946 struct mbuf_table *tx_q; 947 unsigned offset = 0; 948 const uint16_t lcore_id = rte_lcore_id(); 949 struct virtio_net *dev = vdev->dev; 950 struct ether_hdr *nh; 951 952 953 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 954 if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) { 955 struct vhost_dev *vdev2; 956 957 TAILQ_FOREACH(vdev2, &vhost_dev_list, next) { 958 virtio_xmit(vdev2->dev, vdev->dev, m); 959 } 960 goto queue2nic; 961 } 962 963 /*check if destination is local VM*/ 964 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 965 rte_pktmbuf_free(m); 966 return; 967 } 968 969 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 970 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 971 rte_pktmbuf_free(m); 972 return; 973 } 974 } 975 976 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " 977 "MAC address is external\n", dev->device_fh); 978 979 queue2nic: 980 981 /*Add packet to the port tx queue*/ 982 tx_q = &lcore_tx_queue[lcore_id]; 983 984 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 985 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 986 /* Guest has inserted the vlan tag. */ 987 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 988 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 989 if ((vm2vm_mode == VM2VM_HARDWARE) && 990 (vh->vlan_tci != vlan_tag_be)) 991 vh->vlan_tci = vlan_tag_be; 992 } else { 993 m->ol_flags |= PKT_TX_VLAN_PKT; 994 995 /* 996 * Find the right seg to adjust the data len when offset is 997 * bigger than tail room size. 998 */ 999 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1000 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1001 m->data_len += offset; 1002 else { 1003 struct rte_mbuf *seg = m; 1004 1005 while ((seg->next != NULL) && 1006 (offset > rte_pktmbuf_tailroom(seg))) 1007 seg = seg->next; 1008 1009 seg->data_len += offset; 1010 } 1011 m->pkt_len += offset; 1012 } 1013 1014 m->vlan_tci = vlan_tag; 1015 } 1016 1017 if (m->ol_flags & PKT_TX_TCP_SEG) 1018 virtio_tx_offload(m); 1019 1020 tx_q->m_table[tx_q->len++] = m; 1021 if (enable_stats) { 1022 dev_statistics[dev->device_fh].tx_total++; 1023 dev_statistics[dev->device_fh].tx++; 1024 } 1025 1026 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1027 do_drain_mbuf_table(tx_q); 1028 } 1029 1030 1031 static inline void __attribute__((always_inline)) 1032 drain_mbuf_table(struct mbuf_table *tx_q) 1033 { 1034 static uint64_t prev_tsc; 1035 uint64_t cur_tsc; 1036 1037 if (tx_q->len == 0) 1038 return; 1039 1040 cur_tsc = rte_rdtsc(); 1041 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1042 prev_tsc = cur_tsc; 1043 1044 RTE_LOG(DEBUG, VHOST_DATA, 1045 "TX queue drained after timeout with burst size %u\n", 1046 tx_q->len); 1047 do_drain_mbuf_table(tx_q); 1048 } 1049 } 1050 1051 static inline void __attribute__((always_inline)) 1052 drain_eth_rx(struct vhost_dev *vdev) 1053 { 1054 uint16_t rx_count, enqueue_count; 1055 struct virtio_net *dev = vdev->dev; 1056 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1057 1058 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1059 pkts, MAX_PKT_BURST); 1060 if (!rx_count) 1061 return; 1062 1063 /* 1064 * When "enable_retry" is set, here we wait and retry when there 1065 * is no enough free slots in the queue to hold @rx_count packets, 1066 * to diminish packet loss. 1067 */ 1068 if (enable_retry && 1069 unlikely(rx_count > rte_vring_available_entries(dev, 1070 VIRTIO_RXQ))) { 1071 uint32_t retry; 1072 1073 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1074 rte_delay_us(burst_rx_delay_time); 1075 if (rx_count <= rte_vring_available_entries(dev, 1076 VIRTIO_RXQ)) 1077 break; 1078 } 1079 } 1080 1081 enqueue_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, 1082 pkts, rx_count); 1083 if (enable_stats) { 1084 uint64_t fh = dev->device_fh; 1085 1086 rte_atomic64_add(&dev_statistics[fh].rx_total_atomic, rx_count); 1087 rte_atomic64_add(&dev_statistics[fh].rx_atomic, enqueue_count); 1088 } 1089 1090 free_pkts(pkts, rx_count); 1091 } 1092 1093 static inline void __attribute__((always_inline)) 1094 drain_virtio_tx(struct vhost_dev *vdev) 1095 { 1096 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1097 uint16_t count; 1098 uint16_t i; 1099 1100 count = rte_vhost_dequeue_burst(vdev->dev, VIRTIO_TXQ, mbuf_pool, 1101 pkts, MAX_PKT_BURST); 1102 1103 /* setup VMDq for the first packet */ 1104 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1105 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1106 free_pkts(pkts, count); 1107 } 1108 1109 for (i = 0; i < count; ++i) { 1110 virtio_tx_route(vdev, pkts[i], 1111 vlan_tags[(uint16_t)vdev->dev->device_fh]); 1112 } 1113 } 1114 1115 /* 1116 * Main function of vhost-switch. It basically does: 1117 * 1118 * for each vhost device { 1119 * - drain_eth_rx() 1120 * 1121 * Which drains the host eth Rx queue linked to the vhost device, 1122 * and deliver all of them to guest virito Rx ring associated with 1123 * this vhost device. 1124 * 1125 * - drain_virtio_tx() 1126 * 1127 * Which drains the guest virtio Tx queue and deliver all of them 1128 * to the target, which could be another vhost device, or the 1129 * physical eth dev. The route is done in function "virtio_tx_route". 1130 * } 1131 */ 1132 static int 1133 switch_worker(void *arg __rte_unused) 1134 { 1135 unsigned i; 1136 unsigned lcore_id = rte_lcore_id(); 1137 struct vhost_dev *vdev; 1138 struct mbuf_table *tx_q; 1139 1140 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1141 1142 tx_q = &lcore_tx_queue[lcore_id]; 1143 for (i = 0; i < rte_lcore_count(); i++) { 1144 if (lcore_ids[i] == lcore_id) { 1145 tx_q->txq_id = i; 1146 break; 1147 } 1148 } 1149 1150 while(1) { 1151 drain_mbuf_table(tx_q); 1152 1153 /* 1154 * Inform the configuration core that we have exited the 1155 * linked list and that no devices are in use if requested. 1156 */ 1157 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1158 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1159 1160 /* 1161 * Process vhost devices 1162 */ 1163 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) { 1164 if (unlikely(vdev->remove)) { 1165 unlink_vmdq(vdev); 1166 vdev->ready = DEVICE_SAFE_REMOVE; 1167 continue; 1168 } 1169 1170 if (likely(vdev->ready == DEVICE_RX)) 1171 drain_eth_rx(vdev); 1172 1173 if (likely(!vdev->remove)) 1174 drain_virtio_tx(vdev); 1175 } 1176 } 1177 1178 return 0; 1179 } 1180 1181 /* 1182 * Remove a device from the specific data core linked list and from the 1183 * main linked list. Synchonization occurs through the use of the 1184 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1185 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1186 */ 1187 static void 1188 destroy_device (volatile struct virtio_net *dev) 1189 { 1190 struct vhost_dev *vdev; 1191 int lcore; 1192 1193 dev->flags &= ~VIRTIO_DEV_RUNNING; 1194 1195 vdev = (struct vhost_dev *)dev->priv; 1196 /*set the remove flag. */ 1197 vdev->remove = 1; 1198 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1199 rte_pause(); 1200 } 1201 1202 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next); 1203 TAILQ_REMOVE(&vhost_dev_list, vdev, next); 1204 1205 /* Set the dev_removal_flag on each lcore. */ 1206 RTE_LCORE_FOREACH_SLAVE(lcore) 1207 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1208 1209 /* 1210 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1211 * we can be sure that they can no longer access the device removed 1212 * from the linked lists and that the devices are no longer in use. 1213 */ 1214 RTE_LCORE_FOREACH_SLAVE(lcore) { 1215 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1216 rte_pause(); 1217 } 1218 1219 lcore_info[vdev->coreid].device_num--; 1220 1221 RTE_LOG(INFO, VHOST_DATA, 1222 "(%" PRIu64 ") Device has been removed from data core\n", 1223 dev->device_fh); 1224 1225 rte_free(vdev); 1226 } 1227 1228 /* 1229 * A new device is added to a data core. First the device is added to the main linked list 1230 * and the allocated to a specific data core. 1231 */ 1232 static int 1233 new_device (struct virtio_net *dev) 1234 { 1235 int lcore, core_add = 0; 1236 uint32_t device_num_min = num_devices; 1237 struct vhost_dev *vdev; 1238 1239 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1240 if (vdev == NULL) { 1241 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 1242 dev->device_fh); 1243 return -1; 1244 } 1245 vdev->dev = dev; 1246 dev->priv = vdev; 1247 1248 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next); 1249 vdev->vmdq_rx_q 1250 = dev->device_fh * queues_per_pool + vmdq_queue_base; 1251 1252 /*reset ready flag*/ 1253 vdev->ready = DEVICE_MAC_LEARNING; 1254 vdev->remove = 0; 1255 1256 /* Find a suitable lcore to add the device. */ 1257 RTE_LCORE_FOREACH_SLAVE(lcore) { 1258 if (lcore_info[lcore].device_num < device_num_min) { 1259 device_num_min = lcore_info[lcore].device_num; 1260 core_add = lcore; 1261 } 1262 } 1263 vdev->coreid = core_add; 1264 1265 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next); 1266 lcore_info[vdev->coreid].device_num++; 1267 1268 /* Initialize device stats */ 1269 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 1270 1271 /* Disable notifications. */ 1272 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 1273 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 1274 dev->flags |= VIRTIO_DEV_RUNNING; 1275 1276 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 1277 1278 return 0; 1279 } 1280 1281 /* 1282 * These callback allow devices to be added to the data core when configuration 1283 * has been fully complete. 1284 */ 1285 static const struct virtio_net_device_ops virtio_net_device_ops = 1286 { 1287 .new_device = new_device, 1288 .destroy_device = destroy_device, 1289 }; 1290 1291 /* 1292 * This is a thread will wake up after a period to print stats if the user has 1293 * enabled them. 1294 */ 1295 static void 1296 print_stats(void) 1297 { 1298 struct vhost_dev *vdev; 1299 uint64_t tx_dropped, rx_dropped; 1300 uint64_t tx, tx_total, rx, rx_total; 1301 uint32_t device_fh; 1302 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1303 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1304 1305 while(1) { 1306 sleep(enable_stats); 1307 1308 /* Clear screen and move to top left */ 1309 printf("%s%s", clr, top_left); 1310 1311 printf("\nDevice statistics ===================================="); 1312 1313 TAILQ_FOREACH(vdev, &vhost_dev_list, next) { 1314 device_fh = vdev->dev->device_fh; 1315 tx_total = dev_statistics[device_fh].tx_total; 1316 tx = dev_statistics[device_fh].tx; 1317 tx_dropped = tx_total - tx; 1318 rx_total = rte_atomic64_read( 1319 &dev_statistics[device_fh].rx_total_atomic); 1320 rx = rte_atomic64_read( 1321 &dev_statistics[device_fh].rx_atomic); 1322 rx_dropped = rx_total - rx; 1323 1324 printf("\nStatistics for device %"PRIu32" ------------------------------" 1325 "\nTX total: %"PRIu64"" 1326 "\nTX dropped: %"PRIu64"" 1327 "\nTX successful: %"PRIu64"" 1328 "\nRX total: %"PRIu64"" 1329 "\nRX dropped: %"PRIu64"" 1330 "\nRX successful: %"PRIu64"", 1331 device_fh, 1332 tx_total, 1333 tx_dropped, 1334 tx, 1335 rx_total, 1336 rx_dropped, 1337 rx); 1338 } 1339 printf("\n======================================================\n"); 1340 } 1341 } 1342 1343 /* When we receive a INT signal, unregister vhost driver */ 1344 static void 1345 sigint_handler(__rte_unused int signum) 1346 { 1347 /* Unregister vhost driver. */ 1348 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 1349 if (ret != 0) 1350 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 1351 exit(0); 1352 } 1353 1354 /* 1355 * While creating an mbuf pool, one key thing is to figure out how 1356 * many mbuf entries is enough for our use. FYI, here are some 1357 * guidelines: 1358 * 1359 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1360 * 1361 * - For each switch core (A CPU core does the packet switch), we need 1362 * also make some reservation for receiving the packets from virtio 1363 * Tx queue. How many is enough depends on the usage. It's normally 1364 * a simple calculation like following: 1365 * 1366 * MAX_PKT_BURST * max packet size / mbuf size 1367 * 1368 * So, we definitely need allocate more mbufs when TSO is enabled. 1369 * 1370 * - Similarly, for each switching core, we should serve @nr_rx_desc 1371 * mbufs for receiving the packets from physical NIC device. 1372 * 1373 * - We also need make sure, for each switch core, we have allocated 1374 * enough mbufs to fill up the mbuf cache. 1375 */ 1376 static void 1377 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1378 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1379 { 1380 uint32_t nr_mbufs; 1381 uint32_t nr_mbufs_per_core; 1382 uint32_t mtu = 1500; 1383 1384 if (mergeable) 1385 mtu = 9000; 1386 if (enable_tso) 1387 mtu = 64 * 1024; 1388 1389 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1390 (mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST; 1391 nr_mbufs_per_core += nr_rx_desc; 1392 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1393 1394 nr_mbufs = nr_queues * nr_rx_desc; 1395 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1396 nr_mbufs *= nr_port; 1397 1398 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1399 nr_mbuf_cache, 0, mbuf_size, 1400 rte_socket_id()); 1401 if (mbuf_pool == NULL) 1402 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1403 } 1404 1405 /* 1406 * Main function, does initialisation and calls the per-lcore functions. The CUSE 1407 * device is also registered here to handle the IOCTLs. 1408 */ 1409 int 1410 main(int argc, char *argv[]) 1411 { 1412 unsigned lcore_id, core_id = 0; 1413 unsigned nb_ports, valid_num_ports; 1414 int ret; 1415 uint8_t portid; 1416 static pthread_t tid; 1417 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 1418 1419 signal(SIGINT, sigint_handler); 1420 1421 /* init EAL */ 1422 ret = rte_eal_init(argc, argv); 1423 if (ret < 0) 1424 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1425 argc -= ret; 1426 argv += ret; 1427 1428 /* parse app arguments */ 1429 ret = us_vhost_parse_args(argc, argv); 1430 if (ret < 0) 1431 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1432 1433 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 1434 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1435 1436 if (rte_lcore_is_enabled(lcore_id)) 1437 lcore_ids[core_id ++] = lcore_id; 1438 1439 if (rte_lcore_count() > RTE_MAX_LCORE) 1440 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1441 1442 /* Get the number of physical ports. */ 1443 nb_ports = rte_eth_dev_count(); 1444 if (nb_ports > RTE_MAX_ETHPORTS) 1445 nb_ports = RTE_MAX_ETHPORTS; 1446 1447 /* 1448 * Update the global var NUM_PORTS and global array PORTS 1449 * and get value of var VALID_NUM_PORTS according to system ports number 1450 */ 1451 valid_num_ports = check_ports_num(nb_ports); 1452 1453 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1454 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1455 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1456 return -1; 1457 } 1458 1459 /* 1460 * FIXME: here we are trying to allocate mbufs big enough for 1461 * @MAX_QUEUES, but the truth is we're never going to use that 1462 * many queues here. We probably should only do allocation for 1463 * those queues we are going to use. 1464 */ 1465 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1466 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1467 1468 if (vm2vm_mode == VM2VM_HARDWARE) { 1469 /* Enable VT loop back to let L2 switch to do it. */ 1470 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1471 RTE_LOG(DEBUG, VHOST_CONFIG, 1472 "Enable loop back for L2 switch in vmdq.\n"); 1473 } 1474 1475 /* initialize all ports */ 1476 for (portid = 0; portid < nb_ports; portid++) { 1477 /* skip ports that are not enabled */ 1478 if ((enabled_port_mask & (1 << portid)) == 0) { 1479 RTE_LOG(INFO, VHOST_PORT, 1480 "Skipping disabled port %d\n", portid); 1481 continue; 1482 } 1483 if (port_init(portid) != 0) 1484 rte_exit(EXIT_FAILURE, 1485 "Cannot initialize network ports\n"); 1486 } 1487 1488 /* Initialize device stats */ 1489 memset(&dev_statistics, 0, sizeof(dev_statistics)); 1490 1491 /* Enable stats if the user option is set. */ 1492 if (enable_stats) { 1493 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 1494 if (ret != 0) 1495 rte_exit(EXIT_FAILURE, 1496 "Cannot create print-stats thread\n"); 1497 1498 /* Set thread_name for aid in debugging. */ 1499 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 1500 ret = rte_thread_setname(tid, thread_name); 1501 if (ret != 0) 1502 RTE_LOG(ERR, VHOST_CONFIG, 1503 "Cannot set print-stats name\n"); 1504 } 1505 1506 /* Launch all data cores. */ 1507 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1508 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1509 1510 if (mergeable == 0) 1511 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 1512 1513 /* Register vhost(cuse or user) driver to handle vhost messages. */ 1514 ret = rte_vhost_driver_register((char *)&dev_basename); 1515 if (ret != 0) 1516 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 1517 1518 rte_vhost_driver_callback_register(&virtio_net_device_ops); 1519 1520 /* Start CUSE session. */ 1521 rte_vhost_driver_session_start(); 1522 return 0; 1523 1524 } 1525