1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_vhost.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 56 #include "main.h" 57 58 #ifndef MAX_QUEUES 59 #define MAX_QUEUES 128 60 #endif 61 62 /* the maximum number of external ports supported */ 63 #define MAX_SUP_PORTS 1 64 65 #define MBUF_CACHE_SIZE 128 66 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 67 68 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 69 70 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 71 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 72 73 #define JUMBO_FRAME_MAX_SIZE 0x2600 74 75 /* State of virtio device. */ 76 #define DEVICE_MAC_LEARNING 0 77 #define DEVICE_RX 1 78 #define DEVICE_SAFE_REMOVE 2 79 80 /* Configurable number of RX/TX ring descriptors */ 81 #define RTE_TEST_RX_DESC_DEFAULT 1024 82 #define RTE_TEST_TX_DESC_DEFAULT 512 83 84 #define INVALID_PORT_ID 0xFF 85 86 /* Max number of devices. Limited by vmdq. */ 87 #define MAX_DEVICES 64 88 89 /* Size of buffers used for snprintfs. */ 90 #define MAX_PRINT_BUFF 6072 91 92 /* Maximum long option length for option parsing. */ 93 #define MAX_LONG_OPT_SZ 64 94 95 /* mask of enabled ports */ 96 static uint32_t enabled_port_mask = 0; 97 98 /* Promiscuous mode */ 99 static uint32_t promiscuous; 100 101 /* number of devices/queues to support*/ 102 static uint32_t num_queues = 0; 103 static uint32_t num_devices; 104 105 static struct rte_mempool *mbuf_pool; 106 static int mergeable; 107 108 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 109 typedef enum { 110 VM2VM_DISABLED = 0, 111 VM2VM_SOFTWARE = 1, 112 VM2VM_HARDWARE = 2, 113 VM2VM_LAST 114 } vm2vm_type; 115 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 116 117 /* Enable stats. */ 118 static uint32_t enable_stats = 0; 119 /* Enable retries on RX. */ 120 static uint32_t enable_retry = 1; 121 122 /* Disable TX checksum offload */ 123 static uint32_t enable_tx_csum; 124 125 /* Disable TSO offload */ 126 static uint32_t enable_tso; 127 128 static int client_mode; 129 static int dequeue_zero_copy; 130 131 static int builtin_net_driver; 132 133 /* Specify timeout (in useconds) between retries on RX. */ 134 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 135 /* Specify the number of retries on RX. */ 136 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 137 138 /* Socket file paths. Can be set by user */ 139 static char *socket_files; 140 static int nb_sockets; 141 142 /* empty vmdq configuration structure. Filled in programatically */ 143 static struct rte_eth_conf vmdq_conf_default = { 144 .rxmode = { 145 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 146 .split_hdr_size = 0, 147 .header_split = 0, /**< Header Split disabled */ 148 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 149 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 150 /* 151 * It is necessary for 1G NIC such as I350, 152 * this fixes bug of ipv4 forwarding in guest can't 153 * forward pakets from one virtio dev to another virtio dev. 154 */ 155 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 156 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 157 .hw_strip_crc = 1, /**< CRC stripped by hardware */ 158 }, 159 160 .txmode = { 161 .mq_mode = ETH_MQ_TX_NONE, 162 }, 163 .rx_adv_conf = { 164 /* 165 * should be overridden separately in code with 166 * appropriate values 167 */ 168 .vmdq_rx_conf = { 169 .nb_queue_pools = ETH_8_POOLS, 170 .enable_default_pool = 0, 171 .default_pool = 0, 172 .nb_pool_maps = 0, 173 .pool_map = {{0, 0},}, 174 }, 175 }, 176 }; 177 178 static unsigned lcore_ids[RTE_MAX_LCORE]; 179 static uint8_t ports[RTE_MAX_ETHPORTS]; 180 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 181 static uint16_t num_pf_queues, num_vmdq_queues; 182 static uint16_t vmdq_pool_base, vmdq_queue_base; 183 static uint16_t queues_per_pool; 184 185 const uint16_t vlan_tags[] = { 186 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 187 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 188 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 189 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 190 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 191 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 192 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 193 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 194 }; 195 196 /* ethernet addresses of ports */ 197 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 198 199 static struct vhost_dev_tailq_list vhost_dev_list = 200 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 201 202 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 203 204 /* Used for queueing bursts of TX packets. */ 205 struct mbuf_table { 206 unsigned len; 207 unsigned txq_id; 208 struct rte_mbuf *m_table[MAX_PKT_BURST]; 209 }; 210 211 /* TX queue for each data core. */ 212 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 213 214 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 215 / US_PER_S * BURST_TX_DRAIN_US) 216 #define VLAN_HLEN 4 217 218 /* 219 * Builds up the correct configuration for VMDQ VLAN pool map 220 * according to the pool & queue limits. 221 */ 222 static inline int 223 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 224 { 225 struct rte_eth_vmdq_rx_conf conf; 226 struct rte_eth_vmdq_rx_conf *def_conf = 227 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 228 unsigned i; 229 230 memset(&conf, 0, sizeof(conf)); 231 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 232 conf.nb_pool_maps = num_devices; 233 conf.enable_loop_back = def_conf->enable_loop_back; 234 conf.rx_mode = def_conf->rx_mode; 235 236 for (i = 0; i < conf.nb_pool_maps; i++) { 237 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 238 conf.pool_map[i].pools = (1UL << i); 239 } 240 241 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 242 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 243 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 244 return 0; 245 } 246 247 /* 248 * Validate the device number according to the max pool number gotten form 249 * dev_info. If the device number is invalid, give the error message and 250 * return -1. Each device must have its own pool. 251 */ 252 static inline int 253 validate_num_devices(uint32_t max_nb_devices) 254 { 255 if (num_devices > max_nb_devices) { 256 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 257 return -1; 258 } 259 return 0; 260 } 261 262 /* 263 * Initialises a given port using global settings and with the rx buffers 264 * coming from the mbuf_pool passed as parameter 265 */ 266 static inline int 267 port_init(uint8_t port) 268 { 269 struct rte_eth_dev_info dev_info; 270 struct rte_eth_conf port_conf; 271 struct rte_eth_rxconf *rxconf; 272 struct rte_eth_txconf *txconf; 273 int16_t rx_rings, tx_rings; 274 uint16_t rx_ring_size, tx_ring_size; 275 int retval; 276 uint16_t q; 277 278 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 279 rte_eth_dev_info_get (port, &dev_info); 280 281 if (dev_info.max_rx_queues > MAX_QUEUES) { 282 rte_exit(EXIT_FAILURE, 283 "please define MAX_QUEUES no less than %u in %s\n", 284 dev_info.max_rx_queues, __FILE__); 285 } 286 287 rxconf = &dev_info.default_rxconf; 288 txconf = &dev_info.default_txconf; 289 rxconf->rx_drop_en = 1; 290 291 /* Enable vlan offload */ 292 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 293 294 /*configure the number of supported virtio devices based on VMDQ limits */ 295 num_devices = dev_info.max_vmdq_pools; 296 297 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 298 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 299 300 /* 301 * When dequeue zero copy is enabled, guest Tx used vring will be 302 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc 303 * (tx_ring_size here) must be small enough so that the driver will 304 * hit the free threshold easily and free mbufs timely. Otherwise, 305 * guest Tx vring would be starved. 306 */ 307 if (dequeue_zero_copy) 308 tx_ring_size = 64; 309 310 tx_rings = (uint16_t)rte_lcore_count(); 311 312 retval = validate_num_devices(MAX_DEVICES); 313 if (retval < 0) 314 return retval; 315 316 /* Get port configuration. */ 317 retval = get_eth_conf(&port_conf, num_devices); 318 if (retval < 0) 319 return retval; 320 /* NIC queues are divided into pf queues and vmdq queues. */ 321 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 322 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 323 num_vmdq_queues = num_devices * queues_per_pool; 324 num_queues = num_pf_queues + num_vmdq_queues; 325 vmdq_queue_base = dev_info.vmdq_queue_base; 326 vmdq_pool_base = dev_info.vmdq_pool_base; 327 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 328 num_pf_queues, num_devices, queues_per_pool); 329 330 if (port >= rte_eth_dev_count()) return -1; 331 332 rx_rings = (uint16_t)dev_info.max_rx_queues; 333 /* Configure ethernet device. */ 334 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 335 if (retval != 0) { 336 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 337 port, strerror(-retval)); 338 return retval; 339 } 340 341 /* Setup the queues. */ 342 for (q = 0; q < rx_rings; q ++) { 343 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 344 rte_eth_dev_socket_id(port), 345 rxconf, 346 mbuf_pool); 347 if (retval < 0) { 348 RTE_LOG(ERR, VHOST_PORT, 349 "Failed to setup rx queue %u of port %u: %s.\n", 350 q, port, strerror(-retval)); 351 return retval; 352 } 353 } 354 for (q = 0; q < tx_rings; q ++) { 355 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 356 rte_eth_dev_socket_id(port), 357 txconf); 358 if (retval < 0) { 359 RTE_LOG(ERR, VHOST_PORT, 360 "Failed to setup tx queue %u of port %u: %s.\n", 361 q, port, strerror(-retval)); 362 return retval; 363 } 364 } 365 366 /* Start the device. */ 367 retval = rte_eth_dev_start(port); 368 if (retval < 0) { 369 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 370 port, strerror(-retval)); 371 return retval; 372 } 373 374 if (promiscuous) 375 rte_eth_promiscuous_enable(port); 376 377 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 378 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 379 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 380 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 381 (unsigned)port, 382 vmdq_ports_eth_addr[port].addr_bytes[0], 383 vmdq_ports_eth_addr[port].addr_bytes[1], 384 vmdq_ports_eth_addr[port].addr_bytes[2], 385 vmdq_ports_eth_addr[port].addr_bytes[3], 386 vmdq_ports_eth_addr[port].addr_bytes[4], 387 vmdq_ports_eth_addr[port].addr_bytes[5]); 388 389 return 0; 390 } 391 392 /* 393 * Set socket file path. 394 */ 395 static int 396 us_vhost_parse_socket_path(const char *q_arg) 397 { 398 /* parse number string */ 399 if (strnlen(q_arg, PATH_MAX) > PATH_MAX) 400 return -1; 401 402 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 403 snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg); 404 nb_sockets++; 405 406 return 0; 407 } 408 409 /* 410 * Parse the portmask provided at run time. 411 */ 412 static int 413 parse_portmask(const char *portmask) 414 { 415 char *end = NULL; 416 unsigned long pm; 417 418 errno = 0; 419 420 /* parse hexadecimal string */ 421 pm = strtoul(portmask, &end, 16); 422 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 423 return -1; 424 425 if (pm == 0) 426 return -1; 427 428 return pm; 429 430 } 431 432 /* 433 * Parse num options at run time. 434 */ 435 static int 436 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 437 { 438 char *end = NULL; 439 unsigned long num; 440 441 errno = 0; 442 443 /* parse unsigned int string */ 444 num = strtoul(q_arg, &end, 10); 445 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 446 return -1; 447 448 if (num > max_valid_value) 449 return -1; 450 451 return num; 452 453 } 454 455 /* 456 * Display usage 457 */ 458 static void 459 us_vhost_usage(const char *prgname) 460 { 461 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 462 " --vm2vm [0|1|2]\n" 463 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 464 " --socket-file <path>\n" 465 " --nb-devices ND\n" 466 " -p PORTMASK: Set mask for ports to be used by application\n" 467 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 468 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 469 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 470 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 471 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 472 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 473 " --socket-file: The path of the socket file.\n" 474 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 475 " --tso [0|1] disable/enable TCP segment offload.\n" 476 " --client register a vhost-user socket as client mode.\n" 477 " --dequeue-zero-copy enables dequeue zero copy\n", 478 prgname); 479 } 480 481 /* 482 * Parse the arguments given in the command line of the application. 483 */ 484 static int 485 us_vhost_parse_args(int argc, char **argv) 486 { 487 int opt, ret; 488 int option_index; 489 unsigned i; 490 const char *prgname = argv[0]; 491 static struct option long_option[] = { 492 {"vm2vm", required_argument, NULL, 0}, 493 {"rx-retry", required_argument, NULL, 0}, 494 {"rx-retry-delay", required_argument, NULL, 0}, 495 {"rx-retry-num", required_argument, NULL, 0}, 496 {"mergeable", required_argument, NULL, 0}, 497 {"stats", required_argument, NULL, 0}, 498 {"socket-file", required_argument, NULL, 0}, 499 {"tx-csum", required_argument, NULL, 0}, 500 {"tso", required_argument, NULL, 0}, 501 {"client", no_argument, &client_mode, 1}, 502 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1}, 503 {"builtin-net-driver", no_argument, &builtin_net_driver, 1}, 504 {NULL, 0, 0, 0}, 505 }; 506 507 /* Parse command line */ 508 while ((opt = getopt_long(argc, argv, "p:P", 509 long_option, &option_index)) != EOF) { 510 switch (opt) { 511 /* Portmask */ 512 case 'p': 513 enabled_port_mask = parse_portmask(optarg); 514 if (enabled_port_mask == 0) { 515 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 516 us_vhost_usage(prgname); 517 return -1; 518 } 519 break; 520 521 case 'P': 522 promiscuous = 1; 523 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 524 ETH_VMDQ_ACCEPT_BROADCAST | 525 ETH_VMDQ_ACCEPT_MULTICAST; 526 527 break; 528 529 case 0: 530 /* Enable/disable vm2vm comms. */ 531 if (!strncmp(long_option[option_index].name, "vm2vm", 532 MAX_LONG_OPT_SZ)) { 533 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 534 if (ret == -1) { 535 RTE_LOG(INFO, VHOST_CONFIG, 536 "Invalid argument for " 537 "vm2vm [0|1|2]\n"); 538 us_vhost_usage(prgname); 539 return -1; 540 } else { 541 vm2vm_mode = (vm2vm_type)ret; 542 } 543 } 544 545 /* Enable/disable retries on RX. */ 546 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 547 ret = parse_num_opt(optarg, 1); 548 if (ret == -1) { 549 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 550 us_vhost_usage(prgname); 551 return -1; 552 } else { 553 enable_retry = ret; 554 } 555 } 556 557 /* Enable/disable TX checksum offload. */ 558 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 559 ret = parse_num_opt(optarg, 1); 560 if (ret == -1) { 561 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 562 us_vhost_usage(prgname); 563 return -1; 564 } else 565 enable_tx_csum = ret; 566 } 567 568 /* Enable/disable TSO offload. */ 569 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 570 ret = parse_num_opt(optarg, 1); 571 if (ret == -1) { 572 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 573 us_vhost_usage(prgname); 574 return -1; 575 } else 576 enable_tso = ret; 577 } 578 579 /* Specify the retries delay time (in useconds) on RX. */ 580 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 581 ret = parse_num_opt(optarg, INT32_MAX); 582 if (ret == -1) { 583 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 584 us_vhost_usage(prgname); 585 return -1; 586 } else { 587 burst_rx_delay_time = ret; 588 } 589 } 590 591 /* Specify the retries number on RX. */ 592 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 593 ret = parse_num_opt(optarg, INT32_MAX); 594 if (ret == -1) { 595 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 596 us_vhost_usage(prgname); 597 return -1; 598 } else { 599 burst_rx_retry_num = ret; 600 } 601 } 602 603 /* Enable/disable RX mergeable buffers. */ 604 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 605 ret = parse_num_opt(optarg, 1); 606 if (ret == -1) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } else { 611 mergeable = !!ret; 612 if (ret) { 613 vmdq_conf_default.rxmode.jumbo_frame = 1; 614 vmdq_conf_default.rxmode.max_rx_pkt_len 615 = JUMBO_FRAME_MAX_SIZE; 616 } 617 } 618 } 619 620 /* Enable/disable stats. */ 621 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 622 ret = parse_num_opt(optarg, INT32_MAX); 623 if (ret == -1) { 624 RTE_LOG(INFO, VHOST_CONFIG, 625 "Invalid argument for stats [0..N]\n"); 626 us_vhost_usage(prgname); 627 return -1; 628 } else { 629 enable_stats = ret; 630 } 631 } 632 633 /* Set socket file path. */ 634 if (!strncmp(long_option[option_index].name, 635 "socket-file", MAX_LONG_OPT_SZ)) { 636 if (us_vhost_parse_socket_path(optarg) == -1) { 637 RTE_LOG(INFO, VHOST_CONFIG, 638 "Invalid argument for socket name (Max %d characters)\n", 639 PATH_MAX); 640 us_vhost_usage(prgname); 641 return -1; 642 } 643 } 644 645 break; 646 647 /* Invalid option - print options. */ 648 default: 649 us_vhost_usage(prgname); 650 return -1; 651 } 652 } 653 654 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 655 if (enabled_port_mask & (1 << i)) 656 ports[num_ports++] = (uint8_t)i; 657 } 658 659 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 660 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 661 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 662 return -1; 663 } 664 665 return 0; 666 } 667 668 /* 669 * Update the global var NUM_PORTS and array PORTS according to system ports number 670 * and return valid ports number 671 */ 672 static unsigned check_ports_num(unsigned nb_ports) 673 { 674 unsigned valid_num_ports = num_ports; 675 unsigned portid; 676 677 if (num_ports > nb_ports) { 678 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 679 num_ports, nb_ports); 680 num_ports = nb_ports; 681 } 682 683 for (portid = 0; portid < num_ports; portid ++) { 684 if (ports[portid] >= nb_ports) { 685 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 686 ports[portid], (nb_ports - 1)); 687 ports[portid] = INVALID_PORT_ID; 688 valid_num_ports--; 689 } 690 } 691 return valid_num_ports; 692 } 693 694 static inline struct vhost_dev *__attribute__((always_inline)) 695 find_vhost_dev(struct ether_addr *mac) 696 { 697 struct vhost_dev *vdev; 698 699 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 700 if (vdev->ready == DEVICE_RX && 701 is_same_ether_addr(mac, &vdev->mac_address)) 702 return vdev; 703 } 704 705 return NULL; 706 } 707 708 /* 709 * This function learns the MAC address of the device and registers this along with a 710 * vlan tag to a VMDQ. 711 */ 712 static int 713 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 714 { 715 struct ether_hdr *pkt_hdr; 716 int i, ret; 717 718 /* Learn MAC address of guest device from packet */ 719 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 720 721 if (find_vhost_dev(&pkt_hdr->s_addr)) { 722 RTE_LOG(ERR, VHOST_DATA, 723 "(%d) device is using a registered MAC!\n", 724 vdev->vid); 725 return -1; 726 } 727 728 for (i = 0; i < ETHER_ADDR_LEN; i++) 729 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 730 731 /* vlan_tag currently uses the device_id. */ 732 vdev->vlan_tag = vlan_tags[vdev->vid]; 733 734 /* Print out VMDQ registration info. */ 735 RTE_LOG(INFO, VHOST_DATA, 736 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 737 vdev->vid, 738 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 739 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 740 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 741 vdev->vlan_tag); 742 743 /* Register the MAC address. */ 744 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 745 (uint32_t)vdev->vid + vmdq_pool_base); 746 if (ret) 747 RTE_LOG(ERR, VHOST_DATA, 748 "(%d) failed to add device MAC address to VMDQ\n", 749 vdev->vid); 750 751 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 752 753 /* Set device as ready for RX. */ 754 vdev->ready = DEVICE_RX; 755 756 return 0; 757 } 758 759 /* 760 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 761 * queue before disabling RX on the device. 762 */ 763 static inline void 764 unlink_vmdq(struct vhost_dev *vdev) 765 { 766 unsigned i = 0; 767 unsigned rx_count; 768 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 769 770 if (vdev->ready == DEVICE_RX) { 771 /*clear MAC and VLAN settings*/ 772 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 773 for (i = 0; i < 6; i++) 774 vdev->mac_address.addr_bytes[i] = 0; 775 776 vdev->vlan_tag = 0; 777 778 /*Clear out the receive buffers*/ 779 rx_count = rte_eth_rx_burst(ports[0], 780 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 781 782 while (rx_count) { 783 for (i = 0; i < rx_count; i++) 784 rte_pktmbuf_free(pkts_burst[i]); 785 786 rx_count = rte_eth_rx_burst(ports[0], 787 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 788 } 789 790 vdev->ready = DEVICE_MAC_LEARNING; 791 } 792 } 793 794 static inline void __attribute__((always_inline)) 795 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 796 struct rte_mbuf *m) 797 { 798 uint16_t ret; 799 800 if (builtin_net_driver) { 801 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 802 } else { 803 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 804 } 805 806 if (enable_stats) { 807 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic); 808 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret); 809 src_vdev->stats.tx_total++; 810 src_vdev->stats.tx += ret; 811 } 812 } 813 814 /* 815 * Check if the packet destination MAC address is for a local device. If so then put 816 * the packet on that devices RX queue. If not then return. 817 */ 818 static inline int __attribute__((always_inline)) 819 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 820 { 821 struct ether_hdr *pkt_hdr; 822 struct vhost_dev *dst_vdev; 823 824 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 825 826 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 827 if (!dst_vdev) 828 return -1; 829 830 if (vdev->vid == dst_vdev->vid) { 831 RTE_LOG_DP(DEBUG, VHOST_DATA, 832 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 833 vdev->vid); 834 return 0; 835 } 836 837 RTE_LOG_DP(DEBUG, VHOST_DATA, 838 "(%d) TX: MAC address is local\n", dst_vdev->vid); 839 840 if (unlikely(dst_vdev->remove)) { 841 RTE_LOG_DP(DEBUG, VHOST_DATA, 842 "(%d) device is marked for removal\n", dst_vdev->vid); 843 return 0; 844 } 845 846 virtio_xmit(dst_vdev, vdev, m); 847 return 0; 848 } 849 850 /* 851 * Check if the destination MAC of a packet is one local VM, 852 * and get its vlan tag, and offset if it is. 853 */ 854 static inline int __attribute__((always_inline)) 855 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 856 uint32_t *offset, uint16_t *vlan_tag) 857 { 858 struct vhost_dev *dst_vdev; 859 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 860 861 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 862 if (!dst_vdev) 863 return 0; 864 865 if (vdev->vid == dst_vdev->vid) { 866 RTE_LOG_DP(DEBUG, VHOST_DATA, 867 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 868 vdev->vid); 869 return -1; 870 } 871 872 /* 873 * HW vlan strip will reduce the packet length 874 * by minus length of vlan tag, so need restore 875 * the packet length by plus it. 876 */ 877 *offset = VLAN_HLEN; 878 *vlan_tag = vlan_tags[vdev->vid]; 879 880 RTE_LOG_DP(DEBUG, VHOST_DATA, 881 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 882 vdev->vid, dst_vdev->vid, *vlan_tag); 883 884 return 0; 885 } 886 887 static uint16_t 888 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 889 { 890 if (ol_flags & PKT_TX_IPV4) 891 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 892 else /* assume ethertype == ETHER_TYPE_IPv6 */ 893 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 894 } 895 896 static void virtio_tx_offload(struct rte_mbuf *m) 897 { 898 void *l3_hdr; 899 struct ipv4_hdr *ipv4_hdr = NULL; 900 struct tcp_hdr *tcp_hdr = NULL; 901 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 902 903 l3_hdr = (char *)eth_hdr + m->l2_len; 904 905 if (m->ol_flags & PKT_TX_IPV4) { 906 ipv4_hdr = l3_hdr; 907 ipv4_hdr->hdr_checksum = 0; 908 m->ol_flags |= PKT_TX_IP_CKSUM; 909 } 910 911 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 912 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 913 } 914 915 static inline void 916 free_pkts(struct rte_mbuf **pkts, uint16_t n) 917 { 918 while (n--) 919 rte_pktmbuf_free(pkts[n]); 920 } 921 922 static inline void __attribute__((always_inline)) 923 do_drain_mbuf_table(struct mbuf_table *tx_q) 924 { 925 uint16_t count; 926 927 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 928 tx_q->m_table, tx_q->len); 929 if (unlikely(count < tx_q->len)) 930 free_pkts(&tx_q->m_table[count], tx_q->len - count); 931 932 tx_q->len = 0; 933 } 934 935 /* 936 * This function routes the TX packet to the correct interface. This 937 * may be a local device or the physical port. 938 */ 939 static inline void __attribute__((always_inline)) 940 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 941 { 942 struct mbuf_table *tx_q; 943 unsigned offset = 0; 944 const uint16_t lcore_id = rte_lcore_id(); 945 struct ether_hdr *nh; 946 947 948 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 949 if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) { 950 struct vhost_dev *vdev2; 951 952 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 953 virtio_xmit(vdev2, vdev, m); 954 } 955 goto queue2nic; 956 } 957 958 /*check if destination is local VM*/ 959 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 960 rte_pktmbuf_free(m); 961 return; 962 } 963 964 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 965 if (unlikely(find_local_dest(vdev, m, &offset, 966 &vlan_tag) != 0)) { 967 rte_pktmbuf_free(m); 968 return; 969 } 970 } 971 972 RTE_LOG_DP(DEBUG, VHOST_DATA, 973 "(%d) TX: MAC address is external\n", vdev->vid); 974 975 queue2nic: 976 977 /*Add packet to the port tx queue*/ 978 tx_q = &lcore_tx_queue[lcore_id]; 979 980 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 981 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 982 /* Guest has inserted the vlan tag. */ 983 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 984 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 985 if ((vm2vm_mode == VM2VM_HARDWARE) && 986 (vh->vlan_tci != vlan_tag_be)) 987 vh->vlan_tci = vlan_tag_be; 988 } else { 989 m->ol_flags |= PKT_TX_VLAN_PKT; 990 991 /* 992 * Find the right seg to adjust the data len when offset is 993 * bigger than tail room size. 994 */ 995 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 996 if (likely(offset <= rte_pktmbuf_tailroom(m))) 997 m->data_len += offset; 998 else { 999 struct rte_mbuf *seg = m; 1000 1001 while ((seg->next != NULL) && 1002 (offset > rte_pktmbuf_tailroom(seg))) 1003 seg = seg->next; 1004 1005 seg->data_len += offset; 1006 } 1007 m->pkt_len += offset; 1008 } 1009 1010 m->vlan_tci = vlan_tag; 1011 } 1012 1013 if (m->ol_flags & PKT_TX_TCP_SEG) 1014 virtio_tx_offload(m); 1015 1016 tx_q->m_table[tx_q->len++] = m; 1017 if (enable_stats) { 1018 vdev->stats.tx_total++; 1019 vdev->stats.tx++; 1020 } 1021 1022 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1023 do_drain_mbuf_table(tx_q); 1024 } 1025 1026 1027 static inline void __attribute__((always_inline)) 1028 drain_mbuf_table(struct mbuf_table *tx_q) 1029 { 1030 static uint64_t prev_tsc; 1031 uint64_t cur_tsc; 1032 1033 if (tx_q->len == 0) 1034 return; 1035 1036 cur_tsc = rte_rdtsc(); 1037 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1038 prev_tsc = cur_tsc; 1039 1040 RTE_LOG_DP(DEBUG, VHOST_DATA, 1041 "TX queue drained after timeout with burst size %u\n", 1042 tx_q->len); 1043 do_drain_mbuf_table(tx_q); 1044 } 1045 } 1046 1047 static inline void __attribute__((always_inline)) 1048 drain_eth_rx(struct vhost_dev *vdev) 1049 { 1050 uint16_t rx_count, enqueue_count; 1051 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1052 1053 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1054 pkts, MAX_PKT_BURST); 1055 if (!rx_count) 1056 return; 1057 1058 /* 1059 * When "enable_retry" is set, here we wait and retry when there 1060 * is no enough free slots in the queue to hold @rx_count packets, 1061 * to diminish packet loss. 1062 */ 1063 if (enable_retry && 1064 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1065 VIRTIO_RXQ))) { 1066 uint32_t retry; 1067 1068 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1069 rte_delay_us(burst_rx_delay_time); 1070 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1071 VIRTIO_RXQ)) 1072 break; 1073 } 1074 } 1075 1076 if (builtin_net_driver) { 1077 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1078 pkts, rx_count); 1079 } else { 1080 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1081 pkts, rx_count); 1082 } 1083 if (enable_stats) { 1084 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count); 1085 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count); 1086 } 1087 1088 free_pkts(pkts, rx_count); 1089 } 1090 1091 static inline void __attribute__((always_inline)) 1092 drain_virtio_tx(struct vhost_dev *vdev) 1093 { 1094 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1095 uint16_t count; 1096 uint16_t i; 1097 1098 if (builtin_net_driver) { 1099 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1100 pkts, MAX_PKT_BURST); 1101 } else { 1102 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1103 mbuf_pool, pkts, MAX_PKT_BURST); 1104 } 1105 1106 /* setup VMDq for the first packet */ 1107 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1108 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1109 free_pkts(pkts, count); 1110 } 1111 1112 for (i = 0; i < count; ++i) 1113 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1114 } 1115 1116 /* 1117 * Main function of vhost-switch. It basically does: 1118 * 1119 * for each vhost device { 1120 * - drain_eth_rx() 1121 * 1122 * Which drains the host eth Rx queue linked to the vhost device, 1123 * and deliver all of them to guest virito Rx ring associated with 1124 * this vhost device. 1125 * 1126 * - drain_virtio_tx() 1127 * 1128 * Which drains the guest virtio Tx queue and deliver all of them 1129 * to the target, which could be another vhost device, or the 1130 * physical eth dev. The route is done in function "virtio_tx_route". 1131 * } 1132 */ 1133 static int 1134 switch_worker(void *arg __rte_unused) 1135 { 1136 unsigned i; 1137 unsigned lcore_id = rte_lcore_id(); 1138 struct vhost_dev *vdev; 1139 struct mbuf_table *tx_q; 1140 1141 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1142 1143 tx_q = &lcore_tx_queue[lcore_id]; 1144 for (i = 0; i < rte_lcore_count(); i++) { 1145 if (lcore_ids[i] == lcore_id) { 1146 tx_q->txq_id = i; 1147 break; 1148 } 1149 } 1150 1151 while(1) { 1152 drain_mbuf_table(tx_q); 1153 1154 /* 1155 * Inform the configuration core that we have exited the 1156 * linked list and that no devices are in use if requested. 1157 */ 1158 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1159 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1160 1161 /* 1162 * Process vhost devices 1163 */ 1164 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1165 lcore_vdev_entry) { 1166 if (unlikely(vdev->remove)) { 1167 unlink_vmdq(vdev); 1168 vdev->ready = DEVICE_SAFE_REMOVE; 1169 continue; 1170 } 1171 1172 if (likely(vdev->ready == DEVICE_RX)) 1173 drain_eth_rx(vdev); 1174 1175 if (likely(!vdev->remove)) 1176 drain_virtio_tx(vdev); 1177 } 1178 } 1179 1180 return 0; 1181 } 1182 1183 /* 1184 * Remove a device from the specific data core linked list and from the 1185 * main linked list. Synchonization occurs through the use of the 1186 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1187 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1188 */ 1189 static void 1190 destroy_device(int vid) 1191 { 1192 struct vhost_dev *vdev = NULL; 1193 int lcore; 1194 1195 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1196 if (vdev->vid == vid) 1197 break; 1198 } 1199 if (!vdev) 1200 return; 1201 /*set the remove flag. */ 1202 vdev->remove = 1; 1203 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1204 rte_pause(); 1205 } 1206 1207 if (builtin_net_driver) 1208 vs_vhost_net_remove(vdev); 1209 1210 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1211 lcore_vdev_entry); 1212 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1213 1214 1215 /* Set the dev_removal_flag on each lcore. */ 1216 RTE_LCORE_FOREACH_SLAVE(lcore) 1217 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1218 1219 /* 1220 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1221 * we can be sure that they can no longer access the device removed 1222 * from the linked lists and that the devices are no longer in use. 1223 */ 1224 RTE_LCORE_FOREACH_SLAVE(lcore) { 1225 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1226 rte_pause(); 1227 } 1228 1229 lcore_info[vdev->coreid].device_num--; 1230 1231 RTE_LOG(INFO, VHOST_DATA, 1232 "(%d) device has been removed from data core\n", 1233 vdev->vid); 1234 1235 rte_free(vdev); 1236 } 1237 1238 /* 1239 * A new device is added to a data core. First the device is added to the main linked list 1240 * and the allocated to a specific data core. 1241 */ 1242 static int 1243 new_device(int vid) 1244 { 1245 int lcore, core_add = 0; 1246 uint32_t device_num_min = num_devices; 1247 struct vhost_dev *vdev; 1248 1249 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1250 if (vdev == NULL) { 1251 RTE_LOG(INFO, VHOST_DATA, 1252 "(%d) couldn't allocate memory for vhost dev\n", 1253 vid); 1254 return -1; 1255 } 1256 vdev->vid = vid; 1257 1258 if (builtin_net_driver) 1259 vs_vhost_net_setup(vdev); 1260 1261 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1262 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1263 1264 /*reset ready flag*/ 1265 vdev->ready = DEVICE_MAC_LEARNING; 1266 vdev->remove = 0; 1267 1268 /* Find a suitable lcore to add the device. */ 1269 RTE_LCORE_FOREACH_SLAVE(lcore) { 1270 if (lcore_info[lcore].device_num < device_num_min) { 1271 device_num_min = lcore_info[lcore].device_num; 1272 core_add = lcore; 1273 } 1274 } 1275 vdev->coreid = core_add; 1276 1277 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1278 lcore_vdev_entry); 1279 lcore_info[vdev->coreid].device_num++; 1280 1281 /* Disable notifications. */ 1282 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1283 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1284 1285 RTE_LOG(INFO, VHOST_DATA, 1286 "(%d) device has been added to data core %d\n", 1287 vid, vdev->coreid); 1288 1289 return 0; 1290 } 1291 1292 /* 1293 * These callback allow devices to be added to the data core when configuration 1294 * has been fully complete. 1295 */ 1296 static const struct vhost_device_ops virtio_net_device_ops = 1297 { 1298 .new_device = new_device, 1299 .destroy_device = destroy_device, 1300 }; 1301 1302 /* 1303 * This is a thread will wake up after a period to print stats if the user has 1304 * enabled them. 1305 */ 1306 static void 1307 print_stats(void) 1308 { 1309 struct vhost_dev *vdev; 1310 uint64_t tx_dropped, rx_dropped; 1311 uint64_t tx, tx_total, rx, rx_total; 1312 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1313 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1314 1315 while(1) { 1316 sleep(enable_stats); 1317 1318 /* Clear screen and move to top left */ 1319 printf("%s%s\n", clr, top_left); 1320 printf("Device statistics =================================\n"); 1321 1322 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1323 tx_total = vdev->stats.tx_total; 1324 tx = vdev->stats.tx; 1325 tx_dropped = tx_total - tx; 1326 1327 rx_total = rte_atomic64_read(&vdev->stats.rx_total_atomic); 1328 rx = rte_atomic64_read(&vdev->stats.rx_atomic); 1329 rx_dropped = rx_total - rx; 1330 1331 printf("Statistics for device %d\n" 1332 "-----------------------\n" 1333 "TX total: %" PRIu64 "\n" 1334 "TX dropped: %" PRIu64 "\n" 1335 "TX successful: %" PRIu64 "\n" 1336 "RX total: %" PRIu64 "\n" 1337 "RX dropped: %" PRIu64 "\n" 1338 "RX successful: %" PRIu64 "\n", 1339 vdev->vid, 1340 tx_total, tx_dropped, tx, 1341 rx_total, rx_dropped, rx); 1342 } 1343 1344 printf("===================================================\n"); 1345 } 1346 } 1347 1348 static void 1349 unregister_drivers(int socket_num) 1350 { 1351 int i, ret; 1352 1353 for (i = 0; i < socket_num; i++) { 1354 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1355 if (ret != 0) 1356 RTE_LOG(ERR, VHOST_CONFIG, 1357 "Fail to unregister vhost driver for %s.\n", 1358 socket_files + i * PATH_MAX); 1359 } 1360 } 1361 1362 /* When we receive a INT signal, unregister vhost driver */ 1363 static void 1364 sigint_handler(__rte_unused int signum) 1365 { 1366 /* Unregister vhost driver. */ 1367 unregister_drivers(nb_sockets); 1368 1369 exit(0); 1370 } 1371 1372 /* 1373 * While creating an mbuf pool, one key thing is to figure out how 1374 * many mbuf entries is enough for our use. FYI, here are some 1375 * guidelines: 1376 * 1377 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1378 * 1379 * - For each switch core (A CPU core does the packet switch), we need 1380 * also make some reservation for receiving the packets from virtio 1381 * Tx queue. How many is enough depends on the usage. It's normally 1382 * a simple calculation like following: 1383 * 1384 * MAX_PKT_BURST * max packet size / mbuf size 1385 * 1386 * So, we definitely need allocate more mbufs when TSO is enabled. 1387 * 1388 * - Similarly, for each switching core, we should serve @nr_rx_desc 1389 * mbufs for receiving the packets from physical NIC device. 1390 * 1391 * - We also need make sure, for each switch core, we have allocated 1392 * enough mbufs to fill up the mbuf cache. 1393 */ 1394 static void 1395 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1396 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1397 { 1398 uint32_t nr_mbufs; 1399 uint32_t nr_mbufs_per_core; 1400 uint32_t mtu = 1500; 1401 1402 if (mergeable) 1403 mtu = 9000; 1404 if (enable_tso) 1405 mtu = 64 * 1024; 1406 1407 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1408 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1409 nr_mbufs_per_core += nr_rx_desc; 1410 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1411 1412 nr_mbufs = nr_queues * nr_rx_desc; 1413 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1414 nr_mbufs *= nr_port; 1415 1416 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1417 nr_mbuf_cache, 0, mbuf_size, 1418 rte_socket_id()); 1419 if (mbuf_pool == NULL) 1420 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1421 } 1422 1423 /* 1424 * Main function, does initialisation and calls the per-lcore functions. 1425 */ 1426 int 1427 main(int argc, char *argv[]) 1428 { 1429 unsigned lcore_id, core_id = 0; 1430 unsigned nb_ports, valid_num_ports; 1431 int ret, i; 1432 uint8_t portid; 1433 static pthread_t tid; 1434 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 1435 uint64_t flags = 0; 1436 1437 signal(SIGINT, sigint_handler); 1438 1439 /* init EAL */ 1440 ret = rte_eal_init(argc, argv); 1441 if (ret < 0) 1442 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1443 argc -= ret; 1444 argv += ret; 1445 1446 /* parse app arguments */ 1447 ret = us_vhost_parse_args(argc, argv); 1448 if (ret < 0) 1449 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1450 1451 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1452 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1453 1454 if (rte_lcore_is_enabled(lcore_id)) 1455 lcore_ids[core_id++] = lcore_id; 1456 } 1457 1458 if (rte_lcore_count() > RTE_MAX_LCORE) 1459 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1460 1461 /* Get the number of physical ports. */ 1462 nb_ports = rte_eth_dev_count(); 1463 1464 /* 1465 * Update the global var NUM_PORTS and global array PORTS 1466 * and get value of var VALID_NUM_PORTS according to system ports number 1467 */ 1468 valid_num_ports = check_ports_num(nb_ports); 1469 1470 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1471 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1472 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1473 return -1; 1474 } 1475 1476 /* 1477 * FIXME: here we are trying to allocate mbufs big enough for 1478 * @MAX_QUEUES, but the truth is we're never going to use that 1479 * many queues here. We probably should only do allocation for 1480 * those queues we are going to use. 1481 */ 1482 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1483 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1484 1485 if (vm2vm_mode == VM2VM_HARDWARE) { 1486 /* Enable VT loop back to let L2 switch to do it. */ 1487 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1488 RTE_LOG(DEBUG, VHOST_CONFIG, 1489 "Enable loop back for L2 switch in vmdq.\n"); 1490 } 1491 1492 /* initialize all ports */ 1493 for (portid = 0; portid < nb_ports; portid++) { 1494 /* skip ports that are not enabled */ 1495 if ((enabled_port_mask & (1 << portid)) == 0) { 1496 RTE_LOG(INFO, VHOST_PORT, 1497 "Skipping disabled port %d\n", portid); 1498 continue; 1499 } 1500 if (port_init(portid) != 0) 1501 rte_exit(EXIT_FAILURE, 1502 "Cannot initialize network ports\n"); 1503 } 1504 1505 /* Enable stats if the user option is set. */ 1506 if (enable_stats) { 1507 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 1508 if (ret != 0) 1509 rte_exit(EXIT_FAILURE, 1510 "Cannot create print-stats thread\n"); 1511 1512 /* Set thread_name for aid in debugging. */ 1513 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 1514 ret = rte_thread_setname(tid, thread_name); 1515 if (ret != 0) 1516 RTE_LOG(DEBUG, VHOST_CONFIG, 1517 "Cannot set print-stats name\n"); 1518 } 1519 1520 /* Launch all data cores. */ 1521 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1522 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1523 1524 if (client_mode) 1525 flags |= RTE_VHOST_USER_CLIENT; 1526 1527 if (dequeue_zero_copy) 1528 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY; 1529 1530 /* Register vhost user driver to handle vhost messages. */ 1531 for (i = 0; i < nb_sockets; i++) { 1532 char *file = socket_files + i * PATH_MAX; 1533 ret = rte_vhost_driver_register(file, flags); 1534 if (ret != 0) { 1535 unregister_drivers(i); 1536 rte_exit(EXIT_FAILURE, 1537 "vhost driver register failure.\n"); 1538 } 1539 1540 if (builtin_net_driver) 1541 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1542 1543 if (mergeable == 0) { 1544 rte_vhost_driver_disable_features(file, 1545 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1546 } 1547 1548 if (enable_tx_csum == 0) { 1549 rte_vhost_driver_disable_features(file, 1550 1ULL << VIRTIO_NET_F_CSUM); 1551 } 1552 1553 if (enable_tso == 0) { 1554 rte_vhost_driver_disable_features(file, 1555 1ULL << VIRTIO_NET_F_HOST_TSO4); 1556 rte_vhost_driver_disable_features(file, 1557 1ULL << VIRTIO_NET_F_HOST_TSO6); 1558 rte_vhost_driver_disable_features(file, 1559 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1560 rte_vhost_driver_disable_features(file, 1561 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1562 } 1563 1564 if (promiscuous) { 1565 rte_vhost_driver_enable_features(file, 1566 1ULL << VIRTIO_NET_F_CTRL_RX); 1567 } 1568 1569 ret = rte_vhost_driver_callback_register(file, 1570 &virtio_net_device_ops); 1571 if (ret != 0) { 1572 rte_exit(EXIT_FAILURE, 1573 "failed to register vhost driver callbacks.\n"); 1574 } 1575 1576 if (rte_vhost_driver_start(file) < 0) { 1577 rte_exit(EXIT_FAILURE, 1578 "failed to start vhost driver.\n"); 1579 } 1580 } 1581 1582 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1583 rte_eal_wait_lcore(lcore_id); 1584 1585 return 0; 1586 1587 } 1588