1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_atomic.h> 18 #include <rte_cycles.h> 19 #include <rte_ethdev.h> 20 #include <rte_log.h> 21 #include <rte_string_fns.h> 22 #include <rte_malloc.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 28 #include "main.h" 29 30 #ifndef MAX_QUEUES 31 #define MAX_QUEUES 128 32 #endif 33 34 /* the maximum number of external ports supported */ 35 #define MAX_SUP_PORTS 1 36 37 #define MBUF_CACHE_SIZE 128 38 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 39 40 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 41 42 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 43 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 44 45 #define JUMBO_FRAME_MAX_SIZE 0x2600 46 47 /* State of virtio device. */ 48 #define DEVICE_MAC_LEARNING 0 49 #define DEVICE_RX 1 50 #define DEVICE_SAFE_REMOVE 2 51 52 /* Configurable number of RX/TX ring descriptors */ 53 #define RTE_TEST_RX_DESC_DEFAULT 1024 54 #define RTE_TEST_TX_DESC_DEFAULT 512 55 56 #define INVALID_PORT_ID 0xFF 57 58 /* Maximum long option length for option parsing. */ 59 #define MAX_LONG_OPT_SZ 64 60 61 /* mask of enabled ports */ 62 static uint32_t enabled_port_mask = 0; 63 64 /* Promiscuous mode */ 65 static uint32_t promiscuous; 66 67 /* number of devices/queues to support*/ 68 static uint32_t num_queues = 0; 69 static uint32_t num_devices; 70 71 static struct rte_mempool *mbuf_pool; 72 static int mergeable; 73 74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 75 typedef enum { 76 VM2VM_DISABLED = 0, 77 VM2VM_SOFTWARE = 1, 78 VM2VM_HARDWARE = 2, 79 VM2VM_LAST 80 } vm2vm_type; 81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 82 83 /* Enable stats. */ 84 static uint32_t enable_stats = 0; 85 /* Enable retries on RX. */ 86 static uint32_t enable_retry = 1; 87 88 /* Disable TX checksum offload */ 89 static uint32_t enable_tx_csum; 90 91 /* Disable TSO offload */ 92 static uint32_t enable_tso; 93 94 static int client_mode; 95 static int dequeue_zero_copy; 96 97 static int builtin_net_driver; 98 99 /* Specify timeout (in useconds) between retries on RX. */ 100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 101 /* Specify the number of retries on RX. */ 102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 103 104 /* Socket file paths. Can be set by user */ 105 static char *socket_files; 106 static int nb_sockets; 107 108 /* empty vmdq configuration structure. Filled in programatically */ 109 static struct rte_eth_conf vmdq_conf_default = { 110 .rxmode = { 111 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 112 .split_hdr_size = 0, 113 /* 114 * VLAN strip is necessary for 1G NIC such as I350, 115 * this fixes bug of ipv4 forwarding in guest can't 116 * forward pakets from one virtio dev to another virtio dev. 117 */ 118 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP, 119 }, 120 121 .txmode = { 122 .mq_mode = ETH_MQ_TX_NONE, 123 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM | 124 DEV_TX_OFFLOAD_TCP_CKSUM | 125 DEV_TX_OFFLOAD_VLAN_INSERT | 126 DEV_TX_OFFLOAD_MULTI_SEGS | 127 DEV_TX_OFFLOAD_TCP_TSO), 128 }, 129 .rx_adv_conf = { 130 /* 131 * should be overridden separately in code with 132 * appropriate values 133 */ 134 .vmdq_rx_conf = { 135 .nb_queue_pools = ETH_8_POOLS, 136 .enable_default_pool = 0, 137 .default_pool = 0, 138 .nb_pool_maps = 0, 139 .pool_map = {{0, 0},}, 140 }, 141 }, 142 }; 143 144 145 static unsigned lcore_ids[RTE_MAX_LCORE]; 146 static uint16_t ports[RTE_MAX_ETHPORTS]; 147 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 148 static uint16_t num_pf_queues, num_vmdq_queues; 149 static uint16_t vmdq_pool_base, vmdq_queue_base; 150 static uint16_t queues_per_pool; 151 152 const uint16_t vlan_tags[] = { 153 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 154 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 155 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 156 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 157 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 158 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 159 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 160 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 161 }; 162 163 /* ethernet addresses of ports */ 164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 165 166 static struct vhost_dev_tailq_list vhost_dev_list = 167 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 168 169 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 170 171 /* Used for queueing bursts of TX packets. */ 172 struct mbuf_table { 173 unsigned len; 174 unsigned txq_id; 175 struct rte_mbuf *m_table[MAX_PKT_BURST]; 176 }; 177 178 /* TX queue for each data core. */ 179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 180 181 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 182 / US_PER_S * BURST_TX_DRAIN_US) 183 #define VLAN_HLEN 4 184 185 /* 186 * Builds up the correct configuration for VMDQ VLAN pool map 187 * according to the pool & queue limits. 188 */ 189 static inline int 190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 191 { 192 struct rte_eth_vmdq_rx_conf conf; 193 struct rte_eth_vmdq_rx_conf *def_conf = 194 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 195 unsigned i; 196 197 memset(&conf, 0, sizeof(conf)); 198 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 199 conf.nb_pool_maps = num_devices; 200 conf.enable_loop_back = def_conf->enable_loop_back; 201 conf.rx_mode = def_conf->rx_mode; 202 203 for (i = 0; i < conf.nb_pool_maps; i++) { 204 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 205 conf.pool_map[i].pools = (1UL << i); 206 } 207 208 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 209 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 210 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 211 return 0; 212 } 213 214 /* 215 * Initialises a given port using global settings and with the rx buffers 216 * coming from the mbuf_pool passed as parameter 217 */ 218 static inline int 219 port_init(uint16_t port) 220 { 221 struct rte_eth_dev_info dev_info; 222 struct rte_eth_conf port_conf; 223 struct rte_eth_rxconf *rxconf; 224 struct rte_eth_txconf *txconf; 225 int16_t rx_rings, tx_rings; 226 uint16_t rx_ring_size, tx_ring_size; 227 int retval; 228 uint16_t q; 229 230 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 231 retval = rte_eth_dev_info_get(port, &dev_info); 232 if (retval != 0) { 233 RTE_LOG(ERR, VHOST_PORT, 234 "Error during getting device (port %u) info: %s\n", 235 port, strerror(-retval)); 236 237 return retval; 238 } 239 240 rxconf = &dev_info.default_rxconf; 241 txconf = &dev_info.default_txconf; 242 rxconf->rx_drop_en = 1; 243 244 /*configure the number of supported virtio devices based on VMDQ limits */ 245 num_devices = dev_info.max_vmdq_pools; 246 247 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 248 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 249 250 /* 251 * When dequeue zero copy is enabled, guest Tx used vring will be 252 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc 253 * (tx_ring_size here) must be small enough so that the driver will 254 * hit the free threshold easily and free mbufs timely. Otherwise, 255 * guest Tx vring would be starved. 256 */ 257 if (dequeue_zero_copy) 258 tx_ring_size = 64; 259 260 tx_rings = (uint16_t)rte_lcore_count(); 261 262 /* Get port configuration. */ 263 retval = get_eth_conf(&port_conf, num_devices); 264 if (retval < 0) 265 return retval; 266 /* NIC queues are divided into pf queues and vmdq queues. */ 267 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 268 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 269 num_vmdq_queues = num_devices * queues_per_pool; 270 num_queues = num_pf_queues + num_vmdq_queues; 271 vmdq_queue_base = dev_info.vmdq_queue_base; 272 vmdq_pool_base = dev_info.vmdq_pool_base; 273 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 274 num_pf_queues, num_devices, queues_per_pool); 275 276 if (!rte_eth_dev_is_valid_port(port)) 277 return -1; 278 279 rx_rings = (uint16_t)dev_info.max_rx_queues; 280 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) 281 port_conf.txmode.offloads |= 282 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 283 /* Configure ethernet device. */ 284 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 285 if (retval != 0) { 286 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 287 port, strerror(-retval)); 288 return retval; 289 } 290 291 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 292 &tx_ring_size); 293 if (retval != 0) { 294 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 295 "for port %u: %s.\n", port, strerror(-retval)); 296 return retval; 297 } 298 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 299 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 300 "for Rx queues on port %u.\n", port); 301 return -1; 302 } 303 304 /* Setup the queues. */ 305 rxconf->offloads = port_conf.rxmode.offloads; 306 for (q = 0; q < rx_rings; q ++) { 307 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 308 rte_eth_dev_socket_id(port), 309 rxconf, 310 mbuf_pool); 311 if (retval < 0) { 312 RTE_LOG(ERR, VHOST_PORT, 313 "Failed to setup rx queue %u of port %u: %s.\n", 314 q, port, strerror(-retval)); 315 return retval; 316 } 317 } 318 txconf->offloads = port_conf.txmode.offloads; 319 for (q = 0; q < tx_rings; q ++) { 320 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 321 rte_eth_dev_socket_id(port), 322 txconf); 323 if (retval < 0) { 324 RTE_LOG(ERR, VHOST_PORT, 325 "Failed to setup tx queue %u of port %u: %s.\n", 326 q, port, strerror(-retval)); 327 return retval; 328 } 329 } 330 331 /* Start the device. */ 332 retval = rte_eth_dev_start(port); 333 if (retval < 0) { 334 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 335 port, strerror(-retval)); 336 return retval; 337 } 338 339 if (promiscuous) { 340 retval = rte_eth_promiscuous_enable(port); 341 if (retval != 0) { 342 RTE_LOG(ERR, VHOST_PORT, 343 "Failed to enable promiscuous mode on port %u: %s\n", 344 port, rte_strerror(-retval)); 345 return retval; 346 } 347 } 348 349 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 350 if (retval < 0) { 351 RTE_LOG(ERR, VHOST_PORT, 352 "Failed to get MAC address on port %u: %s\n", 353 port, rte_strerror(-retval)); 354 return retval; 355 } 356 357 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 358 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 359 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 360 port, 361 vmdq_ports_eth_addr[port].addr_bytes[0], 362 vmdq_ports_eth_addr[port].addr_bytes[1], 363 vmdq_ports_eth_addr[port].addr_bytes[2], 364 vmdq_ports_eth_addr[port].addr_bytes[3], 365 vmdq_ports_eth_addr[port].addr_bytes[4], 366 vmdq_ports_eth_addr[port].addr_bytes[5]); 367 368 return 0; 369 } 370 371 /* 372 * Set socket file path. 373 */ 374 static int 375 us_vhost_parse_socket_path(const char *q_arg) 376 { 377 char *old; 378 379 /* parse number string */ 380 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 381 return -1; 382 383 old = socket_files; 384 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 385 if (socket_files == NULL) { 386 free(old); 387 return -1; 388 } 389 390 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 391 nb_sockets++; 392 393 return 0; 394 } 395 396 /* 397 * Parse the portmask provided at run time. 398 */ 399 static int 400 parse_portmask(const char *portmask) 401 { 402 char *end = NULL; 403 unsigned long pm; 404 405 errno = 0; 406 407 /* parse hexadecimal string */ 408 pm = strtoul(portmask, &end, 16); 409 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 410 return -1; 411 412 if (pm == 0) 413 return -1; 414 415 return pm; 416 417 } 418 419 /* 420 * Parse num options at run time. 421 */ 422 static int 423 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 424 { 425 char *end = NULL; 426 unsigned long num; 427 428 errno = 0; 429 430 /* parse unsigned int string */ 431 num = strtoul(q_arg, &end, 10); 432 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 433 return -1; 434 435 if (num > max_valid_value) 436 return -1; 437 438 return num; 439 440 } 441 442 /* 443 * Display usage 444 */ 445 static void 446 us_vhost_usage(const char *prgname) 447 { 448 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 449 " --vm2vm [0|1|2]\n" 450 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 451 " --socket-file <path>\n" 452 " --nb-devices ND\n" 453 " -p PORTMASK: Set mask for ports to be used by application\n" 454 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 455 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 456 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 457 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 458 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 459 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 460 " --socket-file: The path of the socket file.\n" 461 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 462 " --tso [0|1] disable/enable TCP segment offload.\n" 463 " --client register a vhost-user socket as client mode.\n" 464 " --dequeue-zero-copy enables dequeue zero copy\n", 465 prgname); 466 } 467 468 /* 469 * Parse the arguments given in the command line of the application. 470 */ 471 static int 472 us_vhost_parse_args(int argc, char **argv) 473 { 474 int opt, ret; 475 int option_index; 476 unsigned i; 477 const char *prgname = argv[0]; 478 static struct option long_option[] = { 479 {"vm2vm", required_argument, NULL, 0}, 480 {"rx-retry", required_argument, NULL, 0}, 481 {"rx-retry-delay", required_argument, NULL, 0}, 482 {"rx-retry-num", required_argument, NULL, 0}, 483 {"mergeable", required_argument, NULL, 0}, 484 {"stats", required_argument, NULL, 0}, 485 {"socket-file", required_argument, NULL, 0}, 486 {"tx-csum", required_argument, NULL, 0}, 487 {"tso", required_argument, NULL, 0}, 488 {"client", no_argument, &client_mode, 1}, 489 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1}, 490 {"builtin-net-driver", no_argument, &builtin_net_driver, 1}, 491 {NULL, 0, 0, 0}, 492 }; 493 494 /* Parse command line */ 495 while ((opt = getopt_long(argc, argv, "p:P", 496 long_option, &option_index)) != EOF) { 497 switch (opt) { 498 /* Portmask */ 499 case 'p': 500 enabled_port_mask = parse_portmask(optarg); 501 if (enabled_port_mask == 0) { 502 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 503 us_vhost_usage(prgname); 504 return -1; 505 } 506 break; 507 508 case 'P': 509 promiscuous = 1; 510 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 511 ETH_VMDQ_ACCEPT_BROADCAST | 512 ETH_VMDQ_ACCEPT_MULTICAST; 513 514 break; 515 516 case 0: 517 /* Enable/disable vm2vm comms. */ 518 if (!strncmp(long_option[option_index].name, "vm2vm", 519 MAX_LONG_OPT_SZ)) { 520 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 521 if (ret == -1) { 522 RTE_LOG(INFO, VHOST_CONFIG, 523 "Invalid argument for " 524 "vm2vm [0|1|2]\n"); 525 us_vhost_usage(prgname); 526 return -1; 527 } else { 528 vm2vm_mode = (vm2vm_type)ret; 529 } 530 } 531 532 /* Enable/disable retries on RX. */ 533 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 534 ret = parse_num_opt(optarg, 1); 535 if (ret == -1) { 536 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 537 us_vhost_usage(prgname); 538 return -1; 539 } else { 540 enable_retry = ret; 541 } 542 } 543 544 /* Enable/disable TX checksum offload. */ 545 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 546 ret = parse_num_opt(optarg, 1); 547 if (ret == -1) { 548 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 549 us_vhost_usage(prgname); 550 return -1; 551 } else 552 enable_tx_csum = ret; 553 } 554 555 /* Enable/disable TSO offload. */ 556 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 557 ret = parse_num_opt(optarg, 1); 558 if (ret == -1) { 559 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 560 us_vhost_usage(prgname); 561 return -1; 562 } else 563 enable_tso = ret; 564 } 565 566 /* Specify the retries delay time (in useconds) on RX. */ 567 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 568 ret = parse_num_opt(optarg, INT32_MAX); 569 if (ret == -1) { 570 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 571 us_vhost_usage(prgname); 572 return -1; 573 } else { 574 burst_rx_delay_time = ret; 575 } 576 } 577 578 /* Specify the retries number on RX. */ 579 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 580 ret = parse_num_opt(optarg, INT32_MAX); 581 if (ret == -1) { 582 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 583 us_vhost_usage(prgname); 584 return -1; 585 } else { 586 burst_rx_retry_num = ret; 587 } 588 } 589 590 /* Enable/disable RX mergeable buffers. */ 591 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 592 ret = parse_num_opt(optarg, 1); 593 if (ret == -1) { 594 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 595 us_vhost_usage(prgname); 596 return -1; 597 } else { 598 mergeable = !!ret; 599 if (ret) { 600 vmdq_conf_default.rxmode.offloads |= 601 DEV_RX_OFFLOAD_JUMBO_FRAME; 602 vmdq_conf_default.rxmode.max_rx_pkt_len 603 = JUMBO_FRAME_MAX_SIZE; 604 } 605 } 606 } 607 608 /* Enable/disable stats. */ 609 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 610 ret = parse_num_opt(optarg, INT32_MAX); 611 if (ret == -1) { 612 RTE_LOG(INFO, VHOST_CONFIG, 613 "Invalid argument for stats [0..N]\n"); 614 us_vhost_usage(prgname); 615 return -1; 616 } else { 617 enable_stats = ret; 618 } 619 } 620 621 /* Set socket file path. */ 622 if (!strncmp(long_option[option_index].name, 623 "socket-file", MAX_LONG_OPT_SZ)) { 624 if (us_vhost_parse_socket_path(optarg) == -1) { 625 RTE_LOG(INFO, VHOST_CONFIG, 626 "Invalid argument for socket name (Max %d characters)\n", 627 PATH_MAX); 628 us_vhost_usage(prgname); 629 return -1; 630 } 631 } 632 633 break; 634 635 /* Invalid option - print options. */ 636 default: 637 us_vhost_usage(prgname); 638 return -1; 639 } 640 } 641 642 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 643 if (enabled_port_mask & (1 << i)) 644 ports[num_ports++] = i; 645 } 646 647 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 648 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 649 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 650 return -1; 651 } 652 653 return 0; 654 } 655 656 /* 657 * Update the global var NUM_PORTS and array PORTS according to system ports number 658 * and return valid ports number 659 */ 660 static unsigned check_ports_num(unsigned nb_ports) 661 { 662 unsigned valid_num_ports = num_ports; 663 unsigned portid; 664 665 if (num_ports > nb_ports) { 666 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 667 num_ports, nb_ports); 668 num_ports = nb_ports; 669 } 670 671 for (portid = 0; portid < num_ports; portid ++) { 672 if (!rte_eth_dev_is_valid_port(ports[portid])) { 673 RTE_LOG(INFO, VHOST_PORT, 674 "\nSpecified port ID(%u) is not valid\n", 675 ports[portid]); 676 ports[portid] = INVALID_PORT_ID; 677 valid_num_ports--; 678 } 679 } 680 return valid_num_ports; 681 } 682 683 static __rte_always_inline struct vhost_dev * 684 find_vhost_dev(struct rte_ether_addr *mac) 685 { 686 struct vhost_dev *vdev; 687 688 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 689 if (vdev->ready == DEVICE_RX && 690 rte_is_same_ether_addr(mac, &vdev->mac_address)) 691 return vdev; 692 } 693 694 return NULL; 695 } 696 697 /* 698 * This function learns the MAC address of the device and registers this along with a 699 * vlan tag to a VMDQ. 700 */ 701 static int 702 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 703 { 704 struct rte_ether_hdr *pkt_hdr; 705 int i, ret; 706 707 /* Learn MAC address of guest device from packet */ 708 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 709 710 if (find_vhost_dev(&pkt_hdr->s_addr)) { 711 RTE_LOG(ERR, VHOST_DATA, 712 "(%d) device is using a registered MAC!\n", 713 vdev->vid); 714 return -1; 715 } 716 717 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 718 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 719 720 /* vlan_tag currently uses the device_id. */ 721 vdev->vlan_tag = vlan_tags[vdev->vid]; 722 723 /* Print out VMDQ registration info. */ 724 RTE_LOG(INFO, VHOST_DATA, 725 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 726 vdev->vid, 727 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 728 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 729 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 730 vdev->vlan_tag); 731 732 /* Register the MAC address. */ 733 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 734 (uint32_t)vdev->vid + vmdq_pool_base); 735 if (ret) 736 RTE_LOG(ERR, VHOST_DATA, 737 "(%d) failed to add device MAC address to VMDQ\n", 738 vdev->vid); 739 740 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 741 742 /* Set device as ready for RX. */ 743 vdev->ready = DEVICE_RX; 744 745 return 0; 746 } 747 748 /* 749 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 750 * queue before disabling RX on the device. 751 */ 752 static inline void 753 unlink_vmdq(struct vhost_dev *vdev) 754 { 755 unsigned i = 0; 756 unsigned rx_count; 757 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 758 759 if (vdev->ready == DEVICE_RX) { 760 /*clear MAC and VLAN settings*/ 761 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 762 for (i = 0; i < 6; i++) 763 vdev->mac_address.addr_bytes[i] = 0; 764 765 vdev->vlan_tag = 0; 766 767 /*Clear out the receive buffers*/ 768 rx_count = rte_eth_rx_burst(ports[0], 769 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 770 771 while (rx_count) { 772 for (i = 0; i < rx_count; i++) 773 rte_pktmbuf_free(pkts_burst[i]); 774 775 rx_count = rte_eth_rx_burst(ports[0], 776 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 777 } 778 779 vdev->ready = DEVICE_MAC_LEARNING; 780 } 781 } 782 783 static __rte_always_inline void 784 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 785 struct rte_mbuf *m) 786 { 787 uint16_t ret; 788 789 if (builtin_net_driver) { 790 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 791 } else { 792 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 793 } 794 795 if (enable_stats) { 796 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic); 797 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret); 798 src_vdev->stats.tx_total++; 799 src_vdev->stats.tx += ret; 800 } 801 } 802 803 /* 804 * Check if the packet destination MAC address is for a local device. If so then put 805 * the packet on that devices RX queue. If not then return. 806 */ 807 static __rte_always_inline int 808 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 809 { 810 struct rte_ether_hdr *pkt_hdr; 811 struct vhost_dev *dst_vdev; 812 813 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 814 815 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 816 if (!dst_vdev) 817 return -1; 818 819 if (vdev->vid == dst_vdev->vid) { 820 RTE_LOG_DP(DEBUG, VHOST_DATA, 821 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 822 vdev->vid); 823 return 0; 824 } 825 826 RTE_LOG_DP(DEBUG, VHOST_DATA, 827 "(%d) TX: MAC address is local\n", dst_vdev->vid); 828 829 if (unlikely(dst_vdev->remove)) { 830 RTE_LOG_DP(DEBUG, VHOST_DATA, 831 "(%d) device is marked for removal\n", dst_vdev->vid); 832 return 0; 833 } 834 835 virtio_xmit(dst_vdev, vdev, m); 836 return 0; 837 } 838 839 /* 840 * Check if the destination MAC of a packet is one local VM, 841 * and get its vlan tag, and offset if it is. 842 */ 843 static __rte_always_inline int 844 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 845 uint32_t *offset, uint16_t *vlan_tag) 846 { 847 struct vhost_dev *dst_vdev; 848 struct rte_ether_hdr *pkt_hdr = 849 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 850 851 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 852 if (!dst_vdev) 853 return 0; 854 855 if (vdev->vid == dst_vdev->vid) { 856 RTE_LOG_DP(DEBUG, VHOST_DATA, 857 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 858 vdev->vid); 859 return -1; 860 } 861 862 /* 863 * HW vlan strip will reduce the packet length 864 * by minus length of vlan tag, so need restore 865 * the packet length by plus it. 866 */ 867 *offset = VLAN_HLEN; 868 *vlan_tag = vlan_tags[vdev->vid]; 869 870 RTE_LOG_DP(DEBUG, VHOST_DATA, 871 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 872 vdev->vid, dst_vdev->vid, *vlan_tag); 873 874 return 0; 875 } 876 877 static uint16_t 878 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 879 { 880 if (ol_flags & PKT_TX_IPV4) 881 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 882 else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 883 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 884 } 885 886 static void virtio_tx_offload(struct rte_mbuf *m) 887 { 888 void *l3_hdr; 889 struct rte_ipv4_hdr *ipv4_hdr = NULL; 890 struct rte_tcp_hdr *tcp_hdr = NULL; 891 struct rte_ether_hdr *eth_hdr = 892 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 893 894 l3_hdr = (char *)eth_hdr + m->l2_len; 895 896 if (m->ol_flags & PKT_TX_IPV4) { 897 ipv4_hdr = l3_hdr; 898 ipv4_hdr->hdr_checksum = 0; 899 m->ol_flags |= PKT_TX_IP_CKSUM; 900 } 901 902 tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len); 903 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 904 } 905 906 static inline void 907 free_pkts(struct rte_mbuf **pkts, uint16_t n) 908 { 909 while (n--) 910 rte_pktmbuf_free(pkts[n]); 911 } 912 913 static __rte_always_inline void 914 do_drain_mbuf_table(struct mbuf_table *tx_q) 915 { 916 uint16_t count; 917 918 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 919 tx_q->m_table, tx_q->len); 920 if (unlikely(count < tx_q->len)) 921 free_pkts(&tx_q->m_table[count], tx_q->len - count); 922 923 tx_q->len = 0; 924 } 925 926 /* 927 * This function routes the TX packet to the correct interface. This 928 * may be a local device or the physical port. 929 */ 930 static __rte_always_inline void 931 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 932 { 933 struct mbuf_table *tx_q; 934 unsigned offset = 0; 935 const uint16_t lcore_id = rte_lcore_id(); 936 struct rte_ether_hdr *nh; 937 938 939 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 940 if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) { 941 struct vhost_dev *vdev2; 942 943 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 944 if (vdev2 != vdev) 945 virtio_xmit(vdev2, vdev, m); 946 } 947 goto queue2nic; 948 } 949 950 /*check if destination is local VM*/ 951 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 952 rte_pktmbuf_free(m); 953 return; 954 } 955 956 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 957 if (unlikely(find_local_dest(vdev, m, &offset, 958 &vlan_tag) != 0)) { 959 rte_pktmbuf_free(m); 960 return; 961 } 962 } 963 964 RTE_LOG_DP(DEBUG, VHOST_DATA, 965 "(%d) TX: MAC address is external\n", vdev->vid); 966 967 queue2nic: 968 969 /*Add packet to the port tx queue*/ 970 tx_q = &lcore_tx_queue[lcore_id]; 971 972 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 973 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 974 /* Guest has inserted the vlan tag. */ 975 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 976 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 977 if ((vm2vm_mode == VM2VM_HARDWARE) && 978 (vh->vlan_tci != vlan_tag_be)) 979 vh->vlan_tci = vlan_tag_be; 980 } else { 981 m->ol_flags |= PKT_TX_VLAN_PKT; 982 983 /* 984 * Find the right seg to adjust the data len when offset is 985 * bigger than tail room size. 986 */ 987 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 988 if (likely(offset <= rte_pktmbuf_tailroom(m))) 989 m->data_len += offset; 990 else { 991 struct rte_mbuf *seg = m; 992 993 while ((seg->next != NULL) && 994 (offset > rte_pktmbuf_tailroom(seg))) 995 seg = seg->next; 996 997 seg->data_len += offset; 998 } 999 m->pkt_len += offset; 1000 } 1001 1002 m->vlan_tci = vlan_tag; 1003 } 1004 1005 if (m->ol_flags & PKT_TX_TCP_SEG) 1006 virtio_tx_offload(m); 1007 1008 tx_q->m_table[tx_q->len++] = m; 1009 if (enable_stats) { 1010 vdev->stats.tx_total++; 1011 vdev->stats.tx++; 1012 } 1013 1014 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1015 do_drain_mbuf_table(tx_q); 1016 } 1017 1018 1019 static __rte_always_inline void 1020 drain_mbuf_table(struct mbuf_table *tx_q) 1021 { 1022 static uint64_t prev_tsc; 1023 uint64_t cur_tsc; 1024 1025 if (tx_q->len == 0) 1026 return; 1027 1028 cur_tsc = rte_rdtsc(); 1029 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1030 prev_tsc = cur_tsc; 1031 1032 RTE_LOG_DP(DEBUG, VHOST_DATA, 1033 "TX queue drained after timeout with burst size %u\n", 1034 tx_q->len); 1035 do_drain_mbuf_table(tx_q); 1036 } 1037 } 1038 1039 static __rte_always_inline void 1040 drain_eth_rx(struct vhost_dev *vdev) 1041 { 1042 uint16_t rx_count, enqueue_count; 1043 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1044 1045 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1046 pkts, MAX_PKT_BURST); 1047 if (!rx_count) 1048 return; 1049 1050 /* 1051 * When "enable_retry" is set, here we wait and retry when there 1052 * is no enough free slots in the queue to hold @rx_count packets, 1053 * to diminish packet loss. 1054 */ 1055 if (enable_retry && 1056 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1057 VIRTIO_RXQ))) { 1058 uint32_t retry; 1059 1060 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1061 rte_delay_us(burst_rx_delay_time); 1062 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1063 VIRTIO_RXQ)) 1064 break; 1065 } 1066 } 1067 1068 if (builtin_net_driver) { 1069 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1070 pkts, rx_count); 1071 } else { 1072 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1073 pkts, rx_count); 1074 } 1075 if (enable_stats) { 1076 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count); 1077 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count); 1078 } 1079 1080 free_pkts(pkts, rx_count); 1081 } 1082 1083 static __rte_always_inline void 1084 drain_virtio_tx(struct vhost_dev *vdev) 1085 { 1086 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1087 uint16_t count; 1088 uint16_t i; 1089 1090 if (builtin_net_driver) { 1091 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1092 pkts, MAX_PKT_BURST); 1093 } else { 1094 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1095 mbuf_pool, pkts, MAX_PKT_BURST); 1096 } 1097 1098 /* setup VMDq for the first packet */ 1099 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1100 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1101 free_pkts(pkts, count); 1102 } 1103 1104 for (i = 0; i < count; ++i) 1105 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1106 } 1107 1108 /* 1109 * Main function of vhost-switch. It basically does: 1110 * 1111 * for each vhost device { 1112 * - drain_eth_rx() 1113 * 1114 * Which drains the host eth Rx queue linked to the vhost device, 1115 * and deliver all of them to guest virito Rx ring associated with 1116 * this vhost device. 1117 * 1118 * - drain_virtio_tx() 1119 * 1120 * Which drains the guest virtio Tx queue and deliver all of them 1121 * to the target, which could be another vhost device, or the 1122 * physical eth dev. The route is done in function "virtio_tx_route". 1123 * } 1124 */ 1125 static int 1126 switch_worker(void *arg __rte_unused) 1127 { 1128 unsigned i; 1129 unsigned lcore_id = rte_lcore_id(); 1130 struct vhost_dev *vdev; 1131 struct mbuf_table *tx_q; 1132 1133 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1134 1135 tx_q = &lcore_tx_queue[lcore_id]; 1136 for (i = 0; i < rte_lcore_count(); i++) { 1137 if (lcore_ids[i] == lcore_id) { 1138 tx_q->txq_id = i; 1139 break; 1140 } 1141 } 1142 1143 while(1) { 1144 drain_mbuf_table(tx_q); 1145 1146 /* 1147 * Inform the configuration core that we have exited the 1148 * linked list and that no devices are in use if requested. 1149 */ 1150 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1151 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1152 1153 /* 1154 * Process vhost devices 1155 */ 1156 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1157 lcore_vdev_entry) { 1158 if (unlikely(vdev->remove)) { 1159 unlink_vmdq(vdev); 1160 vdev->ready = DEVICE_SAFE_REMOVE; 1161 continue; 1162 } 1163 1164 if (likely(vdev->ready == DEVICE_RX)) 1165 drain_eth_rx(vdev); 1166 1167 if (likely(!vdev->remove)) 1168 drain_virtio_tx(vdev); 1169 } 1170 } 1171 1172 return 0; 1173 } 1174 1175 /* 1176 * Remove a device from the specific data core linked list and from the 1177 * main linked list. Synchonization occurs through the use of the 1178 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1179 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1180 */ 1181 static void 1182 destroy_device(int vid) 1183 { 1184 struct vhost_dev *vdev = NULL; 1185 int lcore; 1186 1187 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1188 if (vdev->vid == vid) 1189 break; 1190 } 1191 if (!vdev) 1192 return; 1193 /*set the remove flag. */ 1194 vdev->remove = 1; 1195 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1196 rte_pause(); 1197 } 1198 1199 if (builtin_net_driver) 1200 vs_vhost_net_remove(vdev); 1201 1202 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1203 lcore_vdev_entry); 1204 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1205 1206 1207 /* Set the dev_removal_flag on each lcore. */ 1208 RTE_LCORE_FOREACH_SLAVE(lcore) 1209 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1210 1211 /* 1212 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1213 * we can be sure that they can no longer access the device removed 1214 * from the linked lists and that the devices are no longer in use. 1215 */ 1216 RTE_LCORE_FOREACH_SLAVE(lcore) { 1217 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1218 rte_pause(); 1219 } 1220 1221 lcore_info[vdev->coreid].device_num--; 1222 1223 RTE_LOG(INFO, VHOST_DATA, 1224 "(%d) device has been removed from data core\n", 1225 vdev->vid); 1226 1227 rte_free(vdev); 1228 } 1229 1230 /* 1231 * A new device is added to a data core. First the device is added to the main linked list 1232 * and then allocated to a specific data core. 1233 */ 1234 static int 1235 new_device(int vid) 1236 { 1237 int lcore, core_add = 0; 1238 uint32_t device_num_min = num_devices; 1239 struct vhost_dev *vdev; 1240 1241 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1242 if (vdev == NULL) { 1243 RTE_LOG(INFO, VHOST_DATA, 1244 "(%d) couldn't allocate memory for vhost dev\n", 1245 vid); 1246 return -1; 1247 } 1248 vdev->vid = vid; 1249 1250 if (builtin_net_driver) 1251 vs_vhost_net_setup(vdev); 1252 1253 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1254 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1255 1256 /*reset ready flag*/ 1257 vdev->ready = DEVICE_MAC_LEARNING; 1258 vdev->remove = 0; 1259 1260 /* Find a suitable lcore to add the device. */ 1261 RTE_LCORE_FOREACH_SLAVE(lcore) { 1262 if (lcore_info[lcore].device_num < device_num_min) { 1263 device_num_min = lcore_info[lcore].device_num; 1264 core_add = lcore; 1265 } 1266 } 1267 vdev->coreid = core_add; 1268 1269 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1270 lcore_vdev_entry); 1271 lcore_info[vdev->coreid].device_num++; 1272 1273 /* Disable notifications. */ 1274 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1275 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1276 1277 RTE_LOG(INFO, VHOST_DATA, 1278 "(%d) device has been added to data core %d\n", 1279 vid, vdev->coreid); 1280 1281 return 0; 1282 } 1283 1284 /* 1285 * These callback allow devices to be added to the data core when configuration 1286 * has been fully complete. 1287 */ 1288 static const struct vhost_device_ops virtio_net_device_ops = 1289 { 1290 .new_device = new_device, 1291 .destroy_device = destroy_device, 1292 }; 1293 1294 /* 1295 * This is a thread will wake up after a period to print stats if the user has 1296 * enabled them. 1297 */ 1298 static void * 1299 print_stats(__rte_unused void *arg) 1300 { 1301 struct vhost_dev *vdev; 1302 uint64_t tx_dropped, rx_dropped; 1303 uint64_t tx, tx_total, rx, rx_total; 1304 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1305 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1306 1307 while(1) { 1308 sleep(enable_stats); 1309 1310 /* Clear screen and move to top left */ 1311 printf("%s%s\n", clr, top_left); 1312 printf("Device statistics =================================\n"); 1313 1314 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1315 tx_total = vdev->stats.tx_total; 1316 tx = vdev->stats.tx; 1317 tx_dropped = tx_total - tx; 1318 1319 rx_total = rte_atomic64_read(&vdev->stats.rx_total_atomic); 1320 rx = rte_atomic64_read(&vdev->stats.rx_atomic); 1321 rx_dropped = rx_total - rx; 1322 1323 printf("Statistics for device %d\n" 1324 "-----------------------\n" 1325 "TX total: %" PRIu64 "\n" 1326 "TX dropped: %" PRIu64 "\n" 1327 "TX successful: %" PRIu64 "\n" 1328 "RX total: %" PRIu64 "\n" 1329 "RX dropped: %" PRIu64 "\n" 1330 "RX successful: %" PRIu64 "\n", 1331 vdev->vid, 1332 tx_total, tx_dropped, tx, 1333 rx_total, rx_dropped, rx); 1334 } 1335 1336 printf("===================================================\n"); 1337 } 1338 1339 return NULL; 1340 } 1341 1342 static void 1343 unregister_drivers(int socket_num) 1344 { 1345 int i, ret; 1346 1347 for (i = 0; i < socket_num; i++) { 1348 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1349 if (ret != 0) 1350 RTE_LOG(ERR, VHOST_CONFIG, 1351 "Fail to unregister vhost driver for %s.\n", 1352 socket_files + i * PATH_MAX); 1353 } 1354 } 1355 1356 /* When we receive a INT signal, unregister vhost driver */ 1357 static void 1358 sigint_handler(__rte_unused int signum) 1359 { 1360 /* Unregister vhost driver. */ 1361 unregister_drivers(nb_sockets); 1362 1363 exit(0); 1364 } 1365 1366 /* 1367 * While creating an mbuf pool, one key thing is to figure out how 1368 * many mbuf entries is enough for our use. FYI, here are some 1369 * guidelines: 1370 * 1371 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1372 * 1373 * - For each switch core (A CPU core does the packet switch), we need 1374 * also make some reservation for receiving the packets from virtio 1375 * Tx queue. How many is enough depends on the usage. It's normally 1376 * a simple calculation like following: 1377 * 1378 * MAX_PKT_BURST * max packet size / mbuf size 1379 * 1380 * So, we definitely need allocate more mbufs when TSO is enabled. 1381 * 1382 * - Similarly, for each switching core, we should serve @nr_rx_desc 1383 * mbufs for receiving the packets from physical NIC device. 1384 * 1385 * - We also need make sure, for each switch core, we have allocated 1386 * enough mbufs to fill up the mbuf cache. 1387 */ 1388 static void 1389 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1390 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1391 { 1392 uint32_t nr_mbufs; 1393 uint32_t nr_mbufs_per_core; 1394 uint32_t mtu = 1500; 1395 1396 if (mergeable) 1397 mtu = 9000; 1398 if (enable_tso) 1399 mtu = 64 * 1024; 1400 1401 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1402 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1403 nr_mbufs_per_core += nr_rx_desc; 1404 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1405 1406 nr_mbufs = nr_queues * nr_rx_desc; 1407 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1408 nr_mbufs *= nr_port; 1409 1410 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1411 nr_mbuf_cache, 0, mbuf_size, 1412 rte_socket_id()); 1413 if (mbuf_pool == NULL) 1414 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1415 } 1416 1417 /* 1418 * Main function, does initialisation and calls the per-lcore functions. 1419 */ 1420 int 1421 main(int argc, char *argv[]) 1422 { 1423 unsigned lcore_id, core_id = 0; 1424 unsigned nb_ports, valid_num_ports; 1425 int ret, i; 1426 uint16_t portid; 1427 static pthread_t tid; 1428 uint64_t flags = 0; 1429 1430 signal(SIGINT, sigint_handler); 1431 1432 /* init EAL */ 1433 ret = rte_eal_init(argc, argv); 1434 if (ret < 0) 1435 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1436 argc -= ret; 1437 argv += ret; 1438 1439 /* parse app arguments */ 1440 ret = us_vhost_parse_args(argc, argv); 1441 if (ret < 0) 1442 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1443 1444 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1445 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1446 1447 if (rte_lcore_is_enabled(lcore_id)) 1448 lcore_ids[core_id++] = lcore_id; 1449 } 1450 1451 if (rte_lcore_count() > RTE_MAX_LCORE) 1452 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1453 1454 /* Get the number of physical ports. */ 1455 nb_ports = rte_eth_dev_count_avail(); 1456 1457 /* 1458 * Update the global var NUM_PORTS and global array PORTS 1459 * and get value of var VALID_NUM_PORTS according to system ports number 1460 */ 1461 valid_num_ports = check_ports_num(nb_ports); 1462 1463 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1464 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1465 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1466 return -1; 1467 } 1468 1469 /* 1470 * FIXME: here we are trying to allocate mbufs big enough for 1471 * @MAX_QUEUES, but the truth is we're never going to use that 1472 * many queues here. We probably should only do allocation for 1473 * those queues we are going to use. 1474 */ 1475 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1476 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1477 1478 if (vm2vm_mode == VM2VM_HARDWARE) { 1479 /* Enable VT loop back to let L2 switch to do it. */ 1480 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1481 RTE_LOG(DEBUG, VHOST_CONFIG, 1482 "Enable loop back for L2 switch in vmdq.\n"); 1483 } 1484 1485 /* initialize all ports */ 1486 RTE_ETH_FOREACH_DEV(portid) { 1487 /* skip ports that are not enabled */ 1488 if ((enabled_port_mask & (1 << portid)) == 0) { 1489 RTE_LOG(INFO, VHOST_PORT, 1490 "Skipping disabled port %d\n", portid); 1491 continue; 1492 } 1493 if (port_init(portid) != 0) 1494 rte_exit(EXIT_FAILURE, 1495 "Cannot initialize network ports\n"); 1496 } 1497 1498 /* Enable stats if the user option is set. */ 1499 if (enable_stats) { 1500 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1501 print_stats, NULL); 1502 if (ret < 0) 1503 rte_exit(EXIT_FAILURE, 1504 "Cannot create print-stats thread\n"); 1505 } 1506 1507 /* Launch all data cores. */ 1508 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1509 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1510 1511 if (client_mode) 1512 flags |= RTE_VHOST_USER_CLIENT; 1513 1514 if (dequeue_zero_copy) 1515 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY; 1516 1517 /* Register vhost user driver to handle vhost messages. */ 1518 for (i = 0; i < nb_sockets; i++) { 1519 char *file = socket_files + i * PATH_MAX; 1520 ret = rte_vhost_driver_register(file, flags); 1521 if (ret != 0) { 1522 unregister_drivers(i); 1523 rte_exit(EXIT_FAILURE, 1524 "vhost driver register failure.\n"); 1525 } 1526 1527 if (builtin_net_driver) 1528 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1529 1530 if (mergeable == 0) { 1531 rte_vhost_driver_disable_features(file, 1532 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1533 } 1534 1535 if (enable_tx_csum == 0) { 1536 rte_vhost_driver_disable_features(file, 1537 1ULL << VIRTIO_NET_F_CSUM); 1538 } 1539 1540 if (enable_tso == 0) { 1541 rte_vhost_driver_disable_features(file, 1542 1ULL << VIRTIO_NET_F_HOST_TSO4); 1543 rte_vhost_driver_disable_features(file, 1544 1ULL << VIRTIO_NET_F_HOST_TSO6); 1545 rte_vhost_driver_disable_features(file, 1546 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1547 rte_vhost_driver_disable_features(file, 1548 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1549 } 1550 1551 if (promiscuous) { 1552 rte_vhost_driver_enable_features(file, 1553 1ULL << VIRTIO_NET_F_CTRL_RX); 1554 } 1555 1556 ret = rte_vhost_driver_callback_register(file, 1557 &virtio_net_device_ops); 1558 if (ret != 0) { 1559 rte_exit(EXIT_FAILURE, 1560 "failed to register vhost driver callbacks.\n"); 1561 } 1562 1563 if (rte_vhost_driver_start(file) < 0) { 1564 rte_exit(EXIT_FAILURE, 1565 "failed to start vhost driver.\n"); 1566 } 1567 } 1568 1569 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1570 rte_eal_wait_lcore(lcore_id); 1571 1572 return 0; 1573 1574 } 1575