1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 56 #include "main.h" 57 58 #ifndef MAX_QUEUES 59 #define MAX_QUEUES 128 60 #endif 61 62 /* the maximum number of external ports supported */ 63 #define MAX_SUP_PORTS 1 64 65 /* 66 * Calculate the number of buffers needed per port 67 */ 68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 69 (num_switching_cores*MAX_PKT_BURST) + \ 70 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 71 ((num_switching_cores+1)*MBUF_CACHE_SIZE)) 72 73 #define MBUF_CACHE_SIZE 128 74 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 75 76 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 77 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 78 79 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 80 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 81 82 #define JUMBO_FRAME_MAX_SIZE 0x2600 83 84 /* State of virtio device. */ 85 #define DEVICE_MAC_LEARNING 0 86 #define DEVICE_RX 1 87 #define DEVICE_SAFE_REMOVE 2 88 89 /* Config_core_flag status definitions. */ 90 #define REQUEST_DEV_REMOVAL 1 91 #define ACK_DEV_REMOVAL 0 92 93 /* Configurable number of RX/TX ring descriptors */ 94 #define RTE_TEST_RX_DESC_DEFAULT 1024 95 #define RTE_TEST_TX_DESC_DEFAULT 512 96 97 #define INVALID_PORT_ID 0xFF 98 99 /* Max number of devices. Limited by vmdq. */ 100 #define MAX_DEVICES 64 101 102 /* Size of buffers used for snprintfs. */ 103 #define MAX_PRINT_BUFF 6072 104 105 /* Maximum character device basename size. */ 106 #define MAX_BASENAME_SZ 10 107 108 /* Maximum long option length for option parsing. */ 109 #define MAX_LONG_OPT_SZ 64 110 111 /* Used to compare MAC addresses. */ 112 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 113 114 /* Number of descriptors per cacheline. */ 115 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 116 117 /* mask of enabled ports */ 118 static uint32_t enabled_port_mask = 0; 119 120 /* Promiscuous mode */ 121 static uint32_t promiscuous; 122 123 /*Number of switching cores enabled*/ 124 static uint32_t num_switching_cores = 0; 125 126 /* number of devices/queues to support*/ 127 static uint32_t num_queues = 0; 128 static uint32_t num_devices; 129 130 static struct rte_mempool *mbuf_pool; 131 static int mergeable; 132 133 /* Do vlan strip on host, enabled on default */ 134 static uint32_t vlan_strip = 1; 135 136 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 137 typedef enum { 138 VM2VM_DISABLED = 0, 139 VM2VM_SOFTWARE = 1, 140 VM2VM_HARDWARE = 2, 141 VM2VM_LAST 142 } vm2vm_type; 143 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 144 145 /* Enable stats. */ 146 static uint32_t enable_stats = 0; 147 /* Enable retries on RX. */ 148 static uint32_t enable_retry = 1; 149 150 /* Disable TX checksum offload */ 151 static uint32_t enable_tx_csum; 152 153 /* Disable TSO offload */ 154 static uint32_t enable_tso; 155 156 /* Specify timeout (in useconds) between retries on RX. */ 157 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 158 /* Specify the number of retries on RX. */ 159 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 160 161 /* Character device basename. Can be set by user. */ 162 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 163 164 /* empty vmdq configuration structure. Filled in programatically */ 165 static struct rte_eth_conf vmdq_conf_default = { 166 .rxmode = { 167 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 168 .split_hdr_size = 0, 169 .header_split = 0, /**< Header Split disabled */ 170 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 171 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 172 /* 173 * It is necessary for 1G NIC such as I350, 174 * this fixes bug of ipv4 forwarding in guest can't 175 * forward pakets from one virtio dev to another virtio dev. 176 */ 177 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 178 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 179 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 180 }, 181 182 .txmode = { 183 .mq_mode = ETH_MQ_TX_NONE, 184 }, 185 .rx_adv_conf = { 186 /* 187 * should be overridden separately in code with 188 * appropriate values 189 */ 190 .vmdq_rx_conf = { 191 .nb_queue_pools = ETH_8_POOLS, 192 .enable_default_pool = 0, 193 .default_pool = 0, 194 .nb_pool_maps = 0, 195 .pool_map = {{0, 0},}, 196 }, 197 }, 198 }; 199 200 static unsigned lcore_ids[RTE_MAX_LCORE]; 201 static uint8_t ports[RTE_MAX_ETHPORTS]; 202 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 203 static uint16_t num_pf_queues, num_vmdq_queues; 204 static uint16_t vmdq_pool_base, vmdq_queue_base; 205 static uint16_t queues_per_pool; 206 207 const uint16_t vlan_tags[] = { 208 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 209 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 210 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 211 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 212 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 213 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 214 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 215 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 216 }; 217 218 /* ethernet addresses of ports */ 219 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 220 221 /* heads for the main used and free linked lists for the data path. */ 222 static struct virtio_net_data_ll *ll_root_used = NULL; 223 static struct virtio_net_data_ll *ll_root_free = NULL; 224 225 /* Array of data core structures containing information on individual core linked lists. */ 226 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 227 228 /* Used for queueing bursts of TX packets. */ 229 struct mbuf_table { 230 unsigned len; 231 unsigned txq_id; 232 struct rte_mbuf *m_table[MAX_PKT_BURST]; 233 }; 234 235 /* TX queue for each data core. */ 236 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 237 238 /* Vlan header struct used to insert vlan tags on TX. */ 239 struct vlan_ethhdr { 240 unsigned char h_dest[ETH_ALEN]; 241 unsigned char h_source[ETH_ALEN]; 242 __be16 h_vlan_proto; 243 __be16 h_vlan_TCI; 244 __be16 h_vlan_encapsulated_proto; 245 }; 246 247 /* Header lengths. */ 248 #define VLAN_HLEN 4 249 #define VLAN_ETH_HLEN 18 250 251 /* Per-device statistics struct */ 252 struct device_statistics { 253 uint64_t tx_total; 254 rte_atomic64_t rx_total_atomic; 255 uint64_t tx; 256 rte_atomic64_t rx_atomic; 257 } __rte_cache_aligned; 258 struct device_statistics dev_statistics[MAX_DEVICES]; 259 260 /* 261 * Builds up the correct configuration for VMDQ VLAN pool map 262 * according to the pool & queue limits. 263 */ 264 static inline int 265 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 266 { 267 struct rte_eth_vmdq_rx_conf conf; 268 struct rte_eth_vmdq_rx_conf *def_conf = 269 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 270 unsigned i; 271 272 memset(&conf, 0, sizeof(conf)); 273 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 274 conf.nb_pool_maps = num_devices; 275 conf.enable_loop_back = def_conf->enable_loop_back; 276 conf.rx_mode = def_conf->rx_mode; 277 278 for (i = 0; i < conf.nb_pool_maps; i++) { 279 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 280 conf.pool_map[i].pools = (1UL << i); 281 } 282 283 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 284 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 285 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 286 return 0; 287 } 288 289 /* 290 * Validate the device number according to the max pool number gotten form 291 * dev_info. If the device number is invalid, give the error message and 292 * return -1. Each device must have its own pool. 293 */ 294 static inline int 295 validate_num_devices(uint32_t max_nb_devices) 296 { 297 if (num_devices > max_nb_devices) { 298 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 299 return -1; 300 } 301 return 0; 302 } 303 304 /* 305 * Initialises a given port using global settings and with the rx buffers 306 * coming from the mbuf_pool passed as parameter 307 */ 308 static inline int 309 port_init(uint8_t port) 310 { 311 struct rte_eth_dev_info dev_info; 312 struct rte_eth_conf port_conf; 313 struct rte_eth_rxconf *rxconf; 314 struct rte_eth_txconf *txconf; 315 int16_t rx_rings, tx_rings; 316 uint16_t rx_ring_size, tx_ring_size; 317 int retval; 318 uint16_t q; 319 320 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 321 rte_eth_dev_info_get (port, &dev_info); 322 323 if (dev_info.max_rx_queues > MAX_QUEUES) { 324 rte_exit(EXIT_FAILURE, 325 "please define MAX_QUEUES no less than %u in %s\n", 326 dev_info.max_rx_queues, __FILE__); 327 } 328 329 rxconf = &dev_info.default_rxconf; 330 txconf = &dev_info.default_txconf; 331 rxconf->rx_drop_en = 1; 332 333 /* Enable vlan offload */ 334 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 335 336 /*configure the number of supported virtio devices based on VMDQ limits */ 337 num_devices = dev_info.max_vmdq_pools; 338 339 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 340 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 341 tx_rings = (uint16_t)rte_lcore_count(); 342 343 retval = validate_num_devices(MAX_DEVICES); 344 if (retval < 0) 345 return retval; 346 347 /* Get port configuration. */ 348 retval = get_eth_conf(&port_conf, num_devices); 349 if (retval < 0) 350 return retval; 351 /* NIC queues are divided into pf queues and vmdq queues. */ 352 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 353 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 354 num_vmdq_queues = num_devices * queues_per_pool; 355 num_queues = num_pf_queues + num_vmdq_queues; 356 vmdq_queue_base = dev_info.vmdq_queue_base; 357 vmdq_pool_base = dev_info.vmdq_pool_base; 358 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 359 num_pf_queues, num_devices, queues_per_pool); 360 361 if (port >= rte_eth_dev_count()) return -1; 362 363 if (enable_tx_csum == 0) 364 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); 365 366 if (enable_tso == 0) { 367 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); 368 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); 369 } 370 371 rx_rings = (uint16_t)dev_info.max_rx_queues; 372 /* Configure ethernet device. */ 373 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 374 if (retval != 0) 375 return retval; 376 377 /* Setup the queues. */ 378 for (q = 0; q < rx_rings; q ++) { 379 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 380 rte_eth_dev_socket_id(port), 381 rxconf, 382 mbuf_pool); 383 if (retval < 0) 384 return retval; 385 } 386 for (q = 0; q < tx_rings; q ++) { 387 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 388 rte_eth_dev_socket_id(port), 389 txconf); 390 if (retval < 0) 391 return retval; 392 } 393 394 /* Start the device. */ 395 retval = rte_eth_dev_start(port); 396 if (retval < 0) { 397 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 398 return retval; 399 } 400 401 if (promiscuous) 402 rte_eth_promiscuous_enable(port); 403 404 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 405 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 406 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 407 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 408 (unsigned)port, 409 vmdq_ports_eth_addr[port].addr_bytes[0], 410 vmdq_ports_eth_addr[port].addr_bytes[1], 411 vmdq_ports_eth_addr[port].addr_bytes[2], 412 vmdq_ports_eth_addr[port].addr_bytes[3], 413 vmdq_ports_eth_addr[port].addr_bytes[4], 414 vmdq_ports_eth_addr[port].addr_bytes[5]); 415 416 return 0; 417 } 418 419 /* 420 * Set character device basename. 421 */ 422 static int 423 us_vhost_parse_basename(const char *q_arg) 424 { 425 /* parse number string */ 426 427 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 428 return -1; 429 else 430 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 431 432 return 0; 433 } 434 435 /* 436 * Parse the portmask provided at run time. 437 */ 438 static int 439 parse_portmask(const char *portmask) 440 { 441 char *end = NULL; 442 unsigned long pm; 443 444 errno = 0; 445 446 /* parse hexadecimal string */ 447 pm = strtoul(portmask, &end, 16); 448 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 449 return -1; 450 451 if (pm == 0) 452 return -1; 453 454 return pm; 455 456 } 457 458 /* 459 * Parse num options at run time. 460 */ 461 static int 462 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 463 { 464 char *end = NULL; 465 unsigned long num; 466 467 errno = 0; 468 469 /* parse unsigned int string */ 470 num = strtoul(q_arg, &end, 10); 471 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 472 return -1; 473 474 if (num > max_valid_value) 475 return -1; 476 477 return num; 478 479 } 480 481 /* 482 * Display usage 483 */ 484 static void 485 us_vhost_usage(const char *prgname) 486 { 487 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 488 " --vm2vm [0|1|2]\n" 489 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 490 " --dev-basename <name>\n" 491 " --nb-devices ND\n" 492 " -p PORTMASK: Set mask for ports to be used by application\n" 493 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 494 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 495 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 496 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 497 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 498 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 499 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 500 " --dev-basename: The basename to be used for the character device.\n" 501 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 502 " --tso [0|1] disable/enable TCP segment offload.\n", 503 prgname); 504 } 505 506 /* 507 * Parse the arguments given in the command line of the application. 508 */ 509 static int 510 us_vhost_parse_args(int argc, char **argv) 511 { 512 int opt, ret; 513 int option_index; 514 unsigned i; 515 const char *prgname = argv[0]; 516 static struct option long_option[] = { 517 {"vm2vm", required_argument, NULL, 0}, 518 {"rx-retry", required_argument, NULL, 0}, 519 {"rx-retry-delay", required_argument, NULL, 0}, 520 {"rx-retry-num", required_argument, NULL, 0}, 521 {"mergeable", required_argument, NULL, 0}, 522 {"vlan-strip", required_argument, NULL, 0}, 523 {"stats", required_argument, NULL, 0}, 524 {"dev-basename", required_argument, NULL, 0}, 525 {"tx-csum", required_argument, NULL, 0}, 526 {"tso", required_argument, NULL, 0}, 527 {NULL, 0, 0, 0}, 528 }; 529 530 /* Parse command line */ 531 while ((opt = getopt_long(argc, argv, "p:P", 532 long_option, &option_index)) != EOF) { 533 switch (opt) { 534 /* Portmask */ 535 case 'p': 536 enabled_port_mask = parse_portmask(optarg); 537 if (enabled_port_mask == 0) { 538 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 539 us_vhost_usage(prgname); 540 return -1; 541 } 542 break; 543 544 case 'P': 545 promiscuous = 1; 546 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 547 ETH_VMDQ_ACCEPT_BROADCAST | 548 ETH_VMDQ_ACCEPT_MULTICAST; 549 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 550 551 break; 552 553 case 0: 554 /* Enable/disable vm2vm comms. */ 555 if (!strncmp(long_option[option_index].name, "vm2vm", 556 MAX_LONG_OPT_SZ)) { 557 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 558 if (ret == -1) { 559 RTE_LOG(INFO, VHOST_CONFIG, 560 "Invalid argument for " 561 "vm2vm [0|1|2]\n"); 562 us_vhost_usage(prgname); 563 return -1; 564 } else { 565 vm2vm_mode = (vm2vm_type)ret; 566 } 567 } 568 569 /* Enable/disable retries on RX. */ 570 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 571 ret = parse_num_opt(optarg, 1); 572 if (ret == -1) { 573 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 574 us_vhost_usage(prgname); 575 return -1; 576 } else { 577 enable_retry = ret; 578 } 579 } 580 581 /* Enable/disable TX checksum offload. */ 582 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 583 ret = parse_num_opt(optarg, 1); 584 if (ret == -1) { 585 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 586 us_vhost_usage(prgname); 587 return -1; 588 } else 589 enable_tx_csum = ret; 590 } 591 592 /* Enable/disable TSO offload. */ 593 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 594 ret = parse_num_opt(optarg, 1); 595 if (ret == -1) { 596 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 597 us_vhost_usage(prgname); 598 return -1; 599 } else 600 enable_tso = ret; 601 } 602 603 /* Specify the retries delay time (in useconds) on RX. */ 604 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 605 ret = parse_num_opt(optarg, INT32_MAX); 606 if (ret == -1) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } else { 611 burst_rx_delay_time = ret; 612 } 613 } 614 615 /* Specify the retries number on RX. */ 616 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 617 ret = parse_num_opt(optarg, INT32_MAX); 618 if (ret == -1) { 619 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 620 us_vhost_usage(prgname); 621 return -1; 622 } else { 623 burst_rx_retry_num = ret; 624 } 625 } 626 627 /* Enable/disable RX mergeable buffers. */ 628 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 629 ret = parse_num_opt(optarg, 1); 630 if (ret == -1) { 631 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 632 us_vhost_usage(prgname); 633 return -1; 634 } else { 635 mergeable = !!ret; 636 if (ret) { 637 vmdq_conf_default.rxmode.jumbo_frame = 1; 638 vmdq_conf_default.rxmode.max_rx_pkt_len 639 = JUMBO_FRAME_MAX_SIZE; 640 } 641 } 642 } 643 644 /* Enable/disable RX VLAN strip on host. */ 645 if (!strncmp(long_option[option_index].name, 646 "vlan-strip", MAX_LONG_OPT_SZ)) { 647 ret = parse_num_opt(optarg, 1); 648 if (ret == -1) { 649 RTE_LOG(INFO, VHOST_CONFIG, 650 "Invalid argument for VLAN strip [0|1]\n"); 651 us_vhost_usage(prgname); 652 return -1; 653 } else { 654 vlan_strip = !!ret; 655 vmdq_conf_default.rxmode.hw_vlan_strip = 656 vlan_strip; 657 } 658 } 659 660 /* Enable/disable stats. */ 661 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 662 ret = parse_num_opt(optarg, INT32_MAX); 663 if (ret == -1) { 664 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 665 us_vhost_usage(prgname); 666 return -1; 667 } else { 668 enable_stats = ret; 669 } 670 } 671 672 /* Set character device basename. */ 673 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 674 if (us_vhost_parse_basename(optarg) == -1) { 675 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 676 us_vhost_usage(prgname); 677 return -1; 678 } 679 } 680 681 break; 682 683 /* Invalid option - print options. */ 684 default: 685 us_vhost_usage(prgname); 686 return -1; 687 } 688 } 689 690 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 691 if (enabled_port_mask & (1 << i)) 692 ports[num_ports++] = (uint8_t)i; 693 } 694 695 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 696 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 697 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 698 return -1; 699 } 700 701 return 0; 702 } 703 704 /* 705 * Update the global var NUM_PORTS and array PORTS according to system ports number 706 * and return valid ports number 707 */ 708 static unsigned check_ports_num(unsigned nb_ports) 709 { 710 unsigned valid_num_ports = num_ports; 711 unsigned portid; 712 713 if (num_ports > nb_ports) { 714 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 715 num_ports, nb_ports); 716 num_ports = nb_ports; 717 } 718 719 for (portid = 0; portid < num_ports; portid ++) { 720 if (ports[portid] >= nb_ports) { 721 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 722 ports[portid], (nb_ports - 1)); 723 ports[portid] = INVALID_PORT_ID; 724 valid_num_ports--; 725 } 726 } 727 return valid_num_ports; 728 } 729 730 /* 731 * Compares a packet destination MAC address to a device MAC address. 732 */ 733 static inline int __attribute__((always_inline)) 734 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 735 { 736 return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; 737 } 738 739 /* 740 * This function learns the MAC address of the device and registers this along with a 741 * vlan tag to a VMDQ. 742 */ 743 static int 744 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 745 { 746 struct ether_hdr *pkt_hdr; 747 struct virtio_net_data_ll *dev_ll; 748 struct virtio_net *dev = vdev->dev; 749 int i, ret; 750 751 /* Learn MAC address of guest device from packet */ 752 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 753 754 dev_ll = ll_root_used; 755 756 while (dev_ll != NULL) { 757 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 758 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 759 return -1; 760 } 761 dev_ll = dev_ll->next; 762 } 763 764 for (i = 0; i < ETHER_ADDR_LEN; i++) 765 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 766 767 /* vlan_tag currently uses the device_id. */ 768 vdev->vlan_tag = vlan_tags[dev->device_fh]; 769 770 /* Print out VMDQ registration info. */ 771 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 772 dev->device_fh, 773 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 774 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 775 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 776 vdev->vlan_tag); 777 778 /* Register the MAC address. */ 779 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 780 (uint32_t)dev->device_fh + vmdq_pool_base); 781 if (ret) 782 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 783 dev->device_fh); 784 785 /* Enable stripping of the vlan tag as we handle routing. */ 786 if (vlan_strip) 787 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 788 (uint16_t)vdev->vmdq_rx_q, 1); 789 790 /* Set device as ready for RX. */ 791 vdev->ready = DEVICE_RX; 792 793 return 0; 794 } 795 796 /* 797 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 798 * queue before disabling RX on the device. 799 */ 800 static inline void 801 unlink_vmdq(struct vhost_dev *vdev) 802 { 803 unsigned i = 0; 804 unsigned rx_count; 805 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 806 807 if (vdev->ready == DEVICE_RX) { 808 /*clear MAC and VLAN settings*/ 809 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 810 for (i = 0; i < 6; i++) 811 vdev->mac_address.addr_bytes[i] = 0; 812 813 vdev->vlan_tag = 0; 814 815 /*Clear out the receive buffers*/ 816 rx_count = rte_eth_rx_burst(ports[0], 817 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 818 819 while (rx_count) { 820 for (i = 0; i < rx_count; i++) 821 rte_pktmbuf_free(pkts_burst[i]); 822 823 rx_count = rte_eth_rx_burst(ports[0], 824 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 825 } 826 827 vdev->ready = DEVICE_MAC_LEARNING; 828 } 829 } 830 831 /* 832 * Check if the packet destination MAC address is for a local device. If so then put 833 * the packet on that devices RX queue. If not then return. 834 */ 835 static inline int __attribute__((always_inline)) 836 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 837 { 838 struct virtio_net_data_ll *dev_ll; 839 struct ether_hdr *pkt_hdr; 840 uint64_t ret = 0; 841 struct virtio_net *dev = vdev->dev; 842 struct virtio_net *tdev; /* destination virito device */ 843 844 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 845 846 /*get the used devices list*/ 847 dev_ll = ll_root_used; 848 849 while (dev_ll != NULL) { 850 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 851 &dev_ll->vdev->mac_address)) { 852 853 /* Drop the packet if the TX packet is destined for the TX device. */ 854 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 855 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " 856 "Source and destination MAC addresses are the same. " 857 "Dropping packet.\n", 858 dev->device_fh); 859 return 0; 860 } 861 tdev = dev_ll->vdev->dev; 862 863 864 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " 865 "MAC address is local\n", tdev->device_fh); 866 867 if (unlikely(dev_ll->vdev->remove)) { 868 /*drop the packet if the device is marked for removal*/ 869 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") " 870 "Device is marked for removal\n", tdev->device_fh); 871 } else { 872 /*send the packet to the local virtio device*/ 873 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 874 if (enable_stats) { 875 rte_atomic64_add( 876 &dev_statistics[tdev->device_fh].rx_total_atomic, 877 1); 878 rte_atomic64_add( 879 &dev_statistics[tdev->device_fh].rx_atomic, 880 ret); 881 dev_statistics[dev->device_fh].tx_total++; 882 dev_statistics[dev->device_fh].tx += ret; 883 } 884 } 885 886 return 0; 887 } 888 dev_ll = dev_ll->next; 889 } 890 891 return -1; 892 } 893 894 /* 895 * Check if the destination MAC of a packet is one local VM, 896 * and get its vlan tag, and offset if it is. 897 */ 898 static inline int __attribute__((always_inline)) 899 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 900 uint32_t *offset, uint16_t *vlan_tag) 901 { 902 struct virtio_net_data_ll *dev_ll = ll_root_used; 903 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 904 905 while (dev_ll != NULL) { 906 if ((dev_ll->vdev->ready == DEVICE_RX) 907 && ether_addr_cmp(&(pkt_hdr->d_addr), 908 &dev_ll->vdev->mac_address)) { 909 /* 910 * Drop the packet if the TX packet is 911 * destined for the TX device. 912 */ 913 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 914 RTE_LOG(DEBUG, VHOST_DATA, 915 "(%"PRIu64") TX: Source and destination" 916 " MAC addresses are the same. Dropping " 917 "packet.\n", 918 dev_ll->vdev->dev->device_fh); 919 return -1; 920 } 921 922 /* 923 * HW vlan strip will reduce the packet length 924 * by minus length of vlan tag, so need restore 925 * the packet length by plus it. 926 */ 927 *offset = VLAN_HLEN; 928 *vlan_tag = 929 (uint16_t) 930 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 931 932 RTE_LOG(DEBUG, VHOST_DATA, 933 "(%"PRIu64") TX: pkt to local VM device id:" 934 "(%"PRIu64") vlan tag: %d.\n", 935 dev->device_fh, dev_ll->vdev->dev->device_fh, 936 (int)*vlan_tag); 937 938 break; 939 } 940 dev_ll = dev_ll->next; 941 } 942 return 0; 943 } 944 945 static uint16_t 946 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 947 { 948 if (ol_flags & PKT_TX_IPV4) 949 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 950 else /* assume ethertype == ETHER_TYPE_IPv6 */ 951 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 952 } 953 954 static void virtio_tx_offload(struct rte_mbuf *m) 955 { 956 void *l3_hdr; 957 struct ipv4_hdr *ipv4_hdr = NULL; 958 struct tcp_hdr *tcp_hdr = NULL; 959 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 960 961 l3_hdr = (char *)eth_hdr + m->l2_len; 962 963 if (m->ol_flags & PKT_TX_IPV4) { 964 ipv4_hdr = l3_hdr; 965 ipv4_hdr->hdr_checksum = 0; 966 m->ol_flags |= PKT_TX_IP_CKSUM; 967 } 968 969 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 970 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 971 } 972 973 /* 974 * This function routes the TX packet to the correct interface. This may be a local device 975 * or the physical port. 976 */ 977 static inline void __attribute__((always_inline)) 978 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 979 { 980 struct mbuf_table *tx_q; 981 struct rte_mbuf **m_table; 982 unsigned len, ret, offset = 0; 983 const uint16_t lcore_id = rte_lcore_id(); 984 struct virtio_net *dev = vdev->dev; 985 struct ether_hdr *nh; 986 987 /*check if destination is local VM*/ 988 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 989 rte_pktmbuf_free(m); 990 return; 991 } 992 993 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 994 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 995 rte_pktmbuf_free(m); 996 return; 997 } 998 } 999 1000 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " 1001 "MAC address is external\n", dev->device_fh); 1002 1003 /*Add packet to the port tx queue*/ 1004 tx_q = &lcore_tx_queue[lcore_id]; 1005 len = tx_q->len; 1006 1007 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1008 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1009 /* Guest has inserted the vlan tag. */ 1010 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1011 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1012 if ((vm2vm_mode == VM2VM_HARDWARE) && 1013 (vh->vlan_tci != vlan_tag_be)) 1014 vh->vlan_tci = vlan_tag_be; 1015 } else { 1016 m->ol_flags |= PKT_TX_VLAN_PKT; 1017 1018 /* 1019 * Find the right seg to adjust the data len when offset is 1020 * bigger than tail room size. 1021 */ 1022 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1023 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1024 m->data_len += offset; 1025 else { 1026 struct rte_mbuf *seg = m; 1027 1028 while ((seg->next != NULL) && 1029 (offset > rte_pktmbuf_tailroom(seg))) 1030 seg = seg->next; 1031 1032 seg->data_len += offset; 1033 } 1034 m->pkt_len += offset; 1035 } 1036 1037 m->vlan_tci = vlan_tag; 1038 } 1039 1040 if (m->ol_flags & PKT_TX_TCP_SEG) 1041 virtio_tx_offload(m); 1042 1043 tx_q->m_table[len] = m; 1044 len++; 1045 if (enable_stats) { 1046 dev_statistics[dev->device_fh].tx_total++; 1047 dev_statistics[dev->device_fh].tx++; 1048 } 1049 1050 if (unlikely(len == MAX_PKT_BURST)) { 1051 m_table = (struct rte_mbuf **)tx_q->m_table; 1052 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1053 /* Free any buffers not handled by TX and update the port stats. */ 1054 if (unlikely(ret < len)) { 1055 do { 1056 rte_pktmbuf_free(m_table[ret]); 1057 } while (++ret < len); 1058 } 1059 1060 len = 0; 1061 } 1062 1063 tx_q->len = len; 1064 return; 1065 } 1066 /* 1067 * This function is called by each data core. It handles all RX/TX registered with the 1068 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1069 * with all devices in the main linked list. 1070 */ 1071 static int 1072 switch_worker(__attribute__((unused)) void *arg) 1073 { 1074 struct virtio_net *dev = NULL; 1075 struct vhost_dev *vdev = NULL; 1076 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1077 struct virtio_net_data_ll *dev_ll; 1078 struct mbuf_table *tx_q; 1079 volatile struct lcore_ll_info *lcore_ll; 1080 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1081 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1082 unsigned ret, i; 1083 const uint16_t lcore_id = rte_lcore_id(); 1084 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1085 uint16_t rx_count = 0; 1086 uint16_t tx_count; 1087 uint32_t retry = 0; 1088 1089 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1090 lcore_ll = lcore_info[lcore_id].lcore_ll; 1091 prev_tsc = 0; 1092 1093 tx_q = &lcore_tx_queue[lcore_id]; 1094 for (i = 0; i < num_cores; i ++) { 1095 if (lcore_ids[i] == lcore_id) { 1096 tx_q->txq_id = i; 1097 break; 1098 } 1099 } 1100 1101 while(1) { 1102 cur_tsc = rte_rdtsc(); 1103 /* 1104 * TX burst queue drain 1105 */ 1106 diff_tsc = cur_tsc - prev_tsc; 1107 if (unlikely(diff_tsc > drain_tsc)) { 1108 1109 if (tx_q->len) { 1110 RTE_LOG(DEBUG, VHOST_DATA, 1111 "TX queue drained after timeout with burst size %u\n", 1112 tx_q->len); 1113 1114 /*Tx any packets in the queue*/ 1115 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1116 (struct rte_mbuf **)tx_q->m_table, 1117 (uint16_t)tx_q->len); 1118 if (unlikely(ret < tx_q->len)) { 1119 do { 1120 rte_pktmbuf_free(tx_q->m_table[ret]); 1121 } while (++ret < tx_q->len); 1122 } 1123 1124 tx_q->len = 0; 1125 } 1126 1127 prev_tsc = cur_tsc; 1128 1129 } 1130 1131 rte_prefetch0(lcore_ll->ll_root_used); 1132 /* 1133 * Inform the configuration core that we have exited the linked list and that no devices are 1134 * in use if requested. 1135 */ 1136 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1137 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1138 1139 /* 1140 * Process devices 1141 */ 1142 dev_ll = lcore_ll->ll_root_used; 1143 1144 while (dev_ll != NULL) { 1145 /*get virtio device ID*/ 1146 vdev = dev_ll->vdev; 1147 dev = vdev->dev; 1148 1149 if (unlikely(vdev->remove)) { 1150 dev_ll = dev_ll->next; 1151 unlink_vmdq(vdev); 1152 vdev->ready = DEVICE_SAFE_REMOVE; 1153 continue; 1154 } 1155 if (likely(vdev->ready == DEVICE_RX)) { 1156 /*Handle guest RX*/ 1157 rx_count = rte_eth_rx_burst(ports[0], 1158 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1159 1160 if (rx_count) { 1161 /* 1162 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1163 * Here MAX_PKT_BURST must be less than virtio queue size 1164 */ 1165 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1166 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1167 rte_delay_us(burst_rx_delay_time); 1168 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1169 break; 1170 } 1171 } 1172 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1173 if (enable_stats) { 1174 rte_atomic64_add( 1175 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1176 rx_count); 1177 rte_atomic64_add( 1178 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1179 } 1180 while (likely(rx_count)) { 1181 rx_count--; 1182 rte_pktmbuf_free(pkts_burst[rx_count]); 1183 } 1184 1185 } 1186 } 1187 1188 if (likely(!vdev->remove)) { 1189 /* Handle guest TX*/ 1190 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1191 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1192 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1193 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1194 while (tx_count) 1195 rte_pktmbuf_free(pkts_burst[--tx_count]); 1196 } 1197 } 1198 for (i = 0; i < tx_count; ++i) { 1199 virtio_tx_route(vdev, pkts_burst[i], 1200 vlan_tags[(uint16_t)dev->device_fh]); 1201 } 1202 } 1203 1204 /*move to the next device in the list*/ 1205 dev_ll = dev_ll->next; 1206 } 1207 } 1208 1209 return 0; 1210 } 1211 1212 /* 1213 * Add an entry to a used linked list. A free entry must first be found 1214 * in the free linked list using get_data_ll_free_entry(); 1215 */ 1216 static void 1217 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 1218 struct virtio_net_data_ll *ll_dev) 1219 { 1220 struct virtio_net_data_ll *ll = *ll_root_addr; 1221 1222 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 1223 ll_dev->next = NULL; 1224 rte_compiler_barrier(); 1225 1226 /* If ll == NULL then this is the first device. */ 1227 if (ll) { 1228 /* Increment to the tail of the linked list. */ 1229 while ((ll->next != NULL) ) 1230 ll = ll->next; 1231 1232 ll->next = ll_dev; 1233 } else { 1234 *ll_root_addr = ll_dev; 1235 } 1236 } 1237 1238 /* 1239 * Remove an entry from a used linked list. The entry must then be added to 1240 * the free linked list using put_data_ll_free_entry(). 1241 */ 1242 static void 1243 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 1244 struct virtio_net_data_ll *ll_dev, 1245 struct virtio_net_data_ll *ll_dev_last) 1246 { 1247 struct virtio_net_data_ll *ll = *ll_root_addr; 1248 1249 if (unlikely((ll == NULL) || (ll_dev == NULL))) 1250 return; 1251 1252 if (ll_dev == ll) 1253 *ll_root_addr = ll_dev->next; 1254 else 1255 if (likely(ll_dev_last != NULL)) 1256 ll_dev_last->next = ll_dev->next; 1257 else 1258 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 1259 } 1260 1261 /* 1262 * Find and return an entry from the free linked list. 1263 */ 1264 static struct virtio_net_data_ll * 1265 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 1266 { 1267 struct virtio_net_data_ll *ll_free = *ll_root_addr; 1268 struct virtio_net_data_ll *ll_dev; 1269 1270 if (ll_free == NULL) 1271 return NULL; 1272 1273 ll_dev = ll_free; 1274 *ll_root_addr = ll_free->next; 1275 1276 return ll_dev; 1277 } 1278 1279 /* 1280 * Place an entry back on to the free linked list. 1281 */ 1282 static void 1283 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 1284 struct virtio_net_data_ll *ll_dev) 1285 { 1286 struct virtio_net_data_ll *ll_free = *ll_root_addr; 1287 1288 if (ll_dev == NULL) 1289 return; 1290 1291 ll_dev->next = ll_free; 1292 *ll_root_addr = ll_dev; 1293 } 1294 1295 /* 1296 * Creates a linked list of a given size. 1297 */ 1298 static struct virtio_net_data_ll * 1299 alloc_data_ll(uint32_t size) 1300 { 1301 struct virtio_net_data_ll *ll_new; 1302 uint32_t i; 1303 1304 /* Malloc and then chain the linked list. */ 1305 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 1306 if (ll_new == NULL) { 1307 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 1308 return NULL; 1309 } 1310 1311 for (i = 0; i < size - 1; i++) { 1312 ll_new[i].vdev = NULL; 1313 ll_new[i].next = &ll_new[i+1]; 1314 } 1315 ll_new[i].next = NULL; 1316 1317 return ll_new; 1318 } 1319 1320 /* 1321 * Create the main linked list along with each individual cores linked list. A used and a free list 1322 * are created to manage entries. 1323 */ 1324 static int 1325 init_data_ll (void) 1326 { 1327 int lcore; 1328 1329 RTE_LCORE_FOREACH_SLAVE(lcore) { 1330 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 1331 if (lcore_info[lcore].lcore_ll == NULL) { 1332 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 1333 return -1; 1334 } 1335 1336 lcore_info[lcore].lcore_ll->device_num = 0; 1337 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1338 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 1339 if (num_devices % num_switching_cores) 1340 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 1341 else 1342 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 1343 } 1344 1345 /* Allocate devices up to a maximum of MAX_DEVICES. */ 1346 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 1347 1348 return 0; 1349 } 1350 1351 /* 1352 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 1353 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1354 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1355 */ 1356 static void 1357 destroy_device (volatile struct virtio_net *dev) 1358 { 1359 struct virtio_net_data_ll *ll_lcore_dev_cur; 1360 struct virtio_net_data_ll *ll_main_dev_cur; 1361 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 1362 struct virtio_net_data_ll *ll_main_dev_last = NULL; 1363 struct vhost_dev *vdev; 1364 int lcore; 1365 1366 dev->flags &= ~VIRTIO_DEV_RUNNING; 1367 1368 vdev = (struct vhost_dev *)dev->priv; 1369 /*set the remove flag. */ 1370 vdev->remove = 1; 1371 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1372 rte_pause(); 1373 } 1374 1375 /* Search for entry to be removed from lcore ll */ 1376 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 1377 while (ll_lcore_dev_cur != NULL) { 1378 if (ll_lcore_dev_cur->vdev == vdev) { 1379 break; 1380 } else { 1381 ll_lcore_dev_last = ll_lcore_dev_cur; 1382 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 1383 } 1384 } 1385 1386 if (ll_lcore_dev_cur == NULL) { 1387 RTE_LOG(ERR, VHOST_CONFIG, 1388 "(%"PRIu64") Failed to find the dev to be destroy.\n", 1389 dev->device_fh); 1390 return; 1391 } 1392 1393 /* Search for entry to be removed from main ll */ 1394 ll_main_dev_cur = ll_root_used; 1395 ll_main_dev_last = NULL; 1396 while (ll_main_dev_cur != NULL) { 1397 if (ll_main_dev_cur->vdev == vdev) { 1398 break; 1399 } else { 1400 ll_main_dev_last = ll_main_dev_cur; 1401 ll_main_dev_cur = ll_main_dev_cur->next; 1402 } 1403 } 1404 1405 /* Remove entries from the lcore and main ll. */ 1406 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 1407 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 1408 1409 /* Set the dev_removal_flag on each lcore. */ 1410 RTE_LCORE_FOREACH_SLAVE(lcore) { 1411 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 1412 } 1413 1414 /* 1415 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 1416 * they can no longer access the device removed from the linked lists and that the devices 1417 * are no longer in use. 1418 */ 1419 RTE_LCORE_FOREACH_SLAVE(lcore) { 1420 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 1421 rte_pause(); 1422 } 1423 } 1424 1425 /* Add the entries back to the lcore and main free ll.*/ 1426 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 1427 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 1428 1429 /* Decrement number of device on the lcore. */ 1430 lcore_info[vdev->coreid].lcore_ll->device_num--; 1431 1432 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 1433 1434 rte_free(vdev); 1435 1436 } 1437 1438 /* 1439 * A new device is added to a data core. First the device is added to the main linked list 1440 * and the allocated to a specific data core. 1441 */ 1442 static int 1443 new_device (struct virtio_net *dev) 1444 { 1445 struct virtio_net_data_ll *ll_dev; 1446 int lcore, core_add = 0; 1447 uint32_t device_num_min = num_devices; 1448 struct vhost_dev *vdev; 1449 1450 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1451 if (vdev == NULL) { 1452 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 1453 dev->device_fh); 1454 return -1; 1455 } 1456 vdev->dev = dev; 1457 dev->priv = vdev; 1458 1459 /* Add device to main ll */ 1460 ll_dev = get_data_ll_free_entry(&ll_root_free); 1461 if (ll_dev == NULL) { 1462 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 1463 "of %d devices per core has been reached\n", 1464 dev->device_fh, num_devices); 1465 rte_free(vdev); 1466 return -1; 1467 } 1468 ll_dev->vdev = vdev; 1469 add_data_ll_entry(&ll_root_used, ll_dev); 1470 vdev->vmdq_rx_q 1471 = dev->device_fh * queues_per_pool + vmdq_queue_base; 1472 1473 /*reset ready flag*/ 1474 vdev->ready = DEVICE_MAC_LEARNING; 1475 vdev->remove = 0; 1476 1477 /* Find a suitable lcore to add the device. */ 1478 RTE_LCORE_FOREACH_SLAVE(lcore) { 1479 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 1480 device_num_min = lcore_info[lcore].lcore_ll->device_num; 1481 core_add = lcore; 1482 } 1483 } 1484 /* Add device to lcore ll */ 1485 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 1486 if (ll_dev == NULL) { 1487 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 1488 vdev->ready = DEVICE_SAFE_REMOVE; 1489 destroy_device(dev); 1490 rte_free(vdev); 1491 return -1; 1492 } 1493 ll_dev->vdev = vdev; 1494 vdev->coreid = core_add; 1495 1496 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 1497 1498 /* Initialize device stats */ 1499 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 1500 1501 /* Disable notifications. */ 1502 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 1503 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 1504 lcore_info[vdev->coreid].lcore_ll->device_num++; 1505 dev->flags |= VIRTIO_DEV_RUNNING; 1506 1507 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 1508 1509 return 0; 1510 } 1511 1512 /* 1513 * These callback allow devices to be added to the data core when configuration 1514 * has been fully complete. 1515 */ 1516 static const struct virtio_net_device_ops virtio_net_device_ops = 1517 { 1518 .new_device = new_device, 1519 .destroy_device = destroy_device, 1520 }; 1521 1522 /* 1523 * This is a thread will wake up after a period to print stats if the user has 1524 * enabled them. 1525 */ 1526 static void 1527 print_stats(void) 1528 { 1529 struct virtio_net_data_ll *dev_ll; 1530 uint64_t tx_dropped, rx_dropped; 1531 uint64_t tx, tx_total, rx, rx_total; 1532 uint32_t device_fh; 1533 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1534 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1535 1536 while(1) { 1537 sleep(enable_stats); 1538 1539 /* Clear screen and move to top left */ 1540 printf("%s%s", clr, top_left); 1541 1542 printf("\nDevice statistics ===================================="); 1543 1544 dev_ll = ll_root_used; 1545 while (dev_ll != NULL) { 1546 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 1547 tx_total = dev_statistics[device_fh].tx_total; 1548 tx = dev_statistics[device_fh].tx; 1549 tx_dropped = tx_total - tx; 1550 rx_total = rte_atomic64_read( 1551 &dev_statistics[device_fh].rx_total_atomic); 1552 rx = rte_atomic64_read( 1553 &dev_statistics[device_fh].rx_atomic); 1554 rx_dropped = rx_total - rx; 1555 1556 printf("\nStatistics for device %"PRIu32" ------------------------------" 1557 "\nTX total: %"PRIu64"" 1558 "\nTX dropped: %"PRIu64"" 1559 "\nTX successful: %"PRIu64"" 1560 "\nRX total: %"PRIu64"" 1561 "\nRX dropped: %"PRIu64"" 1562 "\nRX successful: %"PRIu64"", 1563 device_fh, 1564 tx_total, 1565 tx_dropped, 1566 tx, 1567 rx_total, 1568 rx_dropped, 1569 rx); 1570 1571 dev_ll = dev_ll->next; 1572 } 1573 printf("\n======================================================\n"); 1574 } 1575 } 1576 1577 /* When we receive a INT signal, unregister vhost driver */ 1578 static void 1579 sigint_handler(__rte_unused int signum) 1580 { 1581 /* Unregister vhost driver. */ 1582 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 1583 if (ret != 0) 1584 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 1585 exit(0); 1586 } 1587 1588 /* 1589 * Main function, does initialisation and calls the per-lcore functions. The CUSE 1590 * device is also registered here to handle the IOCTLs. 1591 */ 1592 int 1593 main(int argc, char *argv[]) 1594 { 1595 unsigned lcore_id, core_id = 0; 1596 unsigned nb_ports, valid_num_ports; 1597 int ret; 1598 uint8_t portid; 1599 static pthread_t tid; 1600 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 1601 1602 signal(SIGINT, sigint_handler); 1603 1604 /* init EAL */ 1605 ret = rte_eal_init(argc, argv); 1606 if (ret < 0) 1607 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1608 argc -= ret; 1609 argv += ret; 1610 1611 /* parse app arguments */ 1612 ret = us_vhost_parse_args(argc, argv); 1613 if (ret < 0) 1614 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1615 1616 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 1617 if (rte_lcore_is_enabled(lcore_id)) 1618 lcore_ids[core_id ++] = lcore_id; 1619 1620 if (rte_lcore_count() > RTE_MAX_LCORE) 1621 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1622 1623 /*set the number of swithcing cores available*/ 1624 num_switching_cores = rte_lcore_count()-1; 1625 1626 /* Get the number of physical ports. */ 1627 nb_ports = rte_eth_dev_count(); 1628 if (nb_ports > RTE_MAX_ETHPORTS) 1629 nb_ports = RTE_MAX_ETHPORTS; 1630 1631 /* 1632 * Update the global var NUM_PORTS and global array PORTS 1633 * and get value of var VALID_NUM_PORTS according to system ports number 1634 */ 1635 valid_num_ports = check_ports_num(nb_ports); 1636 1637 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1638 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1639 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1640 return -1; 1641 } 1642 1643 /* Create the mbuf pool. */ 1644 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 1645 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 1646 0, MBUF_DATA_SIZE, rte_socket_id()); 1647 if (mbuf_pool == NULL) 1648 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1649 1650 if (vm2vm_mode == VM2VM_HARDWARE) { 1651 /* Enable VT loop back to let L2 switch to do it. */ 1652 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1653 RTE_LOG(DEBUG, VHOST_CONFIG, 1654 "Enable loop back for L2 switch in vmdq.\n"); 1655 } 1656 1657 /* initialize all ports */ 1658 for (portid = 0; portid < nb_ports; portid++) { 1659 /* skip ports that are not enabled */ 1660 if ((enabled_port_mask & (1 << portid)) == 0) { 1661 RTE_LOG(INFO, VHOST_PORT, 1662 "Skipping disabled port %d\n", portid); 1663 continue; 1664 } 1665 if (port_init(portid) != 0) 1666 rte_exit(EXIT_FAILURE, 1667 "Cannot initialize network ports\n"); 1668 } 1669 1670 /* Initialise all linked lists. */ 1671 if (init_data_ll() == -1) 1672 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 1673 1674 /* Initialize device stats */ 1675 memset(&dev_statistics, 0, sizeof(dev_statistics)); 1676 1677 /* Enable stats if the user option is set. */ 1678 if (enable_stats) { 1679 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 1680 if (ret != 0) 1681 rte_exit(EXIT_FAILURE, 1682 "Cannot create print-stats thread\n"); 1683 1684 /* Set thread_name for aid in debugging. */ 1685 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 1686 ret = rte_thread_setname(tid, thread_name); 1687 if (ret != 0) 1688 RTE_LOG(ERR, VHOST_CONFIG, 1689 "Cannot set print-stats name\n"); 1690 } 1691 1692 /* Launch all data cores. */ 1693 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1694 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1695 1696 if (mergeable == 0) 1697 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 1698 1699 /* Register vhost(cuse or user) driver to handle vhost messages. */ 1700 ret = rte_vhost_driver_register((char *)&dev_basename); 1701 if (ret != 0) 1702 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 1703 1704 rte_vhost_driver_callback_register(&virtio_net_device_ops); 1705 1706 /* Start CUSE session. */ 1707 rte_vhost_driver_session_start(); 1708 return 0; 1709 1710 } 1711