1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */ 103 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 104 105 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 106 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 107 108 #define JUMBO_FRAME_MAX_SIZE 0x2600 109 110 /* State of virtio device. */ 111 #define DEVICE_MAC_LEARNING 0 112 #define DEVICE_RX 1 113 #define DEVICE_SAFE_REMOVE 2 114 115 /* Config_core_flag status definitions. */ 116 #define REQUEST_DEV_REMOVAL 1 117 #define ACK_DEV_REMOVAL 0 118 119 /* Configurable number of RX/TX ring descriptors */ 120 #define RTE_TEST_RX_DESC_DEFAULT 1024 121 #define RTE_TEST_TX_DESC_DEFAULT 512 122 123 /* 124 * Need refine these 2 macros for legacy and DPDK based front end: 125 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 126 * And then adjust power 2. 127 */ 128 /* 129 * For legacy front end, 128 descriptors, 130 * half for virtio header, another half for mbuf. 131 */ 132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 134 135 /* Get first 4 bytes in mbuf headroom. */ 136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 137 + sizeof(struct rte_mbuf))) 138 139 /* true if x is a power of 2 */ 140 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 141 142 #define INVALID_PORT_ID 0xFF 143 144 /* Max number of devices. Limited by vmdq. */ 145 #define MAX_DEVICES 64 146 147 /* Size of buffers used for snprintfs. */ 148 #define MAX_PRINT_BUFF 6072 149 150 /* Maximum character device basename size. */ 151 #define MAX_BASENAME_SZ 10 152 153 /* Maximum long option length for option parsing. */ 154 #define MAX_LONG_OPT_SZ 64 155 156 /* Used to compare MAC addresses. */ 157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 158 159 /* Number of descriptors per cacheline. */ 160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 161 162 /* mask of enabled ports */ 163 static uint32_t enabled_port_mask = 0; 164 165 /*Number of switching cores enabled*/ 166 static uint32_t num_switching_cores = 0; 167 168 /* number of devices/queues to support*/ 169 static uint32_t num_queues = 0; 170 uint32_t num_devices = 0; 171 172 /* 173 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 174 * disabled on default. 175 */ 176 static uint32_t zero_copy; 177 178 /* number of descriptors to apply*/ 179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 181 182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 183 #define MAX_RING_DESC 4096 184 185 struct vpool { 186 struct rte_mempool *pool; 187 struct rte_ring *ring; 188 uint32_t buf_size; 189 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 190 191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 192 typedef enum { 193 VM2VM_DISABLED = 0, 194 VM2VM_SOFTWARE = 1, 195 VM2VM_HARDWARE = 2, 196 VM2VM_LAST 197 } vm2vm_type; 198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 199 200 /* The type of host physical address translated from guest physical address. */ 201 typedef enum { 202 PHYS_ADDR_CONTINUOUS = 0, 203 PHYS_ADDR_CROSS_SUBREG = 1, 204 PHYS_ADDR_INVALID = 2, 205 PHYS_ADDR_LAST 206 } hpa_type; 207 208 /* Enable stats. */ 209 static uint32_t enable_stats = 0; 210 /* Enable retries on RX. */ 211 static uint32_t enable_retry = 1; 212 /* Specify timeout (in useconds) between retries on RX. */ 213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 214 /* Specify the number of retries on RX. */ 215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 216 217 /* Character device basename. Can be set by user. */ 218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 219 220 221 /* This can be set by the user so it is made available here. */ 222 extern uint64_t VHOST_FEATURES; 223 224 /* Default configuration for rx and tx thresholds etc. */ 225 static struct rte_eth_rxconf rx_conf_default = { 226 .rx_thresh = { 227 .pthresh = RX_PTHRESH, 228 .hthresh = RX_HTHRESH, 229 .wthresh = RX_WTHRESH, 230 }, 231 .rx_drop_en = 1, 232 }; 233 234 /* 235 * These default values are optimized for use with the Intel(R) 82599 10 GbE 236 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 237 * network controllers and/or network drivers. 238 */ 239 static struct rte_eth_txconf tx_conf_default = { 240 .tx_thresh = { 241 .pthresh = TX_PTHRESH, 242 .hthresh = TX_HTHRESH, 243 .wthresh = TX_WTHRESH, 244 }, 245 .tx_free_thresh = 0, /* Use PMD default values */ 246 .tx_rs_thresh = 0, /* Use PMD default values */ 247 }; 248 249 /* empty vmdq configuration structure. Filled in programatically */ 250 static struct rte_eth_conf vmdq_conf_default = { 251 .rxmode = { 252 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 253 .split_hdr_size = 0, 254 .header_split = 0, /**< Header Split disabled */ 255 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 256 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 257 /* 258 * It is necessary for 1G NIC such as I350, 259 * this fixes bug of ipv4 forwarding in guest can't 260 * forward pakets from one virtio dev to another virtio dev. 261 */ 262 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 263 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 264 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 265 }, 266 267 .txmode = { 268 .mq_mode = ETH_MQ_TX_NONE, 269 }, 270 .rx_adv_conf = { 271 /* 272 * should be overridden separately in code with 273 * appropriate values 274 */ 275 .vmdq_rx_conf = { 276 .nb_queue_pools = ETH_8_POOLS, 277 .enable_default_pool = 0, 278 .default_pool = 0, 279 .nb_pool_maps = 0, 280 .pool_map = {{0, 0},}, 281 }, 282 }, 283 }; 284 285 static unsigned lcore_ids[RTE_MAX_LCORE]; 286 static uint8_t ports[RTE_MAX_ETHPORTS]; 287 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 288 289 static const uint16_t external_pkt_default_vlan_tag = 2000; 290 const uint16_t vlan_tags[] = { 291 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 292 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 293 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 294 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 295 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 296 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 297 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 298 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 299 }; 300 301 /* ethernet addresses of ports */ 302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 303 304 /* heads for the main used and free linked lists for the data path. */ 305 static struct virtio_net_data_ll *ll_root_used = NULL; 306 static struct virtio_net_data_ll *ll_root_free = NULL; 307 308 /* Array of data core structures containing information on individual core linked lists. */ 309 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 310 311 /* Used for queueing bursts of TX packets. */ 312 struct mbuf_table { 313 unsigned len; 314 unsigned txq_id; 315 struct rte_mbuf *m_table[MAX_PKT_BURST]; 316 }; 317 318 /* TX queue for each data core. */ 319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 320 321 /* TX queue fori each virtio device for zero copy. */ 322 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 323 324 /* Vlan header struct used to insert vlan tags on TX. */ 325 struct vlan_ethhdr { 326 unsigned char h_dest[ETH_ALEN]; 327 unsigned char h_source[ETH_ALEN]; 328 __be16 h_vlan_proto; 329 __be16 h_vlan_TCI; 330 __be16 h_vlan_encapsulated_proto; 331 }; 332 333 /* IPv4 Header */ 334 struct ipv4_hdr { 335 uint8_t version_ihl; /**< version and header length */ 336 uint8_t type_of_service; /**< type of service */ 337 uint16_t total_length; /**< length of packet */ 338 uint16_t packet_id; /**< packet ID */ 339 uint16_t fragment_offset; /**< fragmentation offset */ 340 uint8_t time_to_live; /**< time to live */ 341 uint8_t next_proto_id; /**< protocol ID */ 342 uint16_t hdr_checksum; /**< header checksum */ 343 uint32_t src_addr; /**< source address */ 344 uint32_t dst_addr; /**< destination address */ 345 } __attribute__((__packed__)); 346 347 /* Header lengths. */ 348 #define VLAN_HLEN 4 349 #define VLAN_ETH_HLEN 18 350 351 /* Per-device statistics struct */ 352 struct device_statistics { 353 uint64_t tx_total; 354 rte_atomic64_t rx_total_atomic; 355 uint64_t rx_total; 356 uint64_t tx; 357 rte_atomic64_t rx_atomic; 358 uint64_t rx; 359 } __rte_cache_aligned; 360 struct device_statistics dev_statistics[MAX_DEVICES]; 361 362 /* 363 * Builds up the correct configuration for VMDQ VLAN pool map 364 * according to the pool & queue limits. 365 */ 366 static inline int 367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 368 { 369 struct rte_eth_vmdq_rx_conf conf; 370 unsigned i; 371 372 memset(&conf, 0, sizeof(conf)); 373 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 374 conf.nb_pool_maps = num_devices; 375 conf.enable_loop_back = 376 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 377 378 for (i = 0; i < conf.nb_pool_maps; i++) { 379 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 380 conf.pool_map[i].pools = (1UL << i); 381 } 382 383 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 384 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 385 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 386 return 0; 387 } 388 389 /* 390 * Validate the device number according to the max pool number gotten form 391 * dev_info. If the device number is invalid, give the error message and 392 * return -1. Each device must have its own pool. 393 */ 394 static inline int 395 validate_num_devices(uint32_t max_nb_devices) 396 { 397 if (num_devices > max_nb_devices) { 398 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 399 return -1; 400 } 401 return 0; 402 } 403 404 /* 405 * Initialises a given port using global settings and with the rx buffers 406 * coming from the mbuf_pool passed as parameter 407 */ 408 static inline int 409 port_init(uint8_t port) 410 { 411 struct rte_eth_dev_info dev_info; 412 struct rte_eth_conf port_conf; 413 uint16_t rx_rings, tx_rings; 414 uint16_t rx_ring_size, tx_ring_size; 415 int retval; 416 uint16_t q; 417 418 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 419 rte_eth_dev_info_get (port, &dev_info); 420 421 /*configure the number of supported virtio devices based on VMDQ limits */ 422 num_devices = dev_info.max_vmdq_pools; 423 num_queues = dev_info.max_rx_queues; 424 425 if (zero_copy) { 426 rx_ring_size = num_rx_descriptor; 427 tx_ring_size = num_tx_descriptor; 428 tx_rings = dev_info.max_tx_queues; 429 } else { 430 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 431 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 432 tx_rings = (uint16_t)rte_lcore_count(); 433 } 434 435 retval = validate_num_devices(MAX_DEVICES); 436 if (retval < 0) 437 return retval; 438 439 /* Get port configuration. */ 440 retval = get_eth_conf(&port_conf, num_devices); 441 if (retval < 0) 442 return retval; 443 444 if (port >= rte_eth_dev_count()) return -1; 445 446 rx_rings = (uint16_t)num_queues, 447 /* Configure ethernet device. */ 448 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 449 if (retval != 0) 450 return retval; 451 452 /* Setup the queues. */ 453 for (q = 0; q < rx_rings; q ++) { 454 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 455 rte_eth_dev_socket_id(port), &rx_conf_default, 456 vpool_array[q].pool); 457 if (retval < 0) 458 return retval; 459 } 460 for (q = 0; q < tx_rings; q ++) { 461 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 462 rte_eth_dev_socket_id(port), &tx_conf_default); 463 if (retval < 0) 464 return retval; 465 } 466 467 /* Start the device. */ 468 retval = rte_eth_dev_start(port); 469 if (retval < 0) { 470 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 471 return retval; 472 } 473 474 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 475 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 476 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 477 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 478 (unsigned)port, 479 vmdq_ports_eth_addr[port].addr_bytes[0], 480 vmdq_ports_eth_addr[port].addr_bytes[1], 481 vmdq_ports_eth_addr[port].addr_bytes[2], 482 vmdq_ports_eth_addr[port].addr_bytes[3], 483 vmdq_ports_eth_addr[port].addr_bytes[4], 484 vmdq_ports_eth_addr[port].addr_bytes[5]); 485 486 return 0; 487 } 488 489 /* 490 * Set character device basename. 491 */ 492 static int 493 us_vhost_parse_basename(const char *q_arg) 494 { 495 /* parse number string */ 496 497 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 498 return -1; 499 else 500 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 501 502 return 0; 503 } 504 505 /* 506 * Parse the portmask provided at run time. 507 */ 508 static int 509 parse_portmask(const char *portmask) 510 { 511 char *end = NULL; 512 unsigned long pm; 513 514 errno = 0; 515 516 /* parse hexadecimal string */ 517 pm = strtoul(portmask, &end, 16); 518 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 519 return -1; 520 521 if (pm == 0) 522 return -1; 523 524 return pm; 525 526 } 527 528 /* 529 * Parse num options at run time. 530 */ 531 static int 532 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 533 { 534 char *end = NULL; 535 unsigned long num; 536 537 errno = 0; 538 539 /* parse unsigned int string */ 540 num = strtoul(q_arg, &end, 10); 541 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 542 return -1; 543 544 if (num > max_valid_value) 545 return -1; 546 547 return num; 548 549 } 550 551 /* 552 * Display usage 553 */ 554 static void 555 us_vhost_usage(const char *prgname) 556 { 557 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 558 " --vm2vm [0|1|2]\n" 559 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 560 " --dev-basename <name>\n" 561 " --nb-devices ND\n" 562 " -p PORTMASK: Set mask for ports to be used by application\n" 563 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 564 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 565 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 566 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 567 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 568 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 569 " --dev-basename: The basename to be used for the character device.\n" 570 " --zero-copy [0|1]: disable(default)/enable rx/tx " 571 "zero copy\n" 572 " --rx-desc-num [0-N]: the number of descriptors on rx, " 573 "used only when zero copy is enabled.\n" 574 " --tx-desc-num [0-N]: the number of descriptors on tx, " 575 "used only when zero copy is enabled.\n", 576 prgname); 577 } 578 579 /* 580 * Parse the arguments given in the command line of the application. 581 */ 582 static int 583 us_vhost_parse_args(int argc, char **argv) 584 { 585 int opt, ret; 586 int option_index; 587 unsigned i; 588 const char *prgname = argv[0]; 589 static struct option long_option[] = { 590 {"vm2vm", required_argument, NULL, 0}, 591 {"rx-retry", required_argument, NULL, 0}, 592 {"rx-retry-delay", required_argument, NULL, 0}, 593 {"rx-retry-num", required_argument, NULL, 0}, 594 {"mergeable", required_argument, NULL, 0}, 595 {"stats", required_argument, NULL, 0}, 596 {"dev-basename", required_argument, NULL, 0}, 597 {"zero-copy", required_argument, NULL, 0}, 598 {"rx-desc-num", required_argument, NULL, 0}, 599 {"tx-desc-num", required_argument, NULL, 0}, 600 {NULL, 0, 0, 0}, 601 }; 602 603 /* Parse command line */ 604 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 605 switch (opt) { 606 /* Portmask */ 607 case 'p': 608 enabled_port_mask = parse_portmask(optarg); 609 if (enabled_port_mask == 0) { 610 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 611 us_vhost_usage(prgname); 612 return -1; 613 } 614 break; 615 616 case 0: 617 /* Enable/disable vm2vm comms. */ 618 if (!strncmp(long_option[option_index].name, "vm2vm", 619 MAX_LONG_OPT_SZ)) { 620 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 621 if (ret == -1) { 622 RTE_LOG(INFO, VHOST_CONFIG, 623 "Invalid argument for " 624 "vm2vm [0|1|2]\n"); 625 us_vhost_usage(prgname); 626 return -1; 627 } else { 628 vm2vm_mode = (vm2vm_type)ret; 629 } 630 } 631 632 /* Enable/disable retries on RX. */ 633 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 634 ret = parse_num_opt(optarg, 1); 635 if (ret == -1) { 636 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 637 us_vhost_usage(prgname); 638 return -1; 639 } else { 640 enable_retry = ret; 641 } 642 } 643 644 /* Specify the retries delay time (in useconds) on RX. */ 645 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 646 ret = parse_num_opt(optarg, INT32_MAX); 647 if (ret == -1) { 648 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 649 us_vhost_usage(prgname); 650 return -1; 651 } else { 652 burst_rx_delay_time = ret; 653 } 654 } 655 656 /* Specify the retries number on RX. */ 657 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 658 ret = parse_num_opt(optarg, INT32_MAX); 659 if (ret == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 661 us_vhost_usage(prgname); 662 return -1; 663 } else { 664 burst_rx_retry_num = ret; 665 } 666 } 667 668 /* Enable/disable RX mergeable buffers. */ 669 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 670 ret = parse_num_opt(optarg, 1); 671 if (ret == -1) { 672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } else { 676 if (ret) { 677 vmdq_conf_default.rxmode.jumbo_frame = 1; 678 vmdq_conf_default.rxmode.max_rx_pkt_len 679 = JUMBO_FRAME_MAX_SIZE; 680 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF); 681 } 682 } 683 } 684 685 /* Enable/disable stats. */ 686 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 687 ret = parse_num_opt(optarg, INT32_MAX); 688 if (ret == -1) { 689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 690 us_vhost_usage(prgname); 691 return -1; 692 } else { 693 enable_stats = ret; 694 } 695 } 696 697 /* Set character device basename. */ 698 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 699 if (us_vhost_parse_basename(optarg) == -1) { 700 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 701 us_vhost_usage(prgname); 702 return -1; 703 } 704 } 705 706 /* Enable/disable rx/tx zero copy. */ 707 if (!strncmp(long_option[option_index].name, 708 "zero-copy", MAX_LONG_OPT_SZ)) { 709 ret = parse_num_opt(optarg, 1); 710 if (ret == -1) { 711 RTE_LOG(INFO, VHOST_CONFIG, 712 "Invalid argument" 713 " for zero-copy [0|1]\n"); 714 us_vhost_usage(prgname); 715 return -1; 716 } else 717 zero_copy = ret; 718 719 if (zero_copy) { 720 #ifdef RTE_MBUF_REFCNT 721 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 722 "zero copy vhost APP, please " 723 "disable RTE_MBUF_REFCNT\n" 724 "in config file and then rebuild DPDK " 725 "core lib!\n" 726 "Otherwise please disable zero copy " 727 "flag in command line!\n"); 728 return -1; 729 #endif 730 } 731 } 732 733 /* Specify the descriptor number on RX. */ 734 if (!strncmp(long_option[option_index].name, 735 "rx-desc-num", MAX_LONG_OPT_SZ)) { 736 ret = parse_num_opt(optarg, MAX_RING_DESC); 737 if ((ret == -1) || (!POWEROF2(ret))) { 738 RTE_LOG(INFO, VHOST_CONFIG, 739 "Invalid argument for rx-desc-num[0-N]," 740 "power of 2 required.\n"); 741 us_vhost_usage(prgname); 742 return -1; 743 } else { 744 num_rx_descriptor = ret; 745 } 746 } 747 748 /* Specify the descriptor number on TX. */ 749 if (!strncmp(long_option[option_index].name, 750 "tx-desc-num", MAX_LONG_OPT_SZ)) { 751 ret = parse_num_opt(optarg, MAX_RING_DESC); 752 if ((ret == -1) || (!POWEROF2(ret))) { 753 RTE_LOG(INFO, VHOST_CONFIG, 754 "Invalid argument for tx-desc-num [0-N]," 755 "power of 2 required.\n"); 756 us_vhost_usage(prgname); 757 return -1; 758 } else { 759 num_tx_descriptor = ret; 760 } 761 } 762 763 break; 764 765 /* Invalid option - print options. */ 766 default: 767 us_vhost_usage(prgname); 768 return -1; 769 } 770 } 771 772 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 773 if (enabled_port_mask & (1 << i)) 774 ports[num_ports++] = (uint8_t)i; 775 } 776 777 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 778 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 779 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 780 return -1; 781 } 782 783 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 784 RTE_LOG(INFO, VHOST_PORT, 785 "Vhost zero copy doesn't support software vm2vm," 786 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 787 return -1; 788 } 789 790 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 791 RTE_LOG(INFO, VHOST_PORT, 792 "Vhost zero copy doesn't support jumbo frame," 793 "please specify '--mergeable 0' to disable the " 794 "mergeable feature.\n"); 795 return -1; 796 } 797 798 return 0; 799 } 800 801 /* 802 * Update the global var NUM_PORTS and array PORTS according to system ports number 803 * and return valid ports number 804 */ 805 static unsigned check_ports_num(unsigned nb_ports) 806 { 807 unsigned valid_num_ports = num_ports; 808 unsigned portid; 809 810 if (num_ports > nb_ports) { 811 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 812 num_ports, nb_ports); 813 num_ports = nb_ports; 814 } 815 816 for (portid = 0; portid < num_ports; portid ++) { 817 if (ports[portid] >= nb_ports) { 818 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 819 ports[portid], (nb_ports - 1)); 820 ports[portid] = INVALID_PORT_ID; 821 valid_num_ports--; 822 } 823 } 824 return valid_num_ports; 825 } 826 827 /* 828 * Macro to print out packet contents. Wrapped in debug define so that the 829 * data path is not effected when debug is disabled. 830 */ 831 #ifdef DEBUG 832 #define PRINT_PACKET(device, addr, size, header) do { \ 833 char *pkt_addr = (char*)(addr); \ 834 unsigned int index; \ 835 char packet[MAX_PRINT_BUFF]; \ 836 \ 837 if ((header)) \ 838 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 839 else \ 840 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 841 for (index = 0; index < (size); index++) { \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 843 "%02hhx ", pkt_addr[index]); \ 844 } \ 845 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 846 \ 847 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 848 } while(0) 849 #else 850 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 851 #endif 852 853 /* 854 * Function to convert guest physical addresses to vhost physical addresses. 855 * This is used to convert virtio buffer addresses. 856 */ 857 static inline uint64_t __attribute__((always_inline)) 858 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 859 uint32_t buf_len, hpa_type *addr_type) 860 { 861 struct virtio_memory_regions_hpa *region; 862 uint32_t regionidx; 863 uint64_t vhost_pa = 0; 864 865 *addr_type = PHYS_ADDR_INVALID; 866 867 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 868 region = &vdev->regions_hpa[regionidx]; 869 if ((guest_pa >= region->guest_phys_address) && 870 (guest_pa <= region->guest_phys_address_end)) { 871 vhost_pa = region->host_phys_addr_offset + guest_pa; 872 if (likely((guest_pa + buf_len - 1) 873 <= region->guest_phys_address_end)) 874 *addr_type = PHYS_ADDR_CONTINUOUS; 875 else 876 *addr_type = PHYS_ADDR_CROSS_SUBREG; 877 break; 878 } 879 } 880 881 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 882 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 883 (void *)(uintptr_t)vhost_pa); 884 885 return vhost_pa; 886 } 887 888 /* 889 * Compares a packet destination MAC address to a device MAC address. 890 */ 891 static inline int __attribute__((always_inline)) 892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 893 { 894 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 895 } 896 897 /* 898 * This function learns the MAC address of the device and registers this along with a 899 * vlan tag to a VMDQ. 900 */ 901 static int 902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 903 { 904 struct ether_hdr *pkt_hdr; 905 struct virtio_net_data_ll *dev_ll; 906 struct virtio_net *dev = vdev->dev; 907 int i, ret; 908 909 /* Learn MAC address of guest device from packet */ 910 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 911 912 dev_ll = ll_root_used; 913 914 while (dev_ll != NULL) { 915 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 916 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 917 return -1; 918 } 919 dev_ll = dev_ll->next; 920 } 921 922 for (i = 0; i < ETHER_ADDR_LEN; i++) 923 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 924 925 /* vlan_tag currently uses the device_id. */ 926 vdev->vlan_tag = vlan_tags[dev->device_fh]; 927 928 /* Print out VMDQ registration info. */ 929 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 930 dev->device_fh, 931 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 932 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 933 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 934 vdev->vlan_tag); 935 936 /* Register the MAC address. */ 937 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 938 if (ret) 939 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 940 dev->device_fh); 941 942 /* Enable stripping of the vlan tag as we handle routing. */ 943 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 944 945 /* Set device as ready for RX. */ 946 vdev->ready = DEVICE_RX; 947 948 return 0; 949 } 950 951 /* 952 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 953 * queue before disabling RX on the device. 954 */ 955 static inline void 956 unlink_vmdq(struct vhost_dev *vdev) 957 { 958 unsigned i = 0; 959 unsigned rx_count; 960 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 961 962 if (vdev->ready == DEVICE_RX) { 963 /*clear MAC and VLAN settings*/ 964 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 965 for (i = 0; i < 6; i++) 966 vdev->mac_address.addr_bytes[i] = 0; 967 968 vdev->vlan_tag = 0; 969 970 /*Clear out the receive buffers*/ 971 rx_count = rte_eth_rx_burst(ports[0], 972 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 973 974 while (rx_count) { 975 for (i = 0; i < rx_count; i++) 976 rte_pktmbuf_free(pkts_burst[i]); 977 978 rx_count = rte_eth_rx_burst(ports[0], 979 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 980 } 981 982 vdev->ready = DEVICE_MAC_LEARNING; 983 } 984 } 985 986 /* 987 * Check if the packet destination MAC address is for a local device. If so then put 988 * the packet on that devices RX queue. If not then return. 989 */ 990 static inline unsigned __attribute__((always_inline)) 991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 992 { 993 struct virtio_net_data_ll *dev_ll; 994 struct ether_hdr *pkt_hdr; 995 uint64_t ret = 0; 996 struct virtio_net *dev = vdev->dev; 997 struct virtio_net *tdev; /* destination virito device */ 998 999 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1000 1001 /*get the used devices list*/ 1002 dev_ll = ll_root_used; 1003 1004 while (dev_ll != NULL) { 1005 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1006 &dev_ll->vdev->mac_address)) { 1007 1008 /* Drop the packet if the TX packet is destined for the TX device. */ 1009 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1010 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1011 dev->device_fh); 1012 return 0; 1013 } 1014 tdev = dev_ll->vdev->dev; 1015 1016 1017 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1018 1019 if (dev_ll->vdev->remove) { 1020 /*drop the packet if the device is marked for removal*/ 1021 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1022 } else { 1023 /*send the packet to the local virtio device*/ 1024 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1025 if (enable_stats) { 1026 rte_atomic64_add( 1027 &dev_statistics[tdev->device_fh].rx_total_atomic, 1028 1); 1029 rte_atomic64_add( 1030 &dev_statistics[tdev->device_fh].rx_atomic, 1031 ret); 1032 dev_statistics[tdev->device_fh].tx_total++; 1033 dev_statistics[tdev->device_fh].tx += ret; 1034 } 1035 } 1036 1037 return 0; 1038 } 1039 dev_ll = dev_ll->next; 1040 } 1041 1042 return -1; 1043 } 1044 1045 /* 1046 * This function routes the TX packet to the correct interface. This may be a local device 1047 * or the physical port. 1048 */ 1049 static inline void __attribute__((always_inline)) 1050 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag) 1051 { 1052 struct mbuf_table *tx_q; 1053 struct vlan_ethhdr *vlan_hdr; 1054 struct rte_mbuf **m_table; 1055 struct rte_mbuf *mbuf, *prev; 1056 unsigned len, ret, offset = 0; 1057 const uint16_t lcore_id = rte_lcore_id(); 1058 struct virtio_net_data_ll *dev_ll = ll_root_used; 1059 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1060 struct virtio_net *dev = vdev->dev; 1061 1062 /*check if destination is local VM*/ 1063 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1064 return; 1065 1066 if (vm2vm_mode == VM2VM_HARDWARE) { 1067 while (dev_ll != NULL) { 1068 if ((dev_ll->vdev->ready == DEVICE_RX) 1069 && ether_addr_cmp(&(pkt_hdr->d_addr), 1070 &dev_ll->vdev->mac_address)) { 1071 /* 1072 * Drop the packet if the TX packet is 1073 * destined for the TX device. 1074 */ 1075 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1076 LOG_DEBUG(VHOST_DATA, 1077 "(%"PRIu64") TX: Source and destination" 1078 " MAC addresses are the same. Dropping " 1079 "packet.\n", 1080 dev_ll->vdev->device_fh); 1081 return; 1082 } 1083 offset = 4; 1084 vlan_tag = 1085 (uint16_t) 1086 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1087 1088 LOG_DEBUG(VHOST_DATA, 1089 "(%"PRIu64") TX: pkt to local VM device id:" 1090 "(%"PRIu64") vlan tag: %d.\n", 1091 dev->device_fh, dev_ll->vdev->dev->device_fh, 1092 vlan_tag); 1093 1094 break; 1095 } 1096 dev_ll = dev_ll->next; 1097 } 1098 } 1099 1100 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1101 1102 /*Add packet to the port tx queue*/ 1103 tx_q = &lcore_tx_queue[lcore_id]; 1104 len = tx_q->len; 1105 1106 /* Allocate an mbuf and populate the structure. */ 1107 mbuf = rte_pktmbuf_alloc(mbuf_pool); 1108 if (unlikely(mbuf == NULL)) { 1109 RTE_LOG(ERR, VHOST_DATA, 1110 "Failed to allocate memory for mbuf.\n"); 1111 return; 1112 } 1113 1114 mbuf->data_len = m->data_len + VLAN_HLEN + offset; 1115 mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset; 1116 mbuf->nb_segs = m->nb_segs; 1117 1118 /* Copy ethernet header to mbuf. */ 1119 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1120 rte_pktmbuf_mtod(m, const void *), 1121 ETH_HLEN); 1122 1123 1124 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/ 1125 vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *); 1126 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto; 1127 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); 1128 vlan_hdr->h_vlan_TCI = htons(vlan_tag); 1129 1130 /* Copy the remaining packet contents to the mbuf. */ 1131 rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN), 1132 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN), 1133 (m->data_len - ETH_HLEN)); 1134 1135 /* Copy the remaining segments for the whole packet. */ 1136 prev = mbuf; 1137 while (m->next) { 1138 /* Allocate an mbuf and populate the structure. */ 1139 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool); 1140 if (unlikely(next_mbuf == NULL)) { 1141 rte_pktmbuf_free(mbuf); 1142 RTE_LOG(ERR, VHOST_DATA, 1143 "Failed to allocate memory for mbuf.\n"); 1144 return; 1145 } 1146 1147 m = m->next; 1148 prev->next = next_mbuf; 1149 prev = next_mbuf; 1150 next_mbuf->data_len = m->data_len; 1151 1152 /* Copy data to next mbuf. */ 1153 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *), 1154 rte_pktmbuf_mtod(m, const void *), m->data_len); 1155 } 1156 1157 tx_q->m_table[len] = mbuf; 1158 len++; 1159 if (enable_stats) { 1160 dev_statistics[dev->device_fh].tx_total++; 1161 dev_statistics[dev->device_fh].tx++; 1162 } 1163 1164 if (unlikely(len == MAX_PKT_BURST)) { 1165 m_table = (struct rte_mbuf **)tx_q->m_table; 1166 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1167 /* Free any buffers not handled by TX and update the port stats. */ 1168 if (unlikely(ret < len)) { 1169 do { 1170 rte_pktmbuf_free(m_table[ret]); 1171 } while (++ret < len); 1172 } 1173 1174 len = 0; 1175 } 1176 1177 tx_q->len = len; 1178 return; 1179 } 1180 /* 1181 * This function is called by each data core. It handles all RX/TX registered with the 1182 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1183 * with all devices in the main linked list. 1184 */ 1185 static int 1186 switch_worker(__attribute__((unused)) void *arg) 1187 { 1188 struct rte_mempool *mbuf_pool = arg; 1189 struct virtio_net *dev = NULL; 1190 struct vhost_dev *vdev = NULL; 1191 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1192 struct virtio_net_data_ll *dev_ll; 1193 struct mbuf_table *tx_q; 1194 volatile struct lcore_ll_info *lcore_ll; 1195 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1196 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1197 unsigned ret, i; 1198 const uint16_t lcore_id = rte_lcore_id(); 1199 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1200 uint16_t rx_count = 0; 1201 uint16_t tx_count; 1202 uint32_t retry = 0; 1203 1204 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1205 lcore_ll = lcore_info[lcore_id].lcore_ll; 1206 prev_tsc = 0; 1207 1208 tx_q = &lcore_tx_queue[lcore_id]; 1209 for (i = 0; i < num_cores; i ++) { 1210 if (lcore_ids[i] == lcore_id) { 1211 tx_q->txq_id = i; 1212 break; 1213 } 1214 } 1215 1216 while(1) { 1217 cur_tsc = rte_rdtsc(); 1218 /* 1219 * TX burst queue drain 1220 */ 1221 diff_tsc = cur_tsc - prev_tsc; 1222 if (unlikely(diff_tsc > drain_tsc)) { 1223 1224 if (tx_q->len) { 1225 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1226 1227 /*Tx any packets in the queue*/ 1228 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1229 (struct rte_mbuf **)tx_q->m_table, 1230 (uint16_t)tx_q->len); 1231 if (unlikely(ret < tx_q->len)) { 1232 do { 1233 rte_pktmbuf_free(tx_q->m_table[ret]); 1234 } while (++ret < tx_q->len); 1235 } 1236 1237 tx_q->len = 0; 1238 } 1239 1240 prev_tsc = cur_tsc; 1241 1242 } 1243 1244 rte_prefetch0(lcore_ll->ll_root_used); 1245 /* 1246 * Inform the configuration core that we have exited the linked list and that no devices are 1247 * in use if requested. 1248 */ 1249 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1250 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1251 1252 /* 1253 * Process devices 1254 */ 1255 dev_ll = lcore_ll->ll_root_used; 1256 1257 while (dev_ll != NULL) { 1258 /*get virtio device ID*/ 1259 vdev = dev_ll->vdev; 1260 dev = vdev->dev; 1261 1262 if (vdev->remove) { 1263 dev_ll = dev_ll->next; 1264 unlink_vmdq(vdev); 1265 vdev->ready = DEVICE_SAFE_REMOVE; 1266 continue; 1267 } 1268 if (likely(vdev->ready == DEVICE_RX)) { 1269 /*Handle guest RX*/ 1270 rx_count = rte_eth_rx_burst(ports[0], 1271 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1272 1273 if (rx_count) { 1274 /* 1275 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1276 * Here MAX_PKT_BURST must be less than virtio queue size 1277 */ 1278 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1279 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1280 rte_delay_us(burst_rx_delay_time); 1281 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1282 break; 1283 } 1284 } 1285 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1286 if (enable_stats) { 1287 rte_atomic64_add( 1288 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1289 rx_count); 1290 rte_atomic64_add( 1291 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1292 } 1293 while (likely(rx_count)) { 1294 rx_count--; 1295 rte_pktmbuf_free(pkts_burst[rx_count]); 1296 } 1297 1298 } 1299 } 1300 1301 if (!vdev->remove) { 1302 /* Handle guest TX*/ 1303 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1304 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1305 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1306 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1307 while (tx_count--) 1308 rte_pktmbuf_free(pkts_burst[tx_count]); 1309 } 1310 } 1311 while (tx_count) 1312 virtio_tx_route(vdev, pkts_burst[--tx_count], mbuf_pool, (uint16_t)dev->device_fh); 1313 } 1314 1315 /*move to the next device in the list*/ 1316 dev_ll = dev_ll->next; 1317 } 1318 } 1319 1320 return 0; 1321 } 1322 1323 /* 1324 * This function gets available ring number for zero copy rx. 1325 * Only one thread will call this funciton for a paticular virtio device, 1326 * so, it is designed as non-thread-safe function. 1327 */ 1328 static inline uint32_t __attribute__((always_inline)) 1329 get_available_ring_num_zcp(struct virtio_net *dev) 1330 { 1331 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1332 uint16_t avail_idx; 1333 1334 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1335 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1336 } 1337 1338 /* 1339 * This function gets available ring index for zero copy rx, 1340 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1341 * Only one thread will call this funciton for a paticular virtio device, 1342 * so, it is designed as non-thread-safe function. 1343 */ 1344 static inline uint32_t __attribute__((always_inline)) 1345 get_available_ring_index_zcp(struct virtio_net *dev, 1346 uint16_t *res_base_idx, uint32_t count) 1347 { 1348 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1349 uint16_t avail_idx; 1350 uint32_t retry = 0; 1351 uint16_t free_entries; 1352 1353 *res_base_idx = vq->last_used_idx_res; 1354 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1355 free_entries = (avail_idx - *res_base_idx); 1356 1357 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1358 "avail idx: %d, " 1359 "res base idx:%d, free entries:%d\n", 1360 dev->device_fh, avail_idx, *res_base_idx, 1361 free_entries); 1362 1363 /* 1364 * If retry is enabled and the queue is full then we wait 1365 * and retry to avoid packet loss. 1366 */ 1367 if (enable_retry && unlikely(count > free_entries)) { 1368 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1369 rte_delay_us(burst_rx_delay_time); 1370 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1371 free_entries = (avail_idx - *res_base_idx); 1372 if (count <= free_entries) 1373 break; 1374 } 1375 } 1376 1377 /*check that we have enough buffers*/ 1378 if (unlikely(count > free_entries)) 1379 count = free_entries; 1380 1381 if (unlikely(count == 0)) { 1382 LOG_DEBUG(VHOST_DATA, 1383 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1384 "avail idx: %d, res base idx:%d, free entries:%d\n", 1385 dev->device_fh, avail_idx, 1386 *res_base_idx, free_entries); 1387 return 0; 1388 } 1389 1390 vq->last_used_idx_res = *res_base_idx + count; 1391 1392 return count; 1393 } 1394 1395 /* 1396 * This function put descriptor back to used list. 1397 */ 1398 static inline void __attribute__((always_inline)) 1399 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1400 { 1401 uint16_t res_cur_idx = vq->last_used_idx; 1402 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1403 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1404 rte_compiler_barrier(); 1405 *(volatile uint16_t *)&vq->used->idx += 1; 1406 vq->last_used_idx += 1; 1407 1408 /* Kick the guest if necessary. */ 1409 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1410 eventfd_write((int)vq->kickfd, 1); 1411 } 1412 1413 /* 1414 * This function get available descriptor from vitio vring and un-attached mbuf 1415 * from vpool->ring, and then attach them together. It needs adjust the offset 1416 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1417 * frame data may be put to wrong location in mbuf. 1418 */ 1419 static inline void __attribute__((always_inline)) 1420 attach_rxmbuf_zcp(struct virtio_net *dev) 1421 { 1422 uint16_t res_base_idx, desc_idx; 1423 uint64_t buff_addr, phys_addr; 1424 struct vhost_virtqueue *vq; 1425 struct vring_desc *desc; 1426 struct rte_mbuf *mbuf = NULL; 1427 struct vpool *vpool; 1428 hpa_type addr_type; 1429 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1430 1431 vpool = &vpool_array[vdev->vmdq_rx_q]; 1432 vq = dev->virtqueue[VIRTIO_RXQ]; 1433 1434 do { 1435 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1436 1) != 1)) 1437 return; 1438 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1439 1440 desc = &vq->desc[desc_idx]; 1441 if (desc->flags & VRING_DESC_F_NEXT) { 1442 desc = &vq->desc[desc->next]; 1443 buff_addr = gpa_to_vva(dev, desc->addr); 1444 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1445 &addr_type); 1446 } else { 1447 buff_addr = gpa_to_vva(dev, 1448 desc->addr + vq->vhost_hlen); 1449 phys_addr = gpa_to_hpa(vdev, 1450 desc->addr + vq->vhost_hlen, 1451 desc->len, &addr_type); 1452 } 1453 1454 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1455 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1456 " address found when attaching RX frame buffer" 1457 " address!\n", dev->device_fh); 1458 put_desc_to_used_list_zcp(vq, desc_idx); 1459 continue; 1460 } 1461 1462 /* 1463 * Check if the frame buffer address from guest crosses 1464 * sub-region or not. 1465 */ 1466 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1467 RTE_LOG(ERR, VHOST_DATA, 1468 "(%"PRIu64") Frame buffer address cross " 1469 "sub-regioin found when attaching RX frame " 1470 "buffer address!\n", 1471 dev->device_fh); 1472 put_desc_to_used_list_zcp(vq, desc_idx); 1473 continue; 1474 } 1475 } while (unlikely(phys_addr == 0)); 1476 1477 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1478 if (unlikely(mbuf == NULL)) { 1479 LOG_DEBUG(VHOST_DATA, 1480 "(%"PRIu64") in attach_rxmbuf_zcp: " 1481 "ring_sc_dequeue fail.\n", 1482 dev->device_fh); 1483 put_desc_to_used_list_zcp(vq, desc_idx); 1484 return; 1485 } 1486 1487 if (unlikely(vpool->buf_size > desc->len)) { 1488 LOG_DEBUG(VHOST_DATA, 1489 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1490 "length(%d) of descriptor idx: %d less than room " 1491 "size required: %d\n", 1492 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1493 put_desc_to_used_list_zcp(vq, desc_idx); 1494 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1495 return; 1496 } 1497 1498 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1499 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1500 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1501 mbuf->data_len = desc->len; 1502 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1503 1504 LOG_DEBUG(VHOST_DATA, 1505 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1506 "descriptor idx:%d\n", 1507 dev->device_fh, res_base_idx, desc_idx); 1508 1509 __rte_mbuf_raw_free(mbuf); 1510 1511 return; 1512 } 1513 1514 /* 1515 * Detach an attched packet mbuf - 1516 * - restore original mbuf address and length values. 1517 * - reset pktmbuf data and data_len to their default values. 1518 * All other fields of the given packet mbuf will be left intact. 1519 * 1520 * @param m 1521 * The attached packet mbuf. 1522 */ 1523 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1524 { 1525 const struct rte_mempool *mp = m->pool; 1526 void *buf = RTE_MBUF_TO_BADDR(m); 1527 uint32_t buf_ofs; 1528 uint32_t buf_len = mp->elt_size - sizeof(*m); 1529 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1530 1531 m->buf_addr = buf; 1532 m->buf_len = (uint16_t)buf_len; 1533 1534 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1535 RTE_PKTMBUF_HEADROOM : m->buf_len; 1536 m->data_off = buf_ofs; 1537 1538 m->data_len = 0; 1539 } 1540 1541 /* 1542 * This function is called after packets have been transimited. It fetchs mbuf 1543 * from vpool->pool, detached it and put into vpool->ring. It also update the 1544 * used index and kick the guest if necessary. 1545 */ 1546 static inline uint32_t __attribute__((always_inline)) 1547 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1548 { 1549 struct rte_mbuf *mbuf; 1550 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1551 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1552 uint32_t index = 0; 1553 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1554 1555 LOG_DEBUG(VHOST_DATA, 1556 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1557 "clean is: %d\n", 1558 dev->device_fh, mbuf_count); 1559 LOG_DEBUG(VHOST_DATA, 1560 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1561 "clean is : %d\n", 1562 dev->device_fh, rte_ring_count(vpool->ring)); 1563 1564 for (index = 0; index < mbuf_count; index++) { 1565 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1566 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1567 pktmbuf_detach_zcp(mbuf); 1568 rte_ring_sp_enqueue(vpool->ring, mbuf); 1569 1570 /* Update used index buffer information. */ 1571 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1572 vq->used->ring[used_idx].len = 0; 1573 1574 used_idx = (used_idx + 1) & (vq->size - 1); 1575 } 1576 1577 LOG_DEBUG(VHOST_DATA, 1578 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1579 "clean is: %d\n", 1580 dev->device_fh, rte_mempool_count(vpool->pool)); 1581 LOG_DEBUG(VHOST_DATA, 1582 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1583 "clean is : %d\n", 1584 dev->device_fh, rte_ring_count(vpool->ring)); 1585 LOG_DEBUG(VHOST_DATA, 1586 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1587 "vq->last_used_idx:%d\n", 1588 dev->device_fh, vq->last_used_idx); 1589 1590 vq->last_used_idx += mbuf_count; 1591 1592 LOG_DEBUG(VHOST_DATA, 1593 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1594 "vq->last_used_idx:%d\n", 1595 dev->device_fh, vq->last_used_idx); 1596 1597 rte_compiler_barrier(); 1598 1599 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1600 1601 /* Kick guest if required. */ 1602 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1603 eventfd_write((int)vq->kickfd, 1); 1604 1605 return 0; 1606 } 1607 1608 /* 1609 * This function is called when a virtio device is destroy. 1610 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1611 */ 1612 static void mbuf_destroy_zcp(struct vpool *vpool) 1613 { 1614 struct rte_mbuf *mbuf = NULL; 1615 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1616 1617 LOG_DEBUG(VHOST_CONFIG, 1618 "in mbuf_destroy_zcp: mbuf count in mempool before " 1619 "mbuf_destroy_zcp is: %d\n", 1620 mbuf_count); 1621 LOG_DEBUG(VHOST_CONFIG, 1622 "in mbuf_destroy_zcp: mbuf count in ring before " 1623 "mbuf_destroy_zcp is : %d\n", 1624 rte_ring_count(vpool->ring)); 1625 1626 for (index = 0; index < mbuf_count; index++) { 1627 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1628 if (likely(mbuf != NULL)) { 1629 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1630 pktmbuf_detach_zcp(mbuf); 1631 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1632 } 1633 } 1634 1635 LOG_DEBUG(VHOST_CONFIG, 1636 "in mbuf_destroy_zcp: mbuf count in mempool after " 1637 "mbuf_destroy_zcp is: %d\n", 1638 rte_mempool_count(vpool->pool)); 1639 LOG_DEBUG(VHOST_CONFIG, 1640 "in mbuf_destroy_zcp: mbuf count in ring after " 1641 "mbuf_destroy_zcp is : %d\n", 1642 rte_ring_count(vpool->ring)); 1643 } 1644 1645 /* 1646 * This function update the use flag and counter. 1647 */ 1648 static inline uint32_t __attribute__((always_inline)) 1649 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1650 uint32_t count) 1651 { 1652 struct vhost_virtqueue *vq; 1653 struct vring_desc *desc; 1654 struct rte_mbuf *buff; 1655 /* The virtio_hdr is initialised to 0. */ 1656 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1657 = {{0, 0, 0, 0, 0, 0}, 0}; 1658 uint64_t buff_hdr_addr = 0; 1659 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1660 uint32_t head_idx, packet_success = 0; 1661 uint16_t res_cur_idx; 1662 1663 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1664 1665 if (count == 0) 1666 return 0; 1667 1668 vq = dev->virtqueue[VIRTIO_RXQ]; 1669 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1670 1671 res_cur_idx = vq->last_used_idx; 1672 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1673 dev->device_fh, res_cur_idx, res_cur_idx + count); 1674 1675 /* Retrieve all of the head indexes first to avoid caching issues. */ 1676 for (head_idx = 0; head_idx < count; head_idx++) 1677 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1678 1679 /*Prefetch descriptor index. */ 1680 rte_prefetch0(&vq->desc[head[packet_success]]); 1681 1682 while (packet_success != count) { 1683 /* Get descriptor from available ring */ 1684 desc = &vq->desc[head[packet_success]]; 1685 1686 buff = pkts[packet_success]; 1687 LOG_DEBUG(VHOST_DATA, 1688 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1689 "pkt[%d] descriptor idx: %d\n", 1690 dev->device_fh, packet_success, 1691 MBUF_HEADROOM_UINT32(buff)); 1692 1693 PRINT_PACKET(dev, 1694 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1695 + RTE_PKTMBUF_HEADROOM), 1696 rte_pktmbuf_data_len(buff), 0); 1697 1698 /* Buffer address translation for virtio header. */ 1699 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1700 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1701 1702 /* 1703 * If the descriptors are chained the header and data are 1704 * placed in separate buffers. 1705 */ 1706 if (desc->flags & VRING_DESC_F_NEXT) { 1707 desc->len = vq->vhost_hlen; 1708 desc = &vq->desc[desc->next]; 1709 desc->len = rte_pktmbuf_data_len(buff); 1710 } else { 1711 desc->len = packet_len; 1712 } 1713 1714 /* Update used ring with desc information */ 1715 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1716 = head[packet_success]; 1717 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1718 = packet_len; 1719 res_cur_idx++; 1720 packet_success++; 1721 1722 /* A header is required per buffer. */ 1723 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1724 (const void *)&virtio_hdr, vq->vhost_hlen); 1725 1726 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1727 1728 if (likely(packet_success < count)) { 1729 /* Prefetch descriptor index. */ 1730 rte_prefetch0(&vq->desc[head[packet_success]]); 1731 } 1732 } 1733 1734 rte_compiler_barrier(); 1735 1736 LOG_DEBUG(VHOST_DATA, 1737 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1738 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1739 dev->device_fh, vq->last_used_idx, vq->used->idx); 1740 1741 *(volatile uint16_t *)&vq->used->idx += count; 1742 vq->last_used_idx += count; 1743 1744 LOG_DEBUG(VHOST_DATA, 1745 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1746 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1747 dev->device_fh, vq->last_used_idx, vq->used->idx); 1748 1749 /* Kick the guest if necessary. */ 1750 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1751 eventfd_write((int)vq->kickfd, 1); 1752 1753 return count; 1754 } 1755 1756 /* 1757 * This function routes the TX packet to the correct interface. 1758 * This may be a local device or the physical port. 1759 */ 1760 static inline void __attribute__((always_inline)) 1761 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1762 uint32_t desc_idx, uint8_t need_copy) 1763 { 1764 struct mbuf_table *tx_q; 1765 struct rte_mbuf **m_table; 1766 struct rte_mbuf *mbuf = NULL; 1767 unsigned len, ret, offset = 0; 1768 struct vpool *vpool; 1769 struct virtio_net_data_ll *dev_ll = ll_root_used; 1770 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1771 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1772 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1773 1774 /*Add packet to the port tx queue*/ 1775 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1776 len = tx_q->len; 1777 1778 /* Allocate an mbuf and populate the structure. */ 1779 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1780 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1781 if (unlikely(mbuf == NULL)) { 1782 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1783 RTE_LOG(ERR, VHOST_DATA, 1784 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1785 dev->device_fh); 1786 put_desc_to_used_list_zcp(vq, desc_idx); 1787 return; 1788 } 1789 1790 if (vm2vm_mode == VM2VM_HARDWARE) { 1791 /* Avoid using a vlan tag from any vm for external pkt, such as 1792 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1793 * selection, MAC address determines it as an external pkt 1794 * which should go to network, while vlan tag determine it as 1795 * a vm2vm pkt should forward to another vm. Hardware confuse 1796 * such a ambiguous situation, so pkt will lost. 1797 */ 1798 vlan_tag = external_pkt_default_vlan_tag; 1799 while (dev_ll != NULL) { 1800 if (likely(dev_ll->vdev->ready == DEVICE_RX) && 1801 ether_addr_cmp(&(pkt_hdr->d_addr), 1802 &dev_ll->vdev->mac_address)) { 1803 1804 /* 1805 * Drop the packet if the TX packet is destined 1806 * for the TX device. 1807 */ 1808 if (unlikely(dev_ll->vdev->dev->device_fh 1809 == dev->device_fh)) { 1810 LOG_DEBUG(VHOST_DATA, 1811 "(%"PRIu64") TX: Source and destination" 1812 "MAC addresses are the same. Dropping " 1813 "packet.\n", 1814 dev_ll->vdev->dev->device_fh); 1815 MBUF_HEADROOM_UINT32(mbuf) 1816 = (uint32_t)desc_idx; 1817 __rte_mbuf_raw_free(mbuf); 1818 return; 1819 } 1820 1821 /* 1822 * Packet length offset 4 bytes for HW vlan 1823 * strip when L2 switch back. 1824 */ 1825 offset = 4; 1826 vlan_tag = 1827 (uint16_t) 1828 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1829 1830 LOG_DEBUG(VHOST_DATA, 1831 "(%"PRIu64") TX: pkt to local VM device id:" 1832 "(%"PRIu64") vlan tag: %d.\n", 1833 dev->device_fh, dev_ll->vdev->dev->device_fh, 1834 vlan_tag); 1835 1836 break; 1837 } 1838 dev_ll = dev_ll->next; 1839 } 1840 } 1841 1842 mbuf->nb_segs = m->nb_segs; 1843 mbuf->next = m->next; 1844 mbuf->data_len = m->data_len + offset; 1845 mbuf->pkt_len = mbuf->data_len; 1846 if (unlikely(need_copy)) { 1847 /* Copy the packet contents to the mbuf. */ 1848 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1849 rte_pktmbuf_mtod(m, void *), 1850 m->data_len); 1851 } else { 1852 mbuf->data_off = m->data_off; 1853 mbuf->buf_physaddr = m->buf_physaddr; 1854 mbuf->buf_addr = m->buf_addr; 1855 } 1856 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1857 mbuf->vlan_tci = vlan_tag; 1858 mbuf->l2_len = sizeof(struct ether_hdr); 1859 mbuf->l3_len = sizeof(struct ipv4_hdr); 1860 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1861 1862 tx_q->m_table[len] = mbuf; 1863 len++; 1864 1865 LOG_DEBUG(VHOST_DATA, 1866 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1867 dev->device_fh, 1868 mbuf->nb_segs, 1869 (mbuf->next == NULL) ? "null" : "non-null"); 1870 1871 if (enable_stats) { 1872 dev_statistics[dev->device_fh].tx_total++; 1873 dev_statistics[dev->device_fh].tx++; 1874 } 1875 1876 if (unlikely(len == MAX_PKT_BURST)) { 1877 m_table = (struct rte_mbuf **)tx_q->m_table; 1878 ret = rte_eth_tx_burst(ports[0], 1879 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1880 1881 /* 1882 * Free any buffers not handled by TX and update 1883 * the port stats. 1884 */ 1885 if (unlikely(ret < len)) { 1886 do { 1887 rte_pktmbuf_free(m_table[ret]); 1888 } while (++ret < len); 1889 } 1890 1891 len = 0; 1892 txmbuf_clean_zcp(dev, vpool); 1893 } 1894 1895 tx_q->len = len; 1896 1897 return; 1898 } 1899 1900 /* 1901 * This function TX all available packets in virtio TX queue for one 1902 * virtio-net device. If it is first packet, it learns MAC address and 1903 * setup VMDQ. 1904 */ 1905 static inline void __attribute__((always_inline)) 1906 virtio_dev_tx_zcp(struct virtio_net *dev) 1907 { 1908 struct rte_mbuf m; 1909 struct vhost_virtqueue *vq; 1910 struct vring_desc *desc; 1911 uint64_t buff_addr = 0, phys_addr; 1912 uint32_t head[MAX_PKT_BURST]; 1913 uint32_t i; 1914 uint16_t free_entries, packet_success = 0; 1915 uint16_t avail_idx; 1916 uint8_t need_copy = 0; 1917 hpa_type addr_type; 1918 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1919 1920 vq = dev->virtqueue[VIRTIO_TXQ]; 1921 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1922 1923 /* If there are no available buffers then return. */ 1924 if (vq->last_used_idx_res == avail_idx) 1925 return; 1926 1927 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1928 1929 /* Prefetch available ring to retrieve head indexes. */ 1930 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1931 1932 /* Get the number of free entries in the ring */ 1933 free_entries = (avail_idx - vq->last_used_idx_res); 1934 1935 /* Limit to MAX_PKT_BURST. */ 1936 free_entries 1937 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1938 1939 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1940 dev->device_fh, free_entries); 1941 1942 /* Retrieve all of the head indexes first to avoid caching issues. */ 1943 for (i = 0; i < free_entries; i++) 1944 head[i] 1945 = vq->avail->ring[(vq->last_used_idx_res + i) 1946 & (vq->size - 1)]; 1947 1948 vq->last_used_idx_res += free_entries; 1949 1950 /* Prefetch descriptor index. */ 1951 rte_prefetch0(&vq->desc[head[packet_success]]); 1952 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1953 1954 while (packet_success < free_entries) { 1955 desc = &vq->desc[head[packet_success]]; 1956 1957 /* Discard first buffer as it is the virtio header */ 1958 desc = &vq->desc[desc->next]; 1959 1960 /* Buffer address translation. */ 1961 buff_addr = gpa_to_vva(dev, desc->addr); 1962 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type); 1963 1964 if (likely(packet_success < (free_entries - 1))) 1965 /* Prefetch descriptor index. */ 1966 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1967 1968 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1969 RTE_LOG(ERR, VHOST_DATA, 1970 "(%"PRIu64") Invalid frame buffer address found" 1971 "when TX packets!\n", 1972 dev->device_fh); 1973 packet_success++; 1974 continue; 1975 } 1976 1977 /* Prefetch buffer address. */ 1978 rte_prefetch0((void *)(uintptr_t)buff_addr); 1979 1980 /* 1981 * Setup dummy mbuf. This is copied to a real mbuf if 1982 * transmitted out the physical port. 1983 */ 1984 m.data_len = desc->len; 1985 m.nb_segs = 1; 1986 m.next = NULL; 1987 m.data_off = 0; 1988 m.buf_addr = (void *)(uintptr_t)buff_addr; 1989 m.buf_physaddr = phys_addr; 1990 1991 /* 1992 * Check if the frame buffer address from guest crosses 1993 * sub-region or not. 1994 */ 1995 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1996 RTE_LOG(ERR, VHOST_DATA, 1997 "(%"PRIu64") Frame buffer address cross " 1998 "sub-regioin found when attaching TX frame " 1999 "buffer address!\n", 2000 dev->device_fh); 2001 need_copy = 1; 2002 } else 2003 need_copy = 0; 2004 2005 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2006 2007 /* 2008 * If this is the first received packet we need to learn 2009 * the MAC and setup VMDQ 2010 */ 2011 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2012 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2013 /* 2014 * Discard frame if device is scheduled for 2015 * removal or a duplicate MAC address is found. 2016 */ 2017 packet_success += free_entries; 2018 vq->last_used_idx += packet_success; 2019 break; 2020 } 2021 } 2022 2023 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2024 packet_success++; 2025 } 2026 } 2027 2028 /* 2029 * This function is called by each data core. It handles all RX/TX registered 2030 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2031 * addresses are compared with all devices in the main linked list. 2032 */ 2033 static int 2034 switch_worker_zcp(__attribute__((unused)) void *arg) 2035 { 2036 struct virtio_net *dev = NULL; 2037 struct vhost_dev *vdev = NULL; 2038 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2039 struct virtio_net_data_ll *dev_ll; 2040 struct mbuf_table *tx_q; 2041 volatile struct lcore_ll_info *lcore_ll; 2042 const uint64_t drain_tsc 2043 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2044 * BURST_TX_DRAIN_US; 2045 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2046 unsigned ret; 2047 const uint16_t lcore_id = rte_lcore_id(); 2048 uint16_t count_in_ring, rx_count = 0; 2049 2050 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2051 2052 lcore_ll = lcore_info[lcore_id].lcore_ll; 2053 prev_tsc = 0; 2054 2055 while (1) { 2056 cur_tsc = rte_rdtsc(); 2057 2058 /* TX burst queue drain */ 2059 diff_tsc = cur_tsc - prev_tsc; 2060 if (unlikely(diff_tsc > drain_tsc)) { 2061 /* 2062 * Get mbuf from vpool.pool and detach mbuf and 2063 * put back into vpool.ring. 2064 */ 2065 dev_ll = lcore_ll->ll_root_used; 2066 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2067 /* Get virtio device ID */ 2068 vdev = dev_ll->vdev; 2069 dev = vdev->dev; 2070 2071 if (likely(!vdev->remove)) { 2072 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2073 if (tx_q->len) { 2074 LOG_DEBUG(VHOST_DATA, 2075 "TX queue drained after timeout" 2076 " with burst size %u\n", 2077 tx_q->len); 2078 2079 /* 2080 * Tx any packets in the queue 2081 */ 2082 ret = rte_eth_tx_burst( 2083 ports[0], 2084 (uint16_t)tx_q->txq_id, 2085 (struct rte_mbuf **) 2086 tx_q->m_table, 2087 (uint16_t)tx_q->len); 2088 if (unlikely(ret < tx_q->len)) { 2089 do { 2090 rte_pktmbuf_free( 2091 tx_q->m_table[ret]); 2092 } while (++ret < tx_q->len); 2093 } 2094 tx_q->len = 0; 2095 2096 txmbuf_clean_zcp(dev, 2097 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2098 } 2099 } 2100 dev_ll = dev_ll->next; 2101 } 2102 prev_tsc = cur_tsc; 2103 } 2104 2105 rte_prefetch0(lcore_ll->ll_root_used); 2106 2107 /* 2108 * Inform the configuration core that we have exited the linked 2109 * list and that no devices are in use if requested. 2110 */ 2111 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2112 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2113 2114 /* Process devices */ 2115 dev_ll = lcore_ll->ll_root_used; 2116 2117 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2118 vdev = dev_ll->vdev; 2119 dev = vdev->dev; 2120 if (unlikely(vdev->remove)) { 2121 dev_ll = dev_ll->next; 2122 unlink_vmdq(vdev); 2123 vdev->ready = DEVICE_SAFE_REMOVE; 2124 continue; 2125 } 2126 2127 if (likely(vdev->ready == DEVICE_RX)) { 2128 uint32_t index = vdev->vmdq_rx_q; 2129 uint16_t i; 2130 count_in_ring 2131 = rte_ring_count(vpool_array[index].ring); 2132 uint16_t free_entries 2133 = (uint16_t)get_available_ring_num_zcp(dev); 2134 2135 /* 2136 * Attach all mbufs in vpool.ring and put back 2137 * into vpool.pool. 2138 */ 2139 for (i = 0; 2140 i < RTE_MIN(free_entries, 2141 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2142 i++) 2143 attach_rxmbuf_zcp(dev); 2144 2145 /* Handle guest RX */ 2146 rx_count = rte_eth_rx_burst(ports[0], 2147 vdev->vmdq_rx_q, pkts_burst, 2148 MAX_PKT_BURST); 2149 2150 if (rx_count) { 2151 ret_count = virtio_dev_rx_zcp(dev, 2152 pkts_burst, rx_count); 2153 if (enable_stats) { 2154 dev_statistics[dev->device_fh].rx_total 2155 += rx_count; 2156 dev_statistics[dev->device_fh].rx 2157 += ret_count; 2158 } 2159 while (likely(rx_count)) { 2160 rx_count--; 2161 pktmbuf_detach_zcp( 2162 pkts_burst[rx_count]); 2163 rte_ring_sp_enqueue( 2164 vpool_array[index].ring, 2165 (void *)pkts_burst[rx_count]); 2166 } 2167 } 2168 } 2169 2170 if (likely(!vdev->remove)) 2171 /* Handle guest TX */ 2172 virtio_dev_tx_zcp(dev); 2173 2174 /* Move to the next device in the list */ 2175 dev_ll = dev_ll->next; 2176 } 2177 } 2178 2179 return 0; 2180 } 2181 2182 2183 /* 2184 * Add an entry to a used linked list. A free entry must first be found 2185 * in the free linked list using get_data_ll_free_entry(); 2186 */ 2187 static void 2188 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2189 struct virtio_net_data_ll *ll_dev) 2190 { 2191 struct virtio_net_data_ll *ll = *ll_root_addr; 2192 2193 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2194 ll_dev->next = NULL; 2195 rte_compiler_barrier(); 2196 2197 /* If ll == NULL then this is the first device. */ 2198 if (ll) { 2199 /* Increment to the tail of the linked list. */ 2200 while ((ll->next != NULL) ) 2201 ll = ll->next; 2202 2203 ll->next = ll_dev; 2204 } else { 2205 *ll_root_addr = ll_dev; 2206 } 2207 } 2208 2209 /* 2210 * Remove an entry from a used linked list. The entry must then be added to 2211 * the free linked list using put_data_ll_free_entry(). 2212 */ 2213 static void 2214 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2215 struct virtio_net_data_ll *ll_dev, 2216 struct virtio_net_data_ll *ll_dev_last) 2217 { 2218 struct virtio_net_data_ll *ll = *ll_root_addr; 2219 2220 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2221 return; 2222 2223 if (ll_dev == ll) 2224 *ll_root_addr = ll_dev->next; 2225 else 2226 if (likely(ll_dev_last != NULL)) 2227 ll_dev_last->next = ll_dev->next; 2228 else 2229 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2230 } 2231 2232 /* 2233 * Find and return an entry from the free linked list. 2234 */ 2235 static struct virtio_net_data_ll * 2236 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2237 { 2238 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2239 struct virtio_net_data_ll *ll_dev; 2240 2241 if (ll_free == NULL) 2242 return NULL; 2243 2244 ll_dev = ll_free; 2245 *ll_root_addr = ll_free->next; 2246 2247 return ll_dev; 2248 } 2249 2250 /* 2251 * Place an entry back on to the free linked list. 2252 */ 2253 static void 2254 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2255 struct virtio_net_data_ll *ll_dev) 2256 { 2257 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2258 2259 if (ll_dev == NULL) 2260 return; 2261 2262 ll_dev->next = ll_free; 2263 *ll_root_addr = ll_dev; 2264 } 2265 2266 /* 2267 * Creates a linked list of a given size. 2268 */ 2269 static struct virtio_net_data_ll * 2270 alloc_data_ll(uint32_t size) 2271 { 2272 struct virtio_net_data_ll *ll_new; 2273 uint32_t i; 2274 2275 /* Malloc and then chain the linked list. */ 2276 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2277 if (ll_new == NULL) { 2278 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2279 return NULL; 2280 } 2281 2282 for (i = 0; i < size - 1; i++) { 2283 ll_new[i].vdev = NULL; 2284 ll_new[i].next = &ll_new[i+1]; 2285 } 2286 ll_new[i].next = NULL; 2287 2288 return (ll_new); 2289 } 2290 2291 /* 2292 * Create the main linked list along with each individual cores linked list. A used and a free list 2293 * are created to manage entries. 2294 */ 2295 static int 2296 init_data_ll (void) 2297 { 2298 int lcore; 2299 2300 RTE_LCORE_FOREACH_SLAVE(lcore) { 2301 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2302 if (lcore_info[lcore].lcore_ll == NULL) { 2303 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2304 return -1; 2305 } 2306 2307 lcore_info[lcore].lcore_ll->device_num = 0; 2308 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2309 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2310 if (num_devices % num_switching_cores) 2311 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2312 else 2313 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2314 } 2315 2316 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2317 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2318 2319 return 0; 2320 } 2321 2322 /* 2323 * Set virtqueue flags so that we do not receive interrupts. 2324 */ 2325 static void 2326 set_irq_status (struct virtio_net *dev) 2327 { 2328 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2329 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2330 } 2331 2332 /* 2333 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2334 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2335 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2336 */ 2337 static void 2338 destroy_device (volatile struct virtio_net *dev) 2339 { 2340 struct virtio_net_data_ll *ll_lcore_dev_cur; 2341 struct virtio_net_data_ll *ll_main_dev_cur; 2342 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2343 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2344 struct vhost_dev *vdev; 2345 int lcore; 2346 2347 dev->flags &= ~VIRTIO_DEV_RUNNING; 2348 2349 vdev = (struct vhost_dev *)dev->priv; 2350 /*set the remove flag. */ 2351 vdev->remove = 1; 2352 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2353 rte_pause(); 2354 } 2355 2356 /* Search for entry to be removed from lcore ll */ 2357 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2358 while (ll_lcore_dev_cur != NULL) { 2359 if (ll_lcore_dev_cur->vdev == vdev) { 2360 break; 2361 } else { 2362 ll_lcore_dev_last = ll_lcore_dev_cur; 2363 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2364 } 2365 } 2366 2367 if (ll_lcore_dev_cur == NULL) { 2368 RTE_LOG(ERR, VHOST_CONFIG, 2369 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2370 dev->device_fh); 2371 return; 2372 } 2373 2374 /* Search for entry to be removed from main ll */ 2375 ll_main_dev_cur = ll_root_used; 2376 ll_main_dev_last = NULL; 2377 while (ll_main_dev_cur != NULL) { 2378 if (ll_main_dev_cur->vdev == vdev) { 2379 break; 2380 } else { 2381 ll_main_dev_last = ll_main_dev_cur; 2382 ll_main_dev_cur = ll_main_dev_cur->next; 2383 } 2384 } 2385 2386 /* Remove entries from the lcore and main ll. */ 2387 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2388 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2389 2390 /* Set the dev_removal_flag on each lcore. */ 2391 RTE_LCORE_FOREACH_SLAVE(lcore) { 2392 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2393 } 2394 2395 /* 2396 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2397 * they can no longer access the device removed from the linked lists and that the devices 2398 * are no longer in use. 2399 */ 2400 RTE_LCORE_FOREACH_SLAVE(lcore) { 2401 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2402 rte_pause(); 2403 } 2404 } 2405 2406 /* Add the entries back to the lcore and main free ll.*/ 2407 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2408 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2409 2410 /* Decrement number of device on the lcore. */ 2411 lcore_info[vdev->coreid].lcore_ll->device_num--; 2412 2413 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2414 2415 if (zero_copy) { 2416 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2417 2418 /* Stop the RX queue. */ 2419 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2420 LOG_DEBUG(VHOST_CONFIG, 2421 "(%"PRIu64") In destroy_device: Failed to stop " 2422 "rx queue:%d\n", 2423 dev->device_fh, 2424 vdev->vmdq_rx_q); 2425 } 2426 2427 LOG_DEBUG(VHOST_CONFIG, 2428 "(%"PRIu64") in destroy_device: Start put mbuf in " 2429 "mempool back to ring for RX queue: %d\n", 2430 dev->device_fh, vdev->vmdq_rx_q); 2431 2432 mbuf_destroy_zcp(vpool); 2433 2434 /* Stop the TX queue. */ 2435 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2436 LOG_DEBUG(VHOST_CONFIG, 2437 "(%"PRIu64") In destroy_device: Failed to " 2438 "stop tx queue:%d\n", 2439 dev->device_fh, vdev->vmdq_rx_q); 2440 } 2441 2442 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2443 2444 LOG_DEBUG(VHOST_CONFIG, 2445 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2446 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2447 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2448 dev->device_fh); 2449 2450 mbuf_destroy_zcp(vpool); 2451 rte_free(vdev->regions_hpa); 2452 } 2453 rte_free(vdev); 2454 2455 } 2456 2457 /* 2458 * Calculate the region count of physical continous regions for one particular 2459 * region of whose vhost virtual address is continous. The particular region 2460 * start from vva_start, with size of 'size' in argument. 2461 */ 2462 static uint32_t 2463 check_hpa_regions(uint64_t vva_start, uint64_t size) 2464 { 2465 uint32_t i, nregions = 0, page_size = getpagesize(); 2466 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2467 if (vva_start % page_size) { 2468 LOG_DEBUG(VHOST_CONFIG, 2469 "in check_countinous: vva start(%p) mod page_size(%d) " 2470 "has remainder\n", 2471 (void *)(uintptr_t)vva_start, page_size); 2472 return 0; 2473 } 2474 if (size % page_size) { 2475 LOG_DEBUG(VHOST_CONFIG, 2476 "in check_countinous: " 2477 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2478 size, page_size); 2479 return 0; 2480 } 2481 for (i = 0; i < size - page_size; i = i + page_size) { 2482 cur_phys_addr 2483 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2484 next_phys_addr = rte_mem_virt2phy( 2485 (void *)(uintptr_t)(vva_start + i + page_size)); 2486 if ((cur_phys_addr + page_size) != next_phys_addr) { 2487 ++nregions; 2488 LOG_DEBUG(VHOST_CONFIG, 2489 "in check_continuous: hva addr:(%p) is not " 2490 "continuous with hva addr:(%p), diff:%d\n", 2491 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2492 (void *)(uintptr_t)(vva_start + (uint64_t)i 2493 + page_size), page_size); 2494 LOG_DEBUG(VHOST_CONFIG, 2495 "in check_continuous: hpa addr:(%p) is not " 2496 "continuous with hpa addr:(%p), " 2497 "diff:(%"PRIu64")\n", 2498 (void *)(uintptr_t)cur_phys_addr, 2499 (void *)(uintptr_t)next_phys_addr, 2500 (next_phys_addr-cur_phys_addr)); 2501 } 2502 } 2503 return nregions; 2504 } 2505 2506 /* 2507 * Divide each region whose vhost virtual address is continous into a few 2508 * sub-regions, make sure the physical address within each sub-region are 2509 * continous. And fill offset(to GPA) and size etc. information of each 2510 * sub-region into regions_hpa. 2511 */ 2512 static uint32_t 2513 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2514 { 2515 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2516 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2517 2518 if (mem_region_hpa == NULL) 2519 return 0; 2520 2521 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2522 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2523 virtio_memory->regions[regionidx].address_offset; 2524 mem_region_hpa[regionidx_hpa].guest_phys_address 2525 = virtio_memory->regions[regionidx].guest_phys_address; 2526 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2527 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2528 mem_region_hpa[regionidx_hpa].guest_phys_address; 2529 LOG_DEBUG(VHOST_CONFIG, 2530 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2531 regionidx_hpa, 2532 (void *)(uintptr_t) 2533 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2534 LOG_DEBUG(VHOST_CONFIG, 2535 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2536 regionidx_hpa, 2537 (void *)(uintptr_t) 2538 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2539 for (i = 0, k = 0; 2540 i < virtio_memory->regions[regionidx].memory_size - 2541 page_size; 2542 i += page_size) { 2543 cur_phys_addr = rte_mem_virt2phy( 2544 (void *)(uintptr_t)(vva_start + i)); 2545 next_phys_addr = rte_mem_virt2phy( 2546 (void *)(uintptr_t)(vva_start + 2547 i + page_size)); 2548 if ((cur_phys_addr + page_size) != next_phys_addr) { 2549 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2550 mem_region_hpa[regionidx_hpa].guest_phys_address + 2551 k + page_size; 2552 mem_region_hpa[regionidx_hpa].memory_size 2553 = k + page_size; 2554 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2555 "phys addr end [%d]:(%p)\n", 2556 regionidx_hpa, 2557 (void *)(uintptr_t) 2558 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2559 LOG_DEBUG(VHOST_CONFIG, 2560 "in fill_hpa_regions: guest phys addr " 2561 "size [%d]:(%p)\n", 2562 regionidx_hpa, 2563 (void *)(uintptr_t) 2564 (mem_region_hpa[regionidx_hpa].memory_size)); 2565 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2566 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2567 ++regionidx_hpa; 2568 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2569 next_phys_addr - 2570 mem_region_hpa[regionidx_hpa].guest_phys_address; 2571 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2572 " phys addr start[%d]:(%p)\n", 2573 regionidx_hpa, 2574 (void *)(uintptr_t) 2575 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2576 LOG_DEBUG(VHOST_CONFIG, 2577 "in fill_hpa_regions: host phys addr " 2578 "start[%d]:(%p)\n", 2579 regionidx_hpa, 2580 (void *)(uintptr_t) 2581 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2582 k = 0; 2583 } else { 2584 k += page_size; 2585 } 2586 } 2587 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2588 = mem_region_hpa[regionidx_hpa].guest_phys_address 2589 + k + page_size; 2590 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2591 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2592 "[%d]:(%p)\n", regionidx_hpa, 2593 (void *)(uintptr_t) 2594 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2595 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2596 "[%d]:(%p)\n", regionidx_hpa, 2597 (void *)(uintptr_t) 2598 (mem_region_hpa[regionidx_hpa].memory_size)); 2599 ++regionidx_hpa; 2600 } 2601 return regionidx_hpa; 2602 } 2603 2604 /* 2605 * A new device is added to a data core. First the device is added to the main linked list 2606 * and the allocated to a specific data core. 2607 */ 2608 static int 2609 new_device (struct virtio_net *dev) 2610 { 2611 struct virtio_net_data_ll *ll_dev; 2612 int lcore, core_add = 0; 2613 uint32_t device_num_min = num_devices; 2614 struct vhost_dev *vdev; 2615 uint32_t regionidx; 2616 2617 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2618 if (vdev == NULL) { 2619 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2620 dev->device_fh); 2621 return -1; 2622 } 2623 vdev->dev = dev; 2624 dev->priv = vdev; 2625 2626 if (zero_copy) { 2627 vdev->nregions_hpa = dev->mem->nregions; 2628 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2629 vdev->nregions_hpa 2630 += check_hpa_regions( 2631 dev->mem->regions[regionidx].guest_phys_address 2632 + dev->mem->regions[regionidx].address_offset, 2633 dev->mem->regions[regionidx].memory_size); 2634 2635 } 2636 2637 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2638 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2639 CACHE_LINE_SIZE); 2640 if (vdev->regions_hpa == NULL) { 2641 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2642 rte_free(vdev); 2643 return -1; 2644 } 2645 2646 2647 if (fill_hpa_memory_regions( 2648 vdev->regions_hpa, dev->mem 2649 ) != vdev->nregions_hpa) { 2650 2651 RTE_LOG(ERR, VHOST_CONFIG, 2652 "hpa memory regions number mismatch: " 2653 "[%d]\n", vdev->nregions_hpa); 2654 rte_free(vdev->regions_hpa); 2655 rte_free(vdev); 2656 return -1; 2657 } 2658 } 2659 2660 2661 /* Add device to main ll */ 2662 ll_dev = get_data_ll_free_entry(&ll_root_free); 2663 if (ll_dev == NULL) { 2664 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2665 "of %d devices per core has been reached\n", 2666 dev->device_fh, num_devices); 2667 if (vdev->regions_hpa) 2668 rte_free(vdev->regions_hpa); 2669 rte_free(vdev); 2670 return -1; 2671 } 2672 ll_dev->vdev = vdev; 2673 add_data_ll_entry(&ll_root_used, ll_dev); 2674 vdev->vmdq_rx_q 2675 = dev->device_fh * (num_queues / num_devices); 2676 2677 if (zero_copy) { 2678 uint32_t index = vdev->vmdq_rx_q; 2679 uint32_t count_in_ring, i; 2680 struct mbuf_table *tx_q; 2681 2682 count_in_ring = rte_ring_count(vpool_array[index].ring); 2683 2684 LOG_DEBUG(VHOST_CONFIG, 2685 "(%"PRIu64") in new_device: mbuf count in mempool " 2686 "before attach is: %d\n", 2687 dev->device_fh, 2688 rte_mempool_count(vpool_array[index].pool)); 2689 LOG_DEBUG(VHOST_CONFIG, 2690 "(%"PRIu64") in new_device: mbuf count in ring " 2691 "before attach is : %d\n", 2692 dev->device_fh, count_in_ring); 2693 2694 /* 2695 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2696 */ 2697 for (i = 0; i < count_in_ring; i++) 2698 attach_rxmbuf_zcp(dev); 2699 2700 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2701 "mempool after attach is: %d\n", 2702 dev->device_fh, 2703 rte_mempool_count(vpool_array[index].pool)); 2704 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2705 "ring after attach is : %d\n", 2706 dev->device_fh, 2707 rte_ring_count(vpool_array[index].ring)); 2708 2709 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2710 tx_q->txq_id = vdev->vmdq_rx_q; 2711 2712 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2713 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2714 2715 LOG_DEBUG(VHOST_CONFIG, 2716 "(%"PRIu64") In new_device: Failed to start " 2717 "tx queue:%d\n", 2718 dev->device_fh, vdev->vmdq_rx_q); 2719 2720 mbuf_destroy_zcp(vpool); 2721 rte_free(vdev->regions_hpa); 2722 rte_free(vdev); 2723 return -1; 2724 } 2725 2726 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2727 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2728 2729 LOG_DEBUG(VHOST_CONFIG, 2730 "(%"PRIu64") In new_device: Failed to start " 2731 "rx queue:%d\n", 2732 dev->device_fh, vdev->vmdq_rx_q); 2733 2734 /* Stop the TX queue. */ 2735 if (rte_eth_dev_tx_queue_stop(ports[0], 2736 vdev->vmdq_rx_q) != 0) { 2737 LOG_DEBUG(VHOST_CONFIG, 2738 "(%"PRIu64") In new_device: Failed to " 2739 "stop tx queue:%d\n", 2740 dev->device_fh, vdev->vmdq_rx_q); 2741 } 2742 2743 mbuf_destroy_zcp(vpool); 2744 rte_free(vdev->regions_hpa); 2745 rte_free(vdev); 2746 return -1; 2747 } 2748 2749 } 2750 2751 /*reset ready flag*/ 2752 vdev->ready = DEVICE_MAC_LEARNING; 2753 vdev->remove = 0; 2754 2755 /* Find a suitable lcore to add the device. */ 2756 RTE_LCORE_FOREACH_SLAVE(lcore) { 2757 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2758 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2759 core_add = lcore; 2760 } 2761 } 2762 /* Add device to lcore ll */ 2763 ll_dev->dev->coreid = core_add; 2764 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); 2765 if (ll_dev == NULL) { 2766 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2767 vdev->ready = DEVICE_SAFE_REMOVE; 2768 destroy_device(dev); 2769 if (vdev->regions_hpa) 2770 rte_free(vdev->regions_hpa); 2771 rte_free(vdev); 2772 return -1; 2773 } 2774 ll_dev->vdev = vdev; 2775 vdev->coreid = core_add; 2776 2777 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); 2778 2779 /* Initialize device stats */ 2780 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2781 2782 /* Disable notifications. */ 2783 set_irq_status(dev); 2784 lcore_info[vdev->coreid].lcore_ll->device_num++; 2785 dev->flags |= VIRTIO_DEV_RUNNING; 2786 2787 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2788 2789 return 0; 2790 } 2791 2792 /* 2793 * These callback allow devices to be added to the data core when configuration 2794 * has been fully complete. 2795 */ 2796 static const struct virtio_net_device_ops virtio_net_device_ops = 2797 { 2798 .new_device = new_device, 2799 .destroy_device = destroy_device, 2800 }; 2801 2802 /* 2803 * This is a thread will wake up after a period to print stats if the user has 2804 * enabled them. 2805 */ 2806 static void 2807 print_stats(void) 2808 { 2809 struct virtio_net_data_ll *dev_ll; 2810 uint64_t tx_dropped, rx_dropped; 2811 uint64_t tx, tx_total, rx, rx_total; 2812 uint32_t device_fh; 2813 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2814 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2815 2816 while(1) { 2817 sleep(enable_stats); 2818 2819 /* Clear screen and move to top left */ 2820 printf("%s%s", clr, top_left); 2821 2822 printf("\nDevice statistics ===================================="); 2823 2824 dev_ll = ll_root_used; 2825 while (dev_ll != NULL) { 2826 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2827 tx_total = dev_statistics[device_fh].tx_total; 2828 tx = dev_statistics[device_fh].tx; 2829 tx_dropped = tx_total - tx; 2830 if (zero_copy == 0) { 2831 rx_total = rte_atomic64_read( 2832 &dev_statistics[device_fh].rx_total_atomic); 2833 rx = rte_atomic64_read( 2834 &dev_statistics[device_fh].rx_atomic); 2835 } else { 2836 rx_total = dev_statistics[device_fh].rx_total; 2837 rx = dev_statistics[device_fh].rx; 2838 } 2839 rx_dropped = rx_total - rx; 2840 2841 printf("\nStatistics for device %"PRIu32" ------------------------------" 2842 "\nTX total: %"PRIu64"" 2843 "\nTX dropped: %"PRIu64"" 2844 "\nTX successful: %"PRIu64"" 2845 "\nRX total: %"PRIu64"" 2846 "\nRX dropped: %"PRIu64"" 2847 "\nRX successful: %"PRIu64"", 2848 device_fh, 2849 tx_total, 2850 tx_dropped, 2851 tx, 2852 rx_total, 2853 rx_dropped, 2854 rx); 2855 2856 dev_ll = dev_ll->next; 2857 } 2858 printf("\n======================================================\n"); 2859 } 2860 } 2861 2862 static void 2863 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2864 char *ring_name, uint32_t nb_mbuf) 2865 { 2866 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2867 vpool_array[index].pool 2868 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2869 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2870 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2871 rte_pktmbuf_init, NULL, socket, 0); 2872 if (vpool_array[index].pool != NULL) { 2873 vpool_array[index].ring 2874 = rte_ring_create(ring_name, 2875 rte_align32pow2(nb_mbuf + 1), 2876 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2877 if (likely(vpool_array[index].ring != NULL)) { 2878 LOG_DEBUG(VHOST_CONFIG, 2879 "in setup_mempool_tbl: mbuf count in " 2880 "mempool is: %d\n", 2881 rte_mempool_count(vpool_array[index].pool)); 2882 LOG_DEBUG(VHOST_CONFIG, 2883 "in setup_mempool_tbl: mbuf count in " 2884 "ring is: %d\n", 2885 rte_ring_count(vpool_array[index].ring)); 2886 } else { 2887 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2888 ring_name); 2889 } 2890 2891 /* Need consider head room. */ 2892 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2893 } else { 2894 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2895 } 2896 } 2897 2898 2899 /* 2900 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2901 * device is also registered here to handle the IOCTLs. 2902 */ 2903 int 2904 MAIN(int argc, char *argv[]) 2905 { 2906 struct rte_mempool *mbuf_pool = NULL; 2907 unsigned lcore_id, core_id = 0; 2908 unsigned nb_ports, valid_num_ports; 2909 int ret; 2910 uint8_t portid, queue_id = 0; 2911 static pthread_t tid; 2912 2913 /* init EAL */ 2914 ret = rte_eal_init(argc, argv); 2915 if (ret < 0) 2916 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2917 argc -= ret; 2918 argv += ret; 2919 2920 /* parse app arguments */ 2921 ret = us_vhost_parse_args(argc, argv); 2922 if (ret < 0) 2923 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2924 2925 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2926 if (rte_lcore_is_enabled(lcore_id)) 2927 lcore_ids[core_id ++] = lcore_id; 2928 2929 if (rte_lcore_count() > RTE_MAX_LCORE) 2930 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2931 2932 /*set the number of swithcing cores available*/ 2933 num_switching_cores = rte_lcore_count()-1; 2934 2935 /* Get the number of physical ports. */ 2936 nb_ports = rte_eth_dev_count(); 2937 if (nb_ports > RTE_MAX_ETHPORTS) 2938 nb_ports = RTE_MAX_ETHPORTS; 2939 2940 /* 2941 * Update the global var NUM_PORTS and global array PORTS 2942 * and get value of var VALID_NUM_PORTS according to system ports number 2943 */ 2944 valid_num_ports = check_ports_num(nb_ports); 2945 2946 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2947 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2948 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2949 return -1; 2950 } 2951 2952 if (zero_copy == 0) { 2953 /* Create the mbuf pool. */ 2954 mbuf_pool = rte_mempool_create( 2955 "MBUF_POOL", 2956 NUM_MBUFS_PER_PORT 2957 * valid_num_ports, 2958 MBUF_SIZE, MBUF_CACHE_SIZE, 2959 sizeof(struct rte_pktmbuf_pool_private), 2960 rte_pktmbuf_pool_init, NULL, 2961 rte_pktmbuf_init, NULL, 2962 rte_socket_id(), 0); 2963 if (mbuf_pool == NULL) 2964 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2965 2966 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2967 vpool_array[queue_id].pool = mbuf_pool; 2968 2969 if (vm2vm_mode == VM2VM_HARDWARE) { 2970 /* Enable VT loop back to let L2 switch to do it. */ 2971 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2972 LOG_DEBUG(VHOST_CONFIG, 2973 "Enable loop back for L2 switch in vmdq.\n"); 2974 } 2975 } else { 2976 uint32_t nb_mbuf; 2977 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2978 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2979 2980 /* 2981 * Zero copy defers queue RX/TX start to the time when guest 2982 * finishes its startup and packet buffers from that guest are 2983 * available. 2984 */ 2985 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2986 rx_conf_default.rx_drop_en = 0; 2987 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2988 nb_mbuf = num_rx_descriptor 2989 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2990 + num_switching_cores * MAX_PKT_BURST; 2991 2992 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2993 snprintf(pool_name, sizeof(pool_name), 2994 "rxmbuf_pool_%u", queue_id); 2995 snprintf(ring_name, sizeof(ring_name), 2996 "rxmbuf_ring_%u", queue_id); 2997 setup_mempool_tbl(rte_socket_id(), queue_id, 2998 pool_name, ring_name, nb_mbuf); 2999 } 3000 3001 nb_mbuf = num_tx_descriptor 3002 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3003 + num_switching_cores * MAX_PKT_BURST; 3004 3005 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3006 snprintf(pool_name, sizeof(pool_name), 3007 "txmbuf_pool_%u", queue_id); 3008 snprintf(ring_name, sizeof(ring_name), 3009 "txmbuf_ring_%u", queue_id); 3010 setup_mempool_tbl(rte_socket_id(), 3011 (queue_id + MAX_QUEUES), 3012 pool_name, ring_name, nb_mbuf); 3013 } 3014 3015 if (vm2vm_mode == VM2VM_HARDWARE) { 3016 /* Enable VT loop back to let L2 switch to do it. */ 3017 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3018 LOG_DEBUG(VHOST_CONFIG, 3019 "Enable loop back for L2 switch in vmdq.\n"); 3020 } 3021 } 3022 /* Set log level. */ 3023 rte_set_log_level(LOG_LEVEL); 3024 3025 /* initialize all ports */ 3026 for (portid = 0; portid < nb_ports; portid++) { 3027 /* skip ports that are not enabled */ 3028 if ((enabled_port_mask & (1 << portid)) == 0) { 3029 RTE_LOG(INFO, VHOST_PORT, 3030 "Skipping disabled port %d\n", portid); 3031 continue; 3032 } 3033 if (port_init(portid) != 0) 3034 rte_exit(EXIT_FAILURE, 3035 "Cannot initialize network ports\n"); 3036 } 3037 3038 /* Initialise all linked lists. */ 3039 if (init_data_ll() == -1) 3040 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3041 3042 /* Initialize device stats */ 3043 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3044 3045 /* Enable stats if the user option is set. */ 3046 if (enable_stats) 3047 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3048 3049 /* Launch all data cores. */ 3050 if (zero_copy == 0) { 3051 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3052 rte_eal_remote_launch(switch_worker, 3053 mbuf_pool, lcore_id); 3054 } 3055 } else { 3056 uint32_t count_in_mempool, index, i; 3057 for (index = 0; index < 2*MAX_QUEUES; index++) { 3058 /* For all RX and TX queues. */ 3059 count_in_mempool 3060 = rte_mempool_count(vpool_array[index].pool); 3061 3062 /* 3063 * Transfer all un-attached mbufs from vpool.pool 3064 * to vpoo.ring. 3065 */ 3066 for (i = 0; i < count_in_mempool; i++) { 3067 struct rte_mbuf *mbuf 3068 = __rte_mbuf_raw_alloc( 3069 vpool_array[index].pool); 3070 rte_ring_sp_enqueue(vpool_array[index].ring, 3071 (void *)mbuf); 3072 } 3073 3074 LOG_DEBUG(VHOST_CONFIG, 3075 "in MAIN: mbuf count in mempool at initial " 3076 "is: %d\n", count_in_mempool); 3077 LOG_DEBUG(VHOST_CONFIG, 3078 "in MAIN: mbuf count in ring at initial is :" 3079 " %d\n", 3080 rte_ring_count(vpool_array[index].ring)); 3081 } 3082 3083 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3084 rte_eal_remote_launch(switch_worker_zcp, NULL, 3085 lcore_id); 3086 } 3087 3088 /* Register CUSE device to handle IOCTLs. */ 3089 ret = rte_vhost_driver_register((char *)&dev_basename); 3090 if (ret != 0) 3091 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3092 3093 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3094 3095 /* Start CUSE session. */ 3096 rte_vhost_driver_session_start(); 3097 return 0; 3098 3099 } 3100 3101