1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 103 104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 106 107 #define JUMBO_FRAME_MAX_SIZE 0x2600 108 109 /* State of virtio device. */ 110 #define DEVICE_MAC_LEARNING 0 111 #define DEVICE_RX 1 112 #define DEVICE_SAFE_REMOVE 2 113 114 /* Config_core_flag status definitions. */ 115 #define REQUEST_DEV_REMOVAL 1 116 #define ACK_DEV_REMOVAL 0 117 118 /* Configurable number of RX/TX ring descriptors */ 119 #define RTE_TEST_RX_DESC_DEFAULT 1024 120 #define RTE_TEST_TX_DESC_DEFAULT 512 121 122 /* 123 * Need refine these 2 macros for legacy and DPDK based front end: 124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 125 * And then adjust power 2. 126 */ 127 /* 128 * For legacy front end, 128 descriptors, 129 * half for virtio header, another half for mbuf. 130 */ 131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 133 134 /* Get first 4 bytes in mbuf headroom. */ 135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 136 + sizeof(struct rte_mbuf))) 137 138 /* true if x is a power of 2 */ 139 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 140 141 #define INVALID_PORT_ID 0xFF 142 143 /* Max number of devices. Limited by vmdq. */ 144 #define MAX_DEVICES 64 145 146 /* Size of buffers used for snprintfs. */ 147 #define MAX_PRINT_BUFF 6072 148 149 /* Maximum character device basename size. */ 150 #define MAX_BASENAME_SZ 10 151 152 /* Maximum long option length for option parsing. */ 153 #define MAX_LONG_OPT_SZ 64 154 155 /* Used to compare MAC addresses. */ 156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 157 158 /* Number of descriptors per cacheline. */ 159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 160 161 /* mask of enabled ports */ 162 static uint32_t enabled_port_mask = 0; 163 164 /*Number of switching cores enabled*/ 165 static uint32_t num_switching_cores = 0; 166 167 /* number of devices/queues to support*/ 168 static uint32_t num_queues = 0; 169 static uint32_t num_devices; 170 171 /* 172 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 173 * disabled on default. 174 */ 175 static uint32_t zero_copy; 176 static int mergeable; 177 178 /* number of descriptors to apply*/ 179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 181 182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 183 #define MAX_RING_DESC 4096 184 185 struct vpool { 186 struct rte_mempool *pool; 187 struct rte_ring *ring; 188 uint32_t buf_size; 189 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 190 191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 192 typedef enum { 193 VM2VM_DISABLED = 0, 194 VM2VM_SOFTWARE = 1, 195 VM2VM_HARDWARE = 2, 196 VM2VM_LAST 197 } vm2vm_type; 198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 199 200 /* The type of host physical address translated from guest physical address. */ 201 typedef enum { 202 PHYS_ADDR_CONTINUOUS = 0, 203 PHYS_ADDR_CROSS_SUBREG = 1, 204 PHYS_ADDR_INVALID = 2, 205 PHYS_ADDR_LAST 206 } hpa_type; 207 208 /* Enable stats. */ 209 static uint32_t enable_stats = 0; 210 /* Enable retries on RX. */ 211 static uint32_t enable_retry = 1; 212 /* Specify timeout (in useconds) between retries on RX. */ 213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 214 /* Specify the number of retries on RX. */ 215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 216 217 /* Character device basename. Can be set by user. */ 218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 219 220 221 /* Default configuration for rx and tx thresholds etc. */ 222 static struct rte_eth_rxconf rx_conf_default = { 223 .rx_thresh = { 224 .pthresh = RX_PTHRESH, 225 .hthresh = RX_HTHRESH, 226 .wthresh = RX_WTHRESH, 227 }, 228 .rx_drop_en = 1, 229 }; 230 231 /* 232 * These default values are optimized for use with the Intel(R) 82599 10 GbE 233 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 234 * network controllers and/or network drivers. 235 */ 236 static struct rte_eth_txconf tx_conf_default = { 237 .tx_thresh = { 238 .pthresh = TX_PTHRESH, 239 .hthresh = TX_HTHRESH, 240 .wthresh = TX_WTHRESH, 241 }, 242 .tx_free_thresh = 0, /* Use PMD default values */ 243 .tx_rs_thresh = 0, /* Use PMD default values */ 244 }; 245 246 /* empty vmdq configuration structure. Filled in programatically */ 247 static struct rte_eth_conf vmdq_conf_default = { 248 .rxmode = { 249 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 250 .split_hdr_size = 0, 251 .header_split = 0, /**< Header Split disabled */ 252 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 253 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 254 /* 255 * It is necessary for 1G NIC such as I350, 256 * this fixes bug of ipv4 forwarding in guest can't 257 * forward pakets from one virtio dev to another virtio dev. 258 */ 259 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 260 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 261 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 262 }, 263 264 .txmode = { 265 .mq_mode = ETH_MQ_TX_NONE, 266 }, 267 .rx_adv_conf = { 268 /* 269 * should be overridden separately in code with 270 * appropriate values 271 */ 272 .vmdq_rx_conf = { 273 .nb_queue_pools = ETH_8_POOLS, 274 .enable_default_pool = 0, 275 .default_pool = 0, 276 .nb_pool_maps = 0, 277 .pool_map = {{0, 0},}, 278 }, 279 }, 280 }; 281 282 static unsigned lcore_ids[RTE_MAX_LCORE]; 283 static uint8_t ports[RTE_MAX_ETHPORTS]; 284 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 285 286 static const uint16_t external_pkt_default_vlan_tag = 2000; 287 const uint16_t vlan_tags[] = { 288 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 289 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 290 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 291 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 292 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 293 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 294 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 295 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 296 }; 297 298 /* ethernet addresses of ports */ 299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 300 301 /* heads for the main used and free linked lists for the data path. */ 302 static struct virtio_net_data_ll *ll_root_used = NULL; 303 static struct virtio_net_data_ll *ll_root_free = NULL; 304 305 /* Array of data core structures containing information on individual core linked lists. */ 306 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 307 308 /* Used for queueing bursts of TX packets. */ 309 struct mbuf_table { 310 unsigned len; 311 unsigned txq_id; 312 struct rte_mbuf *m_table[MAX_PKT_BURST]; 313 }; 314 315 /* TX queue for each data core. */ 316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 317 318 /* TX queue fori each virtio device for zero copy. */ 319 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 320 321 /* Vlan header struct used to insert vlan tags on TX. */ 322 struct vlan_ethhdr { 323 unsigned char h_dest[ETH_ALEN]; 324 unsigned char h_source[ETH_ALEN]; 325 __be16 h_vlan_proto; 326 __be16 h_vlan_TCI; 327 __be16 h_vlan_encapsulated_proto; 328 }; 329 330 /* IPv4 Header */ 331 struct ipv4_hdr { 332 uint8_t version_ihl; /**< version and header length */ 333 uint8_t type_of_service; /**< type of service */ 334 uint16_t total_length; /**< length of packet */ 335 uint16_t packet_id; /**< packet ID */ 336 uint16_t fragment_offset; /**< fragmentation offset */ 337 uint8_t time_to_live; /**< time to live */ 338 uint8_t next_proto_id; /**< protocol ID */ 339 uint16_t hdr_checksum; /**< header checksum */ 340 uint32_t src_addr; /**< source address */ 341 uint32_t dst_addr; /**< destination address */ 342 } __attribute__((__packed__)); 343 344 /* Header lengths. */ 345 #define VLAN_HLEN 4 346 #define VLAN_ETH_HLEN 18 347 348 /* Per-device statistics struct */ 349 struct device_statistics { 350 uint64_t tx_total; 351 rte_atomic64_t rx_total_atomic; 352 uint64_t rx_total; 353 uint64_t tx; 354 rte_atomic64_t rx_atomic; 355 uint64_t rx; 356 } __rte_cache_aligned; 357 struct device_statistics dev_statistics[MAX_DEVICES]; 358 359 /* 360 * Builds up the correct configuration for VMDQ VLAN pool map 361 * according to the pool & queue limits. 362 */ 363 static inline int 364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 365 { 366 struct rte_eth_vmdq_rx_conf conf; 367 unsigned i; 368 369 memset(&conf, 0, sizeof(conf)); 370 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 371 conf.nb_pool_maps = num_devices; 372 conf.enable_loop_back = 373 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 374 375 for (i = 0; i < conf.nb_pool_maps; i++) { 376 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 377 conf.pool_map[i].pools = (1UL << i); 378 } 379 380 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 381 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 382 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 383 return 0; 384 } 385 386 /* 387 * Validate the device number according to the max pool number gotten form 388 * dev_info. If the device number is invalid, give the error message and 389 * return -1. Each device must have its own pool. 390 */ 391 static inline int 392 validate_num_devices(uint32_t max_nb_devices) 393 { 394 if (num_devices > max_nb_devices) { 395 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 396 return -1; 397 } 398 return 0; 399 } 400 401 /* 402 * Initialises a given port using global settings and with the rx buffers 403 * coming from the mbuf_pool passed as parameter 404 */ 405 static inline int 406 port_init(uint8_t port) 407 { 408 struct rte_eth_dev_info dev_info; 409 struct rte_eth_conf port_conf; 410 uint16_t rx_rings, tx_rings; 411 uint16_t rx_ring_size, tx_ring_size; 412 int retval; 413 uint16_t q; 414 415 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 416 rte_eth_dev_info_get (port, &dev_info); 417 418 /*configure the number of supported virtio devices based on VMDQ limits */ 419 num_devices = dev_info.max_vmdq_pools; 420 num_queues = dev_info.max_rx_queues; 421 422 if (zero_copy) { 423 rx_ring_size = num_rx_descriptor; 424 tx_ring_size = num_tx_descriptor; 425 tx_rings = dev_info.max_tx_queues; 426 } else { 427 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 428 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 429 tx_rings = (uint16_t)rte_lcore_count(); 430 } 431 432 retval = validate_num_devices(MAX_DEVICES); 433 if (retval < 0) 434 return retval; 435 436 /* Get port configuration. */ 437 retval = get_eth_conf(&port_conf, num_devices); 438 if (retval < 0) 439 return retval; 440 441 if (port >= rte_eth_dev_count()) return -1; 442 443 rx_rings = (uint16_t)num_queues, 444 /* Configure ethernet device. */ 445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 446 if (retval != 0) 447 return retval; 448 449 /* Setup the queues. */ 450 for (q = 0; q < rx_rings; q ++) { 451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 452 rte_eth_dev_socket_id(port), &rx_conf_default, 453 vpool_array[q].pool); 454 if (retval < 0) 455 return retval; 456 } 457 for (q = 0; q < tx_rings; q ++) { 458 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 459 rte_eth_dev_socket_id(port), &tx_conf_default); 460 if (retval < 0) 461 return retval; 462 } 463 464 /* Start the device. */ 465 retval = rte_eth_dev_start(port); 466 if (retval < 0) { 467 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 468 return retval; 469 } 470 471 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 472 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 473 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 474 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 475 (unsigned)port, 476 vmdq_ports_eth_addr[port].addr_bytes[0], 477 vmdq_ports_eth_addr[port].addr_bytes[1], 478 vmdq_ports_eth_addr[port].addr_bytes[2], 479 vmdq_ports_eth_addr[port].addr_bytes[3], 480 vmdq_ports_eth_addr[port].addr_bytes[4], 481 vmdq_ports_eth_addr[port].addr_bytes[5]); 482 483 return 0; 484 } 485 486 /* 487 * Set character device basename. 488 */ 489 static int 490 us_vhost_parse_basename(const char *q_arg) 491 { 492 /* parse number string */ 493 494 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 495 return -1; 496 else 497 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 498 499 return 0; 500 } 501 502 /* 503 * Parse the portmask provided at run time. 504 */ 505 static int 506 parse_portmask(const char *portmask) 507 { 508 char *end = NULL; 509 unsigned long pm; 510 511 errno = 0; 512 513 /* parse hexadecimal string */ 514 pm = strtoul(portmask, &end, 16); 515 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 516 return -1; 517 518 if (pm == 0) 519 return -1; 520 521 return pm; 522 523 } 524 525 /* 526 * Parse num options at run time. 527 */ 528 static int 529 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 530 { 531 char *end = NULL; 532 unsigned long num; 533 534 errno = 0; 535 536 /* parse unsigned int string */ 537 num = strtoul(q_arg, &end, 10); 538 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 539 return -1; 540 541 if (num > max_valid_value) 542 return -1; 543 544 return num; 545 546 } 547 548 /* 549 * Display usage 550 */ 551 static void 552 us_vhost_usage(const char *prgname) 553 { 554 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 555 " --vm2vm [0|1|2]\n" 556 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 557 " --dev-basename <name>\n" 558 " --nb-devices ND\n" 559 " -p PORTMASK: Set mask for ports to be used by application\n" 560 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 561 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 562 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 563 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 564 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 565 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 566 " --dev-basename: The basename to be used for the character device.\n" 567 " --zero-copy [0|1]: disable(default)/enable rx/tx " 568 "zero copy\n" 569 " --rx-desc-num [0-N]: the number of descriptors on rx, " 570 "used only when zero copy is enabled.\n" 571 " --tx-desc-num [0-N]: the number of descriptors on tx, " 572 "used only when zero copy is enabled.\n", 573 prgname); 574 } 575 576 /* 577 * Parse the arguments given in the command line of the application. 578 */ 579 static int 580 us_vhost_parse_args(int argc, char **argv) 581 { 582 int opt, ret; 583 int option_index; 584 unsigned i; 585 const char *prgname = argv[0]; 586 static struct option long_option[] = { 587 {"vm2vm", required_argument, NULL, 0}, 588 {"rx-retry", required_argument, NULL, 0}, 589 {"rx-retry-delay", required_argument, NULL, 0}, 590 {"rx-retry-num", required_argument, NULL, 0}, 591 {"mergeable", required_argument, NULL, 0}, 592 {"stats", required_argument, NULL, 0}, 593 {"dev-basename", required_argument, NULL, 0}, 594 {"zero-copy", required_argument, NULL, 0}, 595 {"rx-desc-num", required_argument, NULL, 0}, 596 {"tx-desc-num", required_argument, NULL, 0}, 597 {NULL, 0, 0, 0}, 598 }; 599 600 /* Parse command line */ 601 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 602 switch (opt) { 603 /* Portmask */ 604 case 'p': 605 enabled_port_mask = parse_portmask(optarg); 606 if (enabled_port_mask == 0) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } 611 break; 612 613 case 0: 614 /* Enable/disable vm2vm comms. */ 615 if (!strncmp(long_option[option_index].name, "vm2vm", 616 MAX_LONG_OPT_SZ)) { 617 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 618 if (ret == -1) { 619 RTE_LOG(INFO, VHOST_CONFIG, 620 "Invalid argument for " 621 "vm2vm [0|1|2]\n"); 622 us_vhost_usage(prgname); 623 return -1; 624 } else { 625 vm2vm_mode = (vm2vm_type)ret; 626 } 627 } 628 629 /* Enable/disable retries on RX. */ 630 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 631 ret = parse_num_opt(optarg, 1); 632 if (ret == -1) { 633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } else { 637 enable_retry = ret; 638 } 639 } 640 641 /* Specify the retries delay time (in useconds) on RX. */ 642 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 643 ret = parse_num_opt(optarg, INT32_MAX); 644 if (ret == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 646 us_vhost_usage(prgname); 647 return -1; 648 } else { 649 burst_rx_delay_time = ret; 650 } 651 } 652 653 /* Specify the retries number on RX. */ 654 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 655 ret = parse_num_opt(optarg, INT32_MAX); 656 if (ret == -1) { 657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 658 us_vhost_usage(prgname); 659 return -1; 660 } else { 661 burst_rx_retry_num = ret; 662 } 663 } 664 665 /* Enable/disable RX mergeable buffers. */ 666 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 667 ret = parse_num_opt(optarg, 1); 668 if (ret == -1) { 669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 670 us_vhost_usage(prgname); 671 return -1; 672 } else { 673 mergeable = !!ret; 674 if (ret) { 675 vmdq_conf_default.rxmode.jumbo_frame = 1; 676 vmdq_conf_default.rxmode.max_rx_pkt_len 677 = JUMBO_FRAME_MAX_SIZE; 678 } 679 } 680 } 681 682 /* Enable/disable stats. */ 683 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 684 ret = parse_num_opt(optarg, INT32_MAX); 685 if (ret == -1) { 686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 687 us_vhost_usage(prgname); 688 return -1; 689 } else { 690 enable_stats = ret; 691 } 692 } 693 694 /* Set character device basename. */ 695 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 696 if (us_vhost_parse_basename(optarg) == -1) { 697 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 698 us_vhost_usage(prgname); 699 return -1; 700 } 701 } 702 703 /* Enable/disable rx/tx zero copy. */ 704 if (!strncmp(long_option[option_index].name, 705 "zero-copy", MAX_LONG_OPT_SZ)) { 706 ret = parse_num_opt(optarg, 1); 707 if (ret == -1) { 708 RTE_LOG(INFO, VHOST_CONFIG, 709 "Invalid argument" 710 " for zero-copy [0|1]\n"); 711 us_vhost_usage(prgname); 712 return -1; 713 } else 714 zero_copy = ret; 715 716 if (zero_copy) { 717 #ifdef RTE_MBUF_REFCNT 718 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 719 "zero copy vhost APP, please " 720 "disable RTE_MBUF_REFCNT\n" 721 "in config file and then rebuild DPDK " 722 "core lib!\n" 723 "Otherwise please disable zero copy " 724 "flag in command line!\n"); 725 return -1; 726 #endif 727 } 728 } 729 730 /* Specify the descriptor number on RX. */ 731 if (!strncmp(long_option[option_index].name, 732 "rx-desc-num", MAX_LONG_OPT_SZ)) { 733 ret = parse_num_opt(optarg, MAX_RING_DESC); 734 if ((ret == -1) || (!POWEROF2(ret))) { 735 RTE_LOG(INFO, VHOST_CONFIG, 736 "Invalid argument for rx-desc-num[0-N]," 737 "power of 2 required.\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } else { 741 num_rx_descriptor = ret; 742 } 743 } 744 745 /* Specify the descriptor number on TX. */ 746 if (!strncmp(long_option[option_index].name, 747 "tx-desc-num", MAX_LONG_OPT_SZ)) { 748 ret = parse_num_opt(optarg, MAX_RING_DESC); 749 if ((ret == -1) || (!POWEROF2(ret))) { 750 RTE_LOG(INFO, VHOST_CONFIG, 751 "Invalid argument for tx-desc-num [0-N]," 752 "power of 2 required.\n"); 753 us_vhost_usage(prgname); 754 return -1; 755 } else { 756 num_tx_descriptor = ret; 757 } 758 } 759 760 break; 761 762 /* Invalid option - print options. */ 763 default: 764 us_vhost_usage(prgname); 765 return -1; 766 } 767 } 768 769 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 770 if (enabled_port_mask & (1 << i)) 771 ports[num_ports++] = (uint8_t)i; 772 } 773 774 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 775 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 776 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 777 return -1; 778 } 779 780 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 781 RTE_LOG(INFO, VHOST_PORT, 782 "Vhost zero copy doesn't support software vm2vm," 783 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 784 return -1; 785 } 786 787 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 788 RTE_LOG(INFO, VHOST_PORT, 789 "Vhost zero copy doesn't support jumbo frame," 790 "please specify '--mergeable 0' to disable the " 791 "mergeable feature.\n"); 792 return -1; 793 } 794 795 return 0; 796 } 797 798 /* 799 * Update the global var NUM_PORTS and array PORTS according to system ports number 800 * and return valid ports number 801 */ 802 static unsigned check_ports_num(unsigned nb_ports) 803 { 804 unsigned valid_num_ports = num_ports; 805 unsigned portid; 806 807 if (num_ports > nb_ports) { 808 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 809 num_ports, nb_ports); 810 num_ports = nb_ports; 811 } 812 813 for (portid = 0; portid < num_ports; portid ++) { 814 if (ports[portid] >= nb_ports) { 815 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 816 ports[portid], (nb_ports - 1)); 817 ports[portid] = INVALID_PORT_ID; 818 valid_num_ports--; 819 } 820 } 821 return valid_num_ports; 822 } 823 824 /* 825 * Macro to print out packet contents. Wrapped in debug define so that the 826 * data path is not effected when debug is disabled. 827 */ 828 #ifdef DEBUG 829 #define PRINT_PACKET(device, addr, size, header) do { \ 830 char *pkt_addr = (char*)(addr); \ 831 unsigned int index; \ 832 char packet[MAX_PRINT_BUFF]; \ 833 \ 834 if ((header)) \ 835 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 836 else \ 837 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 838 for (index = 0; index < (size); index++) { \ 839 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 840 "%02hhx ", pkt_addr[index]); \ 841 } \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 843 \ 844 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 845 } while(0) 846 #else 847 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 848 #endif 849 850 /* 851 * Function to convert guest physical addresses to vhost physical addresses. 852 * This is used to convert virtio buffer addresses. 853 */ 854 static inline uint64_t __attribute__((always_inline)) 855 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 856 uint32_t buf_len, hpa_type *addr_type) 857 { 858 struct virtio_memory_regions_hpa *region; 859 uint32_t regionidx; 860 uint64_t vhost_pa = 0; 861 862 *addr_type = PHYS_ADDR_INVALID; 863 864 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 865 region = &vdev->regions_hpa[regionidx]; 866 if ((guest_pa >= region->guest_phys_address) && 867 (guest_pa <= region->guest_phys_address_end)) { 868 vhost_pa = region->host_phys_addr_offset + guest_pa; 869 if (likely((guest_pa + buf_len - 1) 870 <= region->guest_phys_address_end)) 871 *addr_type = PHYS_ADDR_CONTINUOUS; 872 else 873 *addr_type = PHYS_ADDR_CROSS_SUBREG; 874 break; 875 } 876 } 877 878 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 879 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 880 (void *)(uintptr_t)vhost_pa); 881 882 return vhost_pa; 883 } 884 885 /* 886 * Compares a packet destination MAC address to a device MAC address. 887 */ 888 static inline int __attribute__((always_inline)) 889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 890 { 891 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 892 } 893 894 /* 895 * This function learns the MAC address of the device and registers this along with a 896 * vlan tag to a VMDQ. 897 */ 898 static int 899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 900 { 901 struct ether_hdr *pkt_hdr; 902 struct virtio_net_data_ll *dev_ll; 903 struct virtio_net *dev = vdev->dev; 904 int i, ret; 905 906 /* Learn MAC address of guest device from packet */ 907 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 908 909 dev_ll = ll_root_used; 910 911 while (dev_ll != NULL) { 912 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 913 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 914 return -1; 915 } 916 dev_ll = dev_ll->next; 917 } 918 919 for (i = 0; i < ETHER_ADDR_LEN; i++) 920 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 921 922 /* vlan_tag currently uses the device_id. */ 923 vdev->vlan_tag = vlan_tags[dev->device_fh]; 924 925 /* Print out VMDQ registration info. */ 926 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 927 dev->device_fh, 928 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 929 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 930 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 931 vdev->vlan_tag); 932 933 /* Register the MAC address. */ 934 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 935 if (ret) 936 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 937 dev->device_fh); 938 939 /* Enable stripping of the vlan tag as we handle routing. */ 940 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 941 942 /* Set device as ready for RX. */ 943 vdev->ready = DEVICE_RX; 944 945 return 0; 946 } 947 948 /* 949 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 950 * queue before disabling RX on the device. 951 */ 952 static inline void 953 unlink_vmdq(struct vhost_dev *vdev) 954 { 955 unsigned i = 0; 956 unsigned rx_count; 957 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 958 959 if (vdev->ready == DEVICE_RX) { 960 /*clear MAC and VLAN settings*/ 961 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 962 for (i = 0; i < 6; i++) 963 vdev->mac_address.addr_bytes[i] = 0; 964 965 vdev->vlan_tag = 0; 966 967 /*Clear out the receive buffers*/ 968 rx_count = rte_eth_rx_burst(ports[0], 969 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 970 971 while (rx_count) { 972 for (i = 0; i < rx_count; i++) 973 rte_pktmbuf_free(pkts_burst[i]); 974 975 rx_count = rte_eth_rx_burst(ports[0], 976 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 977 } 978 979 vdev->ready = DEVICE_MAC_LEARNING; 980 } 981 } 982 983 /* 984 * Check if the packet destination MAC address is for a local device. If so then put 985 * the packet on that devices RX queue. If not then return. 986 */ 987 static inline int __attribute__((always_inline)) 988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 989 { 990 struct virtio_net_data_ll *dev_ll; 991 struct ether_hdr *pkt_hdr; 992 uint64_t ret = 0; 993 struct virtio_net *dev = vdev->dev; 994 struct virtio_net *tdev; /* destination virito device */ 995 996 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 997 998 /*get the used devices list*/ 999 dev_ll = ll_root_used; 1000 1001 while (dev_ll != NULL) { 1002 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1003 &dev_ll->vdev->mac_address)) { 1004 1005 /* Drop the packet if the TX packet is destined for the TX device. */ 1006 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1007 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1008 dev->device_fh); 1009 return 0; 1010 } 1011 tdev = dev_ll->vdev->dev; 1012 1013 1014 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1015 1016 if (unlikely(dev_ll->vdev->remove)) { 1017 /*drop the packet if the device is marked for removal*/ 1018 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1019 } else { 1020 /*send the packet to the local virtio device*/ 1021 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1022 if (enable_stats) { 1023 rte_atomic64_add( 1024 &dev_statistics[tdev->device_fh].rx_total_atomic, 1025 1); 1026 rte_atomic64_add( 1027 &dev_statistics[tdev->device_fh].rx_atomic, 1028 ret); 1029 dev_statistics[tdev->device_fh].tx_total++; 1030 dev_statistics[tdev->device_fh].tx += ret; 1031 } 1032 } 1033 1034 return 0; 1035 } 1036 dev_ll = dev_ll->next; 1037 } 1038 1039 return -1; 1040 } 1041 1042 /* 1043 * Check if the destination MAC of a packet is one local VM, 1044 * and get its vlan tag, and offset if it is. 1045 */ 1046 static inline int __attribute__((always_inline)) 1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1048 uint32_t *offset, uint16_t *vlan_tag) 1049 { 1050 struct virtio_net_data_ll *dev_ll = ll_root_used; 1051 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1052 1053 while (dev_ll != NULL) { 1054 if ((dev_ll->vdev->ready == DEVICE_RX) 1055 && ether_addr_cmp(&(pkt_hdr->d_addr), 1056 &dev_ll->vdev->mac_address)) { 1057 /* 1058 * Drop the packet if the TX packet is 1059 * destined for the TX device. 1060 */ 1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1062 LOG_DEBUG(VHOST_DATA, 1063 "(%"PRIu64") TX: Source and destination" 1064 " MAC addresses are the same. Dropping " 1065 "packet.\n", 1066 dev_ll->vdev->dev->device_fh); 1067 return -1; 1068 } 1069 1070 /* 1071 * HW vlan strip will reduce the packet length 1072 * by minus length of vlan tag, so need restore 1073 * the packet length by plus it. 1074 */ 1075 *offset = VLAN_HLEN; 1076 *vlan_tag = 1077 (uint16_t) 1078 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1079 1080 LOG_DEBUG(VHOST_DATA, 1081 "(%"PRIu64") TX: pkt to local VM device id:" 1082 "(%"PRIu64") vlan tag: %d.\n", 1083 dev->device_fh, dev_ll->vdev->dev->device_fh, 1084 vlan_tag); 1085 1086 break; 1087 } 1088 dev_ll = dev_ll->next; 1089 } 1090 return 0; 1091 } 1092 1093 /* 1094 * This function routes the TX packet to the correct interface. This may be a local device 1095 * or the physical port. 1096 */ 1097 static inline void __attribute__((always_inline)) 1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1099 { 1100 struct mbuf_table *tx_q; 1101 struct rte_mbuf **m_table; 1102 unsigned len, ret, offset = 0; 1103 const uint16_t lcore_id = rte_lcore_id(); 1104 struct virtio_net *dev = vdev->dev; 1105 1106 /*check if destination is local VM*/ 1107 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1108 rte_pktmbuf_free(m); 1109 return; 1110 } 1111 1112 if (vm2vm_mode == VM2VM_HARDWARE) { 1113 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1114 rte_pktmbuf_free(m); 1115 return; 1116 } 1117 } 1118 1119 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1120 1121 /*Add packet to the port tx queue*/ 1122 tx_q = &lcore_tx_queue[lcore_id]; 1123 len = tx_q->len; 1124 1125 m->ol_flags = PKT_TX_VLAN_PKT; 1126 1127 m->data_len += offset; 1128 m->pkt_len += offset; 1129 1130 m->vlan_tci = vlan_tag; 1131 1132 tx_q->m_table[len] = m; 1133 len++; 1134 if (enable_stats) { 1135 dev_statistics[dev->device_fh].tx_total++; 1136 dev_statistics[dev->device_fh].tx++; 1137 } 1138 1139 if (unlikely(len == MAX_PKT_BURST)) { 1140 m_table = (struct rte_mbuf **)tx_q->m_table; 1141 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1142 /* Free any buffers not handled by TX and update the port stats. */ 1143 if (unlikely(ret < len)) { 1144 do { 1145 rte_pktmbuf_free(m_table[ret]); 1146 } while (++ret < len); 1147 } 1148 1149 len = 0; 1150 } 1151 1152 tx_q->len = len; 1153 return; 1154 } 1155 /* 1156 * This function is called by each data core. It handles all RX/TX registered with the 1157 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1158 * with all devices in the main linked list. 1159 */ 1160 static int 1161 switch_worker(__attribute__((unused)) void *arg) 1162 { 1163 struct rte_mempool *mbuf_pool = arg; 1164 struct virtio_net *dev = NULL; 1165 struct vhost_dev *vdev = NULL; 1166 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1167 struct virtio_net_data_ll *dev_ll; 1168 struct mbuf_table *tx_q; 1169 volatile struct lcore_ll_info *lcore_ll; 1170 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1171 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1172 unsigned ret, i; 1173 const uint16_t lcore_id = rte_lcore_id(); 1174 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1175 uint16_t rx_count = 0; 1176 uint16_t tx_count; 1177 uint32_t retry = 0; 1178 1179 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1180 lcore_ll = lcore_info[lcore_id].lcore_ll; 1181 prev_tsc = 0; 1182 1183 tx_q = &lcore_tx_queue[lcore_id]; 1184 for (i = 0; i < num_cores; i ++) { 1185 if (lcore_ids[i] == lcore_id) { 1186 tx_q->txq_id = i; 1187 break; 1188 } 1189 } 1190 1191 while(1) { 1192 cur_tsc = rte_rdtsc(); 1193 /* 1194 * TX burst queue drain 1195 */ 1196 diff_tsc = cur_tsc - prev_tsc; 1197 if (unlikely(diff_tsc > drain_tsc)) { 1198 1199 if (tx_q->len) { 1200 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1201 1202 /*Tx any packets in the queue*/ 1203 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1204 (struct rte_mbuf **)tx_q->m_table, 1205 (uint16_t)tx_q->len); 1206 if (unlikely(ret < tx_q->len)) { 1207 do { 1208 rte_pktmbuf_free(tx_q->m_table[ret]); 1209 } while (++ret < tx_q->len); 1210 } 1211 1212 tx_q->len = 0; 1213 } 1214 1215 prev_tsc = cur_tsc; 1216 1217 } 1218 1219 rte_prefetch0(lcore_ll->ll_root_used); 1220 /* 1221 * Inform the configuration core that we have exited the linked list and that no devices are 1222 * in use if requested. 1223 */ 1224 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1225 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1226 1227 /* 1228 * Process devices 1229 */ 1230 dev_ll = lcore_ll->ll_root_used; 1231 1232 while (dev_ll != NULL) { 1233 /*get virtio device ID*/ 1234 vdev = dev_ll->vdev; 1235 dev = vdev->dev; 1236 1237 if (unlikely(vdev->remove)) { 1238 dev_ll = dev_ll->next; 1239 unlink_vmdq(vdev); 1240 vdev->ready = DEVICE_SAFE_REMOVE; 1241 continue; 1242 } 1243 if (likely(vdev->ready == DEVICE_RX)) { 1244 /*Handle guest RX*/ 1245 rx_count = rte_eth_rx_burst(ports[0], 1246 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1247 1248 if (rx_count) { 1249 /* 1250 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1251 * Here MAX_PKT_BURST must be less than virtio queue size 1252 */ 1253 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1254 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1255 rte_delay_us(burst_rx_delay_time); 1256 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1257 break; 1258 } 1259 } 1260 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1261 if (enable_stats) { 1262 rte_atomic64_add( 1263 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1264 rx_count); 1265 rte_atomic64_add( 1266 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1267 } 1268 while (likely(rx_count)) { 1269 rx_count--; 1270 rte_pktmbuf_free(pkts_burst[rx_count]); 1271 } 1272 1273 } 1274 } 1275 1276 if (likely(!vdev->remove)) { 1277 /* Handle guest TX*/ 1278 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1279 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1280 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1281 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1282 while (tx_count--) 1283 rte_pktmbuf_free(pkts_burst[tx_count]); 1284 } 1285 } 1286 while (tx_count) 1287 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1288 } 1289 1290 /*move to the next device in the list*/ 1291 dev_ll = dev_ll->next; 1292 } 1293 } 1294 1295 return 0; 1296 } 1297 1298 /* 1299 * This function gets available ring number for zero copy rx. 1300 * Only one thread will call this funciton for a paticular virtio device, 1301 * so, it is designed as non-thread-safe function. 1302 */ 1303 static inline uint32_t __attribute__((always_inline)) 1304 get_available_ring_num_zcp(struct virtio_net *dev) 1305 { 1306 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1307 uint16_t avail_idx; 1308 1309 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1310 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1311 } 1312 1313 /* 1314 * This function gets available ring index for zero copy rx, 1315 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1316 * Only one thread will call this funciton for a paticular virtio device, 1317 * so, it is designed as non-thread-safe function. 1318 */ 1319 static inline uint32_t __attribute__((always_inline)) 1320 get_available_ring_index_zcp(struct virtio_net *dev, 1321 uint16_t *res_base_idx, uint32_t count) 1322 { 1323 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1324 uint16_t avail_idx; 1325 uint32_t retry = 0; 1326 uint16_t free_entries; 1327 1328 *res_base_idx = vq->last_used_idx_res; 1329 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1330 free_entries = (avail_idx - *res_base_idx); 1331 1332 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1333 "avail idx: %d, " 1334 "res base idx:%d, free entries:%d\n", 1335 dev->device_fh, avail_idx, *res_base_idx, 1336 free_entries); 1337 1338 /* 1339 * If retry is enabled and the queue is full then we wait 1340 * and retry to avoid packet loss. 1341 */ 1342 if (enable_retry && unlikely(count > free_entries)) { 1343 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1344 rte_delay_us(burst_rx_delay_time); 1345 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1346 free_entries = (avail_idx - *res_base_idx); 1347 if (count <= free_entries) 1348 break; 1349 } 1350 } 1351 1352 /*check that we have enough buffers*/ 1353 if (unlikely(count > free_entries)) 1354 count = free_entries; 1355 1356 if (unlikely(count == 0)) { 1357 LOG_DEBUG(VHOST_DATA, 1358 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1359 "avail idx: %d, res base idx:%d, free entries:%d\n", 1360 dev->device_fh, avail_idx, 1361 *res_base_idx, free_entries); 1362 return 0; 1363 } 1364 1365 vq->last_used_idx_res = *res_base_idx + count; 1366 1367 return count; 1368 } 1369 1370 /* 1371 * This function put descriptor back to used list. 1372 */ 1373 static inline void __attribute__((always_inline)) 1374 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1375 { 1376 uint16_t res_cur_idx = vq->last_used_idx; 1377 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1378 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1379 rte_compiler_barrier(); 1380 *(volatile uint16_t *)&vq->used->idx += 1; 1381 vq->last_used_idx += 1; 1382 1383 /* Kick the guest if necessary. */ 1384 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1385 eventfd_write((int)vq->kickfd, 1); 1386 } 1387 1388 /* 1389 * This function get available descriptor from vitio vring and un-attached mbuf 1390 * from vpool->ring, and then attach them together. It needs adjust the offset 1391 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1392 * frame data may be put to wrong location in mbuf. 1393 */ 1394 static inline void __attribute__((always_inline)) 1395 attach_rxmbuf_zcp(struct virtio_net *dev) 1396 { 1397 uint16_t res_base_idx, desc_idx; 1398 uint64_t buff_addr, phys_addr; 1399 struct vhost_virtqueue *vq; 1400 struct vring_desc *desc; 1401 struct rte_mbuf *mbuf = NULL; 1402 struct vpool *vpool; 1403 hpa_type addr_type; 1404 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1405 1406 vpool = &vpool_array[vdev->vmdq_rx_q]; 1407 vq = dev->virtqueue[VIRTIO_RXQ]; 1408 1409 do { 1410 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1411 1) != 1)) 1412 return; 1413 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1414 1415 desc = &vq->desc[desc_idx]; 1416 if (desc->flags & VRING_DESC_F_NEXT) { 1417 desc = &vq->desc[desc->next]; 1418 buff_addr = gpa_to_vva(dev, desc->addr); 1419 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1420 &addr_type); 1421 } else { 1422 buff_addr = gpa_to_vva(dev, 1423 desc->addr + vq->vhost_hlen); 1424 phys_addr = gpa_to_hpa(vdev, 1425 desc->addr + vq->vhost_hlen, 1426 desc->len, &addr_type); 1427 } 1428 1429 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1430 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1431 " address found when attaching RX frame buffer" 1432 " address!\n", dev->device_fh); 1433 put_desc_to_used_list_zcp(vq, desc_idx); 1434 continue; 1435 } 1436 1437 /* 1438 * Check if the frame buffer address from guest crosses 1439 * sub-region or not. 1440 */ 1441 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1442 RTE_LOG(ERR, VHOST_DATA, 1443 "(%"PRIu64") Frame buffer address cross " 1444 "sub-regioin found when attaching RX frame " 1445 "buffer address!\n", 1446 dev->device_fh); 1447 put_desc_to_used_list_zcp(vq, desc_idx); 1448 continue; 1449 } 1450 } while (unlikely(phys_addr == 0)); 1451 1452 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1453 if (unlikely(mbuf == NULL)) { 1454 LOG_DEBUG(VHOST_DATA, 1455 "(%"PRIu64") in attach_rxmbuf_zcp: " 1456 "ring_sc_dequeue fail.\n", 1457 dev->device_fh); 1458 put_desc_to_used_list_zcp(vq, desc_idx); 1459 return; 1460 } 1461 1462 if (unlikely(vpool->buf_size > desc->len)) { 1463 LOG_DEBUG(VHOST_DATA, 1464 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1465 "length(%d) of descriptor idx: %d less than room " 1466 "size required: %d\n", 1467 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1468 put_desc_to_used_list_zcp(vq, desc_idx); 1469 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1470 return; 1471 } 1472 1473 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1474 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1475 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1476 mbuf->data_len = desc->len; 1477 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1478 1479 LOG_DEBUG(VHOST_DATA, 1480 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1481 "descriptor idx:%d\n", 1482 dev->device_fh, res_base_idx, desc_idx); 1483 1484 __rte_mbuf_raw_free(mbuf); 1485 1486 return; 1487 } 1488 1489 /* 1490 * Detach an attched packet mbuf - 1491 * - restore original mbuf address and length values. 1492 * - reset pktmbuf data and data_len to their default values. 1493 * All other fields of the given packet mbuf will be left intact. 1494 * 1495 * @param m 1496 * The attached packet mbuf. 1497 */ 1498 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1499 { 1500 const struct rte_mempool *mp = m->pool; 1501 void *buf = RTE_MBUF_TO_BADDR(m); 1502 uint32_t buf_ofs; 1503 uint32_t buf_len = mp->elt_size - sizeof(*m); 1504 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1505 1506 m->buf_addr = buf; 1507 m->buf_len = (uint16_t)buf_len; 1508 1509 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1510 RTE_PKTMBUF_HEADROOM : m->buf_len; 1511 m->data_off = buf_ofs; 1512 1513 m->data_len = 0; 1514 } 1515 1516 /* 1517 * This function is called after packets have been transimited. It fetchs mbuf 1518 * from vpool->pool, detached it and put into vpool->ring. It also update the 1519 * used index and kick the guest if necessary. 1520 */ 1521 static inline uint32_t __attribute__((always_inline)) 1522 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1523 { 1524 struct rte_mbuf *mbuf; 1525 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1526 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1527 uint32_t index = 0; 1528 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1529 1530 LOG_DEBUG(VHOST_DATA, 1531 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1532 "clean is: %d\n", 1533 dev->device_fh, mbuf_count); 1534 LOG_DEBUG(VHOST_DATA, 1535 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1536 "clean is : %d\n", 1537 dev->device_fh, rte_ring_count(vpool->ring)); 1538 1539 for (index = 0; index < mbuf_count; index++) { 1540 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1541 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1542 pktmbuf_detach_zcp(mbuf); 1543 rte_ring_sp_enqueue(vpool->ring, mbuf); 1544 1545 /* Update used index buffer information. */ 1546 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1547 vq->used->ring[used_idx].len = 0; 1548 1549 used_idx = (used_idx + 1) & (vq->size - 1); 1550 } 1551 1552 LOG_DEBUG(VHOST_DATA, 1553 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1554 "clean is: %d\n", 1555 dev->device_fh, rte_mempool_count(vpool->pool)); 1556 LOG_DEBUG(VHOST_DATA, 1557 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1558 "clean is : %d\n", 1559 dev->device_fh, rte_ring_count(vpool->ring)); 1560 LOG_DEBUG(VHOST_DATA, 1561 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1562 "vq->last_used_idx:%d\n", 1563 dev->device_fh, vq->last_used_idx); 1564 1565 vq->last_used_idx += mbuf_count; 1566 1567 LOG_DEBUG(VHOST_DATA, 1568 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1569 "vq->last_used_idx:%d\n", 1570 dev->device_fh, vq->last_used_idx); 1571 1572 rte_compiler_barrier(); 1573 1574 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1575 1576 /* Kick guest if required. */ 1577 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1578 eventfd_write((int)vq->kickfd, 1); 1579 1580 return 0; 1581 } 1582 1583 /* 1584 * This function is called when a virtio device is destroy. 1585 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1586 */ 1587 static void mbuf_destroy_zcp(struct vpool *vpool) 1588 { 1589 struct rte_mbuf *mbuf = NULL; 1590 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1591 1592 LOG_DEBUG(VHOST_CONFIG, 1593 "in mbuf_destroy_zcp: mbuf count in mempool before " 1594 "mbuf_destroy_zcp is: %d\n", 1595 mbuf_count); 1596 LOG_DEBUG(VHOST_CONFIG, 1597 "in mbuf_destroy_zcp: mbuf count in ring before " 1598 "mbuf_destroy_zcp is : %d\n", 1599 rte_ring_count(vpool->ring)); 1600 1601 for (index = 0; index < mbuf_count; index++) { 1602 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1603 if (likely(mbuf != NULL)) { 1604 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1605 pktmbuf_detach_zcp(mbuf); 1606 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1607 } 1608 } 1609 1610 LOG_DEBUG(VHOST_CONFIG, 1611 "in mbuf_destroy_zcp: mbuf count in mempool after " 1612 "mbuf_destroy_zcp is: %d\n", 1613 rte_mempool_count(vpool->pool)); 1614 LOG_DEBUG(VHOST_CONFIG, 1615 "in mbuf_destroy_zcp: mbuf count in ring after " 1616 "mbuf_destroy_zcp is : %d\n", 1617 rte_ring_count(vpool->ring)); 1618 } 1619 1620 /* 1621 * This function update the use flag and counter. 1622 */ 1623 static inline uint32_t __attribute__((always_inline)) 1624 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1625 uint32_t count) 1626 { 1627 struct vhost_virtqueue *vq; 1628 struct vring_desc *desc; 1629 struct rte_mbuf *buff; 1630 /* The virtio_hdr is initialised to 0. */ 1631 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1632 = {{0, 0, 0, 0, 0, 0}, 0}; 1633 uint64_t buff_hdr_addr = 0; 1634 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1635 uint32_t head_idx, packet_success = 0; 1636 uint16_t res_cur_idx; 1637 1638 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1639 1640 if (count == 0) 1641 return 0; 1642 1643 vq = dev->virtqueue[VIRTIO_RXQ]; 1644 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1645 1646 res_cur_idx = vq->last_used_idx; 1647 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1648 dev->device_fh, res_cur_idx, res_cur_idx + count); 1649 1650 /* Retrieve all of the head indexes first to avoid caching issues. */ 1651 for (head_idx = 0; head_idx < count; head_idx++) 1652 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1653 1654 /*Prefetch descriptor index. */ 1655 rte_prefetch0(&vq->desc[head[packet_success]]); 1656 1657 while (packet_success != count) { 1658 /* Get descriptor from available ring */ 1659 desc = &vq->desc[head[packet_success]]; 1660 1661 buff = pkts[packet_success]; 1662 LOG_DEBUG(VHOST_DATA, 1663 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1664 "pkt[%d] descriptor idx: %d\n", 1665 dev->device_fh, packet_success, 1666 MBUF_HEADROOM_UINT32(buff)); 1667 1668 PRINT_PACKET(dev, 1669 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1670 + RTE_PKTMBUF_HEADROOM), 1671 rte_pktmbuf_data_len(buff), 0); 1672 1673 /* Buffer address translation for virtio header. */ 1674 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1675 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1676 1677 /* 1678 * If the descriptors are chained the header and data are 1679 * placed in separate buffers. 1680 */ 1681 if (desc->flags & VRING_DESC_F_NEXT) { 1682 desc->len = vq->vhost_hlen; 1683 desc = &vq->desc[desc->next]; 1684 desc->len = rte_pktmbuf_data_len(buff); 1685 } else { 1686 desc->len = packet_len; 1687 } 1688 1689 /* Update used ring with desc information */ 1690 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1691 = head[packet_success]; 1692 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1693 = packet_len; 1694 res_cur_idx++; 1695 packet_success++; 1696 1697 /* A header is required per buffer. */ 1698 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1699 (const void *)&virtio_hdr, vq->vhost_hlen); 1700 1701 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1702 1703 if (likely(packet_success < count)) { 1704 /* Prefetch descriptor index. */ 1705 rte_prefetch0(&vq->desc[head[packet_success]]); 1706 } 1707 } 1708 1709 rte_compiler_barrier(); 1710 1711 LOG_DEBUG(VHOST_DATA, 1712 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1713 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1714 dev->device_fh, vq->last_used_idx, vq->used->idx); 1715 1716 *(volatile uint16_t *)&vq->used->idx += count; 1717 vq->last_used_idx += count; 1718 1719 LOG_DEBUG(VHOST_DATA, 1720 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1721 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1722 dev->device_fh, vq->last_used_idx, vq->used->idx); 1723 1724 /* Kick the guest if necessary. */ 1725 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1726 eventfd_write((int)vq->kickfd, 1); 1727 1728 return count; 1729 } 1730 1731 /* 1732 * This function routes the TX packet to the correct interface. 1733 * This may be a local device or the physical port. 1734 */ 1735 static inline void __attribute__((always_inline)) 1736 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1737 uint32_t desc_idx, uint8_t need_copy) 1738 { 1739 struct mbuf_table *tx_q; 1740 struct rte_mbuf **m_table; 1741 struct rte_mbuf *mbuf = NULL; 1742 unsigned len, ret, offset = 0; 1743 struct vpool *vpool; 1744 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1745 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1746 1747 /*Add packet to the port tx queue*/ 1748 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1749 len = tx_q->len; 1750 1751 /* Allocate an mbuf and populate the structure. */ 1752 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1753 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1754 if (unlikely(mbuf == NULL)) { 1755 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1756 RTE_LOG(ERR, VHOST_DATA, 1757 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1758 dev->device_fh); 1759 put_desc_to_used_list_zcp(vq, desc_idx); 1760 return; 1761 } 1762 1763 if (vm2vm_mode == VM2VM_HARDWARE) { 1764 /* Avoid using a vlan tag from any vm for external pkt, such as 1765 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1766 * selection, MAC address determines it as an external pkt 1767 * which should go to network, while vlan tag determine it as 1768 * a vm2vm pkt should forward to another vm. Hardware confuse 1769 * such a ambiguous situation, so pkt will lost. 1770 */ 1771 vlan_tag = external_pkt_default_vlan_tag; 1772 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1773 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1774 __rte_mbuf_raw_free(mbuf); 1775 return; 1776 } 1777 } 1778 1779 mbuf->nb_segs = m->nb_segs; 1780 mbuf->next = m->next; 1781 mbuf->data_len = m->data_len + offset; 1782 mbuf->pkt_len = mbuf->data_len; 1783 if (unlikely(need_copy)) { 1784 /* Copy the packet contents to the mbuf. */ 1785 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1786 rte_pktmbuf_mtod(m, void *), 1787 m->data_len); 1788 } else { 1789 mbuf->data_off = m->data_off; 1790 mbuf->buf_physaddr = m->buf_physaddr; 1791 mbuf->buf_addr = m->buf_addr; 1792 } 1793 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1794 mbuf->vlan_tci = vlan_tag; 1795 mbuf->l2_len = sizeof(struct ether_hdr); 1796 mbuf->l3_len = sizeof(struct ipv4_hdr); 1797 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1798 1799 tx_q->m_table[len] = mbuf; 1800 len++; 1801 1802 LOG_DEBUG(VHOST_DATA, 1803 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1804 dev->device_fh, 1805 mbuf->nb_segs, 1806 (mbuf->next == NULL) ? "null" : "non-null"); 1807 1808 if (enable_stats) { 1809 dev_statistics[dev->device_fh].tx_total++; 1810 dev_statistics[dev->device_fh].tx++; 1811 } 1812 1813 if (unlikely(len == MAX_PKT_BURST)) { 1814 m_table = (struct rte_mbuf **)tx_q->m_table; 1815 ret = rte_eth_tx_burst(ports[0], 1816 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1817 1818 /* 1819 * Free any buffers not handled by TX and update 1820 * the port stats. 1821 */ 1822 if (unlikely(ret < len)) { 1823 do { 1824 rte_pktmbuf_free(m_table[ret]); 1825 } while (++ret < len); 1826 } 1827 1828 len = 0; 1829 txmbuf_clean_zcp(dev, vpool); 1830 } 1831 1832 tx_q->len = len; 1833 1834 return; 1835 } 1836 1837 /* 1838 * This function TX all available packets in virtio TX queue for one 1839 * virtio-net device. If it is first packet, it learns MAC address and 1840 * setup VMDQ. 1841 */ 1842 static inline void __attribute__((always_inline)) 1843 virtio_dev_tx_zcp(struct virtio_net *dev) 1844 { 1845 struct rte_mbuf m; 1846 struct vhost_virtqueue *vq; 1847 struct vring_desc *desc; 1848 uint64_t buff_addr = 0, phys_addr; 1849 uint32_t head[MAX_PKT_BURST]; 1850 uint32_t i; 1851 uint16_t free_entries, packet_success = 0; 1852 uint16_t avail_idx; 1853 uint8_t need_copy = 0; 1854 hpa_type addr_type; 1855 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1856 1857 vq = dev->virtqueue[VIRTIO_TXQ]; 1858 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1859 1860 /* If there are no available buffers then return. */ 1861 if (vq->last_used_idx_res == avail_idx) 1862 return; 1863 1864 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1865 1866 /* Prefetch available ring to retrieve head indexes. */ 1867 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1868 1869 /* Get the number of free entries in the ring */ 1870 free_entries = (avail_idx - vq->last_used_idx_res); 1871 1872 /* Limit to MAX_PKT_BURST. */ 1873 free_entries 1874 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1875 1876 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1877 dev->device_fh, free_entries); 1878 1879 /* Retrieve all of the head indexes first to avoid caching issues. */ 1880 for (i = 0; i < free_entries; i++) 1881 head[i] 1882 = vq->avail->ring[(vq->last_used_idx_res + i) 1883 & (vq->size - 1)]; 1884 1885 vq->last_used_idx_res += free_entries; 1886 1887 /* Prefetch descriptor index. */ 1888 rte_prefetch0(&vq->desc[head[packet_success]]); 1889 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1890 1891 while (packet_success < free_entries) { 1892 desc = &vq->desc[head[packet_success]]; 1893 1894 /* Discard first buffer as it is the virtio header */ 1895 desc = &vq->desc[desc->next]; 1896 1897 /* Buffer address translation. */ 1898 buff_addr = gpa_to_vva(dev, desc->addr); 1899 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type); 1900 1901 if (likely(packet_success < (free_entries - 1))) 1902 /* Prefetch descriptor index. */ 1903 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1904 1905 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1906 RTE_LOG(ERR, VHOST_DATA, 1907 "(%"PRIu64") Invalid frame buffer address found" 1908 "when TX packets!\n", 1909 dev->device_fh); 1910 packet_success++; 1911 continue; 1912 } 1913 1914 /* Prefetch buffer address. */ 1915 rte_prefetch0((void *)(uintptr_t)buff_addr); 1916 1917 /* 1918 * Setup dummy mbuf. This is copied to a real mbuf if 1919 * transmitted out the physical port. 1920 */ 1921 m.data_len = desc->len; 1922 m.nb_segs = 1; 1923 m.next = NULL; 1924 m.data_off = 0; 1925 m.buf_addr = (void *)(uintptr_t)buff_addr; 1926 m.buf_physaddr = phys_addr; 1927 1928 /* 1929 * Check if the frame buffer address from guest crosses 1930 * sub-region or not. 1931 */ 1932 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1933 RTE_LOG(ERR, VHOST_DATA, 1934 "(%"PRIu64") Frame buffer address cross " 1935 "sub-regioin found when attaching TX frame " 1936 "buffer address!\n", 1937 dev->device_fh); 1938 need_copy = 1; 1939 } else 1940 need_copy = 0; 1941 1942 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1943 1944 /* 1945 * If this is the first received packet we need to learn 1946 * the MAC and setup VMDQ 1947 */ 1948 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1949 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1950 /* 1951 * Discard frame if device is scheduled for 1952 * removal or a duplicate MAC address is found. 1953 */ 1954 packet_success += free_entries; 1955 vq->last_used_idx += packet_success; 1956 break; 1957 } 1958 } 1959 1960 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1961 packet_success++; 1962 } 1963 } 1964 1965 /* 1966 * This function is called by each data core. It handles all RX/TX registered 1967 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1968 * addresses are compared with all devices in the main linked list. 1969 */ 1970 static int 1971 switch_worker_zcp(__attribute__((unused)) void *arg) 1972 { 1973 struct virtio_net *dev = NULL; 1974 struct vhost_dev *vdev = NULL; 1975 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1976 struct virtio_net_data_ll *dev_ll; 1977 struct mbuf_table *tx_q; 1978 volatile struct lcore_ll_info *lcore_ll; 1979 const uint64_t drain_tsc 1980 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 1981 * BURST_TX_DRAIN_US; 1982 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1983 unsigned ret; 1984 const uint16_t lcore_id = rte_lcore_id(); 1985 uint16_t count_in_ring, rx_count = 0; 1986 1987 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1988 1989 lcore_ll = lcore_info[lcore_id].lcore_ll; 1990 prev_tsc = 0; 1991 1992 while (1) { 1993 cur_tsc = rte_rdtsc(); 1994 1995 /* TX burst queue drain */ 1996 diff_tsc = cur_tsc - prev_tsc; 1997 if (unlikely(diff_tsc > drain_tsc)) { 1998 /* 1999 * Get mbuf from vpool.pool and detach mbuf and 2000 * put back into vpool.ring. 2001 */ 2002 dev_ll = lcore_ll->ll_root_used; 2003 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2004 /* Get virtio device ID */ 2005 vdev = dev_ll->vdev; 2006 dev = vdev->dev; 2007 2008 if (likely(!vdev->remove)) { 2009 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2010 if (tx_q->len) { 2011 LOG_DEBUG(VHOST_DATA, 2012 "TX queue drained after timeout" 2013 " with burst size %u\n", 2014 tx_q->len); 2015 2016 /* 2017 * Tx any packets in the queue 2018 */ 2019 ret = rte_eth_tx_burst( 2020 ports[0], 2021 (uint16_t)tx_q->txq_id, 2022 (struct rte_mbuf **) 2023 tx_q->m_table, 2024 (uint16_t)tx_q->len); 2025 if (unlikely(ret < tx_q->len)) { 2026 do { 2027 rte_pktmbuf_free( 2028 tx_q->m_table[ret]); 2029 } while (++ret < tx_q->len); 2030 } 2031 tx_q->len = 0; 2032 2033 txmbuf_clean_zcp(dev, 2034 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2035 } 2036 } 2037 dev_ll = dev_ll->next; 2038 } 2039 prev_tsc = cur_tsc; 2040 } 2041 2042 rte_prefetch0(lcore_ll->ll_root_used); 2043 2044 /* 2045 * Inform the configuration core that we have exited the linked 2046 * list and that no devices are in use if requested. 2047 */ 2048 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2049 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2050 2051 /* Process devices */ 2052 dev_ll = lcore_ll->ll_root_used; 2053 2054 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2055 vdev = dev_ll->vdev; 2056 dev = vdev->dev; 2057 if (unlikely(vdev->remove)) { 2058 dev_ll = dev_ll->next; 2059 unlink_vmdq(vdev); 2060 vdev->ready = DEVICE_SAFE_REMOVE; 2061 continue; 2062 } 2063 2064 if (likely(vdev->ready == DEVICE_RX)) { 2065 uint32_t index = vdev->vmdq_rx_q; 2066 uint16_t i; 2067 count_in_ring 2068 = rte_ring_count(vpool_array[index].ring); 2069 uint16_t free_entries 2070 = (uint16_t)get_available_ring_num_zcp(dev); 2071 2072 /* 2073 * Attach all mbufs in vpool.ring and put back 2074 * into vpool.pool. 2075 */ 2076 for (i = 0; 2077 i < RTE_MIN(free_entries, 2078 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2079 i++) 2080 attach_rxmbuf_zcp(dev); 2081 2082 /* Handle guest RX */ 2083 rx_count = rte_eth_rx_burst(ports[0], 2084 vdev->vmdq_rx_q, pkts_burst, 2085 MAX_PKT_BURST); 2086 2087 if (rx_count) { 2088 ret_count = virtio_dev_rx_zcp(dev, 2089 pkts_burst, rx_count); 2090 if (enable_stats) { 2091 dev_statistics[dev->device_fh].rx_total 2092 += rx_count; 2093 dev_statistics[dev->device_fh].rx 2094 += ret_count; 2095 } 2096 while (likely(rx_count)) { 2097 rx_count--; 2098 pktmbuf_detach_zcp( 2099 pkts_burst[rx_count]); 2100 rte_ring_sp_enqueue( 2101 vpool_array[index].ring, 2102 (void *)pkts_burst[rx_count]); 2103 } 2104 } 2105 } 2106 2107 if (likely(!vdev->remove)) 2108 /* Handle guest TX */ 2109 virtio_dev_tx_zcp(dev); 2110 2111 /* Move to the next device in the list */ 2112 dev_ll = dev_ll->next; 2113 } 2114 } 2115 2116 return 0; 2117 } 2118 2119 2120 /* 2121 * Add an entry to a used linked list. A free entry must first be found 2122 * in the free linked list using get_data_ll_free_entry(); 2123 */ 2124 static void 2125 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2126 struct virtio_net_data_ll *ll_dev) 2127 { 2128 struct virtio_net_data_ll *ll = *ll_root_addr; 2129 2130 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2131 ll_dev->next = NULL; 2132 rte_compiler_barrier(); 2133 2134 /* If ll == NULL then this is the first device. */ 2135 if (ll) { 2136 /* Increment to the tail of the linked list. */ 2137 while ((ll->next != NULL) ) 2138 ll = ll->next; 2139 2140 ll->next = ll_dev; 2141 } else { 2142 *ll_root_addr = ll_dev; 2143 } 2144 } 2145 2146 /* 2147 * Remove an entry from a used linked list. The entry must then be added to 2148 * the free linked list using put_data_ll_free_entry(). 2149 */ 2150 static void 2151 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2152 struct virtio_net_data_ll *ll_dev, 2153 struct virtio_net_data_ll *ll_dev_last) 2154 { 2155 struct virtio_net_data_ll *ll = *ll_root_addr; 2156 2157 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2158 return; 2159 2160 if (ll_dev == ll) 2161 *ll_root_addr = ll_dev->next; 2162 else 2163 if (likely(ll_dev_last != NULL)) 2164 ll_dev_last->next = ll_dev->next; 2165 else 2166 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2167 } 2168 2169 /* 2170 * Find and return an entry from the free linked list. 2171 */ 2172 static struct virtio_net_data_ll * 2173 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2174 { 2175 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2176 struct virtio_net_data_ll *ll_dev; 2177 2178 if (ll_free == NULL) 2179 return NULL; 2180 2181 ll_dev = ll_free; 2182 *ll_root_addr = ll_free->next; 2183 2184 return ll_dev; 2185 } 2186 2187 /* 2188 * Place an entry back on to the free linked list. 2189 */ 2190 static void 2191 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2192 struct virtio_net_data_ll *ll_dev) 2193 { 2194 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2195 2196 if (ll_dev == NULL) 2197 return; 2198 2199 ll_dev->next = ll_free; 2200 *ll_root_addr = ll_dev; 2201 } 2202 2203 /* 2204 * Creates a linked list of a given size. 2205 */ 2206 static struct virtio_net_data_ll * 2207 alloc_data_ll(uint32_t size) 2208 { 2209 struct virtio_net_data_ll *ll_new; 2210 uint32_t i; 2211 2212 /* Malloc and then chain the linked list. */ 2213 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2214 if (ll_new == NULL) { 2215 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2216 return NULL; 2217 } 2218 2219 for (i = 0; i < size - 1; i++) { 2220 ll_new[i].vdev = NULL; 2221 ll_new[i].next = &ll_new[i+1]; 2222 } 2223 ll_new[i].next = NULL; 2224 2225 return (ll_new); 2226 } 2227 2228 /* 2229 * Create the main linked list along with each individual cores linked list. A used and a free list 2230 * are created to manage entries. 2231 */ 2232 static int 2233 init_data_ll (void) 2234 { 2235 int lcore; 2236 2237 RTE_LCORE_FOREACH_SLAVE(lcore) { 2238 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2239 if (lcore_info[lcore].lcore_ll == NULL) { 2240 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2241 return -1; 2242 } 2243 2244 lcore_info[lcore].lcore_ll->device_num = 0; 2245 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2246 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2247 if (num_devices % num_switching_cores) 2248 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2249 else 2250 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2251 } 2252 2253 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2254 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2255 2256 return 0; 2257 } 2258 2259 /* 2260 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2261 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2262 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2263 */ 2264 static void 2265 destroy_device (volatile struct virtio_net *dev) 2266 { 2267 struct virtio_net_data_ll *ll_lcore_dev_cur; 2268 struct virtio_net_data_ll *ll_main_dev_cur; 2269 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2270 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2271 struct vhost_dev *vdev; 2272 int lcore; 2273 2274 dev->flags &= ~VIRTIO_DEV_RUNNING; 2275 2276 vdev = (struct vhost_dev *)dev->priv; 2277 /*set the remove flag. */ 2278 vdev->remove = 1; 2279 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2280 rte_pause(); 2281 } 2282 2283 /* Search for entry to be removed from lcore ll */ 2284 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2285 while (ll_lcore_dev_cur != NULL) { 2286 if (ll_lcore_dev_cur->vdev == vdev) { 2287 break; 2288 } else { 2289 ll_lcore_dev_last = ll_lcore_dev_cur; 2290 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2291 } 2292 } 2293 2294 if (ll_lcore_dev_cur == NULL) { 2295 RTE_LOG(ERR, VHOST_CONFIG, 2296 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2297 dev->device_fh); 2298 return; 2299 } 2300 2301 /* Search for entry to be removed from main ll */ 2302 ll_main_dev_cur = ll_root_used; 2303 ll_main_dev_last = NULL; 2304 while (ll_main_dev_cur != NULL) { 2305 if (ll_main_dev_cur->vdev == vdev) { 2306 break; 2307 } else { 2308 ll_main_dev_last = ll_main_dev_cur; 2309 ll_main_dev_cur = ll_main_dev_cur->next; 2310 } 2311 } 2312 2313 /* Remove entries from the lcore and main ll. */ 2314 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2315 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2316 2317 /* Set the dev_removal_flag on each lcore. */ 2318 RTE_LCORE_FOREACH_SLAVE(lcore) { 2319 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2320 } 2321 2322 /* 2323 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2324 * they can no longer access the device removed from the linked lists and that the devices 2325 * are no longer in use. 2326 */ 2327 RTE_LCORE_FOREACH_SLAVE(lcore) { 2328 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2329 rte_pause(); 2330 } 2331 } 2332 2333 /* Add the entries back to the lcore and main free ll.*/ 2334 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2335 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2336 2337 /* Decrement number of device on the lcore. */ 2338 lcore_info[vdev->coreid].lcore_ll->device_num--; 2339 2340 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2341 2342 if (zero_copy) { 2343 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2344 2345 /* Stop the RX queue. */ 2346 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2347 LOG_DEBUG(VHOST_CONFIG, 2348 "(%"PRIu64") In destroy_device: Failed to stop " 2349 "rx queue:%d\n", 2350 dev->device_fh, 2351 vdev->vmdq_rx_q); 2352 } 2353 2354 LOG_DEBUG(VHOST_CONFIG, 2355 "(%"PRIu64") in destroy_device: Start put mbuf in " 2356 "mempool back to ring for RX queue: %d\n", 2357 dev->device_fh, vdev->vmdq_rx_q); 2358 2359 mbuf_destroy_zcp(vpool); 2360 2361 /* Stop the TX queue. */ 2362 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2363 LOG_DEBUG(VHOST_CONFIG, 2364 "(%"PRIu64") In destroy_device: Failed to " 2365 "stop tx queue:%d\n", 2366 dev->device_fh, vdev->vmdq_rx_q); 2367 } 2368 2369 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2370 2371 LOG_DEBUG(VHOST_CONFIG, 2372 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2373 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2374 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2375 dev->device_fh); 2376 2377 mbuf_destroy_zcp(vpool); 2378 rte_free(vdev->regions_hpa); 2379 } 2380 rte_free(vdev); 2381 2382 } 2383 2384 /* 2385 * Calculate the region count of physical continous regions for one particular 2386 * region of whose vhost virtual address is continous. The particular region 2387 * start from vva_start, with size of 'size' in argument. 2388 */ 2389 static uint32_t 2390 check_hpa_regions(uint64_t vva_start, uint64_t size) 2391 { 2392 uint32_t i, nregions = 0, page_size = getpagesize(); 2393 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2394 if (vva_start % page_size) { 2395 LOG_DEBUG(VHOST_CONFIG, 2396 "in check_countinous: vva start(%p) mod page_size(%d) " 2397 "has remainder\n", 2398 (void *)(uintptr_t)vva_start, page_size); 2399 return 0; 2400 } 2401 if (size % page_size) { 2402 LOG_DEBUG(VHOST_CONFIG, 2403 "in check_countinous: " 2404 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2405 size, page_size); 2406 return 0; 2407 } 2408 for (i = 0; i < size - page_size; i = i + page_size) { 2409 cur_phys_addr 2410 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2411 next_phys_addr = rte_mem_virt2phy( 2412 (void *)(uintptr_t)(vva_start + i + page_size)); 2413 if ((cur_phys_addr + page_size) != next_phys_addr) { 2414 ++nregions; 2415 LOG_DEBUG(VHOST_CONFIG, 2416 "in check_continuous: hva addr:(%p) is not " 2417 "continuous with hva addr:(%p), diff:%d\n", 2418 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2419 (void *)(uintptr_t)(vva_start + (uint64_t)i 2420 + page_size), page_size); 2421 LOG_DEBUG(VHOST_CONFIG, 2422 "in check_continuous: hpa addr:(%p) is not " 2423 "continuous with hpa addr:(%p), " 2424 "diff:(%"PRIu64")\n", 2425 (void *)(uintptr_t)cur_phys_addr, 2426 (void *)(uintptr_t)next_phys_addr, 2427 (next_phys_addr-cur_phys_addr)); 2428 } 2429 } 2430 return nregions; 2431 } 2432 2433 /* 2434 * Divide each region whose vhost virtual address is continous into a few 2435 * sub-regions, make sure the physical address within each sub-region are 2436 * continous. And fill offset(to GPA) and size etc. information of each 2437 * sub-region into regions_hpa. 2438 */ 2439 static uint32_t 2440 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2441 { 2442 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2443 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2444 2445 if (mem_region_hpa == NULL) 2446 return 0; 2447 2448 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2449 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2450 virtio_memory->regions[regionidx].address_offset; 2451 mem_region_hpa[regionidx_hpa].guest_phys_address 2452 = virtio_memory->regions[regionidx].guest_phys_address; 2453 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2454 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2455 mem_region_hpa[regionidx_hpa].guest_phys_address; 2456 LOG_DEBUG(VHOST_CONFIG, 2457 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2458 regionidx_hpa, 2459 (void *)(uintptr_t) 2460 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2461 LOG_DEBUG(VHOST_CONFIG, 2462 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2463 regionidx_hpa, 2464 (void *)(uintptr_t) 2465 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2466 for (i = 0, k = 0; 2467 i < virtio_memory->regions[regionidx].memory_size - 2468 page_size; 2469 i += page_size) { 2470 cur_phys_addr = rte_mem_virt2phy( 2471 (void *)(uintptr_t)(vva_start + i)); 2472 next_phys_addr = rte_mem_virt2phy( 2473 (void *)(uintptr_t)(vva_start + 2474 i + page_size)); 2475 if ((cur_phys_addr + page_size) != next_phys_addr) { 2476 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2477 mem_region_hpa[regionidx_hpa].guest_phys_address + 2478 k + page_size; 2479 mem_region_hpa[regionidx_hpa].memory_size 2480 = k + page_size; 2481 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2482 "phys addr end [%d]:(%p)\n", 2483 regionidx_hpa, 2484 (void *)(uintptr_t) 2485 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2486 LOG_DEBUG(VHOST_CONFIG, 2487 "in fill_hpa_regions: guest phys addr " 2488 "size [%d]:(%p)\n", 2489 regionidx_hpa, 2490 (void *)(uintptr_t) 2491 (mem_region_hpa[regionidx_hpa].memory_size)); 2492 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2493 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2494 ++regionidx_hpa; 2495 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2496 next_phys_addr - 2497 mem_region_hpa[regionidx_hpa].guest_phys_address; 2498 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2499 " phys addr start[%d]:(%p)\n", 2500 regionidx_hpa, 2501 (void *)(uintptr_t) 2502 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2503 LOG_DEBUG(VHOST_CONFIG, 2504 "in fill_hpa_regions: host phys addr " 2505 "start[%d]:(%p)\n", 2506 regionidx_hpa, 2507 (void *)(uintptr_t) 2508 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2509 k = 0; 2510 } else { 2511 k += page_size; 2512 } 2513 } 2514 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2515 = mem_region_hpa[regionidx_hpa].guest_phys_address 2516 + k + page_size; 2517 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2518 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2519 "[%d]:(%p)\n", regionidx_hpa, 2520 (void *)(uintptr_t) 2521 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2522 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2523 "[%d]:(%p)\n", regionidx_hpa, 2524 (void *)(uintptr_t) 2525 (mem_region_hpa[regionidx_hpa].memory_size)); 2526 ++regionidx_hpa; 2527 } 2528 return regionidx_hpa; 2529 } 2530 2531 /* 2532 * A new device is added to a data core. First the device is added to the main linked list 2533 * and the allocated to a specific data core. 2534 */ 2535 static int 2536 new_device (struct virtio_net *dev) 2537 { 2538 struct virtio_net_data_ll *ll_dev; 2539 int lcore, core_add = 0; 2540 uint32_t device_num_min = num_devices; 2541 struct vhost_dev *vdev; 2542 uint32_t regionidx; 2543 2544 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2545 if (vdev == NULL) { 2546 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2547 dev->device_fh); 2548 return -1; 2549 } 2550 vdev->dev = dev; 2551 dev->priv = vdev; 2552 2553 if (zero_copy) { 2554 vdev->nregions_hpa = dev->mem->nregions; 2555 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2556 vdev->nregions_hpa 2557 += check_hpa_regions( 2558 dev->mem->regions[regionidx].guest_phys_address 2559 + dev->mem->regions[regionidx].address_offset, 2560 dev->mem->regions[regionidx].memory_size); 2561 2562 } 2563 2564 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2565 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2566 CACHE_LINE_SIZE); 2567 if (vdev->regions_hpa == NULL) { 2568 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2569 rte_free(vdev); 2570 return -1; 2571 } 2572 2573 2574 if (fill_hpa_memory_regions( 2575 vdev->regions_hpa, dev->mem 2576 ) != vdev->nregions_hpa) { 2577 2578 RTE_LOG(ERR, VHOST_CONFIG, 2579 "hpa memory regions number mismatch: " 2580 "[%d]\n", vdev->nregions_hpa); 2581 rte_free(vdev->regions_hpa); 2582 rte_free(vdev); 2583 return -1; 2584 } 2585 } 2586 2587 2588 /* Add device to main ll */ 2589 ll_dev = get_data_ll_free_entry(&ll_root_free); 2590 if (ll_dev == NULL) { 2591 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2592 "of %d devices per core has been reached\n", 2593 dev->device_fh, num_devices); 2594 if (vdev->regions_hpa) 2595 rte_free(vdev->regions_hpa); 2596 rte_free(vdev); 2597 return -1; 2598 } 2599 ll_dev->vdev = vdev; 2600 add_data_ll_entry(&ll_root_used, ll_dev); 2601 vdev->vmdq_rx_q 2602 = dev->device_fh * (num_queues / num_devices); 2603 2604 if (zero_copy) { 2605 uint32_t index = vdev->vmdq_rx_q; 2606 uint32_t count_in_ring, i; 2607 struct mbuf_table *tx_q; 2608 2609 count_in_ring = rte_ring_count(vpool_array[index].ring); 2610 2611 LOG_DEBUG(VHOST_CONFIG, 2612 "(%"PRIu64") in new_device: mbuf count in mempool " 2613 "before attach is: %d\n", 2614 dev->device_fh, 2615 rte_mempool_count(vpool_array[index].pool)); 2616 LOG_DEBUG(VHOST_CONFIG, 2617 "(%"PRIu64") in new_device: mbuf count in ring " 2618 "before attach is : %d\n", 2619 dev->device_fh, count_in_ring); 2620 2621 /* 2622 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2623 */ 2624 for (i = 0; i < count_in_ring; i++) 2625 attach_rxmbuf_zcp(dev); 2626 2627 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2628 "mempool after attach is: %d\n", 2629 dev->device_fh, 2630 rte_mempool_count(vpool_array[index].pool)); 2631 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2632 "ring after attach is : %d\n", 2633 dev->device_fh, 2634 rte_ring_count(vpool_array[index].ring)); 2635 2636 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2637 tx_q->txq_id = vdev->vmdq_rx_q; 2638 2639 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2640 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2641 2642 LOG_DEBUG(VHOST_CONFIG, 2643 "(%"PRIu64") In new_device: Failed to start " 2644 "tx queue:%d\n", 2645 dev->device_fh, vdev->vmdq_rx_q); 2646 2647 mbuf_destroy_zcp(vpool); 2648 rte_free(vdev->regions_hpa); 2649 rte_free(vdev); 2650 return -1; 2651 } 2652 2653 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2654 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2655 2656 LOG_DEBUG(VHOST_CONFIG, 2657 "(%"PRIu64") In new_device: Failed to start " 2658 "rx queue:%d\n", 2659 dev->device_fh, vdev->vmdq_rx_q); 2660 2661 /* Stop the TX queue. */ 2662 if (rte_eth_dev_tx_queue_stop(ports[0], 2663 vdev->vmdq_rx_q) != 0) { 2664 LOG_DEBUG(VHOST_CONFIG, 2665 "(%"PRIu64") In new_device: Failed to " 2666 "stop tx queue:%d\n", 2667 dev->device_fh, vdev->vmdq_rx_q); 2668 } 2669 2670 mbuf_destroy_zcp(vpool); 2671 rte_free(vdev->regions_hpa); 2672 rte_free(vdev); 2673 return -1; 2674 } 2675 2676 } 2677 2678 /*reset ready flag*/ 2679 vdev->ready = DEVICE_MAC_LEARNING; 2680 vdev->remove = 0; 2681 2682 /* Find a suitable lcore to add the device. */ 2683 RTE_LCORE_FOREACH_SLAVE(lcore) { 2684 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2685 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2686 core_add = lcore; 2687 } 2688 } 2689 /* Add device to lcore ll */ 2690 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2691 if (ll_dev == NULL) { 2692 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2693 vdev->ready = DEVICE_SAFE_REMOVE; 2694 destroy_device(dev); 2695 if (vdev->regions_hpa) 2696 rte_free(vdev->regions_hpa); 2697 rte_free(vdev); 2698 return -1; 2699 } 2700 ll_dev->vdev = vdev; 2701 vdev->coreid = core_add; 2702 2703 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2704 2705 /* Initialize device stats */ 2706 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2707 2708 /* Disable notifications. */ 2709 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2710 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2711 lcore_info[vdev->coreid].lcore_ll->device_num++; 2712 dev->flags |= VIRTIO_DEV_RUNNING; 2713 2714 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2715 2716 return 0; 2717 } 2718 2719 /* 2720 * These callback allow devices to be added to the data core when configuration 2721 * has been fully complete. 2722 */ 2723 static const struct virtio_net_device_ops virtio_net_device_ops = 2724 { 2725 .new_device = new_device, 2726 .destroy_device = destroy_device, 2727 }; 2728 2729 /* 2730 * This is a thread will wake up after a period to print stats if the user has 2731 * enabled them. 2732 */ 2733 static void 2734 print_stats(void) 2735 { 2736 struct virtio_net_data_ll *dev_ll; 2737 uint64_t tx_dropped, rx_dropped; 2738 uint64_t tx, tx_total, rx, rx_total; 2739 uint32_t device_fh; 2740 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2741 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2742 2743 while(1) { 2744 sleep(enable_stats); 2745 2746 /* Clear screen and move to top left */ 2747 printf("%s%s", clr, top_left); 2748 2749 printf("\nDevice statistics ===================================="); 2750 2751 dev_ll = ll_root_used; 2752 while (dev_ll != NULL) { 2753 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2754 tx_total = dev_statistics[device_fh].tx_total; 2755 tx = dev_statistics[device_fh].tx; 2756 tx_dropped = tx_total - tx; 2757 if (zero_copy == 0) { 2758 rx_total = rte_atomic64_read( 2759 &dev_statistics[device_fh].rx_total_atomic); 2760 rx = rte_atomic64_read( 2761 &dev_statistics[device_fh].rx_atomic); 2762 } else { 2763 rx_total = dev_statistics[device_fh].rx_total; 2764 rx = dev_statistics[device_fh].rx; 2765 } 2766 rx_dropped = rx_total - rx; 2767 2768 printf("\nStatistics for device %"PRIu32" ------------------------------" 2769 "\nTX total: %"PRIu64"" 2770 "\nTX dropped: %"PRIu64"" 2771 "\nTX successful: %"PRIu64"" 2772 "\nRX total: %"PRIu64"" 2773 "\nRX dropped: %"PRIu64"" 2774 "\nRX successful: %"PRIu64"", 2775 device_fh, 2776 tx_total, 2777 tx_dropped, 2778 tx, 2779 rx_total, 2780 rx_dropped, 2781 rx); 2782 2783 dev_ll = dev_ll->next; 2784 } 2785 printf("\n======================================================\n"); 2786 } 2787 } 2788 2789 static void 2790 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2791 char *ring_name, uint32_t nb_mbuf) 2792 { 2793 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2794 vpool_array[index].pool 2795 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2796 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2797 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2798 rte_pktmbuf_init, NULL, socket, 0); 2799 if (vpool_array[index].pool != NULL) { 2800 vpool_array[index].ring 2801 = rte_ring_create(ring_name, 2802 rte_align32pow2(nb_mbuf + 1), 2803 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2804 if (likely(vpool_array[index].ring != NULL)) { 2805 LOG_DEBUG(VHOST_CONFIG, 2806 "in setup_mempool_tbl: mbuf count in " 2807 "mempool is: %d\n", 2808 rte_mempool_count(vpool_array[index].pool)); 2809 LOG_DEBUG(VHOST_CONFIG, 2810 "in setup_mempool_tbl: mbuf count in " 2811 "ring is: %d\n", 2812 rte_ring_count(vpool_array[index].ring)); 2813 } else { 2814 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2815 ring_name); 2816 } 2817 2818 /* Need consider head room. */ 2819 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2820 } else { 2821 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2822 } 2823 } 2824 2825 2826 /* 2827 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2828 * device is also registered here to handle the IOCTLs. 2829 */ 2830 int 2831 MAIN(int argc, char *argv[]) 2832 { 2833 struct rte_mempool *mbuf_pool = NULL; 2834 unsigned lcore_id, core_id = 0; 2835 unsigned nb_ports, valid_num_ports; 2836 int ret; 2837 uint8_t portid, queue_id = 0; 2838 static pthread_t tid; 2839 2840 /* init EAL */ 2841 ret = rte_eal_init(argc, argv); 2842 if (ret < 0) 2843 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2844 argc -= ret; 2845 argv += ret; 2846 2847 /* parse app arguments */ 2848 ret = us_vhost_parse_args(argc, argv); 2849 if (ret < 0) 2850 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2851 2852 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2853 if (rte_lcore_is_enabled(lcore_id)) 2854 lcore_ids[core_id ++] = lcore_id; 2855 2856 if (rte_lcore_count() > RTE_MAX_LCORE) 2857 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2858 2859 /*set the number of swithcing cores available*/ 2860 num_switching_cores = rte_lcore_count()-1; 2861 2862 /* Get the number of physical ports. */ 2863 nb_ports = rte_eth_dev_count(); 2864 if (nb_ports > RTE_MAX_ETHPORTS) 2865 nb_ports = RTE_MAX_ETHPORTS; 2866 2867 /* 2868 * Update the global var NUM_PORTS and global array PORTS 2869 * and get value of var VALID_NUM_PORTS according to system ports number 2870 */ 2871 valid_num_ports = check_ports_num(nb_ports); 2872 2873 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2874 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2875 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2876 return -1; 2877 } 2878 2879 if (zero_copy == 0) { 2880 /* Create the mbuf pool. */ 2881 mbuf_pool = rte_mempool_create( 2882 "MBUF_POOL", 2883 NUM_MBUFS_PER_PORT 2884 * valid_num_ports, 2885 MBUF_SIZE, MBUF_CACHE_SIZE, 2886 sizeof(struct rte_pktmbuf_pool_private), 2887 rte_pktmbuf_pool_init, NULL, 2888 rte_pktmbuf_init, NULL, 2889 rte_socket_id(), 0); 2890 if (mbuf_pool == NULL) 2891 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2892 2893 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2894 vpool_array[queue_id].pool = mbuf_pool; 2895 2896 if (vm2vm_mode == VM2VM_HARDWARE) { 2897 /* Enable VT loop back to let L2 switch to do it. */ 2898 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2899 LOG_DEBUG(VHOST_CONFIG, 2900 "Enable loop back for L2 switch in vmdq.\n"); 2901 } 2902 } else { 2903 uint32_t nb_mbuf; 2904 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2905 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2906 2907 /* 2908 * Zero copy defers queue RX/TX start to the time when guest 2909 * finishes its startup and packet buffers from that guest are 2910 * available. 2911 */ 2912 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2913 rx_conf_default.rx_drop_en = 0; 2914 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2915 nb_mbuf = num_rx_descriptor 2916 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2917 + num_switching_cores * MAX_PKT_BURST; 2918 2919 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2920 snprintf(pool_name, sizeof(pool_name), 2921 "rxmbuf_pool_%u", queue_id); 2922 snprintf(ring_name, sizeof(ring_name), 2923 "rxmbuf_ring_%u", queue_id); 2924 setup_mempool_tbl(rte_socket_id(), queue_id, 2925 pool_name, ring_name, nb_mbuf); 2926 } 2927 2928 nb_mbuf = num_tx_descriptor 2929 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2930 + num_switching_cores * MAX_PKT_BURST; 2931 2932 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2933 snprintf(pool_name, sizeof(pool_name), 2934 "txmbuf_pool_%u", queue_id); 2935 snprintf(ring_name, sizeof(ring_name), 2936 "txmbuf_ring_%u", queue_id); 2937 setup_mempool_tbl(rte_socket_id(), 2938 (queue_id + MAX_QUEUES), 2939 pool_name, ring_name, nb_mbuf); 2940 } 2941 2942 if (vm2vm_mode == VM2VM_HARDWARE) { 2943 /* Enable VT loop back to let L2 switch to do it. */ 2944 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2945 LOG_DEBUG(VHOST_CONFIG, 2946 "Enable loop back for L2 switch in vmdq.\n"); 2947 } 2948 } 2949 /* Set log level. */ 2950 rte_set_log_level(LOG_LEVEL); 2951 2952 /* initialize all ports */ 2953 for (portid = 0; portid < nb_ports; portid++) { 2954 /* skip ports that are not enabled */ 2955 if ((enabled_port_mask & (1 << portid)) == 0) { 2956 RTE_LOG(INFO, VHOST_PORT, 2957 "Skipping disabled port %d\n", portid); 2958 continue; 2959 } 2960 if (port_init(portid) != 0) 2961 rte_exit(EXIT_FAILURE, 2962 "Cannot initialize network ports\n"); 2963 } 2964 2965 /* Initialise all linked lists. */ 2966 if (init_data_ll() == -1) 2967 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2968 2969 /* Initialize device stats */ 2970 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2971 2972 /* Enable stats if the user option is set. */ 2973 if (enable_stats) 2974 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2975 2976 /* Launch all data cores. */ 2977 if (zero_copy == 0) { 2978 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 2979 rte_eal_remote_launch(switch_worker, 2980 mbuf_pool, lcore_id); 2981 } 2982 } else { 2983 uint32_t count_in_mempool, index, i; 2984 for (index = 0; index < 2*MAX_QUEUES; index++) { 2985 /* For all RX and TX queues. */ 2986 count_in_mempool 2987 = rte_mempool_count(vpool_array[index].pool); 2988 2989 /* 2990 * Transfer all un-attached mbufs from vpool.pool 2991 * to vpoo.ring. 2992 */ 2993 for (i = 0; i < count_in_mempool; i++) { 2994 struct rte_mbuf *mbuf 2995 = __rte_mbuf_raw_alloc( 2996 vpool_array[index].pool); 2997 rte_ring_sp_enqueue(vpool_array[index].ring, 2998 (void *)mbuf); 2999 } 3000 3001 LOG_DEBUG(VHOST_CONFIG, 3002 "in MAIN: mbuf count in mempool at initial " 3003 "is: %d\n", count_in_mempool); 3004 LOG_DEBUG(VHOST_CONFIG, 3005 "in MAIN: mbuf count in ring at initial is :" 3006 " %d\n", 3007 rte_ring_count(vpool_array[index].ring)); 3008 } 3009 3010 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3011 rte_eal_remote_launch(switch_worker_zcp, NULL, 3012 lcore_id); 3013 } 3014 3015 if (mergeable == 0) 3016 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3017 3018 /* Register CUSE device to handle IOCTLs. */ 3019 ret = rte_vhost_driver_register((char *)&dev_basename); 3020 if (ret != 0) 3021 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3022 3023 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3024 3025 /* Start CUSE session. */ 3026 rte_vhost_driver_session_start(); 3027 return 0; 3028 3029 } 3030 3031