1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 103 104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 106 107 #define JUMBO_FRAME_MAX_SIZE 0x2600 108 109 /* State of virtio device. */ 110 #define DEVICE_MAC_LEARNING 0 111 #define DEVICE_RX 1 112 #define DEVICE_SAFE_REMOVE 2 113 114 /* Config_core_flag status definitions. */ 115 #define REQUEST_DEV_REMOVAL 1 116 #define ACK_DEV_REMOVAL 0 117 118 /* Configurable number of RX/TX ring descriptors */ 119 #define RTE_TEST_RX_DESC_DEFAULT 1024 120 #define RTE_TEST_TX_DESC_DEFAULT 512 121 122 /* 123 * Need refine these 2 macros for legacy and DPDK based front end: 124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 125 * And then adjust power 2. 126 */ 127 /* 128 * For legacy front end, 128 descriptors, 129 * half for virtio header, another half for mbuf. 130 */ 131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 133 134 /* Get first 4 bytes in mbuf headroom. */ 135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 136 + sizeof(struct rte_mbuf))) 137 138 /* true if x is a power of 2 */ 139 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 140 141 #define INVALID_PORT_ID 0xFF 142 143 /* Max number of devices. Limited by vmdq. */ 144 #define MAX_DEVICES 64 145 146 /* Size of buffers used for snprintfs. */ 147 #define MAX_PRINT_BUFF 6072 148 149 /* Maximum character device basename size. */ 150 #define MAX_BASENAME_SZ 10 151 152 /* Maximum long option length for option parsing. */ 153 #define MAX_LONG_OPT_SZ 64 154 155 /* Used to compare MAC addresses. */ 156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 157 158 /* Number of descriptors per cacheline. */ 159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 160 161 /* mask of enabled ports */ 162 static uint32_t enabled_port_mask = 0; 163 164 /*Number of switching cores enabled*/ 165 static uint32_t num_switching_cores = 0; 166 167 /* number of devices/queues to support*/ 168 static uint32_t num_queues = 0; 169 static uint32_t num_devices; 170 171 /* 172 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 173 * disabled on default. 174 */ 175 static uint32_t zero_copy; 176 static int mergeable; 177 178 /* number of descriptors to apply*/ 179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 181 182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 183 #define MAX_RING_DESC 4096 184 185 struct vpool { 186 struct rte_mempool *pool; 187 struct rte_ring *ring; 188 uint32_t buf_size; 189 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 190 191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 192 typedef enum { 193 VM2VM_DISABLED = 0, 194 VM2VM_SOFTWARE = 1, 195 VM2VM_HARDWARE = 2, 196 VM2VM_LAST 197 } vm2vm_type; 198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 199 200 /* The type of host physical address translated from guest physical address. */ 201 typedef enum { 202 PHYS_ADDR_CONTINUOUS = 0, 203 PHYS_ADDR_CROSS_SUBREG = 1, 204 PHYS_ADDR_INVALID = 2, 205 PHYS_ADDR_LAST 206 } hpa_type; 207 208 /* Enable stats. */ 209 static uint32_t enable_stats = 0; 210 /* Enable retries on RX. */ 211 static uint32_t enable_retry = 1; 212 /* Specify timeout (in useconds) between retries on RX. */ 213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 214 /* Specify the number of retries on RX. */ 215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 216 217 /* Character device basename. Can be set by user. */ 218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 219 220 221 /* Default configuration for rx and tx thresholds etc. */ 222 static struct rte_eth_rxconf rx_conf_default = { 223 .rx_thresh = { 224 .pthresh = RX_PTHRESH, 225 .hthresh = RX_HTHRESH, 226 .wthresh = RX_WTHRESH, 227 }, 228 .rx_drop_en = 1, 229 }; 230 231 /* 232 * These default values are optimized for use with the Intel(R) 82599 10 GbE 233 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 234 * network controllers and/or network drivers. 235 */ 236 static struct rte_eth_txconf tx_conf_default = { 237 .tx_thresh = { 238 .pthresh = TX_PTHRESH, 239 .hthresh = TX_HTHRESH, 240 .wthresh = TX_WTHRESH, 241 }, 242 .tx_free_thresh = 0, /* Use PMD default values */ 243 .tx_rs_thresh = 0, /* Use PMD default values */ 244 }; 245 246 /* empty vmdq configuration structure. Filled in programatically */ 247 static struct rte_eth_conf vmdq_conf_default = { 248 .rxmode = { 249 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 250 .split_hdr_size = 0, 251 .header_split = 0, /**< Header Split disabled */ 252 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 253 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 254 /* 255 * It is necessary for 1G NIC such as I350, 256 * this fixes bug of ipv4 forwarding in guest can't 257 * forward pakets from one virtio dev to another virtio dev. 258 */ 259 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 260 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 261 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 262 }, 263 264 .txmode = { 265 .mq_mode = ETH_MQ_TX_NONE, 266 }, 267 .rx_adv_conf = { 268 /* 269 * should be overridden separately in code with 270 * appropriate values 271 */ 272 .vmdq_rx_conf = { 273 .nb_queue_pools = ETH_8_POOLS, 274 .enable_default_pool = 0, 275 .default_pool = 0, 276 .nb_pool_maps = 0, 277 .pool_map = {{0, 0},}, 278 }, 279 }, 280 }; 281 282 static unsigned lcore_ids[RTE_MAX_LCORE]; 283 static uint8_t ports[RTE_MAX_ETHPORTS]; 284 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 285 286 static const uint16_t external_pkt_default_vlan_tag = 2000; 287 const uint16_t vlan_tags[] = { 288 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 289 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 290 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 291 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 292 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 293 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 294 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 295 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 296 }; 297 298 /* ethernet addresses of ports */ 299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 300 301 /* heads for the main used and free linked lists for the data path. */ 302 static struct virtio_net_data_ll *ll_root_used = NULL; 303 static struct virtio_net_data_ll *ll_root_free = NULL; 304 305 /* Array of data core structures containing information on individual core linked lists. */ 306 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 307 308 /* Used for queueing bursts of TX packets. */ 309 struct mbuf_table { 310 unsigned len; 311 unsigned txq_id; 312 struct rte_mbuf *m_table[MAX_PKT_BURST]; 313 }; 314 315 /* TX queue for each data core. */ 316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 317 318 /* TX queue fori each virtio device for zero copy. */ 319 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 320 321 /* Vlan header struct used to insert vlan tags on TX. */ 322 struct vlan_ethhdr { 323 unsigned char h_dest[ETH_ALEN]; 324 unsigned char h_source[ETH_ALEN]; 325 __be16 h_vlan_proto; 326 __be16 h_vlan_TCI; 327 __be16 h_vlan_encapsulated_proto; 328 }; 329 330 /* IPv4 Header */ 331 struct ipv4_hdr { 332 uint8_t version_ihl; /**< version and header length */ 333 uint8_t type_of_service; /**< type of service */ 334 uint16_t total_length; /**< length of packet */ 335 uint16_t packet_id; /**< packet ID */ 336 uint16_t fragment_offset; /**< fragmentation offset */ 337 uint8_t time_to_live; /**< time to live */ 338 uint8_t next_proto_id; /**< protocol ID */ 339 uint16_t hdr_checksum; /**< header checksum */ 340 uint32_t src_addr; /**< source address */ 341 uint32_t dst_addr; /**< destination address */ 342 } __attribute__((__packed__)); 343 344 /* Header lengths. */ 345 #define VLAN_HLEN 4 346 #define VLAN_ETH_HLEN 18 347 348 /* Per-device statistics struct */ 349 struct device_statistics { 350 uint64_t tx_total; 351 rte_atomic64_t rx_total_atomic; 352 uint64_t rx_total; 353 uint64_t tx; 354 rte_atomic64_t rx_atomic; 355 uint64_t rx; 356 } __rte_cache_aligned; 357 struct device_statistics dev_statistics[MAX_DEVICES]; 358 359 /* 360 * Builds up the correct configuration for VMDQ VLAN pool map 361 * according to the pool & queue limits. 362 */ 363 static inline int 364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 365 { 366 struct rte_eth_vmdq_rx_conf conf; 367 unsigned i; 368 369 memset(&conf, 0, sizeof(conf)); 370 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 371 conf.nb_pool_maps = num_devices; 372 conf.enable_loop_back = 373 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 374 375 for (i = 0; i < conf.nb_pool_maps; i++) { 376 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 377 conf.pool_map[i].pools = (1UL << i); 378 } 379 380 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 381 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 382 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 383 return 0; 384 } 385 386 /* 387 * Validate the device number according to the max pool number gotten form 388 * dev_info. If the device number is invalid, give the error message and 389 * return -1. Each device must have its own pool. 390 */ 391 static inline int 392 validate_num_devices(uint32_t max_nb_devices) 393 { 394 if (num_devices > max_nb_devices) { 395 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 396 return -1; 397 } 398 return 0; 399 } 400 401 /* 402 * Initialises a given port using global settings and with the rx buffers 403 * coming from the mbuf_pool passed as parameter 404 */ 405 static inline int 406 port_init(uint8_t port) 407 { 408 struct rte_eth_dev_info dev_info; 409 struct rte_eth_conf port_conf; 410 uint16_t rx_rings, tx_rings; 411 uint16_t rx_ring_size, tx_ring_size; 412 int retval; 413 uint16_t q; 414 415 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 416 rte_eth_dev_info_get (port, &dev_info); 417 418 /*configure the number of supported virtio devices based on VMDQ limits */ 419 num_devices = dev_info.max_vmdq_pools; 420 num_queues = dev_info.max_rx_queues; 421 422 if (zero_copy) { 423 rx_ring_size = num_rx_descriptor; 424 tx_ring_size = num_tx_descriptor; 425 tx_rings = dev_info.max_tx_queues; 426 } else { 427 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 428 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 429 tx_rings = (uint16_t)rte_lcore_count(); 430 } 431 432 retval = validate_num_devices(MAX_DEVICES); 433 if (retval < 0) 434 return retval; 435 436 /* Get port configuration. */ 437 retval = get_eth_conf(&port_conf, num_devices); 438 if (retval < 0) 439 return retval; 440 441 if (port >= rte_eth_dev_count()) return -1; 442 443 rx_rings = (uint16_t)num_queues, 444 /* Configure ethernet device. */ 445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 446 if (retval != 0) 447 return retval; 448 449 /* Setup the queues. */ 450 for (q = 0; q < rx_rings; q ++) { 451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 452 rte_eth_dev_socket_id(port), &rx_conf_default, 453 vpool_array[q].pool); 454 if (retval < 0) 455 return retval; 456 } 457 for (q = 0; q < tx_rings; q ++) { 458 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 459 rte_eth_dev_socket_id(port), &tx_conf_default); 460 if (retval < 0) 461 return retval; 462 } 463 464 /* Start the device. */ 465 retval = rte_eth_dev_start(port); 466 if (retval < 0) { 467 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 468 return retval; 469 } 470 471 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 472 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 473 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 474 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 475 (unsigned)port, 476 vmdq_ports_eth_addr[port].addr_bytes[0], 477 vmdq_ports_eth_addr[port].addr_bytes[1], 478 vmdq_ports_eth_addr[port].addr_bytes[2], 479 vmdq_ports_eth_addr[port].addr_bytes[3], 480 vmdq_ports_eth_addr[port].addr_bytes[4], 481 vmdq_ports_eth_addr[port].addr_bytes[5]); 482 483 return 0; 484 } 485 486 /* 487 * Set character device basename. 488 */ 489 static int 490 us_vhost_parse_basename(const char *q_arg) 491 { 492 /* parse number string */ 493 494 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 495 return -1; 496 else 497 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 498 499 return 0; 500 } 501 502 /* 503 * Parse the portmask provided at run time. 504 */ 505 static int 506 parse_portmask(const char *portmask) 507 { 508 char *end = NULL; 509 unsigned long pm; 510 511 errno = 0; 512 513 /* parse hexadecimal string */ 514 pm = strtoul(portmask, &end, 16); 515 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 516 return -1; 517 518 if (pm == 0) 519 return -1; 520 521 return pm; 522 523 } 524 525 /* 526 * Parse num options at run time. 527 */ 528 static int 529 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 530 { 531 char *end = NULL; 532 unsigned long num; 533 534 errno = 0; 535 536 /* parse unsigned int string */ 537 num = strtoul(q_arg, &end, 10); 538 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 539 return -1; 540 541 if (num > max_valid_value) 542 return -1; 543 544 return num; 545 546 } 547 548 /* 549 * Display usage 550 */ 551 static void 552 us_vhost_usage(const char *prgname) 553 { 554 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 555 " --vm2vm [0|1|2]\n" 556 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 557 " --dev-basename <name>\n" 558 " --nb-devices ND\n" 559 " -p PORTMASK: Set mask for ports to be used by application\n" 560 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 561 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 562 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 563 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 564 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 565 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 566 " --dev-basename: The basename to be used for the character device.\n" 567 " --zero-copy [0|1]: disable(default)/enable rx/tx " 568 "zero copy\n" 569 " --rx-desc-num [0-N]: the number of descriptors on rx, " 570 "used only when zero copy is enabled.\n" 571 " --tx-desc-num [0-N]: the number of descriptors on tx, " 572 "used only when zero copy is enabled.\n", 573 prgname); 574 } 575 576 /* 577 * Parse the arguments given in the command line of the application. 578 */ 579 static int 580 us_vhost_parse_args(int argc, char **argv) 581 { 582 int opt, ret; 583 int option_index; 584 unsigned i; 585 const char *prgname = argv[0]; 586 static struct option long_option[] = { 587 {"vm2vm", required_argument, NULL, 0}, 588 {"rx-retry", required_argument, NULL, 0}, 589 {"rx-retry-delay", required_argument, NULL, 0}, 590 {"rx-retry-num", required_argument, NULL, 0}, 591 {"mergeable", required_argument, NULL, 0}, 592 {"stats", required_argument, NULL, 0}, 593 {"dev-basename", required_argument, NULL, 0}, 594 {"zero-copy", required_argument, NULL, 0}, 595 {"rx-desc-num", required_argument, NULL, 0}, 596 {"tx-desc-num", required_argument, NULL, 0}, 597 {NULL, 0, 0, 0}, 598 }; 599 600 /* Parse command line */ 601 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 602 switch (opt) { 603 /* Portmask */ 604 case 'p': 605 enabled_port_mask = parse_portmask(optarg); 606 if (enabled_port_mask == 0) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } 611 break; 612 613 case 0: 614 /* Enable/disable vm2vm comms. */ 615 if (!strncmp(long_option[option_index].name, "vm2vm", 616 MAX_LONG_OPT_SZ)) { 617 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 618 if (ret == -1) { 619 RTE_LOG(INFO, VHOST_CONFIG, 620 "Invalid argument for " 621 "vm2vm [0|1|2]\n"); 622 us_vhost_usage(prgname); 623 return -1; 624 } else { 625 vm2vm_mode = (vm2vm_type)ret; 626 } 627 } 628 629 /* Enable/disable retries on RX. */ 630 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 631 ret = parse_num_opt(optarg, 1); 632 if (ret == -1) { 633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } else { 637 enable_retry = ret; 638 } 639 } 640 641 /* Specify the retries delay time (in useconds) on RX. */ 642 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 643 ret = parse_num_opt(optarg, INT32_MAX); 644 if (ret == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 646 us_vhost_usage(prgname); 647 return -1; 648 } else { 649 burst_rx_delay_time = ret; 650 } 651 } 652 653 /* Specify the retries number on RX. */ 654 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 655 ret = parse_num_opt(optarg, INT32_MAX); 656 if (ret == -1) { 657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 658 us_vhost_usage(prgname); 659 return -1; 660 } else { 661 burst_rx_retry_num = ret; 662 } 663 } 664 665 /* Enable/disable RX mergeable buffers. */ 666 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 667 ret = parse_num_opt(optarg, 1); 668 if (ret == -1) { 669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 670 us_vhost_usage(prgname); 671 return -1; 672 } else { 673 mergeable = !!ret; 674 if (ret) { 675 vmdq_conf_default.rxmode.jumbo_frame = 1; 676 vmdq_conf_default.rxmode.max_rx_pkt_len 677 = JUMBO_FRAME_MAX_SIZE; 678 } 679 } 680 } 681 682 /* Enable/disable stats. */ 683 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 684 ret = parse_num_opt(optarg, INT32_MAX); 685 if (ret == -1) { 686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 687 us_vhost_usage(prgname); 688 return -1; 689 } else { 690 enable_stats = ret; 691 } 692 } 693 694 /* Set character device basename. */ 695 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 696 if (us_vhost_parse_basename(optarg) == -1) { 697 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 698 us_vhost_usage(prgname); 699 return -1; 700 } 701 } 702 703 /* Enable/disable rx/tx zero copy. */ 704 if (!strncmp(long_option[option_index].name, 705 "zero-copy", MAX_LONG_OPT_SZ)) { 706 ret = parse_num_opt(optarg, 1); 707 if (ret == -1) { 708 RTE_LOG(INFO, VHOST_CONFIG, 709 "Invalid argument" 710 " for zero-copy [0|1]\n"); 711 us_vhost_usage(prgname); 712 return -1; 713 } else 714 zero_copy = ret; 715 716 if (zero_copy) { 717 #ifdef RTE_MBUF_REFCNT 718 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 719 "zero copy vhost APP, please " 720 "disable RTE_MBUF_REFCNT\n" 721 "in config file and then rebuild DPDK " 722 "core lib!\n" 723 "Otherwise please disable zero copy " 724 "flag in command line!\n"); 725 return -1; 726 #endif 727 } 728 } 729 730 /* Specify the descriptor number on RX. */ 731 if (!strncmp(long_option[option_index].name, 732 "rx-desc-num", MAX_LONG_OPT_SZ)) { 733 ret = parse_num_opt(optarg, MAX_RING_DESC); 734 if ((ret == -1) || (!POWEROF2(ret))) { 735 RTE_LOG(INFO, VHOST_CONFIG, 736 "Invalid argument for rx-desc-num[0-N]," 737 "power of 2 required.\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } else { 741 num_rx_descriptor = ret; 742 } 743 } 744 745 /* Specify the descriptor number on TX. */ 746 if (!strncmp(long_option[option_index].name, 747 "tx-desc-num", MAX_LONG_OPT_SZ)) { 748 ret = parse_num_opt(optarg, MAX_RING_DESC); 749 if ((ret == -1) || (!POWEROF2(ret))) { 750 RTE_LOG(INFO, VHOST_CONFIG, 751 "Invalid argument for tx-desc-num [0-N]," 752 "power of 2 required.\n"); 753 us_vhost_usage(prgname); 754 return -1; 755 } else { 756 num_tx_descriptor = ret; 757 } 758 } 759 760 break; 761 762 /* Invalid option - print options. */ 763 default: 764 us_vhost_usage(prgname); 765 return -1; 766 } 767 } 768 769 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 770 if (enabled_port_mask & (1 << i)) 771 ports[num_ports++] = (uint8_t)i; 772 } 773 774 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 775 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 776 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 777 return -1; 778 } 779 780 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 781 RTE_LOG(INFO, VHOST_PORT, 782 "Vhost zero copy doesn't support software vm2vm," 783 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 784 return -1; 785 } 786 787 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 788 RTE_LOG(INFO, VHOST_PORT, 789 "Vhost zero copy doesn't support jumbo frame," 790 "please specify '--mergeable 0' to disable the " 791 "mergeable feature.\n"); 792 return -1; 793 } 794 795 return 0; 796 } 797 798 /* 799 * Update the global var NUM_PORTS and array PORTS according to system ports number 800 * and return valid ports number 801 */ 802 static unsigned check_ports_num(unsigned nb_ports) 803 { 804 unsigned valid_num_ports = num_ports; 805 unsigned portid; 806 807 if (num_ports > nb_ports) { 808 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 809 num_ports, nb_ports); 810 num_ports = nb_ports; 811 } 812 813 for (portid = 0; portid < num_ports; portid ++) { 814 if (ports[portid] >= nb_ports) { 815 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 816 ports[portid], (nb_ports - 1)); 817 ports[portid] = INVALID_PORT_ID; 818 valid_num_ports--; 819 } 820 } 821 return valid_num_ports; 822 } 823 824 /* 825 * Macro to print out packet contents. Wrapped in debug define so that the 826 * data path is not effected when debug is disabled. 827 */ 828 #ifdef DEBUG 829 #define PRINT_PACKET(device, addr, size, header) do { \ 830 char *pkt_addr = (char*)(addr); \ 831 unsigned int index; \ 832 char packet[MAX_PRINT_BUFF]; \ 833 \ 834 if ((header)) \ 835 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 836 else \ 837 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 838 for (index = 0; index < (size); index++) { \ 839 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 840 "%02hhx ", pkt_addr[index]); \ 841 } \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 843 \ 844 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 845 } while(0) 846 #else 847 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 848 #endif 849 850 /* 851 * Function to convert guest physical addresses to vhost physical addresses. 852 * This is used to convert virtio buffer addresses. 853 */ 854 static inline uint64_t __attribute__((always_inline)) 855 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 856 uint32_t buf_len, hpa_type *addr_type) 857 { 858 struct virtio_memory_regions_hpa *region; 859 uint32_t regionidx; 860 uint64_t vhost_pa = 0; 861 862 *addr_type = PHYS_ADDR_INVALID; 863 864 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 865 region = &vdev->regions_hpa[regionidx]; 866 if ((guest_pa >= region->guest_phys_address) && 867 (guest_pa <= region->guest_phys_address_end)) { 868 vhost_pa = region->host_phys_addr_offset + guest_pa; 869 if (likely((guest_pa + buf_len - 1) 870 <= region->guest_phys_address_end)) 871 *addr_type = PHYS_ADDR_CONTINUOUS; 872 else 873 *addr_type = PHYS_ADDR_CROSS_SUBREG; 874 break; 875 } 876 } 877 878 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 879 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 880 (void *)(uintptr_t)vhost_pa); 881 882 return vhost_pa; 883 } 884 885 /* 886 * Compares a packet destination MAC address to a device MAC address. 887 */ 888 static inline int __attribute__((always_inline)) 889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 890 { 891 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 892 } 893 894 /* 895 * This function learns the MAC address of the device and registers this along with a 896 * vlan tag to a VMDQ. 897 */ 898 static int 899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 900 { 901 struct ether_hdr *pkt_hdr; 902 struct virtio_net_data_ll *dev_ll; 903 struct virtio_net *dev = vdev->dev; 904 int i, ret; 905 906 /* Learn MAC address of guest device from packet */ 907 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 908 909 dev_ll = ll_root_used; 910 911 while (dev_ll != NULL) { 912 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 913 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 914 return -1; 915 } 916 dev_ll = dev_ll->next; 917 } 918 919 for (i = 0; i < ETHER_ADDR_LEN; i++) 920 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 921 922 /* vlan_tag currently uses the device_id. */ 923 vdev->vlan_tag = vlan_tags[dev->device_fh]; 924 925 /* Print out VMDQ registration info. */ 926 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 927 dev->device_fh, 928 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 929 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 930 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 931 vdev->vlan_tag); 932 933 /* Register the MAC address. */ 934 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 935 if (ret) 936 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 937 dev->device_fh); 938 939 /* Enable stripping of the vlan tag as we handle routing. */ 940 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 941 942 /* Set device as ready for RX. */ 943 vdev->ready = DEVICE_RX; 944 945 return 0; 946 } 947 948 /* 949 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 950 * queue before disabling RX on the device. 951 */ 952 static inline void 953 unlink_vmdq(struct vhost_dev *vdev) 954 { 955 unsigned i = 0; 956 unsigned rx_count; 957 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 958 959 if (vdev->ready == DEVICE_RX) { 960 /*clear MAC and VLAN settings*/ 961 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 962 for (i = 0; i < 6; i++) 963 vdev->mac_address.addr_bytes[i] = 0; 964 965 vdev->vlan_tag = 0; 966 967 /*Clear out the receive buffers*/ 968 rx_count = rte_eth_rx_burst(ports[0], 969 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 970 971 while (rx_count) { 972 for (i = 0; i < rx_count; i++) 973 rte_pktmbuf_free(pkts_burst[i]); 974 975 rx_count = rte_eth_rx_burst(ports[0], 976 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 977 } 978 979 vdev->ready = DEVICE_MAC_LEARNING; 980 } 981 } 982 983 /* 984 * Check if the packet destination MAC address is for a local device. If so then put 985 * the packet on that devices RX queue. If not then return. 986 */ 987 static inline int __attribute__((always_inline)) 988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 989 { 990 struct virtio_net_data_ll *dev_ll; 991 struct ether_hdr *pkt_hdr; 992 uint64_t ret = 0; 993 struct virtio_net *dev = vdev->dev; 994 struct virtio_net *tdev; /* destination virito device */ 995 996 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 997 998 /*get the used devices list*/ 999 dev_ll = ll_root_used; 1000 1001 while (dev_ll != NULL) { 1002 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1003 &dev_ll->vdev->mac_address)) { 1004 1005 /* Drop the packet if the TX packet is destined for the TX device. */ 1006 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1007 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1008 dev->device_fh); 1009 return 0; 1010 } 1011 tdev = dev_ll->vdev->dev; 1012 1013 1014 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1015 1016 if (unlikely(dev_ll->vdev->remove)) { 1017 /*drop the packet if the device is marked for removal*/ 1018 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1019 } else { 1020 /*send the packet to the local virtio device*/ 1021 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1022 if (enable_stats) { 1023 rte_atomic64_add( 1024 &dev_statistics[tdev->device_fh].rx_total_atomic, 1025 1); 1026 rte_atomic64_add( 1027 &dev_statistics[tdev->device_fh].rx_atomic, 1028 ret); 1029 dev_statistics[tdev->device_fh].tx_total++; 1030 dev_statistics[tdev->device_fh].tx += ret; 1031 } 1032 } 1033 1034 return 0; 1035 } 1036 dev_ll = dev_ll->next; 1037 } 1038 1039 return -1; 1040 } 1041 1042 /* 1043 * Check if the destination MAC of a packet is one local VM, 1044 * and get its vlan tag, and offset if it is. 1045 */ 1046 static inline int __attribute__((always_inline)) 1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1048 uint32_t *offset, uint16_t *vlan_tag) 1049 { 1050 struct virtio_net_data_ll *dev_ll = ll_root_used; 1051 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1052 1053 while (dev_ll != NULL) { 1054 if ((dev_ll->vdev->ready == DEVICE_RX) 1055 && ether_addr_cmp(&(pkt_hdr->d_addr), 1056 &dev_ll->vdev->mac_address)) { 1057 /* 1058 * Drop the packet if the TX packet is 1059 * destined for the TX device. 1060 */ 1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1062 LOG_DEBUG(VHOST_DATA, 1063 "(%"PRIu64") TX: Source and destination" 1064 " MAC addresses are the same. Dropping " 1065 "packet.\n", 1066 dev_ll->vdev->dev->device_fh); 1067 return -1; 1068 } 1069 1070 /* 1071 * HW vlan strip will reduce the packet length 1072 * by minus length of vlan tag, so need restore 1073 * the packet length by plus it. 1074 */ 1075 *offset = VLAN_HLEN; 1076 *vlan_tag = 1077 (uint16_t) 1078 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1079 1080 LOG_DEBUG(VHOST_DATA, 1081 "(%"PRIu64") TX: pkt to local VM device id:" 1082 "(%"PRIu64") vlan tag: %d.\n", 1083 dev->device_fh, dev_ll->vdev->dev->device_fh, 1084 vlan_tag); 1085 1086 break; 1087 } 1088 dev_ll = dev_ll->next; 1089 } 1090 return 0; 1091 } 1092 1093 /* 1094 * This function routes the TX packet to the correct interface. This may be a local device 1095 * or the physical port. 1096 */ 1097 static inline void __attribute__((always_inline)) 1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1099 { 1100 struct mbuf_table *tx_q; 1101 struct rte_mbuf **m_table; 1102 unsigned len, ret, offset = 0; 1103 const uint16_t lcore_id = rte_lcore_id(); 1104 struct virtio_net *dev = vdev->dev; 1105 1106 /*check if destination is local VM*/ 1107 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1108 rte_pktmbuf_free(m); 1109 return; 1110 } 1111 1112 if (vm2vm_mode == VM2VM_HARDWARE) { 1113 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 || 1114 offset > rte_pktmbuf_tailroom(m)) { 1115 rte_pktmbuf_free(m); 1116 return; 1117 } 1118 } 1119 1120 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1121 1122 /*Add packet to the port tx queue*/ 1123 tx_q = &lcore_tx_queue[lcore_id]; 1124 len = tx_q->len; 1125 1126 m->ol_flags = PKT_TX_VLAN_PKT; 1127 1128 m->data_len += offset; 1129 m->pkt_len += offset; 1130 1131 m->vlan_tci = vlan_tag; 1132 1133 tx_q->m_table[len] = m; 1134 len++; 1135 if (enable_stats) { 1136 dev_statistics[dev->device_fh].tx_total++; 1137 dev_statistics[dev->device_fh].tx++; 1138 } 1139 1140 if (unlikely(len == MAX_PKT_BURST)) { 1141 m_table = (struct rte_mbuf **)tx_q->m_table; 1142 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1143 /* Free any buffers not handled by TX and update the port stats. */ 1144 if (unlikely(ret < len)) { 1145 do { 1146 rte_pktmbuf_free(m_table[ret]); 1147 } while (++ret < len); 1148 } 1149 1150 len = 0; 1151 } 1152 1153 tx_q->len = len; 1154 return; 1155 } 1156 /* 1157 * This function is called by each data core. It handles all RX/TX registered with the 1158 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1159 * with all devices in the main linked list. 1160 */ 1161 static int 1162 switch_worker(__attribute__((unused)) void *arg) 1163 { 1164 struct rte_mempool *mbuf_pool = arg; 1165 struct virtio_net *dev = NULL; 1166 struct vhost_dev *vdev = NULL; 1167 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1168 struct virtio_net_data_ll *dev_ll; 1169 struct mbuf_table *tx_q; 1170 volatile struct lcore_ll_info *lcore_ll; 1171 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1172 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1173 unsigned ret, i; 1174 const uint16_t lcore_id = rte_lcore_id(); 1175 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1176 uint16_t rx_count = 0; 1177 uint16_t tx_count; 1178 uint32_t retry = 0; 1179 1180 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1181 lcore_ll = lcore_info[lcore_id].lcore_ll; 1182 prev_tsc = 0; 1183 1184 tx_q = &lcore_tx_queue[lcore_id]; 1185 for (i = 0; i < num_cores; i ++) { 1186 if (lcore_ids[i] == lcore_id) { 1187 tx_q->txq_id = i; 1188 break; 1189 } 1190 } 1191 1192 while(1) { 1193 cur_tsc = rte_rdtsc(); 1194 /* 1195 * TX burst queue drain 1196 */ 1197 diff_tsc = cur_tsc - prev_tsc; 1198 if (unlikely(diff_tsc > drain_tsc)) { 1199 1200 if (tx_q->len) { 1201 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1202 1203 /*Tx any packets in the queue*/ 1204 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1205 (struct rte_mbuf **)tx_q->m_table, 1206 (uint16_t)tx_q->len); 1207 if (unlikely(ret < tx_q->len)) { 1208 do { 1209 rte_pktmbuf_free(tx_q->m_table[ret]); 1210 } while (++ret < tx_q->len); 1211 } 1212 1213 tx_q->len = 0; 1214 } 1215 1216 prev_tsc = cur_tsc; 1217 1218 } 1219 1220 rte_prefetch0(lcore_ll->ll_root_used); 1221 /* 1222 * Inform the configuration core that we have exited the linked list and that no devices are 1223 * in use if requested. 1224 */ 1225 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1226 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1227 1228 /* 1229 * Process devices 1230 */ 1231 dev_ll = lcore_ll->ll_root_used; 1232 1233 while (dev_ll != NULL) { 1234 /*get virtio device ID*/ 1235 vdev = dev_ll->vdev; 1236 dev = vdev->dev; 1237 1238 if (unlikely(vdev->remove)) { 1239 dev_ll = dev_ll->next; 1240 unlink_vmdq(vdev); 1241 vdev->ready = DEVICE_SAFE_REMOVE; 1242 continue; 1243 } 1244 if (likely(vdev->ready == DEVICE_RX)) { 1245 /*Handle guest RX*/ 1246 rx_count = rte_eth_rx_burst(ports[0], 1247 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1248 1249 if (rx_count) { 1250 /* 1251 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1252 * Here MAX_PKT_BURST must be less than virtio queue size 1253 */ 1254 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1255 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1256 rte_delay_us(burst_rx_delay_time); 1257 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1258 break; 1259 } 1260 } 1261 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1262 if (enable_stats) { 1263 rte_atomic64_add( 1264 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1265 rx_count); 1266 rte_atomic64_add( 1267 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1268 } 1269 while (likely(rx_count)) { 1270 rx_count--; 1271 rte_pktmbuf_free(pkts_burst[rx_count]); 1272 } 1273 1274 } 1275 } 1276 1277 if (likely(!vdev->remove)) { 1278 /* Handle guest TX*/ 1279 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1280 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1281 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1282 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1283 while (tx_count--) 1284 rte_pktmbuf_free(pkts_burst[tx_count]); 1285 } 1286 } 1287 while (tx_count) 1288 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1289 } 1290 1291 /*move to the next device in the list*/ 1292 dev_ll = dev_ll->next; 1293 } 1294 } 1295 1296 return 0; 1297 } 1298 1299 /* 1300 * This function gets available ring number for zero copy rx. 1301 * Only one thread will call this funciton for a paticular virtio device, 1302 * so, it is designed as non-thread-safe function. 1303 */ 1304 static inline uint32_t __attribute__((always_inline)) 1305 get_available_ring_num_zcp(struct virtio_net *dev) 1306 { 1307 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1308 uint16_t avail_idx; 1309 1310 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1311 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1312 } 1313 1314 /* 1315 * This function gets available ring index for zero copy rx, 1316 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1317 * Only one thread will call this funciton for a paticular virtio device, 1318 * so, it is designed as non-thread-safe function. 1319 */ 1320 static inline uint32_t __attribute__((always_inline)) 1321 get_available_ring_index_zcp(struct virtio_net *dev, 1322 uint16_t *res_base_idx, uint32_t count) 1323 { 1324 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1325 uint16_t avail_idx; 1326 uint32_t retry = 0; 1327 uint16_t free_entries; 1328 1329 *res_base_idx = vq->last_used_idx_res; 1330 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1331 free_entries = (avail_idx - *res_base_idx); 1332 1333 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1334 "avail idx: %d, " 1335 "res base idx:%d, free entries:%d\n", 1336 dev->device_fh, avail_idx, *res_base_idx, 1337 free_entries); 1338 1339 /* 1340 * If retry is enabled and the queue is full then we wait 1341 * and retry to avoid packet loss. 1342 */ 1343 if (enable_retry && unlikely(count > free_entries)) { 1344 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1345 rte_delay_us(burst_rx_delay_time); 1346 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1347 free_entries = (avail_idx - *res_base_idx); 1348 if (count <= free_entries) 1349 break; 1350 } 1351 } 1352 1353 /*check that we have enough buffers*/ 1354 if (unlikely(count > free_entries)) 1355 count = free_entries; 1356 1357 if (unlikely(count == 0)) { 1358 LOG_DEBUG(VHOST_DATA, 1359 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1360 "avail idx: %d, res base idx:%d, free entries:%d\n", 1361 dev->device_fh, avail_idx, 1362 *res_base_idx, free_entries); 1363 return 0; 1364 } 1365 1366 vq->last_used_idx_res = *res_base_idx + count; 1367 1368 return count; 1369 } 1370 1371 /* 1372 * This function put descriptor back to used list. 1373 */ 1374 static inline void __attribute__((always_inline)) 1375 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1376 { 1377 uint16_t res_cur_idx = vq->last_used_idx; 1378 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1379 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1380 rte_compiler_barrier(); 1381 *(volatile uint16_t *)&vq->used->idx += 1; 1382 vq->last_used_idx += 1; 1383 1384 /* Kick the guest if necessary. */ 1385 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1386 eventfd_write((int)vq->kickfd, 1); 1387 } 1388 1389 /* 1390 * This function get available descriptor from vitio vring and un-attached mbuf 1391 * from vpool->ring, and then attach them together. It needs adjust the offset 1392 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1393 * frame data may be put to wrong location in mbuf. 1394 */ 1395 static inline void __attribute__((always_inline)) 1396 attach_rxmbuf_zcp(struct virtio_net *dev) 1397 { 1398 uint16_t res_base_idx, desc_idx; 1399 uint64_t buff_addr, phys_addr; 1400 struct vhost_virtqueue *vq; 1401 struct vring_desc *desc; 1402 struct rte_mbuf *mbuf = NULL; 1403 struct vpool *vpool; 1404 hpa_type addr_type; 1405 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1406 1407 vpool = &vpool_array[vdev->vmdq_rx_q]; 1408 vq = dev->virtqueue[VIRTIO_RXQ]; 1409 1410 do { 1411 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1412 1) != 1)) 1413 return; 1414 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1415 1416 desc = &vq->desc[desc_idx]; 1417 if (desc->flags & VRING_DESC_F_NEXT) { 1418 desc = &vq->desc[desc->next]; 1419 buff_addr = gpa_to_vva(dev, desc->addr); 1420 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1421 &addr_type); 1422 } else { 1423 buff_addr = gpa_to_vva(dev, 1424 desc->addr + vq->vhost_hlen); 1425 phys_addr = gpa_to_hpa(vdev, 1426 desc->addr + vq->vhost_hlen, 1427 desc->len, &addr_type); 1428 } 1429 1430 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1431 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1432 " address found when attaching RX frame buffer" 1433 " address!\n", dev->device_fh); 1434 put_desc_to_used_list_zcp(vq, desc_idx); 1435 continue; 1436 } 1437 1438 /* 1439 * Check if the frame buffer address from guest crosses 1440 * sub-region or not. 1441 */ 1442 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1443 RTE_LOG(ERR, VHOST_DATA, 1444 "(%"PRIu64") Frame buffer address cross " 1445 "sub-regioin found when attaching RX frame " 1446 "buffer address!\n", 1447 dev->device_fh); 1448 put_desc_to_used_list_zcp(vq, desc_idx); 1449 continue; 1450 } 1451 } while (unlikely(phys_addr == 0)); 1452 1453 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1454 if (unlikely(mbuf == NULL)) { 1455 LOG_DEBUG(VHOST_DATA, 1456 "(%"PRIu64") in attach_rxmbuf_zcp: " 1457 "ring_sc_dequeue fail.\n", 1458 dev->device_fh); 1459 put_desc_to_used_list_zcp(vq, desc_idx); 1460 return; 1461 } 1462 1463 if (unlikely(vpool->buf_size > desc->len)) { 1464 LOG_DEBUG(VHOST_DATA, 1465 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1466 "length(%d) of descriptor idx: %d less than room " 1467 "size required: %d\n", 1468 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1469 put_desc_to_used_list_zcp(vq, desc_idx); 1470 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1471 return; 1472 } 1473 1474 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1475 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1476 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1477 mbuf->data_len = desc->len; 1478 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1479 1480 LOG_DEBUG(VHOST_DATA, 1481 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1482 "descriptor idx:%d\n", 1483 dev->device_fh, res_base_idx, desc_idx); 1484 1485 __rte_mbuf_raw_free(mbuf); 1486 1487 return; 1488 } 1489 1490 /* 1491 * Detach an attched packet mbuf - 1492 * - restore original mbuf address and length values. 1493 * - reset pktmbuf data and data_len to their default values. 1494 * All other fields of the given packet mbuf will be left intact. 1495 * 1496 * @param m 1497 * The attached packet mbuf. 1498 */ 1499 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1500 { 1501 const struct rte_mempool *mp = m->pool; 1502 void *buf = RTE_MBUF_TO_BADDR(m); 1503 uint32_t buf_ofs; 1504 uint32_t buf_len = mp->elt_size - sizeof(*m); 1505 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1506 1507 m->buf_addr = buf; 1508 m->buf_len = (uint16_t)buf_len; 1509 1510 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1511 RTE_PKTMBUF_HEADROOM : m->buf_len; 1512 m->data_off = buf_ofs; 1513 1514 m->data_len = 0; 1515 } 1516 1517 /* 1518 * This function is called after packets have been transimited. It fetchs mbuf 1519 * from vpool->pool, detached it and put into vpool->ring. It also update the 1520 * used index and kick the guest if necessary. 1521 */ 1522 static inline uint32_t __attribute__((always_inline)) 1523 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1524 { 1525 struct rte_mbuf *mbuf; 1526 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1527 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1528 uint32_t index = 0; 1529 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1530 1531 LOG_DEBUG(VHOST_DATA, 1532 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1533 "clean is: %d\n", 1534 dev->device_fh, mbuf_count); 1535 LOG_DEBUG(VHOST_DATA, 1536 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1537 "clean is : %d\n", 1538 dev->device_fh, rte_ring_count(vpool->ring)); 1539 1540 for (index = 0; index < mbuf_count; index++) { 1541 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1542 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1543 pktmbuf_detach_zcp(mbuf); 1544 rte_ring_sp_enqueue(vpool->ring, mbuf); 1545 1546 /* Update used index buffer information. */ 1547 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1548 vq->used->ring[used_idx].len = 0; 1549 1550 used_idx = (used_idx + 1) & (vq->size - 1); 1551 } 1552 1553 LOG_DEBUG(VHOST_DATA, 1554 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1555 "clean is: %d\n", 1556 dev->device_fh, rte_mempool_count(vpool->pool)); 1557 LOG_DEBUG(VHOST_DATA, 1558 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1559 "clean is : %d\n", 1560 dev->device_fh, rte_ring_count(vpool->ring)); 1561 LOG_DEBUG(VHOST_DATA, 1562 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1563 "vq->last_used_idx:%d\n", 1564 dev->device_fh, vq->last_used_idx); 1565 1566 vq->last_used_idx += mbuf_count; 1567 1568 LOG_DEBUG(VHOST_DATA, 1569 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1570 "vq->last_used_idx:%d\n", 1571 dev->device_fh, vq->last_used_idx); 1572 1573 rte_compiler_barrier(); 1574 1575 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1576 1577 /* Kick guest if required. */ 1578 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1579 eventfd_write((int)vq->kickfd, 1); 1580 1581 return 0; 1582 } 1583 1584 /* 1585 * This function is called when a virtio device is destroy. 1586 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1587 */ 1588 static void mbuf_destroy_zcp(struct vpool *vpool) 1589 { 1590 struct rte_mbuf *mbuf = NULL; 1591 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1592 1593 LOG_DEBUG(VHOST_CONFIG, 1594 "in mbuf_destroy_zcp: mbuf count in mempool before " 1595 "mbuf_destroy_zcp is: %d\n", 1596 mbuf_count); 1597 LOG_DEBUG(VHOST_CONFIG, 1598 "in mbuf_destroy_zcp: mbuf count in ring before " 1599 "mbuf_destroy_zcp is : %d\n", 1600 rte_ring_count(vpool->ring)); 1601 1602 for (index = 0; index < mbuf_count; index++) { 1603 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1604 if (likely(mbuf != NULL)) { 1605 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1606 pktmbuf_detach_zcp(mbuf); 1607 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1608 } 1609 } 1610 1611 LOG_DEBUG(VHOST_CONFIG, 1612 "in mbuf_destroy_zcp: mbuf count in mempool after " 1613 "mbuf_destroy_zcp is: %d\n", 1614 rte_mempool_count(vpool->pool)); 1615 LOG_DEBUG(VHOST_CONFIG, 1616 "in mbuf_destroy_zcp: mbuf count in ring after " 1617 "mbuf_destroy_zcp is : %d\n", 1618 rte_ring_count(vpool->ring)); 1619 } 1620 1621 /* 1622 * This function update the use flag and counter. 1623 */ 1624 static inline uint32_t __attribute__((always_inline)) 1625 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1626 uint32_t count) 1627 { 1628 struct vhost_virtqueue *vq; 1629 struct vring_desc *desc; 1630 struct rte_mbuf *buff; 1631 /* The virtio_hdr is initialised to 0. */ 1632 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1633 = {{0, 0, 0, 0, 0, 0}, 0}; 1634 uint64_t buff_hdr_addr = 0; 1635 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1636 uint32_t head_idx, packet_success = 0; 1637 uint16_t res_cur_idx; 1638 1639 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1640 1641 if (count == 0) 1642 return 0; 1643 1644 vq = dev->virtqueue[VIRTIO_RXQ]; 1645 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1646 1647 res_cur_idx = vq->last_used_idx; 1648 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1649 dev->device_fh, res_cur_idx, res_cur_idx + count); 1650 1651 /* Retrieve all of the head indexes first to avoid caching issues. */ 1652 for (head_idx = 0; head_idx < count; head_idx++) 1653 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1654 1655 /*Prefetch descriptor index. */ 1656 rte_prefetch0(&vq->desc[head[packet_success]]); 1657 1658 while (packet_success != count) { 1659 /* Get descriptor from available ring */ 1660 desc = &vq->desc[head[packet_success]]; 1661 1662 buff = pkts[packet_success]; 1663 LOG_DEBUG(VHOST_DATA, 1664 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1665 "pkt[%d] descriptor idx: %d\n", 1666 dev->device_fh, packet_success, 1667 MBUF_HEADROOM_UINT32(buff)); 1668 1669 PRINT_PACKET(dev, 1670 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1671 + RTE_PKTMBUF_HEADROOM), 1672 rte_pktmbuf_data_len(buff), 0); 1673 1674 /* Buffer address translation for virtio header. */ 1675 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1676 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1677 1678 /* 1679 * If the descriptors are chained the header and data are 1680 * placed in separate buffers. 1681 */ 1682 if (desc->flags & VRING_DESC_F_NEXT) { 1683 desc->len = vq->vhost_hlen; 1684 desc = &vq->desc[desc->next]; 1685 desc->len = rte_pktmbuf_data_len(buff); 1686 } else { 1687 desc->len = packet_len; 1688 } 1689 1690 /* Update used ring with desc information */ 1691 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1692 = head[packet_success]; 1693 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1694 = packet_len; 1695 res_cur_idx++; 1696 packet_success++; 1697 1698 /* A header is required per buffer. */ 1699 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1700 (const void *)&virtio_hdr, vq->vhost_hlen); 1701 1702 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1703 1704 if (likely(packet_success < count)) { 1705 /* Prefetch descriptor index. */ 1706 rte_prefetch0(&vq->desc[head[packet_success]]); 1707 } 1708 } 1709 1710 rte_compiler_barrier(); 1711 1712 LOG_DEBUG(VHOST_DATA, 1713 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1714 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1715 dev->device_fh, vq->last_used_idx, vq->used->idx); 1716 1717 *(volatile uint16_t *)&vq->used->idx += count; 1718 vq->last_used_idx += count; 1719 1720 LOG_DEBUG(VHOST_DATA, 1721 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1722 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1723 dev->device_fh, vq->last_used_idx, vq->used->idx); 1724 1725 /* Kick the guest if necessary. */ 1726 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1727 eventfd_write((int)vq->kickfd, 1); 1728 1729 return count; 1730 } 1731 1732 /* 1733 * This function routes the TX packet to the correct interface. 1734 * This may be a local device or the physical port. 1735 */ 1736 static inline void __attribute__((always_inline)) 1737 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1738 uint32_t desc_idx, uint8_t need_copy) 1739 { 1740 struct mbuf_table *tx_q; 1741 struct rte_mbuf **m_table; 1742 struct rte_mbuf *mbuf = NULL; 1743 unsigned len, ret, offset = 0; 1744 struct vpool *vpool; 1745 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1746 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1747 1748 /*Add packet to the port tx queue*/ 1749 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1750 len = tx_q->len; 1751 1752 /* Allocate an mbuf and populate the structure. */ 1753 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1754 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1755 if (unlikely(mbuf == NULL)) { 1756 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1757 RTE_LOG(ERR, VHOST_DATA, 1758 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1759 dev->device_fh); 1760 put_desc_to_used_list_zcp(vq, desc_idx); 1761 return; 1762 } 1763 1764 if (vm2vm_mode == VM2VM_HARDWARE) { 1765 /* Avoid using a vlan tag from any vm for external pkt, such as 1766 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1767 * selection, MAC address determines it as an external pkt 1768 * which should go to network, while vlan tag determine it as 1769 * a vm2vm pkt should forward to another vm. Hardware confuse 1770 * such a ambiguous situation, so pkt will lost. 1771 */ 1772 vlan_tag = external_pkt_default_vlan_tag; 1773 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1774 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1775 __rte_mbuf_raw_free(mbuf); 1776 return; 1777 } 1778 } 1779 1780 mbuf->nb_segs = m->nb_segs; 1781 mbuf->next = m->next; 1782 mbuf->data_len = m->data_len + offset; 1783 mbuf->pkt_len = mbuf->data_len; 1784 if (unlikely(need_copy)) { 1785 /* Copy the packet contents to the mbuf. */ 1786 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1787 rte_pktmbuf_mtod(m, void *), 1788 m->data_len); 1789 } else { 1790 mbuf->data_off = m->data_off; 1791 mbuf->buf_physaddr = m->buf_physaddr; 1792 mbuf->buf_addr = m->buf_addr; 1793 } 1794 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1795 mbuf->vlan_tci = vlan_tag; 1796 mbuf->l2_len = sizeof(struct ether_hdr); 1797 mbuf->l3_len = sizeof(struct ipv4_hdr); 1798 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1799 1800 tx_q->m_table[len] = mbuf; 1801 len++; 1802 1803 LOG_DEBUG(VHOST_DATA, 1804 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1805 dev->device_fh, 1806 mbuf->nb_segs, 1807 (mbuf->next == NULL) ? "null" : "non-null"); 1808 1809 if (enable_stats) { 1810 dev_statistics[dev->device_fh].tx_total++; 1811 dev_statistics[dev->device_fh].tx++; 1812 } 1813 1814 if (unlikely(len == MAX_PKT_BURST)) { 1815 m_table = (struct rte_mbuf **)tx_q->m_table; 1816 ret = rte_eth_tx_burst(ports[0], 1817 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1818 1819 /* 1820 * Free any buffers not handled by TX and update 1821 * the port stats. 1822 */ 1823 if (unlikely(ret < len)) { 1824 do { 1825 rte_pktmbuf_free(m_table[ret]); 1826 } while (++ret < len); 1827 } 1828 1829 len = 0; 1830 txmbuf_clean_zcp(dev, vpool); 1831 } 1832 1833 tx_q->len = len; 1834 1835 return; 1836 } 1837 1838 /* 1839 * This function TX all available packets in virtio TX queue for one 1840 * virtio-net device. If it is first packet, it learns MAC address and 1841 * setup VMDQ. 1842 */ 1843 static inline void __attribute__((always_inline)) 1844 virtio_dev_tx_zcp(struct virtio_net *dev) 1845 { 1846 struct rte_mbuf m; 1847 struct vhost_virtqueue *vq; 1848 struct vring_desc *desc; 1849 uint64_t buff_addr = 0, phys_addr; 1850 uint32_t head[MAX_PKT_BURST]; 1851 uint32_t i; 1852 uint16_t free_entries, packet_success = 0; 1853 uint16_t avail_idx; 1854 uint8_t need_copy = 0; 1855 hpa_type addr_type; 1856 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1857 1858 vq = dev->virtqueue[VIRTIO_TXQ]; 1859 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1860 1861 /* If there are no available buffers then return. */ 1862 if (vq->last_used_idx_res == avail_idx) 1863 return; 1864 1865 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1866 1867 /* Prefetch available ring to retrieve head indexes. */ 1868 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1869 1870 /* Get the number of free entries in the ring */ 1871 free_entries = (avail_idx - vq->last_used_idx_res); 1872 1873 /* Limit to MAX_PKT_BURST. */ 1874 free_entries 1875 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1876 1877 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1878 dev->device_fh, free_entries); 1879 1880 /* Retrieve all of the head indexes first to avoid caching issues. */ 1881 for (i = 0; i < free_entries; i++) 1882 head[i] 1883 = vq->avail->ring[(vq->last_used_idx_res + i) 1884 & (vq->size - 1)]; 1885 1886 vq->last_used_idx_res += free_entries; 1887 1888 /* Prefetch descriptor index. */ 1889 rte_prefetch0(&vq->desc[head[packet_success]]); 1890 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1891 1892 while (packet_success < free_entries) { 1893 desc = &vq->desc[head[packet_success]]; 1894 1895 /* Discard first buffer as it is the virtio header */ 1896 desc = &vq->desc[desc->next]; 1897 1898 /* Buffer address translation. */ 1899 buff_addr = gpa_to_vva(dev, desc->addr); 1900 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1901 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1902 &addr_type); 1903 1904 if (likely(packet_success < (free_entries - 1))) 1905 /* Prefetch descriptor index. */ 1906 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1907 1908 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1909 RTE_LOG(ERR, VHOST_DATA, 1910 "(%"PRIu64") Invalid frame buffer address found" 1911 "when TX packets!\n", 1912 dev->device_fh); 1913 packet_success++; 1914 continue; 1915 } 1916 1917 /* Prefetch buffer address. */ 1918 rte_prefetch0((void *)(uintptr_t)buff_addr); 1919 1920 /* 1921 * Setup dummy mbuf. This is copied to a real mbuf if 1922 * transmitted out the physical port. 1923 */ 1924 m.data_len = desc->len; 1925 m.nb_segs = 1; 1926 m.next = NULL; 1927 m.data_off = 0; 1928 m.buf_addr = (void *)(uintptr_t)buff_addr; 1929 m.buf_physaddr = phys_addr; 1930 1931 /* 1932 * Check if the frame buffer address from guest crosses 1933 * sub-region or not. 1934 */ 1935 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1936 RTE_LOG(ERR, VHOST_DATA, 1937 "(%"PRIu64") Frame buffer address cross " 1938 "sub-regioin found when attaching TX frame " 1939 "buffer address!\n", 1940 dev->device_fh); 1941 need_copy = 1; 1942 } else 1943 need_copy = 0; 1944 1945 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1946 1947 /* 1948 * If this is the first received packet we need to learn 1949 * the MAC and setup VMDQ 1950 */ 1951 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1952 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1953 /* 1954 * Discard frame if device is scheduled for 1955 * removal or a duplicate MAC address is found. 1956 */ 1957 packet_success += free_entries; 1958 vq->last_used_idx += packet_success; 1959 break; 1960 } 1961 } 1962 1963 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1964 packet_success++; 1965 } 1966 } 1967 1968 /* 1969 * This function is called by each data core. It handles all RX/TX registered 1970 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1971 * addresses are compared with all devices in the main linked list. 1972 */ 1973 static int 1974 switch_worker_zcp(__attribute__((unused)) void *arg) 1975 { 1976 struct virtio_net *dev = NULL; 1977 struct vhost_dev *vdev = NULL; 1978 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1979 struct virtio_net_data_ll *dev_ll; 1980 struct mbuf_table *tx_q; 1981 volatile struct lcore_ll_info *lcore_ll; 1982 const uint64_t drain_tsc 1983 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 1984 * BURST_TX_DRAIN_US; 1985 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1986 unsigned ret; 1987 const uint16_t lcore_id = rte_lcore_id(); 1988 uint16_t count_in_ring, rx_count = 0; 1989 1990 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1991 1992 lcore_ll = lcore_info[lcore_id].lcore_ll; 1993 prev_tsc = 0; 1994 1995 while (1) { 1996 cur_tsc = rte_rdtsc(); 1997 1998 /* TX burst queue drain */ 1999 diff_tsc = cur_tsc - prev_tsc; 2000 if (unlikely(diff_tsc > drain_tsc)) { 2001 /* 2002 * Get mbuf from vpool.pool and detach mbuf and 2003 * put back into vpool.ring. 2004 */ 2005 dev_ll = lcore_ll->ll_root_used; 2006 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2007 /* Get virtio device ID */ 2008 vdev = dev_ll->vdev; 2009 dev = vdev->dev; 2010 2011 if (likely(!vdev->remove)) { 2012 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2013 if (tx_q->len) { 2014 LOG_DEBUG(VHOST_DATA, 2015 "TX queue drained after timeout" 2016 " with burst size %u\n", 2017 tx_q->len); 2018 2019 /* 2020 * Tx any packets in the queue 2021 */ 2022 ret = rte_eth_tx_burst( 2023 ports[0], 2024 (uint16_t)tx_q->txq_id, 2025 (struct rte_mbuf **) 2026 tx_q->m_table, 2027 (uint16_t)tx_q->len); 2028 if (unlikely(ret < tx_q->len)) { 2029 do { 2030 rte_pktmbuf_free( 2031 tx_q->m_table[ret]); 2032 } while (++ret < tx_q->len); 2033 } 2034 tx_q->len = 0; 2035 2036 txmbuf_clean_zcp(dev, 2037 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2038 } 2039 } 2040 dev_ll = dev_ll->next; 2041 } 2042 prev_tsc = cur_tsc; 2043 } 2044 2045 rte_prefetch0(lcore_ll->ll_root_used); 2046 2047 /* 2048 * Inform the configuration core that we have exited the linked 2049 * list and that no devices are in use if requested. 2050 */ 2051 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2052 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2053 2054 /* Process devices */ 2055 dev_ll = lcore_ll->ll_root_used; 2056 2057 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2058 vdev = dev_ll->vdev; 2059 dev = vdev->dev; 2060 if (unlikely(vdev->remove)) { 2061 dev_ll = dev_ll->next; 2062 unlink_vmdq(vdev); 2063 vdev->ready = DEVICE_SAFE_REMOVE; 2064 continue; 2065 } 2066 2067 if (likely(vdev->ready == DEVICE_RX)) { 2068 uint32_t index = vdev->vmdq_rx_q; 2069 uint16_t i; 2070 count_in_ring 2071 = rte_ring_count(vpool_array[index].ring); 2072 uint16_t free_entries 2073 = (uint16_t)get_available_ring_num_zcp(dev); 2074 2075 /* 2076 * Attach all mbufs in vpool.ring and put back 2077 * into vpool.pool. 2078 */ 2079 for (i = 0; 2080 i < RTE_MIN(free_entries, 2081 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2082 i++) 2083 attach_rxmbuf_zcp(dev); 2084 2085 /* Handle guest RX */ 2086 rx_count = rte_eth_rx_burst(ports[0], 2087 vdev->vmdq_rx_q, pkts_burst, 2088 MAX_PKT_BURST); 2089 2090 if (rx_count) { 2091 ret_count = virtio_dev_rx_zcp(dev, 2092 pkts_burst, rx_count); 2093 if (enable_stats) { 2094 dev_statistics[dev->device_fh].rx_total 2095 += rx_count; 2096 dev_statistics[dev->device_fh].rx 2097 += ret_count; 2098 } 2099 while (likely(rx_count)) { 2100 rx_count--; 2101 pktmbuf_detach_zcp( 2102 pkts_burst[rx_count]); 2103 rte_ring_sp_enqueue( 2104 vpool_array[index].ring, 2105 (void *)pkts_burst[rx_count]); 2106 } 2107 } 2108 } 2109 2110 if (likely(!vdev->remove)) 2111 /* Handle guest TX */ 2112 virtio_dev_tx_zcp(dev); 2113 2114 /* Move to the next device in the list */ 2115 dev_ll = dev_ll->next; 2116 } 2117 } 2118 2119 return 0; 2120 } 2121 2122 2123 /* 2124 * Add an entry to a used linked list. A free entry must first be found 2125 * in the free linked list using get_data_ll_free_entry(); 2126 */ 2127 static void 2128 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2129 struct virtio_net_data_ll *ll_dev) 2130 { 2131 struct virtio_net_data_ll *ll = *ll_root_addr; 2132 2133 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2134 ll_dev->next = NULL; 2135 rte_compiler_barrier(); 2136 2137 /* If ll == NULL then this is the first device. */ 2138 if (ll) { 2139 /* Increment to the tail of the linked list. */ 2140 while ((ll->next != NULL) ) 2141 ll = ll->next; 2142 2143 ll->next = ll_dev; 2144 } else { 2145 *ll_root_addr = ll_dev; 2146 } 2147 } 2148 2149 /* 2150 * Remove an entry from a used linked list. The entry must then be added to 2151 * the free linked list using put_data_ll_free_entry(). 2152 */ 2153 static void 2154 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2155 struct virtio_net_data_ll *ll_dev, 2156 struct virtio_net_data_ll *ll_dev_last) 2157 { 2158 struct virtio_net_data_ll *ll = *ll_root_addr; 2159 2160 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2161 return; 2162 2163 if (ll_dev == ll) 2164 *ll_root_addr = ll_dev->next; 2165 else 2166 if (likely(ll_dev_last != NULL)) 2167 ll_dev_last->next = ll_dev->next; 2168 else 2169 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2170 } 2171 2172 /* 2173 * Find and return an entry from the free linked list. 2174 */ 2175 static struct virtio_net_data_ll * 2176 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2177 { 2178 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2179 struct virtio_net_data_ll *ll_dev; 2180 2181 if (ll_free == NULL) 2182 return NULL; 2183 2184 ll_dev = ll_free; 2185 *ll_root_addr = ll_free->next; 2186 2187 return ll_dev; 2188 } 2189 2190 /* 2191 * Place an entry back on to the free linked list. 2192 */ 2193 static void 2194 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2195 struct virtio_net_data_ll *ll_dev) 2196 { 2197 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2198 2199 if (ll_dev == NULL) 2200 return; 2201 2202 ll_dev->next = ll_free; 2203 *ll_root_addr = ll_dev; 2204 } 2205 2206 /* 2207 * Creates a linked list of a given size. 2208 */ 2209 static struct virtio_net_data_ll * 2210 alloc_data_ll(uint32_t size) 2211 { 2212 struct virtio_net_data_ll *ll_new; 2213 uint32_t i; 2214 2215 /* Malloc and then chain the linked list. */ 2216 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2217 if (ll_new == NULL) { 2218 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2219 return NULL; 2220 } 2221 2222 for (i = 0; i < size - 1; i++) { 2223 ll_new[i].vdev = NULL; 2224 ll_new[i].next = &ll_new[i+1]; 2225 } 2226 ll_new[i].next = NULL; 2227 2228 return (ll_new); 2229 } 2230 2231 /* 2232 * Create the main linked list along with each individual cores linked list. A used and a free list 2233 * are created to manage entries. 2234 */ 2235 static int 2236 init_data_ll (void) 2237 { 2238 int lcore; 2239 2240 RTE_LCORE_FOREACH_SLAVE(lcore) { 2241 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2242 if (lcore_info[lcore].lcore_ll == NULL) { 2243 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2244 return -1; 2245 } 2246 2247 lcore_info[lcore].lcore_ll->device_num = 0; 2248 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2249 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2250 if (num_devices % num_switching_cores) 2251 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2252 else 2253 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2254 } 2255 2256 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2257 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2258 2259 return 0; 2260 } 2261 2262 /* 2263 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2264 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2265 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2266 */ 2267 static void 2268 destroy_device (volatile struct virtio_net *dev) 2269 { 2270 struct virtio_net_data_ll *ll_lcore_dev_cur; 2271 struct virtio_net_data_ll *ll_main_dev_cur; 2272 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2273 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2274 struct vhost_dev *vdev; 2275 int lcore; 2276 2277 dev->flags &= ~VIRTIO_DEV_RUNNING; 2278 2279 vdev = (struct vhost_dev *)dev->priv; 2280 /*set the remove flag. */ 2281 vdev->remove = 1; 2282 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2283 rte_pause(); 2284 } 2285 2286 /* Search for entry to be removed from lcore ll */ 2287 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2288 while (ll_lcore_dev_cur != NULL) { 2289 if (ll_lcore_dev_cur->vdev == vdev) { 2290 break; 2291 } else { 2292 ll_lcore_dev_last = ll_lcore_dev_cur; 2293 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2294 } 2295 } 2296 2297 if (ll_lcore_dev_cur == NULL) { 2298 RTE_LOG(ERR, VHOST_CONFIG, 2299 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2300 dev->device_fh); 2301 return; 2302 } 2303 2304 /* Search for entry to be removed from main ll */ 2305 ll_main_dev_cur = ll_root_used; 2306 ll_main_dev_last = NULL; 2307 while (ll_main_dev_cur != NULL) { 2308 if (ll_main_dev_cur->vdev == vdev) { 2309 break; 2310 } else { 2311 ll_main_dev_last = ll_main_dev_cur; 2312 ll_main_dev_cur = ll_main_dev_cur->next; 2313 } 2314 } 2315 2316 /* Remove entries from the lcore and main ll. */ 2317 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2318 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2319 2320 /* Set the dev_removal_flag on each lcore. */ 2321 RTE_LCORE_FOREACH_SLAVE(lcore) { 2322 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2323 } 2324 2325 /* 2326 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2327 * they can no longer access the device removed from the linked lists and that the devices 2328 * are no longer in use. 2329 */ 2330 RTE_LCORE_FOREACH_SLAVE(lcore) { 2331 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2332 rte_pause(); 2333 } 2334 } 2335 2336 /* Add the entries back to the lcore and main free ll.*/ 2337 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2338 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2339 2340 /* Decrement number of device on the lcore. */ 2341 lcore_info[vdev->coreid].lcore_ll->device_num--; 2342 2343 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2344 2345 if (zero_copy) { 2346 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2347 2348 /* Stop the RX queue. */ 2349 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2350 LOG_DEBUG(VHOST_CONFIG, 2351 "(%"PRIu64") In destroy_device: Failed to stop " 2352 "rx queue:%d\n", 2353 dev->device_fh, 2354 vdev->vmdq_rx_q); 2355 } 2356 2357 LOG_DEBUG(VHOST_CONFIG, 2358 "(%"PRIu64") in destroy_device: Start put mbuf in " 2359 "mempool back to ring for RX queue: %d\n", 2360 dev->device_fh, vdev->vmdq_rx_q); 2361 2362 mbuf_destroy_zcp(vpool); 2363 2364 /* Stop the TX queue. */ 2365 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2366 LOG_DEBUG(VHOST_CONFIG, 2367 "(%"PRIu64") In destroy_device: Failed to " 2368 "stop tx queue:%d\n", 2369 dev->device_fh, vdev->vmdq_rx_q); 2370 } 2371 2372 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2373 2374 LOG_DEBUG(VHOST_CONFIG, 2375 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2376 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2377 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2378 dev->device_fh); 2379 2380 mbuf_destroy_zcp(vpool); 2381 rte_free(vdev->regions_hpa); 2382 } 2383 rte_free(vdev); 2384 2385 } 2386 2387 /* 2388 * Calculate the region count of physical continous regions for one particular 2389 * region of whose vhost virtual address is continous. The particular region 2390 * start from vva_start, with size of 'size' in argument. 2391 */ 2392 static uint32_t 2393 check_hpa_regions(uint64_t vva_start, uint64_t size) 2394 { 2395 uint32_t i, nregions = 0, page_size = getpagesize(); 2396 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2397 if (vva_start % page_size) { 2398 LOG_DEBUG(VHOST_CONFIG, 2399 "in check_countinous: vva start(%p) mod page_size(%d) " 2400 "has remainder\n", 2401 (void *)(uintptr_t)vva_start, page_size); 2402 return 0; 2403 } 2404 if (size % page_size) { 2405 LOG_DEBUG(VHOST_CONFIG, 2406 "in check_countinous: " 2407 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2408 size, page_size); 2409 return 0; 2410 } 2411 for (i = 0; i < size - page_size; i = i + page_size) { 2412 cur_phys_addr 2413 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2414 next_phys_addr = rte_mem_virt2phy( 2415 (void *)(uintptr_t)(vva_start + i + page_size)); 2416 if ((cur_phys_addr + page_size) != next_phys_addr) { 2417 ++nregions; 2418 LOG_DEBUG(VHOST_CONFIG, 2419 "in check_continuous: hva addr:(%p) is not " 2420 "continuous with hva addr:(%p), diff:%d\n", 2421 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2422 (void *)(uintptr_t)(vva_start + (uint64_t)i 2423 + page_size), page_size); 2424 LOG_DEBUG(VHOST_CONFIG, 2425 "in check_continuous: hpa addr:(%p) is not " 2426 "continuous with hpa addr:(%p), " 2427 "diff:(%"PRIu64")\n", 2428 (void *)(uintptr_t)cur_phys_addr, 2429 (void *)(uintptr_t)next_phys_addr, 2430 (next_phys_addr-cur_phys_addr)); 2431 } 2432 } 2433 return nregions; 2434 } 2435 2436 /* 2437 * Divide each region whose vhost virtual address is continous into a few 2438 * sub-regions, make sure the physical address within each sub-region are 2439 * continous. And fill offset(to GPA) and size etc. information of each 2440 * sub-region into regions_hpa. 2441 */ 2442 static uint32_t 2443 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2444 { 2445 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2446 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2447 2448 if (mem_region_hpa == NULL) 2449 return 0; 2450 2451 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2452 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2453 virtio_memory->regions[regionidx].address_offset; 2454 mem_region_hpa[regionidx_hpa].guest_phys_address 2455 = virtio_memory->regions[regionidx].guest_phys_address; 2456 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2457 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2458 mem_region_hpa[regionidx_hpa].guest_phys_address; 2459 LOG_DEBUG(VHOST_CONFIG, 2460 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2461 regionidx_hpa, 2462 (void *)(uintptr_t) 2463 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2464 LOG_DEBUG(VHOST_CONFIG, 2465 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2466 regionidx_hpa, 2467 (void *)(uintptr_t) 2468 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2469 for (i = 0, k = 0; 2470 i < virtio_memory->regions[regionidx].memory_size - 2471 page_size; 2472 i += page_size) { 2473 cur_phys_addr = rte_mem_virt2phy( 2474 (void *)(uintptr_t)(vva_start + i)); 2475 next_phys_addr = rte_mem_virt2phy( 2476 (void *)(uintptr_t)(vva_start + 2477 i + page_size)); 2478 if ((cur_phys_addr + page_size) != next_phys_addr) { 2479 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2480 mem_region_hpa[regionidx_hpa].guest_phys_address + 2481 k + page_size; 2482 mem_region_hpa[regionidx_hpa].memory_size 2483 = k + page_size; 2484 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2485 "phys addr end [%d]:(%p)\n", 2486 regionidx_hpa, 2487 (void *)(uintptr_t) 2488 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2489 LOG_DEBUG(VHOST_CONFIG, 2490 "in fill_hpa_regions: guest phys addr " 2491 "size [%d]:(%p)\n", 2492 regionidx_hpa, 2493 (void *)(uintptr_t) 2494 (mem_region_hpa[regionidx_hpa].memory_size)); 2495 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2496 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2497 ++regionidx_hpa; 2498 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2499 next_phys_addr - 2500 mem_region_hpa[regionidx_hpa].guest_phys_address; 2501 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2502 " phys addr start[%d]:(%p)\n", 2503 regionidx_hpa, 2504 (void *)(uintptr_t) 2505 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2506 LOG_DEBUG(VHOST_CONFIG, 2507 "in fill_hpa_regions: host phys addr " 2508 "start[%d]:(%p)\n", 2509 regionidx_hpa, 2510 (void *)(uintptr_t) 2511 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2512 k = 0; 2513 } else { 2514 k += page_size; 2515 } 2516 } 2517 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2518 = mem_region_hpa[regionidx_hpa].guest_phys_address 2519 + k + page_size; 2520 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2521 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2522 "[%d]:(%p)\n", regionidx_hpa, 2523 (void *)(uintptr_t) 2524 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2525 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2526 "[%d]:(%p)\n", regionidx_hpa, 2527 (void *)(uintptr_t) 2528 (mem_region_hpa[regionidx_hpa].memory_size)); 2529 ++regionidx_hpa; 2530 } 2531 return regionidx_hpa; 2532 } 2533 2534 /* 2535 * A new device is added to a data core. First the device is added to the main linked list 2536 * and the allocated to a specific data core. 2537 */ 2538 static int 2539 new_device (struct virtio_net *dev) 2540 { 2541 struct virtio_net_data_ll *ll_dev; 2542 int lcore, core_add = 0; 2543 uint32_t device_num_min = num_devices; 2544 struct vhost_dev *vdev; 2545 uint32_t regionidx; 2546 2547 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2548 if (vdev == NULL) { 2549 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2550 dev->device_fh); 2551 return -1; 2552 } 2553 vdev->dev = dev; 2554 dev->priv = vdev; 2555 2556 if (zero_copy) { 2557 vdev->nregions_hpa = dev->mem->nregions; 2558 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2559 vdev->nregions_hpa 2560 += check_hpa_regions( 2561 dev->mem->regions[regionidx].guest_phys_address 2562 + dev->mem->regions[regionidx].address_offset, 2563 dev->mem->regions[regionidx].memory_size); 2564 2565 } 2566 2567 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2568 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2569 CACHE_LINE_SIZE); 2570 if (vdev->regions_hpa == NULL) { 2571 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2572 rte_free(vdev); 2573 return -1; 2574 } 2575 2576 2577 if (fill_hpa_memory_regions( 2578 vdev->regions_hpa, dev->mem 2579 ) != vdev->nregions_hpa) { 2580 2581 RTE_LOG(ERR, VHOST_CONFIG, 2582 "hpa memory regions number mismatch: " 2583 "[%d]\n", vdev->nregions_hpa); 2584 rte_free(vdev->regions_hpa); 2585 rte_free(vdev); 2586 return -1; 2587 } 2588 } 2589 2590 2591 /* Add device to main ll */ 2592 ll_dev = get_data_ll_free_entry(&ll_root_free); 2593 if (ll_dev == NULL) { 2594 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2595 "of %d devices per core has been reached\n", 2596 dev->device_fh, num_devices); 2597 if (vdev->regions_hpa) 2598 rte_free(vdev->regions_hpa); 2599 rte_free(vdev); 2600 return -1; 2601 } 2602 ll_dev->vdev = vdev; 2603 add_data_ll_entry(&ll_root_used, ll_dev); 2604 vdev->vmdq_rx_q 2605 = dev->device_fh * (num_queues / num_devices); 2606 2607 if (zero_copy) { 2608 uint32_t index = vdev->vmdq_rx_q; 2609 uint32_t count_in_ring, i; 2610 struct mbuf_table *tx_q; 2611 2612 count_in_ring = rte_ring_count(vpool_array[index].ring); 2613 2614 LOG_DEBUG(VHOST_CONFIG, 2615 "(%"PRIu64") in new_device: mbuf count in mempool " 2616 "before attach is: %d\n", 2617 dev->device_fh, 2618 rte_mempool_count(vpool_array[index].pool)); 2619 LOG_DEBUG(VHOST_CONFIG, 2620 "(%"PRIu64") in new_device: mbuf count in ring " 2621 "before attach is : %d\n", 2622 dev->device_fh, count_in_ring); 2623 2624 /* 2625 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2626 */ 2627 for (i = 0; i < count_in_ring; i++) 2628 attach_rxmbuf_zcp(dev); 2629 2630 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2631 "mempool after attach is: %d\n", 2632 dev->device_fh, 2633 rte_mempool_count(vpool_array[index].pool)); 2634 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2635 "ring after attach is : %d\n", 2636 dev->device_fh, 2637 rte_ring_count(vpool_array[index].ring)); 2638 2639 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2640 tx_q->txq_id = vdev->vmdq_rx_q; 2641 2642 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2643 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2644 2645 LOG_DEBUG(VHOST_CONFIG, 2646 "(%"PRIu64") In new_device: Failed to start " 2647 "tx queue:%d\n", 2648 dev->device_fh, vdev->vmdq_rx_q); 2649 2650 mbuf_destroy_zcp(vpool); 2651 rte_free(vdev->regions_hpa); 2652 rte_free(vdev); 2653 return -1; 2654 } 2655 2656 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2657 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2658 2659 LOG_DEBUG(VHOST_CONFIG, 2660 "(%"PRIu64") In new_device: Failed to start " 2661 "rx queue:%d\n", 2662 dev->device_fh, vdev->vmdq_rx_q); 2663 2664 /* Stop the TX queue. */ 2665 if (rte_eth_dev_tx_queue_stop(ports[0], 2666 vdev->vmdq_rx_q) != 0) { 2667 LOG_DEBUG(VHOST_CONFIG, 2668 "(%"PRIu64") In new_device: Failed to " 2669 "stop tx queue:%d\n", 2670 dev->device_fh, vdev->vmdq_rx_q); 2671 } 2672 2673 mbuf_destroy_zcp(vpool); 2674 rte_free(vdev->regions_hpa); 2675 rte_free(vdev); 2676 return -1; 2677 } 2678 2679 } 2680 2681 /*reset ready flag*/ 2682 vdev->ready = DEVICE_MAC_LEARNING; 2683 vdev->remove = 0; 2684 2685 /* Find a suitable lcore to add the device. */ 2686 RTE_LCORE_FOREACH_SLAVE(lcore) { 2687 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2688 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2689 core_add = lcore; 2690 } 2691 } 2692 /* Add device to lcore ll */ 2693 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2694 if (ll_dev == NULL) { 2695 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2696 vdev->ready = DEVICE_SAFE_REMOVE; 2697 destroy_device(dev); 2698 if (vdev->regions_hpa) 2699 rte_free(vdev->regions_hpa); 2700 rte_free(vdev); 2701 return -1; 2702 } 2703 ll_dev->vdev = vdev; 2704 vdev->coreid = core_add; 2705 2706 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2707 2708 /* Initialize device stats */ 2709 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2710 2711 /* Disable notifications. */ 2712 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2713 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2714 lcore_info[vdev->coreid].lcore_ll->device_num++; 2715 dev->flags |= VIRTIO_DEV_RUNNING; 2716 2717 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2718 2719 return 0; 2720 } 2721 2722 /* 2723 * These callback allow devices to be added to the data core when configuration 2724 * has been fully complete. 2725 */ 2726 static const struct virtio_net_device_ops virtio_net_device_ops = 2727 { 2728 .new_device = new_device, 2729 .destroy_device = destroy_device, 2730 }; 2731 2732 /* 2733 * This is a thread will wake up after a period to print stats if the user has 2734 * enabled them. 2735 */ 2736 static void 2737 print_stats(void) 2738 { 2739 struct virtio_net_data_ll *dev_ll; 2740 uint64_t tx_dropped, rx_dropped; 2741 uint64_t tx, tx_total, rx, rx_total; 2742 uint32_t device_fh; 2743 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2744 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2745 2746 while(1) { 2747 sleep(enable_stats); 2748 2749 /* Clear screen and move to top left */ 2750 printf("%s%s", clr, top_left); 2751 2752 printf("\nDevice statistics ===================================="); 2753 2754 dev_ll = ll_root_used; 2755 while (dev_ll != NULL) { 2756 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2757 tx_total = dev_statistics[device_fh].tx_total; 2758 tx = dev_statistics[device_fh].tx; 2759 tx_dropped = tx_total - tx; 2760 if (zero_copy == 0) { 2761 rx_total = rte_atomic64_read( 2762 &dev_statistics[device_fh].rx_total_atomic); 2763 rx = rte_atomic64_read( 2764 &dev_statistics[device_fh].rx_atomic); 2765 } else { 2766 rx_total = dev_statistics[device_fh].rx_total; 2767 rx = dev_statistics[device_fh].rx; 2768 } 2769 rx_dropped = rx_total - rx; 2770 2771 printf("\nStatistics for device %"PRIu32" ------------------------------" 2772 "\nTX total: %"PRIu64"" 2773 "\nTX dropped: %"PRIu64"" 2774 "\nTX successful: %"PRIu64"" 2775 "\nRX total: %"PRIu64"" 2776 "\nRX dropped: %"PRIu64"" 2777 "\nRX successful: %"PRIu64"", 2778 device_fh, 2779 tx_total, 2780 tx_dropped, 2781 tx, 2782 rx_total, 2783 rx_dropped, 2784 rx); 2785 2786 dev_ll = dev_ll->next; 2787 } 2788 printf("\n======================================================\n"); 2789 } 2790 } 2791 2792 static void 2793 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2794 char *ring_name, uint32_t nb_mbuf) 2795 { 2796 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2797 vpool_array[index].pool 2798 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2799 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2800 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2801 rte_pktmbuf_init, NULL, socket, 0); 2802 if (vpool_array[index].pool != NULL) { 2803 vpool_array[index].ring 2804 = rte_ring_create(ring_name, 2805 rte_align32pow2(nb_mbuf + 1), 2806 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2807 if (likely(vpool_array[index].ring != NULL)) { 2808 LOG_DEBUG(VHOST_CONFIG, 2809 "in setup_mempool_tbl: mbuf count in " 2810 "mempool is: %d\n", 2811 rte_mempool_count(vpool_array[index].pool)); 2812 LOG_DEBUG(VHOST_CONFIG, 2813 "in setup_mempool_tbl: mbuf count in " 2814 "ring is: %d\n", 2815 rte_ring_count(vpool_array[index].ring)); 2816 } else { 2817 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2818 ring_name); 2819 } 2820 2821 /* Need consider head room. */ 2822 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2823 } else { 2824 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2825 } 2826 } 2827 2828 2829 /* 2830 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2831 * device is also registered here to handle the IOCTLs. 2832 */ 2833 int 2834 MAIN(int argc, char *argv[]) 2835 { 2836 struct rte_mempool *mbuf_pool = NULL; 2837 unsigned lcore_id, core_id = 0; 2838 unsigned nb_ports, valid_num_ports; 2839 int ret; 2840 uint8_t portid, queue_id = 0; 2841 static pthread_t tid; 2842 2843 /* init EAL */ 2844 ret = rte_eal_init(argc, argv); 2845 if (ret < 0) 2846 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2847 argc -= ret; 2848 argv += ret; 2849 2850 /* parse app arguments */ 2851 ret = us_vhost_parse_args(argc, argv); 2852 if (ret < 0) 2853 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2854 2855 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2856 if (rte_lcore_is_enabled(lcore_id)) 2857 lcore_ids[core_id ++] = lcore_id; 2858 2859 if (rte_lcore_count() > RTE_MAX_LCORE) 2860 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2861 2862 /*set the number of swithcing cores available*/ 2863 num_switching_cores = rte_lcore_count()-1; 2864 2865 /* Get the number of physical ports. */ 2866 nb_ports = rte_eth_dev_count(); 2867 if (nb_ports > RTE_MAX_ETHPORTS) 2868 nb_ports = RTE_MAX_ETHPORTS; 2869 2870 /* 2871 * Update the global var NUM_PORTS and global array PORTS 2872 * and get value of var VALID_NUM_PORTS according to system ports number 2873 */ 2874 valid_num_ports = check_ports_num(nb_ports); 2875 2876 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2877 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2878 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2879 return -1; 2880 } 2881 2882 if (zero_copy == 0) { 2883 /* Create the mbuf pool. */ 2884 mbuf_pool = rte_mempool_create( 2885 "MBUF_POOL", 2886 NUM_MBUFS_PER_PORT 2887 * valid_num_ports, 2888 MBUF_SIZE, MBUF_CACHE_SIZE, 2889 sizeof(struct rte_pktmbuf_pool_private), 2890 rte_pktmbuf_pool_init, NULL, 2891 rte_pktmbuf_init, NULL, 2892 rte_socket_id(), 0); 2893 if (mbuf_pool == NULL) 2894 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2895 2896 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2897 vpool_array[queue_id].pool = mbuf_pool; 2898 2899 if (vm2vm_mode == VM2VM_HARDWARE) { 2900 /* Enable VT loop back to let L2 switch to do it. */ 2901 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2902 LOG_DEBUG(VHOST_CONFIG, 2903 "Enable loop back for L2 switch in vmdq.\n"); 2904 } 2905 } else { 2906 uint32_t nb_mbuf; 2907 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2908 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2909 2910 /* 2911 * Zero copy defers queue RX/TX start to the time when guest 2912 * finishes its startup and packet buffers from that guest are 2913 * available. 2914 */ 2915 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2916 rx_conf_default.rx_drop_en = 0; 2917 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2918 nb_mbuf = num_rx_descriptor 2919 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2920 + num_switching_cores * MAX_PKT_BURST; 2921 2922 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2923 snprintf(pool_name, sizeof(pool_name), 2924 "rxmbuf_pool_%u", queue_id); 2925 snprintf(ring_name, sizeof(ring_name), 2926 "rxmbuf_ring_%u", queue_id); 2927 setup_mempool_tbl(rte_socket_id(), queue_id, 2928 pool_name, ring_name, nb_mbuf); 2929 } 2930 2931 nb_mbuf = num_tx_descriptor 2932 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2933 + num_switching_cores * MAX_PKT_BURST; 2934 2935 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2936 snprintf(pool_name, sizeof(pool_name), 2937 "txmbuf_pool_%u", queue_id); 2938 snprintf(ring_name, sizeof(ring_name), 2939 "txmbuf_ring_%u", queue_id); 2940 setup_mempool_tbl(rte_socket_id(), 2941 (queue_id + MAX_QUEUES), 2942 pool_name, ring_name, nb_mbuf); 2943 } 2944 2945 if (vm2vm_mode == VM2VM_HARDWARE) { 2946 /* Enable VT loop back to let L2 switch to do it. */ 2947 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2948 LOG_DEBUG(VHOST_CONFIG, 2949 "Enable loop back for L2 switch in vmdq.\n"); 2950 } 2951 } 2952 /* Set log level. */ 2953 rte_set_log_level(LOG_LEVEL); 2954 2955 /* initialize all ports */ 2956 for (portid = 0; portid < nb_ports; portid++) { 2957 /* skip ports that are not enabled */ 2958 if ((enabled_port_mask & (1 << portid)) == 0) { 2959 RTE_LOG(INFO, VHOST_PORT, 2960 "Skipping disabled port %d\n", portid); 2961 continue; 2962 } 2963 if (port_init(portid) != 0) 2964 rte_exit(EXIT_FAILURE, 2965 "Cannot initialize network ports\n"); 2966 } 2967 2968 /* Initialise all linked lists. */ 2969 if (init_data_ll() == -1) 2970 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2971 2972 /* Initialize device stats */ 2973 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2974 2975 /* Enable stats if the user option is set. */ 2976 if (enable_stats) 2977 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2978 2979 /* Launch all data cores. */ 2980 if (zero_copy == 0) { 2981 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 2982 rte_eal_remote_launch(switch_worker, 2983 mbuf_pool, lcore_id); 2984 } 2985 } else { 2986 uint32_t count_in_mempool, index, i; 2987 for (index = 0; index < 2*MAX_QUEUES; index++) { 2988 /* For all RX and TX queues. */ 2989 count_in_mempool 2990 = rte_mempool_count(vpool_array[index].pool); 2991 2992 /* 2993 * Transfer all un-attached mbufs from vpool.pool 2994 * to vpoo.ring. 2995 */ 2996 for (i = 0; i < count_in_mempool; i++) { 2997 struct rte_mbuf *mbuf 2998 = __rte_mbuf_raw_alloc( 2999 vpool_array[index].pool); 3000 rte_ring_sp_enqueue(vpool_array[index].ring, 3001 (void *)mbuf); 3002 } 3003 3004 LOG_DEBUG(VHOST_CONFIG, 3005 "in MAIN: mbuf count in mempool at initial " 3006 "is: %d\n", count_in_mempool); 3007 LOG_DEBUG(VHOST_CONFIG, 3008 "in MAIN: mbuf count in ring at initial is :" 3009 " %d\n", 3010 rte_ring_count(vpool_array[index].ring)); 3011 } 3012 3013 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3014 rte_eal_remote_launch(switch_worker_zcp, NULL, 3015 lcore_id); 3016 } 3017 3018 if (mergeable == 0) 3019 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3020 3021 /* Register CUSE device to handle IOCTLs. */ 3022 ret = rte_vhost_driver_register((char *)&dev_basename); 3023 if (ret != 0) 3024 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3025 3026 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3027 3028 /* Start CUSE session. */ 3029 rte_vhost_driver_session_start(); 3030 return 0; 3031 3032 } 3033 3034