1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 103 104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 106 107 #define JUMBO_FRAME_MAX_SIZE 0x2600 108 109 /* State of virtio device. */ 110 #define DEVICE_MAC_LEARNING 0 111 #define DEVICE_RX 1 112 #define DEVICE_SAFE_REMOVE 2 113 114 /* Config_core_flag status definitions. */ 115 #define REQUEST_DEV_REMOVAL 1 116 #define ACK_DEV_REMOVAL 0 117 118 /* Configurable number of RX/TX ring descriptors */ 119 #define RTE_TEST_RX_DESC_DEFAULT 1024 120 #define RTE_TEST_TX_DESC_DEFAULT 512 121 122 /* 123 * Need refine these 2 macros for legacy and DPDK based front end: 124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 125 * And then adjust power 2. 126 */ 127 /* 128 * For legacy front end, 128 descriptors, 129 * half for virtio header, another half for mbuf. 130 */ 131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 133 134 /* Get first 4 bytes in mbuf headroom. */ 135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 136 + sizeof(struct rte_mbuf))) 137 138 /* true if x is a power of 2 */ 139 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 140 141 #define INVALID_PORT_ID 0xFF 142 143 /* Max number of devices. Limited by vmdq. */ 144 #define MAX_DEVICES 64 145 146 /* Size of buffers used for snprintfs. */ 147 #define MAX_PRINT_BUFF 6072 148 149 /* Maximum character device basename size. */ 150 #define MAX_BASENAME_SZ 10 151 152 /* Maximum long option length for option parsing. */ 153 #define MAX_LONG_OPT_SZ 64 154 155 /* Used to compare MAC addresses. */ 156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 157 158 /* Number of descriptors per cacheline. */ 159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 160 161 /* mask of enabled ports */ 162 static uint32_t enabled_port_mask = 0; 163 164 /*Number of switching cores enabled*/ 165 static uint32_t num_switching_cores = 0; 166 167 /* number of devices/queues to support*/ 168 static uint32_t num_queues = 0; 169 uint32_t num_devices = 0; 170 171 /* 172 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 173 * disabled on default. 174 */ 175 static uint32_t zero_copy; 176 static int mergeable; 177 178 /* number of descriptors to apply*/ 179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 181 182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 183 #define MAX_RING_DESC 4096 184 185 struct vpool { 186 struct rte_mempool *pool; 187 struct rte_ring *ring; 188 uint32_t buf_size; 189 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 190 191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 192 typedef enum { 193 VM2VM_DISABLED = 0, 194 VM2VM_SOFTWARE = 1, 195 VM2VM_HARDWARE = 2, 196 VM2VM_LAST 197 } vm2vm_type; 198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 199 200 /* The type of host physical address translated from guest physical address. */ 201 typedef enum { 202 PHYS_ADDR_CONTINUOUS = 0, 203 PHYS_ADDR_CROSS_SUBREG = 1, 204 PHYS_ADDR_INVALID = 2, 205 PHYS_ADDR_LAST 206 } hpa_type; 207 208 /* Enable stats. */ 209 static uint32_t enable_stats = 0; 210 /* Enable retries on RX. */ 211 static uint32_t enable_retry = 1; 212 /* Specify timeout (in useconds) between retries on RX. */ 213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 214 /* Specify the number of retries on RX. */ 215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 216 217 /* Character device basename. Can be set by user. */ 218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 219 220 221 /* Default configuration for rx and tx thresholds etc. */ 222 static struct rte_eth_rxconf rx_conf_default = { 223 .rx_thresh = { 224 .pthresh = RX_PTHRESH, 225 .hthresh = RX_HTHRESH, 226 .wthresh = RX_WTHRESH, 227 }, 228 .rx_drop_en = 1, 229 }; 230 231 /* 232 * These default values are optimized for use with the Intel(R) 82599 10 GbE 233 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 234 * network controllers and/or network drivers. 235 */ 236 static struct rte_eth_txconf tx_conf_default = { 237 .tx_thresh = { 238 .pthresh = TX_PTHRESH, 239 .hthresh = TX_HTHRESH, 240 .wthresh = TX_WTHRESH, 241 }, 242 .tx_free_thresh = 0, /* Use PMD default values */ 243 .tx_rs_thresh = 0, /* Use PMD default values */ 244 }; 245 246 /* empty vmdq configuration structure. Filled in programatically */ 247 static struct rte_eth_conf vmdq_conf_default = { 248 .rxmode = { 249 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 250 .split_hdr_size = 0, 251 .header_split = 0, /**< Header Split disabled */ 252 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 253 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 254 /* 255 * It is necessary for 1G NIC such as I350, 256 * this fixes bug of ipv4 forwarding in guest can't 257 * forward pakets from one virtio dev to another virtio dev. 258 */ 259 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 260 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 261 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 262 }, 263 264 .txmode = { 265 .mq_mode = ETH_MQ_TX_NONE, 266 }, 267 .rx_adv_conf = { 268 /* 269 * should be overridden separately in code with 270 * appropriate values 271 */ 272 .vmdq_rx_conf = { 273 .nb_queue_pools = ETH_8_POOLS, 274 .enable_default_pool = 0, 275 .default_pool = 0, 276 .nb_pool_maps = 0, 277 .pool_map = {{0, 0},}, 278 }, 279 }, 280 }; 281 282 static unsigned lcore_ids[RTE_MAX_LCORE]; 283 static uint8_t ports[RTE_MAX_ETHPORTS]; 284 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 285 286 static const uint16_t external_pkt_default_vlan_tag = 2000; 287 const uint16_t vlan_tags[] = { 288 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 289 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 290 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 291 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 292 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 293 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 294 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 295 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 296 }; 297 298 /* ethernet addresses of ports */ 299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 300 301 /* heads for the main used and free linked lists for the data path. */ 302 static struct virtio_net_data_ll *ll_root_used = NULL; 303 static struct virtio_net_data_ll *ll_root_free = NULL; 304 305 /* Array of data core structures containing information on individual core linked lists. */ 306 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 307 308 /* Used for queueing bursts of TX packets. */ 309 struct mbuf_table { 310 unsigned len; 311 unsigned txq_id; 312 struct rte_mbuf *m_table[MAX_PKT_BURST]; 313 }; 314 315 /* TX queue for each data core. */ 316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 317 318 /* TX queue fori each virtio device for zero copy. */ 319 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 320 321 /* Vlan header struct used to insert vlan tags on TX. */ 322 struct vlan_ethhdr { 323 unsigned char h_dest[ETH_ALEN]; 324 unsigned char h_source[ETH_ALEN]; 325 __be16 h_vlan_proto; 326 __be16 h_vlan_TCI; 327 __be16 h_vlan_encapsulated_proto; 328 }; 329 330 /* IPv4 Header */ 331 struct ipv4_hdr { 332 uint8_t version_ihl; /**< version and header length */ 333 uint8_t type_of_service; /**< type of service */ 334 uint16_t total_length; /**< length of packet */ 335 uint16_t packet_id; /**< packet ID */ 336 uint16_t fragment_offset; /**< fragmentation offset */ 337 uint8_t time_to_live; /**< time to live */ 338 uint8_t next_proto_id; /**< protocol ID */ 339 uint16_t hdr_checksum; /**< header checksum */ 340 uint32_t src_addr; /**< source address */ 341 uint32_t dst_addr; /**< destination address */ 342 } __attribute__((__packed__)); 343 344 /* Header lengths. */ 345 #define VLAN_HLEN 4 346 #define VLAN_ETH_HLEN 18 347 348 /* Per-device statistics struct */ 349 struct device_statistics { 350 uint64_t tx_total; 351 rte_atomic64_t rx_total_atomic; 352 uint64_t rx_total; 353 uint64_t tx; 354 rte_atomic64_t rx_atomic; 355 uint64_t rx; 356 } __rte_cache_aligned; 357 struct device_statistics dev_statistics[MAX_DEVICES]; 358 359 /* 360 * Builds up the correct configuration for VMDQ VLAN pool map 361 * according to the pool & queue limits. 362 */ 363 static inline int 364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 365 { 366 struct rte_eth_vmdq_rx_conf conf; 367 unsigned i; 368 369 memset(&conf, 0, sizeof(conf)); 370 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 371 conf.nb_pool_maps = num_devices; 372 conf.enable_loop_back = 373 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 374 375 for (i = 0; i < conf.nb_pool_maps; i++) { 376 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 377 conf.pool_map[i].pools = (1UL << i); 378 } 379 380 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 381 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 382 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 383 return 0; 384 } 385 386 /* 387 * Validate the device number according to the max pool number gotten form 388 * dev_info. If the device number is invalid, give the error message and 389 * return -1. Each device must have its own pool. 390 */ 391 static inline int 392 validate_num_devices(uint32_t max_nb_devices) 393 { 394 if (num_devices > max_nb_devices) { 395 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 396 return -1; 397 } 398 return 0; 399 } 400 401 /* 402 * Initialises a given port using global settings and with the rx buffers 403 * coming from the mbuf_pool passed as parameter 404 */ 405 static inline int 406 port_init(uint8_t port) 407 { 408 struct rte_eth_dev_info dev_info; 409 struct rte_eth_conf port_conf; 410 uint16_t rx_rings, tx_rings; 411 uint16_t rx_ring_size, tx_ring_size; 412 int retval; 413 uint16_t q; 414 415 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 416 rte_eth_dev_info_get (port, &dev_info); 417 418 /*configure the number of supported virtio devices based on VMDQ limits */ 419 num_devices = dev_info.max_vmdq_pools; 420 num_queues = dev_info.max_rx_queues; 421 422 if (zero_copy) { 423 rx_ring_size = num_rx_descriptor; 424 tx_ring_size = num_tx_descriptor; 425 tx_rings = dev_info.max_tx_queues; 426 } else { 427 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 428 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 429 tx_rings = (uint16_t)rte_lcore_count(); 430 } 431 432 retval = validate_num_devices(MAX_DEVICES); 433 if (retval < 0) 434 return retval; 435 436 /* Get port configuration. */ 437 retval = get_eth_conf(&port_conf, num_devices); 438 if (retval < 0) 439 return retval; 440 441 if (port >= rte_eth_dev_count()) return -1; 442 443 rx_rings = (uint16_t)num_queues, 444 /* Configure ethernet device. */ 445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 446 if (retval != 0) 447 return retval; 448 449 /* Setup the queues. */ 450 for (q = 0; q < rx_rings; q ++) { 451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 452 rte_eth_dev_socket_id(port), &rx_conf_default, 453 vpool_array[q].pool); 454 if (retval < 0) 455 return retval; 456 } 457 for (q = 0; q < tx_rings; q ++) { 458 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 459 rte_eth_dev_socket_id(port), &tx_conf_default); 460 if (retval < 0) 461 return retval; 462 } 463 464 /* Start the device. */ 465 retval = rte_eth_dev_start(port); 466 if (retval < 0) { 467 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 468 return retval; 469 } 470 471 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 472 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 473 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 474 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 475 (unsigned)port, 476 vmdq_ports_eth_addr[port].addr_bytes[0], 477 vmdq_ports_eth_addr[port].addr_bytes[1], 478 vmdq_ports_eth_addr[port].addr_bytes[2], 479 vmdq_ports_eth_addr[port].addr_bytes[3], 480 vmdq_ports_eth_addr[port].addr_bytes[4], 481 vmdq_ports_eth_addr[port].addr_bytes[5]); 482 483 return 0; 484 } 485 486 /* 487 * Set character device basename. 488 */ 489 static int 490 us_vhost_parse_basename(const char *q_arg) 491 { 492 /* parse number string */ 493 494 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 495 return -1; 496 else 497 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 498 499 return 0; 500 } 501 502 /* 503 * Parse the portmask provided at run time. 504 */ 505 static int 506 parse_portmask(const char *portmask) 507 { 508 char *end = NULL; 509 unsigned long pm; 510 511 errno = 0; 512 513 /* parse hexadecimal string */ 514 pm = strtoul(portmask, &end, 16); 515 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 516 return -1; 517 518 if (pm == 0) 519 return -1; 520 521 return pm; 522 523 } 524 525 /* 526 * Parse num options at run time. 527 */ 528 static int 529 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 530 { 531 char *end = NULL; 532 unsigned long num; 533 534 errno = 0; 535 536 /* parse unsigned int string */ 537 num = strtoul(q_arg, &end, 10); 538 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 539 return -1; 540 541 if (num > max_valid_value) 542 return -1; 543 544 return num; 545 546 } 547 548 /* 549 * Display usage 550 */ 551 static void 552 us_vhost_usage(const char *prgname) 553 { 554 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 555 " --vm2vm [0|1|2]\n" 556 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 557 " --dev-basename <name>\n" 558 " --nb-devices ND\n" 559 " -p PORTMASK: Set mask for ports to be used by application\n" 560 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 561 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 562 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 563 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 564 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 565 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 566 " --dev-basename: The basename to be used for the character device.\n" 567 " --zero-copy [0|1]: disable(default)/enable rx/tx " 568 "zero copy\n" 569 " --rx-desc-num [0-N]: the number of descriptors on rx, " 570 "used only when zero copy is enabled.\n" 571 " --tx-desc-num [0-N]: the number of descriptors on tx, " 572 "used only when zero copy is enabled.\n", 573 prgname); 574 } 575 576 /* 577 * Parse the arguments given in the command line of the application. 578 */ 579 static int 580 us_vhost_parse_args(int argc, char **argv) 581 { 582 int opt, ret; 583 int option_index; 584 unsigned i; 585 const char *prgname = argv[0]; 586 static struct option long_option[] = { 587 {"vm2vm", required_argument, NULL, 0}, 588 {"rx-retry", required_argument, NULL, 0}, 589 {"rx-retry-delay", required_argument, NULL, 0}, 590 {"rx-retry-num", required_argument, NULL, 0}, 591 {"mergeable", required_argument, NULL, 0}, 592 {"stats", required_argument, NULL, 0}, 593 {"dev-basename", required_argument, NULL, 0}, 594 {"zero-copy", required_argument, NULL, 0}, 595 {"rx-desc-num", required_argument, NULL, 0}, 596 {"tx-desc-num", required_argument, NULL, 0}, 597 {NULL, 0, 0, 0}, 598 }; 599 600 /* Parse command line */ 601 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 602 switch (opt) { 603 /* Portmask */ 604 case 'p': 605 enabled_port_mask = parse_portmask(optarg); 606 if (enabled_port_mask == 0) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } 611 break; 612 613 case 0: 614 /* Enable/disable vm2vm comms. */ 615 if (!strncmp(long_option[option_index].name, "vm2vm", 616 MAX_LONG_OPT_SZ)) { 617 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 618 if (ret == -1) { 619 RTE_LOG(INFO, VHOST_CONFIG, 620 "Invalid argument for " 621 "vm2vm [0|1|2]\n"); 622 us_vhost_usage(prgname); 623 return -1; 624 } else { 625 vm2vm_mode = (vm2vm_type)ret; 626 } 627 } 628 629 /* Enable/disable retries on RX. */ 630 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 631 ret = parse_num_opt(optarg, 1); 632 if (ret == -1) { 633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } else { 637 enable_retry = ret; 638 } 639 } 640 641 /* Specify the retries delay time (in useconds) on RX. */ 642 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 643 ret = parse_num_opt(optarg, INT32_MAX); 644 if (ret == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 646 us_vhost_usage(prgname); 647 return -1; 648 } else { 649 burst_rx_delay_time = ret; 650 } 651 } 652 653 /* Specify the retries number on RX. */ 654 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 655 ret = parse_num_opt(optarg, INT32_MAX); 656 if (ret == -1) { 657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 658 us_vhost_usage(prgname); 659 return -1; 660 } else { 661 burst_rx_retry_num = ret; 662 } 663 } 664 665 /* Enable/disable RX mergeable buffers. */ 666 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 667 ret = parse_num_opt(optarg, 1); 668 if (ret == -1) { 669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 670 us_vhost_usage(prgname); 671 return -1; 672 } else { 673 mergeable = !!ret; 674 if (ret) { 675 vmdq_conf_default.rxmode.jumbo_frame = 1; 676 vmdq_conf_default.rxmode.max_rx_pkt_len 677 = JUMBO_FRAME_MAX_SIZE; 678 } 679 } 680 } 681 682 /* Enable/disable stats. */ 683 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 684 ret = parse_num_opt(optarg, INT32_MAX); 685 if (ret == -1) { 686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 687 us_vhost_usage(prgname); 688 return -1; 689 } else { 690 enable_stats = ret; 691 } 692 } 693 694 /* Set character device basename. */ 695 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 696 if (us_vhost_parse_basename(optarg) == -1) { 697 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 698 us_vhost_usage(prgname); 699 return -1; 700 } 701 } 702 703 /* Enable/disable rx/tx zero copy. */ 704 if (!strncmp(long_option[option_index].name, 705 "zero-copy", MAX_LONG_OPT_SZ)) { 706 ret = parse_num_opt(optarg, 1); 707 if (ret == -1) { 708 RTE_LOG(INFO, VHOST_CONFIG, 709 "Invalid argument" 710 " for zero-copy [0|1]\n"); 711 us_vhost_usage(prgname); 712 return -1; 713 } else 714 zero_copy = ret; 715 716 if (zero_copy) { 717 #ifdef RTE_MBUF_REFCNT 718 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 719 "zero copy vhost APP, please " 720 "disable RTE_MBUF_REFCNT\n" 721 "in config file and then rebuild DPDK " 722 "core lib!\n" 723 "Otherwise please disable zero copy " 724 "flag in command line!\n"); 725 return -1; 726 #endif 727 } 728 } 729 730 /* Specify the descriptor number on RX. */ 731 if (!strncmp(long_option[option_index].name, 732 "rx-desc-num", MAX_LONG_OPT_SZ)) { 733 ret = parse_num_opt(optarg, MAX_RING_DESC); 734 if ((ret == -1) || (!POWEROF2(ret))) { 735 RTE_LOG(INFO, VHOST_CONFIG, 736 "Invalid argument for rx-desc-num[0-N]," 737 "power of 2 required.\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } else { 741 num_rx_descriptor = ret; 742 } 743 } 744 745 /* Specify the descriptor number on TX. */ 746 if (!strncmp(long_option[option_index].name, 747 "tx-desc-num", MAX_LONG_OPT_SZ)) { 748 ret = parse_num_opt(optarg, MAX_RING_DESC); 749 if ((ret == -1) || (!POWEROF2(ret))) { 750 RTE_LOG(INFO, VHOST_CONFIG, 751 "Invalid argument for tx-desc-num [0-N]," 752 "power of 2 required.\n"); 753 us_vhost_usage(prgname); 754 return -1; 755 } else { 756 num_tx_descriptor = ret; 757 } 758 } 759 760 break; 761 762 /* Invalid option - print options. */ 763 default: 764 us_vhost_usage(prgname); 765 return -1; 766 } 767 } 768 769 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 770 if (enabled_port_mask & (1 << i)) 771 ports[num_ports++] = (uint8_t)i; 772 } 773 774 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 775 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 776 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 777 return -1; 778 } 779 780 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 781 RTE_LOG(INFO, VHOST_PORT, 782 "Vhost zero copy doesn't support software vm2vm," 783 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 784 return -1; 785 } 786 787 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 788 RTE_LOG(INFO, VHOST_PORT, 789 "Vhost zero copy doesn't support jumbo frame," 790 "please specify '--mergeable 0' to disable the " 791 "mergeable feature.\n"); 792 return -1; 793 } 794 795 return 0; 796 } 797 798 /* 799 * Update the global var NUM_PORTS and array PORTS according to system ports number 800 * and return valid ports number 801 */ 802 static unsigned check_ports_num(unsigned nb_ports) 803 { 804 unsigned valid_num_ports = num_ports; 805 unsigned portid; 806 807 if (num_ports > nb_ports) { 808 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 809 num_ports, nb_ports); 810 num_ports = nb_ports; 811 } 812 813 for (portid = 0; portid < num_ports; portid ++) { 814 if (ports[portid] >= nb_ports) { 815 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 816 ports[portid], (nb_ports - 1)); 817 ports[portid] = INVALID_PORT_ID; 818 valid_num_ports--; 819 } 820 } 821 return valid_num_ports; 822 } 823 824 /* 825 * Macro to print out packet contents. Wrapped in debug define so that the 826 * data path is not effected when debug is disabled. 827 */ 828 #ifdef DEBUG 829 #define PRINT_PACKET(device, addr, size, header) do { \ 830 char *pkt_addr = (char*)(addr); \ 831 unsigned int index; \ 832 char packet[MAX_PRINT_BUFF]; \ 833 \ 834 if ((header)) \ 835 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 836 else \ 837 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 838 for (index = 0; index < (size); index++) { \ 839 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 840 "%02hhx ", pkt_addr[index]); \ 841 } \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 843 \ 844 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 845 } while(0) 846 #else 847 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 848 #endif 849 850 /* 851 * Function to convert guest physical addresses to vhost physical addresses. 852 * This is used to convert virtio buffer addresses. 853 */ 854 static inline uint64_t __attribute__((always_inline)) 855 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 856 uint32_t buf_len, hpa_type *addr_type) 857 { 858 struct virtio_memory_regions_hpa *region; 859 uint32_t regionidx; 860 uint64_t vhost_pa = 0; 861 862 *addr_type = PHYS_ADDR_INVALID; 863 864 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 865 region = &vdev->regions_hpa[regionidx]; 866 if ((guest_pa >= region->guest_phys_address) && 867 (guest_pa <= region->guest_phys_address_end)) { 868 vhost_pa = region->host_phys_addr_offset + guest_pa; 869 if (likely((guest_pa + buf_len - 1) 870 <= region->guest_phys_address_end)) 871 *addr_type = PHYS_ADDR_CONTINUOUS; 872 else 873 *addr_type = PHYS_ADDR_CROSS_SUBREG; 874 break; 875 } 876 } 877 878 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 879 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 880 (void *)(uintptr_t)vhost_pa); 881 882 return vhost_pa; 883 } 884 885 /* 886 * Compares a packet destination MAC address to a device MAC address. 887 */ 888 static inline int __attribute__((always_inline)) 889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 890 { 891 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 892 } 893 894 /* 895 * This function learns the MAC address of the device and registers this along with a 896 * vlan tag to a VMDQ. 897 */ 898 static int 899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 900 { 901 struct ether_hdr *pkt_hdr; 902 struct virtio_net_data_ll *dev_ll; 903 struct virtio_net *dev = vdev->dev; 904 int i, ret; 905 906 /* Learn MAC address of guest device from packet */ 907 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 908 909 dev_ll = ll_root_used; 910 911 while (dev_ll != NULL) { 912 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 913 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 914 return -1; 915 } 916 dev_ll = dev_ll->next; 917 } 918 919 for (i = 0; i < ETHER_ADDR_LEN; i++) 920 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 921 922 /* vlan_tag currently uses the device_id. */ 923 vdev->vlan_tag = vlan_tags[dev->device_fh]; 924 925 /* Print out VMDQ registration info. */ 926 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 927 dev->device_fh, 928 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 929 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 930 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 931 vdev->vlan_tag); 932 933 /* Register the MAC address. */ 934 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 935 if (ret) 936 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 937 dev->device_fh); 938 939 /* Enable stripping of the vlan tag as we handle routing. */ 940 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 941 942 /* Set device as ready for RX. */ 943 vdev->ready = DEVICE_RX; 944 945 return 0; 946 } 947 948 /* 949 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 950 * queue before disabling RX on the device. 951 */ 952 static inline void 953 unlink_vmdq(struct vhost_dev *vdev) 954 { 955 unsigned i = 0; 956 unsigned rx_count; 957 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 958 959 if (vdev->ready == DEVICE_RX) { 960 /*clear MAC and VLAN settings*/ 961 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 962 for (i = 0; i < 6; i++) 963 vdev->mac_address.addr_bytes[i] = 0; 964 965 vdev->vlan_tag = 0; 966 967 /*Clear out the receive buffers*/ 968 rx_count = rte_eth_rx_burst(ports[0], 969 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 970 971 while (rx_count) { 972 for (i = 0; i < rx_count; i++) 973 rte_pktmbuf_free(pkts_burst[i]); 974 975 rx_count = rte_eth_rx_burst(ports[0], 976 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 977 } 978 979 vdev->ready = DEVICE_MAC_LEARNING; 980 } 981 } 982 983 /* 984 * Check if the packet destination MAC address is for a local device. If so then put 985 * the packet on that devices RX queue. If not then return. 986 */ 987 static inline unsigned __attribute__((always_inline)) 988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 989 { 990 struct virtio_net_data_ll *dev_ll; 991 struct ether_hdr *pkt_hdr; 992 uint64_t ret = 0; 993 struct virtio_net *dev = vdev->dev; 994 struct virtio_net *tdev; /* destination virito device */ 995 996 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 997 998 /*get the used devices list*/ 999 dev_ll = ll_root_used; 1000 1001 while (dev_ll != NULL) { 1002 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1003 &dev_ll->vdev->mac_address)) { 1004 1005 /* Drop the packet if the TX packet is destined for the TX device. */ 1006 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1007 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1008 dev->device_fh); 1009 return 0; 1010 } 1011 tdev = dev_ll->vdev->dev; 1012 1013 1014 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1015 1016 if (dev_ll->vdev->remove) { 1017 /*drop the packet if the device is marked for removal*/ 1018 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1019 } else { 1020 /*send the packet to the local virtio device*/ 1021 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1022 if (enable_stats) { 1023 rte_atomic64_add( 1024 &dev_statistics[tdev->device_fh].rx_total_atomic, 1025 1); 1026 rte_atomic64_add( 1027 &dev_statistics[tdev->device_fh].rx_atomic, 1028 ret); 1029 dev_statistics[tdev->device_fh].tx_total++; 1030 dev_statistics[tdev->device_fh].tx += ret; 1031 } 1032 } 1033 1034 return 0; 1035 } 1036 dev_ll = dev_ll->next; 1037 } 1038 1039 return -1; 1040 } 1041 1042 /* 1043 * This function routes the TX packet to the correct interface. This may be a local device 1044 * or the physical port. 1045 */ 1046 static inline void __attribute__((always_inline)) 1047 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1048 { 1049 struct mbuf_table *tx_q; 1050 struct rte_mbuf **m_table; 1051 unsigned len, ret, offset = 0; 1052 const uint16_t lcore_id = rte_lcore_id(); 1053 struct virtio_net_data_ll *dev_ll = ll_root_used; 1054 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1055 struct virtio_net *dev = vdev->dev; 1056 1057 /*check if destination is local VM*/ 1058 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1059 rte_pktmbuf_free(m); 1060 return; 1061 } 1062 1063 if (vm2vm_mode == VM2VM_HARDWARE) { 1064 while (dev_ll != NULL) { 1065 if ((dev_ll->vdev->ready == DEVICE_RX) 1066 && ether_addr_cmp(&(pkt_hdr->d_addr), 1067 &dev_ll->vdev->mac_address)) { 1068 /* 1069 * Drop the packet if the TX packet is 1070 * destined for the TX device. 1071 */ 1072 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1073 LOG_DEBUG(VHOST_DATA, 1074 "(%"PRIu64") TX: Source and destination" 1075 " MAC addresses are the same. Dropping " 1076 "packet.\n", 1077 dev_ll->vdev->dev->device_fh); 1078 rte_pktmbuf_free(m); 1079 return; 1080 } 1081 offset = 4; 1082 vlan_tag = 1083 (uint16_t) 1084 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1085 1086 LOG_DEBUG(VHOST_DATA, 1087 "(%"PRIu64") TX: pkt to local VM device id:" 1088 "(%"PRIu64") vlan tag: %d.\n", 1089 dev->device_fh, dev_ll->vdev->dev->device_fh, 1090 vlan_tag); 1091 1092 break; 1093 } 1094 dev_ll = dev_ll->next; 1095 } 1096 } 1097 1098 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1099 1100 /*Add packet to the port tx queue*/ 1101 tx_q = &lcore_tx_queue[lcore_id]; 1102 len = tx_q->len; 1103 1104 m->ol_flags = PKT_TX_VLAN_PKT; 1105 /*FIXME: offset*/ 1106 m->data_len += offset; 1107 m->vlan_tci = vlan_tag; 1108 1109 tx_q->m_table[len] = m; 1110 len++; 1111 if (enable_stats) { 1112 dev_statistics[dev->device_fh].tx_total++; 1113 dev_statistics[dev->device_fh].tx++; 1114 } 1115 1116 if (unlikely(len == MAX_PKT_BURST)) { 1117 m_table = (struct rte_mbuf **)tx_q->m_table; 1118 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1119 /* Free any buffers not handled by TX and update the port stats. */ 1120 if (unlikely(ret < len)) { 1121 do { 1122 rte_pktmbuf_free(m_table[ret]); 1123 } while (++ret < len); 1124 } 1125 1126 len = 0; 1127 } 1128 1129 tx_q->len = len; 1130 return; 1131 } 1132 /* 1133 * This function is called by each data core. It handles all RX/TX registered with the 1134 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1135 * with all devices in the main linked list. 1136 */ 1137 static int 1138 switch_worker(__attribute__((unused)) void *arg) 1139 { 1140 struct rte_mempool *mbuf_pool = arg; 1141 struct virtio_net *dev = NULL; 1142 struct vhost_dev *vdev = NULL; 1143 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1144 struct virtio_net_data_ll *dev_ll; 1145 struct mbuf_table *tx_q; 1146 volatile struct lcore_ll_info *lcore_ll; 1147 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1148 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1149 unsigned ret, i; 1150 const uint16_t lcore_id = rte_lcore_id(); 1151 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1152 uint16_t rx_count = 0; 1153 uint16_t tx_count; 1154 uint32_t retry = 0; 1155 1156 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1157 lcore_ll = lcore_info[lcore_id].lcore_ll; 1158 prev_tsc = 0; 1159 1160 tx_q = &lcore_tx_queue[lcore_id]; 1161 for (i = 0; i < num_cores; i ++) { 1162 if (lcore_ids[i] == lcore_id) { 1163 tx_q->txq_id = i; 1164 break; 1165 } 1166 } 1167 1168 while(1) { 1169 cur_tsc = rte_rdtsc(); 1170 /* 1171 * TX burst queue drain 1172 */ 1173 diff_tsc = cur_tsc - prev_tsc; 1174 if (unlikely(diff_tsc > drain_tsc)) { 1175 1176 if (tx_q->len) { 1177 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1178 1179 /*Tx any packets in the queue*/ 1180 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1181 (struct rte_mbuf **)tx_q->m_table, 1182 (uint16_t)tx_q->len); 1183 if (unlikely(ret < tx_q->len)) { 1184 do { 1185 rte_pktmbuf_free(tx_q->m_table[ret]); 1186 } while (++ret < tx_q->len); 1187 } 1188 1189 tx_q->len = 0; 1190 } 1191 1192 prev_tsc = cur_tsc; 1193 1194 } 1195 1196 rte_prefetch0(lcore_ll->ll_root_used); 1197 /* 1198 * Inform the configuration core that we have exited the linked list and that no devices are 1199 * in use if requested. 1200 */ 1201 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1202 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1203 1204 /* 1205 * Process devices 1206 */ 1207 dev_ll = lcore_ll->ll_root_used; 1208 1209 while (dev_ll != NULL) { 1210 /*get virtio device ID*/ 1211 vdev = dev_ll->vdev; 1212 dev = vdev->dev; 1213 1214 if (vdev->remove) { 1215 dev_ll = dev_ll->next; 1216 unlink_vmdq(vdev); 1217 vdev->ready = DEVICE_SAFE_REMOVE; 1218 continue; 1219 } 1220 if (likely(vdev->ready == DEVICE_RX)) { 1221 /*Handle guest RX*/ 1222 rx_count = rte_eth_rx_burst(ports[0], 1223 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1224 1225 if (rx_count) { 1226 /* 1227 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1228 * Here MAX_PKT_BURST must be less than virtio queue size 1229 */ 1230 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1231 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1232 rte_delay_us(burst_rx_delay_time); 1233 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1234 break; 1235 } 1236 } 1237 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1238 if (enable_stats) { 1239 rte_atomic64_add( 1240 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1241 rx_count); 1242 rte_atomic64_add( 1243 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1244 } 1245 while (likely(rx_count)) { 1246 rx_count--; 1247 rte_pktmbuf_free(pkts_burst[rx_count]); 1248 } 1249 1250 } 1251 } 1252 1253 if (!vdev->remove) { 1254 /* Handle guest TX*/ 1255 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1256 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1257 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1258 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1259 while (tx_count--) 1260 rte_pktmbuf_free(pkts_burst[tx_count]); 1261 } 1262 } 1263 while (tx_count) 1264 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1265 } 1266 1267 /*move to the next device in the list*/ 1268 dev_ll = dev_ll->next; 1269 } 1270 } 1271 1272 return 0; 1273 } 1274 1275 /* 1276 * This function gets available ring number for zero copy rx. 1277 * Only one thread will call this funciton for a paticular virtio device, 1278 * so, it is designed as non-thread-safe function. 1279 */ 1280 static inline uint32_t __attribute__((always_inline)) 1281 get_available_ring_num_zcp(struct virtio_net *dev) 1282 { 1283 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1284 uint16_t avail_idx; 1285 1286 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1287 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1288 } 1289 1290 /* 1291 * This function gets available ring index for zero copy rx, 1292 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1293 * Only one thread will call this funciton for a paticular virtio device, 1294 * so, it is designed as non-thread-safe function. 1295 */ 1296 static inline uint32_t __attribute__((always_inline)) 1297 get_available_ring_index_zcp(struct virtio_net *dev, 1298 uint16_t *res_base_idx, uint32_t count) 1299 { 1300 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1301 uint16_t avail_idx; 1302 uint32_t retry = 0; 1303 uint16_t free_entries; 1304 1305 *res_base_idx = vq->last_used_idx_res; 1306 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1307 free_entries = (avail_idx - *res_base_idx); 1308 1309 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1310 "avail idx: %d, " 1311 "res base idx:%d, free entries:%d\n", 1312 dev->device_fh, avail_idx, *res_base_idx, 1313 free_entries); 1314 1315 /* 1316 * If retry is enabled and the queue is full then we wait 1317 * and retry to avoid packet loss. 1318 */ 1319 if (enable_retry && unlikely(count > free_entries)) { 1320 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1321 rte_delay_us(burst_rx_delay_time); 1322 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1323 free_entries = (avail_idx - *res_base_idx); 1324 if (count <= free_entries) 1325 break; 1326 } 1327 } 1328 1329 /*check that we have enough buffers*/ 1330 if (unlikely(count > free_entries)) 1331 count = free_entries; 1332 1333 if (unlikely(count == 0)) { 1334 LOG_DEBUG(VHOST_DATA, 1335 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1336 "avail idx: %d, res base idx:%d, free entries:%d\n", 1337 dev->device_fh, avail_idx, 1338 *res_base_idx, free_entries); 1339 return 0; 1340 } 1341 1342 vq->last_used_idx_res = *res_base_idx + count; 1343 1344 return count; 1345 } 1346 1347 /* 1348 * This function put descriptor back to used list. 1349 */ 1350 static inline void __attribute__((always_inline)) 1351 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1352 { 1353 uint16_t res_cur_idx = vq->last_used_idx; 1354 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1355 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1356 rte_compiler_barrier(); 1357 *(volatile uint16_t *)&vq->used->idx += 1; 1358 vq->last_used_idx += 1; 1359 1360 /* Kick the guest if necessary. */ 1361 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1362 eventfd_write((int)vq->kickfd, 1); 1363 } 1364 1365 /* 1366 * This function get available descriptor from vitio vring and un-attached mbuf 1367 * from vpool->ring, and then attach them together. It needs adjust the offset 1368 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1369 * frame data may be put to wrong location in mbuf. 1370 */ 1371 static inline void __attribute__((always_inline)) 1372 attach_rxmbuf_zcp(struct virtio_net *dev) 1373 { 1374 uint16_t res_base_idx, desc_idx; 1375 uint64_t buff_addr, phys_addr; 1376 struct vhost_virtqueue *vq; 1377 struct vring_desc *desc; 1378 struct rte_mbuf *mbuf = NULL; 1379 struct vpool *vpool; 1380 hpa_type addr_type; 1381 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1382 1383 vpool = &vpool_array[vdev->vmdq_rx_q]; 1384 vq = dev->virtqueue[VIRTIO_RXQ]; 1385 1386 do { 1387 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1388 1) != 1)) 1389 return; 1390 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1391 1392 desc = &vq->desc[desc_idx]; 1393 if (desc->flags & VRING_DESC_F_NEXT) { 1394 desc = &vq->desc[desc->next]; 1395 buff_addr = gpa_to_vva(dev, desc->addr); 1396 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1397 &addr_type); 1398 } else { 1399 buff_addr = gpa_to_vva(dev, 1400 desc->addr + vq->vhost_hlen); 1401 phys_addr = gpa_to_hpa(vdev, 1402 desc->addr + vq->vhost_hlen, 1403 desc->len, &addr_type); 1404 } 1405 1406 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1407 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1408 " address found when attaching RX frame buffer" 1409 " address!\n", dev->device_fh); 1410 put_desc_to_used_list_zcp(vq, desc_idx); 1411 continue; 1412 } 1413 1414 /* 1415 * Check if the frame buffer address from guest crosses 1416 * sub-region or not. 1417 */ 1418 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1419 RTE_LOG(ERR, VHOST_DATA, 1420 "(%"PRIu64") Frame buffer address cross " 1421 "sub-regioin found when attaching RX frame " 1422 "buffer address!\n", 1423 dev->device_fh); 1424 put_desc_to_used_list_zcp(vq, desc_idx); 1425 continue; 1426 } 1427 } while (unlikely(phys_addr == 0)); 1428 1429 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1430 if (unlikely(mbuf == NULL)) { 1431 LOG_DEBUG(VHOST_DATA, 1432 "(%"PRIu64") in attach_rxmbuf_zcp: " 1433 "ring_sc_dequeue fail.\n", 1434 dev->device_fh); 1435 put_desc_to_used_list_zcp(vq, desc_idx); 1436 return; 1437 } 1438 1439 if (unlikely(vpool->buf_size > desc->len)) { 1440 LOG_DEBUG(VHOST_DATA, 1441 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1442 "length(%d) of descriptor idx: %d less than room " 1443 "size required: %d\n", 1444 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1445 put_desc_to_used_list_zcp(vq, desc_idx); 1446 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1447 return; 1448 } 1449 1450 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1451 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1452 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1453 mbuf->data_len = desc->len; 1454 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1455 1456 LOG_DEBUG(VHOST_DATA, 1457 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1458 "descriptor idx:%d\n", 1459 dev->device_fh, res_base_idx, desc_idx); 1460 1461 __rte_mbuf_raw_free(mbuf); 1462 1463 return; 1464 } 1465 1466 /* 1467 * Detach an attched packet mbuf - 1468 * - restore original mbuf address and length values. 1469 * - reset pktmbuf data and data_len to their default values. 1470 * All other fields of the given packet mbuf will be left intact. 1471 * 1472 * @param m 1473 * The attached packet mbuf. 1474 */ 1475 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1476 { 1477 const struct rte_mempool *mp = m->pool; 1478 void *buf = RTE_MBUF_TO_BADDR(m); 1479 uint32_t buf_ofs; 1480 uint32_t buf_len = mp->elt_size - sizeof(*m); 1481 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1482 1483 m->buf_addr = buf; 1484 m->buf_len = (uint16_t)buf_len; 1485 1486 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1487 RTE_PKTMBUF_HEADROOM : m->buf_len; 1488 m->data_off = buf_ofs; 1489 1490 m->data_len = 0; 1491 } 1492 1493 /* 1494 * This function is called after packets have been transimited. It fetchs mbuf 1495 * from vpool->pool, detached it and put into vpool->ring. It also update the 1496 * used index and kick the guest if necessary. 1497 */ 1498 static inline uint32_t __attribute__((always_inline)) 1499 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1500 { 1501 struct rte_mbuf *mbuf; 1502 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1503 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1504 uint32_t index = 0; 1505 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1506 1507 LOG_DEBUG(VHOST_DATA, 1508 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1509 "clean is: %d\n", 1510 dev->device_fh, mbuf_count); 1511 LOG_DEBUG(VHOST_DATA, 1512 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1513 "clean is : %d\n", 1514 dev->device_fh, rte_ring_count(vpool->ring)); 1515 1516 for (index = 0; index < mbuf_count; index++) { 1517 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1518 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1519 pktmbuf_detach_zcp(mbuf); 1520 rte_ring_sp_enqueue(vpool->ring, mbuf); 1521 1522 /* Update used index buffer information. */ 1523 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1524 vq->used->ring[used_idx].len = 0; 1525 1526 used_idx = (used_idx + 1) & (vq->size - 1); 1527 } 1528 1529 LOG_DEBUG(VHOST_DATA, 1530 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1531 "clean is: %d\n", 1532 dev->device_fh, rte_mempool_count(vpool->pool)); 1533 LOG_DEBUG(VHOST_DATA, 1534 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1535 "clean is : %d\n", 1536 dev->device_fh, rte_ring_count(vpool->ring)); 1537 LOG_DEBUG(VHOST_DATA, 1538 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1539 "vq->last_used_idx:%d\n", 1540 dev->device_fh, vq->last_used_idx); 1541 1542 vq->last_used_idx += mbuf_count; 1543 1544 LOG_DEBUG(VHOST_DATA, 1545 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1546 "vq->last_used_idx:%d\n", 1547 dev->device_fh, vq->last_used_idx); 1548 1549 rte_compiler_barrier(); 1550 1551 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1552 1553 /* Kick guest if required. */ 1554 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1555 eventfd_write((int)vq->kickfd, 1); 1556 1557 return 0; 1558 } 1559 1560 /* 1561 * This function is called when a virtio device is destroy. 1562 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1563 */ 1564 static void mbuf_destroy_zcp(struct vpool *vpool) 1565 { 1566 struct rte_mbuf *mbuf = NULL; 1567 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1568 1569 LOG_DEBUG(VHOST_CONFIG, 1570 "in mbuf_destroy_zcp: mbuf count in mempool before " 1571 "mbuf_destroy_zcp is: %d\n", 1572 mbuf_count); 1573 LOG_DEBUG(VHOST_CONFIG, 1574 "in mbuf_destroy_zcp: mbuf count in ring before " 1575 "mbuf_destroy_zcp is : %d\n", 1576 rte_ring_count(vpool->ring)); 1577 1578 for (index = 0; index < mbuf_count; index++) { 1579 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1580 if (likely(mbuf != NULL)) { 1581 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1582 pktmbuf_detach_zcp(mbuf); 1583 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1584 } 1585 } 1586 1587 LOG_DEBUG(VHOST_CONFIG, 1588 "in mbuf_destroy_zcp: mbuf count in mempool after " 1589 "mbuf_destroy_zcp is: %d\n", 1590 rte_mempool_count(vpool->pool)); 1591 LOG_DEBUG(VHOST_CONFIG, 1592 "in mbuf_destroy_zcp: mbuf count in ring after " 1593 "mbuf_destroy_zcp is : %d\n", 1594 rte_ring_count(vpool->ring)); 1595 } 1596 1597 /* 1598 * This function update the use flag and counter. 1599 */ 1600 static inline uint32_t __attribute__((always_inline)) 1601 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1602 uint32_t count) 1603 { 1604 struct vhost_virtqueue *vq; 1605 struct vring_desc *desc; 1606 struct rte_mbuf *buff; 1607 /* The virtio_hdr is initialised to 0. */ 1608 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1609 = {{0, 0, 0, 0, 0, 0}, 0}; 1610 uint64_t buff_hdr_addr = 0; 1611 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1612 uint32_t head_idx, packet_success = 0; 1613 uint16_t res_cur_idx; 1614 1615 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1616 1617 if (count == 0) 1618 return 0; 1619 1620 vq = dev->virtqueue[VIRTIO_RXQ]; 1621 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1622 1623 res_cur_idx = vq->last_used_idx; 1624 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1625 dev->device_fh, res_cur_idx, res_cur_idx + count); 1626 1627 /* Retrieve all of the head indexes first to avoid caching issues. */ 1628 for (head_idx = 0; head_idx < count; head_idx++) 1629 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1630 1631 /*Prefetch descriptor index. */ 1632 rte_prefetch0(&vq->desc[head[packet_success]]); 1633 1634 while (packet_success != count) { 1635 /* Get descriptor from available ring */ 1636 desc = &vq->desc[head[packet_success]]; 1637 1638 buff = pkts[packet_success]; 1639 LOG_DEBUG(VHOST_DATA, 1640 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1641 "pkt[%d] descriptor idx: %d\n", 1642 dev->device_fh, packet_success, 1643 MBUF_HEADROOM_UINT32(buff)); 1644 1645 PRINT_PACKET(dev, 1646 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1647 + RTE_PKTMBUF_HEADROOM), 1648 rte_pktmbuf_data_len(buff), 0); 1649 1650 /* Buffer address translation for virtio header. */ 1651 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1652 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1653 1654 /* 1655 * If the descriptors are chained the header and data are 1656 * placed in separate buffers. 1657 */ 1658 if (desc->flags & VRING_DESC_F_NEXT) { 1659 desc->len = vq->vhost_hlen; 1660 desc = &vq->desc[desc->next]; 1661 desc->len = rte_pktmbuf_data_len(buff); 1662 } else { 1663 desc->len = packet_len; 1664 } 1665 1666 /* Update used ring with desc information */ 1667 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1668 = head[packet_success]; 1669 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1670 = packet_len; 1671 res_cur_idx++; 1672 packet_success++; 1673 1674 /* A header is required per buffer. */ 1675 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1676 (const void *)&virtio_hdr, vq->vhost_hlen); 1677 1678 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1679 1680 if (likely(packet_success < count)) { 1681 /* Prefetch descriptor index. */ 1682 rte_prefetch0(&vq->desc[head[packet_success]]); 1683 } 1684 } 1685 1686 rte_compiler_barrier(); 1687 1688 LOG_DEBUG(VHOST_DATA, 1689 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1690 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1691 dev->device_fh, vq->last_used_idx, vq->used->idx); 1692 1693 *(volatile uint16_t *)&vq->used->idx += count; 1694 vq->last_used_idx += count; 1695 1696 LOG_DEBUG(VHOST_DATA, 1697 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1698 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1699 dev->device_fh, vq->last_used_idx, vq->used->idx); 1700 1701 /* Kick the guest if necessary. */ 1702 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1703 eventfd_write((int)vq->kickfd, 1); 1704 1705 return count; 1706 } 1707 1708 /* 1709 * This function routes the TX packet to the correct interface. 1710 * This may be a local device or the physical port. 1711 */ 1712 static inline void __attribute__((always_inline)) 1713 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1714 uint32_t desc_idx, uint8_t need_copy) 1715 { 1716 struct mbuf_table *tx_q; 1717 struct rte_mbuf **m_table; 1718 struct rte_mbuf *mbuf = NULL; 1719 unsigned len, ret, offset = 0; 1720 struct vpool *vpool; 1721 struct virtio_net_data_ll *dev_ll = ll_root_used; 1722 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1723 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1724 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1725 1726 /*Add packet to the port tx queue*/ 1727 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1728 len = tx_q->len; 1729 1730 /* Allocate an mbuf and populate the structure. */ 1731 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1732 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1733 if (unlikely(mbuf == NULL)) { 1734 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1735 RTE_LOG(ERR, VHOST_DATA, 1736 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1737 dev->device_fh); 1738 put_desc_to_used_list_zcp(vq, desc_idx); 1739 return; 1740 } 1741 1742 if (vm2vm_mode == VM2VM_HARDWARE) { 1743 /* Avoid using a vlan tag from any vm for external pkt, such as 1744 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1745 * selection, MAC address determines it as an external pkt 1746 * which should go to network, while vlan tag determine it as 1747 * a vm2vm pkt should forward to another vm. Hardware confuse 1748 * such a ambiguous situation, so pkt will lost. 1749 */ 1750 vlan_tag = external_pkt_default_vlan_tag; 1751 while (dev_ll != NULL) { 1752 if (likely(dev_ll->vdev->ready == DEVICE_RX) && 1753 ether_addr_cmp(&(pkt_hdr->d_addr), 1754 &dev_ll->vdev->mac_address)) { 1755 1756 /* 1757 * Drop the packet if the TX packet is destined 1758 * for the TX device. 1759 */ 1760 if (unlikely(dev_ll->vdev->dev->device_fh 1761 == dev->device_fh)) { 1762 LOG_DEBUG(VHOST_DATA, 1763 "(%"PRIu64") TX: Source and destination" 1764 "MAC addresses are the same. Dropping " 1765 "packet.\n", 1766 dev_ll->vdev->dev->device_fh); 1767 MBUF_HEADROOM_UINT32(mbuf) 1768 = (uint32_t)desc_idx; 1769 __rte_mbuf_raw_free(mbuf); 1770 return; 1771 } 1772 1773 /* 1774 * Packet length offset 4 bytes for HW vlan 1775 * strip when L2 switch back. 1776 */ 1777 offset = 4; 1778 vlan_tag = 1779 (uint16_t) 1780 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1781 1782 LOG_DEBUG(VHOST_DATA, 1783 "(%"PRIu64") TX: pkt to local VM device id:" 1784 "(%"PRIu64") vlan tag: %d.\n", 1785 dev->device_fh, dev_ll->vdev->dev->device_fh, 1786 vlan_tag); 1787 1788 break; 1789 } 1790 dev_ll = dev_ll->next; 1791 } 1792 } 1793 1794 mbuf->nb_segs = m->nb_segs; 1795 mbuf->next = m->next; 1796 mbuf->data_len = m->data_len + offset; 1797 mbuf->pkt_len = mbuf->data_len; 1798 if (unlikely(need_copy)) { 1799 /* Copy the packet contents to the mbuf. */ 1800 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1801 rte_pktmbuf_mtod(m, void *), 1802 m->data_len); 1803 } else { 1804 mbuf->data_off = m->data_off; 1805 mbuf->buf_physaddr = m->buf_physaddr; 1806 mbuf->buf_addr = m->buf_addr; 1807 } 1808 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1809 mbuf->vlan_tci = vlan_tag; 1810 mbuf->l2_len = sizeof(struct ether_hdr); 1811 mbuf->l3_len = sizeof(struct ipv4_hdr); 1812 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1813 1814 tx_q->m_table[len] = mbuf; 1815 len++; 1816 1817 LOG_DEBUG(VHOST_DATA, 1818 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1819 dev->device_fh, 1820 mbuf->nb_segs, 1821 (mbuf->next == NULL) ? "null" : "non-null"); 1822 1823 if (enable_stats) { 1824 dev_statistics[dev->device_fh].tx_total++; 1825 dev_statistics[dev->device_fh].tx++; 1826 } 1827 1828 if (unlikely(len == MAX_PKT_BURST)) { 1829 m_table = (struct rte_mbuf **)tx_q->m_table; 1830 ret = rte_eth_tx_burst(ports[0], 1831 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1832 1833 /* 1834 * Free any buffers not handled by TX and update 1835 * the port stats. 1836 */ 1837 if (unlikely(ret < len)) { 1838 do { 1839 rte_pktmbuf_free(m_table[ret]); 1840 } while (++ret < len); 1841 } 1842 1843 len = 0; 1844 txmbuf_clean_zcp(dev, vpool); 1845 } 1846 1847 tx_q->len = len; 1848 1849 return; 1850 } 1851 1852 /* 1853 * This function TX all available packets in virtio TX queue for one 1854 * virtio-net device. If it is first packet, it learns MAC address and 1855 * setup VMDQ. 1856 */ 1857 static inline void __attribute__((always_inline)) 1858 virtio_dev_tx_zcp(struct virtio_net *dev) 1859 { 1860 struct rte_mbuf m; 1861 struct vhost_virtqueue *vq; 1862 struct vring_desc *desc; 1863 uint64_t buff_addr = 0, phys_addr; 1864 uint32_t head[MAX_PKT_BURST]; 1865 uint32_t i; 1866 uint16_t free_entries, packet_success = 0; 1867 uint16_t avail_idx; 1868 uint8_t need_copy = 0; 1869 hpa_type addr_type; 1870 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1871 1872 vq = dev->virtqueue[VIRTIO_TXQ]; 1873 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1874 1875 /* If there are no available buffers then return. */ 1876 if (vq->last_used_idx_res == avail_idx) 1877 return; 1878 1879 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1880 1881 /* Prefetch available ring to retrieve head indexes. */ 1882 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1883 1884 /* Get the number of free entries in the ring */ 1885 free_entries = (avail_idx - vq->last_used_idx_res); 1886 1887 /* Limit to MAX_PKT_BURST. */ 1888 free_entries 1889 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1890 1891 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1892 dev->device_fh, free_entries); 1893 1894 /* Retrieve all of the head indexes first to avoid caching issues. */ 1895 for (i = 0; i < free_entries; i++) 1896 head[i] 1897 = vq->avail->ring[(vq->last_used_idx_res + i) 1898 & (vq->size - 1)]; 1899 1900 vq->last_used_idx_res += free_entries; 1901 1902 /* Prefetch descriptor index. */ 1903 rte_prefetch0(&vq->desc[head[packet_success]]); 1904 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1905 1906 while (packet_success < free_entries) { 1907 desc = &vq->desc[head[packet_success]]; 1908 1909 /* Discard first buffer as it is the virtio header */ 1910 desc = &vq->desc[desc->next]; 1911 1912 /* Buffer address translation. */ 1913 buff_addr = gpa_to_vva(dev, desc->addr); 1914 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type); 1915 1916 if (likely(packet_success < (free_entries - 1))) 1917 /* Prefetch descriptor index. */ 1918 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1919 1920 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1921 RTE_LOG(ERR, VHOST_DATA, 1922 "(%"PRIu64") Invalid frame buffer address found" 1923 "when TX packets!\n", 1924 dev->device_fh); 1925 packet_success++; 1926 continue; 1927 } 1928 1929 /* Prefetch buffer address. */ 1930 rte_prefetch0((void *)(uintptr_t)buff_addr); 1931 1932 /* 1933 * Setup dummy mbuf. This is copied to a real mbuf if 1934 * transmitted out the physical port. 1935 */ 1936 m.data_len = desc->len; 1937 m.nb_segs = 1; 1938 m.next = NULL; 1939 m.data_off = 0; 1940 m.buf_addr = (void *)(uintptr_t)buff_addr; 1941 m.buf_physaddr = phys_addr; 1942 1943 /* 1944 * Check if the frame buffer address from guest crosses 1945 * sub-region or not. 1946 */ 1947 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1948 RTE_LOG(ERR, VHOST_DATA, 1949 "(%"PRIu64") Frame buffer address cross " 1950 "sub-regioin found when attaching TX frame " 1951 "buffer address!\n", 1952 dev->device_fh); 1953 need_copy = 1; 1954 } else 1955 need_copy = 0; 1956 1957 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1958 1959 /* 1960 * If this is the first received packet we need to learn 1961 * the MAC and setup VMDQ 1962 */ 1963 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1964 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1965 /* 1966 * Discard frame if device is scheduled for 1967 * removal or a duplicate MAC address is found. 1968 */ 1969 packet_success += free_entries; 1970 vq->last_used_idx += packet_success; 1971 break; 1972 } 1973 } 1974 1975 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1976 packet_success++; 1977 } 1978 } 1979 1980 /* 1981 * This function is called by each data core. It handles all RX/TX registered 1982 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1983 * addresses are compared with all devices in the main linked list. 1984 */ 1985 static int 1986 switch_worker_zcp(__attribute__((unused)) void *arg) 1987 { 1988 struct virtio_net *dev = NULL; 1989 struct vhost_dev *vdev = NULL; 1990 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1991 struct virtio_net_data_ll *dev_ll; 1992 struct mbuf_table *tx_q; 1993 volatile struct lcore_ll_info *lcore_ll; 1994 const uint64_t drain_tsc 1995 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 1996 * BURST_TX_DRAIN_US; 1997 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1998 unsigned ret; 1999 const uint16_t lcore_id = rte_lcore_id(); 2000 uint16_t count_in_ring, rx_count = 0; 2001 2002 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2003 2004 lcore_ll = lcore_info[lcore_id].lcore_ll; 2005 prev_tsc = 0; 2006 2007 while (1) { 2008 cur_tsc = rte_rdtsc(); 2009 2010 /* TX burst queue drain */ 2011 diff_tsc = cur_tsc - prev_tsc; 2012 if (unlikely(diff_tsc > drain_tsc)) { 2013 /* 2014 * Get mbuf from vpool.pool and detach mbuf and 2015 * put back into vpool.ring. 2016 */ 2017 dev_ll = lcore_ll->ll_root_used; 2018 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2019 /* Get virtio device ID */ 2020 vdev = dev_ll->vdev; 2021 dev = vdev->dev; 2022 2023 if (likely(!vdev->remove)) { 2024 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2025 if (tx_q->len) { 2026 LOG_DEBUG(VHOST_DATA, 2027 "TX queue drained after timeout" 2028 " with burst size %u\n", 2029 tx_q->len); 2030 2031 /* 2032 * Tx any packets in the queue 2033 */ 2034 ret = rte_eth_tx_burst( 2035 ports[0], 2036 (uint16_t)tx_q->txq_id, 2037 (struct rte_mbuf **) 2038 tx_q->m_table, 2039 (uint16_t)tx_q->len); 2040 if (unlikely(ret < tx_q->len)) { 2041 do { 2042 rte_pktmbuf_free( 2043 tx_q->m_table[ret]); 2044 } while (++ret < tx_q->len); 2045 } 2046 tx_q->len = 0; 2047 2048 txmbuf_clean_zcp(dev, 2049 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2050 } 2051 } 2052 dev_ll = dev_ll->next; 2053 } 2054 prev_tsc = cur_tsc; 2055 } 2056 2057 rte_prefetch0(lcore_ll->ll_root_used); 2058 2059 /* 2060 * Inform the configuration core that we have exited the linked 2061 * list and that no devices are in use if requested. 2062 */ 2063 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2064 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2065 2066 /* Process devices */ 2067 dev_ll = lcore_ll->ll_root_used; 2068 2069 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2070 vdev = dev_ll->vdev; 2071 dev = vdev->dev; 2072 if (unlikely(vdev->remove)) { 2073 dev_ll = dev_ll->next; 2074 unlink_vmdq(vdev); 2075 vdev->ready = DEVICE_SAFE_REMOVE; 2076 continue; 2077 } 2078 2079 if (likely(vdev->ready == DEVICE_RX)) { 2080 uint32_t index = vdev->vmdq_rx_q; 2081 uint16_t i; 2082 count_in_ring 2083 = rte_ring_count(vpool_array[index].ring); 2084 uint16_t free_entries 2085 = (uint16_t)get_available_ring_num_zcp(dev); 2086 2087 /* 2088 * Attach all mbufs in vpool.ring and put back 2089 * into vpool.pool. 2090 */ 2091 for (i = 0; 2092 i < RTE_MIN(free_entries, 2093 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2094 i++) 2095 attach_rxmbuf_zcp(dev); 2096 2097 /* Handle guest RX */ 2098 rx_count = rte_eth_rx_burst(ports[0], 2099 vdev->vmdq_rx_q, pkts_burst, 2100 MAX_PKT_BURST); 2101 2102 if (rx_count) { 2103 ret_count = virtio_dev_rx_zcp(dev, 2104 pkts_burst, rx_count); 2105 if (enable_stats) { 2106 dev_statistics[dev->device_fh].rx_total 2107 += rx_count; 2108 dev_statistics[dev->device_fh].rx 2109 += ret_count; 2110 } 2111 while (likely(rx_count)) { 2112 rx_count--; 2113 pktmbuf_detach_zcp( 2114 pkts_burst[rx_count]); 2115 rte_ring_sp_enqueue( 2116 vpool_array[index].ring, 2117 (void *)pkts_burst[rx_count]); 2118 } 2119 } 2120 } 2121 2122 if (likely(!vdev->remove)) 2123 /* Handle guest TX */ 2124 virtio_dev_tx_zcp(dev); 2125 2126 /* Move to the next device in the list */ 2127 dev_ll = dev_ll->next; 2128 } 2129 } 2130 2131 return 0; 2132 } 2133 2134 2135 /* 2136 * Add an entry to a used linked list. A free entry must first be found 2137 * in the free linked list using get_data_ll_free_entry(); 2138 */ 2139 static void 2140 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2141 struct virtio_net_data_ll *ll_dev) 2142 { 2143 struct virtio_net_data_ll *ll = *ll_root_addr; 2144 2145 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2146 ll_dev->next = NULL; 2147 rte_compiler_barrier(); 2148 2149 /* If ll == NULL then this is the first device. */ 2150 if (ll) { 2151 /* Increment to the tail of the linked list. */ 2152 while ((ll->next != NULL) ) 2153 ll = ll->next; 2154 2155 ll->next = ll_dev; 2156 } else { 2157 *ll_root_addr = ll_dev; 2158 } 2159 } 2160 2161 /* 2162 * Remove an entry from a used linked list. The entry must then be added to 2163 * the free linked list using put_data_ll_free_entry(). 2164 */ 2165 static void 2166 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2167 struct virtio_net_data_ll *ll_dev, 2168 struct virtio_net_data_ll *ll_dev_last) 2169 { 2170 struct virtio_net_data_ll *ll = *ll_root_addr; 2171 2172 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2173 return; 2174 2175 if (ll_dev == ll) 2176 *ll_root_addr = ll_dev->next; 2177 else 2178 if (likely(ll_dev_last != NULL)) 2179 ll_dev_last->next = ll_dev->next; 2180 else 2181 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2182 } 2183 2184 /* 2185 * Find and return an entry from the free linked list. 2186 */ 2187 static struct virtio_net_data_ll * 2188 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2189 { 2190 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2191 struct virtio_net_data_ll *ll_dev; 2192 2193 if (ll_free == NULL) 2194 return NULL; 2195 2196 ll_dev = ll_free; 2197 *ll_root_addr = ll_free->next; 2198 2199 return ll_dev; 2200 } 2201 2202 /* 2203 * Place an entry back on to the free linked list. 2204 */ 2205 static void 2206 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2207 struct virtio_net_data_ll *ll_dev) 2208 { 2209 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2210 2211 if (ll_dev == NULL) 2212 return; 2213 2214 ll_dev->next = ll_free; 2215 *ll_root_addr = ll_dev; 2216 } 2217 2218 /* 2219 * Creates a linked list of a given size. 2220 */ 2221 static struct virtio_net_data_ll * 2222 alloc_data_ll(uint32_t size) 2223 { 2224 struct virtio_net_data_ll *ll_new; 2225 uint32_t i; 2226 2227 /* Malloc and then chain the linked list. */ 2228 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2229 if (ll_new == NULL) { 2230 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2231 return NULL; 2232 } 2233 2234 for (i = 0; i < size - 1; i++) { 2235 ll_new[i].vdev = NULL; 2236 ll_new[i].next = &ll_new[i+1]; 2237 } 2238 ll_new[i].next = NULL; 2239 2240 return (ll_new); 2241 } 2242 2243 /* 2244 * Create the main linked list along with each individual cores linked list. A used and a free list 2245 * are created to manage entries. 2246 */ 2247 static int 2248 init_data_ll (void) 2249 { 2250 int lcore; 2251 2252 RTE_LCORE_FOREACH_SLAVE(lcore) { 2253 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2254 if (lcore_info[lcore].lcore_ll == NULL) { 2255 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2256 return -1; 2257 } 2258 2259 lcore_info[lcore].lcore_ll->device_num = 0; 2260 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2261 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2262 if (num_devices % num_switching_cores) 2263 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2264 else 2265 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2266 } 2267 2268 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2269 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2270 2271 return 0; 2272 } 2273 2274 /* 2275 * Set virtqueue flags so that we do not receive interrupts. 2276 */ 2277 static void 2278 set_irq_status (struct virtio_net *dev) 2279 { 2280 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2281 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2282 } 2283 2284 /* 2285 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2286 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2287 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2288 */ 2289 static void 2290 destroy_device (volatile struct virtio_net *dev) 2291 { 2292 struct virtio_net_data_ll *ll_lcore_dev_cur; 2293 struct virtio_net_data_ll *ll_main_dev_cur; 2294 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2295 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2296 struct vhost_dev *vdev; 2297 int lcore; 2298 2299 dev->flags &= ~VIRTIO_DEV_RUNNING; 2300 2301 vdev = (struct vhost_dev *)dev->priv; 2302 /*set the remove flag. */ 2303 vdev->remove = 1; 2304 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2305 rte_pause(); 2306 } 2307 2308 /* Search for entry to be removed from lcore ll */ 2309 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2310 while (ll_lcore_dev_cur != NULL) { 2311 if (ll_lcore_dev_cur->vdev == vdev) { 2312 break; 2313 } else { 2314 ll_lcore_dev_last = ll_lcore_dev_cur; 2315 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2316 } 2317 } 2318 2319 if (ll_lcore_dev_cur == NULL) { 2320 RTE_LOG(ERR, VHOST_CONFIG, 2321 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2322 dev->device_fh); 2323 return; 2324 } 2325 2326 /* Search for entry to be removed from main ll */ 2327 ll_main_dev_cur = ll_root_used; 2328 ll_main_dev_last = NULL; 2329 while (ll_main_dev_cur != NULL) { 2330 if (ll_main_dev_cur->vdev == vdev) { 2331 break; 2332 } else { 2333 ll_main_dev_last = ll_main_dev_cur; 2334 ll_main_dev_cur = ll_main_dev_cur->next; 2335 } 2336 } 2337 2338 /* Remove entries from the lcore and main ll. */ 2339 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2340 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2341 2342 /* Set the dev_removal_flag on each lcore. */ 2343 RTE_LCORE_FOREACH_SLAVE(lcore) { 2344 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2345 } 2346 2347 /* 2348 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2349 * they can no longer access the device removed from the linked lists and that the devices 2350 * are no longer in use. 2351 */ 2352 RTE_LCORE_FOREACH_SLAVE(lcore) { 2353 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2354 rte_pause(); 2355 } 2356 } 2357 2358 /* Add the entries back to the lcore and main free ll.*/ 2359 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2360 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2361 2362 /* Decrement number of device on the lcore. */ 2363 lcore_info[vdev->coreid].lcore_ll->device_num--; 2364 2365 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2366 2367 if (zero_copy) { 2368 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2369 2370 /* Stop the RX queue. */ 2371 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2372 LOG_DEBUG(VHOST_CONFIG, 2373 "(%"PRIu64") In destroy_device: Failed to stop " 2374 "rx queue:%d\n", 2375 dev->device_fh, 2376 vdev->vmdq_rx_q); 2377 } 2378 2379 LOG_DEBUG(VHOST_CONFIG, 2380 "(%"PRIu64") in destroy_device: Start put mbuf in " 2381 "mempool back to ring for RX queue: %d\n", 2382 dev->device_fh, vdev->vmdq_rx_q); 2383 2384 mbuf_destroy_zcp(vpool); 2385 2386 /* Stop the TX queue. */ 2387 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2388 LOG_DEBUG(VHOST_CONFIG, 2389 "(%"PRIu64") In destroy_device: Failed to " 2390 "stop tx queue:%d\n", 2391 dev->device_fh, vdev->vmdq_rx_q); 2392 } 2393 2394 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2395 2396 LOG_DEBUG(VHOST_CONFIG, 2397 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2398 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2399 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2400 dev->device_fh); 2401 2402 mbuf_destroy_zcp(vpool); 2403 rte_free(vdev->regions_hpa); 2404 } 2405 rte_free(vdev); 2406 2407 } 2408 2409 /* 2410 * Calculate the region count of physical continous regions for one particular 2411 * region of whose vhost virtual address is continous. The particular region 2412 * start from vva_start, with size of 'size' in argument. 2413 */ 2414 static uint32_t 2415 check_hpa_regions(uint64_t vva_start, uint64_t size) 2416 { 2417 uint32_t i, nregions = 0, page_size = getpagesize(); 2418 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2419 if (vva_start % page_size) { 2420 LOG_DEBUG(VHOST_CONFIG, 2421 "in check_countinous: vva start(%p) mod page_size(%d) " 2422 "has remainder\n", 2423 (void *)(uintptr_t)vva_start, page_size); 2424 return 0; 2425 } 2426 if (size % page_size) { 2427 LOG_DEBUG(VHOST_CONFIG, 2428 "in check_countinous: " 2429 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2430 size, page_size); 2431 return 0; 2432 } 2433 for (i = 0; i < size - page_size; i = i + page_size) { 2434 cur_phys_addr 2435 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2436 next_phys_addr = rte_mem_virt2phy( 2437 (void *)(uintptr_t)(vva_start + i + page_size)); 2438 if ((cur_phys_addr + page_size) != next_phys_addr) { 2439 ++nregions; 2440 LOG_DEBUG(VHOST_CONFIG, 2441 "in check_continuous: hva addr:(%p) is not " 2442 "continuous with hva addr:(%p), diff:%d\n", 2443 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2444 (void *)(uintptr_t)(vva_start + (uint64_t)i 2445 + page_size), page_size); 2446 LOG_DEBUG(VHOST_CONFIG, 2447 "in check_continuous: hpa addr:(%p) is not " 2448 "continuous with hpa addr:(%p), " 2449 "diff:(%"PRIu64")\n", 2450 (void *)(uintptr_t)cur_phys_addr, 2451 (void *)(uintptr_t)next_phys_addr, 2452 (next_phys_addr-cur_phys_addr)); 2453 } 2454 } 2455 return nregions; 2456 } 2457 2458 /* 2459 * Divide each region whose vhost virtual address is continous into a few 2460 * sub-regions, make sure the physical address within each sub-region are 2461 * continous. And fill offset(to GPA) and size etc. information of each 2462 * sub-region into regions_hpa. 2463 */ 2464 static uint32_t 2465 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2466 { 2467 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2468 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2469 2470 if (mem_region_hpa == NULL) 2471 return 0; 2472 2473 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2474 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2475 virtio_memory->regions[regionidx].address_offset; 2476 mem_region_hpa[regionidx_hpa].guest_phys_address 2477 = virtio_memory->regions[regionidx].guest_phys_address; 2478 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2479 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2480 mem_region_hpa[regionidx_hpa].guest_phys_address; 2481 LOG_DEBUG(VHOST_CONFIG, 2482 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2483 regionidx_hpa, 2484 (void *)(uintptr_t) 2485 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2486 LOG_DEBUG(VHOST_CONFIG, 2487 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2488 regionidx_hpa, 2489 (void *)(uintptr_t) 2490 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2491 for (i = 0, k = 0; 2492 i < virtio_memory->regions[regionidx].memory_size - 2493 page_size; 2494 i += page_size) { 2495 cur_phys_addr = rte_mem_virt2phy( 2496 (void *)(uintptr_t)(vva_start + i)); 2497 next_phys_addr = rte_mem_virt2phy( 2498 (void *)(uintptr_t)(vva_start + 2499 i + page_size)); 2500 if ((cur_phys_addr + page_size) != next_phys_addr) { 2501 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2502 mem_region_hpa[regionidx_hpa].guest_phys_address + 2503 k + page_size; 2504 mem_region_hpa[regionidx_hpa].memory_size 2505 = k + page_size; 2506 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2507 "phys addr end [%d]:(%p)\n", 2508 regionidx_hpa, 2509 (void *)(uintptr_t) 2510 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2511 LOG_DEBUG(VHOST_CONFIG, 2512 "in fill_hpa_regions: guest phys addr " 2513 "size [%d]:(%p)\n", 2514 regionidx_hpa, 2515 (void *)(uintptr_t) 2516 (mem_region_hpa[regionidx_hpa].memory_size)); 2517 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2518 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2519 ++regionidx_hpa; 2520 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2521 next_phys_addr - 2522 mem_region_hpa[regionidx_hpa].guest_phys_address; 2523 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2524 " phys addr start[%d]:(%p)\n", 2525 regionidx_hpa, 2526 (void *)(uintptr_t) 2527 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2528 LOG_DEBUG(VHOST_CONFIG, 2529 "in fill_hpa_regions: host phys addr " 2530 "start[%d]:(%p)\n", 2531 regionidx_hpa, 2532 (void *)(uintptr_t) 2533 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2534 k = 0; 2535 } else { 2536 k += page_size; 2537 } 2538 } 2539 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2540 = mem_region_hpa[regionidx_hpa].guest_phys_address 2541 + k + page_size; 2542 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2543 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2544 "[%d]:(%p)\n", regionidx_hpa, 2545 (void *)(uintptr_t) 2546 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2547 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2548 "[%d]:(%p)\n", regionidx_hpa, 2549 (void *)(uintptr_t) 2550 (mem_region_hpa[regionidx_hpa].memory_size)); 2551 ++regionidx_hpa; 2552 } 2553 return regionidx_hpa; 2554 } 2555 2556 /* 2557 * A new device is added to a data core. First the device is added to the main linked list 2558 * and the allocated to a specific data core. 2559 */ 2560 static int 2561 new_device (struct virtio_net *dev) 2562 { 2563 struct virtio_net_data_ll *ll_dev; 2564 int lcore, core_add = 0; 2565 uint32_t device_num_min = num_devices; 2566 struct vhost_dev *vdev; 2567 uint32_t regionidx; 2568 2569 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2570 if (vdev == NULL) { 2571 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2572 dev->device_fh); 2573 return -1; 2574 } 2575 vdev->dev = dev; 2576 dev->priv = vdev; 2577 2578 if (zero_copy) { 2579 vdev->nregions_hpa = dev->mem->nregions; 2580 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2581 vdev->nregions_hpa 2582 += check_hpa_regions( 2583 dev->mem->regions[regionidx].guest_phys_address 2584 + dev->mem->regions[regionidx].address_offset, 2585 dev->mem->regions[regionidx].memory_size); 2586 2587 } 2588 2589 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2590 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2591 CACHE_LINE_SIZE); 2592 if (vdev->regions_hpa == NULL) { 2593 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2594 rte_free(vdev); 2595 return -1; 2596 } 2597 2598 2599 if (fill_hpa_memory_regions( 2600 vdev->regions_hpa, dev->mem 2601 ) != vdev->nregions_hpa) { 2602 2603 RTE_LOG(ERR, VHOST_CONFIG, 2604 "hpa memory regions number mismatch: " 2605 "[%d]\n", vdev->nregions_hpa); 2606 rte_free(vdev->regions_hpa); 2607 rte_free(vdev); 2608 return -1; 2609 } 2610 } 2611 2612 2613 /* Add device to main ll */ 2614 ll_dev = get_data_ll_free_entry(&ll_root_free); 2615 if (ll_dev == NULL) { 2616 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2617 "of %d devices per core has been reached\n", 2618 dev->device_fh, num_devices); 2619 if (vdev->regions_hpa) 2620 rte_free(vdev->regions_hpa); 2621 rte_free(vdev); 2622 return -1; 2623 } 2624 ll_dev->vdev = vdev; 2625 add_data_ll_entry(&ll_root_used, ll_dev); 2626 vdev->vmdq_rx_q 2627 = dev->device_fh * (num_queues / num_devices); 2628 2629 if (zero_copy) { 2630 uint32_t index = vdev->vmdq_rx_q; 2631 uint32_t count_in_ring, i; 2632 struct mbuf_table *tx_q; 2633 2634 count_in_ring = rte_ring_count(vpool_array[index].ring); 2635 2636 LOG_DEBUG(VHOST_CONFIG, 2637 "(%"PRIu64") in new_device: mbuf count in mempool " 2638 "before attach is: %d\n", 2639 dev->device_fh, 2640 rte_mempool_count(vpool_array[index].pool)); 2641 LOG_DEBUG(VHOST_CONFIG, 2642 "(%"PRIu64") in new_device: mbuf count in ring " 2643 "before attach is : %d\n", 2644 dev->device_fh, count_in_ring); 2645 2646 /* 2647 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2648 */ 2649 for (i = 0; i < count_in_ring; i++) 2650 attach_rxmbuf_zcp(dev); 2651 2652 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2653 "mempool after attach is: %d\n", 2654 dev->device_fh, 2655 rte_mempool_count(vpool_array[index].pool)); 2656 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2657 "ring after attach is : %d\n", 2658 dev->device_fh, 2659 rte_ring_count(vpool_array[index].ring)); 2660 2661 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2662 tx_q->txq_id = vdev->vmdq_rx_q; 2663 2664 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2665 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2666 2667 LOG_DEBUG(VHOST_CONFIG, 2668 "(%"PRIu64") In new_device: Failed to start " 2669 "tx queue:%d\n", 2670 dev->device_fh, vdev->vmdq_rx_q); 2671 2672 mbuf_destroy_zcp(vpool); 2673 rte_free(vdev->regions_hpa); 2674 rte_free(vdev); 2675 return -1; 2676 } 2677 2678 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2679 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2680 2681 LOG_DEBUG(VHOST_CONFIG, 2682 "(%"PRIu64") In new_device: Failed to start " 2683 "rx queue:%d\n", 2684 dev->device_fh, vdev->vmdq_rx_q); 2685 2686 /* Stop the TX queue. */ 2687 if (rte_eth_dev_tx_queue_stop(ports[0], 2688 vdev->vmdq_rx_q) != 0) { 2689 LOG_DEBUG(VHOST_CONFIG, 2690 "(%"PRIu64") In new_device: Failed to " 2691 "stop tx queue:%d\n", 2692 dev->device_fh, vdev->vmdq_rx_q); 2693 } 2694 2695 mbuf_destroy_zcp(vpool); 2696 rte_free(vdev->regions_hpa); 2697 rte_free(vdev); 2698 return -1; 2699 } 2700 2701 } 2702 2703 /*reset ready flag*/ 2704 vdev->ready = DEVICE_MAC_LEARNING; 2705 vdev->remove = 0; 2706 2707 /* Find a suitable lcore to add the device. */ 2708 RTE_LCORE_FOREACH_SLAVE(lcore) { 2709 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2710 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2711 core_add = lcore; 2712 } 2713 } 2714 /* Add device to lcore ll */ 2715 ll_dev->dev->coreid = core_add; 2716 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); 2717 if (ll_dev == NULL) { 2718 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2719 vdev->ready = DEVICE_SAFE_REMOVE; 2720 destroy_device(dev); 2721 if (vdev->regions_hpa) 2722 rte_free(vdev->regions_hpa); 2723 rte_free(vdev); 2724 return -1; 2725 } 2726 ll_dev->vdev = vdev; 2727 vdev->coreid = core_add; 2728 2729 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); 2730 2731 /* Initialize device stats */ 2732 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2733 2734 /* Disable notifications. */ 2735 set_irq_status(dev); 2736 lcore_info[vdev->coreid].lcore_ll->device_num++; 2737 dev->flags |= VIRTIO_DEV_RUNNING; 2738 2739 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2740 2741 return 0; 2742 } 2743 2744 /* 2745 * These callback allow devices to be added to the data core when configuration 2746 * has been fully complete. 2747 */ 2748 static const struct virtio_net_device_ops virtio_net_device_ops = 2749 { 2750 .new_device = new_device, 2751 .destroy_device = destroy_device, 2752 }; 2753 2754 /* 2755 * This is a thread will wake up after a period to print stats if the user has 2756 * enabled them. 2757 */ 2758 static void 2759 print_stats(void) 2760 { 2761 struct virtio_net_data_ll *dev_ll; 2762 uint64_t tx_dropped, rx_dropped; 2763 uint64_t tx, tx_total, rx, rx_total; 2764 uint32_t device_fh; 2765 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2766 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2767 2768 while(1) { 2769 sleep(enable_stats); 2770 2771 /* Clear screen and move to top left */ 2772 printf("%s%s", clr, top_left); 2773 2774 printf("\nDevice statistics ===================================="); 2775 2776 dev_ll = ll_root_used; 2777 while (dev_ll != NULL) { 2778 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2779 tx_total = dev_statistics[device_fh].tx_total; 2780 tx = dev_statistics[device_fh].tx; 2781 tx_dropped = tx_total - tx; 2782 if (zero_copy == 0) { 2783 rx_total = rte_atomic64_read( 2784 &dev_statistics[device_fh].rx_total_atomic); 2785 rx = rte_atomic64_read( 2786 &dev_statistics[device_fh].rx_atomic); 2787 } else { 2788 rx_total = dev_statistics[device_fh].rx_total; 2789 rx = dev_statistics[device_fh].rx; 2790 } 2791 rx_dropped = rx_total - rx; 2792 2793 printf("\nStatistics for device %"PRIu32" ------------------------------" 2794 "\nTX total: %"PRIu64"" 2795 "\nTX dropped: %"PRIu64"" 2796 "\nTX successful: %"PRIu64"" 2797 "\nRX total: %"PRIu64"" 2798 "\nRX dropped: %"PRIu64"" 2799 "\nRX successful: %"PRIu64"", 2800 device_fh, 2801 tx_total, 2802 tx_dropped, 2803 tx, 2804 rx_total, 2805 rx_dropped, 2806 rx); 2807 2808 dev_ll = dev_ll->next; 2809 } 2810 printf("\n======================================================\n"); 2811 } 2812 } 2813 2814 static void 2815 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2816 char *ring_name, uint32_t nb_mbuf) 2817 { 2818 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2819 vpool_array[index].pool 2820 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2821 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2822 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2823 rte_pktmbuf_init, NULL, socket, 0); 2824 if (vpool_array[index].pool != NULL) { 2825 vpool_array[index].ring 2826 = rte_ring_create(ring_name, 2827 rte_align32pow2(nb_mbuf + 1), 2828 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2829 if (likely(vpool_array[index].ring != NULL)) { 2830 LOG_DEBUG(VHOST_CONFIG, 2831 "in setup_mempool_tbl: mbuf count in " 2832 "mempool is: %d\n", 2833 rte_mempool_count(vpool_array[index].pool)); 2834 LOG_DEBUG(VHOST_CONFIG, 2835 "in setup_mempool_tbl: mbuf count in " 2836 "ring is: %d\n", 2837 rte_ring_count(vpool_array[index].ring)); 2838 } else { 2839 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2840 ring_name); 2841 } 2842 2843 /* Need consider head room. */ 2844 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2845 } else { 2846 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2847 } 2848 } 2849 2850 2851 /* 2852 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2853 * device is also registered here to handle the IOCTLs. 2854 */ 2855 int 2856 MAIN(int argc, char *argv[]) 2857 { 2858 struct rte_mempool *mbuf_pool = NULL; 2859 unsigned lcore_id, core_id = 0; 2860 unsigned nb_ports, valid_num_ports; 2861 int ret; 2862 uint8_t portid, queue_id = 0; 2863 static pthread_t tid; 2864 2865 /* init EAL */ 2866 ret = rte_eal_init(argc, argv); 2867 if (ret < 0) 2868 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2869 argc -= ret; 2870 argv += ret; 2871 2872 /* parse app arguments */ 2873 ret = us_vhost_parse_args(argc, argv); 2874 if (ret < 0) 2875 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2876 #ifdef RTE_IXGBE_INC_VECTOR 2877 if (mergeable == 1) { 2878 rte_exit(EXIT_FAILURE, 2879 "sorry, mergeable feature doesn't work with vec sg recv, " \ 2880 "please disable it in cfg as a workaround\n"); 2881 } 2882 #endif 2883 2884 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2885 if (rte_lcore_is_enabled(lcore_id)) 2886 lcore_ids[core_id ++] = lcore_id; 2887 2888 if (rte_lcore_count() > RTE_MAX_LCORE) 2889 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2890 2891 /*set the number of swithcing cores available*/ 2892 num_switching_cores = rte_lcore_count()-1; 2893 2894 /* Get the number of physical ports. */ 2895 nb_ports = rte_eth_dev_count(); 2896 if (nb_ports > RTE_MAX_ETHPORTS) 2897 nb_ports = RTE_MAX_ETHPORTS; 2898 2899 /* 2900 * Update the global var NUM_PORTS and global array PORTS 2901 * and get value of var VALID_NUM_PORTS according to system ports number 2902 */ 2903 valid_num_ports = check_ports_num(nb_ports); 2904 2905 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2906 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2907 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2908 return -1; 2909 } 2910 2911 if (zero_copy == 0) { 2912 /* Create the mbuf pool. */ 2913 mbuf_pool = rte_mempool_create( 2914 "MBUF_POOL", 2915 NUM_MBUFS_PER_PORT 2916 * valid_num_ports, 2917 MBUF_SIZE, MBUF_CACHE_SIZE, 2918 sizeof(struct rte_pktmbuf_pool_private), 2919 rte_pktmbuf_pool_init, NULL, 2920 rte_pktmbuf_init, NULL, 2921 rte_socket_id(), 0); 2922 if (mbuf_pool == NULL) 2923 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2924 2925 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2926 vpool_array[queue_id].pool = mbuf_pool; 2927 2928 if (vm2vm_mode == VM2VM_HARDWARE) { 2929 /* Enable VT loop back to let L2 switch to do it. */ 2930 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2931 LOG_DEBUG(VHOST_CONFIG, 2932 "Enable loop back for L2 switch in vmdq.\n"); 2933 } 2934 } else { 2935 uint32_t nb_mbuf; 2936 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2937 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2938 2939 /* 2940 * Zero copy defers queue RX/TX start to the time when guest 2941 * finishes its startup and packet buffers from that guest are 2942 * available. 2943 */ 2944 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2945 rx_conf_default.rx_drop_en = 0; 2946 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2947 nb_mbuf = num_rx_descriptor 2948 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2949 + num_switching_cores * MAX_PKT_BURST; 2950 2951 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2952 snprintf(pool_name, sizeof(pool_name), 2953 "rxmbuf_pool_%u", queue_id); 2954 snprintf(ring_name, sizeof(ring_name), 2955 "rxmbuf_ring_%u", queue_id); 2956 setup_mempool_tbl(rte_socket_id(), queue_id, 2957 pool_name, ring_name, nb_mbuf); 2958 } 2959 2960 nb_mbuf = num_tx_descriptor 2961 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2962 + num_switching_cores * MAX_PKT_BURST; 2963 2964 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2965 snprintf(pool_name, sizeof(pool_name), 2966 "txmbuf_pool_%u", queue_id); 2967 snprintf(ring_name, sizeof(ring_name), 2968 "txmbuf_ring_%u", queue_id); 2969 setup_mempool_tbl(rte_socket_id(), 2970 (queue_id + MAX_QUEUES), 2971 pool_name, ring_name, nb_mbuf); 2972 } 2973 2974 if (vm2vm_mode == VM2VM_HARDWARE) { 2975 /* Enable VT loop back to let L2 switch to do it. */ 2976 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2977 LOG_DEBUG(VHOST_CONFIG, 2978 "Enable loop back for L2 switch in vmdq.\n"); 2979 } 2980 } 2981 /* Set log level. */ 2982 rte_set_log_level(LOG_LEVEL); 2983 2984 /* initialize all ports */ 2985 for (portid = 0; portid < nb_ports; portid++) { 2986 /* skip ports that are not enabled */ 2987 if ((enabled_port_mask & (1 << portid)) == 0) { 2988 RTE_LOG(INFO, VHOST_PORT, 2989 "Skipping disabled port %d\n", portid); 2990 continue; 2991 } 2992 if (port_init(portid) != 0) 2993 rte_exit(EXIT_FAILURE, 2994 "Cannot initialize network ports\n"); 2995 } 2996 2997 /* Initialise all linked lists. */ 2998 if (init_data_ll() == -1) 2999 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3000 3001 /* Initialize device stats */ 3002 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3003 3004 /* Enable stats if the user option is set. */ 3005 if (enable_stats) 3006 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3007 3008 /* Launch all data cores. */ 3009 if (zero_copy == 0) { 3010 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3011 rte_eal_remote_launch(switch_worker, 3012 mbuf_pool, lcore_id); 3013 } 3014 } else { 3015 uint32_t count_in_mempool, index, i; 3016 for (index = 0; index < 2*MAX_QUEUES; index++) { 3017 /* For all RX and TX queues. */ 3018 count_in_mempool 3019 = rte_mempool_count(vpool_array[index].pool); 3020 3021 /* 3022 * Transfer all un-attached mbufs from vpool.pool 3023 * to vpoo.ring. 3024 */ 3025 for (i = 0; i < count_in_mempool; i++) { 3026 struct rte_mbuf *mbuf 3027 = __rte_mbuf_raw_alloc( 3028 vpool_array[index].pool); 3029 rte_ring_sp_enqueue(vpool_array[index].ring, 3030 (void *)mbuf); 3031 } 3032 3033 LOG_DEBUG(VHOST_CONFIG, 3034 "in MAIN: mbuf count in mempool at initial " 3035 "is: %d\n", count_in_mempool); 3036 LOG_DEBUG(VHOST_CONFIG, 3037 "in MAIN: mbuf count in ring at initial is :" 3038 " %d\n", 3039 rte_ring_count(vpool_array[index].ring)); 3040 } 3041 3042 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3043 rte_eal_remote_launch(switch_worker_zcp, NULL, 3044 lcore_id); 3045 } 3046 3047 if (mergeable == 0) 3048 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3049 3050 /* Register CUSE device to handle IOCTLs. */ 3051 ret = rte_vhost_driver_register((char *)&dev_basename); 3052 if (ret != 0) 3053 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3054 3055 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3056 3057 /* Start CUSE session. */ 3058 rte_vhost_driver_session_start(); 3059 return 0; 3060 3061 } 3062 3063