1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 103 104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 106 107 #define JUMBO_FRAME_MAX_SIZE 0x2600 108 109 /* State of virtio device. */ 110 #define DEVICE_MAC_LEARNING 0 111 #define DEVICE_RX 1 112 #define DEVICE_SAFE_REMOVE 2 113 114 /* Config_core_flag status definitions. */ 115 #define REQUEST_DEV_REMOVAL 1 116 #define ACK_DEV_REMOVAL 0 117 118 /* Configurable number of RX/TX ring descriptors */ 119 #define RTE_TEST_RX_DESC_DEFAULT 1024 120 #define RTE_TEST_TX_DESC_DEFAULT 512 121 122 /* 123 * Need refine these 2 macros for legacy and DPDK based front end: 124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 125 * And then adjust power 2. 126 */ 127 /* 128 * For legacy front end, 128 descriptors, 129 * half for virtio header, another half for mbuf. 130 */ 131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 133 134 /* Get first 4 bytes in mbuf headroom. */ 135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 136 + sizeof(struct rte_mbuf))) 137 138 /* true if x is a power of 2 */ 139 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 140 141 #define INVALID_PORT_ID 0xFF 142 143 /* Max number of devices. Limited by vmdq. */ 144 #define MAX_DEVICES 64 145 146 /* Size of buffers used for snprintfs. */ 147 #define MAX_PRINT_BUFF 6072 148 149 /* Maximum character device basename size. */ 150 #define MAX_BASENAME_SZ 10 151 152 /* Maximum long option length for option parsing. */ 153 #define MAX_LONG_OPT_SZ 64 154 155 /* Used to compare MAC addresses. */ 156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 157 158 /* Number of descriptors per cacheline. */ 159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 160 161 /* mask of enabled ports */ 162 static uint32_t enabled_port_mask = 0; 163 164 /*Number of switching cores enabled*/ 165 static uint32_t num_switching_cores = 0; 166 167 /* number of devices/queues to support*/ 168 static uint32_t num_queues = 0; 169 static uint32_t num_devices; 170 171 /* 172 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 173 * disabled on default. 174 */ 175 static uint32_t zero_copy; 176 static int mergeable; 177 178 /* number of descriptors to apply*/ 179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 181 182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 183 #define MAX_RING_DESC 4096 184 185 struct vpool { 186 struct rte_mempool *pool; 187 struct rte_ring *ring; 188 uint32_t buf_size; 189 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 190 191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 192 typedef enum { 193 VM2VM_DISABLED = 0, 194 VM2VM_SOFTWARE = 1, 195 VM2VM_HARDWARE = 2, 196 VM2VM_LAST 197 } vm2vm_type; 198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 199 200 /* The type of host physical address translated from guest physical address. */ 201 typedef enum { 202 PHYS_ADDR_CONTINUOUS = 0, 203 PHYS_ADDR_CROSS_SUBREG = 1, 204 PHYS_ADDR_INVALID = 2, 205 PHYS_ADDR_LAST 206 } hpa_type; 207 208 /* Enable stats. */ 209 static uint32_t enable_stats = 0; 210 /* Enable retries on RX. */ 211 static uint32_t enable_retry = 1; 212 /* Specify timeout (in useconds) between retries on RX. */ 213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 214 /* Specify the number of retries on RX. */ 215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 216 217 /* Character device basename. Can be set by user. */ 218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 219 220 221 /* Default configuration for rx and tx thresholds etc. */ 222 static struct rte_eth_rxconf rx_conf_default = { 223 .rx_thresh = { 224 .pthresh = RX_PTHRESH, 225 .hthresh = RX_HTHRESH, 226 .wthresh = RX_WTHRESH, 227 }, 228 .rx_drop_en = 1, 229 }; 230 231 /* 232 * These default values are optimized for use with the Intel(R) 82599 10 GbE 233 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 234 * network controllers and/or network drivers. 235 */ 236 static struct rte_eth_txconf tx_conf_default = { 237 .tx_thresh = { 238 .pthresh = TX_PTHRESH, 239 .hthresh = TX_HTHRESH, 240 .wthresh = TX_WTHRESH, 241 }, 242 .tx_free_thresh = 0, /* Use PMD default values */ 243 .tx_rs_thresh = 0, /* Use PMD default values */ 244 }; 245 246 /* empty vmdq configuration structure. Filled in programatically */ 247 static struct rte_eth_conf vmdq_conf_default = { 248 .rxmode = { 249 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 250 .split_hdr_size = 0, 251 .header_split = 0, /**< Header Split disabled */ 252 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 253 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 254 /* 255 * It is necessary for 1G NIC such as I350, 256 * this fixes bug of ipv4 forwarding in guest can't 257 * forward pakets from one virtio dev to another virtio dev. 258 */ 259 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 260 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 261 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 262 }, 263 264 .txmode = { 265 .mq_mode = ETH_MQ_TX_NONE, 266 }, 267 .rx_adv_conf = { 268 /* 269 * should be overridden separately in code with 270 * appropriate values 271 */ 272 .vmdq_rx_conf = { 273 .nb_queue_pools = ETH_8_POOLS, 274 .enable_default_pool = 0, 275 .default_pool = 0, 276 .nb_pool_maps = 0, 277 .pool_map = {{0, 0},}, 278 }, 279 }, 280 }; 281 282 static unsigned lcore_ids[RTE_MAX_LCORE]; 283 static uint8_t ports[RTE_MAX_ETHPORTS]; 284 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 285 286 static const uint16_t external_pkt_default_vlan_tag = 2000; 287 const uint16_t vlan_tags[] = { 288 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 289 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 290 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 291 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 292 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 293 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 294 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 295 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 296 }; 297 298 /* ethernet addresses of ports */ 299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 300 301 /* heads for the main used and free linked lists for the data path. */ 302 static struct virtio_net_data_ll *ll_root_used = NULL; 303 static struct virtio_net_data_ll *ll_root_free = NULL; 304 305 /* Array of data core structures containing information on individual core linked lists. */ 306 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 307 308 /* Used for queueing bursts of TX packets. */ 309 struct mbuf_table { 310 unsigned len; 311 unsigned txq_id; 312 struct rte_mbuf *m_table[MAX_PKT_BURST]; 313 }; 314 315 /* TX queue for each data core. */ 316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 317 318 /* TX queue fori each virtio device for zero copy. */ 319 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 320 321 /* Vlan header struct used to insert vlan tags on TX. */ 322 struct vlan_ethhdr { 323 unsigned char h_dest[ETH_ALEN]; 324 unsigned char h_source[ETH_ALEN]; 325 __be16 h_vlan_proto; 326 __be16 h_vlan_TCI; 327 __be16 h_vlan_encapsulated_proto; 328 }; 329 330 /* IPv4 Header */ 331 struct ipv4_hdr { 332 uint8_t version_ihl; /**< version and header length */ 333 uint8_t type_of_service; /**< type of service */ 334 uint16_t total_length; /**< length of packet */ 335 uint16_t packet_id; /**< packet ID */ 336 uint16_t fragment_offset; /**< fragmentation offset */ 337 uint8_t time_to_live; /**< time to live */ 338 uint8_t next_proto_id; /**< protocol ID */ 339 uint16_t hdr_checksum; /**< header checksum */ 340 uint32_t src_addr; /**< source address */ 341 uint32_t dst_addr; /**< destination address */ 342 } __attribute__((__packed__)); 343 344 /* Header lengths. */ 345 #define VLAN_HLEN 4 346 #define VLAN_ETH_HLEN 18 347 348 /* Per-device statistics struct */ 349 struct device_statistics { 350 uint64_t tx_total; 351 rte_atomic64_t rx_total_atomic; 352 uint64_t rx_total; 353 uint64_t tx; 354 rte_atomic64_t rx_atomic; 355 uint64_t rx; 356 } __rte_cache_aligned; 357 struct device_statistics dev_statistics[MAX_DEVICES]; 358 359 /* 360 * Builds up the correct configuration for VMDQ VLAN pool map 361 * according to the pool & queue limits. 362 */ 363 static inline int 364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 365 { 366 struct rte_eth_vmdq_rx_conf conf; 367 unsigned i; 368 369 memset(&conf, 0, sizeof(conf)); 370 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 371 conf.nb_pool_maps = num_devices; 372 conf.enable_loop_back = 373 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 374 375 for (i = 0; i < conf.nb_pool_maps; i++) { 376 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 377 conf.pool_map[i].pools = (1UL << i); 378 } 379 380 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 381 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 382 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 383 return 0; 384 } 385 386 /* 387 * Validate the device number according to the max pool number gotten form 388 * dev_info. If the device number is invalid, give the error message and 389 * return -1. Each device must have its own pool. 390 */ 391 static inline int 392 validate_num_devices(uint32_t max_nb_devices) 393 { 394 if (num_devices > max_nb_devices) { 395 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 396 return -1; 397 } 398 return 0; 399 } 400 401 /* 402 * Initialises a given port using global settings and with the rx buffers 403 * coming from the mbuf_pool passed as parameter 404 */ 405 static inline int 406 port_init(uint8_t port) 407 { 408 struct rte_eth_dev_info dev_info; 409 struct rte_eth_conf port_conf; 410 uint16_t rx_rings, tx_rings; 411 uint16_t rx_ring_size, tx_ring_size; 412 int retval; 413 uint16_t q; 414 415 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 416 rte_eth_dev_info_get (port, &dev_info); 417 418 /*configure the number of supported virtio devices based on VMDQ limits */ 419 num_devices = dev_info.max_vmdq_pools; 420 num_queues = dev_info.max_rx_queues; 421 422 if (zero_copy) { 423 rx_ring_size = num_rx_descriptor; 424 tx_ring_size = num_tx_descriptor; 425 tx_rings = dev_info.max_tx_queues; 426 } else { 427 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 428 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 429 tx_rings = (uint16_t)rte_lcore_count(); 430 } 431 432 retval = validate_num_devices(MAX_DEVICES); 433 if (retval < 0) 434 return retval; 435 436 /* Get port configuration. */ 437 retval = get_eth_conf(&port_conf, num_devices); 438 if (retval < 0) 439 return retval; 440 441 if (port >= rte_eth_dev_count()) return -1; 442 443 rx_rings = (uint16_t)num_queues, 444 /* Configure ethernet device. */ 445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 446 if (retval != 0) 447 return retval; 448 449 /* Setup the queues. */ 450 for (q = 0; q < rx_rings; q ++) { 451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 452 rte_eth_dev_socket_id(port), &rx_conf_default, 453 vpool_array[q].pool); 454 if (retval < 0) 455 return retval; 456 } 457 for (q = 0; q < tx_rings; q ++) { 458 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 459 rte_eth_dev_socket_id(port), &tx_conf_default); 460 if (retval < 0) 461 return retval; 462 } 463 464 /* Start the device. */ 465 retval = rte_eth_dev_start(port); 466 if (retval < 0) { 467 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 468 return retval; 469 } 470 471 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 472 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 473 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 474 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 475 (unsigned)port, 476 vmdq_ports_eth_addr[port].addr_bytes[0], 477 vmdq_ports_eth_addr[port].addr_bytes[1], 478 vmdq_ports_eth_addr[port].addr_bytes[2], 479 vmdq_ports_eth_addr[port].addr_bytes[3], 480 vmdq_ports_eth_addr[port].addr_bytes[4], 481 vmdq_ports_eth_addr[port].addr_bytes[5]); 482 483 return 0; 484 } 485 486 /* 487 * Set character device basename. 488 */ 489 static int 490 us_vhost_parse_basename(const char *q_arg) 491 { 492 /* parse number string */ 493 494 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 495 return -1; 496 else 497 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 498 499 return 0; 500 } 501 502 /* 503 * Parse the portmask provided at run time. 504 */ 505 static int 506 parse_portmask(const char *portmask) 507 { 508 char *end = NULL; 509 unsigned long pm; 510 511 errno = 0; 512 513 /* parse hexadecimal string */ 514 pm = strtoul(portmask, &end, 16); 515 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 516 return -1; 517 518 if (pm == 0) 519 return -1; 520 521 return pm; 522 523 } 524 525 /* 526 * Parse num options at run time. 527 */ 528 static int 529 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 530 { 531 char *end = NULL; 532 unsigned long num; 533 534 errno = 0; 535 536 /* parse unsigned int string */ 537 num = strtoul(q_arg, &end, 10); 538 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 539 return -1; 540 541 if (num > max_valid_value) 542 return -1; 543 544 return num; 545 546 } 547 548 /* 549 * Display usage 550 */ 551 static void 552 us_vhost_usage(const char *prgname) 553 { 554 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 555 " --vm2vm [0|1|2]\n" 556 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 557 " --dev-basename <name>\n" 558 " --nb-devices ND\n" 559 " -p PORTMASK: Set mask for ports to be used by application\n" 560 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 561 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 562 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 563 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 564 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 565 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 566 " --dev-basename: The basename to be used for the character device.\n" 567 " --zero-copy [0|1]: disable(default)/enable rx/tx " 568 "zero copy\n" 569 " --rx-desc-num [0-N]: the number of descriptors on rx, " 570 "used only when zero copy is enabled.\n" 571 " --tx-desc-num [0-N]: the number of descriptors on tx, " 572 "used only when zero copy is enabled.\n", 573 prgname); 574 } 575 576 /* 577 * Parse the arguments given in the command line of the application. 578 */ 579 static int 580 us_vhost_parse_args(int argc, char **argv) 581 { 582 int opt, ret; 583 int option_index; 584 unsigned i; 585 const char *prgname = argv[0]; 586 static struct option long_option[] = { 587 {"vm2vm", required_argument, NULL, 0}, 588 {"rx-retry", required_argument, NULL, 0}, 589 {"rx-retry-delay", required_argument, NULL, 0}, 590 {"rx-retry-num", required_argument, NULL, 0}, 591 {"mergeable", required_argument, NULL, 0}, 592 {"stats", required_argument, NULL, 0}, 593 {"dev-basename", required_argument, NULL, 0}, 594 {"zero-copy", required_argument, NULL, 0}, 595 {"rx-desc-num", required_argument, NULL, 0}, 596 {"tx-desc-num", required_argument, NULL, 0}, 597 {NULL, 0, 0, 0}, 598 }; 599 600 /* Parse command line */ 601 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 602 switch (opt) { 603 /* Portmask */ 604 case 'p': 605 enabled_port_mask = parse_portmask(optarg); 606 if (enabled_port_mask == 0) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } 611 break; 612 613 case 0: 614 /* Enable/disable vm2vm comms. */ 615 if (!strncmp(long_option[option_index].name, "vm2vm", 616 MAX_LONG_OPT_SZ)) { 617 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 618 if (ret == -1) { 619 RTE_LOG(INFO, VHOST_CONFIG, 620 "Invalid argument for " 621 "vm2vm [0|1|2]\n"); 622 us_vhost_usage(prgname); 623 return -1; 624 } else { 625 vm2vm_mode = (vm2vm_type)ret; 626 } 627 } 628 629 /* Enable/disable retries on RX. */ 630 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 631 ret = parse_num_opt(optarg, 1); 632 if (ret == -1) { 633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } else { 637 enable_retry = ret; 638 } 639 } 640 641 /* Specify the retries delay time (in useconds) on RX. */ 642 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 643 ret = parse_num_opt(optarg, INT32_MAX); 644 if (ret == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 646 us_vhost_usage(prgname); 647 return -1; 648 } else { 649 burst_rx_delay_time = ret; 650 } 651 } 652 653 /* Specify the retries number on RX. */ 654 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 655 ret = parse_num_opt(optarg, INT32_MAX); 656 if (ret == -1) { 657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 658 us_vhost_usage(prgname); 659 return -1; 660 } else { 661 burst_rx_retry_num = ret; 662 } 663 } 664 665 /* Enable/disable RX mergeable buffers. */ 666 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 667 ret = parse_num_opt(optarg, 1); 668 if (ret == -1) { 669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 670 us_vhost_usage(prgname); 671 return -1; 672 } else { 673 mergeable = !!ret; 674 if (ret) { 675 vmdq_conf_default.rxmode.jumbo_frame = 1; 676 vmdq_conf_default.rxmode.max_rx_pkt_len 677 = JUMBO_FRAME_MAX_SIZE; 678 } 679 } 680 } 681 682 /* Enable/disable stats. */ 683 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 684 ret = parse_num_opt(optarg, INT32_MAX); 685 if (ret == -1) { 686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 687 us_vhost_usage(prgname); 688 return -1; 689 } else { 690 enable_stats = ret; 691 } 692 } 693 694 /* Set character device basename. */ 695 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 696 if (us_vhost_parse_basename(optarg) == -1) { 697 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 698 us_vhost_usage(prgname); 699 return -1; 700 } 701 } 702 703 /* Enable/disable rx/tx zero copy. */ 704 if (!strncmp(long_option[option_index].name, 705 "zero-copy", MAX_LONG_OPT_SZ)) { 706 ret = parse_num_opt(optarg, 1); 707 if (ret == -1) { 708 RTE_LOG(INFO, VHOST_CONFIG, 709 "Invalid argument" 710 " for zero-copy [0|1]\n"); 711 us_vhost_usage(prgname); 712 return -1; 713 } else 714 zero_copy = ret; 715 716 if (zero_copy) { 717 #ifdef RTE_MBUF_REFCNT 718 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 719 "zero copy vhost APP, please " 720 "disable RTE_MBUF_REFCNT\n" 721 "in config file and then rebuild DPDK " 722 "core lib!\n" 723 "Otherwise please disable zero copy " 724 "flag in command line!\n"); 725 return -1; 726 #endif 727 } 728 } 729 730 /* Specify the descriptor number on RX. */ 731 if (!strncmp(long_option[option_index].name, 732 "rx-desc-num", MAX_LONG_OPT_SZ)) { 733 ret = parse_num_opt(optarg, MAX_RING_DESC); 734 if ((ret == -1) || (!POWEROF2(ret))) { 735 RTE_LOG(INFO, VHOST_CONFIG, 736 "Invalid argument for rx-desc-num[0-N]," 737 "power of 2 required.\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } else { 741 num_rx_descriptor = ret; 742 } 743 } 744 745 /* Specify the descriptor number on TX. */ 746 if (!strncmp(long_option[option_index].name, 747 "tx-desc-num", MAX_LONG_OPT_SZ)) { 748 ret = parse_num_opt(optarg, MAX_RING_DESC); 749 if ((ret == -1) || (!POWEROF2(ret))) { 750 RTE_LOG(INFO, VHOST_CONFIG, 751 "Invalid argument for tx-desc-num [0-N]," 752 "power of 2 required.\n"); 753 us_vhost_usage(prgname); 754 return -1; 755 } else { 756 num_tx_descriptor = ret; 757 } 758 } 759 760 break; 761 762 /* Invalid option - print options. */ 763 default: 764 us_vhost_usage(prgname); 765 return -1; 766 } 767 } 768 769 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 770 if (enabled_port_mask & (1 << i)) 771 ports[num_ports++] = (uint8_t)i; 772 } 773 774 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 775 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 776 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 777 return -1; 778 } 779 780 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 781 RTE_LOG(INFO, VHOST_PORT, 782 "Vhost zero copy doesn't support software vm2vm," 783 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 784 return -1; 785 } 786 787 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 788 RTE_LOG(INFO, VHOST_PORT, 789 "Vhost zero copy doesn't support jumbo frame," 790 "please specify '--mergeable 0' to disable the " 791 "mergeable feature.\n"); 792 return -1; 793 } 794 795 return 0; 796 } 797 798 /* 799 * Update the global var NUM_PORTS and array PORTS according to system ports number 800 * and return valid ports number 801 */ 802 static unsigned check_ports_num(unsigned nb_ports) 803 { 804 unsigned valid_num_ports = num_ports; 805 unsigned portid; 806 807 if (num_ports > nb_ports) { 808 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 809 num_ports, nb_ports); 810 num_ports = nb_ports; 811 } 812 813 for (portid = 0; portid < num_ports; portid ++) { 814 if (ports[portid] >= nb_ports) { 815 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 816 ports[portid], (nb_ports - 1)); 817 ports[portid] = INVALID_PORT_ID; 818 valid_num_ports--; 819 } 820 } 821 return valid_num_ports; 822 } 823 824 /* 825 * Macro to print out packet contents. Wrapped in debug define so that the 826 * data path is not effected when debug is disabled. 827 */ 828 #ifdef DEBUG 829 #define PRINT_PACKET(device, addr, size, header) do { \ 830 char *pkt_addr = (char*)(addr); \ 831 unsigned int index; \ 832 char packet[MAX_PRINT_BUFF]; \ 833 \ 834 if ((header)) \ 835 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 836 else \ 837 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 838 for (index = 0; index < (size); index++) { \ 839 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 840 "%02hhx ", pkt_addr[index]); \ 841 } \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 843 \ 844 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 845 } while(0) 846 #else 847 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 848 #endif 849 850 /* 851 * Function to convert guest physical addresses to vhost physical addresses. 852 * This is used to convert virtio buffer addresses. 853 */ 854 static inline uint64_t __attribute__((always_inline)) 855 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 856 uint32_t buf_len, hpa_type *addr_type) 857 { 858 struct virtio_memory_regions_hpa *region; 859 uint32_t regionidx; 860 uint64_t vhost_pa = 0; 861 862 *addr_type = PHYS_ADDR_INVALID; 863 864 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 865 region = &vdev->regions_hpa[regionidx]; 866 if ((guest_pa >= region->guest_phys_address) && 867 (guest_pa <= region->guest_phys_address_end)) { 868 vhost_pa = region->host_phys_addr_offset + guest_pa; 869 if (likely((guest_pa + buf_len - 1) 870 <= region->guest_phys_address_end)) 871 *addr_type = PHYS_ADDR_CONTINUOUS; 872 else 873 *addr_type = PHYS_ADDR_CROSS_SUBREG; 874 break; 875 } 876 } 877 878 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 879 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 880 (void *)(uintptr_t)vhost_pa); 881 882 return vhost_pa; 883 } 884 885 /* 886 * Compares a packet destination MAC address to a device MAC address. 887 */ 888 static inline int __attribute__((always_inline)) 889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 890 { 891 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 892 } 893 894 /* 895 * This function learns the MAC address of the device and registers this along with a 896 * vlan tag to a VMDQ. 897 */ 898 static int 899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 900 { 901 struct ether_hdr *pkt_hdr; 902 struct virtio_net_data_ll *dev_ll; 903 struct virtio_net *dev = vdev->dev; 904 int i, ret; 905 906 /* Learn MAC address of guest device from packet */ 907 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 908 909 dev_ll = ll_root_used; 910 911 while (dev_ll != NULL) { 912 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 913 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 914 return -1; 915 } 916 dev_ll = dev_ll->next; 917 } 918 919 for (i = 0; i < ETHER_ADDR_LEN; i++) 920 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 921 922 /* vlan_tag currently uses the device_id. */ 923 vdev->vlan_tag = vlan_tags[dev->device_fh]; 924 925 /* Print out VMDQ registration info. */ 926 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 927 dev->device_fh, 928 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 929 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 930 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 931 vdev->vlan_tag); 932 933 /* Register the MAC address. */ 934 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 935 if (ret) 936 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 937 dev->device_fh); 938 939 /* Enable stripping of the vlan tag as we handle routing. */ 940 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 941 942 /* Set device as ready for RX. */ 943 vdev->ready = DEVICE_RX; 944 945 return 0; 946 } 947 948 /* 949 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 950 * queue before disabling RX on the device. 951 */ 952 static inline void 953 unlink_vmdq(struct vhost_dev *vdev) 954 { 955 unsigned i = 0; 956 unsigned rx_count; 957 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 958 959 if (vdev->ready == DEVICE_RX) { 960 /*clear MAC and VLAN settings*/ 961 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 962 for (i = 0; i < 6; i++) 963 vdev->mac_address.addr_bytes[i] = 0; 964 965 vdev->vlan_tag = 0; 966 967 /*Clear out the receive buffers*/ 968 rx_count = rte_eth_rx_burst(ports[0], 969 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 970 971 while (rx_count) { 972 for (i = 0; i < rx_count; i++) 973 rte_pktmbuf_free(pkts_burst[i]); 974 975 rx_count = rte_eth_rx_burst(ports[0], 976 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 977 } 978 979 vdev->ready = DEVICE_MAC_LEARNING; 980 } 981 } 982 983 /* 984 * Check if the packet destination MAC address is for a local device. If so then put 985 * the packet on that devices RX queue. If not then return. 986 */ 987 static inline int __attribute__((always_inline)) 988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 989 { 990 struct virtio_net_data_ll *dev_ll; 991 struct ether_hdr *pkt_hdr; 992 uint64_t ret = 0; 993 struct virtio_net *dev = vdev->dev; 994 struct virtio_net *tdev; /* destination virito device */ 995 996 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 997 998 /*get the used devices list*/ 999 dev_ll = ll_root_used; 1000 1001 while (dev_ll != NULL) { 1002 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1003 &dev_ll->vdev->mac_address)) { 1004 1005 /* Drop the packet if the TX packet is destined for the TX device. */ 1006 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1007 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1008 dev->device_fh); 1009 return 0; 1010 } 1011 tdev = dev_ll->vdev->dev; 1012 1013 1014 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1015 1016 if (unlikely(dev_ll->vdev->remove)) { 1017 /*drop the packet if the device is marked for removal*/ 1018 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1019 } else { 1020 /*send the packet to the local virtio device*/ 1021 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1022 if (enable_stats) { 1023 rte_atomic64_add( 1024 &dev_statistics[tdev->device_fh].rx_total_atomic, 1025 1); 1026 rte_atomic64_add( 1027 &dev_statistics[tdev->device_fh].rx_atomic, 1028 ret); 1029 dev_statistics[tdev->device_fh].tx_total++; 1030 dev_statistics[tdev->device_fh].tx += ret; 1031 } 1032 } 1033 1034 return 0; 1035 } 1036 dev_ll = dev_ll->next; 1037 } 1038 1039 return -1; 1040 } 1041 1042 /* 1043 * This function routes the TX packet to the correct interface. This may be a local device 1044 * or the physical port. 1045 */ 1046 static inline void __attribute__((always_inline)) 1047 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1048 { 1049 struct mbuf_table *tx_q; 1050 struct rte_mbuf **m_table; 1051 unsigned len, ret, offset = 0; 1052 const uint16_t lcore_id = rte_lcore_id(); 1053 struct virtio_net_data_ll *dev_ll = ll_root_used; 1054 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1055 struct virtio_net *dev = vdev->dev; 1056 1057 /*check if destination is local VM*/ 1058 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1059 rte_pktmbuf_free(m); 1060 return; 1061 } 1062 1063 if (vm2vm_mode == VM2VM_HARDWARE) { 1064 while (dev_ll != NULL) { 1065 if ((dev_ll->vdev->ready == DEVICE_RX) 1066 && ether_addr_cmp(&(pkt_hdr->d_addr), 1067 &dev_ll->vdev->mac_address)) { 1068 /* 1069 * Drop the packet if the TX packet is 1070 * destined for the TX device. 1071 */ 1072 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1073 LOG_DEBUG(VHOST_DATA, 1074 "(%"PRIu64") TX: Source and destination" 1075 " MAC addresses are the same. Dropping " 1076 "packet.\n", 1077 dev_ll->vdev->dev->device_fh); 1078 rte_pktmbuf_free(m); 1079 return; 1080 } 1081 1082 /* 1083 * HW vlan strip will reduce the packet length 1084 * by minus length of vlan tag, so need restore 1085 * the packet length by plus it. 1086 */ 1087 offset = VLAN_HLEN; 1088 vlan_tag = 1089 (uint16_t) 1090 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1091 1092 LOG_DEBUG(VHOST_DATA, 1093 "(%"PRIu64") TX: pkt to local VM device id:" 1094 "(%"PRIu64") vlan tag: %d.\n", 1095 dev->device_fh, dev_ll->vdev->dev->device_fh, 1096 vlan_tag); 1097 1098 break; 1099 } 1100 dev_ll = dev_ll->next; 1101 } 1102 } 1103 1104 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1105 1106 /*Add packet to the port tx queue*/ 1107 tx_q = &lcore_tx_queue[lcore_id]; 1108 len = tx_q->len; 1109 1110 m->ol_flags = PKT_TX_VLAN_PKT; 1111 1112 m->data_len += offset; 1113 m->pkt_len += offset; 1114 1115 m->vlan_tci = vlan_tag; 1116 1117 tx_q->m_table[len] = m; 1118 len++; 1119 if (enable_stats) { 1120 dev_statistics[dev->device_fh].tx_total++; 1121 dev_statistics[dev->device_fh].tx++; 1122 } 1123 1124 if (unlikely(len == MAX_PKT_BURST)) { 1125 m_table = (struct rte_mbuf **)tx_q->m_table; 1126 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1127 /* Free any buffers not handled by TX and update the port stats. */ 1128 if (unlikely(ret < len)) { 1129 do { 1130 rte_pktmbuf_free(m_table[ret]); 1131 } while (++ret < len); 1132 } 1133 1134 len = 0; 1135 } 1136 1137 tx_q->len = len; 1138 return; 1139 } 1140 /* 1141 * This function is called by each data core. It handles all RX/TX registered with the 1142 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1143 * with all devices in the main linked list. 1144 */ 1145 static int 1146 switch_worker(__attribute__((unused)) void *arg) 1147 { 1148 struct rte_mempool *mbuf_pool = arg; 1149 struct virtio_net *dev = NULL; 1150 struct vhost_dev *vdev = NULL; 1151 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1152 struct virtio_net_data_ll *dev_ll; 1153 struct mbuf_table *tx_q; 1154 volatile struct lcore_ll_info *lcore_ll; 1155 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1156 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1157 unsigned ret, i; 1158 const uint16_t lcore_id = rte_lcore_id(); 1159 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1160 uint16_t rx_count = 0; 1161 uint16_t tx_count; 1162 uint32_t retry = 0; 1163 1164 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1165 lcore_ll = lcore_info[lcore_id].lcore_ll; 1166 prev_tsc = 0; 1167 1168 tx_q = &lcore_tx_queue[lcore_id]; 1169 for (i = 0; i < num_cores; i ++) { 1170 if (lcore_ids[i] == lcore_id) { 1171 tx_q->txq_id = i; 1172 break; 1173 } 1174 } 1175 1176 while(1) { 1177 cur_tsc = rte_rdtsc(); 1178 /* 1179 * TX burst queue drain 1180 */ 1181 diff_tsc = cur_tsc - prev_tsc; 1182 if (unlikely(diff_tsc > drain_tsc)) { 1183 1184 if (tx_q->len) { 1185 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1186 1187 /*Tx any packets in the queue*/ 1188 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1189 (struct rte_mbuf **)tx_q->m_table, 1190 (uint16_t)tx_q->len); 1191 if (unlikely(ret < tx_q->len)) { 1192 do { 1193 rte_pktmbuf_free(tx_q->m_table[ret]); 1194 } while (++ret < tx_q->len); 1195 } 1196 1197 tx_q->len = 0; 1198 } 1199 1200 prev_tsc = cur_tsc; 1201 1202 } 1203 1204 rte_prefetch0(lcore_ll->ll_root_used); 1205 /* 1206 * Inform the configuration core that we have exited the linked list and that no devices are 1207 * in use if requested. 1208 */ 1209 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1210 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1211 1212 /* 1213 * Process devices 1214 */ 1215 dev_ll = lcore_ll->ll_root_used; 1216 1217 while (dev_ll != NULL) { 1218 /*get virtio device ID*/ 1219 vdev = dev_ll->vdev; 1220 dev = vdev->dev; 1221 1222 if (unlikely(vdev->remove)) { 1223 dev_ll = dev_ll->next; 1224 unlink_vmdq(vdev); 1225 vdev->ready = DEVICE_SAFE_REMOVE; 1226 continue; 1227 } 1228 if (likely(vdev->ready == DEVICE_RX)) { 1229 /*Handle guest RX*/ 1230 rx_count = rte_eth_rx_burst(ports[0], 1231 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1232 1233 if (rx_count) { 1234 /* 1235 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1236 * Here MAX_PKT_BURST must be less than virtio queue size 1237 */ 1238 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1239 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1240 rte_delay_us(burst_rx_delay_time); 1241 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1242 break; 1243 } 1244 } 1245 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1246 if (enable_stats) { 1247 rte_atomic64_add( 1248 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1249 rx_count); 1250 rte_atomic64_add( 1251 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1252 } 1253 while (likely(rx_count)) { 1254 rx_count--; 1255 rte_pktmbuf_free(pkts_burst[rx_count]); 1256 } 1257 1258 } 1259 } 1260 1261 if (likely(!vdev->remove)) { 1262 /* Handle guest TX*/ 1263 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1264 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1265 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1266 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1267 while (tx_count--) 1268 rte_pktmbuf_free(pkts_burst[tx_count]); 1269 } 1270 } 1271 while (tx_count) 1272 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1273 } 1274 1275 /*move to the next device in the list*/ 1276 dev_ll = dev_ll->next; 1277 } 1278 } 1279 1280 return 0; 1281 } 1282 1283 /* 1284 * This function gets available ring number for zero copy rx. 1285 * Only one thread will call this funciton for a paticular virtio device, 1286 * so, it is designed as non-thread-safe function. 1287 */ 1288 static inline uint32_t __attribute__((always_inline)) 1289 get_available_ring_num_zcp(struct virtio_net *dev) 1290 { 1291 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1292 uint16_t avail_idx; 1293 1294 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1295 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1296 } 1297 1298 /* 1299 * This function gets available ring index for zero copy rx, 1300 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1301 * Only one thread will call this funciton for a paticular virtio device, 1302 * so, it is designed as non-thread-safe function. 1303 */ 1304 static inline uint32_t __attribute__((always_inline)) 1305 get_available_ring_index_zcp(struct virtio_net *dev, 1306 uint16_t *res_base_idx, uint32_t count) 1307 { 1308 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1309 uint16_t avail_idx; 1310 uint32_t retry = 0; 1311 uint16_t free_entries; 1312 1313 *res_base_idx = vq->last_used_idx_res; 1314 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1315 free_entries = (avail_idx - *res_base_idx); 1316 1317 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1318 "avail idx: %d, " 1319 "res base idx:%d, free entries:%d\n", 1320 dev->device_fh, avail_idx, *res_base_idx, 1321 free_entries); 1322 1323 /* 1324 * If retry is enabled and the queue is full then we wait 1325 * and retry to avoid packet loss. 1326 */ 1327 if (enable_retry && unlikely(count > free_entries)) { 1328 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1329 rte_delay_us(burst_rx_delay_time); 1330 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1331 free_entries = (avail_idx - *res_base_idx); 1332 if (count <= free_entries) 1333 break; 1334 } 1335 } 1336 1337 /*check that we have enough buffers*/ 1338 if (unlikely(count > free_entries)) 1339 count = free_entries; 1340 1341 if (unlikely(count == 0)) { 1342 LOG_DEBUG(VHOST_DATA, 1343 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1344 "avail idx: %d, res base idx:%d, free entries:%d\n", 1345 dev->device_fh, avail_idx, 1346 *res_base_idx, free_entries); 1347 return 0; 1348 } 1349 1350 vq->last_used_idx_res = *res_base_idx + count; 1351 1352 return count; 1353 } 1354 1355 /* 1356 * This function put descriptor back to used list. 1357 */ 1358 static inline void __attribute__((always_inline)) 1359 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1360 { 1361 uint16_t res_cur_idx = vq->last_used_idx; 1362 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1363 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1364 rte_compiler_barrier(); 1365 *(volatile uint16_t *)&vq->used->idx += 1; 1366 vq->last_used_idx += 1; 1367 1368 /* Kick the guest if necessary. */ 1369 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1370 eventfd_write((int)vq->kickfd, 1); 1371 } 1372 1373 /* 1374 * This function get available descriptor from vitio vring and un-attached mbuf 1375 * from vpool->ring, and then attach them together. It needs adjust the offset 1376 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1377 * frame data may be put to wrong location in mbuf. 1378 */ 1379 static inline void __attribute__((always_inline)) 1380 attach_rxmbuf_zcp(struct virtio_net *dev) 1381 { 1382 uint16_t res_base_idx, desc_idx; 1383 uint64_t buff_addr, phys_addr; 1384 struct vhost_virtqueue *vq; 1385 struct vring_desc *desc; 1386 struct rte_mbuf *mbuf = NULL; 1387 struct vpool *vpool; 1388 hpa_type addr_type; 1389 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1390 1391 vpool = &vpool_array[vdev->vmdq_rx_q]; 1392 vq = dev->virtqueue[VIRTIO_RXQ]; 1393 1394 do { 1395 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1396 1) != 1)) 1397 return; 1398 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1399 1400 desc = &vq->desc[desc_idx]; 1401 if (desc->flags & VRING_DESC_F_NEXT) { 1402 desc = &vq->desc[desc->next]; 1403 buff_addr = gpa_to_vva(dev, desc->addr); 1404 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1405 &addr_type); 1406 } else { 1407 buff_addr = gpa_to_vva(dev, 1408 desc->addr + vq->vhost_hlen); 1409 phys_addr = gpa_to_hpa(vdev, 1410 desc->addr + vq->vhost_hlen, 1411 desc->len, &addr_type); 1412 } 1413 1414 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1415 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1416 " address found when attaching RX frame buffer" 1417 " address!\n", dev->device_fh); 1418 put_desc_to_used_list_zcp(vq, desc_idx); 1419 continue; 1420 } 1421 1422 /* 1423 * Check if the frame buffer address from guest crosses 1424 * sub-region or not. 1425 */ 1426 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1427 RTE_LOG(ERR, VHOST_DATA, 1428 "(%"PRIu64") Frame buffer address cross " 1429 "sub-regioin found when attaching RX frame " 1430 "buffer address!\n", 1431 dev->device_fh); 1432 put_desc_to_used_list_zcp(vq, desc_idx); 1433 continue; 1434 } 1435 } while (unlikely(phys_addr == 0)); 1436 1437 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1438 if (unlikely(mbuf == NULL)) { 1439 LOG_DEBUG(VHOST_DATA, 1440 "(%"PRIu64") in attach_rxmbuf_zcp: " 1441 "ring_sc_dequeue fail.\n", 1442 dev->device_fh); 1443 put_desc_to_used_list_zcp(vq, desc_idx); 1444 return; 1445 } 1446 1447 if (unlikely(vpool->buf_size > desc->len)) { 1448 LOG_DEBUG(VHOST_DATA, 1449 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1450 "length(%d) of descriptor idx: %d less than room " 1451 "size required: %d\n", 1452 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1453 put_desc_to_used_list_zcp(vq, desc_idx); 1454 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1455 return; 1456 } 1457 1458 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1459 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1460 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1461 mbuf->data_len = desc->len; 1462 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1463 1464 LOG_DEBUG(VHOST_DATA, 1465 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1466 "descriptor idx:%d\n", 1467 dev->device_fh, res_base_idx, desc_idx); 1468 1469 __rte_mbuf_raw_free(mbuf); 1470 1471 return; 1472 } 1473 1474 /* 1475 * Detach an attched packet mbuf - 1476 * - restore original mbuf address and length values. 1477 * - reset pktmbuf data and data_len to their default values. 1478 * All other fields of the given packet mbuf will be left intact. 1479 * 1480 * @param m 1481 * The attached packet mbuf. 1482 */ 1483 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1484 { 1485 const struct rte_mempool *mp = m->pool; 1486 void *buf = RTE_MBUF_TO_BADDR(m); 1487 uint32_t buf_ofs; 1488 uint32_t buf_len = mp->elt_size - sizeof(*m); 1489 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1490 1491 m->buf_addr = buf; 1492 m->buf_len = (uint16_t)buf_len; 1493 1494 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1495 RTE_PKTMBUF_HEADROOM : m->buf_len; 1496 m->data_off = buf_ofs; 1497 1498 m->data_len = 0; 1499 } 1500 1501 /* 1502 * This function is called after packets have been transimited. It fetchs mbuf 1503 * from vpool->pool, detached it and put into vpool->ring. It also update the 1504 * used index and kick the guest if necessary. 1505 */ 1506 static inline uint32_t __attribute__((always_inline)) 1507 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1508 { 1509 struct rte_mbuf *mbuf; 1510 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1511 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1512 uint32_t index = 0; 1513 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1514 1515 LOG_DEBUG(VHOST_DATA, 1516 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1517 "clean is: %d\n", 1518 dev->device_fh, mbuf_count); 1519 LOG_DEBUG(VHOST_DATA, 1520 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1521 "clean is : %d\n", 1522 dev->device_fh, rte_ring_count(vpool->ring)); 1523 1524 for (index = 0; index < mbuf_count; index++) { 1525 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1526 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1527 pktmbuf_detach_zcp(mbuf); 1528 rte_ring_sp_enqueue(vpool->ring, mbuf); 1529 1530 /* Update used index buffer information. */ 1531 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1532 vq->used->ring[used_idx].len = 0; 1533 1534 used_idx = (used_idx + 1) & (vq->size - 1); 1535 } 1536 1537 LOG_DEBUG(VHOST_DATA, 1538 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1539 "clean is: %d\n", 1540 dev->device_fh, rte_mempool_count(vpool->pool)); 1541 LOG_DEBUG(VHOST_DATA, 1542 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1543 "clean is : %d\n", 1544 dev->device_fh, rte_ring_count(vpool->ring)); 1545 LOG_DEBUG(VHOST_DATA, 1546 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1547 "vq->last_used_idx:%d\n", 1548 dev->device_fh, vq->last_used_idx); 1549 1550 vq->last_used_idx += mbuf_count; 1551 1552 LOG_DEBUG(VHOST_DATA, 1553 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1554 "vq->last_used_idx:%d\n", 1555 dev->device_fh, vq->last_used_idx); 1556 1557 rte_compiler_barrier(); 1558 1559 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1560 1561 /* Kick guest if required. */ 1562 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1563 eventfd_write((int)vq->kickfd, 1); 1564 1565 return 0; 1566 } 1567 1568 /* 1569 * This function is called when a virtio device is destroy. 1570 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1571 */ 1572 static void mbuf_destroy_zcp(struct vpool *vpool) 1573 { 1574 struct rte_mbuf *mbuf = NULL; 1575 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1576 1577 LOG_DEBUG(VHOST_CONFIG, 1578 "in mbuf_destroy_zcp: mbuf count in mempool before " 1579 "mbuf_destroy_zcp is: %d\n", 1580 mbuf_count); 1581 LOG_DEBUG(VHOST_CONFIG, 1582 "in mbuf_destroy_zcp: mbuf count in ring before " 1583 "mbuf_destroy_zcp is : %d\n", 1584 rte_ring_count(vpool->ring)); 1585 1586 for (index = 0; index < mbuf_count; index++) { 1587 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1588 if (likely(mbuf != NULL)) { 1589 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1590 pktmbuf_detach_zcp(mbuf); 1591 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1592 } 1593 } 1594 1595 LOG_DEBUG(VHOST_CONFIG, 1596 "in mbuf_destroy_zcp: mbuf count in mempool after " 1597 "mbuf_destroy_zcp is: %d\n", 1598 rte_mempool_count(vpool->pool)); 1599 LOG_DEBUG(VHOST_CONFIG, 1600 "in mbuf_destroy_zcp: mbuf count in ring after " 1601 "mbuf_destroy_zcp is : %d\n", 1602 rte_ring_count(vpool->ring)); 1603 } 1604 1605 /* 1606 * This function update the use flag and counter. 1607 */ 1608 static inline uint32_t __attribute__((always_inline)) 1609 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1610 uint32_t count) 1611 { 1612 struct vhost_virtqueue *vq; 1613 struct vring_desc *desc; 1614 struct rte_mbuf *buff; 1615 /* The virtio_hdr is initialised to 0. */ 1616 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1617 = {{0, 0, 0, 0, 0, 0}, 0}; 1618 uint64_t buff_hdr_addr = 0; 1619 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1620 uint32_t head_idx, packet_success = 0; 1621 uint16_t res_cur_idx; 1622 1623 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1624 1625 if (count == 0) 1626 return 0; 1627 1628 vq = dev->virtqueue[VIRTIO_RXQ]; 1629 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1630 1631 res_cur_idx = vq->last_used_idx; 1632 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1633 dev->device_fh, res_cur_idx, res_cur_idx + count); 1634 1635 /* Retrieve all of the head indexes first to avoid caching issues. */ 1636 for (head_idx = 0; head_idx < count; head_idx++) 1637 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1638 1639 /*Prefetch descriptor index. */ 1640 rte_prefetch0(&vq->desc[head[packet_success]]); 1641 1642 while (packet_success != count) { 1643 /* Get descriptor from available ring */ 1644 desc = &vq->desc[head[packet_success]]; 1645 1646 buff = pkts[packet_success]; 1647 LOG_DEBUG(VHOST_DATA, 1648 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1649 "pkt[%d] descriptor idx: %d\n", 1650 dev->device_fh, packet_success, 1651 MBUF_HEADROOM_UINT32(buff)); 1652 1653 PRINT_PACKET(dev, 1654 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1655 + RTE_PKTMBUF_HEADROOM), 1656 rte_pktmbuf_data_len(buff), 0); 1657 1658 /* Buffer address translation for virtio header. */ 1659 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1660 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1661 1662 /* 1663 * If the descriptors are chained the header and data are 1664 * placed in separate buffers. 1665 */ 1666 if (desc->flags & VRING_DESC_F_NEXT) { 1667 desc->len = vq->vhost_hlen; 1668 desc = &vq->desc[desc->next]; 1669 desc->len = rte_pktmbuf_data_len(buff); 1670 } else { 1671 desc->len = packet_len; 1672 } 1673 1674 /* Update used ring with desc information */ 1675 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1676 = head[packet_success]; 1677 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1678 = packet_len; 1679 res_cur_idx++; 1680 packet_success++; 1681 1682 /* A header is required per buffer. */ 1683 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1684 (const void *)&virtio_hdr, vq->vhost_hlen); 1685 1686 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1687 1688 if (likely(packet_success < count)) { 1689 /* Prefetch descriptor index. */ 1690 rte_prefetch0(&vq->desc[head[packet_success]]); 1691 } 1692 } 1693 1694 rte_compiler_barrier(); 1695 1696 LOG_DEBUG(VHOST_DATA, 1697 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1698 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1699 dev->device_fh, vq->last_used_idx, vq->used->idx); 1700 1701 *(volatile uint16_t *)&vq->used->idx += count; 1702 vq->last_used_idx += count; 1703 1704 LOG_DEBUG(VHOST_DATA, 1705 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1706 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1707 dev->device_fh, vq->last_used_idx, vq->used->idx); 1708 1709 /* Kick the guest if necessary. */ 1710 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1711 eventfd_write((int)vq->kickfd, 1); 1712 1713 return count; 1714 } 1715 1716 /* 1717 * This function routes the TX packet to the correct interface. 1718 * This may be a local device or the physical port. 1719 */ 1720 static inline void __attribute__((always_inline)) 1721 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1722 uint32_t desc_idx, uint8_t need_copy) 1723 { 1724 struct mbuf_table *tx_q; 1725 struct rte_mbuf **m_table; 1726 struct rte_mbuf *mbuf = NULL; 1727 unsigned len, ret, offset = 0; 1728 struct vpool *vpool; 1729 struct virtio_net_data_ll *dev_ll = ll_root_used; 1730 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1731 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1732 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1733 1734 /*Add packet to the port tx queue*/ 1735 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1736 len = tx_q->len; 1737 1738 /* Allocate an mbuf and populate the structure. */ 1739 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1740 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1741 if (unlikely(mbuf == NULL)) { 1742 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1743 RTE_LOG(ERR, VHOST_DATA, 1744 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1745 dev->device_fh); 1746 put_desc_to_used_list_zcp(vq, desc_idx); 1747 return; 1748 } 1749 1750 if (vm2vm_mode == VM2VM_HARDWARE) { 1751 /* Avoid using a vlan tag from any vm for external pkt, such as 1752 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1753 * selection, MAC address determines it as an external pkt 1754 * which should go to network, while vlan tag determine it as 1755 * a vm2vm pkt should forward to another vm. Hardware confuse 1756 * such a ambiguous situation, so pkt will lost. 1757 */ 1758 vlan_tag = external_pkt_default_vlan_tag; 1759 while (dev_ll != NULL) { 1760 if (likely(dev_ll->vdev->ready == DEVICE_RX) && 1761 ether_addr_cmp(&(pkt_hdr->d_addr), 1762 &dev_ll->vdev->mac_address)) { 1763 1764 /* 1765 * Drop the packet if the TX packet is destined 1766 * for the TX device. 1767 */ 1768 if (unlikely(dev_ll->vdev->dev->device_fh 1769 == dev->device_fh)) { 1770 LOG_DEBUG(VHOST_DATA, 1771 "(%"PRIu64") TX: Source and destination" 1772 "MAC addresses are the same. Dropping " 1773 "packet.\n", 1774 dev_ll->vdev->dev->device_fh); 1775 MBUF_HEADROOM_UINT32(mbuf) 1776 = (uint32_t)desc_idx; 1777 __rte_mbuf_raw_free(mbuf); 1778 return; 1779 } 1780 1781 /* 1782 * Packet length offset 4 bytes for HW vlan 1783 * strip when L2 switch back. 1784 */ 1785 offset = 4; 1786 vlan_tag = 1787 (uint16_t) 1788 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1789 1790 LOG_DEBUG(VHOST_DATA, 1791 "(%"PRIu64") TX: pkt to local VM device id:" 1792 "(%"PRIu64") vlan tag: %d.\n", 1793 dev->device_fh, dev_ll->vdev->dev->device_fh, 1794 vlan_tag); 1795 1796 break; 1797 } 1798 dev_ll = dev_ll->next; 1799 } 1800 } 1801 1802 mbuf->nb_segs = m->nb_segs; 1803 mbuf->next = m->next; 1804 mbuf->data_len = m->data_len + offset; 1805 mbuf->pkt_len = mbuf->data_len; 1806 if (unlikely(need_copy)) { 1807 /* Copy the packet contents to the mbuf. */ 1808 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1809 rte_pktmbuf_mtod(m, void *), 1810 m->data_len); 1811 } else { 1812 mbuf->data_off = m->data_off; 1813 mbuf->buf_physaddr = m->buf_physaddr; 1814 mbuf->buf_addr = m->buf_addr; 1815 } 1816 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1817 mbuf->vlan_tci = vlan_tag; 1818 mbuf->l2_len = sizeof(struct ether_hdr); 1819 mbuf->l3_len = sizeof(struct ipv4_hdr); 1820 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1821 1822 tx_q->m_table[len] = mbuf; 1823 len++; 1824 1825 LOG_DEBUG(VHOST_DATA, 1826 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1827 dev->device_fh, 1828 mbuf->nb_segs, 1829 (mbuf->next == NULL) ? "null" : "non-null"); 1830 1831 if (enable_stats) { 1832 dev_statistics[dev->device_fh].tx_total++; 1833 dev_statistics[dev->device_fh].tx++; 1834 } 1835 1836 if (unlikely(len == MAX_PKT_BURST)) { 1837 m_table = (struct rte_mbuf **)tx_q->m_table; 1838 ret = rte_eth_tx_burst(ports[0], 1839 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1840 1841 /* 1842 * Free any buffers not handled by TX and update 1843 * the port stats. 1844 */ 1845 if (unlikely(ret < len)) { 1846 do { 1847 rte_pktmbuf_free(m_table[ret]); 1848 } while (++ret < len); 1849 } 1850 1851 len = 0; 1852 txmbuf_clean_zcp(dev, vpool); 1853 } 1854 1855 tx_q->len = len; 1856 1857 return; 1858 } 1859 1860 /* 1861 * This function TX all available packets in virtio TX queue for one 1862 * virtio-net device. If it is first packet, it learns MAC address and 1863 * setup VMDQ. 1864 */ 1865 static inline void __attribute__((always_inline)) 1866 virtio_dev_tx_zcp(struct virtio_net *dev) 1867 { 1868 struct rte_mbuf m; 1869 struct vhost_virtqueue *vq; 1870 struct vring_desc *desc; 1871 uint64_t buff_addr = 0, phys_addr; 1872 uint32_t head[MAX_PKT_BURST]; 1873 uint32_t i; 1874 uint16_t free_entries, packet_success = 0; 1875 uint16_t avail_idx; 1876 uint8_t need_copy = 0; 1877 hpa_type addr_type; 1878 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1879 1880 vq = dev->virtqueue[VIRTIO_TXQ]; 1881 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1882 1883 /* If there are no available buffers then return. */ 1884 if (vq->last_used_idx_res == avail_idx) 1885 return; 1886 1887 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1888 1889 /* Prefetch available ring to retrieve head indexes. */ 1890 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1891 1892 /* Get the number of free entries in the ring */ 1893 free_entries = (avail_idx - vq->last_used_idx_res); 1894 1895 /* Limit to MAX_PKT_BURST. */ 1896 free_entries 1897 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1898 1899 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1900 dev->device_fh, free_entries); 1901 1902 /* Retrieve all of the head indexes first to avoid caching issues. */ 1903 for (i = 0; i < free_entries; i++) 1904 head[i] 1905 = vq->avail->ring[(vq->last_used_idx_res + i) 1906 & (vq->size - 1)]; 1907 1908 vq->last_used_idx_res += free_entries; 1909 1910 /* Prefetch descriptor index. */ 1911 rte_prefetch0(&vq->desc[head[packet_success]]); 1912 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1913 1914 while (packet_success < free_entries) { 1915 desc = &vq->desc[head[packet_success]]; 1916 1917 /* Discard first buffer as it is the virtio header */ 1918 desc = &vq->desc[desc->next]; 1919 1920 /* Buffer address translation. */ 1921 buff_addr = gpa_to_vva(dev, desc->addr); 1922 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type); 1923 1924 if (likely(packet_success < (free_entries - 1))) 1925 /* Prefetch descriptor index. */ 1926 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1927 1928 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1929 RTE_LOG(ERR, VHOST_DATA, 1930 "(%"PRIu64") Invalid frame buffer address found" 1931 "when TX packets!\n", 1932 dev->device_fh); 1933 packet_success++; 1934 continue; 1935 } 1936 1937 /* Prefetch buffer address. */ 1938 rte_prefetch0((void *)(uintptr_t)buff_addr); 1939 1940 /* 1941 * Setup dummy mbuf. This is copied to a real mbuf if 1942 * transmitted out the physical port. 1943 */ 1944 m.data_len = desc->len; 1945 m.nb_segs = 1; 1946 m.next = NULL; 1947 m.data_off = 0; 1948 m.buf_addr = (void *)(uintptr_t)buff_addr; 1949 m.buf_physaddr = phys_addr; 1950 1951 /* 1952 * Check if the frame buffer address from guest crosses 1953 * sub-region or not. 1954 */ 1955 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1956 RTE_LOG(ERR, VHOST_DATA, 1957 "(%"PRIu64") Frame buffer address cross " 1958 "sub-regioin found when attaching TX frame " 1959 "buffer address!\n", 1960 dev->device_fh); 1961 need_copy = 1; 1962 } else 1963 need_copy = 0; 1964 1965 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1966 1967 /* 1968 * If this is the first received packet we need to learn 1969 * the MAC and setup VMDQ 1970 */ 1971 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1972 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1973 /* 1974 * Discard frame if device is scheduled for 1975 * removal or a duplicate MAC address is found. 1976 */ 1977 packet_success += free_entries; 1978 vq->last_used_idx += packet_success; 1979 break; 1980 } 1981 } 1982 1983 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1984 packet_success++; 1985 } 1986 } 1987 1988 /* 1989 * This function is called by each data core. It handles all RX/TX registered 1990 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1991 * addresses are compared with all devices in the main linked list. 1992 */ 1993 static int 1994 switch_worker_zcp(__attribute__((unused)) void *arg) 1995 { 1996 struct virtio_net *dev = NULL; 1997 struct vhost_dev *vdev = NULL; 1998 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1999 struct virtio_net_data_ll *dev_ll; 2000 struct mbuf_table *tx_q; 2001 volatile struct lcore_ll_info *lcore_ll; 2002 const uint64_t drain_tsc 2003 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2004 * BURST_TX_DRAIN_US; 2005 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2006 unsigned ret; 2007 const uint16_t lcore_id = rte_lcore_id(); 2008 uint16_t count_in_ring, rx_count = 0; 2009 2010 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2011 2012 lcore_ll = lcore_info[lcore_id].lcore_ll; 2013 prev_tsc = 0; 2014 2015 while (1) { 2016 cur_tsc = rte_rdtsc(); 2017 2018 /* TX burst queue drain */ 2019 diff_tsc = cur_tsc - prev_tsc; 2020 if (unlikely(diff_tsc > drain_tsc)) { 2021 /* 2022 * Get mbuf from vpool.pool and detach mbuf and 2023 * put back into vpool.ring. 2024 */ 2025 dev_ll = lcore_ll->ll_root_used; 2026 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2027 /* Get virtio device ID */ 2028 vdev = dev_ll->vdev; 2029 dev = vdev->dev; 2030 2031 if (likely(!vdev->remove)) { 2032 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2033 if (tx_q->len) { 2034 LOG_DEBUG(VHOST_DATA, 2035 "TX queue drained after timeout" 2036 " with burst size %u\n", 2037 tx_q->len); 2038 2039 /* 2040 * Tx any packets in the queue 2041 */ 2042 ret = rte_eth_tx_burst( 2043 ports[0], 2044 (uint16_t)tx_q->txq_id, 2045 (struct rte_mbuf **) 2046 tx_q->m_table, 2047 (uint16_t)tx_q->len); 2048 if (unlikely(ret < tx_q->len)) { 2049 do { 2050 rte_pktmbuf_free( 2051 tx_q->m_table[ret]); 2052 } while (++ret < tx_q->len); 2053 } 2054 tx_q->len = 0; 2055 2056 txmbuf_clean_zcp(dev, 2057 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2058 } 2059 } 2060 dev_ll = dev_ll->next; 2061 } 2062 prev_tsc = cur_tsc; 2063 } 2064 2065 rte_prefetch0(lcore_ll->ll_root_used); 2066 2067 /* 2068 * Inform the configuration core that we have exited the linked 2069 * list and that no devices are in use if requested. 2070 */ 2071 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2072 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2073 2074 /* Process devices */ 2075 dev_ll = lcore_ll->ll_root_used; 2076 2077 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2078 vdev = dev_ll->vdev; 2079 dev = vdev->dev; 2080 if (unlikely(vdev->remove)) { 2081 dev_ll = dev_ll->next; 2082 unlink_vmdq(vdev); 2083 vdev->ready = DEVICE_SAFE_REMOVE; 2084 continue; 2085 } 2086 2087 if (likely(vdev->ready == DEVICE_RX)) { 2088 uint32_t index = vdev->vmdq_rx_q; 2089 uint16_t i; 2090 count_in_ring 2091 = rte_ring_count(vpool_array[index].ring); 2092 uint16_t free_entries 2093 = (uint16_t)get_available_ring_num_zcp(dev); 2094 2095 /* 2096 * Attach all mbufs in vpool.ring and put back 2097 * into vpool.pool. 2098 */ 2099 for (i = 0; 2100 i < RTE_MIN(free_entries, 2101 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2102 i++) 2103 attach_rxmbuf_zcp(dev); 2104 2105 /* Handle guest RX */ 2106 rx_count = rte_eth_rx_burst(ports[0], 2107 vdev->vmdq_rx_q, pkts_burst, 2108 MAX_PKT_BURST); 2109 2110 if (rx_count) { 2111 ret_count = virtio_dev_rx_zcp(dev, 2112 pkts_burst, rx_count); 2113 if (enable_stats) { 2114 dev_statistics[dev->device_fh].rx_total 2115 += rx_count; 2116 dev_statistics[dev->device_fh].rx 2117 += ret_count; 2118 } 2119 while (likely(rx_count)) { 2120 rx_count--; 2121 pktmbuf_detach_zcp( 2122 pkts_burst[rx_count]); 2123 rte_ring_sp_enqueue( 2124 vpool_array[index].ring, 2125 (void *)pkts_burst[rx_count]); 2126 } 2127 } 2128 } 2129 2130 if (likely(!vdev->remove)) 2131 /* Handle guest TX */ 2132 virtio_dev_tx_zcp(dev); 2133 2134 /* Move to the next device in the list */ 2135 dev_ll = dev_ll->next; 2136 } 2137 } 2138 2139 return 0; 2140 } 2141 2142 2143 /* 2144 * Add an entry to a used linked list. A free entry must first be found 2145 * in the free linked list using get_data_ll_free_entry(); 2146 */ 2147 static void 2148 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2149 struct virtio_net_data_ll *ll_dev) 2150 { 2151 struct virtio_net_data_ll *ll = *ll_root_addr; 2152 2153 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2154 ll_dev->next = NULL; 2155 rte_compiler_barrier(); 2156 2157 /* If ll == NULL then this is the first device. */ 2158 if (ll) { 2159 /* Increment to the tail of the linked list. */ 2160 while ((ll->next != NULL) ) 2161 ll = ll->next; 2162 2163 ll->next = ll_dev; 2164 } else { 2165 *ll_root_addr = ll_dev; 2166 } 2167 } 2168 2169 /* 2170 * Remove an entry from a used linked list. The entry must then be added to 2171 * the free linked list using put_data_ll_free_entry(). 2172 */ 2173 static void 2174 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2175 struct virtio_net_data_ll *ll_dev, 2176 struct virtio_net_data_ll *ll_dev_last) 2177 { 2178 struct virtio_net_data_ll *ll = *ll_root_addr; 2179 2180 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2181 return; 2182 2183 if (ll_dev == ll) 2184 *ll_root_addr = ll_dev->next; 2185 else 2186 if (likely(ll_dev_last != NULL)) 2187 ll_dev_last->next = ll_dev->next; 2188 else 2189 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2190 } 2191 2192 /* 2193 * Find and return an entry from the free linked list. 2194 */ 2195 static struct virtio_net_data_ll * 2196 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2197 { 2198 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2199 struct virtio_net_data_ll *ll_dev; 2200 2201 if (ll_free == NULL) 2202 return NULL; 2203 2204 ll_dev = ll_free; 2205 *ll_root_addr = ll_free->next; 2206 2207 return ll_dev; 2208 } 2209 2210 /* 2211 * Place an entry back on to the free linked list. 2212 */ 2213 static void 2214 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2215 struct virtio_net_data_ll *ll_dev) 2216 { 2217 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2218 2219 if (ll_dev == NULL) 2220 return; 2221 2222 ll_dev->next = ll_free; 2223 *ll_root_addr = ll_dev; 2224 } 2225 2226 /* 2227 * Creates a linked list of a given size. 2228 */ 2229 static struct virtio_net_data_ll * 2230 alloc_data_ll(uint32_t size) 2231 { 2232 struct virtio_net_data_ll *ll_new; 2233 uint32_t i; 2234 2235 /* Malloc and then chain the linked list. */ 2236 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2237 if (ll_new == NULL) { 2238 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2239 return NULL; 2240 } 2241 2242 for (i = 0; i < size - 1; i++) { 2243 ll_new[i].vdev = NULL; 2244 ll_new[i].next = &ll_new[i+1]; 2245 } 2246 ll_new[i].next = NULL; 2247 2248 return (ll_new); 2249 } 2250 2251 /* 2252 * Create the main linked list along with each individual cores linked list. A used and a free list 2253 * are created to manage entries. 2254 */ 2255 static int 2256 init_data_ll (void) 2257 { 2258 int lcore; 2259 2260 RTE_LCORE_FOREACH_SLAVE(lcore) { 2261 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2262 if (lcore_info[lcore].lcore_ll == NULL) { 2263 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2264 return -1; 2265 } 2266 2267 lcore_info[lcore].lcore_ll->device_num = 0; 2268 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2269 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2270 if (num_devices % num_switching_cores) 2271 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2272 else 2273 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2274 } 2275 2276 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2277 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2278 2279 return 0; 2280 } 2281 2282 /* 2283 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2284 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2285 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2286 */ 2287 static void 2288 destroy_device (volatile struct virtio_net *dev) 2289 { 2290 struct virtio_net_data_ll *ll_lcore_dev_cur; 2291 struct virtio_net_data_ll *ll_main_dev_cur; 2292 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2293 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2294 struct vhost_dev *vdev; 2295 int lcore; 2296 2297 dev->flags &= ~VIRTIO_DEV_RUNNING; 2298 2299 vdev = (struct vhost_dev *)dev->priv; 2300 /*set the remove flag. */ 2301 vdev->remove = 1; 2302 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2303 rte_pause(); 2304 } 2305 2306 /* Search for entry to be removed from lcore ll */ 2307 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2308 while (ll_lcore_dev_cur != NULL) { 2309 if (ll_lcore_dev_cur->vdev == vdev) { 2310 break; 2311 } else { 2312 ll_lcore_dev_last = ll_lcore_dev_cur; 2313 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2314 } 2315 } 2316 2317 if (ll_lcore_dev_cur == NULL) { 2318 RTE_LOG(ERR, VHOST_CONFIG, 2319 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2320 dev->device_fh); 2321 return; 2322 } 2323 2324 /* Search for entry to be removed from main ll */ 2325 ll_main_dev_cur = ll_root_used; 2326 ll_main_dev_last = NULL; 2327 while (ll_main_dev_cur != NULL) { 2328 if (ll_main_dev_cur->vdev == vdev) { 2329 break; 2330 } else { 2331 ll_main_dev_last = ll_main_dev_cur; 2332 ll_main_dev_cur = ll_main_dev_cur->next; 2333 } 2334 } 2335 2336 /* Remove entries from the lcore and main ll. */ 2337 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2338 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2339 2340 /* Set the dev_removal_flag on each lcore. */ 2341 RTE_LCORE_FOREACH_SLAVE(lcore) { 2342 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2343 } 2344 2345 /* 2346 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2347 * they can no longer access the device removed from the linked lists and that the devices 2348 * are no longer in use. 2349 */ 2350 RTE_LCORE_FOREACH_SLAVE(lcore) { 2351 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2352 rte_pause(); 2353 } 2354 } 2355 2356 /* Add the entries back to the lcore and main free ll.*/ 2357 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2358 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2359 2360 /* Decrement number of device on the lcore. */ 2361 lcore_info[vdev->coreid].lcore_ll->device_num--; 2362 2363 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2364 2365 if (zero_copy) { 2366 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2367 2368 /* Stop the RX queue. */ 2369 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2370 LOG_DEBUG(VHOST_CONFIG, 2371 "(%"PRIu64") In destroy_device: Failed to stop " 2372 "rx queue:%d\n", 2373 dev->device_fh, 2374 vdev->vmdq_rx_q); 2375 } 2376 2377 LOG_DEBUG(VHOST_CONFIG, 2378 "(%"PRIu64") in destroy_device: Start put mbuf in " 2379 "mempool back to ring for RX queue: %d\n", 2380 dev->device_fh, vdev->vmdq_rx_q); 2381 2382 mbuf_destroy_zcp(vpool); 2383 2384 /* Stop the TX queue. */ 2385 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2386 LOG_DEBUG(VHOST_CONFIG, 2387 "(%"PRIu64") In destroy_device: Failed to " 2388 "stop tx queue:%d\n", 2389 dev->device_fh, vdev->vmdq_rx_q); 2390 } 2391 2392 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2393 2394 LOG_DEBUG(VHOST_CONFIG, 2395 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2396 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2397 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2398 dev->device_fh); 2399 2400 mbuf_destroy_zcp(vpool); 2401 rte_free(vdev->regions_hpa); 2402 } 2403 rte_free(vdev); 2404 2405 } 2406 2407 /* 2408 * Calculate the region count of physical continous regions for one particular 2409 * region of whose vhost virtual address is continous. The particular region 2410 * start from vva_start, with size of 'size' in argument. 2411 */ 2412 static uint32_t 2413 check_hpa_regions(uint64_t vva_start, uint64_t size) 2414 { 2415 uint32_t i, nregions = 0, page_size = getpagesize(); 2416 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2417 if (vva_start % page_size) { 2418 LOG_DEBUG(VHOST_CONFIG, 2419 "in check_countinous: vva start(%p) mod page_size(%d) " 2420 "has remainder\n", 2421 (void *)(uintptr_t)vva_start, page_size); 2422 return 0; 2423 } 2424 if (size % page_size) { 2425 LOG_DEBUG(VHOST_CONFIG, 2426 "in check_countinous: " 2427 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2428 size, page_size); 2429 return 0; 2430 } 2431 for (i = 0; i < size - page_size; i = i + page_size) { 2432 cur_phys_addr 2433 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2434 next_phys_addr = rte_mem_virt2phy( 2435 (void *)(uintptr_t)(vva_start + i + page_size)); 2436 if ((cur_phys_addr + page_size) != next_phys_addr) { 2437 ++nregions; 2438 LOG_DEBUG(VHOST_CONFIG, 2439 "in check_continuous: hva addr:(%p) is not " 2440 "continuous with hva addr:(%p), diff:%d\n", 2441 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2442 (void *)(uintptr_t)(vva_start + (uint64_t)i 2443 + page_size), page_size); 2444 LOG_DEBUG(VHOST_CONFIG, 2445 "in check_continuous: hpa addr:(%p) is not " 2446 "continuous with hpa addr:(%p), " 2447 "diff:(%"PRIu64")\n", 2448 (void *)(uintptr_t)cur_phys_addr, 2449 (void *)(uintptr_t)next_phys_addr, 2450 (next_phys_addr-cur_phys_addr)); 2451 } 2452 } 2453 return nregions; 2454 } 2455 2456 /* 2457 * Divide each region whose vhost virtual address is continous into a few 2458 * sub-regions, make sure the physical address within each sub-region are 2459 * continous. And fill offset(to GPA) and size etc. information of each 2460 * sub-region into regions_hpa. 2461 */ 2462 static uint32_t 2463 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2464 { 2465 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2466 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2467 2468 if (mem_region_hpa == NULL) 2469 return 0; 2470 2471 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2472 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2473 virtio_memory->regions[regionidx].address_offset; 2474 mem_region_hpa[regionidx_hpa].guest_phys_address 2475 = virtio_memory->regions[regionidx].guest_phys_address; 2476 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2477 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2478 mem_region_hpa[regionidx_hpa].guest_phys_address; 2479 LOG_DEBUG(VHOST_CONFIG, 2480 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2481 regionidx_hpa, 2482 (void *)(uintptr_t) 2483 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2484 LOG_DEBUG(VHOST_CONFIG, 2485 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2486 regionidx_hpa, 2487 (void *)(uintptr_t) 2488 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2489 for (i = 0, k = 0; 2490 i < virtio_memory->regions[regionidx].memory_size - 2491 page_size; 2492 i += page_size) { 2493 cur_phys_addr = rte_mem_virt2phy( 2494 (void *)(uintptr_t)(vva_start + i)); 2495 next_phys_addr = rte_mem_virt2phy( 2496 (void *)(uintptr_t)(vva_start + 2497 i + page_size)); 2498 if ((cur_phys_addr + page_size) != next_phys_addr) { 2499 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2500 mem_region_hpa[regionidx_hpa].guest_phys_address + 2501 k + page_size; 2502 mem_region_hpa[regionidx_hpa].memory_size 2503 = k + page_size; 2504 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2505 "phys addr end [%d]:(%p)\n", 2506 regionidx_hpa, 2507 (void *)(uintptr_t) 2508 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2509 LOG_DEBUG(VHOST_CONFIG, 2510 "in fill_hpa_regions: guest phys addr " 2511 "size [%d]:(%p)\n", 2512 regionidx_hpa, 2513 (void *)(uintptr_t) 2514 (mem_region_hpa[regionidx_hpa].memory_size)); 2515 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2516 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2517 ++regionidx_hpa; 2518 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2519 next_phys_addr - 2520 mem_region_hpa[regionidx_hpa].guest_phys_address; 2521 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2522 " phys addr start[%d]:(%p)\n", 2523 regionidx_hpa, 2524 (void *)(uintptr_t) 2525 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2526 LOG_DEBUG(VHOST_CONFIG, 2527 "in fill_hpa_regions: host phys addr " 2528 "start[%d]:(%p)\n", 2529 regionidx_hpa, 2530 (void *)(uintptr_t) 2531 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2532 k = 0; 2533 } else { 2534 k += page_size; 2535 } 2536 } 2537 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2538 = mem_region_hpa[regionidx_hpa].guest_phys_address 2539 + k + page_size; 2540 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2541 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2542 "[%d]:(%p)\n", regionidx_hpa, 2543 (void *)(uintptr_t) 2544 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2545 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2546 "[%d]:(%p)\n", regionidx_hpa, 2547 (void *)(uintptr_t) 2548 (mem_region_hpa[regionidx_hpa].memory_size)); 2549 ++regionidx_hpa; 2550 } 2551 return regionidx_hpa; 2552 } 2553 2554 /* 2555 * A new device is added to a data core. First the device is added to the main linked list 2556 * and the allocated to a specific data core. 2557 */ 2558 static int 2559 new_device (struct virtio_net *dev) 2560 { 2561 struct virtio_net_data_ll *ll_dev; 2562 int lcore, core_add = 0; 2563 uint32_t device_num_min = num_devices; 2564 struct vhost_dev *vdev; 2565 uint32_t regionidx; 2566 2567 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2568 if (vdev == NULL) { 2569 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2570 dev->device_fh); 2571 return -1; 2572 } 2573 vdev->dev = dev; 2574 dev->priv = vdev; 2575 2576 if (zero_copy) { 2577 vdev->nregions_hpa = dev->mem->nregions; 2578 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2579 vdev->nregions_hpa 2580 += check_hpa_regions( 2581 dev->mem->regions[regionidx].guest_phys_address 2582 + dev->mem->regions[regionidx].address_offset, 2583 dev->mem->regions[regionidx].memory_size); 2584 2585 } 2586 2587 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2588 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2589 CACHE_LINE_SIZE); 2590 if (vdev->regions_hpa == NULL) { 2591 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2592 rte_free(vdev); 2593 return -1; 2594 } 2595 2596 2597 if (fill_hpa_memory_regions( 2598 vdev->regions_hpa, dev->mem 2599 ) != vdev->nregions_hpa) { 2600 2601 RTE_LOG(ERR, VHOST_CONFIG, 2602 "hpa memory regions number mismatch: " 2603 "[%d]\n", vdev->nregions_hpa); 2604 rte_free(vdev->regions_hpa); 2605 rte_free(vdev); 2606 return -1; 2607 } 2608 } 2609 2610 2611 /* Add device to main ll */ 2612 ll_dev = get_data_ll_free_entry(&ll_root_free); 2613 if (ll_dev == NULL) { 2614 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2615 "of %d devices per core has been reached\n", 2616 dev->device_fh, num_devices); 2617 if (vdev->regions_hpa) 2618 rte_free(vdev->regions_hpa); 2619 rte_free(vdev); 2620 return -1; 2621 } 2622 ll_dev->vdev = vdev; 2623 add_data_ll_entry(&ll_root_used, ll_dev); 2624 vdev->vmdq_rx_q 2625 = dev->device_fh * (num_queues / num_devices); 2626 2627 if (zero_copy) { 2628 uint32_t index = vdev->vmdq_rx_q; 2629 uint32_t count_in_ring, i; 2630 struct mbuf_table *tx_q; 2631 2632 count_in_ring = rte_ring_count(vpool_array[index].ring); 2633 2634 LOG_DEBUG(VHOST_CONFIG, 2635 "(%"PRIu64") in new_device: mbuf count in mempool " 2636 "before attach is: %d\n", 2637 dev->device_fh, 2638 rte_mempool_count(vpool_array[index].pool)); 2639 LOG_DEBUG(VHOST_CONFIG, 2640 "(%"PRIu64") in new_device: mbuf count in ring " 2641 "before attach is : %d\n", 2642 dev->device_fh, count_in_ring); 2643 2644 /* 2645 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2646 */ 2647 for (i = 0; i < count_in_ring; i++) 2648 attach_rxmbuf_zcp(dev); 2649 2650 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2651 "mempool after attach is: %d\n", 2652 dev->device_fh, 2653 rte_mempool_count(vpool_array[index].pool)); 2654 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2655 "ring after attach is : %d\n", 2656 dev->device_fh, 2657 rte_ring_count(vpool_array[index].ring)); 2658 2659 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2660 tx_q->txq_id = vdev->vmdq_rx_q; 2661 2662 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2663 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2664 2665 LOG_DEBUG(VHOST_CONFIG, 2666 "(%"PRIu64") In new_device: Failed to start " 2667 "tx queue:%d\n", 2668 dev->device_fh, vdev->vmdq_rx_q); 2669 2670 mbuf_destroy_zcp(vpool); 2671 rte_free(vdev->regions_hpa); 2672 rte_free(vdev); 2673 return -1; 2674 } 2675 2676 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2677 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2678 2679 LOG_DEBUG(VHOST_CONFIG, 2680 "(%"PRIu64") In new_device: Failed to start " 2681 "rx queue:%d\n", 2682 dev->device_fh, vdev->vmdq_rx_q); 2683 2684 /* Stop the TX queue. */ 2685 if (rte_eth_dev_tx_queue_stop(ports[0], 2686 vdev->vmdq_rx_q) != 0) { 2687 LOG_DEBUG(VHOST_CONFIG, 2688 "(%"PRIu64") In new_device: Failed to " 2689 "stop tx queue:%d\n", 2690 dev->device_fh, vdev->vmdq_rx_q); 2691 } 2692 2693 mbuf_destroy_zcp(vpool); 2694 rte_free(vdev->regions_hpa); 2695 rte_free(vdev); 2696 return -1; 2697 } 2698 2699 } 2700 2701 /*reset ready flag*/ 2702 vdev->ready = DEVICE_MAC_LEARNING; 2703 vdev->remove = 0; 2704 2705 /* Find a suitable lcore to add the device. */ 2706 RTE_LCORE_FOREACH_SLAVE(lcore) { 2707 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2708 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2709 core_add = lcore; 2710 } 2711 } 2712 /* Add device to lcore ll */ 2713 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2714 if (ll_dev == NULL) { 2715 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2716 vdev->ready = DEVICE_SAFE_REMOVE; 2717 destroy_device(dev); 2718 if (vdev->regions_hpa) 2719 rte_free(vdev->regions_hpa); 2720 rte_free(vdev); 2721 return -1; 2722 } 2723 ll_dev->vdev = vdev; 2724 vdev->coreid = core_add; 2725 2726 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2727 2728 /* Initialize device stats */ 2729 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2730 2731 /* Disable notifications. */ 2732 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2733 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2734 lcore_info[vdev->coreid].lcore_ll->device_num++; 2735 dev->flags |= VIRTIO_DEV_RUNNING; 2736 2737 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2738 2739 return 0; 2740 } 2741 2742 /* 2743 * These callback allow devices to be added to the data core when configuration 2744 * has been fully complete. 2745 */ 2746 static const struct virtio_net_device_ops virtio_net_device_ops = 2747 { 2748 .new_device = new_device, 2749 .destroy_device = destroy_device, 2750 }; 2751 2752 /* 2753 * This is a thread will wake up after a period to print stats if the user has 2754 * enabled them. 2755 */ 2756 static void 2757 print_stats(void) 2758 { 2759 struct virtio_net_data_ll *dev_ll; 2760 uint64_t tx_dropped, rx_dropped; 2761 uint64_t tx, tx_total, rx, rx_total; 2762 uint32_t device_fh; 2763 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2764 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2765 2766 while(1) { 2767 sleep(enable_stats); 2768 2769 /* Clear screen and move to top left */ 2770 printf("%s%s", clr, top_left); 2771 2772 printf("\nDevice statistics ===================================="); 2773 2774 dev_ll = ll_root_used; 2775 while (dev_ll != NULL) { 2776 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2777 tx_total = dev_statistics[device_fh].tx_total; 2778 tx = dev_statistics[device_fh].tx; 2779 tx_dropped = tx_total - tx; 2780 if (zero_copy == 0) { 2781 rx_total = rte_atomic64_read( 2782 &dev_statistics[device_fh].rx_total_atomic); 2783 rx = rte_atomic64_read( 2784 &dev_statistics[device_fh].rx_atomic); 2785 } else { 2786 rx_total = dev_statistics[device_fh].rx_total; 2787 rx = dev_statistics[device_fh].rx; 2788 } 2789 rx_dropped = rx_total - rx; 2790 2791 printf("\nStatistics for device %"PRIu32" ------------------------------" 2792 "\nTX total: %"PRIu64"" 2793 "\nTX dropped: %"PRIu64"" 2794 "\nTX successful: %"PRIu64"" 2795 "\nRX total: %"PRIu64"" 2796 "\nRX dropped: %"PRIu64"" 2797 "\nRX successful: %"PRIu64"", 2798 device_fh, 2799 tx_total, 2800 tx_dropped, 2801 tx, 2802 rx_total, 2803 rx_dropped, 2804 rx); 2805 2806 dev_ll = dev_ll->next; 2807 } 2808 printf("\n======================================================\n"); 2809 } 2810 } 2811 2812 static void 2813 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2814 char *ring_name, uint32_t nb_mbuf) 2815 { 2816 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2817 vpool_array[index].pool 2818 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2819 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2820 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2821 rte_pktmbuf_init, NULL, socket, 0); 2822 if (vpool_array[index].pool != NULL) { 2823 vpool_array[index].ring 2824 = rte_ring_create(ring_name, 2825 rte_align32pow2(nb_mbuf + 1), 2826 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2827 if (likely(vpool_array[index].ring != NULL)) { 2828 LOG_DEBUG(VHOST_CONFIG, 2829 "in setup_mempool_tbl: mbuf count in " 2830 "mempool is: %d\n", 2831 rte_mempool_count(vpool_array[index].pool)); 2832 LOG_DEBUG(VHOST_CONFIG, 2833 "in setup_mempool_tbl: mbuf count in " 2834 "ring is: %d\n", 2835 rte_ring_count(vpool_array[index].ring)); 2836 } else { 2837 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2838 ring_name); 2839 } 2840 2841 /* Need consider head room. */ 2842 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2843 } else { 2844 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2845 } 2846 } 2847 2848 2849 /* 2850 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2851 * device is also registered here to handle the IOCTLs. 2852 */ 2853 int 2854 MAIN(int argc, char *argv[]) 2855 { 2856 struct rte_mempool *mbuf_pool = NULL; 2857 unsigned lcore_id, core_id = 0; 2858 unsigned nb_ports, valid_num_ports; 2859 int ret; 2860 uint8_t portid, queue_id = 0; 2861 static pthread_t tid; 2862 2863 /* init EAL */ 2864 ret = rte_eal_init(argc, argv); 2865 if (ret < 0) 2866 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2867 argc -= ret; 2868 argv += ret; 2869 2870 /* parse app arguments */ 2871 ret = us_vhost_parse_args(argc, argv); 2872 if (ret < 0) 2873 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2874 2875 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2876 if (rte_lcore_is_enabled(lcore_id)) 2877 lcore_ids[core_id ++] = lcore_id; 2878 2879 if (rte_lcore_count() > RTE_MAX_LCORE) 2880 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2881 2882 /*set the number of swithcing cores available*/ 2883 num_switching_cores = rte_lcore_count()-1; 2884 2885 /* Get the number of physical ports. */ 2886 nb_ports = rte_eth_dev_count(); 2887 if (nb_ports > RTE_MAX_ETHPORTS) 2888 nb_ports = RTE_MAX_ETHPORTS; 2889 2890 /* 2891 * Update the global var NUM_PORTS and global array PORTS 2892 * and get value of var VALID_NUM_PORTS according to system ports number 2893 */ 2894 valid_num_ports = check_ports_num(nb_ports); 2895 2896 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2897 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2898 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2899 return -1; 2900 } 2901 2902 if (zero_copy == 0) { 2903 /* Create the mbuf pool. */ 2904 mbuf_pool = rte_mempool_create( 2905 "MBUF_POOL", 2906 NUM_MBUFS_PER_PORT 2907 * valid_num_ports, 2908 MBUF_SIZE, MBUF_CACHE_SIZE, 2909 sizeof(struct rte_pktmbuf_pool_private), 2910 rte_pktmbuf_pool_init, NULL, 2911 rte_pktmbuf_init, NULL, 2912 rte_socket_id(), 0); 2913 if (mbuf_pool == NULL) 2914 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2915 2916 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2917 vpool_array[queue_id].pool = mbuf_pool; 2918 2919 if (vm2vm_mode == VM2VM_HARDWARE) { 2920 /* Enable VT loop back to let L2 switch to do it. */ 2921 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2922 LOG_DEBUG(VHOST_CONFIG, 2923 "Enable loop back for L2 switch in vmdq.\n"); 2924 } 2925 } else { 2926 uint32_t nb_mbuf; 2927 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2928 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2929 2930 /* 2931 * Zero copy defers queue RX/TX start to the time when guest 2932 * finishes its startup and packet buffers from that guest are 2933 * available. 2934 */ 2935 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2936 rx_conf_default.rx_drop_en = 0; 2937 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2938 nb_mbuf = num_rx_descriptor 2939 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2940 + num_switching_cores * MAX_PKT_BURST; 2941 2942 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2943 snprintf(pool_name, sizeof(pool_name), 2944 "rxmbuf_pool_%u", queue_id); 2945 snprintf(ring_name, sizeof(ring_name), 2946 "rxmbuf_ring_%u", queue_id); 2947 setup_mempool_tbl(rte_socket_id(), queue_id, 2948 pool_name, ring_name, nb_mbuf); 2949 } 2950 2951 nb_mbuf = num_tx_descriptor 2952 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2953 + num_switching_cores * MAX_PKT_BURST; 2954 2955 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2956 snprintf(pool_name, sizeof(pool_name), 2957 "txmbuf_pool_%u", queue_id); 2958 snprintf(ring_name, sizeof(ring_name), 2959 "txmbuf_ring_%u", queue_id); 2960 setup_mempool_tbl(rte_socket_id(), 2961 (queue_id + MAX_QUEUES), 2962 pool_name, ring_name, nb_mbuf); 2963 } 2964 2965 if (vm2vm_mode == VM2VM_HARDWARE) { 2966 /* Enable VT loop back to let L2 switch to do it. */ 2967 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2968 LOG_DEBUG(VHOST_CONFIG, 2969 "Enable loop back for L2 switch in vmdq.\n"); 2970 } 2971 } 2972 /* Set log level. */ 2973 rte_set_log_level(LOG_LEVEL); 2974 2975 /* initialize all ports */ 2976 for (portid = 0; portid < nb_ports; portid++) { 2977 /* skip ports that are not enabled */ 2978 if ((enabled_port_mask & (1 << portid)) == 0) { 2979 RTE_LOG(INFO, VHOST_PORT, 2980 "Skipping disabled port %d\n", portid); 2981 continue; 2982 } 2983 if (port_init(portid) != 0) 2984 rte_exit(EXIT_FAILURE, 2985 "Cannot initialize network ports\n"); 2986 } 2987 2988 /* Initialise all linked lists. */ 2989 if (init_data_ll() == -1) 2990 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2991 2992 /* Initialize device stats */ 2993 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2994 2995 /* Enable stats if the user option is set. */ 2996 if (enable_stats) 2997 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2998 2999 /* Launch all data cores. */ 3000 if (zero_copy == 0) { 3001 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3002 rte_eal_remote_launch(switch_worker, 3003 mbuf_pool, lcore_id); 3004 } 3005 } else { 3006 uint32_t count_in_mempool, index, i; 3007 for (index = 0; index < 2*MAX_QUEUES; index++) { 3008 /* For all RX and TX queues. */ 3009 count_in_mempool 3010 = rte_mempool_count(vpool_array[index].pool); 3011 3012 /* 3013 * Transfer all un-attached mbufs from vpool.pool 3014 * to vpoo.ring. 3015 */ 3016 for (i = 0; i < count_in_mempool; i++) { 3017 struct rte_mbuf *mbuf 3018 = __rte_mbuf_raw_alloc( 3019 vpool_array[index].pool); 3020 rte_ring_sp_enqueue(vpool_array[index].ring, 3021 (void *)mbuf); 3022 } 3023 3024 LOG_DEBUG(VHOST_CONFIG, 3025 "in MAIN: mbuf count in mempool at initial " 3026 "is: %d\n", count_in_mempool); 3027 LOG_DEBUG(VHOST_CONFIG, 3028 "in MAIN: mbuf count in ring at initial is :" 3029 " %d\n", 3030 rte_ring_count(vpool_array[index].ring)); 3031 } 3032 3033 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3034 rte_eal_remote_launch(switch_worker_zcp, NULL, 3035 lcore_id); 3036 } 3037 3038 if (mergeable == 0) 3039 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3040 3041 /* Register CUSE device to handle IOCTLs. */ 3042 ret = rte_vhost_driver_register((char *)&dev_basename); 3043 if (ret != 0) 3044 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3045 3046 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3047 3048 /* Start CUSE session. */ 3049 rte_vhost_driver_session_start(); 3050 return 0; 3051 3052 } 3053 3054