1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 103 104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 106 107 #define JUMBO_FRAME_MAX_SIZE 0x2600 108 109 /* State of virtio device. */ 110 #define DEVICE_MAC_LEARNING 0 111 #define DEVICE_RX 1 112 #define DEVICE_SAFE_REMOVE 2 113 114 /* Config_core_flag status definitions. */ 115 #define REQUEST_DEV_REMOVAL 1 116 #define ACK_DEV_REMOVAL 0 117 118 /* Configurable number of RX/TX ring descriptors */ 119 #define RTE_TEST_RX_DESC_DEFAULT 1024 120 #define RTE_TEST_TX_DESC_DEFAULT 512 121 122 /* 123 * Need refine these 2 macros for legacy and DPDK based front end: 124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 125 * And then adjust power 2. 126 */ 127 /* 128 * For legacy front end, 128 descriptors, 129 * half for virtio header, another half for mbuf. 130 */ 131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 133 134 /* Get first 4 bytes in mbuf headroom. */ 135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 136 + sizeof(struct rte_mbuf))) 137 138 /* true if x is a power of 2 */ 139 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 140 141 #define INVALID_PORT_ID 0xFF 142 143 /* Max number of devices. Limited by vmdq. */ 144 #define MAX_DEVICES 64 145 146 /* Size of buffers used for snprintfs. */ 147 #define MAX_PRINT_BUFF 6072 148 149 /* Maximum character device basename size. */ 150 #define MAX_BASENAME_SZ 10 151 152 /* Maximum long option length for option parsing. */ 153 #define MAX_LONG_OPT_SZ 64 154 155 /* Used to compare MAC addresses. */ 156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 157 158 /* Number of descriptors per cacheline. */ 159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 160 161 /* mask of enabled ports */ 162 static uint32_t enabled_port_mask = 0; 163 164 /* Promiscuous mode */ 165 static uint32_t promiscuous; 166 167 /*Number of switching cores enabled*/ 168 static uint32_t num_switching_cores = 0; 169 170 /* number of devices/queues to support*/ 171 static uint32_t num_queues = 0; 172 static uint32_t num_devices; 173 174 /* 175 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 176 * disabled on default. 177 */ 178 static uint32_t zero_copy; 179 static int mergeable; 180 181 /* number of descriptors to apply*/ 182 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 183 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 184 185 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 186 #define MAX_RING_DESC 4096 187 188 struct vpool { 189 struct rte_mempool *pool; 190 struct rte_ring *ring; 191 uint32_t buf_size; 192 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 193 194 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 195 typedef enum { 196 VM2VM_DISABLED = 0, 197 VM2VM_SOFTWARE = 1, 198 VM2VM_HARDWARE = 2, 199 VM2VM_LAST 200 } vm2vm_type; 201 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 202 203 /* The type of host physical address translated from guest physical address. */ 204 typedef enum { 205 PHYS_ADDR_CONTINUOUS = 0, 206 PHYS_ADDR_CROSS_SUBREG = 1, 207 PHYS_ADDR_INVALID = 2, 208 PHYS_ADDR_LAST 209 } hpa_type; 210 211 /* Enable stats. */ 212 static uint32_t enable_stats = 0; 213 /* Enable retries on RX. */ 214 static uint32_t enable_retry = 1; 215 /* Specify timeout (in useconds) between retries on RX. */ 216 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 217 /* Specify the number of retries on RX. */ 218 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 219 220 /* Character device basename. Can be set by user. */ 221 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 222 223 224 /* Default configuration for rx and tx thresholds etc. */ 225 static struct rte_eth_rxconf rx_conf_default = { 226 .rx_thresh = { 227 .pthresh = RX_PTHRESH, 228 .hthresh = RX_HTHRESH, 229 .wthresh = RX_WTHRESH, 230 }, 231 .rx_drop_en = 1, 232 }; 233 234 /* 235 * These default values are optimized for use with the Intel(R) 82599 10 GbE 236 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 237 * network controllers and/or network drivers. 238 */ 239 static struct rte_eth_txconf tx_conf_default = { 240 .tx_thresh = { 241 .pthresh = TX_PTHRESH, 242 .hthresh = TX_HTHRESH, 243 .wthresh = TX_WTHRESH, 244 }, 245 .tx_free_thresh = 0, /* Use PMD default values */ 246 .tx_rs_thresh = 0, /* Use PMD default values */ 247 }; 248 249 /* empty vmdq configuration structure. Filled in programatically */ 250 static struct rte_eth_conf vmdq_conf_default = { 251 .rxmode = { 252 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 253 .split_hdr_size = 0, 254 .header_split = 0, /**< Header Split disabled */ 255 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 256 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 257 /* 258 * It is necessary for 1G NIC such as I350, 259 * this fixes bug of ipv4 forwarding in guest can't 260 * forward pakets from one virtio dev to another virtio dev. 261 */ 262 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 263 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 264 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 265 }, 266 267 .txmode = { 268 .mq_mode = ETH_MQ_TX_NONE, 269 }, 270 .rx_adv_conf = { 271 /* 272 * should be overridden separately in code with 273 * appropriate values 274 */ 275 .vmdq_rx_conf = { 276 .nb_queue_pools = ETH_8_POOLS, 277 .enable_default_pool = 0, 278 .default_pool = 0, 279 .nb_pool_maps = 0, 280 .pool_map = {{0, 0},}, 281 }, 282 }, 283 }; 284 285 static unsigned lcore_ids[RTE_MAX_LCORE]; 286 static uint8_t ports[RTE_MAX_ETHPORTS]; 287 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 288 289 static const uint16_t external_pkt_default_vlan_tag = 2000; 290 const uint16_t vlan_tags[] = { 291 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 292 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 293 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 294 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 295 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 296 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 297 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 298 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 299 }; 300 301 /* ethernet addresses of ports */ 302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 303 304 /* heads for the main used and free linked lists for the data path. */ 305 static struct virtio_net_data_ll *ll_root_used = NULL; 306 static struct virtio_net_data_ll *ll_root_free = NULL; 307 308 /* Array of data core structures containing information on individual core linked lists. */ 309 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 310 311 /* Used for queueing bursts of TX packets. */ 312 struct mbuf_table { 313 unsigned len; 314 unsigned txq_id; 315 struct rte_mbuf *m_table[MAX_PKT_BURST]; 316 }; 317 318 /* TX queue for each data core. */ 319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 320 321 /* TX queue fori each virtio device for zero copy. */ 322 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 323 324 /* Vlan header struct used to insert vlan tags on TX. */ 325 struct vlan_ethhdr { 326 unsigned char h_dest[ETH_ALEN]; 327 unsigned char h_source[ETH_ALEN]; 328 __be16 h_vlan_proto; 329 __be16 h_vlan_TCI; 330 __be16 h_vlan_encapsulated_proto; 331 }; 332 333 /* IPv4 Header */ 334 struct ipv4_hdr { 335 uint8_t version_ihl; /**< version and header length */ 336 uint8_t type_of_service; /**< type of service */ 337 uint16_t total_length; /**< length of packet */ 338 uint16_t packet_id; /**< packet ID */ 339 uint16_t fragment_offset; /**< fragmentation offset */ 340 uint8_t time_to_live; /**< time to live */ 341 uint8_t next_proto_id; /**< protocol ID */ 342 uint16_t hdr_checksum; /**< header checksum */ 343 uint32_t src_addr; /**< source address */ 344 uint32_t dst_addr; /**< destination address */ 345 } __attribute__((__packed__)); 346 347 /* Header lengths. */ 348 #define VLAN_HLEN 4 349 #define VLAN_ETH_HLEN 18 350 351 /* Per-device statistics struct */ 352 struct device_statistics { 353 uint64_t tx_total; 354 rte_atomic64_t rx_total_atomic; 355 uint64_t rx_total; 356 uint64_t tx; 357 rte_atomic64_t rx_atomic; 358 uint64_t rx; 359 } __rte_cache_aligned; 360 struct device_statistics dev_statistics[MAX_DEVICES]; 361 362 /* 363 * Builds up the correct configuration for VMDQ VLAN pool map 364 * according to the pool & queue limits. 365 */ 366 static inline int 367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 368 { 369 struct rte_eth_vmdq_rx_conf conf; 370 struct rte_eth_vmdq_rx_conf *def_conf = 371 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 372 unsigned i; 373 374 memset(&conf, 0, sizeof(conf)); 375 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 376 conf.nb_pool_maps = num_devices; 377 conf.enable_loop_back = def_conf->enable_loop_back; 378 conf.rx_mode = def_conf->rx_mode; 379 380 for (i = 0; i < conf.nb_pool_maps; i++) { 381 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 382 conf.pool_map[i].pools = (1UL << i); 383 } 384 385 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 386 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 387 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 388 return 0; 389 } 390 391 /* 392 * Validate the device number according to the max pool number gotten form 393 * dev_info. If the device number is invalid, give the error message and 394 * return -1. Each device must have its own pool. 395 */ 396 static inline int 397 validate_num_devices(uint32_t max_nb_devices) 398 { 399 if (num_devices > max_nb_devices) { 400 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 401 return -1; 402 } 403 return 0; 404 } 405 406 /* 407 * Initialises a given port using global settings and with the rx buffers 408 * coming from the mbuf_pool passed as parameter 409 */ 410 static inline int 411 port_init(uint8_t port) 412 { 413 struct rte_eth_dev_info dev_info; 414 struct rte_eth_conf port_conf; 415 uint16_t rx_rings, tx_rings; 416 uint16_t rx_ring_size, tx_ring_size; 417 int retval; 418 uint16_t q; 419 420 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 421 rte_eth_dev_info_get (port, &dev_info); 422 423 /*configure the number of supported virtio devices based on VMDQ limits */ 424 num_devices = dev_info.max_vmdq_pools; 425 num_queues = dev_info.max_rx_queues; 426 427 if (zero_copy) { 428 rx_ring_size = num_rx_descriptor; 429 tx_ring_size = num_tx_descriptor; 430 tx_rings = dev_info.max_tx_queues; 431 } else { 432 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 433 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 434 tx_rings = (uint16_t)rte_lcore_count(); 435 } 436 437 retval = validate_num_devices(MAX_DEVICES); 438 if (retval < 0) 439 return retval; 440 441 /* Get port configuration. */ 442 retval = get_eth_conf(&port_conf, num_devices); 443 if (retval < 0) 444 return retval; 445 446 if (port >= rte_eth_dev_count()) return -1; 447 448 rx_rings = (uint16_t)num_queues, 449 /* Configure ethernet device. */ 450 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 451 if (retval != 0) 452 return retval; 453 454 /* Setup the queues. */ 455 for (q = 0; q < rx_rings; q ++) { 456 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 457 rte_eth_dev_socket_id(port), &rx_conf_default, 458 vpool_array[q].pool); 459 if (retval < 0) 460 return retval; 461 } 462 for (q = 0; q < tx_rings; q ++) { 463 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 464 rte_eth_dev_socket_id(port), &tx_conf_default); 465 if (retval < 0) 466 return retval; 467 } 468 469 /* Start the device. */ 470 retval = rte_eth_dev_start(port); 471 if (retval < 0) { 472 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 473 return retval; 474 } 475 476 if (promiscuous) 477 rte_eth_promiscuous_enable(port); 478 479 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 480 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 481 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 482 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 483 (unsigned)port, 484 vmdq_ports_eth_addr[port].addr_bytes[0], 485 vmdq_ports_eth_addr[port].addr_bytes[1], 486 vmdq_ports_eth_addr[port].addr_bytes[2], 487 vmdq_ports_eth_addr[port].addr_bytes[3], 488 vmdq_ports_eth_addr[port].addr_bytes[4], 489 vmdq_ports_eth_addr[port].addr_bytes[5]); 490 491 return 0; 492 } 493 494 /* 495 * Set character device basename. 496 */ 497 static int 498 us_vhost_parse_basename(const char *q_arg) 499 { 500 /* parse number string */ 501 502 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 503 return -1; 504 else 505 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 506 507 return 0; 508 } 509 510 /* 511 * Parse the portmask provided at run time. 512 */ 513 static int 514 parse_portmask(const char *portmask) 515 { 516 char *end = NULL; 517 unsigned long pm; 518 519 errno = 0; 520 521 /* parse hexadecimal string */ 522 pm = strtoul(portmask, &end, 16); 523 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 524 return -1; 525 526 if (pm == 0) 527 return -1; 528 529 return pm; 530 531 } 532 533 /* 534 * Parse num options at run time. 535 */ 536 static int 537 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 538 { 539 char *end = NULL; 540 unsigned long num; 541 542 errno = 0; 543 544 /* parse unsigned int string */ 545 num = strtoul(q_arg, &end, 10); 546 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 547 return -1; 548 549 if (num > max_valid_value) 550 return -1; 551 552 return num; 553 554 } 555 556 /* 557 * Display usage 558 */ 559 static void 560 us_vhost_usage(const char *prgname) 561 { 562 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 563 " --vm2vm [0|1|2]\n" 564 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 565 " --dev-basename <name>\n" 566 " --nb-devices ND\n" 567 " -p PORTMASK: Set mask for ports to be used by application\n" 568 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 569 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 570 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 571 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 572 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 573 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 574 " --dev-basename: The basename to be used for the character device.\n" 575 " --zero-copy [0|1]: disable(default)/enable rx/tx " 576 "zero copy\n" 577 " --rx-desc-num [0-N]: the number of descriptors on rx, " 578 "used only when zero copy is enabled.\n" 579 " --tx-desc-num [0-N]: the number of descriptors on tx, " 580 "used only when zero copy is enabled.\n", 581 prgname); 582 } 583 584 /* 585 * Parse the arguments given in the command line of the application. 586 */ 587 static int 588 us_vhost_parse_args(int argc, char **argv) 589 { 590 int opt, ret; 591 int option_index; 592 unsigned i; 593 const char *prgname = argv[0]; 594 static struct option long_option[] = { 595 {"vm2vm", required_argument, NULL, 0}, 596 {"rx-retry", required_argument, NULL, 0}, 597 {"rx-retry-delay", required_argument, NULL, 0}, 598 {"rx-retry-num", required_argument, NULL, 0}, 599 {"mergeable", required_argument, NULL, 0}, 600 {"stats", required_argument, NULL, 0}, 601 {"dev-basename", required_argument, NULL, 0}, 602 {"zero-copy", required_argument, NULL, 0}, 603 {"rx-desc-num", required_argument, NULL, 0}, 604 {"tx-desc-num", required_argument, NULL, 0}, 605 {NULL, 0, 0, 0}, 606 }; 607 608 /* Parse command line */ 609 while ((opt = getopt_long(argc, argv, "p:P", 610 long_option, &option_index)) != EOF) { 611 switch (opt) { 612 /* Portmask */ 613 case 'p': 614 enabled_port_mask = parse_portmask(optarg); 615 if (enabled_port_mask == 0) { 616 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 617 us_vhost_usage(prgname); 618 return -1; 619 } 620 break; 621 622 case 'P': 623 promiscuous = 1; 624 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 625 ETH_VMDQ_ACCEPT_BROADCAST | 626 ETH_VMDQ_ACCEPT_MULTICAST; 627 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 628 629 break; 630 631 case 0: 632 /* Enable/disable vm2vm comms. */ 633 if (!strncmp(long_option[option_index].name, "vm2vm", 634 MAX_LONG_OPT_SZ)) { 635 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 636 if (ret == -1) { 637 RTE_LOG(INFO, VHOST_CONFIG, 638 "Invalid argument for " 639 "vm2vm [0|1|2]\n"); 640 us_vhost_usage(prgname); 641 return -1; 642 } else { 643 vm2vm_mode = (vm2vm_type)ret; 644 } 645 } 646 647 /* Enable/disable retries on RX. */ 648 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 649 ret = parse_num_opt(optarg, 1); 650 if (ret == -1) { 651 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 652 us_vhost_usage(prgname); 653 return -1; 654 } else { 655 enable_retry = ret; 656 } 657 } 658 659 /* Specify the retries delay time (in useconds) on RX. */ 660 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 661 ret = parse_num_opt(optarg, INT32_MAX); 662 if (ret == -1) { 663 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 664 us_vhost_usage(prgname); 665 return -1; 666 } else { 667 burst_rx_delay_time = ret; 668 } 669 } 670 671 /* Specify the retries number on RX. */ 672 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 673 ret = parse_num_opt(optarg, INT32_MAX); 674 if (ret == -1) { 675 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 676 us_vhost_usage(prgname); 677 return -1; 678 } else { 679 burst_rx_retry_num = ret; 680 } 681 } 682 683 /* Enable/disable RX mergeable buffers. */ 684 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 685 ret = parse_num_opt(optarg, 1); 686 if (ret == -1) { 687 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 688 us_vhost_usage(prgname); 689 return -1; 690 } else { 691 mergeable = !!ret; 692 if (ret) { 693 vmdq_conf_default.rxmode.jumbo_frame = 1; 694 vmdq_conf_default.rxmode.max_rx_pkt_len 695 = JUMBO_FRAME_MAX_SIZE; 696 } 697 } 698 } 699 700 /* Enable/disable stats. */ 701 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 702 ret = parse_num_opt(optarg, INT32_MAX); 703 if (ret == -1) { 704 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 705 us_vhost_usage(prgname); 706 return -1; 707 } else { 708 enable_stats = ret; 709 } 710 } 711 712 /* Set character device basename. */ 713 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 714 if (us_vhost_parse_basename(optarg) == -1) { 715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 716 us_vhost_usage(prgname); 717 return -1; 718 } 719 } 720 721 /* Enable/disable rx/tx zero copy. */ 722 if (!strncmp(long_option[option_index].name, 723 "zero-copy", MAX_LONG_OPT_SZ)) { 724 ret = parse_num_opt(optarg, 1); 725 if (ret == -1) { 726 RTE_LOG(INFO, VHOST_CONFIG, 727 "Invalid argument" 728 " for zero-copy [0|1]\n"); 729 us_vhost_usage(prgname); 730 return -1; 731 } else 732 zero_copy = ret; 733 734 if (zero_copy) { 735 #ifdef RTE_MBUF_REFCNT 736 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 737 "zero copy vhost APP, please " 738 "disable RTE_MBUF_REFCNT\n" 739 "in config file and then rebuild DPDK " 740 "core lib!\n" 741 "Otherwise please disable zero copy " 742 "flag in command line!\n"); 743 return -1; 744 #endif 745 } 746 } 747 748 /* Specify the descriptor number on RX. */ 749 if (!strncmp(long_option[option_index].name, 750 "rx-desc-num", MAX_LONG_OPT_SZ)) { 751 ret = parse_num_opt(optarg, MAX_RING_DESC); 752 if ((ret == -1) || (!POWEROF2(ret))) { 753 RTE_LOG(INFO, VHOST_CONFIG, 754 "Invalid argument for rx-desc-num[0-N]," 755 "power of 2 required.\n"); 756 us_vhost_usage(prgname); 757 return -1; 758 } else { 759 num_rx_descriptor = ret; 760 } 761 } 762 763 /* Specify the descriptor number on TX. */ 764 if (!strncmp(long_option[option_index].name, 765 "tx-desc-num", MAX_LONG_OPT_SZ)) { 766 ret = parse_num_opt(optarg, MAX_RING_DESC); 767 if ((ret == -1) || (!POWEROF2(ret))) { 768 RTE_LOG(INFO, VHOST_CONFIG, 769 "Invalid argument for tx-desc-num [0-N]," 770 "power of 2 required.\n"); 771 us_vhost_usage(prgname); 772 return -1; 773 } else { 774 num_tx_descriptor = ret; 775 } 776 } 777 778 break; 779 780 /* Invalid option - print options. */ 781 default: 782 us_vhost_usage(prgname); 783 return -1; 784 } 785 } 786 787 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 788 if (enabled_port_mask & (1 << i)) 789 ports[num_ports++] = (uint8_t)i; 790 } 791 792 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 793 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 794 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 795 return -1; 796 } 797 798 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 799 RTE_LOG(INFO, VHOST_PORT, 800 "Vhost zero copy doesn't support software vm2vm," 801 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 802 return -1; 803 } 804 805 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 806 RTE_LOG(INFO, VHOST_PORT, 807 "Vhost zero copy doesn't support jumbo frame," 808 "please specify '--mergeable 0' to disable the " 809 "mergeable feature.\n"); 810 return -1; 811 } 812 813 return 0; 814 } 815 816 /* 817 * Update the global var NUM_PORTS and array PORTS according to system ports number 818 * and return valid ports number 819 */ 820 static unsigned check_ports_num(unsigned nb_ports) 821 { 822 unsigned valid_num_ports = num_ports; 823 unsigned portid; 824 825 if (num_ports > nb_ports) { 826 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 827 num_ports, nb_ports); 828 num_ports = nb_ports; 829 } 830 831 for (portid = 0; portid < num_ports; portid ++) { 832 if (ports[portid] >= nb_ports) { 833 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 834 ports[portid], (nb_ports - 1)); 835 ports[portid] = INVALID_PORT_ID; 836 valid_num_ports--; 837 } 838 } 839 return valid_num_ports; 840 } 841 842 /* 843 * Macro to print out packet contents. Wrapped in debug define so that the 844 * data path is not effected when debug is disabled. 845 */ 846 #ifdef DEBUG 847 #define PRINT_PACKET(device, addr, size, header) do { \ 848 char *pkt_addr = (char*)(addr); \ 849 unsigned int index; \ 850 char packet[MAX_PRINT_BUFF]; \ 851 \ 852 if ((header)) \ 853 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 854 else \ 855 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 856 for (index = 0; index < (size); index++) { \ 857 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 858 "%02hhx ", pkt_addr[index]); \ 859 } \ 860 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 861 \ 862 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 863 } while(0) 864 #else 865 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 866 #endif 867 868 /* 869 * Function to convert guest physical addresses to vhost physical addresses. 870 * This is used to convert virtio buffer addresses. 871 */ 872 static inline uint64_t __attribute__((always_inline)) 873 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 874 uint32_t buf_len, hpa_type *addr_type) 875 { 876 struct virtio_memory_regions_hpa *region; 877 uint32_t regionidx; 878 uint64_t vhost_pa = 0; 879 880 *addr_type = PHYS_ADDR_INVALID; 881 882 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 883 region = &vdev->regions_hpa[regionidx]; 884 if ((guest_pa >= region->guest_phys_address) && 885 (guest_pa <= region->guest_phys_address_end)) { 886 vhost_pa = region->host_phys_addr_offset + guest_pa; 887 if (likely((guest_pa + buf_len - 1) 888 <= region->guest_phys_address_end)) 889 *addr_type = PHYS_ADDR_CONTINUOUS; 890 else 891 *addr_type = PHYS_ADDR_CROSS_SUBREG; 892 break; 893 } 894 } 895 896 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 897 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 898 (void *)(uintptr_t)vhost_pa); 899 900 return vhost_pa; 901 } 902 903 /* 904 * Compares a packet destination MAC address to a device MAC address. 905 */ 906 static inline int __attribute__((always_inline)) 907 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 908 { 909 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 910 } 911 912 /* 913 * This function learns the MAC address of the device and registers this along with a 914 * vlan tag to a VMDQ. 915 */ 916 static int 917 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 918 { 919 struct ether_hdr *pkt_hdr; 920 struct virtio_net_data_ll *dev_ll; 921 struct virtio_net *dev = vdev->dev; 922 int i, ret; 923 924 /* Learn MAC address of guest device from packet */ 925 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 926 927 dev_ll = ll_root_used; 928 929 while (dev_ll != NULL) { 930 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 931 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 932 return -1; 933 } 934 dev_ll = dev_ll->next; 935 } 936 937 for (i = 0; i < ETHER_ADDR_LEN; i++) 938 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 939 940 /* vlan_tag currently uses the device_id. */ 941 vdev->vlan_tag = vlan_tags[dev->device_fh]; 942 943 /* Print out VMDQ registration info. */ 944 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 945 dev->device_fh, 946 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 947 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 948 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 949 vdev->vlan_tag); 950 951 /* Register the MAC address. */ 952 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 953 if (ret) 954 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 955 dev->device_fh); 956 957 /* Enable stripping of the vlan tag as we handle routing. */ 958 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 959 960 /* Set device as ready for RX. */ 961 vdev->ready = DEVICE_RX; 962 963 return 0; 964 } 965 966 /* 967 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 968 * queue before disabling RX on the device. 969 */ 970 static inline void 971 unlink_vmdq(struct vhost_dev *vdev) 972 { 973 unsigned i = 0; 974 unsigned rx_count; 975 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 976 977 if (vdev->ready == DEVICE_RX) { 978 /*clear MAC and VLAN settings*/ 979 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 980 for (i = 0; i < 6; i++) 981 vdev->mac_address.addr_bytes[i] = 0; 982 983 vdev->vlan_tag = 0; 984 985 /*Clear out the receive buffers*/ 986 rx_count = rte_eth_rx_burst(ports[0], 987 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 988 989 while (rx_count) { 990 for (i = 0; i < rx_count; i++) 991 rte_pktmbuf_free(pkts_burst[i]); 992 993 rx_count = rte_eth_rx_burst(ports[0], 994 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 995 } 996 997 vdev->ready = DEVICE_MAC_LEARNING; 998 } 999 } 1000 1001 /* 1002 * Check if the packet destination MAC address is for a local device. If so then put 1003 * the packet on that devices RX queue. If not then return. 1004 */ 1005 static inline int __attribute__((always_inline)) 1006 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1007 { 1008 struct virtio_net_data_ll *dev_ll; 1009 struct ether_hdr *pkt_hdr; 1010 uint64_t ret = 0; 1011 struct virtio_net *dev = vdev->dev; 1012 struct virtio_net *tdev; /* destination virito device */ 1013 1014 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1015 1016 /*get the used devices list*/ 1017 dev_ll = ll_root_used; 1018 1019 while (dev_ll != NULL) { 1020 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1021 &dev_ll->vdev->mac_address)) { 1022 1023 /* Drop the packet if the TX packet is destined for the TX device. */ 1024 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1025 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1026 dev->device_fh); 1027 return 0; 1028 } 1029 tdev = dev_ll->vdev->dev; 1030 1031 1032 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1033 1034 if (unlikely(dev_ll->vdev->remove)) { 1035 /*drop the packet if the device is marked for removal*/ 1036 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1037 } else { 1038 /*send the packet to the local virtio device*/ 1039 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1040 if (enable_stats) { 1041 rte_atomic64_add( 1042 &dev_statistics[tdev->device_fh].rx_total_atomic, 1043 1); 1044 rte_atomic64_add( 1045 &dev_statistics[tdev->device_fh].rx_atomic, 1046 ret); 1047 dev_statistics[tdev->device_fh].tx_total++; 1048 dev_statistics[tdev->device_fh].tx += ret; 1049 } 1050 } 1051 1052 return 0; 1053 } 1054 dev_ll = dev_ll->next; 1055 } 1056 1057 return -1; 1058 } 1059 1060 /* 1061 * Check if the destination MAC of a packet is one local VM, 1062 * and get its vlan tag, and offset if it is. 1063 */ 1064 static inline int __attribute__((always_inline)) 1065 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1066 uint32_t *offset, uint16_t *vlan_tag) 1067 { 1068 struct virtio_net_data_ll *dev_ll = ll_root_used; 1069 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1070 1071 while (dev_ll != NULL) { 1072 if ((dev_ll->vdev->ready == DEVICE_RX) 1073 && ether_addr_cmp(&(pkt_hdr->d_addr), 1074 &dev_ll->vdev->mac_address)) { 1075 /* 1076 * Drop the packet if the TX packet is 1077 * destined for the TX device. 1078 */ 1079 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1080 LOG_DEBUG(VHOST_DATA, 1081 "(%"PRIu64") TX: Source and destination" 1082 " MAC addresses are the same. Dropping " 1083 "packet.\n", 1084 dev_ll->vdev->dev->device_fh); 1085 return -1; 1086 } 1087 1088 /* 1089 * HW vlan strip will reduce the packet length 1090 * by minus length of vlan tag, so need restore 1091 * the packet length by plus it. 1092 */ 1093 *offset = VLAN_HLEN; 1094 *vlan_tag = 1095 (uint16_t) 1096 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1097 1098 LOG_DEBUG(VHOST_DATA, 1099 "(%"PRIu64") TX: pkt to local VM device id:" 1100 "(%"PRIu64") vlan tag: %d.\n", 1101 dev->device_fh, dev_ll->vdev->dev->device_fh, 1102 vlan_tag); 1103 1104 break; 1105 } 1106 dev_ll = dev_ll->next; 1107 } 1108 return 0; 1109 } 1110 1111 /* 1112 * This function routes the TX packet to the correct interface. This may be a local device 1113 * or the physical port. 1114 */ 1115 static inline void __attribute__((always_inline)) 1116 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1117 { 1118 struct mbuf_table *tx_q; 1119 struct rte_mbuf **m_table; 1120 unsigned len, ret, offset = 0; 1121 const uint16_t lcore_id = rte_lcore_id(); 1122 struct virtio_net *dev = vdev->dev; 1123 1124 /*check if destination is local VM*/ 1125 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1126 rte_pktmbuf_free(m); 1127 return; 1128 } 1129 1130 if (vm2vm_mode == VM2VM_HARDWARE) { 1131 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 || 1132 offset > rte_pktmbuf_tailroom(m)) { 1133 rte_pktmbuf_free(m); 1134 return; 1135 } 1136 } 1137 1138 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1139 1140 /*Add packet to the port tx queue*/ 1141 tx_q = &lcore_tx_queue[lcore_id]; 1142 len = tx_q->len; 1143 1144 m->ol_flags = PKT_TX_VLAN_PKT; 1145 1146 m->data_len += offset; 1147 m->pkt_len += offset; 1148 1149 m->vlan_tci = vlan_tag; 1150 1151 tx_q->m_table[len] = m; 1152 len++; 1153 if (enable_stats) { 1154 dev_statistics[dev->device_fh].tx_total++; 1155 dev_statistics[dev->device_fh].tx++; 1156 } 1157 1158 if (unlikely(len == MAX_PKT_BURST)) { 1159 m_table = (struct rte_mbuf **)tx_q->m_table; 1160 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1161 /* Free any buffers not handled by TX and update the port stats. */ 1162 if (unlikely(ret < len)) { 1163 do { 1164 rte_pktmbuf_free(m_table[ret]); 1165 } while (++ret < len); 1166 } 1167 1168 len = 0; 1169 } 1170 1171 tx_q->len = len; 1172 return; 1173 } 1174 /* 1175 * This function is called by each data core. It handles all RX/TX registered with the 1176 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1177 * with all devices in the main linked list. 1178 */ 1179 static int 1180 switch_worker(__attribute__((unused)) void *arg) 1181 { 1182 struct rte_mempool *mbuf_pool = arg; 1183 struct virtio_net *dev = NULL; 1184 struct vhost_dev *vdev = NULL; 1185 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1186 struct virtio_net_data_ll *dev_ll; 1187 struct mbuf_table *tx_q; 1188 volatile struct lcore_ll_info *lcore_ll; 1189 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1190 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1191 unsigned ret, i; 1192 const uint16_t lcore_id = rte_lcore_id(); 1193 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1194 uint16_t rx_count = 0; 1195 uint16_t tx_count; 1196 uint32_t retry = 0; 1197 1198 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1199 lcore_ll = lcore_info[lcore_id].lcore_ll; 1200 prev_tsc = 0; 1201 1202 tx_q = &lcore_tx_queue[lcore_id]; 1203 for (i = 0; i < num_cores; i ++) { 1204 if (lcore_ids[i] == lcore_id) { 1205 tx_q->txq_id = i; 1206 break; 1207 } 1208 } 1209 1210 while(1) { 1211 cur_tsc = rte_rdtsc(); 1212 /* 1213 * TX burst queue drain 1214 */ 1215 diff_tsc = cur_tsc - prev_tsc; 1216 if (unlikely(diff_tsc > drain_tsc)) { 1217 1218 if (tx_q->len) { 1219 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1220 1221 /*Tx any packets in the queue*/ 1222 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1223 (struct rte_mbuf **)tx_q->m_table, 1224 (uint16_t)tx_q->len); 1225 if (unlikely(ret < tx_q->len)) { 1226 do { 1227 rte_pktmbuf_free(tx_q->m_table[ret]); 1228 } while (++ret < tx_q->len); 1229 } 1230 1231 tx_q->len = 0; 1232 } 1233 1234 prev_tsc = cur_tsc; 1235 1236 } 1237 1238 rte_prefetch0(lcore_ll->ll_root_used); 1239 /* 1240 * Inform the configuration core that we have exited the linked list and that no devices are 1241 * in use if requested. 1242 */ 1243 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1244 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1245 1246 /* 1247 * Process devices 1248 */ 1249 dev_ll = lcore_ll->ll_root_used; 1250 1251 while (dev_ll != NULL) { 1252 /*get virtio device ID*/ 1253 vdev = dev_ll->vdev; 1254 dev = vdev->dev; 1255 1256 if (unlikely(vdev->remove)) { 1257 dev_ll = dev_ll->next; 1258 unlink_vmdq(vdev); 1259 vdev->ready = DEVICE_SAFE_REMOVE; 1260 continue; 1261 } 1262 if (likely(vdev->ready == DEVICE_RX)) { 1263 /*Handle guest RX*/ 1264 rx_count = rte_eth_rx_burst(ports[0], 1265 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1266 1267 if (rx_count) { 1268 /* 1269 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1270 * Here MAX_PKT_BURST must be less than virtio queue size 1271 */ 1272 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1273 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1274 rte_delay_us(burst_rx_delay_time); 1275 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1276 break; 1277 } 1278 } 1279 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1280 if (enable_stats) { 1281 rte_atomic64_add( 1282 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1283 rx_count); 1284 rte_atomic64_add( 1285 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1286 } 1287 while (likely(rx_count)) { 1288 rx_count--; 1289 rte_pktmbuf_free(pkts_burst[rx_count]); 1290 } 1291 1292 } 1293 } 1294 1295 if (likely(!vdev->remove)) { 1296 /* Handle guest TX*/ 1297 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1298 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1299 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1300 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1301 while (tx_count--) 1302 rte_pktmbuf_free(pkts_burst[tx_count]); 1303 } 1304 } 1305 while (tx_count) 1306 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1307 } 1308 1309 /*move to the next device in the list*/ 1310 dev_ll = dev_ll->next; 1311 } 1312 } 1313 1314 return 0; 1315 } 1316 1317 /* 1318 * This function gets available ring number for zero copy rx. 1319 * Only one thread will call this funciton for a paticular virtio device, 1320 * so, it is designed as non-thread-safe function. 1321 */ 1322 static inline uint32_t __attribute__((always_inline)) 1323 get_available_ring_num_zcp(struct virtio_net *dev) 1324 { 1325 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1326 uint16_t avail_idx; 1327 1328 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1329 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1330 } 1331 1332 /* 1333 * This function gets available ring index for zero copy rx, 1334 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1335 * Only one thread will call this funciton for a paticular virtio device, 1336 * so, it is designed as non-thread-safe function. 1337 */ 1338 static inline uint32_t __attribute__((always_inline)) 1339 get_available_ring_index_zcp(struct virtio_net *dev, 1340 uint16_t *res_base_idx, uint32_t count) 1341 { 1342 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1343 uint16_t avail_idx; 1344 uint32_t retry = 0; 1345 uint16_t free_entries; 1346 1347 *res_base_idx = vq->last_used_idx_res; 1348 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1349 free_entries = (avail_idx - *res_base_idx); 1350 1351 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1352 "avail idx: %d, " 1353 "res base idx:%d, free entries:%d\n", 1354 dev->device_fh, avail_idx, *res_base_idx, 1355 free_entries); 1356 1357 /* 1358 * If retry is enabled and the queue is full then we wait 1359 * and retry to avoid packet loss. 1360 */ 1361 if (enable_retry && unlikely(count > free_entries)) { 1362 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1363 rte_delay_us(burst_rx_delay_time); 1364 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1365 free_entries = (avail_idx - *res_base_idx); 1366 if (count <= free_entries) 1367 break; 1368 } 1369 } 1370 1371 /*check that we have enough buffers*/ 1372 if (unlikely(count > free_entries)) 1373 count = free_entries; 1374 1375 if (unlikely(count == 0)) { 1376 LOG_DEBUG(VHOST_DATA, 1377 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1378 "avail idx: %d, res base idx:%d, free entries:%d\n", 1379 dev->device_fh, avail_idx, 1380 *res_base_idx, free_entries); 1381 return 0; 1382 } 1383 1384 vq->last_used_idx_res = *res_base_idx + count; 1385 1386 return count; 1387 } 1388 1389 /* 1390 * This function put descriptor back to used list. 1391 */ 1392 static inline void __attribute__((always_inline)) 1393 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1394 { 1395 uint16_t res_cur_idx = vq->last_used_idx; 1396 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1397 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1398 rte_compiler_barrier(); 1399 *(volatile uint16_t *)&vq->used->idx += 1; 1400 vq->last_used_idx += 1; 1401 1402 /* Kick the guest if necessary. */ 1403 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1404 eventfd_write((int)vq->kickfd, 1); 1405 } 1406 1407 /* 1408 * This function get available descriptor from vitio vring and un-attached mbuf 1409 * from vpool->ring, and then attach them together. It needs adjust the offset 1410 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1411 * frame data may be put to wrong location in mbuf. 1412 */ 1413 static inline void __attribute__((always_inline)) 1414 attach_rxmbuf_zcp(struct virtio_net *dev) 1415 { 1416 uint16_t res_base_idx, desc_idx; 1417 uint64_t buff_addr, phys_addr; 1418 struct vhost_virtqueue *vq; 1419 struct vring_desc *desc; 1420 struct rte_mbuf *mbuf = NULL; 1421 struct vpool *vpool; 1422 hpa_type addr_type; 1423 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1424 1425 vpool = &vpool_array[vdev->vmdq_rx_q]; 1426 vq = dev->virtqueue[VIRTIO_RXQ]; 1427 1428 do { 1429 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1430 1) != 1)) 1431 return; 1432 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1433 1434 desc = &vq->desc[desc_idx]; 1435 if (desc->flags & VRING_DESC_F_NEXT) { 1436 desc = &vq->desc[desc->next]; 1437 buff_addr = gpa_to_vva(dev, desc->addr); 1438 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1439 &addr_type); 1440 } else { 1441 buff_addr = gpa_to_vva(dev, 1442 desc->addr + vq->vhost_hlen); 1443 phys_addr = gpa_to_hpa(vdev, 1444 desc->addr + vq->vhost_hlen, 1445 desc->len, &addr_type); 1446 } 1447 1448 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1449 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1450 " address found when attaching RX frame buffer" 1451 " address!\n", dev->device_fh); 1452 put_desc_to_used_list_zcp(vq, desc_idx); 1453 continue; 1454 } 1455 1456 /* 1457 * Check if the frame buffer address from guest crosses 1458 * sub-region or not. 1459 */ 1460 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1461 RTE_LOG(ERR, VHOST_DATA, 1462 "(%"PRIu64") Frame buffer address cross " 1463 "sub-regioin found when attaching RX frame " 1464 "buffer address!\n", 1465 dev->device_fh); 1466 put_desc_to_used_list_zcp(vq, desc_idx); 1467 continue; 1468 } 1469 } while (unlikely(phys_addr == 0)); 1470 1471 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1472 if (unlikely(mbuf == NULL)) { 1473 LOG_DEBUG(VHOST_DATA, 1474 "(%"PRIu64") in attach_rxmbuf_zcp: " 1475 "ring_sc_dequeue fail.\n", 1476 dev->device_fh); 1477 put_desc_to_used_list_zcp(vq, desc_idx); 1478 return; 1479 } 1480 1481 if (unlikely(vpool->buf_size > desc->len)) { 1482 LOG_DEBUG(VHOST_DATA, 1483 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1484 "length(%d) of descriptor idx: %d less than room " 1485 "size required: %d\n", 1486 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1487 put_desc_to_used_list_zcp(vq, desc_idx); 1488 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1489 return; 1490 } 1491 1492 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1493 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1494 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1495 mbuf->data_len = desc->len; 1496 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1497 1498 LOG_DEBUG(VHOST_DATA, 1499 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1500 "descriptor idx:%d\n", 1501 dev->device_fh, res_base_idx, desc_idx); 1502 1503 __rte_mbuf_raw_free(mbuf); 1504 1505 return; 1506 } 1507 1508 /* 1509 * Detach an attched packet mbuf - 1510 * - restore original mbuf address and length values. 1511 * - reset pktmbuf data and data_len to their default values. 1512 * All other fields of the given packet mbuf will be left intact. 1513 * 1514 * @param m 1515 * The attached packet mbuf. 1516 */ 1517 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1518 { 1519 const struct rte_mempool *mp = m->pool; 1520 void *buf = RTE_MBUF_TO_BADDR(m); 1521 uint32_t buf_ofs; 1522 uint32_t buf_len = mp->elt_size - sizeof(*m); 1523 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1524 1525 m->buf_addr = buf; 1526 m->buf_len = (uint16_t)buf_len; 1527 1528 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1529 RTE_PKTMBUF_HEADROOM : m->buf_len; 1530 m->data_off = buf_ofs; 1531 1532 m->data_len = 0; 1533 } 1534 1535 /* 1536 * This function is called after packets have been transimited. It fetchs mbuf 1537 * from vpool->pool, detached it and put into vpool->ring. It also update the 1538 * used index and kick the guest if necessary. 1539 */ 1540 static inline uint32_t __attribute__((always_inline)) 1541 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1542 { 1543 struct rte_mbuf *mbuf; 1544 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1545 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1546 uint32_t index = 0; 1547 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1548 1549 LOG_DEBUG(VHOST_DATA, 1550 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1551 "clean is: %d\n", 1552 dev->device_fh, mbuf_count); 1553 LOG_DEBUG(VHOST_DATA, 1554 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1555 "clean is : %d\n", 1556 dev->device_fh, rte_ring_count(vpool->ring)); 1557 1558 for (index = 0; index < mbuf_count; index++) { 1559 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1560 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1561 pktmbuf_detach_zcp(mbuf); 1562 rte_ring_sp_enqueue(vpool->ring, mbuf); 1563 1564 /* Update used index buffer information. */ 1565 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1566 vq->used->ring[used_idx].len = 0; 1567 1568 used_idx = (used_idx + 1) & (vq->size - 1); 1569 } 1570 1571 LOG_DEBUG(VHOST_DATA, 1572 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1573 "clean is: %d\n", 1574 dev->device_fh, rte_mempool_count(vpool->pool)); 1575 LOG_DEBUG(VHOST_DATA, 1576 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1577 "clean is : %d\n", 1578 dev->device_fh, rte_ring_count(vpool->ring)); 1579 LOG_DEBUG(VHOST_DATA, 1580 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1581 "vq->last_used_idx:%d\n", 1582 dev->device_fh, vq->last_used_idx); 1583 1584 vq->last_used_idx += mbuf_count; 1585 1586 LOG_DEBUG(VHOST_DATA, 1587 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1588 "vq->last_used_idx:%d\n", 1589 dev->device_fh, vq->last_used_idx); 1590 1591 rte_compiler_barrier(); 1592 1593 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1594 1595 /* Kick guest if required. */ 1596 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1597 eventfd_write((int)vq->kickfd, 1); 1598 1599 return 0; 1600 } 1601 1602 /* 1603 * This function is called when a virtio device is destroy. 1604 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1605 */ 1606 static void mbuf_destroy_zcp(struct vpool *vpool) 1607 { 1608 struct rte_mbuf *mbuf = NULL; 1609 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1610 1611 LOG_DEBUG(VHOST_CONFIG, 1612 "in mbuf_destroy_zcp: mbuf count in mempool before " 1613 "mbuf_destroy_zcp is: %d\n", 1614 mbuf_count); 1615 LOG_DEBUG(VHOST_CONFIG, 1616 "in mbuf_destroy_zcp: mbuf count in ring before " 1617 "mbuf_destroy_zcp is : %d\n", 1618 rte_ring_count(vpool->ring)); 1619 1620 for (index = 0; index < mbuf_count; index++) { 1621 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1622 if (likely(mbuf != NULL)) { 1623 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1624 pktmbuf_detach_zcp(mbuf); 1625 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1626 } 1627 } 1628 1629 LOG_DEBUG(VHOST_CONFIG, 1630 "in mbuf_destroy_zcp: mbuf count in mempool after " 1631 "mbuf_destroy_zcp is: %d\n", 1632 rte_mempool_count(vpool->pool)); 1633 LOG_DEBUG(VHOST_CONFIG, 1634 "in mbuf_destroy_zcp: mbuf count in ring after " 1635 "mbuf_destroy_zcp is : %d\n", 1636 rte_ring_count(vpool->ring)); 1637 } 1638 1639 /* 1640 * This function update the use flag and counter. 1641 */ 1642 static inline uint32_t __attribute__((always_inline)) 1643 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1644 uint32_t count) 1645 { 1646 struct vhost_virtqueue *vq; 1647 struct vring_desc *desc; 1648 struct rte_mbuf *buff; 1649 /* The virtio_hdr is initialised to 0. */ 1650 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1651 = {{0, 0, 0, 0, 0, 0}, 0}; 1652 uint64_t buff_hdr_addr = 0; 1653 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1654 uint32_t head_idx, packet_success = 0; 1655 uint16_t res_cur_idx; 1656 1657 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1658 1659 if (count == 0) 1660 return 0; 1661 1662 vq = dev->virtqueue[VIRTIO_RXQ]; 1663 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1664 1665 res_cur_idx = vq->last_used_idx; 1666 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1667 dev->device_fh, res_cur_idx, res_cur_idx + count); 1668 1669 /* Retrieve all of the head indexes first to avoid caching issues. */ 1670 for (head_idx = 0; head_idx < count; head_idx++) 1671 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1672 1673 /*Prefetch descriptor index. */ 1674 rte_prefetch0(&vq->desc[head[packet_success]]); 1675 1676 while (packet_success != count) { 1677 /* Get descriptor from available ring */ 1678 desc = &vq->desc[head[packet_success]]; 1679 1680 buff = pkts[packet_success]; 1681 LOG_DEBUG(VHOST_DATA, 1682 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1683 "pkt[%d] descriptor idx: %d\n", 1684 dev->device_fh, packet_success, 1685 MBUF_HEADROOM_UINT32(buff)); 1686 1687 PRINT_PACKET(dev, 1688 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1689 + RTE_PKTMBUF_HEADROOM), 1690 rte_pktmbuf_data_len(buff), 0); 1691 1692 /* Buffer address translation for virtio header. */ 1693 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1694 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1695 1696 /* 1697 * If the descriptors are chained the header and data are 1698 * placed in separate buffers. 1699 */ 1700 if (desc->flags & VRING_DESC_F_NEXT) { 1701 desc->len = vq->vhost_hlen; 1702 desc = &vq->desc[desc->next]; 1703 desc->len = rte_pktmbuf_data_len(buff); 1704 } else { 1705 desc->len = packet_len; 1706 } 1707 1708 /* Update used ring with desc information */ 1709 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1710 = head[packet_success]; 1711 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1712 = packet_len; 1713 res_cur_idx++; 1714 packet_success++; 1715 1716 /* A header is required per buffer. */ 1717 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1718 (const void *)&virtio_hdr, vq->vhost_hlen); 1719 1720 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1721 1722 if (likely(packet_success < count)) { 1723 /* Prefetch descriptor index. */ 1724 rte_prefetch0(&vq->desc[head[packet_success]]); 1725 } 1726 } 1727 1728 rte_compiler_barrier(); 1729 1730 LOG_DEBUG(VHOST_DATA, 1731 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1732 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1733 dev->device_fh, vq->last_used_idx, vq->used->idx); 1734 1735 *(volatile uint16_t *)&vq->used->idx += count; 1736 vq->last_used_idx += count; 1737 1738 LOG_DEBUG(VHOST_DATA, 1739 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1740 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1741 dev->device_fh, vq->last_used_idx, vq->used->idx); 1742 1743 /* Kick the guest if necessary. */ 1744 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1745 eventfd_write((int)vq->kickfd, 1); 1746 1747 return count; 1748 } 1749 1750 /* 1751 * This function routes the TX packet to the correct interface. 1752 * This may be a local device or the physical port. 1753 */ 1754 static inline void __attribute__((always_inline)) 1755 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1756 uint32_t desc_idx, uint8_t need_copy) 1757 { 1758 struct mbuf_table *tx_q; 1759 struct rte_mbuf **m_table; 1760 struct rte_mbuf *mbuf = NULL; 1761 unsigned len, ret, offset = 0; 1762 struct vpool *vpool; 1763 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1764 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1765 1766 /*Add packet to the port tx queue*/ 1767 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1768 len = tx_q->len; 1769 1770 /* Allocate an mbuf and populate the structure. */ 1771 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1772 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1773 if (unlikely(mbuf == NULL)) { 1774 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1775 RTE_LOG(ERR, VHOST_DATA, 1776 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1777 dev->device_fh); 1778 put_desc_to_used_list_zcp(vq, desc_idx); 1779 return; 1780 } 1781 1782 if (vm2vm_mode == VM2VM_HARDWARE) { 1783 /* Avoid using a vlan tag from any vm for external pkt, such as 1784 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1785 * selection, MAC address determines it as an external pkt 1786 * which should go to network, while vlan tag determine it as 1787 * a vm2vm pkt should forward to another vm. Hardware confuse 1788 * such a ambiguous situation, so pkt will lost. 1789 */ 1790 vlan_tag = external_pkt_default_vlan_tag; 1791 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1792 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1793 __rte_mbuf_raw_free(mbuf); 1794 return; 1795 } 1796 } 1797 1798 mbuf->nb_segs = m->nb_segs; 1799 mbuf->next = m->next; 1800 mbuf->data_len = m->data_len + offset; 1801 mbuf->pkt_len = mbuf->data_len; 1802 if (unlikely(need_copy)) { 1803 /* Copy the packet contents to the mbuf. */ 1804 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1805 rte_pktmbuf_mtod(m, void *), 1806 m->data_len); 1807 } else { 1808 mbuf->data_off = m->data_off; 1809 mbuf->buf_physaddr = m->buf_physaddr; 1810 mbuf->buf_addr = m->buf_addr; 1811 } 1812 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1813 mbuf->vlan_tci = vlan_tag; 1814 mbuf->l2_len = sizeof(struct ether_hdr); 1815 mbuf->l3_len = sizeof(struct ipv4_hdr); 1816 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1817 1818 tx_q->m_table[len] = mbuf; 1819 len++; 1820 1821 LOG_DEBUG(VHOST_DATA, 1822 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1823 dev->device_fh, 1824 mbuf->nb_segs, 1825 (mbuf->next == NULL) ? "null" : "non-null"); 1826 1827 if (enable_stats) { 1828 dev_statistics[dev->device_fh].tx_total++; 1829 dev_statistics[dev->device_fh].tx++; 1830 } 1831 1832 if (unlikely(len == MAX_PKT_BURST)) { 1833 m_table = (struct rte_mbuf **)tx_q->m_table; 1834 ret = rte_eth_tx_burst(ports[0], 1835 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1836 1837 /* 1838 * Free any buffers not handled by TX and update 1839 * the port stats. 1840 */ 1841 if (unlikely(ret < len)) { 1842 do { 1843 rte_pktmbuf_free(m_table[ret]); 1844 } while (++ret < len); 1845 } 1846 1847 len = 0; 1848 txmbuf_clean_zcp(dev, vpool); 1849 } 1850 1851 tx_q->len = len; 1852 1853 return; 1854 } 1855 1856 /* 1857 * This function TX all available packets in virtio TX queue for one 1858 * virtio-net device. If it is first packet, it learns MAC address and 1859 * setup VMDQ. 1860 */ 1861 static inline void __attribute__((always_inline)) 1862 virtio_dev_tx_zcp(struct virtio_net *dev) 1863 { 1864 struct rte_mbuf m; 1865 struct vhost_virtqueue *vq; 1866 struct vring_desc *desc; 1867 uint64_t buff_addr = 0, phys_addr; 1868 uint32_t head[MAX_PKT_BURST]; 1869 uint32_t i; 1870 uint16_t free_entries, packet_success = 0; 1871 uint16_t avail_idx; 1872 uint8_t need_copy = 0; 1873 hpa_type addr_type; 1874 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1875 1876 vq = dev->virtqueue[VIRTIO_TXQ]; 1877 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1878 1879 /* If there are no available buffers then return. */ 1880 if (vq->last_used_idx_res == avail_idx) 1881 return; 1882 1883 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1884 1885 /* Prefetch available ring to retrieve head indexes. */ 1886 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1887 1888 /* Get the number of free entries in the ring */ 1889 free_entries = (avail_idx - vq->last_used_idx_res); 1890 1891 /* Limit to MAX_PKT_BURST. */ 1892 free_entries 1893 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1894 1895 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1896 dev->device_fh, free_entries); 1897 1898 /* Retrieve all of the head indexes first to avoid caching issues. */ 1899 for (i = 0; i < free_entries; i++) 1900 head[i] 1901 = vq->avail->ring[(vq->last_used_idx_res + i) 1902 & (vq->size - 1)]; 1903 1904 vq->last_used_idx_res += free_entries; 1905 1906 /* Prefetch descriptor index. */ 1907 rte_prefetch0(&vq->desc[head[packet_success]]); 1908 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1909 1910 while (packet_success < free_entries) { 1911 desc = &vq->desc[head[packet_success]]; 1912 1913 /* Discard first buffer as it is the virtio header */ 1914 desc = &vq->desc[desc->next]; 1915 1916 /* Buffer address translation. */ 1917 buff_addr = gpa_to_vva(dev, desc->addr); 1918 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1919 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1920 &addr_type); 1921 1922 if (likely(packet_success < (free_entries - 1))) 1923 /* Prefetch descriptor index. */ 1924 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1925 1926 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1927 RTE_LOG(ERR, VHOST_DATA, 1928 "(%"PRIu64") Invalid frame buffer address found" 1929 "when TX packets!\n", 1930 dev->device_fh); 1931 packet_success++; 1932 continue; 1933 } 1934 1935 /* Prefetch buffer address. */ 1936 rte_prefetch0((void *)(uintptr_t)buff_addr); 1937 1938 /* 1939 * Setup dummy mbuf. This is copied to a real mbuf if 1940 * transmitted out the physical port. 1941 */ 1942 m.data_len = desc->len; 1943 m.nb_segs = 1; 1944 m.next = NULL; 1945 m.data_off = 0; 1946 m.buf_addr = (void *)(uintptr_t)buff_addr; 1947 m.buf_physaddr = phys_addr; 1948 1949 /* 1950 * Check if the frame buffer address from guest crosses 1951 * sub-region or not. 1952 */ 1953 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1954 RTE_LOG(ERR, VHOST_DATA, 1955 "(%"PRIu64") Frame buffer address cross " 1956 "sub-regioin found when attaching TX frame " 1957 "buffer address!\n", 1958 dev->device_fh); 1959 need_copy = 1; 1960 } else 1961 need_copy = 0; 1962 1963 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1964 1965 /* 1966 * If this is the first received packet we need to learn 1967 * the MAC and setup VMDQ 1968 */ 1969 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1970 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1971 /* 1972 * Discard frame if device is scheduled for 1973 * removal or a duplicate MAC address is found. 1974 */ 1975 packet_success += free_entries; 1976 vq->last_used_idx += packet_success; 1977 break; 1978 } 1979 } 1980 1981 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1982 packet_success++; 1983 } 1984 } 1985 1986 /* 1987 * This function is called by each data core. It handles all RX/TX registered 1988 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1989 * addresses are compared with all devices in the main linked list. 1990 */ 1991 static int 1992 switch_worker_zcp(__attribute__((unused)) void *arg) 1993 { 1994 struct virtio_net *dev = NULL; 1995 struct vhost_dev *vdev = NULL; 1996 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1997 struct virtio_net_data_ll *dev_ll; 1998 struct mbuf_table *tx_q; 1999 volatile struct lcore_ll_info *lcore_ll; 2000 const uint64_t drain_tsc 2001 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2002 * BURST_TX_DRAIN_US; 2003 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2004 unsigned ret; 2005 const uint16_t lcore_id = rte_lcore_id(); 2006 uint16_t count_in_ring, rx_count = 0; 2007 2008 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2009 2010 lcore_ll = lcore_info[lcore_id].lcore_ll; 2011 prev_tsc = 0; 2012 2013 while (1) { 2014 cur_tsc = rte_rdtsc(); 2015 2016 /* TX burst queue drain */ 2017 diff_tsc = cur_tsc - prev_tsc; 2018 if (unlikely(diff_tsc > drain_tsc)) { 2019 /* 2020 * Get mbuf from vpool.pool and detach mbuf and 2021 * put back into vpool.ring. 2022 */ 2023 dev_ll = lcore_ll->ll_root_used; 2024 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2025 /* Get virtio device ID */ 2026 vdev = dev_ll->vdev; 2027 dev = vdev->dev; 2028 2029 if (likely(!vdev->remove)) { 2030 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2031 if (tx_q->len) { 2032 LOG_DEBUG(VHOST_DATA, 2033 "TX queue drained after timeout" 2034 " with burst size %u\n", 2035 tx_q->len); 2036 2037 /* 2038 * Tx any packets in the queue 2039 */ 2040 ret = rte_eth_tx_burst( 2041 ports[0], 2042 (uint16_t)tx_q->txq_id, 2043 (struct rte_mbuf **) 2044 tx_q->m_table, 2045 (uint16_t)tx_q->len); 2046 if (unlikely(ret < tx_q->len)) { 2047 do { 2048 rte_pktmbuf_free( 2049 tx_q->m_table[ret]); 2050 } while (++ret < tx_q->len); 2051 } 2052 tx_q->len = 0; 2053 2054 txmbuf_clean_zcp(dev, 2055 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2056 } 2057 } 2058 dev_ll = dev_ll->next; 2059 } 2060 prev_tsc = cur_tsc; 2061 } 2062 2063 rte_prefetch0(lcore_ll->ll_root_used); 2064 2065 /* 2066 * Inform the configuration core that we have exited the linked 2067 * list and that no devices are in use if requested. 2068 */ 2069 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2070 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2071 2072 /* Process devices */ 2073 dev_ll = lcore_ll->ll_root_used; 2074 2075 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2076 vdev = dev_ll->vdev; 2077 dev = vdev->dev; 2078 if (unlikely(vdev->remove)) { 2079 dev_ll = dev_ll->next; 2080 unlink_vmdq(vdev); 2081 vdev->ready = DEVICE_SAFE_REMOVE; 2082 continue; 2083 } 2084 2085 if (likely(vdev->ready == DEVICE_RX)) { 2086 uint32_t index = vdev->vmdq_rx_q; 2087 uint16_t i; 2088 count_in_ring 2089 = rte_ring_count(vpool_array[index].ring); 2090 uint16_t free_entries 2091 = (uint16_t)get_available_ring_num_zcp(dev); 2092 2093 /* 2094 * Attach all mbufs in vpool.ring and put back 2095 * into vpool.pool. 2096 */ 2097 for (i = 0; 2098 i < RTE_MIN(free_entries, 2099 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2100 i++) 2101 attach_rxmbuf_zcp(dev); 2102 2103 /* Handle guest RX */ 2104 rx_count = rte_eth_rx_burst(ports[0], 2105 vdev->vmdq_rx_q, pkts_burst, 2106 MAX_PKT_BURST); 2107 2108 if (rx_count) { 2109 ret_count = virtio_dev_rx_zcp(dev, 2110 pkts_burst, rx_count); 2111 if (enable_stats) { 2112 dev_statistics[dev->device_fh].rx_total 2113 += rx_count; 2114 dev_statistics[dev->device_fh].rx 2115 += ret_count; 2116 } 2117 while (likely(rx_count)) { 2118 rx_count--; 2119 pktmbuf_detach_zcp( 2120 pkts_burst[rx_count]); 2121 rte_ring_sp_enqueue( 2122 vpool_array[index].ring, 2123 (void *)pkts_burst[rx_count]); 2124 } 2125 } 2126 } 2127 2128 if (likely(!vdev->remove)) 2129 /* Handle guest TX */ 2130 virtio_dev_tx_zcp(dev); 2131 2132 /* Move to the next device in the list */ 2133 dev_ll = dev_ll->next; 2134 } 2135 } 2136 2137 return 0; 2138 } 2139 2140 2141 /* 2142 * Add an entry to a used linked list. A free entry must first be found 2143 * in the free linked list using get_data_ll_free_entry(); 2144 */ 2145 static void 2146 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2147 struct virtio_net_data_ll *ll_dev) 2148 { 2149 struct virtio_net_data_ll *ll = *ll_root_addr; 2150 2151 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2152 ll_dev->next = NULL; 2153 rte_compiler_barrier(); 2154 2155 /* If ll == NULL then this is the first device. */ 2156 if (ll) { 2157 /* Increment to the tail of the linked list. */ 2158 while ((ll->next != NULL) ) 2159 ll = ll->next; 2160 2161 ll->next = ll_dev; 2162 } else { 2163 *ll_root_addr = ll_dev; 2164 } 2165 } 2166 2167 /* 2168 * Remove an entry from a used linked list. The entry must then be added to 2169 * the free linked list using put_data_ll_free_entry(). 2170 */ 2171 static void 2172 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2173 struct virtio_net_data_ll *ll_dev, 2174 struct virtio_net_data_ll *ll_dev_last) 2175 { 2176 struct virtio_net_data_ll *ll = *ll_root_addr; 2177 2178 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2179 return; 2180 2181 if (ll_dev == ll) 2182 *ll_root_addr = ll_dev->next; 2183 else 2184 if (likely(ll_dev_last != NULL)) 2185 ll_dev_last->next = ll_dev->next; 2186 else 2187 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2188 } 2189 2190 /* 2191 * Find and return an entry from the free linked list. 2192 */ 2193 static struct virtio_net_data_ll * 2194 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2195 { 2196 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2197 struct virtio_net_data_ll *ll_dev; 2198 2199 if (ll_free == NULL) 2200 return NULL; 2201 2202 ll_dev = ll_free; 2203 *ll_root_addr = ll_free->next; 2204 2205 return ll_dev; 2206 } 2207 2208 /* 2209 * Place an entry back on to the free linked list. 2210 */ 2211 static void 2212 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2213 struct virtio_net_data_ll *ll_dev) 2214 { 2215 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2216 2217 if (ll_dev == NULL) 2218 return; 2219 2220 ll_dev->next = ll_free; 2221 *ll_root_addr = ll_dev; 2222 } 2223 2224 /* 2225 * Creates a linked list of a given size. 2226 */ 2227 static struct virtio_net_data_ll * 2228 alloc_data_ll(uint32_t size) 2229 { 2230 struct virtio_net_data_ll *ll_new; 2231 uint32_t i; 2232 2233 /* Malloc and then chain the linked list. */ 2234 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2235 if (ll_new == NULL) { 2236 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2237 return NULL; 2238 } 2239 2240 for (i = 0; i < size - 1; i++) { 2241 ll_new[i].vdev = NULL; 2242 ll_new[i].next = &ll_new[i+1]; 2243 } 2244 ll_new[i].next = NULL; 2245 2246 return (ll_new); 2247 } 2248 2249 /* 2250 * Create the main linked list along with each individual cores linked list. A used and a free list 2251 * are created to manage entries. 2252 */ 2253 static int 2254 init_data_ll (void) 2255 { 2256 int lcore; 2257 2258 RTE_LCORE_FOREACH_SLAVE(lcore) { 2259 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2260 if (lcore_info[lcore].lcore_ll == NULL) { 2261 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2262 return -1; 2263 } 2264 2265 lcore_info[lcore].lcore_ll->device_num = 0; 2266 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2267 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2268 if (num_devices % num_switching_cores) 2269 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2270 else 2271 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2272 } 2273 2274 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2275 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2276 2277 return 0; 2278 } 2279 2280 /* 2281 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2282 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2283 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2284 */ 2285 static void 2286 destroy_device (volatile struct virtio_net *dev) 2287 { 2288 struct virtio_net_data_ll *ll_lcore_dev_cur; 2289 struct virtio_net_data_ll *ll_main_dev_cur; 2290 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2291 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2292 struct vhost_dev *vdev; 2293 int lcore; 2294 2295 dev->flags &= ~VIRTIO_DEV_RUNNING; 2296 2297 vdev = (struct vhost_dev *)dev->priv; 2298 /*set the remove flag. */ 2299 vdev->remove = 1; 2300 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2301 rte_pause(); 2302 } 2303 2304 /* Search for entry to be removed from lcore ll */ 2305 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2306 while (ll_lcore_dev_cur != NULL) { 2307 if (ll_lcore_dev_cur->vdev == vdev) { 2308 break; 2309 } else { 2310 ll_lcore_dev_last = ll_lcore_dev_cur; 2311 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2312 } 2313 } 2314 2315 if (ll_lcore_dev_cur == NULL) { 2316 RTE_LOG(ERR, VHOST_CONFIG, 2317 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2318 dev->device_fh); 2319 return; 2320 } 2321 2322 /* Search for entry to be removed from main ll */ 2323 ll_main_dev_cur = ll_root_used; 2324 ll_main_dev_last = NULL; 2325 while (ll_main_dev_cur != NULL) { 2326 if (ll_main_dev_cur->vdev == vdev) { 2327 break; 2328 } else { 2329 ll_main_dev_last = ll_main_dev_cur; 2330 ll_main_dev_cur = ll_main_dev_cur->next; 2331 } 2332 } 2333 2334 /* Remove entries from the lcore and main ll. */ 2335 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2336 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2337 2338 /* Set the dev_removal_flag on each lcore. */ 2339 RTE_LCORE_FOREACH_SLAVE(lcore) { 2340 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2341 } 2342 2343 /* 2344 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2345 * they can no longer access the device removed from the linked lists and that the devices 2346 * are no longer in use. 2347 */ 2348 RTE_LCORE_FOREACH_SLAVE(lcore) { 2349 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2350 rte_pause(); 2351 } 2352 } 2353 2354 /* Add the entries back to the lcore and main free ll.*/ 2355 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2356 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2357 2358 /* Decrement number of device on the lcore. */ 2359 lcore_info[vdev->coreid].lcore_ll->device_num--; 2360 2361 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2362 2363 if (zero_copy) { 2364 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2365 2366 /* Stop the RX queue. */ 2367 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2368 LOG_DEBUG(VHOST_CONFIG, 2369 "(%"PRIu64") In destroy_device: Failed to stop " 2370 "rx queue:%d\n", 2371 dev->device_fh, 2372 vdev->vmdq_rx_q); 2373 } 2374 2375 LOG_DEBUG(VHOST_CONFIG, 2376 "(%"PRIu64") in destroy_device: Start put mbuf in " 2377 "mempool back to ring for RX queue: %d\n", 2378 dev->device_fh, vdev->vmdq_rx_q); 2379 2380 mbuf_destroy_zcp(vpool); 2381 2382 /* Stop the TX queue. */ 2383 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2384 LOG_DEBUG(VHOST_CONFIG, 2385 "(%"PRIu64") In destroy_device: Failed to " 2386 "stop tx queue:%d\n", 2387 dev->device_fh, vdev->vmdq_rx_q); 2388 } 2389 2390 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2391 2392 LOG_DEBUG(VHOST_CONFIG, 2393 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2394 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2395 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2396 dev->device_fh); 2397 2398 mbuf_destroy_zcp(vpool); 2399 rte_free(vdev->regions_hpa); 2400 } 2401 rte_free(vdev); 2402 2403 } 2404 2405 /* 2406 * Calculate the region count of physical continous regions for one particular 2407 * region of whose vhost virtual address is continous. The particular region 2408 * start from vva_start, with size of 'size' in argument. 2409 */ 2410 static uint32_t 2411 check_hpa_regions(uint64_t vva_start, uint64_t size) 2412 { 2413 uint32_t i, nregions = 0, page_size = getpagesize(); 2414 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2415 if (vva_start % page_size) { 2416 LOG_DEBUG(VHOST_CONFIG, 2417 "in check_countinous: vva start(%p) mod page_size(%d) " 2418 "has remainder\n", 2419 (void *)(uintptr_t)vva_start, page_size); 2420 return 0; 2421 } 2422 if (size % page_size) { 2423 LOG_DEBUG(VHOST_CONFIG, 2424 "in check_countinous: " 2425 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2426 size, page_size); 2427 return 0; 2428 } 2429 for (i = 0; i < size - page_size; i = i + page_size) { 2430 cur_phys_addr 2431 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2432 next_phys_addr = rte_mem_virt2phy( 2433 (void *)(uintptr_t)(vva_start + i + page_size)); 2434 if ((cur_phys_addr + page_size) != next_phys_addr) { 2435 ++nregions; 2436 LOG_DEBUG(VHOST_CONFIG, 2437 "in check_continuous: hva addr:(%p) is not " 2438 "continuous with hva addr:(%p), diff:%d\n", 2439 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2440 (void *)(uintptr_t)(vva_start + (uint64_t)i 2441 + page_size), page_size); 2442 LOG_DEBUG(VHOST_CONFIG, 2443 "in check_continuous: hpa addr:(%p) is not " 2444 "continuous with hpa addr:(%p), " 2445 "diff:(%"PRIu64")\n", 2446 (void *)(uintptr_t)cur_phys_addr, 2447 (void *)(uintptr_t)next_phys_addr, 2448 (next_phys_addr-cur_phys_addr)); 2449 } 2450 } 2451 return nregions; 2452 } 2453 2454 /* 2455 * Divide each region whose vhost virtual address is continous into a few 2456 * sub-regions, make sure the physical address within each sub-region are 2457 * continous. And fill offset(to GPA) and size etc. information of each 2458 * sub-region into regions_hpa. 2459 */ 2460 static uint32_t 2461 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2462 { 2463 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2464 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2465 2466 if (mem_region_hpa == NULL) 2467 return 0; 2468 2469 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2470 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2471 virtio_memory->regions[regionidx].address_offset; 2472 mem_region_hpa[regionidx_hpa].guest_phys_address 2473 = virtio_memory->regions[regionidx].guest_phys_address; 2474 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2475 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2476 mem_region_hpa[regionidx_hpa].guest_phys_address; 2477 LOG_DEBUG(VHOST_CONFIG, 2478 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2479 regionidx_hpa, 2480 (void *)(uintptr_t) 2481 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2482 LOG_DEBUG(VHOST_CONFIG, 2483 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2484 regionidx_hpa, 2485 (void *)(uintptr_t) 2486 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2487 for (i = 0, k = 0; 2488 i < virtio_memory->regions[regionidx].memory_size - 2489 page_size; 2490 i += page_size) { 2491 cur_phys_addr = rte_mem_virt2phy( 2492 (void *)(uintptr_t)(vva_start + i)); 2493 next_phys_addr = rte_mem_virt2phy( 2494 (void *)(uintptr_t)(vva_start + 2495 i + page_size)); 2496 if ((cur_phys_addr + page_size) != next_phys_addr) { 2497 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2498 mem_region_hpa[regionidx_hpa].guest_phys_address + 2499 k + page_size; 2500 mem_region_hpa[regionidx_hpa].memory_size 2501 = k + page_size; 2502 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2503 "phys addr end [%d]:(%p)\n", 2504 regionidx_hpa, 2505 (void *)(uintptr_t) 2506 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2507 LOG_DEBUG(VHOST_CONFIG, 2508 "in fill_hpa_regions: guest phys addr " 2509 "size [%d]:(%p)\n", 2510 regionidx_hpa, 2511 (void *)(uintptr_t) 2512 (mem_region_hpa[regionidx_hpa].memory_size)); 2513 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2514 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2515 ++regionidx_hpa; 2516 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2517 next_phys_addr - 2518 mem_region_hpa[regionidx_hpa].guest_phys_address; 2519 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2520 " phys addr start[%d]:(%p)\n", 2521 regionidx_hpa, 2522 (void *)(uintptr_t) 2523 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2524 LOG_DEBUG(VHOST_CONFIG, 2525 "in fill_hpa_regions: host phys addr " 2526 "start[%d]:(%p)\n", 2527 regionidx_hpa, 2528 (void *)(uintptr_t) 2529 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2530 k = 0; 2531 } else { 2532 k += page_size; 2533 } 2534 } 2535 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2536 = mem_region_hpa[regionidx_hpa].guest_phys_address 2537 + k + page_size; 2538 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2539 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2540 "[%d]:(%p)\n", regionidx_hpa, 2541 (void *)(uintptr_t) 2542 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2543 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2544 "[%d]:(%p)\n", regionidx_hpa, 2545 (void *)(uintptr_t) 2546 (mem_region_hpa[regionidx_hpa].memory_size)); 2547 ++regionidx_hpa; 2548 } 2549 return regionidx_hpa; 2550 } 2551 2552 /* 2553 * A new device is added to a data core. First the device is added to the main linked list 2554 * and the allocated to a specific data core. 2555 */ 2556 static int 2557 new_device (struct virtio_net *dev) 2558 { 2559 struct virtio_net_data_ll *ll_dev; 2560 int lcore, core_add = 0; 2561 uint32_t device_num_min = num_devices; 2562 struct vhost_dev *vdev; 2563 uint32_t regionidx; 2564 2565 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2566 if (vdev == NULL) { 2567 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2568 dev->device_fh); 2569 return -1; 2570 } 2571 vdev->dev = dev; 2572 dev->priv = vdev; 2573 2574 if (zero_copy) { 2575 vdev->nregions_hpa = dev->mem->nregions; 2576 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2577 vdev->nregions_hpa 2578 += check_hpa_regions( 2579 dev->mem->regions[regionidx].guest_phys_address 2580 + dev->mem->regions[regionidx].address_offset, 2581 dev->mem->regions[regionidx].memory_size); 2582 2583 } 2584 2585 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2586 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2587 CACHE_LINE_SIZE); 2588 if (vdev->regions_hpa == NULL) { 2589 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2590 rte_free(vdev); 2591 return -1; 2592 } 2593 2594 2595 if (fill_hpa_memory_regions( 2596 vdev->regions_hpa, dev->mem 2597 ) != vdev->nregions_hpa) { 2598 2599 RTE_LOG(ERR, VHOST_CONFIG, 2600 "hpa memory regions number mismatch: " 2601 "[%d]\n", vdev->nregions_hpa); 2602 rte_free(vdev->regions_hpa); 2603 rte_free(vdev); 2604 return -1; 2605 } 2606 } 2607 2608 2609 /* Add device to main ll */ 2610 ll_dev = get_data_ll_free_entry(&ll_root_free); 2611 if (ll_dev == NULL) { 2612 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2613 "of %d devices per core has been reached\n", 2614 dev->device_fh, num_devices); 2615 if (vdev->regions_hpa) 2616 rte_free(vdev->regions_hpa); 2617 rte_free(vdev); 2618 return -1; 2619 } 2620 ll_dev->vdev = vdev; 2621 add_data_ll_entry(&ll_root_used, ll_dev); 2622 vdev->vmdq_rx_q 2623 = dev->device_fh * (num_queues / num_devices); 2624 2625 if (zero_copy) { 2626 uint32_t index = vdev->vmdq_rx_q; 2627 uint32_t count_in_ring, i; 2628 struct mbuf_table *tx_q; 2629 2630 count_in_ring = rte_ring_count(vpool_array[index].ring); 2631 2632 LOG_DEBUG(VHOST_CONFIG, 2633 "(%"PRIu64") in new_device: mbuf count in mempool " 2634 "before attach is: %d\n", 2635 dev->device_fh, 2636 rte_mempool_count(vpool_array[index].pool)); 2637 LOG_DEBUG(VHOST_CONFIG, 2638 "(%"PRIu64") in new_device: mbuf count in ring " 2639 "before attach is : %d\n", 2640 dev->device_fh, count_in_ring); 2641 2642 /* 2643 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2644 */ 2645 for (i = 0; i < count_in_ring; i++) 2646 attach_rxmbuf_zcp(dev); 2647 2648 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2649 "mempool after attach is: %d\n", 2650 dev->device_fh, 2651 rte_mempool_count(vpool_array[index].pool)); 2652 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2653 "ring after attach is : %d\n", 2654 dev->device_fh, 2655 rte_ring_count(vpool_array[index].ring)); 2656 2657 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2658 tx_q->txq_id = vdev->vmdq_rx_q; 2659 2660 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2661 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2662 2663 LOG_DEBUG(VHOST_CONFIG, 2664 "(%"PRIu64") In new_device: Failed to start " 2665 "tx queue:%d\n", 2666 dev->device_fh, vdev->vmdq_rx_q); 2667 2668 mbuf_destroy_zcp(vpool); 2669 rte_free(vdev->regions_hpa); 2670 rte_free(vdev); 2671 return -1; 2672 } 2673 2674 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2675 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2676 2677 LOG_DEBUG(VHOST_CONFIG, 2678 "(%"PRIu64") In new_device: Failed to start " 2679 "rx queue:%d\n", 2680 dev->device_fh, vdev->vmdq_rx_q); 2681 2682 /* Stop the TX queue. */ 2683 if (rte_eth_dev_tx_queue_stop(ports[0], 2684 vdev->vmdq_rx_q) != 0) { 2685 LOG_DEBUG(VHOST_CONFIG, 2686 "(%"PRIu64") In new_device: Failed to " 2687 "stop tx queue:%d\n", 2688 dev->device_fh, vdev->vmdq_rx_q); 2689 } 2690 2691 mbuf_destroy_zcp(vpool); 2692 rte_free(vdev->regions_hpa); 2693 rte_free(vdev); 2694 return -1; 2695 } 2696 2697 } 2698 2699 /*reset ready flag*/ 2700 vdev->ready = DEVICE_MAC_LEARNING; 2701 vdev->remove = 0; 2702 2703 /* Find a suitable lcore to add the device. */ 2704 RTE_LCORE_FOREACH_SLAVE(lcore) { 2705 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2706 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2707 core_add = lcore; 2708 } 2709 } 2710 /* Add device to lcore ll */ 2711 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2712 if (ll_dev == NULL) { 2713 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2714 vdev->ready = DEVICE_SAFE_REMOVE; 2715 destroy_device(dev); 2716 if (vdev->regions_hpa) 2717 rte_free(vdev->regions_hpa); 2718 rte_free(vdev); 2719 return -1; 2720 } 2721 ll_dev->vdev = vdev; 2722 vdev->coreid = core_add; 2723 2724 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2725 2726 /* Initialize device stats */ 2727 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2728 2729 /* Disable notifications. */ 2730 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2731 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2732 lcore_info[vdev->coreid].lcore_ll->device_num++; 2733 dev->flags |= VIRTIO_DEV_RUNNING; 2734 2735 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2736 2737 return 0; 2738 } 2739 2740 /* 2741 * These callback allow devices to be added to the data core when configuration 2742 * has been fully complete. 2743 */ 2744 static const struct virtio_net_device_ops virtio_net_device_ops = 2745 { 2746 .new_device = new_device, 2747 .destroy_device = destroy_device, 2748 }; 2749 2750 /* 2751 * This is a thread will wake up after a period to print stats if the user has 2752 * enabled them. 2753 */ 2754 static void 2755 print_stats(void) 2756 { 2757 struct virtio_net_data_ll *dev_ll; 2758 uint64_t tx_dropped, rx_dropped; 2759 uint64_t tx, tx_total, rx, rx_total; 2760 uint32_t device_fh; 2761 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2762 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2763 2764 while(1) { 2765 sleep(enable_stats); 2766 2767 /* Clear screen and move to top left */ 2768 printf("%s%s", clr, top_left); 2769 2770 printf("\nDevice statistics ===================================="); 2771 2772 dev_ll = ll_root_used; 2773 while (dev_ll != NULL) { 2774 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2775 tx_total = dev_statistics[device_fh].tx_total; 2776 tx = dev_statistics[device_fh].tx; 2777 tx_dropped = tx_total - tx; 2778 if (zero_copy == 0) { 2779 rx_total = rte_atomic64_read( 2780 &dev_statistics[device_fh].rx_total_atomic); 2781 rx = rte_atomic64_read( 2782 &dev_statistics[device_fh].rx_atomic); 2783 } else { 2784 rx_total = dev_statistics[device_fh].rx_total; 2785 rx = dev_statistics[device_fh].rx; 2786 } 2787 rx_dropped = rx_total - rx; 2788 2789 printf("\nStatistics for device %"PRIu32" ------------------------------" 2790 "\nTX total: %"PRIu64"" 2791 "\nTX dropped: %"PRIu64"" 2792 "\nTX successful: %"PRIu64"" 2793 "\nRX total: %"PRIu64"" 2794 "\nRX dropped: %"PRIu64"" 2795 "\nRX successful: %"PRIu64"", 2796 device_fh, 2797 tx_total, 2798 tx_dropped, 2799 tx, 2800 rx_total, 2801 rx_dropped, 2802 rx); 2803 2804 dev_ll = dev_ll->next; 2805 } 2806 printf("\n======================================================\n"); 2807 } 2808 } 2809 2810 static void 2811 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2812 char *ring_name, uint32_t nb_mbuf) 2813 { 2814 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2815 vpool_array[index].pool 2816 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2817 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2818 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2819 rte_pktmbuf_init, NULL, socket, 0); 2820 if (vpool_array[index].pool != NULL) { 2821 vpool_array[index].ring 2822 = rte_ring_create(ring_name, 2823 rte_align32pow2(nb_mbuf + 1), 2824 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2825 if (likely(vpool_array[index].ring != NULL)) { 2826 LOG_DEBUG(VHOST_CONFIG, 2827 "in setup_mempool_tbl: mbuf count in " 2828 "mempool is: %d\n", 2829 rte_mempool_count(vpool_array[index].pool)); 2830 LOG_DEBUG(VHOST_CONFIG, 2831 "in setup_mempool_tbl: mbuf count in " 2832 "ring is: %d\n", 2833 rte_ring_count(vpool_array[index].ring)); 2834 } else { 2835 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2836 ring_name); 2837 } 2838 2839 /* Need consider head room. */ 2840 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2841 } else { 2842 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2843 } 2844 } 2845 2846 2847 /* 2848 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2849 * device is also registered here to handle the IOCTLs. 2850 */ 2851 int 2852 MAIN(int argc, char *argv[]) 2853 { 2854 struct rte_mempool *mbuf_pool = NULL; 2855 unsigned lcore_id, core_id = 0; 2856 unsigned nb_ports, valid_num_ports; 2857 int ret; 2858 uint8_t portid, queue_id = 0; 2859 static pthread_t tid; 2860 2861 /* init EAL */ 2862 ret = rte_eal_init(argc, argv); 2863 if (ret < 0) 2864 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2865 argc -= ret; 2866 argv += ret; 2867 2868 /* parse app arguments */ 2869 ret = us_vhost_parse_args(argc, argv); 2870 if (ret < 0) 2871 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2872 2873 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2874 if (rte_lcore_is_enabled(lcore_id)) 2875 lcore_ids[core_id ++] = lcore_id; 2876 2877 if (rte_lcore_count() > RTE_MAX_LCORE) 2878 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2879 2880 /*set the number of swithcing cores available*/ 2881 num_switching_cores = rte_lcore_count()-1; 2882 2883 /* Get the number of physical ports. */ 2884 nb_ports = rte_eth_dev_count(); 2885 if (nb_ports > RTE_MAX_ETHPORTS) 2886 nb_ports = RTE_MAX_ETHPORTS; 2887 2888 /* 2889 * Update the global var NUM_PORTS and global array PORTS 2890 * and get value of var VALID_NUM_PORTS according to system ports number 2891 */ 2892 valid_num_ports = check_ports_num(nb_ports); 2893 2894 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2895 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2896 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2897 return -1; 2898 } 2899 2900 if (zero_copy == 0) { 2901 /* Create the mbuf pool. */ 2902 mbuf_pool = rte_mempool_create( 2903 "MBUF_POOL", 2904 NUM_MBUFS_PER_PORT 2905 * valid_num_ports, 2906 MBUF_SIZE, MBUF_CACHE_SIZE, 2907 sizeof(struct rte_pktmbuf_pool_private), 2908 rte_pktmbuf_pool_init, NULL, 2909 rte_pktmbuf_init, NULL, 2910 rte_socket_id(), 0); 2911 if (mbuf_pool == NULL) 2912 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2913 2914 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2915 vpool_array[queue_id].pool = mbuf_pool; 2916 2917 if (vm2vm_mode == VM2VM_HARDWARE) { 2918 /* Enable VT loop back to let L2 switch to do it. */ 2919 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2920 LOG_DEBUG(VHOST_CONFIG, 2921 "Enable loop back for L2 switch in vmdq.\n"); 2922 } 2923 } else { 2924 uint32_t nb_mbuf; 2925 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2926 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2927 2928 /* 2929 * Zero copy defers queue RX/TX start to the time when guest 2930 * finishes its startup and packet buffers from that guest are 2931 * available. 2932 */ 2933 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2934 rx_conf_default.rx_drop_en = 0; 2935 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2936 nb_mbuf = num_rx_descriptor 2937 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2938 + num_switching_cores * MAX_PKT_BURST; 2939 2940 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2941 snprintf(pool_name, sizeof(pool_name), 2942 "rxmbuf_pool_%u", queue_id); 2943 snprintf(ring_name, sizeof(ring_name), 2944 "rxmbuf_ring_%u", queue_id); 2945 setup_mempool_tbl(rte_socket_id(), queue_id, 2946 pool_name, ring_name, nb_mbuf); 2947 } 2948 2949 nb_mbuf = num_tx_descriptor 2950 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2951 + num_switching_cores * MAX_PKT_BURST; 2952 2953 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2954 snprintf(pool_name, sizeof(pool_name), 2955 "txmbuf_pool_%u", queue_id); 2956 snprintf(ring_name, sizeof(ring_name), 2957 "txmbuf_ring_%u", queue_id); 2958 setup_mempool_tbl(rte_socket_id(), 2959 (queue_id + MAX_QUEUES), 2960 pool_name, ring_name, nb_mbuf); 2961 } 2962 2963 if (vm2vm_mode == VM2VM_HARDWARE) { 2964 /* Enable VT loop back to let L2 switch to do it. */ 2965 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2966 LOG_DEBUG(VHOST_CONFIG, 2967 "Enable loop back for L2 switch in vmdq.\n"); 2968 } 2969 } 2970 /* Set log level. */ 2971 rte_set_log_level(LOG_LEVEL); 2972 2973 /* initialize all ports */ 2974 for (portid = 0; portid < nb_ports; portid++) { 2975 /* skip ports that are not enabled */ 2976 if ((enabled_port_mask & (1 << portid)) == 0) { 2977 RTE_LOG(INFO, VHOST_PORT, 2978 "Skipping disabled port %d\n", portid); 2979 continue; 2980 } 2981 if (port_init(portid) != 0) 2982 rte_exit(EXIT_FAILURE, 2983 "Cannot initialize network ports\n"); 2984 } 2985 2986 /* Initialise all linked lists. */ 2987 if (init_data_ll() == -1) 2988 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2989 2990 /* Initialize device stats */ 2991 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2992 2993 /* Enable stats if the user option is set. */ 2994 if (enable_stats) 2995 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2996 2997 /* Launch all data cores. */ 2998 if (zero_copy == 0) { 2999 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3000 rte_eal_remote_launch(switch_worker, 3001 mbuf_pool, lcore_id); 3002 } 3003 } else { 3004 uint32_t count_in_mempool, index, i; 3005 for (index = 0; index < 2*MAX_QUEUES; index++) { 3006 /* For all RX and TX queues. */ 3007 count_in_mempool 3008 = rte_mempool_count(vpool_array[index].pool); 3009 3010 /* 3011 * Transfer all un-attached mbufs from vpool.pool 3012 * to vpoo.ring. 3013 */ 3014 for (i = 0; i < count_in_mempool; i++) { 3015 struct rte_mbuf *mbuf 3016 = __rte_mbuf_raw_alloc( 3017 vpool_array[index].pool); 3018 rte_ring_sp_enqueue(vpool_array[index].ring, 3019 (void *)mbuf); 3020 } 3021 3022 LOG_DEBUG(VHOST_CONFIG, 3023 "in MAIN: mbuf count in mempool at initial " 3024 "is: %d\n", count_in_mempool); 3025 LOG_DEBUG(VHOST_CONFIG, 3026 "in MAIN: mbuf count in ring at initial is :" 3027 " %d\n", 3028 rte_ring_count(vpool_array[index].ring)); 3029 } 3030 3031 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3032 rte_eal_remote_launch(switch_worker_zcp, NULL, 3033 lcore_id); 3034 } 3035 3036 if (mergeable == 0) 3037 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3038 3039 /* Register CUSE device to handle IOCTLs. */ 3040 ret = rte_vhost_driver_register((char *)&dev_basename); 3041 if (ret != 0) 3042 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3043 3044 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3045 3046 /* Start CUSE session. */ 3047 rte_vhost_driver_session_start(); 3048 return 0; 3049 3050 } 3051 3052