1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */ 103 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 104 105 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 106 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 107 108 #define JUMBO_FRAME_MAX_SIZE 0x2600 109 110 /* State of virtio device. */ 111 #define DEVICE_MAC_LEARNING 0 112 #define DEVICE_RX 1 113 #define DEVICE_SAFE_REMOVE 2 114 115 /* Config_core_flag status definitions. */ 116 #define REQUEST_DEV_REMOVAL 1 117 #define ACK_DEV_REMOVAL 0 118 119 /* Configurable number of RX/TX ring descriptors */ 120 #define RTE_TEST_RX_DESC_DEFAULT 1024 121 #define RTE_TEST_TX_DESC_DEFAULT 512 122 123 /* 124 * Need refine these 2 macros for legacy and DPDK based front end: 125 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 126 * And then adjust power 2. 127 */ 128 /* 129 * For legacy front end, 128 descriptors, 130 * half for virtio header, another half for mbuf. 131 */ 132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 134 135 /* Get first 4 bytes in mbuf headroom. */ 136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 137 + sizeof(struct rte_mbuf))) 138 139 /* true if x is a power of 2 */ 140 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 141 142 #define INVALID_PORT_ID 0xFF 143 144 /* Max number of devices. Limited by vmdq. */ 145 #define MAX_DEVICES 64 146 147 /* Size of buffers used for snprintfs. */ 148 #define MAX_PRINT_BUFF 6072 149 150 /* Maximum character device basename size. */ 151 #define MAX_BASENAME_SZ 10 152 153 /* Maximum long option length for option parsing. */ 154 #define MAX_LONG_OPT_SZ 64 155 156 /* Used to compare MAC addresses. */ 157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 158 159 /* Number of descriptors per cacheline. */ 160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 161 162 /* mask of enabled ports */ 163 static uint32_t enabled_port_mask = 0; 164 165 /*Number of switching cores enabled*/ 166 static uint32_t num_switching_cores = 0; 167 168 /* number of devices/queues to support*/ 169 static uint32_t num_queues = 0; 170 uint32_t num_devices = 0; 171 172 /* 173 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 174 * disabled on default. 175 */ 176 static uint32_t zero_copy; 177 178 /* number of descriptors to apply*/ 179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 181 182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 183 #define MAX_RING_DESC 4096 184 185 struct vpool { 186 struct rte_mempool *pool; 187 struct rte_ring *ring; 188 uint32_t buf_size; 189 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 190 191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 192 typedef enum { 193 VM2VM_DISABLED = 0, 194 VM2VM_SOFTWARE = 1, 195 VM2VM_HARDWARE = 2, 196 VM2VM_LAST 197 } vm2vm_type; 198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 199 200 /* The type of host physical address translated from guest physical address. */ 201 typedef enum { 202 PHYS_ADDR_CONTINUOUS = 0, 203 PHYS_ADDR_CROSS_SUBREG = 1, 204 PHYS_ADDR_INVALID = 2, 205 PHYS_ADDR_LAST 206 } hpa_type; 207 208 /* Enable stats. */ 209 static uint32_t enable_stats = 0; 210 /* Enable retries on RX. */ 211 static uint32_t enable_retry = 1; 212 /* Specify timeout (in useconds) between retries on RX. */ 213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 214 /* Specify the number of retries on RX. */ 215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 216 217 /* Character device basename. Can be set by user. */ 218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 219 220 221 /* This can be set by the user so it is made available here. */ 222 extern uint64_t VHOST_FEATURES; 223 224 /* Default configuration for rx and tx thresholds etc. */ 225 static struct rte_eth_rxconf rx_conf_default = { 226 .rx_thresh = { 227 .pthresh = RX_PTHRESH, 228 .hthresh = RX_HTHRESH, 229 .wthresh = RX_WTHRESH, 230 }, 231 .rx_drop_en = 1, 232 }; 233 234 /* 235 * These default values are optimized for use with the Intel(R) 82599 10 GbE 236 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 237 * network controllers and/or network drivers. 238 */ 239 static struct rte_eth_txconf tx_conf_default = { 240 .tx_thresh = { 241 .pthresh = TX_PTHRESH, 242 .hthresh = TX_HTHRESH, 243 .wthresh = TX_WTHRESH, 244 }, 245 .tx_free_thresh = 0, /* Use PMD default values */ 246 .tx_rs_thresh = 0, /* Use PMD default values */ 247 }; 248 249 /* empty vmdq configuration structure. Filled in programatically */ 250 static struct rte_eth_conf vmdq_conf_default = { 251 .rxmode = { 252 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 253 .split_hdr_size = 0, 254 .header_split = 0, /**< Header Split disabled */ 255 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 256 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 257 /* 258 * It is necessary for 1G NIC such as I350, 259 * this fixes bug of ipv4 forwarding in guest can't 260 * forward pakets from one virtio dev to another virtio dev. 261 */ 262 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 263 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 264 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 265 }, 266 267 .txmode = { 268 .mq_mode = ETH_MQ_TX_NONE, 269 }, 270 .rx_adv_conf = { 271 /* 272 * should be overridden separately in code with 273 * appropriate values 274 */ 275 .vmdq_rx_conf = { 276 .nb_queue_pools = ETH_8_POOLS, 277 .enable_default_pool = 0, 278 .default_pool = 0, 279 .nb_pool_maps = 0, 280 .pool_map = {{0, 0},}, 281 }, 282 }, 283 }; 284 285 static unsigned lcore_ids[RTE_MAX_LCORE]; 286 static uint8_t ports[RTE_MAX_ETHPORTS]; 287 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 288 289 static const uint16_t external_pkt_default_vlan_tag = 2000; 290 const uint16_t vlan_tags[] = { 291 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 292 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 293 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 294 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 295 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 296 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 297 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 298 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 299 }; 300 301 /* ethernet addresses of ports */ 302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 303 304 /* heads for the main used and free linked lists for the data path. */ 305 static struct virtio_net_data_ll *ll_root_used = NULL; 306 static struct virtio_net_data_ll *ll_root_free = NULL; 307 308 /* Array of data core structures containing information on individual core linked lists. */ 309 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 310 311 /* Used for queueing bursts of TX packets. */ 312 struct mbuf_table { 313 unsigned len; 314 unsigned txq_id; 315 struct rte_mbuf *m_table[MAX_PKT_BURST]; 316 }; 317 318 /* TX queue for each data core. */ 319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 320 321 /* TX queue fori each virtio device for zero copy. */ 322 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 323 324 /* Vlan header struct used to insert vlan tags on TX. */ 325 struct vlan_ethhdr { 326 unsigned char h_dest[ETH_ALEN]; 327 unsigned char h_source[ETH_ALEN]; 328 __be16 h_vlan_proto; 329 __be16 h_vlan_TCI; 330 __be16 h_vlan_encapsulated_proto; 331 }; 332 333 /* IPv4 Header */ 334 struct ipv4_hdr { 335 uint8_t version_ihl; /**< version and header length */ 336 uint8_t type_of_service; /**< type of service */ 337 uint16_t total_length; /**< length of packet */ 338 uint16_t packet_id; /**< packet ID */ 339 uint16_t fragment_offset; /**< fragmentation offset */ 340 uint8_t time_to_live; /**< time to live */ 341 uint8_t next_proto_id; /**< protocol ID */ 342 uint16_t hdr_checksum; /**< header checksum */ 343 uint32_t src_addr; /**< source address */ 344 uint32_t dst_addr; /**< destination address */ 345 } __attribute__((__packed__)); 346 347 /* Header lengths. */ 348 #define VLAN_HLEN 4 349 #define VLAN_ETH_HLEN 18 350 351 /* Per-device statistics struct */ 352 struct device_statistics { 353 uint64_t tx_total; 354 rte_atomic64_t rx_total_atomic; 355 uint64_t rx_total; 356 uint64_t tx; 357 rte_atomic64_t rx_atomic; 358 uint64_t rx; 359 } __rte_cache_aligned; 360 struct device_statistics dev_statistics[MAX_DEVICES]; 361 362 /* 363 * Builds up the correct configuration for VMDQ VLAN pool map 364 * according to the pool & queue limits. 365 */ 366 static inline int 367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 368 { 369 struct rte_eth_vmdq_rx_conf conf; 370 unsigned i; 371 372 memset(&conf, 0, sizeof(conf)); 373 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 374 conf.nb_pool_maps = num_devices; 375 conf.enable_loop_back = 376 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 377 378 for (i = 0; i < conf.nb_pool_maps; i++) { 379 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 380 conf.pool_map[i].pools = (1UL << i); 381 } 382 383 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 384 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 385 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 386 return 0; 387 } 388 389 /* 390 * Validate the device number according to the max pool number gotten form 391 * dev_info. If the device number is invalid, give the error message and 392 * return -1. Each device must have its own pool. 393 */ 394 static inline int 395 validate_num_devices(uint32_t max_nb_devices) 396 { 397 if (num_devices > max_nb_devices) { 398 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 399 return -1; 400 } 401 return 0; 402 } 403 404 /* 405 * Initialises a given port using global settings and with the rx buffers 406 * coming from the mbuf_pool passed as parameter 407 */ 408 static inline int 409 port_init(uint8_t port) 410 { 411 struct rte_eth_dev_info dev_info; 412 struct rte_eth_conf port_conf; 413 uint16_t rx_rings, tx_rings; 414 uint16_t rx_ring_size, tx_ring_size; 415 int retval; 416 uint16_t q; 417 418 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 419 rte_eth_dev_info_get (port, &dev_info); 420 421 /*configure the number of supported virtio devices based on VMDQ limits */ 422 num_devices = dev_info.max_vmdq_pools; 423 num_queues = dev_info.max_rx_queues; 424 425 if (zero_copy) { 426 rx_ring_size = num_rx_descriptor; 427 tx_ring_size = num_tx_descriptor; 428 tx_rings = dev_info.max_tx_queues; 429 } else { 430 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 431 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 432 tx_rings = (uint16_t)rte_lcore_count(); 433 } 434 435 retval = validate_num_devices(MAX_DEVICES); 436 if (retval < 0) 437 return retval; 438 439 /* Get port configuration. */ 440 retval = get_eth_conf(&port_conf, num_devices); 441 if (retval < 0) 442 return retval; 443 444 if (port >= rte_eth_dev_count()) return -1; 445 446 rx_rings = (uint16_t)num_queues, 447 /* Configure ethernet device. */ 448 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 449 if (retval != 0) 450 return retval; 451 452 /* Setup the queues. */ 453 for (q = 0; q < rx_rings; q ++) { 454 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 455 rte_eth_dev_socket_id(port), &rx_conf_default, 456 vpool_array[q].pool); 457 if (retval < 0) 458 return retval; 459 } 460 for (q = 0; q < tx_rings; q ++) { 461 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 462 rte_eth_dev_socket_id(port), &tx_conf_default); 463 if (retval < 0) 464 return retval; 465 } 466 467 /* Start the device. */ 468 retval = rte_eth_dev_start(port); 469 if (retval < 0) { 470 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 471 return retval; 472 } 473 474 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 475 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 476 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 477 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 478 (unsigned)port, 479 vmdq_ports_eth_addr[port].addr_bytes[0], 480 vmdq_ports_eth_addr[port].addr_bytes[1], 481 vmdq_ports_eth_addr[port].addr_bytes[2], 482 vmdq_ports_eth_addr[port].addr_bytes[3], 483 vmdq_ports_eth_addr[port].addr_bytes[4], 484 vmdq_ports_eth_addr[port].addr_bytes[5]); 485 486 return 0; 487 } 488 489 /* 490 * Set character device basename. 491 */ 492 static int 493 us_vhost_parse_basename(const char *q_arg) 494 { 495 /* parse number string */ 496 497 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 498 return -1; 499 else 500 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 501 502 return 0; 503 } 504 505 /* 506 * Parse the portmask provided at run time. 507 */ 508 static int 509 parse_portmask(const char *portmask) 510 { 511 char *end = NULL; 512 unsigned long pm; 513 514 errno = 0; 515 516 /* parse hexadecimal string */ 517 pm = strtoul(portmask, &end, 16); 518 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 519 return -1; 520 521 if (pm == 0) 522 return -1; 523 524 return pm; 525 526 } 527 528 /* 529 * Parse num options at run time. 530 */ 531 static int 532 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 533 { 534 char *end = NULL; 535 unsigned long num; 536 537 errno = 0; 538 539 /* parse unsigned int string */ 540 num = strtoul(q_arg, &end, 10); 541 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 542 return -1; 543 544 if (num > max_valid_value) 545 return -1; 546 547 return num; 548 549 } 550 551 /* 552 * Display usage 553 */ 554 static void 555 us_vhost_usage(const char *prgname) 556 { 557 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 558 " --vm2vm [0|1|2]\n" 559 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 560 " --dev-basename <name>\n" 561 " --nb-devices ND\n" 562 " -p PORTMASK: Set mask for ports to be used by application\n" 563 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 564 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 565 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 566 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 567 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 568 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 569 " --dev-basename: The basename to be used for the character device.\n" 570 " --zero-copy [0|1]: disable(default)/enable rx/tx " 571 "zero copy\n" 572 " --rx-desc-num [0-N]: the number of descriptors on rx, " 573 "used only when zero copy is enabled.\n" 574 " --tx-desc-num [0-N]: the number of descriptors on tx, " 575 "used only when zero copy is enabled.\n", 576 prgname); 577 } 578 579 /* 580 * Parse the arguments given in the command line of the application. 581 */ 582 static int 583 us_vhost_parse_args(int argc, char **argv) 584 { 585 int opt, ret; 586 int option_index; 587 unsigned i; 588 const char *prgname = argv[0]; 589 static struct option long_option[] = { 590 {"vm2vm", required_argument, NULL, 0}, 591 {"rx-retry", required_argument, NULL, 0}, 592 {"rx-retry-delay", required_argument, NULL, 0}, 593 {"rx-retry-num", required_argument, NULL, 0}, 594 {"mergeable", required_argument, NULL, 0}, 595 {"stats", required_argument, NULL, 0}, 596 {"dev-basename", required_argument, NULL, 0}, 597 {"zero-copy", required_argument, NULL, 0}, 598 {"rx-desc-num", required_argument, NULL, 0}, 599 {"tx-desc-num", required_argument, NULL, 0}, 600 {NULL, 0, 0, 0}, 601 }; 602 603 /* Parse command line */ 604 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 605 switch (opt) { 606 /* Portmask */ 607 case 'p': 608 enabled_port_mask = parse_portmask(optarg); 609 if (enabled_port_mask == 0) { 610 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 611 us_vhost_usage(prgname); 612 return -1; 613 } 614 break; 615 616 case 0: 617 /* Enable/disable vm2vm comms. */ 618 if (!strncmp(long_option[option_index].name, "vm2vm", 619 MAX_LONG_OPT_SZ)) { 620 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 621 if (ret == -1) { 622 RTE_LOG(INFO, VHOST_CONFIG, 623 "Invalid argument for " 624 "vm2vm [0|1|2]\n"); 625 us_vhost_usage(prgname); 626 return -1; 627 } else { 628 vm2vm_mode = (vm2vm_type)ret; 629 } 630 } 631 632 /* Enable/disable retries on RX. */ 633 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 634 ret = parse_num_opt(optarg, 1); 635 if (ret == -1) { 636 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 637 us_vhost_usage(prgname); 638 return -1; 639 } else { 640 enable_retry = ret; 641 } 642 } 643 644 /* Specify the retries delay time (in useconds) on RX. */ 645 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 646 ret = parse_num_opt(optarg, INT32_MAX); 647 if (ret == -1) { 648 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 649 us_vhost_usage(prgname); 650 return -1; 651 } else { 652 burst_rx_delay_time = ret; 653 } 654 } 655 656 /* Specify the retries number on RX. */ 657 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 658 ret = parse_num_opt(optarg, INT32_MAX); 659 if (ret == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 661 us_vhost_usage(prgname); 662 return -1; 663 } else { 664 burst_rx_retry_num = ret; 665 } 666 } 667 668 /* Enable/disable RX mergeable buffers. */ 669 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 670 ret = parse_num_opt(optarg, 1); 671 if (ret == -1) { 672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } else { 676 if (ret) { 677 vmdq_conf_default.rxmode.jumbo_frame = 1; 678 vmdq_conf_default.rxmode.max_rx_pkt_len 679 = JUMBO_FRAME_MAX_SIZE; 680 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF); 681 } 682 } 683 } 684 685 /* Enable/disable stats. */ 686 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 687 ret = parse_num_opt(optarg, INT32_MAX); 688 if (ret == -1) { 689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 690 us_vhost_usage(prgname); 691 return -1; 692 } else { 693 enable_stats = ret; 694 } 695 } 696 697 /* Set character device basename. */ 698 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 699 if (us_vhost_parse_basename(optarg) == -1) { 700 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 701 us_vhost_usage(prgname); 702 return -1; 703 } 704 } 705 706 /* Enable/disable rx/tx zero copy. */ 707 if (!strncmp(long_option[option_index].name, 708 "zero-copy", MAX_LONG_OPT_SZ)) { 709 ret = parse_num_opt(optarg, 1); 710 if (ret == -1) { 711 RTE_LOG(INFO, VHOST_CONFIG, 712 "Invalid argument" 713 " for zero-copy [0|1]\n"); 714 us_vhost_usage(prgname); 715 return -1; 716 } else 717 zero_copy = ret; 718 719 if (zero_copy) { 720 #ifdef RTE_MBUF_REFCNT 721 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 722 "zero copy vhost APP, please " 723 "disable RTE_MBUF_REFCNT\n" 724 "in config file and then rebuild DPDK " 725 "core lib!\n" 726 "Otherwise please disable zero copy " 727 "flag in command line!\n"); 728 return -1; 729 #endif 730 } 731 } 732 733 /* Specify the descriptor number on RX. */ 734 if (!strncmp(long_option[option_index].name, 735 "rx-desc-num", MAX_LONG_OPT_SZ)) { 736 ret = parse_num_opt(optarg, MAX_RING_DESC); 737 if ((ret == -1) || (!POWEROF2(ret))) { 738 RTE_LOG(INFO, VHOST_CONFIG, 739 "Invalid argument for rx-desc-num[0-N]," 740 "power of 2 required.\n"); 741 us_vhost_usage(prgname); 742 return -1; 743 } else { 744 num_rx_descriptor = ret; 745 } 746 } 747 748 /* Specify the descriptor number on TX. */ 749 if (!strncmp(long_option[option_index].name, 750 "tx-desc-num", MAX_LONG_OPT_SZ)) { 751 ret = parse_num_opt(optarg, MAX_RING_DESC); 752 if ((ret == -1) || (!POWEROF2(ret))) { 753 RTE_LOG(INFO, VHOST_CONFIG, 754 "Invalid argument for tx-desc-num [0-N]," 755 "power of 2 required.\n"); 756 us_vhost_usage(prgname); 757 return -1; 758 } else { 759 num_tx_descriptor = ret; 760 } 761 } 762 763 break; 764 765 /* Invalid option - print options. */ 766 default: 767 us_vhost_usage(prgname); 768 return -1; 769 } 770 } 771 772 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 773 if (enabled_port_mask & (1 << i)) 774 ports[num_ports++] = (uint8_t)i; 775 } 776 777 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 778 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 779 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 780 return -1; 781 } 782 783 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 784 RTE_LOG(INFO, VHOST_PORT, 785 "Vhost zero copy doesn't support software vm2vm," 786 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 787 return -1; 788 } 789 790 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 791 RTE_LOG(INFO, VHOST_PORT, 792 "Vhost zero copy doesn't support jumbo frame," 793 "please specify '--mergeable 0' to disable the " 794 "mergeable feature.\n"); 795 return -1; 796 } 797 798 return 0; 799 } 800 801 /* 802 * Update the global var NUM_PORTS and array PORTS according to system ports number 803 * and return valid ports number 804 */ 805 static unsigned check_ports_num(unsigned nb_ports) 806 { 807 unsigned valid_num_ports = num_ports; 808 unsigned portid; 809 810 if (num_ports > nb_ports) { 811 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 812 num_ports, nb_ports); 813 num_ports = nb_ports; 814 } 815 816 for (portid = 0; portid < num_ports; portid ++) { 817 if (ports[portid] >= nb_ports) { 818 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 819 ports[portid], (nb_ports - 1)); 820 ports[portid] = INVALID_PORT_ID; 821 valid_num_ports--; 822 } 823 } 824 return valid_num_ports; 825 } 826 827 /* 828 * Macro to print out packet contents. Wrapped in debug define so that the 829 * data path is not effected when debug is disabled. 830 */ 831 #ifdef DEBUG 832 #define PRINT_PACKET(device, addr, size, header) do { \ 833 char *pkt_addr = (char*)(addr); \ 834 unsigned int index; \ 835 char packet[MAX_PRINT_BUFF]; \ 836 \ 837 if ((header)) \ 838 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 839 else \ 840 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 841 for (index = 0; index < (size); index++) { \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 843 "%02hhx ", pkt_addr[index]); \ 844 } \ 845 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 846 \ 847 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 848 } while(0) 849 #else 850 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 851 #endif 852 853 /* 854 * Function to convert guest physical addresses to vhost physical addresses. 855 * This is used to convert virtio buffer addresses. 856 */ 857 static inline uint64_t __attribute__((always_inline)) 858 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 859 uint32_t buf_len, hpa_type *addr_type) 860 { 861 struct virtio_memory_regions_hpa *region; 862 uint32_t regionidx; 863 uint64_t vhost_pa = 0; 864 865 *addr_type = PHYS_ADDR_INVALID; 866 867 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 868 region = &vdev->regions_hpa[regionidx]; 869 if ((guest_pa >= region->guest_phys_address) && 870 (guest_pa <= region->guest_phys_address_end)) { 871 vhost_pa = region->host_phys_addr_offset + guest_pa; 872 if (likely((guest_pa + buf_len - 1) 873 <= region->guest_phys_address_end)) 874 *addr_type = PHYS_ADDR_CONTINUOUS; 875 else 876 *addr_type = PHYS_ADDR_CROSS_SUBREG; 877 break; 878 } 879 } 880 881 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 882 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 883 (void *)(uintptr_t)vhost_pa); 884 885 return vhost_pa; 886 } 887 888 /* 889 * Compares a packet destination MAC address to a device MAC address. 890 */ 891 static inline int __attribute__((always_inline)) 892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 893 { 894 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 895 } 896 897 /* 898 * This function learns the MAC address of the device and registers this along with a 899 * vlan tag to a VMDQ. 900 */ 901 static int 902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 903 { 904 struct ether_hdr *pkt_hdr; 905 struct virtio_net_data_ll *dev_ll; 906 struct virtio_net *dev = vdev->dev; 907 int i, ret; 908 909 /* Learn MAC address of guest device from packet */ 910 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 911 912 dev_ll = ll_root_used; 913 914 while (dev_ll != NULL) { 915 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 916 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 917 return -1; 918 } 919 dev_ll = dev_ll->next; 920 } 921 922 for (i = 0; i < ETHER_ADDR_LEN; i++) 923 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 924 925 /* vlan_tag currently uses the device_id. */ 926 vdev->vlan_tag = vlan_tags[dev->device_fh]; 927 928 /* Print out VMDQ registration info. */ 929 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 930 dev->device_fh, 931 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 932 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 933 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 934 vdev->vlan_tag); 935 936 /* Register the MAC address. */ 937 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 938 if (ret) 939 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 940 dev->device_fh); 941 942 /* Enable stripping of the vlan tag as we handle routing. */ 943 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 944 945 /* Set device as ready for RX. */ 946 vdev->ready = DEVICE_RX; 947 948 return 0; 949 } 950 951 /* 952 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 953 * queue before disabling RX on the device. 954 */ 955 static inline void 956 unlink_vmdq(struct vhost_dev *vdev) 957 { 958 unsigned i = 0; 959 unsigned rx_count; 960 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 961 962 if (vdev->ready == DEVICE_RX) { 963 /*clear MAC and VLAN settings*/ 964 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 965 for (i = 0; i < 6; i++) 966 vdev->mac_address.addr_bytes[i] = 0; 967 968 vdev->vlan_tag = 0; 969 970 /*Clear out the receive buffers*/ 971 rx_count = rte_eth_rx_burst(ports[0], 972 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 973 974 while (rx_count) { 975 for (i = 0; i < rx_count; i++) 976 rte_pktmbuf_free(pkts_burst[i]); 977 978 rx_count = rte_eth_rx_burst(ports[0], 979 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 980 } 981 982 vdev->ready = DEVICE_MAC_LEARNING; 983 } 984 } 985 986 /* 987 * Check if the packet destination MAC address is for a local device. If so then put 988 * the packet on that devices RX queue. If not then return. 989 */ 990 static inline unsigned __attribute__((always_inline)) 991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 992 { 993 struct virtio_net_data_ll *dev_ll; 994 struct ether_hdr *pkt_hdr; 995 uint64_t ret = 0; 996 struct virtio_net *dev = vdev->dev; 997 struct virtio_net *tdev; /* destination virito device */ 998 999 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1000 1001 /*get the used devices list*/ 1002 dev_ll = ll_root_used; 1003 1004 while (dev_ll != NULL) { 1005 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1006 &dev_ll->vdev->mac_address)) { 1007 1008 /* Drop the packet if the TX packet is destined for the TX device. */ 1009 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1010 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1011 dev->device_fh); 1012 return 0; 1013 } 1014 tdev = dev_ll->vdev->dev; 1015 1016 1017 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1018 1019 if (dev_ll->vdev->remove) { 1020 /*drop the packet if the device is marked for removal*/ 1021 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1022 } else { 1023 /*send the packet to the local virtio device*/ 1024 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1025 if (enable_stats) { 1026 rte_atomic64_add( 1027 &dev_statistics[tdev->device_fh].rx_total_atomic, 1028 1); 1029 rte_atomic64_add( 1030 &dev_statistics[tdev->device_fh].rx_atomic, 1031 ret); 1032 dev_statistics[tdev->device_fh].tx_total++; 1033 dev_statistics[tdev->device_fh].tx += ret; 1034 } 1035 } 1036 1037 return 0; 1038 } 1039 dev_ll = dev_ll->next; 1040 } 1041 1042 return -1; 1043 } 1044 1045 /* 1046 * This function routes the TX packet to the correct interface. This may be a local device 1047 * or the physical port. 1048 */ 1049 static inline void __attribute__((always_inline)) 1050 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1051 { 1052 struct mbuf_table *tx_q; 1053 struct rte_mbuf **m_table; 1054 unsigned len, ret, offset = 0; 1055 const uint16_t lcore_id = rte_lcore_id(); 1056 struct virtio_net_data_ll *dev_ll = ll_root_used; 1057 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1058 struct virtio_net *dev = vdev->dev; 1059 1060 /*check if destination is local VM*/ 1061 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1062 rte_pktmbuf_free(m); 1063 return; 1064 } 1065 1066 if (vm2vm_mode == VM2VM_HARDWARE) { 1067 while (dev_ll != NULL) { 1068 if ((dev_ll->vdev->ready == DEVICE_RX) 1069 && ether_addr_cmp(&(pkt_hdr->d_addr), 1070 &dev_ll->vdev->mac_address)) { 1071 /* 1072 * Drop the packet if the TX packet is 1073 * destined for the TX device. 1074 */ 1075 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1076 LOG_DEBUG(VHOST_DATA, 1077 "(%"PRIu64") TX: Source and destination" 1078 " MAC addresses are the same. Dropping " 1079 "packet.\n", 1080 dev_ll->vdev->dev->device_fh); 1081 rte_pktmbuf_free(m); 1082 return; 1083 } 1084 offset = 4; 1085 vlan_tag = 1086 (uint16_t) 1087 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1088 1089 LOG_DEBUG(VHOST_DATA, 1090 "(%"PRIu64") TX: pkt to local VM device id:" 1091 "(%"PRIu64") vlan tag: %d.\n", 1092 dev->device_fh, dev_ll->vdev->dev->device_fh, 1093 vlan_tag); 1094 1095 break; 1096 } 1097 dev_ll = dev_ll->next; 1098 } 1099 } 1100 1101 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1102 1103 /*Add packet to the port tx queue*/ 1104 tx_q = &lcore_tx_queue[lcore_id]; 1105 len = tx_q->len; 1106 1107 m->ol_flags = PKT_TX_VLAN_PKT; 1108 /*FIXME: offset*/ 1109 m->data_len += offset; 1110 m->vlan_tci = vlan_tag; 1111 1112 tx_q->m_table[len] = m; 1113 len++; 1114 if (enable_stats) { 1115 dev_statistics[dev->device_fh].tx_total++; 1116 dev_statistics[dev->device_fh].tx++; 1117 } 1118 1119 if (unlikely(len == MAX_PKT_BURST)) { 1120 m_table = (struct rte_mbuf **)tx_q->m_table; 1121 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1122 /* Free any buffers not handled by TX and update the port stats. */ 1123 if (unlikely(ret < len)) { 1124 do { 1125 rte_pktmbuf_free(m_table[ret]); 1126 } while (++ret < len); 1127 } 1128 1129 len = 0; 1130 } 1131 1132 tx_q->len = len; 1133 return; 1134 } 1135 /* 1136 * This function is called by each data core. It handles all RX/TX registered with the 1137 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1138 * with all devices in the main linked list. 1139 */ 1140 static int 1141 switch_worker(__attribute__((unused)) void *arg) 1142 { 1143 struct rte_mempool *mbuf_pool = arg; 1144 struct virtio_net *dev = NULL; 1145 struct vhost_dev *vdev = NULL; 1146 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1147 struct virtio_net_data_ll *dev_ll; 1148 struct mbuf_table *tx_q; 1149 volatile struct lcore_ll_info *lcore_ll; 1150 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1151 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1152 unsigned ret, i; 1153 const uint16_t lcore_id = rte_lcore_id(); 1154 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1155 uint16_t rx_count = 0; 1156 uint16_t tx_count; 1157 uint32_t retry = 0; 1158 1159 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1160 lcore_ll = lcore_info[lcore_id].lcore_ll; 1161 prev_tsc = 0; 1162 1163 tx_q = &lcore_tx_queue[lcore_id]; 1164 for (i = 0; i < num_cores; i ++) { 1165 if (lcore_ids[i] == lcore_id) { 1166 tx_q->txq_id = i; 1167 break; 1168 } 1169 } 1170 1171 while(1) { 1172 cur_tsc = rte_rdtsc(); 1173 /* 1174 * TX burst queue drain 1175 */ 1176 diff_tsc = cur_tsc - prev_tsc; 1177 if (unlikely(diff_tsc > drain_tsc)) { 1178 1179 if (tx_q->len) { 1180 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1181 1182 /*Tx any packets in the queue*/ 1183 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1184 (struct rte_mbuf **)tx_q->m_table, 1185 (uint16_t)tx_q->len); 1186 if (unlikely(ret < tx_q->len)) { 1187 do { 1188 rte_pktmbuf_free(tx_q->m_table[ret]); 1189 } while (++ret < tx_q->len); 1190 } 1191 1192 tx_q->len = 0; 1193 } 1194 1195 prev_tsc = cur_tsc; 1196 1197 } 1198 1199 rte_prefetch0(lcore_ll->ll_root_used); 1200 /* 1201 * Inform the configuration core that we have exited the linked list and that no devices are 1202 * in use if requested. 1203 */ 1204 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1205 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1206 1207 /* 1208 * Process devices 1209 */ 1210 dev_ll = lcore_ll->ll_root_used; 1211 1212 while (dev_ll != NULL) { 1213 /*get virtio device ID*/ 1214 vdev = dev_ll->vdev; 1215 dev = vdev->dev; 1216 1217 if (vdev->remove) { 1218 dev_ll = dev_ll->next; 1219 unlink_vmdq(vdev); 1220 vdev->ready = DEVICE_SAFE_REMOVE; 1221 continue; 1222 } 1223 if (likely(vdev->ready == DEVICE_RX)) { 1224 /*Handle guest RX*/ 1225 rx_count = rte_eth_rx_burst(ports[0], 1226 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1227 1228 if (rx_count) { 1229 /* 1230 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1231 * Here MAX_PKT_BURST must be less than virtio queue size 1232 */ 1233 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1234 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1235 rte_delay_us(burst_rx_delay_time); 1236 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1237 break; 1238 } 1239 } 1240 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1241 if (enable_stats) { 1242 rte_atomic64_add( 1243 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1244 rx_count); 1245 rte_atomic64_add( 1246 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1247 } 1248 while (likely(rx_count)) { 1249 rx_count--; 1250 rte_pktmbuf_free(pkts_burst[rx_count]); 1251 } 1252 1253 } 1254 } 1255 1256 if (!vdev->remove) { 1257 /* Handle guest TX*/ 1258 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1259 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1260 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1261 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1262 while (tx_count--) 1263 rte_pktmbuf_free(pkts_burst[tx_count]); 1264 } 1265 } 1266 while (tx_count) 1267 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1268 } 1269 1270 /*move to the next device in the list*/ 1271 dev_ll = dev_ll->next; 1272 } 1273 } 1274 1275 return 0; 1276 } 1277 1278 /* 1279 * This function gets available ring number for zero copy rx. 1280 * Only one thread will call this funciton for a paticular virtio device, 1281 * so, it is designed as non-thread-safe function. 1282 */ 1283 static inline uint32_t __attribute__((always_inline)) 1284 get_available_ring_num_zcp(struct virtio_net *dev) 1285 { 1286 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1287 uint16_t avail_idx; 1288 1289 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1290 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1291 } 1292 1293 /* 1294 * This function gets available ring index for zero copy rx, 1295 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1296 * Only one thread will call this funciton for a paticular virtio device, 1297 * so, it is designed as non-thread-safe function. 1298 */ 1299 static inline uint32_t __attribute__((always_inline)) 1300 get_available_ring_index_zcp(struct virtio_net *dev, 1301 uint16_t *res_base_idx, uint32_t count) 1302 { 1303 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1304 uint16_t avail_idx; 1305 uint32_t retry = 0; 1306 uint16_t free_entries; 1307 1308 *res_base_idx = vq->last_used_idx_res; 1309 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1310 free_entries = (avail_idx - *res_base_idx); 1311 1312 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1313 "avail idx: %d, " 1314 "res base idx:%d, free entries:%d\n", 1315 dev->device_fh, avail_idx, *res_base_idx, 1316 free_entries); 1317 1318 /* 1319 * If retry is enabled and the queue is full then we wait 1320 * and retry to avoid packet loss. 1321 */ 1322 if (enable_retry && unlikely(count > free_entries)) { 1323 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1324 rte_delay_us(burst_rx_delay_time); 1325 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1326 free_entries = (avail_idx - *res_base_idx); 1327 if (count <= free_entries) 1328 break; 1329 } 1330 } 1331 1332 /*check that we have enough buffers*/ 1333 if (unlikely(count > free_entries)) 1334 count = free_entries; 1335 1336 if (unlikely(count == 0)) { 1337 LOG_DEBUG(VHOST_DATA, 1338 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1339 "avail idx: %d, res base idx:%d, free entries:%d\n", 1340 dev->device_fh, avail_idx, 1341 *res_base_idx, free_entries); 1342 return 0; 1343 } 1344 1345 vq->last_used_idx_res = *res_base_idx + count; 1346 1347 return count; 1348 } 1349 1350 /* 1351 * This function put descriptor back to used list. 1352 */ 1353 static inline void __attribute__((always_inline)) 1354 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1355 { 1356 uint16_t res_cur_idx = vq->last_used_idx; 1357 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1358 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1359 rte_compiler_barrier(); 1360 *(volatile uint16_t *)&vq->used->idx += 1; 1361 vq->last_used_idx += 1; 1362 1363 /* Kick the guest if necessary. */ 1364 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1365 eventfd_write((int)vq->kickfd, 1); 1366 } 1367 1368 /* 1369 * This function get available descriptor from vitio vring and un-attached mbuf 1370 * from vpool->ring, and then attach them together. It needs adjust the offset 1371 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1372 * frame data may be put to wrong location in mbuf. 1373 */ 1374 static inline void __attribute__((always_inline)) 1375 attach_rxmbuf_zcp(struct virtio_net *dev) 1376 { 1377 uint16_t res_base_idx, desc_idx; 1378 uint64_t buff_addr, phys_addr; 1379 struct vhost_virtqueue *vq; 1380 struct vring_desc *desc; 1381 struct rte_mbuf *mbuf = NULL; 1382 struct vpool *vpool; 1383 hpa_type addr_type; 1384 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1385 1386 vpool = &vpool_array[vdev->vmdq_rx_q]; 1387 vq = dev->virtqueue[VIRTIO_RXQ]; 1388 1389 do { 1390 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1391 1) != 1)) 1392 return; 1393 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1394 1395 desc = &vq->desc[desc_idx]; 1396 if (desc->flags & VRING_DESC_F_NEXT) { 1397 desc = &vq->desc[desc->next]; 1398 buff_addr = gpa_to_vva(dev, desc->addr); 1399 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1400 &addr_type); 1401 } else { 1402 buff_addr = gpa_to_vva(dev, 1403 desc->addr + vq->vhost_hlen); 1404 phys_addr = gpa_to_hpa(vdev, 1405 desc->addr + vq->vhost_hlen, 1406 desc->len, &addr_type); 1407 } 1408 1409 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1410 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1411 " address found when attaching RX frame buffer" 1412 " address!\n", dev->device_fh); 1413 put_desc_to_used_list_zcp(vq, desc_idx); 1414 continue; 1415 } 1416 1417 /* 1418 * Check if the frame buffer address from guest crosses 1419 * sub-region or not. 1420 */ 1421 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1422 RTE_LOG(ERR, VHOST_DATA, 1423 "(%"PRIu64") Frame buffer address cross " 1424 "sub-regioin found when attaching RX frame " 1425 "buffer address!\n", 1426 dev->device_fh); 1427 put_desc_to_used_list_zcp(vq, desc_idx); 1428 continue; 1429 } 1430 } while (unlikely(phys_addr == 0)); 1431 1432 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1433 if (unlikely(mbuf == NULL)) { 1434 LOG_DEBUG(VHOST_DATA, 1435 "(%"PRIu64") in attach_rxmbuf_zcp: " 1436 "ring_sc_dequeue fail.\n", 1437 dev->device_fh); 1438 put_desc_to_used_list_zcp(vq, desc_idx); 1439 return; 1440 } 1441 1442 if (unlikely(vpool->buf_size > desc->len)) { 1443 LOG_DEBUG(VHOST_DATA, 1444 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1445 "length(%d) of descriptor idx: %d less than room " 1446 "size required: %d\n", 1447 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1448 put_desc_to_used_list_zcp(vq, desc_idx); 1449 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1450 return; 1451 } 1452 1453 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1454 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1455 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1456 mbuf->data_len = desc->len; 1457 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1458 1459 LOG_DEBUG(VHOST_DATA, 1460 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1461 "descriptor idx:%d\n", 1462 dev->device_fh, res_base_idx, desc_idx); 1463 1464 __rte_mbuf_raw_free(mbuf); 1465 1466 return; 1467 } 1468 1469 /* 1470 * Detach an attched packet mbuf - 1471 * - restore original mbuf address and length values. 1472 * - reset pktmbuf data and data_len to their default values. 1473 * All other fields of the given packet mbuf will be left intact. 1474 * 1475 * @param m 1476 * The attached packet mbuf. 1477 */ 1478 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1479 { 1480 const struct rte_mempool *mp = m->pool; 1481 void *buf = RTE_MBUF_TO_BADDR(m); 1482 uint32_t buf_ofs; 1483 uint32_t buf_len = mp->elt_size - sizeof(*m); 1484 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1485 1486 m->buf_addr = buf; 1487 m->buf_len = (uint16_t)buf_len; 1488 1489 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1490 RTE_PKTMBUF_HEADROOM : m->buf_len; 1491 m->data_off = buf_ofs; 1492 1493 m->data_len = 0; 1494 } 1495 1496 /* 1497 * This function is called after packets have been transimited. It fetchs mbuf 1498 * from vpool->pool, detached it and put into vpool->ring. It also update the 1499 * used index and kick the guest if necessary. 1500 */ 1501 static inline uint32_t __attribute__((always_inline)) 1502 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1503 { 1504 struct rte_mbuf *mbuf; 1505 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1506 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1507 uint32_t index = 0; 1508 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1509 1510 LOG_DEBUG(VHOST_DATA, 1511 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1512 "clean is: %d\n", 1513 dev->device_fh, mbuf_count); 1514 LOG_DEBUG(VHOST_DATA, 1515 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1516 "clean is : %d\n", 1517 dev->device_fh, rte_ring_count(vpool->ring)); 1518 1519 for (index = 0; index < mbuf_count; index++) { 1520 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1521 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1522 pktmbuf_detach_zcp(mbuf); 1523 rte_ring_sp_enqueue(vpool->ring, mbuf); 1524 1525 /* Update used index buffer information. */ 1526 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1527 vq->used->ring[used_idx].len = 0; 1528 1529 used_idx = (used_idx + 1) & (vq->size - 1); 1530 } 1531 1532 LOG_DEBUG(VHOST_DATA, 1533 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1534 "clean is: %d\n", 1535 dev->device_fh, rte_mempool_count(vpool->pool)); 1536 LOG_DEBUG(VHOST_DATA, 1537 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1538 "clean is : %d\n", 1539 dev->device_fh, rte_ring_count(vpool->ring)); 1540 LOG_DEBUG(VHOST_DATA, 1541 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1542 "vq->last_used_idx:%d\n", 1543 dev->device_fh, vq->last_used_idx); 1544 1545 vq->last_used_idx += mbuf_count; 1546 1547 LOG_DEBUG(VHOST_DATA, 1548 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1549 "vq->last_used_idx:%d\n", 1550 dev->device_fh, vq->last_used_idx); 1551 1552 rte_compiler_barrier(); 1553 1554 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1555 1556 /* Kick guest if required. */ 1557 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1558 eventfd_write((int)vq->kickfd, 1); 1559 1560 return 0; 1561 } 1562 1563 /* 1564 * This function is called when a virtio device is destroy. 1565 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1566 */ 1567 static void mbuf_destroy_zcp(struct vpool *vpool) 1568 { 1569 struct rte_mbuf *mbuf = NULL; 1570 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1571 1572 LOG_DEBUG(VHOST_CONFIG, 1573 "in mbuf_destroy_zcp: mbuf count in mempool before " 1574 "mbuf_destroy_zcp is: %d\n", 1575 mbuf_count); 1576 LOG_DEBUG(VHOST_CONFIG, 1577 "in mbuf_destroy_zcp: mbuf count in ring before " 1578 "mbuf_destroy_zcp is : %d\n", 1579 rte_ring_count(vpool->ring)); 1580 1581 for (index = 0; index < mbuf_count; index++) { 1582 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1583 if (likely(mbuf != NULL)) { 1584 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1585 pktmbuf_detach_zcp(mbuf); 1586 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1587 } 1588 } 1589 1590 LOG_DEBUG(VHOST_CONFIG, 1591 "in mbuf_destroy_zcp: mbuf count in mempool after " 1592 "mbuf_destroy_zcp is: %d\n", 1593 rte_mempool_count(vpool->pool)); 1594 LOG_DEBUG(VHOST_CONFIG, 1595 "in mbuf_destroy_zcp: mbuf count in ring after " 1596 "mbuf_destroy_zcp is : %d\n", 1597 rte_ring_count(vpool->ring)); 1598 } 1599 1600 /* 1601 * This function update the use flag and counter. 1602 */ 1603 static inline uint32_t __attribute__((always_inline)) 1604 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1605 uint32_t count) 1606 { 1607 struct vhost_virtqueue *vq; 1608 struct vring_desc *desc; 1609 struct rte_mbuf *buff; 1610 /* The virtio_hdr is initialised to 0. */ 1611 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1612 = {{0, 0, 0, 0, 0, 0}, 0}; 1613 uint64_t buff_hdr_addr = 0; 1614 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1615 uint32_t head_idx, packet_success = 0; 1616 uint16_t res_cur_idx; 1617 1618 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1619 1620 if (count == 0) 1621 return 0; 1622 1623 vq = dev->virtqueue[VIRTIO_RXQ]; 1624 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1625 1626 res_cur_idx = vq->last_used_idx; 1627 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1628 dev->device_fh, res_cur_idx, res_cur_idx + count); 1629 1630 /* Retrieve all of the head indexes first to avoid caching issues. */ 1631 for (head_idx = 0; head_idx < count; head_idx++) 1632 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1633 1634 /*Prefetch descriptor index. */ 1635 rte_prefetch0(&vq->desc[head[packet_success]]); 1636 1637 while (packet_success != count) { 1638 /* Get descriptor from available ring */ 1639 desc = &vq->desc[head[packet_success]]; 1640 1641 buff = pkts[packet_success]; 1642 LOG_DEBUG(VHOST_DATA, 1643 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1644 "pkt[%d] descriptor idx: %d\n", 1645 dev->device_fh, packet_success, 1646 MBUF_HEADROOM_UINT32(buff)); 1647 1648 PRINT_PACKET(dev, 1649 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1650 + RTE_PKTMBUF_HEADROOM), 1651 rte_pktmbuf_data_len(buff), 0); 1652 1653 /* Buffer address translation for virtio header. */ 1654 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1655 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1656 1657 /* 1658 * If the descriptors are chained the header and data are 1659 * placed in separate buffers. 1660 */ 1661 if (desc->flags & VRING_DESC_F_NEXT) { 1662 desc->len = vq->vhost_hlen; 1663 desc = &vq->desc[desc->next]; 1664 desc->len = rte_pktmbuf_data_len(buff); 1665 } else { 1666 desc->len = packet_len; 1667 } 1668 1669 /* Update used ring with desc information */ 1670 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1671 = head[packet_success]; 1672 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1673 = packet_len; 1674 res_cur_idx++; 1675 packet_success++; 1676 1677 /* A header is required per buffer. */ 1678 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1679 (const void *)&virtio_hdr, vq->vhost_hlen); 1680 1681 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1682 1683 if (likely(packet_success < count)) { 1684 /* Prefetch descriptor index. */ 1685 rte_prefetch0(&vq->desc[head[packet_success]]); 1686 } 1687 } 1688 1689 rte_compiler_barrier(); 1690 1691 LOG_DEBUG(VHOST_DATA, 1692 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1693 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1694 dev->device_fh, vq->last_used_idx, vq->used->idx); 1695 1696 *(volatile uint16_t *)&vq->used->idx += count; 1697 vq->last_used_idx += count; 1698 1699 LOG_DEBUG(VHOST_DATA, 1700 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1701 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1702 dev->device_fh, vq->last_used_idx, vq->used->idx); 1703 1704 /* Kick the guest if necessary. */ 1705 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1706 eventfd_write((int)vq->kickfd, 1); 1707 1708 return count; 1709 } 1710 1711 /* 1712 * This function routes the TX packet to the correct interface. 1713 * This may be a local device or the physical port. 1714 */ 1715 static inline void __attribute__((always_inline)) 1716 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1717 uint32_t desc_idx, uint8_t need_copy) 1718 { 1719 struct mbuf_table *tx_q; 1720 struct rte_mbuf **m_table; 1721 struct rte_mbuf *mbuf = NULL; 1722 unsigned len, ret, offset = 0; 1723 struct vpool *vpool; 1724 struct virtio_net_data_ll *dev_ll = ll_root_used; 1725 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1726 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1727 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1728 1729 /*Add packet to the port tx queue*/ 1730 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1731 len = tx_q->len; 1732 1733 /* Allocate an mbuf and populate the structure. */ 1734 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1735 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1736 if (unlikely(mbuf == NULL)) { 1737 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1738 RTE_LOG(ERR, VHOST_DATA, 1739 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1740 dev->device_fh); 1741 put_desc_to_used_list_zcp(vq, desc_idx); 1742 return; 1743 } 1744 1745 if (vm2vm_mode == VM2VM_HARDWARE) { 1746 /* Avoid using a vlan tag from any vm for external pkt, such as 1747 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1748 * selection, MAC address determines it as an external pkt 1749 * which should go to network, while vlan tag determine it as 1750 * a vm2vm pkt should forward to another vm. Hardware confuse 1751 * such a ambiguous situation, so pkt will lost. 1752 */ 1753 vlan_tag = external_pkt_default_vlan_tag; 1754 while (dev_ll != NULL) { 1755 if (likely(dev_ll->vdev->ready == DEVICE_RX) && 1756 ether_addr_cmp(&(pkt_hdr->d_addr), 1757 &dev_ll->vdev->mac_address)) { 1758 1759 /* 1760 * Drop the packet if the TX packet is destined 1761 * for the TX device. 1762 */ 1763 if (unlikely(dev_ll->vdev->dev->device_fh 1764 == dev->device_fh)) { 1765 LOG_DEBUG(VHOST_DATA, 1766 "(%"PRIu64") TX: Source and destination" 1767 "MAC addresses are the same. Dropping " 1768 "packet.\n", 1769 dev_ll->vdev->dev->device_fh); 1770 MBUF_HEADROOM_UINT32(mbuf) 1771 = (uint32_t)desc_idx; 1772 __rte_mbuf_raw_free(mbuf); 1773 return; 1774 } 1775 1776 /* 1777 * Packet length offset 4 bytes for HW vlan 1778 * strip when L2 switch back. 1779 */ 1780 offset = 4; 1781 vlan_tag = 1782 (uint16_t) 1783 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1784 1785 LOG_DEBUG(VHOST_DATA, 1786 "(%"PRIu64") TX: pkt to local VM device id:" 1787 "(%"PRIu64") vlan tag: %d.\n", 1788 dev->device_fh, dev_ll->vdev->dev->device_fh, 1789 vlan_tag); 1790 1791 break; 1792 } 1793 dev_ll = dev_ll->next; 1794 } 1795 } 1796 1797 mbuf->nb_segs = m->nb_segs; 1798 mbuf->next = m->next; 1799 mbuf->data_len = m->data_len + offset; 1800 mbuf->pkt_len = mbuf->data_len; 1801 if (unlikely(need_copy)) { 1802 /* Copy the packet contents to the mbuf. */ 1803 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1804 rte_pktmbuf_mtod(m, void *), 1805 m->data_len); 1806 } else { 1807 mbuf->data_off = m->data_off; 1808 mbuf->buf_physaddr = m->buf_physaddr; 1809 mbuf->buf_addr = m->buf_addr; 1810 } 1811 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1812 mbuf->vlan_tci = vlan_tag; 1813 mbuf->l2_len = sizeof(struct ether_hdr); 1814 mbuf->l3_len = sizeof(struct ipv4_hdr); 1815 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1816 1817 tx_q->m_table[len] = mbuf; 1818 len++; 1819 1820 LOG_DEBUG(VHOST_DATA, 1821 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1822 dev->device_fh, 1823 mbuf->nb_segs, 1824 (mbuf->next == NULL) ? "null" : "non-null"); 1825 1826 if (enable_stats) { 1827 dev_statistics[dev->device_fh].tx_total++; 1828 dev_statistics[dev->device_fh].tx++; 1829 } 1830 1831 if (unlikely(len == MAX_PKT_BURST)) { 1832 m_table = (struct rte_mbuf **)tx_q->m_table; 1833 ret = rte_eth_tx_burst(ports[0], 1834 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1835 1836 /* 1837 * Free any buffers not handled by TX and update 1838 * the port stats. 1839 */ 1840 if (unlikely(ret < len)) { 1841 do { 1842 rte_pktmbuf_free(m_table[ret]); 1843 } while (++ret < len); 1844 } 1845 1846 len = 0; 1847 txmbuf_clean_zcp(dev, vpool); 1848 } 1849 1850 tx_q->len = len; 1851 1852 return; 1853 } 1854 1855 /* 1856 * This function TX all available packets in virtio TX queue for one 1857 * virtio-net device. If it is first packet, it learns MAC address and 1858 * setup VMDQ. 1859 */ 1860 static inline void __attribute__((always_inline)) 1861 virtio_dev_tx_zcp(struct virtio_net *dev) 1862 { 1863 struct rte_mbuf m; 1864 struct vhost_virtqueue *vq; 1865 struct vring_desc *desc; 1866 uint64_t buff_addr = 0, phys_addr; 1867 uint32_t head[MAX_PKT_BURST]; 1868 uint32_t i; 1869 uint16_t free_entries, packet_success = 0; 1870 uint16_t avail_idx; 1871 uint8_t need_copy = 0; 1872 hpa_type addr_type; 1873 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1874 1875 vq = dev->virtqueue[VIRTIO_TXQ]; 1876 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1877 1878 /* If there are no available buffers then return. */ 1879 if (vq->last_used_idx_res == avail_idx) 1880 return; 1881 1882 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1883 1884 /* Prefetch available ring to retrieve head indexes. */ 1885 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1886 1887 /* Get the number of free entries in the ring */ 1888 free_entries = (avail_idx - vq->last_used_idx_res); 1889 1890 /* Limit to MAX_PKT_BURST. */ 1891 free_entries 1892 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1893 1894 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1895 dev->device_fh, free_entries); 1896 1897 /* Retrieve all of the head indexes first to avoid caching issues. */ 1898 for (i = 0; i < free_entries; i++) 1899 head[i] 1900 = vq->avail->ring[(vq->last_used_idx_res + i) 1901 & (vq->size - 1)]; 1902 1903 vq->last_used_idx_res += free_entries; 1904 1905 /* Prefetch descriptor index. */ 1906 rte_prefetch0(&vq->desc[head[packet_success]]); 1907 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1908 1909 while (packet_success < free_entries) { 1910 desc = &vq->desc[head[packet_success]]; 1911 1912 /* Discard first buffer as it is the virtio header */ 1913 desc = &vq->desc[desc->next]; 1914 1915 /* Buffer address translation. */ 1916 buff_addr = gpa_to_vva(dev, desc->addr); 1917 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type); 1918 1919 if (likely(packet_success < (free_entries - 1))) 1920 /* Prefetch descriptor index. */ 1921 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1922 1923 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1924 RTE_LOG(ERR, VHOST_DATA, 1925 "(%"PRIu64") Invalid frame buffer address found" 1926 "when TX packets!\n", 1927 dev->device_fh); 1928 packet_success++; 1929 continue; 1930 } 1931 1932 /* Prefetch buffer address. */ 1933 rte_prefetch0((void *)(uintptr_t)buff_addr); 1934 1935 /* 1936 * Setup dummy mbuf. This is copied to a real mbuf if 1937 * transmitted out the physical port. 1938 */ 1939 m.data_len = desc->len; 1940 m.nb_segs = 1; 1941 m.next = NULL; 1942 m.data_off = 0; 1943 m.buf_addr = (void *)(uintptr_t)buff_addr; 1944 m.buf_physaddr = phys_addr; 1945 1946 /* 1947 * Check if the frame buffer address from guest crosses 1948 * sub-region or not. 1949 */ 1950 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1951 RTE_LOG(ERR, VHOST_DATA, 1952 "(%"PRIu64") Frame buffer address cross " 1953 "sub-regioin found when attaching TX frame " 1954 "buffer address!\n", 1955 dev->device_fh); 1956 need_copy = 1; 1957 } else 1958 need_copy = 0; 1959 1960 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1961 1962 /* 1963 * If this is the first received packet we need to learn 1964 * the MAC and setup VMDQ 1965 */ 1966 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1967 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1968 /* 1969 * Discard frame if device is scheduled for 1970 * removal or a duplicate MAC address is found. 1971 */ 1972 packet_success += free_entries; 1973 vq->last_used_idx += packet_success; 1974 break; 1975 } 1976 } 1977 1978 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1979 packet_success++; 1980 } 1981 } 1982 1983 /* 1984 * This function is called by each data core. It handles all RX/TX registered 1985 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1986 * addresses are compared with all devices in the main linked list. 1987 */ 1988 static int 1989 switch_worker_zcp(__attribute__((unused)) void *arg) 1990 { 1991 struct virtio_net *dev = NULL; 1992 struct vhost_dev *vdev = NULL; 1993 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1994 struct virtio_net_data_ll *dev_ll; 1995 struct mbuf_table *tx_q; 1996 volatile struct lcore_ll_info *lcore_ll; 1997 const uint64_t drain_tsc 1998 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 1999 * BURST_TX_DRAIN_US; 2000 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2001 unsigned ret; 2002 const uint16_t lcore_id = rte_lcore_id(); 2003 uint16_t count_in_ring, rx_count = 0; 2004 2005 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2006 2007 lcore_ll = lcore_info[lcore_id].lcore_ll; 2008 prev_tsc = 0; 2009 2010 while (1) { 2011 cur_tsc = rte_rdtsc(); 2012 2013 /* TX burst queue drain */ 2014 diff_tsc = cur_tsc - prev_tsc; 2015 if (unlikely(diff_tsc > drain_tsc)) { 2016 /* 2017 * Get mbuf from vpool.pool and detach mbuf and 2018 * put back into vpool.ring. 2019 */ 2020 dev_ll = lcore_ll->ll_root_used; 2021 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2022 /* Get virtio device ID */ 2023 vdev = dev_ll->vdev; 2024 dev = vdev->dev; 2025 2026 if (likely(!vdev->remove)) { 2027 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2028 if (tx_q->len) { 2029 LOG_DEBUG(VHOST_DATA, 2030 "TX queue drained after timeout" 2031 " with burst size %u\n", 2032 tx_q->len); 2033 2034 /* 2035 * Tx any packets in the queue 2036 */ 2037 ret = rte_eth_tx_burst( 2038 ports[0], 2039 (uint16_t)tx_q->txq_id, 2040 (struct rte_mbuf **) 2041 tx_q->m_table, 2042 (uint16_t)tx_q->len); 2043 if (unlikely(ret < tx_q->len)) { 2044 do { 2045 rte_pktmbuf_free( 2046 tx_q->m_table[ret]); 2047 } while (++ret < tx_q->len); 2048 } 2049 tx_q->len = 0; 2050 2051 txmbuf_clean_zcp(dev, 2052 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2053 } 2054 } 2055 dev_ll = dev_ll->next; 2056 } 2057 prev_tsc = cur_tsc; 2058 } 2059 2060 rte_prefetch0(lcore_ll->ll_root_used); 2061 2062 /* 2063 * Inform the configuration core that we have exited the linked 2064 * list and that no devices are in use if requested. 2065 */ 2066 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2067 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2068 2069 /* Process devices */ 2070 dev_ll = lcore_ll->ll_root_used; 2071 2072 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2073 vdev = dev_ll->vdev; 2074 dev = vdev->dev; 2075 if (unlikely(vdev->remove)) { 2076 dev_ll = dev_ll->next; 2077 unlink_vmdq(vdev); 2078 vdev->ready = DEVICE_SAFE_REMOVE; 2079 continue; 2080 } 2081 2082 if (likely(vdev->ready == DEVICE_RX)) { 2083 uint32_t index = vdev->vmdq_rx_q; 2084 uint16_t i; 2085 count_in_ring 2086 = rte_ring_count(vpool_array[index].ring); 2087 uint16_t free_entries 2088 = (uint16_t)get_available_ring_num_zcp(dev); 2089 2090 /* 2091 * Attach all mbufs in vpool.ring and put back 2092 * into vpool.pool. 2093 */ 2094 for (i = 0; 2095 i < RTE_MIN(free_entries, 2096 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2097 i++) 2098 attach_rxmbuf_zcp(dev); 2099 2100 /* Handle guest RX */ 2101 rx_count = rte_eth_rx_burst(ports[0], 2102 vdev->vmdq_rx_q, pkts_burst, 2103 MAX_PKT_BURST); 2104 2105 if (rx_count) { 2106 ret_count = virtio_dev_rx_zcp(dev, 2107 pkts_burst, rx_count); 2108 if (enable_stats) { 2109 dev_statistics[dev->device_fh].rx_total 2110 += rx_count; 2111 dev_statistics[dev->device_fh].rx 2112 += ret_count; 2113 } 2114 while (likely(rx_count)) { 2115 rx_count--; 2116 pktmbuf_detach_zcp( 2117 pkts_burst[rx_count]); 2118 rte_ring_sp_enqueue( 2119 vpool_array[index].ring, 2120 (void *)pkts_burst[rx_count]); 2121 } 2122 } 2123 } 2124 2125 if (likely(!vdev->remove)) 2126 /* Handle guest TX */ 2127 virtio_dev_tx_zcp(dev); 2128 2129 /* Move to the next device in the list */ 2130 dev_ll = dev_ll->next; 2131 } 2132 } 2133 2134 return 0; 2135 } 2136 2137 2138 /* 2139 * Add an entry to a used linked list. A free entry must first be found 2140 * in the free linked list using get_data_ll_free_entry(); 2141 */ 2142 static void 2143 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2144 struct virtio_net_data_ll *ll_dev) 2145 { 2146 struct virtio_net_data_ll *ll = *ll_root_addr; 2147 2148 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2149 ll_dev->next = NULL; 2150 rte_compiler_barrier(); 2151 2152 /* If ll == NULL then this is the first device. */ 2153 if (ll) { 2154 /* Increment to the tail of the linked list. */ 2155 while ((ll->next != NULL) ) 2156 ll = ll->next; 2157 2158 ll->next = ll_dev; 2159 } else { 2160 *ll_root_addr = ll_dev; 2161 } 2162 } 2163 2164 /* 2165 * Remove an entry from a used linked list. The entry must then be added to 2166 * the free linked list using put_data_ll_free_entry(). 2167 */ 2168 static void 2169 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2170 struct virtio_net_data_ll *ll_dev, 2171 struct virtio_net_data_ll *ll_dev_last) 2172 { 2173 struct virtio_net_data_ll *ll = *ll_root_addr; 2174 2175 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2176 return; 2177 2178 if (ll_dev == ll) 2179 *ll_root_addr = ll_dev->next; 2180 else 2181 if (likely(ll_dev_last != NULL)) 2182 ll_dev_last->next = ll_dev->next; 2183 else 2184 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2185 } 2186 2187 /* 2188 * Find and return an entry from the free linked list. 2189 */ 2190 static struct virtio_net_data_ll * 2191 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2192 { 2193 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2194 struct virtio_net_data_ll *ll_dev; 2195 2196 if (ll_free == NULL) 2197 return NULL; 2198 2199 ll_dev = ll_free; 2200 *ll_root_addr = ll_free->next; 2201 2202 return ll_dev; 2203 } 2204 2205 /* 2206 * Place an entry back on to the free linked list. 2207 */ 2208 static void 2209 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2210 struct virtio_net_data_ll *ll_dev) 2211 { 2212 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2213 2214 if (ll_dev == NULL) 2215 return; 2216 2217 ll_dev->next = ll_free; 2218 *ll_root_addr = ll_dev; 2219 } 2220 2221 /* 2222 * Creates a linked list of a given size. 2223 */ 2224 static struct virtio_net_data_ll * 2225 alloc_data_ll(uint32_t size) 2226 { 2227 struct virtio_net_data_ll *ll_new; 2228 uint32_t i; 2229 2230 /* Malloc and then chain the linked list. */ 2231 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2232 if (ll_new == NULL) { 2233 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2234 return NULL; 2235 } 2236 2237 for (i = 0; i < size - 1; i++) { 2238 ll_new[i].vdev = NULL; 2239 ll_new[i].next = &ll_new[i+1]; 2240 } 2241 ll_new[i].next = NULL; 2242 2243 return (ll_new); 2244 } 2245 2246 /* 2247 * Create the main linked list along with each individual cores linked list. A used and a free list 2248 * are created to manage entries. 2249 */ 2250 static int 2251 init_data_ll (void) 2252 { 2253 int lcore; 2254 2255 RTE_LCORE_FOREACH_SLAVE(lcore) { 2256 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2257 if (lcore_info[lcore].lcore_ll == NULL) { 2258 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2259 return -1; 2260 } 2261 2262 lcore_info[lcore].lcore_ll->device_num = 0; 2263 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2264 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2265 if (num_devices % num_switching_cores) 2266 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2267 else 2268 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2269 } 2270 2271 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2272 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2273 2274 return 0; 2275 } 2276 2277 /* 2278 * Set virtqueue flags so that we do not receive interrupts. 2279 */ 2280 static void 2281 set_irq_status (struct virtio_net *dev) 2282 { 2283 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2284 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2285 } 2286 2287 /* 2288 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2289 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2290 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2291 */ 2292 static void 2293 destroy_device (volatile struct virtio_net *dev) 2294 { 2295 struct virtio_net_data_ll *ll_lcore_dev_cur; 2296 struct virtio_net_data_ll *ll_main_dev_cur; 2297 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2298 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2299 struct vhost_dev *vdev; 2300 int lcore; 2301 2302 dev->flags &= ~VIRTIO_DEV_RUNNING; 2303 2304 vdev = (struct vhost_dev *)dev->priv; 2305 /*set the remove flag. */ 2306 vdev->remove = 1; 2307 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2308 rte_pause(); 2309 } 2310 2311 /* Search for entry to be removed from lcore ll */ 2312 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2313 while (ll_lcore_dev_cur != NULL) { 2314 if (ll_lcore_dev_cur->vdev == vdev) { 2315 break; 2316 } else { 2317 ll_lcore_dev_last = ll_lcore_dev_cur; 2318 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2319 } 2320 } 2321 2322 if (ll_lcore_dev_cur == NULL) { 2323 RTE_LOG(ERR, VHOST_CONFIG, 2324 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2325 dev->device_fh); 2326 return; 2327 } 2328 2329 /* Search for entry to be removed from main ll */ 2330 ll_main_dev_cur = ll_root_used; 2331 ll_main_dev_last = NULL; 2332 while (ll_main_dev_cur != NULL) { 2333 if (ll_main_dev_cur->vdev == vdev) { 2334 break; 2335 } else { 2336 ll_main_dev_last = ll_main_dev_cur; 2337 ll_main_dev_cur = ll_main_dev_cur->next; 2338 } 2339 } 2340 2341 /* Remove entries from the lcore and main ll. */ 2342 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2343 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2344 2345 /* Set the dev_removal_flag on each lcore. */ 2346 RTE_LCORE_FOREACH_SLAVE(lcore) { 2347 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2348 } 2349 2350 /* 2351 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2352 * they can no longer access the device removed from the linked lists and that the devices 2353 * are no longer in use. 2354 */ 2355 RTE_LCORE_FOREACH_SLAVE(lcore) { 2356 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2357 rte_pause(); 2358 } 2359 } 2360 2361 /* Add the entries back to the lcore and main free ll.*/ 2362 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2363 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2364 2365 /* Decrement number of device on the lcore. */ 2366 lcore_info[vdev->coreid].lcore_ll->device_num--; 2367 2368 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2369 2370 if (zero_copy) { 2371 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2372 2373 /* Stop the RX queue. */ 2374 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2375 LOG_DEBUG(VHOST_CONFIG, 2376 "(%"PRIu64") In destroy_device: Failed to stop " 2377 "rx queue:%d\n", 2378 dev->device_fh, 2379 vdev->vmdq_rx_q); 2380 } 2381 2382 LOG_DEBUG(VHOST_CONFIG, 2383 "(%"PRIu64") in destroy_device: Start put mbuf in " 2384 "mempool back to ring for RX queue: %d\n", 2385 dev->device_fh, vdev->vmdq_rx_q); 2386 2387 mbuf_destroy_zcp(vpool); 2388 2389 /* Stop the TX queue. */ 2390 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2391 LOG_DEBUG(VHOST_CONFIG, 2392 "(%"PRIu64") In destroy_device: Failed to " 2393 "stop tx queue:%d\n", 2394 dev->device_fh, vdev->vmdq_rx_q); 2395 } 2396 2397 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2398 2399 LOG_DEBUG(VHOST_CONFIG, 2400 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2401 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2402 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2403 dev->device_fh); 2404 2405 mbuf_destroy_zcp(vpool); 2406 rte_free(vdev->regions_hpa); 2407 } 2408 rte_free(vdev); 2409 2410 } 2411 2412 /* 2413 * Calculate the region count of physical continous regions for one particular 2414 * region of whose vhost virtual address is continous. The particular region 2415 * start from vva_start, with size of 'size' in argument. 2416 */ 2417 static uint32_t 2418 check_hpa_regions(uint64_t vva_start, uint64_t size) 2419 { 2420 uint32_t i, nregions = 0, page_size = getpagesize(); 2421 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2422 if (vva_start % page_size) { 2423 LOG_DEBUG(VHOST_CONFIG, 2424 "in check_countinous: vva start(%p) mod page_size(%d) " 2425 "has remainder\n", 2426 (void *)(uintptr_t)vva_start, page_size); 2427 return 0; 2428 } 2429 if (size % page_size) { 2430 LOG_DEBUG(VHOST_CONFIG, 2431 "in check_countinous: " 2432 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2433 size, page_size); 2434 return 0; 2435 } 2436 for (i = 0; i < size - page_size; i = i + page_size) { 2437 cur_phys_addr 2438 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2439 next_phys_addr = rte_mem_virt2phy( 2440 (void *)(uintptr_t)(vva_start + i + page_size)); 2441 if ((cur_phys_addr + page_size) != next_phys_addr) { 2442 ++nregions; 2443 LOG_DEBUG(VHOST_CONFIG, 2444 "in check_continuous: hva addr:(%p) is not " 2445 "continuous with hva addr:(%p), diff:%d\n", 2446 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2447 (void *)(uintptr_t)(vva_start + (uint64_t)i 2448 + page_size), page_size); 2449 LOG_DEBUG(VHOST_CONFIG, 2450 "in check_continuous: hpa addr:(%p) is not " 2451 "continuous with hpa addr:(%p), " 2452 "diff:(%"PRIu64")\n", 2453 (void *)(uintptr_t)cur_phys_addr, 2454 (void *)(uintptr_t)next_phys_addr, 2455 (next_phys_addr-cur_phys_addr)); 2456 } 2457 } 2458 return nregions; 2459 } 2460 2461 /* 2462 * Divide each region whose vhost virtual address is continous into a few 2463 * sub-regions, make sure the physical address within each sub-region are 2464 * continous. And fill offset(to GPA) and size etc. information of each 2465 * sub-region into regions_hpa. 2466 */ 2467 static uint32_t 2468 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2469 { 2470 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2471 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2472 2473 if (mem_region_hpa == NULL) 2474 return 0; 2475 2476 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2477 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2478 virtio_memory->regions[regionidx].address_offset; 2479 mem_region_hpa[regionidx_hpa].guest_phys_address 2480 = virtio_memory->regions[regionidx].guest_phys_address; 2481 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2482 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2483 mem_region_hpa[regionidx_hpa].guest_phys_address; 2484 LOG_DEBUG(VHOST_CONFIG, 2485 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2486 regionidx_hpa, 2487 (void *)(uintptr_t) 2488 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2489 LOG_DEBUG(VHOST_CONFIG, 2490 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2491 regionidx_hpa, 2492 (void *)(uintptr_t) 2493 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2494 for (i = 0, k = 0; 2495 i < virtio_memory->regions[regionidx].memory_size - 2496 page_size; 2497 i += page_size) { 2498 cur_phys_addr = rte_mem_virt2phy( 2499 (void *)(uintptr_t)(vva_start + i)); 2500 next_phys_addr = rte_mem_virt2phy( 2501 (void *)(uintptr_t)(vva_start + 2502 i + page_size)); 2503 if ((cur_phys_addr + page_size) != next_phys_addr) { 2504 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2505 mem_region_hpa[regionidx_hpa].guest_phys_address + 2506 k + page_size; 2507 mem_region_hpa[regionidx_hpa].memory_size 2508 = k + page_size; 2509 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2510 "phys addr end [%d]:(%p)\n", 2511 regionidx_hpa, 2512 (void *)(uintptr_t) 2513 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2514 LOG_DEBUG(VHOST_CONFIG, 2515 "in fill_hpa_regions: guest phys addr " 2516 "size [%d]:(%p)\n", 2517 regionidx_hpa, 2518 (void *)(uintptr_t) 2519 (mem_region_hpa[regionidx_hpa].memory_size)); 2520 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2521 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2522 ++regionidx_hpa; 2523 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2524 next_phys_addr - 2525 mem_region_hpa[regionidx_hpa].guest_phys_address; 2526 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2527 " phys addr start[%d]:(%p)\n", 2528 regionidx_hpa, 2529 (void *)(uintptr_t) 2530 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2531 LOG_DEBUG(VHOST_CONFIG, 2532 "in fill_hpa_regions: host phys addr " 2533 "start[%d]:(%p)\n", 2534 regionidx_hpa, 2535 (void *)(uintptr_t) 2536 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2537 k = 0; 2538 } else { 2539 k += page_size; 2540 } 2541 } 2542 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2543 = mem_region_hpa[regionidx_hpa].guest_phys_address 2544 + k + page_size; 2545 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2546 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2547 "[%d]:(%p)\n", regionidx_hpa, 2548 (void *)(uintptr_t) 2549 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2550 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2551 "[%d]:(%p)\n", regionidx_hpa, 2552 (void *)(uintptr_t) 2553 (mem_region_hpa[regionidx_hpa].memory_size)); 2554 ++regionidx_hpa; 2555 } 2556 return regionidx_hpa; 2557 } 2558 2559 /* 2560 * A new device is added to a data core. First the device is added to the main linked list 2561 * and the allocated to a specific data core. 2562 */ 2563 static int 2564 new_device (struct virtio_net *dev) 2565 { 2566 struct virtio_net_data_ll *ll_dev; 2567 int lcore, core_add = 0; 2568 uint32_t device_num_min = num_devices; 2569 struct vhost_dev *vdev; 2570 uint32_t regionidx; 2571 2572 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2573 if (vdev == NULL) { 2574 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2575 dev->device_fh); 2576 return -1; 2577 } 2578 vdev->dev = dev; 2579 dev->priv = vdev; 2580 2581 if (zero_copy) { 2582 vdev->nregions_hpa = dev->mem->nregions; 2583 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2584 vdev->nregions_hpa 2585 += check_hpa_regions( 2586 dev->mem->regions[regionidx].guest_phys_address 2587 + dev->mem->regions[regionidx].address_offset, 2588 dev->mem->regions[regionidx].memory_size); 2589 2590 } 2591 2592 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2593 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2594 CACHE_LINE_SIZE); 2595 if (vdev->regions_hpa == NULL) { 2596 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2597 rte_free(vdev); 2598 return -1; 2599 } 2600 2601 2602 if (fill_hpa_memory_regions( 2603 vdev->regions_hpa, dev->mem 2604 ) != vdev->nregions_hpa) { 2605 2606 RTE_LOG(ERR, VHOST_CONFIG, 2607 "hpa memory regions number mismatch: " 2608 "[%d]\n", vdev->nregions_hpa); 2609 rte_free(vdev->regions_hpa); 2610 rte_free(vdev); 2611 return -1; 2612 } 2613 } 2614 2615 2616 /* Add device to main ll */ 2617 ll_dev = get_data_ll_free_entry(&ll_root_free); 2618 if (ll_dev == NULL) { 2619 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2620 "of %d devices per core has been reached\n", 2621 dev->device_fh, num_devices); 2622 if (vdev->regions_hpa) 2623 rte_free(vdev->regions_hpa); 2624 rte_free(vdev); 2625 return -1; 2626 } 2627 ll_dev->vdev = vdev; 2628 add_data_ll_entry(&ll_root_used, ll_dev); 2629 vdev->vmdq_rx_q 2630 = dev->device_fh * (num_queues / num_devices); 2631 2632 if (zero_copy) { 2633 uint32_t index = vdev->vmdq_rx_q; 2634 uint32_t count_in_ring, i; 2635 struct mbuf_table *tx_q; 2636 2637 count_in_ring = rte_ring_count(vpool_array[index].ring); 2638 2639 LOG_DEBUG(VHOST_CONFIG, 2640 "(%"PRIu64") in new_device: mbuf count in mempool " 2641 "before attach is: %d\n", 2642 dev->device_fh, 2643 rte_mempool_count(vpool_array[index].pool)); 2644 LOG_DEBUG(VHOST_CONFIG, 2645 "(%"PRIu64") in new_device: mbuf count in ring " 2646 "before attach is : %d\n", 2647 dev->device_fh, count_in_ring); 2648 2649 /* 2650 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2651 */ 2652 for (i = 0; i < count_in_ring; i++) 2653 attach_rxmbuf_zcp(dev); 2654 2655 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2656 "mempool after attach is: %d\n", 2657 dev->device_fh, 2658 rte_mempool_count(vpool_array[index].pool)); 2659 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2660 "ring after attach is : %d\n", 2661 dev->device_fh, 2662 rte_ring_count(vpool_array[index].ring)); 2663 2664 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2665 tx_q->txq_id = vdev->vmdq_rx_q; 2666 2667 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2668 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2669 2670 LOG_DEBUG(VHOST_CONFIG, 2671 "(%"PRIu64") In new_device: Failed to start " 2672 "tx queue:%d\n", 2673 dev->device_fh, vdev->vmdq_rx_q); 2674 2675 mbuf_destroy_zcp(vpool); 2676 rte_free(vdev->regions_hpa); 2677 rte_free(vdev); 2678 return -1; 2679 } 2680 2681 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2682 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2683 2684 LOG_DEBUG(VHOST_CONFIG, 2685 "(%"PRIu64") In new_device: Failed to start " 2686 "rx queue:%d\n", 2687 dev->device_fh, vdev->vmdq_rx_q); 2688 2689 /* Stop the TX queue. */ 2690 if (rte_eth_dev_tx_queue_stop(ports[0], 2691 vdev->vmdq_rx_q) != 0) { 2692 LOG_DEBUG(VHOST_CONFIG, 2693 "(%"PRIu64") In new_device: Failed to " 2694 "stop tx queue:%d\n", 2695 dev->device_fh, vdev->vmdq_rx_q); 2696 } 2697 2698 mbuf_destroy_zcp(vpool); 2699 rte_free(vdev->regions_hpa); 2700 rte_free(vdev); 2701 return -1; 2702 } 2703 2704 } 2705 2706 /*reset ready flag*/ 2707 vdev->ready = DEVICE_MAC_LEARNING; 2708 vdev->remove = 0; 2709 2710 /* Find a suitable lcore to add the device. */ 2711 RTE_LCORE_FOREACH_SLAVE(lcore) { 2712 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2713 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2714 core_add = lcore; 2715 } 2716 } 2717 /* Add device to lcore ll */ 2718 ll_dev->dev->coreid = core_add; 2719 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); 2720 if (ll_dev == NULL) { 2721 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2722 vdev->ready = DEVICE_SAFE_REMOVE; 2723 destroy_device(dev); 2724 if (vdev->regions_hpa) 2725 rte_free(vdev->regions_hpa); 2726 rte_free(vdev); 2727 return -1; 2728 } 2729 ll_dev->vdev = vdev; 2730 vdev->coreid = core_add; 2731 2732 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); 2733 2734 /* Initialize device stats */ 2735 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2736 2737 /* Disable notifications. */ 2738 set_irq_status(dev); 2739 lcore_info[vdev->coreid].lcore_ll->device_num++; 2740 dev->flags |= VIRTIO_DEV_RUNNING; 2741 2742 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2743 2744 return 0; 2745 } 2746 2747 /* 2748 * These callback allow devices to be added to the data core when configuration 2749 * has been fully complete. 2750 */ 2751 static const struct virtio_net_device_ops virtio_net_device_ops = 2752 { 2753 .new_device = new_device, 2754 .destroy_device = destroy_device, 2755 }; 2756 2757 /* 2758 * This is a thread will wake up after a period to print stats if the user has 2759 * enabled them. 2760 */ 2761 static void 2762 print_stats(void) 2763 { 2764 struct virtio_net_data_ll *dev_ll; 2765 uint64_t tx_dropped, rx_dropped; 2766 uint64_t tx, tx_total, rx, rx_total; 2767 uint32_t device_fh; 2768 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2769 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2770 2771 while(1) { 2772 sleep(enable_stats); 2773 2774 /* Clear screen and move to top left */ 2775 printf("%s%s", clr, top_left); 2776 2777 printf("\nDevice statistics ===================================="); 2778 2779 dev_ll = ll_root_used; 2780 while (dev_ll != NULL) { 2781 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2782 tx_total = dev_statistics[device_fh].tx_total; 2783 tx = dev_statistics[device_fh].tx; 2784 tx_dropped = tx_total - tx; 2785 if (zero_copy == 0) { 2786 rx_total = rte_atomic64_read( 2787 &dev_statistics[device_fh].rx_total_atomic); 2788 rx = rte_atomic64_read( 2789 &dev_statistics[device_fh].rx_atomic); 2790 } else { 2791 rx_total = dev_statistics[device_fh].rx_total; 2792 rx = dev_statistics[device_fh].rx; 2793 } 2794 rx_dropped = rx_total - rx; 2795 2796 printf("\nStatistics for device %"PRIu32" ------------------------------" 2797 "\nTX total: %"PRIu64"" 2798 "\nTX dropped: %"PRIu64"" 2799 "\nTX successful: %"PRIu64"" 2800 "\nRX total: %"PRIu64"" 2801 "\nRX dropped: %"PRIu64"" 2802 "\nRX successful: %"PRIu64"", 2803 device_fh, 2804 tx_total, 2805 tx_dropped, 2806 tx, 2807 rx_total, 2808 rx_dropped, 2809 rx); 2810 2811 dev_ll = dev_ll->next; 2812 } 2813 printf("\n======================================================\n"); 2814 } 2815 } 2816 2817 static void 2818 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2819 char *ring_name, uint32_t nb_mbuf) 2820 { 2821 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2822 vpool_array[index].pool 2823 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2824 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2825 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2826 rte_pktmbuf_init, NULL, socket, 0); 2827 if (vpool_array[index].pool != NULL) { 2828 vpool_array[index].ring 2829 = rte_ring_create(ring_name, 2830 rte_align32pow2(nb_mbuf + 1), 2831 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2832 if (likely(vpool_array[index].ring != NULL)) { 2833 LOG_DEBUG(VHOST_CONFIG, 2834 "in setup_mempool_tbl: mbuf count in " 2835 "mempool is: %d\n", 2836 rte_mempool_count(vpool_array[index].pool)); 2837 LOG_DEBUG(VHOST_CONFIG, 2838 "in setup_mempool_tbl: mbuf count in " 2839 "ring is: %d\n", 2840 rte_ring_count(vpool_array[index].ring)); 2841 } else { 2842 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2843 ring_name); 2844 } 2845 2846 /* Need consider head room. */ 2847 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2848 } else { 2849 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2850 } 2851 } 2852 2853 2854 /* 2855 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2856 * device is also registered here to handle the IOCTLs. 2857 */ 2858 int 2859 MAIN(int argc, char *argv[]) 2860 { 2861 struct rte_mempool *mbuf_pool = NULL; 2862 unsigned lcore_id, core_id = 0; 2863 unsigned nb_ports, valid_num_ports; 2864 int ret; 2865 uint8_t portid, queue_id = 0; 2866 static pthread_t tid; 2867 2868 /* init EAL */ 2869 ret = rte_eal_init(argc, argv); 2870 if (ret < 0) 2871 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2872 argc -= ret; 2873 argv += ret; 2874 2875 /* parse app arguments */ 2876 ret = us_vhost_parse_args(argc, argv); 2877 if (ret < 0) 2878 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2879 2880 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2881 if (rte_lcore_is_enabled(lcore_id)) 2882 lcore_ids[core_id ++] = lcore_id; 2883 2884 if (rte_lcore_count() > RTE_MAX_LCORE) 2885 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2886 2887 /*set the number of swithcing cores available*/ 2888 num_switching_cores = rte_lcore_count()-1; 2889 2890 /* Get the number of physical ports. */ 2891 nb_ports = rte_eth_dev_count(); 2892 if (nb_ports > RTE_MAX_ETHPORTS) 2893 nb_ports = RTE_MAX_ETHPORTS; 2894 2895 /* 2896 * Update the global var NUM_PORTS and global array PORTS 2897 * and get value of var VALID_NUM_PORTS according to system ports number 2898 */ 2899 valid_num_ports = check_ports_num(nb_ports); 2900 2901 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2902 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2903 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2904 return -1; 2905 } 2906 2907 if (zero_copy == 0) { 2908 /* Create the mbuf pool. */ 2909 mbuf_pool = rte_mempool_create( 2910 "MBUF_POOL", 2911 NUM_MBUFS_PER_PORT 2912 * valid_num_ports, 2913 MBUF_SIZE, MBUF_CACHE_SIZE, 2914 sizeof(struct rte_pktmbuf_pool_private), 2915 rte_pktmbuf_pool_init, NULL, 2916 rte_pktmbuf_init, NULL, 2917 rte_socket_id(), 0); 2918 if (mbuf_pool == NULL) 2919 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2920 2921 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2922 vpool_array[queue_id].pool = mbuf_pool; 2923 2924 if (vm2vm_mode == VM2VM_HARDWARE) { 2925 /* Enable VT loop back to let L2 switch to do it. */ 2926 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2927 LOG_DEBUG(VHOST_CONFIG, 2928 "Enable loop back for L2 switch in vmdq.\n"); 2929 } 2930 } else { 2931 uint32_t nb_mbuf; 2932 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2933 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2934 2935 /* 2936 * Zero copy defers queue RX/TX start to the time when guest 2937 * finishes its startup and packet buffers from that guest are 2938 * available. 2939 */ 2940 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2941 rx_conf_default.rx_drop_en = 0; 2942 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2943 nb_mbuf = num_rx_descriptor 2944 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2945 + num_switching_cores * MAX_PKT_BURST; 2946 2947 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2948 snprintf(pool_name, sizeof(pool_name), 2949 "rxmbuf_pool_%u", queue_id); 2950 snprintf(ring_name, sizeof(ring_name), 2951 "rxmbuf_ring_%u", queue_id); 2952 setup_mempool_tbl(rte_socket_id(), queue_id, 2953 pool_name, ring_name, nb_mbuf); 2954 } 2955 2956 nb_mbuf = num_tx_descriptor 2957 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2958 + num_switching_cores * MAX_PKT_BURST; 2959 2960 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2961 snprintf(pool_name, sizeof(pool_name), 2962 "txmbuf_pool_%u", queue_id); 2963 snprintf(ring_name, sizeof(ring_name), 2964 "txmbuf_ring_%u", queue_id); 2965 setup_mempool_tbl(rte_socket_id(), 2966 (queue_id + MAX_QUEUES), 2967 pool_name, ring_name, nb_mbuf); 2968 } 2969 2970 if (vm2vm_mode == VM2VM_HARDWARE) { 2971 /* Enable VT loop back to let L2 switch to do it. */ 2972 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2973 LOG_DEBUG(VHOST_CONFIG, 2974 "Enable loop back for L2 switch in vmdq.\n"); 2975 } 2976 } 2977 /* Set log level. */ 2978 rte_set_log_level(LOG_LEVEL); 2979 2980 /* initialize all ports */ 2981 for (portid = 0; portid < nb_ports; portid++) { 2982 /* skip ports that are not enabled */ 2983 if ((enabled_port_mask & (1 << portid)) == 0) { 2984 RTE_LOG(INFO, VHOST_PORT, 2985 "Skipping disabled port %d\n", portid); 2986 continue; 2987 } 2988 if (port_init(portid) != 0) 2989 rte_exit(EXIT_FAILURE, 2990 "Cannot initialize network ports\n"); 2991 } 2992 2993 /* Initialise all linked lists. */ 2994 if (init_data_ll() == -1) 2995 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2996 2997 /* Initialize device stats */ 2998 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2999 3000 /* Enable stats if the user option is set. */ 3001 if (enable_stats) 3002 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3003 3004 /* Launch all data cores. */ 3005 if (zero_copy == 0) { 3006 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3007 rte_eal_remote_launch(switch_worker, 3008 mbuf_pool, lcore_id); 3009 } 3010 } else { 3011 uint32_t count_in_mempool, index, i; 3012 for (index = 0; index < 2*MAX_QUEUES; index++) { 3013 /* For all RX and TX queues. */ 3014 count_in_mempool 3015 = rte_mempool_count(vpool_array[index].pool); 3016 3017 /* 3018 * Transfer all un-attached mbufs from vpool.pool 3019 * to vpoo.ring. 3020 */ 3021 for (i = 0; i < count_in_mempool; i++) { 3022 struct rte_mbuf *mbuf 3023 = __rte_mbuf_raw_alloc( 3024 vpool_array[index].pool); 3025 rte_ring_sp_enqueue(vpool_array[index].ring, 3026 (void *)mbuf); 3027 } 3028 3029 LOG_DEBUG(VHOST_CONFIG, 3030 "in MAIN: mbuf count in mempool at initial " 3031 "is: %d\n", count_in_mempool); 3032 LOG_DEBUG(VHOST_CONFIG, 3033 "in MAIN: mbuf count in ring at initial is :" 3034 " %d\n", 3035 rte_ring_count(vpool_array[index].ring)); 3036 } 3037 3038 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3039 rte_eal_remote_launch(switch_worker_zcp, NULL, 3040 lcore_id); 3041 } 3042 3043 /* Register CUSE device to handle IOCTLs. */ 3044 ret = rte_vhost_driver_register((char *)&dev_basename); 3045 if (ret != 0) 3046 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3047 3048 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3049 3050 /* Start CUSE session. */ 3051 rte_vhost_driver_session_start(); 3052 return 0; 3053 3054 } 3055 3056