1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 53 #include "main.h" 54 #include "virtio-net.h" 55 #include "vhost-net-cdev.h" 56 57 #define MAX_QUEUES 128 58 59 /* the maximum number of external ports supported */ 60 #define MAX_SUP_PORTS 1 61 62 /* 63 * Calculate the number of buffers needed per port 64 */ 65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 66 (num_switching_cores*MAX_PKT_BURST) + \ 67 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 68 (num_switching_cores*MBUF_CACHE_SIZE)) 69 70 #define MBUF_CACHE_SIZE 128 71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 72 73 /* 74 * No frame data buffer allocated from host are required for zero copy 75 * implementation, guest will allocate the frame data buffer, and vhost 76 * directly use it. 77 */ 78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 80 + RTE_PKTMBUF_HEADROOM) 81 #define MBUF_CACHE_SIZE_ZCP 0 82 83 /* 84 * RX and TX Prefetch, Host, and Write-back threshold values should be 85 * carefully set for optimal performance. Consult the network 86 * controller's datasheet and supporting DPDK documentation for guidance 87 * on how these parameters should be set. 88 */ 89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 92 93 /* 94 * These default values are optimized for use with the Intel(R) 82599 10 GbE 95 * Controller and the DPDK ixgbe PMD. Consider using other values for other 96 * network controllers and/or network drivers. 97 */ 98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 99 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 100 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 101 102 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 103 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */ 104 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 105 106 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 107 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 108 109 #define JUMBO_FRAME_MAX_SIZE 0x2600 110 111 /* State of virtio device. */ 112 #define DEVICE_MAC_LEARNING 0 113 #define DEVICE_RX 1 114 #define DEVICE_SAFE_REMOVE 2 115 116 /* Config_core_flag status definitions. */ 117 #define REQUEST_DEV_REMOVAL 1 118 #define ACK_DEV_REMOVAL 0 119 120 /* Configurable number of RX/TX ring descriptors */ 121 #define RTE_TEST_RX_DESC_DEFAULT 1024 122 #define RTE_TEST_TX_DESC_DEFAULT 512 123 124 /* 125 * Need refine these 2 macros for legacy and DPDK based front end: 126 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 127 * And then adjust power 2. 128 */ 129 /* 130 * For legacy front end, 128 descriptors, 131 * half for virtio header, another half for mbuf. 132 */ 133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 135 136 /* Get first 4 bytes in mbuf headroom. */ 137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 138 + sizeof(struct rte_mbuf))) 139 140 /* true if x is a power of 2 */ 141 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 142 143 #define INVALID_PORT_ID 0xFF 144 145 /* Max number of devices. Limited by vmdq. */ 146 #define MAX_DEVICES 64 147 148 /* Size of buffers used for snprintfs. */ 149 #define MAX_PRINT_BUFF 6072 150 151 /* Maximum character device basename size. */ 152 #define MAX_BASENAME_SZ 10 153 154 /* Maximum long option length for option parsing. */ 155 #define MAX_LONG_OPT_SZ 64 156 157 /* Used to compare MAC addresses. */ 158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 159 160 /* Number of descriptors per cacheline. */ 161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 162 163 /* mask of enabled ports */ 164 static uint32_t enabled_port_mask = 0; 165 166 /*Number of switching cores enabled*/ 167 static uint32_t num_switching_cores = 0; 168 169 /* number of devices/queues to support*/ 170 static uint32_t num_queues = 0; 171 uint32_t num_devices = 0; 172 173 /* 174 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 175 * disabled on default. 176 */ 177 static uint32_t zero_copy; 178 179 /* number of descriptors to apply*/ 180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 182 183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 184 #define MAX_RING_DESC 4096 185 186 struct vpool { 187 struct rte_mempool *pool; 188 struct rte_ring *ring; 189 uint32_t buf_size; 190 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 191 192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 193 typedef enum { 194 VM2VM_DISABLED = 0, 195 VM2VM_SOFTWARE = 1, 196 VM2VM_HARDWARE = 2, 197 VM2VM_LAST 198 } vm2vm_type; 199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 200 201 /* The type of host physical address translated from guest physical address. */ 202 typedef enum { 203 PHYS_ADDR_CONTINUOUS = 0, 204 PHYS_ADDR_CROSS_SUBREG = 1, 205 PHYS_ADDR_INVALID = 2, 206 PHYS_ADDR_LAST 207 } hpa_type; 208 209 /* Enable stats. */ 210 static uint32_t enable_stats = 0; 211 /* Enable retries on RX. */ 212 static uint32_t enable_retry = 1; 213 /* Specify timeout (in useconds) between retries on RX. */ 214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 215 /* Specify the number of retries on RX. */ 216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 217 218 /* Character device basename. Can be set by user. */ 219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 220 221 /* Charater device index. Can be set by user. */ 222 static uint32_t dev_index = 0; 223 224 /* This can be set by the user so it is made available here. */ 225 extern uint64_t VHOST_FEATURES; 226 227 /* Default configuration for rx and tx thresholds etc. */ 228 static struct rte_eth_rxconf rx_conf_default = { 229 .rx_thresh = { 230 .pthresh = RX_PTHRESH, 231 .hthresh = RX_HTHRESH, 232 .wthresh = RX_WTHRESH, 233 }, 234 .rx_drop_en = 1, 235 }; 236 237 /* 238 * These default values are optimized for use with the Intel(R) 82599 10 GbE 239 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 240 * network controllers and/or network drivers. 241 */ 242 static struct rte_eth_txconf tx_conf_default = { 243 .tx_thresh = { 244 .pthresh = TX_PTHRESH, 245 .hthresh = TX_HTHRESH, 246 .wthresh = TX_WTHRESH, 247 }, 248 .tx_free_thresh = 0, /* Use PMD default values */ 249 .tx_rs_thresh = 0, /* Use PMD default values */ 250 }; 251 252 /* empty vmdq configuration structure. Filled in programatically */ 253 static struct rte_eth_conf vmdq_conf_default = { 254 .rxmode = { 255 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 256 .split_hdr_size = 0, 257 .header_split = 0, /**< Header Split disabled */ 258 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 259 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 260 /* 261 * It is necessary for 1G NIC such as I350, 262 * this fixes bug of ipv4 forwarding in guest can't 263 * forward pakets from one virtio dev to another virtio dev. 264 */ 265 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 266 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 267 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 268 }, 269 270 .txmode = { 271 .mq_mode = ETH_MQ_TX_NONE, 272 }, 273 .rx_adv_conf = { 274 /* 275 * should be overridden separately in code with 276 * appropriate values 277 */ 278 .vmdq_rx_conf = { 279 .nb_queue_pools = ETH_8_POOLS, 280 .enable_default_pool = 0, 281 .default_pool = 0, 282 .nb_pool_maps = 0, 283 .pool_map = {{0, 0},}, 284 }, 285 }, 286 }; 287 288 static unsigned lcore_ids[RTE_MAX_LCORE]; 289 static uint8_t ports[RTE_MAX_ETHPORTS]; 290 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 291 292 static const uint16_t external_pkt_default_vlan_tag = 2000; 293 const uint16_t vlan_tags[] = { 294 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 295 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 296 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 297 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 298 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 299 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 300 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 301 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 302 }; 303 304 /* ethernet addresses of ports */ 305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 306 307 /* heads for the main used and free linked lists for the data path. */ 308 static struct virtio_net_data_ll *ll_root_used = NULL; 309 static struct virtio_net_data_ll *ll_root_free = NULL; 310 311 /* Array of data core structures containing information on individual core linked lists. */ 312 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 313 314 /* Used for queueing bursts of TX packets. */ 315 struct mbuf_table { 316 unsigned len; 317 unsigned txq_id; 318 struct rte_mbuf *m_table[MAX_PKT_BURST]; 319 }; 320 321 /* TX queue for each data core. */ 322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 323 324 /* TX queue fori each virtio device for zero copy. */ 325 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 326 327 /* Vlan header struct used to insert vlan tags on TX. */ 328 struct vlan_ethhdr { 329 unsigned char h_dest[ETH_ALEN]; 330 unsigned char h_source[ETH_ALEN]; 331 __be16 h_vlan_proto; 332 __be16 h_vlan_TCI; 333 __be16 h_vlan_encapsulated_proto; 334 }; 335 336 /* IPv4 Header */ 337 struct ipv4_hdr { 338 uint8_t version_ihl; /**< version and header length */ 339 uint8_t type_of_service; /**< type of service */ 340 uint16_t total_length; /**< length of packet */ 341 uint16_t packet_id; /**< packet ID */ 342 uint16_t fragment_offset; /**< fragmentation offset */ 343 uint8_t time_to_live; /**< time to live */ 344 uint8_t next_proto_id; /**< protocol ID */ 345 uint16_t hdr_checksum; /**< header checksum */ 346 uint32_t src_addr; /**< source address */ 347 uint32_t dst_addr; /**< destination address */ 348 } __attribute__((__packed__)); 349 350 /* Header lengths. */ 351 #define VLAN_HLEN 4 352 #define VLAN_ETH_HLEN 18 353 354 /* Per-device statistics struct */ 355 struct device_statistics { 356 uint64_t tx_total; 357 rte_atomic64_t rx_total_atomic; 358 uint64_t rx_total; 359 uint64_t tx; 360 rte_atomic64_t rx_atomic; 361 uint64_t rx; 362 } __rte_cache_aligned; 363 struct device_statistics dev_statistics[MAX_DEVICES]; 364 365 /* 366 * Builds up the correct configuration for VMDQ VLAN pool map 367 * according to the pool & queue limits. 368 */ 369 static inline int 370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 371 { 372 struct rte_eth_vmdq_rx_conf conf; 373 unsigned i; 374 375 memset(&conf, 0, sizeof(conf)); 376 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 377 conf.nb_pool_maps = num_devices; 378 conf.enable_loop_back = 379 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 380 381 for (i = 0; i < conf.nb_pool_maps; i++) { 382 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 383 conf.pool_map[i].pools = (1UL << i); 384 } 385 386 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 387 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 388 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 389 return 0; 390 } 391 392 /* 393 * Validate the device number according to the max pool number gotten form 394 * dev_info. If the device number is invalid, give the error message and 395 * return -1. Each device must have its own pool. 396 */ 397 static inline int 398 validate_num_devices(uint32_t max_nb_devices) 399 { 400 if (num_devices > max_nb_devices) { 401 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 402 return -1; 403 } 404 return 0; 405 } 406 407 /* 408 * Initialises a given port using global settings and with the rx buffers 409 * coming from the mbuf_pool passed as parameter 410 */ 411 static inline int 412 port_init(uint8_t port) 413 { 414 struct rte_eth_dev_info dev_info; 415 struct rte_eth_conf port_conf; 416 uint16_t rx_rings, tx_rings; 417 uint16_t rx_ring_size, tx_ring_size; 418 int retval; 419 uint16_t q; 420 421 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 422 rte_eth_dev_info_get (port, &dev_info); 423 424 /*configure the number of supported virtio devices based on VMDQ limits */ 425 num_devices = dev_info.max_vmdq_pools; 426 num_queues = dev_info.max_rx_queues; 427 428 if (zero_copy) { 429 rx_ring_size = num_rx_descriptor; 430 tx_ring_size = num_tx_descriptor; 431 tx_rings = dev_info.max_tx_queues; 432 } else { 433 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 434 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 435 tx_rings = (uint16_t)rte_lcore_count(); 436 } 437 438 retval = validate_num_devices(MAX_DEVICES); 439 if (retval < 0) 440 return retval; 441 442 /* Get port configuration. */ 443 retval = get_eth_conf(&port_conf, num_devices); 444 if (retval < 0) 445 return retval; 446 447 if (port >= rte_eth_dev_count()) return -1; 448 449 rx_rings = (uint16_t)num_queues, 450 /* Configure ethernet device. */ 451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 452 if (retval != 0) 453 return retval; 454 455 /* Setup the queues. */ 456 for (q = 0; q < rx_rings; q ++) { 457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 458 rte_eth_dev_socket_id(port), &rx_conf_default, 459 vpool_array[q].pool); 460 if (retval < 0) 461 return retval; 462 } 463 for (q = 0; q < tx_rings; q ++) { 464 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 465 rte_eth_dev_socket_id(port), &tx_conf_default); 466 if (retval < 0) 467 return retval; 468 } 469 470 /* Start the device. */ 471 retval = rte_eth_dev_start(port); 472 if (retval < 0) { 473 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 474 return retval; 475 } 476 477 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 478 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 479 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 480 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 481 (unsigned)port, 482 vmdq_ports_eth_addr[port].addr_bytes[0], 483 vmdq_ports_eth_addr[port].addr_bytes[1], 484 vmdq_ports_eth_addr[port].addr_bytes[2], 485 vmdq_ports_eth_addr[port].addr_bytes[3], 486 vmdq_ports_eth_addr[port].addr_bytes[4], 487 vmdq_ports_eth_addr[port].addr_bytes[5]); 488 489 return 0; 490 } 491 492 /* 493 * Set character device basename. 494 */ 495 static int 496 us_vhost_parse_basename(const char *q_arg) 497 { 498 /* parse number string */ 499 500 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 501 return -1; 502 else 503 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 504 505 return 0; 506 } 507 508 /* 509 * Parse the portmask provided at run time. 510 */ 511 static int 512 parse_portmask(const char *portmask) 513 { 514 char *end = NULL; 515 unsigned long pm; 516 517 errno = 0; 518 519 /* parse hexadecimal string */ 520 pm = strtoul(portmask, &end, 16); 521 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 522 return -1; 523 524 if (pm == 0) 525 return -1; 526 527 return pm; 528 529 } 530 531 /* 532 * Parse num options at run time. 533 */ 534 static int 535 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 536 { 537 char *end = NULL; 538 unsigned long num; 539 540 errno = 0; 541 542 /* parse unsigned int string */ 543 num = strtoul(q_arg, &end, 10); 544 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 545 return -1; 546 547 if (num > max_valid_value) 548 return -1; 549 550 return num; 551 552 } 553 554 /* 555 * Display usage 556 */ 557 static void 558 us_vhost_usage(const char *prgname) 559 { 560 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 561 " --vm2vm [0|1|2]\n" 562 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 563 " --dev-basename <name> --dev-index [0-N]\n" 564 " --nb-devices ND\n" 565 " -p PORTMASK: Set mask for ports to be used by application\n" 566 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 567 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 568 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 569 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 570 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 571 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 572 " --dev-basename: The basename to be used for the character device.\n" 573 " --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n" 574 " --zero-copy [0|1]: disable(default)/enable rx/tx " 575 "zero copy\n" 576 " --rx-desc-num [0-N]: the number of descriptors on rx, " 577 "used only when zero copy is enabled.\n" 578 " --tx-desc-num [0-N]: the number of descriptors on tx, " 579 "used only when zero copy is enabled.\n", 580 prgname); 581 } 582 583 /* 584 * Parse the arguments given in the command line of the application. 585 */ 586 static int 587 us_vhost_parse_args(int argc, char **argv) 588 { 589 int opt, ret; 590 int option_index; 591 unsigned i; 592 const char *prgname = argv[0]; 593 static struct option long_option[] = { 594 {"vm2vm", required_argument, NULL, 0}, 595 {"rx-retry", required_argument, NULL, 0}, 596 {"rx-retry-delay", required_argument, NULL, 0}, 597 {"rx-retry-num", required_argument, NULL, 0}, 598 {"mergeable", required_argument, NULL, 0}, 599 {"stats", required_argument, NULL, 0}, 600 {"dev-basename", required_argument, NULL, 0}, 601 {"dev-index", required_argument, NULL, 0}, 602 {"zero-copy", required_argument, NULL, 0}, 603 {"rx-desc-num", required_argument, NULL, 0}, 604 {"tx-desc-num", required_argument, NULL, 0}, 605 {NULL, 0, 0, 0}, 606 }; 607 608 /* Parse command line */ 609 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 610 switch (opt) { 611 /* Portmask */ 612 case 'p': 613 enabled_port_mask = parse_portmask(optarg); 614 if (enabled_port_mask == 0) { 615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 616 us_vhost_usage(prgname); 617 return -1; 618 } 619 break; 620 621 case 0: 622 /* Enable/disable vm2vm comms. */ 623 if (!strncmp(long_option[option_index].name, "vm2vm", 624 MAX_LONG_OPT_SZ)) { 625 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 626 if (ret == -1) { 627 RTE_LOG(INFO, VHOST_CONFIG, 628 "Invalid argument for " 629 "vm2vm [0|1|2]\n"); 630 us_vhost_usage(prgname); 631 return -1; 632 } else { 633 vm2vm_mode = (vm2vm_type)ret; 634 } 635 } 636 637 /* Enable/disable retries on RX. */ 638 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 639 ret = parse_num_opt(optarg, 1); 640 if (ret == -1) { 641 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 642 us_vhost_usage(prgname); 643 return -1; 644 } else { 645 enable_retry = ret; 646 } 647 } 648 649 /* Specify the retries delay time (in useconds) on RX. */ 650 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 651 ret = parse_num_opt(optarg, INT32_MAX); 652 if (ret == -1) { 653 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 654 us_vhost_usage(prgname); 655 return -1; 656 } else { 657 burst_rx_delay_time = ret; 658 } 659 } 660 661 /* Specify the retries number on RX. */ 662 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 663 ret = parse_num_opt(optarg, INT32_MAX); 664 if (ret == -1) { 665 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 666 us_vhost_usage(prgname); 667 return -1; 668 } else { 669 burst_rx_retry_num = ret; 670 } 671 } 672 673 /* Enable/disable RX mergeable buffers. */ 674 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 675 ret = parse_num_opt(optarg, 1); 676 if (ret == -1) { 677 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 678 us_vhost_usage(prgname); 679 return -1; 680 } else { 681 if (ret) { 682 vmdq_conf_default.rxmode.jumbo_frame = 1; 683 vmdq_conf_default.rxmode.max_rx_pkt_len 684 = JUMBO_FRAME_MAX_SIZE; 685 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF); 686 } 687 } 688 } 689 690 /* Enable/disable stats. */ 691 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 692 ret = parse_num_opt(optarg, INT32_MAX); 693 if (ret == -1) { 694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 695 us_vhost_usage(prgname); 696 return -1; 697 } else { 698 enable_stats = ret; 699 } 700 } 701 702 /* Set character device basename. */ 703 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 704 if (us_vhost_parse_basename(optarg) == -1) { 705 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 706 us_vhost_usage(prgname); 707 return -1; 708 } 709 } 710 711 /* Set character device index. */ 712 if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) { 713 ret = parse_num_opt(optarg, INT32_MAX); 714 if (ret == -1) { 715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n"); 716 us_vhost_usage(prgname); 717 return -1; 718 } else 719 dev_index = ret; 720 } 721 722 /* Enable/disable rx/tx zero copy. */ 723 if (!strncmp(long_option[option_index].name, 724 "zero-copy", MAX_LONG_OPT_SZ)) { 725 ret = parse_num_opt(optarg, 1); 726 if (ret == -1) { 727 RTE_LOG(INFO, VHOST_CONFIG, 728 "Invalid argument" 729 " for zero-copy [0|1]\n"); 730 us_vhost_usage(prgname); 731 return -1; 732 } else 733 zero_copy = ret; 734 735 if (zero_copy) { 736 #ifdef RTE_MBUF_REFCNT 737 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 738 "zero copy vhost APP, please " 739 "disable RTE_MBUF_REFCNT\n" 740 "in config file and then rebuild DPDK " 741 "core lib!\n" 742 "Otherwise please disable zero copy " 743 "flag in command line!\n"); 744 return -1; 745 #endif 746 } 747 } 748 749 /* Specify the descriptor number on RX. */ 750 if (!strncmp(long_option[option_index].name, 751 "rx-desc-num", MAX_LONG_OPT_SZ)) { 752 ret = parse_num_opt(optarg, MAX_RING_DESC); 753 if ((ret == -1) || (!POWEROF2(ret))) { 754 RTE_LOG(INFO, VHOST_CONFIG, 755 "Invalid argument for rx-desc-num[0-N]," 756 "power of 2 required.\n"); 757 us_vhost_usage(prgname); 758 return -1; 759 } else { 760 num_rx_descriptor = ret; 761 } 762 } 763 764 /* Specify the descriptor number on TX. */ 765 if (!strncmp(long_option[option_index].name, 766 "tx-desc-num", MAX_LONG_OPT_SZ)) { 767 ret = parse_num_opt(optarg, MAX_RING_DESC); 768 if ((ret == -1) || (!POWEROF2(ret))) { 769 RTE_LOG(INFO, VHOST_CONFIG, 770 "Invalid argument for tx-desc-num [0-N]," 771 "power of 2 required.\n"); 772 us_vhost_usage(prgname); 773 return -1; 774 } else { 775 num_tx_descriptor = ret; 776 } 777 } 778 779 break; 780 781 /* Invalid option - print options. */ 782 default: 783 us_vhost_usage(prgname); 784 return -1; 785 } 786 } 787 788 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 789 if (enabled_port_mask & (1 << i)) 790 ports[num_ports++] = (uint8_t)i; 791 } 792 793 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 794 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 795 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 796 return -1; 797 } 798 799 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 800 RTE_LOG(INFO, VHOST_PORT, 801 "Vhost zero copy doesn't support software vm2vm," 802 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 803 return -1; 804 } 805 806 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 807 RTE_LOG(INFO, VHOST_PORT, 808 "Vhost zero copy doesn't support jumbo frame," 809 "please specify '--mergeable 0' to disable the " 810 "mergeable feature.\n"); 811 return -1; 812 } 813 814 return 0; 815 } 816 817 /* 818 * Update the global var NUM_PORTS and array PORTS according to system ports number 819 * and return valid ports number 820 */ 821 static unsigned check_ports_num(unsigned nb_ports) 822 { 823 unsigned valid_num_ports = num_ports; 824 unsigned portid; 825 826 if (num_ports > nb_ports) { 827 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 828 num_ports, nb_ports); 829 num_ports = nb_ports; 830 } 831 832 for (portid = 0; portid < num_ports; portid ++) { 833 if (ports[portid] >= nb_ports) { 834 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 835 ports[portid], (nb_ports - 1)); 836 ports[portid] = INVALID_PORT_ID; 837 valid_num_ports--; 838 } 839 } 840 return valid_num_ports; 841 } 842 843 /* 844 * Macro to print out packet contents. Wrapped in debug define so that the 845 * data path is not effected when debug is disabled. 846 */ 847 #ifdef DEBUG 848 #define PRINT_PACKET(device, addr, size, header) do { \ 849 char *pkt_addr = (char*)(addr); \ 850 unsigned int index; \ 851 char packet[MAX_PRINT_BUFF]; \ 852 \ 853 if ((header)) \ 854 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 855 else \ 856 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 857 for (index = 0; index < (size); index++) { \ 858 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 859 "%02hhx ", pkt_addr[index]); \ 860 } \ 861 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 862 \ 863 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 864 } while(0) 865 #else 866 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 867 #endif 868 869 /* 870 * Function to convert guest physical addresses to vhost virtual addresses. This 871 * is used to convert virtio buffer addresses. 872 */ 873 static inline uint64_t __attribute__((always_inline)) 874 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) 875 { 876 struct virtio_memory_regions *region; 877 uint32_t regionidx; 878 uint64_t vhost_va = 0; 879 880 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 881 region = &dev->mem->regions[regionidx]; 882 if ((guest_pa >= region->guest_phys_address) && 883 (guest_pa <= region->guest_phys_address_end)) { 884 vhost_va = region->address_offset + guest_pa; 885 break; 886 } 887 } 888 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| VVA %p\n", 889 dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va); 890 891 return vhost_va; 892 } 893 894 /* 895 * Function to convert guest physical addresses to vhost physical addresses. 896 * This is used to convert virtio buffer addresses. 897 */ 898 static inline uint64_t __attribute__((always_inline)) 899 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa, 900 uint32_t buf_len, hpa_type *addr_type) 901 { 902 struct virtio_memory_regions_hpa *region; 903 uint32_t regionidx; 904 uint64_t vhost_pa = 0; 905 906 *addr_type = PHYS_ADDR_INVALID; 907 908 for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) { 909 region = &dev->mem->regions_hpa[regionidx]; 910 if ((guest_pa >= region->guest_phys_address) && 911 (guest_pa <= region->guest_phys_address_end)) { 912 vhost_pa = region->host_phys_addr_offset + guest_pa; 913 if (likely((guest_pa + buf_len - 1) 914 <= region->guest_phys_address_end)) 915 *addr_type = PHYS_ADDR_CONTINUOUS; 916 else 917 *addr_type = PHYS_ADDR_CROSS_SUBREG; 918 break; 919 } 920 } 921 922 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 923 dev->device_fh, (void *)(uintptr_t)guest_pa, 924 (void *)(uintptr_t)vhost_pa); 925 926 return vhost_pa; 927 } 928 929 /* 930 * This function adds buffers to the virtio devices RX virtqueue. Buffers can 931 * be received from the physical port or from another virtio device. A packet 932 * count is returned to indicate the number of packets that were succesfully 933 * added to the RX queue. This function works when mergeable is disabled. 934 */ 935 static inline uint32_t __attribute__((always_inline)) 936 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) 937 { 938 struct vhost_virtqueue *vq; 939 struct vring_desc *desc; 940 struct rte_mbuf *buff; 941 /* The virtio_hdr is initialised to 0. */ 942 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0}; 943 uint64_t buff_addr = 0; 944 uint64_t buff_hdr_addr = 0; 945 uint32_t head[MAX_PKT_BURST], packet_len = 0; 946 uint32_t head_idx, packet_success = 0; 947 uint32_t retry = 0; 948 uint16_t avail_idx, res_cur_idx; 949 uint16_t res_base_idx, res_end_idx; 950 uint16_t free_entries; 951 uint8_t success = 0; 952 953 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 954 vq = dev->virtqueue[VIRTIO_RXQ]; 955 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 956 957 /* As many data cores may want access to available buffers, they need to be reserved. */ 958 do { 959 res_base_idx = vq->last_used_idx_res; 960 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 961 962 free_entries = (avail_idx - res_base_idx); 963 /* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */ 964 if (enable_retry && unlikely(count > free_entries)) { 965 for (retry = 0; retry < burst_rx_retry_num; retry++) { 966 rte_delay_us(burst_rx_delay_time); 967 avail_idx = 968 *((volatile uint16_t *)&vq->avail->idx); 969 free_entries = (avail_idx - res_base_idx); 970 if (count <= free_entries) 971 break; 972 } 973 } 974 975 /*check that we have enough buffers*/ 976 if (unlikely(count > free_entries)) 977 count = free_entries; 978 979 if (count == 0) 980 return 0; 981 982 res_end_idx = res_base_idx + count; 983 /* vq->last_used_idx_res is atomically updated. */ 984 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx, 985 res_end_idx); 986 } while (unlikely(success == 0)); 987 res_cur_idx = res_base_idx; 988 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx); 989 990 /* Prefetch available ring to retrieve indexes. */ 991 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); 992 993 /* Retrieve all of the head indexes first to avoid caching issues. */ 994 for (head_idx = 0; head_idx < count; head_idx++) 995 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; 996 997 /*Prefetch descriptor index. */ 998 rte_prefetch0(&vq->desc[head[packet_success]]); 999 1000 while (res_cur_idx != res_end_idx) { 1001 /* Get descriptor from available ring */ 1002 desc = &vq->desc[head[packet_success]]; 1003 1004 buff = pkts[packet_success]; 1005 1006 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */ 1007 buff_addr = gpa_to_vva(dev, desc->addr); 1008 /* Prefetch buffer address. */ 1009 rte_prefetch0((void*)(uintptr_t)buff_addr); 1010 1011 /* Copy virtio_hdr to packet and increment buffer address */ 1012 buff_hdr_addr = buff_addr; 1013 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1014 1015 /* 1016 * If the descriptors are chained the header and data are 1017 * placed in separate buffers. 1018 */ 1019 if (desc->flags & VRING_DESC_F_NEXT) { 1020 desc->len = vq->vhost_hlen; 1021 desc = &vq->desc[desc->next]; 1022 /* Buffer address translation. */ 1023 buff_addr = gpa_to_vva(dev, desc->addr); 1024 desc->len = rte_pktmbuf_data_len(buff); 1025 } else { 1026 buff_addr += vq->vhost_hlen; 1027 desc->len = packet_len; 1028 } 1029 1030 /* Update used ring with desc information */ 1031 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; 1032 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; 1033 1034 /* Copy mbuf data to buffer */ 1035 rte_memcpy((void *)(uintptr_t)buff_addr, 1036 rte_pktmbuf_mtod(buff, const void *), 1037 rte_pktmbuf_data_len(buff)); 1038 PRINT_PACKET(dev, (uintptr_t)buff_addr, 1039 rte_pktmbuf_data_len(buff), 0); 1040 1041 res_cur_idx++; 1042 packet_success++; 1043 1044 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1045 (const void *)&virtio_hdr, vq->vhost_hlen); 1046 1047 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1048 1049 if (res_cur_idx < res_end_idx) { 1050 /* Prefetch descriptor index. */ 1051 rte_prefetch0(&vq->desc[head[packet_success]]); 1052 } 1053 } 1054 1055 rte_compiler_barrier(); 1056 1057 /* Wait until it's our turn to add our buffer to the used ring. */ 1058 while (unlikely(vq->last_used_idx != res_base_idx)) 1059 rte_pause(); 1060 1061 *(volatile uint16_t *)&vq->used->idx += count; 1062 vq->last_used_idx = res_end_idx; 1063 1064 /* Kick the guest if necessary. */ 1065 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1066 eventfd_write((int)vq->kickfd, 1); 1067 return count; 1068 } 1069 1070 static inline uint32_t __attribute__((always_inline)) 1071 copy_from_mbuf_to_vring(struct virtio_net *dev, 1072 uint16_t res_base_idx, uint16_t res_end_idx, 1073 struct rte_mbuf *pkt) 1074 { 1075 uint32_t vec_idx = 0; 1076 uint32_t entry_success = 0; 1077 struct vhost_virtqueue *vq; 1078 /* The virtio_hdr is initialised to 0. */ 1079 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = { 1080 {0, 0, 0, 0, 0, 0}, 0}; 1081 uint16_t cur_idx = res_base_idx; 1082 uint64_t vb_addr = 0; 1083 uint64_t vb_hdr_addr = 0; 1084 uint32_t seg_offset = 0; 1085 uint32_t vb_offset = 0; 1086 uint32_t seg_avail; 1087 uint32_t vb_avail; 1088 uint32_t cpy_len, entry_len; 1089 1090 if (pkt == NULL) 1091 return 0; 1092 1093 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " 1094 "End Index %d\n", 1095 dev->device_fh, cur_idx, res_end_idx); 1096 1097 /* 1098 * Convert from gpa to vva 1099 * (guest physical addr -> vhost virtual addr) 1100 */ 1101 vq = dev->virtqueue[VIRTIO_RXQ]; 1102 vb_addr = 1103 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); 1104 vb_hdr_addr = vb_addr; 1105 1106 /* Prefetch buffer address. */ 1107 rte_prefetch0((void *)(uintptr_t)vb_addr); 1108 1109 virtio_hdr.num_buffers = res_end_idx - res_base_idx; 1110 1111 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", 1112 dev->device_fh, virtio_hdr.num_buffers); 1113 1114 rte_memcpy((void *)(uintptr_t)vb_hdr_addr, 1115 (const void *)&virtio_hdr, vq->vhost_hlen); 1116 1117 PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); 1118 1119 seg_avail = rte_pktmbuf_data_len(pkt); 1120 vb_offset = vq->vhost_hlen; 1121 vb_avail = 1122 vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; 1123 1124 entry_len = vq->vhost_hlen; 1125 1126 if (vb_avail == 0) { 1127 uint32_t desc_idx = 1128 vq->buf_vec[vec_idx].desc_idx; 1129 vq->desc[desc_idx].len = vq->vhost_hlen; 1130 1131 if ((vq->desc[desc_idx].flags 1132 & VRING_DESC_F_NEXT) == 0) { 1133 /* Update used ring with desc information */ 1134 vq->used->ring[cur_idx & (vq->size - 1)].id 1135 = vq->buf_vec[vec_idx].desc_idx; 1136 vq->used->ring[cur_idx & (vq->size - 1)].len 1137 = entry_len; 1138 1139 entry_len = 0; 1140 cur_idx++; 1141 entry_success++; 1142 } 1143 1144 vec_idx++; 1145 vb_addr = 1146 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); 1147 1148 /* Prefetch buffer address. */ 1149 rte_prefetch0((void *)(uintptr_t)vb_addr); 1150 vb_offset = 0; 1151 vb_avail = vq->buf_vec[vec_idx].buf_len; 1152 } 1153 1154 cpy_len = RTE_MIN(vb_avail, seg_avail); 1155 1156 while (cpy_len > 0) { 1157 /* Copy mbuf data to vring buffer */ 1158 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), 1159 (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset), 1160 cpy_len); 1161 1162 PRINT_PACKET(dev, 1163 (uintptr_t)(vb_addr + vb_offset), 1164 cpy_len, 0); 1165 1166 seg_offset += cpy_len; 1167 vb_offset += cpy_len; 1168 seg_avail -= cpy_len; 1169 vb_avail -= cpy_len; 1170 entry_len += cpy_len; 1171 1172 if (seg_avail != 0) { 1173 /* 1174 * The virtio buffer in this vring 1175 * entry reach to its end. 1176 * But the segment doesn't complete. 1177 */ 1178 if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & 1179 VRING_DESC_F_NEXT) == 0) { 1180 /* Update used ring with desc information */ 1181 vq->used->ring[cur_idx & (vq->size - 1)].id 1182 = vq->buf_vec[vec_idx].desc_idx; 1183 vq->used->ring[cur_idx & (vq->size - 1)].len 1184 = entry_len; 1185 entry_len = 0; 1186 cur_idx++; 1187 entry_success++; 1188 } 1189 1190 vec_idx++; 1191 vb_addr = gpa_to_vva(dev, 1192 vq->buf_vec[vec_idx].buf_addr); 1193 vb_offset = 0; 1194 vb_avail = vq->buf_vec[vec_idx].buf_len; 1195 cpy_len = RTE_MIN(vb_avail, seg_avail); 1196 } else { 1197 /* 1198 * This current segment complete, need continue to 1199 * check if the whole packet complete or not. 1200 */ 1201 pkt = pkt->next; 1202 if (pkt != NULL) { 1203 /* 1204 * There are more segments. 1205 */ 1206 if (vb_avail == 0) { 1207 /* 1208 * This current buffer from vring is 1209 * used up, need fetch next buffer 1210 * from buf_vec. 1211 */ 1212 uint32_t desc_idx = 1213 vq->buf_vec[vec_idx].desc_idx; 1214 vq->desc[desc_idx].len = vb_offset; 1215 1216 if ((vq->desc[desc_idx].flags & 1217 VRING_DESC_F_NEXT) == 0) { 1218 uint16_t wrapped_idx = 1219 cur_idx & (vq->size - 1); 1220 /* 1221 * Update used ring with the 1222 * descriptor information 1223 */ 1224 vq->used->ring[wrapped_idx].id 1225 = desc_idx; 1226 vq->used->ring[wrapped_idx].len 1227 = entry_len; 1228 entry_success++; 1229 entry_len = 0; 1230 cur_idx++; 1231 } 1232 1233 /* Get next buffer from buf_vec. */ 1234 vec_idx++; 1235 vb_addr = gpa_to_vva(dev, 1236 vq->buf_vec[vec_idx].buf_addr); 1237 vb_avail = 1238 vq->buf_vec[vec_idx].buf_len; 1239 vb_offset = 0; 1240 } 1241 1242 seg_offset = 0; 1243 seg_avail = rte_pktmbuf_data_len(pkt); 1244 cpy_len = RTE_MIN(vb_avail, seg_avail); 1245 } else { 1246 /* 1247 * This whole packet completes. 1248 */ 1249 uint32_t desc_idx = 1250 vq->buf_vec[vec_idx].desc_idx; 1251 vq->desc[desc_idx].len = vb_offset; 1252 1253 while (vq->desc[desc_idx].flags & 1254 VRING_DESC_F_NEXT) { 1255 desc_idx = vq->desc[desc_idx].next; 1256 vq->desc[desc_idx].len = 0; 1257 } 1258 1259 /* Update used ring with desc information */ 1260 vq->used->ring[cur_idx & (vq->size - 1)].id 1261 = vq->buf_vec[vec_idx].desc_idx; 1262 vq->used->ring[cur_idx & (vq->size - 1)].len 1263 = entry_len; 1264 entry_len = 0; 1265 cur_idx++; 1266 entry_success++; 1267 seg_avail = 0; 1268 cpy_len = RTE_MIN(vb_avail, seg_avail); 1269 } 1270 } 1271 } 1272 1273 return entry_success; 1274 } 1275 1276 /* 1277 * This function adds buffers to the virtio devices RX virtqueue. Buffers can 1278 * be received from the physical port or from another virtio device. A packet 1279 * count is returned to indicate the number of packets that were succesfully 1280 * added to the RX queue. This function works for mergeable RX. 1281 */ 1282 static inline uint32_t __attribute__((always_inline)) 1283 virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts, 1284 uint32_t count) 1285 { 1286 struct vhost_virtqueue *vq; 1287 uint32_t pkt_idx = 0, entry_success = 0; 1288 uint32_t retry = 0; 1289 uint16_t avail_idx, res_cur_idx; 1290 uint16_t res_base_idx, res_end_idx; 1291 uint8_t success = 0; 1292 1293 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", 1294 dev->device_fh); 1295 vq = dev->virtqueue[VIRTIO_RXQ]; 1296 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); 1297 1298 if (count == 0) 1299 return 0; 1300 1301 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { 1302 uint32_t secure_len = 0; 1303 uint16_t need_cnt; 1304 uint32_t vec_idx = 0; 1305 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen; 1306 uint16_t i, id; 1307 1308 do { 1309 /* 1310 * As many data cores may want access to available 1311 * buffers, they need to be reserved. 1312 */ 1313 res_base_idx = vq->last_used_idx_res; 1314 res_cur_idx = res_base_idx; 1315 1316 do { 1317 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1318 if (unlikely(res_cur_idx == avail_idx)) { 1319 /* 1320 * If retry is enabled and the queue is 1321 * full then we wait and retry to avoid 1322 * packet loss. 1323 */ 1324 if (enable_retry) { 1325 uint8_t cont = 0; 1326 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1327 rte_delay_us(burst_rx_delay_time); 1328 avail_idx = 1329 *((volatile uint16_t *)&vq->avail->idx); 1330 if (likely(res_cur_idx != avail_idx)) { 1331 cont = 1; 1332 break; 1333 } 1334 } 1335 if (cont == 1) 1336 continue; 1337 } 1338 1339 LOG_DEBUG(VHOST_DATA, 1340 "(%"PRIu64") Failed " 1341 "to get enough desc from " 1342 "vring\n", 1343 dev->device_fh); 1344 return pkt_idx; 1345 } else { 1346 uint16_t wrapped_idx = 1347 (res_cur_idx) & (vq->size - 1); 1348 uint32_t idx = 1349 vq->avail->ring[wrapped_idx]; 1350 uint8_t next_desc; 1351 1352 do { 1353 next_desc = 0; 1354 secure_len += vq->desc[idx].len; 1355 if (vq->desc[idx].flags & 1356 VRING_DESC_F_NEXT) { 1357 idx = vq->desc[idx].next; 1358 next_desc = 1; 1359 } 1360 } while (next_desc); 1361 1362 res_cur_idx++; 1363 } 1364 } while (pkt_len > secure_len); 1365 1366 /* vq->last_used_idx_res is atomically updated. */ 1367 success = rte_atomic16_cmpset(&vq->last_used_idx_res, 1368 res_base_idx, 1369 res_cur_idx); 1370 } while (success == 0); 1371 1372 id = res_base_idx; 1373 need_cnt = res_cur_idx - res_base_idx; 1374 1375 for (i = 0; i < need_cnt; i++, id++) { 1376 uint16_t wrapped_idx = id & (vq->size - 1); 1377 uint32_t idx = vq->avail->ring[wrapped_idx]; 1378 uint8_t next_desc; 1379 do { 1380 next_desc = 0; 1381 vq->buf_vec[vec_idx].buf_addr = 1382 vq->desc[idx].addr; 1383 vq->buf_vec[vec_idx].buf_len = 1384 vq->desc[idx].len; 1385 vq->buf_vec[vec_idx].desc_idx = idx; 1386 vec_idx++; 1387 1388 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) { 1389 idx = vq->desc[idx].next; 1390 next_desc = 1; 1391 } 1392 } while (next_desc); 1393 } 1394 1395 res_end_idx = res_cur_idx; 1396 1397 entry_success = copy_from_mbuf_to_vring(dev, res_base_idx, 1398 res_end_idx, pkts[pkt_idx]); 1399 1400 rte_compiler_barrier(); 1401 1402 /* 1403 * Wait until it's our turn to add our buffer 1404 * to the used ring. 1405 */ 1406 while (unlikely(vq->last_used_idx != res_base_idx)) 1407 rte_pause(); 1408 1409 *(volatile uint16_t *)&vq->used->idx += entry_success; 1410 vq->last_used_idx = res_end_idx; 1411 1412 /* Kick the guest if necessary. */ 1413 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1414 eventfd_write((int)vq->kickfd, 1); 1415 } 1416 1417 return count; 1418 } 1419 1420 /* 1421 * Compares a packet destination MAC address to a device MAC address. 1422 */ 1423 static inline int __attribute__((always_inline)) 1424 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 1425 { 1426 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 1427 } 1428 1429 /* 1430 * This function learns the MAC address of the device and registers this along with a 1431 * vlan tag to a VMDQ. 1432 */ 1433 static int 1434 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m) 1435 { 1436 struct ether_hdr *pkt_hdr; 1437 struct virtio_net_data_ll *dev_ll; 1438 int i, ret; 1439 1440 /* Learn MAC address of guest device from packet */ 1441 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1442 1443 dev_ll = ll_root_used; 1444 1445 while (dev_ll != NULL) { 1446 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) { 1447 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 1448 return -1; 1449 } 1450 dev_ll = dev_ll->next; 1451 } 1452 1453 for (i = 0; i < ETHER_ADDR_LEN; i++) 1454 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 1455 1456 /* vlan_tag currently uses the device_id. */ 1457 dev->vlan_tag = vlan_tags[dev->device_fh]; 1458 1459 /* Print out VMDQ registration info. */ 1460 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 1461 dev->device_fh, 1462 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1], 1463 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3], 1464 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5], 1465 dev->vlan_tag); 1466 1467 /* Register the MAC address. */ 1468 ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh); 1469 if (ret) 1470 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 1471 dev->device_fh); 1472 1473 /* Enable stripping of the vlan tag as we handle routing. */ 1474 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1); 1475 1476 /* Set device as ready for RX. */ 1477 dev->ready = DEVICE_RX; 1478 1479 return 0; 1480 } 1481 1482 /* 1483 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 1484 * queue before disabling RX on the device. 1485 */ 1486 static inline void 1487 unlink_vmdq(struct virtio_net *dev) 1488 { 1489 unsigned i = 0; 1490 unsigned rx_count; 1491 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1492 1493 if (dev->ready == DEVICE_RX) { 1494 /*clear MAC and VLAN settings*/ 1495 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address); 1496 for (i = 0; i < 6; i++) 1497 dev->mac_address.addr_bytes[i] = 0; 1498 1499 dev->vlan_tag = 0; 1500 1501 /*Clear out the receive buffers*/ 1502 rx_count = rte_eth_rx_burst(ports[0], 1503 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1504 1505 while (rx_count) { 1506 for (i = 0; i < rx_count; i++) 1507 rte_pktmbuf_free(pkts_burst[i]); 1508 1509 rx_count = rte_eth_rx_burst(ports[0], 1510 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1511 } 1512 1513 dev->ready = DEVICE_MAC_LEARNING; 1514 } 1515 } 1516 1517 /* 1518 * Check if the packet destination MAC address is for a local device. If so then put 1519 * the packet on that devices RX queue. If not then return. 1520 */ 1521 static inline unsigned __attribute__((always_inline)) 1522 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m) 1523 { 1524 struct virtio_net_data_ll *dev_ll; 1525 struct ether_hdr *pkt_hdr; 1526 uint64_t ret = 0; 1527 1528 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1529 1530 /*get the used devices list*/ 1531 dev_ll = ll_root_used; 1532 1533 while (dev_ll != NULL) { 1534 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1535 &dev_ll->dev->mac_address)) { 1536 1537 /* Drop the packet if the TX packet is destined for the TX device. */ 1538 if (dev_ll->dev->device_fh == dev->device_fh) { 1539 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1540 dev_ll->dev->device_fh); 1541 return 0; 1542 } 1543 1544 1545 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh); 1546 1547 if (dev_ll->dev->remove) { 1548 /*drop the packet if the device is marked for removal*/ 1549 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh); 1550 } else { 1551 uint32_t mergeable = 1552 dev_ll->dev->features & 1553 (1 << VIRTIO_NET_F_MRG_RXBUF); 1554 1555 /*send the packet to the local virtio device*/ 1556 if (likely(mergeable == 0)) 1557 ret = virtio_dev_rx(dev_ll->dev, &m, 1); 1558 else 1559 ret = virtio_dev_merge_rx(dev_ll->dev, 1560 &m, 1); 1561 1562 if (enable_stats) { 1563 rte_atomic64_add( 1564 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic, 1565 1); 1566 rte_atomic64_add( 1567 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, 1568 ret); 1569 dev_statistics[dev->device_fh].tx_total++; 1570 dev_statistics[dev->device_fh].tx += ret; 1571 } 1572 } 1573 1574 return 0; 1575 } 1576 dev_ll = dev_ll->next; 1577 } 1578 1579 return -1; 1580 } 1581 1582 /* 1583 * This function routes the TX packet to the correct interface. This may be a local device 1584 * or the physical port. 1585 */ 1586 static inline void __attribute__((always_inline)) 1587 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag) 1588 { 1589 struct mbuf_table *tx_q; 1590 struct vlan_ethhdr *vlan_hdr; 1591 struct rte_mbuf **m_table; 1592 struct rte_mbuf *mbuf, *prev; 1593 unsigned len, ret, offset = 0; 1594 const uint16_t lcore_id = rte_lcore_id(); 1595 struct virtio_net_data_ll *dev_ll = ll_root_used; 1596 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1597 1598 /*check if destination is local VM*/ 1599 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0)) 1600 return; 1601 1602 if (vm2vm_mode == VM2VM_HARDWARE) { 1603 while (dev_ll != NULL) { 1604 if ((dev_ll->dev->ready == DEVICE_RX) 1605 && ether_addr_cmp(&(pkt_hdr->d_addr), 1606 &dev_ll->dev->mac_address)) { 1607 /* 1608 * Drop the packet if the TX packet is 1609 * destined for the TX device. 1610 */ 1611 if (dev_ll->dev->device_fh == dev->device_fh) { 1612 LOG_DEBUG(VHOST_DATA, 1613 "(%"PRIu64") TX: Source and destination" 1614 " MAC addresses are the same. Dropping " 1615 "packet.\n", 1616 dev_ll->dev->device_fh); 1617 return; 1618 } 1619 offset = 4; 1620 vlan_tag = 1621 (uint16_t) 1622 vlan_tags[(uint16_t)dev_ll->dev->device_fh]; 1623 1624 LOG_DEBUG(VHOST_DATA, 1625 "(%"PRIu64") TX: pkt to local VM device id:" 1626 "(%"PRIu64") vlan tag: %d.\n", 1627 dev->device_fh, dev_ll->dev->device_fh, 1628 vlan_tag); 1629 1630 break; 1631 } 1632 dev_ll = dev_ll->next; 1633 } 1634 } 1635 1636 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1637 1638 /*Add packet to the port tx queue*/ 1639 tx_q = &lcore_tx_queue[lcore_id]; 1640 len = tx_q->len; 1641 1642 /* Allocate an mbuf and populate the structure. */ 1643 mbuf = rte_pktmbuf_alloc(mbuf_pool); 1644 if (unlikely(mbuf == NULL)) { 1645 RTE_LOG(ERR, VHOST_DATA, 1646 "Failed to allocate memory for mbuf.\n"); 1647 return; 1648 } 1649 1650 mbuf->data_len = m->data_len + VLAN_HLEN + offset; 1651 mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset; 1652 mbuf->nb_segs = m->nb_segs; 1653 1654 /* Copy ethernet header to mbuf. */ 1655 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1656 rte_pktmbuf_mtod(m, const void *), 1657 ETH_HLEN); 1658 1659 1660 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/ 1661 vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *); 1662 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto; 1663 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); 1664 vlan_hdr->h_vlan_TCI = htons(vlan_tag); 1665 1666 /* Copy the remaining packet contents to the mbuf. */ 1667 rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN), 1668 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN), 1669 (m->data_len - ETH_HLEN)); 1670 1671 /* Copy the remaining segments for the whole packet. */ 1672 prev = mbuf; 1673 while (m->next) { 1674 /* Allocate an mbuf and populate the structure. */ 1675 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool); 1676 if (unlikely(next_mbuf == NULL)) { 1677 rte_pktmbuf_free(mbuf); 1678 RTE_LOG(ERR, VHOST_DATA, 1679 "Failed to allocate memory for mbuf.\n"); 1680 return; 1681 } 1682 1683 m = m->next; 1684 prev->next = next_mbuf; 1685 prev = next_mbuf; 1686 next_mbuf->data_len = m->data_len; 1687 1688 /* Copy data to next mbuf. */ 1689 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *), 1690 rte_pktmbuf_mtod(m, const void *), m->data_len); 1691 } 1692 1693 tx_q->m_table[len] = mbuf; 1694 len++; 1695 if (enable_stats) { 1696 dev_statistics[dev->device_fh].tx_total++; 1697 dev_statistics[dev->device_fh].tx++; 1698 } 1699 1700 if (unlikely(len == MAX_PKT_BURST)) { 1701 m_table = (struct rte_mbuf **)tx_q->m_table; 1702 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1703 /* Free any buffers not handled by TX and update the port stats. */ 1704 if (unlikely(ret < len)) { 1705 do { 1706 rte_pktmbuf_free(m_table[ret]); 1707 } while (++ret < len); 1708 } 1709 1710 len = 0; 1711 } 1712 1713 tx_q->len = len; 1714 return; 1715 } 1716 1717 static inline void __attribute__((always_inline)) 1718 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool) 1719 { 1720 struct rte_mbuf m; 1721 struct vhost_virtqueue *vq; 1722 struct vring_desc *desc; 1723 uint64_t buff_addr = 0; 1724 uint32_t head[MAX_PKT_BURST]; 1725 uint32_t used_idx; 1726 uint32_t i; 1727 uint16_t free_entries, packet_success = 0; 1728 uint16_t avail_idx; 1729 1730 vq = dev->virtqueue[VIRTIO_TXQ]; 1731 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1732 1733 /* If there are no available buffers then return. */ 1734 if (vq->last_used_idx == avail_idx) 1735 return; 1736 1737 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1738 1739 /* Prefetch available ring to retrieve head indexes. */ 1740 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); 1741 1742 /*get the number of free entries in the ring*/ 1743 free_entries = (avail_idx - vq->last_used_idx); 1744 1745 /* Limit to MAX_PKT_BURST. */ 1746 if (free_entries > MAX_PKT_BURST) 1747 free_entries = MAX_PKT_BURST; 1748 1749 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries); 1750 /* Retrieve all of the head indexes first to avoid caching issues. */ 1751 for (i = 0; i < free_entries; i++) 1752 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; 1753 1754 /* Prefetch descriptor index. */ 1755 rte_prefetch0(&vq->desc[head[packet_success]]); 1756 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1757 1758 while (packet_success < free_entries) { 1759 desc = &vq->desc[head[packet_success]]; 1760 1761 /* Discard first buffer as it is the virtio header */ 1762 desc = &vq->desc[desc->next]; 1763 1764 /* Buffer address translation. */ 1765 buff_addr = gpa_to_vva(dev, desc->addr); 1766 /* Prefetch buffer address. */ 1767 rte_prefetch0((void*)(uintptr_t)buff_addr); 1768 1769 used_idx = vq->last_used_idx & (vq->size - 1); 1770 1771 if (packet_success < (free_entries - 1)) { 1772 /* Prefetch descriptor index. */ 1773 rte_prefetch0(&vq->desc[head[packet_success+1]]); 1774 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); 1775 } 1776 1777 /* Update used index buffer information. */ 1778 vq->used->ring[used_idx].id = head[packet_success]; 1779 vq->used->ring[used_idx].len = 0; 1780 1781 /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */ 1782 m.data_len = desc->len; 1783 m.pkt_len = desc->len; 1784 m.data_off = 0; 1785 1786 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1787 1788 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1789 if (dev->ready == DEVICE_MAC_LEARNING) { 1790 if (dev->remove || (link_vmdq(dev, &m) == -1)) { 1791 /*discard frame if device is scheduled for removal or a duplicate MAC address is found. */ 1792 packet_success += free_entries; 1793 vq->last_used_idx += packet_success; 1794 break; 1795 } 1796 } 1797 virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh); 1798 1799 vq->last_used_idx++; 1800 packet_success++; 1801 } 1802 1803 rte_compiler_barrier(); 1804 vq->used->idx += packet_success; 1805 /* Kick guest if required. */ 1806 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1807 eventfd_write((int)vq->kickfd, 1); 1808 } 1809 1810 /* This function works for TX packets with mergeable feature enabled. */ 1811 static inline void __attribute__((always_inline)) 1812 virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool) 1813 { 1814 struct rte_mbuf *m, *prev; 1815 struct vhost_virtqueue *vq; 1816 struct vring_desc *desc; 1817 uint64_t vb_addr = 0; 1818 uint32_t head[MAX_PKT_BURST]; 1819 uint32_t used_idx; 1820 uint32_t i; 1821 uint16_t free_entries, entry_success = 0; 1822 uint16_t avail_idx; 1823 uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf) 1824 + RTE_PKTMBUF_HEADROOM); 1825 1826 vq = dev->virtqueue[VIRTIO_TXQ]; 1827 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1828 1829 /* If there are no available buffers then return. */ 1830 if (vq->last_used_idx == avail_idx) 1831 return; 1832 1833 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n", 1834 dev->device_fh); 1835 1836 /* Prefetch available ring to retrieve head indexes. */ 1837 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); 1838 1839 /*get the number of free entries in the ring*/ 1840 free_entries = (avail_idx - vq->last_used_idx); 1841 1842 /* Limit to MAX_PKT_BURST. */ 1843 free_entries = RTE_MIN(free_entries, MAX_PKT_BURST); 1844 1845 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1846 dev->device_fh, free_entries); 1847 /* Retrieve all of the head indexes first to avoid caching issues. */ 1848 for (i = 0; i < free_entries; i++) 1849 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; 1850 1851 /* Prefetch descriptor index. */ 1852 rte_prefetch0(&vq->desc[head[entry_success]]); 1853 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1854 1855 while (entry_success < free_entries) { 1856 uint32_t vb_avail, vb_offset; 1857 uint32_t seg_avail, seg_offset; 1858 uint32_t cpy_len; 1859 uint32_t seg_num = 0; 1860 struct rte_mbuf *cur; 1861 uint8_t alloc_err = 0; 1862 1863 desc = &vq->desc[head[entry_success]]; 1864 1865 /* Discard first buffer as it is the virtio header */ 1866 desc = &vq->desc[desc->next]; 1867 1868 /* Buffer address translation. */ 1869 vb_addr = gpa_to_vva(dev, desc->addr); 1870 /* Prefetch buffer address. */ 1871 rte_prefetch0((void *)(uintptr_t)vb_addr); 1872 1873 used_idx = vq->last_used_idx & (vq->size - 1); 1874 1875 if (entry_success < (free_entries - 1)) { 1876 /* Prefetch descriptor index. */ 1877 rte_prefetch0(&vq->desc[head[entry_success+1]]); 1878 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); 1879 } 1880 1881 /* Update used index buffer information. */ 1882 vq->used->ring[used_idx].id = head[entry_success]; 1883 vq->used->ring[used_idx].len = 0; 1884 1885 vb_offset = 0; 1886 vb_avail = desc->len; 1887 seg_offset = 0; 1888 seg_avail = buf_size; 1889 cpy_len = RTE_MIN(vb_avail, seg_avail); 1890 1891 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); 1892 1893 /* Allocate an mbuf and populate the structure. */ 1894 m = rte_pktmbuf_alloc(mbuf_pool); 1895 if (unlikely(m == NULL)) { 1896 RTE_LOG(ERR, VHOST_DATA, 1897 "Failed to allocate memory for mbuf.\n"); 1898 return; 1899 } 1900 1901 seg_num++; 1902 cur = m; 1903 prev = m; 1904 while (cpy_len != 0) { 1905 rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset), 1906 (void *)((uintptr_t)(vb_addr + vb_offset)), 1907 cpy_len); 1908 1909 seg_offset += cpy_len; 1910 vb_offset += cpy_len; 1911 vb_avail -= cpy_len; 1912 seg_avail -= cpy_len; 1913 1914 if (vb_avail != 0) { 1915 /* 1916 * The segment reachs to its end, 1917 * while the virtio buffer in TX vring has 1918 * more data to be copied. 1919 */ 1920 cur->data_len = seg_offset; 1921 m->pkt_len += seg_offset; 1922 /* Allocate mbuf and populate the structure. */ 1923 cur = rte_pktmbuf_alloc(mbuf_pool); 1924 if (unlikely(cur == NULL)) { 1925 RTE_LOG(ERR, VHOST_DATA, "Failed to " 1926 "allocate memory for mbuf.\n"); 1927 rte_pktmbuf_free(m); 1928 alloc_err = 1; 1929 break; 1930 } 1931 1932 seg_num++; 1933 prev->next = cur; 1934 prev = cur; 1935 seg_offset = 0; 1936 seg_avail = buf_size; 1937 } else { 1938 if (desc->flags & VRING_DESC_F_NEXT) { 1939 /* 1940 * There are more virtio buffers in 1941 * same vring entry need to be copied. 1942 */ 1943 if (seg_avail == 0) { 1944 /* 1945 * The current segment hasn't 1946 * room to accomodate more 1947 * data. 1948 */ 1949 cur->data_len = seg_offset; 1950 m->pkt_len += seg_offset; 1951 /* 1952 * Allocate an mbuf and 1953 * populate the structure. 1954 */ 1955 cur = rte_pktmbuf_alloc(mbuf_pool); 1956 if (unlikely(cur == NULL)) { 1957 RTE_LOG(ERR, 1958 VHOST_DATA, 1959 "Failed to " 1960 "allocate memory " 1961 "for mbuf\n"); 1962 rte_pktmbuf_free(m); 1963 alloc_err = 1; 1964 break; 1965 } 1966 seg_num++; 1967 prev->next = cur; 1968 prev = cur; 1969 seg_offset = 0; 1970 seg_avail = buf_size; 1971 } 1972 1973 desc = &vq->desc[desc->next]; 1974 1975 /* Buffer address translation. */ 1976 vb_addr = gpa_to_vva(dev, desc->addr); 1977 /* Prefetch buffer address. */ 1978 rte_prefetch0((void *)(uintptr_t)vb_addr); 1979 vb_offset = 0; 1980 vb_avail = desc->len; 1981 1982 PRINT_PACKET(dev, (uintptr_t)vb_addr, 1983 desc->len, 0); 1984 } else { 1985 /* The whole packet completes. */ 1986 cur->data_len = seg_offset; 1987 m->pkt_len += seg_offset; 1988 vb_avail = 0; 1989 } 1990 } 1991 1992 cpy_len = RTE_MIN(vb_avail, seg_avail); 1993 } 1994 1995 if (unlikely(alloc_err == 1)) 1996 break; 1997 1998 m->nb_segs = seg_num; 1999 2000 /* 2001 * If this is the first received packet we need to learn 2002 * the MAC and setup VMDQ 2003 */ 2004 if (dev->ready == DEVICE_MAC_LEARNING) { 2005 if (dev->remove || (link_vmdq(dev, m) == -1)) { 2006 /* 2007 * Discard frame if device is scheduled for 2008 * removal or a duplicate MAC address is found. 2009 */ 2010 entry_success = free_entries; 2011 vq->last_used_idx += entry_success; 2012 rte_pktmbuf_free(m); 2013 break; 2014 } 2015 } 2016 2017 virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh); 2018 vq->last_used_idx++; 2019 entry_success++; 2020 rte_pktmbuf_free(m); 2021 } 2022 2023 rte_compiler_barrier(); 2024 vq->used->idx += entry_success; 2025 /* Kick guest if required. */ 2026 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 2027 eventfd_write((int)vq->kickfd, 1); 2028 2029 } 2030 2031 /* 2032 * This function is called by each data core. It handles all RX/TX registered with the 2033 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 2034 * with all devices in the main linked list. 2035 */ 2036 static int 2037 switch_worker(__attribute__((unused)) void *arg) 2038 { 2039 struct rte_mempool *mbuf_pool = arg; 2040 struct virtio_net *dev = NULL; 2041 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2042 struct virtio_net_data_ll *dev_ll; 2043 struct mbuf_table *tx_q; 2044 volatile struct lcore_ll_info *lcore_ll; 2045 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 2046 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2047 unsigned ret, i; 2048 const uint16_t lcore_id = rte_lcore_id(); 2049 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 2050 uint16_t rx_count = 0; 2051 uint32_t mergeable = 0; 2052 2053 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2054 lcore_ll = lcore_info[lcore_id].lcore_ll; 2055 prev_tsc = 0; 2056 2057 tx_q = &lcore_tx_queue[lcore_id]; 2058 for (i = 0; i < num_cores; i ++) { 2059 if (lcore_ids[i] == lcore_id) { 2060 tx_q->txq_id = i; 2061 break; 2062 } 2063 } 2064 2065 while(1) { 2066 cur_tsc = rte_rdtsc(); 2067 /* 2068 * TX burst queue drain 2069 */ 2070 diff_tsc = cur_tsc - prev_tsc; 2071 if (unlikely(diff_tsc > drain_tsc)) { 2072 2073 if (tx_q->len) { 2074 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 2075 2076 /*Tx any packets in the queue*/ 2077 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 2078 (struct rte_mbuf **)tx_q->m_table, 2079 (uint16_t)tx_q->len); 2080 if (unlikely(ret < tx_q->len)) { 2081 do { 2082 rte_pktmbuf_free(tx_q->m_table[ret]); 2083 } while (++ret < tx_q->len); 2084 } 2085 2086 tx_q->len = 0; 2087 } 2088 2089 prev_tsc = cur_tsc; 2090 2091 } 2092 2093 rte_prefetch0(lcore_ll->ll_root_used); 2094 /* 2095 * Inform the configuration core that we have exited the linked list and that no devices are 2096 * in use if requested. 2097 */ 2098 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2099 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2100 2101 /* 2102 * Process devices 2103 */ 2104 dev_ll = lcore_ll->ll_root_used; 2105 2106 while (dev_ll != NULL) { 2107 /*get virtio device ID*/ 2108 dev = dev_ll->dev; 2109 mergeable = 2110 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); 2111 2112 if (dev->remove) { 2113 dev_ll = dev_ll->next; 2114 unlink_vmdq(dev); 2115 dev->ready = DEVICE_SAFE_REMOVE; 2116 continue; 2117 } 2118 if (likely(dev->ready == DEVICE_RX)) { 2119 /*Handle guest RX*/ 2120 rx_count = rte_eth_rx_burst(ports[0], 2121 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 2122 2123 if (rx_count) { 2124 if (likely(mergeable == 0)) 2125 ret_count = 2126 virtio_dev_rx(dev, 2127 pkts_burst, rx_count); 2128 else 2129 ret_count = 2130 virtio_dev_merge_rx(dev, 2131 pkts_burst, rx_count); 2132 2133 if (enable_stats) { 2134 rte_atomic64_add( 2135 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic, 2136 rx_count); 2137 rte_atomic64_add( 2138 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count); 2139 } 2140 while (likely(rx_count)) { 2141 rx_count--; 2142 rte_pktmbuf_free(pkts_burst[rx_count]); 2143 } 2144 2145 } 2146 } 2147 2148 if (!dev->remove) { 2149 /*Handle guest TX*/ 2150 if (likely(mergeable == 0)) 2151 virtio_dev_tx(dev, mbuf_pool); 2152 else 2153 virtio_dev_merge_tx(dev, mbuf_pool); 2154 } 2155 2156 /*move to the next device in the list*/ 2157 dev_ll = dev_ll->next; 2158 } 2159 } 2160 2161 return 0; 2162 } 2163 2164 /* 2165 * This function gets available ring number for zero copy rx. 2166 * Only one thread will call this funciton for a paticular virtio device, 2167 * so, it is designed as non-thread-safe function. 2168 */ 2169 static inline uint32_t __attribute__((always_inline)) 2170 get_available_ring_num_zcp(struct virtio_net *dev) 2171 { 2172 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 2173 uint16_t avail_idx; 2174 2175 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 2176 return (uint32_t)(avail_idx - vq->last_used_idx_res); 2177 } 2178 2179 /* 2180 * This function gets available ring index for zero copy rx, 2181 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 2182 * Only one thread will call this funciton for a paticular virtio device, 2183 * so, it is designed as non-thread-safe function. 2184 */ 2185 static inline uint32_t __attribute__((always_inline)) 2186 get_available_ring_index_zcp(struct virtio_net *dev, 2187 uint16_t *res_base_idx, uint32_t count) 2188 { 2189 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 2190 uint16_t avail_idx; 2191 uint32_t retry = 0; 2192 uint16_t free_entries; 2193 2194 *res_base_idx = vq->last_used_idx_res; 2195 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 2196 free_entries = (avail_idx - *res_base_idx); 2197 2198 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 2199 "avail idx: %d, " 2200 "res base idx:%d, free entries:%d\n", 2201 dev->device_fh, avail_idx, *res_base_idx, 2202 free_entries); 2203 2204 /* 2205 * If retry is enabled and the queue is full then we wait 2206 * and retry to avoid packet loss. 2207 */ 2208 if (enable_retry && unlikely(count > free_entries)) { 2209 for (retry = 0; retry < burst_rx_retry_num; retry++) { 2210 rte_delay_us(burst_rx_delay_time); 2211 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 2212 free_entries = (avail_idx - *res_base_idx); 2213 if (count <= free_entries) 2214 break; 2215 } 2216 } 2217 2218 /*check that we have enough buffers*/ 2219 if (unlikely(count > free_entries)) 2220 count = free_entries; 2221 2222 if (unlikely(count == 0)) { 2223 LOG_DEBUG(VHOST_DATA, 2224 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 2225 "avail idx: %d, res base idx:%d, free entries:%d\n", 2226 dev->device_fh, avail_idx, 2227 *res_base_idx, free_entries); 2228 return 0; 2229 } 2230 2231 vq->last_used_idx_res = *res_base_idx + count; 2232 2233 return count; 2234 } 2235 2236 /* 2237 * This function put descriptor back to used list. 2238 */ 2239 static inline void __attribute__((always_inline)) 2240 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 2241 { 2242 uint16_t res_cur_idx = vq->last_used_idx; 2243 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 2244 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 2245 rte_compiler_barrier(); 2246 *(volatile uint16_t *)&vq->used->idx += 1; 2247 vq->last_used_idx += 1; 2248 2249 /* Kick the guest if necessary. */ 2250 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 2251 eventfd_write((int)vq->kickfd, 1); 2252 } 2253 2254 /* 2255 * This function get available descriptor from vitio vring and un-attached mbuf 2256 * from vpool->ring, and then attach them together. It needs adjust the offset 2257 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 2258 * frame data may be put to wrong location in mbuf. 2259 */ 2260 static inline void __attribute__((always_inline)) 2261 attach_rxmbuf_zcp(struct virtio_net *dev) 2262 { 2263 uint16_t res_base_idx, desc_idx; 2264 uint64_t buff_addr, phys_addr; 2265 struct vhost_virtqueue *vq; 2266 struct vring_desc *desc; 2267 struct rte_mbuf *mbuf = NULL; 2268 struct vpool *vpool; 2269 hpa_type addr_type; 2270 2271 vpool = &vpool_array[dev->vmdq_rx_q]; 2272 vq = dev->virtqueue[VIRTIO_RXQ]; 2273 2274 do { 2275 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx, 2276 1) != 1)) 2277 return; 2278 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 2279 2280 desc = &vq->desc[desc_idx]; 2281 if (desc->flags & VRING_DESC_F_NEXT) { 2282 desc = &vq->desc[desc->next]; 2283 buff_addr = gpa_to_vva(dev, desc->addr); 2284 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, 2285 &addr_type); 2286 } else { 2287 buff_addr = gpa_to_vva(dev, 2288 desc->addr + vq->vhost_hlen); 2289 phys_addr = gpa_to_hpa(dev, 2290 desc->addr + vq->vhost_hlen, 2291 desc->len, &addr_type); 2292 } 2293 2294 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 2295 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 2296 " address found when attaching RX frame buffer" 2297 " address!\n", dev->device_fh); 2298 put_desc_to_used_list_zcp(vq, desc_idx); 2299 continue; 2300 } 2301 2302 /* 2303 * Check if the frame buffer address from guest crosses 2304 * sub-region or not. 2305 */ 2306 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2307 RTE_LOG(ERR, VHOST_DATA, 2308 "(%"PRIu64") Frame buffer address cross " 2309 "sub-regioin found when attaching RX frame " 2310 "buffer address!\n", 2311 dev->device_fh); 2312 put_desc_to_used_list_zcp(vq, desc_idx); 2313 continue; 2314 } 2315 } while (unlikely(phys_addr == 0)); 2316 2317 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 2318 if (unlikely(mbuf == NULL)) { 2319 LOG_DEBUG(VHOST_DATA, 2320 "(%"PRIu64") in attach_rxmbuf_zcp: " 2321 "ring_sc_dequeue fail.\n", 2322 dev->device_fh); 2323 put_desc_to_used_list_zcp(vq, desc_idx); 2324 return; 2325 } 2326 2327 if (unlikely(vpool->buf_size > desc->len)) { 2328 LOG_DEBUG(VHOST_DATA, 2329 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 2330 "length(%d) of descriptor idx: %d less than room " 2331 "size required: %d\n", 2332 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 2333 put_desc_to_used_list_zcp(vq, desc_idx); 2334 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 2335 return; 2336 } 2337 2338 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 2339 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 2340 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 2341 mbuf->data_len = desc->len; 2342 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 2343 2344 LOG_DEBUG(VHOST_DATA, 2345 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 2346 "descriptor idx:%d\n", 2347 dev->device_fh, res_base_idx, desc_idx); 2348 2349 __rte_mbuf_raw_free(mbuf); 2350 2351 return; 2352 } 2353 2354 /* 2355 * Detach an attched packet mbuf - 2356 * - restore original mbuf address and length values. 2357 * - reset pktmbuf data and data_len to their default values. 2358 * All other fields of the given packet mbuf will be left intact. 2359 * 2360 * @param m 2361 * The attached packet mbuf. 2362 */ 2363 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 2364 { 2365 const struct rte_mempool *mp = m->pool; 2366 void *buf = RTE_MBUF_TO_BADDR(m); 2367 uint32_t buf_ofs; 2368 uint32_t buf_len = mp->elt_size - sizeof(*m); 2369 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 2370 2371 m->buf_addr = buf; 2372 m->buf_len = (uint16_t)buf_len; 2373 2374 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 2375 RTE_PKTMBUF_HEADROOM : m->buf_len; 2376 m->data_off = buf_ofs; 2377 2378 m->data_len = 0; 2379 } 2380 2381 /* 2382 * This function is called after packets have been transimited. It fetchs mbuf 2383 * from vpool->pool, detached it and put into vpool->ring. It also update the 2384 * used index and kick the guest if necessary. 2385 */ 2386 static inline uint32_t __attribute__((always_inline)) 2387 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 2388 { 2389 struct rte_mbuf *mbuf; 2390 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 2391 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 2392 uint32_t index = 0; 2393 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 2394 2395 LOG_DEBUG(VHOST_DATA, 2396 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 2397 "clean is: %d\n", 2398 dev->device_fh, mbuf_count); 2399 LOG_DEBUG(VHOST_DATA, 2400 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 2401 "clean is : %d\n", 2402 dev->device_fh, rte_ring_count(vpool->ring)); 2403 2404 for (index = 0; index < mbuf_count; index++) { 2405 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 2406 if (likely(RTE_MBUF_INDIRECT(mbuf))) 2407 pktmbuf_detach_zcp(mbuf); 2408 rte_ring_sp_enqueue(vpool->ring, mbuf); 2409 2410 /* Update used index buffer information. */ 2411 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 2412 vq->used->ring[used_idx].len = 0; 2413 2414 used_idx = (used_idx + 1) & (vq->size - 1); 2415 } 2416 2417 LOG_DEBUG(VHOST_DATA, 2418 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 2419 "clean is: %d\n", 2420 dev->device_fh, rte_mempool_count(vpool->pool)); 2421 LOG_DEBUG(VHOST_DATA, 2422 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 2423 "clean is : %d\n", 2424 dev->device_fh, rte_ring_count(vpool->ring)); 2425 LOG_DEBUG(VHOST_DATA, 2426 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 2427 "vq->last_used_idx:%d\n", 2428 dev->device_fh, vq->last_used_idx); 2429 2430 vq->last_used_idx += mbuf_count; 2431 2432 LOG_DEBUG(VHOST_DATA, 2433 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 2434 "vq->last_used_idx:%d\n", 2435 dev->device_fh, vq->last_used_idx); 2436 2437 rte_compiler_barrier(); 2438 2439 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 2440 2441 /* Kick guest if required. */ 2442 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 2443 eventfd_write((int)vq->kickfd, 1); 2444 2445 return 0; 2446 } 2447 2448 /* 2449 * This function is called when a virtio device is destroy. 2450 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 2451 */ 2452 static void mbuf_destroy_zcp(struct vpool *vpool) 2453 { 2454 struct rte_mbuf *mbuf = NULL; 2455 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 2456 2457 LOG_DEBUG(VHOST_CONFIG, 2458 "in mbuf_destroy_zcp: mbuf count in mempool before " 2459 "mbuf_destroy_zcp is: %d\n", 2460 mbuf_count); 2461 LOG_DEBUG(VHOST_CONFIG, 2462 "in mbuf_destroy_zcp: mbuf count in ring before " 2463 "mbuf_destroy_zcp is : %d\n", 2464 rte_ring_count(vpool->ring)); 2465 2466 for (index = 0; index < mbuf_count; index++) { 2467 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 2468 if (likely(mbuf != NULL)) { 2469 if (likely(RTE_MBUF_INDIRECT(mbuf))) 2470 pktmbuf_detach_zcp(mbuf); 2471 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 2472 } 2473 } 2474 2475 LOG_DEBUG(VHOST_CONFIG, 2476 "in mbuf_destroy_zcp: mbuf count in mempool after " 2477 "mbuf_destroy_zcp is: %d\n", 2478 rte_mempool_count(vpool->pool)); 2479 LOG_DEBUG(VHOST_CONFIG, 2480 "in mbuf_destroy_zcp: mbuf count in ring after " 2481 "mbuf_destroy_zcp is : %d\n", 2482 rte_ring_count(vpool->ring)); 2483 } 2484 2485 /* 2486 * This function update the use flag and counter. 2487 */ 2488 static inline uint32_t __attribute__((always_inline)) 2489 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 2490 uint32_t count) 2491 { 2492 struct vhost_virtqueue *vq; 2493 struct vring_desc *desc; 2494 struct rte_mbuf *buff; 2495 /* The virtio_hdr is initialised to 0. */ 2496 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 2497 = {{0, 0, 0, 0, 0, 0}, 0}; 2498 uint64_t buff_hdr_addr = 0; 2499 uint32_t head[MAX_PKT_BURST], packet_len = 0; 2500 uint32_t head_idx, packet_success = 0; 2501 uint16_t res_cur_idx; 2502 2503 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 2504 2505 if (count == 0) 2506 return 0; 2507 2508 vq = dev->virtqueue[VIRTIO_RXQ]; 2509 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 2510 2511 res_cur_idx = vq->last_used_idx; 2512 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 2513 dev->device_fh, res_cur_idx, res_cur_idx + count); 2514 2515 /* Retrieve all of the head indexes first to avoid caching issues. */ 2516 for (head_idx = 0; head_idx < count; head_idx++) 2517 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 2518 2519 /*Prefetch descriptor index. */ 2520 rte_prefetch0(&vq->desc[head[packet_success]]); 2521 2522 while (packet_success != count) { 2523 /* Get descriptor from available ring */ 2524 desc = &vq->desc[head[packet_success]]; 2525 2526 buff = pkts[packet_success]; 2527 LOG_DEBUG(VHOST_DATA, 2528 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 2529 "pkt[%d] descriptor idx: %d\n", 2530 dev->device_fh, packet_success, 2531 MBUF_HEADROOM_UINT32(buff)); 2532 2533 PRINT_PACKET(dev, 2534 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 2535 + RTE_PKTMBUF_HEADROOM), 2536 rte_pktmbuf_data_len(buff), 0); 2537 2538 /* Buffer address translation for virtio header. */ 2539 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 2540 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 2541 2542 /* 2543 * If the descriptors are chained the header and data are 2544 * placed in separate buffers. 2545 */ 2546 if (desc->flags & VRING_DESC_F_NEXT) { 2547 desc->len = vq->vhost_hlen; 2548 desc = &vq->desc[desc->next]; 2549 desc->len = rte_pktmbuf_data_len(buff); 2550 } else { 2551 desc->len = packet_len; 2552 } 2553 2554 /* Update used ring with desc information */ 2555 vq->used->ring[res_cur_idx & (vq->size - 1)].id 2556 = head[packet_success]; 2557 vq->used->ring[res_cur_idx & (vq->size - 1)].len 2558 = packet_len; 2559 res_cur_idx++; 2560 packet_success++; 2561 2562 /* A header is required per buffer. */ 2563 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 2564 (const void *)&virtio_hdr, vq->vhost_hlen); 2565 2566 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 2567 2568 if (likely(packet_success < count)) { 2569 /* Prefetch descriptor index. */ 2570 rte_prefetch0(&vq->desc[head[packet_success]]); 2571 } 2572 } 2573 2574 rte_compiler_barrier(); 2575 2576 LOG_DEBUG(VHOST_DATA, 2577 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 2578 "vq.last_used_idx: %d, vq->used->idx: %d\n", 2579 dev->device_fh, vq->last_used_idx, vq->used->idx); 2580 2581 *(volatile uint16_t *)&vq->used->idx += count; 2582 vq->last_used_idx += count; 2583 2584 LOG_DEBUG(VHOST_DATA, 2585 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 2586 "vq.last_used_idx: %d, vq->used->idx: %d\n", 2587 dev->device_fh, vq->last_used_idx, vq->used->idx); 2588 2589 /* Kick the guest if necessary. */ 2590 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 2591 eventfd_write((int)vq->kickfd, 1); 2592 2593 return count; 2594 } 2595 2596 /* 2597 * This function routes the TX packet to the correct interface. 2598 * This may be a local device or the physical port. 2599 */ 2600 static inline void __attribute__((always_inline)) 2601 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 2602 uint32_t desc_idx, uint8_t need_copy) 2603 { 2604 struct mbuf_table *tx_q; 2605 struct rte_mbuf **m_table; 2606 struct rte_mbuf *mbuf = NULL; 2607 unsigned len, ret, offset = 0; 2608 struct vpool *vpool; 2609 struct virtio_net_data_ll *dev_ll = ll_root_used; 2610 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 2611 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 2612 2613 /*Add packet to the port tx queue*/ 2614 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q]; 2615 len = tx_q->len; 2616 2617 /* Allocate an mbuf and populate the structure. */ 2618 vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q]; 2619 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 2620 if (unlikely(mbuf == NULL)) { 2621 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 2622 RTE_LOG(ERR, VHOST_DATA, 2623 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 2624 dev->device_fh); 2625 put_desc_to_used_list_zcp(vq, desc_idx); 2626 return; 2627 } 2628 2629 if (vm2vm_mode == VM2VM_HARDWARE) { 2630 /* Avoid using a vlan tag from any vm for external pkt, such as 2631 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 2632 * selection, MAC address determines it as an external pkt 2633 * which should go to network, while vlan tag determine it as 2634 * a vm2vm pkt should forward to another vm. Hardware confuse 2635 * such a ambiguous situation, so pkt will lost. 2636 */ 2637 vlan_tag = external_pkt_default_vlan_tag; 2638 while (dev_ll != NULL) { 2639 if (likely(dev_ll->dev->ready == DEVICE_RX) && 2640 ether_addr_cmp(&(pkt_hdr->d_addr), 2641 &dev_ll->dev->mac_address)) { 2642 2643 /* 2644 * Drop the packet if the TX packet is destined 2645 * for the TX device. 2646 */ 2647 if (unlikely(dev_ll->dev->device_fh 2648 == dev->device_fh)) { 2649 LOG_DEBUG(VHOST_DATA, 2650 "(%"PRIu64") TX: Source and destination" 2651 "MAC addresses are the same. Dropping " 2652 "packet.\n", 2653 dev_ll->dev->device_fh); 2654 MBUF_HEADROOM_UINT32(mbuf) 2655 = (uint32_t)desc_idx; 2656 __rte_mbuf_raw_free(mbuf); 2657 return; 2658 } 2659 2660 /* 2661 * Packet length offset 4 bytes for HW vlan 2662 * strip when L2 switch back. 2663 */ 2664 offset = 4; 2665 vlan_tag = 2666 (uint16_t) 2667 vlan_tags[(uint16_t)dev_ll->dev->device_fh]; 2668 2669 LOG_DEBUG(VHOST_DATA, 2670 "(%"PRIu64") TX: pkt to local VM device id:" 2671 "(%"PRIu64") vlan tag: %d.\n", 2672 dev->device_fh, dev_ll->dev->device_fh, 2673 vlan_tag); 2674 2675 break; 2676 } 2677 dev_ll = dev_ll->next; 2678 } 2679 } 2680 2681 mbuf->nb_segs = m->nb_segs; 2682 mbuf->next = m->next; 2683 mbuf->data_len = m->data_len + offset; 2684 mbuf->pkt_len = mbuf->data_len; 2685 if (unlikely(need_copy)) { 2686 /* Copy the packet contents to the mbuf. */ 2687 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 2688 rte_pktmbuf_mtod(m, void *), 2689 m->data_len); 2690 } else { 2691 mbuf->data_off = m->data_off; 2692 mbuf->buf_physaddr = m->buf_physaddr; 2693 mbuf->buf_addr = m->buf_addr; 2694 } 2695 mbuf->ol_flags = PKT_TX_VLAN_PKT; 2696 mbuf->vlan_tci = vlan_tag; 2697 mbuf->l2_len = sizeof(struct ether_hdr); 2698 mbuf->l3_len = sizeof(struct ipv4_hdr); 2699 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 2700 2701 tx_q->m_table[len] = mbuf; 2702 len++; 2703 2704 LOG_DEBUG(VHOST_DATA, 2705 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 2706 dev->device_fh, 2707 mbuf->nb_segs, 2708 (mbuf->next == NULL) ? "null" : "non-null"); 2709 2710 if (enable_stats) { 2711 dev_statistics[dev->device_fh].tx_total++; 2712 dev_statistics[dev->device_fh].tx++; 2713 } 2714 2715 if (unlikely(len == MAX_PKT_BURST)) { 2716 m_table = (struct rte_mbuf **)tx_q->m_table; 2717 ret = rte_eth_tx_burst(ports[0], 2718 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 2719 2720 /* 2721 * Free any buffers not handled by TX and update 2722 * the port stats. 2723 */ 2724 if (unlikely(ret < len)) { 2725 do { 2726 rte_pktmbuf_free(m_table[ret]); 2727 } while (++ret < len); 2728 } 2729 2730 len = 0; 2731 txmbuf_clean_zcp(dev, vpool); 2732 } 2733 2734 tx_q->len = len; 2735 2736 return; 2737 } 2738 2739 /* 2740 * This function TX all available packets in virtio TX queue for one 2741 * virtio-net device. If it is first packet, it learns MAC address and 2742 * setup VMDQ. 2743 */ 2744 static inline void __attribute__((always_inline)) 2745 virtio_dev_tx_zcp(struct virtio_net *dev) 2746 { 2747 struct rte_mbuf m; 2748 struct vhost_virtqueue *vq; 2749 struct vring_desc *desc; 2750 uint64_t buff_addr = 0, phys_addr; 2751 uint32_t head[MAX_PKT_BURST]; 2752 uint32_t i; 2753 uint16_t free_entries, packet_success = 0; 2754 uint16_t avail_idx; 2755 uint8_t need_copy = 0; 2756 hpa_type addr_type; 2757 2758 vq = dev->virtqueue[VIRTIO_TXQ]; 2759 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 2760 2761 /* If there are no available buffers then return. */ 2762 if (vq->last_used_idx_res == avail_idx) 2763 return; 2764 2765 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 2766 2767 /* Prefetch available ring to retrieve head indexes. */ 2768 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 2769 2770 /* Get the number of free entries in the ring */ 2771 free_entries = (avail_idx - vq->last_used_idx_res); 2772 2773 /* Limit to MAX_PKT_BURST. */ 2774 free_entries 2775 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 2776 2777 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 2778 dev->device_fh, free_entries); 2779 2780 /* Retrieve all of the head indexes first to avoid caching issues. */ 2781 for (i = 0; i < free_entries; i++) 2782 head[i] 2783 = vq->avail->ring[(vq->last_used_idx_res + i) 2784 & (vq->size - 1)]; 2785 2786 vq->last_used_idx_res += free_entries; 2787 2788 /* Prefetch descriptor index. */ 2789 rte_prefetch0(&vq->desc[head[packet_success]]); 2790 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 2791 2792 while (packet_success < free_entries) { 2793 desc = &vq->desc[head[packet_success]]; 2794 2795 /* Discard first buffer as it is the virtio header */ 2796 desc = &vq->desc[desc->next]; 2797 2798 /* Buffer address translation. */ 2799 buff_addr = gpa_to_vva(dev, desc->addr); 2800 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type); 2801 2802 if (likely(packet_success < (free_entries - 1))) 2803 /* Prefetch descriptor index. */ 2804 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 2805 2806 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 2807 RTE_LOG(ERR, VHOST_DATA, 2808 "(%"PRIu64") Invalid frame buffer address found" 2809 "when TX packets!\n", 2810 dev->device_fh); 2811 packet_success++; 2812 continue; 2813 } 2814 2815 /* Prefetch buffer address. */ 2816 rte_prefetch0((void *)(uintptr_t)buff_addr); 2817 2818 /* 2819 * Setup dummy mbuf. This is copied to a real mbuf if 2820 * transmitted out the physical port. 2821 */ 2822 m.data_len = desc->len; 2823 m.nb_segs = 1; 2824 m.next = NULL; 2825 m.data_off = 0; 2826 m.buf_addr = (void *)(uintptr_t)buff_addr; 2827 m.buf_physaddr = phys_addr; 2828 2829 /* 2830 * Check if the frame buffer address from guest crosses 2831 * sub-region or not. 2832 */ 2833 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2834 RTE_LOG(ERR, VHOST_DATA, 2835 "(%"PRIu64") Frame buffer address cross " 2836 "sub-regioin found when attaching TX frame " 2837 "buffer address!\n", 2838 dev->device_fh); 2839 need_copy = 1; 2840 } else 2841 need_copy = 0; 2842 2843 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2844 2845 /* 2846 * If this is the first received packet we need to learn 2847 * the MAC and setup VMDQ 2848 */ 2849 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) { 2850 if (dev->remove || (link_vmdq(dev, &m) == -1)) { 2851 /* 2852 * Discard frame if device is scheduled for 2853 * removal or a duplicate MAC address is found. 2854 */ 2855 packet_success += free_entries; 2856 vq->last_used_idx += packet_success; 2857 break; 2858 } 2859 } 2860 2861 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2862 packet_success++; 2863 } 2864 } 2865 2866 /* 2867 * This function is called by each data core. It handles all RX/TX registered 2868 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2869 * addresses are compared with all devices in the main linked list. 2870 */ 2871 static int 2872 switch_worker_zcp(__attribute__((unused)) void *arg) 2873 { 2874 struct virtio_net *dev = NULL; 2875 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2876 struct virtio_net_data_ll *dev_ll; 2877 struct mbuf_table *tx_q; 2878 volatile struct lcore_ll_info *lcore_ll; 2879 const uint64_t drain_tsc 2880 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2881 * BURST_TX_DRAIN_US; 2882 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2883 unsigned ret; 2884 const uint16_t lcore_id = rte_lcore_id(); 2885 uint16_t count_in_ring, rx_count = 0; 2886 2887 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2888 2889 lcore_ll = lcore_info[lcore_id].lcore_ll; 2890 prev_tsc = 0; 2891 2892 while (1) { 2893 cur_tsc = rte_rdtsc(); 2894 2895 /* TX burst queue drain */ 2896 diff_tsc = cur_tsc - prev_tsc; 2897 if (unlikely(diff_tsc > drain_tsc)) { 2898 /* 2899 * Get mbuf from vpool.pool and detach mbuf and 2900 * put back into vpool.ring. 2901 */ 2902 dev_ll = lcore_ll->ll_root_used; 2903 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) { 2904 /* Get virtio device ID */ 2905 dev = dev_ll->dev; 2906 2907 if (likely(!dev->remove)) { 2908 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q]; 2909 if (tx_q->len) { 2910 LOG_DEBUG(VHOST_DATA, 2911 "TX queue drained after timeout" 2912 " with burst size %u\n", 2913 tx_q->len); 2914 2915 /* 2916 * Tx any packets in the queue 2917 */ 2918 ret = rte_eth_tx_burst( 2919 ports[0], 2920 (uint16_t)tx_q->txq_id, 2921 (struct rte_mbuf **) 2922 tx_q->m_table, 2923 (uint16_t)tx_q->len); 2924 if (unlikely(ret < tx_q->len)) { 2925 do { 2926 rte_pktmbuf_free( 2927 tx_q->m_table[ret]); 2928 } while (++ret < tx_q->len); 2929 } 2930 tx_q->len = 0; 2931 2932 txmbuf_clean_zcp(dev, 2933 &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]); 2934 } 2935 } 2936 dev_ll = dev_ll->next; 2937 } 2938 prev_tsc = cur_tsc; 2939 } 2940 2941 rte_prefetch0(lcore_ll->ll_root_used); 2942 2943 /* 2944 * Inform the configuration core that we have exited the linked 2945 * list and that no devices are in use if requested. 2946 */ 2947 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2948 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2949 2950 /* Process devices */ 2951 dev_ll = lcore_ll->ll_root_used; 2952 2953 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) { 2954 dev = dev_ll->dev; 2955 if (unlikely(dev->remove)) { 2956 dev_ll = dev_ll->next; 2957 unlink_vmdq(dev); 2958 dev->ready = DEVICE_SAFE_REMOVE; 2959 continue; 2960 } 2961 2962 if (likely(dev->ready == DEVICE_RX)) { 2963 uint32_t index = dev->vmdq_rx_q; 2964 uint16_t i; 2965 count_in_ring 2966 = rte_ring_count(vpool_array[index].ring); 2967 uint16_t free_entries 2968 = (uint16_t)get_available_ring_num_zcp(dev); 2969 2970 /* 2971 * Attach all mbufs in vpool.ring and put back 2972 * into vpool.pool. 2973 */ 2974 for (i = 0; 2975 i < RTE_MIN(free_entries, 2976 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2977 i++) 2978 attach_rxmbuf_zcp(dev); 2979 2980 /* Handle guest RX */ 2981 rx_count = rte_eth_rx_burst(ports[0], 2982 (uint16_t)dev->vmdq_rx_q, pkts_burst, 2983 MAX_PKT_BURST); 2984 2985 if (rx_count) { 2986 ret_count = virtio_dev_rx_zcp(dev, 2987 pkts_burst, rx_count); 2988 if (enable_stats) { 2989 dev_statistics[dev->device_fh].rx_total 2990 += rx_count; 2991 dev_statistics[dev->device_fh].rx 2992 += ret_count; 2993 } 2994 while (likely(rx_count)) { 2995 rx_count--; 2996 pktmbuf_detach_zcp( 2997 pkts_burst[rx_count]); 2998 rte_ring_sp_enqueue( 2999 vpool_array[index].ring, 3000 (void *)pkts_burst[rx_count]); 3001 } 3002 } 3003 } 3004 3005 if (likely(!dev->remove)) 3006 /* Handle guest TX */ 3007 virtio_dev_tx_zcp(dev); 3008 3009 /* Move to the next device in the list */ 3010 dev_ll = dev_ll->next; 3011 } 3012 } 3013 3014 return 0; 3015 } 3016 3017 3018 /* 3019 * Add an entry to a used linked list. A free entry must first be found 3020 * in the free linked list using get_data_ll_free_entry(); 3021 */ 3022 static void 3023 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 3024 struct virtio_net_data_ll *ll_dev) 3025 { 3026 struct virtio_net_data_ll *ll = *ll_root_addr; 3027 3028 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 3029 ll_dev->next = NULL; 3030 rte_compiler_barrier(); 3031 3032 /* If ll == NULL then this is the first device. */ 3033 if (ll) { 3034 /* Increment to the tail of the linked list. */ 3035 while ((ll->next != NULL) ) 3036 ll = ll->next; 3037 3038 ll->next = ll_dev; 3039 } else { 3040 *ll_root_addr = ll_dev; 3041 } 3042 } 3043 3044 /* 3045 * Remove an entry from a used linked list. The entry must then be added to 3046 * the free linked list using put_data_ll_free_entry(). 3047 */ 3048 static void 3049 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 3050 struct virtio_net_data_ll *ll_dev, 3051 struct virtio_net_data_ll *ll_dev_last) 3052 { 3053 struct virtio_net_data_ll *ll = *ll_root_addr; 3054 3055 if (unlikely((ll == NULL) || (ll_dev == NULL))) 3056 return; 3057 3058 if (ll_dev == ll) 3059 *ll_root_addr = ll_dev->next; 3060 else 3061 if (likely(ll_dev_last != NULL)) 3062 ll_dev_last->next = ll_dev->next; 3063 else 3064 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 3065 } 3066 3067 /* 3068 * Find and return an entry from the free linked list. 3069 */ 3070 static struct virtio_net_data_ll * 3071 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 3072 { 3073 struct virtio_net_data_ll *ll_free = *ll_root_addr; 3074 struct virtio_net_data_ll *ll_dev; 3075 3076 if (ll_free == NULL) 3077 return NULL; 3078 3079 ll_dev = ll_free; 3080 *ll_root_addr = ll_free->next; 3081 3082 return ll_dev; 3083 } 3084 3085 /* 3086 * Place an entry back on to the free linked list. 3087 */ 3088 static void 3089 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 3090 struct virtio_net_data_ll *ll_dev) 3091 { 3092 struct virtio_net_data_ll *ll_free = *ll_root_addr; 3093 3094 if (ll_dev == NULL) 3095 return; 3096 3097 ll_dev->next = ll_free; 3098 *ll_root_addr = ll_dev; 3099 } 3100 3101 /* 3102 * Creates a linked list of a given size. 3103 */ 3104 static struct virtio_net_data_ll * 3105 alloc_data_ll(uint32_t size) 3106 { 3107 struct virtio_net_data_ll *ll_new; 3108 uint32_t i; 3109 3110 /* Malloc and then chain the linked list. */ 3111 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 3112 if (ll_new == NULL) { 3113 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 3114 return NULL; 3115 } 3116 3117 for (i = 0; i < size - 1; i++) { 3118 ll_new[i].dev = NULL; 3119 ll_new[i].next = &ll_new[i+1]; 3120 } 3121 ll_new[i].next = NULL; 3122 3123 return (ll_new); 3124 } 3125 3126 /* 3127 * Create the main linked list along with each individual cores linked list. A used and a free list 3128 * are created to manage entries. 3129 */ 3130 static int 3131 init_data_ll (void) 3132 { 3133 int lcore; 3134 3135 RTE_LCORE_FOREACH_SLAVE(lcore) { 3136 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 3137 if (lcore_info[lcore].lcore_ll == NULL) { 3138 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 3139 return -1; 3140 } 3141 3142 lcore_info[lcore].lcore_ll->device_num = 0; 3143 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 3144 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 3145 if (num_devices % num_switching_cores) 3146 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 3147 else 3148 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 3149 } 3150 3151 /* Allocate devices up to a maximum of MAX_DEVICES. */ 3152 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 3153 3154 return 0; 3155 } 3156 3157 /* 3158 * Set virtqueue flags so that we do not receive interrupts. 3159 */ 3160 static void 3161 set_irq_status (struct virtio_net *dev) 3162 { 3163 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 3164 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 3165 } 3166 3167 /* 3168 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 3169 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 3170 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 3171 */ 3172 static void 3173 destroy_device (volatile struct virtio_net *dev) 3174 { 3175 struct virtio_net_data_ll *ll_lcore_dev_cur; 3176 struct virtio_net_data_ll *ll_main_dev_cur; 3177 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 3178 struct virtio_net_data_ll *ll_main_dev_last = NULL; 3179 int lcore; 3180 3181 dev->flags &= ~VIRTIO_DEV_RUNNING; 3182 3183 /*set the remove flag. */ 3184 dev->remove = 1; 3185 3186 while(dev->ready != DEVICE_SAFE_REMOVE) { 3187 rte_pause(); 3188 } 3189 3190 /* Search for entry to be removed from lcore ll */ 3191 ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used; 3192 while (ll_lcore_dev_cur != NULL) { 3193 if (ll_lcore_dev_cur->dev == dev) { 3194 break; 3195 } else { 3196 ll_lcore_dev_last = ll_lcore_dev_cur; 3197 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 3198 } 3199 } 3200 3201 if (ll_lcore_dev_cur == NULL) { 3202 RTE_LOG(ERR, VHOST_CONFIG, 3203 "(%"PRIu64") Failed to find the dev to be destroy.\n", 3204 dev->device_fh); 3205 return; 3206 } 3207 3208 /* Search for entry to be removed from main ll */ 3209 ll_main_dev_cur = ll_root_used; 3210 ll_main_dev_last = NULL; 3211 while (ll_main_dev_cur != NULL) { 3212 if (ll_main_dev_cur->dev == dev) { 3213 break; 3214 } else { 3215 ll_main_dev_last = ll_main_dev_cur; 3216 ll_main_dev_cur = ll_main_dev_cur->next; 3217 } 3218 } 3219 3220 /* Remove entries from the lcore and main ll. */ 3221 rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 3222 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 3223 3224 /* Set the dev_removal_flag on each lcore. */ 3225 RTE_LCORE_FOREACH_SLAVE(lcore) { 3226 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 3227 } 3228 3229 /* 3230 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 3231 * they can no longer access the device removed from the linked lists and that the devices 3232 * are no longer in use. 3233 */ 3234 RTE_LCORE_FOREACH_SLAVE(lcore) { 3235 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 3236 rte_pause(); 3237 } 3238 } 3239 3240 /* Add the entries back to the lcore and main free ll.*/ 3241 put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 3242 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 3243 3244 /* Decrement number of device on the lcore. */ 3245 lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--; 3246 3247 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 3248 3249 if (zero_copy) { 3250 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q]; 3251 3252 /* Stop the RX queue. */ 3253 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) { 3254 LOG_DEBUG(VHOST_CONFIG, 3255 "(%"PRIu64") In destroy_device: Failed to stop " 3256 "rx queue:%d\n", 3257 dev->device_fh, 3258 dev->vmdq_rx_q); 3259 } 3260 3261 LOG_DEBUG(VHOST_CONFIG, 3262 "(%"PRIu64") in destroy_device: Start put mbuf in " 3263 "mempool back to ring for RX queue: %d\n", 3264 dev->device_fh, dev->vmdq_rx_q); 3265 3266 mbuf_destroy_zcp(vpool); 3267 3268 /* Stop the TX queue. */ 3269 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) { 3270 LOG_DEBUG(VHOST_CONFIG, 3271 "(%"PRIu64") In destroy_device: Failed to " 3272 "stop tx queue:%d\n", 3273 dev->device_fh, dev->vmdq_rx_q); 3274 } 3275 3276 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES]; 3277 3278 LOG_DEBUG(VHOST_CONFIG, 3279 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 3280 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 3281 dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES), 3282 dev->device_fh); 3283 3284 mbuf_destroy_zcp(vpool); 3285 } 3286 3287 } 3288 3289 /* 3290 * A new device is added to a data core. First the device is added to the main linked list 3291 * and the allocated to a specific data core. 3292 */ 3293 static int 3294 new_device (struct virtio_net *dev) 3295 { 3296 struct virtio_net_data_ll *ll_dev; 3297 int lcore, core_add = 0; 3298 uint32_t device_num_min = num_devices; 3299 3300 /* Add device to main ll */ 3301 ll_dev = get_data_ll_free_entry(&ll_root_free); 3302 if (ll_dev == NULL) { 3303 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 3304 "of %d devices per core has been reached\n", 3305 dev->device_fh, num_devices); 3306 return -1; 3307 } 3308 ll_dev->dev = dev; 3309 add_data_ll_entry(&ll_root_used, ll_dev); 3310 ll_dev->dev->vmdq_rx_q 3311 = ll_dev->dev->device_fh * (num_queues / num_devices); 3312 3313 if (zero_copy) { 3314 uint32_t index = ll_dev->dev->vmdq_rx_q; 3315 uint32_t count_in_ring, i; 3316 struct mbuf_table *tx_q; 3317 3318 count_in_ring = rte_ring_count(vpool_array[index].ring); 3319 3320 LOG_DEBUG(VHOST_CONFIG, 3321 "(%"PRIu64") in new_device: mbuf count in mempool " 3322 "before attach is: %d\n", 3323 dev->device_fh, 3324 rte_mempool_count(vpool_array[index].pool)); 3325 LOG_DEBUG(VHOST_CONFIG, 3326 "(%"PRIu64") in new_device: mbuf count in ring " 3327 "before attach is : %d\n", 3328 dev->device_fh, count_in_ring); 3329 3330 /* 3331 * Attach all mbufs in vpool.ring and put back intovpool.pool. 3332 */ 3333 for (i = 0; i < count_in_ring; i++) 3334 attach_rxmbuf_zcp(dev); 3335 3336 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 3337 "mempool after attach is: %d\n", 3338 dev->device_fh, 3339 rte_mempool_count(vpool_array[index].pool)); 3340 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 3341 "ring after attach is : %d\n", 3342 dev->device_fh, 3343 rte_ring_count(vpool_array[index].ring)); 3344 3345 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q]; 3346 tx_q->txq_id = dev->vmdq_rx_q; 3347 3348 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) { 3349 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q]; 3350 3351 LOG_DEBUG(VHOST_CONFIG, 3352 "(%"PRIu64") In new_device: Failed to start " 3353 "tx queue:%d\n", 3354 dev->device_fh, dev->vmdq_rx_q); 3355 3356 mbuf_destroy_zcp(vpool); 3357 return -1; 3358 } 3359 3360 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) { 3361 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q]; 3362 3363 LOG_DEBUG(VHOST_CONFIG, 3364 "(%"PRIu64") In new_device: Failed to start " 3365 "rx queue:%d\n", 3366 dev->device_fh, dev->vmdq_rx_q); 3367 3368 /* Stop the TX queue. */ 3369 if (rte_eth_dev_tx_queue_stop(ports[0], 3370 dev->vmdq_rx_q) != 0) { 3371 LOG_DEBUG(VHOST_CONFIG, 3372 "(%"PRIu64") In new_device: Failed to " 3373 "stop tx queue:%d\n", 3374 dev->device_fh, dev->vmdq_rx_q); 3375 } 3376 3377 mbuf_destroy_zcp(vpool); 3378 return -1; 3379 } 3380 3381 } 3382 3383 /*reset ready flag*/ 3384 dev->ready = DEVICE_MAC_LEARNING; 3385 dev->remove = 0; 3386 3387 /* Find a suitable lcore to add the device. */ 3388 RTE_LCORE_FOREACH_SLAVE(lcore) { 3389 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 3390 device_num_min = lcore_info[lcore].lcore_ll->device_num; 3391 core_add = lcore; 3392 } 3393 } 3394 /* Add device to lcore ll */ 3395 ll_dev->dev->coreid = core_add; 3396 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); 3397 if (ll_dev == NULL) { 3398 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 3399 dev->ready = DEVICE_SAFE_REMOVE; 3400 destroy_device(dev); 3401 return -1; 3402 } 3403 ll_dev->dev = dev; 3404 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); 3405 3406 /* Initialize device stats */ 3407 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 3408 3409 /* Disable notifications. */ 3410 set_irq_status(dev); 3411 lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++; 3412 dev->flags |= VIRTIO_DEV_RUNNING; 3413 3414 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid); 3415 3416 return 0; 3417 } 3418 3419 /* 3420 * These callback allow devices to be added to the data core when configuration 3421 * has been fully complete. 3422 */ 3423 static const struct virtio_net_device_ops virtio_net_device_ops = 3424 { 3425 .new_device = new_device, 3426 .destroy_device = destroy_device, 3427 }; 3428 3429 /* 3430 * This is a thread will wake up after a period to print stats if the user has 3431 * enabled them. 3432 */ 3433 static void 3434 print_stats(void) 3435 { 3436 struct virtio_net_data_ll *dev_ll; 3437 uint64_t tx_dropped, rx_dropped; 3438 uint64_t tx, tx_total, rx, rx_total; 3439 uint32_t device_fh; 3440 const char clr[] = { 27, '[', '2', 'J', '\0' }; 3441 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 3442 3443 while(1) { 3444 sleep(enable_stats); 3445 3446 /* Clear screen and move to top left */ 3447 printf("%s%s", clr, top_left); 3448 3449 printf("\nDevice statistics ===================================="); 3450 3451 dev_ll = ll_root_used; 3452 while (dev_ll != NULL) { 3453 device_fh = (uint32_t)dev_ll->dev->device_fh; 3454 tx_total = dev_statistics[device_fh].tx_total; 3455 tx = dev_statistics[device_fh].tx; 3456 tx_dropped = tx_total - tx; 3457 if (zero_copy == 0) { 3458 rx_total = rte_atomic64_read( 3459 &dev_statistics[device_fh].rx_total_atomic); 3460 rx = rte_atomic64_read( 3461 &dev_statistics[device_fh].rx_atomic); 3462 } else { 3463 rx_total = dev_statistics[device_fh].rx_total; 3464 rx = dev_statistics[device_fh].rx; 3465 } 3466 rx_dropped = rx_total - rx; 3467 3468 printf("\nStatistics for device %"PRIu32" ------------------------------" 3469 "\nTX total: %"PRIu64"" 3470 "\nTX dropped: %"PRIu64"" 3471 "\nTX successful: %"PRIu64"" 3472 "\nRX total: %"PRIu64"" 3473 "\nRX dropped: %"PRIu64"" 3474 "\nRX successful: %"PRIu64"", 3475 device_fh, 3476 tx_total, 3477 tx_dropped, 3478 tx, 3479 rx_total, 3480 rx_dropped, 3481 rx); 3482 3483 dev_ll = dev_ll->next; 3484 } 3485 printf("\n======================================================\n"); 3486 } 3487 } 3488 3489 static void 3490 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 3491 char *ring_name, uint32_t nb_mbuf) 3492 { 3493 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 3494 vpool_array[index].pool 3495 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 3496 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 3497 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 3498 rte_pktmbuf_init, NULL, socket, 0); 3499 if (vpool_array[index].pool != NULL) { 3500 vpool_array[index].ring 3501 = rte_ring_create(ring_name, 3502 rte_align32pow2(nb_mbuf + 1), 3503 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 3504 if (likely(vpool_array[index].ring != NULL)) { 3505 LOG_DEBUG(VHOST_CONFIG, 3506 "in setup_mempool_tbl: mbuf count in " 3507 "mempool is: %d\n", 3508 rte_mempool_count(vpool_array[index].pool)); 3509 LOG_DEBUG(VHOST_CONFIG, 3510 "in setup_mempool_tbl: mbuf count in " 3511 "ring is: %d\n", 3512 rte_ring_count(vpool_array[index].ring)); 3513 } else { 3514 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 3515 ring_name); 3516 } 3517 3518 /* Need consider head room. */ 3519 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 3520 } else { 3521 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 3522 } 3523 } 3524 3525 3526 /* 3527 * Main function, does initialisation and calls the per-lcore functions. The CUSE 3528 * device is also registered here to handle the IOCTLs. 3529 */ 3530 int 3531 MAIN(int argc, char *argv[]) 3532 { 3533 struct rte_mempool *mbuf_pool = NULL; 3534 unsigned lcore_id, core_id = 0; 3535 unsigned nb_ports, valid_num_ports; 3536 int ret; 3537 uint8_t portid, queue_id = 0; 3538 static pthread_t tid; 3539 3540 /* init EAL */ 3541 ret = rte_eal_init(argc, argv); 3542 if (ret < 0) 3543 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 3544 argc -= ret; 3545 argv += ret; 3546 3547 /* parse app arguments */ 3548 ret = us_vhost_parse_args(argc, argv); 3549 if (ret < 0) 3550 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 3551 3552 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 3553 if (rte_lcore_is_enabled(lcore_id)) 3554 lcore_ids[core_id ++] = lcore_id; 3555 3556 if (rte_lcore_count() > RTE_MAX_LCORE) 3557 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 3558 3559 /*set the number of swithcing cores available*/ 3560 num_switching_cores = rte_lcore_count()-1; 3561 3562 /* Get the number of physical ports. */ 3563 nb_ports = rte_eth_dev_count(); 3564 if (nb_ports > RTE_MAX_ETHPORTS) 3565 nb_ports = RTE_MAX_ETHPORTS; 3566 3567 /* 3568 * Update the global var NUM_PORTS and global array PORTS 3569 * and get value of var VALID_NUM_PORTS according to system ports number 3570 */ 3571 valid_num_ports = check_ports_num(nb_ports); 3572 3573 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 3574 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 3575 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 3576 return -1; 3577 } 3578 3579 if (zero_copy == 0) { 3580 /* Create the mbuf pool. */ 3581 mbuf_pool = rte_mempool_create( 3582 "MBUF_POOL", 3583 NUM_MBUFS_PER_PORT 3584 * valid_num_ports, 3585 MBUF_SIZE, MBUF_CACHE_SIZE, 3586 sizeof(struct rte_pktmbuf_pool_private), 3587 rte_pktmbuf_pool_init, NULL, 3588 rte_pktmbuf_init, NULL, 3589 rte_socket_id(), 0); 3590 if (mbuf_pool == NULL) 3591 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 3592 3593 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 3594 vpool_array[queue_id].pool = mbuf_pool; 3595 3596 if (vm2vm_mode == VM2VM_HARDWARE) { 3597 /* Enable VT loop back to let L2 switch to do it. */ 3598 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3599 LOG_DEBUG(VHOST_CONFIG, 3600 "Enable loop back for L2 switch in vmdq.\n"); 3601 } 3602 } else { 3603 uint32_t nb_mbuf; 3604 char pool_name[RTE_MEMPOOL_NAMESIZE]; 3605 char ring_name[RTE_MEMPOOL_NAMESIZE]; 3606 3607 rx_conf_default.start_rx_per_q = (uint8_t)zero_copy; 3608 rx_conf_default.rx_drop_en = 0; 3609 tx_conf_default.start_tx_per_q = (uint8_t)zero_copy; 3610 nb_mbuf = num_rx_descriptor 3611 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3612 + num_switching_cores * MAX_PKT_BURST; 3613 3614 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3615 snprintf(pool_name, sizeof(pool_name), 3616 "rxmbuf_pool_%u", queue_id); 3617 snprintf(ring_name, sizeof(ring_name), 3618 "rxmbuf_ring_%u", queue_id); 3619 setup_mempool_tbl(rte_socket_id(), queue_id, 3620 pool_name, ring_name, nb_mbuf); 3621 } 3622 3623 nb_mbuf = num_tx_descriptor 3624 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3625 + num_switching_cores * MAX_PKT_BURST; 3626 3627 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3628 snprintf(pool_name, sizeof(pool_name), 3629 "txmbuf_pool_%u", queue_id); 3630 snprintf(ring_name, sizeof(ring_name), 3631 "txmbuf_ring_%u", queue_id); 3632 setup_mempool_tbl(rte_socket_id(), 3633 (queue_id + MAX_QUEUES), 3634 pool_name, ring_name, nb_mbuf); 3635 } 3636 3637 if (vm2vm_mode == VM2VM_HARDWARE) { 3638 /* Enable VT loop back to let L2 switch to do it. */ 3639 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3640 LOG_DEBUG(VHOST_CONFIG, 3641 "Enable loop back for L2 switch in vmdq.\n"); 3642 } 3643 } 3644 /* Set log level. */ 3645 rte_set_log_level(LOG_LEVEL); 3646 3647 /* initialize all ports */ 3648 for (portid = 0; portid < nb_ports; portid++) { 3649 /* skip ports that are not enabled */ 3650 if ((enabled_port_mask & (1 << portid)) == 0) { 3651 RTE_LOG(INFO, VHOST_PORT, 3652 "Skipping disabled port %d\n", portid); 3653 continue; 3654 } 3655 if (port_init(portid) != 0) 3656 rte_exit(EXIT_FAILURE, 3657 "Cannot initialize network ports\n"); 3658 } 3659 3660 /* Initialise all linked lists. */ 3661 if (init_data_ll() == -1) 3662 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3663 3664 /* Initialize device stats */ 3665 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3666 3667 /* Enable stats if the user option is set. */ 3668 if (enable_stats) 3669 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3670 3671 /* Launch all data cores. */ 3672 if (zero_copy == 0) { 3673 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3674 rte_eal_remote_launch(switch_worker, 3675 mbuf_pool, lcore_id); 3676 } 3677 } else { 3678 uint32_t count_in_mempool, index, i; 3679 for (index = 0; index < 2*MAX_QUEUES; index++) { 3680 /* For all RX and TX queues. */ 3681 count_in_mempool 3682 = rte_mempool_count(vpool_array[index].pool); 3683 3684 /* 3685 * Transfer all un-attached mbufs from vpool.pool 3686 * to vpoo.ring. 3687 */ 3688 for (i = 0; i < count_in_mempool; i++) { 3689 struct rte_mbuf *mbuf 3690 = __rte_mbuf_raw_alloc( 3691 vpool_array[index].pool); 3692 rte_ring_sp_enqueue(vpool_array[index].ring, 3693 (void *)mbuf); 3694 } 3695 3696 LOG_DEBUG(VHOST_CONFIG, 3697 "in MAIN: mbuf count in mempool at initial " 3698 "is: %d\n", count_in_mempool); 3699 LOG_DEBUG(VHOST_CONFIG, 3700 "in MAIN: mbuf count in ring at initial is :" 3701 " %d\n", 3702 rte_ring_count(vpool_array[index].ring)); 3703 } 3704 3705 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3706 rte_eal_remote_launch(switch_worker_zcp, NULL, 3707 lcore_id); 3708 } 3709 3710 /* Register CUSE device to handle IOCTLs. */ 3711 ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks()); 3712 if (ret != 0) 3713 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3714 3715 init_virtio_net(&virtio_net_device_ops); 3716 3717 /* Start CUSE session. */ 3718 start_cuse_session_loop(); 3719 return 0; 3720 3721 } 3722 3723