1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 53 #include "main.h" 54 #include "virtio-net.h" 55 #include "vhost-net-cdev.h" 56 57 #define MAX_QUEUES 128 58 59 /* the maximum number of external ports supported */ 60 #define MAX_SUP_PORTS 1 61 62 /* 63 * Calculate the number of buffers needed per port 64 */ 65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 66 (num_switching_cores*MAX_PKT_BURST) + \ 67 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 68 (num_switching_cores*MBUF_CACHE_SIZE)) 69 70 #define MBUF_CACHE_SIZE 128 71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 72 73 /* 74 * No frame data buffer allocated from host are required for zero copy 75 * implementation, guest will allocate the frame data buffer, and vhost 76 * directly use it. 77 */ 78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 80 + RTE_PKTMBUF_HEADROOM) 81 #define MBUF_CACHE_SIZE_ZCP 0 82 83 /* 84 * RX and TX Prefetch, Host, and Write-back threshold values should be 85 * carefully set for optimal performance. Consult the network 86 * controller's datasheet and supporting DPDK documentation for guidance 87 * on how these parameters should be set. 88 */ 89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 92 93 /* 94 * These default values are optimized for use with the Intel(R) 82599 10 GbE 95 * Controller and the DPDK ixgbe PMD. Consider using other values for other 96 * network controllers and/or network drivers. 97 */ 98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 99 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 100 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 101 102 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 103 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */ 104 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 105 106 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 107 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 108 109 /* State of virtio device. */ 110 #define DEVICE_MAC_LEARNING 0 111 #define DEVICE_RX 1 112 #define DEVICE_SAFE_REMOVE 2 113 114 /* Config_core_flag status definitions. */ 115 #define REQUEST_DEV_REMOVAL 1 116 #define ACK_DEV_REMOVAL 0 117 118 /* Configurable number of RX/TX ring descriptors */ 119 #define RTE_TEST_RX_DESC_DEFAULT 1024 120 #define RTE_TEST_TX_DESC_DEFAULT 512 121 122 /* 123 * Need refine these 2 macros for legacy and DPDK based front end: 124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 125 * And then adjust power 2. 126 */ 127 /* 128 * For legacy front end, 128 descriptors, 129 * half for virtio header, another half for mbuf. 130 */ 131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 133 134 /* Get first 4 bytes in mbuf headroom. */ 135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 136 + sizeof(struct rte_mbuf))) 137 138 /* true if x is a power of 2 */ 139 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 140 141 #define INVALID_PORT_ID 0xFF 142 143 /* Max number of devices. Limited by vmdq. */ 144 #define MAX_DEVICES 64 145 146 /* Size of buffers used for snprintfs. */ 147 #define MAX_PRINT_BUFF 6072 148 149 /* Maximum character device basename size. */ 150 #define MAX_BASENAME_SZ 10 151 152 /* Maximum long option length for option parsing. */ 153 #define MAX_LONG_OPT_SZ 64 154 155 /* Used to compare MAC addresses. */ 156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 157 158 /* Number of descriptors per cacheline. */ 159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 160 161 /* mask of enabled ports */ 162 static uint32_t enabled_port_mask = 0; 163 164 /*Number of switching cores enabled*/ 165 static uint32_t num_switching_cores = 0; 166 167 /* number of devices/queues to support*/ 168 static uint32_t num_queues = 0; 169 uint32_t num_devices = 0; 170 171 /* 172 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 173 * disabled on default. 174 */ 175 static uint32_t zero_copy; 176 177 /* number of descriptors to apply*/ 178 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 179 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 180 181 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 182 #define MAX_RING_DESC 4096 183 184 struct vpool { 185 struct rte_mempool *pool; 186 struct rte_ring *ring; 187 uint32_t buf_size; 188 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 189 190 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 191 typedef enum { 192 VM2VM_DISABLED = 0, 193 VM2VM_SOFTWARE = 1, 194 VM2VM_HARDWARE = 2, 195 VM2VM_LAST 196 } vm2vm_type; 197 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 198 199 /* The type of host physical address translated from guest physical address. */ 200 typedef enum { 201 PHYS_ADDR_CONTINUOUS = 0, 202 PHYS_ADDR_CROSS_SUBREG = 1, 203 PHYS_ADDR_INVALID = 2, 204 PHYS_ADDR_LAST 205 } hpa_type; 206 207 /* Enable stats. */ 208 static uint32_t enable_stats = 0; 209 /* Enable retries on RX. */ 210 static uint32_t enable_retry = 1; 211 /* Specify timeout (in useconds) between retries on RX. */ 212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 213 /* Specify the number of retries on RX. */ 214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 215 216 /* Character device basename. Can be set by user. */ 217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 218 219 /* Charater device index. Can be set by user. */ 220 static uint32_t dev_index = 0; 221 222 /* This can be set by the user so it is made available here. */ 223 extern uint64_t VHOST_FEATURES; 224 225 /* Default configuration for rx and tx thresholds etc. */ 226 static struct rte_eth_rxconf rx_conf_default = { 227 .rx_thresh = { 228 .pthresh = RX_PTHRESH, 229 .hthresh = RX_HTHRESH, 230 .wthresh = RX_WTHRESH, 231 }, 232 .rx_drop_en = 1, 233 }; 234 235 /* 236 * These default values are optimized for use with the Intel(R) 82599 10 GbE 237 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 238 * network controllers and/or network drivers. 239 */ 240 static struct rte_eth_txconf tx_conf_default = { 241 .tx_thresh = { 242 .pthresh = TX_PTHRESH, 243 .hthresh = TX_HTHRESH, 244 .wthresh = TX_WTHRESH, 245 }, 246 .tx_free_thresh = 0, /* Use PMD default values */ 247 .tx_rs_thresh = 0, /* Use PMD default values */ 248 }; 249 250 /* empty vmdq configuration structure. Filled in programatically */ 251 static struct rte_eth_conf vmdq_conf_default = { 252 .rxmode = { 253 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 254 .split_hdr_size = 0, 255 .header_split = 0, /**< Header Split disabled */ 256 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 257 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 258 /* 259 * It is necessary for 1G NIC such as I350, 260 * this fixes bug of ipv4 forwarding in guest can't 261 * forward pakets from one virtio dev to another virtio dev. 262 */ 263 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 264 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 265 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 266 }, 267 268 .txmode = { 269 .mq_mode = ETH_MQ_TX_NONE, 270 }, 271 .rx_adv_conf = { 272 /* 273 * should be overridden separately in code with 274 * appropriate values 275 */ 276 .vmdq_rx_conf = { 277 .nb_queue_pools = ETH_8_POOLS, 278 .enable_default_pool = 0, 279 .default_pool = 0, 280 .nb_pool_maps = 0, 281 .pool_map = {{0, 0},}, 282 }, 283 }, 284 }; 285 286 static unsigned lcore_ids[RTE_MAX_LCORE]; 287 static uint8_t ports[RTE_MAX_ETHPORTS]; 288 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 289 290 static const uint16_t external_pkt_default_vlan_tag = 2000; 291 const uint16_t vlan_tags[] = { 292 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 293 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 294 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 295 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 296 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 297 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 298 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 299 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 300 }; 301 302 /* ethernet addresses of ports */ 303 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 304 305 /* heads for the main used and free linked lists for the data path. */ 306 static struct virtio_net_data_ll *ll_root_used = NULL; 307 static struct virtio_net_data_ll *ll_root_free = NULL; 308 309 /* Array of data core structures containing information on individual core linked lists. */ 310 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 311 312 /* Used for queueing bursts of TX packets. */ 313 struct mbuf_table { 314 unsigned len; 315 unsigned txq_id; 316 struct rte_mbuf *m_table[MAX_PKT_BURST]; 317 }; 318 319 /* TX queue for each data core. */ 320 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 321 322 /* TX queue fori each virtio device for zero copy. */ 323 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 324 325 /* Vlan header struct used to insert vlan tags on TX. */ 326 struct vlan_ethhdr { 327 unsigned char h_dest[ETH_ALEN]; 328 unsigned char h_source[ETH_ALEN]; 329 __be16 h_vlan_proto; 330 __be16 h_vlan_TCI; 331 __be16 h_vlan_encapsulated_proto; 332 }; 333 334 /* IPv4 Header */ 335 struct ipv4_hdr { 336 uint8_t version_ihl; /**< version and header length */ 337 uint8_t type_of_service; /**< type of service */ 338 uint16_t total_length; /**< length of packet */ 339 uint16_t packet_id; /**< packet ID */ 340 uint16_t fragment_offset; /**< fragmentation offset */ 341 uint8_t time_to_live; /**< time to live */ 342 uint8_t next_proto_id; /**< protocol ID */ 343 uint16_t hdr_checksum; /**< header checksum */ 344 uint32_t src_addr; /**< source address */ 345 uint32_t dst_addr; /**< destination address */ 346 } __attribute__((__packed__)); 347 348 /* Header lengths. */ 349 #define VLAN_HLEN 4 350 #define VLAN_ETH_HLEN 18 351 352 /* Per-device statistics struct */ 353 struct device_statistics { 354 uint64_t tx_total; 355 rte_atomic64_t rx_total_atomic; 356 uint64_t rx_total; 357 uint64_t tx; 358 rte_atomic64_t rx_atomic; 359 uint64_t rx; 360 } __rte_cache_aligned; 361 struct device_statistics dev_statistics[MAX_DEVICES]; 362 363 /* 364 * Builds up the correct configuration for VMDQ VLAN pool map 365 * according to the pool & queue limits. 366 */ 367 static inline int 368 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 369 { 370 struct rte_eth_vmdq_rx_conf conf; 371 unsigned i; 372 373 memset(&conf, 0, sizeof(conf)); 374 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 375 conf.nb_pool_maps = num_devices; 376 conf.enable_loop_back = 377 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 378 379 for (i = 0; i < conf.nb_pool_maps; i++) { 380 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 381 conf.pool_map[i].pools = (1UL << i); 382 } 383 384 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 385 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 386 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 387 return 0; 388 } 389 390 /* 391 * Validate the device number according to the max pool number gotten form 392 * dev_info. If the device number is invalid, give the error message and 393 * return -1. Each device must have its own pool. 394 */ 395 static inline int 396 validate_num_devices(uint32_t max_nb_devices) 397 { 398 if (num_devices > max_nb_devices) { 399 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 400 return -1; 401 } 402 return 0; 403 } 404 405 /* 406 * Initialises a given port using global settings and with the rx buffers 407 * coming from the mbuf_pool passed as parameter 408 */ 409 static inline int 410 port_init(uint8_t port) 411 { 412 struct rte_eth_dev_info dev_info; 413 struct rte_eth_conf port_conf; 414 uint16_t rx_rings, tx_rings; 415 uint16_t rx_ring_size, tx_ring_size; 416 int retval; 417 uint16_t q; 418 419 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 420 rte_eth_dev_info_get (port, &dev_info); 421 422 /*configure the number of supported virtio devices based on VMDQ limits */ 423 num_devices = dev_info.max_vmdq_pools; 424 num_queues = dev_info.max_rx_queues; 425 426 if (zero_copy) { 427 rx_ring_size = num_rx_descriptor; 428 tx_ring_size = num_tx_descriptor; 429 tx_rings = dev_info.max_tx_queues; 430 } else { 431 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 432 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 433 tx_rings = (uint16_t)rte_lcore_count(); 434 } 435 436 retval = validate_num_devices(MAX_DEVICES); 437 if (retval < 0) 438 return retval; 439 440 /* Get port configuration. */ 441 retval = get_eth_conf(&port_conf, num_devices); 442 if (retval < 0) 443 return retval; 444 445 if (port >= rte_eth_dev_count()) return -1; 446 447 rx_rings = (uint16_t)num_queues, 448 /* Configure ethernet device. */ 449 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 450 if (retval != 0) 451 return retval; 452 453 /* Setup the queues. */ 454 for (q = 0; q < rx_rings; q ++) { 455 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 456 rte_eth_dev_socket_id(port), &rx_conf_default, 457 vpool_array[q].pool); 458 if (retval < 0) 459 return retval; 460 } 461 for (q = 0; q < tx_rings; q ++) { 462 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 463 rte_eth_dev_socket_id(port), &tx_conf_default); 464 if (retval < 0) 465 return retval; 466 } 467 468 /* Start the device. */ 469 retval = rte_eth_dev_start(port); 470 if (retval < 0) { 471 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 472 return retval; 473 } 474 475 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 476 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 477 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 478 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 479 (unsigned)port, 480 vmdq_ports_eth_addr[port].addr_bytes[0], 481 vmdq_ports_eth_addr[port].addr_bytes[1], 482 vmdq_ports_eth_addr[port].addr_bytes[2], 483 vmdq_ports_eth_addr[port].addr_bytes[3], 484 vmdq_ports_eth_addr[port].addr_bytes[4], 485 vmdq_ports_eth_addr[port].addr_bytes[5]); 486 487 return 0; 488 } 489 490 /* 491 * Set character device basename. 492 */ 493 static int 494 us_vhost_parse_basename(const char *q_arg) 495 { 496 /* parse number string */ 497 498 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 499 return -1; 500 else 501 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 502 503 return 0; 504 } 505 506 /* 507 * Parse the portmask provided at run time. 508 */ 509 static int 510 parse_portmask(const char *portmask) 511 { 512 char *end = NULL; 513 unsigned long pm; 514 515 errno = 0; 516 517 /* parse hexadecimal string */ 518 pm = strtoul(portmask, &end, 16); 519 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 520 return -1; 521 522 if (pm == 0) 523 return -1; 524 525 return pm; 526 527 } 528 529 /* 530 * Parse num options at run time. 531 */ 532 static int 533 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 534 { 535 char *end = NULL; 536 unsigned long num; 537 538 errno = 0; 539 540 /* parse unsigned int string */ 541 num = strtoul(q_arg, &end, 10); 542 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 543 return -1; 544 545 if (num > max_valid_value) 546 return -1; 547 548 return num; 549 550 } 551 552 /* 553 * Display usage 554 */ 555 static void 556 us_vhost_usage(const char *prgname) 557 { 558 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 559 " --vm2vm [0|1|2]\n" 560 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 561 " --dev-basename <name> --dev-index [0-N]\n" 562 " --nb-devices ND\n" 563 " -p PORTMASK: Set mask for ports to be used by application\n" 564 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 565 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 566 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 567 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 568 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 569 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 570 " --dev-basename: The basename to be used for the character device.\n" 571 " --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n" 572 " --zero-copy [0|1]: disable(default)/enable rx/tx " 573 "zero copy\n" 574 " --rx-desc-num [0-N]: the number of descriptors on rx, " 575 "used only when zero copy is enabled.\n" 576 " --tx-desc-num [0-N]: the number of descriptors on tx, " 577 "used only when zero copy is enabled.\n", 578 prgname); 579 } 580 581 /* 582 * Parse the arguments given in the command line of the application. 583 */ 584 static int 585 us_vhost_parse_args(int argc, char **argv) 586 { 587 int opt, ret; 588 int option_index; 589 unsigned i; 590 const char *prgname = argv[0]; 591 static struct option long_option[] = { 592 {"vm2vm", required_argument, NULL, 0}, 593 {"rx-retry", required_argument, NULL, 0}, 594 {"rx-retry-delay", required_argument, NULL, 0}, 595 {"rx-retry-num", required_argument, NULL, 0}, 596 {"mergeable", required_argument, NULL, 0}, 597 {"stats", required_argument, NULL, 0}, 598 {"dev-basename", required_argument, NULL, 0}, 599 {"dev-index", required_argument, NULL, 0}, 600 {"zero-copy", required_argument, NULL, 0}, 601 {"rx-desc-num", required_argument, NULL, 0}, 602 {"tx-desc-num", required_argument, NULL, 0}, 603 {NULL, 0, 0, 0}, 604 }; 605 606 /* Parse command line */ 607 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 608 switch (opt) { 609 /* Portmask */ 610 case 'p': 611 enabled_port_mask = parse_portmask(optarg); 612 if (enabled_port_mask == 0) { 613 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 614 us_vhost_usage(prgname); 615 return -1; 616 } 617 break; 618 619 case 0: 620 /* Enable/disable vm2vm comms. */ 621 if (!strncmp(long_option[option_index].name, "vm2vm", 622 MAX_LONG_OPT_SZ)) { 623 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 624 if (ret == -1) { 625 RTE_LOG(INFO, VHOST_CONFIG, 626 "Invalid argument for " 627 "vm2vm [0|1|2]\n"); 628 us_vhost_usage(prgname); 629 return -1; 630 } else { 631 vm2vm_mode = (vm2vm_type)ret; 632 } 633 } 634 635 /* Enable/disable retries on RX. */ 636 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 637 ret = parse_num_opt(optarg, 1); 638 if (ret == -1) { 639 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 640 us_vhost_usage(prgname); 641 return -1; 642 } else { 643 enable_retry = ret; 644 } 645 } 646 647 /* Specify the retries delay time (in useconds) on RX. */ 648 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 649 ret = parse_num_opt(optarg, INT32_MAX); 650 if (ret == -1) { 651 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 652 us_vhost_usage(prgname); 653 return -1; 654 } else { 655 burst_rx_delay_time = ret; 656 } 657 } 658 659 /* Specify the retries number on RX. */ 660 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 661 ret = parse_num_opt(optarg, INT32_MAX); 662 if (ret == -1) { 663 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 664 us_vhost_usage(prgname); 665 return -1; 666 } else { 667 burst_rx_retry_num = ret; 668 } 669 } 670 671 /* Enable/disable RX mergeable buffers. */ 672 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 673 ret = parse_num_opt(optarg, 1); 674 if (ret == -1) { 675 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 676 us_vhost_usage(prgname); 677 return -1; 678 } else { 679 if (ret) 680 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF); 681 } 682 } 683 684 /* Enable/disable stats. */ 685 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 686 ret = parse_num_opt(optarg, INT32_MAX); 687 if (ret == -1) { 688 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 689 us_vhost_usage(prgname); 690 return -1; 691 } else { 692 enable_stats = ret; 693 } 694 } 695 696 /* Set character device basename. */ 697 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 698 if (us_vhost_parse_basename(optarg) == -1) { 699 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 700 us_vhost_usage(prgname); 701 return -1; 702 } 703 } 704 705 /* Set character device index. */ 706 if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) { 707 ret = parse_num_opt(optarg, INT32_MAX); 708 if (ret == -1) { 709 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n"); 710 us_vhost_usage(prgname); 711 return -1; 712 } else 713 dev_index = ret; 714 } 715 716 /* Enable/disable rx/tx zero copy. */ 717 if (!strncmp(long_option[option_index].name, 718 "zero-copy", MAX_LONG_OPT_SZ)) { 719 ret = parse_num_opt(optarg, 1); 720 if (ret == -1) { 721 RTE_LOG(INFO, VHOST_CONFIG, 722 "Invalid argument" 723 " for zero-copy [0|1]\n"); 724 us_vhost_usage(prgname); 725 return -1; 726 } else 727 zero_copy = ret; 728 729 if (zero_copy) { 730 #ifdef RTE_MBUF_SCATTER_GATHER 731 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 732 "zero copy vhost APP, please " 733 "disable RTE_MBUF_SCATTER_GATHER\n" 734 "in config file and then rebuild DPDK " 735 "core lib!\n" 736 "Otherwise please disable zero copy " 737 "flag in command line!\n"); 738 return -1; 739 #endif 740 } 741 } 742 743 /* Specify the descriptor number on RX. */ 744 if (!strncmp(long_option[option_index].name, 745 "rx-desc-num", MAX_LONG_OPT_SZ)) { 746 ret = parse_num_opt(optarg, MAX_RING_DESC); 747 if ((ret == -1) || (!POWEROF2(ret))) { 748 RTE_LOG(INFO, VHOST_CONFIG, 749 "Invalid argument for rx-desc-num[0-N]," 750 "power of 2 required.\n"); 751 us_vhost_usage(prgname); 752 return -1; 753 } else { 754 num_rx_descriptor = ret; 755 } 756 } 757 758 /* Specify the descriptor number on TX. */ 759 if (!strncmp(long_option[option_index].name, 760 "tx-desc-num", MAX_LONG_OPT_SZ)) { 761 ret = parse_num_opt(optarg, MAX_RING_DESC); 762 if ((ret == -1) || (!POWEROF2(ret))) { 763 RTE_LOG(INFO, VHOST_CONFIG, 764 "Invalid argument for tx-desc-num [0-N]," 765 "power of 2 required.\n"); 766 us_vhost_usage(prgname); 767 return -1; 768 } else { 769 num_tx_descriptor = ret; 770 } 771 } 772 773 break; 774 775 /* Invalid option - print options. */ 776 default: 777 us_vhost_usage(prgname); 778 return -1; 779 } 780 } 781 782 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 783 if (enabled_port_mask & (1 << i)) 784 ports[num_ports++] = (uint8_t)i; 785 } 786 787 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 788 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 789 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 790 return -1; 791 } 792 793 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 794 RTE_LOG(INFO, VHOST_PORT, 795 "Vhost zero copy doesn't support software vm2vm," 796 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 797 return -1; 798 } 799 800 return 0; 801 } 802 803 /* 804 * Update the global var NUM_PORTS and array PORTS according to system ports number 805 * and return valid ports number 806 */ 807 static unsigned check_ports_num(unsigned nb_ports) 808 { 809 unsigned valid_num_ports = num_ports; 810 unsigned portid; 811 812 if (num_ports > nb_ports) { 813 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 814 num_ports, nb_ports); 815 num_ports = nb_ports; 816 } 817 818 for (portid = 0; portid < num_ports; portid ++) { 819 if (ports[portid] >= nb_ports) { 820 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 821 ports[portid], (nb_ports - 1)); 822 ports[portid] = INVALID_PORT_ID; 823 valid_num_ports--; 824 } 825 } 826 return valid_num_ports; 827 } 828 829 /* 830 * Macro to print out packet contents. Wrapped in debug define so that the 831 * data path is not effected when debug is disabled. 832 */ 833 #ifdef DEBUG 834 #define PRINT_PACKET(device, addr, size, header) do { \ 835 char *pkt_addr = (char*)(addr); \ 836 unsigned int index; \ 837 char packet[MAX_PRINT_BUFF]; \ 838 \ 839 if ((header)) \ 840 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 841 else \ 842 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 843 for (index = 0; index < (size); index++) { \ 844 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 845 "%02hhx ", pkt_addr[index]); \ 846 } \ 847 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 848 \ 849 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 850 } while(0) 851 #else 852 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 853 #endif 854 855 /* 856 * Function to convert guest physical addresses to vhost virtual addresses. This 857 * is used to convert virtio buffer addresses. 858 */ 859 static inline uint64_t __attribute__((always_inline)) 860 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) 861 { 862 struct virtio_memory_regions *region; 863 uint32_t regionidx; 864 uint64_t vhost_va = 0; 865 866 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 867 region = &dev->mem->regions[regionidx]; 868 if ((guest_pa >= region->guest_phys_address) && 869 (guest_pa <= region->guest_phys_address_end)) { 870 vhost_va = region->address_offset + guest_pa; 871 break; 872 } 873 } 874 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| VVA %p\n", 875 dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va); 876 877 return vhost_va; 878 } 879 880 /* 881 * Function to convert guest physical addresses to vhost physical addresses. 882 * This is used to convert virtio buffer addresses. 883 */ 884 static inline uint64_t __attribute__((always_inline)) 885 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa, 886 uint32_t buf_len, hpa_type *addr_type) 887 { 888 struct virtio_memory_regions_hpa *region; 889 uint32_t regionidx; 890 uint64_t vhost_pa = 0; 891 892 *addr_type = PHYS_ADDR_INVALID; 893 894 for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) { 895 region = &dev->mem->regions_hpa[regionidx]; 896 if ((guest_pa >= region->guest_phys_address) && 897 (guest_pa <= region->guest_phys_address_end)) { 898 vhost_pa = region->host_phys_addr_offset + guest_pa; 899 if (likely((guest_pa + buf_len - 1) 900 <= region->guest_phys_address_end)) 901 *addr_type = PHYS_ADDR_CONTINUOUS; 902 else 903 *addr_type = PHYS_ADDR_CROSS_SUBREG; 904 break; 905 } 906 } 907 908 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 909 dev->device_fh, (void *)(uintptr_t)guest_pa, 910 (void *)(uintptr_t)vhost_pa); 911 912 return vhost_pa; 913 } 914 915 /* 916 * This function adds buffers to the virtio devices RX virtqueue. Buffers can 917 * be received from the physical port or from another virtio device. A packet 918 * count is returned to indicate the number of packets that were succesfully 919 * added to the RX queue. 920 */ 921 static inline uint32_t __attribute__((always_inline)) 922 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) 923 { 924 struct vhost_virtqueue *vq; 925 struct vring_desc *desc; 926 struct rte_mbuf *buff; 927 /* The virtio_hdr is initialised to 0. */ 928 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0}; 929 uint64_t buff_addr = 0; 930 uint64_t buff_hdr_addr = 0; 931 uint32_t head[MAX_PKT_BURST], packet_len = 0; 932 uint32_t head_idx, packet_success = 0; 933 uint32_t mergeable, mrg_count = 0; 934 uint32_t retry = 0; 935 uint16_t avail_idx, res_cur_idx; 936 uint16_t res_base_idx, res_end_idx; 937 uint16_t free_entries; 938 uint8_t success = 0; 939 940 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 941 vq = dev->virtqueue[VIRTIO_RXQ]; 942 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 943 /* As many data cores may want access to available buffers, they need to be reserved. */ 944 do { 945 res_base_idx = vq->last_used_idx_res; 946 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 947 948 free_entries = (avail_idx - res_base_idx); 949 /* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */ 950 if (enable_retry && unlikely(count > free_entries)) { 951 for (retry = 0; retry < burst_rx_retry_num; retry++) { 952 rte_delay_us(burst_rx_delay_time); 953 avail_idx = 954 *((volatile uint16_t *)&vq->avail->idx); 955 free_entries = (avail_idx - res_base_idx); 956 if (count <= free_entries) 957 break; 958 } 959 } 960 961 /*check that we have enough buffers*/ 962 if (unlikely(count > free_entries)) 963 count = free_entries; 964 965 if (count == 0) 966 return 0; 967 968 res_end_idx = res_base_idx + count; 969 /* vq->last_used_idx_res is atomically updated. */ 970 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx, 971 res_end_idx); 972 } while (unlikely(success == 0)); 973 res_cur_idx = res_base_idx; 974 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx); 975 976 /* Prefetch available ring to retrieve indexes. */ 977 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); 978 979 /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ 980 mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); 981 982 /* Retrieve all of the head indexes first to avoid caching issues. */ 983 for (head_idx = 0; head_idx < count; head_idx++) 984 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; 985 986 /*Prefetch descriptor index. */ 987 rte_prefetch0(&vq->desc[head[packet_success]]); 988 989 while (res_cur_idx != res_end_idx) { 990 /* Get descriptor from available ring */ 991 desc = &vq->desc[head[packet_success]]; 992 993 buff = pkts[packet_success]; 994 995 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */ 996 buff_addr = gpa_to_vva(dev, desc->addr); 997 /* Prefetch buffer address. */ 998 rte_prefetch0((void*)(uintptr_t)buff_addr); 999 1000 if (mergeable && (mrg_count != 0)) { 1001 desc->len = packet_len = rte_pktmbuf_data_len(buff); 1002 } else { 1003 /* Copy virtio_hdr to packet and increment buffer address */ 1004 buff_hdr_addr = buff_addr; 1005 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1006 1007 /* 1008 * If the descriptors are chained the header and data are placed in 1009 * separate buffers. 1010 */ 1011 if (desc->flags & VRING_DESC_F_NEXT) { 1012 desc->len = vq->vhost_hlen; 1013 desc = &vq->desc[desc->next]; 1014 /* Buffer address translation. */ 1015 buff_addr = gpa_to_vva(dev, desc->addr); 1016 desc->len = rte_pktmbuf_data_len(buff); 1017 } else { 1018 buff_addr += vq->vhost_hlen; 1019 desc->len = packet_len; 1020 } 1021 } 1022 1023 PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0); 1024 1025 /* Update used ring with desc information */ 1026 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; 1027 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; 1028 1029 /* Copy mbuf data to buffer */ 1030 rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff)); 1031 1032 res_cur_idx++; 1033 packet_success++; 1034 1035 /* If mergeable is disabled then a header is required per buffer. */ 1036 if (!mergeable) { 1037 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen); 1038 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1039 } else { 1040 mrg_count++; 1041 /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */ 1042 if ((mrg_count == MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) { 1043 virtio_hdr.num_buffers = mrg_count; 1044 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); 1045 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen); 1046 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1047 mrg_count = 0; 1048 } 1049 } 1050 if (res_cur_idx < res_end_idx) { 1051 /* Prefetch descriptor index. */ 1052 rte_prefetch0(&vq->desc[head[packet_success]]); 1053 } 1054 } 1055 1056 rte_compiler_barrier(); 1057 1058 /* Wait until it's our turn to add our buffer to the used ring. */ 1059 while (unlikely(vq->last_used_idx != res_base_idx)) 1060 rte_pause(); 1061 1062 *(volatile uint16_t *)&vq->used->idx += count; 1063 vq->last_used_idx = res_end_idx; 1064 1065 /* Kick the guest if necessary. */ 1066 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1067 eventfd_write((int)vq->kickfd, 1); 1068 return count; 1069 } 1070 1071 /* 1072 * Compares a packet destination MAC address to a device MAC address. 1073 */ 1074 static inline int __attribute__((always_inline)) 1075 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 1076 { 1077 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 1078 } 1079 1080 /* 1081 * This function learns the MAC address of the device and registers this along with a 1082 * vlan tag to a VMDQ. 1083 */ 1084 static int 1085 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m) 1086 { 1087 struct ether_hdr *pkt_hdr; 1088 struct virtio_net_data_ll *dev_ll; 1089 int i, ret; 1090 1091 /* Learn MAC address of guest device from packet */ 1092 pkt_hdr = (struct ether_hdr *)m->pkt.data; 1093 1094 dev_ll = ll_root_used; 1095 1096 while (dev_ll != NULL) { 1097 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) { 1098 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 1099 return -1; 1100 } 1101 dev_ll = dev_ll->next; 1102 } 1103 1104 for (i = 0; i < ETHER_ADDR_LEN; i++) 1105 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 1106 1107 /* vlan_tag currently uses the device_id. */ 1108 dev->vlan_tag = vlan_tags[dev->device_fh]; 1109 1110 /* Print out VMDQ registration info. */ 1111 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 1112 dev->device_fh, 1113 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1], 1114 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3], 1115 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5], 1116 dev->vlan_tag); 1117 1118 /* Register the MAC address. */ 1119 ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh); 1120 if (ret) 1121 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 1122 dev->device_fh); 1123 1124 /* Enable stripping of the vlan tag as we handle routing. */ 1125 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1); 1126 1127 /* Set device as ready for RX. */ 1128 dev->ready = DEVICE_RX; 1129 1130 return 0; 1131 } 1132 1133 /* 1134 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 1135 * queue before disabling RX on the device. 1136 */ 1137 static inline void 1138 unlink_vmdq(struct virtio_net *dev) 1139 { 1140 unsigned i = 0; 1141 unsigned rx_count; 1142 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1143 1144 if (dev->ready == DEVICE_RX) { 1145 /*clear MAC and VLAN settings*/ 1146 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address); 1147 for (i = 0; i < 6; i++) 1148 dev->mac_address.addr_bytes[i] = 0; 1149 1150 dev->vlan_tag = 0; 1151 1152 /*Clear out the receive buffers*/ 1153 rx_count = rte_eth_rx_burst(ports[0], 1154 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1155 1156 while (rx_count) { 1157 for (i = 0; i < rx_count; i++) 1158 rte_pktmbuf_free(pkts_burst[i]); 1159 1160 rx_count = rte_eth_rx_burst(ports[0], 1161 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1162 } 1163 1164 dev->ready = DEVICE_MAC_LEARNING; 1165 } 1166 } 1167 1168 /* 1169 * Check if the packet destination MAC address is for a local device. If so then put 1170 * the packet on that devices RX queue. If not then return. 1171 */ 1172 static inline unsigned __attribute__((always_inline)) 1173 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m) 1174 { 1175 struct virtio_net_data_ll *dev_ll; 1176 struct ether_hdr *pkt_hdr; 1177 uint64_t ret = 0; 1178 1179 pkt_hdr = (struct ether_hdr *)m->pkt.data; 1180 1181 /*get the used devices list*/ 1182 dev_ll = ll_root_used; 1183 1184 while (dev_ll != NULL) { 1185 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1186 &dev_ll->dev->mac_address)) { 1187 1188 /* Drop the packet if the TX packet is destined for the TX device. */ 1189 if (dev_ll->dev->device_fh == dev->device_fh) { 1190 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1191 dev_ll->dev->device_fh); 1192 return 0; 1193 } 1194 1195 1196 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh); 1197 1198 if (dev_ll->dev->remove) { 1199 /*drop the packet if the device is marked for removal*/ 1200 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh); 1201 } else { 1202 /*send the packet to the local virtio device*/ 1203 ret = virtio_dev_rx(dev_ll->dev, &m, 1); 1204 if (enable_stats) { 1205 rte_atomic64_add( 1206 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic, 1207 1); 1208 rte_atomic64_add( 1209 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, 1210 ret); 1211 dev_statistics[dev->device_fh].tx_total++; 1212 dev_statistics[dev->device_fh].tx += ret; 1213 } 1214 } 1215 1216 return 0; 1217 } 1218 dev_ll = dev_ll->next; 1219 } 1220 1221 return -1; 1222 } 1223 1224 /* 1225 * This function routes the TX packet to the correct interface. This may be a local device 1226 * or the physical port. 1227 */ 1228 static inline void __attribute__((always_inline)) 1229 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag) 1230 { 1231 struct mbuf_table *tx_q; 1232 struct vlan_ethhdr *vlan_hdr; 1233 struct rte_mbuf **m_table; 1234 struct rte_mbuf *mbuf; 1235 unsigned len, ret, offset = 0; 1236 const uint16_t lcore_id = rte_lcore_id(); 1237 struct virtio_net_data_ll *dev_ll = ll_root_used; 1238 struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data; 1239 1240 /*check if destination is local VM*/ 1241 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0)) 1242 return; 1243 1244 if (vm2vm_mode == VM2VM_HARDWARE) { 1245 while (dev_ll != NULL) { 1246 if ((dev_ll->dev->ready == DEVICE_RX) 1247 && ether_addr_cmp(&(pkt_hdr->d_addr), 1248 &dev_ll->dev->mac_address)) { 1249 /* 1250 * Drop the packet if the TX packet is 1251 * destined for the TX device. 1252 */ 1253 if (dev_ll->dev->device_fh == dev->device_fh) { 1254 LOG_DEBUG(VHOST_DATA, 1255 "(%"PRIu64") TX: Source and destination" 1256 " MAC addresses are the same. Dropping " 1257 "packet.\n", 1258 dev_ll->dev->device_fh); 1259 return; 1260 } 1261 offset = 4; 1262 vlan_tag = 1263 (uint16_t) 1264 vlan_tags[(uint16_t)dev_ll->dev->device_fh]; 1265 1266 LOG_DEBUG(VHOST_DATA, 1267 "(%"PRIu64") TX: pkt to local VM device id:" 1268 "(%"PRIu64") vlan tag: %d.\n", 1269 dev->device_fh, dev_ll->dev->device_fh, 1270 vlan_tag); 1271 1272 break; 1273 } 1274 dev_ll = dev_ll->next; 1275 } 1276 } 1277 1278 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1279 1280 /*Add packet to the port tx queue*/ 1281 tx_q = &lcore_tx_queue[lcore_id]; 1282 len = tx_q->len; 1283 1284 /* Allocate an mbuf and populate the structure. */ 1285 mbuf = rte_pktmbuf_alloc(mbuf_pool); 1286 if (unlikely(mbuf == NULL)) { 1287 RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); 1288 return; 1289 } 1290 1291 mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset; 1292 mbuf->pkt.pkt_len = mbuf->pkt.data_len; 1293 1294 /* Copy ethernet header to mbuf. */ 1295 rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN); 1296 1297 1298 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/ 1299 vlan_hdr = (struct vlan_ethhdr *) mbuf->pkt.data; 1300 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto; 1301 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); 1302 vlan_hdr->h_vlan_TCI = htons(vlan_tag); 1303 1304 /* Copy the remaining packet contents to the mbuf. */ 1305 rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN), 1306 (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN)); 1307 tx_q->m_table[len] = mbuf; 1308 len++; 1309 if (enable_stats) { 1310 dev_statistics[dev->device_fh].tx_total++; 1311 dev_statistics[dev->device_fh].tx++; 1312 } 1313 1314 if (unlikely(len == MAX_PKT_BURST)) { 1315 m_table = (struct rte_mbuf **)tx_q->m_table; 1316 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1317 /* Free any buffers not handled by TX and update the port stats. */ 1318 if (unlikely(ret < len)) { 1319 do { 1320 rte_pktmbuf_free(m_table[ret]); 1321 } while (++ret < len); 1322 } 1323 1324 len = 0; 1325 } 1326 1327 tx_q->len = len; 1328 return; 1329 } 1330 1331 static inline void __attribute__((always_inline)) 1332 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool) 1333 { 1334 struct rte_mbuf m; 1335 struct vhost_virtqueue *vq; 1336 struct vring_desc *desc; 1337 uint64_t buff_addr = 0; 1338 uint32_t head[MAX_PKT_BURST]; 1339 uint32_t used_idx; 1340 uint32_t i; 1341 uint16_t free_entries, packet_success = 0; 1342 uint16_t avail_idx; 1343 1344 vq = dev->virtqueue[VIRTIO_TXQ]; 1345 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1346 1347 /* If there are no available buffers then return. */ 1348 if (vq->last_used_idx == avail_idx) 1349 return; 1350 1351 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1352 1353 /* Prefetch available ring to retrieve head indexes. */ 1354 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); 1355 1356 /*get the number of free entries in the ring*/ 1357 free_entries = (avail_idx - vq->last_used_idx); 1358 1359 /* Limit to MAX_PKT_BURST. */ 1360 if (free_entries > MAX_PKT_BURST) 1361 free_entries = MAX_PKT_BURST; 1362 1363 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries); 1364 /* Retrieve all of the head indexes first to avoid caching issues. */ 1365 for (i = 0; i < free_entries; i++) 1366 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; 1367 1368 /* Prefetch descriptor index. */ 1369 rte_prefetch0(&vq->desc[head[packet_success]]); 1370 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1371 1372 while (packet_success < free_entries) { 1373 desc = &vq->desc[head[packet_success]]; 1374 1375 /* Discard first buffer as it is the virtio header */ 1376 desc = &vq->desc[desc->next]; 1377 1378 /* Buffer address translation. */ 1379 buff_addr = gpa_to_vva(dev, desc->addr); 1380 /* Prefetch buffer address. */ 1381 rte_prefetch0((void*)(uintptr_t)buff_addr); 1382 1383 used_idx = vq->last_used_idx & (vq->size - 1); 1384 1385 if (packet_success < (free_entries - 1)) { 1386 /* Prefetch descriptor index. */ 1387 rte_prefetch0(&vq->desc[head[packet_success+1]]); 1388 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); 1389 } 1390 1391 /* Update used index buffer information. */ 1392 vq->used->ring[used_idx].id = head[packet_success]; 1393 vq->used->ring[used_idx].len = 0; 1394 1395 /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */ 1396 m.pkt.data_len = desc->len; 1397 m.pkt.data = (void*)(uintptr_t)buff_addr; 1398 1399 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1400 1401 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1402 if (dev->ready == DEVICE_MAC_LEARNING) { 1403 if (dev->remove || (link_vmdq(dev, &m) == -1)) { 1404 /*discard frame if device is scheduled for removal or a duplicate MAC address is found. */ 1405 packet_success += free_entries; 1406 vq->last_used_idx += packet_success; 1407 break; 1408 } 1409 } 1410 virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh); 1411 1412 vq->last_used_idx++; 1413 packet_success++; 1414 } 1415 1416 rte_compiler_barrier(); 1417 vq->used->idx += packet_success; 1418 /* Kick guest if required. */ 1419 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1420 eventfd_write((int)vq->kickfd, 1); 1421 } 1422 1423 /* 1424 * This function is called by each data core. It handles all RX/TX registered with the 1425 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1426 * with all devices in the main linked list. 1427 */ 1428 static int 1429 switch_worker(__attribute__((unused)) void *arg) 1430 { 1431 struct rte_mempool *mbuf_pool = arg; 1432 struct virtio_net *dev = NULL; 1433 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1434 struct virtio_net_data_ll *dev_ll; 1435 struct mbuf_table *tx_q; 1436 volatile struct lcore_ll_info *lcore_ll; 1437 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1438 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1439 unsigned ret, i; 1440 const uint16_t lcore_id = rte_lcore_id(); 1441 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1442 uint16_t rx_count = 0; 1443 1444 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", lcore_id); 1445 lcore_ll = lcore_info[lcore_id].lcore_ll; 1446 prev_tsc = 0; 1447 1448 tx_q = &lcore_tx_queue[lcore_id]; 1449 for (i = 0; i < num_cores; i ++) { 1450 if (lcore_ids[i] == lcore_id) { 1451 tx_q->txq_id = i; 1452 break; 1453 } 1454 } 1455 1456 while(1) { 1457 cur_tsc = rte_rdtsc(); 1458 /* 1459 * TX burst queue drain 1460 */ 1461 diff_tsc = cur_tsc - prev_tsc; 1462 if (unlikely(diff_tsc > drain_tsc)) { 1463 1464 if (tx_q->len) { 1465 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1466 1467 /*Tx any packets in the queue*/ 1468 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1469 (struct rte_mbuf **)tx_q->m_table, 1470 (uint16_t)tx_q->len); 1471 if (unlikely(ret < tx_q->len)) { 1472 do { 1473 rte_pktmbuf_free(tx_q->m_table[ret]); 1474 } while (++ret < tx_q->len); 1475 } 1476 1477 tx_q->len = 0; 1478 } 1479 1480 prev_tsc = cur_tsc; 1481 1482 } 1483 1484 rte_prefetch0(lcore_ll->ll_root_used); 1485 /* 1486 * Inform the configuration core that we have exited the linked list and that no devices are 1487 * in use if requested. 1488 */ 1489 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1490 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1491 1492 /* 1493 * Process devices 1494 */ 1495 dev_ll = lcore_ll->ll_root_used; 1496 1497 while (dev_ll != NULL) { 1498 /*get virtio device ID*/ 1499 dev = dev_ll->dev; 1500 1501 if (dev->remove) { 1502 dev_ll = dev_ll->next; 1503 unlink_vmdq(dev); 1504 dev->ready = DEVICE_SAFE_REMOVE; 1505 continue; 1506 } 1507 if (likely(dev->ready == DEVICE_RX)) { 1508 /*Handle guest RX*/ 1509 rx_count = rte_eth_rx_burst(ports[0], 1510 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1511 1512 if (rx_count) { 1513 ret_count = virtio_dev_rx(dev, pkts_burst, rx_count); 1514 if (enable_stats) { 1515 rte_atomic64_add( 1516 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic, 1517 rx_count); 1518 rte_atomic64_add( 1519 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count); 1520 } 1521 while (likely(rx_count)) { 1522 rx_count--; 1523 rte_pktmbuf_free_seg(pkts_burst[rx_count]); 1524 } 1525 1526 } 1527 } 1528 1529 if (!dev->remove) 1530 /*Handle guest TX*/ 1531 virtio_dev_tx(dev, mbuf_pool); 1532 1533 /*move to the next device in the list*/ 1534 dev_ll = dev_ll->next; 1535 } 1536 } 1537 1538 return 0; 1539 } 1540 1541 /* 1542 * This function gets available ring number for zero copy rx. 1543 * Only one thread will call this funciton for a paticular virtio device, 1544 * so, it is designed as non-thread-safe function. 1545 */ 1546 static inline uint32_t __attribute__((always_inline)) 1547 get_available_ring_num_zcp(struct virtio_net *dev) 1548 { 1549 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1550 uint16_t avail_idx; 1551 1552 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1553 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1554 } 1555 1556 /* 1557 * This function gets available ring index for zero copy rx, 1558 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1559 * Only one thread will call this funciton for a paticular virtio device, 1560 * so, it is designed as non-thread-safe function. 1561 */ 1562 static inline uint32_t __attribute__((always_inline)) 1563 get_available_ring_index_zcp(struct virtio_net *dev, 1564 uint16_t *res_base_idx, uint32_t count) 1565 { 1566 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1567 uint16_t avail_idx; 1568 uint32_t retry = 0; 1569 uint16_t free_entries; 1570 1571 *res_base_idx = vq->last_used_idx_res; 1572 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1573 free_entries = (avail_idx - *res_base_idx); 1574 1575 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1576 "avail idx: %d, " 1577 "res base idx:%d, free entries:%d\n", 1578 dev->device_fh, avail_idx, *res_base_idx, 1579 free_entries); 1580 1581 /* 1582 * If retry is enabled and the queue is full then we wait 1583 * and retry to avoid packet loss. 1584 */ 1585 if (enable_retry && unlikely(count > free_entries)) { 1586 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1587 rte_delay_us(burst_rx_delay_time); 1588 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1589 free_entries = (avail_idx - *res_base_idx); 1590 if (count <= free_entries) 1591 break; 1592 } 1593 } 1594 1595 /*check that we have enough buffers*/ 1596 if (unlikely(count > free_entries)) 1597 count = free_entries; 1598 1599 if (unlikely(count == 0)) { 1600 LOG_DEBUG(VHOST_DATA, 1601 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1602 "avail idx: %d, res base idx:%d, free entries:%d\n", 1603 dev->device_fh, avail_idx, 1604 *res_base_idx, free_entries); 1605 return 0; 1606 } 1607 1608 vq->last_used_idx_res = *res_base_idx + count; 1609 1610 return count; 1611 } 1612 1613 /* 1614 * This function put descriptor back to used list. 1615 */ 1616 static inline void __attribute__((always_inline)) 1617 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1618 { 1619 uint16_t res_cur_idx = vq->last_used_idx; 1620 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1621 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1622 rte_compiler_barrier(); 1623 *(volatile uint16_t *)&vq->used->idx += 1; 1624 vq->last_used_idx += 1; 1625 1626 /* Kick the guest if necessary. */ 1627 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1628 eventfd_write((int)vq->kickfd, 1); 1629 } 1630 1631 /* 1632 * This function get available descriptor from vitio vring and un-attached mbuf 1633 * from vpool->ring, and then attach them together. It needs adjust the offset 1634 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1635 * frame data may be put to wrong location in mbuf. 1636 */ 1637 static inline void __attribute__((always_inline)) 1638 attach_rxmbuf_zcp(struct virtio_net *dev) 1639 { 1640 uint16_t res_base_idx, desc_idx; 1641 uint64_t buff_addr, phys_addr; 1642 struct vhost_virtqueue *vq; 1643 struct vring_desc *desc; 1644 struct rte_mbuf *mbuf = NULL; 1645 struct vpool *vpool; 1646 hpa_type addr_type; 1647 1648 vpool = &vpool_array[dev->vmdq_rx_q]; 1649 vq = dev->virtqueue[VIRTIO_RXQ]; 1650 1651 do { 1652 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx, 1653 1) != 1)) 1654 return; 1655 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1656 1657 desc = &vq->desc[desc_idx]; 1658 if (desc->flags & VRING_DESC_F_NEXT) { 1659 desc = &vq->desc[desc->next]; 1660 buff_addr = gpa_to_vva(dev, desc->addr); 1661 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, 1662 &addr_type); 1663 } else { 1664 buff_addr = gpa_to_vva(dev, 1665 desc->addr + vq->vhost_hlen); 1666 phys_addr = gpa_to_hpa(dev, 1667 desc->addr + vq->vhost_hlen, 1668 desc->len, &addr_type); 1669 } 1670 1671 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1672 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1673 " address found when attaching RX frame buffer" 1674 " address!\n", dev->device_fh); 1675 put_desc_to_used_list_zcp(vq, desc_idx); 1676 continue; 1677 } 1678 1679 /* 1680 * Check if the frame buffer address from guest crosses 1681 * sub-region or not. 1682 */ 1683 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1684 RTE_LOG(ERR, VHOST_DATA, 1685 "(%"PRIu64") Frame buffer address cross " 1686 "sub-regioin found when attaching RX frame " 1687 "buffer address!\n", 1688 dev->device_fh); 1689 put_desc_to_used_list_zcp(vq, desc_idx); 1690 continue; 1691 } 1692 } while (unlikely(phys_addr == 0)); 1693 1694 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1695 if (unlikely(mbuf == NULL)) { 1696 LOG_DEBUG(VHOST_DATA, 1697 "(%"PRIu64") in attach_rxmbuf_zcp: " 1698 "ring_sc_dequeue fail.\n", 1699 dev->device_fh); 1700 put_desc_to_used_list_zcp(vq, desc_idx); 1701 return; 1702 } 1703 1704 if (unlikely(vpool->buf_size > desc->len)) { 1705 LOG_DEBUG(VHOST_DATA, 1706 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1707 "length(%d) of descriptor idx: %d less than room " 1708 "size required: %d\n", 1709 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1710 put_desc_to_used_list_zcp(vq, desc_idx); 1711 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1712 return; 1713 } 1714 1715 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1716 mbuf->pkt.data = (void *)(uintptr_t)(buff_addr); 1717 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1718 mbuf->pkt.data_len = desc->len; 1719 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1720 1721 LOG_DEBUG(VHOST_DATA, 1722 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1723 "descriptor idx:%d\n", 1724 dev->device_fh, res_base_idx, desc_idx); 1725 1726 __rte_mbuf_raw_free(mbuf); 1727 1728 return; 1729 } 1730 1731 /* 1732 * Detach an attched packet mbuf - 1733 * - restore original mbuf address and length values. 1734 * - reset pktmbuf data and data_len to their default values. 1735 * All other fields of the given packet mbuf will be left intact. 1736 * 1737 * @param m 1738 * The attached packet mbuf. 1739 */ 1740 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1741 { 1742 const struct rte_mempool *mp = m->pool; 1743 void *buf = RTE_MBUF_TO_BADDR(m); 1744 uint32_t buf_ofs; 1745 uint32_t buf_len = mp->elt_size - sizeof(*m); 1746 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1747 1748 m->buf_addr = buf; 1749 m->buf_len = (uint16_t)buf_len; 1750 1751 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1752 RTE_PKTMBUF_HEADROOM : m->buf_len; 1753 m->pkt.data = (char *) m->buf_addr + buf_ofs; 1754 1755 m->pkt.data_len = 0; 1756 } 1757 1758 /* 1759 * This function is called after packets have been transimited. It fetchs mbuf 1760 * from vpool->pool, detached it and put into vpool->ring. It also update the 1761 * used index and kick the guest if necessary. 1762 */ 1763 static inline uint32_t __attribute__((always_inline)) 1764 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1765 { 1766 struct rte_mbuf *mbuf; 1767 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1768 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1769 uint32_t index = 0; 1770 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1771 1772 LOG_DEBUG(VHOST_DATA, 1773 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1774 "clean is: %d\n", 1775 dev->device_fh, mbuf_count); 1776 LOG_DEBUG(VHOST_DATA, 1777 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1778 "clean is : %d\n", 1779 dev->device_fh, rte_ring_count(vpool->ring)); 1780 1781 for (index = 0; index < mbuf_count; index++) { 1782 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1783 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1784 pktmbuf_detach_zcp(mbuf); 1785 rte_ring_sp_enqueue(vpool->ring, mbuf); 1786 1787 /* Update used index buffer information. */ 1788 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1789 vq->used->ring[used_idx].len = 0; 1790 1791 used_idx = (used_idx + 1) & (vq->size - 1); 1792 } 1793 1794 LOG_DEBUG(VHOST_DATA, 1795 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1796 "clean is: %d\n", 1797 dev->device_fh, rte_mempool_count(vpool->pool)); 1798 LOG_DEBUG(VHOST_DATA, 1799 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1800 "clean is : %d\n", 1801 dev->device_fh, rte_ring_count(vpool->ring)); 1802 LOG_DEBUG(VHOST_DATA, 1803 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1804 "vq->last_used_idx:%d\n", 1805 dev->device_fh, vq->last_used_idx); 1806 1807 vq->last_used_idx += mbuf_count; 1808 1809 LOG_DEBUG(VHOST_DATA, 1810 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1811 "vq->last_used_idx:%d\n", 1812 dev->device_fh, vq->last_used_idx); 1813 1814 rte_compiler_barrier(); 1815 1816 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1817 1818 /* Kick guest if required. */ 1819 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1820 eventfd_write((int)vq->kickfd, 1); 1821 1822 return 0; 1823 } 1824 1825 /* 1826 * This function is called when a virtio device is destroy. 1827 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1828 */ 1829 static void mbuf_destroy_zcp(struct vpool *vpool) 1830 { 1831 struct rte_mbuf *mbuf = NULL; 1832 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1833 1834 LOG_DEBUG(VHOST_CONFIG, 1835 "in mbuf_destroy_zcp: mbuf count in mempool before " 1836 "mbuf_destroy_zcp is: %d\n", 1837 mbuf_count); 1838 LOG_DEBUG(VHOST_CONFIG, 1839 "in mbuf_destroy_zcp: mbuf count in ring before " 1840 "mbuf_destroy_zcp is : %d\n", 1841 rte_ring_count(vpool->ring)); 1842 1843 for (index = 0; index < mbuf_count; index++) { 1844 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1845 if (likely(mbuf != NULL)) { 1846 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1847 pktmbuf_detach_zcp(mbuf); 1848 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1849 } 1850 } 1851 1852 LOG_DEBUG(VHOST_CONFIG, 1853 "in mbuf_destroy_zcp: mbuf count in mempool after " 1854 "mbuf_destroy_zcp is: %d\n", 1855 rte_mempool_count(vpool->pool)); 1856 LOG_DEBUG(VHOST_CONFIG, 1857 "in mbuf_destroy_zcp: mbuf count in ring after " 1858 "mbuf_destroy_zcp is : %d\n", 1859 rte_ring_count(vpool->ring)); 1860 } 1861 1862 /* 1863 * This function update the use flag and counter. 1864 */ 1865 static inline uint32_t __attribute__((always_inline)) 1866 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1867 uint32_t count) 1868 { 1869 struct vhost_virtqueue *vq; 1870 struct vring_desc *desc; 1871 struct rte_mbuf *buff; 1872 /* The virtio_hdr is initialised to 0. */ 1873 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1874 = {{0, 0, 0, 0, 0, 0}, 0}; 1875 uint64_t buff_hdr_addr = 0; 1876 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1877 uint32_t head_idx, packet_success = 0; 1878 uint16_t res_cur_idx; 1879 1880 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1881 1882 if (count == 0) 1883 return 0; 1884 1885 vq = dev->virtqueue[VIRTIO_RXQ]; 1886 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1887 1888 res_cur_idx = vq->last_used_idx; 1889 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1890 dev->device_fh, res_cur_idx, res_cur_idx + count); 1891 1892 /* Retrieve all of the head indexes first to avoid caching issues. */ 1893 for (head_idx = 0; head_idx < count; head_idx++) 1894 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1895 1896 /*Prefetch descriptor index. */ 1897 rte_prefetch0(&vq->desc[head[packet_success]]); 1898 1899 while (packet_success != count) { 1900 /* Get descriptor from available ring */ 1901 desc = &vq->desc[head[packet_success]]; 1902 1903 buff = pkts[packet_success]; 1904 LOG_DEBUG(VHOST_DATA, 1905 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1906 "pkt[%d] descriptor idx: %d\n", 1907 dev->device_fh, packet_success, 1908 MBUF_HEADROOM_UINT32(buff)); 1909 1910 PRINT_PACKET(dev, 1911 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1912 + RTE_PKTMBUF_HEADROOM), 1913 rte_pktmbuf_data_len(buff), 0); 1914 1915 /* Buffer address translation for virtio header. */ 1916 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1917 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1918 1919 /* 1920 * If the descriptors are chained the header and data are 1921 * placed in separate buffers. 1922 */ 1923 if (desc->flags & VRING_DESC_F_NEXT) { 1924 desc->len = vq->vhost_hlen; 1925 desc = &vq->desc[desc->next]; 1926 desc->len = rte_pktmbuf_data_len(buff); 1927 } else { 1928 desc->len = packet_len; 1929 } 1930 1931 /* Update used ring with desc information */ 1932 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1933 = head[packet_success]; 1934 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1935 = packet_len; 1936 res_cur_idx++; 1937 packet_success++; 1938 1939 /* A header is required per buffer. */ 1940 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1941 (const void *)&virtio_hdr, vq->vhost_hlen); 1942 1943 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1944 1945 if (likely(packet_success < count)) { 1946 /* Prefetch descriptor index. */ 1947 rte_prefetch0(&vq->desc[head[packet_success]]); 1948 } 1949 } 1950 1951 rte_compiler_barrier(); 1952 1953 LOG_DEBUG(VHOST_DATA, 1954 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1955 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1956 dev->device_fh, vq->last_used_idx, vq->used->idx); 1957 1958 *(volatile uint16_t *)&vq->used->idx += count; 1959 vq->last_used_idx += count; 1960 1961 LOG_DEBUG(VHOST_DATA, 1962 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1963 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1964 dev->device_fh, vq->last_used_idx, vq->used->idx); 1965 1966 /* Kick the guest if necessary. */ 1967 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1968 eventfd_write((int)vq->kickfd, 1); 1969 1970 return count; 1971 } 1972 1973 /* 1974 * This function routes the TX packet to the correct interface. 1975 * This may be a local device or the physical port. 1976 */ 1977 static inline void __attribute__((always_inline)) 1978 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1979 uint32_t desc_idx, uint8_t need_copy) 1980 { 1981 struct mbuf_table *tx_q; 1982 struct rte_mbuf **m_table; 1983 struct rte_mbuf *mbuf = NULL; 1984 unsigned len, ret, offset = 0; 1985 struct vpool *vpool; 1986 struct virtio_net_data_ll *dev_ll = ll_root_used; 1987 struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data; 1988 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1989 1990 /*Add packet to the port tx queue*/ 1991 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q]; 1992 len = tx_q->len; 1993 1994 /* Allocate an mbuf and populate the structure. */ 1995 vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q]; 1996 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1997 if (unlikely(mbuf == NULL)) { 1998 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1999 RTE_LOG(ERR, VHOST_DATA, 2000 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 2001 dev->device_fh); 2002 put_desc_to_used_list_zcp(vq, desc_idx); 2003 return; 2004 } 2005 2006 if (vm2vm_mode == VM2VM_HARDWARE) { 2007 /* Avoid using a vlan tag from any vm for external pkt, such as 2008 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 2009 * selection, MAC address determines it as an external pkt 2010 * which should go to network, while vlan tag determine it as 2011 * a vm2vm pkt should forward to another vm. Hardware confuse 2012 * such a ambiguous situation, so pkt will lost. 2013 */ 2014 vlan_tag = external_pkt_default_vlan_tag; 2015 while (dev_ll != NULL) { 2016 if (likely(dev_ll->dev->ready == DEVICE_RX) && 2017 ether_addr_cmp(&(pkt_hdr->d_addr), 2018 &dev_ll->dev->mac_address)) { 2019 2020 /* 2021 * Drop the packet if the TX packet is destined 2022 * for the TX device. 2023 */ 2024 if (unlikely(dev_ll->dev->device_fh 2025 == dev->device_fh)) { 2026 LOG_DEBUG(VHOST_DATA, 2027 "(%"PRIu64") TX: Source and destination" 2028 "MAC addresses are the same. Dropping " 2029 "packet.\n", 2030 dev_ll->dev->device_fh); 2031 MBUF_HEADROOM_UINT32(mbuf) 2032 = (uint32_t)desc_idx; 2033 __rte_mbuf_raw_free(mbuf); 2034 return; 2035 } 2036 2037 /* 2038 * Packet length offset 4 bytes for HW vlan 2039 * strip when L2 switch back. 2040 */ 2041 offset = 4; 2042 vlan_tag = 2043 (uint16_t) 2044 vlan_tags[(uint16_t)dev_ll->dev->device_fh]; 2045 2046 LOG_DEBUG(VHOST_DATA, 2047 "(%"PRIu64") TX: pkt to local VM device id:" 2048 "(%"PRIu64") vlan tag: %d.\n", 2049 dev->device_fh, dev_ll->dev->device_fh, 2050 vlan_tag); 2051 2052 break; 2053 } 2054 dev_ll = dev_ll->next; 2055 } 2056 } 2057 2058 mbuf->pkt.nb_segs = m->pkt.nb_segs; 2059 mbuf->pkt.next = m->pkt.next; 2060 mbuf->pkt.data_len = m->pkt.data_len + offset; 2061 mbuf->pkt.pkt_len = mbuf->pkt.data_len; 2062 if (unlikely(need_copy)) { 2063 /* Copy the packet contents to the mbuf. */ 2064 rte_memcpy((void *)((uint8_t *)mbuf->pkt.data), 2065 (const void *) ((uint8_t *)m->pkt.data), 2066 m->pkt.data_len); 2067 } else { 2068 mbuf->pkt.data = m->pkt.data; 2069 mbuf->buf_physaddr = m->buf_physaddr; 2070 mbuf->buf_addr = m->buf_addr; 2071 } 2072 mbuf->ol_flags = PKT_TX_VLAN_PKT; 2073 mbuf->pkt.vlan_macip.f.vlan_tci = vlan_tag; 2074 mbuf->pkt.vlan_macip.f.l2_len = sizeof(struct ether_hdr); 2075 mbuf->pkt.vlan_macip.f.l3_len = sizeof(struct ipv4_hdr); 2076 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 2077 2078 tx_q->m_table[len] = mbuf; 2079 len++; 2080 2081 LOG_DEBUG(VHOST_DATA, 2082 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 2083 dev->device_fh, 2084 mbuf->pkt.nb_segs, 2085 (mbuf->pkt.next == NULL) ? "null" : "non-null"); 2086 2087 if (enable_stats) { 2088 dev_statistics[dev->device_fh].tx_total++; 2089 dev_statistics[dev->device_fh].tx++; 2090 } 2091 2092 if (unlikely(len == MAX_PKT_BURST)) { 2093 m_table = (struct rte_mbuf **)tx_q->m_table; 2094 ret = rte_eth_tx_burst(ports[0], 2095 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 2096 2097 /* 2098 * Free any buffers not handled by TX and update 2099 * the port stats. 2100 */ 2101 if (unlikely(ret < len)) { 2102 do { 2103 rte_pktmbuf_free(m_table[ret]); 2104 } while (++ret < len); 2105 } 2106 2107 len = 0; 2108 txmbuf_clean_zcp(dev, vpool); 2109 } 2110 2111 tx_q->len = len; 2112 2113 return; 2114 } 2115 2116 /* 2117 * This function TX all available packets in virtio TX queue for one 2118 * virtio-net device. If it is first packet, it learns MAC address and 2119 * setup VMDQ. 2120 */ 2121 static inline void __attribute__((always_inline)) 2122 virtio_dev_tx_zcp(struct virtio_net *dev) 2123 { 2124 struct rte_mbuf m; 2125 struct vhost_virtqueue *vq; 2126 struct vring_desc *desc; 2127 uint64_t buff_addr = 0, phys_addr; 2128 uint32_t head[MAX_PKT_BURST]; 2129 uint32_t i; 2130 uint16_t free_entries, packet_success = 0; 2131 uint16_t avail_idx; 2132 uint8_t need_copy = 0; 2133 hpa_type addr_type; 2134 2135 vq = dev->virtqueue[VIRTIO_TXQ]; 2136 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 2137 2138 /* If there are no available buffers then return. */ 2139 if (vq->last_used_idx_res == avail_idx) 2140 return; 2141 2142 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 2143 2144 /* Prefetch available ring to retrieve head indexes. */ 2145 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 2146 2147 /* Get the number of free entries in the ring */ 2148 free_entries = (avail_idx - vq->last_used_idx_res); 2149 2150 /* Limit to MAX_PKT_BURST. */ 2151 free_entries 2152 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 2153 2154 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 2155 dev->device_fh, free_entries); 2156 2157 /* Retrieve all of the head indexes first to avoid caching issues. */ 2158 for (i = 0; i < free_entries; i++) 2159 head[i] 2160 = vq->avail->ring[(vq->last_used_idx_res + i) 2161 & (vq->size - 1)]; 2162 2163 vq->last_used_idx_res += free_entries; 2164 2165 /* Prefetch descriptor index. */ 2166 rte_prefetch0(&vq->desc[head[packet_success]]); 2167 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 2168 2169 while (packet_success < free_entries) { 2170 desc = &vq->desc[head[packet_success]]; 2171 2172 /* Discard first buffer as it is the virtio header */ 2173 desc = &vq->desc[desc->next]; 2174 2175 /* Buffer address translation. */ 2176 buff_addr = gpa_to_vva(dev, desc->addr); 2177 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type); 2178 2179 if (likely(packet_success < (free_entries - 1))) 2180 /* Prefetch descriptor index. */ 2181 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 2182 2183 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 2184 RTE_LOG(ERR, VHOST_DATA, 2185 "(%"PRIu64") Invalid frame buffer address found" 2186 "when TX packets!\n", 2187 dev->device_fh); 2188 packet_success++; 2189 continue; 2190 } 2191 2192 /* Prefetch buffer address. */ 2193 rte_prefetch0((void *)(uintptr_t)buff_addr); 2194 2195 /* 2196 * Setup dummy mbuf. This is copied to a real mbuf if 2197 * transmitted out the physical port. 2198 */ 2199 m.pkt.data_len = desc->len; 2200 m.pkt.nb_segs = 1; 2201 m.pkt.next = NULL; 2202 m.pkt.data = (void *)(uintptr_t)buff_addr; 2203 m.buf_addr = m.pkt.data; 2204 m.buf_physaddr = phys_addr; 2205 2206 /* 2207 * Check if the frame buffer address from guest crosses 2208 * sub-region or not. 2209 */ 2210 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2211 RTE_LOG(ERR, VHOST_DATA, 2212 "(%"PRIu64") Frame buffer address cross " 2213 "sub-regioin found when attaching TX frame " 2214 "buffer address!\n", 2215 dev->device_fh); 2216 need_copy = 1; 2217 } else 2218 need_copy = 0; 2219 2220 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2221 2222 /* 2223 * If this is the first received packet we need to learn 2224 * the MAC and setup VMDQ 2225 */ 2226 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) { 2227 if (dev->remove || (link_vmdq(dev, &m) == -1)) { 2228 /* 2229 * Discard frame if device is scheduled for 2230 * removal or a duplicate MAC address is found. 2231 */ 2232 packet_success += free_entries; 2233 vq->last_used_idx += packet_success; 2234 break; 2235 } 2236 } 2237 2238 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2239 packet_success++; 2240 } 2241 } 2242 2243 /* 2244 * This function is called by each data core. It handles all RX/TX registered 2245 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2246 * addresses are compared with all devices in the main linked list. 2247 */ 2248 static int 2249 switch_worker_zcp(__attribute__((unused)) void *arg) 2250 { 2251 struct virtio_net *dev = NULL; 2252 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2253 struct virtio_net_data_ll *dev_ll; 2254 struct mbuf_table *tx_q; 2255 volatile struct lcore_ll_info *lcore_ll; 2256 const uint64_t drain_tsc 2257 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2258 * BURST_TX_DRAIN_US; 2259 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2260 unsigned ret; 2261 const uint16_t lcore_id = rte_lcore_id(); 2262 uint16_t count_in_ring, rx_count = 0; 2263 2264 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2265 2266 lcore_ll = lcore_info[lcore_id].lcore_ll; 2267 prev_tsc = 0; 2268 2269 while (1) { 2270 cur_tsc = rte_rdtsc(); 2271 2272 /* TX burst queue drain */ 2273 diff_tsc = cur_tsc - prev_tsc; 2274 if (unlikely(diff_tsc > drain_tsc)) { 2275 /* 2276 * Get mbuf from vpool.pool and detach mbuf and 2277 * put back into vpool.ring. 2278 */ 2279 dev_ll = lcore_ll->ll_root_used; 2280 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) { 2281 /* Get virtio device ID */ 2282 dev = dev_ll->dev; 2283 2284 if (likely(!dev->remove)) { 2285 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q]; 2286 if (tx_q->len) { 2287 LOG_DEBUG(VHOST_DATA, 2288 "TX queue drained after timeout" 2289 " with burst size %u\n", 2290 tx_q->len); 2291 2292 /* 2293 * Tx any packets in the queue 2294 */ 2295 ret = rte_eth_tx_burst( 2296 ports[0], 2297 (uint16_t)tx_q->txq_id, 2298 (struct rte_mbuf **) 2299 tx_q->m_table, 2300 (uint16_t)tx_q->len); 2301 if (unlikely(ret < tx_q->len)) { 2302 do { 2303 rte_pktmbuf_free( 2304 tx_q->m_table[ret]); 2305 } while (++ret < tx_q->len); 2306 } 2307 tx_q->len = 0; 2308 2309 txmbuf_clean_zcp(dev, 2310 &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]); 2311 } 2312 } 2313 dev_ll = dev_ll->next; 2314 } 2315 prev_tsc = cur_tsc; 2316 } 2317 2318 rte_prefetch0(lcore_ll->ll_root_used); 2319 2320 /* 2321 * Inform the configuration core that we have exited the linked 2322 * list and that no devices are in use if requested. 2323 */ 2324 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2325 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2326 2327 /* Process devices */ 2328 dev_ll = lcore_ll->ll_root_used; 2329 2330 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) { 2331 dev = dev_ll->dev; 2332 if (unlikely(dev->remove)) { 2333 dev_ll = dev_ll->next; 2334 unlink_vmdq(dev); 2335 dev->ready = DEVICE_SAFE_REMOVE; 2336 continue; 2337 } 2338 2339 if (likely(dev->ready == DEVICE_RX)) { 2340 uint32_t index = dev->vmdq_rx_q; 2341 uint16_t i; 2342 count_in_ring 2343 = rte_ring_count(vpool_array[index].ring); 2344 uint16_t free_entries 2345 = (uint16_t)get_available_ring_num_zcp(dev); 2346 2347 /* 2348 * Attach all mbufs in vpool.ring and put back 2349 * into vpool.pool. 2350 */ 2351 for (i = 0; 2352 i < RTE_MIN(free_entries, 2353 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2354 i++) 2355 attach_rxmbuf_zcp(dev); 2356 2357 /* Handle guest RX */ 2358 rx_count = rte_eth_rx_burst(ports[0], 2359 (uint16_t)dev->vmdq_rx_q, pkts_burst, 2360 MAX_PKT_BURST); 2361 2362 if (rx_count) { 2363 ret_count = virtio_dev_rx_zcp(dev, 2364 pkts_burst, rx_count); 2365 if (enable_stats) { 2366 dev_statistics[dev->device_fh].rx_total 2367 += rx_count; 2368 dev_statistics[dev->device_fh].rx 2369 += ret_count; 2370 } 2371 while (likely(rx_count)) { 2372 rx_count--; 2373 pktmbuf_detach_zcp( 2374 pkts_burst[rx_count]); 2375 rte_ring_sp_enqueue( 2376 vpool_array[index].ring, 2377 (void *)pkts_burst[rx_count]); 2378 } 2379 } 2380 } 2381 2382 if (likely(!dev->remove)) 2383 /* Handle guest TX */ 2384 virtio_dev_tx_zcp(dev); 2385 2386 /* Move to the next device in the list */ 2387 dev_ll = dev_ll->next; 2388 } 2389 } 2390 2391 return 0; 2392 } 2393 2394 2395 /* 2396 * Add an entry to a used linked list. A free entry must first be found 2397 * in the free linked list using get_data_ll_free_entry(); 2398 */ 2399 static void 2400 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2401 struct virtio_net_data_ll *ll_dev) 2402 { 2403 struct virtio_net_data_ll *ll = *ll_root_addr; 2404 2405 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2406 ll_dev->next = NULL; 2407 rte_compiler_barrier(); 2408 2409 /* If ll == NULL then this is the first device. */ 2410 if (ll) { 2411 /* Increment to the tail of the linked list. */ 2412 while ((ll->next != NULL) ) 2413 ll = ll->next; 2414 2415 ll->next = ll_dev; 2416 } else { 2417 *ll_root_addr = ll_dev; 2418 } 2419 } 2420 2421 /* 2422 * Remove an entry from a used linked list. The entry must then be added to 2423 * the free linked list using put_data_ll_free_entry(). 2424 */ 2425 static void 2426 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2427 struct virtio_net_data_ll *ll_dev, 2428 struct virtio_net_data_ll *ll_dev_last) 2429 { 2430 struct virtio_net_data_ll *ll = *ll_root_addr; 2431 2432 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2433 return; 2434 2435 if (ll_dev == ll) 2436 *ll_root_addr = ll_dev->next; 2437 else 2438 if (likely(ll_dev_last != NULL)) 2439 ll_dev_last->next = ll_dev->next; 2440 else 2441 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2442 } 2443 2444 /* 2445 * Find and return an entry from the free linked list. 2446 */ 2447 static struct virtio_net_data_ll * 2448 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2449 { 2450 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2451 struct virtio_net_data_ll *ll_dev; 2452 2453 if (ll_free == NULL) 2454 return NULL; 2455 2456 ll_dev = ll_free; 2457 *ll_root_addr = ll_free->next; 2458 2459 return ll_dev; 2460 } 2461 2462 /* 2463 * Place an entry back on to the free linked list. 2464 */ 2465 static void 2466 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2467 struct virtio_net_data_ll *ll_dev) 2468 { 2469 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2470 2471 if (ll_dev == NULL) 2472 return; 2473 2474 ll_dev->next = ll_free; 2475 *ll_root_addr = ll_dev; 2476 } 2477 2478 /* 2479 * Creates a linked list of a given size. 2480 */ 2481 static struct virtio_net_data_ll * 2482 alloc_data_ll(uint32_t size) 2483 { 2484 struct virtio_net_data_ll *ll_new; 2485 uint32_t i; 2486 2487 /* Malloc and then chain the linked list. */ 2488 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2489 if (ll_new == NULL) { 2490 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2491 return NULL; 2492 } 2493 2494 for (i = 0; i < size - 1; i++) { 2495 ll_new[i].dev = NULL; 2496 ll_new[i].next = &ll_new[i+1]; 2497 } 2498 ll_new[i].next = NULL; 2499 2500 return (ll_new); 2501 } 2502 2503 /* 2504 * Create the main linked list along with each individual cores linked list. A used and a free list 2505 * are created to manage entries. 2506 */ 2507 static int 2508 init_data_ll (void) 2509 { 2510 int lcore; 2511 2512 RTE_LCORE_FOREACH_SLAVE(lcore) { 2513 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2514 if (lcore_info[lcore].lcore_ll == NULL) { 2515 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2516 return -1; 2517 } 2518 2519 lcore_info[lcore].lcore_ll->device_num = 0; 2520 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2521 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2522 if (num_devices % num_switching_cores) 2523 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2524 else 2525 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2526 } 2527 2528 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2529 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2530 2531 return 0; 2532 } 2533 2534 /* 2535 * Set virtqueue flags so that we do not receive interrupts. 2536 */ 2537 static void 2538 set_irq_status (struct virtio_net *dev) 2539 { 2540 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2541 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2542 } 2543 2544 /* 2545 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2546 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2547 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2548 */ 2549 static void 2550 destroy_device (volatile struct virtio_net *dev) 2551 { 2552 struct virtio_net_data_ll *ll_lcore_dev_cur; 2553 struct virtio_net_data_ll *ll_main_dev_cur; 2554 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2555 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2556 int lcore; 2557 2558 dev->flags &= ~VIRTIO_DEV_RUNNING; 2559 2560 /*set the remove flag. */ 2561 dev->remove = 1; 2562 2563 while(dev->ready != DEVICE_SAFE_REMOVE) { 2564 rte_pause(); 2565 } 2566 2567 /* Search for entry to be removed from lcore ll */ 2568 ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used; 2569 while (ll_lcore_dev_cur != NULL) { 2570 if (ll_lcore_dev_cur->dev == dev) { 2571 break; 2572 } else { 2573 ll_lcore_dev_last = ll_lcore_dev_cur; 2574 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2575 } 2576 } 2577 2578 if (ll_lcore_dev_cur == NULL) { 2579 RTE_LOG(ERR, VHOST_CONFIG, 2580 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2581 dev->device_fh); 2582 return; 2583 } 2584 2585 /* Search for entry to be removed from main ll */ 2586 ll_main_dev_cur = ll_root_used; 2587 ll_main_dev_last = NULL; 2588 while (ll_main_dev_cur != NULL) { 2589 if (ll_main_dev_cur->dev == dev) { 2590 break; 2591 } else { 2592 ll_main_dev_last = ll_main_dev_cur; 2593 ll_main_dev_cur = ll_main_dev_cur->next; 2594 } 2595 } 2596 2597 /* Remove entries from the lcore and main ll. */ 2598 rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2599 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2600 2601 /* Set the dev_removal_flag on each lcore. */ 2602 RTE_LCORE_FOREACH_SLAVE(lcore) { 2603 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2604 } 2605 2606 /* 2607 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2608 * they can no longer access the device removed from the linked lists and that the devices 2609 * are no longer in use. 2610 */ 2611 RTE_LCORE_FOREACH_SLAVE(lcore) { 2612 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2613 rte_pause(); 2614 } 2615 } 2616 2617 /* Add the entries back to the lcore and main free ll.*/ 2618 put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2619 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2620 2621 /* Decrement number of device on the lcore. */ 2622 lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--; 2623 2624 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2625 2626 if (zero_copy) { 2627 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q]; 2628 2629 /* Stop the RX queue. */ 2630 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) { 2631 LOG_DEBUG(VHOST_CONFIG, 2632 "(%"PRIu64") In destroy_device: Failed to stop " 2633 "rx queue:%d\n", 2634 dev->device_fh, 2635 dev->vmdq_rx_q); 2636 } 2637 2638 LOG_DEBUG(VHOST_CONFIG, 2639 "(%"PRIu64") in destroy_device: Start put mbuf in " 2640 "mempool back to ring for RX queue: %d\n", 2641 dev->device_fh, dev->vmdq_rx_q); 2642 2643 mbuf_destroy_zcp(vpool); 2644 2645 /* Stop the TX queue. */ 2646 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) { 2647 LOG_DEBUG(VHOST_CONFIG, 2648 "(%"PRIu64") In destroy_device: Failed to " 2649 "stop tx queue:%d\n", 2650 dev->device_fh, dev->vmdq_rx_q); 2651 } 2652 2653 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES]; 2654 2655 LOG_DEBUG(VHOST_CONFIG, 2656 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2657 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2658 dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES), 2659 dev->device_fh); 2660 2661 mbuf_destroy_zcp(vpool); 2662 } 2663 2664 } 2665 2666 /* 2667 * A new device is added to a data core. First the device is added to the main linked list 2668 * and the allocated to a specific data core. 2669 */ 2670 static int 2671 new_device (struct virtio_net *dev) 2672 { 2673 struct virtio_net_data_ll *ll_dev; 2674 int lcore, core_add = 0; 2675 uint32_t device_num_min = num_devices; 2676 2677 /* Add device to main ll */ 2678 ll_dev = get_data_ll_free_entry(&ll_root_free); 2679 if (ll_dev == NULL) { 2680 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2681 "of %d devices per core has been reached\n", 2682 dev->device_fh, num_devices); 2683 return -1; 2684 } 2685 ll_dev->dev = dev; 2686 add_data_ll_entry(&ll_root_used, ll_dev); 2687 ll_dev->dev->vmdq_rx_q 2688 = ll_dev->dev->device_fh * (num_queues / num_devices); 2689 2690 if (zero_copy) { 2691 uint32_t index = ll_dev->dev->vmdq_rx_q; 2692 uint32_t count_in_ring, i; 2693 struct mbuf_table *tx_q; 2694 2695 count_in_ring = rte_ring_count(vpool_array[index].ring); 2696 2697 LOG_DEBUG(VHOST_CONFIG, 2698 "(%"PRIu64") in new_device: mbuf count in mempool " 2699 "before attach is: %d\n", 2700 dev->device_fh, 2701 rte_mempool_count(vpool_array[index].pool)); 2702 LOG_DEBUG(VHOST_CONFIG, 2703 "(%"PRIu64") in new_device: mbuf count in ring " 2704 "before attach is : %d\n", 2705 dev->device_fh, count_in_ring); 2706 2707 /* 2708 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2709 */ 2710 for (i = 0; i < count_in_ring; i++) 2711 attach_rxmbuf_zcp(dev); 2712 2713 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2714 "mempool after attach is: %d\n", 2715 dev->device_fh, 2716 rte_mempool_count(vpool_array[index].pool)); 2717 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2718 "ring after attach is : %d\n", 2719 dev->device_fh, 2720 rte_ring_count(vpool_array[index].ring)); 2721 2722 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q]; 2723 tx_q->txq_id = dev->vmdq_rx_q; 2724 2725 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) { 2726 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q]; 2727 2728 LOG_DEBUG(VHOST_CONFIG, 2729 "(%"PRIu64") In new_device: Failed to start " 2730 "tx queue:%d\n", 2731 dev->device_fh, dev->vmdq_rx_q); 2732 2733 mbuf_destroy_zcp(vpool); 2734 return -1; 2735 } 2736 2737 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) { 2738 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q]; 2739 2740 LOG_DEBUG(VHOST_CONFIG, 2741 "(%"PRIu64") In new_device: Failed to start " 2742 "rx queue:%d\n", 2743 dev->device_fh, dev->vmdq_rx_q); 2744 2745 /* Stop the TX queue. */ 2746 if (rte_eth_dev_tx_queue_stop(ports[0], 2747 dev->vmdq_rx_q) != 0) { 2748 LOG_DEBUG(VHOST_CONFIG, 2749 "(%"PRIu64") In new_device: Failed to " 2750 "stop tx queue:%d\n", 2751 dev->device_fh, dev->vmdq_rx_q); 2752 } 2753 2754 mbuf_destroy_zcp(vpool); 2755 return -1; 2756 } 2757 2758 } 2759 2760 /*reset ready flag*/ 2761 dev->ready = DEVICE_MAC_LEARNING; 2762 dev->remove = 0; 2763 2764 /* Find a suitable lcore to add the device. */ 2765 RTE_LCORE_FOREACH_SLAVE(lcore) { 2766 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2767 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2768 core_add = lcore; 2769 } 2770 } 2771 /* Add device to lcore ll */ 2772 ll_dev->dev->coreid = core_add; 2773 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); 2774 if (ll_dev == NULL) { 2775 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2776 dev->ready = DEVICE_SAFE_REMOVE; 2777 destroy_device(dev); 2778 return -1; 2779 } 2780 ll_dev->dev = dev; 2781 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); 2782 2783 /* Initialize device stats */ 2784 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2785 2786 /* Disable notifications. */ 2787 set_irq_status(dev); 2788 lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++; 2789 dev->flags |= VIRTIO_DEV_RUNNING; 2790 2791 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid); 2792 2793 return 0; 2794 } 2795 2796 /* 2797 * These callback allow devices to be added to the data core when configuration 2798 * has been fully complete. 2799 */ 2800 static const struct virtio_net_device_ops virtio_net_device_ops = 2801 { 2802 .new_device = new_device, 2803 .destroy_device = destroy_device, 2804 }; 2805 2806 /* 2807 * This is a thread will wake up after a period to print stats if the user has 2808 * enabled them. 2809 */ 2810 static void 2811 print_stats(void) 2812 { 2813 struct virtio_net_data_ll *dev_ll; 2814 uint64_t tx_dropped, rx_dropped; 2815 uint64_t tx, tx_total, rx, rx_total; 2816 uint32_t device_fh; 2817 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2818 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2819 2820 while(1) { 2821 sleep(enable_stats); 2822 2823 /* Clear screen and move to top left */ 2824 printf("%s%s", clr, top_left); 2825 2826 printf("\nDevice statistics ===================================="); 2827 2828 dev_ll = ll_root_used; 2829 while (dev_ll != NULL) { 2830 device_fh = (uint32_t)dev_ll->dev->device_fh; 2831 tx_total = dev_statistics[device_fh].tx_total; 2832 tx = dev_statistics[device_fh].tx; 2833 tx_dropped = tx_total - tx; 2834 if (zero_copy == 0) { 2835 rx_total = rte_atomic64_read( 2836 &dev_statistics[device_fh].rx_total_atomic); 2837 rx = rte_atomic64_read( 2838 &dev_statistics[device_fh].rx_atomic); 2839 } else { 2840 rx_total = dev_statistics[device_fh].rx_total; 2841 rx = dev_statistics[device_fh].rx; 2842 } 2843 rx_dropped = rx_total - rx; 2844 2845 printf("\nStatistics for device %"PRIu32" ------------------------------" 2846 "\nTX total: %"PRIu64"" 2847 "\nTX dropped: %"PRIu64"" 2848 "\nTX successful: %"PRIu64"" 2849 "\nRX total: %"PRIu64"" 2850 "\nRX dropped: %"PRIu64"" 2851 "\nRX successful: %"PRIu64"", 2852 device_fh, 2853 tx_total, 2854 tx_dropped, 2855 tx, 2856 rx_total, 2857 rx_dropped, 2858 rx); 2859 2860 dev_ll = dev_ll->next; 2861 } 2862 printf("\n======================================================\n"); 2863 } 2864 } 2865 2866 static void 2867 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2868 char *ring_name, uint32_t nb_mbuf) 2869 { 2870 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2871 vpool_array[index].pool 2872 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2873 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2874 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2875 rte_pktmbuf_init, NULL, socket, 0); 2876 if (vpool_array[index].pool != NULL) { 2877 vpool_array[index].ring 2878 = rte_ring_create(ring_name, 2879 rte_align32pow2(nb_mbuf + 1), 2880 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2881 if (likely(vpool_array[index].ring != NULL)) { 2882 LOG_DEBUG(VHOST_CONFIG, 2883 "in setup_mempool_tbl: mbuf count in " 2884 "mempool is: %d\n", 2885 rte_mempool_count(vpool_array[index].pool)); 2886 LOG_DEBUG(VHOST_CONFIG, 2887 "in setup_mempool_tbl: mbuf count in " 2888 "ring is: %d\n", 2889 rte_ring_count(vpool_array[index].ring)); 2890 } else { 2891 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2892 ring_name); 2893 } 2894 2895 /* Need consider head room. */ 2896 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2897 } else { 2898 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2899 } 2900 } 2901 2902 2903 /* 2904 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2905 * device is also registered here to handle the IOCTLs. 2906 */ 2907 int 2908 MAIN(int argc, char *argv[]) 2909 { 2910 struct rte_mempool *mbuf_pool = NULL; 2911 unsigned lcore_id, core_id = 0; 2912 unsigned nb_ports, valid_num_ports; 2913 int ret; 2914 uint8_t portid, queue_id = 0; 2915 static pthread_t tid; 2916 2917 /* init EAL */ 2918 ret = rte_eal_init(argc, argv); 2919 if (ret < 0) 2920 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2921 argc -= ret; 2922 argv += ret; 2923 2924 /* parse app arguments */ 2925 ret = us_vhost_parse_args(argc, argv); 2926 if (ret < 0) 2927 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2928 2929 if (rte_eal_pci_probe() != 0) 2930 rte_exit(EXIT_FAILURE, "Error with NIC driver initialization\n"); 2931 2932 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2933 if (rte_lcore_is_enabled(lcore_id)) 2934 lcore_ids[core_id ++] = lcore_id; 2935 2936 if (rte_lcore_count() > RTE_MAX_LCORE) 2937 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2938 2939 /*set the number of swithcing cores available*/ 2940 num_switching_cores = rte_lcore_count()-1; 2941 2942 /* Get the number of physical ports. */ 2943 nb_ports = rte_eth_dev_count(); 2944 if (nb_ports > RTE_MAX_ETHPORTS) 2945 nb_ports = RTE_MAX_ETHPORTS; 2946 2947 /* 2948 * Update the global var NUM_PORTS and global array PORTS 2949 * and get value of var VALID_NUM_PORTS according to system ports number 2950 */ 2951 valid_num_ports = check_ports_num(nb_ports); 2952 2953 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2954 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2955 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2956 return -1; 2957 } 2958 2959 if (zero_copy == 0) { 2960 /* Create the mbuf pool. */ 2961 mbuf_pool = rte_mempool_create( 2962 "MBUF_POOL", 2963 NUM_MBUFS_PER_PORT 2964 * valid_num_ports, 2965 MBUF_SIZE, MBUF_CACHE_SIZE, 2966 sizeof(struct rte_pktmbuf_pool_private), 2967 rte_pktmbuf_pool_init, NULL, 2968 rte_pktmbuf_init, NULL, 2969 rte_socket_id(), 0); 2970 if (mbuf_pool == NULL) 2971 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2972 2973 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2974 vpool_array[queue_id].pool = mbuf_pool; 2975 2976 if (vm2vm_mode == VM2VM_HARDWARE) { 2977 /* Enable VT loop back to let L2 switch to do it. */ 2978 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2979 LOG_DEBUG(VHOST_CONFIG, 2980 "Enable loop back for L2 switch in vmdq.\n"); 2981 } 2982 } else { 2983 uint32_t nb_mbuf; 2984 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2985 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2986 2987 rx_conf_default.start_rx_per_q = (uint8_t)zero_copy; 2988 rx_conf_default.rx_drop_en = 0; 2989 tx_conf_default.start_tx_per_q = (uint8_t)zero_copy; 2990 nb_mbuf = num_rx_descriptor 2991 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2992 + num_switching_cores * MAX_PKT_BURST; 2993 2994 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2995 snprintf(pool_name, sizeof(pool_name), 2996 "rxmbuf_pool_%u", queue_id); 2997 snprintf(ring_name, sizeof(ring_name), 2998 "rxmbuf_ring_%u", queue_id); 2999 setup_mempool_tbl(rte_socket_id(), queue_id, 3000 pool_name, ring_name, nb_mbuf); 3001 } 3002 3003 nb_mbuf = num_tx_descriptor 3004 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3005 + num_switching_cores * MAX_PKT_BURST; 3006 3007 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3008 snprintf(pool_name, sizeof(pool_name), 3009 "txmbuf_pool_%u", queue_id); 3010 snprintf(ring_name, sizeof(ring_name), 3011 "txmbuf_ring_%u", queue_id); 3012 setup_mempool_tbl(rte_socket_id(), 3013 (queue_id + MAX_QUEUES), 3014 pool_name, ring_name, nb_mbuf); 3015 } 3016 3017 if (vm2vm_mode == VM2VM_HARDWARE) { 3018 /* Enable VT loop back to let L2 switch to do it. */ 3019 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3020 LOG_DEBUG(VHOST_CONFIG, 3021 "Enable loop back for L2 switch in vmdq.\n"); 3022 } 3023 } 3024 /* Set log level. */ 3025 rte_set_log_level(LOG_LEVEL); 3026 3027 /* initialize all ports */ 3028 for (portid = 0; portid < nb_ports; portid++) { 3029 /* skip ports that are not enabled */ 3030 if ((enabled_port_mask & (1 << portid)) == 0) { 3031 RTE_LOG(INFO, VHOST_PORT, 3032 "Skipping disabled port %d\n", portid); 3033 continue; 3034 } 3035 if (port_init(portid) != 0) 3036 rte_exit(EXIT_FAILURE, 3037 "Cannot initialize network ports\n"); 3038 } 3039 3040 /* Initialise all linked lists. */ 3041 if (init_data_ll() == -1) 3042 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3043 3044 /* Initialize device stats */ 3045 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3046 3047 /* Enable stats if the user option is set. */ 3048 if (enable_stats) 3049 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3050 3051 /* Launch all data cores. */ 3052 if (zero_copy == 0) { 3053 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3054 rte_eal_remote_launch(switch_worker, 3055 mbuf_pool, lcore_id); 3056 } 3057 } else { 3058 uint32_t count_in_mempool, index, i; 3059 for (index = 0; index < 2*MAX_QUEUES; index++) { 3060 /* For all RX and TX queues. */ 3061 count_in_mempool 3062 = rte_mempool_count(vpool_array[index].pool); 3063 3064 /* 3065 * Transfer all un-attached mbufs from vpool.pool 3066 * to vpoo.ring. 3067 */ 3068 for (i = 0; i < count_in_mempool; i++) { 3069 struct rte_mbuf *mbuf 3070 = __rte_mbuf_raw_alloc( 3071 vpool_array[index].pool); 3072 rte_ring_sp_enqueue(vpool_array[index].ring, 3073 (void *)mbuf); 3074 } 3075 3076 LOG_DEBUG(VHOST_CONFIG, 3077 "in MAIN: mbuf count in mempool at initial " 3078 "is: %d\n", count_in_mempool); 3079 LOG_DEBUG(VHOST_CONFIG, 3080 "in MAIN: mbuf count in ring at initial is :" 3081 " %d\n", 3082 rte_ring_count(vpool_array[index].ring)); 3083 } 3084 3085 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3086 rte_eal_remote_launch(switch_worker_zcp, NULL, 3087 lcore_id); 3088 } 3089 3090 /* Register CUSE device to handle IOCTLs. */ 3091 ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks()); 3092 if (ret != 0) 3093 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3094 3095 init_virtio_net(&virtio_net_device_ops); 3096 3097 /* Start CUSE session. */ 3098 start_cuse_session_loop(); 3099 return 0; 3100 3101 } 3102 3103