1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 53 #include "main.h" 54 #include "virtio-net.h" 55 #include "vhost-net-cdev.h" 56 57 #define MAX_QUEUES 128 58 59 /* the maximum number of external ports supported */ 60 #define MAX_SUP_PORTS 1 61 62 /* 63 * Calculate the number of buffers needed per port 64 */ 65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 66 (num_switching_cores*MAX_PKT_BURST) + \ 67 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 68 (num_switching_cores*MBUF_CACHE_SIZE)) 69 70 #define MBUF_CACHE_SIZE 128 71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 72 73 /* 74 * No frame data buffer allocated from host are required for zero copy 75 * implementation, guest will allocate the frame data buffer, and vhost 76 * directly use it. 77 */ 78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 80 + RTE_PKTMBUF_HEADROOM) 81 #define MBUF_CACHE_SIZE_ZCP 0 82 83 /* 84 * RX and TX Prefetch, Host, and Write-back threshold values should be 85 * carefully set for optimal performance. Consult the network 86 * controller's datasheet and supporting DPDK documentation for guidance 87 * on how these parameters should be set. 88 */ 89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 92 93 /* 94 * These default values are optimized for use with the Intel(R) 82599 10 GbE 95 * Controller and the DPDK ixgbe PMD. Consider using other values for other 96 * network controllers and/or network drivers. 97 */ 98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 99 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 100 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 101 102 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 103 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */ 104 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 105 106 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 107 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 108 109 #define JUMBO_FRAME_MAX_SIZE 0x2600 110 111 /* State of virtio device. */ 112 #define DEVICE_MAC_LEARNING 0 113 #define DEVICE_RX 1 114 #define DEVICE_SAFE_REMOVE 2 115 116 /* Config_core_flag status definitions. */ 117 #define REQUEST_DEV_REMOVAL 1 118 #define ACK_DEV_REMOVAL 0 119 120 /* Configurable number of RX/TX ring descriptors */ 121 #define RTE_TEST_RX_DESC_DEFAULT 1024 122 #define RTE_TEST_TX_DESC_DEFAULT 512 123 124 /* 125 * Need refine these 2 macros for legacy and DPDK based front end: 126 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 127 * And then adjust power 2. 128 */ 129 /* 130 * For legacy front end, 128 descriptors, 131 * half for virtio header, another half for mbuf. 132 */ 133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 135 136 /* Get first 4 bytes in mbuf headroom. */ 137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 138 + sizeof(struct rte_mbuf))) 139 140 /* true if x is a power of 2 */ 141 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 142 143 #define INVALID_PORT_ID 0xFF 144 145 /* Max number of devices. Limited by vmdq. */ 146 #define MAX_DEVICES 64 147 148 /* Size of buffers used for snprintfs. */ 149 #define MAX_PRINT_BUFF 6072 150 151 /* Maximum character device basename size. */ 152 #define MAX_BASENAME_SZ 10 153 154 /* Maximum long option length for option parsing. */ 155 #define MAX_LONG_OPT_SZ 64 156 157 /* Used to compare MAC addresses. */ 158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 159 160 /* Number of descriptors per cacheline. */ 161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 162 163 /* mask of enabled ports */ 164 static uint32_t enabled_port_mask = 0; 165 166 /*Number of switching cores enabled*/ 167 static uint32_t num_switching_cores = 0; 168 169 /* number of devices/queues to support*/ 170 static uint32_t num_queues = 0; 171 uint32_t num_devices = 0; 172 173 /* 174 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 175 * disabled on default. 176 */ 177 static uint32_t zero_copy; 178 179 /* number of descriptors to apply*/ 180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 182 183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 184 #define MAX_RING_DESC 4096 185 186 struct vpool { 187 struct rte_mempool *pool; 188 struct rte_ring *ring; 189 uint32_t buf_size; 190 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 191 192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 193 typedef enum { 194 VM2VM_DISABLED = 0, 195 VM2VM_SOFTWARE = 1, 196 VM2VM_HARDWARE = 2, 197 VM2VM_LAST 198 } vm2vm_type; 199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 200 201 /* The type of host physical address translated from guest physical address. */ 202 typedef enum { 203 PHYS_ADDR_CONTINUOUS = 0, 204 PHYS_ADDR_CROSS_SUBREG = 1, 205 PHYS_ADDR_INVALID = 2, 206 PHYS_ADDR_LAST 207 } hpa_type; 208 209 /* Enable stats. */ 210 static uint32_t enable_stats = 0; 211 /* Enable retries on RX. */ 212 static uint32_t enable_retry = 1; 213 /* Specify timeout (in useconds) between retries on RX. */ 214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 215 /* Specify the number of retries on RX. */ 216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 217 218 /* Character device basename. Can be set by user. */ 219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 220 221 /* Charater device index. Can be set by user. */ 222 static uint32_t dev_index = 0; 223 224 /* This can be set by the user so it is made available here. */ 225 extern uint64_t VHOST_FEATURES; 226 227 /* Default configuration for rx and tx thresholds etc. */ 228 static struct rte_eth_rxconf rx_conf_default = { 229 .rx_thresh = { 230 .pthresh = RX_PTHRESH, 231 .hthresh = RX_HTHRESH, 232 .wthresh = RX_WTHRESH, 233 }, 234 .rx_drop_en = 1, 235 }; 236 237 /* 238 * These default values are optimized for use with the Intel(R) 82599 10 GbE 239 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 240 * network controllers and/or network drivers. 241 */ 242 static struct rte_eth_txconf tx_conf_default = { 243 .tx_thresh = { 244 .pthresh = TX_PTHRESH, 245 .hthresh = TX_HTHRESH, 246 .wthresh = TX_WTHRESH, 247 }, 248 .tx_free_thresh = 0, /* Use PMD default values */ 249 .tx_rs_thresh = 0, /* Use PMD default values */ 250 }; 251 252 /* empty vmdq configuration structure. Filled in programatically */ 253 static struct rte_eth_conf vmdq_conf_default = { 254 .rxmode = { 255 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 256 .split_hdr_size = 0, 257 .header_split = 0, /**< Header Split disabled */ 258 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 259 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 260 /* 261 * It is necessary for 1G NIC such as I350, 262 * this fixes bug of ipv4 forwarding in guest can't 263 * forward pakets from one virtio dev to another virtio dev. 264 */ 265 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 266 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 267 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 268 }, 269 270 .txmode = { 271 .mq_mode = ETH_MQ_TX_NONE, 272 }, 273 .rx_adv_conf = { 274 /* 275 * should be overridden separately in code with 276 * appropriate values 277 */ 278 .vmdq_rx_conf = { 279 .nb_queue_pools = ETH_8_POOLS, 280 .enable_default_pool = 0, 281 .default_pool = 0, 282 .nb_pool_maps = 0, 283 .pool_map = {{0, 0},}, 284 }, 285 }, 286 }; 287 288 static unsigned lcore_ids[RTE_MAX_LCORE]; 289 static uint8_t ports[RTE_MAX_ETHPORTS]; 290 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 291 292 static const uint16_t external_pkt_default_vlan_tag = 2000; 293 const uint16_t vlan_tags[] = { 294 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 295 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 296 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 297 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 298 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 299 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 300 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 301 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 302 }; 303 304 /* ethernet addresses of ports */ 305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 306 307 /* heads for the main used and free linked lists for the data path. */ 308 static struct virtio_net_data_ll *ll_root_used = NULL; 309 static struct virtio_net_data_ll *ll_root_free = NULL; 310 311 /* Array of data core structures containing information on individual core linked lists. */ 312 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 313 314 /* Used for queueing bursts of TX packets. */ 315 struct mbuf_table { 316 unsigned len; 317 unsigned txq_id; 318 struct rte_mbuf *m_table[MAX_PKT_BURST]; 319 }; 320 321 /* TX queue for each data core. */ 322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 323 324 /* TX queue fori each virtio device for zero copy. */ 325 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 326 327 /* Vlan header struct used to insert vlan tags on TX. */ 328 struct vlan_ethhdr { 329 unsigned char h_dest[ETH_ALEN]; 330 unsigned char h_source[ETH_ALEN]; 331 __be16 h_vlan_proto; 332 __be16 h_vlan_TCI; 333 __be16 h_vlan_encapsulated_proto; 334 }; 335 336 /* IPv4 Header */ 337 struct ipv4_hdr { 338 uint8_t version_ihl; /**< version and header length */ 339 uint8_t type_of_service; /**< type of service */ 340 uint16_t total_length; /**< length of packet */ 341 uint16_t packet_id; /**< packet ID */ 342 uint16_t fragment_offset; /**< fragmentation offset */ 343 uint8_t time_to_live; /**< time to live */ 344 uint8_t next_proto_id; /**< protocol ID */ 345 uint16_t hdr_checksum; /**< header checksum */ 346 uint32_t src_addr; /**< source address */ 347 uint32_t dst_addr; /**< destination address */ 348 } __attribute__((__packed__)); 349 350 /* Header lengths. */ 351 #define VLAN_HLEN 4 352 #define VLAN_ETH_HLEN 18 353 354 /* Per-device statistics struct */ 355 struct device_statistics { 356 uint64_t tx_total; 357 rte_atomic64_t rx_total_atomic; 358 uint64_t rx_total; 359 uint64_t tx; 360 rte_atomic64_t rx_atomic; 361 uint64_t rx; 362 } __rte_cache_aligned; 363 struct device_statistics dev_statistics[MAX_DEVICES]; 364 365 /* 366 * Builds up the correct configuration for VMDQ VLAN pool map 367 * according to the pool & queue limits. 368 */ 369 static inline int 370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 371 { 372 struct rte_eth_vmdq_rx_conf conf; 373 unsigned i; 374 375 memset(&conf, 0, sizeof(conf)); 376 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 377 conf.nb_pool_maps = num_devices; 378 conf.enable_loop_back = 379 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 380 381 for (i = 0; i < conf.nb_pool_maps; i++) { 382 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 383 conf.pool_map[i].pools = (1UL << i); 384 } 385 386 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 387 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 388 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 389 return 0; 390 } 391 392 /* 393 * Validate the device number according to the max pool number gotten form 394 * dev_info. If the device number is invalid, give the error message and 395 * return -1. Each device must have its own pool. 396 */ 397 static inline int 398 validate_num_devices(uint32_t max_nb_devices) 399 { 400 if (num_devices > max_nb_devices) { 401 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 402 return -1; 403 } 404 return 0; 405 } 406 407 /* 408 * Initialises a given port using global settings and with the rx buffers 409 * coming from the mbuf_pool passed as parameter 410 */ 411 static inline int 412 port_init(uint8_t port) 413 { 414 struct rte_eth_dev_info dev_info; 415 struct rte_eth_conf port_conf; 416 uint16_t rx_rings, tx_rings; 417 uint16_t rx_ring_size, tx_ring_size; 418 int retval; 419 uint16_t q; 420 421 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 422 rte_eth_dev_info_get (port, &dev_info); 423 424 /*configure the number of supported virtio devices based on VMDQ limits */ 425 num_devices = dev_info.max_vmdq_pools; 426 num_queues = dev_info.max_rx_queues; 427 428 if (zero_copy) { 429 rx_ring_size = num_rx_descriptor; 430 tx_ring_size = num_tx_descriptor; 431 tx_rings = dev_info.max_tx_queues; 432 } else { 433 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 434 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 435 tx_rings = (uint16_t)rte_lcore_count(); 436 } 437 438 retval = validate_num_devices(MAX_DEVICES); 439 if (retval < 0) 440 return retval; 441 442 /* Get port configuration. */ 443 retval = get_eth_conf(&port_conf, num_devices); 444 if (retval < 0) 445 return retval; 446 447 if (port >= rte_eth_dev_count()) return -1; 448 449 rx_rings = (uint16_t)num_queues, 450 /* Configure ethernet device. */ 451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 452 if (retval != 0) 453 return retval; 454 455 /* Setup the queues. */ 456 for (q = 0; q < rx_rings; q ++) { 457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 458 rte_eth_dev_socket_id(port), &rx_conf_default, 459 vpool_array[q].pool); 460 if (retval < 0) 461 return retval; 462 } 463 for (q = 0; q < tx_rings; q ++) { 464 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 465 rte_eth_dev_socket_id(port), &tx_conf_default); 466 if (retval < 0) 467 return retval; 468 } 469 470 /* Start the device. */ 471 retval = rte_eth_dev_start(port); 472 if (retval < 0) { 473 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 474 return retval; 475 } 476 477 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 478 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 479 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 480 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 481 (unsigned)port, 482 vmdq_ports_eth_addr[port].addr_bytes[0], 483 vmdq_ports_eth_addr[port].addr_bytes[1], 484 vmdq_ports_eth_addr[port].addr_bytes[2], 485 vmdq_ports_eth_addr[port].addr_bytes[3], 486 vmdq_ports_eth_addr[port].addr_bytes[4], 487 vmdq_ports_eth_addr[port].addr_bytes[5]); 488 489 return 0; 490 } 491 492 /* 493 * Set character device basename. 494 */ 495 static int 496 us_vhost_parse_basename(const char *q_arg) 497 { 498 /* parse number string */ 499 500 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 501 return -1; 502 else 503 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 504 505 return 0; 506 } 507 508 /* 509 * Parse the portmask provided at run time. 510 */ 511 static int 512 parse_portmask(const char *portmask) 513 { 514 char *end = NULL; 515 unsigned long pm; 516 517 errno = 0; 518 519 /* parse hexadecimal string */ 520 pm = strtoul(portmask, &end, 16); 521 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 522 return -1; 523 524 if (pm == 0) 525 return -1; 526 527 return pm; 528 529 } 530 531 /* 532 * Parse num options at run time. 533 */ 534 static int 535 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 536 { 537 char *end = NULL; 538 unsigned long num; 539 540 errno = 0; 541 542 /* parse unsigned int string */ 543 num = strtoul(q_arg, &end, 10); 544 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 545 return -1; 546 547 if (num > max_valid_value) 548 return -1; 549 550 return num; 551 552 } 553 554 /* 555 * Display usage 556 */ 557 static void 558 us_vhost_usage(const char *prgname) 559 { 560 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 561 " --vm2vm [0|1|2]\n" 562 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 563 " --dev-basename <name> --dev-index [0-N]\n" 564 " --nb-devices ND\n" 565 " -p PORTMASK: Set mask for ports to be used by application\n" 566 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 567 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 568 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 569 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 570 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 571 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 572 " --dev-basename: The basename to be used for the character device.\n" 573 " --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n" 574 " --zero-copy [0|1]: disable(default)/enable rx/tx " 575 "zero copy\n" 576 " --rx-desc-num [0-N]: the number of descriptors on rx, " 577 "used only when zero copy is enabled.\n" 578 " --tx-desc-num [0-N]: the number of descriptors on tx, " 579 "used only when zero copy is enabled.\n", 580 prgname); 581 } 582 583 /* 584 * Parse the arguments given in the command line of the application. 585 */ 586 static int 587 us_vhost_parse_args(int argc, char **argv) 588 { 589 int opt, ret; 590 int option_index; 591 unsigned i; 592 const char *prgname = argv[0]; 593 static struct option long_option[] = { 594 {"vm2vm", required_argument, NULL, 0}, 595 {"rx-retry", required_argument, NULL, 0}, 596 {"rx-retry-delay", required_argument, NULL, 0}, 597 {"rx-retry-num", required_argument, NULL, 0}, 598 {"mergeable", required_argument, NULL, 0}, 599 {"stats", required_argument, NULL, 0}, 600 {"dev-basename", required_argument, NULL, 0}, 601 {"dev-index", required_argument, NULL, 0}, 602 {"zero-copy", required_argument, NULL, 0}, 603 {"rx-desc-num", required_argument, NULL, 0}, 604 {"tx-desc-num", required_argument, NULL, 0}, 605 {NULL, 0, 0, 0}, 606 }; 607 608 /* Parse command line */ 609 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 610 switch (opt) { 611 /* Portmask */ 612 case 'p': 613 enabled_port_mask = parse_portmask(optarg); 614 if (enabled_port_mask == 0) { 615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 616 us_vhost_usage(prgname); 617 return -1; 618 } 619 break; 620 621 case 0: 622 /* Enable/disable vm2vm comms. */ 623 if (!strncmp(long_option[option_index].name, "vm2vm", 624 MAX_LONG_OPT_SZ)) { 625 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 626 if (ret == -1) { 627 RTE_LOG(INFO, VHOST_CONFIG, 628 "Invalid argument for " 629 "vm2vm [0|1|2]\n"); 630 us_vhost_usage(prgname); 631 return -1; 632 } else { 633 vm2vm_mode = (vm2vm_type)ret; 634 } 635 } 636 637 /* Enable/disable retries on RX. */ 638 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 639 ret = parse_num_opt(optarg, 1); 640 if (ret == -1) { 641 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 642 us_vhost_usage(prgname); 643 return -1; 644 } else { 645 enable_retry = ret; 646 } 647 } 648 649 /* Specify the retries delay time (in useconds) on RX. */ 650 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 651 ret = parse_num_opt(optarg, INT32_MAX); 652 if (ret == -1) { 653 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 654 us_vhost_usage(prgname); 655 return -1; 656 } else { 657 burst_rx_delay_time = ret; 658 } 659 } 660 661 /* Specify the retries number on RX. */ 662 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 663 ret = parse_num_opt(optarg, INT32_MAX); 664 if (ret == -1) { 665 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 666 us_vhost_usage(prgname); 667 return -1; 668 } else { 669 burst_rx_retry_num = ret; 670 } 671 } 672 673 /* Enable/disable RX mergeable buffers. */ 674 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 675 ret = parse_num_opt(optarg, 1); 676 if (ret == -1) { 677 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 678 us_vhost_usage(prgname); 679 return -1; 680 } else { 681 if (ret) { 682 vmdq_conf_default.rxmode.jumbo_frame = 1; 683 vmdq_conf_default.rxmode.max_rx_pkt_len 684 = JUMBO_FRAME_MAX_SIZE; 685 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF); 686 } 687 } 688 } 689 690 /* Enable/disable stats. */ 691 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 692 ret = parse_num_opt(optarg, INT32_MAX); 693 if (ret == -1) { 694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 695 us_vhost_usage(prgname); 696 return -1; 697 } else { 698 enable_stats = ret; 699 } 700 } 701 702 /* Set character device basename. */ 703 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 704 if (us_vhost_parse_basename(optarg) == -1) { 705 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 706 us_vhost_usage(prgname); 707 return -1; 708 } 709 } 710 711 /* Set character device index. */ 712 if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) { 713 ret = parse_num_opt(optarg, INT32_MAX); 714 if (ret == -1) { 715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n"); 716 us_vhost_usage(prgname); 717 return -1; 718 } else 719 dev_index = ret; 720 } 721 722 /* Enable/disable rx/tx zero copy. */ 723 if (!strncmp(long_option[option_index].name, 724 "zero-copy", MAX_LONG_OPT_SZ)) { 725 ret = parse_num_opt(optarg, 1); 726 if (ret == -1) { 727 RTE_LOG(INFO, VHOST_CONFIG, 728 "Invalid argument" 729 " for zero-copy [0|1]\n"); 730 us_vhost_usage(prgname); 731 return -1; 732 } else 733 zero_copy = ret; 734 735 if (zero_copy) { 736 #ifdef RTE_MBUF_REFCNT 737 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 738 "zero copy vhost APP, please " 739 "disable RTE_MBUF_REFCNT\n" 740 "in config file and then rebuild DPDK " 741 "core lib!\n" 742 "Otherwise please disable zero copy " 743 "flag in command line!\n"); 744 return -1; 745 #endif 746 } 747 } 748 749 /* Specify the descriptor number on RX. */ 750 if (!strncmp(long_option[option_index].name, 751 "rx-desc-num", MAX_LONG_OPT_SZ)) { 752 ret = parse_num_opt(optarg, MAX_RING_DESC); 753 if ((ret == -1) || (!POWEROF2(ret))) { 754 RTE_LOG(INFO, VHOST_CONFIG, 755 "Invalid argument for rx-desc-num[0-N]," 756 "power of 2 required.\n"); 757 us_vhost_usage(prgname); 758 return -1; 759 } else { 760 num_rx_descriptor = ret; 761 } 762 } 763 764 /* Specify the descriptor number on TX. */ 765 if (!strncmp(long_option[option_index].name, 766 "tx-desc-num", MAX_LONG_OPT_SZ)) { 767 ret = parse_num_opt(optarg, MAX_RING_DESC); 768 if ((ret == -1) || (!POWEROF2(ret))) { 769 RTE_LOG(INFO, VHOST_CONFIG, 770 "Invalid argument for tx-desc-num [0-N]," 771 "power of 2 required.\n"); 772 us_vhost_usage(prgname); 773 return -1; 774 } else { 775 num_tx_descriptor = ret; 776 } 777 } 778 779 break; 780 781 /* Invalid option - print options. */ 782 default: 783 us_vhost_usage(prgname); 784 return -1; 785 } 786 } 787 788 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 789 if (enabled_port_mask & (1 << i)) 790 ports[num_ports++] = (uint8_t)i; 791 } 792 793 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 794 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 795 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 796 return -1; 797 } 798 799 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 800 RTE_LOG(INFO, VHOST_PORT, 801 "Vhost zero copy doesn't support software vm2vm," 802 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 803 return -1; 804 } 805 806 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 807 RTE_LOG(INFO, VHOST_PORT, 808 "Vhost zero copy doesn't support jumbo frame," 809 "please specify '--mergeable 0' to disable the " 810 "mergeable feature.\n"); 811 return -1; 812 } 813 814 return 0; 815 } 816 817 /* 818 * Update the global var NUM_PORTS and array PORTS according to system ports number 819 * and return valid ports number 820 */ 821 static unsigned check_ports_num(unsigned nb_ports) 822 { 823 unsigned valid_num_ports = num_ports; 824 unsigned portid; 825 826 if (num_ports > nb_ports) { 827 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 828 num_ports, nb_ports); 829 num_ports = nb_ports; 830 } 831 832 for (portid = 0; portid < num_ports; portid ++) { 833 if (ports[portid] >= nb_ports) { 834 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 835 ports[portid], (nb_ports - 1)); 836 ports[portid] = INVALID_PORT_ID; 837 valid_num_ports--; 838 } 839 } 840 return valid_num_ports; 841 } 842 843 /* 844 * Macro to print out packet contents. Wrapped in debug define so that the 845 * data path is not effected when debug is disabled. 846 */ 847 #ifdef DEBUG 848 #define PRINT_PACKET(device, addr, size, header) do { \ 849 char *pkt_addr = (char*)(addr); \ 850 unsigned int index; \ 851 char packet[MAX_PRINT_BUFF]; \ 852 \ 853 if ((header)) \ 854 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 855 else \ 856 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 857 for (index = 0; index < (size); index++) { \ 858 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 859 "%02hhx ", pkt_addr[index]); \ 860 } \ 861 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 862 \ 863 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 864 } while(0) 865 #else 866 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 867 #endif 868 869 /* 870 * Function to convert guest physical addresses to vhost physical addresses. 871 * This is used to convert virtio buffer addresses. 872 */ 873 static inline uint64_t __attribute__((always_inline)) 874 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 875 uint32_t buf_len, hpa_type *addr_type) 876 { 877 struct virtio_memory_regions_hpa *region; 878 uint32_t regionidx; 879 uint64_t vhost_pa = 0; 880 881 *addr_type = PHYS_ADDR_INVALID; 882 883 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 884 region = &vdev->regions_hpa[regionidx]; 885 if ((guest_pa >= region->guest_phys_address) && 886 (guest_pa <= region->guest_phys_address_end)) { 887 vhost_pa = region->host_phys_addr_offset + guest_pa; 888 if (likely((guest_pa + buf_len - 1) 889 <= region->guest_phys_address_end)) 890 *addr_type = PHYS_ADDR_CONTINUOUS; 891 else 892 *addr_type = PHYS_ADDR_CROSS_SUBREG; 893 break; 894 } 895 } 896 897 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 898 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 899 (void *)(uintptr_t)vhost_pa); 900 901 return vhost_pa; 902 } 903 904 /* 905 * Compares a packet destination MAC address to a device MAC address. 906 */ 907 static inline int __attribute__((always_inline)) 908 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 909 { 910 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 911 } 912 913 /* 914 * This function learns the MAC address of the device and registers this along with a 915 * vlan tag to a VMDQ. 916 */ 917 static int 918 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 919 { 920 struct ether_hdr *pkt_hdr; 921 struct virtio_net_data_ll *dev_ll; 922 struct virtio_net *dev = vdev->dev; 923 int i, ret; 924 925 /* Learn MAC address of guest device from packet */ 926 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 927 928 dev_ll = ll_root_used; 929 930 while (dev_ll != NULL) { 931 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 932 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 933 return -1; 934 } 935 dev_ll = dev_ll->next; 936 } 937 938 for (i = 0; i < ETHER_ADDR_LEN; i++) 939 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 940 941 /* vlan_tag currently uses the device_id. */ 942 vdev->vlan_tag = vlan_tags[dev->device_fh]; 943 944 /* Print out VMDQ registration info. */ 945 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 946 dev->device_fh, 947 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 948 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 949 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 950 vdev->vlan_tag); 951 952 /* Register the MAC address. */ 953 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 954 if (ret) 955 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 956 dev->device_fh); 957 958 /* Enable stripping of the vlan tag as we handle routing. */ 959 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 960 961 /* Set device as ready for RX. */ 962 vdev->ready = DEVICE_RX; 963 964 return 0; 965 } 966 967 /* 968 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 969 * queue before disabling RX on the device. 970 */ 971 static inline void 972 unlink_vmdq(struct vhost_dev *vdev) 973 { 974 unsigned i = 0; 975 unsigned rx_count; 976 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 977 978 if (vdev->ready == DEVICE_RX) { 979 /*clear MAC and VLAN settings*/ 980 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 981 for (i = 0; i < 6; i++) 982 vdev->mac_address.addr_bytes[i] = 0; 983 984 vdev->vlan_tag = 0; 985 986 /*Clear out the receive buffers*/ 987 rx_count = rte_eth_rx_burst(ports[0], 988 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 989 990 while (rx_count) { 991 for (i = 0; i < rx_count; i++) 992 rte_pktmbuf_free(pkts_burst[i]); 993 994 rx_count = rte_eth_rx_burst(ports[0], 995 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 996 } 997 998 vdev->ready = DEVICE_MAC_LEARNING; 999 } 1000 } 1001 1002 /* 1003 * Check if the packet destination MAC address is for a local device. If so then put 1004 * the packet on that devices RX queue. If not then return. 1005 */ 1006 static inline unsigned __attribute__((always_inline)) 1007 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1008 { 1009 struct virtio_net_data_ll *dev_ll; 1010 struct ether_hdr *pkt_hdr; 1011 uint64_t ret = 0; 1012 struct virtio_net *dev = vdev->dev; 1013 struct virtio_net *tdev; /* destination virito device */ 1014 1015 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1016 1017 /*get the used devices list*/ 1018 dev_ll = ll_root_used; 1019 1020 while (dev_ll != NULL) { 1021 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1022 &dev_ll->vdev->mac_address)) { 1023 1024 /* Drop the packet if the TX packet is destined for the TX device. */ 1025 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1026 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1027 dev->device_fh); 1028 return 0; 1029 } 1030 tdev = dev_ll->vdev->dev; 1031 1032 1033 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1034 1035 if (dev_ll->vdev->remove) { 1036 /*drop the packet if the device is marked for removal*/ 1037 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1038 } else { 1039 uint32_t mergeable = 1040 dev_ll->dev->features & 1041 (1 << VIRTIO_NET_F_MRG_RXBUF); 1042 1043 /*send the packet to the local virtio device*/ 1044 if (likely(mergeable == 0)) 1045 ret = virtio_dev_rx(dev_ll->dev, &m, 1); 1046 else 1047 ret = virtio_dev_merge_rx(dev_ll->dev, 1048 &m, 1); 1049 1050 if (enable_stats) { 1051 rte_atomic64_add( 1052 &dev_statistics[tdev->device_fh].rx_total_atomic, 1053 1); 1054 rte_atomic64_add( 1055 &dev_statistics[tdev->device_fh].rx_atomic, 1056 ret); 1057 dev_statistics[tdev->device_fh].tx_total++; 1058 dev_statistics[tdev->device_fh].tx += ret; 1059 } 1060 } 1061 1062 return 0; 1063 } 1064 dev_ll = dev_ll->next; 1065 } 1066 1067 return -1; 1068 } 1069 1070 /* 1071 * This function routes the TX packet to the correct interface. This may be a local device 1072 * or the physical port. 1073 */ 1074 static inline void __attribute__((always_inline)) 1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag) 1076 { 1077 struct mbuf_table *tx_q; 1078 struct vlan_ethhdr *vlan_hdr; 1079 struct rte_mbuf **m_table; 1080 struct rte_mbuf *mbuf, *prev; 1081 unsigned len, ret, offset = 0; 1082 const uint16_t lcore_id = rte_lcore_id(); 1083 struct virtio_net_data_ll *dev_ll = ll_root_used; 1084 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1085 struct virtio_net *dev = vdev->dev; 1086 1087 /*check if destination is local VM*/ 1088 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1089 return; 1090 1091 if (vm2vm_mode == VM2VM_HARDWARE) { 1092 while (dev_ll != NULL) { 1093 if ((dev_ll->vdev->ready == DEVICE_RX) 1094 && ether_addr_cmp(&(pkt_hdr->d_addr), 1095 &dev_ll->vdev->mac_address)) { 1096 /* 1097 * Drop the packet if the TX packet is 1098 * destined for the TX device. 1099 */ 1100 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1101 LOG_DEBUG(VHOST_DATA, 1102 "(%"PRIu64") TX: Source and destination" 1103 " MAC addresses are the same. Dropping " 1104 "packet.\n", 1105 dev_ll->vdev->device_fh); 1106 return; 1107 } 1108 offset = 4; 1109 vlan_tag = 1110 (uint16_t) 1111 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1112 1113 LOG_DEBUG(VHOST_DATA, 1114 "(%"PRIu64") TX: pkt to local VM device id:" 1115 "(%"PRIu64") vlan tag: %d.\n", 1116 dev->device_fh, dev_ll->vdev->dev->device_fh, 1117 vlan_tag); 1118 1119 break; 1120 } 1121 dev_ll = dev_ll->next; 1122 } 1123 } 1124 1125 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1126 1127 /*Add packet to the port tx queue*/ 1128 tx_q = &lcore_tx_queue[lcore_id]; 1129 len = tx_q->len; 1130 1131 /* Allocate an mbuf and populate the structure. */ 1132 mbuf = rte_pktmbuf_alloc(mbuf_pool); 1133 if (unlikely(mbuf == NULL)) { 1134 RTE_LOG(ERR, VHOST_DATA, 1135 "Failed to allocate memory for mbuf.\n"); 1136 return; 1137 } 1138 1139 mbuf->data_len = m->data_len + VLAN_HLEN + offset; 1140 mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset; 1141 mbuf->nb_segs = m->nb_segs; 1142 1143 /* Copy ethernet header to mbuf. */ 1144 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1145 rte_pktmbuf_mtod(m, const void *), 1146 ETH_HLEN); 1147 1148 1149 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/ 1150 vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *); 1151 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto; 1152 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); 1153 vlan_hdr->h_vlan_TCI = htons(vlan_tag); 1154 1155 /* Copy the remaining packet contents to the mbuf. */ 1156 rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN), 1157 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN), 1158 (m->data_len - ETH_HLEN)); 1159 1160 /* Copy the remaining segments for the whole packet. */ 1161 prev = mbuf; 1162 while (m->next) { 1163 /* Allocate an mbuf and populate the structure. */ 1164 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool); 1165 if (unlikely(next_mbuf == NULL)) { 1166 rte_pktmbuf_free(mbuf); 1167 RTE_LOG(ERR, VHOST_DATA, 1168 "Failed to allocate memory for mbuf.\n"); 1169 return; 1170 } 1171 1172 m = m->next; 1173 prev->next = next_mbuf; 1174 prev = next_mbuf; 1175 next_mbuf->data_len = m->data_len; 1176 1177 /* Copy data to next mbuf. */ 1178 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *), 1179 rte_pktmbuf_mtod(m, const void *), m->data_len); 1180 } 1181 1182 tx_q->m_table[len] = mbuf; 1183 len++; 1184 if (enable_stats) { 1185 dev_statistics[dev->device_fh].tx_total++; 1186 dev_statistics[dev->device_fh].tx++; 1187 } 1188 1189 if (unlikely(len == MAX_PKT_BURST)) { 1190 m_table = (struct rte_mbuf **)tx_q->m_table; 1191 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1192 /* Free any buffers not handled by TX and update the port stats. */ 1193 if (unlikely(ret < len)) { 1194 do { 1195 rte_pktmbuf_free(m_table[ret]); 1196 } while (++ret < len); 1197 } 1198 1199 len = 0; 1200 } 1201 1202 tx_q->len = len; 1203 return; 1204 } 1205 /* 1206 * This function is called by each data core. It handles all RX/TX registered with the 1207 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1208 * with all devices in the main linked list. 1209 */ 1210 static int 1211 switch_worker(__attribute__((unused)) void *arg) 1212 { 1213 struct rte_mempool *mbuf_pool = arg; 1214 struct virtio_net *dev = NULL; 1215 struct vhost_dev *vdev = NULL; 1216 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1217 struct virtio_net_data_ll *dev_ll; 1218 struct mbuf_table *tx_q; 1219 volatile struct lcore_ll_info *lcore_ll; 1220 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1221 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1222 unsigned ret, i; 1223 const uint16_t lcore_id = rte_lcore_id(); 1224 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1225 uint16_t rx_count = 0; 1226 uint32_t mergeable = 0; 1227 1228 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1229 lcore_ll = lcore_info[lcore_id].lcore_ll; 1230 prev_tsc = 0; 1231 1232 tx_q = &lcore_tx_queue[lcore_id]; 1233 for (i = 0; i < num_cores; i ++) { 1234 if (lcore_ids[i] == lcore_id) { 1235 tx_q->txq_id = i; 1236 break; 1237 } 1238 } 1239 1240 while(1) { 1241 cur_tsc = rte_rdtsc(); 1242 /* 1243 * TX burst queue drain 1244 */ 1245 diff_tsc = cur_tsc - prev_tsc; 1246 if (unlikely(diff_tsc > drain_tsc)) { 1247 1248 if (tx_q->len) { 1249 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1250 1251 /*Tx any packets in the queue*/ 1252 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1253 (struct rte_mbuf **)tx_q->m_table, 1254 (uint16_t)tx_q->len); 1255 if (unlikely(ret < tx_q->len)) { 1256 do { 1257 rte_pktmbuf_free(tx_q->m_table[ret]); 1258 } while (++ret < tx_q->len); 1259 } 1260 1261 tx_q->len = 0; 1262 } 1263 1264 prev_tsc = cur_tsc; 1265 1266 } 1267 1268 rte_prefetch0(lcore_ll->ll_root_used); 1269 /* 1270 * Inform the configuration core that we have exited the linked list and that no devices are 1271 * in use if requested. 1272 */ 1273 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1274 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1275 1276 /* 1277 * Process devices 1278 */ 1279 dev_ll = lcore_ll->ll_root_used; 1280 1281 while (dev_ll != NULL) { 1282 /*get virtio device ID*/ 1283 vdev = dev_ll->vdev; 1284 dev = vdev->dev; 1285 mergeable = 1286 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); 1287 1288 if (vdev->remove) { 1289 dev_ll = dev_ll->next; 1290 unlink_vmdq(vdev); 1291 vdev->ready = DEVICE_SAFE_REMOVE; 1292 continue; 1293 } 1294 if (likely(vdev->ready == DEVICE_RX)) { 1295 /*Handle guest RX*/ 1296 rx_count = rte_eth_rx_burst(ports[0], 1297 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1298 1299 if (rx_count) { 1300 if (likely(mergeable == 0)) 1301 ret_count = 1302 virtio_dev_rx(dev, 1303 pkts_burst, rx_count); 1304 else 1305 ret_count = 1306 virtio_dev_merge_rx(dev, 1307 pkts_burst, rx_count); 1308 1309 if (enable_stats) { 1310 rte_atomic64_add( 1311 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1312 rx_count); 1313 rte_atomic64_add( 1314 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1315 } 1316 while (likely(rx_count)) { 1317 rx_count--; 1318 rte_pktmbuf_free(pkts_burst[rx_count]); 1319 } 1320 1321 } 1322 } 1323 1324 if (!vdev->remove) { 1325 /*Handle guest TX*/ 1326 if (likely(mergeable == 0)) 1327 virtio_dev_tx(dev, mbuf_pool); 1328 else 1329 virtio_dev_merge_tx(dev, mbuf_pool); 1330 } 1331 1332 /*move to the next device in the list*/ 1333 dev_ll = dev_ll->next; 1334 } 1335 } 1336 1337 return 0; 1338 } 1339 1340 /* 1341 * This function gets available ring number for zero copy rx. 1342 * Only one thread will call this funciton for a paticular virtio device, 1343 * so, it is designed as non-thread-safe function. 1344 */ 1345 static inline uint32_t __attribute__((always_inline)) 1346 get_available_ring_num_zcp(struct virtio_net *dev) 1347 { 1348 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1349 uint16_t avail_idx; 1350 1351 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1352 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1353 } 1354 1355 /* 1356 * This function gets available ring index for zero copy rx, 1357 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1358 * Only one thread will call this funciton for a paticular virtio device, 1359 * so, it is designed as non-thread-safe function. 1360 */ 1361 static inline uint32_t __attribute__((always_inline)) 1362 get_available_ring_index_zcp(struct virtio_net *dev, 1363 uint16_t *res_base_idx, uint32_t count) 1364 { 1365 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1366 uint16_t avail_idx; 1367 uint32_t retry = 0; 1368 uint16_t free_entries; 1369 1370 *res_base_idx = vq->last_used_idx_res; 1371 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1372 free_entries = (avail_idx - *res_base_idx); 1373 1374 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1375 "avail idx: %d, " 1376 "res base idx:%d, free entries:%d\n", 1377 dev->device_fh, avail_idx, *res_base_idx, 1378 free_entries); 1379 1380 /* 1381 * If retry is enabled and the queue is full then we wait 1382 * and retry to avoid packet loss. 1383 */ 1384 if (enable_retry && unlikely(count > free_entries)) { 1385 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1386 rte_delay_us(burst_rx_delay_time); 1387 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1388 free_entries = (avail_idx - *res_base_idx); 1389 if (count <= free_entries) 1390 break; 1391 } 1392 } 1393 1394 /*check that we have enough buffers*/ 1395 if (unlikely(count > free_entries)) 1396 count = free_entries; 1397 1398 if (unlikely(count == 0)) { 1399 LOG_DEBUG(VHOST_DATA, 1400 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1401 "avail idx: %d, res base idx:%d, free entries:%d\n", 1402 dev->device_fh, avail_idx, 1403 *res_base_idx, free_entries); 1404 return 0; 1405 } 1406 1407 vq->last_used_idx_res = *res_base_idx + count; 1408 1409 return count; 1410 } 1411 1412 /* 1413 * This function put descriptor back to used list. 1414 */ 1415 static inline void __attribute__((always_inline)) 1416 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1417 { 1418 uint16_t res_cur_idx = vq->last_used_idx; 1419 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1420 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1421 rte_compiler_barrier(); 1422 *(volatile uint16_t *)&vq->used->idx += 1; 1423 vq->last_used_idx += 1; 1424 1425 /* Kick the guest if necessary. */ 1426 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1427 eventfd_write((int)vq->kickfd, 1); 1428 } 1429 1430 /* 1431 * This function get available descriptor from vitio vring and un-attached mbuf 1432 * from vpool->ring, and then attach them together. It needs adjust the offset 1433 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1434 * frame data may be put to wrong location in mbuf. 1435 */ 1436 static inline void __attribute__((always_inline)) 1437 attach_rxmbuf_zcp(struct virtio_net *dev) 1438 { 1439 uint16_t res_base_idx, desc_idx; 1440 uint64_t buff_addr, phys_addr; 1441 struct vhost_virtqueue *vq; 1442 struct vring_desc *desc; 1443 struct rte_mbuf *mbuf = NULL; 1444 struct vpool *vpool; 1445 hpa_type addr_type; 1446 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1447 1448 vpool = &vpool_array[vdev->vmdq_rx_q]; 1449 vq = dev->virtqueue[VIRTIO_RXQ]; 1450 1451 do { 1452 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1453 1) != 1)) 1454 return; 1455 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1456 1457 desc = &vq->desc[desc_idx]; 1458 if (desc->flags & VRING_DESC_F_NEXT) { 1459 desc = &vq->desc[desc->next]; 1460 buff_addr = gpa_to_vva(dev, desc->addr); 1461 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1462 &addr_type); 1463 } else { 1464 buff_addr = gpa_to_vva(dev, 1465 desc->addr + vq->vhost_hlen); 1466 phys_addr = gpa_to_hpa(vdev, 1467 desc->addr + vq->vhost_hlen, 1468 desc->len, &addr_type); 1469 } 1470 1471 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1472 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1473 " address found when attaching RX frame buffer" 1474 " address!\n", dev->device_fh); 1475 put_desc_to_used_list_zcp(vq, desc_idx); 1476 continue; 1477 } 1478 1479 /* 1480 * Check if the frame buffer address from guest crosses 1481 * sub-region or not. 1482 */ 1483 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1484 RTE_LOG(ERR, VHOST_DATA, 1485 "(%"PRIu64") Frame buffer address cross " 1486 "sub-regioin found when attaching RX frame " 1487 "buffer address!\n", 1488 dev->device_fh); 1489 put_desc_to_used_list_zcp(vq, desc_idx); 1490 continue; 1491 } 1492 } while (unlikely(phys_addr == 0)); 1493 1494 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1495 if (unlikely(mbuf == NULL)) { 1496 LOG_DEBUG(VHOST_DATA, 1497 "(%"PRIu64") in attach_rxmbuf_zcp: " 1498 "ring_sc_dequeue fail.\n", 1499 dev->device_fh); 1500 put_desc_to_used_list_zcp(vq, desc_idx); 1501 return; 1502 } 1503 1504 if (unlikely(vpool->buf_size > desc->len)) { 1505 LOG_DEBUG(VHOST_DATA, 1506 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1507 "length(%d) of descriptor idx: %d less than room " 1508 "size required: %d\n", 1509 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1510 put_desc_to_used_list_zcp(vq, desc_idx); 1511 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1512 return; 1513 } 1514 1515 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1516 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1517 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1518 mbuf->data_len = desc->len; 1519 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1520 1521 LOG_DEBUG(VHOST_DATA, 1522 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1523 "descriptor idx:%d\n", 1524 dev->device_fh, res_base_idx, desc_idx); 1525 1526 __rte_mbuf_raw_free(mbuf); 1527 1528 return; 1529 } 1530 1531 /* 1532 * Detach an attched packet mbuf - 1533 * - restore original mbuf address and length values. 1534 * - reset pktmbuf data and data_len to their default values. 1535 * All other fields of the given packet mbuf will be left intact. 1536 * 1537 * @param m 1538 * The attached packet mbuf. 1539 */ 1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1541 { 1542 const struct rte_mempool *mp = m->pool; 1543 void *buf = RTE_MBUF_TO_BADDR(m); 1544 uint32_t buf_ofs; 1545 uint32_t buf_len = mp->elt_size - sizeof(*m); 1546 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1547 1548 m->buf_addr = buf; 1549 m->buf_len = (uint16_t)buf_len; 1550 1551 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1552 RTE_PKTMBUF_HEADROOM : m->buf_len; 1553 m->data_off = buf_ofs; 1554 1555 m->data_len = 0; 1556 } 1557 1558 /* 1559 * This function is called after packets have been transimited. It fetchs mbuf 1560 * from vpool->pool, detached it and put into vpool->ring. It also update the 1561 * used index and kick the guest if necessary. 1562 */ 1563 static inline uint32_t __attribute__((always_inline)) 1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1565 { 1566 struct rte_mbuf *mbuf; 1567 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1568 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1569 uint32_t index = 0; 1570 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1571 1572 LOG_DEBUG(VHOST_DATA, 1573 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1574 "clean is: %d\n", 1575 dev->device_fh, mbuf_count); 1576 LOG_DEBUG(VHOST_DATA, 1577 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1578 "clean is : %d\n", 1579 dev->device_fh, rte_ring_count(vpool->ring)); 1580 1581 for (index = 0; index < mbuf_count; index++) { 1582 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1583 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1584 pktmbuf_detach_zcp(mbuf); 1585 rte_ring_sp_enqueue(vpool->ring, mbuf); 1586 1587 /* Update used index buffer information. */ 1588 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1589 vq->used->ring[used_idx].len = 0; 1590 1591 used_idx = (used_idx + 1) & (vq->size - 1); 1592 } 1593 1594 LOG_DEBUG(VHOST_DATA, 1595 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1596 "clean is: %d\n", 1597 dev->device_fh, rte_mempool_count(vpool->pool)); 1598 LOG_DEBUG(VHOST_DATA, 1599 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1600 "clean is : %d\n", 1601 dev->device_fh, rte_ring_count(vpool->ring)); 1602 LOG_DEBUG(VHOST_DATA, 1603 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1604 "vq->last_used_idx:%d\n", 1605 dev->device_fh, vq->last_used_idx); 1606 1607 vq->last_used_idx += mbuf_count; 1608 1609 LOG_DEBUG(VHOST_DATA, 1610 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1611 "vq->last_used_idx:%d\n", 1612 dev->device_fh, vq->last_used_idx); 1613 1614 rte_compiler_barrier(); 1615 1616 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1617 1618 /* Kick guest if required. */ 1619 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1620 eventfd_write((int)vq->kickfd, 1); 1621 1622 return 0; 1623 } 1624 1625 /* 1626 * This function is called when a virtio device is destroy. 1627 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1628 */ 1629 static void mbuf_destroy_zcp(struct vpool *vpool) 1630 { 1631 struct rte_mbuf *mbuf = NULL; 1632 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1633 1634 LOG_DEBUG(VHOST_CONFIG, 1635 "in mbuf_destroy_zcp: mbuf count in mempool before " 1636 "mbuf_destroy_zcp is: %d\n", 1637 mbuf_count); 1638 LOG_DEBUG(VHOST_CONFIG, 1639 "in mbuf_destroy_zcp: mbuf count in ring before " 1640 "mbuf_destroy_zcp is : %d\n", 1641 rte_ring_count(vpool->ring)); 1642 1643 for (index = 0; index < mbuf_count; index++) { 1644 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1645 if (likely(mbuf != NULL)) { 1646 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1647 pktmbuf_detach_zcp(mbuf); 1648 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1649 } 1650 } 1651 1652 LOG_DEBUG(VHOST_CONFIG, 1653 "in mbuf_destroy_zcp: mbuf count in mempool after " 1654 "mbuf_destroy_zcp is: %d\n", 1655 rte_mempool_count(vpool->pool)); 1656 LOG_DEBUG(VHOST_CONFIG, 1657 "in mbuf_destroy_zcp: mbuf count in ring after " 1658 "mbuf_destroy_zcp is : %d\n", 1659 rte_ring_count(vpool->ring)); 1660 } 1661 1662 /* 1663 * This function update the use flag and counter. 1664 */ 1665 static inline uint32_t __attribute__((always_inline)) 1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1667 uint32_t count) 1668 { 1669 struct vhost_virtqueue *vq; 1670 struct vring_desc *desc; 1671 struct rte_mbuf *buff; 1672 /* The virtio_hdr is initialised to 0. */ 1673 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1674 = {{0, 0, 0, 0, 0, 0}, 0}; 1675 uint64_t buff_hdr_addr = 0; 1676 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1677 uint32_t head_idx, packet_success = 0; 1678 uint16_t res_cur_idx; 1679 1680 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1681 1682 if (count == 0) 1683 return 0; 1684 1685 vq = dev->virtqueue[VIRTIO_RXQ]; 1686 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1687 1688 res_cur_idx = vq->last_used_idx; 1689 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1690 dev->device_fh, res_cur_idx, res_cur_idx + count); 1691 1692 /* Retrieve all of the head indexes first to avoid caching issues. */ 1693 for (head_idx = 0; head_idx < count; head_idx++) 1694 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1695 1696 /*Prefetch descriptor index. */ 1697 rte_prefetch0(&vq->desc[head[packet_success]]); 1698 1699 while (packet_success != count) { 1700 /* Get descriptor from available ring */ 1701 desc = &vq->desc[head[packet_success]]; 1702 1703 buff = pkts[packet_success]; 1704 LOG_DEBUG(VHOST_DATA, 1705 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1706 "pkt[%d] descriptor idx: %d\n", 1707 dev->device_fh, packet_success, 1708 MBUF_HEADROOM_UINT32(buff)); 1709 1710 PRINT_PACKET(dev, 1711 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1712 + RTE_PKTMBUF_HEADROOM), 1713 rte_pktmbuf_data_len(buff), 0); 1714 1715 /* Buffer address translation for virtio header. */ 1716 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1717 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1718 1719 /* 1720 * If the descriptors are chained the header and data are 1721 * placed in separate buffers. 1722 */ 1723 if (desc->flags & VRING_DESC_F_NEXT) { 1724 desc->len = vq->vhost_hlen; 1725 desc = &vq->desc[desc->next]; 1726 desc->len = rte_pktmbuf_data_len(buff); 1727 } else { 1728 desc->len = packet_len; 1729 } 1730 1731 /* Update used ring with desc information */ 1732 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1733 = head[packet_success]; 1734 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1735 = packet_len; 1736 res_cur_idx++; 1737 packet_success++; 1738 1739 /* A header is required per buffer. */ 1740 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1741 (const void *)&virtio_hdr, vq->vhost_hlen); 1742 1743 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1744 1745 if (likely(packet_success < count)) { 1746 /* Prefetch descriptor index. */ 1747 rte_prefetch0(&vq->desc[head[packet_success]]); 1748 } 1749 } 1750 1751 rte_compiler_barrier(); 1752 1753 LOG_DEBUG(VHOST_DATA, 1754 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1755 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1756 dev->device_fh, vq->last_used_idx, vq->used->idx); 1757 1758 *(volatile uint16_t *)&vq->used->idx += count; 1759 vq->last_used_idx += count; 1760 1761 LOG_DEBUG(VHOST_DATA, 1762 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1763 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1764 dev->device_fh, vq->last_used_idx, vq->used->idx); 1765 1766 /* Kick the guest if necessary. */ 1767 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1768 eventfd_write((int)vq->kickfd, 1); 1769 1770 return count; 1771 } 1772 1773 /* 1774 * This function routes the TX packet to the correct interface. 1775 * This may be a local device or the physical port. 1776 */ 1777 static inline void __attribute__((always_inline)) 1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1779 uint32_t desc_idx, uint8_t need_copy) 1780 { 1781 struct mbuf_table *tx_q; 1782 struct rte_mbuf **m_table; 1783 struct rte_mbuf *mbuf = NULL; 1784 unsigned len, ret, offset = 0; 1785 struct vpool *vpool; 1786 struct virtio_net_data_ll *dev_ll = ll_root_used; 1787 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1788 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1789 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1790 1791 /*Add packet to the port tx queue*/ 1792 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1793 len = tx_q->len; 1794 1795 /* Allocate an mbuf and populate the structure. */ 1796 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1797 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1798 if (unlikely(mbuf == NULL)) { 1799 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1800 RTE_LOG(ERR, VHOST_DATA, 1801 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1802 dev->device_fh); 1803 put_desc_to_used_list_zcp(vq, desc_idx); 1804 return; 1805 } 1806 1807 if (vm2vm_mode == VM2VM_HARDWARE) { 1808 /* Avoid using a vlan tag from any vm for external pkt, such as 1809 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1810 * selection, MAC address determines it as an external pkt 1811 * which should go to network, while vlan tag determine it as 1812 * a vm2vm pkt should forward to another vm. Hardware confuse 1813 * such a ambiguous situation, so pkt will lost. 1814 */ 1815 vlan_tag = external_pkt_default_vlan_tag; 1816 while (dev_ll != NULL) { 1817 if (likely(dev_ll->vdev->ready == DEVICE_RX) && 1818 ether_addr_cmp(&(pkt_hdr->d_addr), 1819 &dev_ll->vdev->mac_address)) { 1820 1821 /* 1822 * Drop the packet if the TX packet is destined 1823 * for the TX device. 1824 */ 1825 if (unlikely(dev_ll->vdev->dev->device_fh 1826 == dev->device_fh)) { 1827 LOG_DEBUG(VHOST_DATA, 1828 "(%"PRIu64") TX: Source and destination" 1829 "MAC addresses are the same. Dropping " 1830 "packet.\n", 1831 dev_ll->vdev->dev->device_fh); 1832 MBUF_HEADROOM_UINT32(mbuf) 1833 = (uint32_t)desc_idx; 1834 __rte_mbuf_raw_free(mbuf); 1835 return; 1836 } 1837 1838 /* 1839 * Packet length offset 4 bytes for HW vlan 1840 * strip when L2 switch back. 1841 */ 1842 offset = 4; 1843 vlan_tag = 1844 (uint16_t) 1845 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1846 1847 LOG_DEBUG(VHOST_DATA, 1848 "(%"PRIu64") TX: pkt to local VM device id:" 1849 "(%"PRIu64") vlan tag: %d.\n", 1850 dev->device_fh, dev_ll->vdev->dev->device_fh, 1851 vlan_tag); 1852 1853 break; 1854 } 1855 dev_ll = dev_ll->next; 1856 } 1857 } 1858 1859 mbuf->nb_segs = m->nb_segs; 1860 mbuf->next = m->next; 1861 mbuf->data_len = m->data_len + offset; 1862 mbuf->pkt_len = mbuf->data_len; 1863 if (unlikely(need_copy)) { 1864 /* Copy the packet contents to the mbuf. */ 1865 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1866 rte_pktmbuf_mtod(m, void *), 1867 m->data_len); 1868 } else { 1869 mbuf->data_off = m->data_off; 1870 mbuf->buf_physaddr = m->buf_physaddr; 1871 mbuf->buf_addr = m->buf_addr; 1872 } 1873 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1874 mbuf->vlan_tci = vlan_tag; 1875 mbuf->l2_len = sizeof(struct ether_hdr); 1876 mbuf->l3_len = sizeof(struct ipv4_hdr); 1877 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1878 1879 tx_q->m_table[len] = mbuf; 1880 len++; 1881 1882 LOG_DEBUG(VHOST_DATA, 1883 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1884 dev->device_fh, 1885 mbuf->nb_segs, 1886 (mbuf->next == NULL) ? "null" : "non-null"); 1887 1888 if (enable_stats) { 1889 dev_statistics[dev->device_fh].tx_total++; 1890 dev_statistics[dev->device_fh].tx++; 1891 } 1892 1893 if (unlikely(len == MAX_PKT_BURST)) { 1894 m_table = (struct rte_mbuf **)tx_q->m_table; 1895 ret = rte_eth_tx_burst(ports[0], 1896 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1897 1898 /* 1899 * Free any buffers not handled by TX and update 1900 * the port stats. 1901 */ 1902 if (unlikely(ret < len)) { 1903 do { 1904 rte_pktmbuf_free(m_table[ret]); 1905 } while (++ret < len); 1906 } 1907 1908 len = 0; 1909 txmbuf_clean_zcp(dev, vpool); 1910 } 1911 1912 tx_q->len = len; 1913 1914 return; 1915 } 1916 1917 /* 1918 * This function TX all available packets in virtio TX queue for one 1919 * virtio-net device. If it is first packet, it learns MAC address and 1920 * setup VMDQ. 1921 */ 1922 static inline void __attribute__((always_inline)) 1923 virtio_dev_tx_zcp(struct virtio_net *dev) 1924 { 1925 struct rte_mbuf m; 1926 struct vhost_virtqueue *vq; 1927 struct vring_desc *desc; 1928 uint64_t buff_addr = 0, phys_addr; 1929 uint32_t head[MAX_PKT_BURST]; 1930 uint32_t i; 1931 uint16_t free_entries, packet_success = 0; 1932 uint16_t avail_idx; 1933 uint8_t need_copy = 0; 1934 hpa_type addr_type; 1935 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1936 1937 vq = dev->virtqueue[VIRTIO_TXQ]; 1938 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1939 1940 /* If there are no available buffers then return. */ 1941 if (vq->last_used_idx_res == avail_idx) 1942 return; 1943 1944 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1945 1946 /* Prefetch available ring to retrieve head indexes. */ 1947 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1948 1949 /* Get the number of free entries in the ring */ 1950 free_entries = (avail_idx - vq->last_used_idx_res); 1951 1952 /* Limit to MAX_PKT_BURST. */ 1953 free_entries 1954 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1955 1956 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1957 dev->device_fh, free_entries); 1958 1959 /* Retrieve all of the head indexes first to avoid caching issues. */ 1960 for (i = 0; i < free_entries; i++) 1961 head[i] 1962 = vq->avail->ring[(vq->last_used_idx_res + i) 1963 & (vq->size - 1)]; 1964 1965 vq->last_used_idx_res += free_entries; 1966 1967 /* Prefetch descriptor index. */ 1968 rte_prefetch0(&vq->desc[head[packet_success]]); 1969 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1970 1971 while (packet_success < free_entries) { 1972 desc = &vq->desc[head[packet_success]]; 1973 1974 /* Discard first buffer as it is the virtio header */ 1975 desc = &vq->desc[desc->next]; 1976 1977 /* Buffer address translation. */ 1978 buff_addr = gpa_to_vva(dev, desc->addr); 1979 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type); 1980 1981 if (likely(packet_success < (free_entries - 1))) 1982 /* Prefetch descriptor index. */ 1983 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1984 1985 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1986 RTE_LOG(ERR, VHOST_DATA, 1987 "(%"PRIu64") Invalid frame buffer address found" 1988 "when TX packets!\n", 1989 dev->device_fh); 1990 packet_success++; 1991 continue; 1992 } 1993 1994 /* Prefetch buffer address. */ 1995 rte_prefetch0((void *)(uintptr_t)buff_addr); 1996 1997 /* 1998 * Setup dummy mbuf. This is copied to a real mbuf if 1999 * transmitted out the physical port. 2000 */ 2001 m.data_len = desc->len; 2002 m.nb_segs = 1; 2003 m.next = NULL; 2004 m.data_off = 0; 2005 m.buf_addr = (void *)(uintptr_t)buff_addr; 2006 m.buf_physaddr = phys_addr; 2007 2008 /* 2009 * Check if the frame buffer address from guest crosses 2010 * sub-region or not. 2011 */ 2012 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2013 RTE_LOG(ERR, VHOST_DATA, 2014 "(%"PRIu64") Frame buffer address cross " 2015 "sub-regioin found when attaching TX frame " 2016 "buffer address!\n", 2017 dev->device_fh); 2018 need_copy = 1; 2019 } else 2020 need_copy = 0; 2021 2022 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2023 2024 /* 2025 * If this is the first received packet we need to learn 2026 * the MAC and setup VMDQ 2027 */ 2028 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2029 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2030 /* 2031 * Discard frame if device is scheduled for 2032 * removal or a duplicate MAC address is found. 2033 */ 2034 packet_success += free_entries; 2035 vq->last_used_idx += packet_success; 2036 break; 2037 } 2038 } 2039 2040 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2041 packet_success++; 2042 } 2043 } 2044 2045 /* 2046 * This function is called by each data core. It handles all RX/TX registered 2047 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2048 * addresses are compared with all devices in the main linked list. 2049 */ 2050 static int 2051 switch_worker_zcp(__attribute__((unused)) void *arg) 2052 { 2053 struct virtio_net *dev = NULL; 2054 struct vhost_dev *vdev = NULL; 2055 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2056 struct virtio_net_data_ll *dev_ll; 2057 struct mbuf_table *tx_q; 2058 volatile struct lcore_ll_info *lcore_ll; 2059 const uint64_t drain_tsc 2060 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2061 * BURST_TX_DRAIN_US; 2062 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2063 unsigned ret; 2064 const uint16_t lcore_id = rte_lcore_id(); 2065 uint16_t count_in_ring, rx_count = 0; 2066 2067 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2068 2069 lcore_ll = lcore_info[lcore_id].lcore_ll; 2070 prev_tsc = 0; 2071 2072 while (1) { 2073 cur_tsc = rte_rdtsc(); 2074 2075 /* TX burst queue drain */ 2076 diff_tsc = cur_tsc - prev_tsc; 2077 if (unlikely(diff_tsc > drain_tsc)) { 2078 /* 2079 * Get mbuf from vpool.pool and detach mbuf and 2080 * put back into vpool.ring. 2081 */ 2082 dev_ll = lcore_ll->ll_root_used; 2083 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2084 /* Get virtio device ID */ 2085 vdev = dev_ll->vdev; 2086 dev = vdev->dev; 2087 2088 if (likely(!vdev->remove)) { 2089 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2090 if (tx_q->len) { 2091 LOG_DEBUG(VHOST_DATA, 2092 "TX queue drained after timeout" 2093 " with burst size %u\n", 2094 tx_q->len); 2095 2096 /* 2097 * Tx any packets in the queue 2098 */ 2099 ret = rte_eth_tx_burst( 2100 ports[0], 2101 (uint16_t)tx_q->txq_id, 2102 (struct rte_mbuf **) 2103 tx_q->m_table, 2104 (uint16_t)tx_q->len); 2105 if (unlikely(ret < tx_q->len)) { 2106 do { 2107 rte_pktmbuf_free( 2108 tx_q->m_table[ret]); 2109 } while (++ret < tx_q->len); 2110 } 2111 tx_q->len = 0; 2112 2113 txmbuf_clean_zcp(dev, 2114 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2115 } 2116 } 2117 dev_ll = dev_ll->next; 2118 } 2119 prev_tsc = cur_tsc; 2120 } 2121 2122 rte_prefetch0(lcore_ll->ll_root_used); 2123 2124 /* 2125 * Inform the configuration core that we have exited the linked 2126 * list and that no devices are in use if requested. 2127 */ 2128 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2129 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2130 2131 /* Process devices */ 2132 dev_ll = lcore_ll->ll_root_used; 2133 2134 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2135 vdev = dev_ll->vdev; 2136 dev = vdev->dev; 2137 if (unlikely(vdev->remove)) { 2138 dev_ll = dev_ll->next; 2139 unlink_vmdq(vdev); 2140 vdev->ready = DEVICE_SAFE_REMOVE; 2141 continue; 2142 } 2143 2144 if (likely(vdev->ready == DEVICE_RX)) { 2145 uint32_t index = vdev->vmdq_rx_q; 2146 uint16_t i; 2147 count_in_ring 2148 = rte_ring_count(vpool_array[index].ring); 2149 uint16_t free_entries 2150 = (uint16_t)get_available_ring_num_zcp(dev); 2151 2152 /* 2153 * Attach all mbufs in vpool.ring and put back 2154 * into vpool.pool. 2155 */ 2156 for (i = 0; 2157 i < RTE_MIN(free_entries, 2158 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2159 i++) 2160 attach_rxmbuf_zcp(dev); 2161 2162 /* Handle guest RX */ 2163 rx_count = rte_eth_rx_burst(ports[0], 2164 vdev->vmdq_rx_q, pkts_burst, 2165 MAX_PKT_BURST); 2166 2167 if (rx_count) { 2168 ret_count = virtio_dev_rx_zcp(dev, 2169 pkts_burst, rx_count); 2170 if (enable_stats) { 2171 dev_statistics[dev->device_fh].rx_total 2172 += rx_count; 2173 dev_statistics[dev->device_fh].rx 2174 += ret_count; 2175 } 2176 while (likely(rx_count)) { 2177 rx_count--; 2178 pktmbuf_detach_zcp( 2179 pkts_burst[rx_count]); 2180 rte_ring_sp_enqueue( 2181 vpool_array[index].ring, 2182 (void *)pkts_burst[rx_count]); 2183 } 2184 } 2185 } 2186 2187 if (likely(!vdev->remove)) 2188 /* Handle guest TX */ 2189 virtio_dev_tx_zcp(dev); 2190 2191 /* Move to the next device in the list */ 2192 dev_ll = dev_ll->next; 2193 } 2194 } 2195 2196 return 0; 2197 } 2198 2199 2200 /* 2201 * Add an entry to a used linked list. A free entry must first be found 2202 * in the free linked list using get_data_ll_free_entry(); 2203 */ 2204 static void 2205 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2206 struct virtio_net_data_ll *ll_dev) 2207 { 2208 struct virtio_net_data_ll *ll = *ll_root_addr; 2209 2210 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2211 ll_dev->next = NULL; 2212 rte_compiler_barrier(); 2213 2214 /* If ll == NULL then this is the first device. */ 2215 if (ll) { 2216 /* Increment to the tail of the linked list. */ 2217 while ((ll->next != NULL) ) 2218 ll = ll->next; 2219 2220 ll->next = ll_dev; 2221 } else { 2222 *ll_root_addr = ll_dev; 2223 } 2224 } 2225 2226 /* 2227 * Remove an entry from a used linked list. The entry must then be added to 2228 * the free linked list using put_data_ll_free_entry(). 2229 */ 2230 static void 2231 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2232 struct virtio_net_data_ll *ll_dev, 2233 struct virtio_net_data_ll *ll_dev_last) 2234 { 2235 struct virtio_net_data_ll *ll = *ll_root_addr; 2236 2237 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2238 return; 2239 2240 if (ll_dev == ll) 2241 *ll_root_addr = ll_dev->next; 2242 else 2243 if (likely(ll_dev_last != NULL)) 2244 ll_dev_last->next = ll_dev->next; 2245 else 2246 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2247 } 2248 2249 /* 2250 * Find and return an entry from the free linked list. 2251 */ 2252 static struct virtio_net_data_ll * 2253 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2254 { 2255 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2256 struct virtio_net_data_ll *ll_dev; 2257 2258 if (ll_free == NULL) 2259 return NULL; 2260 2261 ll_dev = ll_free; 2262 *ll_root_addr = ll_free->next; 2263 2264 return ll_dev; 2265 } 2266 2267 /* 2268 * Place an entry back on to the free linked list. 2269 */ 2270 static void 2271 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2272 struct virtio_net_data_ll *ll_dev) 2273 { 2274 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2275 2276 if (ll_dev == NULL) 2277 return; 2278 2279 ll_dev->next = ll_free; 2280 *ll_root_addr = ll_dev; 2281 } 2282 2283 /* 2284 * Creates a linked list of a given size. 2285 */ 2286 static struct virtio_net_data_ll * 2287 alloc_data_ll(uint32_t size) 2288 { 2289 struct virtio_net_data_ll *ll_new; 2290 uint32_t i; 2291 2292 /* Malloc and then chain the linked list. */ 2293 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2294 if (ll_new == NULL) { 2295 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2296 return NULL; 2297 } 2298 2299 for (i = 0; i < size - 1; i++) { 2300 ll_new[i].vdev = NULL; 2301 ll_new[i].next = &ll_new[i+1]; 2302 } 2303 ll_new[i].next = NULL; 2304 2305 return (ll_new); 2306 } 2307 2308 /* 2309 * Create the main linked list along with each individual cores linked list. A used and a free list 2310 * are created to manage entries. 2311 */ 2312 static int 2313 init_data_ll (void) 2314 { 2315 int lcore; 2316 2317 RTE_LCORE_FOREACH_SLAVE(lcore) { 2318 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2319 if (lcore_info[lcore].lcore_ll == NULL) { 2320 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2321 return -1; 2322 } 2323 2324 lcore_info[lcore].lcore_ll->device_num = 0; 2325 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2326 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2327 if (num_devices % num_switching_cores) 2328 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2329 else 2330 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2331 } 2332 2333 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2334 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2335 2336 return 0; 2337 } 2338 2339 /* 2340 * Set virtqueue flags so that we do not receive interrupts. 2341 */ 2342 static void 2343 set_irq_status (struct virtio_net *dev) 2344 { 2345 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2346 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2347 } 2348 2349 /* 2350 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2351 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2352 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2353 */ 2354 static void 2355 destroy_device (volatile struct virtio_net *dev) 2356 { 2357 struct virtio_net_data_ll *ll_lcore_dev_cur; 2358 struct virtio_net_data_ll *ll_main_dev_cur; 2359 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2360 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2361 struct vhost_dev *vdev; 2362 int lcore; 2363 2364 dev->flags &= ~VIRTIO_DEV_RUNNING; 2365 2366 vdev = (struct vhost_dev *)dev->priv; 2367 /*set the remove flag. */ 2368 vdev->remove = 1; 2369 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2370 rte_pause(); 2371 } 2372 2373 /* Search for entry to be removed from lcore ll */ 2374 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2375 while (ll_lcore_dev_cur != NULL) { 2376 if (ll_lcore_dev_cur->vdev == vdev) { 2377 break; 2378 } else { 2379 ll_lcore_dev_last = ll_lcore_dev_cur; 2380 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2381 } 2382 } 2383 2384 if (ll_lcore_dev_cur == NULL) { 2385 RTE_LOG(ERR, VHOST_CONFIG, 2386 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2387 dev->device_fh); 2388 return; 2389 } 2390 2391 /* Search for entry to be removed from main ll */ 2392 ll_main_dev_cur = ll_root_used; 2393 ll_main_dev_last = NULL; 2394 while (ll_main_dev_cur != NULL) { 2395 if (ll_main_dev_cur->vdev == vdev) { 2396 break; 2397 } else { 2398 ll_main_dev_last = ll_main_dev_cur; 2399 ll_main_dev_cur = ll_main_dev_cur->next; 2400 } 2401 } 2402 2403 /* Remove entries from the lcore and main ll. */ 2404 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2405 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2406 2407 /* Set the dev_removal_flag on each lcore. */ 2408 RTE_LCORE_FOREACH_SLAVE(lcore) { 2409 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2410 } 2411 2412 /* 2413 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2414 * they can no longer access the device removed from the linked lists and that the devices 2415 * are no longer in use. 2416 */ 2417 RTE_LCORE_FOREACH_SLAVE(lcore) { 2418 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2419 rte_pause(); 2420 } 2421 } 2422 2423 /* Add the entries back to the lcore and main free ll.*/ 2424 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2425 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2426 2427 /* Decrement number of device on the lcore. */ 2428 lcore_info[vdev->coreid].lcore_ll->device_num--; 2429 2430 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2431 2432 if (zero_copy) { 2433 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2434 2435 /* Stop the RX queue. */ 2436 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2437 LOG_DEBUG(VHOST_CONFIG, 2438 "(%"PRIu64") In destroy_device: Failed to stop " 2439 "rx queue:%d\n", 2440 dev->device_fh, 2441 vdev->vmdq_rx_q); 2442 } 2443 2444 LOG_DEBUG(VHOST_CONFIG, 2445 "(%"PRIu64") in destroy_device: Start put mbuf in " 2446 "mempool back to ring for RX queue: %d\n", 2447 dev->device_fh, vdev->vmdq_rx_q); 2448 2449 mbuf_destroy_zcp(vpool); 2450 2451 /* Stop the TX queue. */ 2452 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2453 LOG_DEBUG(VHOST_CONFIG, 2454 "(%"PRIu64") In destroy_device: Failed to " 2455 "stop tx queue:%d\n", 2456 dev->device_fh, vdev->vmdq_rx_q); 2457 } 2458 2459 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2460 2461 LOG_DEBUG(VHOST_CONFIG, 2462 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2463 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2464 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2465 dev->device_fh); 2466 2467 mbuf_destroy_zcp(vpool); 2468 } 2469 rte_free(vdev); 2470 2471 } 2472 2473 /* 2474 * A new device is added to a data core. First the device is added to the main linked list 2475 * and the allocated to a specific data core. 2476 */ 2477 static int 2478 new_device (struct virtio_net *dev) 2479 { 2480 struct virtio_net_data_ll *ll_dev; 2481 int lcore, core_add = 0; 2482 uint32_t device_num_min = num_devices; 2483 struct vhost_dev *vdev; 2484 2485 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2486 if (vdev == NULL) { 2487 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2488 dev->device_fh); 2489 return -1; 2490 } 2491 vdev->dev = dev; 2492 dev->priv = vdev; 2493 2494 /* Add device to main ll */ 2495 ll_dev = get_data_ll_free_entry(&ll_root_free); 2496 if (ll_dev == NULL) { 2497 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2498 "of %d devices per core has been reached\n", 2499 dev->device_fh, num_devices); 2500 rte_free(vdev); 2501 return -1; 2502 } 2503 ll_dev->vdev = vdev; 2504 add_data_ll_entry(&ll_root_used, ll_dev); 2505 vdev->vmdq_rx_q 2506 = dev->device_fh * (num_queues / num_devices); 2507 2508 if (zero_copy) { 2509 uint32_t index = vdev->vmdq_rx_q; 2510 uint32_t count_in_ring, i; 2511 struct mbuf_table *tx_q; 2512 2513 count_in_ring = rte_ring_count(vpool_array[index].ring); 2514 2515 LOG_DEBUG(VHOST_CONFIG, 2516 "(%"PRIu64") in new_device: mbuf count in mempool " 2517 "before attach is: %d\n", 2518 dev->device_fh, 2519 rte_mempool_count(vpool_array[index].pool)); 2520 LOG_DEBUG(VHOST_CONFIG, 2521 "(%"PRIu64") in new_device: mbuf count in ring " 2522 "before attach is : %d\n", 2523 dev->device_fh, count_in_ring); 2524 2525 /* 2526 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2527 */ 2528 for (i = 0; i < count_in_ring; i++) 2529 attach_rxmbuf_zcp(dev); 2530 2531 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2532 "mempool after attach is: %d\n", 2533 dev->device_fh, 2534 rte_mempool_count(vpool_array[index].pool)); 2535 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2536 "ring after attach is : %d\n", 2537 dev->device_fh, 2538 rte_ring_count(vpool_array[index].ring)); 2539 2540 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2541 tx_q->txq_id = vdev->vmdq_rx_q; 2542 2543 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2544 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2545 2546 LOG_DEBUG(VHOST_CONFIG, 2547 "(%"PRIu64") In new_device: Failed to start " 2548 "tx queue:%d\n", 2549 dev->device_fh, vdev->vmdq_rx_q); 2550 2551 mbuf_destroy_zcp(vpool); 2552 rte_free(vdev); 2553 return -1; 2554 } 2555 2556 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2557 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2558 2559 LOG_DEBUG(VHOST_CONFIG, 2560 "(%"PRIu64") In new_device: Failed to start " 2561 "rx queue:%d\n", 2562 dev->device_fh, vdev->vmdq_rx_q); 2563 2564 /* Stop the TX queue. */ 2565 if (rte_eth_dev_tx_queue_stop(ports[0], 2566 vdev->vmdq_rx_q) != 0) { 2567 LOG_DEBUG(VHOST_CONFIG, 2568 "(%"PRIu64") In new_device: Failed to " 2569 "stop tx queue:%d\n", 2570 dev->device_fh, vdev->vmdq_rx_q); 2571 } 2572 2573 mbuf_destroy_zcp(vpool); 2574 rte_free(vdev); 2575 return -1; 2576 } 2577 2578 } 2579 2580 /*reset ready flag*/ 2581 vdev->ready = DEVICE_MAC_LEARNING; 2582 vdev->remove = 0; 2583 2584 /* Find a suitable lcore to add the device. */ 2585 RTE_LCORE_FOREACH_SLAVE(lcore) { 2586 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2587 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2588 core_add = lcore; 2589 } 2590 } 2591 /* Add device to lcore ll */ 2592 ll_dev->dev->coreid = core_add; 2593 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); 2594 if (ll_dev == NULL) { 2595 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2596 vdev->ready = DEVICE_SAFE_REMOVE; 2597 destroy_device(dev); 2598 rte_free(vdev); 2599 return -1; 2600 } 2601 ll_dev->vdev = vdev; 2602 vdev->coreid = core_add; 2603 2604 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); 2605 2606 /* Initialize device stats */ 2607 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2608 2609 /* Disable notifications. */ 2610 set_irq_status(dev); 2611 lcore_info[vdev->coreid].lcore_ll->device_num++; 2612 dev->flags |= VIRTIO_DEV_RUNNING; 2613 2614 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2615 2616 return 0; 2617 } 2618 2619 /* 2620 * These callback allow devices to be added to the data core when configuration 2621 * has been fully complete. 2622 */ 2623 static const struct virtio_net_device_ops virtio_net_device_ops = 2624 { 2625 .new_device = new_device, 2626 .destroy_device = destroy_device, 2627 }; 2628 2629 /* 2630 * This is a thread will wake up after a period to print stats if the user has 2631 * enabled them. 2632 */ 2633 static void 2634 print_stats(void) 2635 { 2636 struct virtio_net_data_ll *dev_ll; 2637 uint64_t tx_dropped, rx_dropped; 2638 uint64_t tx, tx_total, rx, rx_total; 2639 uint32_t device_fh; 2640 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2641 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2642 2643 while(1) { 2644 sleep(enable_stats); 2645 2646 /* Clear screen and move to top left */ 2647 printf("%s%s", clr, top_left); 2648 2649 printf("\nDevice statistics ===================================="); 2650 2651 dev_ll = ll_root_used; 2652 while (dev_ll != NULL) { 2653 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2654 tx_total = dev_statistics[device_fh].tx_total; 2655 tx = dev_statistics[device_fh].tx; 2656 tx_dropped = tx_total - tx; 2657 if (zero_copy == 0) { 2658 rx_total = rte_atomic64_read( 2659 &dev_statistics[device_fh].rx_total_atomic); 2660 rx = rte_atomic64_read( 2661 &dev_statistics[device_fh].rx_atomic); 2662 } else { 2663 rx_total = dev_statistics[device_fh].rx_total; 2664 rx = dev_statistics[device_fh].rx; 2665 } 2666 rx_dropped = rx_total - rx; 2667 2668 printf("\nStatistics for device %"PRIu32" ------------------------------" 2669 "\nTX total: %"PRIu64"" 2670 "\nTX dropped: %"PRIu64"" 2671 "\nTX successful: %"PRIu64"" 2672 "\nRX total: %"PRIu64"" 2673 "\nRX dropped: %"PRIu64"" 2674 "\nRX successful: %"PRIu64"", 2675 device_fh, 2676 tx_total, 2677 tx_dropped, 2678 tx, 2679 rx_total, 2680 rx_dropped, 2681 rx); 2682 2683 dev_ll = dev_ll->next; 2684 } 2685 printf("\n======================================================\n"); 2686 } 2687 } 2688 2689 static void 2690 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2691 char *ring_name, uint32_t nb_mbuf) 2692 { 2693 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2694 vpool_array[index].pool 2695 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2696 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2697 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2698 rte_pktmbuf_init, NULL, socket, 0); 2699 if (vpool_array[index].pool != NULL) { 2700 vpool_array[index].ring 2701 = rte_ring_create(ring_name, 2702 rte_align32pow2(nb_mbuf + 1), 2703 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2704 if (likely(vpool_array[index].ring != NULL)) { 2705 LOG_DEBUG(VHOST_CONFIG, 2706 "in setup_mempool_tbl: mbuf count in " 2707 "mempool is: %d\n", 2708 rte_mempool_count(vpool_array[index].pool)); 2709 LOG_DEBUG(VHOST_CONFIG, 2710 "in setup_mempool_tbl: mbuf count in " 2711 "ring is: %d\n", 2712 rte_ring_count(vpool_array[index].ring)); 2713 } else { 2714 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2715 ring_name); 2716 } 2717 2718 /* Need consider head room. */ 2719 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2720 } else { 2721 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2722 } 2723 } 2724 2725 2726 /* 2727 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2728 * device is also registered here to handle the IOCTLs. 2729 */ 2730 int 2731 MAIN(int argc, char *argv[]) 2732 { 2733 struct rte_mempool *mbuf_pool = NULL; 2734 unsigned lcore_id, core_id = 0; 2735 unsigned nb_ports, valid_num_ports; 2736 int ret; 2737 uint8_t portid, queue_id = 0; 2738 static pthread_t tid; 2739 2740 /* init EAL */ 2741 ret = rte_eal_init(argc, argv); 2742 if (ret < 0) 2743 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2744 argc -= ret; 2745 argv += ret; 2746 2747 /* parse app arguments */ 2748 ret = us_vhost_parse_args(argc, argv); 2749 if (ret < 0) 2750 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2751 2752 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2753 if (rte_lcore_is_enabled(lcore_id)) 2754 lcore_ids[core_id ++] = lcore_id; 2755 2756 if (rte_lcore_count() > RTE_MAX_LCORE) 2757 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2758 2759 /*set the number of swithcing cores available*/ 2760 num_switching_cores = rte_lcore_count()-1; 2761 2762 /* Get the number of physical ports. */ 2763 nb_ports = rte_eth_dev_count(); 2764 if (nb_ports > RTE_MAX_ETHPORTS) 2765 nb_ports = RTE_MAX_ETHPORTS; 2766 2767 /* 2768 * Update the global var NUM_PORTS and global array PORTS 2769 * and get value of var VALID_NUM_PORTS according to system ports number 2770 */ 2771 valid_num_ports = check_ports_num(nb_ports); 2772 2773 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2774 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2775 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2776 return -1; 2777 } 2778 2779 if (zero_copy == 0) { 2780 /* Create the mbuf pool. */ 2781 mbuf_pool = rte_mempool_create( 2782 "MBUF_POOL", 2783 NUM_MBUFS_PER_PORT 2784 * valid_num_ports, 2785 MBUF_SIZE, MBUF_CACHE_SIZE, 2786 sizeof(struct rte_pktmbuf_pool_private), 2787 rte_pktmbuf_pool_init, NULL, 2788 rte_pktmbuf_init, NULL, 2789 rte_socket_id(), 0); 2790 if (mbuf_pool == NULL) 2791 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2792 2793 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2794 vpool_array[queue_id].pool = mbuf_pool; 2795 2796 if (vm2vm_mode == VM2VM_HARDWARE) { 2797 /* Enable VT loop back to let L2 switch to do it. */ 2798 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2799 LOG_DEBUG(VHOST_CONFIG, 2800 "Enable loop back for L2 switch in vmdq.\n"); 2801 } 2802 } else { 2803 uint32_t nb_mbuf; 2804 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2805 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2806 2807 /* 2808 * Zero copy defers queue RX/TX start to the time when guest 2809 * finishes its startup and packet buffers from that guest are 2810 * available. 2811 */ 2812 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2813 rx_conf_default.rx_drop_en = 0; 2814 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2815 nb_mbuf = num_rx_descriptor 2816 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2817 + num_switching_cores * MAX_PKT_BURST; 2818 2819 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2820 snprintf(pool_name, sizeof(pool_name), 2821 "rxmbuf_pool_%u", queue_id); 2822 snprintf(ring_name, sizeof(ring_name), 2823 "rxmbuf_ring_%u", queue_id); 2824 setup_mempool_tbl(rte_socket_id(), queue_id, 2825 pool_name, ring_name, nb_mbuf); 2826 } 2827 2828 nb_mbuf = num_tx_descriptor 2829 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2830 + num_switching_cores * MAX_PKT_BURST; 2831 2832 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2833 snprintf(pool_name, sizeof(pool_name), 2834 "txmbuf_pool_%u", queue_id); 2835 snprintf(ring_name, sizeof(ring_name), 2836 "txmbuf_ring_%u", queue_id); 2837 setup_mempool_tbl(rte_socket_id(), 2838 (queue_id + MAX_QUEUES), 2839 pool_name, ring_name, nb_mbuf); 2840 } 2841 2842 if (vm2vm_mode == VM2VM_HARDWARE) { 2843 /* Enable VT loop back to let L2 switch to do it. */ 2844 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2845 LOG_DEBUG(VHOST_CONFIG, 2846 "Enable loop back for L2 switch in vmdq.\n"); 2847 } 2848 } 2849 /* Set log level. */ 2850 rte_set_log_level(LOG_LEVEL); 2851 2852 /* initialize all ports */ 2853 for (portid = 0; portid < nb_ports; portid++) { 2854 /* skip ports that are not enabled */ 2855 if ((enabled_port_mask & (1 << portid)) == 0) { 2856 RTE_LOG(INFO, VHOST_PORT, 2857 "Skipping disabled port %d\n", portid); 2858 continue; 2859 } 2860 if (port_init(portid) != 0) 2861 rte_exit(EXIT_FAILURE, 2862 "Cannot initialize network ports\n"); 2863 } 2864 2865 /* Initialise all linked lists. */ 2866 if (init_data_ll() == -1) 2867 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2868 2869 /* Initialize device stats */ 2870 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2871 2872 /* Enable stats if the user option is set. */ 2873 if (enable_stats) 2874 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2875 2876 /* Launch all data cores. */ 2877 if (zero_copy == 0) { 2878 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 2879 rte_eal_remote_launch(switch_worker, 2880 mbuf_pool, lcore_id); 2881 } 2882 } else { 2883 uint32_t count_in_mempool, index, i; 2884 for (index = 0; index < 2*MAX_QUEUES; index++) { 2885 /* For all RX and TX queues. */ 2886 count_in_mempool 2887 = rte_mempool_count(vpool_array[index].pool); 2888 2889 /* 2890 * Transfer all un-attached mbufs from vpool.pool 2891 * to vpoo.ring. 2892 */ 2893 for (i = 0; i < count_in_mempool; i++) { 2894 struct rte_mbuf *mbuf 2895 = __rte_mbuf_raw_alloc( 2896 vpool_array[index].pool); 2897 rte_ring_sp_enqueue(vpool_array[index].ring, 2898 (void *)mbuf); 2899 } 2900 2901 LOG_DEBUG(VHOST_CONFIG, 2902 "in MAIN: mbuf count in mempool at initial " 2903 "is: %d\n", count_in_mempool); 2904 LOG_DEBUG(VHOST_CONFIG, 2905 "in MAIN: mbuf count in ring at initial is :" 2906 " %d\n", 2907 rte_ring_count(vpool_array[index].ring)); 2908 } 2909 2910 RTE_LCORE_FOREACH_SLAVE(lcore_id) 2911 rte_eal_remote_launch(switch_worker_zcp, NULL, 2912 lcore_id); 2913 } 2914 2915 /* Register CUSE device to handle IOCTLs. */ 2916 ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks()); 2917 if (ret != 0) 2918 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 2919 2920 init_virtio_net(&virtio_net_device_ops); 2921 2922 /* Start CUSE session. */ 2923 start_cuse_session_loop(); 2924 return 0; 2925 2926 } 2927 2928