1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 128 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 /* 83 * RX and TX Prefetch, Host, and Write-back threshold values should be 84 * carefully set for optimal performance. Consult the network 85 * controller's datasheet and supporting DPDK documentation for guidance 86 * on how these parameters should be set. 87 */ 88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */ 89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */ 90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */ 91 92 /* 93 * These default values are optimized for use with the Intel(R) 82599 10 GbE 94 * Controller and the DPDK ixgbe PMD. Consider using other values for other 95 * network controllers and/or network drivers. 96 */ 97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */ 98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */ 99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */ 100 101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 102 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */ 103 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 104 105 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 106 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 107 108 #define JUMBO_FRAME_MAX_SIZE 0x2600 109 110 /* State of virtio device. */ 111 #define DEVICE_MAC_LEARNING 0 112 #define DEVICE_RX 1 113 #define DEVICE_SAFE_REMOVE 2 114 115 /* Config_core_flag status definitions. */ 116 #define REQUEST_DEV_REMOVAL 1 117 #define ACK_DEV_REMOVAL 0 118 119 /* Configurable number of RX/TX ring descriptors */ 120 #define RTE_TEST_RX_DESC_DEFAULT 1024 121 #define RTE_TEST_TX_DESC_DEFAULT 512 122 123 /* 124 * Need refine these 2 macros for legacy and DPDK based front end: 125 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 126 * And then adjust power 2. 127 */ 128 /* 129 * For legacy front end, 128 descriptors, 130 * half for virtio header, another half for mbuf. 131 */ 132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 134 135 /* Get first 4 bytes in mbuf headroom. */ 136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 137 + sizeof(struct rte_mbuf))) 138 139 /* true if x is a power of 2 */ 140 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 141 142 #define INVALID_PORT_ID 0xFF 143 144 /* Max number of devices. Limited by vmdq. */ 145 #define MAX_DEVICES 64 146 147 /* Size of buffers used for snprintfs. */ 148 #define MAX_PRINT_BUFF 6072 149 150 /* Maximum character device basename size. */ 151 #define MAX_BASENAME_SZ 10 152 153 /* Maximum long option length for option parsing. */ 154 #define MAX_LONG_OPT_SZ 64 155 156 /* Used to compare MAC addresses. */ 157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 158 159 /* Number of descriptors per cacheline. */ 160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc)) 161 162 /* mask of enabled ports */ 163 static uint32_t enabled_port_mask = 0; 164 165 /*Number of switching cores enabled*/ 166 static uint32_t num_switching_cores = 0; 167 168 /* number of devices/queues to support*/ 169 static uint32_t num_queues = 0; 170 uint32_t num_devices = 0; 171 172 /* 173 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 174 * disabled on default. 175 */ 176 static uint32_t zero_copy; 177 178 /* number of descriptors to apply*/ 179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 181 182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 183 #define MAX_RING_DESC 4096 184 185 struct vpool { 186 struct rte_mempool *pool; 187 struct rte_ring *ring; 188 uint32_t buf_size; 189 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 190 191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 192 typedef enum { 193 VM2VM_DISABLED = 0, 194 VM2VM_SOFTWARE = 1, 195 VM2VM_HARDWARE = 2, 196 VM2VM_LAST 197 } vm2vm_type; 198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 199 200 /* The type of host physical address translated from guest physical address. */ 201 typedef enum { 202 PHYS_ADDR_CONTINUOUS = 0, 203 PHYS_ADDR_CROSS_SUBREG = 1, 204 PHYS_ADDR_INVALID = 2, 205 PHYS_ADDR_LAST 206 } hpa_type; 207 208 /* Enable stats. */ 209 static uint32_t enable_stats = 0; 210 /* Enable retries on RX. */ 211 static uint32_t enable_retry = 1; 212 /* Specify timeout (in useconds) between retries on RX. */ 213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 214 /* Specify the number of retries on RX. */ 215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 216 217 /* Character device basename. Can be set by user. */ 218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 219 220 221 /* This can be set by the user so it is made available here. */ 222 extern uint64_t VHOST_FEATURES; 223 224 /* Default configuration for rx and tx thresholds etc. */ 225 static struct rte_eth_rxconf rx_conf_default = { 226 .rx_thresh = { 227 .pthresh = RX_PTHRESH, 228 .hthresh = RX_HTHRESH, 229 .wthresh = RX_WTHRESH, 230 }, 231 .rx_drop_en = 1, 232 }; 233 234 /* 235 * These default values are optimized for use with the Intel(R) 82599 10 GbE 236 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other 237 * network controllers and/or network drivers. 238 */ 239 static struct rte_eth_txconf tx_conf_default = { 240 .tx_thresh = { 241 .pthresh = TX_PTHRESH, 242 .hthresh = TX_HTHRESH, 243 .wthresh = TX_WTHRESH, 244 }, 245 .tx_free_thresh = 0, /* Use PMD default values */ 246 .tx_rs_thresh = 0, /* Use PMD default values */ 247 }; 248 249 /* empty vmdq configuration structure. Filled in programatically */ 250 static struct rte_eth_conf vmdq_conf_default = { 251 .rxmode = { 252 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 253 .split_hdr_size = 0, 254 .header_split = 0, /**< Header Split disabled */ 255 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 256 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 257 /* 258 * It is necessary for 1G NIC such as I350, 259 * this fixes bug of ipv4 forwarding in guest can't 260 * forward pakets from one virtio dev to another virtio dev. 261 */ 262 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 263 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 264 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 265 }, 266 267 .txmode = { 268 .mq_mode = ETH_MQ_TX_NONE, 269 }, 270 .rx_adv_conf = { 271 /* 272 * should be overridden separately in code with 273 * appropriate values 274 */ 275 .vmdq_rx_conf = { 276 .nb_queue_pools = ETH_8_POOLS, 277 .enable_default_pool = 0, 278 .default_pool = 0, 279 .nb_pool_maps = 0, 280 .pool_map = {{0, 0},}, 281 }, 282 }, 283 }; 284 285 static unsigned lcore_ids[RTE_MAX_LCORE]; 286 static uint8_t ports[RTE_MAX_ETHPORTS]; 287 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 288 289 static const uint16_t external_pkt_default_vlan_tag = 2000; 290 const uint16_t vlan_tags[] = { 291 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 292 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 293 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 294 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 295 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 296 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 297 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 298 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 299 }; 300 301 /* ethernet addresses of ports */ 302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 303 304 /* heads for the main used and free linked lists for the data path. */ 305 static struct virtio_net_data_ll *ll_root_used = NULL; 306 static struct virtio_net_data_ll *ll_root_free = NULL; 307 308 /* Array of data core structures containing information on individual core linked lists. */ 309 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 310 311 /* Used for queueing bursts of TX packets. */ 312 struct mbuf_table { 313 unsigned len; 314 unsigned txq_id; 315 struct rte_mbuf *m_table[MAX_PKT_BURST]; 316 }; 317 318 /* TX queue for each data core. */ 319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 320 321 /* TX queue fori each virtio device for zero copy. */ 322 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 323 324 /* Vlan header struct used to insert vlan tags on TX. */ 325 struct vlan_ethhdr { 326 unsigned char h_dest[ETH_ALEN]; 327 unsigned char h_source[ETH_ALEN]; 328 __be16 h_vlan_proto; 329 __be16 h_vlan_TCI; 330 __be16 h_vlan_encapsulated_proto; 331 }; 332 333 /* IPv4 Header */ 334 struct ipv4_hdr { 335 uint8_t version_ihl; /**< version and header length */ 336 uint8_t type_of_service; /**< type of service */ 337 uint16_t total_length; /**< length of packet */ 338 uint16_t packet_id; /**< packet ID */ 339 uint16_t fragment_offset; /**< fragmentation offset */ 340 uint8_t time_to_live; /**< time to live */ 341 uint8_t next_proto_id; /**< protocol ID */ 342 uint16_t hdr_checksum; /**< header checksum */ 343 uint32_t src_addr; /**< source address */ 344 uint32_t dst_addr; /**< destination address */ 345 } __attribute__((__packed__)); 346 347 /* Header lengths. */ 348 #define VLAN_HLEN 4 349 #define VLAN_ETH_HLEN 18 350 351 /* Per-device statistics struct */ 352 struct device_statistics { 353 uint64_t tx_total; 354 rte_atomic64_t rx_total_atomic; 355 uint64_t rx_total; 356 uint64_t tx; 357 rte_atomic64_t rx_atomic; 358 uint64_t rx; 359 } __rte_cache_aligned; 360 struct device_statistics dev_statistics[MAX_DEVICES]; 361 362 /* 363 * Builds up the correct configuration for VMDQ VLAN pool map 364 * according to the pool & queue limits. 365 */ 366 static inline int 367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 368 { 369 struct rte_eth_vmdq_rx_conf conf; 370 unsigned i; 371 372 memset(&conf, 0, sizeof(conf)); 373 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 374 conf.nb_pool_maps = num_devices; 375 conf.enable_loop_back = 376 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back; 377 378 for (i = 0; i < conf.nb_pool_maps; i++) { 379 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 380 conf.pool_map[i].pools = (1UL << i); 381 } 382 383 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 384 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 385 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 386 return 0; 387 } 388 389 /* 390 * Validate the device number according to the max pool number gotten form 391 * dev_info. If the device number is invalid, give the error message and 392 * return -1. Each device must have its own pool. 393 */ 394 static inline int 395 validate_num_devices(uint32_t max_nb_devices) 396 { 397 if (num_devices > max_nb_devices) { 398 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 399 return -1; 400 } 401 return 0; 402 } 403 404 /* 405 * Initialises a given port using global settings and with the rx buffers 406 * coming from the mbuf_pool passed as parameter 407 */ 408 static inline int 409 port_init(uint8_t port) 410 { 411 struct rte_eth_dev_info dev_info; 412 struct rte_eth_conf port_conf; 413 uint16_t rx_rings, tx_rings; 414 uint16_t rx_ring_size, tx_ring_size; 415 int retval; 416 uint16_t q; 417 418 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 419 rte_eth_dev_info_get (port, &dev_info); 420 421 /*configure the number of supported virtio devices based on VMDQ limits */ 422 num_devices = dev_info.max_vmdq_pools; 423 num_queues = dev_info.max_rx_queues; 424 425 if (zero_copy) { 426 rx_ring_size = num_rx_descriptor; 427 tx_ring_size = num_tx_descriptor; 428 tx_rings = dev_info.max_tx_queues; 429 } else { 430 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 431 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 432 tx_rings = (uint16_t)rte_lcore_count(); 433 } 434 435 retval = validate_num_devices(MAX_DEVICES); 436 if (retval < 0) 437 return retval; 438 439 /* Get port configuration. */ 440 retval = get_eth_conf(&port_conf, num_devices); 441 if (retval < 0) 442 return retval; 443 444 if (port >= rte_eth_dev_count()) return -1; 445 446 rx_rings = (uint16_t)num_queues, 447 /* Configure ethernet device. */ 448 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 449 if (retval != 0) 450 return retval; 451 452 /* Setup the queues. */ 453 for (q = 0; q < rx_rings; q ++) { 454 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 455 rte_eth_dev_socket_id(port), &rx_conf_default, 456 vpool_array[q].pool); 457 if (retval < 0) 458 return retval; 459 } 460 for (q = 0; q < tx_rings; q ++) { 461 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 462 rte_eth_dev_socket_id(port), &tx_conf_default); 463 if (retval < 0) 464 return retval; 465 } 466 467 /* Start the device. */ 468 retval = rte_eth_dev_start(port); 469 if (retval < 0) { 470 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 471 return retval; 472 } 473 474 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 475 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 476 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 477 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 478 (unsigned)port, 479 vmdq_ports_eth_addr[port].addr_bytes[0], 480 vmdq_ports_eth_addr[port].addr_bytes[1], 481 vmdq_ports_eth_addr[port].addr_bytes[2], 482 vmdq_ports_eth_addr[port].addr_bytes[3], 483 vmdq_ports_eth_addr[port].addr_bytes[4], 484 vmdq_ports_eth_addr[port].addr_bytes[5]); 485 486 return 0; 487 } 488 489 /* 490 * Set character device basename. 491 */ 492 static int 493 us_vhost_parse_basename(const char *q_arg) 494 { 495 /* parse number string */ 496 497 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 498 return -1; 499 else 500 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 501 502 return 0; 503 } 504 505 /* 506 * Parse the portmask provided at run time. 507 */ 508 static int 509 parse_portmask(const char *portmask) 510 { 511 char *end = NULL; 512 unsigned long pm; 513 514 errno = 0; 515 516 /* parse hexadecimal string */ 517 pm = strtoul(portmask, &end, 16); 518 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 519 return -1; 520 521 if (pm == 0) 522 return -1; 523 524 return pm; 525 526 } 527 528 /* 529 * Parse num options at run time. 530 */ 531 static int 532 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 533 { 534 char *end = NULL; 535 unsigned long num; 536 537 errno = 0; 538 539 /* parse unsigned int string */ 540 num = strtoul(q_arg, &end, 10); 541 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 542 return -1; 543 544 if (num > max_valid_value) 545 return -1; 546 547 return num; 548 549 } 550 551 /* 552 * Display usage 553 */ 554 static void 555 us_vhost_usage(const char *prgname) 556 { 557 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 558 " --vm2vm [0|1|2]\n" 559 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 560 " --dev-basename <name>\n" 561 " --nb-devices ND\n" 562 " -p PORTMASK: Set mask for ports to be used by application\n" 563 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 564 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 565 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 566 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 567 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 568 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 569 " --dev-basename: The basename to be used for the character device.\n" 570 " --zero-copy [0|1]: disable(default)/enable rx/tx " 571 "zero copy\n" 572 " --rx-desc-num [0-N]: the number of descriptors on rx, " 573 "used only when zero copy is enabled.\n" 574 " --tx-desc-num [0-N]: the number of descriptors on tx, " 575 "used only when zero copy is enabled.\n", 576 prgname); 577 } 578 579 /* 580 * Parse the arguments given in the command line of the application. 581 */ 582 static int 583 us_vhost_parse_args(int argc, char **argv) 584 { 585 int opt, ret; 586 int option_index; 587 unsigned i; 588 const char *prgname = argv[0]; 589 static struct option long_option[] = { 590 {"vm2vm", required_argument, NULL, 0}, 591 {"rx-retry", required_argument, NULL, 0}, 592 {"rx-retry-delay", required_argument, NULL, 0}, 593 {"rx-retry-num", required_argument, NULL, 0}, 594 {"mergeable", required_argument, NULL, 0}, 595 {"stats", required_argument, NULL, 0}, 596 {"dev-basename", required_argument, NULL, 0}, 597 {"zero-copy", required_argument, NULL, 0}, 598 {"rx-desc-num", required_argument, NULL, 0}, 599 {"tx-desc-num", required_argument, NULL, 0}, 600 {NULL, 0, 0, 0}, 601 }; 602 603 /* Parse command line */ 604 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) { 605 switch (opt) { 606 /* Portmask */ 607 case 'p': 608 enabled_port_mask = parse_portmask(optarg); 609 if (enabled_port_mask == 0) { 610 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 611 us_vhost_usage(prgname); 612 return -1; 613 } 614 break; 615 616 case 0: 617 /* Enable/disable vm2vm comms. */ 618 if (!strncmp(long_option[option_index].name, "vm2vm", 619 MAX_LONG_OPT_SZ)) { 620 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 621 if (ret == -1) { 622 RTE_LOG(INFO, VHOST_CONFIG, 623 "Invalid argument for " 624 "vm2vm [0|1|2]\n"); 625 us_vhost_usage(prgname); 626 return -1; 627 } else { 628 vm2vm_mode = (vm2vm_type)ret; 629 } 630 } 631 632 /* Enable/disable retries on RX. */ 633 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 634 ret = parse_num_opt(optarg, 1); 635 if (ret == -1) { 636 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 637 us_vhost_usage(prgname); 638 return -1; 639 } else { 640 enable_retry = ret; 641 } 642 } 643 644 /* Specify the retries delay time (in useconds) on RX. */ 645 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 646 ret = parse_num_opt(optarg, INT32_MAX); 647 if (ret == -1) { 648 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 649 us_vhost_usage(prgname); 650 return -1; 651 } else { 652 burst_rx_delay_time = ret; 653 } 654 } 655 656 /* Specify the retries number on RX. */ 657 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 658 ret = parse_num_opt(optarg, INT32_MAX); 659 if (ret == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 661 us_vhost_usage(prgname); 662 return -1; 663 } else { 664 burst_rx_retry_num = ret; 665 } 666 } 667 668 /* Enable/disable RX mergeable buffers. */ 669 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 670 ret = parse_num_opt(optarg, 1); 671 if (ret == -1) { 672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } else { 676 if (ret) { 677 vmdq_conf_default.rxmode.jumbo_frame = 1; 678 vmdq_conf_default.rxmode.max_rx_pkt_len 679 = JUMBO_FRAME_MAX_SIZE; 680 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF); 681 } 682 } 683 } 684 685 /* Enable/disable stats. */ 686 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 687 ret = parse_num_opt(optarg, INT32_MAX); 688 if (ret == -1) { 689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 690 us_vhost_usage(prgname); 691 return -1; 692 } else { 693 enable_stats = ret; 694 } 695 } 696 697 /* Set character device basename. */ 698 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 699 if (us_vhost_parse_basename(optarg) == -1) { 700 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 701 us_vhost_usage(prgname); 702 return -1; 703 } 704 } 705 706 /* Enable/disable rx/tx zero copy. */ 707 if (!strncmp(long_option[option_index].name, 708 "zero-copy", MAX_LONG_OPT_SZ)) { 709 ret = parse_num_opt(optarg, 1); 710 if (ret == -1) { 711 RTE_LOG(INFO, VHOST_CONFIG, 712 "Invalid argument" 713 " for zero-copy [0|1]\n"); 714 us_vhost_usage(prgname); 715 return -1; 716 } else 717 zero_copy = ret; 718 719 if (zero_copy) { 720 #ifdef RTE_MBUF_REFCNT 721 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 722 "zero copy vhost APP, please " 723 "disable RTE_MBUF_REFCNT\n" 724 "in config file and then rebuild DPDK " 725 "core lib!\n" 726 "Otherwise please disable zero copy " 727 "flag in command line!\n"); 728 return -1; 729 #endif 730 } 731 } 732 733 /* Specify the descriptor number on RX. */ 734 if (!strncmp(long_option[option_index].name, 735 "rx-desc-num", MAX_LONG_OPT_SZ)) { 736 ret = parse_num_opt(optarg, MAX_RING_DESC); 737 if ((ret == -1) || (!POWEROF2(ret))) { 738 RTE_LOG(INFO, VHOST_CONFIG, 739 "Invalid argument for rx-desc-num[0-N]," 740 "power of 2 required.\n"); 741 us_vhost_usage(prgname); 742 return -1; 743 } else { 744 num_rx_descriptor = ret; 745 } 746 } 747 748 /* Specify the descriptor number on TX. */ 749 if (!strncmp(long_option[option_index].name, 750 "tx-desc-num", MAX_LONG_OPT_SZ)) { 751 ret = parse_num_opt(optarg, MAX_RING_DESC); 752 if ((ret == -1) || (!POWEROF2(ret))) { 753 RTE_LOG(INFO, VHOST_CONFIG, 754 "Invalid argument for tx-desc-num [0-N]," 755 "power of 2 required.\n"); 756 us_vhost_usage(prgname); 757 return -1; 758 } else { 759 num_tx_descriptor = ret; 760 } 761 } 762 763 break; 764 765 /* Invalid option - print options. */ 766 default: 767 us_vhost_usage(prgname); 768 return -1; 769 } 770 } 771 772 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 773 if (enabled_port_mask & (1 << i)) 774 ports[num_ports++] = (uint8_t)i; 775 } 776 777 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 778 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 779 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 780 return -1; 781 } 782 783 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 784 RTE_LOG(INFO, VHOST_PORT, 785 "Vhost zero copy doesn't support software vm2vm," 786 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 787 return -1; 788 } 789 790 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 791 RTE_LOG(INFO, VHOST_PORT, 792 "Vhost zero copy doesn't support jumbo frame," 793 "please specify '--mergeable 0' to disable the " 794 "mergeable feature.\n"); 795 return -1; 796 } 797 798 return 0; 799 } 800 801 /* 802 * Update the global var NUM_PORTS and array PORTS according to system ports number 803 * and return valid ports number 804 */ 805 static unsigned check_ports_num(unsigned nb_ports) 806 { 807 unsigned valid_num_ports = num_ports; 808 unsigned portid; 809 810 if (num_ports > nb_ports) { 811 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 812 num_ports, nb_ports); 813 num_ports = nb_ports; 814 } 815 816 for (portid = 0; portid < num_ports; portid ++) { 817 if (ports[portid] >= nb_ports) { 818 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 819 ports[portid], (nb_ports - 1)); 820 ports[portid] = INVALID_PORT_ID; 821 valid_num_ports--; 822 } 823 } 824 return valid_num_ports; 825 } 826 827 /* 828 * Macro to print out packet contents. Wrapped in debug define so that the 829 * data path is not effected when debug is disabled. 830 */ 831 #ifdef DEBUG 832 #define PRINT_PACKET(device, addr, size, header) do { \ 833 char *pkt_addr = (char*)(addr); \ 834 unsigned int index; \ 835 char packet[MAX_PRINT_BUFF]; \ 836 \ 837 if ((header)) \ 838 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 839 else \ 840 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 841 for (index = 0; index < (size); index++) { \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 843 "%02hhx ", pkt_addr[index]); \ 844 } \ 845 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 846 \ 847 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 848 } while(0) 849 #else 850 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 851 #endif 852 853 /* 854 * Function to convert guest physical addresses to vhost physical addresses. 855 * This is used to convert virtio buffer addresses. 856 */ 857 static inline uint64_t __attribute__((always_inline)) 858 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 859 uint32_t buf_len, hpa_type *addr_type) 860 { 861 struct virtio_memory_regions_hpa *region; 862 uint32_t regionidx; 863 uint64_t vhost_pa = 0; 864 865 *addr_type = PHYS_ADDR_INVALID; 866 867 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 868 region = &vdev->regions_hpa[regionidx]; 869 if ((guest_pa >= region->guest_phys_address) && 870 (guest_pa <= region->guest_phys_address_end)) { 871 vhost_pa = region->host_phys_addr_offset + guest_pa; 872 if (likely((guest_pa + buf_len - 1) 873 <= region->guest_phys_address_end)) 874 *addr_type = PHYS_ADDR_CONTINUOUS; 875 else 876 *addr_type = PHYS_ADDR_CROSS_SUBREG; 877 break; 878 } 879 } 880 881 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 882 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 883 (void *)(uintptr_t)vhost_pa); 884 885 return vhost_pa; 886 } 887 888 /* 889 * Compares a packet destination MAC address to a device MAC address. 890 */ 891 static inline int __attribute__((always_inline)) 892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 893 { 894 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 895 } 896 897 /* 898 * This function learns the MAC address of the device and registers this along with a 899 * vlan tag to a VMDQ. 900 */ 901 static int 902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 903 { 904 struct ether_hdr *pkt_hdr; 905 struct virtio_net_data_ll *dev_ll; 906 struct virtio_net *dev = vdev->dev; 907 int i, ret; 908 909 /* Learn MAC address of guest device from packet */ 910 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 911 912 dev_ll = ll_root_used; 913 914 while (dev_ll != NULL) { 915 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 916 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 917 return -1; 918 } 919 dev_ll = dev_ll->next; 920 } 921 922 for (i = 0; i < ETHER_ADDR_LEN; i++) 923 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 924 925 /* vlan_tag currently uses the device_id. */ 926 vdev->vlan_tag = vlan_tags[dev->device_fh]; 927 928 /* Print out VMDQ registration info. */ 929 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 930 dev->device_fh, 931 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 932 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 933 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 934 vdev->vlan_tag); 935 936 /* Register the MAC address. */ 937 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh); 938 if (ret) 939 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 940 dev->device_fh); 941 942 /* Enable stripping of the vlan tag as we handle routing. */ 943 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 944 945 /* Set device as ready for RX. */ 946 vdev->ready = DEVICE_RX; 947 948 return 0; 949 } 950 951 /* 952 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 953 * queue before disabling RX on the device. 954 */ 955 static inline void 956 unlink_vmdq(struct vhost_dev *vdev) 957 { 958 unsigned i = 0; 959 unsigned rx_count; 960 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 961 962 if (vdev->ready == DEVICE_RX) { 963 /*clear MAC and VLAN settings*/ 964 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 965 for (i = 0; i < 6; i++) 966 vdev->mac_address.addr_bytes[i] = 0; 967 968 vdev->vlan_tag = 0; 969 970 /*Clear out the receive buffers*/ 971 rx_count = rte_eth_rx_burst(ports[0], 972 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 973 974 while (rx_count) { 975 for (i = 0; i < rx_count; i++) 976 rte_pktmbuf_free(pkts_burst[i]); 977 978 rx_count = rte_eth_rx_burst(ports[0], 979 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 980 } 981 982 vdev->ready = DEVICE_MAC_LEARNING; 983 } 984 } 985 986 /* 987 * Check if the packet destination MAC address is for a local device. If so then put 988 * the packet on that devices RX queue. If not then return. 989 */ 990 static inline unsigned __attribute__((always_inline)) 991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 992 { 993 struct virtio_net_data_ll *dev_ll; 994 struct ether_hdr *pkt_hdr; 995 uint64_t ret = 0; 996 struct virtio_net *dev = vdev->dev; 997 struct virtio_net *tdev; /* destination virito device */ 998 999 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1000 1001 /*get the used devices list*/ 1002 dev_ll = ll_root_used; 1003 1004 while (dev_ll != NULL) { 1005 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1006 &dev_ll->vdev->mac_address)) { 1007 1008 /* Drop the packet if the TX packet is destined for the TX device. */ 1009 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1010 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1011 dev->device_fh); 1012 return 0; 1013 } 1014 tdev = dev_ll->vdev->dev; 1015 1016 1017 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1018 1019 if (dev_ll->vdev->remove) { 1020 /*drop the packet if the device is marked for removal*/ 1021 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1022 } else { 1023 uint32_t mergeable = 1024 dev_ll->dev->features & 1025 (1 << VIRTIO_NET_F_MRG_RXBUF); 1026 1027 /*send the packet to the local virtio device*/ 1028 if (likely(mergeable == 0)) 1029 ret = virtio_dev_rx(dev_ll->dev, &m, 1); 1030 else 1031 ret = virtio_dev_merge_rx(dev_ll->dev, 1032 &m, 1); 1033 1034 if (enable_stats) { 1035 rte_atomic64_add( 1036 &dev_statistics[tdev->device_fh].rx_total_atomic, 1037 1); 1038 rte_atomic64_add( 1039 &dev_statistics[tdev->device_fh].rx_atomic, 1040 ret); 1041 dev_statistics[tdev->device_fh].tx_total++; 1042 dev_statistics[tdev->device_fh].tx += ret; 1043 } 1044 } 1045 1046 return 0; 1047 } 1048 dev_ll = dev_ll->next; 1049 } 1050 1051 return -1; 1052 } 1053 1054 /* 1055 * This function routes the TX packet to the correct interface. This may be a local device 1056 * or the physical port. 1057 */ 1058 static inline void __attribute__((always_inline)) 1059 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag) 1060 { 1061 struct mbuf_table *tx_q; 1062 struct vlan_ethhdr *vlan_hdr; 1063 struct rte_mbuf **m_table; 1064 struct rte_mbuf *mbuf, *prev; 1065 unsigned len, ret, offset = 0; 1066 const uint16_t lcore_id = rte_lcore_id(); 1067 struct virtio_net_data_ll *dev_ll = ll_root_used; 1068 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1069 struct virtio_net *dev = vdev->dev; 1070 1071 /*check if destination is local VM*/ 1072 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1073 return; 1074 1075 if (vm2vm_mode == VM2VM_HARDWARE) { 1076 while (dev_ll != NULL) { 1077 if ((dev_ll->vdev->ready == DEVICE_RX) 1078 && ether_addr_cmp(&(pkt_hdr->d_addr), 1079 &dev_ll->vdev->mac_address)) { 1080 /* 1081 * Drop the packet if the TX packet is 1082 * destined for the TX device. 1083 */ 1084 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1085 LOG_DEBUG(VHOST_DATA, 1086 "(%"PRIu64") TX: Source and destination" 1087 " MAC addresses are the same. Dropping " 1088 "packet.\n", 1089 dev_ll->vdev->device_fh); 1090 return; 1091 } 1092 offset = 4; 1093 vlan_tag = 1094 (uint16_t) 1095 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1096 1097 LOG_DEBUG(VHOST_DATA, 1098 "(%"PRIu64") TX: pkt to local VM device id:" 1099 "(%"PRIu64") vlan tag: %d.\n", 1100 dev->device_fh, dev_ll->vdev->dev->device_fh, 1101 vlan_tag); 1102 1103 break; 1104 } 1105 dev_ll = dev_ll->next; 1106 } 1107 } 1108 1109 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1110 1111 /*Add packet to the port tx queue*/ 1112 tx_q = &lcore_tx_queue[lcore_id]; 1113 len = tx_q->len; 1114 1115 /* Allocate an mbuf and populate the structure. */ 1116 mbuf = rte_pktmbuf_alloc(mbuf_pool); 1117 if (unlikely(mbuf == NULL)) { 1118 RTE_LOG(ERR, VHOST_DATA, 1119 "Failed to allocate memory for mbuf.\n"); 1120 return; 1121 } 1122 1123 mbuf->data_len = m->data_len + VLAN_HLEN + offset; 1124 mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset; 1125 mbuf->nb_segs = m->nb_segs; 1126 1127 /* Copy ethernet header to mbuf. */ 1128 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1129 rte_pktmbuf_mtod(m, const void *), 1130 ETH_HLEN); 1131 1132 1133 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/ 1134 vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *); 1135 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto; 1136 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); 1137 vlan_hdr->h_vlan_TCI = htons(vlan_tag); 1138 1139 /* Copy the remaining packet contents to the mbuf. */ 1140 rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN), 1141 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN), 1142 (m->data_len - ETH_HLEN)); 1143 1144 /* Copy the remaining segments for the whole packet. */ 1145 prev = mbuf; 1146 while (m->next) { 1147 /* Allocate an mbuf and populate the structure. */ 1148 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool); 1149 if (unlikely(next_mbuf == NULL)) { 1150 rte_pktmbuf_free(mbuf); 1151 RTE_LOG(ERR, VHOST_DATA, 1152 "Failed to allocate memory for mbuf.\n"); 1153 return; 1154 } 1155 1156 m = m->next; 1157 prev->next = next_mbuf; 1158 prev = next_mbuf; 1159 next_mbuf->data_len = m->data_len; 1160 1161 /* Copy data to next mbuf. */ 1162 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *), 1163 rte_pktmbuf_mtod(m, const void *), m->data_len); 1164 } 1165 1166 tx_q->m_table[len] = mbuf; 1167 len++; 1168 if (enable_stats) { 1169 dev_statistics[dev->device_fh].tx_total++; 1170 dev_statistics[dev->device_fh].tx++; 1171 } 1172 1173 if (unlikely(len == MAX_PKT_BURST)) { 1174 m_table = (struct rte_mbuf **)tx_q->m_table; 1175 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1176 /* Free any buffers not handled by TX and update the port stats. */ 1177 if (unlikely(ret < len)) { 1178 do { 1179 rte_pktmbuf_free(m_table[ret]); 1180 } while (++ret < len); 1181 } 1182 1183 len = 0; 1184 } 1185 1186 tx_q->len = len; 1187 return; 1188 } 1189 /* 1190 * This function is called by each data core. It handles all RX/TX registered with the 1191 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1192 * with all devices in the main linked list. 1193 */ 1194 static int 1195 switch_worker(__attribute__((unused)) void *arg) 1196 { 1197 struct rte_mempool *mbuf_pool = arg; 1198 struct virtio_net *dev = NULL; 1199 struct vhost_dev *vdev = NULL; 1200 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1201 struct virtio_net_data_ll *dev_ll; 1202 struct mbuf_table *tx_q; 1203 volatile struct lcore_ll_info *lcore_ll; 1204 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1205 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1206 unsigned ret, i; 1207 const uint16_t lcore_id = rte_lcore_id(); 1208 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1209 uint16_t rx_count = 0; 1210 uint32_t mergeable = 0; 1211 1212 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1213 lcore_ll = lcore_info[lcore_id].lcore_ll; 1214 prev_tsc = 0; 1215 1216 tx_q = &lcore_tx_queue[lcore_id]; 1217 for (i = 0; i < num_cores; i ++) { 1218 if (lcore_ids[i] == lcore_id) { 1219 tx_q->txq_id = i; 1220 break; 1221 } 1222 } 1223 1224 while(1) { 1225 cur_tsc = rte_rdtsc(); 1226 /* 1227 * TX burst queue drain 1228 */ 1229 diff_tsc = cur_tsc - prev_tsc; 1230 if (unlikely(diff_tsc > drain_tsc)) { 1231 1232 if (tx_q->len) { 1233 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1234 1235 /*Tx any packets in the queue*/ 1236 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1237 (struct rte_mbuf **)tx_q->m_table, 1238 (uint16_t)tx_q->len); 1239 if (unlikely(ret < tx_q->len)) { 1240 do { 1241 rte_pktmbuf_free(tx_q->m_table[ret]); 1242 } while (++ret < tx_q->len); 1243 } 1244 1245 tx_q->len = 0; 1246 } 1247 1248 prev_tsc = cur_tsc; 1249 1250 } 1251 1252 rte_prefetch0(lcore_ll->ll_root_used); 1253 /* 1254 * Inform the configuration core that we have exited the linked list and that no devices are 1255 * in use if requested. 1256 */ 1257 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1258 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1259 1260 /* 1261 * Process devices 1262 */ 1263 dev_ll = lcore_ll->ll_root_used; 1264 1265 while (dev_ll != NULL) { 1266 /*get virtio device ID*/ 1267 vdev = dev_ll->vdev; 1268 dev = vdev->dev; 1269 mergeable = 1270 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); 1271 1272 if (vdev->remove) { 1273 dev_ll = dev_ll->next; 1274 unlink_vmdq(vdev); 1275 vdev->ready = DEVICE_SAFE_REMOVE; 1276 continue; 1277 } 1278 if (likely(vdev->ready == DEVICE_RX)) { 1279 /*Handle guest RX*/ 1280 rx_count = rte_eth_rx_burst(ports[0], 1281 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1282 1283 if (rx_count) { 1284 if (likely(mergeable == 0)) 1285 ret_count = 1286 virtio_dev_rx(dev, 1287 pkts_burst, rx_count); 1288 else 1289 ret_count = 1290 virtio_dev_merge_rx(dev, 1291 pkts_burst, rx_count); 1292 1293 if (enable_stats) { 1294 rte_atomic64_add( 1295 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1296 rx_count); 1297 rte_atomic64_add( 1298 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1299 } 1300 while (likely(rx_count)) { 1301 rx_count--; 1302 rte_pktmbuf_free(pkts_burst[rx_count]); 1303 } 1304 1305 } 1306 } 1307 1308 if (!vdev->remove) { 1309 /*Handle guest TX*/ 1310 if (likely(mergeable == 0)) 1311 virtio_dev_tx(dev, mbuf_pool); 1312 else 1313 virtio_dev_merge_tx(dev, mbuf_pool); 1314 } 1315 1316 /*move to the next device in the list*/ 1317 dev_ll = dev_ll->next; 1318 } 1319 } 1320 1321 return 0; 1322 } 1323 1324 /* 1325 * This function gets available ring number for zero copy rx. 1326 * Only one thread will call this funciton for a paticular virtio device, 1327 * so, it is designed as non-thread-safe function. 1328 */ 1329 static inline uint32_t __attribute__((always_inline)) 1330 get_available_ring_num_zcp(struct virtio_net *dev) 1331 { 1332 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1333 uint16_t avail_idx; 1334 1335 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1336 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1337 } 1338 1339 /* 1340 * This function gets available ring index for zero copy rx, 1341 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1342 * Only one thread will call this funciton for a paticular virtio device, 1343 * so, it is designed as non-thread-safe function. 1344 */ 1345 static inline uint32_t __attribute__((always_inline)) 1346 get_available_ring_index_zcp(struct virtio_net *dev, 1347 uint16_t *res_base_idx, uint32_t count) 1348 { 1349 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1350 uint16_t avail_idx; 1351 uint32_t retry = 0; 1352 uint16_t free_entries; 1353 1354 *res_base_idx = vq->last_used_idx_res; 1355 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1356 free_entries = (avail_idx - *res_base_idx); 1357 1358 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1359 "avail idx: %d, " 1360 "res base idx:%d, free entries:%d\n", 1361 dev->device_fh, avail_idx, *res_base_idx, 1362 free_entries); 1363 1364 /* 1365 * If retry is enabled and the queue is full then we wait 1366 * and retry to avoid packet loss. 1367 */ 1368 if (enable_retry && unlikely(count > free_entries)) { 1369 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1370 rte_delay_us(burst_rx_delay_time); 1371 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1372 free_entries = (avail_idx - *res_base_idx); 1373 if (count <= free_entries) 1374 break; 1375 } 1376 } 1377 1378 /*check that we have enough buffers*/ 1379 if (unlikely(count > free_entries)) 1380 count = free_entries; 1381 1382 if (unlikely(count == 0)) { 1383 LOG_DEBUG(VHOST_DATA, 1384 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1385 "avail idx: %d, res base idx:%d, free entries:%d\n", 1386 dev->device_fh, avail_idx, 1387 *res_base_idx, free_entries); 1388 return 0; 1389 } 1390 1391 vq->last_used_idx_res = *res_base_idx + count; 1392 1393 return count; 1394 } 1395 1396 /* 1397 * This function put descriptor back to used list. 1398 */ 1399 static inline void __attribute__((always_inline)) 1400 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1401 { 1402 uint16_t res_cur_idx = vq->last_used_idx; 1403 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1404 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1405 rte_compiler_barrier(); 1406 *(volatile uint16_t *)&vq->used->idx += 1; 1407 vq->last_used_idx += 1; 1408 1409 /* Kick the guest if necessary. */ 1410 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1411 eventfd_write((int)vq->kickfd, 1); 1412 } 1413 1414 /* 1415 * This function get available descriptor from vitio vring and un-attached mbuf 1416 * from vpool->ring, and then attach them together. It needs adjust the offset 1417 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1418 * frame data may be put to wrong location in mbuf. 1419 */ 1420 static inline void __attribute__((always_inline)) 1421 attach_rxmbuf_zcp(struct virtio_net *dev) 1422 { 1423 uint16_t res_base_idx, desc_idx; 1424 uint64_t buff_addr, phys_addr; 1425 struct vhost_virtqueue *vq; 1426 struct vring_desc *desc; 1427 struct rte_mbuf *mbuf = NULL; 1428 struct vpool *vpool; 1429 hpa_type addr_type; 1430 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1431 1432 vpool = &vpool_array[vdev->vmdq_rx_q]; 1433 vq = dev->virtqueue[VIRTIO_RXQ]; 1434 1435 do { 1436 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1437 1) != 1)) 1438 return; 1439 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1440 1441 desc = &vq->desc[desc_idx]; 1442 if (desc->flags & VRING_DESC_F_NEXT) { 1443 desc = &vq->desc[desc->next]; 1444 buff_addr = gpa_to_vva(dev, desc->addr); 1445 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1446 &addr_type); 1447 } else { 1448 buff_addr = gpa_to_vva(dev, 1449 desc->addr + vq->vhost_hlen); 1450 phys_addr = gpa_to_hpa(vdev, 1451 desc->addr + vq->vhost_hlen, 1452 desc->len, &addr_type); 1453 } 1454 1455 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1456 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1457 " address found when attaching RX frame buffer" 1458 " address!\n", dev->device_fh); 1459 put_desc_to_used_list_zcp(vq, desc_idx); 1460 continue; 1461 } 1462 1463 /* 1464 * Check if the frame buffer address from guest crosses 1465 * sub-region or not. 1466 */ 1467 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1468 RTE_LOG(ERR, VHOST_DATA, 1469 "(%"PRIu64") Frame buffer address cross " 1470 "sub-regioin found when attaching RX frame " 1471 "buffer address!\n", 1472 dev->device_fh); 1473 put_desc_to_used_list_zcp(vq, desc_idx); 1474 continue; 1475 } 1476 } while (unlikely(phys_addr == 0)); 1477 1478 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1479 if (unlikely(mbuf == NULL)) { 1480 LOG_DEBUG(VHOST_DATA, 1481 "(%"PRIu64") in attach_rxmbuf_zcp: " 1482 "ring_sc_dequeue fail.\n", 1483 dev->device_fh); 1484 put_desc_to_used_list_zcp(vq, desc_idx); 1485 return; 1486 } 1487 1488 if (unlikely(vpool->buf_size > desc->len)) { 1489 LOG_DEBUG(VHOST_DATA, 1490 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1491 "length(%d) of descriptor idx: %d less than room " 1492 "size required: %d\n", 1493 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1494 put_desc_to_used_list_zcp(vq, desc_idx); 1495 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1496 return; 1497 } 1498 1499 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1500 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1501 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1502 mbuf->data_len = desc->len; 1503 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1504 1505 LOG_DEBUG(VHOST_DATA, 1506 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1507 "descriptor idx:%d\n", 1508 dev->device_fh, res_base_idx, desc_idx); 1509 1510 __rte_mbuf_raw_free(mbuf); 1511 1512 return; 1513 } 1514 1515 /* 1516 * Detach an attched packet mbuf - 1517 * - restore original mbuf address and length values. 1518 * - reset pktmbuf data and data_len to their default values. 1519 * All other fields of the given packet mbuf will be left intact. 1520 * 1521 * @param m 1522 * The attached packet mbuf. 1523 */ 1524 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1525 { 1526 const struct rte_mempool *mp = m->pool; 1527 void *buf = RTE_MBUF_TO_BADDR(m); 1528 uint32_t buf_ofs; 1529 uint32_t buf_len = mp->elt_size - sizeof(*m); 1530 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1531 1532 m->buf_addr = buf; 1533 m->buf_len = (uint16_t)buf_len; 1534 1535 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1536 RTE_PKTMBUF_HEADROOM : m->buf_len; 1537 m->data_off = buf_ofs; 1538 1539 m->data_len = 0; 1540 } 1541 1542 /* 1543 * This function is called after packets have been transimited. It fetchs mbuf 1544 * from vpool->pool, detached it and put into vpool->ring. It also update the 1545 * used index and kick the guest if necessary. 1546 */ 1547 static inline uint32_t __attribute__((always_inline)) 1548 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1549 { 1550 struct rte_mbuf *mbuf; 1551 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1552 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1553 uint32_t index = 0; 1554 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1555 1556 LOG_DEBUG(VHOST_DATA, 1557 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1558 "clean is: %d\n", 1559 dev->device_fh, mbuf_count); 1560 LOG_DEBUG(VHOST_DATA, 1561 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1562 "clean is : %d\n", 1563 dev->device_fh, rte_ring_count(vpool->ring)); 1564 1565 for (index = 0; index < mbuf_count; index++) { 1566 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1567 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1568 pktmbuf_detach_zcp(mbuf); 1569 rte_ring_sp_enqueue(vpool->ring, mbuf); 1570 1571 /* Update used index buffer information. */ 1572 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1573 vq->used->ring[used_idx].len = 0; 1574 1575 used_idx = (used_idx + 1) & (vq->size - 1); 1576 } 1577 1578 LOG_DEBUG(VHOST_DATA, 1579 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1580 "clean is: %d\n", 1581 dev->device_fh, rte_mempool_count(vpool->pool)); 1582 LOG_DEBUG(VHOST_DATA, 1583 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1584 "clean is : %d\n", 1585 dev->device_fh, rte_ring_count(vpool->ring)); 1586 LOG_DEBUG(VHOST_DATA, 1587 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1588 "vq->last_used_idx:%d\n", 1589 dev->device_fh, vq->last_used_idx); 1590 1591 vq->last_used_idx += mbuf_count; 1592 1593 LOG_DEBUG(VHOST_DATA, 1594 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1595 "vq->last_used_idx:%d\n", 1596 dev->device_fh, vq->last_used_idx); 1597 1598 rte_compiler_barrier(); 1599 1600 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1601 1602 /* Kick guest if required. */ 1603 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1604 eventfd_write((int)vq->kickfd, 1); 1605 1606 return 0; 1607 } 1608 1609 /* 1610 * This function is called when a virtio device is destroy. 1611 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1612 */ 1613 static void mbuf_destroy_zcp(struct vpool *vpool) 1614 { 1615 struct rte_mbuf *mbuf = NULL; 1616 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1617 1618 LOG_DEBUG(VHOST_CONFIG, 1619 "in mbuf_destroy_zcp: mbuf count in mempool before " 1620 "mbuf_destroy_zcp is: %d\n", 1621 mbuf_count); 1622 LOG_DEBUG(VHOST_CONFIG, 1623 "in mbuf_destroy_zcp: mbuf count in ring before " 1624 "mbuf_destroy_zcp is : %d\n", 1625 rte_ring_count(vpool->ring)); 1626 1627 for (index = 0; index < mbuf_count; index++) { 1628 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1629 if (likely(mbuf != NULL)) { 1630 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1631 pktmbuf_detach_zcp(mbuf); 1632 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1633 } 1634 } 1635 1636 LOG_DEBUG(VHOST_CONFIG, 1637 "in mbuf_destroy_zcp: mbuf count in mempool after " 1638 "mbuf_destroy_zcp is: %d\n", 1639 rte_mempool_count(vpool->pool)); 1640 LOG_DEBUG(VHOST_CONFIG, 1641 "in mbuf_destroy_zcp: mbuf count in ring after " 1642 "mbuf_destroy_zcp is : %d\n", 1643 rte_ring_count(vpool->ring)); 1644 } 1645 1646 /* 1647 * This function update the use flag and counter. 1648 */ 1649 static inline uint32_t __attribute__((always_inline)) 1650 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1651 uint32_t count) 1652 { 1653 struct vhost_virtqueue *vq; 1654 struct vring_desc *desc; 1655 struct rte_mbuf *buff; 1656 /* The virtio_hdr is initialised to 0. */ 1657 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1658 = {{0, 0, 0, 0, 0, 0}, 0}; 1659 uint64_t buff_hdr_addr = 0; 1660 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1661 uint32_t head_idx, packet_success = 0; 1662 uint16_t res_cur_idx; 1663 1664 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1665 1666 if (count == 0) 1667 return 0; 1668 1669 vq = dev->virtqueue[VIRTIO_RXQ]; 1670 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1671 1672 res_cur_idx = vq->last_used_idx; 1673 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1674 dev->device_fh, res_cur_idx, res_cur_idx + count); 1675 1676 /* Retrieve all of the head indexes first to avoid caching issues. */ 1677 for (head_idx = 0; head_idx < count; head_idx++) 1678 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1679 1680 /*Prefetch descriptor index. */ 1681 rte_prefetch0(&vq->desc[head[packet_success]]); 1682 1683 while (packet_success != count) { 1684 /* Get descriptor from available ring */ 1685 desc = &vq->desc[head[packet_success]]; 1686 1687 buff = pkts[packet_success]; 1688 LOG_DEBUG(VHOST_DATA, 1689 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1690 "pkt[%d] descriptor idx: %d\n", 1691 dev->device_fh, packet_success, 1692 MBUF_HEADROOM_UINT32(buff)); 1693 1694 PRINT_PACKET(dev, 1695 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1696 + RTE_PKTMBUF_HEADROOM), 1697 rte_pktmbuf_data_len(buff), 0); 1698 1699 /* Buffer address translation for virtio header. */ 1700 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1701 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1702 1703 /* 1704 * If the descriptors are chained the header and data are 1705 * placed in separate buffers. 1706 */ 1707 if (desc->flags & VRING_DESC_F_NEXT) { 1708 desc->len = vq->vhost_hlen; 1709 desc = &vq->desc[desc->next]; 1710 desc->len = rte_pktmbuf_data_len(buff); 1711 } else { 1712 desc->len = packet_len; 1713 } 1714 1715 /* Update used ring with desc information */ 1716 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1717 = head[packet_success]; 1718 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1719 = packet_len; 1720 res_cur_idx++; 1721 packet_success++; 1722 1723 /* A header is required per buffer. */ 1724 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1725 (const void *)&virtio_hdr, vq->vhost_hlen); 1726 1727 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1728 1729 if (likely(packet_success < count)) { 1730 /* Prefetch descriptor index. */ 1731 rte_prefetch0(&vq->desc[head[packet_success]]); 1732 } 1733 } 1734 1735 rte_compiler_barrier(); 1736 1737 LOG_DEBUG(VHOST_DATA, 1738 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1739 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1740 dev->device_fh, vq->last_used_idx, vq->used->idx); 1741 1742 *(volatile uint16_t *)&vq->used->idx += count; 1743 vq->last_used_idx += count; 1744 1745 LOG_DEBUG(VHOST_DATA, 1746 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1747 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1748 dev->device_fh, vq->last_used_idx, vq->used->idx); 1749 1750 /* Kick the guest if necessary. */ 1751 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1752 eventfd_write((int)vq->kickfd, 1); 1753 1754 return count; 1755 } 1756 1757 /* 1758 * This function routes the TX packet to the correct interface. 1759 * This may be a local device or the physical port. 1760 */ 1761 static inline void __attribute__((always_inline)) 1762 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1763 uint32_t desc_idx, uint8_t need_copy) 1764 { 1765 struct mbuf_table *tx_q; 1766 struct rte_mbuf **m_table; 1767 struct rte_mbuf *mbuf = NULL; 1768 unsigned len, ret, offset = 0; 1769 struct vpool *vpool; 1770 struct virtio_net_data_ll *dev_ll = ll_root_used; 1771 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1772 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1773 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1774 1775 /*Add packet to the port tx queue*/ 1776 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1777 len = tx_q->len; 1778 1779 /* Allocate an mbuf and populate the structure. */ 1780 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1781 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1782 if (unlikely(mbuf == NULL)) { 1783 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1784 RTE_LOG(ERR, VHOST_DATA, 1785 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1786 dev->device_fh); 1787 put_desc_to_used_list_zcp(vq, desc_idx); 1788 return; 1789 } 1790 1791 if (vm2vm_mode == VM2VM_HARDWARE) { 1792 /* Avoid using a vlan tag from any vm for external pkt, such as 1793 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1794 * selection, MAC address determines it as an external pkt 1795 * which should go to network, while vlan tag determine it as 1796 * a vm2vm pkt should forward to another vm. Hardware confuse 1797 * such a ambiguous situation, so pkt will lost. 1798 */ 1799 vlan_tag = external_pkt_default_vlan_tag; 1800 while (dev_ll != NULL) { 1801 if (likely(dev_ll->vdev->ready == DEVICE_RX) && 1802 ether_addr_cmp(&(pkt_hdr->d_addr), 1803 &dev_ll->vdev->mac_address)) { 1804 1805 /* 1806 * Drop the packet if the TX packet is destined 1807 * for the TX device. 1808 */ 1809 if (unlikely(dev_ll->vdev->dev->device_fh 1810 == dev->device_fh)) { 1811 LOG_DEBUG(VHOST_DATA, 1812 "(%"PRIu64") TX: Source and destination" 1813 "MAC addresses are the same. Dropping " 1814 "packet.\n", 1815 dev_ll->vdev->dev->device_fh); 1816 MBUF_HEADROOM_UINT32(mbuf) 1817 = (uint32_t)desc_idx; 1818 __rte_mbuf_raw_free(mbuf); 1819 return; 1820 } 1821 1822 /* 1823 * Packet length offset 4 bytes for HW vlan 1824 * strip when L2 switch back. 1825 */ 1826 offset = 4; 1827 vlan_tag = 1828 (uint16_t) 1829 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1830 1831 LOG_DEBUG(VHOST_DATA, 1832 "(%"PRIu64") TX: pkt to local VM device id:" 1833 "(%"PRIu64") vlan tag: %d.\n", 1834 dev->device_fh, dev_ll->vdev->dev->device_fh, 1835 vlan_tag); 1836 1837 break; 1838 } 1839 dev_ll = dev_ll->next; 1840 } 1841 } 1842 1843 mbuf->nb_segs = m->nb_segs; 1844 mbuf->next = m->next; 1845 mbuf->data_len = m->data_len + offset; 1846 mbuf->pkt_len = mbuf->data_len; 1847 if (unlikely(need_copy)) { 1848 /* Copy the packet contents to the mbuf. */ 1849 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1850 rte_pktmbuf_mtod(m, void *), 1851 m->data_len); 1852 } else { 1853 mbuf->data_off = m->data_off; 1854 mbuf->buf_physaddr = m->buf_physaddr; 1855 mbuf->buf_addr = m->buf_addr; 1856 } 1857 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1858 mbuf->vlan_tci = vlan_tag; 1859 mbuf->l2_len = sizeof(struct ether_hdr); 1860 mbuf->l3_len = sizeof(struct ipv4_hdr); 1861 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1862 1863 tx_q->m_table[len] = mbuf; 1864 len++; 1865 1866 LOG_DEBUG(VHOST_DATA, 1867 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1868 dev->device_fh, 1869 mbuf->nb_segs, 1870 (mbuf->next == NULL) ? "null" : "non-null"); 1871 1872 if (enable_stats) { 1873 dev_statistics[dev->device_fh].tx_total++; 1874 dev_statistics[dev->device_fh].tx++; 1875 } 1876 1877 if (unlikely(len == MAX_PKT_BURST)) { 1878 m_table = (struct rte_mbuf **)tx_q->m_table; 1879 ret = rte_eth_tx_burst(ports[0], 1880 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1881 1882 /* 1883 * Free any buffers not handled by TX and update 1884 * the port stats. 1885 */ 1886 if (unlikely(ret < len)) { 1887 do { 1888 rte_pktmbuf_free(m_table[ret]); 1889 } while (++ret < len); 1890 } 1891 1892 len = 0; 1893 txmbuf_clean_zcp(dev, vpool); 1894 } 1895 1896 tx_q->len = len; 1897 1898 return; 1899 } 1900 1901 /* 1902 * This function TX all available packets in virtio TX queue for one 1903 * virtio-net device. If it is first packet, it learns MAC address and 1904 * setup VMDQ. 1905 */ 1906 static inline void __attribute__((always_inline)) 1907 virtio_dev_tx_zcp(struct virtio_net *dev) 1908 { 1909 struct rte_mbuf m; 1910 struct vhost_virtqueue *vq; 1911 struct vring_desc *desc; 1912 uint64_t buff_addr = 0, phys_addr; 1913 uint32_t head[MAX_PKT_BURST]; 1914 uint32_t i; 1915 uint16_t free_entries, packet_success = 0; 1916 uint16_t avail_idx; 1917 uint8_t need_copy = 0; 1918 hpa_type addr_type; 1919 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1920 1921 vq = dev->virtqueue[VIRTIO_TXQ]; 1922 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1923 1924 /* If there are no available buffers then return. */ 1925 if (vq->last_used_idx_res == avail_idx) 1926 return; 1927 1928 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1929 1930 /* Prefetch available ring to retrieve head indexes. */ 1931 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1932 1933 /* Get the number of free entries in the ring */ 1934 free_entries = (avail_idx - vq->last_used_idx_res); 1935 1936 /* Limit to MAX_PKT_BURST. */ 1937 free_entries 1938 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1939 1940 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1941 dev->device_fh, free_entries); 1942 1943 /* Retrieve all of the head indexes first to avoid caching issues. */ 1944 for (i = 0; i < free_entries; i++) 1945 head[i] 1946 = vq->avail->ring[(vq->last_used_idx_res + i) 1947 & (vq->size - 1)]; 1948 1949 vq->last_used_idx_res += free_entries; 1950 1951 /* Prefetch descriptor index. */ 1952 rte_prefetch0(&vq->desc[head[packet_success]]); 1953 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1954 1955 while (packet_success < free_entries) { 1956 desc = &vq->desc[head[packet_success]]; 1957 1958 /* Discard first buffer as it is the virtio header */ 1959 desc = &vq->desc[desc->next]; 1960 1961 /* Buffer address translation. */ 1962 buff_addr = gpa_to_vva(dev, desc->addr); 1963 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type); 1964 1965 if (likely(packet_success < (free_entries - 1))) 1966 /* Prefetch descriptor index. */ 1967 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1968 1969 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1970 RTE_LOG(ERR, VHOST_DATA, 1971 "(%"PRIu64") Invalid frame buffer address found" 1972 "when TX packets!\n", 1973 dev->device_fh); 1974 packet_success++; 1975 continue; 1976 } 1977 1978 /* Prefetch buffer address. */ 1979 rte_prefetch0((void *)(uintptr_t)buff_addr); 1980 1981 /* 1982 * Setup dummy mbuf. This is copied to a real mbuf if 1983 * transmitted out the physical port. 1984 */ 1985 m.data_len = desc->len; 1986 m.nb_segs = 1; 1987 m.next = NULL; 1988 m.data_off = 0; 1989 m.buf_addr = (void *)(uintptr_t)buff_addr; 1990 m.buf_physaddr = phys_addr; 1991 1992 /* 1993 * Check if the frame buffer address from guest crosses 1994 * sub-region or not. 1995 */ 1996 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1997 RTE_LOG(ERR, VHOST_DATA, 1998 "(%"PRIu64") Frame buffer address cross " 1999 "sub-regioin found when attaching TX frame " 2000 "buffer address!\n", 2001 dev->device_fh); 2002 need_copy = 1; 2003 } else 2004 need_copy = 0; 2005 2006 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2007 2008 /* 2009 * If this is the first received packet we need to learn 2010 * the MAC and setup VMDQ 2011 */ 2012 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2013 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2014 /* 2015 * Discard frame if device is scheduled for 2016 * removal or a duplicate MAC address is found. 2017 */ 2018 packet_success += free_entries; 2019 vq->last_used_idx += packet_success; 2020 break; 2021 } 2022 } 2023 2024 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2025 packet_success++; 2026 } 2027 } 2028 2029 /* 2030 * This function is called by each data core. It handles all RX/TX registered 2031 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2032 * addresses are compared with all devices in the main linked list. 2033 */ 2034 static int 2035 switch_worker_zcp(__attribute__((unused)) void *arg) 2036 { 2037 struct virtio_net *dev = NULL; 2038 struct vhost_dev *vdev = NULL; 2039 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2040 struct virtio_net_data_ll *dev_ll; 2041 struct mbuf_table *tx_q; 2042 volatile struct lcore_ll_info *lcore_ll; 2043 const uint64_t drain_tsc 2044 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2045 * BURST_TX_DRAIN_US; 2046 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2047 unsigned ret; 2048 const uint16_t lcore_id = rte_lcore_id(); 2049 uint16_t count_in_ring, rx_count = 0; 2050 2051 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2052 2053 lcore_ll = lcore_info[lcore_id].lcore_ll; 2054 prev_tsc = 0; 2055 2056 while (1) { 2057 cur_tsc = rte_rdtsc(); 2058 2059 /* TX burst queue drain */ 2060 diff_tsc = cur_tsc - prev_tsc; 2061 if (unlikely(diff_tsc > drain_tsc)) { 2062 /* 2063 * Get mbuf from vpool.pool and detach mbuf and 2064 * put back into vpool.ring. 2065 */ 2066 dev_ll = lcore_ll->ll_root_used; 2067 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2068 /* Get virtio device ID */ 2069 vdev = dev_ll->vdev; 2070 dev = vdev->dev; 2071 2072 if (likely(!vdev->remove)) { 2073 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2074 if (tx_q->len) { 2075 LOG_DEBUG(VHOST_DATA, 2076 "TX queue drained after timeout" 2077 " with burst size %u\n", 2078 tx_q->len); 2079 2080 /* 2081 * Tx any packets in the queue 2082 */ 2083 ret = rte_eth_tx_burst( 2084 ports[0], 2085 (uint16_t)tx_q->txq_id, 2086 (struct rte_mbuf **) 2087 tx_q->m_table, 2088 (uint16_t)tx_q->len); 2089 if (unlikely(ret < tx_q->len)) { 2090 do { 2091 rte_pktmbuf_free( 2092 tx_q->m_table[ret]); 2093 } while (++ret < tx_q->len); 2094 } 2095 tx_q->len = 0; 2096 2097 txmbuf_clean_zcp(dev, 2098 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2099 } 2100 } 2101 dev_ll = dev_ll->next; 2102 } 2103 prev_tsc = cur_tsc; 2104 } 2105 2106 rte_prefetch0(lcore_ll->ll_root_used); 2107 2108 /* 2109 * Inform the configuration core that we have exited the linked 2110 * list and that no devices are in use if requested. 2111 */ 2112 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2113 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2114 2115 /* Process devices */ 2116 dev_ll = lcore_ll->ll_root_used; 2117 2118 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2119 vdev = dev_ll->vdev; 2120 dev = vdev->dev; 2121 if (unlikely(vdev->remove)) { 2122 dev_ll = dev_ll->next; 2123 unlink_vmdq(vdev); 2124 vdev->ready = DEVICE_SAFE_REMOVE; 2125 continue; 2126 } 2127 2128 if (likely(vdev->ready == DEVICE_RX)) { 2129 uint32_t index = vdev->vmdq_rx_q; 2130 uint16_t i; 2131 count_in_ring 2132 = rte_ring_count(vpool_array[index].ring); 2133 uint16_t free_entries 2134 = (uint16_t)get_available_ring_num_zcp(dev); 2135 2136 /* 2137 * Attach all mbufs in vpool.ring and put back 2138 * into vpool.pool. 2139 */ 2140 for (i = 0; 2141 i < RTE_MIN(free_entries, 2142 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2143 i++) 2144 attach_rxmbuf_zcp(dev); 2145 2146 /* Handle guest RX */ 2147 rx_count = rte_eth_rx_burst(ports[0], 2148 vdev->vmdq_rx_q, pkts_burst, 2149 MAX_PKT_BURST); 2150 2151 if (rx_count) { 2152 ret_count = virtio_dev_rx_zcp(dev, 2153 pkts_burst, rx_count); 2154 if (enable_stats) { 2155 dev_statistics[dev->device_fh].rx_total 2156 += rx_count; 2157 dev_statistics[dev->device_fh].rx 2158 += ret_count; 2159 } 2160 while (likely(rx_count)) { 2161 rx_count--; 2162 pktmbuf_detach_zcp( 2163 pkts_burst[rx_count]); 2164 rte_ring_sp_enqueue( 2165 vpool_array[index].ring, 2166 (void *)pkts_burst[rx_count]); 2167 } 2168 } 2169 } 2170 2171 if (likely(!vdev->remove)) 2172 /* Handle guest TX */ 2173 virtio_dev_tx_zcp(dev); 2174 2175 /* Move to the next device in the list */ 2176 dev_ll = dev_ll->next; 2177 } 2178 } 2179 2180 return 0; 2181 } 2182 2183 2184 /* 2185 * Add an entry to a used linked list. A free entry must first be found 2186 * in the free linked list using get_data_ll_free_entry(); 2187 */ 2188 static void 2189 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2190 struct virtio_net_data_ll *ll_dev) 2191 { 2192 struct virtio_net_data_ll *ll = *ll_root_addr; 2193 2194 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2195 ll_dev->next = NULL; 2196 rte_compiler_barrier(); 2197 2198 /* If ll == NULL then this is the first device. */ 2199 if (ll) { 2200 /* Increment to the tail of the linked list. */ 2201 while ((ll->next != NULL) ) 2202 ll = ll->next; 2203 2204 ll->next = ll_dev; 2205 } else { 2206 *ll_root_addr = ll_dev; 2207 } 2208 } 2209 2210 /* 2211 * Remove an entry from a used linked list. The entry must then be added to 2212 * the free linked list using put_data_ll_free_entry(). 2213 */ 2214 static void 2215 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2216 struct virtio_net_data_ll *ll_dev, 2217 struct virtio_net_data_ll *ll_dev_last) 2218 { 2219 struct virtio_net_data_ll *ll = *ll_root_addr; 2220 2221 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2222 return; 2223 2224 if (ll_dev == ll) 2225 *ll_root_addr = ll_dev->next; 2226 else 2227 if (likely(ll_dev_last != NULL)) 2228 ll_dev_last->next = ll_dev->next; 2229 else 2230 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2231 } 2232 2233 /* 2234 * Find and return an entry from the free linked list. 2235 */ 2236 static struct virtio_net_data_ll * 2237 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2238 { 2239 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2240 struct virtio_net_data_ll *ll_dev; 2241 2242 if (ll_free == NULL) 2243 return NULL; 2244 2245 ll_dev = ll_free; 2246 *ll_root_addr = ll_free->next; 2247 2248 return ll_dev; 2249 } 2250 2251 /* 2252 * Place an entry back on to the free linked list. 2253 */ 2254 static void 2255 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2256 struct virtio_net_data_ll *ll_dev) 2257 { 2258 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2259 2260 if (ll_dev == NULL) 2261 return; 2262 2263 ll_dev->next = ll_free; 2264 *ll_root_addr = ll_dev; 2265 } 2266 2267 /* 2268 * Creates a linked list of a given size. 2269 */ 2270 static struct virtio_net_data_ll * 2271 alloc_data_ll(uint32_t size) 2272 { 2273 struct virtio_net_data_ll *ll_new; 2274 uint32_t i; 2275 2276 /* Malloc and then chain the linked list. */ 2277 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2278 if (ll_new == NULL) { 2279 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2280 return NULL; 2281 } 2282 2283 for (i = 0; i < size - 1; i++) { 2284 ll_new[i].vdev = NULL; 2285 ll_new[i].next = &ll_new[i+1]; 2286 } 2287 ll_new[i].next = NULL; 2288 2289 return (ll_new); 2290 } 2291 2292 /* 2293 * Create the main linked list along with each individual cores linked list. A used and a free list 2294 * are created to manage entries. 2295 */ 2296 static int 2297 init_data_ll (void) 2298 { 2299 int lcore; 2300 2301 RTE_LCORE_FOREACH_SLAVE(lcore) { 2302 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2303 if (lcore_info[lcore].lcore_ll == NULL) { 2304 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2305 return -1; 2306 } 2307 2308 lcore_info[lcore].lcore_ll->device_num = 0; 2309 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2310 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2311 if (num_devices % num_switching_cores) 2312 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2313 else 2314 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2315 } 2316 2317 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2318 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2319 2320 return 0; 2321 } 2322 2323 /* 2324 * Set virtqueue flags so that we do not receive interrupts. 2325 */ 2326 static void 2327 set_irq_status (struct virtio_net *dev) 2328 { 2329 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2330 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY; 2331 } 2332 2333 /* 2334 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2335 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2336 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2337 */ 2338 static void 2339 destroy_device (volatile struct virtio_net *dev) 2340 { 2341 struct virtio_net_data_ll *ll_lcore_dev_cur; 2342 struct virtio_net_data_ll *ll_main_dev_cur; 2343 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2344 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2345 struct vhost_dev *vdev; 2346 int lcore; 2347 2348 dev->flags &= ~VIRTIO_DEV_RUNNING; 2349 2350 vdev = (struct vhost_dev *)dev->priv; 2351 /*set the remove flag. */ 2352 vdev->remove = 1; 2353 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2354 rte_pause(); 2355 } 2356 2357 /* Search for entry to be removed from lcore ll */ 2358 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2359 while (ll_lcore_dev_cur != NULL) { 2360 if (ll_lcore_dev_cur->vdev == vdev) { 2361 break; 2362 } else { 2363 ll_lcore_dev_last = ll_lcore_dev_cur; 2364 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2365 } 2366 } 2367 2368 if (ll_lcore_dev_cur == NULL) { 2369 RTE_LOG(ERR, VHOST_CONFIG, 2370 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2371 dev->device_fh); 2372 return; 2373 } 2374 2375 /* Search for entry to be removed from main ll */ 2376 ll_main_dev_cur = ll_root_used; 2377 ll_main_dev_last = NULL; 2378 while (ll_main_dev_cur != NULL) { 2379 if (ll_main_dev_cur->vdev == vdev) { 2380 break; 2381 } else { 2382 ll_main_dev_last = ll_main_dev_cur; 2383 ll_main_dev_cur = ll_main_dev_cur->next; 2384 } 2385 } 2386 2387 /* Remove entries from the lcore and main ll. */ 2388 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2389 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2390 2391 /* Set the dev_removal_flag on each lcore. */ 2392 RTE_LCORE_FOREACH_SLAVE(lcore) { 2393 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2394 } 2395 2396 /* 2397 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2398 * they can no longer access the device removed from the linked lists and that the devices 2399 * are no longer in use. 2400 */ 2401 RTE_LCORE_FOREACH_SLAVE(lcore) { 2402 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2403 rte_pause(); 2404 } 2405 } 2406 2407 /* Add the entries back to the lcore and main free ll.*/ 2408 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2409 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2410 2411 /* Decrement number of device on the lcore. */ 2412 lcore_info[vdev->coreid].lcore_ll->device_num--; 2413 2414 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2415 2416 if (zero_copy) { 2417 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2418 2419 /* Stop the RX queue. */ 2420 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2421 LOG_DEBUG(VHOST_CONFIG, 2422 "(%"PRIu64") In destroy_device: Failed to stop " 2423 "rx queue:%d\n", 2424 dev->device_fh, 2425 vdev->vmdq_rx_q); 2426 } 2427 2428 LOG_DEBUG(VHOST_CONFIG, 2429 "(%"PRIu64") in destroy_device: Start put mbuf in " 2430 "mempool back to ring for RX queue: %d\n", 2431 dev->device_fh, vdev->vmdq_rx_q); 2432 2433 mbuf_destroy_zcp(vpool); 2434 2435 /* Stop the TX queue. */ 2436 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2437 LOG_DEBUG(VHOST_CONFIG, 2438 "(%"PRIu64") In destroy_device: Failed to " 2439 "stop tx queue:%d\n", 2440 dev->device_fh, vdev->vmdq_rx_q); 2441 } 2442 2443 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2444 2445 LOG_DEBUG(VHOST_CONFIG, 2446 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2447 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2448 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2449 dev->device_fh); 2450 2451 mbuf_destroy_zcp(vpool); 2452 rte_free(vdev->regions_hpa); 2453 } 2454 rte_free(vdev); 2455 2456 } 2457 2458 /* 2459 * Calculate the region count of physical continous regions for one particular 2460 * region of whose vhost virtual address is continous. The particular region 2461 * start from vva_start, with size of 'size' in argument. 2462 */ 2463 static uint32_t 2464 check_hpa_regions(uint64_t vva_start, uint64_t size) 2465 { 2466 uint32_t i, nregions = 0, page_size = getpagesize(); 2467 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2468 if (vva_start % page_size) { 2469 LOG_DEBUG(VHOST_CONFIG, 2470 "in check_countinous: vva start(%p) mod page_size(%d) " 2471 "has remainder\n", 2472 (void *)(uintptr_t)vva_start, page_size); 2473 return 0; 2474 } 2475 if (size % page_size) { 2476 LOG_DEBUG(VHOST_CONFIG, 2477 "in check_countinous: " 2478 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2479 size, page_size); 2480 return 0; 2481 } 2482 for (i = 0; i < size - page_size; i = i + page_size) { 2483 cur_phys_addr 2484 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2485 next_phys_addr = rte_mem_virt2phy( 2486 (void *)(uintptr_t)(vva_start + i + page_size)); 2487 if ((cur_phys_addr + page_size) != next_phys_addr) { 2488 ++nregions; 2489 LOG_DEBUG(VHOST_CONFIG, 2490 "in check_continuous: hva addr:(%p) is not " 2491 "continuous with hva addr:(%p), diff:%d\n", 2492 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2493 (void *)(uintptr_t)(vva_start + (uint64_t)i 2494 + page_size), page_size); 2495 LOG_DEBUG(VHOST_CONFIG, 2496 "in check_continuous: hpa addr:(%p) is not " 2497 "continuous with hpa addr:(%p), " 2498 "diff:(%"PRIu64")\n", 2499 (void *)(uintptr_t)cur_phys_addr, 2500 (void *)(uintptr_t)next_phys_addr, 2501 (next_phys_addr-cur_phys_addr)); 2502 } 2503 } 2504 return nregions; 2505 } 2506 2507 /* 2508 * Divide each region whose vhost virtual address is continous into a few 2509 * sub-regions, make sure the physical address within each sub-region are 2510 * continous. And fill offset(to GPA) and size etc. information of each 2511 * sub-region into regions_hpa. 2512 */ 2513 static uint32_t 2514 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2515 { 2516 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2517 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2518 2519 if (mem_region_hpa == NULL) 2520 return 0; 2521 2522 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2523 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2524 virtio_memory->regions[regionidx].address_offset; 2525 mem_region_hpa[regionidx_hpa].guest_phys_address 2526 = virtio_memory->regions[regionidx].guest_phys_address; 2527 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2528 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2529 mem_region_hpa[regionidx_hpa].guest_phys_address; 2530 LOG_DEBUG(VHOST_CONFIG, 2531 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2532 regionidx_hpa, 2533 (void *)(uintptr_t) 2534 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2535 LOG_DEBUG(VHOST_CONFIG, 2536 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2537 regionidx_hpa, 2538 (void *)(uintptr_t) 2539 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2540 for (i = 0, k = 0; 2541 i < virtio_memory->regions[regionidx].memory_size - 2542 page_size; 2543 i += page_size) { 2544 cur_phys_addr = rte_mem_virt2phy( 2545 (void *)(uintptr_t)(vva_start + i)); 2546 next_phys_addr = rte_mem_virt2phy( 2547 (void *)(uintptr_t)(vva_start + 2548 i + page_size)); 2549 if ((cur_phys_addr + page_size) != next_phys_addr) { 2550 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2551 mem_region_hpa[regionidx_hpa].guest_phys_address + 2552 k + page_size; 2553 mem_region_hpa[regionidx_hpa].memory_size 2554 = k + page_size; 2555 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2556 "phys addr end [%d]:(%p)\n", 2557 regionidx_hpa, 2558 (void *)(uintptr_t) 2559 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2560 LOG_DEBUG(VHOST_CONFIG, 2561 "in fill_hpa_regions: guest phys addr " 2562 "size [%d]:(%p)\n", 2563 regionidx_hpa, 2564 (void *)(uintptr_t) 2565 (mem_region_hpa[regionidx_hpa].memory_size)); 2566 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2567 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2568 ++regionidx_hpa; 2569 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2570 next_phys_addr - 2571 mem_region_hpa[regionidx_hpa].guest_phys_address; 2572 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2573 " phys addr start[%d]:(%p)\n", 2574 regionidx_hpa, 2575 (void *)(uintptr_t) 2576 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2577 LOG_DEBUG(VHOST_CONFIG, 2578 "in fill_hpa_regions: host phys addr " 2579 "start[%d]:(%p)\n", 2580 regionidx_hpa, 2581 (void *)(uintptr_t) 2582 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2583 k = 0; 2584 } else { 2585 k += page_size; 2586 } 2587 } 2588 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2589 = mem_region_hpa[regionidx_hpa].guest_phys_address 2590 + k + page_size; 2591 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2592 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2593 "[%d]:(%p)\n", regionidx_hpa, 2594 (void *)(uintptr_t) 2595 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2596 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2597 "[%d]:(%p)\n", regionidx_hpa, 2598 (void *)(uintptr_t) 2599 (mem_region_hpa[regionidx_hpa].memory_size)); 2600 ++regionidx_hpa; 2601 } 2602 return regionidx_hpa; 2603 } 2604 2605 /* 2606 * A new device is added to a data core. First the device is added to the main linked list 2607 * and the allocated to a specific data core. 2608 */ 2609 static int 2610 new_device (struct virtio_net *dev) 2611 { 2612 struct virtio_net_data_ll *ll_dev; 2613 int lcore, core_add = 0; 2614 uint32_t device_num_min = num_devices; 2615 struct vhost_dev *vdev; 2616 uint32_t regionidx; 2617 2618 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE); 2619 if (vdev == NULL) { 2620 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2621 dev->device_fh); 2622 return -1; 2623 } 2624 vdev->dev = dev; 2625 dev->priv = vdev; 2626 2627 if (zero_copy) { 2628 vdev->nregions_hpa = dev->mem->nregions; 2629 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2630 vdev->nregions_hpa 2631 += check_hpa_regions( 2632 dev->mem->regions[regionidx].guest_phys_address 2633 + dev->mem->regions[regionidx].address_offset, 2634 dev->mem->regions[regionidx].memory_size); 2635 2636 } 2637 2638 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2639 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2640 CACHE_LINE_SIZE); 2641 if (vdev->regions_hpa == NULL) { 2642 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2643 rte_free(vdev); 2644 return -1; 2645 } 2646 2647 2648 if (fill_hpa_memory_regions( 2649 vdev->regions_hpa, dev->mem 2650 ) != vdev->nregions_hpa) { 2651 2652 RTE_LOG(ERR, VHOST_CONFIG, 2653 "hpa memory regions number mismatch: " 2654 "[%d]\n", vdev->nregions_hpa); 2655 rte_free(vdev->regions_hpa); 2656 rte_free(vdev); 2657 return -1; 2658 } 2659 } 2660 2661 2662 /* Add device to main ll */ 2663 ll_dev = get_data_ll_free_entry(&ll_root_free); 2664 if (ll_dev == NULL) { 2665 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2666 "of %d devices per core has been reached\n", 2667 dev->device_fh, num_devices); 2668 if (vdev->regions_hpa) 2669 rte_free(vdev->regions_hpa); 2670 rte_free(vdev); 2671 return -1; 2672 } 2673 ll_dev->vdev = vdev; 2674 add_data_ll_entry(&ll_root_used, ll_dev); 2675 vdev->vmdq_rx_q 2676 = dev->device_fh * (num_queues / num_devices); 2677 2678 if (zero_copy) { 2679 uint32_t index = vdev->vmdq_rx_q; 2680 uint32_t count_in_ring, i; 2681 struct mbuf_table *tx_q; 2682 2683 count_in_ring = rte_ring_count(vpool_array[index].ring); 2684 2685 LOG_DEBUG(VHOST_CONFIG, 2686 "(%"PRIu64") in new_device: mbuf count in mempool " 2687 "before attach is: %d\n", 2688 dev->device_fh, 2689 rte_mempool_count(vpool_array[index].pool)); 2690 LOG_DEBUG(VHOST_CONFIG, 2691 "(%"PRIu64") in new_device: mbuf count in ring " 2692 "before attach is : %d\n", 2693 dev->device_fh, count_in_ring); 2694 2695 /* 2696 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2697 */ 2698 for (i = 0; i < count_in_ring; i++) 2699 attach_rxmbuf_zcp(dev); 2700 2701 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2702 "mempool after attach is: %d\n", 2703 dev->device_fh, 2704 rte_mempool_count(vpool_array[index].pool)); 2705 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2706 "ring after attach is : %d\n", 2707 dev->device_fh, 2708 rte_ring_count(vpool_array[index].ring)); 2709 2710 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2711 tx_q->txq_id = vdev->vmdq_rx_q; 2712 2713 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2714 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2715 2716 LOG_DEBUG(VHOST_CONFIG, 2717 "(%"PRIu64") In new_device: Failed to start " 2718 "tx queue:%d\n", 2719 dev->device_fh, vdev->vmdq_rx_q); 2720 2721 mbuf_destroy_zcp(vpool); 2722 rte_free(vdev->regions_hpa); 2723 rte_free(vdev); 2724 return -1; 2725 } 2726 2727 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2728 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2729 2730 LOG_DEBUG(VHOST_CONFIG, 2731 "(%"PRIu64") In new_device: Failed to start " 2732 "rx queue:%d\n", 2733 dev->device_fh, vdev->vmdq_rx_q); 2734 2735 /* Stop the TX queue. */ 2736 if (rte_eth_dev_tx_queue_stop(ports[0], 2737 vdev->vmdq_rx_q) != 0) { 2738 LOG_DEBUG(VHOST_CONFIG, 2739 "(%"PRIu64") In new_device: Failed to " 2740 "stop tx queue:%d\n", 2741 dev->device_fh, vdev->vmdq_rx_q); 2742 } 2743 2744 mbuf_destroy_zcp(vpool); 2745 rte_free(vdev->regions_hpa); 2746 rte_free(vdev); 2747 return -1; 2748 } 2749 2750 } 2751 2752 /*reset ready flag*/ 2753 vdev->ready = DEVICE_MAC_LEARNING; 2754 vdev->remove = 0; 2755 2756 /* Find a suitable lcore to add the device. */ 2757 RTE_LCORE_FOREACH_SLAVE(lcore) { 2758 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2759 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2760 core_add = lcore; 2761 } 2762 } 2763 /* Add device to lcore ll */ 2764 ll_dev->dev->coreid = core_add; 2765 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free); 2766 if (ll_dev == NULL) { 2767 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2768 vdev->ready = DEVICE_SAFE_REMOVE; 2769 destroy_device(dev); 2770 if (vdev->regions_hpa) 2771 rte_free(vdev->regions_hpa); 2772 rte_free(vdev); 2773 return -1; 2774 } 2775 ll_dev->vdev = vdev; 2776 vdev->coreid = core_add; 2777 2778 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev); 2779 2780 /* Initialize device stats */ 2781 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2782 2783 /* Disable notifications. */ 2784 set_irq_status(dev); 2785 lcore_info[vdev->coreid].lcore_ll->device_num++; 2786 dev->flags |= VIRTIO_DEV_RUNNING; 2787 2788 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2789 2790 return 0; 2791 } 2792 2793 /* 2794 * These callback allow devices to be added to the data core when configuration 2795 * has been fully complete. 2796 */ 2797 static const struct virtio_net_device_ops virtio_net_device_ops = 2798 { 2799 .new_device = new_device, 2800 .destroy_device = destroy_device, 2801 }; 2802 2803 /* 2804 * This is a thread will wake up after a period to print stats if the user has 2805 * enabled them. 2806 */ 2807 static void 2808 print_stats(void) 2809 { 2810 struct virtio_net_data_ll *dev_ll; 2811 uint64_t tx_dropped, rx_dropped; 2812 uint64_t tx, tx_total, rx, rx_total; 2813 uint32_t device_fh; 2814 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2815 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2816 2817 while(1) { 2818 sleep(enable_stats); 2819 2820 /* Clear screen and move to top left */ 2821 printf("%s%s", clr, top_left); 2822 2823 printf("\nDevice statistics ===================================="); 2824 2825 dev_ll = ll_root_used; 2826 while (dev_ll != NULL) { 2827 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2828 tx_total = dev_statistics[device_fh].tx_total; 2829 tx = dev_statistics[device_fh].tx; 2830 tx_dropped = tx_total - tx; 2831 if (zero_copy == 0) { 2832 rx_total = rte_atomic64_read( 2833 &dev_statistics[device_fh].rx_total_atomic); 2834 rx = rte_atomic64_read( 2835 &dev_statistics[device_fh].rx_atomic); 2836 } else { 2837 rx_total = dev_statistics[device_fh].rx_total; 2838 rx = dev_statistics[device_fh].rx; 2839 } 2840 rx_dropped = rx_total - rx; 2841 2842 printf("\nStatistics for device %"PRIu32" ------------------------------" 2843 "\nTX total: %"PRIu64"" 2844 "\nTX dropped: %"PRIu64"" 2845 "\nTX successful: %"PRIu64"" 2846 "\nRX total: %"PRIu64"" 2847 "\nRX dropped: %"PRIu64"" 2848 "\nRX successful: %"PRIu64"", 2849 device_fh, 2850 tx_total, 2851 tx_dropped, 2852 tx, 2853 rx_total, 2854 rx_dropped, 2855 rx); 2856 2857 dev_ll = dev_ll->next; 2858 } 2859 printf("\n======================================================\n"); 2860 } 2861 } 2862 2863 static void 2864 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2865 char *ring_name, uint32_t nb_mbuf) 2866 { 2867 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2868 vpool_array[index].pool 2869 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2870 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2871 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2872 rte_pktmbuf_init, NULL, socket, 0); 2873 if (vpool_array[index].pool != NULL) { 2874 vpool_array[index].ring 2875 = rte_ring_create(ring_name, 2876 rte_align32pow2(nb_mbuf + 1), 2877 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2878 if (likely(vpool_array[index].ring != NULL)) { 2879 LOG_DEBUG(VHOST_CONFIG, 2880 "in setup_mempool_tbl: mbuf count in " 2881 "mempool is: %d\n", 2882 rte_mempool_count(vpool_array[index].pool)); 2883 LOG_DEBUG(VHOST_CONFIG, 2884 "in setup_mempool_tbl: mbuf count in " 2885 "ring is: %d\n", 2886 rte_ring_count(vpool_array[index].ring)); 2887 } else { 2888 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2889 ring_name); 2890 } 2891 2892 /* Need consider head room. */ 2893 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2894 } else { 2895 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2896 } 2897 } 2898 2899 2900 /* 2901 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2902 * device is also registered here to handle the IOCTLs. 2903 */ 2904 int 2905 MAIN(int argc, char *argv[]) 2906 { 2907 struct rte_mempool *mbuf_pool = NULL; 2908 unsigned lcore_id, core_id = 0; 2909 unsigned nb_ports, valid_num_ports; 2910 int ret; 2911 uint8_t portid, queue_id = 0; 2912 static pthread_t tid; 2913 2914 /* init EAL */ 2915 ret = rte_eal_init(argc, argv); 2916 if (ret < 0) 2917 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2918 argc -= ret; 2919 argv += ret; 2920 2921 /* parse app arguments */ 2922 ret = us_vhost_parse_args(argc, argv); 2923 if (ret < 0) 2924 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2925 2926 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2927 if (rte_lcore_is_enabled(lcore_id)) 2928 lcore_ids[core_id ++] = lcore_id; 2929 2930 if (rte_lcore_count() > RTE_MAX_LCORE) 2931 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2932 2933 /*set the number of swithcing cores available*/ 2934 num_switching_cores = rte_lcore_count()-1; 2935 2936 /* Get the number of physical ports. */ 2937 nb_ports = rte_eth_dev_count(); 2938 if (nb_ports > RTE_MAX_ETHPORTS) 2939 nb_ports = RTE_MAX_ETHPORTS; 2940 2941 /* 2942 * Update the global var NUM_PORTS and global array PORTS 2943 * and get value of var VALID_NUM_PORTS according to system ports number 2944 */ 2945 valid_num_ports = check_ports_num(nb_ports); 2946 2947 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2948 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2949 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2950 return -1; 2951 } 2952 2953 if (zero_copy == 0) { 2954 /* Create the mbuf pool. */ 2955 mbuf_pool = rte_mempool_create( 2956 "MBUF_POOL", 2957 NUM_MBUFS_PER_PORT 2958 * valid_num_ports, 2959 MBUF_SIZE, MBUF_CACHE_SIZE, 2960 sizeof(struct rte_pktmbuf_pool_private), 2961 rte_pktmbuf_pool_init, NULL, 2962 rte_pktmbuf_init, NULL, 2963 rte_socket_id(), 0); 2964 if (mbuf_pool == NULL) 2965 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2966 2967 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2968 vpool_array[queue_id].pool = mbuf_pool; 2969 2970 if (vm2vm_mode == VM2VM_HARDWARE) { 2971 /* Enable VT loop back to let L2 switch to do it. */ 2972 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2973 LOG_DEBUG(VHOST_CONFIG, 2974 "Enable loop back for L2 switch in vmdq.\n"); 2975 } 2976 } else { 2977 uint32_t nb_mbuf; 2978 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2979 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2980 2981 /* 2982 * Zero copy defers queue RX/TX start to the time when guest 2983 * finishes its startup and packet buffers from that guest are 2984 * available. 2985 */ 2986 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy; 2987 rx_conf_default.rx_drop_en = 0; 2988 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy; 2989 nb_mbuf = num_rx_descriptor 2990 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2991 + num_switching_cores * MAX_PKT_BURST; 2992 2993 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2994 snprintf(pool_name, sizeof(pool_name), 2995 "rxmbuf_pool_%u", queue_id); 2996 snprintf(ring_name, sizeof(ring_name), 2997 "rxmbuf_ring_%u", queue_id); 2998 setup_mempool_tbl(rte_socket_id(), queue_id, 2999 pool_name, ring_name, nb_mbuf); 3000 } 3001 3002 nb_mbuf = num_tx_descriptor 3003 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3004 + num_switching_cores * MAX_PKT_BURST; 3005 3006 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3007 snprintf(pool_name, sizeof(pool_name), 3008 "txmbuf_pool_%u", queue_id); 3009 snprintf(ring_name, sizeof(ring_name), 3010 "txmbuf_ring_%u", queue_id); 3011 setup_mempool_tbl(rte_socket_id(), 3012 (queue_id + MAX_QUEUES), 3013 pool_name, ring_name, nb_mbuf); 3014 } 3015 3016 if (vm2vm_mode == VM2VM_HARDWARE) { 3017 /* Enable VT loop back to let L2 switch to do it. */ 3018 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3019 LOG_DEBUG(VHOST_CONFIG, 3020 "Enable loop back for L2 switch in vmdq.\n"); 3021 } 3022 } 3023 /* Set log level. */ 3024 rte_set_log_level(LOG_LEVEL); 3025 3026 /* initialize all ports */ 3027 for (portid = 0; portid < nb_ports; portid++) { 3028 /* skip ports that are not enabled */ 3029 if ((enabled_port_mask & (1 << portid)) == 0) { 3030 RTE_LOG(INFO, VHOST_PORT, 3031 "Skipping disabled port %d\n", portid); 3032 continue; 3033 } 3034 if (port_init(portid) != 0) 3035 rte_exit(EXIT_FAILURE, 3036 "Cannot initialize network ports\n"); 3037 } 3038 3039 /* Initialise all linked lists. */ 3040 if (init_data_ll() == -1) 3041 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3042 3043 /* Initialize device stats */ 3044 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3045 3046 /* Enable stats if the user option is set. */ 3047 if (enable_stats) 3048 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3049 3050 /* Launch all data cores. */ 3051 if (zero_copy == 0) { 3052 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3053 rte_eal_remote_launch(switch_worker, 3054 mbuf_pool, lcore_id); 3055 } 3056 } else { 3057 uint32_t count_in_mempool, index, i; 3058 for (index = 0; index < 2*MAX_QUEUES; index++) { 3059 /* For all RX and TX queues. */ 3060 count_in_mempool 3061 = rte_mempool_count(vpool_array[index].pool); 3062 3063 /* 3064 * Transfer all un-attached mbufs from vpool.pool 3065 * to vpoo.ring. 3066 */ 3067 for (i = 0; i < count_in_mempool; i++) { 3068 struct rte_mbuf *mbuf 3069 = __rte_mbuf_raw_alloc( 3070 vpool_array[index].pool); 3071 rte_ring_sp_enqueue(vpool_array[index].ring, 3072 (void *)mbuf); 3073 } 3074 3075 LOG_DEBUG(VHOST_CONFIG, 3076 "in MAIN: mbuf count in mempool at initial " 3077 "is: %d\n", count_in_mempool); 3078 LOG_DEBUG(VHOST_CONFIG, 3079 "in MAIN: mbuf count in ring at initial is :" 3080 " %d\n", 3081 rte_ring_count(vpool_array[index].ring)); 3082 } 3083 3084 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3085 rte_eal_remote_launch(switch_worker_zcp, NULL, 3086 lcore_id); 3087 } 3088 3089 /* Register CUSE device to handle IOCTLs. */ 3090 ret = rte_vhost_driver_register((char *)&dev_basename); 3091 if (ret != 0) 3092 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3093 3094 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3095 3096 /* Start CUSE session. */ 3097 rte_vhost_driver_session_start(); 3098 return 0; 3099 3100 } 3101 3102