1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 /* mask of enabled ports */ 143 static uint32_t enabled_port_mask = 0; 144 145 /* Promiscuous mode */ 146 static uint32_t promiscuous; 147 148 /*Number of switching cores enabled*/ 149 static uint32_t num_switching_cores = 0; 150 151 /* number of devices/queues to support*/ 152 static uint32_t num_queues = 0; 153 static uint32_t num_devices; 154 155 /* 156 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 157 * disabled on default. 158 */ 159 static uint32_t zero_copy; 160 static int mergeable; 161 162 /* number of descriptors to apply*/ 163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 165 166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 167 #define MAX_RING_DESC 4096 168 169 struct vpool { 170 struct rte_mempool *pool; 171 struct rte_ring *ring; 172 uint32_t buf_size; 173 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 174 175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 176 typedef enum { 177 VM2VM_DISABLED = 0, 178 VM2VM_SOFTWARE = 1, 179 VM2VM_HARDWARE = 2, 180 VM2VM_LAST 181 } vm2vm_type; 182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 183 184 /* The type of host physical address translated from guest physical address. */ 185 typedef enum { 186 PHYS_ADDR_CONTINUOUS = 0, 187 PHYS_ADDR_CROSS_SUBREG = 1, 188 PHYS_ADDR_INVALID = 2, 189 PHYS_ADDR_LAST 190 } hpa_type; 191 192 /* Enable stats. */ 193 static uint32_t enable_stats = 0; 194 /* Enable retries on RX. */ 195 static uint32_t enable_retry = 1; 196 /* Specify timeout (in useconds) between retries on RX. */ 197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 198 /* Specify the number of retries on RX. */ 199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 200 201 /* Character device basename. Can be set by user. */ 202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 203 204 /* empty vmdq configuration structure. Filled in programatically */ 205 static struct rte_eth_conf vmdq_conf_default = { 206 .rxmode = { 207 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 208 .split_hdr_size = 0, 209 .header_split = 0, /**< Header Split disabled */ 210 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 211 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 212 /* 213 * It is necessary for 1G NIC such as I350, 214 * this fixes bug of ipv4 forwarding in guest can't 215 * forward pakets from one virtio dev to another virtio dev. 216 */ 217 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 218 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 219 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 220 }, 221 222 .txmode = { 223 .mq_mode = ETH_MQ_TX_NONE, 224 }, 225 .rx_adv_conf = { 226 /* 227 * should be overridden separately in code with 228 * appropriate values 229 */ 230 .vmdq_rx_conf = { 231 .nb_queue_pools = ETH_8_POOLS, 232 .enable_default_pool = 0, 233 .default_pool = 0, 234 .nb_pool_maps = 0, 235 .pool_map = {{0, 0},}, 236 }, 237 }, 238 }; 239 240 static unsigned lcore_ids[RTE_MAX_LCORE]; 241 static uint8_t ports[RTE_MAX_ETHPORTS]; 242 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 243 static uint16_t num_pf_queues, num_vmdq_queues; 244 static uint16_t vmdq_pool_base, vmdq_queue_base; 245 static uint16_t queues_per_pool; 246 247 static const uint16_t external_pkt_default_vlan_tag = 2000; 248 const uint16_t vlan_tags[] = { 249 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 250 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 251 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 252 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 253 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 254 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 255 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 256 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 257 }; 258 259 /* ethernet addresses of ports */ 260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 261 262 /* heads for the main used and free linked lists for the data path. */ 263 static struct virtio_net_data_ll *ll_root_used = NULL; 264 static struct virtio_net_data_ll *ll_root_free = NULL; 265 266 /* Array of data core structures containing information on individual core linked lists. */ 267 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 268 269 /* Used for queueing bursts of TX packets. */ 270 struct mbuf_table { 271 unsigned len; 272 unsigned txq_id; 273 struct rte_mbuf *m_table[MAX_PKT_BURST]; 274 }; 275 276 /* TX queue for each data core. */ 277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 278 279 /* TX queue fori each virtio device for zero copy. */ 280 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 281 282 /* Vlan header struct used to insert vlan tags on TX. */ 283 struct vlan_ethhdr { 284 unsigned char h_dest[ETH_ALEN]; 285 unsigned char h_source[ETH_ALEN]; 286 __be16 h_vlan_proto; 287 __be16 h_vlan_TCI; 288 __be16 h_vlan_encapsulated_proto; 289 }; 290 291 /* IPv4 Header */ 292 struct ipv4_hdr { 293 uint8_t version_ihl; /**< version and header length */ 294 uint8_t type_of_service; /**< type of service */ 295 uint16_t total_length; /**< length of packet */ 296 uint16_t packet_id; /**< packet ID */ 297 uint16_t fragment_offset; /**< fragmentation offset */ 298 uint8_t time_to_live; /**< time to live */ 299 uint8_t next_proto_id; /**< protocol ID */ 300 uint16_t hdr_checksum; /**< header checksum */ 301 uint32_t src_addr; /**< source address */ 302 uint32_t dst_addr; /**< destination address */ 303 } __attribute__((__packed__)); 304 305 /* Header lengths. */ 306 #define VLAN_HLEN 4 307 #define VLAN_ETH_HLEN 18 308 309 /* Per-device statistics struct */ 310 struct device_statistics { 311 uint64_t tx_total; 312 rte_atomic64_t rx_total_atomic; 313 uint64_t rx_total; 314 uint64_t tx; 315 rte_atomic64_t rx_atomic; 316 uint64_t rx; 317 } __rte_cache_aligned; 318 struct device_statistics dev_statistics[MAX_DEVICES]; 319 320 /* 321 * Builds up the correct configuration for VMDQ VLAN pool map 322 * according to the pool & queue limits. 323 */ 324 static inline int 325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 326 { 327 struct rte_eth_vmdq_rx_conf conf; 328 struct rte_eth_vmdq_rx_conf *def_conf = 329 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 330 unsigned i; 331 332 memset(&conf, 0, sizeof(conf)); 333 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 334 conf.nb_pool_maps = num_devices; 335 conf.enable_loop_back = def_conf->enable_loop_back; 336 conf.rx_mode = def_conf->rx_mode; 337 338 for (i = 0; i < conf.nb_pool_maps; i++) { 339 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 340 conf.pool_map[i].pools = (1UL << i); 341 } 342 343 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 344 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 345 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 346 return 0; 347 } 348 349 /* 350 * Validate the device number according to the max pool number gotten form 351 * dev_info. If the device number is invalid, give the error message and 352 * return -1. Each device must have its own pool. 353 */ 354 static inline int 355 validate_num_devices(uint32_t max_nb_devices) 356 { 357 if (num_devices > max_nb_devices) { 358 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 359 return -1; 360 } 361 return 0; 362 } 363 364 /* 365 * Initialises a given port using global settings and with the rx buffers 366 * coming from the mbuf_pool passed as parameter 367 */ 368 static inline int 369 port_init(uint8_t port) 370 { 371 struct rte_eth_dev_info dev_info; 372 struct rte_eth_conf port_conf; 373 struct rte_eth_rxconf *rxconf; 374 struct rte_eth_txconf *txconf; 375 int16_t rx_rings, tx_rings; 376 uint16_t rx_ring_size, tx_ring_size; 377 int retval; 378 uint16_t q; 379 380 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 381 rte_eth_dev_info_get (port, &dev_info); 382 383 if (dev_info.max_rx_queues > MAX_QUEUES) { 384 rte_exit(EXIT_FAILURE, 385 "please define MAX_QUEUES no less than %u in %s\n", 386 dev_info.max_rx_queues, __FILE__); 387 } 388 389 rxconf = &dev_info.default_rxconf; 390 txconf = &dev_info.default_txconf; 391 rxconf->rx_drop_en = 1; 392 393 /* Enable vlan offload */ 394 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 395 396 /* 397 * Zero copy defers queue RX/TX start to the time when guest 398 * finishes its startup and packet buffers from that guest are 399 * available. 400 */ 401 if (zero_copy) { 402 rxconf->rx_deferred_start = 1; 403 rxconf->rx_drop_en = 0; 404 txconf->tx_deferred_start = 1; 405 } 406 407 /*configure the number of supported virtio devices based on VMDQ limits */ 408 num_devices = dev_info.max_vmdq_pools; 409 410 if (zero_copy) { 411 rx_ring_size = num_rx_descriptor; 412 tx_ring_size = num_tx_descriptor; 413 tx_rings = dev_info.max_tx_queues; 414 } else { 415 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 416 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 417 tx_rings = (uint16_t)rte_lcore_count(); 418 } 419 420 retval = validate_num_devices(MAX_DEVICES); 421 if (retval < 0) 422 return retval; 423 424 /* Get port configuration. */ 425 retval = get_eth_conf(&port_conf, num_devices); 426 if (retval < 0) 427 return retval; 428 /* NIC queues are divided into pf queues and vmdq queues. */ 429 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 430 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 431 num_vmdq_queues = num_devices * queues_per_pool; 432 num_queues = num_pf_queues + num_vmdq_queues; 433 vmdq_queue_base = dev_info.vmdq_queue_base; 434 vmdq_pool_base = dev_info.vmdq_pool_base; 435 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 436 num_pf_queues, num_devices, queues_per_pool); 437 438 if (port >= rte_eth_dev_count()) return -1; 439 440 rx_rings = (uint16_t)dev_info.max_rx_queues; 441 /* Configure ethernet device. */ 442 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 443 if (retval != 0) 444 return retval; 445 446 /* Setup the queues. */ 447 for (q = 0; q < rx_rings; q ++) { 448 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 449 rte_eth_dev_socket_id(port), 450 rxconf, 451 vpool_array[q].pool); 452 if (retval < 0) 453 return retval; 454 } 455 for (q = 0; q < tx_rings; q ++) { 456 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 457 rte_eth_dev_socket_id(port), 458 txconf); 459 if (retval < 0) 460 return retval; 461 } 462 463 /* Start the device. */ 464 retval = rte_eth_dev_start(port); 465 if (retval < 0) { 466 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 467 return retval; 468 } 469 470 if (promiscuous) 471 rte_eth_promiscuous_enable(port); 472 473 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 474 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 475 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 476 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 477 (unsigned)port, 478 vmdq_ports_eth_addr[port].addr_bytes[0], 479 vmdq_ports_eth_addr[port].addr_bytes[1], 480 vmdq_ports_eth_addr[port].addr_bytes[2], 481 vmdq_ports_eth_addr[port].addr_bytes[3], 482 vmdq_ports_eth_addr[port].addr_bytes[4], 483 vmdq_ports_eth_addr[port].addr_bytes[5]); 484 485 return 0; 486 } 487 488 /* 489 * Set character device basename. 490 */ 491 static int 492 us_vhost_parse_basename(const char *q_arg) 493 { 494 /* parse number string */ 495 496 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 497 return -1; 498 else 499 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 500 501 return 0; 502 } 503 504 /* 505 * Parse the portmask provided at run time. 506 */ 507 static int 508 parse_portmask(const char *portmask) 509 { 510 char *end = NULL; 511 unsigned long pm; 512 513 errno = 0; 514 515 /* parse hexadecimal string */ 516 pm = strtoul(portmask, &end, 16); 517 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 518 return -1; 519 520 if (pm == 0) 521 return -1; 522 523 return pm; 524 525 } 526 527 /* 528 * Parse num options at run time. 529 */ 530 static int 531 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 532 { 533 char *end = NULL; 534 unsigned long num; 535 536 errno = 0; 537 538 /* parse unsigned int string */ 539 num = strtoul(q_arg, &end, 10); 540 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 541 return -1; 542 543 if (num > max_valid_value) 544 return -1; 545 546 return num; 547 548 } 549 550 /* 551 * Display usage 552 */ 553 static void 554 us_vhost_usage(const char *prgname) 555 { 556 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 557 " --vm2vm [0|1|2]\n" 558 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 559 " --dev-basename <name>\n" 560 " --nb-devices ND\n" 561 " -p PORTMASK: Set mask for ports to be used by application\n" 562 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 563 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 564 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 565 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 566 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 567 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 568 " --dev-basename: The basename to be used for the character device.\n" 569 " --zero-copy [0|1]: disable(default)/enable rx/tx " 570 "zero copy\n" 571 " --rx-desc-num [0-N]: the number of descriptors on rx, " 572 "used only when zero copy is enabled.\n" 573 " --tx-desc-num [0-N]: the number of descriptors on tx, " 574 "used only when zero copy is enabled.\n", 575 prgname); 576 } 577 578 /* 579 * Parse the arguments given in the command line of the application. 580 */ 581 static int 582 us_vhost_parse_args(int argc, char **argv) 583 { 584 int opt, ret; 585 int option_index; 586 unsigned i; 587 const char *prgname = argv[0]; 588 static struct option long_option[] = { 589 {"vm2vm", required_argument, NULL, 0}, 590 {"rx-retry", required_argument, NULL, 0}, 591 {"rx-retry-delay", required_argument, NULL, 0}, 592 {"rx-retry-num", required_argument, NULL, 0}, 593 {"mergeable", required_argument, NULL, 0}, 594 {"stats", required_argument, NULL, 0}, 595 {"dev-basename", required_argument, NULL, 0}, 596 {"zero-copy", required_argument, NULL, 0}, 597 {"rx-desc-num", required_argument, NULL, 0}, 598 {"tx-desc-num", required_argument, NULL, 0}, 599 {NULL, 0, 0, 0}, 600 }; 601 602 /* Parse command line */ 603 while ((opt = getopt_long(argc, argv, "p:P", 604 long_option, &option_index)) != EOF) { 605 switch (opt) { 606 /* Portmask */ 607 case 'p': 608 enabled_port_mask = parse_portmask(optarg); 609 if (enabled_port_mask == 0) { 610 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 611 us_vhost_usage(prgname); 612 return -1; 613 } 614 break; 615 616 case 'P': 617 promiscuous = 1; 618 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 619 ETH_VMDQ_ACCEPT_BROADCAST | 620 ETH_VMDQ_ACCEPT_MULTICAST; 621 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 622 623 break; 624 625 case 0: 626 /* Enable/disable vm2vm comms. */ 627 if (!strncmp(long_option[option_index].name, "vm2vm", 628 MAX_LONG_OPT_SZ)) { 629 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 630 if (ret == -1) { 631 RTE_LOG(INFO, VHOST_CONFIG, 632 "Invalid argument for " 633 "vm2vm [0|1|2]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } else { 637 vm2vm_mode = (vm2vm_type)ret; 638 } 639 } 640 641 /* Enable/disable retries on RX. */ 642 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 643 ret = parse_num_opt(optarg, 1); 644 if (ret == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 646 us_vhost_usage(prgname); 647 return -1; 648 } else { 649 enable_retry = ret; 650 } 651 } 652 653 /* Specify the retries delay time (in useconds) on RX. */ 654 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 655 ret = parse_num_opt(optarg, INT32_MAX); 656 if (ret == -1) { 657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 658 us_vhost_usage(prgname); 659 return -1; 660 } else { 661 burst_rx_delay_time = ret; 662 } 663 } 664 665 /* Specify the retries number on RX. */ 666 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 667 ret = parse_num_opt(optarg, INT32_MAX); 668 if (ret == -1) { 669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 670 us_vhost_usage(prgname); 671 return -1; 672 } else { 673 burst_rx_retry_num = ret; 674 } 675 } 676 677 /* Enable/disable RX mergeable buffers. */ 678 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 679 ret = parse_num_opt(optarg, 1); 680 if (ret == -1) { 681 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 682 us_vhost_usage(prgname); 683 return -1; 684 } else { 685 mergeable = !!ret; 686 if (ret) { 687 vmdq_conf_default.rxmode.jumbo_frame = 1; 688 vmdq_conf_default.rxmode.max_rx_pkt_len 689 = JUMBO_FRAME_MAX_SIZE; 690 } 691 } 692 } 693 694 /* Enable/disable stats. */ 695 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 696 ret = parse_num_opt(optarg, INT32_MAX); 697 if (ret == -1) { 698 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 699 us_vhost_usage(prgname); 700 return -1; 701 } else { 702 enable_stats = ret; 703 } 704 } 705 706 /* Set character device basename. */ 707 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 708 if (us_vhost_parse_basename(optarg) == -1) { 709 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 710 us_vhost_usage(prgname); 711 return -1; 712 } 713 } 714 715 /* Enable/disable rx/tx zero copy. */ 716 if (!strncmp(long_option[option_index].name, 717 "zero-copy", MAX_LONG_OPT_SZ)) { 718 ret = parse_num_opt(optarg, 1); 719 if (ret == -1) { 720 RTE_LOG(INFO, VHOST_CONFIG, 721 "Invalid argument" 722 " for zero-copy [0|1]\n"); 723 us_vhost_usage(prgname); 724 return -1; 725 } else 726 zero_copy = ret; 727 728 if (zero_copy) { 729 #ifdef RTE_MBUF_REFCNT 730 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 731 "zero copy vhost APP, please " 732 "disable RTE_MBUF_REFCNT\n" 733 "in config file and then rebuild DPDK " 734 "core lib!\n" 735 "Otherwise please disable zero copy " 736 "flag in command line!\n"); 737 return -1; 738 #endif 739 } 740 } 741 742 /* Specify the descriptor number on RX. */ 743 if (!strncmp(long_option[option_index].name, 744 "rx-desc-num", MAX_LONG_OPT_SZ)) { 745 ret = parse_num_opt(optarg, MAX_RING_DESC); 746 if ((ret == -1) || (!POWEROF2(ret))) { 747 RTE_LOG(INFO, VHOST_CONFIG, 748 "Invalid argument for rx-desc-num[0-N]," 749 "power of 2 required.\n"); 750 us_vhost_usage(prgname); 751 return -1; 752 } else { 753 num_rx_descriptor = ret; 754 } 755 } 756 757 /* Specify the descriptor number on TX. */ 758 if (!strncmp(long_option[option_index].name, 759 "tx-desc-num", MAX_LONG_OPT_SZ)) { 760 ret = parse_num_opt(optarg, MAX_RING_DESC); 761 if ((ret == -1) || (!POWEROF2(ret))) { 762 RTE_LOG(INFO, VHOST_CONFIG, 763 "Invalid argument for tx-desc-num [0-N]," 764 "power of 2 required.\n"); 765 us_vhost_usage(prgname); 766 return -1; 767 } else { 768 num_tx_descriptor = ret; 769 } 770 } 771 772 break; 773 774 /* Invalid option - print options. */ 775 default: 776 us_vhost_usage(prgname); 777 return -1; 778 } 779 } 780 781 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 782 if (enabled_port_mask & (1 << i)) 783 ports[num_ports++] = (uint8_t)i; 784 } 785 786 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 787 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 788 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 789 return -1; 790 } 791 792 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 793 RTE_LOG(INFO, VHOST_PORT, 794 "Vhost zero copy doesn't support software vm2vm," 795 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 796 return -1; 797 } 798 799 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 800 RTE_LOG(INFO, VHOST_PORT, 801 "Vhost zero copy doesn't support jumbo frame," 802 "please specify '--mergeable 0' to disable the " 803 "mergeable feature.\n"); 804 return -1; 805 } 806 807 return 0; 808 } 809 810 /* 811 * Update the global var NUM_PORTS and array PORTS according to system ports number 812 * and return valid ports number 813 */ 814 static unsigned check_ports_num(unsigned nb_ports) 815 { 816 unsigned valid_num_ports = num_ports; 817 unsigned portid; 818 819 if (num_ports > nb_ports) { 820 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 821 num_ports, nb_ports); 822 num_ports = nb_ports; 823 } 824 825 for (portid = 0; portid < num_ports; portid ++) { 826 if (ports[portid] >= nb_ports) { 827 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 828 ports[portid], (nb_ports - 1)); 829 ports[portid] = INVALID_PORT_ID; 830 valid_num_ports--; 831 } 832 } 833 return valid_num_ports; 834 } 835 836 /* 837 * Macro to print out packet contents. Wrapped in debug define so that the 838 * data path is not effected when debug is disabled. 839 */ 840 #ifdef DEBUG 841 #define PRINT_PACKET(device, addr, size, header) do { \ 842 char *pkt_addr = (char*)(addr); \ 843 unsigned int index; \ 844 char packet[MAX_PRINT_BUFF]; \ 845 \ 846 if ((header)) \ 847 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 848 else \ 849 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 850 for (index = 0; index < (size); index++) { \ 851 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 852 "%02hhx ", pkt_addr[index]); \ 853 } \ 854 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 855 \ 856 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 857 } while(0) 858 #else 859 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 860 #endif 861 862 /* 863 * Function to convert guest physical addresses to vhost physical addresses. 864 * This is used to convert virtio buffer addresses. 865 */ 866 static inline uint64_t __attribute__((always_inline)) 867 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 868 uint32_t buf_len, hpa_type *addr_type) 869 { 870 struct virtio_memory_regions_hpa *region; 871 uint32_t regionidx; 872 uint64_t vhost_pa = 0; 873 874 *addr_type = PHYS_ADDR_INVALID; 875 876 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 877 region = &vdev->regions_hpa[regionidx]; 878 if ((guest_pa >= region->guest_phys_address) && 879 (guest_pa <= region->guest_phys_address_end)) { 880 vhost_pa = region->host_phys_addr_offset + guest_pa; 881 if (likely((guest_pa + buf_len - 1) 882 <= region->guest_phys_address_end)) 883 *addr_type = PHYS_ADDR_CONTINUOUS; 884 else 885 *addr_type = PHYS_ADDR_CROSS_SUBREG; 886 break; 887 } 888 } 889 890 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 891 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 892 (void *)(uintptr_t)vhost_pa); 893 894 return vhost_pa; 895 } 896 897 /* 898 * Compares a packet destination MAC address to a device MAC address. 899 */ 900 static inline int __attribute__((always_inline)) 901 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 902 { 903 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 904 } 905 906 /* 907 * This function learns the MAC address of the device and registers this along with a 908 * vlan tag to a VMDQ. 909 */ 910 static int 911 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 912 { 913 struct ether_hdr *pkt_hdr; 914 struct virtio_net_data_ll *dev_ll; 915 struct virtio_net *dev = vdev->dev; 916 int i, ret; 917 918 /* Learn MAC address of guest device from packet */ 919 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 920 921 dev_ll = ll_root_used; 922 923 while (dev_ll != NULL) { 924 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 925 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 926 return -1; 927 } 928 dev_ll = dev_ll->next; 929 } 930 931 for (i = 0; i < ETHER_ADDR_LEN; i++) 932 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 933 934 /* vlan_tag currently uses the device_id. */ 935 vdev->vlan_tag = vlan_tags[dev->device_fh]; 936 937 /* Print out VMDQ registration info. */ 938 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 939 dev->device_fh, 940 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 941 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 942 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 943 vdev->vlan_tag); 944 945 /* Register the MAC address. */ 946 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 947 (uint32_t)dev->device_fh + vmdq_pool_base); 948 if (ret) 949 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 950 dev->device_fh); 951 952 /* Enable stripping of the vlan tag as we handle routing. */ 953 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 954 955 /* Set device as ready for RX. */ 956 vdev->ready = DEVICE_RX; 957 958 return 0; 959 } 960 961 /* 962 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 963 * queue before disabling RX on the device. 964 */ 965 static inline void 966 unlink_vmdq(struct vhost_dev *vdev) 967 { 968 unsigned i = 0; 969 unsigned rx_count; 970 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 971 972 if (vdev->ready == DEVICE_RX) { 973 /*clear MAC and VLAN settings*/ 974 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 975 for (i = 0; i < 6; i++) 976 vdev->mac_address.addr_bytes[i] = 0; 977 978 vdev->vlan_tag = 0; 979 980 /*Clear out the receive buffers*/ 981 rx_count = rte_eth_rx_burst(ports[0], 982 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 983 984 while (rx_count) { 985 for (i = 0; i < rx_count; i++) 986 rte_pktmbuf_free(pkts_burst[i]); 987 988 rx_count = rte_eth_rx_burst(ports[0], 989 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 990 } 991 992 vdev->ready = DEVICE_MAC_LEARNING; 993 } 994 } 995 996 /* 997 * Check if the packet destination MAC address is for a local device. If so then put 998 * the packet on that devices RX queue. If not then return. 999 */ 1000 static inline int __attribute__((always_inline)) 1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1002 { 1003 struct virtio_net_data_ll *dev_ll; 1004 struct ether_hdr *pkt_hdr; 1005 uint64_t ret = 0; 1006 struct virtio_net *dev = vdev->dev; 1007 struct virtio_net *tdev; /* destination virito device */ 1008 1009 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1010 1011 /*get the used devices list*/ 1012 dev_ll = ll_root_used; 1013 1014 while (dev_ll != NULL) { 1015 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1016 &dev_ll->vdev->mac_address)) { 1017 1018 /* Drop the packet if the TX packet is destined for the TX device. */ 1019 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1020 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1021 dev->device_fh); 1022 return 0; 1023 } 1024 tdev = dev_ll->vdev->dev; 1025 1026 1027 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1028 1029 if (unlikely(dev_ll->vdev->remove)) { 1030 /*drop the packet if the device is marked for removal*/ 1031 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1032 } else { 1033 /*send the packet to the local virtio device*/ 1034 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1035 if (enable_stats) { 1036 rte_atomic64_add( 1037 &dev_statistics[tdev->device_fh].rx_total_atomic, 1038 1); 1039 rte_atomic64_add( 1040 &dev_statistics[tdev->device_fh].rx_atomic, 1041 ret); 1042 dev_statistics[tdev->device_fh].tx_total++; 1043 dev_statistics[tdev->device_fh].tx += ret; 1044 } 1045 } 1046 1047 return 0; 1048 } 1049 dev_ll = dev_ll->next; 1050 } 1051 1052 return -1; 1053 } 1054 1055 /* 1056 * Check if the destination MAC of a packet is one local VM, 1057 * and get its vlan tag, and offset if it is. 1058 */ 1059 static inline int __attribute__((always_inline)) 1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1061 uint32_t *offset, uint16_t *vlan_tag) 1062 { 1063 struct virtio_net_data_ll *dev_ll = ll_root_used; 1064 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1065 1066 while (dev_ll != NULL) { 1067 if ((dev_ll->vdev->ready == DEVICE_RX) 1068 && ether_addr_cmp(&(pkt_hdr->d_addr), 1069 &dev_ll->vdev->mac_address)) { 1070 /* 1071 * Drop the packet if the TX packet is 1072 * destined for the TX device. 1073 */ 1074 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1075 LOG_DEBUG(VHOST_DATA, 1076 "(%"PRIu64") TX: Source and destination" 1077 " MAC addresses are the same. Dropping " 1078 "packet.\n", 1079 dev_ll->vdev->dev->device_fh); 1080 return -1; 1081 } 1082 1083 /* 1084 * HW vlan strip will reduce the packet length 1085 * by minus length of vlan tag, so need restore 1086 * the packet length by plus it. 1087 */ 1088 *offset = VLAN_HLEN; 1089 *vlan_tag = 1090 (uint16_t) 1091 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1092 1093 LOG_DEBUG(VHOST_DATA, 1094 "(%"PRIu64") TX: pkt to local VM device id:" 1095 "(%"PRIu64") vlan tag: %d.\n", 1096 dev->device_fh, dev_ll->vdev->dev->device_fh, 1097 vlan_tag); 1098 1099 break; 1100 } 1101 dev_ll = dev_ll->next; 1102 } 1103 return 0; 1104 } 1105 1106 /* 1107 * This function routes the TX packet to the correct interface. This may be a local device 1108 * or the physical port. 1109 */ 1110 static inline void __attribute__((always_inline)) 1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1112 { 1113 struct mbuf_table *tx_q; 1114 struct rte_mbuf **m_table; 1115 unsigned len, ret, offset = 0; 1116 const uint16_t lcore_id = rte_lcore_id(); 1117 struct virtio_net *dev = vdev->dev; 1118 1119 /*check if destination is local VM*/ 1120 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1121 rte_pktmbuf_free(m); 1122 return; 1123 } 1124 1125 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1126 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1127 rte_pktmbuf_free(m); 1128 return; 1129 } 1130 } 1131 1132 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1133 1134 /*Add packet to the port tx queue*/ 1135 tx_q = &lcore_tx_queue[lcore_id]; 1136 len = tx_q->len; 1137 1138 m->ol_flags = PKT_TX_VLAN_PKT; 1139 1140 /* 1141 * Find the right seg to adjust the data len when offset is 1142 * bigger than tail room size. 1143 */ 1144 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1145 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1146 m->data_len += offset; 1147 else { 1148 struct rte_mbuf *seg = m; 1149 1150 while ((seg->next != NULL) && 1151 (offset > rte_pktmbuf_tailroom(seg))) 1152 seg = seg->next; 1153 1154 seg->data_len += offset; 1155 } 1156 m->pkt_len += offset; 1157 } 1158 1159 m->vlan_tci = vlan_tag; 1160 1161 tx_q->m_table[len] = m; 1162 len++; 1163 if (enable_stats) { 1164 dev_statistics[dev->device_fh].tx_total++; 1165 dev_statistics[dev->device_fh].tx++; 1166 } 1167 1168 if (unlikely(len == MAX_PKT_BURST)) { 1169 m_table = (struct rte_mbuf **)tx_q->m_table; 1170 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1171 /* Free any buffers not handled by TX and update the port stats. */ 1172 if (unlikely(ret < len)) { 1173 do { 1174 rte_pktmbuf_free(m_table[ret]); 1175 } while (++ret < len); 1176 } 1177 1178 len = 0; 1179 } 1180 1181 tx_q->len = len; 1182 return; 1183 } 1184 /* 1185 * This function is called by each data core. It handles all RX/TX registered with the 1186 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1187 * with all devices in the main linked list. 1188 */ 1189 static int 1190 switch_worker(__attribute__((unused)) void *arg) 1191 { 1192 struct rte_mempool *mbuf_pool = arg; 1193 struct virtio_net *dev = NULL; 1194 struct vhost_dev *vdev = NULL; 1195 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1196 struct virtio_net_data_ll *dev_ll; 1197 struct mbuf_table *tx_q; 1198 volatile struct lcore_ll_info *lcore_ll; 1199 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1200 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1201 unsigned ret, i; 1202 const uint16_t lcore_id = rte_lcore_id(); 1203 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1204 uint16_t rx_count = 0; 1205 uint16_t tx_count; 1206 uint32_t retry = 0; 1207 1208 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1209 lcore_ll = lcore_info[lcore_id].lcore_ll; 1210 prev_tsc = 0; 1211 1212 tx_q = &lcore_tx_queue[lcore_id]; 1213 for (i = 0; i < num_cores; i ++) { 1214 if (lcore_ids[i] == lcore_id) { 1215 tx_q->txq_id = i; 1216 break; 1217 } 1218 } 1219 1220 while(1) { 1221 cur_tsc = rte_rdtsc(); 1222 /* 1223 * TX burst queue drain 1224 */ 1225 diff_tsc = cur_tsc - prev_tsc; 1226 if (unlikely(diff_tsc > drain_tsc)) { 1227 1228 if (tx_q->len) { 1229 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1230 1231 /*Tx any packets in the queue*/ 1232 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1233 (struct rte_mbuf **)tx_q->m_table, 1234 (uint16_t)tx_q->len); 1235 if (unlikely(ret < tx_q->len)) { 1236 do { 1237 rte_pktmbuf_free(tx_q->m_table[ret]); 1238 } while (++ret < tx_q->len); 1239 } 1240 1241 tx_q->len = 0; 1242 } 1243 1244 prev_tsc = cur_tsc; 1245 1246 } 1247 1248 rte_prefetch0(lcore_ll->ll_root_used); 1249 /* 1250 * Inform the configuration core that we have exited the linked list and that no devices are 1251 * in use if requested. 1252 */ 1253 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1254 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1255 1256 /* 1257 * Process devices 1258 */ 1259 dev_ll = lcore_ll->ll_root_used; 1260 1261 while (dev_ll != NULL) { 1262 /*get virtio device ID*/ 1263 vdev = dev_ll->vdev; 1264 dev = vdev->dev; 1265 1266 if (unlikely(vdev->remove)) { 1267 dev_ll = dev_ll->next; 1268 unlink_vmdq(vdev); 1269 vdev->ready = DEVICE_SAFE_REMOVE; 1270 continue; 1271 } 1272 if (likely(vdev->ready == DEVICE_RX)) { 1273 /*Handle guest RX*/ 1274 rx_count = rte_eth_rx_burst(ports[0], 1275 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1276 1277 if (rx_count) { 1278 /* 1279 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1280 * Here MAX_PKT_BURST must be less than virtio queue size 1281 */ 1282 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1283 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1284 rte_delay_us(burst_rx_delay_time); 1285 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1286 break; 1287 } 1288 } 1289 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1290 if (enable_stats) { 1291 rte_atomic64_add( 1292 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1293 rx_count); 1294 rte_atomic64_add( 1295 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1296 } 1297 while (likely(rx_count)) { 1298 rx_count--; 1299 rte_pktmbuf_free(pkts_burst[rx_count]); 1300 } 1301 1302 } 1303 } 1304 1305 if (likely(!vdev->remove)) { 1306 /* Handle guest TX*/ 1307 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1308 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1309 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1310 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1311 while (tx_count--) 1312 rte_pktmbuf_free(pkts_burst[tx_count]); 1313 } 1314 } 1315 while (tx_count) 1316 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1317 } 1318 1319 /*move to the next device in the list*/ 1320 dev_ll = dev_ll->next; 1321 } 1322 } 1323 1324 return 0; 1325 } 1326 1327 /* 1328 * This function gets available ring number for zero copy rx. 1329 * Only one thread will call this funciton for a paticular virtio device, 1330 * so, it is designed as non-thread-safe function. 1331 */ 1332 static inline uint32_t __attribute__((always_inline)) 1333 get_available_ring_num_zcp(struct virtio_net *dev) 1334 { 1335 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1336 uint16_t avail_idx; 1337 1338 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1339 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1340 } 1341 1342 /* 1343 * This function gets available ring index for zero copy rx, 1344 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1345 * Only one thread will call this funciton for a paticular virtio device, 1346 * so, it is designed as non-thread-safe function. 1347 */ 1348 static inline uint32_t __attribute__((always_inline)) 1349 get_available_ring_index_zcp(struct virtio_net *dev, 1350 uint16_t *res_base_idx, uint32_t count) 1351 { 1352 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1353 uint16_t avail_idx; 1354 uint32_t retry = 0; 1355 uint16_t free_entries; 1356 1357 *res_base_idx = vq->last_used_idx_res; 1358 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1359 free_entries = (avail_idx - *res_base_idx); 1360 1361 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1362 "avail idx: %d, " 1363 "res base idx:%d, free entries:%d\n", 1364 dev->device_fh, avail_idx, *res_base_idx, 1365 free_entries); 1366 1367 /* 1368 * If retry is enabled and the queue is full then we wait 1369 * and retry to avoid packet loss. 1370 */ 1371 if (enable_retry && unlikely(count > free_entries)) { 1372 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1373 rte_delay_us(burst_rx_delay_time); 1374 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1375 free_entries = (avail_idx - *res_base_idx); 1376 if (count <= free_entries) 1377 break; 1378 } 1379 } 1380 1381 /*check that we have enough buffers*/ 1382 if (unlikely(count > free_entries)) 1383 count = free_entries; 1384 1385 if (unlikely(count == 0)) { 1386 LOG_DEBUG(VHOST_DATA, 1387 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1388 "avail idx: %d, res base idx:%d, free entries:%d\n", 1389 dev->device_fh, avail_idx, 1390 *res_base_idx, free_entries); 1391 return 0; 1392 } 1393 1394 vq->last_used_idx_res = *res_base_idx + count; 1395 1396 return count; 1397 } 1398 1399 /* 1400 * This function put descriptor back to used list. 1401 */ 1402 static inline void __attribute__((always_inline)) 1403 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1404 { 1405 uint16_t res_cur_idx = vq->last_used_idx; 1406 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1407 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1408 rte_compiler_barrier(); 1409 *(volatile uint16_t *)&vq->used->idx += 1; 1410 vq->last_used_idx += 1; 1411 1412 /* Kick the guest if necessary. */ 1413 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1414 eventfd_write((int)vq->kickfd, 1); 1415 } 1416 1417 /* 1418 * This function get available descriptor from vitio vring and un-attached mbuf 1419 * from vpool->ring, and then attach them together. It needs adjust the offset 1420 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1421 * frame data may be put to wrong location in mbuf. 1422 */ 1423 static inline void __attribute__((always_inline)) 1424 attach_rxmbuf_zcp(struct virtio_net *dev) 1425 { 1426 uint16_t res_base_idx, desc_idx; 1427 uint64_t buff_addr, phys_addr; 1428 struct vhost_virtqueue *vq; 1429 struct vring_desc *desc; 1430 struct rte_mbuf *mbuf = NULL; 1431 struct vpool *vpool; 1432 hpa_type addr_type; 1433 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1434 1435 vpool = &vpool_array[vdev->vmdq_rx_q]; 1436 vq = dev->virtqueue[VIRTIO_RXQ]; 1437 1438 do { 1439 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1440 1) != 1)) 1441 return; 1442 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1443 1444 desc = &vq->desc[desc_idx]; 1445 if (desc->flags & VRING_DESC_F_NEXT) { 1446 desc = &vq->desc[desc->next]; 1447 buff_addr = gpa_to_vva(dev, desc->addr); 1448 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1449 &addr_type); 1450 } else { 1451 buff_addr = gpa_to_vva(dev, 1452 desc->addr + vq->vhost_hlen); 1453 phys_addr = gpa_to_hpa(vdev, 1454 desc->addr + vq->vhost_hlen, 1455 desc->len, &addr_type); 1456 } 1457 1458 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1459 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1460 " address found when attaching RX frame buffer" 1461 " address!\n", dev->device_fh); 1462 put_desc_to_used_list_zcp(vq, desc_idx); 1463 continue; 1464 } 1465 1466 /* 1467 * Check if the frame buffer address from guest crosses 1468 * sub-region or not. 1469 */ 1470 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1471 RTE_LOG(ERR, VHOST_DATA, 1472 "(%"PRIu64") Frame buffer address cross " 1473 "sub-regioin found when attaching RX frame " 1474 "buffer address!\n", 1475 dev->device_fh); 1476 put_desc_to_used_list_zcp(vq, desc_idx); 1477 continue; 1478 } 1479 } while (unlikely(phys_addr == 0)); 1480 1481 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1482 if (unlikely(mbuf == NULL)) { 1483 LOG_DEBUG(VHOST_DATA, 1484 "(%"PRIu64") in attach_rxmbuf_zcp: " 1485 "ring_sc_dequeue fail.\n", 1486 dev->device_fh); 1487 put_desc_to_used_list_zcp(vq, desc_idx); 1488 return; 1489 } 1490 1491 if (unlikely(vpool->buf_size > desc->len)) { 1492 LOG_DEBUG(VHOST_DATA, 1493 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1494 "length(%d) of descriptor idx: %d less than room " 1495 "size required: %d\n", 1496 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1497 put_desc_to_used_list_zcp(vq, desc_idx); 1498 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1499 return; 1500 } 1501 1502 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1503 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1504 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1505 mbuf->data_len = desc->len; 1506 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1507 1508 LOG_DEBUG(VHOST_DATA, 1509 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1510 "descriptor idx:%d\n", 1511 dev->device_fh, res_base_idx, desc_idx); 1512 1513 __rte_mbuf_raw_free(mbuf); 1514 1515 return; 1516 } 1517 1518 /* 1519 * Detach an attched packet mbuf - 1520 * - restore original mbuf address and length values. 1521 * - reset pktmbuf data and data_len to their default values. 1522 * All other fields of the given packet mbuf will be left intact. 1523 * 1524 * @param m 1525 * The attached packet mbuf. 1526 */ 1527 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1528 { 1529 const struct rte_mempool *mp = m->pool; 1530 void *buf = RTE_MBUF_TO_BADDR(m); 1531 uint32_t buf_ofs; 1532 uint32_t buf_len = mp->elt_size - sizeof(*m); 1533 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1534 1535 m->buf_addr = buf; 1536 m->buf_len = (uint16_t)buf_len; 1537 1538 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1539 RTE_PKTMBUF_HEADROOM : m->buf_len; 1540 m->data_off = buf_ofs; 1541 1542 m->data_len = 0; 1543 } 1544 1545 /* 1546 * This function is called after packets have been transimited. It fetchs mbuf 1547 * from vpool->pool, detached it and put into vpool->ring. It also update the 1548 * used index and kick the guest if necessary. 1549 */ 1550 static inline uint32_t __attribute__((always_inline)) 1551 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1552 { 1553 struct rte_mbuf *mbuf; 1554 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1555 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1556 uint32_t index = 0; 1557 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1558 1559 LOG_DEBUG(VHOST_DATA, 1560 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1561 "clean is: %d\n", 1562 dev->device_fh, mbuf_count); 1563 LOG_DEBUG(VHOST_DATA, 1564 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1565 "clean is : %d\n", 1566 dev->device_fh, rte_ring_count(vpool->ring)); 1567 1568 for (index = 0; index < mbuf_count; index++) { 1569 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1570 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1571 pktmbuf_detach_zcp(mbuf); 1572 rte_ring_sp_enqueue(vpool->ring, mbuf); 1573 1574 /* Update used index buffer information. */ 1575 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1576 vq->used->ring[used_idx].len = 0; 1577 1578 used_idx = (used_idx + 1) & (vq->size - 1); 1579 } 1580 1581 LOG_DEBUG(VHOST_DATA, 1582 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1583 "clean is: %d\n", 1584 dev->device_fh, rte_mempool_count(vpool->pool)); 1585 LOG_DEBUG(VHOST_DATA, 1586 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1587 "clean is : %d\n", 1588 dev->device_fh, rte_ring_count(vpool->ring)); 1589 LOG_DEBUG(VHOST_DATA, 1590 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1591 "vq->last_used_idx:%d\n", 1592 dev->device_fh, vq->last_used_idx); 1593 1594 vq->last_used_idx += mbuf_count; 1595 1596 LOG_DEBUG(VHOST_DATA, 1597 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1598 "vq->last_used_idx:%d\n", 1599 dev->device_fh, vq->last_used_idx); 1600 1601 rte_compiler_barrier(); 1602 1603 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1604 1605 /* Kick guest if required. */ 1606 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1607 eventfd_write((int)vq->kickfd, 1); 1608 1609 return 0; 1610 } 1611 1612 /* 1613 * This function is called when a virtio device is destroy. 1614 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1615 */ 1616 static void mbuf_destroy_zcp(struct vpool *vpool) 1617 { 1618 struct rte_mbuf *mbuf = NULL; 1619 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1620 1621 LOG_DEBUG(VHOST_CONFIG, 1622 "in mbuf_destroy_zcp: mbuf count in mempool before " 1623 "mbuf_destroy_zcp is: %d\n", 1624 mbuf_count); 1625 LOG_DEBUG(VHOST_CONFIG, 1626 "in mbuf_destroy_zcp: mbuf count in ring before " 1627 "mbuf_destroy_zcp is : %d\n", 1628 rte_ring_count(vpool->ring)); 1629 1630 for (index = 0; index < mbuf_count; index++) { 1631 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1632 if (likely(mbuf != NULL)) { 1633 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1634 pktmbuf_detach_zcp(mbuf); 1635 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1636 } 1637 } 1638 1639 LOG_DEBUG(VHOST_CONFIG, 1640 "in mbuf_destroy_zcp: mbuf count in mempool after " 1641 "mbuf_destroy_zcp is: %d\n", 1642 rte_mempool_count(vpool->pool)); 1643 LOG_DEBUG(VHOST_CONFIG, 1644 "in mbuf_destroy_zcp: mbuf count in ring after " 1645 "mbuf_destroy_zcp is : %d\n", 1646 rte_ring_count(vpool->ring)); 1647 } 1648 1649 /* 1650 * This function update the use flag and counter. 1651 */ 1652 static inline uint32_t __attribute__((always_inline)) 1653 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1654 uint32_t count) 1655 { 1656 struct vhost_virtqueue *vq; 1657 struct vring_desc *desc; 1658 struct rte_mbuf *buff; 1659 /* The virtio_hdr is initialised to 0. */ 1660 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1661 = {{0, 0, 0, 0, 0, 0}, 0}; 1662 uint64_t buff_hdr_addr = 0; 1663 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1664 uint32_t head_idx, packet_success = 0; 1665 uint16_t res_cur_idx; 1666 1667 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1668 1669 if (count == 0) 1670 return 0; 1671 1672 vq = dev->virtqueue[VIRTIO_RXQ]; 1673 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1674 1675 res_cur_idx = vq->last_used_idx; 1676 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1677 dev->device_fh, res_cur_idx, res_cur_idx + count); 1678 1679 /* Retrieve all of the head indexes first to avoid caching issues. */ 1680 for (head_idx = 0; head_idx < count; head_idx++) 1681 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1682 1683 /*Prefetch descriptor index. */ 1684 rte_prefetch0(&vq->desc[head[packet_success]]); 1685 1686 while (packet_success != count) { 1687 /* Get descriptor from available ring */ 1688 desc = &vq->desc[head[packet_success]]; 1689 1690 buff = pkts[packet_success]; 1691 LOG_DEBUG(VHOST_DATA, 1692 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1693 "pkt[%d] descriptor idx: %d\n", 1694 dev->device_fh, packet_success, 1695 MBUF_HEADROOM_UINT32(buff)); 1696 1697 PRINT_PACKET(dev, 1698 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1699 + RTE_PKTMBUF_HEADROOM), 1700 rte_pktmbuf_data_len(buff), 0); 1701 1702 /* Buffer address translation for virtio header. */ 1703 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1704 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1705 1706 /* 1707 * If the descriptors are chained the header and data are 1708 * placed in separate buffers. 1709 */ 1710 if (desc->flags & VRING_DESC_F_NEXT) { 1711 desc->len = vq->vhost_hlen; 1712 desc = &vq->desc[desc->next]; 1713 desc->len = rte_pktmbuf_data_len(buff); 1714 } else { 1715 desc->len = packet_len; 1716 } 1717 1718 /* Update used ring with desc information */ 1719 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1720 = head[packet_success]; 1721 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1722 = packet_len; 1723 res_cur_idx++; 1724 packet_success++; 1725 1726 /* A header is required per buffer. */ 1727 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1728 (const void *)&virtio_hdr, vq->vhost_hlen); 1729 1730 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1731 1732 if (likely(packet_success < count)) { 1733 /* Prefetch descriptor index. */ 1734 rte_prefetch0(&vq->desc[head[packet_success]]); 1735 } 1736 } 1737 1738 rte_compiler_barrier(); 1739 1740 LOG_DEBUG(VHOST_DATA, 1741 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1742 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1743 dev->device_fh, vq->last_used_idx, vq->used->idx); 1744 1745 *(volatile uint16_t *)&vq->used->idx += count; 1746 vq->last_used_idx += count; 1747 1748 LOG_DEBUG(VHOST_DATA, 1749 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1750 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1751 dev->device_fh, vq->last_used_idx, vq->used->idx); 1752 1753 /* Kick the guest if necessary. */ 1754 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1755 eventfd_write((int)vq->kickfd, 1); 1756 1757 return count; 1758 } 1759 1760 /* 1761 * This function routes the TX packet to the correct interface. 1762 * This may be a local device or the physical port. 1763 */ 1764 static inline void __attribute__((always_inline)) 1765 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1766 uint32_t desc_idx, uint8_t need_copy) 1767 { 1768 struct mbuf_table *tx_q; 1769 struct rte_mbuf **m_table; 1770 struct rte_mbuf *mbuf = NULL; 1771 unsigned len, ret, offset = 0; 1772 struct vpool *vpool; 1773 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1774 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1775 1776 /*Add packet to the port tx queue*/ 1777 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1778 len = tx_q->len; 1779 1780 /* Allocate an mbuf and populate the structure. */ 1781 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1782 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1783 if (unlikely(mbuf == NULL)) { 1784 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1785 RTE_LOG(ERR, VHOST_DATA, 1786 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1787 dev->device_fh); 1788 put_desc_to_used_list_zcp(vq, desc_idx); 1789 return; 1790 } 1791 1792 if (vm2vm_mode == VM2VM_HARDWARE) { 1793 /* Avoid using a vlan tag from any vm for external pkt, such as 1794 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1795 * selection, MAC address determines it as an external pkt 1796 * which should go to network, while vlan tag determine it as 1797 * a vm2vm pkt should forward to another vm. Hardware confuse 1798 * such a ambiguous situation, so pkt will lost. 1799 */ 1800 vlan_tag = external_pkt_default_vlan_tag; 1801 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1802 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1803 __rte_mbuf_raw_free(mbuf); 1804 return; 1805 } 1806 } 1807 1808 mbuf->nb_segs = m->nb_segs; 1809 mbuf->next = m->next; 1810 mbuf->data_len = m->data_len + offset; 1811 mbuf->pkt_len = mbuf->data_len; 1812 if (unlikely(need_copy)) { 1813 /* Copy the packet contents to the mbuf. */ 1814 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1815 rte_pktmbuf_mtod(m, void *), 1816 m->data_len); 1817 } else { 1818 mbuf->data_off = m->data_off; 1819 mbuf->buf_physaddr = m->buf_physaddr; 1820 mbuf->buf_addr = m->buf_addr; 1821 } 1822 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1823 mbuf->vlan_tci = vlan_tag; 1824 mbuf->l2_len = sizeof(struct ether_hdr); 1825 mbuf->l3_len = sizeof(struct ipv4_hdr); 1826 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1827 1828 tx_q->m_table[len] = mbuf; 1829 len++; 1830 1831 LOG_DEBUG(VHOST_DATA, 1832 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1833 dev->device_fh, 1834 mbuf->nb_segs, 1835 (mbuf->next == NULL) ? "null" : "non-null"); 1836 1837 if (enable_stats) { 1838 dev_statistics[dev->device_fh].tx_total++; 1839 dev_statistics[dev->device_fh].tx++; 1840 } 1841 1842 if (unlikely(len == MAX_PKT_BURST)) { 1843 m_table = (struct rte_mbuf **)tx_q->m_table; 1844 ret = rte_eth_tx_burst(ports[0], 1845 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1846 1847 /* 1848 * Free any buffers not handled by TX and update 1849 * the port stats. 1850 */ 1851 if (unlikely(ret < len)) { 1852 do { 1853 rte_pktmbuf_free(m_table[ret]); 1854 } while (++ret < len); 1855 } 1856 1857 len = 0; 1858 txmbuf_clean_zcp(dev, vpool); 1859 } 1860 1861 tx_q->len = len; 1862 1863 return; 1864 } 1865 1866 /* 1867 * This function TX all available packets in virtio TX queue for one 1868 * virtio-net device. If it is first packet, it learns MAC address and 1869 * setup VMDQ. 1870 */ 1871 static inline void __attribute__((always_inline)) 1872 virtio_dev_tx_zcp(struct virtio_net *dev) 1873 { 1874 struct rte_mbuf m; 1875 struct vhost_virtqueue *vq; 1876 struct vring_desc *desc; 1877 uint64_t buff_addr = 0, phys_addr; 1878 uint32_t head[MAX_PKT_BURST]; 1879 uint32_t i; 1880 uint16_t free_entries, packet_success = 0; 1881 uint16_t avail_idx; 1882 uint8_t need_copy = 0; 1883 hpa_type addr_type; 1884 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1885 1886 vq = dev->virtqueue[VIRTIO_TXQ]; 1887 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1888 1889 /* If there are no available buffers then return. */ 1890 if (vq->last_used_idx_res == avail_idx) 1891 return; 1892 1893 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1894 1895 /* Prefetch available ring to retrieve head indexes. */ 1896 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1897 1898 /* Get the number of free entries in the ring */ 1899 free_entries = (avail_idx - vq->last_used_idx_res); 1900 1901 /* Limit to MAX_PKT_BURST. */ 1902 free_entries 1903 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1904 1905 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1906 dev->device_fh, free_entries); 1907 1908 /* Retrieve all of the head indexes first to avoid caching issues. */ 1909 for (i = 0; i < free_entries; i++) 1910 head[i] 1911 = vq->avail->ring[(vq->last_used_idx_res + i) 1912 & (vq->size - 1)]; 1913 1914 vq->last_used_idx_res += free_entries; 1915 1916 /* Prefetch descriptor index. */ 1917 rte_prefetch0(&vq->desc[head[packet_success]]); 1918 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1919 1920 while (packet_success < free_entries) { 1921 desc = &vq->desc[head[packet_success]]; 1922 1923 /* Discard first buffer as it is the virtio header */ 1924 desc = &vq->desc[desc->next]; 1925 1926 /* Buffer address translation. */ 1927 buff_addr = gpa_to_vva(dev, desc->addr); 1928 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1929 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1930 &addr_type); 1931 1932 if (likely(packet_success < (free_entries - 1))) 1933 /* Prefetch descriptor index. */ 1934 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1935 1936 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1937 RTE_LOG(ERR, VHOST_DATA, 1938 "(%"PRIu64") Invalid frame buffer address found" 1939 "when TX packets!\n", 1940 dev->device_fh); 1941 packet_success++; 1942 continue; 1943 } 1944 1945 /* Prefetch buffer address. */ 1946 rte_prefetch0((void *)(uintptr_t)buff_addr); 1947 1948 /* 1949 * Setup dummy mbuf. This is copied to a real mbuf if 1950 * transmitted out the physical port. 1951 */ 1952 m.data_len = desc->len; 1953 m.nb_segs = 1; 1954 m.next = NULL; 1955 m.data_off = 0; 1956 m.buf_addr = (void *)(uintptr_t)buff_addr; 1957 m.buf_physaddr = phys_addr; 1958 1959 /* 1960 * Check if the frame buffer address from guest crosses 1961 * sub-region or not. 1962 */ 1963 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1964 RTE_LOG(ERR, VHOST_DATA, 1965 "(%"PRIu64") Frame buffer address cross " 1966 "sub-regioin found when attaching TX frame " 1967 "buffer address!\n", 1968 dev->device_fh); 1969 need_copy = 1; 1970 } else 1971 need_copy = 0; 1972 1973 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1974 1975 /* 1976 * If this is the first received packet we need to learn 1977 * the MAC and setup VMDQ 1978 */ 1979 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1980 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1981 /* 1982 * Discard frame if device is scheduled for 1983 * removal or a duplicate MAC address is found. 1984 */ 1985 packet_success += free_entries; 1986 vq->last_used_idx += packet_success; 1987 break; 1988 } 1989 } 1990 1991 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1992 packet_success++; 1993 } 1994 } 1995 1996 /* 1997 * This function is called by each data core. It handles all RX/TX registered 1998 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1999 * addresses are compared with all devices in the main linked list. 2000 */ 2001 static int 2002 switch_worker_zcp(__attribute__((unused)) void *arg) 2003 { 2004 struct virtio_net *dev = NULL; 2005 struct vhost_dev *vdev = NULL; 2006 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2007 struct virtio_net_data_ll *dev_ll; 2008 struct mbuf_table *tx_q; 2009 volatile struct lcore_ll_info *lcore_ll; 2010 const uint64_t drain_tsc 2011 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2012 * BURST_TX_DRAIN_US; 2013 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2014 unsigned ret; 2015 const uint16_t lcore_id = rte_lcore_id(); 2016 uint16_t count_in_ring, rx_count = 0; 2017 2018 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2019 2020 lcore_ll = lcore_info[lcore_id].lcore_ll; 2021 prev_tsc = 0; 2022 2023 while (1) { 2024 cur_tsc = rte_rdtsc(); 2025 2026 /* TX burst queue drain */ 2027 diff_tsc = cur_tsc - prev_tsc; 2028 if (unlikely(diff_tsc > drain_tsc)) { 2029 /* 2030 * Get mbuf from vpool.pool and detach mbuf and 2031 * put back into vpool.ring. 2032 */ 2033 dev_ll = lcore_ll->ll_root_used; 2034 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2035 /* Get virtio device ID */ 2036 vdev = dev_ll->vdev; 2037 dev = vdev->dev; 2038 2039 if (likely(!vdev->remove)) { 2040 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2041 if (tx_q->len) { 2042 LOG_DEBUG(VHOST_DATA, 2043 "TX queue drained after timeout" 2044 " with burst size %u\n", 2045 tx_q->len); 2046 2047 /* 2048 * Tx any packets in the queue 2049 */ 2050 ret = rte_eth_tx_burst( 2051 ports[0], 2052 (uint16_t)tx_q->txq_id, 2053 (struct rte_mbuf **) 2054 tx_q->m_table, 2055 (uint16_t)tx_q->len); 2056 if (unlikely(ret < tx_q->len)) { 2057 do { 2058 rte_pktmbuf_free( 2059 tx_q->m_table[ret]); 2060 } while (++ret < tx_q->len); 2061 } 2062 tx_q->len = 0; 2063 2064 txmbuf_clean_zcp(dev, 2065 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2066 } 2067 } 2068 dev_ll = dev_ll->next; 2069 } 2070 prev_tsc = cur_tsc; 2071 } 2072 2073 rte_prefetch0(lcore_ll->ll_root_used); 2074 2075 /* 2076 * Inform the configuration core that we have exited the linked 2077 * list and that no devices are in use if requested. 2078 */ 2079 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2080 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2081 2082 /* Process devices */ 2083 dev_ll = lcore_ll->ll_root_used; 2084 2085 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2086 vdev = dev_ll->vdev; 2087 dev = vdev->dev; 2088 if (unlikely(vdev->remove)) { 2089 dev_ll = dev_ll->next; 2090 unlink_vmdq(vdev); 2091 vdev->ready = DEVICE_SAFE_REMOVE; 2092 continue; 2093 } 2094 2095 if (likely(vdev->ready == DEVICE_RX)) { 2096 uint32_t index = vdev->vmdq_rx_q; 2097 uint16_t i; 2098 count_in_ring 2099 = rte_ring_count(vpool_array[index].ring); 2100 uint16_t free_entries 2101 = (uint16_t)get_available_ring_num_zcp(dev); 2102 2103 /* 2104 * Attach all mbufs in vpool.ring and put back 2105 * into vpool.pool. 2106 */ 2107 for (i = 0; 2108 i < RTE_MIN(free_entries, 2109 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2110 i++) 2111 attach_rxmbuf_zcp(dev); 2112 2113 /* Handle guest RX */ 2114 rx_count = rte_eth_rx_burst(ports[0], 2115 vdev->vmdq_rx_q, pkts_burst, 2116 MAX_PKT_BURST); 2117 2118 if (rx_count) { 2119 ret_count = virtio_dev_rx_zcp(dev, 2120 pkts_burst, rx_count); 2121 if (enable_stats) { 2122 dev_statistics[dev->device_fh].rx_total 2123 += rx_count; 2124 dev_statistics[dev->device_fh].rx 2125 += ret_count; 2126 } 2127 while (likely(rx_count)) { 2128 rx_count--; 2129 pktmbuf_detach_zcp( 2130 pkts_burst[rx_count]); 2131 rte_ring_sp_enqueue( 2132 vpool_array[index].ring, 2133 (void *)pkts_burst[rx_count]); 2134 } 2135 } 2136 } 2137 2138 if (likely(!vdev->remove)) 2139 /* Handle guest TX */ 2140 virtio_dev_tx_zcp(dev); 2141 2142 /* Move to the next device in the list */ 2143 dev_ll = dev_ll->next; 2144 } 2145 } 2146 2147 return 0; 2148 } 2149 2150 2151 /* 2152 * Add an entry to a used linked list. A free entry must first be found 2153 * in the free linked list using get_data_ll_free_entry(); 2154 */ 2155 static void 2156 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2157 struct virtio_net_data_ll *ll_dev) 2158 { 2159 struct virtio_net_data_ll *ll = *ll_root_addr; 2160 2161 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2162 ll_dev->next = NULL; 2163 rte_compiler_barrier(); 2164 2165 /* If ll == NULL then this is the first device. */ 2166 if (ll) { 2167 /* Increment to the tail of the linked list. */ 2168 while ((ll->next != NULL) ) 2169 ll = ll->next; 2170 2171 ll->next = ll_dev; 2172 } else { 2173 *ll_root_addr = ll_dev; 2174 } 2175 } 2176 2177 /* 2178 * Remove an entry from a used linked list. The entry must then be added to 2179 * the free linked list using put_data_ll_free_entry(). 2180 */ 2181 static void 2182 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2183 struct virtio_net_data_ll *ll_dev, 2184 struct virtio_net_data_ll *ll_dev_last) 2185 { 2186 struct virtio_net_data_ll *ll = *ll_root_addr; 2187 2188 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2189 return; 2190 2191 if (ll_dev == ll) 2192 *ll_root_addr = ll_dev->next; 2193 else 2194 if (likely(ll_dev_last != NULL)) 2195 ll_dev_last->next = ll_dev->next; 2196 else 2197 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2198 } 2199 2200 /* 2201 * Find and return an entry from the free linked list. 2202 */ 2203 static struct virtio_net_data_ll * 2204 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2205 { 2206 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2207 struct virtio_net_data_ll *ll_dev; 2208 2209 if (ll_free == NULL) 2210 return NULL; 2211 2212 ll_dev = ll_free; 2213 *ll_root_addr = ll_free->next; 2214 2215 return ll_dev; 2216 } 2217 2218 /* 2219 * Place an entry back on to the free linked list. 2220 */ 2221 static void 2222 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2223 struct virtio_net_data_ll *ll_dev) 2224 { 2225 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2226 2227 if (ll_dev == NULL) 2228 return; 2229 2230 ll_dev->next = ll_free; 2231 *ll_root_addr = ll_dev; 2232 } 2233 2234 /* 2235 * Creates a linked list of a given size. 2236 */ 2237 static struct virtio_net_data_ll * 2238 alloc_data_ll(uint32_t size) 2239 { 2240 struct virtio_net_data_ll *ll_new; 2241 uint32_t i; 2242 2243 /* Malloc and then chain the linked list. */ 2244 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2245 if (ll_new == NULL) { 2246 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2247 return NULL; 2248 } 2249 2250 for (i = 0; i < size - 1; i++) { 2251 ll_new[i].vdev = NULL; 2252 ll_new[i].next = &ll_new[i+1]; 2253 } 2254 ll_new[i].next = NULL; 2255 2256 return (ll_new); 2257 } 2258 2259 /* 2260 * Create the main linked list along with each individual cores linked list. A used and a free list 2261 * are created to manage entries. 2262 */ 2263 static int 2264 init_data_ll (void) 2265 { 2266 int lcore; 2267 2268 RTE_LCORE_FOREACH_SLAVE(lcore) { 2269 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2270 if (lcore_info[lcore].lcore_ll == NULL) { 2271 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2272 return -1; 2273 } 2274 2275 lcore_info[lcore].lcore_ll->device_num = 0; 2276 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2277 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2278 if (num_devices % num_switching_cores) 2279 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2280 else 2281 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2282 } 2283 2284 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2285 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2286 2287 return 0; 2288 } 2289 2290 /* 2291 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2292 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2293 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2294 */ 2295 static void 2296 destroy_device (volatile struct virtio_net *dev) 2297 { 2298 struct virtio_net_data_ll *ll_lcore_dev_cur; 2299 struct virtio_net_data_ll *ll_main_dev_cur; 2300 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2301 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2302 struct vhost_dev *vdev; 2303 int lcore; 2304 2305 dev->flags &= ~VIRTIO_DEV_RUNNING; 2306 2307 vdev = (struct vhost_dev *)dev->priv; 2308 /*set the remove flag. */ 2309 vdev->remove = 1; 2310 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2311 rte_pause(); 2312 } 2313 2314 /* Search for entry to be removed from lcore ll */ 2315 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2316 while (ll_lcore_dev_cur != NULL) { 2317 if (ll_lcore_dev_cur->vdev == vdev) { 2318 break; 2319 } else { 2320 ll_lcore_dev_last = ll_lcore_dev_cur; 2321 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2322 } 2323 } 2324 2325 if (ll_lcore_dev_cur == NULL) { 2326 RTE_LOG(ERR, VHOST_CONFIG, 2327 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2328 dev->device_fh); 2329 return; 2330 } 2331 2332 /* Search for entry to be removed from main ll */ 2333 ll_main_dev_cur = ll_root_used; 2334 ll_main_dev_last = NULL; 2335 while (ll_main_dev_cur != NULL) { 2336 if (ll_main_dev_cur->vdev == vdev) { 2337 break; 2338 } else { 2339 ll_main_dev_last = ll_main_dev_cur; 2340 ll_main_dev_cur = ll_main_dev_cur->next; 2341 } 2342 } 2343 2344 /* Remove entries from the lcore and main ll. */ 2345 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2346 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2347 2348 /* Set the dev_removal_flag on each lcore. */ 2349 RTE_LCORE_FOREACH_SLAVE(lcore) { 2350 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2351 } 2352 2353 /* 2354 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2355 * they can no longer access the device removed from the linked lists and that the devices 2356 * are no longer in use. 2357 */ 2358 RTE_LCORE_FOREACH_SLAVE(lcore) { 2359 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2360 rte_pause(); 2361 } 2362 } 2363 2364 /* Add the entries back to the lcore and main free ll.*/ 2365 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2366 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2367 2368 /* Decrement number of device on the lcore. */ 2369 lcore_info[vdev->coreid].lcore_ll->device_num--; 2370 2371 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2372 2373 if (zero_copy) { 2374 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2375 2376 /* Stop the RX queue. */ 2377 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2378 LOG_DEBUG(VHOST_CONFIG, 2379 "(%"PRIu64") In destroy_device: Failed to stop " 2380 "rx queue:%d\n", 2381 dev->device_fh, 2382 vdev->vmdq_rx_q); 2383 } 2384 2385 LOG_DEBUG(VHOST_CONFIG, 2386 "(%"PRIu64") in destroy_device: Start put mbuf in " 2387 "mempool back to ring for RX queue: %d\n", 2388 dev->device_fh, vdev->vmdq_rx_q); 2389 2390 mbuf_destroy_zcp(vpool); 2391 2392 /* Stop the TX queue. */ 2393 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2394 LOG_DEBUG(VHOST_CONFIG, 2395 "(%"PRIu64") In destroy_device: Failed to " 2396 "stop tx queue:%d\n", 2397 dev->device_fh, vdev->vmdq_rx_q); 2398 } 2399 2400 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2401 2402 LOG_DEBUG(VHOST_CONFIG, 2403 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2404 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2405 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2406 dev->device_fh); 2407 2408 mbuf_destroy_zcp(vpool); 2409 rte_free(vdev->regions_hpa); 2410 } 2411 rte_free(vdev); 2412 2413 } 2414 2415 /* 2416 * Calculate the region count of physical continous regions for one particular 2417 * region of whose vhost virtual address is continous. The particular region 2418 * start from vva_start, with size of 'size' in argument. 2419 */ 2420 static uint32_t 2421 check_hpa_regions(uint64_t vva_start, uint64_t size) 2422 { 2423 uint32_t i, nregions = 0, page_size = getpagesize(); 2424 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2425 if (vva_start % page_size) { 2426 LOG_DEBUG(VHOST_CONFIG, 2427 "in check_countinous: vva start(%p) mod page_size(%d) " 2428 "has remainder\n", 2429 (void *)(uintptr_t)vva_start, page_size); 2430 return 0; 2431 } 2432 if (size % page_size) { 2433 LOG_DEBUG(VHOST_CONFIG, 2434 "in check_countinous: " 2435 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2436 size, page_size); 2437 return 0; 2438 } 2439 for (i = 0; i < size - page_size; i = i + page_size) { 2440 cur_phys_addr 2441 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2442 next_phys_addr = rte_mem_virt2phy( 2443 (void *)(uintptr_t)(vva_start + i + page_size)); 2444 if ((cur_phys_addr + page_size) != next_phys_addr) { 2445 ++nregions; 2446 LOG_DEBUG(VHOST_CONFIG, 2447 "in check_continuous: hva addr:(%p) is not " 2448 "continuous with hva addr:(%p), diff:%d\n", 2449 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2450 (void *)(uintptr_t)(vva_start + (uint64_t)i 2451 + page_size), page_size); 2452 LOG_DEBUG(VHOST_CONFIG, 2453 "in check_continuous: hpa addr:(%p) is not " 2454 "continuous with hpa addr:(%p), " 2455 "diff:(%"PRIu64")\n", 2456 (void *)(uintptr_t)cur_phys_addr, 2457 (void *)(uintptr_t)next_phys_addr, 2458 (next_phys_addr-cur_phys_addr)); 2459 } 2460 } 2461 return nregions; 2462 } 2463 2464 /* 2465 * Divide each region whose vhost virtual address is continous into a few 2466 * sub-regions, make sure the physical address within each sub-region are 2467 * continous. And fill offset(to GPA) and size etc. information of each 2468 * sub-region into regions_hpa. 2469 */ 2470 static uint32_t 2471 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2472 { 2473 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2474 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2475 2476 if (mem_region_hpa == NULL) 2477 return 0; 2478 2479 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2480 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2481 virtio_memory->regions[regionidx].address_offset; 2482 mem_region_hpa[regionidx_hpa].guest_phys_address 2483 = virtio_memory->regions[regionidx].guest_phys_address; 2484 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2485 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2486 mem_region_hpa[regionidx_hpa].guest_phys_address; 2487 LOG_DEBUG(VHOST_CONFIG, 2488 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2489 regionidx_hpa, 2490 (void *)(uintptr_t) 2491 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2492 LOG_DEBUG(VHOST_CONFIG, 2493 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2494 regionidx_hpa, 2495 (void *)(uintptr_t) 2496 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2497 for (i = 0, k = 0; 2498 i < virtio_memory->regions[regionidx].memory_size - 2499 page_size; 2500 i += page_size) { 2501 cur_phys_addr = rte_mem_virt2phy( 2502 (void *)(uintptr_t)(vva_start + i)); 2503 next_phys_addr = rte_mem_virt2phy( 2504 (void *)(uintptr_t)(vva_start + 2505 i + page_size)); 2506 if ((cur_phys_addr + page_size) != next_phys_addr) { 2507 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2508 mem_region_hpa[regionidx_hpa].guest_phys_address + 2509 k + page_size; 2510 mem_region_hpa[regionidx_hpa].memory_size 2511 = k + page_size; 2512 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2513 "phys addr end [%d]:(%p)\n", 2514 regionidx_hpa, 2515 (void *)(uintptr_t) 2516 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2517 LOG_DEBUG(VHOST_CONFIG, 2518 "in fill_hpa_regions: guest phys addr " 2519 "size [%d]:(%p)\n", 2520 regionidx_hpa, 2521 (void *)(uintptr_t) 2522 (mem_region_hpa[regionidx_hpa].memory_size)); 2523 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2524 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2525 ++regionidx_hpa; 2526 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2527 next_phys_addr - 2528 mem_region_hpa[regionidx_hpa].guest_phys_address; 2529 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2530 " phys addr start[%d]:(%p)\n", 2531 regionidx_hpa, 2532 (void *)(uintptr_t) 2533 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2534 LOG_DEBUG(VHOST_CONFIG, 2535 "in fill_hpa_regions: host phys addr " 2536 "start[%d]:(%p)\n", 2537 regionidx_hpa, 2538 (void *)(uintptr_t) 2539 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2540 k = 0; 2541 } else { 2542 k += page_size; 2543 } 2544 } 2545 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2546 = mem_region_hpa[regionidx_hpa].guest_phys_address 2547 + k + page_size; 2548 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2549 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2550 "[%d]:(%p)\n", regionidx_hpa, 2551 (void *)(uintptr_t) 2552 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2553 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2554 "[%d]:(%p)\n", regionidx_hpa, 2555 (void *)(uintptr_t) 2556 (mem_region_hpa[regionidx_hpa].memory_size)); 2557 ++regionidx_hpa; 2558 } 2559 return regionidx_hpa; 2560 } 2561 2562 /* 2563 * A new device is added to a data core. First the device is added to the main linked list 2564 * and the allocated to a specific data core. 2565 */ 2566 static int 2567 new_device (struct virtio_net *dev) 2568 { 2569 struct virtio_net_data_ll *ll_dev; 2570 int lcore, core_add = 0; 2571 uint32_t device_num_min = num_devices; 2572 struct vhost_dev *vdev; 2573 uint32_t regionidx; 2574 2575 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2576 if (vdev == NULL) { 2577 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2578 dev->device_fh); 2579 return -1; 2580 } 2581 vdev->dev = dev; 2582 dev->priv = vdev; 2583 2584 if (zero_copy) { 2585 vdev->nregions_hpa = dev->mem->nregions; 2586 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2587 vdev->nregions_hpa 2588 += check_hpa_regions( 2589 dev->mem->regions[regionidx].guest_phys_address 2590 + dev->mem->regions[regionidx].address_offset, 2591 dev->mem->regions[regionidx].memory_size); 2592 2593 } 2594 2595 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2596 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2597 RTE_CACHE_LINE_SIZE); 2598 if (vdev->regions_hpa == NULL) { 2599 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2600 rte_free(vdev); 2601 return -1; 2602 } 2603 2604 2605 if (fill_hpa_memory_regions( 2606 vdev->regions_hpa, dev->mem 2607 ) != vdev->nregions_hpa) { 2608 2609 RTE_LOG(ERR, VHOST_CONFIG, 2610 "hpa memory regions number mismatch: " 2611 "[%d]\n", vdev->nregions_hpa); 2612 rte_free(vdev->regions_hpa); 2613 rte_free(vdev); 2614 return -1; 2615 } 2616 } 2617 2618 2619 /* Add device to main ll */ 2620 ll_dev = get_data_ll_free_entry(&ll_root_free); 2621 if (ll_dev == NULL) { 2622 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2623 "of %d devices per core has been reached\n", 2624 dev->device_fh, num_devices); 2625 if (vdev->regions_hpa) 2626 rte_free(vdev->regions_hpa); 2627 rte_free(vdev); 2628 return -1; 2629 } 2630 ll_dev->vdev = vdev; 2631 add_data_ll_entry(&ll_root_used, ll_dev); 2632 vdev->vmdq_rx_q 2633 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2634 2635 if (zero_copy) { 2636 uint32_t index = vdev->vmdq_rx_q; 2637 uint32_t count_in_ring, i; 2638 struct mbuf_table *tx_q; 2639 2640 count_in_ring = rte_ring_count(vpool_array[index].ring); 2641 2642 LOG_DEBUG(VHOST_CONFIG, 2643 "(%"PRIu64") in new_device: mbuf count in mempool " 2644 "before attach is: %d\n", 2645 dev->device_fh, 2646 rte_mempool_count(vpool_array[index].pool)); 2647 LOG_DEBUG(VHOST_CONFIG, 2648 "(%"PRIu64") in new_device: mbuf count in ring " 2649 "before attach is : %d\n", 2650 dev->device_fh, count_in_ring); 2651 2652 /* 2653 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2654 */ 2655 for (i = 0; i < count_in_ring; i++) 2656 attach_rxmbuf_zcp(dev); 2657 2658 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2659 "mempool after attach is: %d\n", 2660 dev->device_fh, 2661 rte_mempool_count(vpool_array[index].pool)); 2662 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2663 "ring after attach is : %d\n", 2664 dev->device_fh, 2665 rte_ring_count(vpool_array[index].ring)); 2666 2667 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2668 tx_q->txq_id = vdev->vmdq_rx_q; 2669 2670 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2671 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2672 2673 LOG_DEBUG(VHOST_CONFIG, 2674 "(%"PRIu64") In new_device: Failed to start " 2675 "tx queue:%d\n", 2676 dev->device_fh, vdev->vmdq_rx_q); 2677 2678 mbuf_destroy_zcp(vpool); 2679 rte_free(vdev->regions_hpa); 2680 rte_free(vdev); 2681 return -1; 2682 } 2683 2684 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2685 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2686 2687 LOG_DEBUG(VHOST_CONFIG, 2688 "(%"PRIu64") In new_device: Failed to start " 2689 "rx queue:%d\n", 2690 dev->device_fh, vdev->vmdq_rx_q); 2691 2692 /* Stop the TX queue. */ 2693 if (rte_eth_dev_tx_queue_stop(ports[0], 2694 vdev->vmdq_rx_q) != 0) { 2695 LOG_DEBUG(VHOST_CONFIG, 2696 "(%"PRIu64") In new_device: Failed to " 2697 "stop tx queue:%d\n", 2698 dev->device_fh, vdev->vmdq_rx_q); 2699 } 2700 2701 mbuf_destroy_zcp(vpool); 2702 rte_free(vdev->regions_hpa); 2703 rte_free(vdev); 2704 return -1; 2705 } 2706 2707 } 2708 2709 /*reset ready flag*/ 2710 vdev->ready = DEVICE_MAC_LEARNING; 2711 vdev->remove = 0; 2712 2713 /* Find a suitable lcore to add the device. */ 2714 RTE_LCORE_FOREACH_SLAVE(lcore) { 2715 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2716 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2717 core_add = lcore; 2718 } 2719 } 2720 /* Add device to lcore ll */ 2721 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2722 if (ll_dev == NULL) { 2723 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2724 vdev->ready = DEVICE_SAFE_REMOVE; 2725 destroy_device(dev); 2726 if (vdev->regions_hpa) 2727 rte_free(vdev->regions_hpa); 2728 rte_free(vdev); 2729 return -1; 2730 } 2731 ll_dev->vdev = vdev; 2732 vdev->coreid = core_add; 2733 2734 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2735 2736 /* Initialize device stats */ 2737 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2738 2739 /* Disable notifications. */ 2740 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2741 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2742 lcore_info[vdev->coreid].lcore_ll->device_num++; 2743 dev->flags |= VIRTIO_DEV_RUNNING; 2744 2745 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2746 2747 return 0; 2748 } 2749 2750 /* 2751 * These callback allow devices to be added to the data core when configuration 2752 * has been fully complete. 2753 */ 2754 static const struct virtio_net_device_ops virtio_net_device_ops = 2755 { 2756 .new_device = new_device, 2757 .destroy_device = destroy_device, 2758 }; 2759 2760 /* 2761 * This is a thread will wake up after a period to print stats if the user has 2762 * enabled them. 2763 */ 2764 static void 2765 print_stats(void) 2766 { 2767 struct virtio_net_data_ll *dev_ll; 2768 uint64_t tx_dropped, rx_dropped; 2769 uint64_t tx, tx_total, rx, rx_total; 2770 uint32_t device_fh; 2771 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2772 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2773 2774 while(1) { 2775 sleep(enable_stats); 2776 2777 /* Clear screen and move to top left */ 2778 printf("%s%s", clr, top_left); 2779 2780 printf("\nDevice statistics ===================================="); 2781 2782 dev_ll = ll_root_used; 2783 while (dev_ll != NULL) { 2784 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2785 tx_total = dev_statistics[device_fh].tx_total; 2786 tx = dev_statistics[device_fh].tx; 2787 tx_dropped = tx_total - tx; 2788 if (zero_copy == 0) { 2789 rx_total = rte_atomic64_read( 2790 &dev_statistics[device_fh].rx_total_atomic); 2791 rx = rte_atomic64_read( 2792 &dev_statistics[device_fh].rx_atomic); 2793 } else { 2794 rx_total = dev_statistics[device_fh].rx_total; 2795 rx = dev_statistics[device_fh].rx; 2796 } 2797 rx_dropped = rx_total - rx; 2798 2799 printf("\nStatistics for device %"PRIu32" ------------------------------" 2800 "\nTX total: %"PRIu64"" 2801 "\nTX dropped: %"PRIu64"" 2802 "\nTX successful: %"PRIu64"" 2803 "\nRX total: %"PRIu64"" 2804 "\nRX dropped: %"PRIu64"" 2805 "\nRX successful: %"PRIu64"", 2806 device_fh, 2807 tx_total, 2808 tx_dropped, 2809 tx, 2810 rx_total, 2811 rx_dropped, 2812 rx); 2813 2814 dev_ll = dev_ll->next; 2815 } 2816 printf("\n======================================================\n"); 2817 } 2818 } 2819 2820 static void 2821 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2822 char *ring_name, uint32_t nb_mbuf) 2823 { 2824 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2825 vpool_array[index].pool 2826 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2827 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2828 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2829 rte_pktmbuf_init, NULL, socket, 0); 2830 if (vpool_array[index].pool != NULL) { 2831 vpool_array[index].ring 2832 = rte_ring_create(ring_name, 2833 rte_align32pow2(nb_mbuf + 1), 2834 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2835 if (likely(vpool_array[index].ring != NULL)) { 2836 LOG_DEBUG(VHOST_CONFIG, 2837 "in setup_mempool_tbl: mbuf count in " 2838 "mempool is: %d\n", 2839 rte_mempool_count(vpool_array[index].pool)); 2840 LOG_DEBUG(VHOST_CONFIG, 2841 "in setup_mempool_tbl: mbuf count in " 2842 "ring is: %d\n", 2843 rte_ring_count(vpool_array[index].ring)); 2844 } else { 2845 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2846 ring_name); 2847 } 2848 2849 /* Need consider head room. */ 2850 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2851 } else { 2852 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2853 } 2854 } 2855 2856 2857 /* 2858 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2859 * device is also registered here to handle the IOCTLs. 2860 */ 2861 int 2862 main(int argc, char *argv[]) 2863 { 2864 struct rte_mempool *mbuf_pool = NULL; 2865 unsigned lcore_id, core_id = 0; 2866 unsigned nb_ports, valid_num_ports; 2867 int ret; 2868 uint8_t portid; 2869 uint16_t queue_id; 2870 static pthread_t tid; 2871 2872 /* init EAL */ 2873 ret = rte_eal_init(argc, argv); 2874 if (ret < 0) 2875 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2876 argc -= ret; 2877 argv += ret; 2878 2879 /* parse app arguments */ 2880 ret = us_vhost_parse_args(argc, argv); 2881 if (ret < 0) 2882 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2883 2884 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2885 if (rte_lcore_is_enabled(lcore_id)) 2886 lcore_ids[core_id ++] = lcore_id; 2887 2888 if (rte_lcore_count() > RTE_MAX_LCORE) 2889 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2890 2891 /*set the number of swithcing cores available*/ 2892 num_switching_cores = rte_lcore_count()-1; 2893 2894 /* Get the number of physical ports. */ 2895 nb_ports = rte_eth_dev_count(); 2896 if (nb_ports > RTE_MAX_ETHPORTS) 2897 nb_ports = RTE_MAX_ETHPORTS; 2898 2899 /* 2900 * Update the global var NUM_PORTS and global array PORTS 2901 * and get value of var VALID_NUM_PORTS according to system ports number 2902 */ 2903 valid_num_ports = check_ports_num(nb_ports); 2904 2905 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2906 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2907 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2908 return -1; 2909 } 2910 2911 if (zero_copy == 0) { 2912 /* Create the mbuf pool. */ 2913 mbuf_pool = rte_mempool_create( 2914 "MBUF_POOL", 2915 NUM_MBUFS_PER_PORT 2916 * valid_num_ports, 2917 MBUF_SIZE, MBUF_CACHE_SIZE, 2918 sizeof(struct rte_pktmbuf_pool_private), 2919 rte_pktmbuf_pool_init, NULL, 2920 rte_pktmbuf_init, NULL, 2921 rte_socket_id(), 0); 2922 if (mbuf_pool == NULL) 2923 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2924 2925 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2926 vpool_array[queue_id].pool = mbuf_pool; 2927 2928 if (vm2vm_mode == VM2VM_HARDWARE) { 2929 /* Enable VT loop back to let L2 switch to do it. */ 2930 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2931 LOG_DEBUG(VHOST_CONFIG, 2932 "Enable loop back for L2 switch in vmdq.\n"); 2933 } 2934 } else { 2935 uint32_t nb_mbuf; 2936 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2937 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2938 2939 nb_mbuf = num_rx_descriptor 2940 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2941 + num_switching_cores * MAX_PKT_BURST; 2942 2943 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2944 snprintf(pool_name, sizeof(pool_name), 2945 "rxmbuf_pool_%u", queue_id); 2946 snprintf(ring_name, sizeof(ring_name), 2947 "rxmbuf_ring_%u", queue_id); 2948 setup_mempool_tbl(rte_socket_id(), queue_id, 2949 pool_name, ring_name, nb_mbuf); 2950 } 2951 2952 nb_mbuf = num_tx_descriptor 2953 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2954 + num_switching_cores * MAX_PKT_BURST; 2955 2956 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2957 snprintf(pool_name, sizeof(pool_name), 2958 "txmbuf_pool_%u", queue_id); 2959 snprintf(ring_name, sizeof(ring_name), 2960 "txmbuf_ring_%u", queue_id); 2961 setup_mempool_tbl(rte_socket_id(), 2962 (queue_id + MAX_QUEUES), 2963 pool_name, ring_name, nb_mbuf); 2964 } 2965 2966 if (vm2vm_mode == VM2VM_HARDWARE) { 2967 /* Enable VT loop back to let L2 switch to do it. */ 2968 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2969 LOG_DEBUG(VHOST_CONFIG, 2970 "Enable loop back for L2 switch in vmdq.\n"); 2971 } 2972 } 2973 /* Set log level. */ 2974 rte_set_log_level(LOG_LEVEL); 2975 2976 /* initialize all ports */ 2977 for (portid = 0; portid < nb_ports; portid++) { 2978 /* skip ports that are not enabled */ 2979 if ((enabled_port_mask & (1 << portid)) == 0) { 2980 RTE_LOG(INFO, VHOST_PORT, 2981 "Skipping disabled port %d\n", portid); 2982 continue; 2983 } 2984 if (port_init(portid) != 0) 2985 rte_exit(EXIT_FAILURE, 2986 "Cannot initialize network ports\n"); 2987 } 2988 2989 /* Initialise all linked lists. */ 2990 if (init_data_ll() == -1) 2991 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2992 2993 /* Initialize device stats */ 2994 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2995 2996 /* Enable stats if the user option is set. */ 2997 if (enable_stats) 2998 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2999 3000 /* Launch all data cores. */ 3001 if (zero_copy == 0) { 3002 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3003 rte_eal_remote_launch(switch_worker, 3004 mbuf_pool, lcore_id); 3005 } 3006 } else { 3007 uint32_t count_in_mempool, index, i; 3008 for (index = 0; index < 2*MAX_QUEUES; index++) { 3009 /* For all RX and TX queues. */ 3010 count_in_mempool 3011 = rte_mempool_count(vpool_array[index].pool); 3012 3013 /* 3014 * Transfer all un-attached mbufs from vpool.pool 3015 * to vpoo.ring. 3016 */ 3017 for (i = 0; i < count_in_mempool; i++) { 3018 struct rte_mbuf *mbuf 3019 = __rte_mbuf_raw_alloc( 3020 vpool_array[index].pool); 3021 rte_ring_sp_enqueue(vpool_array[index].ring, 3022 (void *)mbuf); 3023 } 3024 3025 LOG_DEBUG(VHOST_CONFIG, 3026 "in main: mbuf count in mempool at initial " 3027 "is: %d\n", count_in_mempool); 3028 LOG_DEBUG(VHOST_CONFIG, 3029 "in main: mbuf count in ring at initial is :" 3030 " %d\n", 3031 rte_ring_count(vpool_array[index].ring)); 3032 } 3033 3034 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3035 rte_eal_remote_launch(switch_worker_zcp, NULL, 3036 lcore_id); 3037 } 3038 3039 if (mergeable == 0) 3040 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3041 3042 /* Register CUSE device to handle IOCTLs. */ 3043 ret = rte_vhost_driver_register((char *)&dev_basename); 3044 if (ret != 0) 3045 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3046 3047 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3048 3049 /* Start CUSE session. */ 3050 rte_vhost_driver_session_start(); 3051 return 0; 3052 3053 } 3054 3055