1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 /* mask of enabled ports */ 143 static uint32_t enabled_port_mask = 0; 144 145 /* Promiscuous mode */ 146 static uint32_t promiscuous; 147 148 /*Number of switching cores enabled*/ 149 static uint32_t num_switching_cores = 0; 150 151 /* number of devices/queues to support*/ 152 static uint32_t num_queues = 0; 153 static uint32_t num_devices; 154 155 /* 156 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 157 * disabled on default. 158 */ 159 static uint32_t zero_copy; 160 static int mergeable; 161 162 /* number of descriptors to apply*/ 163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 165 166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 167 #define MAX_RING_DESC 4096 168 169 struct vpool { 170 struct rte_mempool *pool; 171 struct rte_ring *ring; 172 uint32_t buf_size; 173 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 174 175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 176 typedef enum { 177 VM2VM_DISABLED = 0, 178 VM2VM_SOFTWARE = 1, 179 VM2VM_HARDWARE = 2, 180 VM2VM_LAST 181 } vm2vm_type; 182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 183 184 /* The type of host physical address translated from guest physical address. */ 185 typedef enum { 186 PHYS_ADDR_CONTINUOUS = 0, 187 PHYS_ADDR_CROSS_SUBREG = 1, 188 PHYS_ADDR_INVALID = 2, 189 PHYS_ADDR_LAST 190 } hpa_type; 191 192 /* Enable stats. */ 193 static uint32_t enable_stats = 0; 194 /* Enable retries on RX. */ 195 static uint32_t enable_retry = 1; 196 /* Specify timeout (in useconds) between retries on RX. */ 197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 198 /* Specify the number of retries on RX. */ 199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 200 201 /* Character device basename. Can be set by user. */ 202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 203 204 /* empty vmdq configuration structure. Filled in programatically */ 205 static struct rte_eth_conf vmdq_conf_default = { 206 .rxmode = { 207 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 208 .split_hdr_size = 0, 209 .header_split = 0, /**< Header Split disabled */ 210 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 211 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 212 /* 213 * It is necessary for 1G NIC such as I350, 214 * this fixes bug of ipv4 forwarding in guest can't 215 * forward pakets from one virtio dev to another virtio dev. 216 */ 217 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 218 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 219 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 220 }, 221 222 .txmode = { 223 .mq_mode = ETH_MQ_TX_NONE, 224 }, 225 .rx_adv_conf = { 226 /* 227 * should be overridden separately in code with 228 * appropriate values 229 */ 230 .vmdq_rx_conf = { 231 .nb_queue_pools = ETH_8_POOLS, 232 .enable_default_pool = 0, 233 .default_pool = 0, 234 .nb_pool_maps = 0, 235 .pool_map = {{0, 0},}, 236 }, 237 }, 238 }; 239 240 static unsigned lcore_ids[RTE_MAX_LCORE]; 241 static uint8_t ports[RTE_MAX_ETHPORTS]; 242 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 243 static uint16_t num_pf_queues, num_vmdq_queues; 244 static uint16_t vmdq_pool_base, vmdq_queue_base; 245 static uint16_t queues_per_pool; 246 247 static const uint16_t external_pkt_default_vlan_tag = 2000; 248 const uint16_t vlan_tags[] = { 249 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 250 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 251 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 252 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 253 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 254 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 255 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 256 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 257 }; 258 259 /* ethernet addresses of ports */ 260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 261 262 /* heads for the main used and free linked lists for the data path. */ 263 static struct virtio_net_data_ll *ll_root_used = NULL; 264 static struct virtio_net_data_ll *ll_root_free = NULL; 265 266 /* Array of data core structures containing information on individual core linked lists. */ 267 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 268 269 /* Used for queueing bursts of TX packets. */ 270 struct mbuf_table { 271 unsigned len; 272 unsigned txq_id; 273 struct rte_mbuf *m_table[MAX_PKT_BURST]; 274 }; 275 276 /* TX queue for each data core. */ 277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 278 279 /* TX queue fori each virtio device for zero copy. */ 280 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 281 282 /* Vlan header struct used to insert vlan tags on TX. */ 283 struct vlan_ethhdr { 284 unsigned char h_dest[ETH_ALEN]; 285 unsigned char h_source[ETH_ALEN]; 286 __be16 h_vlan_proto; 287 __be16 h_vlan_TCI; 288 __be16 h_vlan_encapsulated_proto; 289 }; 290 291 /* IPv4 Header */ 292 struct ipv4_hdr { 293 uint8_t version_ihl; /**< version and header length */ 294 uint8_t type_of_service; /**< type of service */ 295 uint16_t total_length; /**< length of packet */ 296 uint16_t packet_id; /**< packet ID */ 297 uint16_t fragment_offset; /**< fragmentation offset */ 298 uint8_t time_to_live; /**< time to live */ 299 uint8_t next_proto_id; /**< protocol ID */ 300 uint16_t hdr_checksum; /**< header checksum */ 301 uint32_t src_addr; /**< source address */ 302 uint32_t dst_addr; /**< destination address */ 303 } __attribute__((__packed__)); 304 305 /* Header lengths. */ 306 #define VLAN_HLEN 4 307 #define VLAN_ETH_HLEN 18 308 309 /* Per-device statistics struct */ 310 struct device_statistics { 311 uint64_t tx_total; 312 rte_atomic64_t rx_total_atomic; 313 uint64_t rx_total; 314 uint64_t tx; 315 rte_atomic64_t rx_atomic; 316 uint64_t rx; 317 } __rte_cache_aligned; 318 struct device_statistics dev_statistics[MAX_DEVICES]; 319 320 /* 321 * Builds up the correct configuration for VMDQ VLAN pool map 322 * according to the pool & queue limits. 323 */ 324 static inline int 325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 326 { 327 struct rte_eth_vmdq_rx_conf conf; 328 struct rte_eth_vmdq_rx_conf *def_conf = 329 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 330 unsigned i; 331 332 memset(&conf, 0, sizeof(conf)); 333 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 334 conf.nb_pool_maps = num_devices; 335 conf.enable_loop_back = def_conf->enable_loop_back; 336 conf.rx_mode = def_conf->rx_mode; 337 338 for (i = 0; i < conf.nb_pool_maps; i++) { 339 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 340 conf.pool_map[i].pools = (1UL << i); 341 } 342 343 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 344 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 345 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 346 return 0; 347 } 348 349 /* 350 * Validate the device number according to the max pool number gotten form 351 * dev_info. If the device number is invalid, give the error message and 352 * return -1. Each device must have its own pool. 353 */ 354 static inline int 355 validate_num_devices(uint32_t max_nb_devices) 356 { 357 if (num_devices > max_nb_devices) { 358 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 359 return -1; 360 } 361 return 0; 362 } 363 364 /* 365 * Initialises a given port using global settings and with the rx buffers 366 * coming from the mbuf_pool passed as parameter 367 */ 368 static inline int 369 port_init(uint8_t port) 370 { 371 struct rte_eth_dev_info dev_info; 372 struct rte_eth_conf port_conf; 373 struct rte_eth_rxconf *rxconf; 374 struct rte_eth_txconf *txconf; 375 int16_t rx_rings, tx_rings; 376 uint16_t rx_ring_size, tx_ring_size; 377 int retval; 378 uint16_t q; 379 380 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 381 rte_eth_dev_info_get (port, &dev_info); 382 383 if (dev_info.max_rx_queues > MAX_QUEUES) { 384 rte_exit(EXIT_FAILURE, 385 "please define MAX_QUEUES no less than %u in %s\n", 386 dev_info.max_rx_queues, __FILE__); 387 } 388 389 rxconf = &dev_info.default_rxconf; 390 txconf = &dev_info.default_txconf; 391 rxconf->rx_drop_en = 1; 392 393 /* 394 * Zero copy defers queue RX/TX start to the time when guest 395 * finishes its startup and packet buffers from that guest are 396 * available. 397 */ 398 if (zero_copy) { 399 rxconf->rx_deferred_start = 1; 400 rxconf->rx_drop_en = 0; 401 txconf->tx_deferred_start = 1; 402 } 403 404 /*configure the number of supported virtio devices based on VMDQ limits */ 405 num_devices = dev_info.max_vmdq_pools; 406 407 if (zero_copy) { 408 rx_ring_size = num_rx_descriptor; 409 tx_ring_size = num_tx_descriptor; 410 tx_rings = dev_info.max_tx_queues; 411 } else { 412 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 413 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 414 tx_rings = (uint16_t)rte_lcore_count(); 415 } 416 417 retval = validate_num_devices(MAX_DEVICES); 418 if (retval < 0) 419 return retval; 420 421 /* Get port configuration. */ 422 retval = get_eth_conf(&port_conf, num_devices); 423 if (retval < 0) 424 return retval; 425 /* NIC queues are divided into pf queues and vmdq queues. */ 426 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 427 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 428 num_vmdq_queues = num_devices * queues_per_pool; 429 num_queues = num_pf_queues + num_vmdq_queues; 430 vmdq_queue_base = dev_info.vmdq_queue_base; 431 vmdq_pool_base = dev_info.vmdq_pool_base; 432 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 433 num_pf_queues, num_devices, queues_per_pool); 434 435 if (port >= rte_eth_dev_count()) return -1; 436 437 rx_rings = (uint16_t)dev_info.max_rx_queues; 438 /* Configure ethernet device. */ 439 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 440 if (retval != 0) 441 return retval; 442 443 /* Setup the queues. */ 444 for (q = 0; q < rx_rings; q ++) { 445 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 446 rte_eth_dev_socket_id(port), 447 rxconf, 448 vpool_array[q].pool); 449 if (retval < 0) 450 return retval; 451 } 452 for (q = 0; q < tx_rings; q ++) { 453 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 454 rte_eth_dev_socket_id(port), 455 txconf); 456 if (retval < 0) 457 return retval; 458 } 459 460 /* Start the device. */ 461 retval = rte_eth_dev_start(port); 462 if (retval < 0) { 463 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 464 return retval; 465 } 466 467 if (promiscuous) 468 rte_eth_promiscuous_enable(port); 469 470 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 471 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 472 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 473 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 474 (unsigned)port, 475 vmdq_ports_eth_addr[port].addr_bytes[0], 476 vmdq_ports_eth_addr[port].addr_bytes[1], 477 vmdq_ports_eth_addr[port].addr_bytes[2], 478 vmdq_ports_eth_addr[port].addr_bytes[3], 479 vmdq_ports_eth_addr[port].addr_bytes[4], 480 vmdq_ports_eth_addr[port].addr_bytes[5]); 481 482 return 0; 483 } 484 485 /* 486 * Set character device basename. 487 */ 488 static int 489 us_vhost_parse_basename(const char *q_arg) 490 { 491 /* parse number string */ 492 493 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 494 return -1; 495 else 496 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 497 498 return 0; 499 } 500 501 /* 502 * Parse the portmask provided at run time. 503 */ 504 static int 505 parse_portmask(const char *portmask) 506 { 507 char *end = NULL; 508 unsigned long pm; 509 510 errno = 0; 511 512 /* parse hexadecimal string */ 513 pm = strtoul(portmask, &end, 16); 514 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 515 return -1; 516 517 if (pm == 0) 518 return -1; 519 520 return pm; 521 522 } 523 524 /* 525 * Parse num options at run time. 526 */ 527 static int 528 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 529 { 530 char *end = NULL; 531 unsigned long num; 532 533 errno = 0; 534 535 /* parse unsigned int string */ 536 num = strtoul(q_arg, &end, 10); 537 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 538 return -1; 539 540 if (num > max_valid_value) 541 return -1; 542 543 return num; 544 545 } 546 547 /* 548 * Display usage 549 */ 550 static void 551 us_vhost_usage(const char *prgname) 552 { 553 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 554 " --vm2vm [0|1|2]\n" 555 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 556 " --dev-basename <name>\n" 557 " --nb-devices ND\n" 558 " -p PORTMASK: Set mask for ports to be used by application\n" 559 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 560 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 561 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 562 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 563 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 564 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 565 " --dev-basename: The basename to be used for the character device.\n" 566 " --zero-copy [0|1]: disable(default)/enable rx/tx " 567 "zero copy\n" 568 " --rx-desc-num [0-N]: the number of descriptors on rx, " 569 "used only when zero copy is enabled.\n" 570 " --tx-desc-num [0-N]: the number of descriptors on tx, " 571 "used only when zero copy is enabled.\n", 572 prgname); 573 } 574 575 /* 576 * Parse the arguments given in the command line of the application. 577 */ 578 static int 579 us_vhost_parse_args(int argc, char **argv) 580 { 581 int opt, ret; 582 int option_index; 583 unsigned i; 584 const char *prgname = argv[0]; 585 static struct option long_option[] = { 586 {"vm2vm", required_argument, NULL, 0}, 587 {"rx-retry", required_argument, NULL, 0}, 588 {"rx-retry-delay", required_argument, NULL, 0}, 589 {"rx-retry-num", required_argument, NULL, 0}, 590 {"mergeable", required_argument, NULL, 0}, 591 {"stats", required_argument, NULL, 0}, 592 {"dev-basename", required_argument, NULL, 0}, 593 {"zero-copy", required_argument, NULL, 0}, 594 {"rx-desc-num", required_argument, NULL, 0}, 595 {"tx-desc-num", required_argument, NULL, 0}, 596 {NULL, 0, 0, 0}, 597 }; 598 599 /* Parse command line */ 600 while ((opt = getopt_long(argc, argv, "p:P", 601 long_option, &option_index)) != EOF) { 602 switch (opt) { 603 /* Portmask */ 604 case 'p': 605 enabled_port_mask = parse_portmask(optarg); 606 if (enabled_port_mask == 0) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } 611 break; 612 613 case 'P': 614 promiscuous = 1; 615 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 616 ETH_VMDQ_ACCEPT_BROADCAST | 617 ETH_VMDQ_ACCEPT_MULTICAST; 618 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 619 620 break; 621 622 case 0: 623 /* Enable/disable vm2vm comms. */ 624 if (!strncmp(long_option[option_index].name, "vm2vm", 625 MAX_LONG_OPT_SZ)) { 626 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 627 if (ret == -1) { 628 RTE_LOG(INFO, VHOST_CONFIG, 629 "Invalid argument for " 630 "vm2vm [0|1|2]\n"); 631 us_vhost_usage(prgname); 632 return -1; 633 } else { 634 vm2vm_mode = (vm2vm_type)ret; 635 } 636 } 637 638 /* Enable/disable retries on RX. */ 639 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 640 ret = parse_num_opt(optarg, 1); 641 if (ret == -1) { 642 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 643 us_vhost_usage(prgname); 644 return -1; 645 } else { 646 enable_retry = ret; 647 } 648 } 649 650 /* Specify the retries delay time (in useconds) on RX. */ 651 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 652 ret = parse_num_opt(optarg, INT32_MAX); 653 if (ret == -1) { 654 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 655 us_vhost_usage(prgname); 656 return -1; 657 } else { 658 burst_rx_delay_time = ret; 659 } 660 } 661 662 /* Specify the retries number on RX. */ 663 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 664 ret = parse_num_opt(optarg, INT32_MAX); 665 if (ret == -1) { 666 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 667 us_vhost_usage(prgname); 668 return -1; 669 } else { 670 burst_rx_retry_num = ret; 671 } 672 } 673 674 /* Enable/disable RX mergeable buffers. */ 675 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 676 ret = parse_num_opt(optarg, 1); 677 if (ret == -1) { 678 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 679 us_vhost_usage(prgname); 680 return -1; 681 } else { 682 mergeable = !!ret; 683 if (ret) { 684 vmdq_conf_default.rxmode.jumbo_frame = 1; 685 vmdq_conf_default.rxmode.max_rx_pkt_len 686 = JUMBO_FRAME_MAX_SIZE; 687 } 688 } 689 } 690 691 /* Enable/disable stats. */ 692 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 693 ret = parse_num_opt(optarg, INT32_MAX); 694 if (ret == -1) { 695 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 696 us_vhost_usage(prgname); 697 return -1; 698 } else { 699 enable_stats = ret; 700 } 701 } 702 703 /* Set character device basename. */ 704 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 705 if (us_vhost_parse_basename(optarg) == -1) { 706 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 707 us_vhost_usage(prgname); 708 return -1; 709 } 710 } 711 712 /* Enable/disable rx/tx zero copy. */ 713 if (!strncmp(long_option[option_index].name, 714 "zero-copy", MAX_LONG_OPT_SZ)) { 715 ret = parse_num_opt(optarg, 1); 716 if (ret == -1) { 717 RTE_LOG(INFO, VHOST_CONFIG, 718 "Invalid argument" 719 " for zero-copy [0|1]\n"); 720 us_vhost_usage(prgname); 721 return -1; 722 } else 723 zero_copy = ret; 724 725 if (zero_copy) { 726 #ifdef RTE_MBUF_REFCNT 727 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 728 "zero copy vhost APP, please " 729 "disable RTE_MBUF_REFCNT\n" 730 "in config file and then rebuild DPDK " 731 "core lib!\n" 732 "Otherwise please disable zero copy " 733 "flag in command line!\n"); 734 return -1; 735 #endif 736 } 737 } 738 739 /* Specify the descriptor number on RX. */ 740 if (!strncmp(long_option[option_index].name, 741 "rx-desc-num", MAX_LONG_OPT_SZ)) { 742 ret = parse_num_opt(optarg, MAX_RING_DESC); 743 if ((ret == -1) || (!POWEROF2(ret))) { 744 RTE_LOG(INFO, VHOST_CONFIG, 745 "Invalid argument for rx-desc-num[0-N]," 746 "power of 2 required.\n"); 747 us_vhost_usage(prgname); 748 return -1; 749 } else { 750 num_rx_descriptor = ret; 751 } 752 } 753 754 /* Specify the descriptor number on TX. */ 755 if (!strncmp(long_option[option_index].name, 756 "tx-desc-num", MAX_LONG_OPT_SZ)) { 757 ret = parse_num_opt(optarg, MAX_RING_DESC); 758 if ((ret == -1) || (!POWEROF2(ret))) { 759 RTE_LOG(INFO, VHOST_CONFIG, 760 "Invalid argument for tx-desc-num [0-N]," 761 "power of 2 required.\n"); 762 us_vhost_usage(prgname); 763 return -1; 764 } else { 765 num_tx_descriptor = ret; 766 } 767 } 768 769 break; 770 771 /* Invalid option - print options. */ 772 default: 773 us_vhost_usage(prgname); 774 return -1; 775 } 776 } 777 778 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 779 if (enabled_port_mask & (1 << i)) 780 ports[num_ports++] = (uint8_t)i; 781 } 782 783 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 784 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 785 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 786 return -1; 787 } 788 789 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 790 RTE_LOG(INFO, VHOST_PORT, 791 "Vhost zero copy doesn't support software vm2vm," 792 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 793 return -1; 794 } 795 796 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 797 RTE_LOG(INFO, VHOST_PORT, 798 "Vhost zero copy doesn't support jumbo frame," 799 "please specify '--mergeable 0' to disable the " 800 "mergeable feature.\n"); 801 return -1; 802 } 803 804 return 0; 805 } 806 807 /* 808 * Update the global var NUM_PORTS and array PORTS according to system ports number 809 * and return valid ports number 810 */ 811 static unsigned check_ports_num(unsigned nb_ports) 812 { 813 unsigned valid_num_ports = num_ports; 814 unsigned portid; 815 816 if (num_ports > nb_ports) { 817 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 818 num_ports, nb_ports); 819 num_ports = nb_ports; 820 } 821 822 for (portid = 0; portid < num_ports; portid ++) { 823 if (ports[portid] >= nb_ports) { 824 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 825 ports[portid], (nb_ports - 1)); 826 ports[portid] = INVALID_PORT_ID; 827 valid_num_ports--; 828 } 829 } 830 return valid_num_ports; 831 } 832 833 /* 834 * Macro to print out packet contents. Wrapped in debug define so that the 835 * data path is not effected when debug is disabled. 836 */ 837 #ifdef DEBUG 838 #define PRINT_PACKET(device, addr, size, header) do { \ 839 char *pkt_addr = (char*)(addr); \ 840 unsigned int index; \ 841 char packet[MAX_PRINT_BUFF]; \ 842 \ 843 if ((header)) \ 844 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 845 else \ 846 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 847 for (index = 0; index < (size); index++) { \ 848 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 849 "%02hhx ", pkt_addr[index]); \ 850 } \ 851 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 852 \ 853 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 854 } while(0) 855 #else 856 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 857 #endif 858 859 /* 860 * Function to convert guest physical addresses to vhost physical addresses. 861 * This is used to convert virtio buffer addresses. 862 */ 863 static inline uint64_t __attribute__((always_inline)) 864 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 865 uint32_t buf_len, hpa_type *addr_type) 866 { 867 struct virtio_memory_regions_hpa *region; 868 uint32_t regionidx; 869 uint64_t vhost_pa = 0; 870 871 *addr_type = PHYS_ADDR_INVALID; 872 873 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 874 region = &vdev->regions_hpa[regionidx]; 875 if ((guest_pa >= region->guest_phys_address) && 876 (guest_pa <= region->guest_phys_address_end)) { 877 vhost_pa = region->host_phys_addr_offset + guest_pa; 878 if (likely((guest_pa + buf_len - 1) 879 <= region->guest_phys_address_end)) 880 *addr_type = PHYS_ADDR_CONTINUOUS; 881 else 882 *addr_type = PHYS_ADDR_CROSS_SUBREG; 883 break; 884 } 885 } 886 887 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 888 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 889 (void *)(uintptr_t)vhost_pa); 890 891 return vhost_pa; 892 } 893 894 /* 895 * Compares a packet destination MAC address to a device MAC address. 896 */ 897 static inline int __attribute__((always_inline)) 898 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 899 { 900 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 901 } 902 903 /* 904 * This function learns the MAC address of the device and registers this along with a 905 * vlan tag to a VMDQ. 906 */ 907 static int 908 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 909 { 910 struct ether_hdr *pkt_hdr; 911 struct virtio_net_data_ll *dev_ll; 912 struct virtio_net *dev = vdev->dev; 913 int i, ret; 914 915 /* Learn MAC address of guest device from packet */ 916 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 917 918 dev_ll = ll_root_used; 919 920 while (dev_ll != NULL) { 921 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 922 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 923 return -1; 924 } 925 dev_ll = dev_ll->next; 926 } 927 928 for (i = 0; i < ETHER_ADDR_LEN; i++) 929 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 930 931 /* vlan_tag currently uses the device_id. */ 932 vdev->vlan_tag = vlan_tags[dev->device_fh]; 933 934 /* Print out VMDQ registration info. */ 935 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 936 dev->device_fh, 937 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 938 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 939 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 940 vdev->vlan_tag); 941 942 /* Register the MAC address. */ 943 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 944 (uint32_t)dev->device_fh + vmdq_pool_base); 945 if (ret) 946 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 947 dev->device_fh); 948 949 /* Enable stripping of the vlan tag as we handle routing. */ 950 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 951 952 /* Set device as ready for RX. */ 953 vdev->ready = DEVICE_RX; 954 955 return 0; 956 } 957 958 /* 959 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 960 * queue before disabling RX on the device. 961 */ 962 static inline void 963 unlink_vmdq(struct vhost_dev *vdev) 964 { 965 unsigned i = 0; 966 unsigned rx_count; 967 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 968 969 if (vdev->ready == DEVICE_RX) { 970 /*clear MAC and VLAN settings*/ 971 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 972 for (i = 0; i < 6; i++) 973 vdev->mac_address.addr_bytes[i] = 0; 974 975 vdev->vlan_tag = 0; 976 977 /*Clear out the receive buffers*/ 978 rx_count = rte_eth_rx_burst(ports[0], 979 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 980 981 while (rx_count) { 982 for (i = 0; i < rx_count; i++) 983 rte_pktmbuf_free(pkts_burst[i]); 984 985 rx_count = rte_eth_rx_burst(ports[0], 986 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 987 } 988 989 vdev->ready = DEVICE_MAC_LEARNING; 990 } 991 } 992 993 /* 994 * Check if the packet destination MAC address is for a local device. If so then put 995 * the packet on that devices RX queue. If not then return. 996 */ 997 static inline int __attribute__((always_inline)) 998 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 999 { 1000 struct virtio_net_data_ll *dev_ll; 1001 struct ether_hdr *pkt_hdr; 1002 uint64_t ret = 0; 1003 struct virtio_net *dev = vdev->dev; 1004 struct virtio_net *tdev; /* destination virito device */ 1005 1006 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1007 1008 /*get the used devices list*/ 1009 dev_ll = ll_root_used; 1010 1011 while (dev_ll != NULL) { 1012 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1013 &dev_ll->vdev->mac_address)) { 1014 1015 /* Drop the packet if the TX packet is destined for the TX device. */ 1016 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1017 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1018 dev->device_fh); 1019 return 0; 1020 } 1021 tdev = dev_ll->vdev->dev; 1022 1023 1024 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1025 1026 if (unlikely(dev_ll->vdev->remove)) { 1027 /*drop the packet if the device is marked for removal*/ 1028 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1029 } else { 1030 /*send the packet to the local virtio device*/ 1031 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1032 if (enable_stats) { 1033 rte_atomic64_add( 1034 &dev_statistics[tdev->device_fh].rx_total_atomic, 1035 1); 1036 rte_atomic64_add( 1037 &dev_statistics[tdev->device_fh].rx_atomic, 1038 ret); 1039 dev_statistics[tdev->device_fh].tx_total++; 1040 dev_statistics[tdev->device_fh].tx += ret; 1041 } 1042 } 1043 1044 return 0; 1045 } 1046 dev_ll = dev_ll->next; 1047 } 1048 1049 return -1; 1050 } 1051 1052 /* 1053 * Check if the destination MAC of a packet is one local VM, 1054 * and get its vlan tag, and offset if it is. 1055 */ 1056 static inline int __attribute__((always_inline)) 1057 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1058 uint32_t *offset, uint16_t *vlan_tag) 1059 { 1060 struct virtio_net_data_ll *dev_ll = ll_root_used; 1061 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1062 1063 while (dev_ll != NULL) { 1064 if ((dev_ll->vdev->ready == DEVICE_RX) 1065 && ether_addr_cmp(&(pkt_hdr->d_addr), 1066 &dev_ll->vdev->mac_address)) { 1067 /* 1068 * Drop the packet if the TX packet is 1069 * destined for the TX device. 1070 */ 1071 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1072 LOG_DEBUG(VHOST_DATA, 1073 "(%"PRIu64") TX: Source and destination" 1074 " MAC addresses are the same. Dropping " 1075 "packet.\n", 1076 dev_ll->vdev->dev->device_fh); 1077 return -1; 1078 } 1079 1080 /* 1081 * HW vlan strip will reduce the packet length 1082 * by minus length of vlan tag, so need restore 1083 * the packet length by plus it. 1084 */ 1085 *offset = VLAN_HLEN; 1086 *vlan_tag = 1087 (uint16_t) 1088 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1089 1090 LOG_DEBUG(VHOST_DATA, 1091 "(%"PRIu64") TX: pkt to local VM device id:" 1092 "(%"PRIu64") vlan tag: %d.\n", 1093 dev->device_fh, dev_ll->vdev->dev->device_fh, 1094 vlan_tag); 1095 1096 break; 1097 } 1098 dev_ll = dev_ll->next; 1099 } 1100 return 0; 1101 } 1102 1103 /* 1104 * This function routes the TX packet to the correct interface. This may be a local device 1105 * or the physical port. 1106 */ 1107 static inline void __attribute__((always_inline)) 1108 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1109 { 1110 struct mbuf_table *tx_q; 1111 struct rte_mbuf **m_table; 1112 unsigned len, ret, offset = 0; 1113 const uint16_t lcore_id = rte_lcore_id(); 1114 struct virtio_net *dev = vdev->dev; 1115 1116 /*check if destination is local VM*/ 1117 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1118 rte_pktmbuf_free(m); 1119 return; 1120 } 1121 1122 if (vm2vm_mode == VM2VM_HARDWARE) { 1123 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 || 1124 offset > rte_pktmbuf_tailroom(m)) { 1125 rte_pktmbuf_free(m); 1126 return; 1127 } 1128 } 1129 1130 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1131 1132 /*Add packet to the port tx queue*/ 1133 tx_q = &lcore_tx_queue[lcore_id]; 1134 len = tx_q->len; 1135 1136 m->ol_flags = PKT_TX_VLAN_PKT; 1137 1138 m->data_len += offset; 1139 m->pkt_len += offset; 1140 1141 m->vlan_tci = vlan_tag; 1142 1143 tx_q->m_table[len] = m; 1144 len++; 1145 if (enable_stats) { 1146 dev_statistics[dev->device_fh].tx_total++; 1147 dev_statistics[dev->device_fh].tx++; 1148 } 1149 1150 if (unlikely(len == MAX_PKT_BURST)) { 1151 m_table = (struct rte_mbuf **)tx_q->m_table; 1152 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1153 /* Free any buffers not handled by TX and update the port stats. */ 1154 if (unlikely(ret < len)) { 1155 do { 1156 rte_pktmbuf_free(m_table[ret]); 1157 } while (++ret < len); 1158 } 1159 1160 len = 0; 1161 } 1162 1163 tx_q->len = len; 1164 return; 1165 } 1166 /* 1167 * This function is called by each data core. It handles all RX/TX registered with the 1168 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1169 * with all devices in the main linked list. 1170 */ 1171 static int 1172 switch_worker(__attribute__((unused)) void *arg) 1173 { 1174 struct rte_mempool *mbuf_pool = arg; 1175 struct virtio_net *dev = NULL; 1176 struct vhost_dev *vdev = NULL; 1177 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1178 struct virtio_net_data_ll *dev_ll; 1179 struct mbuf_table *tx_q; 1180 volatile struct lcore_ll_info *lcore_ll; 1181 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1182 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1183 unsigned ret, i; 1184 const uint16_t lcore_id = rte_lcore_id(); 1185 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1186 uint16_t rx_count = 0; 1187 uint16_t tx_count; 1188 uint32_t retry = 0; 1189 1190 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1191 lcore_ll = lcore_info[lcore_id].lcore_ll; 1192 prev_tsc = 0; 1193 1194 tx_q = &lcore_tx_queue[lcore_id]; 1195 for (i = 0; i < num_cores; i ++) { 1196 if (lcore_ids[i] == lcore_id) { 1197 tx_q->txq_id = i; 1198 break; 1199 } 1200 } 1201 1202 while(1) { 1203 cur_tsc = rte_rdtsc(); 1204 /* 1205 * TX burst queue drain 1206 */ 1207 diff_tsc = cur_tsc - prev_tsc; 1208 if (unlikely(diff_tsc > drain_tsc)) { 1209 1210 if (tx_q->len) { 1211 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1212 1213 /*Tx any packets in the queue*/ 1214 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1215 (struct rte_mbuf **)tx_q->m_table, 1216 (uint16_t)tx_q->len); 1217 if (unlikely(ret < tx_q->len)) { 1218 do { 1219 rte_pktmbuf_free(tx_q->m_table[ret]); 1220 } while (++ret < tx_q->len); 1221 } 1222 1223 tx_q->len = 0; 1224 } 1225 1226 prev_tsc = cur_tsc; 1227 1228 } 1229 1230 rte_prefetch0(lcore_ll->ll_root_used); 1231 /* 1232 * Inform the configuration core that we have exited the linked list and that no devices are 1233 * in use if requested. 1234 */ 1235 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1236 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1237 1238 /* 1239 * Process devices 1240 */ 1241 dev_ll = lcore_ll->ll_root_used; 1242 1243 while (dev_ll != NULL) { 1244 /*get virtio device ID*/ 1245 vdev = dev_ll->vdev; 1246 dev = vdev->dev; 1247 1248 if (unlikely(vdev->remove)) { 1249 dev_ll = dev_ll->next; 1250 unlink_vmdq(vdev); 1251 vdev->ready = DEVICE_SAFE_REMOVE; 1252 continue; 1253 } 1254 if (likely(vdev->ready == DEVICE_RX)) { 1255 /*Handle guest RX*/ 1256 rx_count = rte_eth_rx_burst(ports[0], 1257 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1258 1259 if (rx_count) { 1260 /* 1261 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1262 * Here MAX_PKT_BURST must be less than virtio queue size 1263 */ 1264 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1265 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1266 rte_delay_us(burst_rx_delay_time); 1267 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1268 break; 1269 } 1270 } 1271 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1272 if (enable_stats) { 1273 rte_atomic64_add( 1274 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1275 rx_count); 1276 rte_atomic64_add( 1277 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1278 } 1279 while (likely(rx_count)) { 1280 rx_count--; 1281 rte_pktmbuf_free(pkts_burst[rx_count]); 1282 } 1283 1284 } 1285 } 1286 1287 if (likely(!vdev->remove)) { 1288 /* Handle guest TX*/ 1289 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1290 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1291 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1292 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1293 while (tx_count--) 1294 rte_pktmbuf_free(pkts_burst[tx_count]); 1295 } 1296 } 1297 while (tx_count) 1298 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1299 } 1300 1301 /*move to the next device in the list*/ 1302 dev_ll = dev_ll->next; 1303 } 1304 } 1305 1306 return 0; 1307 } 1308 1309 /* 1310 * This function gets available ring number for zero copy rx. 1311 * Only one thread will call this funciton for a paticular virtio device, 1312 * so, it is designed as non-thread-safe function. 1313 */ 1314 static inline uint32_t __attribute__((always_inline)) 1315 get_available_ring_num_zcp(struct virtio_net *dev) 1316 { 1317 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1318 uint16_t avail_idx; 1319 1320 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1321 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1322 } 1323 1324 /* 1325 * This function gets available ring index for zero copy rx, 1326 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1327 * Only one thread will call this funciton for a paticular virtio device, 1328 * so, it is designed as non-thread-safe function. 1329 */ 1330 static inline uint32_t __attribute__((always_inline)) 1331 get_available_ring_index_zcp(struct virtio_net *dev, 1332 uint16_t *res_base_idx, uint32_t count) 1333 { 1334 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1335 uint16_t avail_idx; 1336 uint32_t retry = 0; 1337 uint16_t free_entries; 1338 1339 *res_base_idx = vq->last_used_idx_res; 1340 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1341 free_entries = (avail_idx - *res_base_idx); 1342 1343 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1344 "avail idx: %d, " 1345 "res base idx:%d, free entries:%d\n", 1346 dev->device_fh, avail_idx, *res_base_idx, 1347 free_entries); 1348 1349 /* 1350 * If retry is enabled and the queue is full then we wait 1351 * and retry to avoid packet loss. 1352 */ 1353 if (enable_retry && unlikely(count > free_entries)) { 1354 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1355 rte_delay_us(burst_rx_delay_time); 1356 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1357 free_entries = (avail_idx - *res_base_idx); 1358 if (count <= free_entries) 1359 break; 1360 } 1361 } 1362 1363 /*check that we have enough buffers*/ 1364 if (unlikely(count > free_entries)) 1365 count = free_entries; 1366 1367 if (unlikely(count == 0)) { 1368 LOG_DEBUG(VHOST_DATA, 1369 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1370 "avail idx: %d, res base idx:%d, free entries:%d\n", 1371 dev->device_fh, avail_idx, 1372 *res_base_idx, free_entries); 1373 return 0; 1374 } 1375 1376 vq->last_used_idx_res = *res_base_idx + count; 1377 1378 return count; 1379 } 1380 1381 /* 1382 * This function put descriptor back to used list. 1383 */ 1384 static inline void __attribute__((always_inline)) 1385 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1386 { 1387 uint16_t res_cur_idx = vq->last_used_idx; 1388 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1389 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1390 rte_compiler_barrier(); 1391 *(volatile uint16_t *)&vq->used->idx += 1; 1392 vq->last_used_idx += 1; 1393 1394 /* Kick the guest if necessary. */ 1395 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1396 eventfd_write((int)vq->kickfd, 1); 1397 } 1398 1399 /* 1400 * This function get available descriptor from vitio vring and un-attached mbuf 1401 * from vpool->ring, and then attach them together. It needs adjust the offset 1402 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1403 * frame data may be put to wrong location in mbuf. 1404 */ 1405 static inline void __attribute__((always_inline)) 1406 attach_rxmbuf_zcp(struct virtio_net *dev) 1407 { 1408 uint16_t res_base_idx, desc_idx; 1409 uint64_t buff_addr, phys_addr; 1410 struct vhost_virtqueue *vq; 1411 struct vring_desc *desc; 1412 struct rte_mbuf *mbuf = NULL; 1413 struct vpool *vpool; 1414 hpa_type addr_type; 1415 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1416 1417 vpool = &vpool_array[vdev->vmdq_rx_q]; 1418 vq = dev->virtqueue[VIRTIO_RXQ]; 1419 1420 do { 1421 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1422 1) != 1)) 1423 return; 1424 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1425 1426 desc = &vq->desc[desc_idx]; 1427 if (desc->flags & VRING_DESC_F_NEXT) { 1428 desc = &vq->desc[desc->next]; 1429 buff_addr = gpa_to_vva(dev, desc->addr); 1430 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1431 &addr_type); 1432 } else { 1433 buff_addr = gpa_to_vva(dev, 1434 desc->addr + vq->vhost_hlen); 1435 phys_addr = gpa_to_hpa(vdev, 1436 desc->addr + vq->vhost_hlen, 1437 desc->len, &addr_type); 1438 } 1439 1440 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1441 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1442 " address found when attaching RX frame buffer" 1443 " address!\n", dev->device_fh); 1444 put_desc_to_used_list_zcp(vq, desc_idx); 1445 continue; 1446 } 1447 1448 /* 1449 * Check if the frame buffer address from guest crosses 1450 * sub-region or not. 1451 */ 1452 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1453 RTE_LOG(ERR, VHOST_DATA, 1454 "(%"PRIu64") Frame buffer address cross " 1455 "sub-regioin found when attaching RX frame " 1456 "buffer address!\n", 1457 dev->device_fh); 1458 put_desc_to_used_list_zcp(vq, desc_idx); 1459 continue; 1460 } 1461 } while (unlikely(phys_addr == 0)); 1462 1463 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1464 if (unlikely(mbuf == NULL)) { 1465 LOG_DEBUG(VHOST_DATA, 1466 "(%"PRIu64") in attach_rxmbuf_zcp: " 1467 "ring_sc_dequeue fail.\n", 1468 dev->device_fh); 1469 put_desc_to_used_list_zcp(vq, desc_idx); 1470 return; 1471 } 1472 1473 if (unlikely(vpool->buf_size > desc->len)) { 1474 LOG_DEBUG(VHOST_DATA, 1475 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1476 "length(%d) of descriptor idx: %d less than room " 1477 "size required: %d\n", 1478 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1479 put_desc_to_used_list_zcp(vq, desc_idx); 1480 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1481 return; 1482 } 1483 1484 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1485 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1486 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1487 mbuf->data_len = desc->len; 1488 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1489 1490 LOG_DEBUG(VHOST_DATA, 1491 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1492 "descriptor idx:%d\n", 1493 dev->device_fh, res_base_idx, desc_idx); 1494 1495 __rte_mbuf_raw_free(mbuf); 1496 1497 return; 1498 } 1499 1500 /* 1501 * Detach an attched packet mbuf - 1502 * - restore original mbuf address and length values. 1503 * - reset pktmbuf data and data_len to their default values. 1504 * All other fields of the given packet mbuf will be left intact. 1505 * 1506 * @param m 1507 * The attached packet mbuf. 1508 */ 1509 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1510 { 1511 const struct rte_mempool *mp = m->pool; 1512 void *buf = RTE_MBUF_TO_BADDR(m); 1513 uint32_t buf_ofs; 1514 uint32_t buf_len = mp->elt_size - sizeof(*m); 1515 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1516 1517 m->buf_addr = buf; 1518 m->buf_len = (uint16_t)buf_len; 1519 1520 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1521 RTE_PKTMBUF_HEADROOM : m->buf_len; 1522 m->data_off = buf_ofs; 1523 1524 m->data_len = 0; 1525 } 1526 1527 /* 1528 * This function is called after packets have been transimited. It fetchs mbuf 1529 * from vpool->pool, detached it and put into vpool->ring. It also update the 1530 * used index and kick the guest if necessary. 1531 */ 1532 static inline uint32_t __attribute__((always_inline)) 1533 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1534 { 1535 struct rte_mbuf *mbuf; 1536 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1537 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1538 uint32_t index = 0; 1539 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1540 1541 LOG_DEBUG(VHOST_DATA, 1542 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1543 "clean is: %d\n", 1544 dev->device_fh, mbuf_count); 1545 LOG_DEBUG(VHOST_DATA, 1546 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1547 "clean is : %d\n", 1548 dev->device_fh, rte_ring_count(vpool->ring)); 1549 1550 for (index = 0; index < mbuf_count; index++) { 1551 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1552 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1553 pktmbuf_detach_zcp(mbuf); 1554 rte_ring_sp_enqueue(vpool->ring, mbuf); 1555 1556 /* Update used index buffer information. */ 1557 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1558 vq->used->ring[used_idx].len = 0; 1559 1560 used_idx = (used_idx + 1) & (vq->size - 1); 1561 } 1562 1563 LOG_DEBUG(VHOST_DATA, 1564 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1565 "clean is: %d\n", 1566 dev->device_fh, rte_mempool_count(vpool->pool)); 1567 LOG_DEBUG(VHOST_DATA, 1568 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1569 "clean is : %d\n", 1570 dev->device_fh, rte_ring_count(vpool->ring)); 1571 LOG_DEBUG(VHOST_DATA, 1572 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1573 "vq->last_used_idx:%d\n", 1574 dev->device_fh, vq->last_used_idx); 1575 1576 vq->last_used_idx += mbuf_count; 1577 1578 LOG_DEBUG(VHOST_DATA, 1579 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1580 "vq->last_used_idx:%d\n", 1581 dev->device_fh, vq->last_used_idx); 1582 1583 rte_compiler_barrier(); 1584 1585 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1586 1587 /* Kick guest if required. */ 1588 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1589 eventfd_write((int)vq->kickfd, 1); 1590 1591 return 0; 1592 } 1593 1594 /* 1595 * This function is called when a virtio device is destroy. 1596 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1597 */ 1598 static void mbuf_destroy_zcp(struct vpool *vpool) 1599 { 1600 struct rte_mbuf *mbuf = NULL; 1601 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1602 1603 LOG_DEBUG(VHOST_CONFIG, 1604 "in mbuf_destroy_zcp: mbuf count in mempool before " 1605 "mbuf_destroy_zcp is: %d\n", 1606 mbuf_count); 1607 LOG_DEBUG(VHOST_CONFIG, 1608 "in mbuf_destroy_zcp: mbuf count in ring before " 1609 "mbuf_destroy_zcp is : %d\n", 1610 rte_ring_count(vpool->ring)); 1611 1612 for (index = 0; index < mbuf_count; index++) { 1613 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1614 if (likely(mbuf != NULL)) { 1615 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1616 pktmbuf_detach_zcp(mbuf); 1617 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1618 } 1619 } 1620 1621 LOG_DEBUG(VHOST_CONFIG, 1622 "in mbuf_destroy_zcp: mbuf count in mempool after " 1623 "mbuf_destroy_zcp is: %d\n", 1624 rte_mempool_count(vpool->pool)); 1625 LOG_DEBUG(VHOST_CONFIG, 1626 "in mbuf_destroy_zcp: mbuf count in ring after " 1627 "mbuf_destroy_zcp is : %d\n", 1628 rte_ring_count(vpool->ring)); 1629 } 1630 1631 /* 1632 * This function update the use flag and counter. 1633 */ 1634 static inline uint32_t __attribute__((always_inline)) 1635 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1636 uint32_t count) 1637 { 1638 struct vhost_virtqueue *vq; 1639 struct vring_desc *desc; 1640 struct rte_mbuf *buff; 1641 /* The virtio_hdr is initialised to 0. */ 1642 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1643 = {{0, 0, 0, 0, 0, 0}, 0}; 1644 uint64_t buff_hdr_addr = 0; 1645 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1646 uint32_t head_idx, packet_success = 0; 1647 uint16_t res_cur_idx; 1648 1649 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1650 1651 if (count == 0) 1652 return 0; 1653 1654 vq = dev->virtqueue[VIRTIO_RXQ]; 1655 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1656 1657 res_cur_idx = vq->last_used_idx; 1658 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1659 dev->device_fh, res_cur_idx, res_cur_idx + count); 1660 1661 /* Retrieve all of the head indexes first to avoid caching issues. */ 1662 for (head_idx = 0; head_idx < count; head_idx++) 1663 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1664 1665 /*Prefetch descriptor index. */ 1666 rte_prefetch0(&vq->desc[head[packet_success]]); 1667 1668 while (packet_success != count) { 1669 /* Get descriptor from available ring */ 1670 desc = &vq->desc[head[packet_success]]; 1671 1672 buff = pkts[packet_success]; 1673 LOG_DEBUG(VHOST_DATA, 1674 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1675 "pkt[%d] descriptor idx: %d\n", 1676 dev->device_fh, packet_success, 1677 MBUF_HEADROOM_UINT32(buff)); 1678 1679 PRINT_PACKET(dev, 1680 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1681 + RTE_PKTMBUF_HEADROOM), 1682 rte_pktmbuf_data_len(buff), 0); 1683 1684 /* Buffer address translation for virtio header. */ 1685 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1686 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1687 1688 /* 1689 * If the descriptors are chained the header and data are 1690 * placed in separate buffers. 1691 */ 1692 if (desc->flags & VRING_DESC_F_NEXT) { 1693 desc->len = vq->vhost_hlen; 1694 desc = &vq->desc[desc->next]; 1695 desc->len = rte_pktmbuf_data_len(buff); 1696 } else { 1697 desc->len = packet_len; 1698 } 1699 1700 /* Update used ring with desc information */ 1701 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1702 = head[packet_success]; 1703 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1704 = packet_len; 1705 res_cur_idx++; 1706 packet_success++; 1707 1708 /* A header is required per buffer. */ 1709 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1710 (const void *)&virtio_hdr, vq->vhost_hlen); 1711 1712 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1713 1714 if (likely(packet_success < count)) { 1715 /* Prefetch descriptor index. */ 1716 rte_prefetch0(&vq->desc[head[packet_success]]); 1717 } 1718 } 1719 1720 rte_compiler_barrier(); 1721 1722 LOG_DEBUG(VHOST_DATA, 1723 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1724 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1725 dev->device_fh, vq->last_used_idx, vq->used->idx); 1726 1727 *(volatile uint16_t *)&vq->used->idx += count; 1728 vq->last_used_idx += count; 1729 1730 LOG_DEBUG(VHOST_DATA, 1731 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1732 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1733 dev->device_fh, vq->last_used_idx, vq->used->idx); 1734 1735 /* Kick the guest if necessary. */ 1736 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1737 eventfd_write((int)vq->kickfd, 1); 1738 1739 return count; 1740 } 1741 1742 /* 1743 * This function routes the TX packet to the correct interface. 1744 * This may be a local device or the physical port. 1745 */ 1746 static inline void __attribute__((always_inline)) 1747 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1748 uint32_t desc_idx, uint8_t need_copy) 1749 { 1750 struct mbuf_table *tx_q; 1751 struct rte_mbuf **m_table; 1752 struct rte_mbuf *mbuf = NULL; 1753 unsigned len, ret, offset = 0; 1754 struct vpool *vpool; 1755 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1756 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1757 1758 /*Add packet to the port tx queue*/ 1759 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1760 len = tx_q->len; 1761 1762 /* Allocate an mbuf and populate the structure. */ 1763 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1764 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1765 if (unlikely(mbuf == NULL)) { 1766 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1767 RTE_LOG(ERR, VHOST_DATA, 1768 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1769 dev->device_fh); 1770 put_desc_to_used_list_zcp(vq, desc_idx); 1771 return; 1772 } 1773 1774 if (vm2vm_mode == VM2VM_HARDWARE) { 1775 /* Avoid using a vlan tag from any vm for external pkt, such as 1776 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1777 * selection, MAC address determines it as an external pkt 1778 * which should go to network, while vlan tag determine it as 1779 * a vm2vm pkt should forward to another vm. Hardware confuse 1780 * such a ambiguous situation, so pkt will lost. 1781 */ 1782 vlan_tag = external_pkt_default_vlan_tag; 1783 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1784 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1785 __rte_mbuf_raw_free(mbuf); 1786 return; 1787 } 1788 } 1789 1790 mbuf->nb_segs = m->nb_segs; 1791 mbuf->next = m->next; 1792 mbuf->data_len = m->data_len + offset; 1793 mbuf->pkt_len = mbuf->data_len; 1794 if (unlikely(need_copy)) { 1795 /* Copy the packet contents to the mbuf. */ 1796 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1797 rte_pktmbuf_mtod(m, void *), 1798 m->data_len); 1799 } else { 1800 mbuf->data_off = m->data_off; 1801 mbuf->buf_physaddr = m->buf_physaddr; 1802 mbuf->buf_addr = m->buf_addr; 1803 } 1804 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1805 mbuf->vlan_tci = vlan_tag; 1806 mbuf->l2_len = sizeof(struct ether_hdr); 1807 mbuf->l3_len = sizeof(struct ipv4_hdr); 1808 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1809 1810 tx_q->m_table[len] = mbuf; 1811 len++; 1812 1813 LOG_DEBUG(VHOST_DATA, 1814 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1815 dev->device_fh, 1816 mbuf->nb_segs, 1817 (mbuf->next == NULL) ? "null" : "non-null"); 1818 1819 if (enable_stats) { 1820 dev_statistics[dev->device_fh].tx_total++; 1821 dev_statistics[dev->device_fh].tx++; 1822 } 1823 1824 if (unlikely(len == MAX_PKT_BURST)) { 1825 m_table = (struct rte_mbuf **)tx_q->m_table; 1826 ret = rte_eth_tx_burst(ports[0], 1827 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1828 1829 /* 1830 * Free any buffers not handled by TX and update 1831 * the port stats. 1832 */ 1833 if (unlikely(ret < len)) { 1834 do { 1835 rte_pktmbuf_free(m_table[ret]); 1836 } while (++ret < len); 1837 } 1838 1839 len = 0; 1840 txmbuf_clean_zcp(dev, vpool); 1841 } 1842 1843 tx_q->len = len; 1844 1845 return; 1846 } 1847 1848 /* 1849 * This function TX all available packets in virtio TX queue for one 1850 * virtio-net device. If it is first packet, it learns MAC address and 1851 * setup VMDQ. 1852 */ 1853 static inline void __attribute__((always_inline)) 1854 virtio_dev_tx_zcp(struct virtio_net *dev) 1855 { 1856 struct rte_mbuf m; 1857 struct vhost_virtqueue *vq; 1858 struct vring_desc *desc; 1859 uint64_t buff_addr = 0, phys_addr; 1860 uint32_t head[MAX_PKT_BURST]; 1861 uint32_t i; 1862 uint16_t free_entries, packet_success = 0; 1863 uint16_t avail_idx; 1864 uint8_t need_copy = 0; 1865 hpa_type addr_type; 1866 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1867 1868 vq = dev->virtqueue[VIRTIO_TXQ]; 1869 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1870 1871 /* If there are no available buffers then return. */ 1872 if (vq->last_used_idx_res == avail_idx) 1873 return; 1874 1875 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1876 1877 /* Prefetch available ring to retrieve head indexes. */ 1878 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1879 1880 /* Get the number of free entries in the ring */ 1881 free_entries = (avail_idx - vq->last_used_idx_res); 1882 1883 /* Limit to MAX_PKT_BURST. */ 1884 free_entries 1885 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1886 1887 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1888 dev->device_fh, free_entries); 1889 1890 /* Retrieve all of the head indexes first to avoid caching issues. */ 1891 for (i = 0; i < free_entries; i++) 1892 head[i] 1893 = vq->avail->ring[(vq->last_used_idx_res + i) 1894 & (vq->size - 1)]; 1895 1896 vq->last_used_idx_res += free_entries; 1897 1898 /* Prefetch descriptor index. */ 1899 rte_prefetch0(&vq->desc[head[packet_success]]); 1900 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1901 1902 while (packet_success < free_entries) { 1903 desc = &vq->desc[head[packet_success]]; 1904 1905 /* Discard first buffer as it is the virtio header */ 1906 desc = &vq->desc[desc->next]; 1907 1908 /* Buffer address translation. */ 1909 buff_addr = gpa_to_vva(dev, desc->addr); 1910 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1911 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1912 &addr_type); 1913 1914 if (likely(packet_success < (free_entries - 1))) 1915 /* Prefetch descriptor index. */ 1916 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1917 1918 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1919 RTE_LOG(ERR, VHOST_DATA, 1920 "(%"PRIu64") Invalid frame buffer address found" 1921 "when TX packets!\n", 1922 dev->device_fh); 1923 packet_success++; 1924 continue; 1925 } 1926 1927 /* Prefetch buffer address. */ 1928 rte_prefetch0((void *)(uintptr_t)buff_addr); 1929 1930 /* 1931 * Setup dummy mbuf. This is copied to a real mbuf if 1932 * transmitted out the physical port. 1933 */ 1934 m.data_len = desc->len; 1935 m.nb_segs = 1; 1936 m.next = NULL; 1937 m.data_off = 0; 1938 m.buf_addr = (void *)(uintptr_t)buff_addr; 1939 m.buf_physaddr = phys_addr; 1940 1941 /* 1942 * Check if the frame buffer address from guest crosses 1943 * sub-region or not. 1944 */ 1945 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1946 RTE_LOG(ERR, VHOST_DATA, 1947 "(%"PRIu64") Frame buffer address cross " 1948 "sub-regioin found when attaching TX frame " 1949 "buffer address!\n", 1950 dev->device_fh); 1951 need_copy = 1; 1952 } else 1953 need_copy = 0; 1954 1955 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1956 1957 /* 1958 * If this is the first received packet we need to learn 1959 * the MAC and setup VMDQ 1960 */ 1961 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1962 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1963 /* 1964 * Discard frame if device is scheduled for 1965 * removal or a duplicate MAC address is found. 1966 */ 1967 packet_success += free_entries; 1968 vq->last_used_idx += packet_success; 1969 break; 1970 } 1971 } 1972 1973 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1974 packet_success++; 1975 } 1976 } 1977 1978 /* 1979 * This function is called by each data core. It handles all RX/TX registered 1980 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1981 * addresses are compared with all devices in the main linked list. 1982 */ 1983 static int 1984 switch_worker_zcp(__attribute__((unused)) void *arg) 1985 { 1986 struct virtio_net *dev = NULL; 1987 struct vhost_dev *vdev = NULL; 1988 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1989 struct virtio_net_data_ll *dev_ll; 1990 struct mbuf_table *tx_q; 1991 volatile struct lcore_ll_info *lcore_ll; 1992 const uint64_t drain_tsc 1993 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 1994 * BURST_TX_DRAIN_US; 1995 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1996 unsigned ret; 1997 const uint16_t lcore_id = rte_lcore_id(); 1998 uint16_t count_in_ring, rx_count = 0; 1999 2000 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2001 2002 lcore_ll = lcore_info[lcore_id].lcore_ll; 2003 prev_tsc = 0; 2004 2005 while (1) { 2006 cur_tsc = rte_rdtsc(); 2007 2008 /* TX burst queue drain */ 2009 diff_tsc = cur_tsc - prev_tsc; 2010 if (unlikely(diff_tsc > drain_tsc)) { 2011 /* 2012 * Get mbuf from vpool.pool and detach mbuf and 2013 * put back into vpool.ring. 2014 */ 2015 dev_ll = lcore_ll->ll_root_used; 2016 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2017 /* Get virtio device ID */ 2018 vdev = dev_ll->vdev; 2019 dev = vdev->dev; 2020 2021 if (likely(!vdev->remove)) { 2022 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2023 if (tx_q->len) { 2024 LOG_DEBUG(VHOST_DATA, 2025 "TX queue drained after timeout" 2026 " with burst size %u\n", 2027 tx_q->len); 2028 2029 /* 2030 * Tx any packets in the queue 2031 */ 2032 ret = rte_eth_tx_burst( 2033 ports[0], 2034 (uint16_t)tx_q->txq_id, 2035 (struct rte_mbuf **) 2036 tx_q->m_table, 2037 (uint16_t)tx_q->len); 2038 if (unlikely(ret < tx_q->len)) { 2039 do { 2040 rte_pktmbuf_free( 2041 tx_q->m_table[ret]); 2042 } while (++ret < tx_q->len); 2043 } 2044 tx_q->len = 0; 2045 2046 txmbuf_clean_zcp(dev, 2047 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2048 } 2049 } 2050 dev_ll = dev_ll->next; 2051 } 2052 prev_tsc = cur_tsc; 2053 } 2054 2055 rte_prefetch0(lcore_ll->ll_root_used); 2056 2057 /* 2058 * Inform the configuration core that we have exited the linked 2059 * list and that no devices are in use if requested. 2060 */ 2061 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2062 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2063 2064 /* Process devices */ 2065 dev_ll = lcore_ll->ll_root_used; 2066 2067 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2068 vdev = dev_ll->vdev; 2069 dev = vdev->dev; 2070 if (unlikely(vdev->remove)) { 2071 dev_ll = dev_ll->next; 2072 unlink_vmdq(vdev); 2073 vdev->ready = DEVICE_SAFE_REMOVE; 2074 continue; 2075 } 2076 2077 if (likely(vdev->ready == DEVICE_RX)) { 2078 uint32_t index = vdev->vmdq_rx_q; 2079 uint16_t i; 2080 count_in_ring 2081 = rte_ring_count(vpool_array[index].ring); 2082 uint16_t free_entries 2083 = (uint16_t)get_available_ring_num_zcp(dev); 2084 2085 /* 2086 * Attach all mbufs in vpool.ring and put back 2087 * into vpool.pool. 2088 */ 2089 for (i = 0; 2090 i < RTE_MIN(free_entries, 2091 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2092 i++) 2093 attach_rxmbuf_zcp(dev); 2094 2095 /* Handle guest RX */ 2096 rx_count = rte_eth_rx_burst(ports[0], 2097 vdev->vmdq_rx_q, pkts_burst, 2098 MAX_PKT_BURST); 2099 2100 if (rx_count) { 2101 ret_count = virtio_dev_rx_zcp(dev, 2102 pkts_burst, rx_count); 2103 if (enable_stats) { 2104 dev_statistics[dev->device_fh].rx_total 2105 += rx_count; 2106 dev_statistics[dev->device_fh].rx 2107 += ret_count; 2108 } 2109 while (likely(rx_count)) { 2110 rx_count--; 2111 pktmbuf_detach_zcp( 2112 pkts_burst[rx_count]); 2113 rte_ring_sp_enqueue( 2114 vpool_array[index].ring, 2115 (void *)pkts_burst[rx_count]); 2116 } 2117 } 2118 } 2119 2120 if (likely(!vdev->remove)) 2121 /* Handle guest TX */ 2122 virtio_dev_tx_zcp(dev); 2123 2124 /* Move to the next device in the list */ 2125 dev_ll = dev_ll->next; 2126 } 2127 } 2128 2129 return 0; 2130 } 2131 2132 2133 /* 2134 * Add an entry to a used linked list. A free entry must first be found 2135 * in the free linked list using get_data_ll_free_entry(); 2136 */ 2137 static void 2138 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2139 struct virtio_net_data_ll *ll_dev) 2140 { 2141 struct virtio_net_data_ll *ll = *ll_root_addr; 2142 2143 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2144 ll_dev->next = NULL; 2145 rte_compiler_barrier(); 2146 2147 /* If ll == NULL then this is the first device. */ 2148 if (ll) { 2149 /* Increment to the tail of the linked list. */ 2150 while ((ll->next != NULL) ) 2151 ll = ll->next; 2152 2153 ll->next = ll_dev; 2154 } else { 2155 *ll_root_addr = ll_dev; 2156 } 2157 } 2158 2159 /* 2160 * Remove an entry from a used linked list. The entry must then be added to 2161 * the free linked list using put_data_ll_free_entry(). 2162 */ 2163 static void 2164 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2165 struct virtio_net_data_ll *ll_dev, 2166 struct virtio_net_data_ll *ll_dev_last) 2167 { 2168 struct virtio_net_data_ll *ll = *ll_root_addr; 2169 2170 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2171 return; 2172 2173 if (ll_dev == ll) 2174 *ll_root_addr = ll_dev->next; 2175 else 2176 if (likely(ll_dev_last != NULL)) 2177 ll_dev_last->next = ll_dev->next; 2178 else 2179 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2180 } 2181 2182 /* 2183 * Find and return an entry from the free linked list. 2184 */ 2185 static struct virtio_net_data_ll * 2186 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2187 { 2188 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2189 struct virtio_net_data_ll *ll_dev; 2190 2191 if (ll_free == NULL) 2192 return NULL; 2193 2194 ll_dev = ll_free; 2195 *ll_root_addr = ll_free->next; 2196 2197 return ll_dev; 2198 } 2199 2200 /* 2201 * Place an entry back on to the free linked list. 2202 */ 2203 static void 2204 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2205 struct virtio_net_data_ll *ll_dev) 2206 { 2207 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2208 2209 if (ll_dev == NULL) 2210 return; 2211 2212 ll_dev->next = ll_free; 2213 *ll_root_addr = ll_dev; 2214 } 2215 2216 /* 2217 * Creates a linked list of a given size. 2218 */ 2219 static struct virtio_net_data_ll * 2220 alloc_data_ll(uint32_t size) 2221 { 2222 struct virtio_net_data_ll *ll_new; 2223 uint32_t i; 2224 2225 /* Malloc and then chain the linked list. */ 2226 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2227 if (ll_new == NULL) { 2228 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2229 return NULL; 2230 } 2231 2232 for (i = 0; i < size - 1; i++) { 2233 ll_new[i].vdev = NULL; 2234 ll_new[i].next = &ll_new[i+1]; 2235 } 2236 ll_new[i].next = NULL; 2237 2238 return (ll_new); 2239 } 2240 2241 /* 2242 * Create the main linked list along with each individual cores linked list. A used and a free list 2243 * are created to manage entries. 2244 */ 2245 static int 2246 init_data_ll (void) 2247 { 2248 int lcore; 2249 2250 RTE_LCORE_FOREACH_SLAVE(lcore) { 2251 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2252 if (lcore_info[lcore].lcore_ll == NULL) { 2253 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2254 return -1; 2255 } 2256 2257 lcore_info[lcore].lcore_ll->device_num = 0; 2258 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2259 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2260 if (num_devices % num_switching_cores) 2261 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2262 else 2263 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2264 } 2265 2266 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2267 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2268 2269 return 0; 2270 } 2271 2272 /* 2273 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2274 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2275 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2276 */ 2277 static void 2278 destroy_device (volatile struct virtio_net *dev) 2279 { 2280 struct virtio_net_data_ll *ll_lcore_dev_cur; 2281 struct virtio_net_data_ll *ll_main_dev_cur; 2282 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2283 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2284 struct vhost_dev *vdev; 2285 int lcore; 2286 2287 dev->flags &= ~VIRTIO_DEV_RUNNING; 2288 2289 vdev = (struct vhost_dev *)dev->priv; 2290 /*set the remove flag. */ 2291 vdev->remove = 1; 2292 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2293 rte_pause(); 2294 } 2295 2296 /* Search for entry to be removed from lcore ll */ 2297 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2298 while (ll_lcore_dev_cur != NULL) { 2299 if (ll_lcore_dev_cur->vdev == vdev) { 2300 break; 2301 } else { 2302 ll_lcore_dev_last = ll_lcore_dev_cur; 2303 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2304 } 2305 } 2306 2307 if (ll_lcore_dev_cur == NULL) { 2308 RTE_LOG(ERR, VHOST_CONFIG, 2309 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2310 dev->device_fh); 2311 return; 2312 } 2313 2314 /* Search for entry to be removed from main ll */ 2315 ll_main_dev_cur = ll_root_used; 2316 ll_main_dev_last = NULL; 2317 while (ll_main_dev_cur != NULL) { 2318 if (ll_main_dev_cur->vdev == vdev) { 2319 break; 2320 } else { 2321 ll_main_dev_last = ll_main_dev_cur; 2322 ll_main_dev_cur = ll_main_dev_cur->next; 2323 } 2324 } 2325 2326 /* Remove entries from the lcore and main ll. */ 2327 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2328 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2329 2330 /* Set the dev_removal_flag on each lcore. */ 2331 RTE_LCORE_FOREACH_SLAVE(lcore) { 2332 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2333 } 2334 2335 /* 2336 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2337 * they can no longer access the device removed from the linked lists and that the devices 2338 * are no longer in use. 2339 */ 2340 RTE_LCORE_FOREACH_SLAVE(lcore) { 2341 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2342 rte_pause(); 2343 } 2344 } 2345 2346 /* Add the entries back to the lcore and main free ll.*/ 2347 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2348 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2349 2350 /* Decrement number of device on the lcore. */ 2351 lcore_info[vdev->coreid].lcore_ll->device_num--; 2352 2353 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2354 2355 if (zero_copy) { 2356 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2357 2358 /* Stop the RX queue. */ 2359 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2360 LOG_DEBUG(VHOST_CONFIG, 2361 "(%"PRIu64") In destroy_device: Failed to stop " 2362 "rx queue:%d\n", 2363 dev->device_fh, 2364 vdev->vmdq_rx_q); 2365 } 2366 2367 LOG_DEBUG(VHOST_CONFIG, 2368 "(%"PRIu64") in destroy_device: Start put mbuf in " 2369 "mempool back to ring for RX queue: %d\n", 2370 dev->device_fh, vdev->vmdq_rx_q); 2371 2372 mbuf_destroy_zcp(vpool); 2373 2374 /* Stop the TX queue. */ 2375 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2376 LOG_DEBUG(VHOST_CONFIG, 2377 "(%"PRIu64") In destroy_device: Failed to " 2378 "stop tx queue:%d\n", 2379 dev->device_fh, vdev->vmdq_rx_q); 2380 } 2381 2382 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2383 2384 LOG_DEBUG(VHOST_CONFIG, 2385 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2386 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2387 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2388 dev->device_fh); 2389 2390 mbuf_destroy_zcp(vpool); 2391 rte_free(vdev->regions_hpa); 2392 } 2393 rte_free(vdev); 2394 2395 } 2396 2397 /* 2398 * Calculate the region count of physical continous regions for one particular 2399 * region of whose vhost virtual address is continous. The particular region 2400 * start from vva_start, with size of 'size' in argument. 2401 */ 2402 static uint32_t 2403 check_hpa_regions(uint64_t vva_start, uint64_t size) 2404 { 2405 uint32_t i, nregions = 0, page_size = getpagesize(); 2406 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2407 if (vva_start % page_size) { 2408 LOG_DEBUG(VHOST_CONFIG, 2409 "in check_countinous: vva start(%p) mod page_size(%d) " 2410 "has remainder\n", 2411 (void *)(uintptr_t)vva_start, page_size); 2412 return 0; 2413 } 2414 if (size % page_size) { 2415 LOG_DEBUG(VHOST_CONFIG, 2416 "in check_countinous: " 2417 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2418 size, page_size); 2419 return 0; 2420 } 2421 for (i = 0; i < size - page_size; i = i + page_size) { 2422 cur_phys_addr 2423 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2424 next_phys_addr = rte_mem_virt2phy( 2425 (void *)(uintptr_t)(vva_start + i + page_size)); 2426 if ((cur_phys_addr + page_size) != next_phys_addr) { 2427 ++nregions; 2428 LOG_DEBUG(VHOST_CONFIG, 2429 "in check_continuous: hva addr:(%p) is not " 2430 "continuous with hva addr:(%p), diff:%d\n", 2431 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2432 (void *)(uintptr_t)(vva_start + (uint64_t)i 2433 + page_size), page_size); 2434 LOG_DEBUG(VHOST_CONFIG, 2435 "in check_continuous: hpa addr:(%p) is not " 2436 "continuous with hpa addr:(%p), " 2437 "diff:(%"PRIu64")\n", 2438 (void *)(uintptr_t)cur_phys_addr, 2439 (void *)(uintptr_t)next_phys_addr, 2440 (next_phys_addr-cur_phys_addr)); 2441 } 2442 } 2443 return nregions; 2444 } 2445 2446 /* 2447 * Divide each region whose vhost virtual address is continous into a few 2448 * sub-regions, make sure the physical address within each sub-region are 2449 * continous. And fill offset(to GPA) and size etc. information of each 2450 * sub-region into regions_hpa. 2451 */ 2452 static uint32_t 2453 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2454 { 2455 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2456 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2457 2458 if (mem_region_hpa == NULL) 2459 return 0; 2460 2461 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2462 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2463 virtio_memory->regions[regionidx].address_offset; 2464 mem_region_hpa[regionidx_hpa].guest_phys_address 2465 = virtio_memory->regions[regionidx].guest_phys_address; 2466 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2467 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2468 mem_region_hpa[regionidx_hpa].guest_phys_address; 2469 LOG_DEBUG(VHOST_CONFIG, 2470 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2471 regionidx_hpa, 2472 (void *)(uintptr_t) 2473 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2474 LOG_DEBUG(VHOST_CONFIG, 2475 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2476 regionidx_hpa, 2477 (void *)(uintptr_t) 2478 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2479 for (i = 0, k = 0; 2480 i < virtio_memory->regions[regionidx].memory_size - 2481 page_size; 2482 i += page_size) { 2483 cur_phys_addr = rte_mem_virt2phy( 2484 (void *)(uintptr_t)(vva_start + i)); 2485 next_phys_addr = rte_mem_virt2phy( 2486 (void *)(uintptr_t)(vva_start + 2487 i + page_size)); 2488 if ((cur_phys_addr + page_size) != next_phys_addr) { 2489 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2490 mem_region_hpa[regionidx_hpa].guest_phys_address + 2491 k + page_size; 2492 mem_region_hpa[regionidx_hpa].memory_size 2493 = k + page_size; 2494 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2495 "phys addr end [%d]:(%p)\n", 2496 regionidx_hpa, 2497 (void *)(uintptr_t) 2498 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2499 LOG_DEBUG(VHOST_CONFIG, 2500 "in fill_hpa_regions: guest phys addr " 2501 "size [%d]:(%p)\n", 2502 regionidx_hpa, 2503 (void *)(uintptr_t) 2504 (mem_region_hpa[regionidx_hpa].memory_size)); 2505 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2506 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2507 ++regionidx_hpa; 2508 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2509 next_phys_addr - 2510 mem_region_hpa[regionidx_hpa].guest_phys_address; 2511 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2512 " phys addr start[%d]:(%p)\n", 2513 regionidx_hpa, 2514 (void *)(uintptr_t) 2515 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2516 LOG_DEBUG(VHOST_CONFIG, 2517 "in fill_hpa_regions: host phys addr " 2518 "start[%d]:(%p)\n", 2519 regionidx_hpa, 2520 (void *)(uintptr_t) 2521 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2522 k = 0; 2523 } else { 2524 k += page_size; 2525 } 2526 } 2527 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2528 = mem_region_hpa[regionidx_hpa].guest_phys_address 2529 + k + page_size; 2530 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2531 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2532 "[%d]:(%p)\n", regionidx_hpa, 2533 (void *)(uintptr_t) 2534 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2535 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2536 "[%d]:(%p)\n", regionidx_hpa, 2537 (void *)(uintptr_t) 2538 (mem_region_hpa[regionidx_hpa].memory_size)); 2539 ++regionidx_hpa; 2540 } 2541 return regionidx_hpa; 2542 } 2543 2544 /* 2545 * A new device is added to a data core. First the device is added to the main linked list 2546 * and the allocated to a specific data core. 2547 */ 2548 static int 2549 new_device (struct virtio_net *dev) 2550 { 2551 struct virtio_net_data_ll *ll_dev; 2552 int lcore, core_add = 0; 2553 uint32_t device_num_min = num_devices; 2554 struct vhost_dev *vdev; 2555 uint32_t regionidx; 2556 2557 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2558 if (vdev == NULL) { 2559 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2560 dev->device_fh); 2561 return -1; 2562 } 2563 vdev->dev = dev; 2564 dev->priv = vdev; 2565 2566 if (zero_copy) { 2567 vdev->nregions_hpa = dev->mem->nregions; 2568 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2569 vdev->nregions_hpa 2570 += check_hpa_regions( 2571 dev->mem->regions[regionidx].guest_phys_address 2572 + dev->mem->regions[regionidx].address_offset, 2573 dev->mem->regions[regionidx].memory_size); 2574 2575 } 2576 2577 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2578 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2579 RTE_CACHE_LINE_SIZE); 2580 if (vdev->regions_hpa == NULL) { 2581 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2582 rte_free(vdev); 2583 return -1; 2584 } 2585 2586 2587 if (fill_hpa_memory_regions( 2588 vdev->regions_hpa, dev->mem 2589 ) != vdev->nregions_hpa) { 2590 2591 RTE_LOG(ERR, VHOST_CONFIG, 2592 "hpa memory regions number mismatch: " 2593 "[%d]\n", vdev->nregions_hpa); 2594 rte_free(vdev->regions_hpa); 2595 rte_free(vdev); 2596 return -1; 2597 } 2598 } 2599 2600 2601 /* Add device to main ll */ 2602 ll_dev = get_data_ll_free_entry(&ll_root_free); 2603 if (ll_dev == NULL) { 2604 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2605 "of %d devices per core has been reached\n", 2606 dev->device_fh, num_devices); 2607 if (vdev->regions_hpa) 2608 rte_free(vdev->regions_hpa); 2609 rte_free(vdev); 2610 return -1; 2611 } 2612 ll_dev->vdev = vdev; 2613 add_data_ll_entry(&ll_root_used, ll_dev); 2614 vdev->vmdq_rx_q 2615 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2616 2617 if (zero_copy) { 2618 uint32_t index = vdev->vmdq_rx_q; 2619 uint32_t count_in_ring, i; 2620 struct mbuf_table *tx_q; 2621 2622 count_in_ring = rte_ring_count(vpool_array[index].ring); 2623 2624 LOG_DEBUG(VHOST_CONFIG, 2625 "(%"PRIu64") in new_device: mbuf count in mempool " 2626 "before attach is: %d\n", 2627 dev->device_fh, 2628 rte_mempool_count(vpool_array[index].pool)); 2629 LOG_DEBUG(VHOST_CONFIG, 2630 "(%"PRIu64") in new_device: mbuf count in ring " 2631 "before attach is : %d\n", 2632 dev->device_fh, count_in_ring); 2633 2634 /* 2635 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2636 */ 2637 for (i = 0; i < count_in_ring; i++) 2638 attach_rxmbuf_zcp(dev); 2639 2640 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2641 "mempool after attach is: %d\n", 2642 dev->device_fh, 2643 rte_mempool_count(vpool_array[index].pool)); 2644 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2645 "ring after attach is : %d\n", 2646 dev->device_fh, 2647 rte_ring_count(vpool_array[index].ring)); 2648 2649 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2650 tx_q->txq_id = vdev->vmdq_rx_q; 2651 2652 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2653 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2654 2655 LOG_DEBUG(VHOST_CONFIG, 2656 "(%"PRIu64") In new_device: Failed to start " 2657 "tx queue:%d\n", 2658 dev->device_fh, vdev->vmdq_rx_q); 2659 2660 mbuf_destroy_zcp(vpool); 2661 rte_free(vdev->regions_hpa); 2662 rte_free(vdev); 2663 return -1; 2664 } 2665 2666 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2667 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2668 2669 LOG_DEBUG(VHOST_CONFIG, 2670 "(%"PRIu64") In new_device: Failed to start " 2671 "rx queue:%d\n", 2672 dev->device_fh, vdev->vmdq_rx_q); 2673 2674 /* Stop the TX queue. */ 2675 if (rte_eth_dev_tx_queue_stop(ports[0], 2676 vdev->vmdq_rx_q) != 0) { 2677 LOG_DEBUG(VHOST_CONFIG, 2678 "(%"PRIu64") In new_device: Failed to " 2679 "stop tx queue:%d\n", 2680 dev->device_fh, vdev->vmdq_rx_q); 2681 } 2682 2683 mbuf_destroy_zcp(vpool); 2684 rte_free(vdev->regions_hpa); 2685 rte_free(vdev); 2686 return -1; 2687 } 2688 2689 } 2690 2691 /*reset ready flag*/ 2692 vdev->ready = DEVICE_MAC_LEARNING; 2693 vdev->remove = 0; 2694 2695 /* Find a suitable lcore to add the device. */ 2696 RTE_LCORE_FOREACH_SLAVE(lcore) { 2697 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2698 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2699 core_add = lcore; 2700 } 2701 } 2702 /* Add device to lcore ll */ 2703 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2704 if (ll_dev == NULL) { 2705 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2706 vdev->ready = DEVICE_SAFE_REMOVE; 2707 destroy_device(dev); 2708 if (vdev->regions_hpa) 2709 rte_free(vdev->regions_hpa); 2710 rte_free(vdev); 2711 return -1; 2712 } 2713 ll_dev->vdev = vdev; 2714 vdev->coreid = core_add; 2715 2716 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2717 2718 /* Initialize device stats */ 2719 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2720 2721 /* Disable notifications. */ 2722 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2723 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2724 lcore_info[vdev->coreid].lcore_ll->device_num++; 2725 dev->flags |= VIRTIO_DEV_RUNNING; 2726 2727 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2728 2729 return 0; 2730 } 2731 2732 /* 2733 * These callback allow devices to be added to the data core when configuration 2734 * has been fully complete. 2735 */ 2736 static const struct virtio_net_device_ops virtio_net_device_ops = 2737 { 2738 .new_device = new_device, 2739 .destroy_device = destroy_device, 2740 }; 2741 2742 /* 2743 * This is a thread will wake up after a period to print stats if the user has 2744 * enabled them. 2745 */ 2746 static void 2747 print_stats(void) 2748 { 2749 struct virtio_net_data_ll *dev_ll; 2750 uint64_t tx_dropped, rx_dropped; 2751 uint64_t tx, tx_total, rx, rx_total; 2752 uint32_t device_fh; 2753 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2754 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2755 2756 while(1) { 2757 sleep(enable_stats); 2758 2759 /* Clear screen and move to top left */ 2760 printf("%s%s", clr, top_left); 2761 2762 printf("\nDevice statistics ===================================="); 2763 2764 dev_ll = ll_root_used; 2765 while (dev_ll != NULL) { 2766 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2767 tx_total = dev_statistics[device_fh].tx_total; 2768 tx = dev_statistics[device_fh].tx; 2769 tx_dropped = tx_total - tx; 2770 if (zero_copy == 0) { 2771 rx_total = rte_atomic64_read( 2772 &dev_statistics[device_fh].rx_total_atomic); 2773 rx = rte_atomic64_read( 2774 &dev_statistics[device_fh].rx_atomic); 2775 } else { 2776 rx_total = dev_statistics[device_fh].rx_total; 2777 rx = dev_statistics[device_fh].rx; 2778 } 2779 rx_dropped = rx_total - rx; 2780 2781 printf("\nStatistics for device %"PRIu32" ------------------------------" 2782 "\nTX total: %"PRIu64"" 2783 "\nTX dropped: %"PRIu64"" 2784 "\nTX successful: %"PRIu64"" 2785 "\nRX total: %"PRIu64"" 2786 "\nRX dropped: %"PRIu64"" 2787 "\nRX successful: %"PRIu64"", 2788 device_fh, 2789 tx_total, 2790 tx_dropped, 2791 tx, 2792 rx_total, 2793 rx_dropped, 2794 rx); 2795 2796 dev_ll = dev_ll->next; 2797 } 2798 printf("\n======================================================\n"); 2799 } 2800 } 2801 2802 static void 2803 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2804 char *ring_name, uint32_t nb_mbuf) 2805 { 2806 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2807 vpool_array[index].pool 2808 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2809 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2810 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2811 rte_pktmbuf_init, NULL, socket, 0); 2812 if (vpool_array[index].pool != NULL) { 2813 vpool_array[index].ring 2814 = rte_ring_create(ring_name, 2815 rte_align32pow2(nb_mbuf + 1), 2816 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2817 if (likely(vpool_array[index].ring != NULL)) { 2818 LOG_DEBUG(VHOST_CONFIG, 2819 "in setup_mempool_tbl: mbuf count in " 2820 "mempool is: %d\n", 2821 rte_mempool_count(vpool_array[index].pool)); 2822 LOG_DEBUG(VHOST_CONFIG, 2823 "in setup_mempool_tbl: mbuf count in " 2824 "ring is: %d\n", 2825 rte_ring_count(vpool_array[index].ring)); 2826 } else { 2827 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2828 ring_name); 2829 } 2830 2831 /* Need consider head room. */ 2832 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2833 } else { 2834 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2835 } 2836 } 2837 2838 2839 /* 2840 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2841 * device is also registered here to handle the IOCTLs. 2842 */ 2843 int 2844 main(int argc, char *argv[]) 2845 { 2846 struct rte_mempool *mbuf_pool = NULL; 2847 unsigned lcore_id, core_id = 0; 2848 unsigned nb_ports, valid_num_ports; 2849 int ret; 2850 uint8_t portid; 2851 uint16_t queue_id; 2852 static pthread_t tid; 2853 2854 /* init EAL */ 2855 ret = rte_eal_init(argc, argv); 2856 if (ret < 0) 2857 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2858 argc -= ret; 2859 argv += ret; 2860 2861 /* parse app arguments */ 2862 ret = us_vhost_parse_args(argc, argv); 2863 if (ret < 0) 2864 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2865 2866 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2867 if (rte_lcore_is_enabled(lcore_id)) 2868 lcore_ids[core_id ++] = lcore_id; 2869 2870 if (rte_lcore_count() > RTE_MAX_LCORE) 2871 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2872 2873 /*set the number of swithcing cores available*/ 2874 num_switching_cores = rte_lcore_count()-1; 2875 2876 /* Get the number of physical ports. */ 2877 nb_ports = rte_eth_dev_count(); 2878 if (nb_ports > RTE_MAX_ETHPORTS) 2879 nb_ports = RTE_MAX_ETHPORTS; 2880 2881 /* 2882 * Update the global var NUM_PORTS and global array PORTS 2883 * and get value of var VALID_NUM_PORTS according to system ports number 2884 */ 2885 valid_num_ports = check_ports_num(nb_ports); 2886 2887 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2888 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2889 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2890 return -1; 2891 } 2892 2893 if (zero_copy == 0) { 2894 /* Create the mbuf pool. */ 2895 mbuf_pool = rte_mempool_create( 2896 "MBUF_POOL", 2897 NUM_MBUFS_PER_PORT 2898 * valid_num_ports, 2899 MBUF_SIZE, MBUF_CACHE_SIZE, 2900 sizeof(struct rte_pktmbuf_pool_private), 2901 rte_pktmbuf_pool_init, NULL, 2902 rte_pktmbuf_init, NULL, 2903 rte_socket_id(), 0); 2904 if (mbuf_pool == NULL) 2905 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2906 2907 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2908 vpool_array[queue_id].pool = mbuf_pool; 2909 2910 if (vm2vm_mode == VM2VM_HARDWARE) { 2911 /* Enable VT loop back to let L2 switch to do it. */ 2912 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2913 LOG_DEBUG(VHOST_CONFIG, 2914 "Enable loop back for L2 switch in vmdq.\n"); 2915 } 2916 } else { 2917 uint32_t nb_mbuf; 2918 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2919 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2920 2921 nb_mbuf = num_rx_descriptor 2922 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2923 + num_switching_cores * MAX_PKT_BURST; 2924 2925 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2926 snprintf(pool_name, sizeof(pool_name), 2927 "rxmbuf_pool_%u", queue_id); 2928 snprintf(ring_name, sizeof(ring_name), 2929 "rxmbuf_ring_%u", queue_id); 2930 setup_mempool_tbl(rte_socket_id(), queue_id, 2931 pool_name, ring_name, nb_mbuf); 2932 } 2933 2934 nb_mbuf = num_tx_descriptor 2935 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2936 + num_switching_cores * MAX_PKT_BURST; 2937 2938 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2939 snprintf(pool_name, sizeof(pool_name), 2940 "txmbuf_pool_%u", queue_id); 2941 snprintf(ring_name, sizeof(ring_name), 2942 "txmbuf_ring_%u", queue_id); 2943 setup_mempool_tbl(rte_socket_id(), 2944 (queue_id + MAX_QUEUES), 2945 pool_name, ring_name, nb_mbuf); 2946 } 2947 2948 if (vm2vm_mode == VM2VM_HARDWARE) { 2949 /* Enable VT loop back to let L2 switch to do it. */ 2950 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2951 LOG_DEBUG(VHOST_CONFIG, 2952 "Enable loop back for L2 switch in vmdq.\n"); 2953 } 2954 } 2955 /* Set log level. */ 2956 rte_set_log_level(LOG_LEVEL); 2957 2958 /* initialize all ports */ 2959 for (portid = 0; portid < nb_ports; portid++) { 2960 /* skip ports that are not enabled */ 2961 if ((enabled_port_mask & (1 << portid)) == 0) { 2962 RTE_LOG(INFO, VHOST_PORT, 2963 "Skipping disabled port %d\n", portid); 2964 continue; 2965 } 2966 if (port_init(portid) != 0) 2967 rte_exit(EXIT_FAILURE, 2968 "Cannot initialize network ports\n"); 2969 } 2970 2971 /* Initialise all linked lists. */ 2972 if (init_data_ll() == -1) 2973 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2974 2975 /* Initialize device stats */ 2976 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2977 2978 /* Enable stats if the user option is set. */ 2979 if (enable_stats) 2980 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2981 2982 /* Launch all data cores. */ 2983 if (zero_copy == 0) { 2984 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 2985 rte_eal_remote_launch(switch_worker, 2986 mbuf_pool, lcore_id); 2987 } 2988 } else { 2989 uint32_t count_in_mempool, index, i; 2990 for (index = 0; index < 2*MAX_QUEUES; index++) { 2991 /* For all RX and TX queues. */ 2992 count_in_mempool 2993 = rte_mempool_count(vpool_array[index].pool); 2994 2995 /* 2996 * Transfer all un-attached mbufs from vpool.pool 2997 * to vpoo.ring. 2998 */ 2999 for (i = 0; i < count_in_mempool; i++) { 3000 struct rte_mbuf *mbuf 3001 = __rte_mbuf_raw_alloc( 3002 vpool_array[index].pool); 3003 rte_ring_sp_enqueue(vpool_array[index].ring, 3004 (void *)mbuf); 3005 } 3006 3007 LOG_DEBUG(VHOST_CONFIG, 3008 "in main: mbuf count in mempool at initial " 3009 "is: %d\n", count_in_mempool); 3010 LOG_DEBUG(VHOST_CONFIG, 3011 "in main: mbuf count in ring at initial is :" 3012 " %d\n", 3013 rte_ring_count(vpool_array[index].ring)); 3014 } 3015 3016 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3017 rte_eal_remote_launch(switch_worker_zcp, NULL, 3018 lcore_id); 3019 } 3020 3021 if (mergeable == 0) 3022 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3023 3024 /* Register CUSE device to handle IOCTLs. */ 3025 ret = rte_vhost_driver_register((char *)&dev_basename); 3026 if (ret != 0) 3027 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3028 3029 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3030 3031 /* Start CUSE session. */ 3032 rte_vhost_driver_session_start(); 3033 return 0; 3034 3035 } 3036 3037