1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 256 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 /* mask of enabled ports */ 143 static uint32_t enabled_port_mask = 0; 144 145 /* Promiscuous mode */ 146 static uint32_t promiscuous; 147 148 /*Number of switching cores enabled*/ 149 static uint32_t num_switching_cores = 0; 150 151 /* number of devices/queues to support*/ 152 static uint32_t num_queues = 0; 153 static uint32_t num_devices; 154 155 /* 156 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 157 * disabled on default. 158 */ 159 static uint32_t zero_copy; 160 static int mergeable; 161 162 /* number of descriptors to apply*/ 163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 165 166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 167 #define MAX_RING_DESC 4096 168 169 struct vpool { 170 struct rte_mempool *pool; 171 struct rte_ring *ring; 172 uint32_t buf_size; 173 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 174 175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 176 typedef enum { 177 VM2VM_DISABLED = 0, 178 VM2VM_SOFTWARE = 1, 179 VM2VM_HARDWARE = 2, 180 VM2VM_LAST 181 } vm2vm_type; 182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 183 184 /* The type of host physical address translated from guest physical address. */ 185 typedef enum { 186 PHYS_ADDR_CONTINUOUS = 0, 187 PHYS_ADDR_CROSS_SUBREG = 1, 188 PHYS_ADDR_INVALID = 2, 189 PHYS_ADDR_LAST 190 } hpa_type; 191 192 /* Enable stats. */ 193 static uint32_t enable_stats = 0; 194 /* Enable retries on RX. */ 195 static uint32_t enable_retry = 1; 196 /* Specify timeout (in useconds) between retries on RX. */ 197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 198 /* Specify the number of retries on RX. */ 199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 200 201 /* Character device basename. Can be set by user. */ 202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 203 204 /* empty vmdq configuration structure. Filled in programatically */ 205 static struct rte_eth_conf vmdq_conf_default = { 206 .rxmode = { 207 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 208 .split_hdr_size = 0, 209 .header_split = 0, /**< Header Split disabled */ 210 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 211 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 212 /* 213 * It is necessary for 1G NIC such as I350, 214 * this fixes bug of ipv4 forwarding in guest can't 215 * forward pakets from one virtio dev to another virtio dev. 216 */ 217 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 218 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 219 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 220 }, 221 222 .txmode = { 223 .mq_mode = ETH_MQ_TX_NONE, 224 }, 225 .rx_adv_conf = { 226 /* 227 * should be overridden separately in code with 228 * appropriate values 229 */ 230 .vmdq_rx_conf = { 231 .nb_queue_pools = ETH_8_POOLS, 232 .enable_default_pool = 0, 233 .default_pool = 0, 234 .nb_pool_maps = 0, 235 .pool_map = {{0, 0},}, 236 }, 237 }, 238 }; 239 240 static unsigned lcore_ids[RTE_MAX_LCORE]; 241 static uint8_t ports[RTE_MAX_ETHPORTS]; 242 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 243 static uint16_t num_pf_queues, num_vmdq_queues; 244 static uint16_t vmdq_pool_base, vmdq_queue_base; 245 static uint16_t queues_per_pool; 246 247 static const uint16_t external_pkt_default_vlan_tag = 2000; 248 const uint16_t vlan_tags[] = { 249 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 250 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 251 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 252 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 253 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 254 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 255 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 256 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 257 }; 258 259 /* ethernet addresses of ports */ 260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 261 262 /* heads for the main used and free linked lists for the data path. */ 263 static struct virtio_net_data_ll *ll_root_used = NULL; 264 static struct virtio_net_data_ll *ll_root_free = NULL; 265 266 /* Array of data core structures containing information on individual core linked lists. */ 267 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 268 269 /* Used for queueing bursts of TX packets. */ 270 struct mbuf_table { 271 unsigned len; 272 unsigned txq_id; 273 struct rte_mbuf *m_table[MAX_PKT_BURST]; 274 }; 275 276 /* TX queue for each data core. */ 277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 278 279 /* TX queue fori each virtio device for zero copy. */ 280 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 281 282 /* Vlan header struct used to insert vlan tags on TX. */ 283 struct vlan_ethhdr { 284 unsigned char h_dest[ETH_ALEN]; 285 unsigned char h_source[ETH_ALEN]; 286 __be16 h_vlan_proto; 287 __be16 h_vlan_TCI; 288 __be16 h_vlan_encapsulated_proto; 289 }; 290 291 /* IPv4 Header */ 292 struct ipv4_hdr { 293 uint8_t version_ihl; /**< version and header length */ 294 uint8_t type_of_service; /**< type of service */ 295 uint16_t total_length; /**< length of packet */ 296 uint16_t packet_id; /**< packet ID */ 297 uint16_t fragment_offset; /**< fragmentation offset */ 298 uint8_t time_to_live; /**< time to live */ 299 uint8_t next_proto_id; /**< protocol ID */ 300 uint16_t hdr_checksum; /**< header checksum */ 301 uint32_t src_addr; /**< source address */ 302 uint32_t dst_addr; /**< destination address */ 303 } __attribute__((__packed__)); 304 305 /* Header lengths. */ 306 #define VLAN_HLEN 4 307 #define VLAN_ETH_HLEN 18 308 309 /* Per-device statistics struct */ 310 struct device_statistics { 311 uint64_t tx_total; 312 rte_atomic64_t rx_total_atomic; 313 uint64_t rx_total; 314 uint64_t tx; 315 rte_atomic64_t rx_atomic; 316 uint64_t rx; 317 } __rte_cache_aligned; 318 struct device_statistics dev_statistics[MAX_DEVICES]; 319 320 /* 321 * Builds up the correct configuration for VMDQ VLAN pool map 322 * according to the pool & queue limits. 323 */ 324 static inline int 325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 326 { 327 struct rte_eth_vmdq_rx_conf conf; 328 struct rte_eth_vmdq_rx_conf *def_conf = 329 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 330 unsigned i; 331 332 memset(&conf, 0, sizeof(conf)); 333 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 334 conf.nb_pool_maps = num_devices; 335 conf.enable_loop_back = def_conf->enable_loop_back; 336 conf.rx_mode = def_conf->rx_mode; 337 338 for (i = 0; i < conf.nb_pool_maps; i++) { 339 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 340 conf.pool_map[i].pools = (1UL << i); 341 } 342 343 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 344 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 345 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 346 return 0; 347 } 348 349 /* 350 * Validate the device number according to the max pool number gotten form 351 * dev_info. If the device number is invalid, give the error message and 352 * return -1. Each device must have its own pool. 353 */ 354 static inline int 355 validate_num_devices(uint32_t max_nb_devices) 356 { 357 if (num_devices > max_nb_devices) { 358 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 359 return -1; 360 } 361 return 0; 362 } 363 364 /* 365 * Initialises a given port using global settings and with the rx buffers 366 * coming from the mbuf_pool passed as parameter 367 */ 368 static inline int 369 port_init(uint8_t port) 370 { 371 struct rte_eth_dev_info dev_info; 372 struct rte_eth_conf port_conf; 373 struct rte_eth_rxconf *rxconf; 374 struct rte_eth_txconf *txconf; 375 int16_t rx_rings, tx_rings; 376 uint16_t rx_ring_size, tx_ring_size; 377 int retval; 378 uint16_t q; 379 380 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 381 rte_eth_dev_info_get (port, &dev_info); 382 383 rxconf = &dev_info.default_rxconf; 384 txconf = &dev_info.default_txconf; 385 rxconf->rx_drop_en = 1; 386 387 /* 388 * Zero copy defers queue RX/TX start to the time when guest 389 * finishes its startup and packet buffers from that guest are 390 * available. 391 */ 392 if (zero_copy) { 393 rxconf->rx_deferred_start = 1; 394 rxconf->rx_drop_en = 0; 395 txconf->tx_deferred_start = 1; 396 } 397 398 /*configure the number of supported virtio devices based on VMDQ limits */ 399 num_devices = dev_info.max_vmdq_pools; 400 401 if (zero_copy) { 402 rx_ring_size = num_rx_descriptor; 403 tx_ring_size = num_tx_descriptor; 404 tx_rings = dev_info.max_tx_queues; 405 } else { 406 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 407 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 408 tx_rings = (uint16_t)rte_lcore_count(); 409 } 410 411 retval = validate_num_devices(MAX_DEVICES); 412 if (retval < 0) 413 return retval; 414 415 /* Get port configuration. */ 416 retval = get_eth_conf(&port_conf, num_devices); 417 if (retval < 0) 418 return retval; 419 /* NIC queues are divided into pf queues and vmdq queues. */ 420 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 421 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 422 num_vmdq_queues = num_devices * queues_per_pool; 423 num_queues = num_pf_queues + num_vmdq_queues; 424 vmdq_queue_base = dev_info.vmdq_queue_base; 425 vmdq_pool_base = dev_info.vmdq_pool_base; 426 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 427 num_pf_queues, num_devices, queues_per_pool); 428 429 if (port >= rte_eth_dev_count()) return -1; 430 431 rx_rings = (uint16_t)dev_info.max_rx_queues; 432 /* Configure ethernet device. */ 433 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 434 if (retval != 0) 435 return retval; 436 437 /* Setup the queues. */ 438 for (q = 0; q < rx_rings; q ++) { 439 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 440 rte_eth_dev_socket_id(port), 441 rxconf, 442 vpool_array[q].pool); 443 if (retval < 0) 444 return retval; 445 } 446 for (q = 0; q < tx_rings; q ++) { 447 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 448 rte_eth_dev_socket_id(port), 449 txconf); 450 if (retval < 0) 451 return retval; 452 } 453 454 /* Start the device. */ 455 retval = rte_eth_dev_start(port); 456 if (retval < 0) { 457 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 458 return retval; 459 } 460 461 if (promiscuous) 462 rte_eth_promiscuous_enable(port); 463 464 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 465 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 466 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 467 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 468 (unsigned)port, 469 vmdq_ports_eth_addr[port].addr_bytes[0], 470 vmdq_ports_eth_addr[port].addr_bytes[1], 471 vmdq_ports_eth_addr[port].addr_bytes[2], 472 vmdq_ports_eth_addr[port].addr_bytes[3], 473 vmdq_ports_eth_addr[port].addr_bytes[4], 474 vmdq_ports_eth_addr[port].addr_bytes[5]); 475 476 return 0; 477 } 478 479 /* 480 * Set character device basename. 481 */ 482 static int 483 us_vhost_parse_basename(const char *q_arg) 484 { 485 /* parse number string */ 486 487 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 488 return -1; 489 else 490 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 491 492 return 0; 493 } 494 495 /* 496 * Parse the portmask provided at run time. 497 */ 498 static int 499 parse_portmask(const char *portmask) 500 { 501 char *end = NULL; 502 unsigned long pm; 503 504 errno = 0; 505 506 /* parse hexadecimal string */ 507 pm = strtoul(portmask, &end, 16); 508 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 509 return -1; 510 511 if (pm == 0) 512 return -1; 513 514 return pm; 515 516 } 517 518 /* 519 * Parse num options at run time. 520 */ 521 static int 522 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 523 { 524 char *end = NULL; 525 unsigned long num; 526 527 errno = 0; 528 529 /* parse unsigned int string */ 530 num = strtoul(q_arg, &end, 10); 531 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 532 return -1; 533 534 if (num > max_valid_value) 535 return -1; 536 537 return num; 538 539 } 540 541 /* 542 * Display usage 543 */ 544 static void 545 us_vhost_usage(const char *prgname) 546 { 547 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 548 " --vm2vm [0|1|2]\n" 549 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 550 " --dev-basename <name>\n" 551 " --nb-devices ND\n" 552 " -p PORTMASK: Set mask for ports to be used by application\n" 553 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 554 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 555 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 556 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 557 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 558 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 559 " --dev-basename: The basename to be used for the character device.\n" 560 " --zero-copy [0|1]: disable(default)/enable rx/tx " 561 "zero copy\n" 562 " --rx-desc-num [0-N]: the number of descriptors on rx, " 563 "used only when zero copy is enabled.\n" 564 " --tx-desc-num [0-N]: the number of descriptors on tx, " 565 "used only when zero copy is enabled.\n", 566 prgname); 567 } 568 569 /* 570 * Parse the arguments given in the command line of the application. 571 */ 572 static int 573 us_vhost_parse_args(int argc, char **argv) 574 { 575 int opt, ret; 576 int option_index; 577 unsigned i; 578 const char *prgname = argv[0]; 579 static struct option long_option[] = { 580 {"vm2vm", required_argument, NULL, 0}, 581 {"rx-retry", required_argument, NULL, 0}, 582 {"rx-retry-delay", required_argument, NULL, 0}, 583 {"rx-retry-num", required_argument, NULL, 0}, 584 {"mergeable", required_argument, NULL, 0}, 585 {"stats", required_argument, NULL, 0}, 586 {"dev-basename", required_argument, NULL, 0}, 587 {"zero-copy", required_argument, NULL, 0}, 588 {"rx-desc-num", required_argument, NULL, 0}, 589 {"tx-desc-num", required_argument, NULL, 0}, 590 {NULL, 0, 0, 0}, 591 }; 592 593 /* Parse command line */ 594 while ((opt = getopt_long(argc, argv, "p:P", 595 long_option, &option_index)) != EOF) { 596 switch (opt) { 597 /* Portmask */ 598 case 'p': 599 enabled_port_mask = parse_portmask(optarg); 600 if (enabled_port_mask == 0) { 601 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 602 us_vhost_usage(prgname); 603 return -1; 604 } 605 break; 606 607 case 'P': 608 promiscuous = 1; 609 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 610 ETH_VMDQ_ACCEPT_BROADCAST | 611 ETH_VMDQ_ACCEPT_MULTICAST; 612 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 613 614 break; 615 616 case 0: 617 /* Enable/disable vm2vm comms. */ 618 if (!strncmp(long_option[option_index].name, "vm2vm", 619 MAX_LONG_OPT_SZ)) { 620 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 621 if (ret == -1) { 622 RTE_LOG(INFO, VHOST_CONFIG, 623 "Invalid argument for " 624 "vm2vm [0|1|2]\n"); 625 us_vhost_usage(prgname); 626 return -1; 627 } else { 628 vm2vm_mode = (vm2vm_type)ret; 629 } 630 } 631 632 /* Enable/disable retries on RX. */ 633 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 634 ret = parse_num_opt(optarg, 1); 635 if (ret == -1) { 636 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 637 us_vhost_usage(prgname); 638 return -1; 639 } else { 640 enable_retry = ret; 641 } 642 } 643 644 /* Specify the retries delay time (in useconds) on RX. */ 645 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 646 ret = parse_num_opt(optarg, INT32_MAX); 647 if (ret == -1) { 648 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 649 us_vhost_usage(prgname); 650 return -1; 651 } else { 652 burst_rx_delay_time = ret; 653 } 654 } 655 656 /* Specify the retries number on RX. */ 657 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 658 ret = parse_num_opt(optarg, INT32_MAX); 659 if (ret == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 661 us_vhost_usage(prgname); 662 return -1; 663 } else { 664 burst_rx_retry_num = ret; 665 } 666 } 667 668 /* Enable/disable RX mergeable buffers. */ 669 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 670 ret = parse_num_opt(optarg, 1); 671 if (ret == -1) { 672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } else { 676 mergeable = !!ret; 677 if (ret) { 678 vmdq_conf_default.rxmode.jumbo_frame = 1; 679 vmdq_conf_default.rxmode.max_rx_pkt_len 680 = JUMBO_FRAME_MAX_SIZE; 681 } 682 } 683 } 684 685 /* Enable/disable stats. */ 686 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 687 ret = parse_num_opt(optarg, INT32_MAX); 688 if (ret == -1) { 689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 690 us_vhost_usage(prgname); 691 return -1; 692 } else { 693 enable_stats = ret; 694 } 695 } 696 697 /* Set character device basename. */ 698 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 699 if (us_vhost_parse_basename(optarg) == -1) { 700 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 701 us_vhost_usage(prgname); 702 return -1; 703 } 704 } 705 706 /* Enable/disable rx/tx zero copy. */ 707 if (!strncmp(long_option[option_index].name, 708 "zero-copy", MAX_LONG_OPT_SZ)) { 709 ret = parse_num_opt(optarg, 1); 710 if (ret == -1) { 711 RTE_LOG(INFO, VHOST_CONFIG, 712 "Invalid argument" 713 " for zero-copy [0|1]\n"); 714 us_vhost_usage(prgname); 715 return -1; 716 } else 717 zero_copy = ret; 718 719 if (zero_copy) { 720 #ifdef RTE_MBUF_REFCNT 721 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 722 "zero copy vhost APP, please " 723 "disable RTE_MBUF_REFCNT\n" 724 "in config file and then rebuild DPDK " 725 "core lib!\n" 726 "Otherwise please disable zero copy " 727 "flag in command line!\n"); 728 return -1; 729 #endif 730 } 731 } 732 733 /* Specify the descriptor number on RX. */ 734 if (!strncmp(long_option[option_index].name, 735 "rx-desc-num", MAX_LONG_OPT_SZ)) { 736 ret = parse_num_opt(optarg, MAX_RING_DESC); 737 if ((ret == -1) || (!POWEROF2(ret))) { 738 RTE_LOG(INFO, VHOST_CONFIG, 739 "Invalid argument for rx-desc-num[0-N]," 740 "power of 2 required.\n"); 741 us_vhost_usage(prgname); 742 return -1; 743 } else { 744 num_rx_descriptor = ret; 745 } 746 } 747 748 /* Specify the descriptor number on TX. */ 749 if (!strncmp(long_option[option_index].name, 750 "tx-desc-num", MAX_LONG_OPT_SZ)) { 751 ret = parse_num_opt(optarg, MAX_RING_DESC); 752 if ((ret == -1) || (!POWEROF2(ret))) { 753 RTE_LOG(INFO, VHOST_CONFIG, 754 "Invalid argument for tx-desc-num [0-N]," 755 "power of 2 required.\n"); 756 us_vhost_usage(prgname); 757 return -1; 758 } else { 759 num_tx_descriptor = ret; 760 } 761 } 762 763 break; 764 765 /* Invalid option - print options. */ 766 default: 767 us_vhost_usage(prgname); 768 return -1; 769 } 770 } 771 772 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 773 if (enabled_port_mask & (1 << i)) 774 ports[num_ports++] = (uint8_t)i; 775 } 776 777 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 778 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 779 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 780 return -1; 781 } 782 783 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 784 RTE_LOG(INFO, VHOST_PORT, 785 "Vhost zero copy doesn't support software vm2vm," 786 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 787 return -1; 788 } 789 790 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 791 RTE_LOG(INFO, VHOST_PORT, 792 "Vhost zero copy doesn't support jumbo frame," 793 "please specify '--mergeable 0' to disable the " 794 "mergeable feature.\n"); 795 return -1; 796 } 797 798 return 0; 799 } 800 801 /* 802 * Update the global var NUM_PORTS and array PORTS according to system ports number 803 * and return valid ports number 804 */ 805 static unsigned check_ports_num(unsigned nb_ports) 806 { 807 unsigned valid_num_ports = num_ports; 808 unsigned portid; 809 810 if (num_ports > nb_ports) { 811 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 812 num_ports, nb_ports); 813 num_ports = nb_ports; 814 } 815 816 for (portid = 0; portid < num_ports; portid ++) { 817 if (ports[portid] >= nb_ports) { 818 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 819 ports[portid], (nb_ports - 1)); 820 ports[portid] = INVALID_PORT_ID; 821 valid_num_ports--; 822 } 823 } 824 return valid_num_ports; 825 } 826 827 /* 828 * Macro to print out packet contents. Wrapped in debug define so that the 829 * data path is not effected when debug is disabled. 830 */ 831 #ifdef DEBUG 832 #define PRINT_PACKET(device, addr, size, header) do { \ 833 char *pkt_addr = (char*)(addr); \ 834 unsigned int index; \ 835 char packet[MAX_PRINT_BUFF]; \ 836 \ 837 if ((header)) \ 838 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 839 else \ 840 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 841 for (index = 0; index < (size); index++) { \ 842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 843 "%02hhx ", pkt_addr[index]); \ 844 } \ 845 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 846 \ 847 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 848 } while(0) 849 #else 850 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 851 #endif 852 853 /* 854 * Function to convert guest physical addresses to vhost physical addresses. 855 * This is used to convert virtio buffer addresses. 856 */ 857 static inline uint64_t __attribute__((always_inline)) 858 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 859 uint32_t buf_len, hpa_type *addr_type) 860 { 861 struct virtio_memory_regions_hpa *region; 862 uint32_t regionidx; 863 uint64_t vhost_pa = 0; 864 865 *addr_type = PHYS_ADDR_INVALID; 866 867 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 868 region = &vdev->regions_hpa[regionidx]; 869 if ((guest_pa >= region->guest_phys_address) && 870 (guest_pa <= region->guest_phys_address_end)) { 871 vhost_pa = region->host_phys_addr_offset + guest_pa; 872 if (likely((guest_pa + buf_len - 1) 873 <= region->guest_phys_address_end)) 874 *addr_type = PHYS_ADDR_CONTINUOUS; 875 else 876 *addr_type = PHYS_ADDR_CROSS_SUBREG; 877 break; 878 } 879 } 880 881 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 882 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 883 (void *)(uintptr_t)vhost_pa); 884 885 return vhost_pa; 886 } 887 888 /* 889 * Compares a packet destination MAC address to a device MAC address. 890 */ 891 static inline int __attribute__((always_inline)) 892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 893 { 894 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 895 } 896 897 /* 898 * This function learns the MAC address of the device and registers this along with a 899 * vlan tag to a VMDQ. 900 */ 901 static int 902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 903 { 904 struct ether_hdr *pkt_hdr; 905 struct virtio_net_data_ll *dev_ll; 906 struct virtio_net *dev = vdev->dev; 907 int i, ret; 908 909 /* Learn MAC address of guest device from packet */ 910 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 911 912 dev_ll = ll_root_used; 913 914 while (dev_ll != NULL) { 915 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 916 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 917 return -1; 918 } 919 dev_ll = dev_ll->next; 920 } 921 922 for (i = 0; i < ETHER_ADDR_LEN; i++) 923 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 924 925 /* vlan_tag currently uses the device_id. */ 926 vdev->vlan_tag = vlan_tags[dev->device_fh]; 927 928 /* Print out VMDQ registration info. */ 929 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 930 dev->device_fh, 931 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 932 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 933 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 934 vdev->vlan_tag); 935 936 /* Register the MAC address. */ 937 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 938 (uint32_t)dev->device_fh + vmdq_pool_base); 939 if (ret) 940 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 941 dev->device_fh); 942 943 /* Enable stripping of the vlan tag as we handle routing. */ 944 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 945 946 /* Set device as ready for RX. */ 947 vdev->ready = DEVICE_RX; 948 949 return 0; 950 } 951 952 /* 953 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 954 * queue before disabling RX on the device. 955 */ 956 static inline void 957 unlink_vmdq(struct vhost_dev *vdev) 958 { 959 unsigned i = 0; 960 unsigned rx_count; 961 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 962 963 if (vdev->ready == DEVICE_RX) { 964 /*clear MAC and VLAN settings*/ 965 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 966 for (i = 0; i < 6; i++) 967 vdev->mac_address.addr_bytes[i] = 0; 968 969 vdev->vlan_tag = 0; 970 971 /*Clear out the receive buffers*/ 972 rx_count = rte_eth_rx_burst(ports[0], 973 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 974 975 while (rx_count) { 976 for (i = 0; i < rx_count; i++) 977 rte_pktmbuf_free(pkts_burst[i]); 978 979 rx_count = rte_eth_rx_burst(ports[0], 980 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 981 } 982 983 vdev->ready = DEVICE_MAC_LEARNING; 984 } 985 } 986 987 /* 988 * Check if the packet destination MAC address is for a local device. If so then put 989 * the packet on that devices RX queue. If not then return. 990 */ 991 static inline int __attribute__((always_inline)) 992 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 993 { 994 struct virtio_net_data_ll *dev_ll; 995 struct ether_hdr *pkt_hdr; 996 uint64_t ret = 0; 997 struct virtio_net *dev = vdev->dev; 998 struct virtio_net *tdev; /* destination virito device */ 999 1000 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1001 1002 /*get the used devices list*/ 1003 dev_ll = ll_root_used; 1004 1005 while (dev_ll != NULL) { 1006 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1007 &dev_ll->vdev->mac_address)) { 1008 1009 /* Drop the packet if the TX packet is destined for the TX device. */ 1010 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1011 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1012 dev->device_fh); 1013 return 0; 1014 } 1015 tdev = dev_ll->vdev->dev; 1016 1017 1018 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1019 1020 if (unlikely(dev_ll->vdev->remove)) { 1021 /*drop the packet if the device is marked for removal*/ 1022 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1023 } else { 1024 /*send the packet to the local virtio device*/ 1025 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1026 if (enable_stats) { 1027 rte_atomic64_add( 1028 &dev_statistics[tdev->device_fh].rx_total_atomic, 1029 1); 1030 rte_atomic64_add( 1031 &dev_statistics[tdev->device_fh].rx_atomic, 1032 ret); 1033 dev_statistics[tdev->device_fh].tx_total++; 1034 dev_statistics[tdev->device_fh].tx += ret; 1035 } 1036 } 1037 1038 return 0; 1039 } 1040 dev_ll = dev_ll->next; 1041 } 1042 1043 return -1; 1044 } 1045 1046 /* 1047 * Check if the destination MAC of a packet is one local VM, 1048 * and get its vlan tag, and offset if it is. 1049 */ 1050 static inline int __attribute__((always_inline)) 1051 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1052 uint32_t *offset, uint16_t *vlan_tag) 1053 { 1054 struct virtio_net_data_ll *dev_ll = ll_root_used; 1055 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1056 1057 while (dev_ll != NULL) { 1058 if ((dev_ll->vdev->ready == DEVICE_RX) 1059 && ether_addr_cmp(&(pkt_hdr->d_addr), 1060 &dev_ll->vdev->mac_address)) { 1061 /* 1062 * Drop the packet if the TX packet is 1063 * destined for the TX device. 1064 */ 1065 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1066 LOG_DEBUG(VHOST_DATA, 1067 "(%"PRIu64") TX: Source and destination" 1068 " MAC addresses are the same. Dropping " 1069 "packet.\n", 1070 dev_ll->vdev->dev->device_fh); 1071 return -1; 1072 } 1073 1074 /* 1075 * HW vlan strip will reduce the packet length 1076 * by minus length of vlan tag, so need restore 1077 * the packet length by plus it. 1078 */ 1079 *offset = VLAN_HLEN; 1080 *vlan_tag = 1081 (uint16_t) 1082 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1083 1084 LOG_DEBUG(VHOST_DATA, 1085 "(%"PRIu64") TX: pkt to local VM device id:" 1086 "(%"PRIu64") vlan tag: %d.\n", 1087 dev->device_fh, dev_ll->vdev->dev->device_fh, 1088 vlan_tag); 1089 1090 break; 1091 } 1092 dev_ll = dev_ll->next; 1093 } 1094 return 0; 1095 } 1096 1097 /* 1098 * This function routes the TX packet to the correct interface. This may be a local device 1099 * or the physical port. 1100 */ 1101 static inline void __attribute__((always_inline)) 1102 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1103 { 1104 struct mbuf_table *tx_q; 1105 struct rte_mbuf **m_table; 1106 unsigned len, ret, offset = 0; 1107 const uint16_t lcore_id = rte_lcore_id(); 1108 struct virtio_net *dev = vdev->dev; 1109 1110 /*check if destination is local VM*/ 1111 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1112 rte_pktmbuf_free(m); 1113 return; 1114 } 1115 1116 if (vm2vm_mode == VM2VM_HARDWARE) { 1117 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 || 1118 offset > rte_pktmbuf_tailroom(m)) { 1119 rte_pktmbuf_free(m); 1120 return; 1121 } 1122 } 1123 1124 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1125 1126 /*Add packet to the port tx queue*/ 1127 tx_q = &lcore_tx_queue[lcore_id]; 1128 len = tx_q->len; 1129 1130 m->ol_flags = PKT_TX_VLAN_PKT; 1131 1132 m->data_len += offset; 1133 m->pkt_len += offset; 1134 1135 m->vlan_tci = vlan_tag; 1136 1137 tx_q->m_table[len] = m; 1138 len++; 1139 if (enable_stats) { 1140 dev_statistics[dev->device_fh].tx_total++; 1141 dev_statistics[dev->device_fh].tx++; 1142 } 1143 1144 if (unlikely(len == MAX_PKT_BURST)) { 1145 m_table = (struct rte_mbuf **)tx_q->m_table; 1146 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1147 /* Free any buffers not handled by TX and update the port stats. */ 1148 if (unlikely(ret < len)) { 1149 do { 1150 rte_pktmbuf_free(m_table[ret]); 1151 } while (++ret < len); 1152 } 1153 1154 len = 0; 1155 } 1156 1157 tx_q->len = len; 1158 return; 1159 } 1160 /* 1161 * This function is called by each data core. It handles all RX/TX registered with the 1162 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1163 * with all devices in the main linked list. 1164 */ 1165 static int 1166 switch_worker(__attribute__((unused)) void *arg) 1167 { 1168 struct rte_mempool *mbuf_pool = arg; 1169 struct virtio_net *dev = NULL; 1170 struct vhost_dev *vdev = NULL; 1171 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1172 struct virtio_net_data_ll *dev_ll; 1173 struct mbuf_table *tx_q; 1174 volatile struct lcore_ll_info *lcore_ll; 1175 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1176 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1177 unsigned ret, i; 1178 const uint16_t lcore_id = rte_lcore_id(); 1179 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1180 uint16_t rx_count = 0; 1181 uint16_t tx_count; 1182 uint32_t retry = 0; 1183 1184 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1185 lcore_ll = lcore_info[lcore_id].lcore_ll; 1186 prev_tsc = 0; 1187 1188 tx_q = &lcore_tx_queue[lcore_id]; 1189 for (i = 0; i < num_cores; i ++) { 1190 if (lcore_ids[i] == lcore_id) { 1191 tx_q->txq_id = i; 1192 break; 1193 } 1194 } 1195 1196 while(1) { 1197 cur_tsc = rte_rdtsc(); 1198 /* 1199 * TX burst queue drain 1200 */ 1201 diff_tsc = cur_tsc - prev_tsc; 1202 if (unlikely(diff_tsc > drain_tsc)) { 1203 1204 if (tx_q->len) { 1205 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1206 1207 /*Tx any packets in the queue*/ 1208 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1209 (struct rte_mbuf **)tx_q->m_table, 1210 (uint16_t)tx_q->len); 1211 if (unlikely(ret < tx_q->len)) { 1212 do { 1213 rte_pktmbuf_free(tx_q->m_table[ret]); 1214 } while (++ret < tx_q->len); 1215 } 1216 1217 tx_q->len = 0; 1218 } 1219 1220 prev_tsc = cur_tsc; 1221 1222 } 1223 1224 rte_prefetch0(lcore_ll->ll_root_used); 1225 /* 1226 * Inform the configuration core that we have exited the linked list and that no devices are 1227 * in use if requested. 1228 */ 1229 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1230 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1231 1232 /* 1233 * Process devices 1234 */ 1235 dev_ll = lcore_ll->ll_root_used; 1236 1237 while (dev_ll != NULL) { 1238 /*get virtio device ID*/ 1239 vdev = dev_ll->vdev; 1240 dev = vdev->dev; 1241 1242 if (unlikely(vdev->remove)) { 1243 dev_ll = dev_ll->next; 1244 unlink_vmdq(vdev); 1245 vdev->ready = DEVICE_SAFE_REMOVE; 1246 continue; 1247 } 1248 if (likely(vdev->ready == DEVICE_RX)) { 1249 /*Handle guest RX*/ 1250 rx_count = rte_eth_rx_burst(ports[0], 1251 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1252 1253 if (rx_count) { 1254 /* 1255 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1256 * Here MAX_PKT_BURST must be less than virtio queue size 1257 */ 1258 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1259 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1260 rte_delay_us(burst_rx_delay_time); 1261 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1262 break; 1263 } 1264 } 1265 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1266 if (enable_stats) { 1267 rte_atomic64_add( 1268 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1269 rx_count); 1270 rte_atomic64_add( 1271 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1272 } 1273 while (likely(rx_count)) { 1274 rx_count--; 1275 rte_pktmbuf_free(pkts_burst[rx_count]); 1276 } 1277 1278 } 1279 } 1280 1281 if (likely(!vdev->remove)) { 1282 /* Handle guest TX*/ 1283 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1284 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1285 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1286 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1287 while (tx_count--) 1288 rte_pktmbuf_free(pkts_burst[tx_count]); 1289 } 1290 } 1291 while (tx_count) 1292 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1293 } 1294 1295 /*move to the next device in the list*/ 1296 dev_ll = dev_ll->next; 1297 } 1298 } 1299 1300 return 0; 1301 } 1302 1303 /* 1304 * This function gets available ring number for zero copy rx. 1305 * Only one thread will call this funciton for a paticular virtio device, 1306 * so, it is designed as non-thread-safe function. 1307 */ 1308 static inline uint32_t __attribute__((always_inline)) 1309 get_available_ring_num_zcp(struct virtio_net *dev) 1310 { 1311 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1312 uint16_t avail_idx; 1313 1314 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1315 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1316 } 1317 1318 /* 1319 * This function gets available ring index for zero copy rx, 1320 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1321 * Only one thread will call this funciton for a paticular virtio device, 1322 * so, it is designed as non-thread-safe function. 1323 */ 1324 static inline uint32_t __attribute__((always_inline)) 1325 get_available_ring_index_zcp(struct virtio_net *dev, 1326 uint16_t *res_base_idx, uint32_t count) 1327 { 1328 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1329 uint16_t avail_idx; 1330 uint32_t retry = 0; 1331 uint16_t free_entries; 1332 1333 *res_base_idx = vq->last_used_idx_res; 1334 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1335 free_entries = (avail_idx - *res_base_idx); 1336 1337 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1338 "avail idx: %d, " 1339 "res base idx:%d, free entries:%d\n", 1340 dev->device_fh, avail_idx, *res_base_idx, 1341 free_entries); 1342 1343 /* 1344 * If retry is enabled and the queue is full then we wait 1345 * and retry to avoid packet loss. 1346 */ 1347 if (enable_retry && unlikely(count > free_entries)) { 1348 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1349 rte_delay_us(burst_rx_delay_time); 1350 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1351 free_entries = (avail_idx - *res_base_idx); 1352 if (count <= free_entries) 1353 break; 1354 } 1355 } 1356 1357 /*check that we have enough buffers*/ 1358 if (unlikely(count > free_entries)) 1359 count = free_entries; 1360 1361 if (unlikely(count == 0)) { 1362 LOG_DEBUG(VHOST_DATA, 1363 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1364 "avail idx: %d, res base idx:%d, free entries:%d\n", 1365 dev->device_fh, avail_idx, 1366 *res_base_idx, free_entries); 1367 return 0; 1368 } 1369 1370 vq->last_used_idx_res = *res_base_idx + count; 1371 1372 return count; 1373 } 1374 1375 /* 1376 * This function put descriptor back to used list. 1377 */ 1378 static inline void __attribute__((always_inline)) 1379 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1380 { 1381 uint16_t res_cur_idx = vq->last_used_idx; 1382 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1383 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1384 rte_compiler_barrier(); 1385 *(volatile uint16_t *)&vq->used->idx += 1; 1386 vq->last_used_idx += 1; 1387 1388 /* Kick the guest if necessary. */ 1389 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1390 eventfd_write((int)vq->kickfd, 1); 1391 } 1392 1393 /* 1394 * This function get available descriptor from vitio vring and un-attached mbuf 1395 * from vpool->ring, and then attach them together. It needs adjust the offset 1396 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1397 * frame data may be put to wrong location in mbuf. 1398 */ 1399 static inline void __attribute__((always_inline)) 1400 attach_rxmbuf_zcp(struct virtio_net *dev) 1401 { 1402 uint16_t res_base_idx, desc_idx; 1403 uint64_t buff_addr, phys_addr; 1404 struct vhost_virtqueue *vq; 1405 struct vring_desc *desc; 1406 struct rte_mbuf *mbuf = NULL; 1407 struct vpool *vpool; 1408 hpa_type addr_type; 1409 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1410 1411 vpool = &vpool_array[vdev->vmdq_rx_q]; 1412 vq = dev->virtqueue[VIRTIO_RXQ]; 1413 1414 do { 1415 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1416 1) != 1)) 1417 return; 1418 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1419 1420 desc = &vq->desc[desc_idx]; 1421 if (desc->flags & VRING_DESC_F_NEXT) { 1422 desc = &vq->desc[desc->next]; 1423 buff_addr = gpa_to_vva(dev, desc->addr); 1424 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1425 &addr_type); 1426 } else { 1427 buff_addr = gpa_to_vva(dev, 1428 desc->addr + vq->vhost_hlen); 1429 phys_addr = gpa_to_hpa(vdev, 1430 desc->addr + vq->vhost_hlen, 1431 desc->len, &addr_type); 1432 } 1433 1434 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1435 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1436 " address found when attaching RX frame buffer" 1437 " address!\n", dev->device_fh); 1438 put_desc_to_used_list_zcp(vq, desc_idx); 1439 continue; 1440 } 1441 1442 /* 1443 * Check if the frame buffer address from guest crosses 1444 * sub-region or not. 1445 */ 1446 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1447 RTE_LOG(ERR, VHOST_DATA, 1448 "(%"PRIu64") Frame buffer address cross " 1449 "sub-regioin found when attaching RX frame " 1450 "buffer address!\n", 1451 dev->device_fh); 1452 put_desc_to_used_list_zcp(vq, desc_idx); 1453 continue; 1454 } 1455 } while (unlikely(phys_addr == 0)); 1456 1457 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1458 if (unlikely(mbuf == NULL)) { 1459 LOG_DEBUG(VHOST_DATA, 1460 "(%"PRIu64") in attach_rxmbuf_zcp: " 1461 "ring_sc_dequeue fail.\n", 1462 dev->device_fh); 1463 put_desc_to_used_list_zcp(vq, desc_idx); 1464 return; 1465 } 1466 1467 if (unlikely(vpool->buf_size > desc->len)) { 1468 LOG_DEBUG(VHOST_DATA, 1469 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1470 "length(%d) of descriptor idx: %d less than room " 1471 "size required: %d\n", 1472 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1473 put_desc_to_used_list_zcp(vq, desc_idx); 1474 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1475 return; 1476 } 1477 1478 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1479 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1480 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1481 mbuf->data_len = desc->len; 1482 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1483 1484 LOG_DEBUG(VHOST_DATA, 1485 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1486 "descriptor idx:%d\n", 1487 dev->device_fh, res_base_idx, desc_idx); 1488 1489 __rte_mbuf_raw_free(mbuf); 1490 1491 return; 1492 } 1493 1494 /* 1495 * Detach an attched packet mbuf - 1496 * - restore original mbuf address and length values. 1497 * - reset pktmbuf data and data_len to their default values. 1498 * All other fields of the given packet mbuf will be left intact. 1499 * 1500 * @param m 1501 * The attached packet mbuf. 1502 */ 1503 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1504 { 1505 const struct rte_mempool *mp = m->pool; 1506 void *buf = RTE_MBUF_TO_BADDR(m); 1507 uint32_t buf_ofs; 1508 uint32_t buf_len = mp->elt_size - sizeof(*m); 1509 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1510 1511 m->buf_addr = buf; 1512 m->buf_len = (uint16_t)buf_len; 1513 1514 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1515 RTE_PKTMBUF_HEADROOM : m->buf_len; 1516 m->data_off = buf_ofs; 1517 1518 m->data_len = 0; 1519 } 1520 1521 /* 1522 * This function is called after packets have been transimited. It fetchs mbuf 1523 * from vpool->pool, detached it and put into vpool->ring. It also update the 1524 * used index and kick the guest if necessary. 1525 */ 1526 static inline uint32_t __attribute__((always_inline)) 1527 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1528 { 1529 struct rte_mbuf *mbuf; 1530 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1531 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1532 uint32_t index = 0; 1533 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1534 1535 LOG_DEBUG(VHOST_DATA, 1536 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1537 "clean is: %d\n", 1538 dev->device_fh, mbuf_count); 1539 LOG_DEBUG(VHOST_DATA, 1540 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1541 "clean is : %d\n", 1542 dev->device_fh, rte_ring_count(vpool->ring)); 1543 1544 for (index = 0; index < mbuf_count; index++) { 1545 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1546 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1547 pktmbuf_detach_zcp(mbuf); 1548 rte_ring_sp_enqueue(vpool->ring, mbuf); 1549 1550 /* Update used index buffer information. */ 1551 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1552 vq->used->ring[used_idx].len = 0; 1553 1554 used_idx = (used_idx + 1) & (vq->size - 1); 1555 } 1556 1557 LOG_DEBUG(VHOST_DATA, 1558 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1559 "clean is: %d\n", 1560 dev->device_fh, rte_mempool_count(vpool->pool)); 1561 LOG_DEBUG(VHOST_DATA, 1562 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1563 "clean is : %d\n", 1564 dev->device_fh, rte_ring_count(vpool->ring)); 1565 LOG_DEBUG(VHOST_DATA, 1566 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1567 "vq->last_used_idx:%d\n", 1568 dev->device_fh, vq->last_used_idx); 1569 1570 vq->last_used_idx += mbuf_count; 1571 1572 LOG_DEBUG(VHOST_DATA, 1573 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1574 "vq->last_used_idx:%d\n", 1575 dev->device_fh, vq->last_used_idx); 1576 1577 rte_compiler_barrier(); 1578 1579 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1580 1581 /* Kick guest if required. */ 1582 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1583 eventfd_write((int)vq->kickfd, 1); 1584 1585 return 0; 1586 } 1587 1588 /* 1589 * This function is called when a virtio device is destroy. 1590 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1591 */ 1592 static void mbuf_destroy_zcp(struct vpool *vpool) 1593 { 1594 struct rte_mbuf *mbuf = NULL; 1595 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1596 1597 LOG_DEBUG(VHOST_CONFIG, 1598 "in mbuf_destroy_zcp: mbuf count in mempool before " 1599 "mbuf_destroy_zcp is: %d\n", 1600 mbuf_count); 1601 LOG_DEBUG(VHOST_CONFIG, 1602 "in mbuf_destroy_zcp: mbuf count in ring before " 1603 "mbuf_destroy_zcp is : %d\n", 1604 rte_ring_count(vpool->ring)); 1605 1606 for (index = 0; index < mbuf_count; index++) { 1607 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1608 if (likely(mbuf != NULL)) { 1609 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1610 pktmbuf_detach_zcp(mbuf); 1611 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1612 } 1613 } 1614 1615 LOG_DEBUG(VHOST_CONFIG, 1616 "in mbuf_destroy_zcp: mbuf count in mempool after " 1617 "mbuf_destroy_zcp is: %d\n", 1618 rte_mempool_count(vpool->pool)); 1619 LOG_DEBUG(VHOST_CONFIG, 1620 "in mbuf_destroy_zcp: mbuf count in ring after " 1621 "mbuf_destroy_zcp is : %d\n", 1622 rte_ring_count(vpool->ring)); 1623 } 1624 1625 /* 1626 * This function update the use flag and counter. 1627 */ 1628 static inline uint32_t __attribute__((always_inline)) 1629 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1630 uint32_t count) 1631 { 1632 struct vhost_virtqueue *vq; 1633 struct vring_desc *desc; 1634 struct rte_mbuf *buff; 1635 /* The virtio_hdr is initialised to 0. */ 1636 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1637 = {{0, 0, 0, 0, 0, 0}, 0}; 1638 uint64_t buff_hdr_addr = 0; 1639 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1640 uint32_t head_idx, packet_success = 0; 1641 uint16_t res_cur_idx; 1642 1643 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1644 1645 if (count == 0) 1646 return 0; 1647 1648 vq = dev->virtqueue[VIRTIO_RXQ]; 1649 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1650 1651 res_cur_idx = vq->last_used_idx; 1652 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1653 dev->device_fh, res_cur_idx, res_cur_idx + count); 1654 1655 /* Retrieve all of the head indexes first to avoid caching issues. */ 1656 for (head_idx = 0; head_idx < count; head_idx++) 1657 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1658 1659 /*Prefetch descriptor index. */ 1660 rte_prefetch0(&vq->desc[head[packet_success]]); 1661 1662 while (packet_success != count) { 1663 /* Get descriptor from available ring */ 1664 desc = &vq->desc[head[packet_success]]; 1665 1666 buff = pkts[packet_success]; 1667 LOG_DEBUG(VHOST_DATA, 1668 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1669 "pkt[%d] descriptor idx: %d\n", 1670 dev->device_fh, packet_success, 1671 MBUF_HEADROOM_UINT32(buff)); 1672 1673 PRINT_PACKET(dev, 1674 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1675 + RTE_PKTMBUF_HEADROOM), 1676 rte_pktmbuf_data_len(buff), 0); 1677 1678 /* Buffer address translation for virtio header. */ 1679 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1680 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1681 1682 /* 1683 * If the descriptors are chained the header and data are 1684 * placed in separate buffers. 1685 */ 1686 if (desc->flags & VRING_DESC_F_NEXT) { 1687 desc->len = vq->vhost_hlen; 1688 desc = &vq->desc[desc->next]; 1689 desc->len = rte_pktmbuf_data_len(buff); 1690 } else { 1691 desc->len = packet_len; 1692 } 1693 1694 /* Update used ring with desc information */ 1695 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1696 = head[packet_success]; 1697 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1698 = packet_len; 1699 res_cur_idx++; 1700 packet_success++; 1701 1702 /* A header is required per buffer. */ 1703 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1704 (const void *)&virtio_hdr, vq->vhost_hlen); 1705 1706 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1707 1708 if (likely(packet_success < count)) { 1709 /* Prefetch descriptor index. */ 1710 rte_prefetch0(&vq->desc[head[packet_success]]); 1711 } 1712 } 1713 1714 rte_compiler_barrier(); 1715 1716 LOG_DEBUG(VHOST_DATA, 1717 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1718 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1719 dev->device_fh, vq->last_used_idx, vq->used->idx); 1720 1721 *(volatile uint16_t *)&vq->used->idx += count; 1722 vq->last_used_idx += count; 1723 1724 LOG_DEBUG(VHOST_DATA, 1725 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1726 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1727 dev->device_fh, vq->last_used_idx, vq->used->idx); 1728 1729 /* Kick the guest if necessary. */ 1730 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1731 eventfd_write((int)vq->kickfd, 1); 1732 1733 return count; 1734 } 1735 1736 /* 1737 * This function routes the TX packet to the correct interface. 1738 * This may be a local device or the physical port. 1739 */ 1740 static inline void __attribute__((always_inline)) 1741 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1742 uint32_t desc_idx, uint8_t need_copy) 1743 { 1744 struct mbuf_table *tx_q; 1745 struct rte_mbuf **m_table; 1746 struct rte_mbuf *mbuf = NULL; 1747 unsigned len, ret, offset = 0; 1748 struct vpool *vpool; 1749 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1750 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1751 1752 /*Add packet to the port tx queue*/ 1753 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1754 len = tx_q->len; 1755 1756 /* Allocate an mbuf and populate the structure. */ 1757 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1758 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1759 if (unlikely(mbuf == NULL)) { 1760 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1761 RTE_LOG(ERR, VHOST_DATA, 1762 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1763 dev->device_fh); 1764 put_desc_to_used_list_zcp(vq, desc_idx); 1765 return; 1766 } 1767 1768 if (vm2vm_mode == VM2VM_HARDWARE) { 1769 /* Avoid using a vlan tag from any vm for external pkt, such as 1770 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1771 * selection, MAC address determines it as an external pkt 1772 * which should go to network, while vlan tag determine it as 1773 * a vm2vm pkt should forward to another vm. Hardware confuse 1774 * such a ambiguous situation, so pkt will lost. 1775 */ 1776 vlan_tag = external_pkt_default_vlan_tag; 1777 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1778 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1779 __rte_mbuf_raw_free(mbuf); 1780 return; 1781 } 1782 } 1783 1784 mbuf->nb_segs = m->nb_segs; 1785 mbuf->next = m->next; 1786 mbuf->data_len = m->data_len + offset; 1787 mbuf->pkt_len = mbuf->data_len; 1788 if (unlikely(need_copy)) { 1789 /* Copy the packet contents to the mbuf. */ 1790 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1791 rte_pktmbuf_mtod(m, void *), 1792 m->data_len); 1793 } else { 1794 mbuf->data_off = m->data_off; 1795 mbuf->buf_physaddr = m->buf_physaddr; 1796 mbuf->buf_addr = m->buf_addr; 1797 } 1798 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1799 mbuf->vlan_tci = vlan_tag; 1800 mbuf->l2_len = sizeof(struct ether_hdr); 1801 mbuf->l3_len = sizeof(struct ipv4_hdr); 1802 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1803 1804 tx_q->m_table[len] = mbuf; 1805 len++; 1806 1807 LOG_DEBUG(VHOST_DATA, 1808 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1809 dev->device_fh, 1810 mbuf->nb_segs, 1811 (mbuf->next == NULL) ? "null" : "non-null"); 1812 1813 if (enable_stats) { 1814 dev_statistics[dev->device_fh].tx_total++; 1815 dev_statistics[dev->device_fh].tx++; 1816 } 1817 1818 if (unlikely(len == MAX_PKT_BURST)) { 1819 m_table = (struct rte_mbuf **)tx_q->m_table; 1820 ret = rte_eth_tx_burst(ports[0], 1821 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1822 1823 /* 1824 * Free any buffers not handled by TX and update 1825 * the port stats. 1826 */ 1827 if (unlikely(ret < len)) { 1828 do { 1829 rte_pktmbuf_free(m_table[ret]); 1830 } while (++ret < len); 1831 } 1832 1833 len = 0; 1834 txmbuf_clean_zcp(dev, vpool); 1835 } 1836 1837 tx_q->len = len; 1838 1839 return; 1840 } 1841 1842 /* 1843 * This function TX all available packets in virtio TX queue for one 1844 * virtio-net device. If it is first packet, it learns MAC address and 1845 * setup VMDQ. 1846 */ 1847 static inline void __attribute__((always_inline)) 1848 virtio_dev_tx_zcp(struct virtio_net *dev) 1849 { 1850 struct rte_mbuf m; 1851 struct vhost_virtqueue *vq; 1852 struct vring_desc *desc; 1853 uint64_t buff_addr = 0, phys_addr; 1854 uint32_t head[MAX_PKT_BURST]; 1855 uint32_t i; 1856 uint16_t free_entries, packet_success = 0; 1857 uint16_t avail_idx; 1858 uint8_t need_copy = 0; 1859 hpa_type addr_type; 1860 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1861 1862 vq = dev->virtqueue[VIRTIO_TXQ]; 1863 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1864 1865 /* If there are no available buffers then return. */ 1866 if (vq->last_used_idx_res == avail_idx) 1867 return; 1868 1869 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1870 1871 /* Prefetch available ring to retrieve head indexes. */ 1872 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1873 1874 /* Get the number of free entries in the ring */ 1875 free_entries = (avail_idx - vq->last_used_idx_res); 1876 1877 /* Limit to MAX_PKT_BURST. */ 1878 free_entries 1879 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1880 1881 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1882 dev->device_fh, free_entries); 1883 1884 /* Retrieve all of the head indexes first to avoid caching issues. */ 1885 for (i = 0; i < free_entries; i++) 1886 head[i] 1887 = vq->avail->ring[(vq->last_used_idx_res + i) 1888 & (vq->size - 1)]; 1889 1890 vq->last_used_idx_res += free_entries; 1891 1892 /* Prefetch descriptor index. */ 1893 rte_prefetch0(&vq->desc[head[packet_success]]); 1894 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1895 1896 while (packet_success < free_entries) { 1897 desc = &vq->desc[head[packet_success]]; 1898 1899 /* Discard first buffer as it is the virtio header */ 1900 desc = &vq->desc[desc->next]; 1901 1902 /* Buffer address translation. */ 1903 buff_addr = gpa_to_vva(dev, desc->addr); 1904 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1905 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1906 &addr_type); 1907 1908 if (likely(packet_success < (free_entries - 1))) 1909 /* Prefetch descriptor index. */ 1910 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1911 1912 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1913 RTE_LOG(ERR, VHOST_DATA, 1914 "(%"PRIu64") Invalid frame buffer address found" 1915 "when TX packets!\n", 1916 dev->device_fh); 1917 packet_success++; 1918 continue; 1919 } 1920 1921 /* Prefetch buffer address. */ 1922 rte_prefetch0((void *)(uintptr_t)buff_addr); 1923 1924 /* 1925 * Setup dummy mbuf. This is copied to a real mbuf if 1926 * transmitted out the physical port. 1927 */ 1928 m.data_len = desc->len; 1929 m.nb_segs = 1; 1930 m.next = NULL; 1931 m.data_off = 0; 1932 m.buf_addr = (void *)(uintptr_t)buff_addr; 1933 m.buf_physaddr = phys_addr; 1934 1935 /* 1936 * Check if the frame buffer address from guest crosses 1937 * sub-region or not. 1938 */ 1939 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1940 RTE_LOG(ERR, VHOST_DATA, 1941 "(%"PRIu64") Frame buffer address cross " 1942 "sub-regioin found when attaching TX frame " 1943 "buffer address!\n", 1944 dev->device_fh); 1945 need_copy = 1; 1946 } else 1947 need_copy = 0; 1948 1949 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1950 1951 /* 1952 * If this is the first received packet we need to learn 1953 * the MAC and setup VMDQ 1954 */ 1955 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1956 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1957 /* 1958 * Discard frame if device is scheduled for 1959 * removal or a duplicate MAC address is found. 1960 */ 1961 packet_success += free_entries; 1962 vq->last_used_idx += packet_success; 1963 break; 1964 } 1965 } 1966 1967 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1968 packet_success++; 1969 } 1970 } 1971 1972 /* 1973 * This function is called by each data core. It handles all RX/TX registered 1974 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1975 * addresses are compared with all devices in the main linked list. 1976 */ 1977 static int 1978 switch_worker_zcp(__attribute__((unused)) void *arg) 1979 { 1980 struct virtio_net *dev = NULL; 1981 struct vhost_dev *vdev = NULL; 1982 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1983 struct virtio_net_data_ll *dev_ll; 1984 struct mbuf_table *tx_q; 1985 volatile struct lcore_ll_info *lcore_ll; 1986 const uint64_t drain_tsc 1987 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 1988 * BURST_TX_DRAIN_US; 1989 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1990 unsigned ret; 1991 const uint16_t lcore_id = rte_lcore_id(); 1992 uint16_t count_in_ring, rx_count = 0; 1993 1994 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1995 1996 lcore_ll = lcore_info[lcore_id].lcore_ll; 1997 prev_tsc = 0; 1998 1999 while (1) { 2000 cur_tsc = rte_rdtsc(); 2001 2002 /* TX burst queue drain */ 2003 diff_tsc = cur_tsc - prev_tsc; 2004 if (unlikely(diff_tsc > drain_tsc)) { 2005 /* 2006 * Get mbuf from vpool.pool and detach mbuf and 2007 * put back into vpool.ring. 2008 */ 2009 dev_ll = lcore_ll->ll_root_used; 2010 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2011 /* Get virtio device ID */ 2012 vdev = dev_ll->vdev; 2013 dev = vdev->dev; 2014 2015 if (likely(!vdev->remove)) { 2016 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2017 if (tx_q->len) { 2018 LOG_DEBUG(VHOST_DATA, 2019 "TX queue drained after timeout" 2020 " with burst size %u\n", 2021 tx_q->len); 2022 2023 /* 2024 * Tx any packets in the queue 2025 */ 2026 ret = rte_eth_tx_burst( 2027 ports[0], 2028 (uint16_t)tx_q->txq_id, 2029 (struct rte_mbuf **) 2030 tx_q->m_table, 2031 (uint16_t)tx_q->len); 2032 if (unlikely(ret < tx_q->len)) { 2033 do { 2034 rte_pktmbuf_free( 2035 tx_q->m_table[ret]); 2036 } while (++ret < tx_q->len); 2037 } 2038 tx_q->len = 0; 2039 2040 txmbuf_clean_zcp(dev, 2041 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2042 } 2043 } 2044 dev_ll = dev_ll->next; 2045 } 2046 prev_tsc = cur_tsc; 2047 } 2048 2049 rte_prefetch0(lcore_ll->ll_root_used); 2050 2051 /* 2052 * Inform the configuration core that we have exited the linked 2053 * list and that no devices are in use if requested. 2054 */ 2055 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2056 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2057 2058 /* Process devices */ 2059 dev_ll = lcore_ll->ll_root_used; 2060 2061 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2062 vdev = dev_ll->vdev; 2063 dev = vdev->dev; 2064 if (unlikely(vdev->remove)) { 2065 dev_ll = dev_ll->next; 2066 unlink_vmdq(vdev); 2067 vdev->ready = DEVICE_SAFE_REMOVE; 2068 continue; 2069 } 2070 2071 if (likely(vdev->ready == DEVICE_RX)) { 2072 uint32_t index = vdev->vmdq_rx_q; 2073 uint16_t i; 2074 count_in_ring 2075 = rte_ring_count(vpool_array[index].ring); 2076 uint16_t free_entries 2077 = (uint16_t)get_available_ring_num_zcp(dev); 2078 2079 /* 2080 * Attach all mbufs in vpool.ring and put back 2081 * into vpool.pool. 2082 */ 2083 for (i = 0; 2084 i < RTE_MIN(free_entries, 2085 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2086 i++) 2087 attach_rxmbuf_zcp(dev); 2088 2089 /* Handle guest RX */ 2090 rx_count = rte_eth_rx_burst(ports[0], 2091 vdev->vmdq_rx_q, pkts_burst, 2092 MAX_PKT_BURST); 2093 2094 if (rx_count) { 2095 ret_count = virtio_dev_rx_zcp(dev, 2096 pkts_burst, rx_count); 2097 if (enable_stats) { 2098 dev_statistics[dev->device_fh].rx_total 2099 += rx_count; 2100 dev_statistics[dev->device_fh].rx 2101 += ret_count; 2102 } 2103 while (likely(rx_count)) { 2104 rx_count--; 2105 pktmbuf_detach_zcp( 2106 pkts_burst[rx_count]); 2107 rte_ring_sp_enqueue( 2108 vpool_array[index].ring, 2109 (void *)pkts_burst[rx_count]); 2110 } 2111 } 2112 } 2113 2114 if (likely(!vdev->remove)) 2115 /* Handle guest TX */ 2116 virtio_dev_tx_zcp(dev); 2117 2118 /* Move to the next device in the list */ 2119 dev_ll = dev_ll->next; 2120 } 2121 } 2122 2123 return 0; 2124 } 2125 2126 2127 /* 2128 * Add an entry to a used linked list. A free entry must first be found 2129 * in the free linked list using get_data_ll_free_entry(); 2130 */ 2131 static void 2132 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2133 struct virtio_net_data_ll *ll_dev) 2134 { 2135 struct virtio_net_data_ll *ll = *ll_root_addr; 2136 2137 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2138 ll_dev->next = NULL; 2139 rte_compiler_barrier(); 2140 2141 /* If ll == NULL then this is the first device. */ 2142 if (ll) { 2143 /* Increment to the tail of the linked list. */ 2144 while ((ll->next != NULL) ) 2145 ll = ll->next; 2146 2147 ll->next = ll_dev; 2148 } else { 2149 *ll_root_addr = ll_dev; 2150 } 2151 } 2152 2153 /* 2154 * Remove an entry from a used linked list. The entry must then be added to 2155 * the free linked list using put_data_ll_free_entry(). 2156 */ 2157 static void 2158 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2159 struct virtio_net_data_ll *ll_dev, 2160 struct virtio_net_data_ll *ll_dev_last) 2161 { 2162 struct virtio_net_data_ll *ll = *ll_root_addr; 2163 2164 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2165 return; 2166 2167 if (ll_dev == ll) 2168 *ll_root_addr = ll_dev->next; 2169 else 2170 if (likely(ll_dev_last != NULL)) 2171 ll_dev_last->next = ll_dev->next; 2172 else 2173 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2174 } 2175 2176 /* 2177 * Find and return an entry from the free linked list. 2178 */ 2179 static struct virtio_net_data_ll * 2180 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2181 { 2182 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2183 struct virtio_net_data_ll *ll_dev; 2184 2185 if (ll_free == NULL) 2186 return NULL; 2187 2188 ll_dev = ll_free; 2189 *ll_root_addr = ll_free->next; 2190 2191 return ll_dev; 2192 } 2193 2194 /* 2195 * Place an entry back on to the free linked list. 2196 */ 2197 static void 2198 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2199 struct virtio_net_data_ll *ll_dev) 2200 { 2201 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2202 2203 if (ll_dev == NULL) 2204 return; 2205 2206 ll_dev->next = ll_free; 2207 *ll_root_addr = ll_dev; 2208 } 2209 2210 /* 2211 * Creates a linked list of a given size. 2212 */ 2213 static struct virtio_net_data_ll * 2214 alloc_data_ll(uint32_t size) 2215 { 2216 struct virtio_net_data_ll *ll_new; 2217 uint32_t i; 2218 2219 /* Malloc and then chain the linked list. */ 2220 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2221 if (ll_new == NULL) { 2222 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2223 return NULL; 2224 } 2225 2226 for (i = 0; i < size - 1; i++) { 2227 ll_new[i].vdev = NULL; 2228 ll_new[i].next = &ll_new[i+1]; 2229 } 2230 ll_new[i].next = NULL; 2231 2232 return (ll_new); 2233 } 2234 2235 /* 2236 * Create the main linked list along with each individual cores linked list. A used and a free list 2237 * are created to manage entries. 2238 */ 2239 static int 2240 init_data_ll (void) 2241 { 2242 int lcore; 2243 2244 RTE_LCORE_FOREACH_SLAVE(lcore) { 2245 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2246 if (lcore_info[lcore].lcore_ll == NULL) { 2247 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2248 return -1; 2249 } 2250 2251 lcore_info[lcore].lcore_ll->device_num = 0; 2252 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2253 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2254 if (num_devices % num_switching_cores) 2255 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2256 else 2257 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2258 } 2259 2260 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2261 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2262 2263 return 0; 2264 } 2265 2266 /* 2267 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2268 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2269 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2270 */ 2271 static void 2272 destroy_device (volatile struct virtio_net *dev) 2273 { 2274 struct virtio_net_data_ll *ll_lcore_dev_cur; 2275 struct virtio_net_data_ll *ll_main_dev_cur; 2276 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2277 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2278 struct vhost_dev *vdev; 2279 int lcore; 2280 2281 dev->flags &= ~VIRTIO_DEV_RUNNING; 2282 2283 vdev = (struct vhost_dev *)dev->priv; 2284 /*set the remove flag. */ 2285 vdev->remove = 1; 2286 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2287 rte_pause(); 2288 } 2289 2290 /* Search for entry to be removed from lcore ll */ 2291 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2292 while (ll_lcore_dev_cur != NULL) { 2293 if (ll_lcore_dev_cur->vdev == vdev) { 2294 break; 2295 } else { 2296 ll_lcore_dev_last = ll_lcore_dev_cur; 2297 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2298 } 2299 } 2300 2301 if (ll_lcore_dev_cur == NULL) { 2302 RTE_LOG(ERR, VHOST_CONFIG, 2303 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2304 dev->device_fh); 2305 return; 2306 } 2307 2308 /* Search for entry to be removed from main ll */ 2309 ll_main_dev_cur = ll_root_used; 2310 ll_main_dev_last = NULL; 2311 while (ll_main_dev_cur != NULL) { 2312 if (ll_main_dev_cur->vdev == vdev) { 2313 break; 2314 } else { 2315 ll_main_dev_last = ll_main_dev_cur; 2316 ll_main_dev_cur = ll_main_dev_cur->next; 2317 } 2318 } 2319 2320 /* Remove entries from the lcore and main ll. */ 2321 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2322 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2323 2324 /* Set the dev_removal_flag on each lcore. */ 2325 RTE_LCORE_FOREACH_SLAVE(lcore) { 2326 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2327 } 2328 2329 /* 2330 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2331 * they can no longer access the device removed from the linked lists and that the devices 2332 * are no longer in use. 2333 */ 2334 RTE_LCORE_FOREACH_SLAVE(lcore) { 2335 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2336 rte_pause(); 2337 } 2338 } 2339 2340 /* Add the entries back to the lcore and main free ll.*/ 2341 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2342 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2343 2344 /* Decrement number of device on the lcore. */ 2345 lcore_info[vdev->coreid].lcore_ll->device_num--; 2346 2347 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2348 2349 if (zero_copy) { 2350 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2351 2352 /* Stop the RX queue. */ 2353 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2354 LOG_DEBUG(VHOST_CONFIG, 2355 "(%"PRIu64") In destroy_device: Failed to stop " 2356 "rx queue:%d\n", 2357 dev->device_fh, 2358 vdev->vmdq_rx_q); 2359 } 2360 2361 LOG_DEBUG(VHOST_CONFIG, 2362 "(%"PRIu64") in destroy_device: Start put mbuf in " 2363 "mempool back to ring for RX queue: %d\n", 2364 dev->device_fh, vdev->vmdq_rx_q); 2365 2366 mbuf_destroy_zcp(vpool); 2367 2368 /* Stop the TX queue. */ 2369 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2370 LOG_DEBUG(VHOST_CONFIG, 2371 "(%"PRIu64") In destroy_device: Failed to " 2372 "stop tx queue:%d\n", 2373 dev->device_fh, vdev->vmdq_rx_q); 2374 } 2375 2376 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2377 2378 LOG_DEBUG(VHOST_CONFIG, 2379 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2380 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2381 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2382 dev->device_fh); 2383 2384 mbuf_destroy_zcp(vpool); 2385 rte_free(vdev->regions_hpa); 2386 } 2387 rte_free(vdev); 2388 2389 } 2390 2391 /* 2392 * Calculate the region count of physical continous regions for one particular 2393 * region of whose vhost virtual address is continous. The particular region 2394 * start from vva_start, with size of 'size' in argument. 2395 */ 2396 static uint32_t 2397 check_hpa_regions(uint64_t vva_start, uint64_t size) 2398 { 2399 uint32_t i, nregions = 0, page_size = getpagesize(); 2400 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2401 if (vva_start % page_size) { 2402 LOG_DEBUG(VHOST_CONFIG, 2403 "in check_countinous: vva start(%p) mod page_size(%d) " 2404 "has remainder\n", 2405 (void *)(uintptr_t)vva_start, page_size); 2406 return 0; 2407 } 2408 if (size % page_size) { 2409 LOG_DEBUG(VHOST_CONFIG, 2410 "in check_countinous: " 2411 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2412 size, page_size); 2413 return 0; 2414 } 2415 for (i = 0; i < size - page_size; i = i + page_size) { 2416 cur_phys_addr 2417 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2418 next_phys_addr = rte_mem_virt2phy( 2419 (void *)(uintptr_t)(vva_start + i + page_size)); 2420 if ((cur_phys_addr + page_size) != next_phys_addr) { 2421 ++nregions; 2422 LOG_DEBUG(VHOST_CONFIG, 2423 "in check_continuous: hva addr:(%p) is not " 2424 "continuous with hva addr:(%p), diff:%d\n", 2425 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2426 (void *)(uintptr_t)(vva_start + (uint64_t)i 2427 + page_size), page_size); 2428 LOG_DEBUG(VHOST_CONFIG, 2429 "in check_continuous: hpa addr:(%p) is not " 2430 "continuous with hpa addr:(%p), " 2431 "diff:(%"PRIu64")\n", 2432 (void *)(uintptr_t)cur_phys_addr, 2433 (void *)(uintptr_t)next_phys_addr, 2434 (next_phys_addr-cur_phys_addr)); 2435 } 2436 } 2437 return nregions; 2438 } 2439 2440 /* 2441 * Divide each region whose vhost virtual address is continous into a few 2442 * sub-regions, make sure the physical address within each sub-region are 2443 * continous. And fill offset(to GPA) and size etc. information of each 2444 * sub-region into regions_hpa. 2445 */ 2446 static uint32_t 2447 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2448 { 2449 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2450 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2451 2452 if (mem_region_hpa == NULL) 2453 return 0; 2454 2455 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2456 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2457 virtio_memory->regions[regionidx].address_offset; 2458 mem_region_hpa[regionidx_hpa].guest_phys_address 2459 = virtio_memory->regions[regionidx].guest_phys_address; 2460 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2461 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2462 mem_region_hpa[regionidx_hpa].guest_phys_address; 2463 LOG_DEBUG(VHOST_CONFIG, 2464 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2465 regionidx_hpa, 2466 (void *)(uintptr_t) 2467 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2468 LOG_DEBUG(VHOST_CONFIG, 2469 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2470 regionidx_hpa, 2471 (void *)(uintptr_t) 2472 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2473 for (i = 0, k = 0; 2474 i < virtio_memory->regions[regionidx].memory_size - 2475 page_size; 2476 i += page_size) { 2477 cur_phys_addr = rte_mem_virt2phy( 2478 (void *)(uintptr_t)(vva_start + i)); 2479 next_phys_addr = rte_mem_virt2phy( 2480 (void *)(uintptr_t)(vva_start + 2481 i + page_size)); 2482 if ((cur_phys_addr + page_size) != next_phys_addr) { 2483 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2484 mem_region_hpa[regionidx_hpa].guest_phys_address + 2485 k + page_size; 2486 mem_region_hpa[regionidx_hpa].memory_size 2487 = k + page_size; 2488 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2489 "phys addr end [%d]:(%p)\n", 2490 regionidx_hpa, 2491 (void *)(uintptr_t) 2492 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2493 LOG_DEBUG(VHOST_CONFIG, 2494 "in fill_hpa_regions: guest phys addr " 2495 "size [%d]:(%p)\n", 2496 regionidx_hpa, 2497 (void *)(uintptr_t) 2498 (mem_region_hpa[regionidx_hpa].memory_size)); 2499 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2500 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2501 ++regionidx_hpa; 2502 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2503 next_phys_addr - 2504 mem_region_hpa[regionidx_hpa].guest_phys_address; 2505 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2506 " phys addr start[%d]:(%p)\n", 2507 regionidx_hpa, 2508 (void *)(uintptr_t) 2509 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2510 LOG_DEBUG(VHOST_CONFIG, 2511 "in fill_hpa_regions: host phys addr " 2512 "start[%d]:(%p)\n", 2513 regionidx_hpa, 2514 (void *)(uintptr_t) 2515 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2516 k = 0; 2517 } else { 2518 k += page_size; 2519 } 2520 } 2521 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2522 = mem_region_hpa[regionidx_hpa].guest_phys_address 2523 + k + page_size; 2524 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2525 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2526 "[%d]:(%p)\n", regionidx_hpa, 2527 (void *)(uintptr_t) 2528 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2529 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2530 "[%d]:(%p)\n", regionidx_hpa, 2531 (void *)(uintptr_t) 2532 (mem_region_hpa[regionidx_hpa].memory_size)); 2533 ++regionidx_hpa; 2534 } 2535 return regionidx_hpa; 2536 } 2537 2538 /* 2539 * A new device is added to a data core. First the device is added to the main linked list 2540 * and the allocated to a specific data core. 2541 */ 2542 static int 2543 new_device (struct virtio_net *dev) 2544 { 2545 struct virtio_net_data_ll *ll_dev; 2546 int lcore, core_add = 0; 2547 uint32_t device_num_min = num_devices; 2548 struct vhost_dev *vdev; 2549 uint32_t regionidx; 2550 2551 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2552 if (vdev == NULL) { 2553 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2554 dev->device_fh); 2555 return -1; 2556 } 2557 vdev->dev = dev; 2558 dev->priv = vdev; 2559 2560 if (zero_copy) { 2561 vdev->nregions_hpa = dev->mem->nregions; 2562 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2563 vdev->nregions_hpa 2564 += check_hpa_regions( 2565 dev->mem->regions[regionidx].guest_phys_address 2566 + dev->mem->regions[regionidx].address_offset, 2567 dev->mem->regions[regionidx].memory_size); 2568 2569 } 2570 2571 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2572 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2573 RTE_CACHE_LINE_SIZE); 2574 if (vdev->regions_hpa == NULL) { 2575 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2576 rte_free(vdev); 2577 return -1; 2578 } 2579 2580 2581 if (fill_hpa_memory_regions( 2582 vdev->regions_hpa, dev->mem 2583 ) != vdev->nregions_hpa) { 2584 2585 RTE_LOG(ERR, VHOST_CONFIG, 2586 "hpa memory regions number mismatch: " 2587 "[%d]\n", vdev->nregions_hpa); 2588 rte_free(vdev->regions_hpa); 2589 rte_free(vdev); 2590 return -1; 2591 } 2592 } 2593 2594 2595 /* Add device to main ll */ 2596 ll_dev = get_data_ll_free_entry(&ll_root_free); 2597 if (ll_dev == NULL) { 2598 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2599 "of %d devices per core has been reached\n", 2600 dev->device_fh, num_devices); 2601 if (vdev->regions_hpa) 2602 rte_free(vdev->regions_hpa); 2603 rte_free(vdev); 2604 return -1; 2605 } 2606 ll_dev->vdev = vdev; 2607 add_data_ll_entry(&ll_root_used, ll_dev); 2608 vdev->vmdq_rx_q 2609 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2610 2611 if (zero_copy) { 2612 uint32_t index = vdev->vmdq_rx_q; 2613 uint32_t count_in_ring, i; 2614 struct mbuf_table *tx_q; 2615 2616 count_in_ring = rte_ring_count(vpool_array[index].ring); 2617 2618 LOG_DEBUG(VHOST_CONFIG, 2619 "(%"PRIu64") in new_device: mbuf count in mempool " 2620 "before attach is: %d\n", 2621 dev->device_fh, 2622 rte_mempool_count(vpool_array[index].pool)); 2623 LOG_DEBUG(VHOST_CONFIG, 2624 "(%"PRIu64") in new_device: mbuf count in ring " 2625 "before attach is : %d\n", 2626 dev->device_fh, count_in_ring); 2627 2628 /* 2629 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2630 */ 2631 for (i = 0; i < count_in_ring; i++) 2632 attach_rxmbuf_zcp(dev); 2633 2634 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2635 "mempool after attach is: %d\n", 2636 dev->device_fh, 2637 rte_mempool_count(vpool_array[index].pool)); 2638 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2639 "ring after attach is : %d\n", 2640 dev->device_fh, 2641 rte_ring_count(vpool_array[index].ring)); 2642 2643 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2644 tx_q->txq_id = vdev->vmdq_rx_q; 2645 2646 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2647 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2648 2649 LOG_DEBUG(VHOST_CONFIG, 2650 "(%"PRIu64") In new_device: Failed to start " 2651 "tx queue:%d\n", 2652 dev->device_fh, vdev->vmdq_rx_q); 2653 2654 mbuf_destroy_zcp(vpool); 2655 rte_free(vdev->regions_hpa); 2656 rte_free(vdev); 2657 return -1; 2658 } 2659 2660 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2661 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2662 2663 LOG_DEBUG(VHOST_CONFIG, 2664 "(%"PRIu64") In new_device: Failed to start " 2665 "rx queue:%d\n", 2666 dev->device_fh, vdev->vmdq_rx_q); 2667 2668 /* Stop the TX queue. */ 2669 if (rte_eth_dev_tx_queue_stop(ports[0], 2670 vdev->vmdq_rx_q) != 0) { 2671 LOG_DEBUG(VHOST_CONFIG, 2672 "(%"PRIu64") In new_device: Failed to " 2673 "stop tx queue:%d\n", 2674 dev->device_fh, vdev->vmdq_rx_q); 2675 } 2676 2677 mbuf_destroy_zcp(vpool); 2678 rte_free(vdev->regions_hpa); 2679 rte_free(vdev); 2680 return -1; 2681 } 2682 2683 } 2684 2685 /*reset ready flag*/ 2686 vdev->ready = DEVICE_MAC_LEARNING; 2687 vdev->remove = 0; 2688 2689 /* Find a suitable lcore to add the device. */ 2690 RTE_LCORE_FOREACH_SLAVE(lcore) { 2691 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2692 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2693 core_add = lcore; 2694 } 2695 } 2696 /* Add device to lcore ll */ 2697 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2698 if (ll_dev == NULL) { 2699 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2700 vdev->ready = DEVICE_SAFE_REMOVE; 2701 destroy_device(dev); 2702 if (vdev->regions_hpa) 2703 rte_free(vdev->regions_hpa); 2704 rte_free(vdev); 2705 return -1; 2706 } 2707 ll_dev->vdev = vdev; 2708 vdev->coreid = core_add; 2709 2710 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2711 2712 /* Initialize device stats */ 2713 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2714 2715 /* Disable notifications. */ 2716 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2717 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2718 lcore_info[vdev->coreid].lcore_ll->device_num++; 2719 dev->flags |= VIRTIO_DEV_RUNNING; 2720 2721 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2722 2723 return 0; 2724 } 2725 2726 /* 2727 * These callback allow devices to be added to the data core when configuration 2728 * has been fully complete. 2729 */ 2730 static const struct virtio_net_device_ops virtio_net_device_ops = 2731 { 2732 .new_device = new_device, 2733 .destroy_device = destroy_device, 2734 }; 2735 2736 /* 2737 * This is a thread will wake up after a period to print stats if the user has 2738 * enabled them. 2739 */ 2740 static void 2741 print_stats(void) 2742 { 2743 struct virtio_net_data_ll *dev_ll; 2744 uint64_t tx_dropped, rx_dropped; 2745 uint64_t tx, tx_total, rx, rx_total; 2746 uint32_t device_fh; 2747 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2748 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2749 2750 while(1) { 2751 sleep(enable_stats); 2752 2753 /* Clear screen and move to top left */ 2754 printf("%s%s", clr, top_left); 2755 2756 printf("\nDevice statistics ===================================="); 2757 2758 dev_ll = ll_root_used; 2759 while (dev_ll != NULL) { 2760 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2761 tx_total = dev_statistics[device_fh].tx_total; 2762 tx = dev_statistics[device_fh].tx; 2763 tx_dropped = tx_total - tx; 2764 if (zero_copy == 0) { 2765 rx_total = rte_atomic64_read( 2766 &dev_statistics[device_fh].rx_total_atomic); 2767 rx = rte_atomic64_read( 2768 &dev_statistics[device_fh].rx_atomic); 2769 } else { 2770 rx_total = dev_statistics[device_fh].rx_total; 2771 rx = dev_statistics[device_fh].rx; 2772 } 2773 rx_dropped = rx_total - rx; 2774 2775 printf("\nStatistics for device %"PRIu32" ------------------------------" 2776 "\nTX total: %"PRIu64"" 2777 "\nTX dropped: %"PRIu64"" 2778 "\nTX successful: %"PRIu64"" 2779 "\nRX total: %"PRIu64"" 2780 "\nRX dropped: %"PRIu64"" 2781 "\nRX successful: %"PRIu64"", 2782 device_fh, 2783 tx_total, 2784 tx_dropped, 2785 tx, 2786 rx_total, 2787 rx_dropped, 2788 rx); 2789 2790 dev_ll = dev_ll->next; 2791 } 2792 printf("\n======================================================\n"); 2793 } 2794 } 2795 2796 static void 2797 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2798 char *ring_name, uint32_t nb_mbuf) 2799 { 2800 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2801 vpool_array[index].pool 2802 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2803 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2804 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2805 rte_pktmbuf_init, NULL, socket, 0); 2806 if (vpool_array[index].pool != NULL) { 2807 vpool_array[index].ring 2808 = rte_ring_create(ring_name, 2809 rte_align32pow2(nb_mbuf + 1), 2810 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2811 if (likely(vpool_array[index].ring != NULL)) { 2812 LOG_DEBUG(VHOST_CONFIG, 2813 "in setup_mempool_tbl: mbuf count in " 2814 "mempool is: %d\n", 2815 rte_mempool_count(vpool_array[index].pool)); 2816 LOG_DEBUG(VHOST_CONFIG, 2817 "in setup_mempool_tbl: mbuf count in " 2818 "ring is: %d\n", 2819 rte_ring_count(vpool_array[index].ring)); 2820 } else { 2821 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2822 ring_name); 2823 } 2824 2825 /* Need consider head room. */ 2826 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2827 } else { 2828 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2829 } 2830 } 2831 2832 2833 /* 2834 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2835 * device is also registered here to handle the IOCTLs. 2836 */ 2837 int 2838 main(int argc, char *argv[]) 2839 { 2840 struct rte_mempool *mbuf_pool = NULL; 2841 unsigned lcore_id, core_id = 0; 2842 unsigned nb_ports, valid_num_ports; 2843 int ret; 2844 uint8_t portid; 2845 uint16_t queue_id; 2846 static pthread_t tid; 2847 2848 /* init EAL */ 2849 ret = rte_eal_init(argc, argv); 2850 if (ret < 0) 2851 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2852 argc -= ret; 2853 argv += ret; 2854 2855 /* parse app arguments */ 2856 ret = us_vhost_parse_args(argc, argv); 2857 if (ret < 0) 2858 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2859 2860 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2861 if (rte_lcore_is_enabled(lcore_id)) 2862 lcore_ids[core_id ++] = lcore_id; 2863 2864 if (rte_lcore_count() > RTE_MAX_LCORE) 2865 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2866 2867 /*set the number of swithcing cores available*/ 2868 num_switching_cores = rte_lcore_count()-1; 2869 2870 /* Get the number of physical ports. */ 2871 nb_ports = rte_eth_dev_count(); 2872 if (nb_ports > RTE_MAX_ETHPORTS) 2873 nb_ports = RTE_MAX_ETHPORTS; 2874 2875 /* 2876 * Update the global var NUM_PORTS and global array PORTS 2877 * and get value of var VALID_NUM_PORTS according to system ports number 2878 */ 2879 valid_num_ports = check_ports_num(nb_ports); 2880 2881 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2882 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2883 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2884 return -1; 2885 } 2886 2887 if (zero_copy == 0) { 2888 /* Create the mbuf pool. */ 2889 mbuf_pool = rte_mempool_create( 2890 "MBUF_POOL", 2891 NUM_MBUFS_PER_PORT 2892 * valid_num_ports, 2893 MBUF_SIZE, MBUF_CACHE_SIZE, 2894 sizeof(struct rte_pktmbuf_pool_private), 2895 rte_pktmbuf_pool_init, NULL, 2896 rte_pktmbuf_init, NULL, 2897 rte_socket_id(), 0); 2898 if (mbuf_pool == NULL) 2899 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2900 2901 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2902 vpool_array[queue_id].pool = mbuf_pool; 2903 2904 if (vm2vm_mode == VM2VM_HARDWARE) { 2905 /* Enable VT loop back to let L2 switch to do it. */ 2906 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2907 LOG_DEBUG(VHOST_CONFIG, 2908 "Enable loop back for L2 switch in vmdq.\n"); 2909 } 2910 } else { 2911 uint32_t nb_mbuf; 2912 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2913 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2914 2915 nb_mbuf = num_rx_descriptor 2916 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2917 + num_switching_cores * MAX_PKT_BURST; 2918 2919 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2920 snprintf(pool_name, sizeof(pool_name), 2921 "rxmbuf_pool_%u", queue_id); 2922 snprintf(ring_name, sizeof(ring_name), 2923 "rxmbuf_ring_%u", queue_id); 2924 setup_mempool_tbl(rte_socket_id(), queue_id, 2925 pool_name, ring_name, nb_mbuf); 2926 } 2927 2928 nb_mbuf = num_tx_descriptor 2929 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2930 + num_switching_cores * MAX_PKT_BURST; 2931 2932 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2933 snprintf(pool_name, sizeof(pool_name), 2934 "txmbuf_pool_%u", queue_id); 2935 snprintf(ring_name, sizeof(ring_name), 2936 "txmbuf_ring_%u", queue_id); 2937 setup_mempool_tbl(rte_socket_id(), 2938 (queue_id + MAX_QUEUES), 2939 pool_name, ring_name, nb_mbuf); 2940 } 2941 2942 if (vm2vm_mode == VM2VM_HARDWARE) { 2943 /* Enable VT loop back to let L2 switch to do it. */ 2944 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2945 LOG_DEBUG(VHOST_CONFIG, 2946 "Enable loop back for L2 switch in vmdq.\n"); 2947 } 2948 } 2949 /* Set log level. */ 2950 rte_set_log_level(LOG_LEVEL); 2951 2952 /* initialize all ports */ 2953 for (portid = 0; portid < nb_ports; portid++) { 2954 /* skip ports that are not enabled */ 2955 if ((enabled_port_mask & (1 << portid)) == 0) { 2956 RTE_LOG(INFO, VHOST_PORT, 2957 "Skipping disabled port %d\n", portid); 2958 continue; 2959 } 2960 if (port_init(portid) != 0) 2961 rte_exit(EXIT_FAILURE, 2962 "Cannot initialize network ports\n"); 2963 } 2964 2965 /* Initialise all linked lists. */ 2966 if (init_data_ll() == -1) 2967 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2968 2969 /* Initialize device stats */ 2970 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2971 2972 /* Enable stats if the user option is set. */ 2973 if (enable_stats) 2974 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2975 2976 /* Launch all data cores. */ 2977 if (zero_copy == 0) { 2978 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 2979 rte_eal_remote_launch(switch_worker, 2980 mbuf_pool, lcore_id); 2981 } 2982 } else { 2983 uint32_t count_in_mempool, index, i; 2984 for (index = 0; index < 2*MAX_QUEUES; index++) { 2985 /* For all RX and TX queues. */ 2986 count_in_mempool 2987 = rte_mempool_count(vpool_array[index].pool); 2988 2989 /* 2990 * Transfer all un-attached mbufs from vpool.pool 2991 * to vpoo.ring. 2992 */ 2993 for (i = 0; i < count_in_mempool; i++) { 2994 struct rte_mbuf *mbuf 2995 = __rte_mbuf_raw_alloc( 2996 vpool_array[index].pool); 2997 rte_ring_sp_enqueue(vpool_array[index].ring, 2998 (void *)mbuf); 2999 } 3000 3001 LOG_DEBUG(VHOST_CONFIG, 3002 "in main: mbuf count in mempool at initial " 3003 "is: %d\n", count_in_mempool); 3004 LOG_DEBUG(VHOST_CONFIG, 3005 "in main: mbuf count in ring at initial is :" 3006 " %d\n", 3007 rte_ring_count(vpool_array[index].ring)); 3008 } 3009 3010 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3011 rte_eal_remote_launch(switch_worker_zcp, NULL, 3012 lcore_id); 3013 } 3014 3015 if (mergeable == 0) 3016 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3017 3018 /* Register CUSE device to handle IOCTLs. */ 3019 ret = rte_vhost_driver_register((char *)&dev_basename); 3020 if (ret != 0) 3021 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3022 3023 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3024 3025 /* Start CUSE session. */ 3026 rte_vhost_driver_session_start(); 3027 return 0; 3028 3029 } 3030 3031