1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 /* mask of enabled ports */ 143 static uint32_t enabled_port_mask = 0; 144 145 /* Promiscuous mode */ 146 static uint32_t promiscuous; 147 148 /*Number of switching cores enabled*/ 149 static uint32_t num_switching_cores = 0; 150 151 /* number of devices/queues to support*/ 152 static uint32_t num_queues = 0; 153 static uint32_t num_devices; 154 155 /* 156 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 157 * disabled on default. 158 */ 159 static uint32_t zero_copy; 160 static int mergeable; 161 162 /* number of descriptors to apply*/ 163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 165 166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 167 #define MAX_RING_DESC 4096 168 169 struct vpool { 170 struct rte_mempool *pool; 171 struct rte_ring *ring; 172 uint32_t buf_size; 173 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 174 175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 176 typedef enum { 177 VM2VM_DISABLED = 0, 178 VM2VM_SOFTWARE = 1, 179 VM2VM_HARDWARE = 2, 180 VM2VM_LAST 181 } vm2vm_type; 182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 183 184 /* The type of host physical address translated from guest physical address. */ 185 typedef enum { 186 PHYS_ADDR_CONTINUOUS = 0, 187 PHYS_ADDR_CROSS_SUBREG = 1, 188 PHYS_ADDR_INVALID = 2, 189 PHYS_ADDR_LAST 190 } hpa_type; 191 192 /* Enable stats. */ 193 static uint32_t enable_stats = 0; 194 /* Enable retries on RX. */ 195 static uint32_t enable_retry = 1; 196 /* Specify timeout (in useconds) between retries on RX. */ 197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 198 /* Specify the number of retries on RX. */ 199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 200 201 /* Character device basename. Can be set by user. */ 202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 203 204 /* empty vmdq configuration structure. Filled in programatically */ 205 static struct rte_eth_conf vmdq_conf_default = { 206 .rxmode = { 207 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 208 .split_hdr_size = 0, 209 .header_split = 0, /**< Header Split disabled */ 210 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 211 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 212 /* 213 * It is necessary for 1G NIC such as I350, 214 * this fixes bug of ipv4 forwarding in guest can't 215 * forward pakets from one virtio dev to another virtio dev. 216 */ 217 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 218 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 219 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 220 }, 221 222 .txmode = { 223 .mq_mode = ETH_MQ_TX_NONE, 224 }, 225 .rx_adv_conf = { 226 /* 227 * should be overridden separately in code with 228 * appropriate values 229 */ 230 .vmdq_rx_conf = { 231 .nb_queue_pools = ETH_8_POOLS, 232 .enable_default_pool = 0, 233 .default_pool = 0, 234 .nb_pool_maps = 0, 235 .pool_map = {{0, 0},}, 236 }, 237 }, 238 }; 239 240 static unsigned lcore_ids[RTE_MAX_LCORE]; 241 static uint8_t ports[RTE_MAX_ETHPORTS]; 242 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 243 static uint16_t num_pf_queues, num_vmdq_queues; 244 static uint16_t vmdq_pool_base, vmdq_queue_base; 245 static uint16_t queues_per_pool; 246 247 static const uint16_t external_pkt_default_vlan_tag = 2000; 248 const uint16_t vlan_tags[] = { 249 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 250 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 251 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 252 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 253 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 254 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 255 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 256 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 257 }; 258 259 /* ethernet addresses of ports */ 260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 261 262 /* heads for the main used and free linked lists for the data path. */ 263 static struct virtio_net_data_ll *ll_root_used = NULL; 264 static struct virtio_net_data_ll *ll_root_free = NULL; 265 266 /* Array of data core structures containing information on individual core linked lists. */ 267 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 268 269 /* Used for queueing bursts of TX packets. */ 270 struct mbuf_table { 271 unsigned len; 272 unsigned txq_id; 273 struct rte_mbuf *m_table[MAX_PKT_BURST]; 274 }; 275 276 /* TX queue for each data core. */ 277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 278 279 /* TX queue fori each virtio device for zero copy. */ 280 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 281 282 /* Vlan header struct used to insert vlan tags on TX. */ 283 struct vlan_ethhdr { 284 unsigned char h_dest[ETH_ALEN]; 285 unsigned char h_source[ETH_ALEN]; 286 __be16 h_vlan_proto; 287 __be16 h_vlan_TCI; 288 __be16 h_vlan_encapsulated_proto; 289 }; 290 291 /* IPv4 Header */ 292 struct ipv4_hdr { 293 uint8_t version_ihl; /**< version and header length */ 294 uint8_t type_of_service; /**< type of service */ 295 uint16_t total_length; /**< length of packet */ 296 uint16_t packet_id; /**< packet ID */ 297 uint16_t fragment_offset; /**< fragmentation offset */ 298 uint8_t time_to_live; /**< time to live */ 299 uint8_t next_proto_id; /**< protocol ID */ 300 uint16_t hdr_checksum; /**< header checksum */ 301 uint32_t src_addr; /**< source address */ 302 uint32_t dst_addr; /**< destination address */ 303 } __attribute__((__packed__)); 304 305 /* Header lengths. */ 306 #define VLAN_HLEN 4 307 #define VLAN_ETH_HLEN 18 308 309 /* Per-device statistics struct */ 310 struct device_statistics { 311 uint64_t tx_total; 312 rte_atomic64_t rx_total_atomic; 313 uint64_t rx_total; 314 uint64_t tx; 315 rte_atomic64_t rx_atomic; 316 uint64_t rx; 317 } __rte_cache_aligned; 318 struct device_statistics dev_statistics[MAX_DEVICES]; 319 320 /* 321 * Builds up the correct configuration for VMDQ VLAN pool map 322 * according to the pool & queue limits. 323 */ 324 static inline int 325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 326 { 327 struct rte_eth_vmdq_rx_conf conf; 328 struct rte_eth_vmdq_rx_conf *def_conf = 329 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 330 unsigned i; 331 332 memset(&conf, 0, sizeof(conf)); 333 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 334 conf.nb_pool_maps = num_devices; 335 conf.enable_loop_back = def_conf->enable_loop_back; 336 conf.rx_mode = def_conf->rx_mode; 337 338 for (i = 0; i < conf.nb_pool_maps; i++) { 339 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 340 conf.pool_map[i].pools = (1UL << i); 341 } 342 343 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 344 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 345 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 346 return 0; 347 } 348 349 /* 350 * Validate the device number according to the max pool number gotten form 351 * dev_info. If the device number is invalid, give the error message and 352 * return -1. Each device must have its own pool. 353 */ 354 static inline int 355 validate_num_devices(uint32_t max_nb_devices) 356 { 357 if (num_devices > max_nb_devices) { 358 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 359 return -1; 360 } 361 return 0; 362 } 363 364 /* 365 * Initialises a given port using global settings and with the rx buffers 366 * coming from the mbuf_pool passed as parameter 367 */ 368 static inline int 369 port_init(uint8_t port) 370 { 371 struct rte_eth_dev_info dev_info; 372 struct rte_eth_conf port_conf; 373 struct rte_eth_rxconf *rxconf; 374 struct rte_eth_txconf *txconf; 375 int16_t rx_rings, tx_rings; 376 uint16_t rx_ring_size, tx_ring_size; 377 int retval; 378 uint16_t q; 379 380 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 381 rte_eth_dev_info_get (port, &dev_info); 382 383 if (dev_info.max_rx_queues > MAX_QUEUES) { 384 rte_exit(EXIT_FAILURE, 385 "please define MAX_QUEUES no less than %u in %s\n", 386 dev_info.max_rx_queues, __FILE__); 387 } 388 389 rxconf = &dev_info.default_rxconf; 390 txconf = &dev_info.default_txconf; 391 rxconf->rx_drop_en = 1; 392 393 /* 394 * Zero copy defers queue RX/TX start to the time when guest 395 * finishes its startup and packet buffers from that guest are 396 * available. 397 */ 398 if (zero_copy) { 399 rxconf->rx_deferred_start = 1; 400 rxconf->rx_drop_en = 0; 401 txconf->tx_deferred_start = 1; 402 } 403 404 /*configure the number of supported virtio devices based on VMDQ limits */ 405 num_devices = dev_info.max_vmdq_pools; 406 407 if (zero_copy) { 408 rx_ring_size = num_rx_descriptor; 409 tx_ring_size = num_tx_descriptor; 410 tx_rings = dev_info.max_tx_queues; 411 } else { 412 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 413 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 414 tx_rings = (uint16_t)rte_lcore_count(); 415 } 416 417 retval = validate_num_devices(MAX_DEVICES); 418 if (retval < 0) 419 return retval; 420 421 /* Get port configuration. */ 422 retval = get_eth_conf(&port_conf, num_devices); 423 if (retval < 0) 424 return retval; 425 /* NIC queues are divided into pf queues and vmdq queues. */ 426 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 427 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 428 num_vmdq_queues = num_devices * queues_per_pool; 429 num_queues = num_pf_queues + num_vmdq_queues; 430 vmdq_queue_base = dev_info.vmdq_queue_base; 431 vmdq_pool_base = dev_info.vmdq_pool_base; 432 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 433 num_pf_queues, num_devices, queues_per_pool); 434 435 if (port >= rte_eth_dev_count()) return -1; 436 437 rx_rings = (uint16_t)dev_info.max_rx_queues; 438 /* Configure ethernet device. */ 439 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 440 if (retval != 0) 441 return retval; 442 443 /* Setup the queues. */ 444 for (q = 0; q < rx_rings; q ++) { 445 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 446 rte_eth_dev_socket_id(port), 447 rxconf, 448 vpool_array[q].pool); 449 if (retval < 0) 450 return retval; 451 } 452 for (q = 0; q < tx_rings; q ++) { 453 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 454 rte_eth_dev_socket_id(port), 455 txconf); 456 if (retval < 0) 457 return retval; 458 } 459 460 /* Start the device. */ 461 retval = rte_eth_dev_start(port); 462 if (retval < 0) { 463 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 464 return retval; 465 } 466 467 if (promiscuous) 468 rte_eth_promiscuous_enable(port); 469 470 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 471 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 472 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 473 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 474 (unsigned)port, 475 vmdq_ports_eth_addr[port].addr_bytes[0], 476 vmdq_ports_eth_addr[port].addr_bytes[1], 477 vmdq_ports_eth_addr[port].addr_bytes[2], 478 vmdq_ports_eth_addr[port].addr_bytes[3], 479 vmdq_ports_eth_addr[port].addr_bytes[4], 480 vmdq_ports_eth_addr[port].addr_bytes[5]); 481 482 return 0; 483 } 484 485 /* 486 * Set character device basename. 487 */ 488 static int 489 us_vhost_parse_basename(const char *q_arg) 490 { 491 /* parse number string */ 492 493 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 494 return -1; 495 else 496 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 497 498 return 0; 499 } 500 501 /* 502 * Parse the portmask provided at run time. 503 */ 504 static int 505 parse_portmask(const char *portmask) 506 { 507 char *end = NULL; 508 unsigned long pm; 509 510 errno = 0; 511 512 /* parse hexadecimal string */ 513 pm = strtoul(portmask, &end, 16); 514 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 515 return -1; 516 517 if (pm == 0) 518 return -1; 519 520 return pm; 521 522 } 523 524 /* 525 * Parse num options at run time. 526 */ 527 static int 528 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 529 { 530 char *end = NULL; 531 unsigned long num; 532 533 errno = 0; 534 535 /* parse unsigned int string */ 536 num = strtoul(q_arg, &end, 10); 537 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 538 return -1; 539 540 if (num > max_valid_value) 541 return -1; 542 543 return num; 544 545 } 546 547 /* 548 * Display usage 549 */ 550 static void 551 us_vhost_usage(const char *prgname) 552 { 553 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 554 " --vm2vm [0|1|2]\n" 555 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 556 " --dev-basename <name>\n" 557 " --nb-devices ND\n" 558 " -p PORTMASK: Set mask for ports to be used by application\n" 559 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 560 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 561 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 562 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 563 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 564 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 565 " --dev-basename: The basename to be used for the character device.\n" 566 " --zero-copy [0|1]: disable(default)/enable rx/tx " 567 "zero copy\n" 568 " --rx-desc-num [0-N]: the number of descriptors on rx, " 569 "used only when zero copy is enabled.\n" 570 " --tx-desc-num [0-N]: the number of descriptors on tx, " 571 "used only when zero copy is enabled.\n", 572 prgname); 573 } 574 575 /* 576 * Parse the arguments given in the command line of the application. 577 */ 578 static int 579 us_vhost_parse_args(int argc, char **argv) 580 { 581 int opt, ret; 582 int option_index; 583 unsigned i; 584 const char *prgname = argv[0]; 585 static struct option long_option[] = { 586 {"vm2vm", required_argument, NULL, 0}, 587 {"rx-retry", required_argument, NULL, 0}, 588 {"rx-retry-delay", required_argument, NULL, 0}, 589 {"rx-retry-num", required_argument, NULL, 0}, 590 {"mergeable", required_argument, NULL, 0}, 591 {"stats", required_argument, NULL, 0}, 592 {"dev-basename", required_argument, NULL, 0}, 593 {"zero-copy", required_argument, NULL, 0}, 594 {"rx-desc-num", required_argument, NULL, 0}, 595 {"tx-desc-num", required_argument, NULL, 0}, 596 {NULL, 0, 0, 0}, 597 }; 598 599 /* Parse command line */ 600 while ((opt = getopt_long(argc, argv, "p:P", 601 long_option, &option_index)) != EOF) { 602 switch (opt) { 603 /* Portmask */ 604 case 'p': 605 enabled_port_mask = parse_portmask(optarg); 606 if (enabled_port_mask == 0) { 607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 608 us_vhost_usage(prgname); 609 return -1; 610 } 611 break; 612 613 case 'P': 614 promiscuous = 1; 615 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 616 ETH_VMDQ_ACCEPT_BROADCAST | 617 ETH_VMDQ_ACCEPT_MULTICAST; 618 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 619 620 break; 621 622 case 0: 623 /* Enable/disable vm2vm comms. */ 624 if (!strncmp(long_option[option_index].name, "vm2vm", 625 MAX_LONG_OPT_SZ)) { 626 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 627 if (ret == -1) { 628 RTE_LOG(INFO, VHOST_CONFIG, 629 "Invalid argument for " 630 "vm2vm [0|1|2]\n"); 631 us_vhost_usage(prgname); 632 return -1; 633 } else { 634 vm2vm_mode = (vm2vm_type)ret; 635 } 636 } 637 638 /* Enable/disable retries on RX. */ 639 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 640 ret = parse_num_opt(optarg, 1); 641 if (ret == -1) { 642 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 643 us_vhost_usage(prgname); 644 return -1; 645 } else { 646 enable_retry = ret; 647 } 648 } 649 650 /* Specify the retries delay time (in useconds) on RX. */ 651 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 652 ret = parse_num_opt(optarg, INT32_MAX); 653 if (ret == -1) { 654 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 655 us_vhost_usage(prgname); 656 return -1; 657 } else { 658 burst_rx_delay_time = ret; 659 } 660 } 661 662 /* Specify the retries number on RX. */ 663 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 664 ret = parse_num_opt(optarg, INT32_MAX); 665 if (ret == -1) { 666 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 667 us_vhost_usage(prgname); 668 return -1; 669 } else { 670 burst_rx_retry_num = ret; 671 } 672 } 673 674 /* Enable/disable RX mergeable buffers. */ 675 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 676 ret = parse_num_opt(optarg, 1); 677 if (ret == -1) { 678 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 679 us_vhost_usage(prgname); 680 return -1; 681 } else { 682 mergeable = !!ret; 683 if (ret) { 684 vmdq_conf_default.rxmode.jumbo_frame = 1; 685 vmdq_conf_default.rxmode.max_rx_pkt_len 686 = JUMBO_FRAME_MAX_SIZE; 687 } 688 } 689 } 690 691 /* Enable/disable stats. */ 692 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 693 ret = parse_num_opt(optarg, INT32_MAX); 694 if (ret == -1) { 695 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 696 us_vhost_usage(prgname); 697 return -1; 698 } else { 699 enable_stats = ret; 700 } 701 } 702 703 /* Set character device basename. */ 704 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 705 if (us_vhost_parse_basename(optarg) == -1) { 706 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 707 us_vhost_usage(prgname); 708 return -1; 709 } 710 } 711 712 /* Enable/disable rx/tx zero copy. */ 713 if (!strncmp(long_option[option_index].name, 714 "zero-copy", MAX_LONG_OPT_SZ)) { 715 ret = parse_num_opt(optarg, 1); 716 if (ret == -1) { 717 RTE_LOG(INFO, VHOST_CONFIG, 718 "Invalid argument" 719 " for zero-copy [0|1]\n"); 720 us_vhost_usage(prgname); 721 return -1; 722 } else 723 zero_copy = ret; 724 725 if (zero_copy) { 726 #ifdef RTE_MBUF_REFCNT 727 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 728 "zero copy vhost APP, please " 729 "disable RTE_MBUF_REFCNT\n" 730 "in config file and then rebuild DPDK " 731 "core lib!\n" 732 "Otherwise please disable zero copy " 733 "flag in command line!\n"); 734 return -1; 735 #endif 736 } 737 } 738 739 /* Specify the descriptor number on RX. */ 740 if (!strncmp(long_option[option_index].name, 741 "rx-desc-num", MAX_LONG_OPT_SZ)) { 742 ret = parse_num_opt(optarg, MAX_RING_DESC); 743 if ((ret == -1) || (!POWEROF2(ret))) { 744 RTE_LOG(INFO, VHOST_CONFIG, 745 "Invalid argument for rx-desc-num[0-N]," 746 "power of 2 required.\n"); 747 us_vhost_usage(prgname); 748 return -1; 749 } else { 750 num_rx_descriptor = ret; 751 } 752 } 753 754 /* Specify the descriptor number on TX. */ 755 if (!strncmp(long_option[option_index].name, 756 "tx-desc-num", MAX_LONG_OPT_SZ)) { 757 ret = parse_num_opt(optarg, MAX_RING_DESC); 758 if ((ret == -1) || (!POWEROF2(ret))) { 759 RTE_LOG(INFO, VHOST_CONFIG, 760 "Invalid argument for tx-desc-num [0-N]," 761 "power of 2 required.\n"); 762 us_vhost_usage(prgname); 763 return -1; 764 } else { 765 num_tx_descriptor = ret; 766 } 767 } 768 769 break; 770 771 /* Invalid option - print options. */ 772 default: 773 us_vhost_usage(prgname); 774 return -1; 775 } 776 } 777 778 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 779 if (enabled_port_mask & (1 << i)) 780 ports[num_ports++] = (uint8_t)i; 781 } 782 783 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 784 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 785 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 786 return -1; 787 } 788 789 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 790 RTE_LOG(INFO, VHOST_PORT, 791 "Vhost zero copy doesn't support software vm2vm," 792 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 793 return -1; 794 } 795 796 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 797 RTE_LOG(INFO, VHOST_PORT, 798 "Vhost zero copy doesn't support jumbo frame," 799 "please specify '--mergeable 0' to disable the " 800 "mergeable feature.\n"); 801 return -1; 802 } 803 804 return 0; 805 } 806 807 /* 808 * Update the global var NUM_PORTS and array PORTS according to system ports number 809 * and return valid ports number 810 */ 811 static unsigned check_ports_num(unsigned nb_ports) 812 { 813 unsigned valid_num_ports = num_ports; 814 unsigned portid; 815 816 if (num_ports > nb_ports) { 817 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 818 num_ports, nb_ports); 819 num_ports = nb_ports; 820 } 821 822 for (portid = 0; portid < num_ports; portid ++) { 823 if (ports[portid] >= nb_ports) { 824 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 825 ports[portid], (nb_ports - 1)); 826 ports[portid] = INVALID_PORT_ID; 827 valid_num_ports--; 828 } 829 } 830 return valid_num_ports; 831 } 832 833 /* 834 * Macro to print out packet contents. Wrapped in debug define so that the 835 * data path is not effected when debug is disabled. 836 */ 837 #ifdef DEBUG 838 #define PRINT_PACKET(device, addr, size, header) do { \ 839 char *pkt_addr = (char*)(addr); \ 840 unsigned int index; \ 841 char packet[MAX_PRINT_BUFF]; \ 842 \ 843 if ((header)) \ 844 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 845 else \ 846 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 847 for (index = 0; index < (size); index++) { \ 848 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 849 "%02hhx ", pkt_addr[index]); \ 850 } \ 851 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 852 \ 853 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 854 } while(0) 855 #else 856 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 857 #endif 858 859 /* 860 * Function to convert guest physical addresses to vhost physical addresses. 861 * This is used to convert virtio buffer addresses. 862 */ 863 static inline uint64_t __attribute__((always_inline)) 864 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 865 uint32_t buf_len, hpa_type *addr_type) 866 { 867 struct virtio_memory_regions_hpa *region; 868 uint32_t regionidx; 869 uint64_t vhost_pa = 0; 870 871 *addr_type = PHYS_ADDR_INVALID; 872 873 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 874 region = &vdev->regions_hpa[regionidx]; 875 if ((guest_pa >= region->guest_phys_address) && 876 (guest_pa <= region->guest_phys_address_end)) { 877 vhost_pa = region->host_phys_addr_offset + guest_pa; 878 if (likely((guest_pa + buf_len - 1) 879 <= region->guest_phys_address_end)) 880 *addr_type = PHYS_ADDR_CONTINUOUS; 881 else 882 *addr_type = PHYS_ADDR_CROSS_SUBREG; 883 break; 884 } 885 } 886 887 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 888 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 889 (void *)(uintptr_t)vhost_pa); 890 891 return vhost_pa; 892 } 893 894 /* 895 * Compares a packet destination MAC address to a device MAC address. 896 */ 897 static inline int __attribute__((always_inline)) 898 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 899 { 900 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 901 } 902 903 /* 904 * This function learns the MAC address of the device and registers this along with a 905 * vlan tag to a VMDQ. 906 */ 907 static int 908 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 909 { 910 struct ether_hdr *pkt_hdr; 911 struct virtio_net_data_ll *dev_ll; 912 struct virtio_net *dev = vdev->dev; 913 int i, ret; 914 915 /* Learn MAC address of guest device from packet */ 916 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 917 918 dev_ll = ll_root_used; 919 920 while (dev_ll != NULL) { 921 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 922 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 923 return -1; 924 } 925 dev_ll = dev_ll->next; 926 } 927 928 for (i = 0; i < ETHER_ADDR_LEN; i++) 929 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 930 931 /* vlan_tag currently uses the device_id. */ 932 vdev->vlan_tag = vlan_tags[dev->device_fh]; 933 934 /* Print out VMDQ registration info. */ 935 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 936 dev->device_fh, 937 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 938 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 939 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 940 vdev->vlan_tag); 941 942 /* Register the MAC address. */ 943 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 944 (uint32_t)dev->device_fh + vmdq_pool_base); 945 if (ret) 946 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 947 dev->device_fh); 948 949 /* Enable stripping of the vlan tag as we handle routing. */ 950 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 951 952 /* Set device as ready for RX. */ 953 vdev->ready = DEVICE_RX; 954 955 return 0; 956 } 957 958 /* 959 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 960 * queue before disabling RX on the device. 961 */ 962 static inline void 963 unlink_vmdq(struct vhost_dev *vdev) 964 { 965 unsigned i = 0; 966 unsigned rx_count; 967 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 968 969 if (vdev->ready == DEVICE_RX) { 970 /*clear MAC and VLAN settings*/ 971 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 972 for (i = 0; i < 6; i++) 973 vdev->mac_address.addr_bytes[i] = 0; 974 975 vdev->vlan_tag = 0; 976 977 /*Clear out the receive buffers*/ 978 rx_count = rte_eth_rx_burst(ports[0], 979 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 980 981 while (rx_count) { 982 for (i = 0; i < rx_count; i++) 983 rte_pktmbuf_free(pkts_burst[i]); 984 985 rx_count = rte_eth_rx_burst(ports[0], 986 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 987 } 988 989 vdev->ready = DEVICE_MAC_LEARNING; 990 } 991 } 992 993 /* 994 * Check if the packet destination MAC address is for a local device. If so then put 995 * the packet on that devices RX queue. If not then return. 996 */ 997 static inline int __attribute__((always_inline)) 998 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 999 { 1000 struct virtio_net_data_ll *dev_ll; 1001 struct ether_hdr *pkt_hdr; 1002 uint64_t ret = 0; 1003 struct virtio_net *dev = vdev->dev; 1004 struct virtio_net *tdev; /* destination virito device */ 1005 1006 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1007 1008 /*get the used devices list*/ 1009 dev_ll = ll_root_used; 1010 1011 while (dev_ll != NULL) { 1012 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1013 &dev_ll->vdev->mac_address)) { 1014 1015 /* Drop the packet if the TX packet is destined for the TX device. */ 1016 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1017 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1018 dev->device_fh); 1019 return 0; 1020 } 1021 tdev = dev_ll->vdev->dev; 1022 1023 1024 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1025 1026 if (unlikely(dev_ll->vdev->remove)) { 1027 /*drop the packet if the device is marked for removal*/ 1028 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1029 } else { 1030 /*send the packet to the local virtio device*/ 1031 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1032 if (enable_stats) { 1033 rte_atomic64_add( 1034 &dev_statistics[tdev->device_fh].rx_total_atomic, 1035 1); 1036 rte_atomic64_add( 1037 &dev_statistics[tdev->device_fh].rx_atomic, 1038 ret); 1039 dev_statistics[tdev->device_fh].tx_total++; 1040 dev_statistics[tdev->device_fh].tx += ret; 1041 } 1042 } 1043 1044 return 0; 1045 } 1046 dev_ll = dev_ll->next; 1047 } 1048 1049 return -1; 1050 } 1051 1052 /* 1053 * Check if the destination MAC of a packet is one local VM, 1054 * and get its vlan tag, and offset if it is. 1055 */ 1056 static inline int __attribute__((always_inline)) 1057 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1058 uint32_t *offset, uint16_t *vlan_tag) 1059 { 1060 struct virtio_net_data_ll *dev_ll = ll_root_used; 1061 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1062 1063 while (dev_ll != NULL) { 1064 if ((dev_ll->vdev->ready == DEVICE_RX) 1065 && ether_addr_cmp(&(pkt_hdr->d_addr), 1066 &dev_ll->vdev->mac_address)) { 1067 /* 1068 * Drop the packet if the TX packet is 1069 * destined for the TX device. 1070 */ 1071 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1072 LOG_DEBUG(VHOST_DATA, 1073 "(%"PRIu64") TX: Source and destination" 1074 " MAC addresses are the same. Dropping " 1075 "packet.\n", 1076 dev_ll->vdev->dev->device_fh); 1077 return -1; 1078 } 1079 1080 /* 1081 * HW vlan strip will reduce the packet length 1082 * by minus length of vlan tag, so need restore 1083 * the packet length by plus it. 1084 */ 1085 *offset = VLAN_HLEN; 1086 *vlan_tag = 1087 (uint16_t) 1088 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1089 1090 LOG_DEBUG(VHOST_DATA, 1091 "(%"PRIu64") TX: pkt to local VM device id:" 1092 "(%"PRIu64") vlan tag: %d.\n", 1093 dev->device_fh, dev_ll->vdev->dev->device_fh, 1094 vlan_tag); 1095 1096 break; 1097 } 1098 dev_ll = dev_ll->next; 1099 } 1100 return 0; 1101 } 1102 1103 /* 1104 * This function routes the TX packet to the correct interface. This may be a local device 1105 * or the physical port. 1106 */ 1107 static inline void __attribute__((always_inline)) 1108 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1109 { 1110 struct mbuf_table *tx_q; 1111 struct rte_mbuf **m_table; 1112 unsigned len, ret, offset = 0; 1113 const uint16_t lcore_id = rte_lcore_id(); 1114 struct virtio_net *dev = vdev->dev; 1115 1116 /*check if destination is local VM*/ 1117 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1118 rte_pktmbuf_free(m); 1119 return; 1120 } 1121 1122 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1123 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1124 rte_pktmbuf_free(m); 1125 return; 1126 } 1127 } 1128 1129 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1130 1131 /*Add packet to the port tx queue*/ 1132 tx_q = &lcore_tx_queue[lcore_id]; 1133 len = tx_q->len; 1134 1135 m->ol_flags = PKT_TX_VLAN_PKT; 1136 1137 /* 1138 * Find the right seg to adjust the data len when offset is 1139 * bigger than tail room size. 1140 */ 1141 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1142 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1143 m->data_len += offset; 1144 else { 1145 struct rte_mbuf *seg = m; 1146 1147 while ((seg->next != NULL) && 1148 (offset > rte_pktmbuf_tailroom(seg))) 1149 seg = seg->next; 1150 1151 seg->data_len += offset; 1152 } 1153 m->pkt_len += offset; 1154 } 1155 1156 m->vlan_tci = vlan_tag; 1157 1158 tx_q->m_table[len] = m; 1159 len++; 1160 if (enable_stats) { 1161 dev_statistics[dev->device_fh].tx_total++; 1162 dev_statistics[dev->device_fh].tx++; 1163 } 1164 1165 if (unlikely(len == MAX_PKT_BURST)) { 1166 m_table = (struct rte_mbuf **)tx_q->m_table; 1167 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1168 /* Free any buffers not handled by TX and update the port stats. */ 1169 if (unlikely(ret < len)) { 1170 do { 1171 rte_pktmbuf_free(m_table[ret]); 1172 } while (++ret < len); 1173 } 1174 1175 len = 0; 1176 } 1177 1178 tx_q->len = len; 1179 return; 1180 } 1181 /* 1182 * This function is called by each data core. It handles all RX/TX registered with the 1183 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1184 * with all devices in the main linked list. 1185 */ 1186 static int 1187 switch_worker(__attribute__((unused)) void *arg) 1188 { 1189 struct rte_mempool *mbuf_pool = arg; 1190 struct virtio_net *dev = NULL; 1191 struct vhost_dev *vdev = NULL; 1192 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1193 struct virtio_net_data_ll *dev_ll; 1194 struct mbuf_table *tx_q; 1195 volatile struct lcore_ll_info *lcore_ll; 1196 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1197 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1198 unsigned ret, i; 1199 const uint16_t lcore_id = rte_lcore_id(); 1200 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1201 uint16_t rx_count = 0; 1202 uint16_t tx_count; 1203 uint32_t retry = 0; 1204 1205 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1206 lcore_ll = lcore_info[lcore_id].lcore_ll; 1207 prev_tsc = 0; 1208 1209 tx_q = &lcore_tx_queue[lcore_id]; 1210 for (i = 0; i < num_cores; i ++) { 1211 if (lcore_ids[i] == lcore_id) { 1212 tx_q->txq_id = i; 1213 break; 1214 } 1215 } 1216 1217 while(1) { 1218 cur_tsc = rte_rdtsc(); 1219 /* 1220 * TX burst queue drain 1221 */ 1222 diff_tsc = cur_tsc - prev_tsc; 1223 if (unlikely(diff_tsc > drain_tsc)) { 1224 1225 if (tx_q->len) { 1226 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1227 1228 /*Tx any packets in the queue*/ 1229 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1230 (struct rte_mbuf **)tx_q->m_table, 1231 (uint16_t)tx_q->len); 1232 if (unlikely(ret < tx_q->len)) { 1233 do { 1234 rte_pktmbuf_free(tx_q->m_table[ret]); 1235 } while (++ret < tx_q->len); 1236 } 1237 1238 tx_q->len = 0; 1239 } 1240 1241 prev_tsc = cur_tsc; 1242 1243 } 1244 1245 rte_prefetch0(lcore_ll->ll_root_used); 1246 /* 1247 * Inform the configuration core that we have exited the linked list and that no devices are 1248 * in use if requested. 1249 */ 1250 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1251 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1252 1253 /* 1254 * Process devices 1255 */ 1256 dev_ll = lcore_ll->ll_root_used; 1257 1258 while (dev_ll != NULL) { 1259 /*get virtio device ID*/ 1260 vdev = dev_ll->vdev; 1261 dev = vdev->dev; 1262 1263 if (unlikely(vdev->remove)) { 1264 dev_ll = dev_ll->next; 1265 unlink_vmdq(vdev); 1266 vdev->ready = DEVICE_SAFE_REMOVE; 1267 continue; 1268 } 1269 if (likely(vdev->ready == DEVICE_RX)) { 1270 /*Handle guest RX*/ 1271 rx_count = rte_eth_rx_burst(ports[0], 1272 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1273 1274 if (rx_count) { 1275 /* 1276 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1277 * Here MAX_PKT_BURST must be less than virtio queue size 1278 */ 1279 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1280 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1281 rte_delay_us(burst_rx_delay_time); 1282 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1283 break; 1284 } 1285 } 1286 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1287 if (enable_stats) { 1288 rte_atomic64_add( 1289 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1290 rx_count); 1291 rte_atomic64_add( 1292 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1293 } 1294 while (likely(rx_count)) { 1295 rx_count--; 1296 rte_pktmbuf_free(pkts_burst[rx_count]); 1297 } 1298 1299 } 1300 } 1301 1302 if (likely(!vdev->remove)) { 1303 /* Handle guest TX*/ 1304 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1305 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1306 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1307 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1308 while (tx_count--) 1309 rte_pktmbuf_free(pkts_burst[tx_count]); 1310 } 1311 } 1312 while (tx_count) 1313 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1314 } 1315 1316 /*move to the next device in the list*/ 1317 dev_ll = dev_ll->next; 1318 } 1319 } 1320 1321 return 0; 1322 } 1323 1324 /* 1325 * This function gets available ring number for zero copy rx. 1326 * Only one thread will call this funciton for a paticular virtio device, 1327 * so, it is designed as non-thread-safe function. 1328 */ 1329 static inline uint32_t __attribute__((always_inline)) 1330 get_available_ring_num_zcp(struct virtio_net *dev) 1331 { 1332 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1333 uint16_t avail_idx; 1334 1335 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1336 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1337 } 1338 1339 /* 1340 * This function gets available ring index for zero copy rx, 1341 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1342 * Only one thread will call this funciton for a paticular virtio device, 1343 * so, it is designed as non-thread-safe function. 1344 */ 1345 static inline uint32_t __attribute__((always_inline)) 1346 get_available_ring_index_zcp(struct virtio_net *dev, 1347 uint16_t *res_base_idx, uint32_t count) 1348 { 1349 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1350 uint16_t avail_idx; 1351 uint32_t retry = 0; 1352 uint16_t free_entries; 1353 1354 *res_base_idx = vq->last_used_idx_res; 1355 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1356 free_entries = (avail_idx - *res_base_idx); 1357 1358 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1359 "avail idx: %d, " 1360 "res base idx:%d, free entries:%d\n", 1361 dev->device_fh, avail_idx, *res_base_idx, 1362 free_entries); 1363 1364 /* 1365 * If retry is enabled and the queue is full then we wait 1366 * and retry to avoid packet loss. 1367 */ 1368 if (enable_retry && unlikely(count > free_entries)) { 1369 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1370 rte_delay_us(burst_rx_delay_time); 1371 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1372 free_entries = (avail_idx - *res_base_idx); 1373 if (count <= free_entries) 1374 break; 1375 } 1376 } 1377 1378 /*check that we have enough buffers*/ 1379 if (unlikely(count > free_entries)) 1380 count = free_entries; 1381 1382 if (unlikely(count == 0)) { 1383 LOG_DEBUG(VHOST_DATA, 1384 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1385 "avail idx: %d, res base idx:%d, free entries:%d\n", 1386 dev->device_fh, avail_idx, 1387 *res_base_idx, free_entries); 1388 return 0; 1389 } 1390 1391 vq->last_used_idx_res = *res_base_idx + count; 1392 1393 return count; 1394 } 1395 1396 /* 1397 * This function put descriptor back to used list. 1398 */ 1399 static inline void __attribute__((always_inline)) 1400 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1401 { 1402 uint16_t res_cur_idx = vq->last_used_idx; 1403 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1404 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1405 rte_compiler_barrier(); 1406 *(volatile uint16_t *)&vq->used->idx += 1; 1407 vq->last_used_idx += 1; 1408 1409 /* Kick the guest if necessary. */ 1410 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1411 eventfd_write((int)vq->kickfd, 1); 1412 } 1413 1414 /* 1415 * This function get available descriptor from vitio vring and un-attached mbuf 1416 * from vpool->ring, and then attach them together. It needs adjust the offset 1417 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1418 * frame data may be put to wrong location in mbuf. 1419 */ 1420 static inline void __attribute__((always_inline)) 1421 attach_rxmbuf_zcp(struct virtio_net *dev) 1422 { 1423 uint16_t res_base_idx, desc_idx; 1424 uint64_t buff_addr, phys_addr; 1425 struct vhost_virtqueue *vq; 1426 struct vring_desc *desc; 1427 struct rte_mbuf *mbuf = NULL; 1428 struct vpool *vpool; 1429 hpa_type addr_type; 1430 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1431 1432 vpool = &vpool_array[vdev->vmdq_rx_q]; 1433 vq = dev->virtqueue[VIRTIO_RXQ]; 1434 1435 do { 1436 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1437 1) != 1)) 1438 return; 1439 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1440 1441 desc = &vq->desc[desc_idx]; 1442 if (desc->flags & VRING_DESC_F_NEXT) { 1443 desc = &vq->desc[desc->next]; 1444 buff_addr = gpa_to_vva(dev, desc->addr); 1445 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1446 &addr_type); 1447 } else { 1448 buff_addr = gpa_to_vva(dev, 1449 desc->addr + vq->vhost_hlen); 1450 phys_addr = gpa_to_hpa(vdev, 1451 desc->addr + vq->vhost_hlen, 1452 desc->len, &addr_type); 1453 } 1454 1455 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1456 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1457 " address found when attaching RX frame buffer" 1458 " address!\n", dev->device_fh); 1459 put_desc_to_used_list_zcp(vq, desc_idx); 1460 continue; 1461 } 1462 1463 /* 1464 * Check if the frame buffer address from guest crosses 1465 * sub-region or not. 1466 */ 1467 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1468 RTE_LOG(ERR, VHOST_DATA, 1469 "(%"PRIu64") Frame buffer address cross " 1470 "sub-regioin found when attaching RX frame " 1471 "buffer address!\n", 1472 dev->device_fh); 1473 put_desc_to_used_list_zcp(vq, desc_idx); 1474 continue; 1475 } 1476 } while (unlikely(phys_addr == 0)); 1477 1478 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1479 if (unlikely(mbuf == NULL)) { 1480 LOG_DEBUG(VHOST_DATA, 1481 "(%"PRIu64") in attach_rxmbuf_zcp: " 1482 "ring_sc_dequeue fail.\n", 1483 dev->device_fh); 1484 put_desc_to_used_list_zcp(vq, desc_idx); 1485 return; 1486 } 1487 1488 if (unlikely(vpool->buf_size > desc->len)) { 1489 LOG_DEBUG(VHOST_DATA, 1490 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1491 "length(%d) of descriptor idx: %d less than room " 1492 "size required: %d\n", 1493 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1494 put_desc_to_used_list_zcp(vq, desc_idx); 1495 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1496 return; 1497 } 1498 1499 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1500 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1501 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1502 mbuf->data_len = desc->len; 1503 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1504 1505 LOG_DEBUG(VHOST_DATA, 1506 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1507 "descriptor idx:%d\n", 1508 dev->device_fh, res_base_idx, desc_idx); 1509 1510 __rte_mbuf_raw_free(mbuf); 1511 1512 return; 1513 } 1514 1515 /* 1516 * Detach an attched packet mbuf - 1517 * - restore original mbuf address and length values. 1518 * - reset pktmbuf data and data_len to their default values. 1519 * All other fields of the given packet mbuf will be left intact. 1520 * 1521 * @param m 1522 * The attached packet mbuf. 1523 */ 1524 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1525 { 1526 const struct rte_mempool *mp = m->pool; 1527 void *buf = RTE_MBUF_TO_BADDR(m); 1528 uint32_t buf_ofs; 1529 uint32_t buf_len = mp->elt_size - sizeof(*m); 1530 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1531 1532 m->buf_addr = buf; 1533 m->buf_len = (uint16_t)buf_len; 1534 1535 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1536 RTE_PKTMBUF_HEADROOM : m->buf_len; 1537 m->data_off = buf_ofs; 1538 1539 m->data_len = 0; 1540 } 1541 1542 /* 1543 * This function is called after packets have been transimited. It fetchs mbuf 1544 * from vpool->pool, detached it and put into vpool->ring. It also update the 1545 * used index and kick the guest if necessary. 1546 */ 1547 static inline uint32_t __attribute__((always_inline)) 1548 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1549 { 1550 struct rte_mbuf *mbuf; 1551 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1552 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1553 uint32_t index = 0; 1554 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1555 1556 LOG_DEBUG(VHOST_DATA, 1557 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1558 "clean is: %d\n", 1559 dev->device_fh, mbuf_count); 1560 LOG_DEBUG(VHOST_DATA, 1561 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1562 "clean is : %d\n", 1563 dev->device_fh, rte_ring_count(vpool->ring)); 1564 1565 for (index = 0; index < mbuf_count; index++) { 1566 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1567 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1568 pktmbuf_detach_zcp(mbuf); 1569 rte_ring_sp_enqueue(vpool->ring, mbuf); 1570 1571 /* Update used index buffer information. */ 1572 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1573 vq->used->ring[used_idx].len = 0; 1574 1575 used_idx = (used_idx + 1) & (vq->size - 1); 1576 } 1577 1578 LOG_DEBUG(VHOST_DATA, 1579 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1580 "clean is: %d\n", 1581 dev->device_fh, rte_mempool_count(vpool->pool)); 1582 LOG_DEBUG(VHOST_DATA, 1583 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1584 "clean is : %d\n", 1585 dev->device_fh, rte_ring_count(vpool->ring)); 1586 LOG_DEBUG(VHOST_DATA, 1587 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1588 "vq->last_used_idx:%d\n", 1589 dev->device_fh, vq->last_used_idx); 1590 1591 vq->last_used_idx += mbuf_count; 1592 1593 LOG_DEBUG(VHOST_DATA, 1594 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1595 "vq->last_used_idx:%d\n", 1596 dev->device_fh, vq->last_used_idx); 1597 1598 rte_compiler_barrier(); 1599 1600 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1601 1602 /* Kick guest if required. */ 1603 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1604 eventfd_write((int)vq->kickfd, 1); 1605 1606 return 0; 1607 } 1608 1609 /* 1610 * This function is called when a virtio device is destroy. 1611 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1612 */ 1613 static void mbuf_destroy_zcp(struct vpool *vpool) 1614 { 1615 struct rte_mbuf *mbuf = NULL; 1616 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1617 1618 LOG_DEBUG(VHOST_CONFIG, 1619 "in mbuf_destroy_zcp: mbuf count in mempool before " 1620 "mbuf_destroy_zcp is: %d\n", 1621 mbuf_count); 1622 LOG_DEBUG(VHOST_CONFIG, 1623 "in mbuf_destroy_zcp: mbuf count in ring before " 1624 "mbuf_destroy_zcp is : %d\n", 1625 rte_ring_count(vpool->ring)); 1626 1627 for (index = 0; index < mbuf_count; index++) { 1628 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1629 if (likely(mbuf != NULL)) { 1630 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1631 pktmbuf_detach_zcp(mbuf); 1632 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1633 } 1634 } 1635 1636 LOG_DEBUG(VHOST_CONFIG, 1637 "in mbuf_destroy_zcp: mbuf count in mempool after " 1638 "mbuf_destroy_zcp is: %d\n", 1639 rte_mempool_count(vpool->pool)); 1640 LOG_DEBUG(VHOST_CONFIG, 1641 "in mbuf_destroy_zcp: mbuf count in ring after " 1642 "mbuf_destroy_zcp is : %d\n", 1643 rte_ring_count(vpool->ring)); 1644 } 1645 1646 /* 1647 * This function update the use flag and counter. 1648 */ 1649 static inline uint32_t __attribute__((always_inline)) 1650 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1651 uint32_t count) 1652 { 1653 struct vhost_virtqueue *vq; 1654 struct vring_desc *desc; 1655 struct rte_mbuf *buff; 1656 /* The virtio_hdr is initialised to 0. */ 1657 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1658 = {{0, 0, 0, 0, 0, 0}, 0}; 1659 uint64_t buff_hdr_addr = 0; 1660 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1661 uint32_t head_idx, packet_success = 0; 1662 uint16_t res_cur_idx; 1663 1664 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1665 1666 if (count == 0) 1667 return 0; 1668 1669 vq = dev->virtqueue[VIRTIO_RXQ]; 1670 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1671 1672 res_cur_idx = vq->last_used_idx; 1673 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1674 dev->device_fh, res_cur_idx, res_cur_idx + count); 1675 1676 /* Retrieve all of the head indexes first to avoid caching issues. */ 1677 for (head_idx = 0; head_idx < count; head_idx++) 1678 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1679 1680 /*Prefetch descriptor index. */ 1681 rte_prefetch0(&vq->desc[head[packet_success]]); 1682 1683 while (packet_success != count) { 1684 /* Get descriptor from available ring */ 1685 desc = &vq->desc[head[packet_success]]; 1686 1687 buff = pkts[packet_success]; 1688 LOG_DEBUG(VHOST_DATA, 1689 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1690 "pkt[%d] descriptor idx: %d\n", 1691 dev->device_fh, packet_success, 1692 MBUF_HEADROOM_UINT32(buff)); 1693 1694 PRINT_PACKET(dev, 1695 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1696 + RTE_PKTMBUF_HEADROOM), 1697 rte_pktmbuf_data_len(buff), 0); 1698 1699 /* Buffer address translation for virtio header. */ 1700 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1701 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1702 1703 /* 1704 * If the descriptors are chained the header and data are 1705 * placed in separate buffers. 1706 */ 1707 if (desc->flags & VRING_DESC_F_NEXT) { 1708 desc->len = vq->vhost_hlen; 1709 desc = &vq->desc[desc->next]; 1710 desc->len = rte_pktmbuf_data_len(buff); 1711 } else { 1712 desc->len = packet_len; 1713 } 1714 1715 /* Update used ring with desc information */ 1716 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1717 = head[packet_success]; 1718 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1719 = packet_len; 1720 res_cur_idx++; 1721 packet_success++; 1722 1723 /* A header is required per buffer. */ 1724 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1725 (const void *)&virtio_hdr, vq->vhost_hlen); 1726 1727 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1728 1729 if (likely(packet_success < count)) { 1730 /* Prefetch descriptor index. */ 1731 rte_prefetch0(&vq->desc[head[packet_success]]); 1732 } 1733 } 1734 1735 rte_compiler_barrier(); 1736 1737 LOG_DEBUG(VHOST_DATA, 1738 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1739 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1740 dev->device_fh, vq->last_used_idx, vq->used->idx); 1741 1742 *(volatile uint16_t *)&vq->used->idx += count; 1743 vq->last_used_idx += count; 1744 1745 LOG_DEBUG(VHOST_DATA, 1746 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1747 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1748 dev->device_fh, vq->last_used_idx, vq->used->idx); 1749 1750 /* Kick the guest if necessary. */ 1751 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1752 eventfd_write((int)vq->kickfd, 1); 1753 1754 return count; 1755 } 1756 1757 /* 1758 * This function routes the TX packet to the correct interface. 1759 * This may be a local device or the physical port. 1760 */ 1761 static inline void __attribute__((always_inline)) 1762 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1763 uint32_t desc_idx, uint8_t need_copy) 1764 { 1765 struct mbuf_table *tx_q; 1766 struct rte_mbuf **m_table; 1767 struct rte_mbuf *mbuf = NULL; 1768 unsigned len, ret, offset = 0; 1769 struct vpool *vpool; 1770 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1771 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1772 1773 /*Add packet to the port tx queue*/ 1774 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1775 len = tx_q->len; 1776 1777 /* Allocate an mbuf and populate the structure. */ 1778 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1779 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1780 if (unlikely(mbuf == NULL)) { 1781 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1782 RTE_LOG(ERR, VHOST_DATA, 1783 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1784 dev->device_fh); 1785 put_desc_to_used_list_zcp(vq, desc_idx); 1786 return; 1787 } 1788 1789 if (vm2vm_mode == VM2VM_HARDWARE) { 1790 /* Avoid using a vlan tag from any vm for external pkt, such as 1791 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1792 * selection, MAC address determines it as an external pkt 1793 * which should go to network, while vlan tag determine it as 1794 * a vm2vm pkt should forward to another vm. Hardware confuse 1795 * such a ambiguous situation, so pkt will lost. 1796 */ 1797 vlan_tag = external_pkt_default_vlan_tag; 1798 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1799 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1800 __rte_mbuf_raw_free(mbuf); 1801 return; 1802 } 1803 } 1804 1805 mbuf->nb_segs = m->nb_segs; 1806 mbuf->next = m->next; 1807 mbuf->data_len = m->data_len + offset; 1808 mbuf->pkt_len = mbuf->data_len; 1809 if (unlikely(need_copy)) { 1810 /* Copy the packet contents to the mbuf. */ 1811 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1812 rte_pktmbuf_mtod(m, void *), 1813 m->data_len); 1814 } else { 1815 mbuf->data_off = m->data_off; 1816 mbuf->buf_physaddr = m->buf_physaddr; 1817 mbuf->buf_addr = m->buf_addr; 1818 } 1819 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1820 mbuf->vlan_tci = vlan_tag; 1821 mbuf->l2_len = sizeof(struct ether_hdr); 1822 mbuf->l3_len = sizeof(struct ipv4_hdr); 1823 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1824 1825 tx_q->m_table[len] = mbuf; 1826 len++; 1827 1828 LOG_DEBUG(VHOST_DATA, 1829 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1830 dev->device_fh, 1831 mbuf->nb_segs, 1832 (mbuf->next == NULL) ? "null" : "non-null"); 1833 1834 if (enable_stats) { 1835 dev_statistics[dev->device_fh].tx_total++; 1836 dev_statistics[dev->device_fh].tx++; 1837 } 1838 1839 if (unlikely(len == MAX_PKT_BURST)) { 1840 m_table = (struct rte_mbuf **)tx_q->m_table; 1841 ret = rte_eth_tx_burst(ports[0], 1842 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1843 1844 /* 1845 * Free any buffers not handled by TX and update 1846 * the port stats. 1847 */ 1848 if (unlikely(ret < len)) { 1849 do { 1850 rte_pktmbuf_free(m_table[ret]); 1851 } while (++ret < len); 1852 } 1853 1854 len = 0; 1855 txmbuf_clean_zcp(dev, vpool); 1856 } 1857 1858 tx_q->len = len; 1859 1860 return; 1861 } 1862 1863 /* 1864 * This function TX all available packets in virtio TX queue for one 1865 * virtio-net device. If it is first packet, it learns MAC address and 1866 * setup VMDQ. 1867 */ 1868 static inline void __attribute__((always_inline)) 1869 virtio_dev_tx_zcp(struct virtio_net *dev) 1870 { 1871 struct rte_mbuf m; 1872 struct vhost_virtqueue *vq; 1873 struct vring_desc *desc; 1874 uint64_t buff_addr = 0, phys_addr; 1875 uint32_t head[MAX_PKT_BURST]; 1876 uint32_t i; 1877 uint16_t free_entries, packet_success = 0; 1878 uint16_t avail_idx; 1879 uint8_t need_copy = 0; 1880 hpa_type addr_type; 1881 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1882 1883 vq = dev->virtqueue[VIRTIO_TXQ]; 1884 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1885 1886 /* If there are no available buffers then return. */ 1887 if (vq->last_used_idx_res == avail_idx) 1888 return; 1889 1890 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1891 1892 /* Prefetch available ring to retrieve head indexes. */ 1893 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1894 1895 /* Get the number of free entries in the ring */ 1896 free_entries = (avail_idx - vq->last_used_idx_res); 1897 1898 /* Limit to MAX_PKT_BURST. */ 1899 free_entries 1900 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1901 1902 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1903 dev->device_fh, free_entries); 1904 1905 /* Retrieve all of the head indexes first to avoid caching issues. */ 1906 for (i = 0; i < free_entries; i++) 1907 head[i] 1908 = vq->avail->ring[(vq->last_used_idx_res + i) 1909 & (vq->size - 1)]; 1910 1911 vq->last_used_idx_res += free_entries; 1912 1913 /* Prefetch descriptor index. */ 1914 rte_prefetch0(&vq->desc[head[packet_success]]); 1915 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1916 1917 while (packet_success < free_entries) { 1918 desc = &vq->desc[head[packet_success]]; 1919 1920 /* Discard first buffer as it is the virtio header */ 1921 desc = &vq->desc[desc->next]; 1922 1923 /* Buffer address translation. */ 1924 buff_addr = gpa_to_vva(dev, desc->addr); 1925 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1926 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1927 &addr_type); 1928 1929 if (likely(packet_success < (free_entries - 1))) 1930 /* Prefetch descriptor index. */ 1931 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1932 1933 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1934 RTE_LOG(ERR, VHOST_DATA, 1935 "(%"PRIu64") Invalid frame buffer address found" 1936 "when TX packets!\n", 1937 dev->device_fh); 1938 packet_success++; 1939 continue; 1940 } 1941 1942 /* Prefetch buffer address. */ 1943 rte_prefetch0((void *)(uintptr_t)buff_addr); 1944 1945 /* 1946 * Setup dummy mbuf. This is copied to a real mbuf if 1947 * transmitted out the physical port. 1948 */ 1949 m.data_len = desc->len; 1950 m.nb_segs = 1; 1951 m.next = NULL; 1952 m.data_off = 0; 1953 m.buf_addr = (void *)(uintptr_t)buff_addr; 1954 m.buf_physaddr = phys_addr; 1955 1956 /* 1957 * Check if the frame buffer address from guest crosses 1958 * sub-region or not. 1959 */ 1960 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1961 RTE_LOG(ERR, VHOST_DATA, 1962 "(%"PRIu64") Frame buffer address cross " 1963 "sub-regioin found when attaching TX frame " 1964 "buffer address!\n", 1965 dev->device_fh); 1966 need_copy = 1; 1967 } else 1968 need_copy = 0; 1969 1970 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1971 1972 /* 1973 * If this is the first received packet we need to learn 1974 * the MAC and setup VMDQ 1975 */ 1976 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1977 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1978 /* 1979 * Discard frame if device is scheduled for 1980 * removal or a duplicate MAC address is found. 1981 */ 1982 packet_success += free_entries; 1983 vq->last_used_idx += packet_success; 1984 break; 1985 } 1986 } 1987 1988 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 1989 packet_success++; 1990 } 1991 } 1992 1993 /* 1994 * This function is called by each data core. It handles all RX/TX registered 1995 * with the core. For TX the specific lcore linked list is used. For RX, MAC 1996 * addresses are compared with all devices in the main linked list. 1997 */ 1998 static int 1999 switch_worker_zcp(__attribute__((unused)) void *arg) 2000 { 2001 struct virtio_net *dev = NULL; 2002 struct vhost_dev *vdev = NULL; 2003 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2004 struct virtio_net_data_ll *dev_ll; 2005 struct mbuf_table *tx_q; 2006 volatile struct lcore_ll_info *lcore_ll; 2007 const uint64_t drain_tsc 2008 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2009 * BURST_TX_DRAIN_US; 2010 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2011 unsigned ret; 2012 const uint16_t lcore_id = rte_lcore_id(); 2013 uint16_t count_in_ring, rx_count = 0; 2014 2015 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2016 2017 lcore_ll = lcore_info[lcore_id].lcore_ll; 2018 prev_tsc = 0; 2019 2020 while (1) { 2021 cur_tsc = rte_rdtsc(); 2022 2023 /* TX burst queue drain */ 2024 diff_tsc = cur_tsc - prev_tsc; 2025 if (unlikely(diff_tsc > drain_tsc)) { 2026 /* 2027 * Get mbuf from vpool.pool and detach mbuf and 2028 * put back into vpool.ring. 2029 */ 2030 dev_ll = lcore_ll->ll_root_used; 2031 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2032 /* Get virtio device ID */ 2033 vdev = dev_ll->vdev; 2034 dev = vdev->dev; 2035 2036 if (likely(!vdev->remove)) { 2037 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2038 if (tx_q->len) { 2039 LOG_DEBUG(VHOST_DATA, 2040 "TX queue drained after timeout" 2041 " with burst size %u\n", 2042 tx_q->len); 2043 2044 /* 2045 * Tx any packets in the queue 2046 */ 2047 ret = rte_eth_tx_burst( 2048 ports[0], 2049 (uint16_t)tx_q->txq_id, 2050 (struct rte_mbuf **) 2051 tx_q->m_table, 2052 (uint16_t)tx_q->len); 2053 if (unlikely(ret < tx_q->len)) { 2054 do { 2055 rte_pktmbuf_free( 2056 tx_q->m_table[ret]); 2057 } while (++ret < tx_q->len); 2058 } 2059 tx_q->len = 0; 2060 2061 txmbuf_clean_zcp(dev, 2062 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2063 } 2064 } 2065 dev_ll = dev_ll->next; 2066 } 2067 prev_tsc = cur_tsc; 2068 } 2069 2070 rte_prefetch0(lcore_ll->ll_root_used); 2071 2072 /* 2073 * Inform the configuration core that we have exited the linked 2074 * list and that no devices are in use if requested. 2075 */ 2076 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2077 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2078 2079 /* Process devices */ 2080 dev_ll = lcore_ll->ll_root_used; 2081 2082 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2083 vdev = dev_ll->vdev; 2084 dev = vdev->dev; 2085 if (unlikely(vdev->remove)) { 2086 dev_ll = dev_ll->next; 2087 unlink_vmdq(vdev); 2088 vdev->ready = DEVICE_SAFE_REMOVE; 2089 continue; 2090 } 2091 2092 if (likely(vdev->ready == DEVICE_RX)) { 2093 uint32_t index = vdev->vmdq_rx_q; 2094 uint16_t i; 2095 count_in_ring 2096 = rte_ring_count(vpool_array[index].ring); 2097 uint16_t free_entries 2098 = (uint16_t)get_available_ring_num_zcp(dev); 2099 2100 /* 2101 * Attach all mbufs in vpool.ring and put back 2102 * into vpool.pool. 2103 */ 2104 for (i = 0; 2105 i < RTE_MIN(free_entries, 2106 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2107 i++) 2108 attach_rxmbuf_zcp(dev); 2109 2110 /* Handle guest RX */ 2111 rx_count = rte_eth_rx_burst(ports[0], 2112 vdev->vmdq_rx_q, pkts_burst, 2113 MAX_PKT_BURST); 2114 2115 if (rx_count) { 2116 ret_count = virtio_dev_rx_zcp(dev, 2117 pkts_burst, rx_count); 2118 if (enable_stats) { 2119 dev_statistics[dev->device_fh].rx_total 2120 += rx_count; 2121 dev_statistics[dev->device_fh].rx 2122 += ret_count; 2123 } 2124 while (likely(rx_count)) { 2125 rx_count--; 2126 pktmbuf_detach_zcp( 2127 pkts_burst[rx_count]); 2128 rte_ring_sp_enqueue( 2129 vpool_array[index].ring, 2130 (void *)pkts_burst[rx_count]); 2131 } 2132 } 2133 } 2134 2135 if (likely(!vdev->remove)) 2136 /* Handle guest TX */ 2137 virtio_dev_tx_zcp(dev); 2138 2139 /* Move to the next device in the list */ 2140 dev_ll = dev_ll->next; 2141 } 2142 } 2143 2144 return 0; 2145 } 2146 2147 2148 /* 2149 * Add an entry to a used linked list. A free entry must first be found 2150 * in the free linked list using get_data_ll_free_entry(); 2151 */ 2152 static void 2153 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2154 struct virtio_net_data_ll *ll_dev) 2155 { 2156 struct virtio_net_data_ll *ll = *ll_root_addr; 2157 2158 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2159 ll_dev->next = NULL; 2160 rte_compiler_barrier(); 2161 2162 /* If ll == NULL then this is the first device. */ 2163 if (ll) { 2164 /* Increment to the tail of the linked list. */ 2165 while ((ll->next != NULL) ) 2166 ll = ll->next; 2167 2168 ll->next = ll_dev; 2169 } else { 2170 *ll_root_addr = ll_dev; 2171 } 2172 } 2173 2174 /* 2175 * Remove an entry from a used linked list. The entry must then be added to 2176 * the free linked list using put_data_ll_free_entry(). 2177 */ 2178 static void 2179 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2180 struct virtio_net_data_ll *ll_dev, 2181 struct virtio_net_data_ll *ll_dev_last) 2182 { 2183 struct virtio_net_data_ll *ll = *ll_root_addr; 2184 2185 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2186 return; 2187 2188 if (ll_dev == ll) 2189 *ll_root_addr = ll_dev->next; 2190 else 2191 if (likely(ll_dev_last != NULL)) 2192 ll_dev_last->next = ll_dev->next; 2193 else 2194 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2195 } 2196 2197 /* 2198 * Find and return an entry from the free linked list. 2199 */ 2200 static struct virtio_net_data_ll * 2201 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2202 { 2203 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2204 struct virtio_net_data_ll *ll_dev; 2205 2206 if (ll_free == NULL) 2207 return NULL; 2208 2209 ll_dev = ll_free; 2210 *ll_root_addr = ll_free->next; 2211 2212 return ll_dev; 2213 } 2214 2215 /* 2216 * Place an entry back on to the free linked list. 2217 */ 2218 static void 2219 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2220 struct virtio_net_data_ll *ll_dev) 2221 { 2222 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2223 2224 if (ll_dev == NULL) 2225 return; 2226 2227 ll_dev->next = ll_free; 2228 *ll_root_addr = ll_dev; 2229 } 2230 2231 /* 2232 * Creates a linked list of a given size. 2233 */ 2234 static struct virtio_net_data_ll * 2235 alloc_data_ll(uint32_t size) 2236 { 2237 struct virtio_net_data_ll *ll_new; 2238 uint32_t i; 2239 2240 /* Malloc and then chain the linked list. */ 2241 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2242 if (ll_new == NULL) { 2243 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2244 return NULL; 2245 } 2246 2247 for (i = 0; i < size - 1; i++) { 2248 ll_new[i].vdev = NULL; 2249 ll_new[i].next = &ll_new[i+1]; 2250 } 2251 ll_new[i].next = NULL; 2252 2253 return (ll_new); 2254 } 2255 2256 /* 2257 * Create the main linked list along with each individual cores linked list. A used and a free list 2258 * are created to manage entries. 2259 */ 2260 static int 2261 init_data_ll (void) 2262 { 2263 int lcore; 2264 2265 RTE_LCORE_FOREACH_SLAVE(lcore) { 2266 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2267 if (lcore_info[lcore].lcore_ll == NULL) { 2268 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2269 return -1; 2270 } 2271 2272 lcore_info[lcore].lcore_ll->device_num = 0; 2273 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2274 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2275 if (num_devices % num_switching_cores) 2276 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2277 else 2278 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2279 } 2280 2281 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2282 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2283 2284 return 0; 2285 } 2286 2287 /* 2288 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2289 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2290 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2291 */ 2292 static void 2293 destroy_device (volatile struct virtio_net *dev) 2294 { 2295 struct virtio_net_data_ll *ll_lcore_dev_cur; 2296 struct virtio_net_data_ll *ll_main_dev_cur; 2297 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2298 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2299 struct vhost_dev *vdev; 2300 int lcore; 2301 2302 dev->flags &= ~VIRTIO_DEV_RUNNING; 2303 2304 vdev = (struct vhost_dev *)dev->priv; 2305 /*set the remove flag. */ 2306 vdev->remove = 1; 2307 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2308 rte_pause(); 2309 } 2310 2311 /* Search for entry to be removed from lcore ll */ 2312 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2313 while (ll_lcore_dev_cur != NULL) { 2314 if (ll_lcore_dev_cur->vdev == vdev) { 2315 break; 2316 } else { 2317 ll_lcore_dev_last = ll_lcore_dev_cur; 2318 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2319 } 2320 } 2321 2322 if (ll_lcore_dev_cur == NULL) { 2323 RTE_LOG(ERR, VHOST_CONFIG, 2324 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2325 dev->device_fh); 2326 return; 2327 } 2328 2329 /* Search for entry to be removed from main ll */ 2330 ll_main_dev_cur = ll_root_used; 2331 ll_main_dev_last = NULL; 2332 while (ll_main_dev_cur != NULL) { 2333 if (ll_main_dev_cur->vdev == vdev) { 2334 break; 2335 } else { 2336 ll_main_dev_last = ll_main_dev_cur; 2337 ll_main_dev_cur = ll_main_dev_cur->next; 2338 } 2339 } 2340 2341 /* Remove entries from the lcore and main ll. */ 2342 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2343 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2344 2345 /* Set the dev_removal_flag on each lcore. */ 2346 RTE_LCORE_FOREACH_SLAVE(lcore) { 2347 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2348 } 2349 2350 /* 2351 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2352 * they can no longer access the device removed from the linked lists and that the devices 2353 * are no longer in use. 2354 */ 2355 RTE_LCORE_FOREACH_SLAVE(lcore) { 2356 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2357 rte_pause(); 2358 } 2359 } 2360 2361 /* Add the entries back to the lcore and main free ll.*/ 2362 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2363 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2364 2365 /* Decrement number of device on the lcore. */ 2366 lcore_info[vdev->coreid].lcore_ll->device_num--; 2367 2368 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2369 2370 if (zero_copy) { 2371 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2372 2373 /* Stop the RX queue. */ 2374 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2375 LOG_DEBUG(VHOST_CONFIG, 2376 "(%"PRIu64") In destroy_device: Failed to stop " 2377 "rx queue:%d\n", 2378 dev->device_fh, 2379 vdev->vmdq_rx_q); 2380 } 2381 2382 LOG_DEBUG(VHOST_CONFIG, 2383 "(%"PRIu64") in destroy_device: Start put mbuf in " 2384 "mempool back to ring for RX queue: %d\n", 2385 dev->device_fh, vdev->vmdq_rx_q); 2386 2387 mbuf_destroy_zcp(vpool); 2388 2389 /* Stop the TX queue. */ 2390 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2391 LOG_DEBUG(VHOST_CONFIG, 2392 "(%"PRIu64") In destroy_device: Failed to " 2393 "stop tx queue:%d\n", 2394 dev->device_fh, vdev->vmdq_rx_q); 2395 } 2396 2397 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2398 2399 LOG_DEBUG(VHOST_CONFIG, 2400 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2401 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2402 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2403 dev->device_fh); 2404 2405 mbuf_destroy_zcp(vpool); 2406 rte_free(vdev->regions_hpa); 2407 } 2408 rte_free(vdev); 2409 2410 } 2411 2412 /* 2413 * Calculate the region count of physical continous regions for one particular 2414 * region of whose vhost virtual address is continous. The particular region 2415 * start from vva_start, with size of 'size' in argument. 2416 */ 2417 static uint32_t 2418 check_hpa_regions(uint64_t vva_start, uint64_t size) 2419 { 2420 uint32_t i, nregions = 0, page_size = getpagesize(); 2421 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2422 if (vva_start % page_size) { 2423 LOG_DEBUG(VHOST_CONFIG, 2424 "in check_countinous: vva start(%p) mod page_size(%d) " 2425 "has remainder\n", 2426 (void *)(uintptr_t)vva_start, page_size); 2427 return 0; 2428 } 2429 if (size % page_size) { 2430 LOG_DEBUG(VHOST_CONFIG, 2431 "in check_countinous: " 2432 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2433 size, page_size); 2434 return 0; 2435 } 2436 for (i = 0; i < size - page_size; i = i + page_size) { 2437 cur_phys_addr 2438 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2439 next_phys_addr = rte_mem_virt2phy( 2440 (void *)(uintptr_t)(vva_start + i + page_size)); 2441 if ((cur_phys_addr + page_size) != next_phys_addr) { 2442 ++nregions; 2443 LOG_DEBUG(VHOST_CONFIG, 2444 "in check_continuous: hva addr:(%p) is not " 2445 "continuous with hva addr:(%p), diff:%d\n", 2446 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2447 (void *)(uintptr_t)(vva_start + (uint64_t)i 2448 + page_size), page_size); 2449 LOG_DEBUG(VHOST_CONFIG, 2450 "in check_continuous: hpa addr:(%p) is not " 2451 "continuous with hpa addr:(%p), " 2452 "diff:(%"PRIu64")\n", 2453 (void *)(uintptr_t)cur_phys_addr, 2454 (void *)(uintptr_t)next_phys_addr, 2455 (next_phys_addr-cur_phys_addr)); 2456 } 2457 } 2458 return nregions; 2459 } 2460 2461 /* 2462 * Divide each region whose vhost virtual address is continous into a few 2463 * sub-regions, make sure the physical address within each sub-region are 2464 * continous. And fill offset(to GPA) and size etc. information of each 2465 * sub-region into regions_hpa. 2466 */ 2467 static uint32_t 2468 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2469 { 2470 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2471 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2472 2473 if (mem_region_hpa == NULL) 2474 return 0; 2475 2476 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2477 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2478 virtio_memory->regions[regionidx].address_offset; 2479 mem_region_hpa[regionidx_hpa].guest_phys_address 2480 = virtio_memory->regions[regionidx].guest_phys_address; 2481 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2482 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2483 mem_region_hpa[regionidx_hpa].guest_phys_address; 2484 LOG_DEBUG(VHOST_CONFIG, 2485 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2486 regionidx_hpa, 2487 (void *)(uintptr_t) 2488 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2489 LOG_DEBUG(VHOST_CONFIG, 2490 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2491 regionidx_hpa, 2492 (void *)(uintptr_t) 2493 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2494 for (i = 0, k = 0; 2495 i < virtio_memory->regions[regionidx].memory_size - 2496 page_size; 2497 i += page_size) { 2498 cur_phys_addr = rte_mem_virt2phy( 2499 (void *)(uintptr_t)(vva_start + i)); 2500 next_phys_addr = rte_mem_virt2phy( 2501 (void *)(uintptr_t)(vva_start + 2502 i + page_size)); 2503 if ((cur_phys_addr + page_size) != next_phys_addr) { 2504 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2505 mem_region_hpa[regionidx_hpa].guest_phys_address + 2506 k + page_size; 2507 mem_region_hpa[regionidx_hpa].memory_size 2508 = k + page_size; 2509 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2510 "phys addr end [%d]:(%p)\n", 2511 regionidx_hpa, 2512 (void *)(uintptr_t) 2513 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2514 LOG_DEBUG(VHOST_CONFIG, 2515 "in fill_hpa_regions: guest phys addr " 2516 "size [%d]:(%p)\n", 2517 regionidx_hpa, 2518 (void *)(uintptr_t) 2519 (mem_region_hpa[regionidx_hpa].memory_size)); 2520 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2521 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2522 ++regionidx_hpa; 2523 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2524 next_phys_addr - 2525 mem_region_hpa[regionidx_hpa].guest_phys_address; 2526 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2527 " phys addr start[%d]:(%p)\n", 2528 regionidx_hpa, 2529 (void *)(uintptr_t) 2530 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2531 LOG_DEBUG(VHOST_CONFIG, 2532 "in fill_hpa_regions: host phys addr " 2533 "start[%d]:(%p)\n", 2534 regionidx_hpa, 2535 (void *)(uintptr_t) 2536 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2537 k = 0; 2538 } else { 2539 k += page_size; 2540 } 2541 } 2542 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2543 = mem_region_hpa[regionidx_hpa].guest_phys_address 2544 + k + page_size; 2545 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2546 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2547 "[%d]:(%p)\n", regionidx_hpa, 2548 (void *)(uintptr_t) 2549 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2550 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2551 "[%d]:(%p)\n", regionidx_hpa, 2552 (void *)(uintptr_t) 2553 (mem_region_hpa[regionidx_hpa].memory_size)); 2554 ++regionidx_hpa; 2555 } 2556 return regionidx_hpa; 2557 } 2558 2559 /* 2560 * A new device is added to a data core. First the device is added to the main linked list 2561 * and the allocated to a specific data core. 2562 */ 2563 static int 2564 new_device (struct virtio_net *dev) 2565 { 2566 struct virtio_net_data_ll *ll_dev; 2567 int lcore, core_add = 0; 2568 uint32_t device_num_min = num_devices; 2569 struct vhost_dev *vdev; 2570 uint32_t regionidx; 2571 2572 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2573 if (vdev == NULL) { 2574 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2575 dev->device_fh); 2576 return -1; 2577 } 2578 vdev->dev = dev; 2579 dev->priv = vdev; 2580 2581 if (zero_copy) { 2582 vdev->nregions_hpa = dev->mem->nregions; 2583 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2584 vdev->nregions_hpa 2585 += check_hpa_regions( 2586 dev->mem->regions[regionidx].guest_phys_address 2587 + dev->mem->regions[regionidx].address_offset, 2588 dev->mem->regions[regionidx].memory_size); 2589 2590 } 2591 2592 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2593 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2594 RTE_CACHE_LINE_SIZE); 2595 if (vdev->regions_hpa == NULL) { 2596 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2597 rte_free(vdev); 2598 return -1; 2599 } 2600 2601 2602 if (fill_hpa_memory_regions( 2603 vdev->regions_hpa, dev->mem 2604 ) != vdev->nregions_hpa) { 2605 2606 RTE_LOG(ERR, VHOST_CONFIG, 2607 "hpa memory regions number mismatch: " 2608 "[%d]\n", vdev->nregions_hpa); 2609 rte_free(vdev->regions_hpa); 2610 rte_free(vdev); 2611 return -1; 2612 } 2613 } 2614 2615 2616 /* Add device to main ll */ 2617 ll_dev = get_data_ll_free_entry(&ll_root_free); 2618 if (ll_dev == NULL) { 2619 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2620 "of %d devices per core has been reached\n", 2621 dev->device_fh, num_devices); 2622 if (vdev->regions_hpa) 2623 rte_free(vdev->regions_hpa); 2624 rte_free(vdev); 2625 return -1; 2626 } 2627 ll_dev->vdev = vdev; 2628 add_data_ll_entry(&ll_root_used, ll_dev); 2629 vdev->vmdq_rx_q 2630 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2631 2632 if (zero_copy) { 2633 uint32_t index = vdev->vmdq_rx_q; 2634 uint32_t count_in_ring, i; 2635 struct mbuf_table *tx_q; 2636 2637 count_in_ring = rte_ring_count(vpool_array[index].ring); 2638 2639 LOG_DEBUG(VHOST_CONFIG, 2640 "(%"PRIu64") in new_device: mbuf count in mempool " 2641 "before attach is: %d\n", 2642 dev->device_fh, 2643 rte_mempool_count(vpool_array[index].pool)); 2644 LOG_DEBUG(VHOST_CONFIG, 2645 "(%"PRIu64") in new_device: mbuf count in ring " 2646 "before attach is : %d\n", 2647 dev->device_fh, count_in_ring); 2648 2649 /* 2650 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2651 */ 2652 for (i = 0; i < count_in_ring; i++) 2653 attach_rxmbuf_zcp(dev); 2654 2655 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2656 "mempool after attach is: %d\n", 2657 dev->device_fh, 2658 rte_mempool_count(vpool_array[index].pool)); 2659 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2660 "ring after attach is : %d\n", 2661 dev->device_fh, 2662 rte_ring_count(vpool_array[index].ring)); 2663 2664 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2665 tx_q->txq_id = vdev->vmdq_rx_q; 2666 2667 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2668 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2669 2670 LOG_DEBUG(VHOST_CONFIG, 2671 "(%"PRIu64") In new_device: Failed to start " 2672 "tx queue:%d\n", 2673 dev->device_fh, vdev->vmdq_rx_q); 2674 2675 mbuf_destroy_zcp(vpool); 2676 rte_free(vdev->regions_hpa); 2677 rte_free(vdev); 2678 return -1; 2679 } 2680 2681 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2682 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2683 2684 LOG_DEBUG(VHOST_CONFIG, 2685 "(%"PRIu64") In new_device: Failed to start " 2686 "rx queue:%d\n", 2687 dev->device_fh, vdev->vmdq_rx_q); 2688 2689 /* Stop the TX queue. */ 2690 if (rte_eth_dev_tx_queue_stop(ports[0], 2691 vdev->vmdq_rx_q) != 0) { 2692 LOG_DEBUG(VHOST_CONFIG, 2693 "(%"PRIu64") In new_device: Failed to " 2694 "stop tx queue:%d\n", 2695 dev->device_fh, vdev->vmdq_rx_q); 2696 } 2697 2698 mbuf_destroy_zcp(vpool); 2699 rte_free(vdev->regions_hpa); 2700 rte_free(vdev); 2701 return -1; 2702 } 2703 2704 } 2705 2706 /*reset ready flag*/ 2707 vdev->ready = DEVICE_MAC_LEARNING; 2708 vdev->remove = 0; 2709 2710 /* Find a suitable lcore to add the device. */ 2711 RTE_LCORE_FOREACH_SLAVE(lcore) { 2712 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2713 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2714 core_add = lcore; 2715 } 2716 } 2717 /* Add device to lcore ll */ 2718 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2719 if (ll_dev == NULL) { 2720 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2721 vdev->ready = DEVICE_SAFE_REMOVE; 2722 destroy_device(dev); 2723 if (vdev->regions_hpa) 2724 rte_free(vdev->regions_hpa); 2725 rte_free(vdev); 2726 return -1; 2727 } 2728 ll_dev->vdev = vdev; 2729 vdev->coreid = core_add; 2730 2731 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2732 2733 /* Initialize device stats */ 2734 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2735 2736 /* Disable notifications. */ 2737 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2738 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2739 lcore_info[vdev->coreid].lcore_ll->device_num++; 2740 dev->flags |= VIRTIO_DEV_RUNNING; 2741 2742 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2743 2744 return 0; 2745 } 2746 2747 /* 2748 * These callback allow devices to be added to the data core when configuration 2749 * has been fully complete. 2750 */ 2751 static const struct virtio_net_device_ops virtio_net_device_ops = 2752 { 2753 .new_device = new_device, 2754 .destroy_device = destroy_device, 2755 }; 2756 2757 /* 2758 * This is a thread will wake up after a period to print stats if the user has 2759 * enabled them. 2760 */ 2761 static void 2762 print_stats(void) 2763 { 2764 struct virtio_net_data_ll *dev_ll; 2765 uint64_t tx_dropped, rx_dropped; 2766 uint64_t tx, tx_total, rx, rx_total; 2767 uint32_t device_fh; 2768 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2769 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2770 2771 while(1) { 2772 sleep(enable_stats); 2773 2774 /* Clear screen and move to top left */ 2775 printf("%s%s", clr, top_left); 2776 2777 printf("\nDevice statistics ===================================="); 2778 2779 dev_ll = ll_root_used; 2780 while (dev_ll != NULL) { 2781 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2782 tx_total = dev_statistics[device_fh].tx_total; 2783 tx = dev_statistics[device_fh].tx; 2784 tx_dropped = tx_total - tx; 2785 if (zero_copy == 0) { 2786 rx_total = rte_atomic64_read( 2787 &dev_statistics[device_fh].rx_total_atomic); 2788 rx = rte_atomic64_read( 2789 &dev_statistics[device_fh].rx_atomic); 2790 } else { 2791 rx_total = dev_statistics[device_fh].rx_total; 2792 rx = dev_statistics[device_fh].rx; 2793 } 2794 rx_dropped = rx_total - rx; 2795 2796 printf("\nStatistics for device %"PRIu32" ------------------------------" 2797 "\nTX total: %"PRIu64"" 2798 "\nTX dropped: %"PRIu64"" 2799 "\nTX successful: %"PRIu64"" 2800 "\nRX total: %"PRIu64"" 2801 "\nRX dropped: %"PRIu64"" 2802 "\nRX successful: %"PRIu64"", 2803 device_fh, 2804 tx_total, 2805 tx_dropped, 2806 tx, 2807 rx_total, 2808 rx_dropped, 2809 rx); 2810 2811 dev_ll = dev_ll->next; 2812 } 2813 printf("\n======================================================\n"); 2814 } 2815 } 2816 2817 static void 2818 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2819 char *ring_name, uint32_t nb_mbuf) 2820 { 2821 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2822 vpool_array[index].pool 2823 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2824 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2825 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2826 rte_pktmbuf_init, NULL, socket, 0); 2827 if (vpool_array[index].pool != NULL) { 2828 vpool_array[index].ring 2829 = rte_ring_create(ring_name, 2830 rte_align32pow2(nb_mbuf + 1), 2831 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2832 if (likely(vpool_array[index].ring != NULL)) { 2833 LOG_DEBUG(VHOST_CONFIG, 2834 "in setup_mempool_tbl: mbuf count in " 2835 "mempool is: %d\n", 2836 rte_mempool_count(vpool_array[index].pool)); 2837 LOG_DEBUG(VHOST_CONFIG, 2838 "in setup_mempool_tbl: mbuf count in " 2839 "ring is: %d\n", 2840 rte_ring_count(vpool_array[index].ring)); 2841 } else { 2842 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2843 ring_name); 2844 } 2845 2846 /* Need consider head room. */ 2847 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2848 } else { 2849 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2850 } 2851 } 2852 2853 2854 /* 2855 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2856 * device is also registered here to handle the IOCTLs. 2857 */ 2858 int 2859 main(int argc, char *argv[]) 2860 { 2861 struct rte_mempool *mbuf_pool = NULL; 2862 unsigned lcore_id, core_id = 0; 2863 unsigned nb_ports, valid_num_ports; 2864 int ret; 2865 uint8_t portid; 2866 uint16_t queue_id; 2867 static pthread_t tid; 2868 2869 /* init EAL */ 2870 ret = rte_eal_init(argc, argv); 2871 if (ret < 0) 2872 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2873 argc -= ret; 2874 argv += ret; 2875 2876 /* parse app arguments */ 2877 ret = us_vhost_parse_args(argc, argv); 2878 if (ret < 0) 2879 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2880 2881 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2882 if (rte_lcore_is_enabled(lcore_id)) 2883 lcore_ids[core_id ++] = lcore_id; 2884 2885 if (rte_lcore_count() > RTE_MAX_LCORE) 2886 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2887 2888 /*set the number of swithcing cores available*/ 2889 num_switching_cores = rte_lcore_count()-1; 2890 2891 /* Get the number of physical ports. */ 2892 nb_ports = rte_eth_dev_count(); 2893 if (nb_ports > RTE_MAX_ETHPORTS) 2894 nb_ports = RTE_MAX_ETHPORTS; 2895 2896 /* 2897 * Update the global var NUM_PORTS and global array PORTS 2898 * and get value of var VALID_NUM_PORTS according to system ports number 2899 */ 2900 valid_num_ports = check_ports_num(nb_ports); 2901 2902 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2903 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2904 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2905 return -1; 2906 } 2907 2908 if (zero_copy == 0) { 2909 /* Create the mbuf pool. */ 2910 mbuf_pool = rte_mempool_create( 2911 "MBUF_POOL", 2912 NUM_MBUFS_PER_PORT 2913 * valid_num_ports, 2914 MBUF_SIZE, MBUF_CACHE_SIZE, 2915 sizeof(struct rte_pktmbuf_pool_private), 2916 rte_pktmbuf_pool_init, NULL, 2917 rte_pktmbuf_init, NULL, 2918 rte_socket_id(), 0); 2919 if (mbuf_pool == NULL) 2920 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2921 2922 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2923 vpool_array[queue_id].pool = mbuf_pool; 2924 2925 if (vm2vm_mode == VM2VM_HARDWARE) { 2926 /* Enable VT loop back to let L2 switch to do it. */ 2927 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2928 LOG_DEBUG(VHOST_CONFIG, 2929 "Enable loop back for L2 switch in vmdq.\n"); 2930 } 2931 } else { 2932 uint32_t nb_mbuf; 2933 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2934 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2935 2936 nb_mbuf = num_rx_descriptor 2937 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2938 + num_switching_cores * MAX_PKT_BURST; 2939 2940 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2941 snprintf(pool_name, sizeof(pool_name), 2942 "rxmbuf_pool_%u", queue_id); 2943 snprintf(ring_name, sizeof(ring_name), 2944 "rxmbuf_ring_%u", queue_id); 2945 setup_mempool_tbl(rte_socket_id(), queue_id, 2946 pool_name, ring_name, nb_mbuf); 2947 } 2948 2949 nb_mbuf = num_tx_descriptor 2950 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2951 + num_switching_cores * MAX_PKT_BURST; 2952 2953 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2954 snprintf(pool_name, sizeof(pool_name), 2955 "txmbuf_pool_%u", queue_id); 2956 snprintf(ring_name, sizeof(ring_name), 2957 "txmbuf_ring_%u", queue_id); 2958 setup_mempool_tbl(rte_socket_id(), 2959 (queue_id + MAX_QUEUES), 2960 pool_name, ring_name, nb_mbuf); 2961 } 2962 2963 if (vm2vm_mode == VM2VM_HARDWARE) { 2964 /* Enable VT loop back to let L2 switch to do it. */ 2965 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2966 LOG_DEBUG(VHOST_CONFIG, 2967 "Enable loop back for L2 switch in vmdq.\n"); 2968 } 2969 } 2970 /* Set log level. */ 2971 rte_set_log_level(LOG_LEVEL); 2972 2973 /* initialize all ports */ 2974 for (portid = 0; portid < nb_ports; portid++) { 2975 /* skip ports that are not enabled */ 2976 if ((enabled_port_mask & (1 << portid)) == 0) { 2977 RTE_LOG(INFO, VHOST_PORT, 2978 "Skipping disabled port %d\n", portid); 2979 continue; 2980 } 2981 if (port_init(portid) != 0) 2982 rte_exit(EXIT_FAILURE, 2983 "Cannot initialize network ports\n"); 2984 } 2985 2986 /* Initialise all linked lists. */ 2987 if (init_data_ll() == -1) 2988 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 2989 2990 /* Initialize device stats */ 2991 memset(&dev_statistics, 0, sizeof(dev_statistics)); 2992 2993 /* Enable stats if the user option is set. */ 2994 if (enable_stats) 2995 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 2996 2997 /* Launch all data cores. */ 2998 if (zero_copy == 0) { 2999 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3000 rte_eal_remote_launch(switch_worker, 3001 mbuf_pool, lcore_id); 3002 } 3003 } else { 3004 uint32_t count_in_mempool, index, i; 3005 for (index = 0; index < 2*MAX_QUEUES; index++) { 3006 /* For all RX and TX queues. */ 3007 count_in_mempool 3008 = rte_mempool_count(vpool_array[index].pool); 3009 3010 /* 3011 * Transfer all un-attached mbufs from vpool.pool 3012 * to vpoo.ring. 3013 */ 3014 for (i = 0; i < count_in_mempool; i++) { 3015 struct rte_mbuf *mbuf 3016 = __rte_mbuf_raw_alloc( 3017 vpool_array[index].pool); 3018 rte_ring_sp_enqueue(vpool_array[index].ring, 3019 (void *)mbuf); 3020 } 3021 3022 LOG_DEBUG(VHOST_CONFIG, 3023 "in main: mbuf count in mempool at initial " 3024 "is: %d\n", count_in_mempool); 3025 LOG_DEBUG(VHOST_CONFIG, 3026 "in main: mbuf count in ring at initial is :" 3027 " %d\n", 3028 rte_ring_count(vpool_array[index].ring)); 3029 } 3030 3031 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3032 rte_eal_remote_launch(switch_worker_zcp, NULL, 3033 lcore_id); 3034 } 3035 3036 if (mergeable == 0) 3037 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3038 3039 /* Register CUSE device to handle IOCTLs. */ 3040 ret = rte_vhost_driver_register((char *)&dev_basename); 3041 if (ret != 0) 3042 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3043 3044 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3045 3046 /* Start CUSE session. */ 3047 rte_vhost_driver_session_start(); 3048 return 0; 3049 3050 } 3051 3052