1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #ifndef MAX_QUEUES 57 #define MAX_QUEUES 128 58 #endif 59 60 /* the maximum number of external ports supported */ 61 #define MAX_SUP_PORTS 1 62 63 /* 64 * Calculate the number of buffers needed per port 65 */ 66 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 67 (num_switching_cores*MAX_PKT_BURST) + \ 68 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 69 (num_switching_cores*MBUF_CACHE_SIZE)) 70 71 #define MBUF_CACHE_SIZE 128 72 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 73 74 /* 75 * No frame data buffer allocated from host are required for zero copy 76 * implementation, guest will allocate the frame data buffer, and vhost 77 * directly use it. 78 */ 79 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM 80 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE 81 #define MBUF_CACHE_SIZE_ZCP 0 82 83 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 84 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 85 86 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 87 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 88 89 #define JUMBO_FRAME_MAX_SIZE 0x2600 90 91 /* State of virtio device. */ 92 #define DEVICE_MAC_LEARNING 0 93 #define DEVICE_RX 1 94 #define DEVICE_SAFE_REMOVE 2 95 96 /* Config_core_flag status definitions. */ 97 #define REQUEST_DEV_REMOVAL 1 98 #define ACK_DEV_REMOVAL 0 99 100 /* Configurable number of RX/TX ring descriptors */ 101 #define RTE_TEST_RX_DESC_DEFAULT 1024 102 #define RTE_TEST_TX_DESC_DEFAULT 512 103 104 /* 105 * Need refine these 2 macros for legacy and DPDK based front end: 106 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 107 * And then adjust power 2. 108 */ 109 /* 110 * For legacy front end, 128 descriptors, 111 * half for virtio header, another half for mbuf. 112 */ 113 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 114 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 115 116 /* Get first 4 bytes in mbuf headroom. */ 117 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 118 + sizeof(struct rte_mbuf))) 119 120 /* true if x is a power of 2 */ 121 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 122 123 #define INVALID_PORT_ID 0xFF 124 125 /* Max number of devices. Limited by vmdq. */ 126 #define MAX_DEVICES 64 127 128 /* Size of buffers used for snprintfs. */ 129 #define MAX_PRINT_BUFF 6072 130 131 /* Maximum character device basename size. */ 132 #define MAX_BASENAME_SZ 10 133 134 /* Maximum long option length for option parsing. */ 135 #define MAX_LONG_OPT_SZ 64 136 137 /* Used to compare MAC addresses. */ 138 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 139 140 /* Number of descriptors per cacheline. */ 141 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 142 143 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb)) 144 145 /* mask of enabled ports */ 146 static uint32_t enabled_port_mask = 0; 147 148 /* Promiscuous mode */ 149 static uint32_t promiscuous; 150 151 /*Number of switching cores enabled*/ 152 static uint32_t num_switching_cores = 0; 153 154 /* number of devices/queues to support*/ 155 static uint32_t num_queues = 0; 156 static uint32_t num_devices; 157 158 /* 159 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 160 * disabled on default. 161 */ 162 static uint32_t zero_copy; 163 static int mergeable; 164 165 /* Do vlan strip on host, enabled on default */ 166 static uint32_t vlan_strip = 1; 167 168 /* number of descriptors to apply*/ 169 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 170 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 171 172 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 173 #define MAX_RING_DESC 4096 174 175 struct vpool { 176 struct rte_mempool *pool; 177 struct rte_ring *ring; 178 uint32_t buf_size; 179 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 180 181 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 182 typedef enum { 183 VM2VM_DISABLED = 0, 184 VM2VM_SOFTWARE = 1, 185 VM2VM_HARDWARE = 2, 186 VM2VM_LAST 187 } vm2vm_type; 188 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 189 190 /* The type of host physical address translated from guest physical address. */ 191 typedef enum { 192 PHYS_ADDR_CONTINUOUS = 0, 193 PHYS_ADDR_CROSS_SUBREG = 1, 194 PHYS_ADDR_INVALID = 2, 195 PHYS_ADDR_LAST 196 } hpa_type; 197 198 /* Enable stats. */ 199 static uint32_t enable_stats = 0; 200 /* Enable retries on RX. */ 201 static uint32_t enable_retry = 1; 202 /* Specify timeout (in useconds) between retries on RX. */ 203 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 204 /* Specify the number of retries on RX. */ 205 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 206 207 /* Character device basename. Can be set by user. */ 208 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 209 210 /* empty vmdq configuration structure. Filled in programatically */ 211 static struct rte_eth_conf vmdq_conf_default = { 212 .rxmode = { 213 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 214 .split_hdr_size = 0, 215 .header_split = 0, /**< Header Split disabled */ 216 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 217 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 218 /* 219 * It is necessary for 1G NIC such as I350, 220 * this fixes bug of ipv4 forwarding in guest can't 221 * forward pakets from one virtio dev to another virtio dev. 222 */ 223 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 224 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 225 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 226 }, 227 228 .txmode = { 229 .mq_mode = ETH_MQ_TX_NONE, 230 }, 231 .rx_adv_conf = { 232 /* 233 * should be overridden separately in code with 234 * appropriate values 235 */ 236 .vmdq_rx_conf = { 237 .nb_queue_pools = ETH_8_POOLS, 238 .enable_default_pool = 0, 239 .default_pool = 0, 240 .nb_pool_maps = 0, 241 .pool_map = {{0, 0},}, 242 }, 243 }, 244 }; 245 246 static unsigned lcore_ids[RTE_MAX_LCORE]; 247 static uint8_t ports[RTE_MAX_ETHPORTS]; 248 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 249 static uint16_t num_pf_queues, num_vmdq_queues; 250 static uint16_t vmdq_pool_base, vmdq_queue_base; 251 static uint16_t queues_per_pool; 252 253 static const uint16_t external_pkt_default_vlan_tag = 2000; 254 const uint16_t vlan_tags[] = { 255 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 256 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 257 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 258 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 259 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 260 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 261 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 262 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 263 }; 264 265 /* ethernet addresses of ports */ 266 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 267 268 /* heads for the main used and free linked lists for the data path. */ 269 static struct virtio_net_data_ll *ll_root_used = NULL; 270 static struct virtio_net_data_ll *ll_root_free = NULL; 271 272 /* Array of data core structures containing information on individual core linked lists. */ 273 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 274 275 /* Used for queueing bursts of TX packets. */ 276 struct mbuf_table { 277 unsigned len; 278 unsigned txq_id; 279 struct rte_mbuf *m_table[MAX_PKT_BURST]; 280 }; 281 282 /* TX queue for each data core. */ 283 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 284 285 /* TX queue fori each virtio device for zero copy. */ 286 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 287 288 /* Vlan header struct used to insert vlan tags on TX. */ 289 struct vlan_ethhdr { 290 unsigned char h_dest[ETH_ALEN]; 291 unsigned char h_source[ETH_ALEN]; 292 __be16 h_vlan_proto; 293 __be16 h_vlan_TCI; 294 __be16 h_vlan_encapsulated_proto; 295 }; 296 297 /* IPv4 Header */ 298 struct ipv4_hdr { 299 uint8_t version_ihl; /**< version and header length */ 300 uint8_t type_of_service; /**< type of service */ 301 uint16_t total_length; /**< length of packet */ 302 uint16_t packet_id; /**< packet ID */ 303 uint16_t fragment_offset; /**< fragmentation offset */ 304 uint8_t time_to_live; /**< time to live */ 305 uint8_t next_proto_id; /**< protocol ID */ 306 uint16_t hdr_checksum; /**< header checksum */ 307 uint32_t src_addr; /**< source address */ 308 uint32_t dst_addr; /**< destination address */ 309 } __attribute__((__packed__)); 310 311 /* Header lengths. */ 312 #define VLAN_HLEN 4 313 #define VLAN_ETH_HLEN 18 314 315 /* Per-device statistics struct */ 316 struct device_statistics { 317 uint64_t tx_total; 318 rte_atomic64_t rx_total_atomic; 319 uint64_t rx_total; 320 uint64_t tx; 321 rte_atomic64_t rx_atomic; 322 uint64_t rx; 323 } __rte_cache_aligned; 324 struct device_statistics dev_statistics[MAX_DEVICES]; 325 326 /* 327 * Builds up the correct configuration for VMDQ VLAN pool map 328 * according to the pool & queue limits. 329 */ 330 static inline int 331 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 332 { 333 struct rte_eth_vmdq_rx_conf conf; 334 struct rte_eth_vmdq_rx_conf *def_conf = 335 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 336 unsigned i; 337 338 memset(&conf, 0, sizeof(conf)); 339 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 340 conf.nb_pool_maps = num_devices; 341 conf.enable_loop_back = def_conf->enable_loop_back; 342 conf.rx_mode = def_conf->rx_mode; 343 344 for (i = 0; i < conf.nb_pool_maps; i++) { 345 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 346 conf.pool_map[i].pools = (1UL << i); 347 } 348 349 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 350 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 351 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 352 return 0; 353 } 354 355 /* 356 * Validate the device number according to the max pool number gotten form 357 * dev_info. If the device number is invalid, give the error message and 358 * return -1. Each device must have its own pool. 359 */ 360 static inline int 361 validate_num_devices(uint32_t max_nb_devices) 362 { 363 if (num_devices > max_nb_devices) { 364 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 365 return -1; 366 } 367 return 0; 368 } 369 370 /* 371 * Initialises a given port using global settings and with the rx buffers 372 * coming from the mbuf_pool passed as parameter 373 */ 374 static inline int 375 port_init(uint8_t port) 376 { 377 struct rte_eth_dev_info dev_info; 378 struct rte_eth_conf port_conf; 379 struct rte_eth_rxconf *rxconf; 380 struct rte_eth_txconf *txconf; 381 int16_t rx_rings, tx_rings; 382 uint16_t rx_ring_size, tx_ring_size; 383 int retval; 384 uint16_t q; 385 386 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 387 rte_eth_dev_info_get (port, &dev_info); 388 389 if (dev_info.max_rx_queues > MAX_QUEUES) { 390 rte_exit(EXIT_FAILURE, 391 "please define MAX_QUEUES no less than %u in %s\n", 392 dev_info.max_rx_queues, __FILE__); 393 } 394 395 rxconf = &dev_info.default_rxconf; 396 txconf = &dev_info.default_txconf; 397 rxconf->rx_drop_en = 1; 398 399 /* Enable vlan offload */ 400 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 401 402 /* 403 * Zero copy defers queue RX/TX start to the time when guest 404 * finishes its startup and packet buffers from that guest are 405 * available. 406 */ 407 if (zero_copy) { 408 rxconf->rx_deferred_start = 1; 409 rxconf->rx_drop_en = 0; 410 txconf->tx_deferred_start = 1; 411 } 412 413 /*configure the number of supported virtio devices based on VMDQ limits */ 414 num_devices = dev_info.max_vmdq_pools; 415 416 if (zero_copy) { 417 rx_ring_size = num_rx_descriptor; 418 tx_ring_size = num_tx_descriptor; 419 tx_rings = dev_info.max_tx_queues; 420 } else { 421 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 422 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 423 tx_rings = (uint16_t)rte_lcore_count(); 424 } 425 426 retval = validate_num_devices(MAX_DEVICES); 427 if (retval < 0) 428 return retval; 429 430 /* Get port configuration. */ 431 retval = get_eth_conf(&port_conf, num_devices); 432 if (retval < 0) 433 return retval; 434 /* NIC queues are divided into pf queues and vmdq queues. */ 435 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 436 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 437 num_vmdq_queues = num_devices * queues_per_pool; 438 num_queues = num_pf_queues + num_vmdq_queues; 439 vmdq_queue_base = dev_info.vmdq_queue_base; 440 vmdq_pool_base = dev_info.vmdq_pool_base; 441 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 442 num_pf_queues, num_devices, queues_per_pool); 443 444 if (port >= rte_eth_dev_count()) return -1; 445 446 rx_rings = (uint16_t)dev_info.max_rx_queues; 447 /* Configure ethernet device. */ 448 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 449 if (retval != 0) 450 return retval; 451 452 /* Setup the queues. */ 453 for (q = 0; q < rx_rings; q ++) { 454 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 455 rte_eth_dev_socket_id(port), 456 rxconf, 457 vpool_array[q].pool); 458 if (retval < 0) 459 return retval; 460 } 461 for (q = 0; q < tx_rings; q ++) { 462 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 463 rte_eth_dev_socket_id(port), 464 txconf); 465 if (retval < 0) 466 return retval; 467 } 468 469 /* Start the device. */ 470 retval = rte_eth_dev_start(port); 471 if (retval < 0) { 472 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 473 return retval; 474 } 475 476 if (promiscuous) 477 rte_eth_promiscuous_enable(port); 478 479 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 480 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 481 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 482 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 483 (unsigned)port, 484 vmdq_ports_eth_addr[port].addr_bytes[0], 485 vmdq_ports_eth_addr[port].addr_bytes[1], 486 vmdq_ports_eth_addr[port].addr_bytes[2], 487 vmdq_ports_eth_addr[port].addr_bytes[3], 488 vmdq_ports_eth_addr[port].addr_bytes[4], 489 vmdq_ports_eth_addr[port].addr_bytes[5]); 490 491 return 0; 492 } 493 494 /* 495 * Set character device basename. 496 */ 497 static int 498 us_vhost_parse_basename(const char *q_arg) 499 { 500 /* parse number string */ 501 502 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 503 return -1; 504 else 505 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 506 507 return 0; 508 } 509 510 /* 511 * Parse the portmask provided at run time. 512 */ 513 static int 514 parse_portmask(const char *portmask) 515 { 516 char *end = NULL; 517 unsigned long pm; 518 519 errno = 0; 520 521 /* parse hexadecimal string */ 522 pm = strtoul(portmask, &end, 16); 523 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 524 return -1; 525 526 if (pm == 0) 527 return -1; 528 529 return pm; 530 531 } 532 533 /* 534 * Parse num options at run time. 535 */ 536 static int 537 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 538 { 539 char *end = NULL; 540 unsigned long num; 541 542 errno = 0; 543 544 /* parse unsigned int string */ 545 num = strtoul(q_arg, &end, 10); 546 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 547 return -1; 548 549 if (num > max_valid_value) 550 return -1; 551 552 return num; 553 554 } 555 556 /* 557 * Display usage 558 */ 559 static void 560 us_vhost_usage(const char *prgname) 561 { 562 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 563 " --vm2vm [0|1|2]\n" 564 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 565 " --dev-basename <name>\n" 566 " --nb-devices ND\n" 567 " -p PORTMASK: Set mask for ports to be used by application\n" 568 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 569 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 570 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 571 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 572 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 573 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 574 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 575 " --dev-basename: The basename to be used for the character device.\n" 576 " --zero-copy [0|1]: disable(default)/enable rx/tx " 577 "zero copy\n" 578 " --rx-desc-num [0-N]: the number of descriptors on rx, " 579 "used only when zero copy is enabled.\n" 580 " --tx-desc-num [0-N]: the number of descriptors on tx, " 581 "used only when zero copy is enabled.\n", 582 prgname); 583 } 584 585 /* 586 * Parse the arguments given in the command line of the application. 587 */ 588 static int 589 us_vhost_parse_args(int argc, char **argv) 590 { 591 int opt, ret; 592 int option_index; 593 unsigned i; 594 const char *prgname = argv[0]; 595 static struct option long_option[] = { 596 {"vm2vm", required_argument, NULL, 0}, 597 {"rx-retry", required_argument, NULL, 0}, 598 {"rx-retry-delay", required_argument, NULL, 0}, 599 {"rx-retry-num", required_argument, NULL, 0}, 600 {"mergeable", required_argument, NULL, 0}, 601 {"vlan-strip", required_argument, NULL, 0}, 602 {"stats", required_argument, NULL, 0}, 603 {"dev-basename", required_argument, NULL, 0}, 604 {"zero-copy", required_argument, NULL, 0}, 605 {"rx-desc-num", required_argument, NULL, 0}, 606 {"tx-desc-num", required_argument, NULL, 0}, 607 {NULL, 0, 0, 0}, 608 }; 609 610 /* Parse command line */ 611 while ((opt = getopt_long(argc, argv, "p:P", 612 long_option, &option_index)) != EOF) { 613 switch (opt) { 614 /* Portmask */ 615 case 'p': 616 enabled_port_mask = parse_portmask(optarg); 617 if (enabled_port_mask == 0) { 618 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 619 us_vhost_usage(prgname); 620 return -1; 621 } 622 break; 623 624 case 'P': 625 promiscuous = 1; 626 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 627 ETH_VMDQ_ACCEPT_BROADCAST | 628 ETH_VMDQ_ACCEPT_MULTICAST; 629 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 630 631 break; 632 633 case 0: 634 /* Enable/disable vm2vm comms. */ 635 if (!strncmp(long_option[option_index].name, "vm2vm", 636 MAX_LONG_OPT_SZ)) { 637 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 638 if (ret == -1) { 639 RTE_LOG(INFO, VHOST_CONFIG, 640 "Invalid argument for " 641 "vm2vm [0|1|2]\n"); 642 us_vhost_usage(prgname); 643 return -1; 644 } else { 645 vm2vm_mode = (vm2vm_type)ret; 646 } 647 } 648 649 /* Enable/disable retries on RX. */ 650 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 651 ret = parse_num_opt(optarg, 1); 652 if (ret == -1) { 653 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 654 us_vhost_usage(prgname); 655 return -1; 656 } else { 657 enable_retry = ret; 658 } 659 } 660 661 /* Specify the retries delay time (in useconds) on RX. */ 662 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 663 ret = parse_num_opt(optarg, INT32_MAX); 664 if (ret == -1) { 665 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 666 us_vhost_usage(prgname); 667 return -1; 668 } else { 669 burst_rx_delay_time = ret; 670 } 671 } 672 673 /* Specify the retries number on RX. */ 674 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 675 ret = parse_num_opt(optarg, INT32_MAX); 676 if (ret == -1) { 677 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 678 us_vhost_usage(prgname); 679 return -1; 680 } else { 681 burst_rx_retry_num = ret; 682 } 683 } 684 685 /* Enable/disable RX mergeable buffers. */ 686 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 687 ret = parse_num_opt(optarg, 1); 688 if (ret == -1) { 689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 690 us_vhost_usage(prgname); 691 return -1; 692 } else { 693 mergeable = !!ret; 694 if (ret) { 695 vmdq_conf_default.rxmode.jumbo_frame = 1; 696 vmdq_conf_default.rxmode.max_rx_pkt_len 697 = JUMBO_FRAME_MAX_SIZE; 698 } 699 } 700 } 701 702 /* Enable/disable RX VLAN strip on host. */ 703 if (!strncmp(long_option[option_index].name, 704 "vlan-strip", MAX_LONG_OPT_SZ)) { 705 ret = parse_num_opt(optarg, 1); 706 if (ret == -1) { 707 RTE_LOG(INFO, VHOST_CONFIG, 708 "Invalid argument for VLAN strip [0|1]\n"); 709 us_vhost_usage(prgname); 710 return -1; 711 } else { 712 vlan_strip = !!ret; 713 vmdq_conf_default.rxmode.hw_vlan_strip = 714 vlan_strip; 715 } 716 } 717 718 /* Enable/disable stats. */ 719 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 720 ret = parse_num_opt(optarg, INT32_MAX); 721 if (ret == -1) { 722 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 723 us_vhost_usage(prgname); 724 return -1; 725 } else { 726 enable_stats = ret; 727 } 728 } 729 730 /* Set character device basename. */ 731 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 732 if (us_vhost_parse_basename(optarg) == -1) { 733 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 734 us_vhost_usage(prgname); 735 return -1; 736 } 737 } 738 739 /* Enable/disable rx/tx zero copy. */ 740 if (!strncmp(long_option[option_index].name, 741 "zero-copy", MAX_LONG_OPT_SZ)) { 742 ret = parse_num_opt(optarg, 1); 743 if (ret == -1) { 744 RTE_LOG(INFO, VHOST_CONFIG, 745 "Invalid argument" 746 " for zero-copy [0|1]\n"); 747 us_vhost_usage(prgname); 748 return -1; 749 } else 750 zero_copy = ret; 751 } 752 753 /* Specify the descriptor number on RX. */ 754 if (!strncmp(long_option[option_index].name, 755 "rx-desc-num", MAX_LONG_OPT_SZ)) { 756 ret = parse_num_opt(optarg, MAX_RING_DESC); 757 if ((ret == -1) || (!POWEROF2(ret))) { 758 RTE_LOG(INFO, VHOST_CONFIG, 759 "Invalid argument for rx-desc-num[0-N]," 760 "power of 2 required.\n"); 761 us_vhost_usage(prgname); 762 return -1; 763 } else { 764 num_rx_descriptor = ret; 765 } 766 } 767 768 /* Specify the descriptor number on TX. */ 769 if (!strncmp(long_option[option_index].name, 770 "tx-desc-num", MAX_LONG_OPT_SZ)) { 771 ret = parse_num_opt(optarg, MAX_RING_DESC); 772 if ((ret == -1) || (!POWEROF2(ret))) { 773 RTE_LOG(INFO, VHOST_CONFIG, 774 "Invalid argument for tx-desc-num [0-N]," 775 "power of 2 required.\n"); 776 us_vhost_usage(prgname); 777 return -1; 778 } else { 779 num_tx_descriptor = ret; 780 } 781 } 782 783 break; 784 785 /* Invalid option - print options. */ 786 default: 787 us_vhost_usage(prgname); 788 return -1; 789 } 790 } 791 792 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 793 if (enabled_port_mask & (1 << i)) 794 ports[num_ports++] = (uint8_t)i; 795 } 796 797 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 798 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 799 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 800 return -1; 801 } 802 803 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 804 RTE_LOG(INFO, VHOST_PORT, 805 "Vhost zero copy doesn't support software vm2vm," 806 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 807 return -1; 808 } 809 810 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 811 RTE_LOG(INFO, VHOST_PORT, 812 "Vhost zero copy doesn't support jumbo frame," 813 "please specify '--mergeable 0' to disable the " 814 "mergeable feature.\n"); 815 return -1; 816 } 817 818 return 0; 819 } 820 821 /* 822 * Update the global var NUM_PORTS and array PORTS according to system ports number 823 * and return valid ports number 824 */ 825 static unsigned check_ports_num(unsigned nb_ports) 826 { 827 unsigned valid_num_ports = num_ports; 828 unsigned portid; 829 830 if (num_ports > nb_ports) { 831 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 832 num_ports, nb_ports); 833 num_ports = nb_ports; 834 } 835 836 for (portid = 0; portid < num_ports; portid ++) { 837 if (ports[portid] >= nb_ports) { 838 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 839 ports[portid], (nb_ports - 1)); 840 ports[portid] = INVALID_PORT_ID; 841 valid_num_ports--; 842 } 843 } 844 return valid_num_ports; 845 } 846 847 /* 848 * Macro to print out packet contents. Wrapped in debug define so that the 849 * data path is not effected when debug is disabled. 850 */ 851 #ifdef DEBUG 852 #define PRINT_PACKET(device, addr, size, header) do { \ 853 char *pkt_addr = (char*)(addr); \ 854 unsigned int index; \ 855 char packet[MAX_PRINT_BUFF]; \ 856 \ 857 if ((header)) \ 858 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 859 else \ 860 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 861 for (index = 0; index < (size); index++) { \ 862 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 863 "%02hhx ", pkt_addr[index]); \ 864 } \ 865 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 866 \ 867 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 868 } while(0) 869 #else 870 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 871 #endif 872 873 /* 874 * Function to convert guest physical addresses to vhost physical addresses. 875 * This is used to convert virtio buffer addresses. 876 */ 877 static inline uint64_t __attribute__((always_inline)) 878 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 879 uint32_t buf_len, hpa_type *addr_type) 880 { 881 struct virtio_memory_regions_hpa *region; 882 uint32_t regionidx; 883 uint64_t vhost_pa = 0; 884 885 *addr_type = PHYS_ADDR_INVALID; 886 887 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 888 region = &vdev->regions_hpa[regionidx]; 889 if ((guest_pa >= region->guest_phys_address) && 890 (guest_pa <= region->guest_phys_address_end)) { 891 vhost_pa = region->host_phys_addr_offset + guest_pa; 892 if (likely((guest_pa + buf_len - 1) 893 <= region->guest_phys_address_end)) 894 *addr_type = PHYS_ADDR_CONTINUOUS; 895 else 896 *addr_type = PHYS_ADDR_CROSS_SUBREG; 897 break; 898 } 899 } 900 901 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 902 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 903 (void *)(uintptr_t)vhost_pa); 904 905 return vhost_pa; 906 } 907 908 /* 909 * Compares a packet destination MAC address to a device MAC address. 910 */ 911 static inline int __attribute__((always_inline)) 912 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 913 { 914 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 915 } 916 917 /* 918 * This function learns the MAC address of the device and registers this along with a 919 * vlan tag to a VMDQ. 920 */ 921 static int 922 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 923 { 924 struct ether_hdr *pkt_hdr; 925 struct virtio_net_data_ll *dev_ll; 926 struct virtio_net *dev = vdev->dev; 927 int i, ret; 928 929 /* Learn MAC address of guest device from packet */ 930 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 931 932 dev_ll = ll_root_used; 933 934 while (dev_ll != NULL) { 935 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 936 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 937 return -1; 938 } 939 dev_ll = dev_ll->next; 940 } 941 942 for (i = 0; i < ETHER_ADDR_LEN; i++) 943 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 944 945 /* vlan_tag currently uses the device_id. */ 946 vdev->vlan_tag = vlan_tags[dev->device_fh]; 947 948 /* Print out VMDQ registration info. */ 949 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 950 dev->device_fh, 951 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 952 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 953 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 954 vdev->vlan_tag); 955 956 /* Register the MAC address. */ 957 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 958 (uint32_t)dev->device_fh + vmdq_pool_base); 959 if (ret) 960 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 961 dev->device_fh); 962 963 /* Enable stripping of the vlan tag as we handle routing. */ 964 if (vlan_strip) 965 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 966 (uint16_t)vdev->vmdq_rx_q, 1); 967 968 /* Set device as ready for RX. */ 969 vdev->ready = DEVICE_RX; 970 971 return 0; 972 } 973 974 /* 975 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 976 * queue before disabling RX on the device. 977 */ 978 static inline void 979 unlink_vmdq(struct vhost_dev *vdev) 980 { 981 unsigned i = 0; 982 unsigned rx_count; 983 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 984 985 if (vdev->ready == DEVICE_RX) { 986 /*clear MAC and VLAN settings*/ 987 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 988 for (i = 0; i < 6; i++) 989 vdev->mac_address.addr_bytes[i] = 0; 990 991 vdev->vlan_tag = 0; 992 993 /*Clear out the receive buffers*/ 994 rx_count = rte_eth_rx_burst(ports[0], 995 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 996 997 while (rx_count) { 998 for (i = 0; i < rx_count; i++) 999 rte_pktmbuf_free(pkts_burst[i]); 1000 1001 rx_count = rte_eth_rx_burst(ports[0], 1002 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1003 } 1004 1005 vdev->ready = DEVICE_MAC_LEARNING; 1006 } 1007 } 1008 1009 /* 1010 * Check if the packet destination MAC address is for a local device. If so then put 1011 * the packet on that devices RX queue. If not then return. 1012 */ 1013 static inline int __attribute__((always_inline)) 1014 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1015 { 1016 struct virtio_net_data_ll *dev_ll; 1017 struct ether_hdr *pkt_hdr; 1018 uint64_t ret = 0; 1019 struct virtio_net *dev = vdev->dev; 1020 struct virtio_net *tdev; /* destination virito device */ 1021 1022 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1023 1024 /*get the used devices list*/ 1025 dev_ll = ll_root_used; 1026 1027 while (dev_ll != NULL) { 1028 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1029 &dev_ll->vdev->mac_address)) { 1030 1031 /* Drop the packet if the TX packet is destined for the TX device. */ 1032 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1033 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1034 dev->device_fh); 1035 return 0; 1036 } 1037 tdev = dev_ll->vdev->dev; 1038 1039 1040 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1041 1042 if (unlikely(dev_ll->vdev->remove)) { 1043 /*drop the packet if the device is marked for removal*/ 1044 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1045 } else { 1046 /*send the packet to the local virtio device*/ 1047 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1048 if (enable_stats) { 1049 rte_atomic64_add( 1050 &dev_statistics[tdev->device_fh].rx_total_atomic, 1051 1); 1052 rte_atomic64_add( 1053 &dev_statistics[tdev->device_fh].rx_atomic, 1054 ret); 1055 dev_statistics[dev->device_fh].tx_total++; 1056 dev_statistics[dev->device_fh].tx += ret; 1057 } 1058 } 1059 1060 return 0; 1061 } 1062 dev_ll = dev_ll->next; 1063 } 1064 1065 return -1; 1066 } 1067 1068 /* 1069 * Check if the destination MAC of a packet is one local VM, 1070 * and get its vlan tag, and offset if it is. 1071 */ 1072 static inline int __attribute__((always_inline)) 1073 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1074 uint32_t *offset, uint16_t *vlan_tag) 1075 { 1076 struct virtio_net_data_ll *dev_ll = ll_root_used; 1077 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1078 1079 while (dev_ll != NULL) { 1080 if ((dev_ll->vdev->ready == DEVICE_RX) 1081 && ether_addr_cmp(&(pkt_hdr->d_addr), 1082 &dev_ll->vdev->mac_address)) { 1083 /* 1084 * Drop the packet if the TX packet is 1085 * destined for the TX device. 1086 */ 1087 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1088 LOG_DEBUG(VHOST_DATA, 1089 "(%"PRIu64") TX: Source and destination" 1090 " MAC addresses are the same. Dropping " 1091 "packet.\n", 1092 dev_ll->vdev->dev->device_fh); 1093 return -1; 1094 } 1095 1096 /* 1097 * HW vlan strip will reduce the packet length 1098 * by minus length of vlan tag, so need restore 1099 * the packet length by plus it. 1100 */ 1101 *offset = VLAN_HLEN; 1102 *vlan_tag = 1103 (uint16_t) 1104 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1105 1106 LOG_DEBUG(VHOST_DATA, 1107 "(%"PRIu64") TX: pkt to local VM device id:" 1108 "(%"PRIu64") vlan tag: %d.\n", 1109 dev->device_fh, dev_ll->vdev->dev->device_fh, 1110 (int)*vlan_tag); 1111 1112 break; 1113 } 1114 dev_ll = dev_ll->next; 1115 } 1116 return 0; 1117 } 1118 1119 /* 1120 * This function routes the TX packet to the correct interface. This may be a local device 1121 * or the physical port. 1122 */ 1123 static inline void __attribute__((always_inline)) 1124 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1125 { 1126 struct mbuf_table *tx_q; 1127 struct rte_mbuf **m_table; 1128 unsigned len, ret, offset = 0; 1129 const uint16_t lcore_id = rte_lcore_id(); 1130 struct virtio_net *dev = vdev->dev; 1131 struct ether_hdr *nh; 1132 1133 /*check if destination is local VM*/ 1134 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1135 rte_pktmbuf_free(m); 1136 return; 1137 } 1138 1139 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1140 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1141 rte_pktmbuf_free(m); 1142 return; 1143 } 1144 } 1145 1146 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1147 1148 /*Add packet to the port tx queue*/ 1149 tx_q = &lcore_tx_queue[lcore_id]; 1150 len = tx_q->len; 1151 1152 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1153 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1154 /* Guest has inserted the vlan tag. */ 1155 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1156 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1157 if ((vm2vm_mode == VM2VM_HARDWARE) && 1158 (vh->vlan_tci != vlan_tag_be)) 1159 vh->vlan_tci = vlan_tag_be; 1160 } else { 1161 m->ol_flags = PKT_TX_VLAN_PKT; 1162 1163 /* 1164 * Find the right seg to adjust the data len when offset is 1165 * bigger than tail room size. 1166 */ 1167 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1168 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1169 m->data_len += offset; 1170 else { 1171 struct rte_mbuf *seg = m; 1172 1173 while ((seg->next != NULL) && 1174 (offset > rte_pktmbuf_tailroom(seg))) 1175 seg = seg->next; 1176 1177 seg->data_len += offset; 1178 } 1179 m->pkt_len += offset; 1180 } 1181 1182 m->vlan_tci = vlan_tag; 1183 } 1184 1185 tx_q->m_table[len] = m; 1186 len++; 1187 if (enable_stats) { 1188 dev_statistics[dev->device_fh].tx_total++; 1189 dev_statistics[dev->device_fh].tx++; 1190 } 1191 1192 if (unlikely(len == MAX_PKT_BURST)) { 1193 m_table = (struct rte_mbuf **)tx_q->m_table; 1194 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1195 /* Free any buffers not handled by TX and update the port stats. */ 1196 if (unlikely(ret < len)) { 1197 do { 1198 rte_pktmbuf_free(m_table[ret]); 1199 } while (++ret < len); 1200 } 1201 1202 len = 0; 1203 } 1204 1205 tx_q->len = len; 1206 return; 1207 } 1208 /* 1209 * This function is called by each data core. It handles all RX/TX registered with the 1210 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1211 * with all devices in the main linked list. 1212 */ 1213 static int 1214 switch_worker(__attribute__((unused)) void *arg) 1215 { 1216 struct rte_mempool *mbuf_pool = arg; 1217 struct virtio_net *dev = NULL; 1218 struct vhost_dev *vdev = NULL; 1219 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1220 struct virtio_net_data_ll *dev_ll; 1221 struct mbuf_table *tx_q; 1222 volatile struct lcore_ll_info *lcore_ll; 1223 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1224 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1225 unsigned ret, i; 1226 const uint16_t lcore_id = rte_lcore_id(); 1227 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1228 uint16_t rx_count = 0; 1229 uint16_t tx_count; 1230 uint32_t retry = 0; 1231 1232 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1233 lcore_ll = lcore_info[lcore_id].lcore_ll; 1234 prev_tsc = 0; 1235 1236 tx_q = &lcore_tx_queue[lcore_id]; 1237 for (i = 0; i < num_cores; i ++) { 1238 if (lcore_ids[i] == lcore_id) { 1239 tx_q->txq_id = i; 1240 break; 1241 } 1242 } 1243 1244 while(1) { 1245 cur_tsc = rte_rdtsc(); 1246 /* 1247 * TX burst queue drain 1248 */ 1249 diff_tsc = cur_tsc - prev_tsc; 1250 if (unlikely(diff_tsc > drain_tsc)) { 1251 1252 if (tx_q->len) { 1253 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1254 1255 /*Tx any packets in the queue*/ 1256 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1257 (struct rte_mbuf **)tx_q->m_table, 1258 (uint16_t)tx_q->len); 1259 if (unlikely(ret < tx_q->len)) { 1260 do { 1261 rte_pktmbuf_free(tx_q->m_table[ret]); 1262 } while (++ret < tx_q->len); 1263 } 1264 1265 tx_q->len = 0; 1266 } 1267 1268 prev_tsc = cur_tsc; 1269 1270 } 1271 1272 rte_prefetch0(lcore_ll->ll_root_used); 1273 /* 1274 * Inform the configuration core that we have exited the linked list and that no devices are 1275 * in use if requested. 1276 */ 1277 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1278 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1279 1280 /* 1281 * Process devices 1282 */ 1283 dev_ll = lcore_ll->ll_root_used; 1284 1285 while (dev_ll != NULL) { 1286 /*get virtio device ID*/ 1287 vdev = dev_ll->vdev; 1288 dev = vdev->dev; 1289 1290 if (unlikely(vdev->remove)) { 1291 dev_ll = dev_ll->next; 1292 unlink_vmdq(vdev); 1293 vdev->ready = DEVICE_SAFE_REMOVE; 1294 continue; 1295 } 1296 if (likely(vdev->ready == DEVICE_RX)) { 1297 /*Handle guest RX*/ 1298 rx_count = rte_eth_rx_burst(ports[0], 1299 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1300 1301 if (rx_count) { 1302 /* 1303 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1304 * Here MAX_PKT_BURST must be less than virtio queue size 1305 */ 1306 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1307 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1308 rte_delay_us(burst_rx_delay_time); 1309 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1310 break; 1311 } 1312 } 1313 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1314 if (enable_stats) { 1315 rte_atomic64_add( 1316 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1317 rx_count); 1318 rte_atomic64_add( 1319 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1320 } 1321 while (likely(rx_count)) { 1322 rx_count--; 1323 rte_pktmbuf_free(pkts_burst[rx_count]); 1324 } 1325 1326 } 1327 } 1328 1329 if (likely(!vdev->remove)) { 1330 /* Handle guest TX*/ 1331 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1332 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1333 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1334 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1335 while (tx_count) 1336 rte_pktmbuf_free(pkts_burst[--tx_count]); 1337 } 1338 } 1339 while (tx_count) 1340 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1341 } 1342 1343 /*move to the next device in the list*/ 1344 dev_ll = dev_ll->next; 1345 } 1346 } 1347 1348 return 0; 1349 } 1350 1351 /* 1352 * This function gets available ring number for zero copy rx. 1353 * Only one thread will call this funciton for a paticular virtio device, 1354 * so, it is designed as non-thread-safe function. 1355 */ 1356 static inline uint32_t __attribute__((always_inline)) 1357 get_available_ring_num_zcp(struct virtio_net *dev) 1358 { 1359 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1360 uint16_t avail_idx; 1361 1362 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1363 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1364 } 1365 1366 /* 1367 * This function gets available ring index for zero copy rx, 1368 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1369 * Only one thread will call this funciton for a paticular virtio device, 1370 * so, it is designed as non-thread-safe function. 1371 */ 1372 static inline uint32_t __attribute__((always_inline)) 1373 get_available_ring_index_zcp(struct virtio_net *dev, 1374 uint16_t *res_base_idx, uint32_t count) 1375 { 1376 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1377 uint16_t avail_idx; 1378 uint32_t retry = 0; 1379 uint16_t free_entries; 1380 1381 *res_base_idx = vq->last_used_idx_res; 1382 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1383 free_entries = (avail_idx - *res_base_idx); 1384 1385 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1386 "avail idx: %d, " 1387 "res base idx:%d, free entries:%d\n", 1388 dev->device_fh, avail_idx, *res_base_idx, 1389 free_entries); 1390 1391 /* 1392 * If retry is enabled and the queue is full then we wait 1393 * and retry to avoid packet loss. 1394 */ 1395 if (enable_retry && unlikely(count > free_entries)) { 1396 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1397 rte_delay_us(burst_rx_delay_time); 1398 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1399 free_entries = (avail_idx - *res_base_idx); 1400 if (count <= free_entries) 1401 break; 1402 } 1403 } 1404 1405 /*check that we have enough buffers*/ 1406 if (unlikely(count > free_entries)) 1407 count = free_entries; 1408 1409 if (unlikely(count == 0)) { 1410 LOG_DEBUG(VHOST_DATA, 1411 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1412 "avail idx: %d, res base idx:%d, free entries:%d\n", 1413 dev->device_fh, avail_idx, 1414 *res_base_idx, free_entries); 1415 return 0; 1416 } 1417 1418 vq->last_used_idx_res = *res_base_idx + count; 1419 1420 return count; 1421 } 1422 1423 /* 1424 * This function put descriptor back to used list. 1425 */ 1426 static inline void __attribute__((always_inline)) 1427 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1428 { 1429 uint16_t res_cur_idx = vq->last_used_idx; 1430 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1431 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1432 rte_compiler_barrier(); 1433 *(volatile uint16_t *)&vq->used->idx += 1; 1434 vq->last_used_idx += 1; 1435 1436 /* Kick the guest if necessary. */ 1437 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1438 eventfd_write(vq->callfd, (eventfd_t)1); 1439 } 1440 1441 /* 1442 * This function get available descriptor from vitio vring and un-attached mbuf 1443 * from vpool->ring, and then attach them together. It needs adjust the offset 1444 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1445 * frame data may be put to wrong location in mbuf. 1446 */ 1447 static inline void __attribute__((always_inline)) 1448 attach_rxmbuf_zcp(struct virtio_net *dev) 1449 { 1450 uint16_t res_base_idx, desc_idx; 1451 uint64_t buff_addr, phys_addr; 1452 struct vhost_virtqueue *vq; 1453 struct vring_desc *desc; 1454 void *obj = NULL; 1455 struct rte_mbuf *mbuf; 1456 struct vpool *vpool; 1457 hpa_type addr_type; 1458 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1459 1460 vpool = &vpool_array[vdev->vmdq_rx_q]; 1461 vq = dev->virtqueue[VIRTIO_RXQ]; 1462 1463 do { 1464 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1465 1) != 1)) 1466 return; 1467 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1468 1469 desc = &vq->desc[desc_idx]; 1470 if (desc->flags & VRING_DESC_F_NEXT) { 1471 desc = &vq->desc[desc->next]; 1472 buff_addr = gpa_to_vva(dev, desc->addr); 1473 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1474 &addr_type); 1475 } else { 1476 buff_addr = gpa_to_vva(dev, 1477 desc->addr + vq->vhost_hlen); 1478 phys_addr = gpa_to_hpa(vdev, 1479 desc->addr + vq->vhost_hlen, 1480 desc->len, &addr_type); 1481 } 1482 1483 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1484 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1485 " address found when attaching RX frame buffer" 1486 " address!\n", dev->device_fh); 1487 put_desc_to_used_list_zcp(vq, desc_idx); 1488 continue; 1489 } 1490 1491 /* 1492 * Check if the frame buffer address from guest crosses 1493 * sub-region or not. 1494 */ 1495 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1496 RTE_LOG(ERR, VHOST_DATA, 1497 "(%"PRIu64") Frame buffer address cross " 1498 "sub-regioin found when attaching RX frame " 1499 "buffer address!\n", 1500 dev->device_fh); 1501 put_desc_to_used_list_zcp(vq, desc_idx); 1502 continue; 1503 } 1504 } while (unlikely(phys_addr == 0)); 1505 1506 rte_ring_sc_dequeue(vpool->ring, &obj); 1507 mbuf = obj; 1508 if (unlikely(mbuf == NULL)) { 1509 LOG_DEBUG(VHOST_DATA, 1510 "(%"PRIu64") in attach_rxmbuf_zcp: " 1511 "ring_sc_dequeue fail.\n", 1512 dev->device_fh); 1513 put_desc_to_used_list_zcp(vq, desc_idx); 1514 return; 1515 } 1516 1517 if (unlikely(vpool->buf_size > desc->len)) { 1518 LOG_DEBUG(VHOST_DATA, 1519 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1520 "length(%d) of descriptor idx: %d less than room " 1521 "size required: %d\n", 1522 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1523 put_desc_to_used_list_zcp(vq, desc_idx); 1524 rte_ring_sp_enqueue(vpool->ring, obj); 1525 return; 1526 } 1527 1528 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1529 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1530 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1531 mbuf->data_len = desc->len; 1532 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1533 1534 LOG_DEBUG(VHOST_DATA, 1535 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1536 "descriptor idx:%d\n", 1537 dev->device_fh, res_base_idx, desc_idx); 1538 1539 __rte_mbuf_raw_free(mbuf); 1540 1541 return; 1542 } 1543 1544 /* 1545 * Detach an attched packet mbuf - 1546 * - restore original mbuf address and length values. 1547 * - reset pktmbuf data and data_len to their default values. 1548 * All other fields of the given packet mbuf will be left intact. 1549 * 1550 * @param m 1551 * The attached packet mbuf. 1552 */ 1553 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1554 { 1555 const struct rte_mempool *mp = m->pool; 1556 void *buf = rte_mbuf_to_baddr(m); 1557 uint32_t buf_ofs; 1558 uint32_t buf_len = mp->elt_size - sizeof(*m); 1559 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1560 1561 m->buf_addr = buf; 1562 m->buf_len = (uint16_t)buf_len; 1563 1564 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1565 RTE_PKTMBUF_HEADROOM : m->buf_len; 1566 m->data_off = buf_ofs; 1567 1568 m->data_len = 0; 1569 } 1570 1571 /* 1572 * This function is called after packets have been transimited. It fetchs mbuf 1573 * from vpool->pool, detached it and put into vpool->ring. It also update the 1574 * used index and kick the guest if necessary. 1575 */ 1576 static inline uint32_t __attribute__((always_inline)) 1577 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1578 { 1579 struct rte_mbuf *mbuf; 1580 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1581 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1582 uint32_t index = 0; 1583 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1584 1585 LOG_DEBUG(VHOST_DATA, 1586 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1587 "clean is: %d\n", 1588 dev->device_fh, mbuf_count); 1589 LOG_DEBUG(VHOST_DATA, 1590 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1591 "clean is : %d\n", 1592 dev->device_fh, rte_ring_count(vpool->ring)); 1593 1594 for (index = 0; index < mbuf_count; index++) { 1595 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1596 if (likely(MBUF_EXT_MEM(mbuf))) 1597 pktmbuf_detach_zcp(mbuf); 1598 rte_ring_sp_enqueue(vpool->ring, mbuf); 1599 1600 /* Update used index buffer information. */ 1601 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1602 vq->used->ring[used_idx].len = 0; 1603 1604 used_idx = (used_idx + 1) & (vq->size - 1); 1605 } 1606 1607 LOG_DEBUG(VHOST_DATA, 1608 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1609 "clean is: %d\n", 1610 dev->device_fh, rte_mempool_count(vpool->pool)); 1611 LOG_DEBUG(VHOST_DATA, 1612 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1613 "clean is : %d\n", 1614 dev->device_fh, rte_ring_count(vpool->ring)); 1615 LOG_DEBUG(VHOST_DATA, 1616 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1617 "vq->last_used_idx:%d\n", 1618 dev->device_fh, vq->last_used_idx); 1619 1620 vq->last_used_idx += mbuf_count; 1621 1622 LOG_DEBUG(VHOST_DATA, 1623 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1624 "vq->last_used_idx:%d\n", 1625 dev->device_fh, vq->last_used_idx); 1626 1627 rte_compiler_barrier(); 1628 1629 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1630 1631 /* Kick guest if required. */ 1632 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1633 eventfd_write(vq->callfd, (eventfd_t)1); 1634 1635 return 0; 1636 } 1637 1638 /* 1639 * This function is called when a virtio device is destroy. 1640 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1641 */ 1642 static void mbuf_destroy_zcp(struct vpool *vpool) 1643 { 1644 struct rte_mbuf *mbuf = NULL; 1645 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1646 1647 LOG_DEBUG(VHOST_CONFIG, 1648 "in mbuf_destroy_zcp: mbuf count in mempool before " 1649 "mbuf_destroy_zcp is: %d\n", 1650 mbuf_count); 1651 LOG_DEBUG(VHOST_CONFIG, 1652 "in mbuf_destroy_zcp: mbuf count in ring before " 1653 "mbuf_destroy_zcp is : %d\n", 1654 rte_ring_count(vpool->ring)); 1655 1656 for (index = 0; index < mbuf_count; index++) { 1657 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1658 if (likely(mbuf != NULL)) { 1659 if (likely(MBUF_EXT_MEM(mbuf))) 1660 pktmbuf_detach_zcp(mbuf); 1661 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1662 } 1663 } 1664 1665 LOG_DEBUG(VHOST_CONFIG, 1666 "in mbuf_destroy_zcp: mbuf count in mempool after " 1667 "mbuf_destroy_zcp is: %d\n", 1668 rte_mempool_count(vpool->pool)); 1669 LOG_DEBUG(VHOST_CONFIG, 1670 "in mbuf_destroy_zcp: mbuf count in ring after " 1671 "mbuf_destroy_zcp is : %d\n", 1672 rte_ring_count(vpool->ring)); 1673 } 1674 1675 /* 1676 * This function update the use flag and counter. 1677 */ 1678 static inline uint32_t __attribute__((always_inline)) 1679 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1680 uint32_t count) 1681 { 1682 struct vhost_virtqueue *vq; 1683 struct vring_desc *desc; 1684 struct rte_mbuf *buff; 1685 /* The virtio_hdr is initialised to 0. */ 1686 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1687 = {{0, 0, 0, 0, 0, 0}, 0}; 1688 uint64_t buff_hdr_addr = 0; 1689 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1690 uint32_t head_idx, packet_success = 0; 1691 uint16_t res_cur_idx; 1692 1693 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1694 1695 if (count == 0) 1696 return 0; 1697 1698 vq = dev->virtqueue[VIRTIO_RXQ]; 1699 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1700 1701 res_cur_idx = vq->last_used_idx; 1702 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1703 dev->device_fh, res_cur_idx, res_cur_idx + count); 1704 1705 /* Retrieve all of the head indexes first to avoid caching issues. */ 1706 for (head_idx = 0; head_idx < count; head_idx++) 1707 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1708 1709 /*Prefetch descriptor index. */ 1710 rte_prefetch0(&vq->desc[head[packet_success]]); 1711 1712 while (packet_success != count) { 1713 /* Get descriptor from available ring */ 1714 desc = &vq->desc[head[packet_success]]; 1715 1716 buff = pkts[packet_success]; 1717 LOG_DEBUG(VHOST_DATA, 1718 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1719 "pkt[%d] descriptor idx: %d\n", 1720 dev->device_fh, packet_success, 1721 MBUF_HEADROOM_UINT32(buff)); 1722 1723 PRINT_PACKET(dev, 1724 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1725 + RTE_PKTMBUF_HEADROOM), 1726 rte_pktmbuf_data_len(buff), 0); 1727 1728 /* Buffer address translation for virtio header. */ 1729 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1730 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1731 1732 /* 1733 * If the descriptors are chained the header and data are 1734 * placed in separate buffers. 1735 */ 1736 if (desc->flags & VRING_DESC_F_NEXT) { 1737 desc->len = vq->vhost_hlen; 1738 desc = &vq->desc[desc->next]; 1739 desc->len = rte_pktmbuf_data_len(buff); 1740 } else { 1741 desc->len = packet_len; 1742 } 1743 1744 /* Update used ring with desc information */ 1745 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1746 = head[packet_success]; 1747 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1748 = packet_len; 1749 res_cur_idx++; 1750 packet_success++; 1751 1752 /* A header is required per buffer. */ 1753 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1754 (const void *)&virtio_hdr, vq->vhost_hlen); 1755 1756 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1757 1758 if (likely(packet_success < count)) { 1759 /* Prefetch descriptor index. */ 1760 rte_prefetch0(&vq->desc[head[packet_success]]); 1761 } 1762 } 1763 1764 rte_compiler_barrier(); 1765 1766 LOG_DEBUG(VHOST_DATA, 1767 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1768 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1769 dev->device_fh, vq->last_used_idx, vq->used->idx); 1770 1771 *(volatile uint16_t *)&vq->used->idx += count; 1772 vq->last_used_idx += count; 1773 1774 LOG_DEBUG(VHOST_DATA, 1775 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1776 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1777 dev->device_fh, vq->last_used_idx, vq->used->idx); 1778 1779 /* Kick the guest if necessary. */ 1780 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1781 eventfd_write(vq->callfd, (eventfd_t)1); 1782 1783 return count; 1784 } 1785 1786 /* 1787 * This function routes the TX packet to the correct interface. 1788 * This may be a local device or the physical port. 1789 */ 1790 static inline void __attribute__((always_inline)) 1791 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1792 uint32_t desc_idx, uint8_t need_copy) 1793 { 1794 struct mbuf_table *tx_q; 1795 struct rte_mbuf **m_table; 1796 void *obj = NULL; 1797 struct rte_mbuf *mbuf; 1798 unsigned len, ret, offset = 0; 1799 struct vpool *vpool; 1800 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1801 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1802 1803 /*Add packet to the port tx queue*/ 1804 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1805 len = tx_q->len; 1806 1807 /* Allocate an mbuf and populate the structure. */ 1808 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1809 rte_ring_sc_dequeue(vpool->ring, &obj); 1810 mbuf = obj; 1811 if (unlikely(mbuf == NULL)) { 1812 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1813 RTE_LOG(ERR, VHOST_DATA, 1814 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1815 dev->device_fh); 1816 put_desc_to_used_list_zcp(vq, desc_idx); 1817 return; 1818 } 1819 1820 if (vm2vm_mode == VM2VM_HARDWARE) { 1821 /* Avoid using a vlan tag from any vm for external pkt, such as 1822 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1823 * selection, MAC address determines it as an external pkt 1824 * which should go to network, while vlan tag determine it as 1825 * a vm2vm pkt should forward to another vm. Hardware confuse 1826 * such a ambiguous situation, so pkt will lost. 1827 */ 1828 vlan_tag = external_pkt_default_vlan_tag; 1829 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1830 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1831 __rte_mbuf_raw_free(mbuf); 1832 return; 1833 } 1834 } 1835 1836 mbuf->nb_segs = m->nb_segs; 1837 mbuf->next = m->next; 1838 mbuf->data_len = m->data_len + offset; 1839 mbuf->pkt_len = mbuf->data_len; 1840 if (unlikely(need_copy)) { 1841 /* Copy the packet contents to the mbuf. */ 1842 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1843 rte_pktmbuf_mtod(m, void *), 1844 m->data_len); 1845 } else { 1846 mbuf->data_off = m->data_off; 1847 mbuf->buf_physaddr = m->buf_physaddr; 1848 mbuf->buf_addr = m->buf_addr; 1849 } 1850 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1851 mbuf->vlan_tci = vlan_tag; 1852 mbuf->l2_len = sizeof(struct ether_hdr); 1853 mbuf->l3_len = sizeof(struct ipv4_hdr); 1854 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1855 1856 tx_q->m_table[len] = mbuf; 1857 len++; 1858 1859 LOG_DEBUG(VHOST_DATA, 1860 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1861 dev->device_fh, 1862 mbuf->nb_segs, 1863 (mbuf->next == NULL) ? "null" : "non-null"); 1864 1865 if (enable_stats) { 1866 dev_statistics[dev->device_fh].tx_total++; 1867 dev_statistics[dev->device_fh].tx++; 1868 } 1869 1870 if (unlikely(len == MAX_PKT_BURST)) { 1871 m_table = (struct rte_mbuf **)tx_q->m_table; 1872 ret = rte_eth_tx_burst(ports[0], 1873 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1874 1875 /* 1876 * Free any buffers not handled by TX and update 1877 * the port stats. 1878 */ 1879 if (unlikely(ret < len)) { 1880 do { 1881 rte_pktmbuf_free(m_table[ret]); 1882 } while (++ret < len); 1883 } 1884 1885 len = 0; 1886 txmbuf_clean_zcp(dev, vpool); 1887 } 1888 1889 tx_q->len = len; 1890 1891 return; 1892 } 1893 1894 /* 1895 * This function TX all available packets in virtio TX queue for one 1896 * virtio-net device. If it is first packet, it learns MAC address and 1897 * setup VMDQ. 1898 */ 1899 static inline void __attribute__((always_inline)) 1900 virtio_dev_tx_zcp(struct virtio_net *dev) 1901 { 1902 struct rte_mbuf m; 1903 struct vhost_virtqueue *vq; 1904 struct vring_desc *desc; 1905 uint64_t buff_addr = 0, phys_addr; 1906 uint32_t head[MAX_PKT_BURST]; 1907 uint32_t i; 1908 uint16_t free_entries, packet_success = 0; 1909 uint16_t avail_idx; 1910 uint8_t need_copy = 0; 1911 hpa_type addr_type; 1912 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1913 1914 vq = dev->virtqueue[VIRTIO_TXQ]; 1915 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1916 1917 /* If there are no available buffers then return. */ 1918 if (vq->last_used_idx_res == avail_idx) 1919 return; 1920 1921 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1922 1923 /* Prefetch available ring to retrieve head indexes. */ 1924 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1925 1926 /* Get the number of free entries in the ring */ 1927 free_entries = (avail_idx - vq->last_used_idx_res); 1928 1929 /* Limit to MAX_PKT_BURST. */ 1930 free_entries 1931 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1932 1933 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1934 dev->device_fh, free_entries); 1935 1936 /* Retrieve all of the head indexes first to avoid caching issues. */ 1937 for (i = 0; i < free_entries; i++) 1938 head[i] 1939 = vq->avail->ring[(vq->last_used_idx_res + i) 1940 & (vq->size - 1)]; 1941 1942 vq->last_used_idx_res += free_entries; 1943 1944 /* Prefetch descriptor index. */ 1945 rte_prefetch0(&vq->desc[head[packet_success]]); 1946 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1947 1948 while (packet_success < free_entries) { 1949 desc = &vq->desc[head[packet_success]]; 1950 1951 /* Discard first buffer as it is the virtio header */ 1952 desc = &vq->desc[desc->next]; 1953 1954 /* Buffer address translation. */ 1955 buff_addr = gpa_to_vva(dev, desc->addr); 1956 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1957 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1958 &addr_type); 1959 1960 if (likely(packet_success < (free_entries - 1))) 1961 /* Prefetch descriptor index. */ 1962 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1963 1964 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1965 RTE_LOG(ERR, VHOST_DATA, 1966 "(%"PRIu64") Invalid frame buffer address found" 1967 "when TX packets!\n", 1968 dev->device_fh); 1969 packet_success++; 1970 continue; 1971 } 1972 1973 /* Prefetch buffer address. */ 1974 rte_prefetch0((void *)(uintptr_t)buff_addr); 1975 1976 /* 1977 * Setup dummy mbuf. This is copied to a real mbuf if 1978 * transmitted out the physical port. 1979 */ 1980 m.data_len = desc->len; 1981 m.nb_segs = 1; 1982 m.next = NULL; 1983 m.data_off = 0; 1984 m.buf_addr = (void *)(uintptr_t)buff_addr; 1985 m.buf_physaddr = phys_addr; 1986 1987 /* 1988 * Check if the frame buffer address from guest crosses 1989 * sub-region or not. 1990 */ 1991 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1992 RTE_LOG(ERR, VHOST_DATA, 1993 "(%"PRIu64") Frame buffer address cross " 1994 "sub-regioin found when attaching TX frame " 1995 "buffer address!\n", 1996 dev->device_fh); 1997 need_copy = 1; 1998 } else 1999 need_copy = 0; 2000 2001 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2002 2003 /* 2004 * If this is the first received packet we need to learn 2005 * the MAC and setup VMDQ 2006 */ 2007 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2008 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2009 /* 2010 * Discard frame if device is scheduled for 2011 * removal or a duplicate MAC address is found. 2012 */ 2013 packet_success += free_entries; 2014 vq->last_used_idx += packet_success; 2015 break; 2016 } 2017 } 2018 2019 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2020 packet_success++; 2021 } 2022 } 2023 2024 /* 2025 * This function is called by each data core. It handles all RX/TX registered 2026 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2027 * addresses are compared with all devices in the main linked list. 2028 */ 2029 static int 2030 switch_worker_zcp(__attribute__((unused)) void *arg) 2031 { 2032 struct virtio_net *dev = NULL; 2033 struct vhost_dev *vdev = NULL; 2034 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2035 struct virtio_net_data_ll *dev_ll; 2036 struct mbuf_table *tx_q; 2037 volatile struct lcore_ll_info *lcore_ll; 2038 const uint64_t drain_tsc 2039 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2040 * BURST_TX_DRAIN_US; 2041 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2042 unsigned ret; 2043 const uint16_t lcore_id = rte_lcore_id(); 2044 uint16_t count_in_ring, rx_count = 0; 2045 2046 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2047 2048 lcore_ll = lcore_info[lcore_id].lcore_ll; 2049 prev_tsc = 0; 2050 2051 while (1) { 2052 cur_tsc = rte_rdtsc(); 2053 2054 /* TX burst queue drain */ 2055 diff_tsc = cur_tsc - prev_tsc; 2056 if (unlikely(diff_tsc > drain_tsc)) { 2057 /* 2058 * Get mbuf from vpool.pool and detach mbuf and 2059 * put back into vpool.ring. 2060 */ 2061 dev_ll = lcore_ll->ll_root_used; 2062 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2063 /* Get virtio device ID */ 2064 vdev = dev_ll->vdev; 2065 dev = vdev->dev; 2066 2067 if (likely(!vdev->remove)) { 2068 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2069 if (tx_q->len) { 2070 LOG_DEBUG(VHOST_DATA, 2071 "TX queue drained after timeout" 2072 " with burst size %u\n", 2073 tx_q->len); 2074 2075 /* 2076 * Tx any packets in the queue 2077 */ 2078 ret = rte_eth_tx_burst( 2079 ports[0], 2080 (uint16_t)tx_q->txq_id, 2081 (struct rte_mbuf **) 2082 tx_q->m_table, 2083 (uint16_t)tx_q->len); 2084 if (unlikely(ret < tx_q->len)) { 2085 do { 2086 rte_pktmbuf_free( 2087 tx_q->m_table[ret]); 2088 } while (++ret < tx_q->len); 2089 } 2090 tx_q->len = 0; 2091 2092 txmbuf_clean_zcp(dev, 2093 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2094 } 2095 } 2096 dev_ll = dev_ll->next; 2097 } 2098 prev_tsc = cur_tsc; 2099 } 2100 2101 rte_prefetch0(lcore_ll->ll_root_used); 2102 2103 /* 2104 * Inform the configuration core that we have exited the linked 2105 * list and that no devices are in use if requested. 2106 */ 2107 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2108 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2109 2110 /* Process devices */ 2111 dev_ll = lcore_ll->ll_root_used; 2112 2113 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2114 vdev = dev_ll->vdev; 2115 dev = vdev->dev; 2116 if (unlikely(vdev->remove)) { 2117 dev_ll = dev_ll->next; 2118 unlink_vmdq(vdev); 2119 vdev->ready = DEVICE_SAFE_REMOVE; 2120 continue; 2121 } 2122 2123 if (likely(vdev->ready == DEVICE_RX)) { 2124 uint32_t index = vdev->vmdq_rx_q; 2125 uint16_t i; 2126 count_in_ring 2127 = rte_ring_count(vpool_array[index].ring); 2128 uint16_t free_entries 2129 = (uint16_t)get_available_ring_num_zcp(dev); 2130 2131 /* 2132 * Attach all mbufs in vpool.ring and put back 2133 * into vpool.pool. 2134 */ 2135 for (i = 0; 2136 i < RTE_MIN(free_entries, 2137 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2138 i++) 2139 attach_rxmbuf_zcp(dev); 2140 2141 /* Handle guest RX */ 2142 rx_count = rte_eth_rx_burst(ports[0], 2143 vdev->vmdq_rx_q, pkts_burst, 2144 MAX_PKT_BURST); 2145 2146 if (rx_count) { 2147 ret_count = virtio_dev_rx_zcp(dev, 2148 pkts_burst, rx_count); 2149 if (enable_stats) { 2150 dev_statistics[dev->device_fh].rx_total 2151 += rx_count; 2152 dev_statistics[dev->device_fh].rx 2153 += ret_count; 2154 } 2155 while (likely(rx_count)) { 2156 rx_count--; 2157 pktmbuf_detach_zcp( 2158 pkts_burst[rx_count]); 2159 rte_ring_sp_enqueue( 2160 vpool_array[index].ring, 2161 (void *)pkts_burst[rx_count]); 2162 } 2163 } 2164 } 2165 2166 if (likely(!vdev->remove)) 2167 /* Handle guest TX */ 2168 virtio_dev_tx_zcp(dev); 2169 2170 /* Move to the next device in the list */ 2171 dev_ll = dev_ll->next; 2172 } 2173 } 2174 2175 return 0; 2176 } 2177 2178 2179 /* 2180 * Add an entry to a used linked list. A free entry must first be found 2181 * in the free linked list using get_data_ll_free_entry(); 2182 */ 2183 static void 2184 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2185 struct virtio_net_data_ll *ll_dev) 2186 { 2187 struct virtio_net_data_ll *ll = *ll_root_addr; 2188 2189 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2190 ll_dev->next = NULL; 2191 rte_compiler_barrier(); 2192 2193 /* If ll == NULL then this is the first device. */ 2194 if (ll) { 2195 /* Increment to the tail of the linked list. */ 2196 while ((ll->next != NULL) ) 2197 ll = ll->next; 2198 2199 ll->next = ll_dev; 2200 } else { 2201 *ll_root_addr = ll_dev; 2202 } 2203 } 2204 2205 /* 2206 * Remove an entry from a used linked list. The entry must then be added to 2207 * the free linked list using put_data_ll_free_entry(). 2208 */ 2209 static void 2210 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2211 struct virtio_net_data_ll *ll_dev, 2212 struct virtio_net_data_ll *ll_dev_last) 2213 { 2214 struct virtio_net_data_ll *ll = *ll_root_addr; 2215 2216 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2217 return; 2218 2219 if (ll_dev == ll) 2220 *ll_root_addr = ll_dev->next; 2221 else 2222 if (likely(ll_dev_last != NULL)) 2223 ll_dev_last->next = ll_dev->next; 2224 else 2225 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2226 } 2227 2228 /* 2229 * Find and return an entry from the free linked list. 2230 */ 2231 static struct virtio_net_data_ll * 2232 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2233 { 2234 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2235 struct virtio_net_data_ll *ll_dev; 2236 2237 if (ll_free == NULL) 2238 return NULL; 2239 2240 ll_dev = ll_free; 2241 *ll_root_addr = ll_free->next; 2242 2243 return ll_dev; 2244 } 2245 2246 /* 2247 * Place an entry back on to the free linked list. 2248 */ 2249 static void 2250 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2251 struct virtio_net_data_ll *ll_dev) 2252 { 2253 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2254 2255 if (ll_dev == NULL) 2256 return; 2257 2258 ll_dev->next = ll_free; 2259 *ll_root_addr = ll_dev; 2260 } 2261 2262 /* 2263 * Creates a linked list of a given size. 2264 */ 2265 static struct virtio_net_data_ll * 2266 alloc_data_ll(uint32_t size) 2267 { 2268 struct virtio_net_data_ll *ll_new; 2269 uint32_t i; 2270 2271 /* Malloc and then chain the linked list. */ 2272 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2273 if (ll_new == NULL) { 2274 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2275 return NULL; 2276 } 2277 2278 for (i = 0; i < size - 1; i++) { 2279 ll_new[i].vdev = NULL; 2280 ll_new[i].next = &ll_new[i+1]; 2281 } 2282 ll_new[i].next = NULL; 2283 2284 return (ll_new); 2285 } 2286 2287 /* 2288 * Create the main linked list along with each individual cores linked list. A used and a free list 2289 * are created to manage entries. 2290 */ 2291 static int 2292 init_data_ll (void) 2293 { 2294 int lcore; 2295 2296 RTE_LCORE_FOREACH_SLAVE(lcore) { 2297 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2298 if (lcore_info[lcore].lcore_ll == NULL) { 2299 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2300 return -1; 2301 } 2302 2303 lcore_info[lcore].lcore_ll->device_num = 0; 2304 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2305 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2306 if (num_devices % num_switching_cores) 2307 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2308 else 2309 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2310 } 2311 2312 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2313 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2314 2315 return 0; 2316 } 2317 2318 /* 2319 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2320 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2321 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2322 */ 2323 static void 2324 destroy_device (volatile struct virtio_net *dev) 2325 { 2326 struct virtio_net_data_ll *ll_lcore_dev_cur; 2327 struct virtio_net_data_ll *ll_main_dev_cur; 2328 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2329 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2330 struct vhost_dev *vdev; 2331 int lcore; 2332 2333 dev->flags &= ~VIRTIO_DEV_RUNNING; 2334 2335 vdev = (struct vhost_dev *)dev->priv; 2336 /*set the remove flag. */ 2337 vdev->remove = 1; 2338 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2339 rte_pause(); 2340 } 2341 2342 /* Search for entry to be removed from lcore ll */ 2343 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2344 while (ll_lcore_dev_cur != NULL) { 2345 if (ll_lcore_dev_cur->vdev == vdev) { 2346 break; 2347 } else { 2348 ll_lcore_dev_last = ll_lcore_dev_cur; 2349 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2350 } 2351 } 2352 2353 if (ll_lcore_dev_cur == NULL) { 2354 RTE_LOG(ERR, VHOST_CONFIG, 2355 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2356 dev->device_fh); 2357 return; 2358 } 2359 2360 /* Search for entry to be removed from main ll */ 2361 ll_main_dev_cur = ll_root_used; 2362 ll_main_dev_last = NULL; 2363 while (ll_main_dev_cur != NULL) { 2364 if (ll_main_dev_cur->vdev == vdev) { 2365 break; 2366 } else { 2367 ll_main_dev_last = ll_main_dev_cur; 2368 ll_main_dev_cur = ll_main_dev_cur->next; 2369 } 2370 } 2371 2372 /* Remove entries from the lcore and main ll. */ 2373 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2374 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2375 2376 /* Set the dev_removal_flag on each lcore. */ 2377 RTE_LCORE_FOREACH_SLAVE(lcore) { 2378 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2379 } 2380 2381 /* 2382 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2383 * they can no longer access the device removed from the linked lists and that the devices 2384 * are no longer in use. 2385 */ 2386 RTE_LCORE_FOREACH_SLAVE(lcore) { 2387 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2388 rte_pause(); 2389 } 2390 } 2391 2392 /* Add the entries back to the lcore and main free ll.*/ 2393 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2394 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2395 2396 /* Decrement number of device on the lcore. */ 2397 lcore_info[vdev->coreid].lcore_ll->device_num--; 2398 2399 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2400 2401 if (zero_copy) { 2402 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2403 2404 /* Stop the RX queue. */ 2405 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2406 LOG_DEBUG(VHOST_CONFIG, 2407 "(%"PRIu64") In destroy_device: Failed to stop " 2408 "rx queue:%d\n", 2409 dev->device_fh, 2410 vdev->vmdq_rx_q); 2411 } 2412 2413 LOG_DEBUG(VHOST_CONFIG, 2414 "(%"PRIu64") in destroy_device: Start put mbuf in " 2415 "mempool back to ring for RX queue: %d\n", 2416 dev->device_fh, vdev->vmdq_rx_q); 2417 2418 mbuf_destroy_zcp(vpool); 2419 2420 /* Stop the TX queue. */ 2421 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2422 LOG_DEBUG(VHOST_CONFIG, 2423 "(%"PRIu64") In destroy_device: Failed to " 2424 "stop tx queue:%d\n", 2425 dev->device_fh, vdev->vmdq_rx_q); 2426 } 2427 2428 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2429 2430 LOG_DEBUG(VHOST_CONFIG, 2431 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2432 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2433 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2434 dev->device_fh); 2435 2436 mbuf_destroy_zcp(vpool); 2437 rte_free(vdev->regions_hpa); 2438 } 2439 rte_free(vdev); 2440 2441 } 2442 2443 /* 2444 * Calculate the region count of physical continous regions for one particular 2445 * region of whose vhost virtual address is continous. The particular region 2446 * start from vva_start, with size of 'size' in argument. 2447 */ 2448 static uint32_t 2449 check_hpa_regions(uint64_t vva_start, uint64_t size) 2450 { 2451 uint32_t i, nregions = 0, page_size = getpagesize(); 2452 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2453 if (vva_start % page_size) { 2454 LOG_DEBUG(VHOST_CONFIG, 2455 "in check_countinous: vva start(%p) mod page_size(%d) " 2456 "has remainder\n", 2457 (void *)(uintptr_t)vva_start, page_size); 2458 return 0; 2459 } 2460 if (size % page_size) { 2461 LOG_DEBUG(VHOST_CONFIG, 2462 "in check_countinous: " 2463 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2464 size, page_size); 2465 return 0; 2466 } 2467 for (i = 0; i < size - page_size; i = i + page_size) { 2468 cur_phys_addr 2469 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2470 next_phys_addr = rte_mem_virt2phy( 2471 (void *)(uintptr_t)(vva_start + i + page_size)); 2472 if ((cur_phys_addr + page_size) != next_phys_addr) { 2473 ++nregions; 2474 LOG_DEBUG(VHOST_CONFIG, 2475 "in check_continuous: hva addr:(%p) is not " 2476 "continuous with hva addr:(%p), diff:%d\n", 2477 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2478 (void *)(uintptr_t)(vva_start + (uint64_t)i 2479 + page_size), page_size); 2480 LOG_DEBUG(VHOST_CONFIG, 2481 "in check_continuous: hpa addr:(%p) is not " 2482 "continuous with hpa addr:(%p), " 2483 "diff:(%"PRIu64")\n", 2484 (void *)(uintptr_t)cur_phys_addr, 2485 (void *)(uintptr_t)next_phys_addr, 2486 (next_phys_addr-cur_phys_addr)); 2487 } 2488 } 2489 return nregions; 2490 } 2491 2492 /* 2493 * Divide each region whose vhost virtual address is continous into a few 2494 * sub-regions, make sure the physical address within each sub-region are 2495 * continous. And fill offset(to GPA) and size etc. information of each 2496 * sub-region into regions_hpa. 2497 */ 2498 static uint32_t 2499 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2500 { 2501 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2502 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2503 2504 if (mem_region_hpa == NULL) 2505 return 0; 2506 2507 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2508 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2509 virtio_memory->regions[regionidx].address_offset; 2510 mem_region_hpa[regionidx_hpa].guest_phys_address 2511 = virtio_memory->regions[regionidx].guest_phys_address; 2512 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2513 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2514 mem_region_hpa[regionidx_hpa].guest_phys_address; 2515 LOG_DEBUG(VHOST_CONFIG, 2516 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2517 regionidx_hpa, 2518 (void *)(uintptr_t) 2519 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2520 LOG_DEBUG(VHOST_CONFIG, 2521 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2522 regionidx_hpa, 2523 (void *)(uintptr_t) 2524 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2525 for (i = 0, k = 0; 2526 i < virtio_memory->regions[regionidx].memory_size - 2527 page_size; 2528 i += page_size) { 2529 cur_phys_addr = rte_mem_virt2phy( 2530 (void *)(uintptr_t)(vva_start + i)); 2531 next_phys_addr = rte_mem_virt2phy( 2532 (void *)(uintptr_t)(vva_start + 2533 i + page_size)); 2534 if ((cur_phys_addr + page_size) != next_phys_addr) { 2535 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2536 mem_region_hpa[regionidx_hpa].guest_phys_address + 2537 k + page_size; 2538 mem_region_hpa[regionidx_hpa].memory_size 2539 = k + page_size; 2540 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2541 "phys addr end [%d]:(%p)\n", 2542 regionidx_hpa, 2543 (void *)(uintptr_t) 2544 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2545 LOG_DEBUG(VHOST_CONFIG, 2546 "in fill_hpa_regions: guest phys addr " 2547 "size [%d]:(%p)\n", 2548 regionidx_hpa, 2549 (void *)(uintptr_t) 2550 (mem_region_hpa[regionidx_hpa].memory_size)); 2551 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2552 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2553 ++regionidx_hpa; 2554 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2555 next_phys_addr - 2556 mem_region_hpa[regionidx_hpa].guest_phys_address; 2557 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2558 " phys addr start[%d]:(%p)\n", 2559 regionidx_hpa, 2560 (void *)(uintptr_t) 2561 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2562 LOG_DEBUG(VHOST_CONFIG, 2563 "in fill_hpa_regions: host phys addr " 2564 "start[%d]:(%p)\n", 2565 regionidx_hpa, 2566 (void *)(uintptr_t) 2567 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2568 k = 0; 2569 } else { 2570 k += page_size; 2571 } 2572 } 2573 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2574 = mem_region_hpa[regionidx_hpa].guest_phys_address 2575 + k + page_size; 2576 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2577 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2578 "[%d]:(%p)\n", regionidx_hpa, 2579 (void *)(uintptr_t) 2580 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2581 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2582 "[%d]:(%p)\n", regionidx_hpa, 2583 (void *)(uintptr_t) 2584 (mem_region_hpa[regionidx_hpa].memory_size)); 2585 ++regionidx_hpa; 2586 } 2587 return regionidx_hpa; 2588 } 2589 2590 /* 2591 * A new device is added to a data core. First the device is added to the main linked list 2592 * and the allocated to a specific data core. 2593 */ 2594 static int 2595 new_device (struct virtio_net *dev) 2596 { 2597 struct virtio_net_data_ll *ll_dev; 2598 int lcore, core_add = 0; 2599 uint32_t device_num_min = num_devices; 2600 struct vhost_dev *vdev; 2601 uint32_t regionidx; 2602 2603 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2604 if (vdev == NULL) { 2605 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2606 dev->device_fh); 2607 return -1; 2608 } 2609 vdev->dev = dev; 2610 dev->priv = vdev; 2611 2612 if (zero_copy) { 2613 vdev->nregions_hpa = dev->mem->nregions; 2614 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2615 vdev->nregions_hpa 2616 += check_hpa_regions( 2617 dev->mem->regions[regionidx].guest_phys_address 2618 + dev->mem->regions[regionidx].address_offset, 2619 dev->mem->regions[regionidx].memory_size); 2620 2621 } 2622 2623 vdev->regions_hpa = rte_calloc("vhost hpa region", 2624 vdev->nregions_hpa, 2625 sizeof(struct virtio_memory_regions_hpa), 2626 RTE_CACHE_LINE_SIZE); 2627 if (vdev->regions_hpa == NULL) { 2628 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2629 rte_free(vdev); 2630 return -1; 2631 } 2632 2633 2634 if (fill_hpa_memory_regions( 2635 vdev->regions_hpa, dev->mem 2636 ) != vdev->nregions_hpa) { 2637 2638 RTE_LOG(ERR, VHOST_CONFIG, 2639 "hpa memory regions number mismatch: " 2640 "[%d]\n", vdev->nregions_hpa); 2641 rte_free(vdev->regions_hpa); 2642 rte_free(vdev); 2643 return -1; 2644 } 2645 } 2646 2647 2648 /* Add device to main ll */ 2649 ll_dev = get_data_ll_free_entry(&ll_root_free); 2650 if (ll_dev == NULL) { 2651 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2652 "of %d devices per core has been reached\n", 2653 dev->device_fh, num_devices); 2654 if (vdev->regions_hpa) 2655 rte_free(vdev->regions_hpa); 2656 rte_free(vdev); 2657 return -1; 2658 } 2659 ll_dev->vdev = vdev; 2660 add_data_ll_entry(&ll_root_used, ll_dev); 2661 vdev->vmdq_rx_q 2662 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2663 2664 if (zero_copy) { 2665 uint32_t index = vdev->vmdq_rx_q; 2666 uint32_t count_in_ring, i; 2667 struct mbuf_table *tx_q; 2668 2669 count_in_ring = rte_ring_count(vpool_array[index].ring); 2670 2671 LOG_DEBUG(VHOST_CONFIG, 2672 "(%"PRIu64") in new_device: mbuf count in mempool " 2673 "before attach is: %d\n", 2674 dev->device_fh, 2675 rte_mempool_count(vpool_array[index].pool)); 2676 LOG_DEBUG(VHOST_CONFIG, 2677 "(%"PRIu64") in new_device: mbuf count in ring " 2678 "before attach is : %d\n", 2679 dev->device_fh, count_in_ring); 2680 2681 /* 2682 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2683 */ 2684 for (i = 0; i < count_in_ring; i++) 2685 attach_rxmbuf_zcp(dev); 2686 2687 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2688 "mempool after attach is: %d\n", 2689 dev->device_fh, 2690 rte_mempool_count(vpool_array[index].pool)); 2691 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2692 "ring after attach is : %d\n", 2693 dev->device_fh, 2694 rte_ring_count(vpool_array[index].ring)); 2695 2696 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2697 tx_q->txq_id = vdev->vmdq_rx_q; 2698 2699 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2700 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2701 2702 LOG_DEBUG(VHOST_CONFIG, 2703 "(%"PRIu64") In new_device: Failed to start " 2704 "tx queue:%d\n", 2705 dev->device_fh, vdev->vmdq_rx_q); 2706 2707 mbuf_destroy_zcp(vpool); 2708 rte_free(vdev->regions_hpa); 2709 rte_free(vdev); 2710 return -1; 2711 } 2712 2713 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2714 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2715 2716 LOG_DEBUG(VHOST_CONFIG, 2717 "(%"PRIu64") In new_device: Failed to start " 2718 "rx queue:%d\n", 2719 dev->device_fh, vdev->vmdq_rx_q); 2720 2721 /* Stop the TX queue. */ 2722 if (rte_eth_dev_tx_queue_stop(ports[0], 2723 vdev->vmdq_rx_q) != 0) { 2724 LOG_DEBUG(VHOST_CONFIG, 2725 "(%"PRIu64") In new_device: Failed to " 2726 "stop tx queue:%d\n", 2727 dev->device_fh, vdev->vmdq_rx_q); 2728 } 2729 2730 mbuf_destroy_zcp(vpool); 2731 rte_free(vdev->regions_hpa); 2732 rte_free(vdev); 2733 return -1; 2734 } 2735 2736 } 2737 2738 /*reset ready flag*/ 2739 vdev->ready = DEVICE_MAC_LEARNING; 2740 vdev->remove = 0; 2741 2742 /* Find a suitable lcore to add the device. */ 2743 RTE_LCORE_FOREACH_SLAVE(lcore) { 2744 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2745 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2746 core_add = lcore; 2747 } 2748 } 2749 /* Add device to lcore ll */ 2750 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2751 if (ll_dev == NULL) { 2752 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2753 vdev->ready = DEVICE_SAFE_REMOVE; 2754 destroy_device(dev); 2755 rte_free(vdev->regions_hpa); 2756 rte_free(vdev); 2757 return -1; 2758 } 2759 ll_dev->vdev = vdev; 2760 vdev->coreid = core_add; 2761 2762 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2763 2764 /* Initialize device stats */ 2765 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2766 2767 /* Disable notifications. */ 2768 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2769 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2770 lcore_info[vdev->coreid].lcore_ll->device_num++; 2771 dev->flags |= VIRTIO_DEV_RUNNING; 2772 2773 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2774 2775 return 0; 2776 } 2777 2778 /* 2779 * These callback allow devices to be added to the data core when configuration 2780 * has been fully complete. 2781 */ 2782 static const struct virtio_net_device_ops virtio_net_device_ops = 2783 { 2784 .new_device = new_device, 2785 .destroy_device = destroy_device, 2786 }; 2787 2788 /* 2789 * This is a thread will wake up after a period to print stats if the user has 2790 * enabled them. 2791 */ 2792 static void 2793 print_stats(void) 2794 { 2795 struct virtio_net_data_ll *dev_ll; 2796 uint64_t tx_dropped, rx_dropped; 2797 uint64_t tx, tx_total, rx, rx_total; 2798 uint32_t device_fh; 2799 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2800 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2801 2802 while(1) { 2803 sleep(enable_stats); 2804 2805 /* Clear screen and move to top left */ 2806 printf("%s%s", clr, top_left); 2807 2808 printf("\nDevice statistics ===================================="); 2809 2810 dev_ll = ll_root_used; 2811 while (dev_ll != NULL) { 2812 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2813 tx_total = dev_statistics[device_fh].tx_total; 2814 tx = dev_statistics[device_fh].tx; 2815 tx_dropped = tx_total - tx; 2816 if (zero_copy == 0) { 2817 rx_total = rte_atomic64_read( 2818 &dev_statistics[device_fh].rx_total_atomic); 2819 rx = rte_atomic64_read( 2820 &dev_statistics[device_fh].rx_atomic); 2821 } else { 2822 rx_total = dev_statistics[device_fh].rx_total; 2823 rx = dev_statistics[device_fh].rx; 2824 } 2825 rx_dropped = rx_total - rx; 2826 2827 printf("\nStatistics for device %"PRIu32" ------------------------------" 2828 "\nTX total: %"PRIu64"" 2829 "\nTX dropped: %"PRIu64"" 2830 "\nTX successful: %"PRIu64"" 2831 "\nRX total: %"PRIu64"" 2832 "\nRX dropped: %"PRIu64"" 2833 "\nRX successful: %"PRIu64"", 2834 device_fh, 2835 tx_total, 2836 tx_dropped, 2837 tx, 2838 rx_total, 2839 rx_dropped, 2840 rx); 2841 2842 dev_ll = dev_ll->next; 2843 } 2844 printf("\n======================================================\n"); 2845 } 2846 } 2847 2848 static void 2849 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2850 char *ring_name, uint32_t nb_mbuf) 2851 { 2852 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, 2853 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket); 2854 if (vpool_array[index].pool != NULL) { 2855 vpool_array[index].ring 2856 = rte_ring_create(ring_name, 2857 rte_align32pow2(nb_mbuf + 1), 2858 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2859 if (likely(vpool_array[index].ring != NULL)) { 2860 LOG_DEBUG(VHOST_CONFIG, 2861 "in setup_mempool_tbl: mbuf count in " 2862 "mempool is: %d\n", 2863 rte_mempool_count(vpool_array[index].pool)); 2864 LOG_DEBUG(VHOST_CONFIG, 2865 "in setup_mempool_tbl: mbuf count in " 2866 "ring is: %d\n", 2867 rte_ring_count(vpool_array[index].ring)); 2868 } else { 2869 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2870 ring_name); 2871 } 2872 2873 /* Need consider head room. */ 2874 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP; 2875 } else { 2876 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2877 } 2878 } 2879 2880 /* When we receive a INT signal, unregister vhost driver */ 2881 static void 2882 sigint_handler(__rte_unused int signum) 2883 { 2884 /* Unregister vhost driver. */ 2885 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 2886 if (ret != 0) 2887 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 2888 exit(0); 2889 } 2890 2891 /* 2892 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2893 * device is also registered here to handle the IOCTLs. 2894 */ 2895 int 2896 main(int argc, char *argv[]) 2897 { 2898 struct rte_mempool *mbuf_pool = NULL; 2899 unsigned lcore_id, core_id = 0; 2900 unsigned nb_ports, valid_num_ports; 2901 int ret; 2902 uint8_t portid; 2903 uint16_t queue_id; 2904 static pthread_t tid; 2905 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 2906 2907 signal(SIGINT, sigint_handler); 2908 2909 /* init EAL */ 2910 ret = rte_eal_init(argc, argv); 2911 if (ret < 0) 2912 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2913 argc -= ret; 2914 argv += ret; 2915 2916 /* parse app arguments */ 2917 ret = us_vhost_parse_args(argc, argv); 2918 if (ret < 0) 2919 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2920 2921 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2922 if (rte_lcore_is_enabled(lcore_id)) 2923 lcore_ids[core_id ++] = lcore_id; 2924 2925 if (rte_lcore_count() > RTE_MAX_LCORE) 2926 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2927 2928 /*set the number of swithcing cores available*/ 2929 num_switching_cores = rte_lcore_count()-1; 2930 2931 /* Get the number of physical ports. */ 2932 nb_ports = rte_eth_dev_count(); 2933 if (nb_ports > RTE_MAX_ETHPORTS) 2934 nb_ports = RTE_MAX_ETHPORTS; 2935 2936 /* 2937 * Update the global var NUM_PORTS and global array PORTS 2938 * and get value of var VALID_NUM_PORTS according to system ports number 2939 */ 2940 valid_num_ports = check_ports_num(nb_ports); 2941 2942 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2943 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2944 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2945 return -1; 2946 } 2947 2948 if (zero_copy == 0) { 2949 /* Create the mbuf pool. */ 2950 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 2951 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 2952 0, MBUF_DATA_SIZE, rte_socket_id()); 2953 if (mbuf_pool == NULL) 2954 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2955 2956 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2957 vpool_array[queue_id].pool = mbuf_pool; 2958 2959 if (vm2vm_mode == VM2VM_HARDWARE) { 2960 /* Enable VT loop back to let L2 switch to do it. */ 2961 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2962 LOG_DEBUG(VHOST_CONFIG, 2963 "Enable loop back for L2 switch in vmdq.\n"); 2964 } 2965 } else { 2966 uint32_t nb_mbuf; 2967 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2968 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2969 2970 nb_mbuf = num_rx_descriptor 2971 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2972 + num_switching_cores * MAX_PKT_BURST; 2973 2974 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2975 snprintf(pool_name, sizeof(pool_name), 2976 "rxmbuf_pool_%u", queue_id); 2977 snprintf(ring_name, sizeof(ring_name), 2978 "rxmbuf_ring_%u", queue_id); 2979 setup_mempool_tbl(rte_socket_id(), queue_id, 2980 pool_name, ring_name, nb_mbuf); 2981 } 2982 2983 nb_mbuf = num_tx_descriptor 2984 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2985 + num_switching_cores * MAX_PKT_BURST; 2986 2987 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2988 snprintf(pool_name, sizeof(pool_name), 2989 "txmbuf_pool_%u", queue_id); 2990 snprintf(ring_name, sizeof(ring_name), 2991 "txmbuf_ring_%u", queue_id); 2992 setup_mempool_tbl(rte_socket_id(), 2993 (queue_id + MAX_QUEUES), 2994 pool_name, ring_name, nb_mbuf); 2995 } 2996 2997 if (vm2vm_mode == VM2VM_HARDWARE) { 2998 /* Enable VT loop back to let L2 switch to do it. */ 2999 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3000 LOG_DEBUG(VHOST_CONFIG, 3001 "Enable loop back for L2 switch in vmdq.\n"); 3002 } 3003 } 3004 /* Set log level. */ 3005 rte_set_log_level(LOG_LEVEL); 3006 3007 /* initialize all ports */ 3008 for (portid = 0; portid < nb_ports; portid++) { 3009 /* skip ports that are not enabled */ 3010 if ((enabled_port_mask & (1 << portid)) == 0) { 3011 RTE_LOG(INFO, VHOST_PORT, 3012 "Skipping disabled port %d\n", portid); 3013 continue; 3014 } 3015 if (port_init(portid) != 0) 3016 rte_exit(EXIT_FAILURE, 3017 "Cannot initialize network ports\n"); 3018 } 3019 3020 /* Initialise all linked lists. */ 3021 if (init_data_ll() == -1) 3022 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3023 3024 /* Initialize device stats */ 3025 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3026 3027 /* Enable stats if the user option is set. */ 3028 if (enable_stats) { 3029 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 3030 if (ret != 0) 3031 rte_exit(EXIT_FAILURE, 3032 "Cannot create print-stats thread\n"); 3033 3034 /* Set thread_name for aid in debugging. */ 3035 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 3036 ret = rte_thread_setname(tid, thread_name); 3037 if (ret != 0) 3038 RTE_LOG(ERR, VHOST_CONFIG, 3039 "Cannot set print-stats name\n"); 3040 } 3041 3042 /* Launch all data cores. */ 3043 if (zero_copy == 0) { 3044 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3045 rte_eal_remote_launch(switch_worker, 3046 mbuf_pool, lcore_id); 3047 } 3048 } else { 3049 uint32_t count_in_mempool, index, i; 3050 for (index = 0; index < 2*MAX_QUEUES; index++) { 3051 /* For all RX and TX queues. */ 3052 count_in_mempool 3053 = rte_mempool_count(vpool_array[index].pool); 3054 3055 /* 3056 * Transfer all un-attached mbufs from vpool.pool 3057 * to vpoo.ring. 3058 */ 3059 for (i = 0; i < count_in_mempool; i++) { 3060 struct rte_mbuf *mbuf 3061 = __rte_mbuf_raw_alloc( 3062 vpool_array[index].pool); 3063 rte_ring_sp_enqueue(vpool_array[index].ring, 3064 (void *)mbuf); 3065 } 3066 3067 LOG_DEBUG(VHOST_CONFIG, 3068 "in main: mbuf count in mempool at initial " 3069 "is: %d\n", count_in_mempool); 3070 LOG_DEBUG(VHOST_CONFIG, 3071 "in main: mbuf count in ring at initial is :" 3072 " %d\n", 3073 rte_ring_count(vpool_array[index].ring)); 3074 } 3075 3076 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3077 rte_eal_remote_launch(switch_worker_zcp, NULL, 3078 lcore_id); 3079 } 3080 3081 if (mergeable == 0) 3082 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3083 3084 /* Register vhost(cuse or user) driver to handle vhost messages. */ 3085 ret = rte_vhost_driver_register((char *)&dev_basename); 3086 if (ret != 0) 3087 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 3088 3089 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3090 3091 /* Start CUSE session. */ 3092 rte_vhost_driver_session_start(); 3093 return 0; 3094 3095 } 3096