1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 /* mask of enabled ports */ 143 static uint32_t enabled_port_mask = 0; 144 145 /* Promiscuous mode */ 146 static uint32_t promiscuous; 147 148 /*Number of switching cores enabled*/ 149 static uint32_t num_switching_cores = 0; 150 151 /* number of devices/queues to support*/ 152 static uint32_t num_queues = 0; 153 static uint32_t num_devices; 154 155 /* 156 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 157 * disabled on default. 158 */ 159 static uint32_t zero_copy; 160 static int mergeable; 161 162 /* number of descriptors to apply*/ 163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 165 166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 167 #define MAX_RING_DESC 4096 168 169 struct vpool { 170 struct rte_mempool *pool; 171 struct rte_ring *ring; 172 uint32_t buf_size; 173 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 174 175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 176 typedef enum { 177 VM2VM_DISABLED = 0, 178 VM2VM_SOFTWARE = 1, 179 VM2VM_HARDWARE = 2, 180 VM2VM_LAST 181 } vm2vm_type; 182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 183 184 /* The type of host physical address translated from guest physical address. */ 185 typedef enum { 186 PHYS_ADDR_CONTINUOUS = 0, 187 PHYS_ADDR_CROSS_SUBREG = 1, 188 PHYS_ADDR_INVALID = 2, 189 PHYS_ADDR_LAST 190 } hpa_type; 191 192 /* Enable stats. */ 193 static uint32_t enable_stats = 0; 194 /* Enable retries on RX. */ 195 static uint32_t enable_retry = 1; 196 /* Specify timeout (in useconds) between retries on RX. */ 197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 198 /* Specify the number of retries on RX. */ 199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 200 201 /* Character device basename. Can be set by user. */ 202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 203 204 /* empty vmdq configuration structure. Filled in programatically */ 205 static struct rte_eth_conf vmdq_conf_default = { 206 .rxmode = { 207 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 208 .split_hdr_size = 0, 209 .header_split = 0, /**< Header Split disabled */ 210 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 211 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 212 /* 213 * It is necessary for 1G NIC such as I350, 214 * this fixes bug of ipv4 forwarding in guest can't 215 * forward pakets from one virtio dev to another virtio dev. 216 */ 217 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 218 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 219 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 220 }, 221 222 .txmode = { 223 .mq_mode = ETH_MQ_TX_NONE, 224 }, 225 .rx_adv_conf = { 226 /* 227 * should be overridden separately in code with 228 * appropriate values 229 */ 230 .vmdq_rx_conf = { 231 .nb_queue_pools = ETH_8_POOLS, 232 .enable_default_pool = 0, 233 .default_pool = 0, 234 .nb_pool_maps = 0, 235 .pool_map = {{0, 0},}, 236 }, 237 }, 238 }; 239 240 static unsigned lcore_ids[RTE_MAX_LCORE]; 241 static uint8_t ports[RTE_MAX_ETHPORTS]; 242 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 243 static uint16_t num_pf_queues, num_vmdq_queues; 244 static uint16_t vmdq_pool_base, vmdq_queue_base; 245 static uint16_t queues_per_pool; 246 247 static const uint16_t external_pkt_default_vlan_tag = 2000; 248 const uint16_t vlan_tags[] = { 249 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 250 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 251 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 252 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 253 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 254 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 255 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 256 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 257 }; 258 259 /* ethernet addresses of ports */ 260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 261 262 /* heads for the main used and free linked lists for the data path. */ 263 static struct virtio_net_data_ll *ll_root_used = NULL; 264 static struct virtio_net_data_ll *ll_root_free = NULL; 265 266 /* Array of data core structures containing information on individual core linked lists. */ 267 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 268 269 /* Used for queueing bursts of TX packets. */ 270 struct mbuf_table { 271 unsigned len; 272 unsigned txq_id; 273 struct rte_mbuf *m_table[MAX_PKT_BURST]; 274 }; 275 276 /* TX queue for each data core. */ 277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 278 279 /* TX queue fori each virtio device for zero copy. */ 280 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 281 282 /* Vlan header struct used to insert vlan tags on TX. */ 283 struct vlan_ethhdr { 284 unsigned char h_dest[ETH_ALEN]; 285 unsigned char h_source[ETH_ALEN]; 286 __be16 h_vlan_proto; 287 __be16 h_vlan_TCI; 288 __be16 h_vlan_encapsulated_proto; 289 }; 290 291 /* IPv4 Header */ 292 struct ipv4_hdr { 293 uint8_t version_ihl; /**< version and header length */ 294 uint8_t type_of_service; /**< type of service */ 295 uint16_t total_length; /**< length of packet */ 296 uint16_t packet_id; /**< packet ID */ 297 uint16_t fragment_offset; /**< fragmentation offset */ 298 uint8_t time_to_live; /**< time to live */ 299 uint8_t next_proto_id; /**< protocol ID */ 300 uint16_t hdr_checksum; /**< header checksum */ 301 uint32_t src_addr; /**< source address */ 302 uint32_t dst_addr; /**< destination address */ 303 } __attribute__((__packed__)); 304 305 /* Header lengths. */ 306 #define VLAN_HLEN 4 307 #define VLAN_ETH_HLEN 18 308 309 /* Per-device statistics struct */ 310 struct device_statistics { 311 uint64_t tx_total; 312 rte_atomic64_t rx_total_atomic; 313 uint64_t rx_total; 314 uint64_t tx; 315 rte_atomic64_t rx_atomic; 316 uint64_t rx; 317 } __rte_cache_aligned; 318 struct device_statistics dev_statistics[MAX_DEVICES]; 319 320 /* 321 * Builds up the correct configuration for VMDQ VLAN pool map 322 * according to the pool & queue limits. 323 */ 324 static inline int 325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 326 { 327 struct rte_eth_vmdq_rx_conf conf; 328 struct rte_eth_vmdq_rx_conf *def_conf = 329 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 330 unsigned i; 331 332 memset(&conf, 0, sizeof(conf)); 333 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 334 conf.nb_pool_maps = num_devices; 335 conf.enable_loop_back = def_conf->enable_loop_back; 336 conf.rx_mode = def_conf->rx_mode; 337 338 for (i = 0; i < conf.nb_pool_maps; i++) { 339 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 340 conf.pool_map[i].pools = (1UL << i); 341 } 342 343 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 344 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 345 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 346 return 0; 347 } 348 349 /* 350 * Validate the device number according to the max pool number gotten form 351 * dev_info. If the device number is invalid, give the error message and 352 * return -1. Each device must have its own pool. 353 */ 354 static inline int 355 validate_num_devices(uint32_t max_nb_devices) 356 { 357 if (num_devices > max_nb_devices) { 358 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 359 return -1; 360 } 361 return 0; 362 } 363 364 /* 365 * Initialises a given port using global settings and with the rx buffers 366 * coming from the mbuf_pool passed as parameter 367 */ 368 static inline int 369 port_init(uint8_t port) 370 { 371 struct rte_eth_dev_info dev_info; 372 struct rte_eth_conf port_conf; 373 struct rte_eth_rxconf *rxconf; 374 struct rte_eth_txconf *txconf; 375 int16_t rx_rings, tx_rings; 376 uint16_t rx_ring_size, tx_ring_size; 377 int retval; 378 uint16_t q; 379 380 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 381 rte_eth_dev_info_get (port, &dev_info); 382 383 if (dev_info.max_rx_queues > MAX_QUEUES) { 384 rte_exit(EXIT_FAILURE, 385 "please define MAX_QUEUES no less than %u in %s\n", 386 dev_info.max_rx_queues, __FILE__); 387 } 388 389 rxconf = &dev_info.default_rxconf; 390 txconf = &dev_info.default_txconf; 391 rxconf->rx_drop_en = 1; 392 393 /* Enable vlan offload */ 394 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 395 396 /* 397 * Zero copy defers queue RX/TX start to the time when guest 398 * finishes its startup and packet buffers from that guest are 399 * available. 400 */ 401 if (zero_copy) { 402 rxconf->rx_deferred_start = 1; 403 rxconf->rx_drop_en = 0; 404 txconf->tx_deferred_start = 1; 405 } 406 407 /*configure the number of supported virtio devices based on VMDQ limits */ 408 num_devices = dev_info.max_vmdq_pools; 409 410 if (zero_copy) { 411 rx_ring_size = num_rx_descriptor; 412 tx_ring_size = num_tx_descriptor; 413 tx_rings = dev_info.max_tx_queues; 414 } else { 415 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 416 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 417 tx_rings = (uint16_t)rte_lcore_count(); 418 } 419 420 retval = validate_num_devices(MAX_DEVICES); 421 if (retval < 0) 422 return retval; 423 424 /* Get port configuration. */ 425 retval = get_eth_conf(&port_conf, num_devices); 426 if (retval < 0) 427 return retval; 428 /* NIC queues are divided into pf queues and vmdq queues. */ 429 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 430 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 431 num_vmdq_queues = num_devices * queues_per_pool; 432 num_queues = num_pf_queues + num_vmdq_queues; 433 vmdq_queue_base = dev_info.vmdq_queue_base; 434 vmdq_pool_base = dev_info.vmdq_pool_base; 435 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 436 num_pf_queues, num_devices, queues_per_pool); 437 438 if (port >= rte_eth_dev_count()) return -1; 439 440 rx_rings = (uint16_t)dev_info.max_rx_queues; 441 /* Configure ethernet device. */ 442 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 443 if (retval != 0) 444 return retval; 445 446 /* Setup the queues. */ 447 for (q = 0; q < rx_rings; q ++) { 448 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 449 rte_eth_dev_socket_id(port), 450 rxconf, 451 vpool_array[q].pool); 452 if (retval < 0) 453 return retval; 454 } 455 for (q = 0; q < tx_rings; q ++) { 456 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 457 rte_eth_dev_socket_id(port), 458 txconf); 459 if (retval < 0) 460 return retval; 461 } 462 463 /* Start the device. */ 464 retval = rte_eth_dev_start(port); 465 if (retval < 0) { 466 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 467 return retval; 468 } 469 470 if (promiscuous) 471 rte_eth_promiscuous_enable(port); 472 473 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 474 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 475 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 476 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 477 (unsigned)port, 478 vmdq_ports_eth_addr[port].addr_bytes[0], 479 vmdq_ports_eth_addr[port].addr_bytes[1], 480 vmdq_ports_eth_addr[port].addr_bytes[2], 481 vmdq_ports_eth_addr[port].addr_bytes[3], 482 vmdq_ports_eth_addr[port].addr_bytes[4], 483 vmdq_ports_eth_addr[port].addr_bytes[5]); 484 485 return 0; 486 } 487 488 /* 489 * Set character device basename. 490 */ 491 static int 492 us_vhost_parse_basename(const char *q_arg) 493 { 494 /* parse number string */ 495 496 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 497 return -1; 498 else 499 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 500 501 return 0; 502 } 503 504 /* 505 * Parse the portmask provided at run time. 506 */ 507 static int 508 parse_portmask(const char *portmask) 509 { 510 char *end = NULL; 511 unsigned long pm; 512 513 errno = 0; 514 515 /* parse hexadecimal string */ 516 pm = strtoul(portmask, &end, 16); 517 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 518 return -1; 519 520 if (pm == 0) 521 return -1; 522 523 return pm; 524 525 } 526 527 /* 528 * Parse num options at run time. 529 */ 530 static int 531 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 532 { 533 char *end = NULL; 534 unsigned long num; 535 536 errno = 0; 537 538 /* parse unsigned int string */ 539 num = strtoul(q_arg, &end, 10); 540 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 541 return -1; 542 543 if (num > max_valid_value) 544 return -1; 545 546 return num; 547 548 } 549 550 /* 551 * Display usage 552 */ 553 static void 554 us_vhost_usage(const char *prgname) 555 { 556 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 557 " --vm2vm [0|1|2]\n" 558 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 559 " --dev-basename <name>\n" 560 " --nb-devices ND\n" 561 " -p PORTMASK: Set mask for ports to be used by application\n" 562 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 563 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 564 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 565 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 566 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 567 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 568 " --dev-basename: The basename to be used for the character device.\n" 569 " --zero-copy [0|1]: disable(default)/enable rx/tx " 570 "zero copy\n" 571 " --rx-desc-num [0-N]: the number of descriptors on rx, " 572 "used only when zero copy is enabled.\n" 573 " --tx-desc-num [0-N]: the number of descriptors on tx, " 574 "used only when zero copy is enabled.\n", 575 prgname); 576 } 577 578 /* 579 * Parse the arguments given in the command line of the application. 580 */ 581 static int 582 us_vhost_parse_args(int argc, char **argv) 583 { 584 int opt, ret; 585 int option_index; 586 unsigned i; 587 const char *prgname = argv[0]; 588 static struct option long_option[] = { 589 {"vm2vm", required_argument, NULL, 0}, 590 {"rx-retry", required_argument, NULL, 0}, 591 {"rx-retry-delay", required_argument, NULL, 0}, 592 {"rx-retry-num", required_argument, NULL, 0}, 593 {"mergeable", required_argument, NULL, 0}, 594 {"stats", required_argument, NULL, 0}, 595 {"dev-basename", required_argument, NULL, 0}, 596 {"zero-copy", required_argument, NULL, 0}, 597 {"rx-desc-num", required_argument, NULL, 0}, 598 {"tx-desc-num", required_argument, NULL, 0}, 599 {NULL, 0, 0, 0}, 600 }; 601 602 /* Parse command line */ 603 while ((opt = getopt_long(argc, argv, "p:P", 604 long_option, &option_index)) != EOF) { 605 switch (opt) { 606 /* Portmask */ 607 case 'p': 608 enabled_port_mask = parse_portmask(optarg); 609 if (enabled_port_mask == 0) { 610 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 611 us_vhost_usage(prgname); 612 return -1; 613 } 614 break; 615 616 case 'P': 617 promiscuous = 1; 618 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 619 ETH_VMDQ_ACCEPT_BROADCAST | 620 ETH_VMDQ_ACCEPT_MULTICAST; 621 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 622 623 break; 624 625 case 0: 626 /* Enable/disable vm2vm comms. */ 627 if (!strncmp(long_option[option_index].name, "vm2vm", 628 MAX_LONG_OPT_SZ)) { 629 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 630 if (ret == -1) { 631 RTE_LOG(INFO, VHOST_CONFIG, 632 "Invalid argument for " 633 "vm2vm [0|1|2]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } else { 637 vm2vm_mode = (vm2vm_type)ret; 638 } 639 } 640 641 /* Enable/disable retries on RX. */ 642 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 643 ret = parse_num_opt(optarg, 1); 644 if (ret == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 646 us_vhost_usage(prgname); 647 return -1; 648 } else { 649 enable_retry = ret; 650 } 651 } 652 653 /* Specify the retries delay time (in useconds) on RX. */ 654 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 655 ret = parse_num_opt(optarg, INT32_MAX); 656 if (ret == -1) { 657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 658 us_vhost_usage(prgname); 659 return -1; 660 } else { 661 burst_rx_delay_time = ret; 662 } 663 } 664 665 /* Specify the retries number on RX. */ 666 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 667 ret = parse_num_opt(optarg, INT32_MAX); 668 if (ret == -1) { 669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 670 us_vhost_usage(prgname); 671 return -1; 672 } else { 673 burst_rx_retry_num = ret; 674 } 675 } 676 677 /* Enable/disable RX mergeable buffers. */ 678 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 679 ret = parse_num_opt(optarg, 1); 680 if (ret == -1) { 681 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 682 us_vhost_usage(prgname); 683 return -1; 684 } else { 685 mergeable = !!ret; 686 if (ret) { 687 vmdq_conf_default.rxmode.jumbo_frame = 1; 688 vmdq_conf_default.rxmode.max_rx_pkt_len 689 = JUMBO_FRAME_MAX_SIZE; 690 } 691 } 692 } 693 694 /* Enable/disable stats. */ 695 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 696 ret = parse_num_opt(optarg, INT32_MAX); 697 if (ret == -1) { 698 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 699 us_vhost_usage(prgname); 700 return -1; 701 } else { 702 enable_stats = ret; 703 } 704 } 705 706 /* Set character device basename. */ 707 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 708 if (us_vhost_parse_basename(optarg) == -1) { 709 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 710 us_vhost_usage(prgname); 711 return -1; 712 } 713 } 714 715 /* Enable/disable rx/tx zero copy. */ 716 if (!strncmp(long_option[option_index].name, 717 "zero-copy", MAX_LONG_OPT_SZ)) { 718 ret = parse_num_opt(optarg, 1); 719 if (ret == -1) { 720 RTE_LOG(INFO, VHOST_CONFIG, 721 "Invalid argument" 722 " for zero-copy [0|1]\n"); 723 us_vhost_usage(prgname); 724 return -1; 725 } else 726 zero_copy = ret; 727 728 if (zero_copy) { 729 #ifdef RTE_MBUF_REFCNT 730 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 731 "zero copy vhost APP, please " 732 "disable RTE_MBUF_REFCNT\n" 733 "in config file and then rebuild DPDK " 734 "core lib!\n" 735 "Otherwise please disable zero copy " 736 "flag in command line!\n"); 737 return -1; 738 #endif 739 } 740 } 741 742 /* Specify the descriptor number on RX. */ 743 if (!strncmp(long_option[option_index].name, 744 "rx-desc-num", MAX_LONG_OPT_SZ)) { 745 ret = parse_num_opt(optarg, MAX_RING_DESC); 746 if ((ret == -1) || (!POWEROF2(ret))) { 747 RTE_LOG(INFO, VHOST_CONFIG, 748 "Invalid argument for rx-desc-num[0-N]," 749 "power of 2 required.\n"); 750 us_vhost_usage(prgname); 751 return -1; 752 } else { 753 num_rx_descriptor = ret; 754 } 755 } 756 757 /* Specify the descriptor number on TX. */ 758 if (!strncmp(long_option[option_index].name, 759 "tx-desc-num", MAX_LONG_OPT_SZ)) { 760 ret = parse_num_opt(optarg, MAX_RING_DESC); 761 if ((ret == -1) || (!POWEROF2(ret))) { 762 RTE_LOG(INFO, VHOST_CONFIG, 763 "Invalid argument for tx-desc-num [0-N]," 764 "power of 2 required.\n"); 765 us_vhost_usage(prgname); 766 return -1; 767 } else { 768 num_tx_descriptor = ret; 769 } 770 } 771 772 break; 773 774 /* Invalid option - print options. */ 775 default: 776 us_vhost_usage(prgname); 777 return -1; 778 } 779 } 780 781 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 782 if (enabled_port_mask & (1 << i)) 783 ports[num_ports++] = (uint8_t)i; 784 } 785 786 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 787 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 788 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 789 return -1; 790 } 791 792 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 793 RTE_LOG(INFO, VHOST_PORT, 794 "Vhost zero copy doesn't support software vm2vm," 795 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 796 return -1; 797 } 798 799 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 800 RTE_LOG(INFO, VHOST_PORT, 801 "Vhost zero copy doesn't support jumbo frame," 802 "please specify '--mergeable 0' to disable the " 803 "mergeable feature.\n"); 804 return -1; 805 } 806 807 return 0; 808 } 809 810 /* 811 * Update the global var NUM_PORTS and array PORTS according to system ports number 812 * and return valid ports number 813 */ 814 static unsigned check_ports_num(unsigned nb_ports) 815 { 816 unsigned valid_num_ports = num_ports; 817 unsigned portid; 818 819 if (num_ports > nb_ports) { 820 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 821 num_ports, nb_ports); 822 num_ports = nb_ports; 823 } 824 825 for (portid = 0; portid < num_ports; portid ++) { 826 if (ports[portid] >= nb_ports) { 827 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 828 ports[portid], (nb_ports - 1)); 829 ports[portid] = INVALID_PORT_ID; 830 valid_num_ports--; 831 } 832 } 833 return valid_num_ports; 834 } 835 836 /* 837 * Macro to print out packet contents. Wrapped in debug define so that the 838 * data path is not effected when debug is disabled. 839 */ 840 #ifdef DEBUG 841 #define PRINT_PACKET(device, addr, size, header) do { \ 842 char *pkt_addr = (char*)(addr); \ 843 unsigned int index; \ 844 char packet[MAX_PRINT_BUFF]; \ 845 \ 846 if ((header)) \ 847 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 848 else \ 849 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 850 for (index = 0; index < (size); index++) { \ 851 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 852 "%02hhx ", pkt_addr[index]); \ 853 } \ 854 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 855 \ 856 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 857 } while(0) 858 #else 859 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 860 #endif 861 862 /* 863 * Function to convert guest physical addresses to vhost physical addresses. 864 * This is used to convert virtio buffer addresses. 865 */ 866 static inline uint64_t __attribute__((always_inline)) 867 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 868 uint32_t buf_len, hpa_type *addr_type) 869 { 870 struct virtio_memory_regions_hpa *region; 871 uint32_t regionidx; 872 uint64_t vhost_pa = 0; 873 874 *addr_type = PHYS_ADDR_INVALID; 875 876 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 877 region = &vdev->regions_hpa[regionidx]; 878 if ((guest_pa >= region->guest_phys_address) && 879 (guest_pa <= region->guest_phys_address_end)) { 880 vhost_pa = region->host_phys_addr_offset + guest_pa; 881 if (likely((guest_pa + buf_len - 1) 882 <= region->guest_phys_address_end)) 883 *addr_type = PHYS_ADDR_CONTINUOUS; 884 else 885 *addr_type = PHYS_ADDR_CROSS_SUBREG; 886 break; 887 } 888 } 889 890 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 891 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 892 (void *)(uintptr_t)vhost_pa); 893 894 return vhost_pa; 895 } 896 897 /* 898 * Compares a packet destination MAC address to a device MAC address. 899 */ 900 static inline int __attribute__((always_inline)) 901 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 902 { 903 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 904 } 905 906 /* 907 * This function learns the MAC address of the device and registers this along with a 908 * vlan tag to a VMDQ. 909 */ 910 static int 911 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 912 { 913 struct ether_hdr *pkt_hdr; 914 struct virtio_net_data_ll *dev_ll; 915 struct virtio_net *dev = vdev->dev; 916 int i, ret; 917 918 /* Learn MAC address of guest device from packet */ 919 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 920 921 dev_ll = ll_root_used; 922 923 while (dev_ll != NULL) { 924 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 925 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 926 return -1; 927 } 928 dev_ll = dev_ll->next; 929 } 930 931 for (i = 0; i < ETHER_ADDR_LEN; i++) 932 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 933 934 /* vlan_tag currently uses the device_id. */ 935 vdev->vlan_tag = vlan_tags[dev->device_fh]; 936 937 /* Print out VMDQ registration info. */ 938 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 939 dev->device_fh, 940 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 941 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 942 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 943 vdev->vlan_tag); 944 945 /* Register the MAC address. */ 946 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 947 (uint32_t)dev->device_fh + vmdq_pool_base); 948 if (ret) 949 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 950 dev->device_fh); 951 952 /* Enable stripping of the vlan tag as we handle routing. */ 953 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1); 954 955 /* Set device as ready for RX. */ 956 vdev->ready = DEVICE_RX; 957 958 return 0; 959 } 960 961 /* 962 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 963 * queue before disabling RX on the device. 964 */ 965 static inline void 966 unlink_vmdq(struct vhost_dev *vdev) 967 { 968 unsigned i = 0; 969 unsigned rx_count; 970 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 971 972 if (vdev->ready == DEVICE_RX) { 973 /*clear MAC and VLAN settings*/ 974 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 975 for (i = 0; i < 6; i++) 976 vdev->mac_address.addr_bytes[i] = 0; 977 978 vdev->vlan_tag = 0; 979 980 /*Clear out the receive buffers*/ 981 rx_count = rte_eth_rx_burst(ports[0], 982 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 983 984 while (rx_count) { 985 for (i = 0; i < rx_count; i++) 986 rte_pktmbuf_free(pkts_burst[i]); 987 988 rx_count = rte_eth_rx_burst(ports[0], 989 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 990 } 991 992 vdev->ready = DEVICE_MAC_LEARNING; 993 } 994 } 995 996 /* 997 * Check if the packet destination MAC address is for a local device. If so then put 998 * the packet on that devices RX queue. If not then return. 999 */ 1000 static inline int __attribute__((always_inline)) 1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1002 { 1003 struct virtio_net_data_ll *dev_ll; 1004 struct ether_hdr *pkt_hdr; 1005 uint64_t ret = 0; 1006 struct virtio_net *dev = vdev->dev; 1007 struct virtio_net *tdev; /* destination virito device */ 1008 1009 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1010 1011 /*get the used devices list*/ 1012 dev_ll = ll_root_used; 1013 1014 while (dev_ll != NULL) { 1015 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1016 &dev_ll->vdev->mac_address)) { 1017 1018 /* Drop the packet if the TX packet is destined for the TX device. */ 1019 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1020 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1021 dev->device_fh); 1022 return 0; 1023 } 1024 tdev = dev_ll->vdev->dev; 1025 1026 1027 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1028 1029 if (unlikely(dev_ll->vdev->remove)) { 1030 /*drop the packet if the device is marked for removal*/ 1031 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1032 } else { 1033 /*send the packet to the local virtio device*/ 1034 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1035 if (enable_stats) { 1036 rte_atomic64_add( 1037 &dev_statistics[tdev->device_fh].rx_total_atomic, 1038 1); 1039 rte_atomic64_add( 1040 &dev_statistics[tdev->device_fh].rx_atomic, 1041 ret); 1042 dev_statistics[tdev->device_fh].tx_total++; 1043 dev_statistics[tdev->device_fh].tx += ret; 1044 } 1045 } 1046 1047 return 0; 1048 } 1049 dev_ll = dev_ll->next; 1050 } 1051 1052 return -1; 1053 } 1054 1055 /* 1056 * Check if the destination MAC of a packet is one local VM, 1057 * and get its vlan tag, and offset if it is. 1058 */ 1059 static inline int __attribute__((always_inline)) 1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1061 uint32_t *offset, uint16_t *vlan_tag) 1062 { 1063 struct virtio_net_data_ll *dev_ll = ll_root_used; 1064 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1065 1066 while (dev_ll != NULL) { 1067 if ((dev_ll->vdev->ready == DEVICE_RX) 1068 && ether_addr_cmp(&(pkt_hdr->d_addr), 1069 &dev_ll->vdev->mac_address)) { 1070 /* 1071 * Drop the packet if the TX packet is 1072 * destined for the TX device. 1073 */ 1074 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1075 LOG_DEBUG(VHOST_DATA, 1076 "(%"PRIu64") TX: Source and destination" 1077 " MAC addresses are the same. Dropping " 1078 "packet.\n", 1079 dev_ll->vdev->dev->device_fh); 1080 return -1; 1081 } 1082 1083 /* 1084 * HW vlan strip will reduce the packet length 1085 * by minus length of vlan tag, so need restore 1086 * the packet length by plus it. 1087 */ 1088 *offset = VLAN_HLEN; 1089 *vlan_tag = 1090 (uint16_t) 1091 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1092 1093 LOG_DEBUG(VHOST_DATA, 1094 "(%"PRIu64") TX: pkt to local VM device id:" 1095 "(%"PRIu64") vlan tag: %d.\n", 1096 dev->device_fh, dev_ll->vdev->dev->device_fh, 1097 vlan_tag); 1098 1099 break; 1100 } 1101 dev_ll = dev_ll->next; 1102 } 1103 return 0; 1104 } 1105 1106 /* 1107 * This function routes the TX packet to the correct interface. This may be a local device 1108 * or the physical port. 1109 */ 1110 static inline void __attribute__((always_inline)) 1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1112 { 1113 struct mbuf_table *tx_q; 1114 struct rte_mbuf **m_table; 1115 unsigned len, ret, offset = 0; 1116 const uint16_t lcore_id = rte_lcore_id(); 1117 struct virtio_net *dev = vdev->dev; 1118 struct ether_hdr *nh; 1119 1120 /*check if destination is local VM*/ 1121 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1122 rte_pktmbuf_free(m); 1123 return; 1124 } 1125 1126 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1127 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1128 rte_pktmbuf_free(m); 1129 return; 1130 } 1131 } 1132 1133 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1134 1135 /*Add packet to the port tx queue*/ 1136 tx_q = &lcore_tx_queue[lcore_id]; 1137 len = tx_q->len; 1138 1139 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1140 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1141 /* Guest has inserted the vlan tag. */ 1142 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1143 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1144 if ((vm2vm_mode == VM2VM_HARDWARE) && 1145 (vh->vlan_tci != vlan_tag_be)) 1146 vh->vlan_tci = vlan_tag_be; 1147 } else { 1148 m->ol_flags = PKT_TX_VLAN_PKT; 1149 1150 /* 1151 * Find the right seg to adjust the data len when offset is 1152 * bigger than tail room size. 1153 */ 1154 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1155 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1156 m->data_len += offset; 1157 else { 1158 struct rte_mbuf *seg = m; 1159 1160 while ((seg->next != NULL) && 1161 (offset > rte_pktmbuf_tailroom(seg))) 1162 seg = seg->next; 1163 1164 seg->data_len += offset; 1165 } 1166 m->pkt_len += offset; 1167 } 1168 1169 m->vlan_tci = vlan_tag; 1170 } 1171 1172 tx_q->m_table[len] = m; 1173 len++; 1174 if (enable_stats) { 1175 dev_statistics[dev->device_fh].tx_total++; 1176 dev_statistics[dev->device_fh].tx++; 1177 } 1178 1179 if (unlikely(len == MAX_PKT_BURST)) { 1180 m_table = (struct rte_mbuf **)tx_q->m_table; 1181 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1182 /* Free any buffers not handled by TX and update the port stats. */ 1183 if (unlikely(ret < len)) { 1184 do { 1185 rte_pktmbuf_free(m_table[ret]); 1186 } while (++ret < len); 1187 } 1188 1189 len = 0; 1190 } 1191 1192 tx_q->len = len; 1193 return; 1194 } 1195 /* 1196 * This function is called by each data core. It handles all RX/TX registered with the 1197 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1198 * with all devices in the main linked list. 1199 */ 1200 static int 1201 switch_worker(__attribute__((unused)) void *arg) 1202 { 1203 struct rte_mempool *mbuf_pool = arg; 1204 struct virtio_net *dev = NULL; 1205 struct vhost_dev *vdev = NULL; 1206 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1207 struct virtio_net_data_ll *dev_ll; 1208 struct mbuf_table *tx_q; 1209 volatile struct lcore_ll_info *lcore_ll; 1210 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1211 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1212 unsigned ret, i; 1213 const uint16_t lcore_id = rte_lcore_id(); 1214 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1215 uint16_t rx_count = 0; 1216 uint16_t tx_count; 1217 uint32_t retry = 0; 1218 1219 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1220 lcore_ll = lcore_info[lcore_id].lcore_ll; 1221 prev_tsc = 0; 1222 1223 tx_q = &lcore_tx_queue[lcore_id]; 1224 for (i = 0; i < num_cores; i ++) { 1225 if (lcore_ids[i] == lcore_id) { 1226 tx_q->txq_id = i; 1227 break; 1228 } 1229 } 1230 1231 while(1) { 1232 cur_tsc = rte_rdtsc(); 1233 /* 1234 * TX burst queue drain 1235 */ 1236 diff_tsc = cur_tsc - prev_tsc; 1237 if (unlikely(diff_tsc > drain_tsc)) { 1238 1239 if (tx_q->len) { 1240 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1241 1242 /*Tx any packets in the queue*/ 1243 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1244 (struct rte_mbuf **)tx_q->m_table, 1245 (uint16_t)tx_q->len); 1246 if (unlikely(ret < tx_q->len)) { 1247 do { 1248 rte_pktmbuf_free(tx_q->m_table[ret]); 1249 } while (++ret < tx_q->len); 1250 } 1251 1252 tx_q->len = 0; 1253 } 1254 1255 prev_tsc = cur_tsc; 1256 1257 } 1258 1259 rte_prefetch0(lcore_ll->ll_root_used); 1260 /* 1261 * Inform the configuration core that we have exited the linked list and that no devices are 1262 * in use if requested. 1263 */ 1264 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1265 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1266 1267 /* 1268 * Process devices 1269 */ 1270 dev_ll = lcore_ll->ll_root_used; 1271 1272 while (dev_ll != NULL) { 1273 /*get virtio device ID*/ 1274 vdev = dev_ll->vdev; 1275 dev = vdev->dev; 1276 1277 if (unlikely(vdev->remove)) { 1278 dev_ll = dev_ll->next; 1279 unlink_vmdq(vdev); 1280 vdev->ready = DEVICE_SAFE_REMOVE; 1281 continue; 1282 } 1283 if (likely(vdev->ready == DEVICE_RX)) { 1284 /*Handle guest RX*/ 1285 rx_count = rte_eth_rx_burst(ports[0], 1286 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1287 1288 if (rx_count) { 1289 /* 1290 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1291 * Here MAX_PKT_BURST must be less than virtio queue size 1292 */ 1293 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1294 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1295 rte_delay_us(burst_rx_delay_time); 1296 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1297 break; 1298 } 1299 } 1300 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1301 if (enable_stats) { 1302 rte_atomic64_add( 1303 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1304 rx_count); 1305 rte_atomic64_add( 1306 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1307 } 1308 while (likely(rx_count)) { 1309 rx_count--; 1310 rte_pktmbuf_free(pkts_burst[rx_count]); 1311 } 1312 1313 } 1314 } 1315 1316 if (likely(!vdev->remove)) { 1317 /* Handle guest TX*/ 1318 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1319 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1320 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1321 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1322 while (tx_count) 1323 rte_pktmbuf_free(pkts_burst[--tx_count]); 1324 } 1325 } 1326 while (tx_count) 1327 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1328 } 1329 1330 /*move to the next device in the list*/ 1331 dev_ll = dev_ll->next; 1332 } 1333 } 1334 1335 return 0; 1336 } 1337 1338 /* 1339 * This function gets available ring number for zero copy rx. 1340 * Only one thread will call this funciton for a paticular virtio device, 1341 * so, it is designed as non-thread-safe function. 1342 */ 1343 static inline uint32_t __attribute__((always_inline)) 1344 get_available_ring_num_zcp(struct virtio_net *dev) 1345 { 1346 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1347 uint16_t avail_idx; 1348 1349 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1350 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1351 } 1352 1353 /* 1354 * This function gets available ring index for zero copy rx, 1355 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1356 * Only one thread will call this funciton for a paticular virtio device, 1357 * so, it is designed as non-thread-safe function. 1358 */ 1359 static inline uint32_t __attribute__((always_inline)) 1360 get_available_ring_index_zcp(struct virtio_net *dev, 1361 uint16_t *res_base_idx, uint32_t count) 1362 { 1363 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1364 uint16_t avail_idx; 1365 uint32_t retry = 0; 1366 uint16_t free_entries; 1367 1368 *res_base_idx = vq->last_used_idx_res; 1369 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1370 free_entries = (avail_idx - *res_base_idx); 1371 1372 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1373 "avail idx: %d, " 1374 "res base idx:%d, free entries:%d\n", 1375 dev->device_fh, avail_idx, *res_base_idx, 1376 free_entries); 1377 1378 /* 1379 * If retry is enabled and the queue is full then we wait 1380 * and retry to avoid packet loss. 1381 */ 1382 if (enable_retry && unlikely(count > free_entries)) { 1383 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1384 rte_delay_us(burst_rx_delay_time); 1385 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1386 free_entries = (avail_idx - *res_base_idx); 1387 if (count <= free_entries) 1388 break; 1389 } 1390 } 1391 1392 /*check that we have enough buffers*/ 1393 if (unlikely(count > free_entries)) 1394 count = free_entries; 1395 1396 if (unlikely(count == 0)) { 1397 LOG_DEBUG(VHOST_DATA, 1398 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1399 "avail idx: %d, res base idx:%d, free entries:%d\n", 1400 dev->device_fh, avail_idx, 1401 *res_base_idx, free_entries); 1402 return 0; 1403 } 1404 1405 vq->last_used_idx_res = *res_base_idx + count; 1406 1407 return count; 1408 } 1409 1410 /* 1411 * This function put descriptor back to used list. 1412 */ 1413 static inline void __attribute__((always_inline)) 1414 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1415 { 1416 uint16_t res_cur_idx = vq->last_used_idx; 1417 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1418 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1419 rte_compiler_barrier(); 1420 *(volatile uint16_t *)&vq->used->idx += 1; 1421 vq->last_used_idx += 1; 1422 1423 /* Kick the guest if necessary. */ 1424 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1425 eventfd_write((int)vq->kickfd, 1); 1426 } 1427 1428 /* 1429 * This function get available descriptor from vitio vring and un-attached mbuf 1430 * from vpool->ring, and then attach them together. It needs adjust the offset 1431 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1432 * frame data may be put to wrong location in mbuf. 1433 */ 1434 static inline void __attribute__((always_inline)) 1435 attach_rxmbuf_zcp(struct virtio_net *dev) 1436 { 1437 uint16_t res_base_idx, desc_idx; 1438 uint64_t buff_addr, phys_addr; 1439 struct vhost_virtqueue *vq; 1440 struct vring_desc *desc; 1441 struct rte_mbuf *mbuf = NULL; 1442 struct vpool *vpool; 1443 hpa_type addr_type; 1444 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1445 1446 vpool = &vpool_array[vdev->vmdq_rx_q]; 1447 vq = dev->virtqueue[VIRTIO_RXQ]; 1448 1449 do { 1450 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1451 1) != 1)) 1452 return; 1453 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1454 1455 desc = &vq->desc[desc_idx]; 1456 if (desc->flags & VRING_DESC_F_NEXT) { 1457 desc = &vq->desc[desc->next]; 1458 buff_addr = gpa_to_vva(dev, desc->addr); 1459 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1460 &addr_type); 1461 } else { 1462 buff_addr = gpa_to_vva(dev, 1463 desc->addr + vq->vhost_hlen); 1464 phys_addr = gpa_to_hpa(vdev, 1465 desc->addr + vq->vhost_hlen, 1466 desc->len, &addr_type); 1467 } 1468 1469 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1470 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1471 " address found when attaching RX frame buffer" 1472 " address!\n", dev->device_fh); 1473 put_desc_to_used_list_zcp(vq, desc_idx); 1474 continue; 1475 } 1476 1477 /* 1478 * Check if the frame buffer address from guest crosses 1479 * sub-region or not. 1480 */ 1481 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1482 RTE_LOG(ERR, VHOST_DATA, 1483 "(%"PRIu64") Frame buffer address cross " 1484 "sub-regioin found when attaching RX frame " 1485 "buffer address!\n", 1486 dev->device_fh); 1487 put_desc_to_used_list_zcp(vq, desc_idx); 1488 continue; 1489 } 1490 } while (unlikely(phys_addr == 0)); 1491 1492 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1493 if (unlikely(mbuf == NULL)) { 1494 LOG_DEBUG(VHOST_DATA, 1495 "(%"PRIu64") in attach_rxmbuf_zcp: " 1496 "ring_sc_dequeue fail.\n", 1497 dev->device_fh); 1498 put_desc_to_used_list_zcp(vq, desc_idx); 1499 return; 1500 } 1501 1502 if (unlikely(vpool->buf_size > desc->len)) { 1503 LOG_DEBUG(VHOST_DATA, 1504 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1505 "length(%d) of descriptor idx: %d less than room " 1506 "size required: %d\n", 1507 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1508 put_desc_to_used_list_zcp(vq, desc_idx); 1509 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1510 return; 1511 } 1512 1513 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1514 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1515 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1516 mbuf->data_len = desc->len; 1517 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1518 1519 LOG_DEBUG(VHOST_DATA, 1520 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1521 "descriptor idx:%d\n", 1522 dev->device_fh, res_base_idx, desc_idx); 1523 1524 __rte_mbuf_raw_free(mbuf); 1525 1526 return; 1527 } 1528 1529 /* 1530 * Detach an attched packet mbuf - 1531 * - restore original mbuf address and length values. 1532 * - reset pktmbuf data and data_len to their default values. 1533 * All other fields of the given packet mbuf will be left intact. 1534 * 1535 * @param m 1536 * The attached packet mbuf. 1537 */ 1538 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1539 { 1540 const struct rte_mempool *mp = m->pool; 1541 void *buf = RTE_MBUF_TO_BADDR(m); 1542 uint32_t buf_ofs; 1543 uint32_t buf_len = mp->elt_size - sizeof(*m); 1544 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1545 1546 m->buf_addr = buf; 1547 m->buf_len = (uint16_t)buf_len; 1548 1549 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1550 RTE_PKTMBUF_HEADROOM : m->buf_len; 1551 m->data_off = buf_ofs; 1552 1553 m->data_len = 0; 1554 } 1555 1556 /* 1557 * This function is called after packets have been transimited. It fetchs mbuf 1558 * from vpool->pool, detached it and put into vpool->ring. It also update the 1559 * used index and kick the guest if necessary. 1560 */ 1561 static inline uint32_t __attribute__((always_inline)) 1562 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1563 { 1564 struct rte_mbuf *mbuf; 1565 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1566 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1567 uint32_t index = 0; 1568 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1569 1570 LOG_DEBUG(VHOST_DATA, 1571 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1572 "clean is: %d\n", 1573 dev->device_fh, mbuf_count); 1574 LOG_DEBUG(VHOST_DATA, 1575 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1576 "clean is : %d\n", 1577 dev->device_fh, rte_ring_count(vpool->ring)); 1578 1579 for (index = 0; index < mbuf_count; index++) { 1580 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1581 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1582 pktmbuf_detach_zcp(mbuf); 1583 rte_ring_sp_enqueue(vpool->ring, mbuf); 1584 1585 /* Update used index buffer information. */ 1586 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1587 vq->used->ring[used_idx].len = 0; 1588 1589 used_idx = (used_idx + 1) & (vq->size - 1); 1590 } 1591 1592 LOG_DEBUG(VHOST_DATA, 1593 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1594 "clean is: %d\n", 1595 dev->device_fh, rte_mempool_count(vpool->pool)); 1596 LOG_DEBUG(VHOST_DATA, 1597 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1598 "clean is : %d\n", 1599 dev->device_fh, rte_ring_count(vpool->ring)); 1600 LOG_DEBUG(VHOST_DATA, 1601 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1602 "vq->last_used_idx:%d\n", 1603 dev->device_fh, vq->last_used_idx); 1604 1605 vq->last_used_idx += mbuf_count; 1606 1607 LOG_DEBUG(VHOST_DATA, 1608 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1609 "vq->last_used_idx:%d\n", 1610 dev->device_fh, vq->last_used_idx); 1611 1612 rte_compiler_barrier(); 1613 1614 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1615 1616 /* Kick guest if required. */ 1617 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1618 eventfd_write((int)vq->kickfd, 1); 1619 1620 return 0; 1621 } 1622 1623 /* 1624 * This function is called when a virtio device is destroy. 1625 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1626 */ 1627 static void mbuf_destroy_zcp(struct vpool *vpool) 1628 { 1629 struct rte_mbuf *mbuf = NULL; 1630 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1631 1632 LOG_DEBUG(VHOST_CONFIG, 1633 "in mbuf_destroy_zcp: mbuf count in mempool before " 1634 "mbuf_destroy_zcp is: %d\n", 1635 mbuf_count); 1636 LOG_DEBUG(VHOST_CONFIG, 1637 "in mbuf_destroy_zcp: mbuf count in ring before " 1638 "mbuf_destroy_zcp is : %d\n", 1639 rte_ring_count(vpool->ring)); 1640 1641 for (index = 0; index < mbuf_count; index++) { 1642 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1643 if (likely(mbuf != NULL)) { 1644 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1645 pktmbuf_detach_zcp(mbuf); 1646 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1647 } 1648 } 1649 1650 LOG_DEBUG(VHOST_CONFIG, 1651 "in mbuf_destroy_zcp: mbuf count in mempool after " 1652 "mbuf_destroy_zcp is: %d\n", 1653 rte_mempool_count(vpool->pool)); 1654 LOG_DEBUG(VHOST_CONFIG, 1655 "in mbuf_destroy_zcp: mbuf count in ring after " 1656 "mbuf_destroy_zcp is : %d\n", 1657 rte_ring_count(vpool->ring)); 1658 } 1659 1660 /* 1661 * This function update the use flag and counter. 1662 */ 1663 static inline uint32_t __attribute__((always_inline)) 1664 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1665 uint32_t count) 1666 { 1667 struct vhost_virtqueue *vq; 1668 struct vring_desc *desc; 1669 struct rte_mbuf *buff; 1670 /* The virtio_hdr is initialised to 0. */ 1671 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1672 = {{0, 0, 0, 0, 0, 0}, 0}; 1673 uint64_t buff_hdr_addr = 0; 1674 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1675 uint32_t head_idx, packet_success = 0; 1676 uint16_t res_cur_idx; 1677 1678 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1679 1680 if (count == 0) 1681 return 0; 1682 1683 vq = dev->virtqueue[VIRTIO_RXQ]; 1684 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1685 1686 res_cur_idx = vq->last_used_idx; 1687 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1688 dev->device_fh, res_cur_idx, res_cur_idx + count); 1689 1690 /* Retrieve all of the head indexes first to avoid caching issues. */ 1691 for (head_idx = 0; head_idx < count; head_idx++) 1692 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1693 1694 /*Prefetch descriptor index. */ 1695 rte_prefetch0(&vq->desc[head[packet_success]]); 1696 1697 while (packet_success != count) { 1698 /* Get descriptor from available ring */ 1699 desc = &vq->desc[head[packet_success]]; 1700 1701 buff = pkts[packet_success]; 1702 LOG_DEBUG(VHOST_DATA, 1703 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1704 "pkt[%d] descriptor idx: %d\n", 1705 dev->device_fh, packet_success, 1706 MBUF_HEADROOM_UINT32(buff)); 1707 1708 PRINT_PACKET(dev, 1709 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1710 + RTE_PKTMBUF_HEADROOM), 1711 rte_pktmbuf_data_len(buff), 0); 1712 1713 /* Buffer address translation for virtio header. */ 1714 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1715 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1716 1717 /* 1718 * If the descriptors are chained the header and data are 1719 * placed in separate buffers. 1720 */ 1721 if (desc->flags & VRING_DESC_F_NEXT) { 1722 desc->len = vq->vhost_hlen; 1723 desc = &vq->desc[desc->next]; 1724 desc->len = rte_pktmbuf_data_len(buff); 1725 } else { 1726 desc->len = packet_len; 1727 } 1728 1729 /* Update used ring with desc information */ 1730 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1731 = head[packet_success]; 1732 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1733 = packet_len; 1734 res_cur_idx++; 1735 packet_success++; 1736 1737 /* A header is required per buffer. */ 1738 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1739 (const void *)&virtio_hdr, vq->vhost_hlen); 1740 1741 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1742 1743 if (likely(packet_success < count)) { 1744 /* Prefetch descriptor index. */ 1745 rte_prefetch0(&vq->desc[head[packet_success]]); 1746 } 1747 } 1748 1749 rte_compiler_barrier(); 1750 1751 LOG_DEBUG(VHOST_DATA, 1752 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1753 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1754 dev->device_fh, vq->last_used_idx, vq->used->idx); 1755 1756 *(volatile uint16_t *)&vq->used->idx += count; 1757 vq->last_used_idx += count; 1758 1759 LOG_DEBUG(VHOST_DATA, 1760 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1761 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1762 dev->device_fh, vq->last_used_idx, vq->used->idx); 1763 1764 /* Kick the guest if necessary. */ 1765 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1766 eventfd_write((int)vq->kickfd, 1); 1767 1768 return count; 1769 } 1770 1771 /* 1772 * This function routes the TX packet to the correct interface. 1773 * This may be a local device or the physical port. 1774 */ 1775 static inline void __attribute__((always_inline)) 1776 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1777 uint32_t desc_idx, uint8_t need_copy) 1778 { 1779 struct mbuf_table *tx_q; 1780 struct rte_mbuf **m_table; 1781 struct rte_mbuf *mbuf = NULL; 1782 unsigned len, ret, offset = 0; 1783 struct vpool *vpool; 1784 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1785 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1786 1787 /*Add packet to the port tx queue*/ 1788 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1789 len = tx_q->len; 1790 1791 /* Allocate an mbuf and populate the structure. */ 1792 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1793 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1794 if (unlikely(mbuf == NULL)) { 1795 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1796 RTE_LOG(ERR, VHOST_DATA, 1797 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1798 dev->device_fh); 1799 put_desc_to_used_list_zcp(vq, desc_idx); 1800 return; 1801 } 1802 1803 if (vm2vm_mode == VM2VM_HARDWARE) { 1804 /* Avoid using a vlan tag from any vm for external pkt, such as 1805 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1806 * selection, MAC address determines it as an external pkt 1807 * which should go to network, while vlan tag determine it as 1808 * a vm2vm pkt should forward to another vm. Hardware confuse 1809 * such a ambiguous situation, so pkt will lost. 1810 */ 1811 vlan_tag = external_pkt_default_vlan_tag; 1812 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1813 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1814 __rte_mbuf_raw_free(mbuf); 1815 return; 1816 } 1817 } 1818 1819 mbuf->nb_segs = m->nb_segs; 1820 mbuf->next = m->next; 1821 mbuf->data_len = m->data_len + offset; 1822 mbuf->pkt_len = mbuf->data_len; 1823 if (unlikely(need_copy)) { 1824 /* Copy the packet contents to the mbuf. */ 1825 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1826 rte_pktmbuf_mtod(m, void *), 1827 m->data_len); 1828 } else { 1829 mbuf->data_off = m->data_off; 1830 mbuf->buf_physaddr = m->buf_physaddr; 1831 mbuf->buf_addr = m->buf_addr; 1832 } 1833 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1834 mbuf->vlan_tci = vlan_tag; 1835 mbuf->l2_len = sizeof(struct ether_hdr); 1836 mbuf->l3_len = sizeof(struct ipv4_hdr); 1837 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1838 1839 tx_q->m_table[len] = mbuf; 1840 len++; 1841 1842 LOG_DEBUG(VHOST_DATA, 1843 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1844 dev->device_fh, 1845 mbuf->nb_segs, 1846 (mbuf->next == NULL) ? "null" : "non-null"); 1847 1848 if (enable_stats) { 1849 dev_statistics[dev->device_fh].tx_total++; 1850 dev_statistics[dev->device_fh].tx++; 1851 } 1852 1853 if (unlikely(len == MAX_PKT_BURST)) { 1854 m_table = (struct rte_mbuf **)tx_q->m_table; 1855 ret = rte_eth_tx_burst(ports[0], 1856 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1857 1858 /* 1859 * Free any buffers not handled by TX and update 1860 * the port stats. 1861 */ 1862 if (unlikely(ret < len)) { 1863 do { 1864 rte_pktmbuf_free(m_table[ret]); 1865 } while (++ret < len); 1866 } 1867 1868 len = 0; 1869 txmbuf_clean_zcp(dev, vpool); 1870 } 1871 1872 tx_q->len = len; 1873 1874 return; 1875 } 1876 1877 /* 1878 * This function TX all available packets in virtio TX queue for one 1879 * virtio-net device. If it is first packet, it learns MAC address and 1880 * setup VMDQ. 1881 */ 1882 static inline void __attribute__((always_inline)) 1883 virtio_dev_tx_zcp(struct virtio_net *dev) 1884 { 1885 struct rte_mbuf m; 1886 struct vhost_virtqueue *vq; 1887 struct vring_desc *desc; 1888 uint64_t buff_addr = 0, phys_addr; 1889 uint32_t head[MAX_PKT_BURST]; 1890 uint32_t i; 1891 uint16_t free_entries, packet_success = 0; 1892 uint16_t avail_idx; 1893 uint8_t need_copy = 0; 1894 hpa_type addr_type; 1895 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1896 1897 vq = dev->virtqueue[VIRTIO_TXQ]; 1898 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1899 1900 /* If there are no available buffers then return. */ 1901 if (vq->last_used_idx_res == avail_idx) 1902 return; 1903 1904 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1905 1906 /* Prefetch available ring to retrieve head indexes. */ 1907 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1908 1909 /* Get the number of free entries in the ring */ 1910 free_entries = (avail_idx - vq->last_used_idx_res); 1911 1912 /* Limit to MAX_PKT_BURST. */ 1913 free_entries 1914 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1915 1916 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1917 dev->device_fh, free_entries); 1918 1919 /* Retrieve all of the head indexes first to avoid caching issues. */ 1920 for (i = 0; i < free_entries; i++) 1921 head[i] 1922 = vq->avail->ring[(vq->last_used_idx_res + i) 1923 & (vq->size - 1)]; 1924 1925 vq->last_used_idx_res += free_entries; 1926 1927 /* Prefetch descriptor index. */ 1928 rte_prefetch0(&vq->desc[head[packet_success]]); 1929 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1930 1931 while (packet_success < free_entries) { 1932 desc = &vq->desc[head[packet_success]]; 1933 1934 /* Discard first buffer as it is the virtio header */ 1935 desc = &vq->desc[desc->next]; 1936 1937 /* Buffer address translation. */ 1938 buff_addr = gpa_to_vva(dev, desc->addr); 1939 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1940 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1941 &addr_type); 1942 1943 if (likely(packet_success < (free_entries - 1))) 1944 /* Prefetch descriptor index. */ 1945 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1946 1947 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1948 RTE_LOG(ERR, VHOST_DATA, 1949 "(%"PRIu64") Invalid frame buffer address found" 1950 "when TX packets!\n", 1951 dev->device_fh); 1952 packet_success++; 1953 continue; 1954 } 1955 1956 /* Prefetch buffer address. */ 1957 rte_prefetch0((void *)(uintptr_t)buff_addr); 1958 1959 /* 1960 * Setup dummy mbuf. This is copied to a real mbuf if 1961 * transmitted out the physical port. 1962 */ 1963 m.data_len = desc->len; 1964 m.nb_segs = 1; 1965 m.next = NULL; 1966 m.data_off = 0; 1967 m.buf_addr = (void *)(uintptr_t)buff_addr; 1968 m.buf_physaddr = phys_addr; 1969 1970 /* 1971 * Check if the frame buffer address from guest crosses 1972 * sub-region or not. 1973 */ 1974 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1975 RTE_LOG(ERR, VHOST_DATA, 1976 "(%"PRIu64") Frame buffer address cross " 1977 "sub-regioin found when attaching TX frame " 1978 "buffer address!\n", 1979 dev->device_fh); 1980 need_copy = 1; 1981 } else 1982 need_copy = 0; 1983 1984 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1985 1986 /* 1987 * If this is the first received packet we need to learn 1988 * the MAC and setup VMDQ 1989 */ 1990 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1991 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1992 /* 1993 * Discard frame if device is scheduled for 1994 * removal or a duplicate MAC address is found. 1995 */ 1996 packet_success += free_entries; 1997 vq->last_used_idx += packet_success; 1998 break; 1999 } 2000 } 2001 2002 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2003 packet_success++; 2004 } 2005 } 2006 2007 /* 2008 * This function is called by each data core. It handles all RX/TX registered 2009 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2010 * addresses are compared with all devices in the main linked list. 2011 */ 2012 static int 2013 switch_worker_zcp(__attribute__((unused)) void *arg) 2014 { 2015 struct virtio_net *dev = NULL; 2016 struct vhost_dev *vdev = NULL; 2017 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2018 struct virtio_net_data_ll *dev_ll; 2019 struct mbuf_table *tx_q; 2020 volatile struct lcore_ll_info *lcore_ll; 2021 const uint64_t drain_tsc 2022 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2023 * BURST_TX_DRAIN_US; 2024 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2025 unsigned ret; 2026 const uint16_t lcore_id = rte_lcore_id(); 2027 uint16_t count_in_ring, rx_count = 0; 2028 2029 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2030 2031 lcore_ll = lcore_info[lcore_id].lcore_ll; 2032 prev_tsc = 0; 2033 2034 while (1) { 2035 cur_tsc = rte_rdtsc(); 2036 2037 /* TX burst queue drain */ 2038 diff_tsc = cur_tsc - prev_tsc; 2039 if (unlikely(diff_tsc > drain_tsc)) { 2040 /* 2041 * Get mbuf from vpool.pool and detach mbuf and 2042 * put back into vpool.ring. 2043 */ 2044 dev_ll = lcore_ll->ll_root_used; 2045 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2046 /* Get virtio device ID */ 2047 vdev = dev_ll->vdev; 2048 dev = vdev->dev; 2049 2050 if (likely(!vdev->remove)) { 2051 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2052 if (tx_q->len) { 2053 LOG_DEBUG(VHOST_DATA, 2054 "TX queue drained after timeout" 2055 " with burst size %u\n", 2056 tx_q->len); 2057 2058 /* 2059 * Tx any packets in the queue 2060 */ 2061 ret = rte_eth_tx_burst( 2062 ports[0], 2063 (uint16_t)tx_q->txq_id, 2064 (struct rte_mbuf **) 2065 tx_q->m_table, 2066 (uint16_t)tx_q->len); 2067 if (unlikely(ret < tx_q->len)) { 2068 do { 2069 rte_pktmbuf_free( 2070 tx_q->m_table[ret]); 2071 } while (++ret < tx_q->len); 2072 } 2073 tx_q->len = 0; 2074 2075 txmbuf_clean_zcp(dev, 2076 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2077 } 2078 } 2079 dev_ll = dev_ll->next; 2080 } 2081 prev_tsc = cur_tsc; 2082 } 2083 2084 rte_prefetch0(lcore_ll->ll_root_used); 2085 2086 /* 2087 * Inform the configuration core that we have exited the linked 2088 * list and that no devices are in use if requested. 2089 */ 2090 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2091 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2092 2093 /* Process devices */ 2094 dev_ll = lcore_ll->ll_root_used; 2095 2096 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2097 vdev = dev_ll->vdev; 2098 dev = vdev->dev; 2099 if (unlikely(vdev->remove)) { 2100 dev_ll = dev_ll->next; 2101 unlink_vmdq(vdev); 2102 vdev->ready = DEVICE_SAFE_REMOVE; 2103 continue; 2104 } 2105 2106 if (likely(vdev->ready == DEVICE_RX)) { 2107 uint32_t index = vdev->vmdq_rx_q; 2108 uint16_t i; 2109 count_in_ring 2110 = rte_ring_count(vpool_array[index].ring); 2111 uint16_t free_entries 2112 = (uint16_t)get_available_ring_num_zcp(dev); 2113 2114 /* 2115 * Attach all mbufs in vpool.ring and put back 2116 * into vpool.pool. 2117 */ 2118 for (i = 0; 2119 i < RTE_MIN(free_entries, 2120 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2121 i++) 2122 attach_rxmbuf_zcp(dev); 2123 2124 /* Handle guest RX */ 2125 rx_count = rte_eth_rx_burst(ports[0], 2126 vdev->vmdq_rx_q, pkts_burst, 2127 MAX_PKT_BURST); 2128 2129 if (rx_count) { 2130 ret_count = virtio_dev_rx_zcp(dev, 2131 pkts_burst, rx_count); 2132 if (enable_stats) { 2133 dev_statistics[dev->device_fh].rx_total 2134 += rx_count; 2135 dev_statistics[dev->device_fh].rx 2136 += ret_count; 2137 } 2138 while (likely(rx_count)) { 2139 rx_count--; 2140 pktmbuf_detach_zcp( 2141 pkts_burst[rx_count]); 2142 rte_ring_sp_enqueue( 2143 vpool_array[index].ring, 2144 (void *)pkts_burst[rx_count]); 2145 } 2146 } 2147 } 2148 2149 if (likely(!vdev->remove)) 2150 /* Handle guest TX */ 2151 virtio_dev_tx_zcp(dev); 2152 2153 /* Move to the next device in the list */ 2154 dev_ll = dev_ll->next; 2155 } 2156 } 2157 2158 return 0; 2159 } 2160 2161 2162 /* 2163 * Add an entry to a used linked list. A free entry must first be found 2164 * in the free linked list using get_data_ll_free_entry(); 2165 */ 2166 static void 2167 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2168 struct virtio_net_data_ll *ll_dev) 2169 { 2170 struct virtio_net_data_ll *ll = *ll_root_addr; 2171 2172 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2173 ll_dev->next = NULL; 2174 rte_compiler_barrier(); 2175 2176 /* If ll == NULL then this is the first device. */ 2177 if (ll) { 2178 /* Increment to the tail of the linked list. */ 2179 while ((ll->next != NULL) ) 2180 ll = ll->next; 2181 2182 ll->next = ll_dev; 2183 } else { 2184 *ll_root_addr = ll_dev; 2185 } 2186 } 2187 2188 /* 2189 * Remove an entry from a used linked list. The entry must then be added to 2190 * the free linked list using put_data_ll_free_entry(). 2191 */ 2192 static void 2193 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2194 struct virtio_net_data_ll *ll_dev, 2195 struct virtio_net_data_ll *ll_dev_last) 2196 { 2197 struct virtio_net_data_ll *ll = *ll_root_addr; 2198 2199 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2200 return; 2201 2202 if (ll_dev == ll) 2203 *ll_root_addr = ll_dev->next; 2204 else 2205 if (likely(ll_dev_last != NULL)) 2206 ll_dev_last->next = ll_dev->next; 2207 else 2208 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2209 } 2210 2211 /* 2212 * Find and return an entry from the free linked list. 2213 */ 2214 static struct virtio_net_data_ll * 2215 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2216 { 2217 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2218 struct virtio_net_data_ll *ll_dev; 2219 2220 if (ll_free == NULL) 2221 return NULL; 2222 2223 ll_dev = ll_free; 2224 *ll_root_addr = ll_free->next; 2225 2226 return ll_dev; 2227 } 2228 2229 /* 2230 * Place an entry back on to the free linked list. 2231 */ 2232 static void 2233 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2234 struct virtio_net_data_ll *ll_dev) 2235 { 2236 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2237 2238 if (ll_dev == NULL) 2239 return; 2240 2241 ll_dev->next = ll_free; 2242 *ll_root_addr = ll_dev; 2243 } 2244 2245 /* 2246 * Creates a linked list of a given size. 2247 */ 2248 static struct virtio_net_data_ll * 2249 alloc_data_ll(uint32_t size) 2250 { 2251 struct virtio_net_data_ll *ll_new; 2252 uint32_t i; 2253 2254 /* Malloc and then chain the linked list. */ 2255 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2256 if (ll_new == NULL) { 2257 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2258 return NULL; 2259 } 2260 2261 for (i = 0; i < size - 1; i++) { 2262 ll_new[i].vdev = NULL; 2263 ll_new[i].next = &ll_new[i+1]; 2264 } 2265 ll_new[i].next = NULL; 2266 2267 return (ll_new); 2268 } 2269 2270 /* 2271 * Create the main linked list along with each individual cores linked list. A used and a free list 2272 * are created to manage entries. 2273 */ 2274 static int 2275 init_data_ll (void) 2276 { 2277 int lcore; 2278 2279 RTE_LCORE_FOREACH_SLAVE(lcore) { 2280 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2281 if (lcore_info[lcore].lcore_ll == NULL) { 2282 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2283 return -1; 2284 } 2285 2286 lcore_info[lcore].lcore_ll->device_num = 0; 2287 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2288 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2289 if (num_devices % num_switching_cores) 2290 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2291 else 2292 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2293 } 2294 2295 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2296 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2297 2298 return 0; 2299 } 2300 2301 /* 2302 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2303 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2304 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2305 */ 2306 static void 2307 destroy_device (volatile struct virtio_net *dev) 2308 { 2309 struct virtio_net_data_ll *ll_lcore_dev_cur; 2310 struct virtio_net_data_ll *ll_main_dev_cur; 2311 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2312 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2313 struct vhost_dev *vdev; 2314 int lcore; 2315 2316 dev->flags &= ~VIRTIO_DEV_RUNNING; 2317 2318 vdev = (struct vhost_dev *)dev->priv; 2319 /*set the remove flag. */ 2320 vdev->remove = 1; 2321 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2322 rte_pause(); 2323 } 2324 2325 /* Search for entry to be removed from lcore ll */ 2326 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2327 while (ll_lcore_dev_cur != NULL) { 2328 if (ll_lcore_dev_cur->vdev == vdev) { 2329 break; 2330 } else { 2331 ll_lcore_dev_last = ll_lcore_dev_cur; 2332 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2333 } 2334 } 2335 2336 if (ll_lcore_dev_cur == NULL) { 2337 RTE_LOG(ERR, VHOST_CONFIG, 2338 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2339 dev->device_fh); 2340 return; 2341 } 2342 2343 /* Search for entry to be removed from main ll */ 2344 ll_main_dev_cur = ll_root_used; 2345 ll_main_dev_last = NULL; 2346 while (ll_main_dev_cur != NULL) { 2347 if (ll_main_dev_cur->vdev == vdev) { 2348 break; 2349 } else { 2350 ll_main_dev_last = ll_main_dev_cur; 2351 ll_main_dev_cur = ll_main_dev_cur->next; 2352 } 2353 } 2354 2355 /* Remove entries from the lcore and main ll. */ 2356 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2357 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2358 2359 /* Set the dev_removal_flag on each lcore. */ 2360 RTE_LCORE_FOREACH_SLAVE(lcore) { 2361 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2362 } 2363 2364 /* 2365 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2366 * they can no longer access the device removed from the linked lists and that the devices 2367 * are no longer in use. 2368 */ 2369 RTE_LCORE_FOREACH_SLAVE(lcore) { 2370 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2371 rte_pause(); 2372 } 2373 } 2374 2375 /* Add the entries back to the lcore and main free ll.*/ 2376 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2377 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2378 2379 /* Decrement number of device on the lcore. */ 2380 lcore_info[vdev->coreid].lcore_ll->device_num--; 2381 2382 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2383 2384 if (zero_copy) { 2385 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2386 2387 /* Stop the RX queue. */ 2388 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2389 LOG_DEBUG(VHOST_CONFIG, 2390 "(%"PRIu64") In destroy_device: Failed to stop " 2391 "rx queue:%d\n", 2392 dev->device_fh, 2393 vdev->vmdq_rx_q); 2394 } 2395 2396 LOG_DEBUG(VHOST_CONFIG, 2397 "(%"PRIu64") in destroy_device: Start put mbuf in " 2398 "mempool back to ring for RX queue: %d\n", 2399 dev->device_fh, vdev->vmdq_rx_q); 2400 2401 mbuf_destroy_zcp(vpool); 2402 2403 /* Stop the TX queue. */ 2404 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2405 LOG_DEBUG(VHOST_CONFIG, 2406 "(%"PRIu64") In destroy_device: Failed to " 2407 "stop tx queue:%d\n", 2408 dev->device_fh, vdev->vmdq_rx_q); 2409 } 2410 2411 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2412 2413 LOG_DEBUG(VHOST_CONFIG, 2414 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2415 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2416 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2417 dev->device_fh); 2418 2419 mbuf_destroy_zcp(vpool); 2420 rte_free(vdev->regions_hpa); 2421 } 2422 rte_free(vdev); 2423 2424 } 2425 2426 /* 2427 * Calculate the region count of physical continous regions for one particular 2428 * region of whose vhost virtual address is continous. The particular region 2429 * start from vva_start, with size of 'size' in argument. 2430 */ 2431 static uint32_t 2432 check_hpa_regions(uint64_t vva_start, uint64_t size) 2433 { 2434 uint32_t i, nregions = 0, page_size = getpagesize(); 2435 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2436 if (vva_start % page_size) { 2437 LOG_DEBUG(VHOST_CONFIG, 2438 "in check_countinous: vva start(%p) mod page_size(%d) " 2439 "has remainder\n", 2440 (void *)(uintptr_t)vva_start, page_size); 2441 return 0; 2442 } 2443 if (size % page_size) { 2444 LOG_DEBUG(VHOST_CONFIG, 2445 "in check_countinous: " 2446 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2447 size, page_size); 2448 return 0; 2449 } 2450 for (i = 0; i < size - page_size; i = i + page_size) { 2451 cur_phys_addr 2452 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2453 next_phys_addr = rte_mem_virt2phy( 2454 (void *)(uintptr_t)(vva_start + i + page_size)); 2455 if ((cur_phys_addr + page_size) != next_phys_addr) { 2456 ++nregions; 2457 LOG_DEBUG(VHOST_CONFIG, 2458 "in check_continuous: hva addr:(%p) is not " 2459 "continuous with hva addr:(%p), diff:%d\n", 2460 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2461 (void *)(uintptr_t)(vva_start + (uint64_t)i 2462 + page_size), page_size); 2463 LOG_DEBUG(VHOST_CONFIG, 2464 "in check_continuous: hpa addr:(%p) is not " 2465 "continuous with hpa addr:(%p), " 2466 "diff:(%"PRIu64")\n", 2467 (void *)(uintptr_t)cur_phys_addr, 2468 (void *)(uintptr_t)next_phys_addr, 2469 (next_phys_addr-cur_phys_addr)); 2470 } 2471 } 2472 return nregions; 2473 } 2474 2475 /* 2476 * Divide each region whose vhost virtual address is continous into a few 2477 * sub-regions, make sure the physical address within each sub-region are 2478 * continous. And fill offset(to GPA) and size etc. information of each 2479 * sub-region into regions_hpa. 2480 */ 2481 static uint32_t 2482 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2483 { 2484 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2485 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2486 2487 if (mem_region_hpa == NULL) 2488 return 0; 2489 2490 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2491 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2492 virtio_memory->regions[regionidx].address_offset; 2493 mem_region_hpa[regionidx_hpa].guest_phys_address 2494 = virtio_memory->regions[regionidx].guest_phys_address; 2495 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2496 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2497 mem_region_hpa[regionidx_hpa].guest_phys_address; 2498 LOG_DEBUG(VHOST_CONFIG, 2499 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2500 regionidx_hpa, 2501 (void *)(uintptr_t) 2502 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2503 LOG_DEBUG(VHOST_CONFIG, 2504 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2505 regionidx_hpa, 2506 (void *)(uintptr_t) 2507 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2508 for (i = 0, k = 0; 2509 i < virtio_memory->regions[regionidx].memory_size - 2510 page_size; 2511 i += page_size) { 2512 cur_phys_addr = rte_mem_virt2phy( 2513 (void *)(uintptr_t)(vva_start + i)); 2514 next_phys_addr = rte_mem_virt2phy( 2515 (void *)(uintptr_t)(vva_start + 2516 i + page_size)); 2517 if ((cur_phys_addr + page_size) != next_phys_addr) { 2518 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2519 mem_region_hpa[regionidx_hpa].guest_phys_address + 2520 k + page_size; 2521 mem_region_hpa[regionidx_hpa].memory_size 2522 = k + page_size; 2523 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2524 "phys addr end [%d]:(%p)\n", 2525 regionidx_hpa, 2526 (void *)(uintptr_t) 2527 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2528 LOG_DEBUG(VHOST_CONFIG, 2529 "in fill_hpa_regions: guest phys addr " 2530 "size [%d]:(%p)\n", 2531 regionidx_hpa, 2532 (void *)(uintptr_t) 2533 (mem_region_hpa[regionidx_hpa].memory_size)); 2534 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2535 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2536 ++regionidx_hpa; 2537 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2538 next_phys_addr - 2539 mem_region_hpa[regionidx_hpa].guest_phys_address; 2540 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2541 " phys addr start[%d]:(%p)\n", 2542 regionidx_hpa, 2543 (void *)(uintptr_t) 2544 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2545 LOG_DEBUG(VHOST_CONFIG, 2546 "in fill_hpa_regions: host phys addr " 2547 "start[%d]:(%p)\n", 2548 regionidx_hpa, 2549 (void *)(uintptr_t) 2550 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2551 k = 0; 2552 } else { 2553 k += page_size; 2554 } 2555 } 2556 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2557 = mem_region_hpa[regionidx_hpa].guest_phys_address 2558 + k + page_size; 2559 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2560 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2561 "[%d]:(%p)\n", regionidx_hpa, 2562 (void *)(uintptr_t) 2563 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2564 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2565 "[%d]:(%p)\n", regionidx_hpa, 2566 (void *)(uintptr_t) 2567 (mem_region_hpa[regionidx_hpa].memory_size)); 2568 ++regionidx_hpa; 2569 } 2570 return regionidx_hpa; 2571 } 2572 2573 /* 2574 * A new device is added to a data core. First the device is added to the main linked list 2575 * and the allocated to a specific data core. 2576 */ 2577 static int 2578 new_device (struct virtio_net *dev) 2579 { 2580 struct virtio_net_data_ll *ll_dev; 2581 int lcore, core_add = 0; 2582 uint32_t device_num_min = num_devices; 2583 struct vhost_dev *vdev; 2584 uint32_t regionidx; 2585 2586 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2587 if (vdev == NULL) { 2588 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2589 dev->device_fh); 2590 return -1; 2591 } 2592 vdev->dev = dev; 2593 dev->priv = vdev; 2594 2595 if (zero_copy) { 2596 vdev->nregions_hpa = dev->mem->nregions; 2597 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2598 vdev->nregions_hpa 2599 += check_hpa_regions( 2600 dev->mem->regions[regionidx].guest_phys_address 2601 + dev->mem->regions[regionidx].address_offset, 2602 dev->mem->regions[regionidx].memory_size); 2603 2604 } 2605 2606 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2607 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2608 RTE_CACHE_LINE_SIZE); 2609 if (vdev->regions_hpa == NULL) { 2610 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2611 rte_free(vdev); 2612 return -1; 2613 } 2614 2615 2616 if (fill_hpa_memory_regions( 2617 vdev->regions_hpa, dev->mem 2618 ) != vdev->nregions_hpa) { 2619 2620 RTE_LOG(ERR, VHOST_CONFIG, 2621 "hpa memory regions number mismatch: " 2622 "[%d]\n", vdev->nregions_hpa); 2623 rte_free(vdev->regions_hpa); 2624 rte_free(vdev); 2625 return -1; 2626 } 2627 } 2628 2629 2630 /* Add device to main ll */ 2631 ll_dev = get_data_ll_free_entry(&ll_root_free); 2632 if (ll_dev == NULL) { 2633 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2634 "of %d devices per core has been reached\n", 2635 dev->device_fh, num_devices); 2636 if (vdev->regions_hpa) 2637 rte_free(vdev->regions_hpa); 2638 rte_free(vdev); 2639 return -1; 2640 } 2641 ll_dev->vdev = vdev; 2642 add_data_ll_entry(&ll_root_used, ll_dev); 2643 vdev->vmdq_rx_q 2644 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2645 2646 if (zero_copy) { 2647 uint32_t index = vdev->vmdq_rx_q; 2648 uint32_t count_in_ring, i; 2649 struct mbuf_table *tx_q; 2650 2651 count_in_ring = rte_ring_count(vpool_array[index].ring); 2652 2653 LOG_DEBUG(VHOST_CONFIG, 2654 "(%"PRIu64") in new_device: mbuf count in mempool " 2655 "before attach is: %d\n", 2656 dev->device_fh, 2657 rte_mempool_count(vpool_array[index].pool)); 2658 LOG_DEBUG(VHOST_CONFIG, 2659 "(%"PRIu64") in new_device: mbuf count in ring " 2660 "before attach is : %d\n", 2661 dev->device_fh, count_in_ring); 2662 2663 /* 2664 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2665 */ 2666 for (i = 0; i < count_in_ring; i++) 2667 attach_rxmbuf_zcp(dev); 2668 2669 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2670 "mempool after attach is: %d\n", 2671 dev->device_fh, 2672 rte_mempool_count(vpool_array[index].pool)); 2673 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2674 "ring after attach is : %d\n", 2675 dev->device_fh, 2676 rte_ring_count(vpool_array[index].ring)); 2677 2678 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2679 tx_q->txq_id = vdev->vmdq_rx_q; 2680 2681 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2682 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2683 2684 LOG_DEBUG(VHOST_CONFIG, 2685 "(%"PRIu64") In new_device: Failed to start " 2686 "tx queue:%d\n", 2687 dev->device_fh, vdev->vmdq_rx_q); 2688 2689 mbuf_destroy_zcp(vpool); 2690 rte_free(vdev->regions_hpa); 2691 rte_free(vdev); 2692 return -1; 2693 } 2694 2695 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2696 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2697 2698 LOG_DEBUG(VHOST_CONFIG, 2699 "(%"PRIu64") In new_device: Failed to start " 2700 "rx queue:%d\n", 2701 dev->device_fh, vdev->vmdq_rx_q); 2702 2703 /* Stop the TX queue. */ 2704 if (rte_eth_dev_tx_queue_stop(ports[0], 2705 vdev->vmdq_rx_q) != 0) { 2706 LOG_DEBUG(VHOST_CONFIG, 2707 "(%"PRIu64") In new_device: Failed to " 2708 "stop tx queue:%d\n", 2709 dev->device_fh, vdev->vmdq_rx_q); 2710 } 2711 2712 mbuf_destroy_zcp(vpool); 2713 rte_free(vdev->regions_hpa); 2714 rte_free(vdev); 2715 return -1; 2716 } 2717 2718 } 2719 2720 /*reset ready flag*/ 2721 vdev->ready = DEVICE_MAC_LEARNING; 2722 vdev->remove = 0; 2723 2724 /* Find a suitable lcore to add the device. */ 2725 RTE_LCORE_FOREACH_SLAVE(lcore) { 2726 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2727 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2728 core_add = lcore; 2729 } 2730 } 2731 /* Add device to lcore ll */ 2732 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2733 if (ll_dev == NULL) { 2734 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2735 vdev->ready = DEVICE_SAFE_REMOVE; 2736 destroy_device(dev); 2737 if (vdev->regions_hpa) 2738 rte_free(vdev->regions_hpa); 2739 rte_free(vdev); 2740 return -1; 2741 } 2742 ll_dev->vdev = vdev; 2743 vdev->coreid = core_add; 2744 2745 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2746 2747 /* Initialize device stats */ 2748 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2749 2750 /* Disable notifications. */ 2751 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2752 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2753 lcore_info[vdev->coreid].lcore_ll->device_num++; 2754 dev->flags |= VIRTIO_DEV_RUNNING; 2755 2756 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2757 2758 return 0; 2759 } 2760 2761 /* 2762 * These callback allow devices to be added to the data core when configuration 2763 * has been fully complete. 2764 */ 2765 static const struct virtio_net_device_ops virtio_net_device_ops = 2766 { 2767 .new_device = new_device, 2768 .destroy_device = destroy_device, 2769 }; 2770 2771 /* 2772 * This is a thread will wake up after a period to print stats if the user has 2773 * enabled them. 2774 */ 2775 static void 2776 print_stats(void) 2777 { 2778 struct virtio_net_data_ll *dev_ll; 2779 uint64_t tx_dropped, rx_dropped; 2780 uint64_t tx, tx_total, rx, rx_total; 2781 uint32_t device_fh; 2782 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2783 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2784 2785 while(1) { 2786 sleep(enable_stats); 2787 2788 /* Clear screen and move to top left */ 2789 printf("%s%s", clr, top_left); 2790 2791 printf("\nDevice statistics ===================================="); 2792 2793 dev_ll = ll_root_used; 2794 while (dev_ll != NULL) { 2795 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2796 tx_total = dev_statistics[device_fh].tx_total; 2797 tx = dev_statistics[device_fh].tx; 2798 tx_dropped = tx_total - tx; 2799 if (zero_copy == 0) { 2800 rx_total = rte_atomic64_read( 2801 &dev_statistics[device_fh].rx_total_atomic); 2802 rx = rte_atomic64_read( 2803 &dev_statistics[device_fh].rx_atomic); 2804 } else { 2805 rx_total = dev_statistics[device_fh].rx_total; 2806 rx = dev_statistics[device_fh].rx; 2807 } 2808 rx_dropped = rx_total - rx; 2809 2810 printf("\nStatistics for device %"PRIu32" ------------------------------" 2811 "\nTX total: %"PRIu64"" 2812 "\nTX dropped: %"PRIu64"" 2813 "\nTX successful: %"PRIu64"" 2814 "\nRX total: %"PRIu64"" 2815 "\nRX dropped: %"PRIu64"" 2816 "\nRX successful: %"PRIu64"", 2817 device_fh, 2818 tx_total, 2819 tx_dropped, 2820 tx, 2821 rx_total, 2822 rx_dropped, 2823 rx); 2824 2825 dev_ll = dev_ll->next; 2826 } 2827 printf("\n======================================================\n"); 2828 } 2829 } 2830 2831 static void 2832 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2833 char *ring_name, uint32_t nb_mbuf) 2834 { 2835 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2836 vpool_array[index].pool 2837 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2838 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2839 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2840 rte_pktmbuf_init, NULL, socket, 0); 2841 if (vpool_array[index].pool != NULL) { 2842 vpool_array[index].ring 2843 = rte_ring_create(ring_name, 2844 rte_align32pow2(nb_mbuf + 1), 2845 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2846 if (likely(vpool_array[index].ring != NULL)) { 2847 LOG_DEBUG(VHOST_CONFIG, 2848 "in setup_mempool_tbl: mbuf count in " 2849 "mempool is: %d\n", 2850 rte_mempool_count(vpool_array[index].pool)); 2851 LOG_DEBUG(VHOST_CONFIG, 2852 "in setup_mempool_tbl: mbuf count in " 2853 "ring is: %d\n", 2854 rte_ring_count(vpool_array[index].ring)); 2855 } else { 2856 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2857 ring_name); 2858 } 2859 2860 /* Need consider head room. */ 2861 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2862 } else { 2863 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2864 } 2865 } 2866 2867 2868 /* 2869 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2870 * device is also registered here to handle the IOCTLs. 2871 */ 2872 int 2873 main(int argc, char *argv[]) 2874 { 2875 struct rte_mempool *mbuf_pool = NULL; 2876 unsigned lcore_id, core_id = 0; 2877 unsigned nb_ports, valid_num_ports; 2878 int ret; 2879 uint8_t portid; 2880 uint16_t queue_id; 2881 static pthread_t tid; 2882 2883 /* init EAL */ 2884 ret = rte_eal_init(argc, argv); 2885 if (ret < 0) 2886 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2887 argc -= ret; 2888 argv += ret; 2889 2890 /* parse app arguments */ 2891 ret = us_vhost_parse_args(argc, argv); 2892 if (ret < 0) 2893 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2894 2895 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2896 if (rte_lcore_is_enabled(lcore_id)) 2897 lcore_ids[core_id ++] = lcore_id; 2898 2899 if (rte_lcore_count() > RTE_MAX_LCORE) 2900 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2901 2902 /*set the number of swithcing cores available*/ 2903 num_switching_cores = rte_lcore_count()-1; 2904 2905 /* Get the number of physical ports. */ 2906 nb_ports = rte_eth_dev_count(); 2907 if (nb_ports > RTE_MAX_ETHPORTS) 2908 nb_ports = RTE_MAX_ETHPORTS; 2909 2910 /* 2911 * Update the global var NUM_PORTS and global array PORTS 2912 * and get value of var VALID_NUM_PORTS according to system ports number 2913 */ 2914 valid_num_ports = check_ports_num(nb_ports); 2915 2916 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2917 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2918 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2919 return -1; 2920 } 2921 2922 if (zero_copy == 0) { 2923 /* Create the mbuf pool. */ 2924 mbuf_pool = rte_mempool_create( 2925 "MBUF_POOL", 2926 NUM_MBUFS_PER_PORT 2927 * valid_num_ports, 2928 MBUF_SIZE, MBUF_CACHE_SIZE, 2929 sizeof(struct rte_pktmbuf_pool_private), 2930 rte_pktmbuf_pool_init, NULL, 2931 rte_pktmbuf_init, NULL, 2932 rte_socket_id(), 0); 2933 if (mbuf_pool == NULL) 2934 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2935 2936 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2937 vpool_array[queue_id].pool = mbuf_pool; 2938 2939 if (vm2vm_mode == VM2VM_HARDWARE) { 2940 /* Enable VT loop back to let L2 switch to do it. */ 2941 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2942 LOG_DEBUG(VHOST_CONFIG, 2943 "Enable loop back for L2 switch in vmdq.\n"); 2944 } 2945 } else { 2946 uint32_t nb_mbuf; 2947 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2948 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2949 2950 nb_mbuf = num_rx_descriptor 2951 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2952 + num_switching_cores * MAX_PKT_BURST; 2953 2954 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2955 snprintf(pool_name, sizeof(pool_name), 2956 "rxmbuf_pool_%u", queue_id); 2957 snprintf(ring_name, sizeof(ring_name), 2958 "rxmbuf_ring_%u", queue_id); 2959 setup_mempool_tbl(rte_socket_id(), queue_id, 2960 pool_name, ring_name, nb_mbuf); 2961 } 2962 2963 nb_mbuf = num_tx_descriptor 2964 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2965 + num_switching_cores * MAX_PKT_BURST; 2966 2967 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2968 snprintf(pool_name, sizeof(pool_name), 2969 "txmbuf_pool_%u", queue_id); 2970 snprintf(ring_name, sizeof(ring_name), 2971 "txmbuf_ring_%u", queue_id); 2972 setup_mempool_tbl(rte_socket_id(), 2973 (queue_id + MAX_QUEUES), 2974 pool_name, ring_name, nb_mbuf); 2975 } 2976 2977 if (vm2vm_mode == VM2VM_HARDWARE) { 2978 /* Enable VT loop back to let L2 switch to do it. */ 2979 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2980 LOG_DEBUG(VHOST_CONFIG, 2981 "Enable loop back for L2 switch in vmdq.\n"); 2982 } 2983 } 2984 /* Set log level. */ 2985 rte_set_log_level(LOG_LEVEL); 2986 2987 /* initialize all ports */ 2988 for (portid = 0; portid < nb_ports; portid++) { 2989 /* skip ports that are not enabled */ 2990 if ((enabled_port_mask & (1 << portid)) == 0) { 2991 RTE_LOG(INFO, VHOST_PORT, 2992 "Skipping disabled port %d\n", portid); 2993 continue; 2994 } 2995 if (port_init(portid) != 0) 2996 rte_exit(EXIT_FAILURE, 2997 "Cannot initialize network ports\n"); 2998 } 2999 3000 /* Initialise all linked lists. */ 3001 if (init_data_ll() == -1) 3002 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3003 3004 /* Initialize device stats */ 3005 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3006 3007 /* Enable stats if the user option is set. */ 3008 if (enable_stats) 3009 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3010 3011 /* Launch all data cores. */ 3012 if (zero_copy == 0) { 3013 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3014 rte_eal_remote_launch(switch_worker, 3015 mbuf_pool, lcore_id); 3016 } 3017 } else { 3018 uint32_t count_in_mempool, index, i; 3019 for (index = 0; index < 2*MAX_QUEUES; index++) { 3020 /* For all RX and TX queues. */ 3021 count_in_mempool 3022 = rte_mempool_count(vpool_array[index].pool); 3023 3024 /* 3025 * Transfer all un-attached mbufs from vpool.pool 3026 * to vpoo.ring. 3027 */ 3028 for (i = 0; i < count_in_mempool; i++) { 3029 struct rte_mbuf *mbuf 3030 = __rte_mbuf_raw_alloc( 3031 vpool_array[index].pool); 3032 rte_ring_sp_enqueue(vpool_array[index].ring, 3033 (void *)mbuf); 3034 } 3035 3036 LOG_DEBUG(VHOST_CONFIG, 3037 "in main: mbuf count in mempool at initial " 3038 "is: %d\n", count_in_mempool); 3039 LOG_DEBUG(VHOST_CONFIG, 3040 "in main: mbuf count in ring at initial is :" 3041 " %d\n", 3042 rte_ring_count(vpool_array[index].ring)); 3043 } 3044 3045 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3046 rte_eal_remote_launch(switch_worker_zcp, NULL, 3047 lcore_id); 3048 } 3049 3050 if (mergeable == 0) 3051 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3052 3053 /* Register CUSE device to handle IOCTLs. */ 3054 ret = rte_vhost_driver_register((char *)&dev_basename); 3055 if (ret != 0) 3056 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3057 3058 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3059 3060 /* Start CUSE session. */ 3061 rte_vhost_driver_session_start(); 3062 return 0; 3063 3064 } 3065 3066