1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 #define MBUF_EXT_MEM(mb) (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb)) 143 144 /* mask of enabled ports */ 145 static uint32_t enabled_port_mask = 0; 146 147 /* Promiscuous mode */ 148 static uint32_t promiscuous; 149 150 /*Number of switching cores enabled*/ 151 static uint32_t num_switching_cores = 0; 152 153 /* number of devices/queues to support*/ 154 static uint32_t num_queues = 0; 155 static uint32_t num_devices; 156 157 /* 158 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 159 * disabled on default. 160 */ 161 static uint32_t zero_copy; 162 static int mergeable; 163 164 /* Do vlan strip on host, enabled on default */ 165 static uint32_t vlan_strip = 1; 166 167 /* number of descriptors to apply*/ 168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 170 171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 172 #define MAX_RING_DESC 4096 173 174 struct vpool { 175 struct rte_mempool *pool; 176 struct rte_ring *ring; 177 uint32_t buf_size; 178 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 179 180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 181 typedef enum { 182 VM2VM_DISABLED = 0, 183 VM2VM_SOFTWARE = 1, 184 VM2VM_HARDWARE = 2, 185 VM2VM_LAST 186 } vm2vm_type; 187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 188 189 /* The type of host physical address translated from guest physical address. */ 190 typedef enum { 191 PHYS_ADDR_CONTINUOUS = 0, 192 PHYS_ADDR_CROSS_SUBREG = 1, 193 PHYS_ADDR_INVALID = 2, 194 PHYS_ADDR_LAST 195 } hpa_type; 196 197 /* Enable stats. */ 198 static uint32_t enable_stats = 0; 199 /* Enable retries on RX. */ 200 static uint32_t enable_retry = 1; 201 /* Specify timeout (in useconds) between retries on RX. */ 202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 203 /* Specify the number of retries on RX. */ 204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 205 206 /* Character device basename. Can be set by user. */ 207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 208 209 /* empty vmdq configuration structure. Filled in programatically */ 210 static struct rte_eth_conf vmdq_conf_default = { 211 .rxmode = { 212 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 213 .split_hdr_size = 0, 214 .header_split = 0, /**< Header Split disabled */ 215 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 216 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 217 /* 218 * It is necessary for 1G NIC such as I350, 219 * this fixes bug of ipv4 forwarding in guest can't 220 * forward pakets from one virtio dev to another virtio dev. 221 */ 222 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 223 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 224 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 225 }, 226 227 .txmode = { 228 .mq_mode = ETH_MQ_TX_NONE, 229 }, 230 .rx_adv_conf = { 231 /* 232 * should be overridden separately in code with 233 * appropriate values 234 */ 235 .vmdq_rx_conf = { 236 .nb_queue_pools = ETH_8_POOLS, 237 .enable_default_pool = 0, 238 .default_pool = 0, 239 .nb_pool_maps = 0, 240 .pool_map = {{0, 0},}, 241 }, 242 }, 243 }; 244 245 static unsigned lcore_ids[RTE_MAX_LCORE]; 246 static uint8_t ports[RTE_MAX_ETHPORTS]; 247 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 248 static uint16_t num_pf_queues, num_vmdq_queues; 249 static uint16_t vmdq_pool_base, vmdq_queue_base; 250 static uint16_t queues_per_pool; 251 252 static const uint16_t external_pkt_default_vlan_tag = 2000; 253 const uint16_t vlan_tags[] = { 254 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 255 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 256 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 257 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 258 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 259 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 260 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 261 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 262 }; 263 264 /* ethernet addresses of ports */ 265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 266 267 /* heads for the main used and free linked lists for the data path. */ 268 static struct virtio_net_data_ll *ll_root_used = NULL; 269 static struct virtio_net_data_ll *ll_root_free = NULL; 270 271 /* Array of data core structures containing information on individual core linked lists. */ 272 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 273 274 /* Used for queueing bursts of TX packets. */ 275 struct mbuf_table { 276 unsigned len; 277 unsigned txq_id; 278 struct rte_mbuf *m_table[MAX_PKT_BURST]; 279 }; 280 281 /* TX queue for each data core. */ 282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 283 284 /* TX queue fori each virtio device for zero copy. */ 285 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 286 287 /* Vlan header struct used to insert vlan tags on TX. */ 288 struct vlan_ethhdr { 289 unsigned char h_dest[ETH_ALEN]; 290 unsigned char h_source[ETH_ALEN]; 291 __be16 h_vlan_proto; 292 __be16 h_vlan_TCI; 293 __be16 h_vlan_encapsulated_proto; 294 }; 295 296 /* IPv4 Header */ 297 struct ipv4_hdr { 298 uint8_t version_ihl; /**< version and header length */ 299 uint8_t type_of_service; /**< type of service */ 300 uint16_t total_length; /**< length of packet */ 301 uint16_t packet_id; /**< packet ID */ 302 uint16_t fragment_offset; /**< fragmentation offset */ 303 uint8_t time_to_live; /**< time to live */ 304 uint8_t next_proto_id; /**< protocol ID */ 305 uint16_t hdr_checksum; /**< header checksum */ 306 uint32_t src_addr; /**< source address */ 307 uint32_t dst_addr; /**< destination address */ 308 } __attribute__((__packed__)); 309 310 /* Header lengths. */ 311 #define VLAN_HLEN 4 312 #define VLAN_ETH_HLEN 18 313 314 /* Per-device statistics struct */ 315 struct device_statistics { 316 uint64_t tx_total; 317 rte_atomic64_t rx_total_atomic; 318 uint64_t rx_total; 319 uint64_t tx; 320 rte_atomic64_t rx_atomic; 321 uint64_t rx; 322 } __rte_cache_aligned; 323 struct device_statistics dev_statistics[MAX_DEVICES]; 324 325 /* 326 * Builds up the correct configuration for VMDQ VLAN pool map 327 * according to the pool & queue limits. 328 */ 329 static inline int 330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 331 { 332 struct rte_eth_vmdq_rx_conf conf; 333 struct rte_eth_vmdq_rx_conf *def_conf = 334 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 335 unsigned i; 336 337 memset(&conf, 0, sizeof(conf)); 338 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 339 conf.nb_pool_maps = num_devices; 340 conf.enable_loop_back = def_conf->enable_loop_back; 341 conf.rx_mode = def_conf->rx_mode; 342 343 for (i = 0; i < conf.nb_pool_maps; i++) { 344 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 345 conf.pool_map[i].pools = (1UL << i); 346 } 347 348 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 349 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 350 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 351 return 0; 352 } 353 354 /* 355 * Validate the device number according to the max pool number gotten form 356 * dev_info. If the device number is invalid, give the error message and 357 * return -1. Each device must have its own pool. 358 */ 359 static inline int 360 validate_num_devices(uint32_t max_nb_devices) 361 { 362 if (num_devices > max_nb_devices) { 363 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 364 return -1; 365 } 366 return 0; 367 } 368 369 /* 370 * Initialises a given port using global settings and with the rx buffers 371 * coming from the mbuf_pool passed as parameter 372 */ 373 static inline int 374 port_init(uint8_t port) 375 { 376 struct rte_eth_dev_info dev_info; 377 struct rte_eth_conf port_conf; 378 struct rte_eth_rxconf *rxconf; 379 struct rte_eth_txconf *txconf; 380 int16_t rx_rings, tx_rings; 381 uint16_t rx_ring_size, tx_ring_size; 382 int retval; 383 uint16_t q; 384 385 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 386 rte_eth_dev_info_get (port, &dev_info); 387 388 if (dev_info.max_rx_queues > MAX_QUEUES) { 389 rte_exit(EXIT_FAILURE, 390 "please define MAX_QUEUES no less than %u in %s\n", 391 dev_info.max_rx_queues, __FILE__); 392 } 393 394 rxconf = &dev_info.default_rxconf; 395 txconf = &dev_info.default_txconf; 396 rxconf->rx_drop_en = 1; 397 398 /* Enable vlan offload */ 399 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 400 401 /* 402 * Zero copy defers queue RX/TX start to the time when guest 403 * finishes its startup and packet buffers from that guest are 404 * available. 405 */ 406 if (zero_copy) { 407 rxconf->rx_deferred_start = 1; 408 rxconf->rx_drop_en = 0; 409 txconf->tx_deferred_start = 1; 410 } 411 412 /*configure the number of supported virtio devices based on VMDQ limits */ 413 num_devices = dev_info.max_vmdq_pools; 414 415 if (zero_copy) { 416 rx_ring_size = num_rx_descriptor; 417 tx_ring_size = num_tx_descriptor; 418 tx_rings = dev_info.max_tx_queues; 419 } else { 420 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 421 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 422 tx_rings = (uint16_t)rte_lcore_count(); 423 } 424 425 retval = validate_num_devices(MAX_DEVICES); 426 if (retval < 0) 427 return retval; 428 429 /* Get port configuration. */ 430 retval = get_eth_conf(&port_conf, num_devices); 431 if (retval < 0) 432 return retval; 433 /* NIC queues are divided into pf queues and vmdq queues. */ 434 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 435 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 436 num_vmdq_queues = num_devices * queues_per_pool; 437 num_queues = num_pf_queues + num_vmdq_queues; 438 vmdq_queue_base = dev_info.vmdq_queue_base; 439 vmdq_pool_base = dev_info.vmdq_pool_base; 440 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 441 num_pf_queues, num_devices, queues_per_pool); 442 443 if (port >= rte_eth_dev_count()) return -1; 444 445 rx_rings = (uint16_t)dev_info.max_rx_queues; 446 /* Configure ethernet device. */ 447 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 448 if (retval != 0) 449 return retval; 450 451 /* Setup the queues. */ 452 for (q = 0; q < rx_rings; q ++) { 453 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 454 rte_eth_dev_socket_id(port), 455 rxconf, 456 vpool_array[q].pool); 457 if (retval < 0) 458 return retval; 459 } 460 for (q = 0; q < tx_rings; q ++) { 461 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 462 rte_eth_dev_socket_id(port), 463 txconf); 464 if (retval < 0) 465 return retval; 466 } 467 468 /* Start the device. */ 469 retval = rte_eth_dev_start(port); 470 if (retval < 0) { 471 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 472 return retval; 473 } 474 475 if (promiscuous) 476 rte_eth_promiscuous_enable(port); 477 478 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 479 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 480 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 481 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 482 (unsigned)port, 483 vmdq_ports_eth_addr[port].addr_bytes[0], 484 vmdq_ports_eth_addr[port].addr_bytes[1], 485 vmdq_ports_eth_addr[port].addr_bytes[2], 486 vmdq_ports_eth_addr[port].addr_bytes[3], 487 vmdq_ports_eth_addr[port].addr_bytes[4], 488 vmdq_ports_eth_addr[port].addr_bytes[5]); 489 490 return 0; 491 } 492 493 /* 494 * Set character device basename. 495 */ 496 static int 497 us_vhost_parse_basename(const char *q_arg) 498 { 499 /* parse number string */ 500 501 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 502 return -1; 503 else 504 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 505 506 return 0; 507 } 508 509 /* 510 * Parse the portmask provided at run time. 511 */ 512 static int 513 parse_portmask(const char *portmask) 514 { 515 char *end = NULL; 516 unsigned long pm; 517 518 errno = 0; 519 520 /* parse hexadecimal string */ 521 pm = strtoul(portmask, &end, 16); 522 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 523 return -1; 524 525 if (pm == 0) 526 return -1; 527 528 return pm; 529 530 } 531 532 /* 533 * Parse num options at run time. 534 */ 535 static int 536 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 537 { 538 char *end = NULL; 539 unsigned long num; 540 541 errno = 0; 542 543 /* parse unsigned int string */ 544 num = strtoul(q_arg, &end, 10); 545 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 546 return -1; 547 548 if (num > max_valid_value) 549 return -1; 550 551 return num; 552 553 } 554 555 /* 556 * Display usage 557 */ 558 static void 559 us_vhost_usage(const char *prgname) 560 { 561 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 562 " --vm2vm [0|1|2]\n" 563 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 564 " --dev-basename <name>\n" 565 " --nb-devices ND\n" 566 " -p PORTMASK: Set mask for ports to be used by application\n" 567 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 568 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 569 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 570 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 571 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 572 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 573 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 574 " --dev-basename: The basename to be used for the character device.\n" 575 " --zero-copy [0|1]: disable(default)/enable rx/tx " 576 "zero copy\n" 577 " --rx-desc-num [0-N]: the number of descriptors on rx, " 578 "used only when zero copy is enabled.\n" 579 " --tx-desc-num [0-N]: the number of descriptors on tx, " 580 "used only when zero copy is enabled.\n", 581 prgname); 582 } 583 584 /* 585 * Parse the arguments given in the command line of the application. 586 */ 587 static int 588 us_vhost_parse_args(int argc, char **argv) 589 { 590 int opt, ret; 591 int option_index; 592 unsigned i; 593 const char *prgname = argv[0]; 594 static struct option long_option[] = { 595 {"vm2vm", required_argument, NULL, 0}, 596 {"rx-retry", required_argument, NULL, 0}, 597 {"rx-retry-delay", required_argument, NULL, 0}, 598 {"rx-retry-num", required_argument, NULL, 0}, 599 {"mergeable", required_argument, NULL, 0}, 600 {"vlan-strip", required_argument, NULL, 0}, 601 {"stats", required_argument, NULL, 0}, 602 {"dev-basename", required_argument, NULL, 0}, 603 {"zero-copy", required_argument, NULL, 0}, 604 {"rx-desc-num", required_argument, NULL, 0}, 605 {"tx-desc-num", required_argument, NULL, 0}, 606 {NULL, 0, 0, 0}, 607 }; 608 609 /* Parse command line */ 610 while ((opt = getopt_long(argc, argv, "p:P", 611 long_option, &option_index)) != EOF) { 612 switch (opt) { 613 /* Portmask */ 614 case 'p': 615 enabled_port_mask = parse_portmask(optarg); 616 if (enabled_port_mask == 0) { 617 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 618 us_vhost_usage(prgname); 619 return -1; 620 } 621 break; 622 623 case 'P': 624 promiscuous = 1; 625 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 626 ETH_VMDQ_ACCEPT_BROADCAST | 627 ETH_VMDQ_ACCEPT_MULTICAST; 628 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 629 630 break; 631 632 case 0: 633 /* Enable/disable vm2vm comms. */ 634 if (!strncmp(long_option[option_index].name, "vm2vm", 635 MAX_LONG_OPT_SZ)) { 636 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 637 if (ret == -1) { 638 RTE_LOG(INFO, VHOST_CONFIG, 639 "Invalid argument for " 640 "vm2vm [0|1|2]\n"); 641 us_vhost_usage(prgname); 642 return -1; 643 } else { 644 vm2vm_mode = (vm2vm_type)ret; 645 } 646 } 647 648 /* Enable/disable retries on RX. */ 649 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 650 ret = parse_num_opt(optarg, 1); 651 if (ret == -1) { 652 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 653 us_vhost_usage(prgname); 654 return -1; 655 } else { 656 enable_retry = ret; 657 } 658 } 659 660 /* Specify the retries delay time (in useconds) on RX. */ 661 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 662 ret = parse_num_opt(optarg, INT32_MAX); 663 if (ret == -1) { 664 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 665 us_vhost_usage(prgname); 666 return -1; 667 } else { 668 burst_rx_delay_time = ret; 669 } 670 } 671 672 /* Specify the retries number on RX. */ 673 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 674 ret = parse_num_opt(optarg, INT32_MAX); 675 if (ret == -1) { 676 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 677 us_vhost_usage(prgname); 678 return -1; 679 } else { 680 burst_rx_retry_num = ret; 681 } 682 } 683 684 /* Enable/disable RX mergeable buffers. */ 685 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 686 ret = parse_num_opt(optarg, 1); 687 if (ret == -1) { 688 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 689 us_vhost_usage(prgname); 690 return -1; 691 } else { 692 mergeable = !!ret; 693 if (ret) { 694 vmdq_conf_default.rxmode.jumbo_frame = 1; 695 vmdq_conf_default.rxmode.max_rx_pkt_len 696 = JUMBO_FRAME_MAX_SIZE; 697 } 698 } 699 } 700 701 /* Enable/disable RX VLAN strip on host. */ 702 if (!strncmp(long_option[option_index].name, 703 "vlan-strip", MAX_LONG_OPT_SZ)) { 704 ret = parse_num_opt(optarg, 1); 705 if (ret == -1) { 706 RTE_LOG(INFO, VHOST_CONFIG, 707 "Invalid argument for VLAN strip [0|1]\n"); 708 us_vhost_usage(prgname); 709 return -1; 710 } else { 711 vlan_strip = !!ret; 712 vmdq_conf_default.rxmode.hw_vlan_strip = 713 vlan_strip; 714 } 715 } 716 717 /* Enable/disable stats. */ 718 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 719 ret = parse_num_opt(optarg, INT32_MAX); 720 if (ret == -1) { 721 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 722 us_vhost_usage(prgname); 723 return -1; 724 } else { 725 enable_stats = ret; 726 } 727 } 728 729 /* Set character device basename. */ 730 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 731 if (us_vhost_parse_basename(optarg) == -1) { 732 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 733 us_vhost_usage(prgname); 734 return -1; 735 } 736 } 737 738 /* Enable/disable rx/tx zero copy. */ 739 if (!strncmp(long_option[option_index].name, 740 "zero-copy", MAX_LONG_OPT_SZ)) { 741 ret = parse_num_opt(optarg, 1); 742 if (ret == -1) { 743 RTE_LOG(INFO, VHOST_CONFIG, 744 "Invalid argument" 745 " for zero-copy [0|1]\n"); 746 us_vhost_usage(prgname); 747 return -1; 748 } else 749 zero_copy = ret; 750 } 751 752 /* Specify the descriptor number on RX. */ 753 if (!strncmp(long_option[option_index].name, 754 "rx-desc-num", MAX_LONG_OPT_SZ)) { 755 ret = parse_num_opt(optarg, MAX_RING_DESC); 756 if ((ret == -1) || (!POWEROF2(ret))) { 757 RTE_LOG(INFO, VHOST_CONFIG, 758 "Invalid argument for rx-desc-num[0-N]," 759 "power of 2 required.\n"); 760 us_vhost_usage(prgname); 761 return -1; 762 } else { 763 num_rx_descriptor = ret; 764 } 765 } 766 767 /* Specify the descriptor number on TX. */ 768 if (!strncmp(long_option[option_index].name, 769 "tx-desc-num", MAX_LONG_OPT_SZ)) { 770 ret = parse_num_opt(optarg, MAX_RING_DESC); 771 if ((ret == -1) || (!POWEROF2(ret))) { 772 RTE_LOG(INFO, VHOST_CONFIG, 773 "Invalid argument for tx-desc-num [0-N]," 774 "power of 2 required.\n"); 775 us_vhost_usage(prgname); 776 return -1; 777 } else { 778 num_tx_descriptor = ret; 779 } 780 } 781 782 break; 783 784 /* Invalid option - print options. */ 785 default: 786 us_vhost_usage(prgname); 787 return -1; 788 } 789 } 790 791 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 792 if (enabled_port_mask & (1 << i)) 793 ports[num_ports++] = (uint8_t)i; 794 } 795 796 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 797 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 798 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 799 return -1; 800 } 801 802 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 803 RTE_LOG(INFO, VHOST_PORT, 804 "Vhost zero copy doesn't support software vm2vm," 805 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 806 return -1; 807 } 808 809 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 810 RTE_LOG(INFO, VHOST_PORT, 811 "Vhost zero copy doesn't support jumbo frame," 812 "please specify '--mergeable 0' to disable the " 813 "mergeable feature.\n"); 814 return -1; 815 } 816 817 return 0; 818 } 819 820 /* 821 * Update the global var NUM_PORTS and array PORTS according to system ports number 822 * and return valid ports number 823 */ 824 static unsigned check_ports_num(unsigned nb_ports) 825 { 826 unsigned valid_num_ports = num_ports; 827 unsigned portid; 828 829 if (num_ports > nb_ports) { 830 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 831 num_ports, nb_ports); 832 num_ports = nb_ports; 833 } 834 835 for (portid = 0; portid < num_ports; portid ++) { 836 if (ports[portid] >= nb_ports) { 837 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 838 ports[portid], (nb_ports - 1)); 839 ports[portid] = INVALID_PORT_ID; 840 valid_num_ports--; 841 } 842 } 843 return valid_num_ports; 844 } 845 846 /* 847 * Macro to print out packet contents. Wrapped in debug define so that the 848 * data path is not effected when debug is disabled. 849 */ 850 #ifdef DEBUG 851 #define PRINT_PACKET(device, addr, size, header) do { \ 852 char *pkt_addr = (char*)(addr); \ 853 unsigned int index; \ 854 char packet[MAX_PRINT_BUFF]; \ 855 \ 856 if ((header)) \ 857 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 858 else \ 859 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 860 for (index = 0; index < (size); index++) { \ 861 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 862 "%02hhx ", pkt_addr[index]); \ 863 } \ 864 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 865 \ 866 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 867 } while(0) 868 #else 869 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 870 #endif 871 872 /* 873 * Function to convert guest physical addresses to vhost physical addresses. 874 * This is used to convert virtio buffer addresses. 875 */ 876 static inline uint64_t __attribute__((always_inline)) 877 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 878 uint32_t buf_len, hpa_type *addr_type) 879 { 880 struct virtio_memory_regions_hpa *region; 881 uint32_t regionidx; 882 uint64_t vhost_pa = 0; 883 884 *addr_type = PHYS_ADDR_INVALID; 885 886 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 887 region = &vdev->regions_hpa[regionidx]; 888 if ((guest_pa >= region->guest_phys_address) && 889 (guest_pa <= region->guest_phys_address_end)) { 890 vhost_pa = region->host_phys_addr_offset + guest_pa; 891 if (likely((guest_pa + buf_len - 1) 892 <= region->guest_phys_address_end)) 893 *addr_type = PHYS_ADDR_CONTINUOUS; 894 else 895 *addr_type = PHYS_ADDR_CROSS_SUBREG; 896 break; 897 } 898 } 899 900 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 901 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 902 (void *)(uintptr_t)vhost_pa); 903 904 return vhost_pa; 905 } 906 907 /* 908 * Compares a packet destination MAC address to a device MAC address. 909 */ 910 static inline int __attribute__((always_inline)) 911 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 912 { 913 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 914 } 915 916 /* 917 * This function learns the MAC address of the device and registers this along with a 918 * vlan tag to a VMDQ. 919 */ 920 static int 921 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 922 { 923 struct ether_hdr *pkt_hdr; 924 struct virtio_net_data_ll *dev_ll; 925 struct virtio_net *dev = vdev->dev; 926 int i, ret; 927 928 /* Learn MAC address of guest device from packet */ 929 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 930 931 dev_ll = ll_root_used; 932 933 while (dev_ll != NULL) { 934 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 935 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 936 return -1; 937 } 938 dev_ll = dev_ll->next; 939 } 940 941 for (i = 0; i < ETHER_ADDR_LEN; i++) 942 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 943 944 /* vlan_tag currently uses the device_id. */ 945 vdev->vlan_tag = vlan_tags[dev->device_fh]; 946 947 /* Print out VMDQ registration info. */ 948 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 949 dev->device_fh, 950 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 951 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 952 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 953 vdev->vlan_tag); 954 955 /* Register the MAC address. */ 956 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 957 (uint32_t)dev->device_fh + vmdq_pool_base); 958 if (ret) 959 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 960 dev->device_fh); 961 962 /* Enable stripping of the vlan tag as we handle routing. */ 963 if (vlan_strip) 964 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 965 (uint16_t)vdev->vmdq_rx_q, 1); 966 967 /* Set device as ready for RX. */ 968 vdev->ready = DEVICE_RX; 969 970 return 0; 971 } 972 973 /* 974 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 975 * queue before disabling RX on the device. 976 */ 977 static inline void 978 unlink_vmdq(struct vhost_dev *vdev) 979 { 980 unsigned i = 0; 981 unsigned rx_count; 982 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 983 984 if (vdev->ready == DEVICE_RX) { 985 /*clear MAC and VLAN settings*/ 986 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 987 for (i = 0; i < 6; i++) 988 vdev->mac_address.addr_bytes[i] = 0; 989 990 vdev->vlan_tag = 0; 991 992 /*Clear out the receive buffers*/ 993 rx_count = rte_eth_rx_burst(ports[0], 994 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 995 996 while (rx_count) { 997 for (i = 0; i < rx_count; i++) 998 rte_pktmbuf_free(pkts_burst[i]); 999 1000 rx_count = rte_eth_rx_burst(ports[0], 1001 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1002 } 1003 1004 vdev->ready = DEVICE_MAC_LEARNING; 1005 } 1006 } 1007 1008 /* 1009 * Check if the packet destination MAC address is for a local device. If so then put 1010 * the packet on that devices RX queue. If not then return. 1011 */ 1012 static inline int __attribute__((always_inline)) 1013 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1014 { 1015 struct virtio_net_data_ll *dev_ll; 1016 struct ether_hdr *pkt_hdr; 1017 uint64_t ret = 0; 1018 struct virtio_net *dev = vdev->dev; 1019 struct virtio_net *tdev; /* destination virito device */ 1020 1021 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1022 1023 /*get the used devices list*/ 1024 dev_ll = ll_root_used; 1025 1026 while (dev_ll != NULL) { 1027 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1028 &dev_ll->vdev->mac_address)) { 1029 1030 /* Drop the packet if the TX packet is destined for the TX device. */ 1031 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1032 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1033 dev->device_fh); 1034 return 0; 1035 } 1036 tdev = dev_ll->vdev->dev; 1037 1038 1039 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1040 1041 if (unlikely(dev_ll->vdev->remove)) { 1042 /*drop the packet if the device is marked for removal*/ 1043 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1044 } else { 1045 /*send the packet to the local virtio device*/ 1046 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1047 if (enable_stats) { 1048 rte_atomic64_add( 1049 &dev_statistics[tdev->device_fh].rx_total_atomic, 1050 1); 1051 rte_atomic64_add( 1052 &dev_statistics[tdev->device_fh].rx_atomic, 1053 ret); 1054 dev_statistics[tdev->device_fh].tx_total++; 1055 dev_statistics[tdev->device_fh].tx += ret; 1056 } 1057 } 1058 1059 return 0; 1060 } 1061 dev_ll = dev_ll->next; 1062 } 1063 1064 return -1; 1065 } 1066 1067 /* 1068 * Check if the destination MAC of a packet is one local VM, 1069 * and get its vlan tag, and offset if it is. 1070 */ 1071 static inline int __attribute__((always_inline)) 1072 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1073 uint32_t *offset, uint16_t *vlan_tag) 1074 { 1075 struct virtio_net_data_ll *dev_ll = ll_root_used; 1076 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1077 1078 while (dev_ll != NULL) { 1079 if ((dev_ll->vdev->ready == DEVICE_RX) 1080 && ether_addr_cmp(&(pkt_hdr->d_addr), 1081 &dev_ll->vdev->mac_address)) { 1082 /* 1083 * Drop the packet if the TX packet is 1084 * destined for the TX device. 1085 */ 1086 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1087 LOG_DEBUG(VHOST_DATA, 1088 "(%"PRIu64") TX: Source and destination" 1089 " MAC addresses are the same. Dropping " 1090 "packet.\n", 1091 dev_ll->vdev->dev->device_fh); 1092 return -1; 1093 } 1094 1095 /* 1096 * HW vlan strip will reduce the packet length 1097 * by minus length of vlan tag, so need restore 1098 * the packet length by plus it. 1099 */ 1100 *offset = VLAN_HLEN; 1101 *vlan_tag = 1102 (uint16_t) 1103 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1104 1105 LOG_DEBUG(VHOST_DATA, 1106 "(%"PRIu64") TX: pkt to local VM device id:" 1107 "(%"PRIu64") vlan tag: %d.\n", 1108 dev->device_fh, dev_ll->vdev->dev->device_fh, 1109 vlan_tag); 1110 1111 break; 1112 } 1113 dev_ll = dev_ll->next; 1114 } 1115 return 0; 1116 } 1117 1118 /* 1119 * This function routes the TX packet to the correct interface. This may be a local device 1120 * or the physical port. 1121 */ 1122 static inline void __attribute__((always_inline)) 1123 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1124 { 1125 struct mbuf_table *tx_q; 1126 struct rte_mbuf **m_table; 1127 unsigned len, ret, offset = 0; 1128 const uint16_t lcore_id = rte_lcore_id(); 1129 struct virtio_net *dev = vdev->dev; 1130 struct ether_hdr *nh; 1131 1132 /*check if destination is local VM*/ 1133 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1134 rte_pktmbuf_free(m); 1135 return; 1136 } 1137 1138 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1139 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1140 rte_pktmbuf_free(m); 1141 return; 1142 } 1143 } 1144 1145 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1146 1147 /*Add packet to the port tx queue*/ 1148 tx_q = &lcore_tx_queue[lcore_id]; 1149 len = tx_q->len; 1150 1151 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1152 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1153 /* Guest has inserted the vlan tag. */ 1154 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1155 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1156 if ((vm2vm_mode == VM2VM_HARDWARE) && 1157 (vh->vlan_tci != vlan_tag_be)) 1158 vh->vlan_tci = vlan_tag_be; 1159 } else { 1160 m->ol_flags = PKT_TX_VLAN_PKT; 1161 1162 /* 1163 * Find the right seg to adjust the data len when offset is 1164 * bigger than tail room size. 1165 */ 1166 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1167 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1168 m->data_len += offset; 1169 else { 1170 struct rte_mbuf *seg = m; 1171 1172 while ((seg->next != NULL) && 1173 (offset > rte_pktmbuf_tailroom(seg))) 1174 seg = seg->next; 1175 1176 seg->data_len += offset; 1177 } 1178 m->pkt_len += offset; 1179 } 1180 1181 m->vlan_tci = vlan_tag; 1182 } 1183 1184 tx_q->m_table[len] = m; 1185 len++; 1186 if (enable_stats) { 1187 dev_statistics[dev->device_fh].tx_total++; 1188 dev_statistics[dev->device_fh].tx++; 1189 } 1190 1191 if (unlikely(len == MAX_PKT_BURST)) { 1192 m_table = (struct rte_mbuf **)tx_q->m_table; 1193 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1194 /* Free any buffers not handled by TX and update the port stats. */ 1195 if (unlikely(ret < len)) { 1196 do { 1197 rte_pktmbuf_free(m_table[ret]); 1198 } while (++ret < len); 1199 } 1200 1201 len = 0; 1202 } 1203 1204 tx_q->len = len; 1205 return; 1206 } 1207 /* 1208 * This function is called by each data core. It handles all RX/TX registered with the 1209 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1210 * with all devices in the main linked list. 1211 */ 1212 static int 1213 switch_worker(__attribute__((unused)) void *arg) 1214 { 1215 struct rte_mempool *mbuf_pool = arg; 1216 struct virtio_net *dev = NULL; 1217 struct vhost_dev *vdev = NULL; 1218 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1219 struct virtio_net_data_ll *dev_ll; 1220 struct mbuf_table *tx_q; 1221 volatile struct lcore_ll_info *lcore_ll; 1222 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1223 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1224 unsigned ret, i; 1225 const uint16_t lcore_id = rte_lcore_id(); 1226 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1227 uint16_t rx_count = 0; 1228 uint16_t tx_count; 1229 uint32_t retry = 0; 1230 1231 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1232 lcore_ll = lcore_info[lcore_id].lcore_ll; 1233 prev_tsc = 0; 1234 1235 tx_q = &lcore_tx_queue[lcore_id]; 1236 for (i = 0; i < num_cores; i ++) { 1237 if (lcore_ids[i] == lcore_id) { 1238 tx_q->txq_id = i; 1239 break; 1240 } 1241 } 1242 1243 while(1) { 1244 cur_tsc = rte_rdtsc(); 1245 /* 1246 * TX burst queue drain 1247 */ 1248 diff_tsc = cur_tsc - prev_tsc; 1249 if (unlikely(diff_tsc > drain_tsc)) { 1250 1251 if (tx_q->len) { 1252 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1253 1254 /*Tx any packets in the queue*/ 1255 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1256 (struct rte_mbuf **)tx_q->m_table, 1257 (uint16_t)tx_q->len); 1258 if (unlikely(ret < tx_q->len)) { 1259 do { 1260 rte_pktmbuf_free(tx_q->m_table[ret]); 1261 } while (++ret < tx_q->len); 1262 } 1263 1264 tx_q->len = 0; 1265 } 1266 1267 prev_tsc = cur_tsc; 1268 1269 } 1270 1271 rte_prefetch0(lcore_ll->ll_root_used); 1272 /* 1273 * Inform the configuration core that we have exited the linked list and that no devices are 1274 * in use if requested. 1275 */ 1276 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1277 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1278 1279 /* 1280 * Process devices 1281 */ 1282 dev_ll = lcore_ll->ll_root_used; 1283 1284 while (dev_ll != NULL) { 1285 /*get virtio device ID*/ 1286 vdev = dev_ll->vdev; 1287 dev = vdev->dev; 1288 1289 if (unlikely(vdev->remove)) { 1290 dev_ll = dev_ll->next; 1291 unlink_vmdq(vdev); 1292 vdev->ready = DEVICE_SAFE_REMOVE; 1293 continue; 1294 } 1295 if (likely(vdev->ready == DEVICE_RX)) { 1296 /*Handle guest RX*/ 1297 rx_count = rte_eth_rx_burst(ports[0], 1298 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1299 1300 if (rx_count) { 1301 /* 1302 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1303 * Here MAX_PKT_BURST must be less than virtio queue size 1304 */ 1305 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1306 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1307 rte_delay_us(burst_rx_delay_time); 1308 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1309 break; 1310 } 1311 } 1312 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1313 if (enable_stats) { 1314 rte_atomic64_add( 1315 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1316 rx_count); 1317 rte_atomic64_add( 1318 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1319 } 1320 while (likely(rx_count)) { 1321 rx_count--; 1322 rte_pktmbuf_free(pkts_burst[rx_count]); 1323 } 1324 1325 } 1326 } 1327 1328 if (likely(!vdev->remove)) { 1329 /* Handle guest TX*/ 1330 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1331 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1332 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1333 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1334 while (tx_count) 1335 rte_pktmbuf_free(pkts_burst[--tx_count]); 1336 } 1337 } 1338 while (tx_count) 1339 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1340 } 1341 1342 /*move to the next device in the list*/ 1343 dev_ll = dev_ll->next; 1344 } 1345 } 1346 1347 return 0; 1348 } 1349 1350 /* 1351 * This function gets available ring number for zero copy rx. 1352 * Only one thread will call this funciton for a paticular virtio device, 1353 * so, it is designed as non-thread-safe function. 1354 */ 1355 static inline uint32_t __attribute__((always_inline)) 1356 get_available_ring_num_zcp(struct virtio_net *dev) 1357 { 1358 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1359 uint16_t avail_idx; 1360 1361 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1362 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1363 } 1364 1365 /* 1366 * This function gets available ring index for zero copy rx, 1367 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1368 * Only one thread will call this funciton for a paticular virtio device, 1369 * so, it is designed as non-thread-safe function. 1370 */ 1371 static inline uint32_t __attribute__((always_inline)) 1372 get_available_ring_index_zcp(struct virtio_net *dev, 1373 uint16_t *res_base_idx, uint32_t count) 1374 { 1375 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1376 uint16_t avail_idx; 1377 uint32_t retry = 0; 1378 uint16_t free_entries; 1379 1380 *res_base_idx = vq->last_used_idx_res; 1381 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1382 free_entries = (avail_idx - *res_base_idx); 1383 1384 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1385 "avail idx: %d, " 1386 "res base idx:%d, free entries:%d\n", 1387 dev->device_fh, avail_idx, *res_base_idx, 1388 free_entries); 1389 1390 /* 1391 * If retry is enabled and the queue is full then we wait 1392 * and retry to avoid packet loss. 1393 */ 1394 if (enable_retry && unlikely(count > free_entries)) { 1395 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1396 rte_delay_us(burst_rx_delay_time); 1397 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1398 free_entries = (avail_idx - *res_base_idx); 1399 if (count <= free_entries) 1400 break; 1401 } 1402 } 1403 1404 /*check that we have enough buffers*/ 1405 if (unlikely(count > free_entries)) 1406 count = free_entries; 1407 1408 if (unlikely(count == 0)) { 1409 LOG_DEBUG(VHOST_DATA, 1410 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1411 "avail idx: %d, res base idx:%d, free entries:%d\n", 1412 dev->device_fh, avail_idx, 1413 *res_base_idx, free_entries); 1414 return 0; 1415 } 1416 1417 vq->last_used_idx_res = *res_base_idx + count; 1418 1419 return count; 1420 } 1421 1422 /* 1423 * This function put descriptor back to used list. 1424 */ 1425 static inline void __attribute__((always_inline)) 1426 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1427 { 1428 uint16_t res_cur_idx = vq->last_used_idx; 1429 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1430 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1431 rte_compiler_barrier(); 1432 *(volatile uint16_t *)&vq->used->idx += 1; 1433 vq->last_used_idx += 1; 1434 1435 /* Kick the guest if necessary. */ 1436 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1437 eventfd_write((int)vq->callfd, 1); 1438 } 1439 1440 /* 1441 * This function get available descriptor from vitio vring and un-attached mbuf 1442 * from vpool->ring, and then attach them together. It needs adjust the offset 1443 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1444 * frame data may be put to wrong location in mbuf. 1445 */ 1446 static inline void __attribute__((always_inline)) 1447 attach_rxmbuf_zcp(struct virtio_net *dev) 1448 { 1449 uint16_t res_base_idx, desc_idx; 1450 uint64_t buff_addr, phys_addr; 1451 struct vhost_virtqueue *vq; 1452 struct vring_desc *desc; 1453 struct rte_mbuf *mbuf = NULL; 1454 struct vpool *vpool; 1455 hpa_type addr_type; 1456 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1457 1458 vpool = &vpool_array[vdev->vmdq_rx_q]; 1459 vq = dev->virtqueue[VIRTIO_RXQ]; 1460 1461 do { 1462 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1463 1) != 1)) 1464 return; 1465 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1466 1467 desc = &vq->desc[desc_idx]; 1468 if (desc->flags & VRING_DESC_F_NEXT) { 1469 desc = &vq->desc[desc->next]; 1470 buff_addr = gpa_to_vva(dev, desc->addr); 1471 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1472 &addr_type); 1473 } else { 1474 buff_addr = gpa_to_vva(dev, 1475 desc->addr + vq->vhost_hlen); 1476 phys_addr = gpa_to_hpa(vdev, 1477 desc->addr + vq->vhost_hlen, 1478 desc->len, &addr_type); 1479 } 1480 1481 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1482 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1483 " address found when attaching RX frame buffer" 1484 " address!\n", dev->device_fh); 1485 put_desc_to_used_list_zcp(vq, desc_idx); 1486 continue; 1487 } 1488 1489 /* 1490 * Check if the frame buffer address from guest crosses 1491 * sub-region or not. 1492 */ 1493 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1494 RTE_LOG(ERR, VHOST_DATA, 1495 "(%"PRIu64") Frame buffer address cross " 1496 "sub-regioin found when attaching RX frame " 1497 "buffer address!\n", 1498 dev->device_fh); 1499 put_desc_to_used_list_zcp(vq, desc_idx); 1500 continue; 1501 } 1502 } while (unlikely(phys_addr == 0)); 1503 1504 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1505 if (unlikely(mbuf == NULL)) { 1506 LOG_DEBUG(VHOST_DATA, 1507 "(%"PRIu64") in attach_rxmbuf_zcp: " 1508 "ring_sc_dequeue fail.\n", 1509 dev->device_fh); 1510 put_desc_to_used_list_zcp(vq, desc_idx); 1511 return; 1512 } 1513 1514 if (unlikely(vpool->buf_size > desc->len)) { 1515 LOG_DEBUG(VHOST_DATA, 1516 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1517 "length(%d) of descriptor idx: %d less than room " 1518 "size required: %d\n", 1519 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1520 put_desc_to_used_list_zcp(vq, desc_idx); 1521 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1522 return; 1523 } 1524 1525 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1526 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1527 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1528 mbuf->data_len = desc->len; 1529 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1530 1531 LOG_DEBUG(VHOST_DATA, 1532 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1533 "descriptor idx:%d\n", 1534 dev->device_fh, res_base_idx, desc_idx); 1535 1536 __rte_mbuf_raw_free(mbuf); 1537 1538 return; 1539 } 1540 1541 /* 1542 * Detach an attched packet mbuf - 1543 * - restore original mbuf address and length values. 1544 * - reset pktmbuf data and data_len to their default values. 1545 * All other fields of the given packet mbuf will be left intact. 1546 * 1547 * @param m 1548 * The attached packet mbuf. 1549 */ 1550 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1551 { 1552 const struct rte_mempool *mp = m->pool; 1553 void *buf = RTE_MBUF_TO_BADDR(m); 1554 uint32_t buf_ofs; 1555 uint32_t buf_len = mp->elt_size - sizeof(*m); 1556 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1557 1558 m->buf_addr = buf; 1559 m->buf_len = (uint16_t)buf_len; 1560 1561 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1562 RTE_PKTMBUF_HEADROOM : m->buf_len; 1563 m->data_off = buf_ofs; 1564 1565 m->data_len = 0; 1566 } 1567 1568 /* 1569 * This function is called after packets have been transimited. It fetchs mbuf 1570 * from vpool->pool, detached it and put into vpool->ring. It also update the 1571 * used index and kick the guest if necessary. 1572 */ 1573 static inline uint32_t __attribute__((always_inline)) 1574 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1575 { 1576 struct rte_mbuf *mbuf; 1577 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1578 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1579 uint32_t index = 0; 1580 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1581 1582 LOG_DEBUG(VHOST_DATA, 1583 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1584 "clean is: %d\n", 1585 dev->device_fh, mbuf_count); 1586 LOG_DEBUG(VHOST_DATA, 1587 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1588 "clean is : %d\n", 1589 dev->device_fh, rte_ring_count(vpool->ring)); 1590 1591 for (index = 0; index < mbuf_count; index++) { 1592 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1593 if (likely(MBUF_EXT_MEM(mbuf))) 1594 pktmbuf_detach_zcp(mbuf); 1595 rte_ring_sp_enqueue(vpool->ring, mbuf); 1596 1597 /* Update used index buffer information. */ 1598 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1599 vq->used->ring[used_idx].len = 0; 1600 1601 used_idx = (used_idx + 1) & (vq->size - 1); 1602 } 1603 1604 LOG_DEBUG(VHOST_DATA, 1605 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1606 "clean is: %d\n", 1607 dev->device_fh, rte_mempool_count(vpool->pool)); 1608 LOG_DEBUG(VHOST_DATA, 1609 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1610 "clean is : %d\n", 1611 dev->device_fh, rte_ring_count(vpool->ring)); 1612 LOG_DEBUG(VHOST_DATA, 1613 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1614 "vq->last_used_idx:%d\n", 1615 dev->device_fh, vq->last_used_idx); 1616 1617 vq->last_used_idx += mbuf_count; 1618 1619 LOG_DEBUG(VHOST_DATA, 1620 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1621 "vq->last_used_idx:%d\n", 1622 dev->device_fh, vq->last_used_idx); 1623 1624 rte_compiler_barrier(); 1625 1626 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1627 1628 /* Kick guest if required. */ 1629 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1630 eventfd_write((int)vq->callfd, 1); 1631 1632 return 0; 1633 } 1634 1635 /* 1636 * This function is called when a virtio device is destroy. 1637 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1638 */ 1639 static void mbuf_destroy_zcp(struct vpool *vpool) 1640 { 1641 struct rte_mbuf *mbuf = NULL; 1642 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1643 1644 LOG_DEBUG(VHOST_CONFIG, 1645 "in mbuf_destroy_zcp: mbuf count in mempool before " 1646 "mbuf_destroy_zcp is: %d\n", 1647 mbuf_count); 1648 LOG_DEBUG(VHOST_CONFIG, 1649 "in mbuf_destroy_zcp: mbuf count in ring before " 1650 "mbuf_destroy_zcp is : %d\n", 1651 rte_ring_count(vpool->ring)); 1652 1653 for (index = 0; index < mbuf_count; index++) { 1654 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1655 if (likely(mbuf != NULL)) { 1656 if (likely(MBUF_EXT_MEM(mbuf))) 1657 pktmbuf_detach_zcp(mbuf); 1658 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1659 } 1660 } 1661 1662 LOG_DEBUG(VHOST_CONFIG, 1663 "in mbuf_destroy_zcp: mbuf count in mempool after " 1664 "mbuf_destroy_zcp is: %d\n", 1665 rte_mempool_count(vpool->pool)); 1666 LOG_DEBUG(VHOST_CONFIG, 1667 "in mbuf_destroy_zcp: mbuf count in ring after " 1668 "mbuf_destroy_zcp is : %d\n", 1669 rte_ring_count(vpool->ring)); 1670 } 1671 1672 /* 1673 * This function update the use flag and counter. 1674 */ 1675 static inline uint32_t __attribute__((always_inline)) 1676 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1677 uint32_t count) 1678 { 1679 struct vhost_virtqueue *vq; 1680 struct vring_desc *desc; 1681 struct rte_mbuf *buff; 1682 /* The virtio_hdr is initialised to 0. */ 1683 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1684 = {{0, 0, 0, 0, 0, 0}, 0}; 1685 uint64_t buff_hdr_addr = 0; 1686 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1687 uint32_t head_idx, packet_success = 0; 1688 uint16_t res_cur_idx; 1689 1690 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1691 1692 if (count == 0) 1693 return 0; 1694 1695 vq = dev->virtqueue[VIRTIO_RXQ]; 1696 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1697 1698 res_cur_idx = vq->last_used_idx; 1699 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1700 dev->device_fh, res_cur_idx, res_cur_idx + count); 1701 1702 /* Retrieve all of the head indexes first to avoid caching issues. */ 1703 for (head_idx = 0; head_idx < count; head_idx++) 1704 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1705 1706 /*Prefetch descriptor index. */ 1707 rte_prefetch0(&vq->desc[head[packet_success]]); 1708 1709 while (packet_success != count) { 1710 /* Get descriptor from available ring */ 1711 desc = &vq->desc[head[packet_success]]; 1712 1713 buff = pkts[packet_success]; 1714 LOG_DEBUG(VHOST_DATA, 1715 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1716 "pkt[%d] descriptor idx: %d\n", 1717 dev->device_fh, packet_success, 1718 MBUF_HEADROOM_UINT32(buff)); 1719 1720 PRINT_PACKET(dev, 1721 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1722 + RTE_PKTMBUF_HEADROOM), 1723 rte_pktmbuf_data_len(buff), 0); 1724 1725 /* Buffer address translation for virtio header. */ 1726 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1727 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1728 1729 /* 1730 * If the descriptors are chained the header and data are 1731 * placed in separate buffers. 1732 */ 1733 if (desc->flags & VRING_DESC_F_NEXT) { 1734 desc->len = vq->vhost_hlen; 1735 desc = &vq->desc[desc->next]; 1736 desc->len = rte_pktmbuf_data_len(buff); 1737 } else { 1738 desc->len = packet_len; 1739 } 1740 1741 /* Update used ring with desc information */ 1742 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1743 = head[packet_success]; 1744 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1745 = packet_len; 1746 res_cur_idx++; 1747 packet_success++; 1748 1749 /* A header is required per buffer. */ 1750 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1751 (const void *)&virtio_hdr, vq->vhost_hlen); 1752 1753 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1754 1755 if (likely(packet_success < count)) { 1756 /* Prefetch descriptor index. */ 1757 rte_prefetch0(&vq->desc[head[packet_success]]); 1758 } 1759 } 1760 1761 rte_compiler_barrier(); 1762 1763 LOG_DEBUG(VHOST_DATA, 1764 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1765 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1766 dev->device_fh, vq->last_used_idx, vq->used->idx); 1767 1768 *(volatile uint16_t *)&vq->used->idx += count; 1769 vq->last_used_idx += count; 1770 1771 LOG_DEBUG(VHOST_DATA, 1772 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1773 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1774 dev->device_fh, vq->last_used_idx, vq->used->idx); 1775 1776 /* Kick the guest if necessary. */ 1777 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1778 eventfd_write((int)vq->callfd, 1); 1779 1780 return count; 1781 } 1782 1783 /* 1784 * This function routes the TX packet to the correct interface. 1785 * This may be a local device or the physical port. 1786 */ 1787 static inline void __attribute__((always_inline)) 1788 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1789 uint32_t desc_idx, uint8_t need_copy) 1790 { 1791 struct mbuf_table *tx_q; 1792 struct rte_mbuf **m_table; 1793 struct rte_mbuf *mbuf = NULL; 1794 unsigned len, ret, offset = 0; 1795 struct vpool *vpool; 1796 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1797 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1798 1799 /*Add packet to the port tx queue*/ 1800 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1801 len = tx_q->len; 1802 1803 /* Allocate an mbuf and populate the structure. */ 1804 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1805 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1806 if (unlikely(mbuf == NULL)) { 1807 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1808 RTE_LOG(ERR, VHOST_DATA, 1809 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1810 dev->device_fh); 1811 put_desc_to_used_list_zcp(vq, desc_idx); 1812 return; 1813 } 1814 1815 if (vm2vm_mode == VM2VM_HARDWARE) { 1816 /* Avoid using a vlan tag from any vm for external pkt, such as 1817 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1818 * selection, MAC address determines it as an external pkt 1819 * which should go to network, while vlan tag determine it as 1820 * a vm2vm pkt should forward to another vm. Hardware confuse 1821 * such a ambiguous situation, so pkt will lost. 1822 */ 1823 vlan_tag = external_pkt_default_vlan_tag; 1824 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1825 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1826 __rte_mbuf_raw_free(mbuf); 1827 return; 1828 } 1829 } 1830 1831 mbuf->nb_segs = m->nb_segs; 1832 mbuf->next = m->next; 1833 mbuf->data_len = m->data_len + offset; 1834 mbuf->pkt_len = mbuf->data_len; 1835 if (unlikely(need_copy)) { 1836 /* Copy the packet contents to the mbuf. */ 1837 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1838 rte_pktmbuf_mtod(m, void *), 1839 m->data_len); 1840 } else { 1841 mbuf->data_off = m->data_off; 1842 mbuf->buf_physaddr = m->buf_physaddr; 1843 mbuf->buf_addr = m->buf_addr; 1844 } 1845 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1846 mbuf->vlan_tci = vlan_tag; 1847 mbuf->l2_len = sizeof(struct ether_hdr); 1848 mbuf->l3_len = sizeof(struct ipv4_hdr); 1849 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1850 1851 tx_q->m_table[len] = mbuf; 1852 len++; 1853 1854 LOG_DEBUG(VHOST_DATA, 1855 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1856 dev->device_fh, 1857 mbuf->nb_segs, 1858 (mbuf->next == NULL) ? "null" : "non-null"); 1859 1860 if (enable_stats) { 1861 dev_statistics[dev->device_fh].tx_total++; 1862 dev_statistics[dev->device_fh].tx++; 1863 } 1864 1865 if (unlikely(len == MAX_PKT_BURST)) { 1866 m_table = (struct rte_mbuf **)tx_q->m_table; 1867 ret = rte_eth_tx_burst(ports[0], 1868 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1869 1870 /* 1871 * Free any buffers not handled by TX and update 1872 * the port stats. 1873 */ 1874 if (unlikely(ret < len)) { 1875 do { 1876 rte_pktmbuf_free(m_table[ret]); 1877 } while (++ret < len); 1878 } 1879 1880 len = 0; 1881 txmbuf_clean_zcp(dev, vpool); 1882 } 1883 1884 tx_q->len = len; 1885 1886 return; 1887 } 1888 1889 /* 1890 * This function TX all available packets in virtio TX queue for one 1891 * virtio-net device. If it is first packet, it learns MAC address and 1892 * setup VMDQ. 1893 */ 1894 static inline void __attribute__((always_inline)) 1895 virtio_dev_tx_zcp(struct virtio_net *dev) 1896 { 1897 struct rte_mbuf m; 1898 struct vhost_virtqueue *vq; 1899 struct vring_desc *desc; 1900 uint64_t buff_addr = 0, phys_addr; 1901 uint32_t head[MAX_PKT_BURST]; 1902 uint32_t i; 1903 uint16_t free_entries, packet_success = 0; 1904 uint16_t avail_idx; 1905 uint8_t need_copy = 0; 1906 hpa_type addr_type; 1907 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1908 1909 vq = dev->virtqueue[VIRTIO_TXQ]; 1910 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1911 1912 /* If there are no available buffers then return. */ 1913 if (vq->last_used_idx_res == avail_idx) 1914 return; 1915 1916 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1917 1918 /* Prefetch available ring to retrieve head indexes. */ 1919 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1920 1921 /* Get the number of free entries in the ring */ 1922 free_entries = (avail_idx - vq->last_used_idx_res); 1923 1924 /* Limit to MAX_PKT_BURST. */ 1925 free_entries 1926 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1927 1928 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1929 dev->device_fh, free_entries); 1930 1931 /* Retrieve all of the head indexes first to avoid caching issues. */ 1932 for (i = 0; i < free_entries; i++) 1933 head[i] 1934 = vq->avail->ring[(vq->last_used_idx_res + i) 1935 & (vq->size - 1)]; 1936 1937 vq->last_used_idx_res += free_entries; 1938 1939 /* Prefetch descriptor index. */ 1940 rte_prefetch0(&vq->desc[head[packet_success]]); 1941 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1942 1943 while (packet_success < free_entries) { 1944 desc = &vq->desc[head[packet_success]]; 1945 1946 /* Discard first buffer as it is the virtio header */ 1947 desc = &vq->desc[desc->next]; 1948 1949 /* Buffer address translation. */ 1950 buff_addr = gpa_to_vva(dev, desc->addr); 1951 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1952 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1953 &addr_type); 1954 1955 if (likely(packet_success < (free_entries - 1))) 1956 /* Prefetch descriptor index. */ 1957 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1958 1959 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1960 RTE_LOG(ERR, VHOST_DATA, 1961 "(%"PRIu64") Invalid frame buffer address found" 1962 "when TX packets!\n", 1963 dev->device_fh); 1964 packet_success++; 1965 continue; 1966 } 1967 1968 /* Prefetch buffer address. */ 1969 rte_prefetch0((void *)(uintptr_t)buff_addr); 1970 1971 /* 1972 * Setup dummy mbuf. This is copied to a real mbuf if 1973 * transmitted out the physical port. 1974 */ 1975 m.data_len = desc->len; 1976 m.nb_segs = 1; 1977 m.next = NULL; 1978 m.data_off = 0; 1979 m.buf_addr = (void *)(uintptr_t)buff_addr; 1980 m.buf_physaddr = phys_addr; 1981 1982 /* 1983 * Check if the frame buffer address from guest crosses 1984 * sub-region or not. 1985 */ 1986 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1987 RTE_LOG(ERR, VHOST_DATA, 1988 "(%"PRIu64") Frame buffer address cross " 1989 "sub-regioin found when attaching TX frame " 1990 "buffer address!\n", 1991 dev->device_fh); 1992 need_copy = 1; 1993 } else 1994 need_copy = 0; 1995 1996 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1997 1998 /* 1999 * If this is the first received packet we need to learn 2000 * the MAC and setup VMDQ 2001 */ 2002 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2003 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2004 /* 2005 * Discard frame if device is scheduled for 2006 * removal or a duplicate MAC address is found. 2007 */ 2008 packet_success += free_entries; 2009 vq->last_used_idx += packet_success; 2010 break; 2011 } 2012 } 2013 2014 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2015 packet_success++; 2016 } 2017 } 2018 2019 /* 2020 * This function is called by each data core. It handles all RX/TX registered 2021 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2022 * addresses are compared with all devices in the main linked list. 2023 */ 2024 static int 2025 switch_worker_zcp(__attribute__((unused)) void *arg) 2026 { 2027 struct virtio_net *dev = NULL; 2028 struct vhost_dev *vdev = NULL; 2029 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2030 struct virtio_net_data_ll *dev_ll; 2031 struct mbuf_table *tx_q; 2032 volatile struct lcore_ll_info *lcore_ll; 2033 const uint64_t drain_tsc 2034 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2035 * BURST_TX_DRAIN_US; 2036 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2037 unsigned ret; 2038 const uint16_t lcore_id = rte_lcore_id(); 2039 uint16_t count_in_ring, rx_count = 0; 2040 2041 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2042 2043 lcore_ll = lcore_info[lcore_id].lcore_ll; 2044 prev_tsc = 0; 2045 2046 while (1) { 2047 cur_tsc = rte_rdtsc(); 2048 2049 /* TX burst queue drain */ 2050 diff_tsc = cur_tsc - prev_tsc; 2051 if (unlikely(diff_tsc > drain_tsc)) { 2052 /* 2053 * Get mbuf from vpool.pool and detach mbuf and 2054 * put back into vpool.ring. 2055 */ 2056 dev_ll = lcore_ll->ll_root_used; 2057 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2058 /* Get virtio device ID */ 2059 vdev = dev_ll->vdev; 2060 dev = vdev->dev; 2061 2062 if (likely(!vdev->remove)) { 2063 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2064 if (tx_q->len) { 2065 LOG_DEBUG(VHOST_DATA, 2066 "TX queue drained after timeout" 2067 " with burst size %u\n", 2068 tx_q->len); 2069 2070 /* 2071 * Tx any packets in the queue 2072 */ 2073 ret = rte_eth_tx_burst( 2074 ports[0], 2075 (uint16_t)tx_q->txq_id, 2076 (struct rte_mbuf **) 2077 tx_q->m_table, 2078 (uint16_t)tx_q->len); 2079 if (unlikely(ret < tx_q->len)) { 2080 do { 2081 rte_pktmbuf_free( 2082 tx_q->m_table[ret]); 2083 } while (++ret < tx_q->len); 2084 } 2085 tx_q->len = 0; 2086 2087 txmbuf_clean_zcp(dev, 2088 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2089 } 2090 } 2091 dev_ll = dev_ll->next; 2092 } 2093 prev_tsc = cur_tsc; 2094 } 2095 2096 rte_prefetch0(lcore_ll->ll_root_used); 2097 2098 /* 2099 * Inform the configuration core that we have exited the linked 2100 * list and that no devices are in use if requested. 2101 */ 2102 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2103 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2104 2105 /* Process devices */ 2106 dev_ll = lcore_ll->ll_root_used; 2107 2108 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2109 vdev = dev_ll->vdev; 2110 dev = vdev->dev; 2111 if (unlikely(vdev->remove)) { 2112 dev_ll = dev_ll->next; 2113 unlink_vmdq(vdev); 2114 vdev->ready = DEVICE_SAFE_REMOVE; 2115 continue; 2116 } 2117 2118 if (likely(vdev->ready == DEVICE_RX)) { 2119 uint32_t index = vdev->vmdq_rx_q; 2120 uint16_t i; 2121 count_in_ring 2122 = rte_ring_count(vpool_array[index].ring); 2123 uint16_t free_entries 2124 = (uint16_t)get_available_ring_num_zcp(dev); 2125 2126 /* 2127 * Attach all mbufs in vpool.ring and put back 2128 * into vpool.pool. 2129 */ 2130 for (i = 0; 2131 i < RTE_MIN(free_entries, 2132 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2133 i++) 2134 attach_rxmbuf_zcp(dev); 2135 2136 /* Handle guest RX */ 2137 rx_count = rte_eth_rx_burst(ports[0], 2138 vdev->vmdq_rx_q, pkts_burst, 2139 MAX_PKT_BURST); 2140 2141 if (rx_count) { 2142 ret_count = virtio_dev_rx_zcp(dev, 2143 pkts_burst, rx_count); 2144 if (enable_stats) { 2145 dev_statistics[dev->device_fh].rx_total 2146 += rx_count; 2147 dev_statistics[dev->device_fh].rx 2148 += ret_count; 2149 } 2150 while (likely(rx_count)) { 2151 rx_count--; 2152 pktmbuf_detach_zcp( 2153 pkts_burst[rx_count]); 2154 rte_ring_sp_enqueue( 2155 vpool_array[index].ring, 2156 (void *)pkts_burst[rx_count]); 2157 } 2158 } 2159 } 2160 2161 if (likely(!vdev->remove)) 2162 /* Handle guest TX */ 2163 virtio_dev_tx_zcp(dev); 2164 2165 /* Move to the next device in the list */ 2166 dev_ll = dev_ll->next; 2167 } 2168 } 2169 2170 return 0; 2171 } 2172 2173 2174 /* 2175 * Add an entry to a used linked list. A free entry must first be found 2176 * in the free linked list using get_data_ll_free_entry(); 2177 */ 2178 static void 2179 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2180 struct virtio_net_data_ll *ll_dev) 2181 { 2182 struct virtio_net_data_ll *ll = *ll_root_addr; 2183 2184 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2185 ll_dev->next = NULL; 2186 rte_compiler_barrier(); 2187 2188 /* If ll == NULL then this is the first device. */ 2189 if (ll) { 2190 /* Increment to the tail of the linked list. */ 2191 while ((ll->next != NULL) ) 2192 ll = ll->next; 2193 2194 ll->next = ll_dev; 2195 } else { 2196 *ll_root_addr = ll_dev; 2197 } 2198 } 2199 2200 /* 2201 * Remove an entry from a used linked list. The entry must then be added to 2202 * the free linked list using put_data_ll_free_entry(). 2203 */ 2204 static void 2205 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2206 struct virtio_net_data_ll *ll_dev, 2207 struct virtio_net_data_ll *ll_dev_last) 2208 { 2209 struct virtio_net_data_ll *ll = *ll_root_addr; 2210 2211 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2212 return; 2213 2214 if (ll_dev == ll) 2215 *ll_root_addr = ll_dev->next; 2216 else 2217 if (likely(ll_dev_last != NULL)) 2218 ll_dev_last->next = ll_dev->next; 2219 else 2220 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2221 } 2222 2223 /* 2224 * Find and return an entry from the free linked list. 2225 */ 2226 static struct virtio_net_data_ll * 2227 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2228 { 2229 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2230 struct virtio_net_data_ll *ll_dev; 2231 2232 if (ll_free == NULL) 2233 return NULL; 2234 2235 ll_dev = ll_free; 2236 *ll_root_addr = ll_free->next; 2237 2238 return ll_dev; 2239 } 2240 2241 /* 2242 * Place an entry back on to the free linked list. 2243 */ 2244 static void 2245 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2246 struct virtio_net_data_ll *ll_dev) 2247 { 2248 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2249 2250 if (ll_dev == NULL) 2251 return; 2252 2253 ll_dev->next = ll_free; 2254 *ll_root_addr = ll_dev; 2255 } 2256 2257 /* 2258 * Creates a linked list of a given size. 2259 */ 2260 static struct virtio_net_data_ll * 2261 alloc_data_ll(uint32_t size) 2262 { 2263 struct virtio_net_data_ll *ll_new; 2264 uint32_t i; 2265 2266 /* Malloc and then chain the linked list. */ 2267 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2268 if (ll_new == NULL) { 2269 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2270 return NULL; 2271 } 2272 2273 for (i = 0; i < size - 1; i++) { 2274 ll_new[i].vdev = NULL; 2275 ll_new[i].next = &ll_new[i+1]; 2276 } 2277 ll_new[i].next = NULL; 2278 2279 return (ll_new); 2280 } 2281 2282 /* 2283 * Create the main linked list along with each individual cores linked list. A used and a free list 2284 * are created to manage entries. 2285 */ 2286 static int 2287 init_data_ll (void) 2288 { 2289 int lcore; 2290 2291 RTE_LCORE_FOREACH_SLAVE(lcore) { 2292 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2293 if (lcore_info[lcore].lcore_ll == NULL) { 2294 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2295 return -1; 2296 } 2297 2298 lcore_info[lcore].lcore_ll->device_num = 0; 2299 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2300 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2301 if (num_devices % num_switching_cores) 2302 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2303 else 2304 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2305 } 2306 2307 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2308 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2309 2310 return 0; 2311 } 2312 2313 /* 2314 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2315 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2316 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2317 */ 2318 static void 2319 destroy_device (volatile struct virtio_net *dev) 2320 { 2321 struct virtio_net_data_ll *ll_lcore_dev_cur; 2322 struct virtio_net_data_ll *ll_main_dev_cur; 2323 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2324 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2325 struct vhost_dev *vdev; 2326 int lcore; 2327 2328 dev->flags &= ~VIRTIO_DEV_RUNNING; 2329 2330 vdev = (struct vhost_dev *)dev->priv; 2331 /*set the remove flag. */ 2332 vdev->remove = 1; 2333 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2334 rte_pause(); 2335 } 2336 2337 /* Search for entry to be removed from lcore ll */ 2338 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2339 while (ll_lcore_dev_cur != NULL) { 2340 if (ll_lcore_dev_cur->vdev == vdev) { 2341 break; 2342 } else { 2343 ll_lcore_dev_last = ll_lcore_dev_cur; 2344 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2345 } 2346 } 2347 2348 if (ll_lcore_dev_cur == NULL) { 2349 RTE_LOG(ERR, VHOST_CONFIG, 2350 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2351 dev->device_fh); 2352 return; 2353 } 2354 2355 /* Search for entry to be removed from main ll */ 2356 ll_main_dev_cur = ll_root_used; 2357 ll_main_dev_last = NULL; 2358 while (ll_main_dev_cur != NULL) { 2359 if (ll_main_dev_cur->vdev == vdev) { 2360 break; 2361 } else { 2362 ll_main_dev_last = ll_main_dev_cur; 2363 ll_main_dev_cur = ll_main_dev_cur->next; 2364 } 2365 } 2366 2367 /* Remove entries from the lcore and main ll. */ 2368 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2369 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2370 2371 /* Set the dev_removal_flag on each lcore. */ 2372 RTE_LCORE_FOREACH_SLAVE(lcore) { 2373 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2374 } 2375 2376 /* 2377 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2378 * they can no longer access the device removed from the linked lists and that the devices 2379 * are no longer in use. 2380 */ 2381 RTE_LCORE_FOREACH_SLAVE(lcore) { 2382 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2383 rte_pause(); 2384 } 2385 } 2386 2387 /* Add the entries back to the lcore and main free ll.*/ 2388 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2389 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2390 2391 /* Decrement number of device on the lcore. */ 2392 lcore_info[vdev->coreid].lcore_ll->device_num--; 2393 2394 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2395 2396 if (zero_copy) { 2397 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2398 2399 /* Stop the RX queue. */ 2400 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2401 LOG_DEBUG(VHOST_CONFIG, 2402 "(%"PRIu64") In destroy_device: Failed to stop " 2403 "rx queue:%d\n", 2404 dev->device_fh, 2405 vdev->vmdq_rx_q); 2406 } 2407 2408 LOG_DEBUG(VHOST_CONFIG, 2409 "(%"PRIu64") in destroy_device: Start put mbuf in " 2410 "mempool back to ring for RX queue: %d\n", 2411 dev->device_fh, vdev->vmdq_rx_q); 2412 2413 mbuf_destroy_zcp(vpool); 2414 2415 /* Stop the TX queue. */ 2416 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2417 LOG_DEBUG(VHOST_CONFIG, 2418 "(%"PRIu64") In destroy_device: Failed to " 2419 "stop tx queue:%d\n", 2420 dev->device_fh, vdev->vmdq_rx_q); 2421 } 2422 2423 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2424 2425 LOG_DEBUG(VHOST_CONFIG, 2426 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2427 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2428 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2429 dev->device_fh); 2430 2431 mbuf_destroy_zcp(vpool); 2432 rte_free(vdev->regions_hpa); 2433 } 2434 rte_free(vdev); 2435 2436 } 2437 2438 /* 2439 * Calculate the region count of physical continous regions for one particular 2440 * region of whose vhost virtual address is continous. The particular region 2441 * start from vva_start, with size of 'size' in argument. 2442 */ 2443 static uint32_t 2444 check_hpa_regions(uint64_t vva_start, uint64_t size) 2445 { 2446 uint32_t i, nregions = 0, page_size = getpagesize(); 2447 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2448 if (vva_start % page_size) { 2449 LOG_DEBUG(VHOST_CONFIG, 2450 "in check_countinous: vva start(%p) mod page_size(%d) " 2451 "has remainder\n", 2452 (void *)(uintptr_t)vva_start, page_size); 2453 return 0; 2454 } 2455 if (size % page_size) { 2456 LOG_DEBUG(VHOST_CONFIG, 2457 "in check_countinous: " 2458 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2459 size, page_size); 2460 return 0; 2461 } 2462 for (i = 0; i < size - page_size; i = i + page_size) { 2463 cur_phys_addr 2464 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2465 next_phys_addr = rte_mem_virt2phy( 2466 (void *)(uintptr_t)(vva_start + i + page_size)); 2467 if ((cur_phys_addr + page_size) != next_phys_addr) { 2468 ++nregions; 2469 LOG_DEBUG(VHOST_CONFIG, 2470 "in check_continuous: hva addr:(%p) is not " 2471 "continuous with hva addr:(%p), diff:%d\n", 2472 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2473 (void *)(uintptr_t)(vva_start + (uint64_t)i 2474 + page_size), page_size); 2475 LOG_DEBUG(VHOST_CONFIG, 2476 "in check_continuous: hpa addr:(%p) is not " 2477 "continuous with hpa addr:(%p), " 2478 "diff:(%"PRIu64")\n", 2479 (void *)(uintptr_t)cur_phys_addr, 2480 (void *)(uintptr_t)next_phys_addr, 2481 (next_phys_addr-cur_phys_addr)); 2482 } 2483 } 2484 return nregions; 2485 } 2486 2487 /* 2488 * Divide each region whose vhost virtual address is continous into a few 2489 * sub-regions, make sure the physical address within each sub-region are 2490 * continous. And fill offset(to GPA) and size etc. information of each 2491 * sub-region into regions_hpa. 2492 */ 2493 static uint32_t 2494 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2495 { 2496 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2497 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2498 2499 if (mem_region_hpa == NULL) 2500 return 0; 2501 2502 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2503 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2504 virtio_memory->regions[regionidx].address_offset; 2505 mem_region_hpa[regionidx_hpa].guest_phys_address 2506 = virtio_memory->regions[regionidx].guest_phys_address; 2507 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2508 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2509 mem_region_hpa[regionidx_hpa].guest_phys_address; 2510 LOG_DEBUG(VHOST_CONFIG, 2511 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2512 regionidx_hpa, 2513 (void *)(uintptr_t) 2514 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2515 LOG_DEBUG(VHOST_CONFIG, 2516 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2517 regionidx_hpa, 2518 (void *)(uintptr_t) 2519 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2520 for (i = 0, k = 0; 2521 i < virtio_memory->regions[regionidx].memory_size - 2522 page_size; 2523 i += page_size) { 2524 cur_phys_addr = rte_mem_virt2phy( 2525 (void *)(uintptr_t)(vva_start + i)); 2526 next_phys_addr = rte_mem_virt2phy( 2527 (void *)(uintptr_t)(vva_start + 2528 i + page_size)); 2529 if ((cur_phys_addr + page_size) != next_phys_addr) { 2530 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2531 mem_region_hpa[regionidx_hpa].guest_phys_address + 2532 k + page_size; 2533 mem_region_hpa[regionidx_hpa].memory_size 2534 = k + page_size; 2535 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2536 "phys addr end [%d]:(%p)\n", 2537 regionidx_hpa, 2538 (void *)(uintptr_t) 2539 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2540 LOG_DEBUG(VHOST_CONFIG, 2541 "in fill_hpa_regions: guest phys addr " 2542 "size [%d]:(%p)\n", 2543 regionidx_hpa, 2544 (void *)(uintptr_t) 2545 (mem_region_hpa[regionidx_hpa].memory_size)); 2546 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2547 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2548 ++regionidx_hpa; 2549 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2550 next_phys_addr - 2551 mem_region_hpa[regionidx_hpa].guest_phys_address; 2552 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2553 " phys addr start[%d]:(%p)\n", 2554 regionidx_hpa, 2555 (void *)(uintptr_t) 2556 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2557 LOG_DEBUG(VHOST_CONFIG, 2558 "in fill_hpa_regions: host phys addr " 2559 "start[%d]:(%p)\n", 2560 regionidx_hpa, 2561 (void *)(uintptr_t) 2562 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2563 k = 0; 2564 } else { 2565 k += page_size; 2566 } 2567 } 2568 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2569 = mem_region_hpa[regionidx_hpa].guest_phys_address 2570 + k + page_size; 2571 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2572 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2573 "[%d]:(%p)\n", regionidx_hpa, 2574 (void *)(uintptr_t) 2575 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2576 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2577 "[%d]:(%p)\n", regionidx_hpa, 2578 (void *)(uintptr_t) 2579 (mem_region_hpa[regionidx_hpa].memory_size)); 2580 ++regionidx_hpa; 2581 } 2582 return regionidx_hpa; 2583 } 2584 2585 /* 2586 * A new device is added to a data core. First the device is added to the main linked list 2587 * and the allocated to a specific data core. 2588 */ 2589 static int 2590 new_device (struct virtio_net *dev) 2591 { 2592 struct virtio_net_data_ll *ll_dev; 2593 int lcore, core_add = 0; 2594 uint32_t device_num_min = num_devices; 2595 struct vhost_dev *vdev; 2596 uint32_t regionidx; 2597 2598 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2599 if (vdev == NULL) { 2600 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2601 dev->device_fh); 2602 return -1; 2603 } 2604 vdev->dev = dev; 2605 dev->priv = vdev; 2606 2607 if (zero_copy) { 2608 vdev->nregions_hpa = dev->mem->nregions; 2609 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2610 vdev->nregions_hpa 2611 += check_hpa_regions( 2612 dev->mem->regions[regionidx].guest_phys_address 2613 + dev->mem->regions[regionidx].address_offset, 2614 dev->mem->regions[regionidx].memory_size); 2615 2616 } 2617 2618 vdev->regions_hpa = rte_calloc("vhost hpa region", 2619 vdev->nregions_hpa, 2620 sizeof(struct virtio_memory_regions_hpa), 2621 RTE_CACHE_LINE_SIZE); 2622 if (vdev->regions_hpa == NULL) { 2623 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2624 rte_free(vdev); 2625 return -1; 2626 } 2627 2628 2629 if (fill_hpa_memory_regions( 2630 vdev->regions_hpa, dev->mem 2631 ) != vdev->nregions_hpa) { 2632 2633 RTE_LOG(ERR, VHOST_CONFIG, 2634 "hpa memory regions number mismatch: " 2635 "[%d]\n", vdev->nregions_hpa); 2636 rte_free(vdev->regions_hpa); 2637 rte_free(vdev); 2638 return -1; 2639 } 2640 } 2641 2642 2643 /* Add device to main ll */ 2644 ll_dev = get_data_ll_free_entry(&ll_root_free); 2645 if (ll_dev == NULL) { 2646 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2647 "of %d devices per core has been reached\n", 2648 dev->device_fh, num_devices); 2649 if (vdev->regions_hpa) 2650 rte_free(vdev->regions_hpa); 2651 rte_free(vdev); 2652 return -1; 2653 } 2654 ll_dev->vdev = vdev; 2655 add_data_ll_entry(&ll_root_used, ll_dev); 2656 vdev->vmdq_rx_q 2657 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2658 2659 if (zero_copy) { 2660 uint32_t index = vdev->vmdq_rx_q; 2661 uint32_t count_in_ring, i; 2662 struct mbuf_table *tx_q; 2663 2664 count_in_ring = rte_ring_count(vpool_array[index].ring); 2665 2666 LOG_DEBUG(VHOST_CONFIG, 2667 "(%"PRIu64") in new_device: mbuf count in mempool " 2668 "before attach is: %d\n", 2669 dev->device_fh, 2670 rte_mempool_count(vpool_array[index].pool)); 2671 LOG_DEBUG(VHOST_CONFIG, 2672 "(%"PRIu64") in new_device: mbuf count in ring " 2673 "before attach is : %d\n", 2674 dev->device_fh, count_in_ring); 2675 2676 /* 2677 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2678 */ 2679 for (i = 0; i < count_in_ring; i++) 2680 attach_rxmbuf_zcp(dev); 2681 2682 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2683 "mempool after attach is: %d\n", 2684 dev->device_fh, 2685 rte_mempool_count(vpool_array[index].pool)); 2686 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2687 "ring after attach is : %d\n", 2688 dev->device_fh, 2689 rte_ring_count(vpool_array[index].ring)); 2690 2691 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2692 tx_q->txq_id = vdev->vmdq_rx_q; 2693 2694 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2695 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2696 2697 LOG_DEBUG(VHOST_CONFIG, 2698 "(%"PRIu64") In new_device: Failed to start " 2699 "tx queue:%d\n", 2700 dev->device_fh, vdev->vmdq_rx_q); 2701 2702 mbuf_destroy_zcp(vpool); 2703 rte_free(vdev->regions_hpa); 2704 rte_free(vdev); 2705 return -1; 2706 } 2707 2708 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2709 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2710 2711 LOG_DEBUG(VHOST_CONFIG, 2712 "(%"PRIu64") In new_device: Failed to start " 2713 "rx queue:%d\n", 2714 dev->device_fh, vdev->vmdq_rx_q); 2715 2716 /* Stop the TX queue. */ 2717 if (rte_eth_dev_tx_queue_stop(ports[0], 2718 vdev->vmdq_rx_q) != 0) { 2719 LOG_DEBUG(VHOST_CONFIG, 2720 "(%"PRIu64") In new_device: Failed to " 2721 "stop tx queue:%d\n", 2722 dev->device_fh, vdev->vmdq_rx_q); 2723 } 2724 2725 mbuf_destroy_zcp(vpool); 2726 rte_free(vdev->regions_hpa); 2727 rte_free(vdev); 2728 return -1; 2729 } 2730 2731 } 2732 2733 /*reset ready flag*/ 2734 vdev->ready = DEVICE_MAC_LEARNING; 2735 vdev->remove = 0; 2736 2737 /* Find a suitable lcore to add the device. */ 2738 RTE_LCORE_FOREACH_SLAVE(lcore) { 2739 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2740 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2741 core_add = lcore; 2742 } 2743 } 2744 /* Add device to lcore ll */ 2745 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2746 if (ll_dev == NULL) { 2747 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2748 vdev->ready = DEVICE_SAFE_REMOVE; 2749 destroy_device(dev); 2750 rte_free(vdev->regions_hpa); 2751 rte_free(vdev); 2752 return -1; 2753 } 2754 ll_dev->vdev = vdev; 2755 vdev->coreid = core_add; 2756 2757 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2758 2759 /* Initialize device stats */ 2760 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2761 2762 /* Disable notifications. */ 2763 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2764 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2765 lcore_info[vdev->coreid].lcore_ll->device_num++; 2766 dev->flags |= VIRTIO_DEV_RUNNING; 2767 2768 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2769 2770 return 0; 2771 } 2772 2773 /* 2774 * These callback allow devices to be added to the data core when configuration 2775 * has been fully complete. 2776 */ 2777 static const struct virtio_net_device_ops virtio_net_device_ops = 2778 { 2779 .new_device = new_device, 2780 .destroy_device = destroy_device, 2781 }; 2782 2783 /* 2784 * This is a thread will wake up after a period to print stats if the user has 2785 * enabled them. 2786 */ 2787 static void 2788 print_stats(void) 2789 { 2790 struct virtio_net_data_ll *dev_ll; 2791 uint64_t tx_dropped, rx_dropped; 2792 uint64_t tx, tx_total, rx, rx_total; 2793 uint32_t device_fh; 2794 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2795 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2796 2797 while(1) { 2798 sleep(enable_stats); 2799 2800 /* Clear screen and move to top left */ 2801 printf("%s%s", clr, top_left); 2802 2803 printf("\nDevice statistics ===================================="); 2804 2805 dev_ll = ll_root_used; 2806 while (dev_ll != NULL) { 2807 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2808 tx_total = dev_statistics[device_fh].tx_total; 2809 tx = dev_statistics[device_fh].tx; 2810 tx_dropped = tx_total - tx; 2811 if (zero_copy == 0) { 2812 rx_total = rte_atomic64_read( 2813 &dev_statistics[device_fh].rx_total_atomic); 2814 rx = rte_atomic64_read( 2815 &dev_statistics[device_fh].rx_atomic); 2816 } else { 2817 rx_total = dev_statistics[device_fh].rx_total; 2818 rx = dev_statistics[device_fh].rx; 2819 } 2820 rx_dropped = rx_total - rx; 2821 2822 printf("\nStatistics for device %"PRIu32" ------------------------------" 2823 "\nTX total: %"PRIu64"" 2824 "\nTX dropped: %"PRIu64"" 2825 "\nTX successful: %"PRIu64"" 2826 "\nRX total: %"PRIu64"" 2827 "\nRX dropped: %"PRIu64"" 2828 "\nRX successful: %"PRIu64"", 2829 device_fh, 2830 tx_total, 2831 tx_dropped, 2832 tx, 2833 rx_total, 2834 rx_dropped, 2835 rx); 2836 2837 dev_ll = dev_ll->next; 2838 } 2839 printf("\n======================================================\n"); 2840 } 2841 } 2842 2843 static void 2844 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2845 char *ring_name, uint32_t nb_mbuf) 2846 { 2847 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2848 vpool_array[index].pool 2849 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2850 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2851 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2852 rte_pktmbuf_init, NULL, socket, 0); 2853 if (vpool_array[index].pool != NULL) { 2854 vpool_array[index].ring 2855 = rte_ring_create(ring_name, 2856 rte_align32pow2(nb_mbuf + 1), 2857 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2858 if (likely(vpool_array[index].ring != NULL)) { 2859 LOG_DEBUG(VHOST_CONFIG, 2860 "in setup_mempool_tbl: mbuf count in " 2861 "mempool is: %d\n", 2862 rte_mempool_count(vpool_array[index].pool)); 2863 LOG_DEBUG(VHOST_CONFIG, 2864 "in setup_mempool_tbl: mbuf count in " 2865 "ring is: %d\n", 2866 rte_ring_count(vpool_array[index].ring)); 2867 } else { 2868 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2869 ring_name); 2870 } 2871 2872 /* Need consider head room. */ 2873 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2874 } else { 2875 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2876 } 2877 } 2878 2879 2880 /* 2881 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2882 * device is also registered here to handle the IOCTLs. 2883 */ 2884 int 2885 main(int argc, char *argv[]) 2886 { 2887 struct rte_mempool *mbuf_pool = NULL; 2888 unsigned lcore_id, core_id = 0; 2889 unsigned nb_ports, valid_num_ports; 2890 int ret; 2891 uint8_t portid; 2892 uint16_t queue_id; 2893 static pthread_t tid; 2894 2895 /* init EAL */ 2896 ret = rte_eal_init(argc, argv); 2897 if (ret < 0) 2898 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2899 argc -= ret; 2900 argv += ret; 2901 2902 /* parse app arguments */ 2903 ret = us_vhost_parse_args(argc, argv); 2904 if (ret < 0) 2905 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2906 2907 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2908 if (rte_lcore_is_enabled(lcore_id)) 2909 lcore_ids[core_id ++] = lcore_id; 2910 2911 if (rte_lcore_count() > RTE_MAX_LCORE) 2912 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2913 2914 /*set the number of swithcing cores available*/ 2915 num_switching_cores = rte_lcore_count()-1; 2916 2917 /* Get the number of physical ports. */ 2918 nb_ports = rte_eth_dev_count(); 2919 if (nb_ports > RTE_MAX_ETHPORTS) 2920 nb_ports = RTE_MAX_ETHPORTS; 2921 2922 /* 2923 * Update the global var NUM_PORTS and global array PORTS 2924 * and get value of var VALID_NUM_PORTS according to system ports number 2925 */ 2926 valid_num_ports = check_ports_num(nb_ports); 2927 2928 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2929 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2930 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2931 return -1; 2932 } 2933 2934 if (zero_copy == 0) { 2935 /* Create the mbuf pool. */ 2936 mbuf_pool = rte_mempool_create( 2937 "MBUF_POOL", 2938 NUM_MBUFS_PER_PORT 2939 * valid_num_ports, 2940 MBUF_SIZE, MBUF_CACHE_SIZE, 2941 sizeof(struct rte_pktmbuf_pool_private), 2942 rte_pktmbuf_pool_init, NULL, 2943 rte_pktmbuf_init, NULL, 2944 rte_socket_id(), 0); 2945 if (mbuf_pool == NULL) 2946 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2947 2948 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2949 vpool_array[queue_id].pool = mbuf_pool; 2950 2951 if (vm2vm_mode == VM2VM_HARDWARE) { 2952 /* Enable VT loop back to let L2 switch to do it. */ 2953 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2954 LOG_DEBUG(VHOST_CONFIG, 2955 "Enable loop back for L2 switch in vmdq.\n"); 2956 } 2957 } else { 2958 uint32_t nb_mbuf; 2959 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2960 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2961 2962 nb_mbuf = num_rx_descriptor 2963 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2964 + num_switching_cores * MAX_PKT_BURST; 2965 2966 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2967 snprintf(pool_name, sizeof(pool_name), 2968 "rxmbuf_pool_%u", queue_id); 2969 snprintf(ring_name, sizeof(ring_name), 2970 "rxmbuf_ring_%u", queue_id); 2971 setup_mempool_tbl(rte_socket_id(), queue_id, 2972 pool_name, ring_name, nb_mbuf); 2973 } 2974 2975 nb_mbuf = num_tx_descriptor 2976 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2977 + num_switching_cores * MAX_PKT_BURST; 2978 2979 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2980 snprintf(pool_name, sizeof(pool_name), 2981 "txmbuf_pool_%u", queue_id); 2982 snprintf(ring_name, sizeof(ring_name), 2983 "txmbuf_ring_%u", queue_id); 2984 setup_mempool_tbl(rte_socket_id(), 2985 (queue_id + MAX_QUEUES), 2986 pool_name, ring_name, nb_mbuf); 2987 } 2988 2989 if (vm2vm_mode == VM2VM_HARDWARE) { 2990 /* Enable VT loop back to let L2 switch to do it. */ 2991 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2992 LOG_DEBUG(VHOST_CONFIG, 2993 "Enable loop back for L2 switch in vmdq.\n"); 2994 } 2995 } 2996 /* Set log level. */ 2997 rte_set_log_level(LOG_LEVEL); 2998 2999 /* initialize all ports */ 3000 for (portid = 0; portid < nb_ports; portid++) { 3001 /* skip ports that are not enabled */ 3002 if ((enabled_port_mask & (1 << portid)) == 0) { 3003 RTE_LOG(INFO, VHOST_PORT, 3004 "Skipping disabled port %d\n", portid); 3005 continue; 3006 } 3007 if (port_init(portid) != 0) 3008 rte_exit(EXIT_FAILURE, 3009 "Cannot initialize network ports\n"); 3010 } 3011 3012 /* Initialise all linked lists. */ 3013 if (init_data_ll() == -1) 3014 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3015 3016 /* Initialize device stats */ 3017 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3018 3019 /* Enable stats if the user option is set. */ 3020 if (enable_stats) 3021 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3022 3023 /* Launch all data cores. */ 3024 if (zero_copy == 0) { 3025 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3026 rte_eal_remote_launch(switch_worker, 3027 mbuf_pool, lcore_id); 3028 } 3029 } else { 3030 uint32_t count_in_mempool, index, i; 3031 for (index = 0; index < 2*MAX_QUEUES; index++) { 3032 /* For all RX and TX queues. */ 3033 count_in_mempool 3034 = rte_mempool_count(vpool_array[index].pool); 3035 3036 /* 3037 * Transfer all un-attached mbufs from vpool.pool 3038 * to vpoo.ring. 3039 */ 3040 for (i = 0; i < count_in_mempool; i++) { 3041 struct rte_mbuf *mbuf 3042 = __rte_mbuf_raw_alloc( 3043 vpool_array[index].pool); 3044 rte_ring_sp_enqueue(vpool_array[index].ring, 3045 (void *)mbuf); 3046 } 3047 3048 LOG_DEBUG(VHOST_CONFIG, 3049 "in main: mbuf count in mempool at initial " 3050 "is: %d\n", count_in_mempool); 3051 LOG_DEBUG(VHOST_CONFIG, 3052 "in main: mbuf count in ring at initial is :" 3053 " %d\n", 3054 rte_ring_count(vpool_array[index].ring)); 3055 } 3056 3057 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3058 rte_eal_remote_launch(switch_worker_zcp, NULL, 3059 lcore_id); 3060 } 3061 3062 if (mergeable == 0) 3063 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3064 3065 /* Register CUSE device to handle IOCTLs. */ 3066 ret = rte_vhost_driver_register((char *)&dev_basename); 3067 if (ret != 0) 3068 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3069 3070 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3071 3072 /* Start CUSE session. */ 3073 rte_vhost_driver_session_start(); 3074 return 0; 3075 3076 } 3077 3078