1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 #define MBUF_EXT_MEM(mb) (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb)) 143 144 /* mask of enabled ports */ 145 static uint32_t enabled_port_mask = 0; 146 147 /* Promiscuous mode */ 148 static uint32_t promiscuous; 149 150 /*Number of switching cores enabled*/ 151 static uint32_t num_switching_cores = 0; 152 153 /* number of devices/queues to support*/ 154 static uint32_t num_queues = 0; 155 static uint32_t num_devices; 156 157 /* 158 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 159 * disabled on default. 160 */ 161 static uint32_t zero_copy; 162 static int mergeable; 163 164 /* Do vlan strip on host, enabled on default */ 165 static uint32_t vlan_strip = 1; 166 167 /* number of descriptors to apply*/ 168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 170 171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 172 #define MAX_RING_DESC 4096 173 174 struct vpool { 175 struct rte_mempool *pool; 176 struct rte_ring *ring; 177 uint32_t buf_size; 178 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 179 180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 181 typedef enum { 182 VM2VM_DISABLED = 0, 183 VM2VM_SOFTWARE = 1, 184 VM2VM_HARDWARE = 2, 185 VM2VM_LAST 186 } vm2vm_type; 187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 188 189 /* The type of host physical address translated from guest physical address. */ 190 typedef enum { 191 PHYS_ADDR_CONTINUOUS = 0, 192 PHYS_ADDR_CROSS_SUBREG = 1, 193 PHYS_ADDR_INVALID = 2, 194 PHYS_ADDR_LAST 195 } hpa_type; 196 197 /* Enable stats. */ 198 static uint32_t enable_stats = 0; 199 /* Enable retries on RX. */ 200 static uint32_t enable_retry = 1; 201 /* Specify timeout (in useconds) between retries on RX. */ 202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 203 /* Specify the number of retries on RX. */ 204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 205 206 /* Character device basename. Can be set by user. */ 207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 208 209 /* empty vmdq configuration structure. Filled in programatically */ 210 static struct rte_eth_conf vmdq_conf_default = { 211 .rxmode = { 212 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 213 .split_hdr_size = 0, 214 .header_split = 0, /**< Header Split disabled */ 215 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 216 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 217 /* 218 * It is necessary for 1G NIC such as I350, 219 * this fixes bug of ipv4 forwarding in guest can't 220 * forward pakets from one virtio dev to another virtio dev. 221 */ 222 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 223 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 224 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 225 }, 226 227 .txmode = { 228 .mq_mode = ETH_MQ_TX_NONE, 229 }, 230 .rx_adv_conf = { 231 /* 232 * should be overridden separately in code with 233 * appropriate values 234 */ 235 .vmdq_rx_conf = { 236 .nb_queue_pools = ETH_8_POOLS, 237 .enable_default_pool = 0, 238 .default_pool = 0, 239 .nb_pool_maps = 0, 240 .pool_map = {{0, 0},}, 241 }, 242 }, 243 }; 244 245 static unsigned lcore_ids[RTE_MAX_LCORE]; 246 static uint8_t ports[RTE_MAX_ETHPORTS]; 247 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 248 static uint16_t num_pf_queues, num_vmdq_queues; 249 static uint16_t vmdq_pool_base, vmdq_queue_base; 250 static uint16_t queues_per_pool; 251 252 static const uint16_t external_pkt_default_vlan_tag = 2000; 253 const uint16_t vlan_tags[] = { 254 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 255 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 256 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 257 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 258 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 259 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 260 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 261 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 262 }; 263 264 /* ethernet addresses of ports */ 265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 266 267 /* heads for the main used and free linked lists for the data path. */ 268 static struct virtio_net_data_ll *ll_root_used = NULL; 269 static struct virtio_net_data_ll *ll_root_free = NULL; 270 271 /* Array of data core structures containing information on individual core linked lists. */ 272 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 273 274 /* Used for queueing bursts of TX packets. */ 275 struct mbuf_table { 276 unsigned len; 277 unsigned txq_id; 278 struct rte_mbuf *m_table[MAX_PKT_BURST]; 279 }; 280 281 /* TX queue for each data core. */ 282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 283 284 /* TX queue fori each virtio device for zero copy. */ 285 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 286 287 /* Vlan header struct used to insert vlan tags on TX. */ 288 struct vlan_ethhdr { 289 unsigned char h_dest[ETH_ALEN]; 290 unsigned char h_source[ETH_ALEN]; 291 __be16 h_vlan_proto; 292 __be16 h_vlan_TCI; 293 __be16 h_vlan_encapsulated_proto; 294 }; 295 296 /* IPv4 Header */ 297 struct ipv4_hdr { 298 uint8_t version_ihl; /**< version and header length */ 299 uint8_t type_of_service; /**< type of service */ 300 uint16_t total_length; /**< length of packet */ 301 uint16_t packet_id; /**< packet ID */ 302 uint16_t fragment_offset; /**< fragmentation offset */ 303 uint8_t time_to_live; /**< time to live */ 304 uint8_t next_proto_id; /**< protocol ID */ 305 uint16_t hdr_checksum; /**< header checksum */ 306 uint32_t src_addr; /**< source address */ 307 uint32_t dst_addr; /**< destination address */ 308 } __attribute__((__packed__)); 309 310 /* Header lengths. */ 311 #define VLAN_HLEN 4 312 #define VLAN_ETH_HLEN 18 313 314 /* Per-device statistics struct */ 315 struct device_statistics { 316 uint64_t tx_total; 317 rte_atomic64_t rx_total_atomic; 318 uint64_t rx_total; 319 uint64_t tx; 320 rte_atomic64_t rx_atomic; 321 uint64_t rx; 322 } __rte_cache_aligned; 323 struct device_statistics dev_statistics[MAX_DEVICES]; 324 325 /* 326 * Builds up the correct configuration for VMDQ VLAN pool map 327 * according to the pool & queue limits. 328 */ 329 static inline int 330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 331 { 332 struct rte_eth_vmdq_rx_conf conf; 333 struct rte_eth_vmdq_rx_conf *def_conf = 334 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 335 unsigned i; 336 337 memset(&conf, 0, sizeof(conf)); 338 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 339 conf.nb_pool_maps = num_devices; 340 conf.enable_loop_back = def_conf->enable_loop_back; 341 conf.rx_mode = def_conf->rx_mode; 342 343 for (i = 0; i < conf.nb_pool_maps; i++) { 344 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 345 conf.pool_map[i].pools = (1UL << i); 346 } 347 348 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 349 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 350 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 351 return 0; 352 } 353 354 /* 355 * Validate the device number according to the max pool number gotten form 356 * dev_info. If the device number is invalid, give the error message and 357 * return -1. Each device must have its own pool. 358 */ 359 static inline int 360 validate_num_devices(uint32_t max_nb_devices) 361 { 362 if (num_devices > max_nb_devices) { 363 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 364 return -1; 365 } 366 return 0; 367 } 368 369 /* 370 * Initialises a given port using global settings and with the rx buffers 371 * coming from the mbuf_pool passed as parameter 372 */ 373 static inline int 374 port_init(uint8_t port) 375 { 376 struct rte_eth_dev_info dev_info; 377 struct rte_eth_conf port_conf; 378 struct rte_eth_rxconf *rxconf; 379 struct rte_eth_txconf *txconf; 380 int16_t rx_rings, tx_rings; 381 uint16_t rx_ring_size, tx_ring_size; 382 int retval; 383 uint16_t q; 384 385 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 386 rte_eth_dev_info_get (port, &dev_info); 387 388 if (dev_info.max_rx_queues > MAX_QUEUES) { 389 rte_exit(EXIT_FAILURE, 390 "please define MAX_QUEUES no less than %u in %s\n", 391 dev_info.max_rx_queues, __FILE__); 392 } 393 394 rxconf = &dev_info.default_rxconf; 395 txconf = &dev_info.default_txconf; 396 rxconf->rx_drop_en = 1; 397 398 /* Enable vlan offload */ 399 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 400 401 /* 402 * Zero copy defers queue RX/TX start to the time when guest 403 * finishes its startup and packet buffers from that guest are 404 * available. 405 */ 406 if (zero_copy) { 407 rxconf->rx_deferred_start = 1; 408 rxconf->rx_drop_en = 0; 409 txconf->tx_deferred_start = 1; 410 } 411 412 /*configure the number of supported virtio devices based on VMDQ limits */ 413 num_devices = dev_info.max_vmdq_pools; 414 415 if (zero_copy) { 416 rx_ring_size = num_rx_descriptor; 417 tx_ring_size = num_tx_descriptor; 418 tx_rings = dev_info.max_tx_queues; 419 } else { 420 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 421 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 422 tx_rings = (uint16_t)rte_lcore_count(); 423 } 424 425 retval = validate_num_devices(MAX_DEVICES); 426 if (retval < 0) 427 return retval; 428 429 /* Get port configuration. */ 430 retval = get_eth_conf(&port_conf, num_devices); 431 if (retval < 0) 432 return retval; 433 /* NIC queues are divided into pf queues and vmdq queues. */ 434 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 435 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 436 num_vmdq_queues = num_devices * queues_per_pool; 437 num_queues = num_pf_queues + num_vmdq_queues; 438 vmdq_queue_base = dev_info.vmdq_queue_base; 439 vmdq_pool_base = dev_info.vmdq_pool_base; 440 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 441 num_pf_queues, num_devices, queues_per_pool); 442 443 if (port >= rte_eth_dev_count()) return -1; 444 445 rx_rings = (uint16_t)dev_info.max_rx_queues; 446 /* Configure ethernet device. */ 447 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 448 if (retval != 0) 449 return retval; 450 451 /* Setup the queues. */ 452 for (q = 0; q < rx_rings; q ++) { 453 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 454 rte_eth_dev_socket_id(port), 455 rxconf, 456 vpool_array[q].pool); 457 if (retval < 0) 458 return retval; 459 } 460 for (q = 0; q < tx_rings; q ++) { 461 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 462 rte_eth_dev_socket_id(port), 463 txconf); 464 if (retval < 0) 465 return retval; 466 } 467 468 /* Start the device. */ 469 retval = rte_eth_dev_start(port); 470 if (retval < 0) { 471 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 472 return retval; 473 } 474 475 if (promiscuous) 476 rte_eth_promiscuous_enable(port); 477 478 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 479 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 480 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 481 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 482 (unsigned)port, 483 vmdq_ports_eth_addr[port].addr_bytes[0], 484 vmdq_ports_eth_addr[port].addr_bytes[1], 485 vmdq_ports_eth_addr[port].addr_bytes[2], 486 vmdq_ports_eth_addr[port].addr_bytes[3], 487 vmdq_ports_eth_addr[port].addr_bytes[4], 488 vmdq_ports_eth_addr[port].addr_bytes[5]); 489 490 return 0; 491 } 492 493 /* 494 * Set character device basename. 495 */ 496 static int 497 us_vhost_parse_basename(const char *q_arg) 498 { 499 /* parse number string */ 500 501 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 502 return -1; 503 else 504 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 505 506 return 0; 507 } 508 509 /* 510 * Parse the portmask provided at run time. 511 */ 512 static int 513 parse_portmask(const char *portmask) 514 { 515 char *end = NULL; 516 unsigned long pm; 517 518 errno = 0; 519 520 /* parse hexadecimal string */ 521 pm = strtoul(portmask, &end, 16); 522 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 523 return -1; 524 525 if (pm == 0) 526 return -1; 527 528 return pm; 529 530 } 531 532 /* 533 * Parse num options at run time. 534 */ 535 static int 536 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 537 { 538 char *end = NULL; 539 unsigned long num; 540 541 errno = 0; 542 543 /* parse unsigned int string */ 544 num = strtoul(q_arg, &end, 10); 545 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 546 return -1; 547 548 if (num > max_valid_value) 549 return -1; 550 551 return num; 552 553 } 554 555 /* 556 * Display usage 557 */ 558 static void 559 us_vhost_usage(const char *prgname) 560 { 561 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 562 " --vm2vm [0|1|2]\n" 563 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 564 " --dev-basename <name>\n" 565 " --nb-devices ND\n" 566 " -p PORTMASK: Set mask for ports to be used by application\n" 567 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 568 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 569 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 570 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 571 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 572 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 573 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 574 " --dev-basename: The basename to be used for the character device.\n" 575 " --zero-copy [0|1]: disable(default)/enable rx/tx " 576 "zero copy\n" 577 " --rx-desc-num [0-N]: the number of descriptors on rx, " 578 "used only when zero copy is enabled.\n" 579 " --tx-desc-num [0-N]: the number of descriptors on tx, " 580 "used only when zero copy is enabled.\n", 581 prgname); 582 } 583 584 /* 585 * Parse the arguments given in the command line of the application. 586 */ 587 static int 588 us_vhost_parse_args(int argc, char **argv) 589 { 590 int opt, ret; 591 int option_index; 592 unsigned i; 593 const char *prgname = argv[0]; 594 static struct option long_option[] = { 595 {"vm2vm", required_argument, NULL, 0}, 596 {"rx-retry", required_argument, NULL, 0}, 597 {"rx-retry-delay", required_argument, NULL, 0}, 598 {"rx-retry-num", required_argument, NULL, 0}, 599 {"mergeable", required_argument, NULL, 0}, 600 {"vlan-strip", required_argument, NULL, 0}, 601 {"stats", required_argument, NULL, 0}, 602 {"dev-basename", required_argument, NULL, 0}, 603 {"zero-copy", required_argument, NULL, 0}, 604 {"rx-desc-num", required_argument, NULL, 0}, 605 {"tx-desc-num", required_argument, NULL, 0}, 606 {NULL, 0, 0, 0}, 607 }; 608 609 /* Parse command line */ 610 while ((opt = getopt_long(argc, argv, "p:P", 611 long_option, &option_index)) != EOF) { 612 switch (opt) { 613 /* Portmask */ 614 case 'p': 615 enabled_port_mask = parse_portmask(optarg); 616 if (enabled_port_mask == 0) { 617 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 618 us_vhost_usage(prgname); 619 return -1; 620 } 621 break; 622 623 case 'P': 624 promiscuous = 1; 625 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 626 ETH_VMDQ_ACCEPT_BROADCAST | 627 ETH_VMDQ_ACCEPT_MULTICAST; 628 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 629 630 break; 631 632 case 0: 633 /* Enable/disable vm2vm comms. */ 634 if (!strncmp(long_option[option_index].name, "vm2vm", 635 MAX_LONG_OPT_SZ)) { 636 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 637 if (ret == -1) { 638 RTE_LOG(INFO, VHOST_CONFIG, 639 "Invalid argument for " 640 "vm2vm [0|1|2]\n"); 641 us_vhost_usage(prgname); 642 return -1; 643 } else { 644 vm2vm_mode = (vm2vm_type)ret; 645 } 646 } 647 648 /* Enable/disable retries on RX. */ 649 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 650 ret = parse_num_opt(optarg, 1); 651 if (ret == -1) { 652 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 653 us_vhost_usage(prgname); 654 return -1; 655 } else { 656 enable_retry = ret; 657 } 658 } 659 660 /* Specify the retries delay time (in useconds) on RX. */ 661 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 662 ret = parse_num_opt(optarg, INT32_MAX); 663 if (ret == -1) { 664 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 665 us_vhost_usage(prgname); 666 return -1; 667 } else { 668 burst_rx_delay_time = ret; 669 } 670 } 671 672 /* Specify the retries number on RX. */ 673 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 674 ret = parse_num_opt(optarg, INT32_MAX); 675 if (ret == -1) { 676 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 677 us_vhost_usage(prgname); 678 return -1; 679 } else { 680 burst_rx_retry_num = ret; 681 } 682 } 683 684 /* Enable/disable RX mergeable buffers. */ 685 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 686 ret = parse_num_opt(optarg, 1); 687 if (ret == -1) { 688 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 689 us_vhost_usage(prgname); 690 return -1; 691 } else { 692 mergeable = !!ret; 693 if (ret) { 694 vmdq_conf_default.rxmode.jumbo_frame = 1; 695 vmdq_conf_default.rxmode.max_rx_pkt_len 696 = JUMBO_FRAME_MAX_SIZE; 697 } 698 } 699 } 700 701 /* Enable/disable RX VLAN strip on host. */ 702 if (!strncmp(long_option[option_index].name, 703 "vlan-strip", MAX_LONG_OPT_SZ)) { 704 ret = parse_num_opt(optarg, 1); 705 if (ret == -1) { 706 RTE_LOG(INFO, VHOST_CONFIG, 707 "Invalid argument for VLAN strip [0|1]\n"); 708 us_vhost_usage(prgname); 709 return -1; 710 } else { 711 vlan_strip = !!ret; 712 vmdq_conf_default.rxmode.hw_vlan_strip = 713 vlan_strip; 714 } 715 } 716 717 /* Enable/disable stats. */ 718 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 719 ret = parse_num_opt(optarg, INT32_MAX); 720 if (ret == -1) { 721 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 722 us_vhost_usage(prgname); 723 return -1; 724 } else { 725 enable_stats = ret; 726 } 727 } 728 729 /* Set character device basename. */ 730 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 731 if (us_vhost_parse_basename(optarg) == -1) { 732 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 733 us_vhost_usage(prgname); 734 return -1; 735 } 736 } 737 738 /* Enable/disable rx/tx zero copy. */ 739 if (!strncmp(long_option[option_index].name, 740 "zero-copy", MAX_LONG_OPT_SZ)) { 741 ret = parse_num_opt(optarg, 1); 742 if (ret == -1) { 743 RTE_LOG(INFO, VHOST_CONFIG, 744 "Invalid argument" 745 " for zero-copy [0|1]\n"); 746 us_vhost_usage(prgname); 747 return -1; 748 } else 749 zero_copy = ret; 750 751 if (zero_copy) { 752 #ifdef RTE_MBUF_REFCNT 753 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 754 "zero copy vhost APP, please " 755 "disable RTE_MBUF_REFCNT\n" 756 "in config file and then rebuild DPDK " 757 "core lib!\n" 758 "Otherwise please disable zero copy " 759 "flag in command line!\n"); 760 return -1; 761 #endif 762 } 763 } 764 765 /* Specify the descriptor number on RX. */ 766 if (!strncmp(long_option[option_index].name, 767 "rx-desc-num", MAX_LONG_OPT_SZ)) { 768 ret = parse_num_opt(optarg, MAX_RING_DESC); 769 if ((ret == -1) || (!POWEROF2(ret))) { 770 RTE_LOG(INFO, VHOST_CONFIG, 771 "Invalid argument for rx-desc-num[0-N]," 772 "power of 2 required.\n"); 773 us_vhost_usage(prgname); 774 return -1; 775 } else { 776 num_rx_descriptor = ret; 777 } 778 } 779 780 /* Specify the descriptor number on TX. */ 781 if (!strncmp(long_option[option_index].name, 782 "tx-desc-num", MAX_LONG_OPT_SZ)) { 783 ret = parse_num_opt(optarg, MAX_RING_DESC); 784 if ((ret == -1) || (!POWEROF2(ret))) { 785 RTE_LOG(INFO, VHOST_CONFIG, 786 "Invalid argument for tx-desc-num [0-N]," 787 "power of 2 required.\n"); 788 us_vhost_usage(prgname); 789 return -1; 790 } else { 791 num_tx_descriptor = ret; 792 } 793 } 794 795 break; 796 797 /* Invalid option - print options. */ 798 default: 799 us_vhost_usage(prgname); 800 return -1; 801 } 802 } 803 804 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 805 if (enabled_port_mask & (1 << i)) 806 ports[num_ports++] = (uint8_t)i; 807 } 808 809 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 810 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 811 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 812 return -1; 813 } 814 815 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 816 RTE_LOG(INFO, VHOST_PORT, 817 "Vhost zero copy doesn't support software vm2vm," 818 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 819 return -1; 820 } 821 822 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 823 RTE_LOG(INFO, VHOST_PORT, 824 "Vhost zero copy doesn't support jumbo frame," 825 "please specify '--mergeable 0' to disable the " 826 "mergeable feature.\n"); 827 return -1; 828 } 829 830 return 0; 831 } 832 833 /* 834 * Update the global var NUM_PORTS and array PORTS according to system ports number 835 * and return valid ports number 836 */ 837 static unsigned check_ports_num(unsigned nb_ports) 838 { 839 unsigned valid_num_ports = num_ports; 840 unsigned portid; 841 842 if (num_ports > nb_ports) { 843 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 844 num_ports, nb_ports); 845 num_ports = nb_ports; 846 } 847 848 for (portid = 0; portid < num_ports; portid ++) { 849 if (ports[portid] >= nb_ports) { 850 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 851 ports[portid], (nb_ports - 1)); 852 ports[portid] = INVALID_PORT_ID; 853 valid_num_ports--; 854 } 855 } 856 return valid_num_ports; 857 } 858 859 /* 860 * Macro to print out packet contents. Wrapped in debug define so that the 861 * data path is not effected when debug is disabled. 862 */ 863 #ifdef DEBUG 864 #define PRINT_PACKET(device, addr, size, header) do { \ 865 char *pkt_addr = (char*)(addr); \ 866 unsigned int index; \ 867 char packet[MAX_PRINT_BUFF]; \ 868 \ 869 if ((header)) \ 870 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 871 else \ 872 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 873 for (index = 0; index < (size); index++) { \ 874 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 875 "%02hhx ", pkt_addr[index]); \ 876 } \ 877 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 878 \ 879 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 880 } while(0) 881 #else 882 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 883 #endif 884 885 /* 886 * Function to convert guest physical addresses to vhost physical addresses. 887 * This is used to convert virtio buffer addresses. 888 */ 889 static inline uint64_t __attribute__((always_inline)) 890 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 891 uint32_t buf_len, hpa_type *addr_type) 892 { 893 struct virtio_memory_regions_hpa *region; 894 uint32_t regionidx; 895 uint64_t vhost_pa = 0; 896 897 *addr_type = PHYS_ADDR_INVALID; 898 899 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 900 region = &vdev->regions_hpa[regionidx]; 901 if ((guest_pa >= region->guest_phys_address) && 902 (guest_pa <= region->guest_phys_address_end)) { 903 vhost_pa = region->host_phys_addr_offset + guest_pa; 904 if (likely((guest_pa + buf_len - 1) 905 <= region->guest_phys_address_end)) 906 *addr_type = PHYS_ADDR_CONTINUOUS; 907 else 908 *addr_type = PHYS_ADDR_CROSS_SUBREG; 909 break; 910 } 911 } 912 913 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 914 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 915 (void *)(uintptr_t)vhost_pa); 916 917 return vhost_pa; 918 } 919 920 /* 921 * Compares a packet destination MAC address to a device MAC address. 922 */ 923 static inline int __attribute__((always_inline)) 924 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 925 { 926 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 927 } 928 929 /* 930 * This function learns the MAC address of the device and registers this along with a 931 * vlan tag to a VMDQ. 932 */ 933 static int 934 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 935 { 936 struct ether_hdr *pkt_hdr; 937 struct virtio_net_data_ll *dev_ll; 938 struct virtio_net *dev = vdev->dev; 939 int i, ret; 940 941 /* Learn MAC address of guest device from packet */ 942 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 943 944 dev_ll = ll_root_used; 945 946 while (dev_ll != NULL) { 947 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 948 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 949 return -1; 950 } 951 dev_ll = dev_ll->next; 952 } 953 954 for (i = 0; i < ETHER_ADDR_LEN; i++) 955 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 956 957 /* vlan_tag currently uses the device_id. */ 958 vdev->vlan_tag = vlan_tags[dev->device_fh]; 959 960 /* Print out VMDQ registration info. */ 961 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 962 dev->device_fh, 963 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 964 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 965 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 966 vdev->vlan_tag); 967 968 /* Register the MAC address. */ 969 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 970 (uint32_t)dev->device_fh + vmdq_pool_base); 971 if (ret) 972 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 973 dev->device_fh); 974 975 /* Enable stripping of the vlan tag as we handle routing. */ 976 if (vlan_strip) 977 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 978 (uint16_t)vdev->vmdq_rx_q, 1); 979 980 /* Set device as ready for RX. */ 981 vdev->ready = DEVICE_RX; 982 983 return 0; 984 } 985 986 /* 987 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 988 * queue before disabling RX on the device. 989 */ 990 static inline void 991 unlink_vmdq(struct vhost_dev *vdev) 992 { 993 unsigned i = 0; 994 unsigned rx_count; 995 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 996 997 if (vdev->ready == DEVICE_RX) { 998 /*clear MAC and VLAN settings*/ 999 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 1000 for (i = 0; i < 6; i++) 1001 vdev->mac_address.addr_bytes[i] = 0; 1002 1003 vdev->vlan_tag = 0; 1004 1005 /*Clear out the receive buffers*/ 1006 rx_count = rte_eth_rx_burst(ports[0], 1007 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1008 1009 while (rx_count) { 1010 for (i = 0; i < rx_count; i++) 1011 rte_pktmbuf_free(pkts_burst[i]); 1012 1013 rx_count = rte_eth_rx_burst(ports[0], 1014 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1015 } 1016 1017 vdev->ready = DEVICE_MAC_LEARNING; 1018 } 1019 } 1020 1021 /* 1022 * Check if the packet destination MAC address is for a local device. If so then put 1023 * the packet on that devices RX queue. If not then return. 1024 */ 1025 static inline int __attribute__((always_inline)) 1026 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1027 { 1028 struct virtio_net_data_ll *dev_ll; 1029 struct ether_hdr *pkt_hdr; 1030 uint64_t ret = 0; 1031 struct virtio_net *dev = vdev->dev; 1032 struct virtio_net *tdev; /* destination virito device */ 1033 1034 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1035 1036 /*get the used devices list*/ 1037 dev_ll = ll_root_used; 1038 1039 while (dev_ll != NULL) { 1040 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1041 &dev_ll->vdev->mac_address)) { 1042 1043 /* Drop the packet if the TX packet is destined for the TX device. */ 1044 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1045 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1046 dev->device_fh); 1047 return 0; 1048 } 1049 tdev = dev_ll->vdev->dev; 1050 1051 1052 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1053 1054 if (unlikely(dev_ll->vdev->remove)) { 1055 /*drop the packet if the device is marked for removal*/ 1056 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1057 } else { 1058 /*send the packet to the local virtio device*/ 1059 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1060 if (enable_stats) { 1061 rte_atomic64_add( 1062 &dev_statistics[tdev->device_fh].rx_total_atomic, 1063 1); 1064 rte_atomic64_add( 1065 &dev_statistics[tdev->device_fh].rx_atomic, 1066 ret); 1067 dev_statistics[tdev->device_fh].tx_total++; 1068 dev_statistics[tdev->device_fh].tx += ret; 1069 } 1070 } 1071 1072 return 0; 1073 } 1074 dev_ll = dev_ll->next; 1075 } 1076 1077 return -1; 1078 } 1079 1080 /* 1081 * Check if the destination MAC of a packet is one local VM, 1082 * and get its vlan tag, and offset if it is. 1083 */ 1084 static inline int __attribute__((always_inline)) 1085 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1086 uint32_t *offset, uint16_t *vlan_tag) 1087 { 1088 struct virtio_net_data_ll *dev_ll = ll_root_used; 1089 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1090 1091 while (dev_ll != NULL) { 1092 if ((dev_ll->vdev->ready == DEVICE_RX) 1093 && ether_addr_cmp(&(pkt_hdr->d_addr), 1094 &dev_ll->vdev->mac_address)) { 1095 /* 1096 * Drop the packet if the TX packet is 1097 * destined for the TX device. 1098 */ 1099 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1100 LOG_DEBUG(VHOST_DATA, 1101 "(%"PRIu64") TX: Source and destination" 1102 " MAC addresses are the same. Dropping " 1103 "packet.\n", 1104 dev_ll->vdev->dev->device_fh); 1105 return -1; 1106 } 1107 1108 /* 1109 * HW vlan strip will reduce the packet length 1110 * by minus length of vlan tag, so need restore 1111 * the packet length by plus it. 1112 */ 1113 *offset = VLAN_HLEN; 1114 *vlan_tag = 1115 (uint16_t) 1116 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1117 1118 LOG_DEBUG(VHOST_DATA, 1119 "(%"PRIu64") TX: pkt to local VM device id:" 1120 "(%"PRIu64") vlan tag: %d.\n", 1121 dev->device_fh, dev_ll->vdev->dev->device_fh, 1122 vlan_tag); 1123 1124 break; 1125 } 1126 dev_ll = dev_ll->next; 1127 } 1128 return 0; 1129 } 1130 1131 /* 1132 * This function routes the TX packet to the correct interface. This may be a local device 1133 * or the physical port. 1134 */ 1135 static inline void __attribute__((always_inline)) 1136 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1137 { 1138 struct mbuf_table *tx_q; 1139 struct rte_mbuf **m_table; 1140 unsigned len, ret, offset = 0; 1141 const uint16_t lcore_id = rte_lcore_id(); 1142 struct virtio_net *dev = vdev->dev; 1143 struct ether_hdr *nh; 1144 1145 /*check if destination is local VM*/ 1146 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1147 rte_pktmbuf_free(m); 1148 return; 1149 } 1150 1151 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1152 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1153 rte_pktmbuf_free(m); 1154 return; 1155 } 1156 } 1157 1158 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1159 1160 /*Add packet to the port tx queue*/ 1161 tx_q = &lcore_tx_queue[lcore_id]; 1162 len = tx_q->len; 1163 1164 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1165 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1166 /* Guest has inserted the vlan tag. */ 1167 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1168 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1169 if ((vm2vm_mode == VM2VM_HARDWARE) && 1170 (vh->vlan_tci != vlan_tag_be)) 1171 vh->vlan_tci = vlan_tag_be; 1172 } else { 1173 m->ol_flags = PKT_TX_VLAN_PKT; 1174 1175 /* 1176 * Find the right seg to adjust the data len when offset is 1177 * bigger than tail room size. 1178 */ 1179 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1180 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1181 m->data_len += offset; 1182 else { 1183 struct rte_mbuf *seg = m; 1184 1185 while ((seg->next != NULL) && 1186 (offset > rte_pktmbuf_tailroom(seg))) 1187 seg = seg->next; 1188 1189 seg->data_len += offset; 1190 } 1191 m->pkt_len += offset; 1192 } 1193 1194 m->vlan_tci = vlan_tag; 1195 } 1196 1197 tx_q->m_table[len] = m; 1198 len++; 1199 if (enable_stats) { 1200 dev_statistics[dev->device_fh].tx_total++; 1201 dev_statistics[dev->device_fh].tx++; 1202 } 1203 1204 if (unlikely(len == MAX_PKT_BURST)) { 1205 m_table = (struct rte_mbuf **)tx_q->m_table; 1206 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1207 /* Free any buffers not handled by TX and update the port stats. */ 1208 if (unlikely(ret < len)) { 1209 do { 1210 rte_pktmbuf_free(m_table[ret]); 1211 } while (++ret < len); 1212 } 1213 1214 len = 0; 1215 } 1216 1217 tx_q->len = len; 1218 return; 1219 } 1220 /* 1221 * This function is called by each data core. It handles all RX/TX registered with the 1222 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1223 * with all devices in the main linked list. 1224 */ 1225 static int 1226 switch_worker(__attribute__((unused)) void *arg) 1227 { 1228 struct rte_mempool *mbuf_pool = arg; 1229 struct virtio_net *dev = NULL; 1230 struct vhost_dev *vdev = NULL; 1231 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1232 struct virtio_net_data_ll *dev_ll; 1233 struct mbuf_table *tx_q; 1234 volatile struct lcore_ll_info *lcore_ll; 1235 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1236 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1237 unsigned ret, i; 1238 const uint16_t lcore_id = rte_lcore_id(); 1239 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1240 uint16_t rx_count = 0; 1241 uint16_t tx_count; 1242 uint32_t retry = 0; 1243 1244 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1245 lcore_ll = lcore_info[lcore_id].lcore_ll; 1246 prev_tsc = 0; 1247 1248 tx_q = &lcore_tx_queue[lcore_id]; 1249 for (i = 0; i < num_cores; i ++) { 1250 if (lcore_ids[i] == lcore_id) { 1251 tx_q->txq_id = i; 1252 break; 1253 } 1254 } 1255 1256 while(1) { 1257 cur_tsc = rte_rdtsc(); 1258 /* 1259 * TX burst queue drain 1260 */ 1261 diff_tsc = cur_tsc - prev_tsc; 1262 if (unlikely(diff_tsc > drain_tsc)) { 1263 1264 if (tx_q->len) { 1265 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1266 1267 /*Tx any packets in the queue*/ 1268 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1269 (struct rte_mbuf **)tx_q->m_table, 1270 (uint16_t)tx_q->len); 1271 if (unlikely(ret < tx_q->len)) { 1272 do { 1273 rte_pktmbuf_free(tx_q->m_table[ret]); 1274 } while (++ret < tx_q->len); 1275 } 1276 1277 tx_q->len = 0; 1278 } 1279 1280 prev_tsc = cur_tsc; 1281 1282 } 1283 1284 rte_prefetch0(lcore_ll->ll_root_used); 1285 /* 1286 * Inform the configuration core that we have exited the linked list and that no devices are 1287 * in use if requested. 1288 */ 1289 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1290 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1291 1292 /* 1293 * Process devices 1294 */ 1295 dev_ll = lcore_ll->ll_root_used; 1296 1297 while (dev_ll != NULL) { 1298 /*get virtio device ID*/ 1299 vdev = dev_ll->vdev; 1300 dev = vdev->dev; 1301 1302 if (unlikely(vdev->remove)) { 1303 dev_ll = dev_ll->next; 1304 unlink_vmdq(vdev); 1305 vdev->ready = DEVICE_SAFE_REMOVE; 1306 continue; 1307 } 1308 if (likely(vdev->ready == DEVICE_RX)) { 1309 /*Handle guest RX*/ 1310 rx_count = rte_eth_rx_burst(ports[0], 1311 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1312 1313 if (rx_count) { 1314 /* 1315 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1316 * Here MAX_PKT_BURST must be less than virtio queue size 1317 */ 1318 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1319 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1320 rte_delay_us(burst_rx_delay_time); 1321 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1322 break; 1323 } 1324 } 1325 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1326 if (enable_stats) { 1327 rte_atomic64_add( 1328 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1329 rx_count); 1330 rte_atomic64_add( 1331 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1332 } 1333 while (likely(rx_count)) { 1334 rx_count--; 1335 rte_pktmbuf_free(pkts_burst[rx_count]); 1336 } 1337 1338 } 1339 } 1340 1341 if (likely(!vdev->remove)) { 1342 /* Handle guest TX*/ 1343 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1344 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1345 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1346 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1347 while (tx_count) 1348 rte_pktmbuf_free(pkts_burst[--tx_count]); 1349 } 1350 } 1351 while (tx_count) 1352 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1353 } 1354 1355 /*move to the next device in the list*/ 1356 dev_ll = dev_ll->next; 1357 } 1358 } 1359 1360 return 0; 1361 } 1362 1363 /* 1364 * This function gets available ring number for zero copy rx. 1365 * Only one thread will call this funciton for a paticular virtio device, 1366 * so, it is designed as non-thread-safe function. 1367 */ 1368 static inline uint32_t __attribute__((always_inline)) 1369 get_available_ring_num_zcp(struct virtio_net *dev) 1370 { 1371 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1372 uint16_t avail_idx; 1373 1374 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1375 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1376 } 1377 1378 /* 1379 * This function gets available ring index for zero copy rx, 1380 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1381 * Only one thread will call this funciton for a paticular virtio device, 1382 * so, it is designed as non-thread-safe function. 1383 */ 1384 static inline uint32_t __attribute__((always_inline)) 1385 get_available_ring_index_zcp(struct virtio_net *dev, 1386 uint16_t *res_base_idx, uint32_t count) 1387 { 1388 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1389 uint16_t avail_idx; 1390 uint32_t retry = 0; 1391 uint16_t free_entries; 1392 1393 *res_base_idx = vq->last_used_idx_res; 1394 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1395 free_entries = (avail_idx - *res_base_idx); 1396 1397 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1398 "avail idx: %d, " 1399 "res base idx:%d, free entries:%d\n", 1400 dev->device_fh, avail_idx, *res_base_idx, 1401 free_entries); 1402 1403 /* 1404 * If retry is enabled and the queue is full then we wait 1405 * and retry to avoid packet loss. 1406 */ 1407 if (enable_retry && unlikely(count > free_entries)) { 1408 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1409 rte_delay_us(burst_rx_delay_time); 1410 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1411 free_entries = (avail_idx - *res_base_idx); 1412 if (count <= free_entries) 1413 break; 1414 } 1415 } 1416 1417 /*check that we have enough buffers*/ 1418 if (unlikely(count > free_entries)) 1419 count = free_entries; 1420 1421 if (unlikely(count == 0)) { 1422 LOG_DEBUG(VHOST_DATA, 1423 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1424 "avail idx: %d, res base idx:%d, free entries:%d\n", 1425 dev->device_fh, avail_idx, 1426 *res_base_idx, free_entries); 1427 return 0; 1428 } 1429 1430 vq->last_used_idx_res = *res_base_idx + count; 1431 1432 return count; 1433 } 1434 1435 /* 1436 * This function put descriptor back to used list. 1437 */ 1438 static inline void __attribute__((always_inline)) 1439 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1440 { 1441 uint16_t res_cur_idx = vq->last_used_idx; 1442 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1443 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1444 rte_compiler_barrier(); 1445 *(volatile uint16_t *)&vq->used->idx += 1; 1446 vq->last_used_idx += 1; 1447 1448 /* Kick the guest if necessary. */ 1449 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1450 eventfd_write((int)vq->kickfd, 1); 1451 } 1452 1453 /* 1454 * This function get available descriptor from vitio vring and un-attached mbuf 1455 * from vpool->ring, and then attach them together. It needs adjust the offset 1456 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1457 * frame data may be put to wrong location in mbuf. 1458 */ 1459 static inline void __attribute__((always_inline)) 1460 attach_rxmbuf_zcp(struct virtio_net *dev) 1461 { 1462 uint16_t res_base_idx, desc_idx; 1463 uint64_t buff_addr, phys_addr; 1464 struct vhost_virtqueue *vq; 1465 struct vring_desc *desc; 1466 struct rte_mbuf *mbuf = NULL; 1467 struct vpool *vpool; 1468 hpa_type addr_type; 1469 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1470 1471 vpool = &vpool_array[vdev->vmdq_rx_q]; 1472 vq = dev->virtqueue[VIRTIO_RXQ]; 1473 1474 do { 1475 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1476 1) != 1)) 1477 return; 1478 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1479 1480 desc = &vq->desc[desc_idx]; 1481 if (desc->flags & VRING_DESC_F_NEXT) { 1482 desc = &vq->desc[desc->next]; 1483 buff_addr = gpa_to_vva(dev, desc->addr); 1484 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1485 &addr_type); 1486 } else { 1487 buff_addr = gpa_to_vva(dev, 1488 desc->addr + vq->vhost_hlen); 1489 phys_addr = gpa_to_hpa(vdev, 1490 desc->addr + vq->vhost_hlen, 1491 desc->len, &addr_type); 1492 } 1493 1494 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1495 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1496 " address found when attaching RX frame buffer" 1497 " address!\n", dev->device_fh); 1498 put_desc_to_used_list_zcp(vq, desc_idx); 1499 continue; 1500 } 1501 1502 /* 1503 * Check if the frame buffer address from guest crosses 1504 * sub-region or not. 1505 */ 1506 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1507 RTE_LOG(ERR, VHOST_DATA, 1508 "(%"PRIu64") Frame buffer address cross " 1509 "sub-regioin found when attaching RX frame " 1510 "buffer address!\n", 1511 dev->device_fh); 1512 put_desc_to_used_list_zcp(vq, desc_idx); 1513 continue; 1514 } 1515 } while (unlikely(phys_addr == 0)); 1516 1517 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1518 if (unlikely(mbuf == NULL)) { 1519 LOG_DEBUG(VHOST_DATA, 1520 "(%"PRIu64") in attach_rxmbuf_zcp: " 1521 "ring_sc_dequeue fail.\n", 1522 dev->device_fh); 1523 put_desc_to_used_list_zcp(vq, desc_idx); 1524 return; 1525 } 1526 1527 if (unlikely(vpool->buf_size > desc->len)) { 1528 LOG_DEBUG(VHOST_DATA, 1529 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1530 "length(%d) of descriptor idx: %d less than room " 1531 "size required: %d\n", 1532 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1533 put_desc_to_used_list_zcp(vq, desc_idx); 1534 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1535 return; 1536 } 1537 1538 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1539 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1540 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1541 mbuf->data_len = desc->len; 1542 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1543 1544 LOG_DEBUG(VHOST_DATA, 1545 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1546 "descriptor idx:%d\n", 1547 dev->device_fh, res_base_idx, desc_idx); 1548 1549 __rte_mbuf_raw_free(mbuf); 1550 1551 return; 1552 } 1553 1554 /* 1555 * Detach an attched packet mbuf - 1556 * - restore original mbuf address and length values. 1557 * - reset pktmbuf data and data_len to their default values. 1558 * All other fields of the given packet mbuf will be left intact. 1559 * 1560 * @param m 1561 * The attached packet mbuf. 1562 */ 1563 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1564 { 1565 const struct rte_mempool *mp = m->pool; 1566 void *buf = RTE_MBUF_TO_BADDR(m); 1567 uint32_t buf_ofs; 1568 uint32_t buf_len = mp->elt_size - sizeof(*m); 1569 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1570 1571 m->buf_addr = buf; 1572 m->buf_len = (uint16_t)buf_len; 1573 1574 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1575 RTE_PKTMBUF_HEADROOM : m->buf_len; 1576 m->data_off = buf_ofs; 1577 1578 m->data_len = 0; 1579 } 1580 1581 /* 1582 * This function is called after packets have been transimited. It fetchs mbuf 1583 * from vpool->pool, detached it and put into vpool->ring. It also update the 1584 * used index and kick the guest if necessary. 1585 */ 1586 static inline uint32_t __attribute__((always_inline)) 1587 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1588 { 1589 struct rte_mbuf *mbuf; 1590 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1591 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1592 uint32_t index = 0; 1593 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1594 1595 LOG_DEBUG(VHOST_DATA, 1596 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1597 "clean is: %d\n", 1598 dev->device_fh, mbuf_count); 1599 LOG_DEBUG(VHOST_DATA, 1600 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1601 "clean is : %d\n", 1602 dev->device_fh, rte_ring_count(vpool->ring)); 1603 1604 for (index = 0; index < mbuf_count; index++) { 1605 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1606 if (likely(MBUF_EXT_MEM(mbuf))) 1607 pktmbuf_detach_zcp(mbuf); 1608 rte_ring_sp_enqueue(vpool->ring, mbuf); 1609 1610 /* Update used index buffer information. */ 1611 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1612 vq->used->ring[used_idx].len = 0; 1613 1614 used_idx = (used_idx + 1) & (vq->size - 1); 1615 } 1616 1617 LOG_DEBUG(VHOST_DATA, 1618 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1619 "clean is: %d\n", 1620 dev->device_fh, rte_mempool_count(vpool->pool)); 1621 LOG_DEBUG(VHOST_DATA, 1622 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1623 "clean is : %d\n", 1624 dev->device_fh, rte_ring_count(vpool->ring)); 1625 LOG_DEBUG(VHOST_DATA, 1626 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1627 "vq->last_used_idx:%d\n", 1628 dev->device_fh, vq->last_used_idx); 1629 1630 vq->last_used_idx += mbuf_count; 1631 1632 LOG_DEBUG(VHOST_DATA, 1633 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1634 "vq->last_used_idx:%d\n", 1635 dev->device_fh, vq->last_used_idx); 1636 1637 rte_compiler_barrier(); 1638 1639 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1640 1641 /* Kick guest if required. */ 1642 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1643 eventfd_write((int)vq->kickfd, 1); 1644 1645 return 0; 1646 } 1647 1648 /* 1649 * This function is called when a virtio device is destroy. 1650 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1651 */ 1652 static void mbuf_destroy_zcp(struct vpool *vpool) 1653 { 1654 struct rte_mbuf *mbuf = NULL; 1655 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1656 1657 LOG_DEBUG(VHOST_CONFIG, 1658 "in mbuf_destroy_zcp: mbuf count in mempool before " 1659 "mbuf_destroy_zcp is: %d\n", 1660 mbuf_count); 1661 LOG_DEBUG(VHOST_CONFIG, 1662 "in mbuf_destroy_zcp: mbuf count in ring before " 1663 "mbuf_destroy_zcp is : %d\n", 1664 rte_ring_count(vpool->ring)); 1665 1666 for (index = 0; index < mbuf_count; index++) { 1667 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1668 if (likely(mbuf != NULL)) { 1669 if (likely(MBUF_EXT_MEM(mbuf))) 1670 pktmbuf_detach_zcp(mbuf); 1671 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1672 } 1673 } 1674 1675 LOG_DEBUG(VHOST_CONFIG, 1676 "in mbuf_destroy_zcp: mbuf count in mempool after " 1677 "mbuf_destroy_zcp is: %d\n", 1678 rte_mempool_count(vpool->pool)); 1679 LOG_DEBUG(VHOST_CONFIG, 1680 "in mbuf_destroy_zcp: mbuf count in ring after " 1681 "mbuf_destroy_zcp is : %d\n", 1682 rte_ring_count(vpool->ring)); 1683 } 1684 1685 /* 1686 * This function update the use flag and counter. 1687 */ 1688 static inline uint32_t __attribute__((always_inline)) 1689 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1690 uint32_t count) 1691 { 1692 struct vhost_virtqueue *vq; 1693 struct vring_desc *desc; 1694 struct rte_mbuf *buff; 1695 /* The virtio_hdr is initialised to 0. */ 1696 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1697 = {{0, 0, 0, 0, 0, 0}, 0}; 1698 uint64_t buff_hdr_addr = 0; 1699 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1700 uint32_t head_idx, packet_success = 0; 1701 uint16_t res_cur_idx; 1702 1703 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1704 1705 if (count == 0) 1706 return 0; 1707 1708 vq = dev->virtqueue[VIRTIO_RXQ]; 1709 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1710 1711 res_cur_idx = vq->last_used_idx; 1712 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1713 dev->device_fh, res_cur_idx, res_cur_idx + count); 1714 1715 /* Retrieve all of the head indexes first to avoid caching issues. */ 1716 for (head_idx = 0; head_idx < count; head_idx++) 1717 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1718 1719 /*Prefetch descriptor index. */ 1720 rte_prefetch0(&vq->desc[head[packet_success]]); 1721 1722 while (packet_success != count) { 1723 /* Get descriptor from available ring */ 1724 desc = &vq->desc[head[packet_success]]; 1725 1726 buff = pkts[packet_success]; 1727 LOG_DEBUG(VHOST_DATA, 1728 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1729 "pkt[%d] descriptor idx: %d\n", 1730 dev->device_fh, packet_success, 1731 MBUF_HEADROOM_UINT32(buff)); 1732 1733 PRINT_PACKET(dev, 1734 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1735 + RTE_PKTMBUF_HEADROOM), 1736 rte_pktmbuf_data_len(buff), 0); 1737 1738 /* Buffer address translation for virtio header. */ 1739 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1740 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1741 1742 /* 1743 * If the descriptors are chained the header and data are 1744 * placed in separate buffers. 1745 */ 1746 if (desc->flags & VRING_DESC_F_NEXT) { 1747 desc->len = vq->vhost_hlen; 1748 desc = &vq->desc[desc->next]; 1749 desc->len = rte_pktmbuf_data_len(buff); 1750 } else { 1751 desc->len = packet_len; 1752 } 1753 1754 /* Update used ring with desc information */ 1755 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1756 = head[packet_success]; 1757 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1758 = packet_len; 1759 res_cur_idx++; 1760 packet_success++; 1761 1762 /* A header is required per buffer. */ 1763 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1764 (const void *)&virtio_hdr, vq->vhost_hlen); 1765 1766 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1767 1768 if (likely(packet_success < count)) { 1769 /* Prefetch descriptor index. */ 1770 rte_prefetch0(&vq->desc[head[packet_success]]); 1771 } 1772 } 1773 1774 rte_compiler_barrier(); 1775 1776 LOG_DEBUG(VHOST_DATA, 1777 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1778 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1779 dev->device_fh, vq->last_used_idx, vq->used->idx); 1780 1781 *(volatile uint16_t *)&vq->used->idx += count; 1782 vq->last_used_idx += count; 1783 1784 LOG_DEBUG(VHOST_DATA, 1785 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1786 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1787 dev->device_fh, vq->last_used_idx, vq->used->idx); 1788 1789 /* Kick the guest if necessary. */ 1790 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1791 eventfd_write((int)vq->kickfd, 1); 1792 1793 return count; 1794 } 1795 1796 /* 1797 * This function routes the TX packet to the correct interface. 1798 * This may be a local device or the physical port. 1799 */ 1800 static inline void __attribute__((always_inline)) 1801 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1802 uint32_t desc_idx, uint8_t need_copy) 1803 { 1804 struct mbuf_table *tx_q; 1805 struct rte_mbuf **m_table; 1806 struct rte_mbuf *mbuf = NULL; 1807 unsigned len, ret, offset = 0; 1808 struct vpool *vpool; 1809 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1810 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1811 1812 /*Add packet to the port tx queue*/ 1813 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1814 len = tx_q->len; 1815 1816 /* Allocate an mbuf and populate the structure. */ 1817 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1818 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1819 if (unlikely(mbuf == NULL)) { 1820 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1821 RTE_LOG(ERR, VHOST_DATA, 1822 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1823 dev->device_fh); 1824 put_desc_to_used_list_zcp(vq, desc_idx); 1825 return; 1826 } 1827 1828 if (vm2vm_mode == VM2VM_HARDWARE) { 1829 /* Avoid using a vlan tag from any vm for external pkt, such as 1830 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1831 * selection, MAC address determines it as an external pkt 1832 * which should go to network, while vlan tag determine it as 1833 * a vm2vm pkt should forward to another vm. Hardware confuse 1834 * such a ambiguous situation, so pkt will lost. 1835 */ 1836 vlan_tag = external_pkt_default_vlan_tag; 1837 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1838 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1839 __rte_mbuf_raw_free(mbuf); 1840 return; 1841 } 1842 } 1843 1844 mbuf->nb_segs = m->nb_segs; 1845 mbuf->next = m->next; 1846 mbuf->data_len = m->data_len + offset; 1847 mbuf->pkt_len = mbuf->data_len; 1848 if (unlikely(need_copy)) { 1849 /* Copy the packet contents to the mbuf. */ 1850 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1851 rte_pktmbuf_mtod(m, void *), 1852 m->data_len); 1853 } else { 1854 mbuf->data_off = m->data_off; 1855 mbuf->buf_physaddr = m->buf_physaddr; 1856 mbuf->buf_addr = m->buf_addr; 1857 } 1858 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1859 mbuf->vlan_tci = vlan_tag; 1860 mbuf->l2_len = sizeof(struct ether_hdr); 1861 mbuf->l3_len = sizeof(struct ipv4_hdr); 1862 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1863 1864 tx_q->m_table[len] = mbuf; 1865 len++; 1866 1867 LOG_DEBUG(VHOST_DATA, 1868 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1869 dev->device_fh, 1870 mbuf->nb_segs, 1871 (mbuf->next == NULL) ? "null" : "non-null"); 1872 1873 if (enable_stats) { 1874 dev_statistics[dev->device_fh].tx_total++; 1875 dev_statistics[dev->device_fh].tx++; 1876 } 1877 1878 if (unlikely(len == MAX_PKT_BURST)) { 1879 m_table = (struct rte_mbuf **)tx_q->m_table; 1880 ret = rte_eth_tx_burst(ports[0], 1881 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1882 1883 /* 1884 * Free any buffers not handled by TX and update 1885 * the port stats. 1886 */ 1887 if (unlikely(ret < len)) { 1888 do { 1889 rte_pktmbuf_free(m_table[ret]); 1890 } while (++ret < len); 1891 } 1892 1893 len = 0; 1894 txmbuf_clean_zcp(dev, vpool); 1895 } 1896 1897 tx_q->len = len; 1898 1899 return; 1900 } 1901 1902 /* 1903 * This function TX all available packets in virtio TX queue for one 1904 * virtio-net device. If it is first packet, it learns MAC address and 1905 * setup VMDQ. 1906 */ 1907 static inline void __attribute__((always_inline)) 1908 virtio_dev_tx_zcp(struct virtio_net *dev) 1909 { 1910 struct rte_mbuf m; 1911 struct vhost_virtqueue *vq; 1912 struct vring_desc *desc; 1913 uint64_t buff_addr = 0, phys_addr; 1914 uint32_t head[MAX_PKT_BURST]; 1915 uint32_t i; 1916 uint16_t free_entries, packet_success = 0; 1917 uint16_t avail_idx; 1918 uint8_t need_copy = 0; 1919 hpa_type addr_type; 1920 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1921 1922 vq = dev->virtqueue[VIRTIO_TXQ]; 1923 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1924 1925 /* If there are no available buffers then return. */ 1926 if (vq->last_used_idx_res == avail_idx) 1927 return; 1928 1929 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1930 1931 /* Prefetch available ring to retrieve head indexes. */ 1932 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1933 1934 /* Get the number of free entries in the ring */ 1935 free_entries = (avail_idx - vq->last_used_idx_res); 1936 1937 /* Limit to MAX_PKT_BURST. */ 1938 free_entries 1939 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1940 1941 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1942 dev->device_fh, free_entries); 1943 1944 /* Retrieve all of the head indexes first to avoid caching issues. */ 1945 for (i = 0; i < free_entries; i++) 1946 head[i] 1947 = vq->avail->ring[(vq->last_used_idx_res + i) 1948 & (vq->size - 1)]; 1949 1950 vq->last_used_idx_res += free_entries; 1951 1952 /* Prefetch descriptor index. */ 1953 rte_prefetch0(&vq->desc[head[packet_success]]); 1954 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1955 1956 while (packet_success < free_entries) { 1957 desc = &vq->desc[head[packet_success]]; 1958 1959 /* Discard first buffer as it is the virtio header */ 1960 desc = &vq->desc[desc->next]; 1961 1962 /* Buffer address translation. */ 1963 buff_addr = gpa_to_vva(dev, desc->addr); 1964 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1965 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1966 &addr_type); 1967 1968 if (likely(packet_success < (free_entries - 1))) 1969 /* Prefetch descriptor index. */ 1970 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1971 1972 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1973 RTE_LOG(ERR, VHOST_DATA, 1974 "(%"PRIu64") Invalid frame buffer address found" 1975 "when TX packets!\n", 1976 dev->device_fh); 1977 packet_success++; 1978 continue; 1979 } 1980 1981 /* Prefetch buffer address. */ 1982 rte_prefetch0((void *)(uintptr_t)buff_addr); 1983 1984 /* 1985 * Setup dummy mbuf. This is copied to a real mbuf if 1986 * transmitted out the physical port. 1987 */ 1988 m.data_len = desc->len; 1989 m.nb_segs = 1; 1990 m.next = NULL; 1991 m.data_off = 0; 1992 m.buf_addr = (void *)(uintptr_t)buff_addr; 1993 m.buf_physaddr = phys_addr; 1994 1995 /* 1996 * Check if the frame buffer address from guest crosses 1997 * sub-region or not. 1998 */ 1999 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2000 RTE_LOG(ERR, VHOST_DATA, 2001 "(%"PRIu64") Frame buffer address cross " 2002 "sub-regioin found when attaching TX frame " 2003 "buffer address!\n", 2004 dev->device_fh); 2005 need_copy = 1; 2006 } else 2007 need_copy = 0; 2008 2009 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2010 2011 /* 2012 * If this is the first received packet we need to learn 2013 * the MAC and setup VMDQ 2014 */ 2015 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2016 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2017 /* 2018 * Discard frame if device is scheduled for 2019 * removal or a duplicate MAC address is found. 2020 */ 2021 packet_success += free_entries; 2022 vq->last_used_idx += packet_success; 2023 break; 2024 } 2025 } 2026 2027 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2028 packet_success++; 2029 } 2030 } 2031 2032 /* 2033 * This function is called by each data core. It handles all RX/TX registered 2034 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2035 * addresses are compared with all devices in the main linked list. 2036 */ 2037 static int 2038 switch_worker_zcp(__attribute__((unused)) void *arg) 2039 { 2040 struct virtio_net *dev = NULL; 2041 struct vhost_dev *vdev = NULL; 2042 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2043 struct virtio_net_data_ll *dev_ll; 2044 struct mbuf_table *tx_q; 2045 volatile struct lcore_ll_info *lcore_ll; 2046 const uint64_t drain_tsc 2047 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2048 * BURST_TX_DRAIN_US; 2049 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2050 unsigned ret; 2051 const uint16_t lcore_id = rte_lcore_id(); 2052 uint16_t count_in_ring, rx_count = 0; 2053 2054 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2055 2056 lcore_ll = lcore_info[lcore_id].lcore_ll; 2057 prev_tsc = 0; 2058 2059 while (1) { 2060 cur_tsc = rte_rdtsc(); 2061 2062 /* TX burst queue drain */ 2063 diff_tsc = cur_tsc - prev_tsc; 2064 if (unlikely(diff_tsc > drain_tsc)) { 2065 /* 2066 * Get mbuf from vpool.pool and detach mbuf and 2067 * put back into vpool.ring. 2068 */ 2069 dev_ll = lcore_ll->ll_root_used; 2070 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2071 /* Get virtio device ID */ 2072 vdev = dev_ll->vdev; 2073 dev = vdev->dev; 2074 2075 if (likely(!vdev->remove)) { 2076 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2077 if (tx_q->len) { 2078 LOG_DEBUG(VHOST_DATA, 2079 "TX queue drained after timeout" 2080 " with burst size %u\n", 2081 tx_q->len); 2082 2083 /* 2084 * Tx any packets in the queue 2085 */ 2086 ret = rte_eth_tx_burst( 2087 ports[0], 2088 (uint16_t)tx_q->txq_id, 2089 (struct rte_mbuf **) 2090 tx_q->m_table, 2091 (uint16_t)tx_q->len); 2092 if (unlikely(ret < tx_q->len)) { 2093 do { 2094 rte_pktmbuf_free( 2095 tx_q->m_table[ret]); 2096 } while (++ret < tx_q->len); 2097 } 2098 tx_q->len = 0; 2099 2100 txmbuf_clean_zcp(dev, 2101 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2102 } 2103 } 2104 dev_ll = dev_ll->next; 2105 } 2106 prev_tsc = cur_tsc; 2107 } 2108 2109 rte_prefetch0(lcore_ll->ll_root_used); 2110 2111 /* 2112 * Inform the configuration core that we have exited the linked 2113 * list and that no devices are in use if requested. 2114 */ 2115 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2116 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2117 2118 /* Process devices */ 2119 dev_ll = lcore_ll->ll_root_used; 2120 2121 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2122 vdev = dev_ll->vdev; 2123 dev = vdev->dev; 2124 if (unlikely(vdev->remove)) { 2125 dev_ll = dev_ll->next; 2126 unlink_vmdq(vdev); 2127 vdev->ready = DEVICE_SAFE_REMOVE; 2128 continue; 2129 } 2130 2131 if (likely(vdev->ready == DEVICE_RX)) { 2132 uint32_t index = vdev->vmdq_rx_q; 2133 uint16_t i; 2134 count_in_ring 2135 = rte_ring_count(vpool_array[index].ring); 2136 uint16_t free_entries 2137 = (uint16_t)get_available_ring_num_zcp(dev); 2138 2139 /* 2140 * Attach all mbufs in vpool.ring and put back 2141 * into vpool.pool. 2142 */ 2143 for (i = 0; 2144 i < RTE_MIN(free_entries, 2145 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2146 i++) 2147 attach_rxmbuf_zcp(dev); 2148 2149 /* Handle guest RX */ 2150 rx_count = rte_eth_rx_burst(ports[0], 2151 vdev->vmdq_rx_q, pkts_burst, 2152 MAX_PKT_BURST); 2153 2154 if (rx_count) { 2155 ret_count = virtio_dev_rx_zcp(dev, 2156 pkts_burst, rx_count); 2157 if (enable_stats) { 2158 dev_statistics[dev->device_fh].rx_total 2159 += rx_count; 2160 dev_statistics[dev->device_fh].rx 2161 += ret_count; 2162 } 2163 while (likely(rx_count)) { 2164 rx_count--; 2165 pktmbuf_detach_zcp( 2166 pkts_burst[rx_count]); 2167 rte_ring_sp_enqueue( 2168 vpool_array[index].ring, 2169 (void *)pkts_burst[rx_count]); 2170 } 2171 } 2172 } 2173 2174 if (likely(!vdev->remove)) 2175 /* Handle guest TX */ 2176 virtio_dev_tx_zcp(dev); 2177 2178 /* Move to the next device in the list */ 2179 dev_ll = dev_ll->next; 2180 } 2181 } 2182 2183 return 0; 2184 } 2185 2186 2187 /* 2188 * Add an entry to a used linked list. A free entry must first be found 2189 * in the free linked list using get_data_ll_free_entry(); 2190 */ 2191 static void 2192 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2193 struct virtio_net_data_ll *ll_dev) 2194 { 2195 struct virtio_net_data_ll *ll = *ll_root_addr; 2196 2197 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2198 ll_dev->next = NULL; 2199 rte_compiler_barrier(); 2200 2201 /* If ll == NULL then this is the first device. */ 2202 if (ll) { 2203 /* Increment to the tail of the linked list. */ 2204 while ((ll->next != NULL) ) 2205 ll = ll->next; 2206 2207 ll->next = ll_dev; 2208 } else { 2209 *ll_root_addr = ll_dev; 2210 } 2211 } 2212 2213 /* 2214 * Remove an entry from a used linked list. The entry must then be added to 2215 * the free linked list using put_data_ll_free_entry(). 2216 */ 2217 static void 2218 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2219 struct virtio_net_data_ll *ll_dev, 2220 struct virtio_net_data_ll *ll_dev_last) 2221 { 2222 struct virtio_net_data_ll *ll = *ll_root_addr; 2223 2224 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2225 return; 2226 2227 if (ll_dev == ll) 2228 *ll_root_addr = ll_dev->next; 2229 else 2230 if (likely(ll_dev_last != NULL)) 2231 ll_dev_last->next = ll_dev->next; 2232 else 2233 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2234 } 2235 2236 /* 2237 * Find and return an entry from the free linked list. 2238 */ 2239 static struct virtio_net_data_ll * 2240 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2241 { 2242 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2243 struct virtio_net_data_ll *ll_dev; 2244 2245 if (ll_free == NULL) 2246 return NULL; 2247 2248 ll_dev = ll_free; 2249 *ll_root_addr = ll_free->next; 2250 2251 return ll_dev; 2252 } 2253 2254 /* 2255 * Place an entry back on to the free linked list. 2256 */ 2257 static void 2258 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2259 struct virtio_net_data_ll *ll_dev) 2260 { 2261 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2262 2263 if (ll_dev == NULL) 2264 return; 2265 2266 ll_dev->next = ll_free; 2267 *ll_root_addr = ll_dev; 2268 } 2269 2270 /* 2271 * Creates a linked list of a given size. 2272 */ 2273 static struct virtio_net_data_ll * 2274 alloc_data_ll(uint32_t size) 2275 { 2276 struct virtio_net_data_ll *ll_new; 2277 uint32_t i; 2278 2279 /* Malloc and then chain the linked list. */ 2280 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2281 if (ll_new == NULL) { 2282 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2283 return NULL; 2284 } 2285 2286 for (i = 0; i < size - 1; i++) { 2287 ll_new[i].vdev = NULL; 2288 ll_new[i].next = &ll_new[i+1]; 2289 } 2290 ll_new[i].next = NULL; 2291 2292 return (ll_new); 2293 } 2294 2295 /* 2296 * Create the main linked list along with each individual cores linked list. A used and a free list 2297 * are created to manage entries. 2298 */ 2299 static int 2300 init_data_ll (void) 2301 { 2302 int lcore; 2303 2304 RTE_LCORE_FOREACH_SLAVE(lcore) { 2305 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2306 if (lcore_info[lcore].lcore_ll == NULL) { 2307 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2308 return -1; 2309 } 2310 2311 lcore_info[lcore].lcore_ll->device_num = 0; 2312 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2313 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2314 if (num_devices % num_switching_cores) 2315 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2316 else 2317 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2318 } 2319 2320 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2321 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2322 2323 return 0; 2324 } 2325 2326 /* 2327 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2328 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2329 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2330 */ 2331 static void 2332 destroy_device (volatile struct virtio_net *dev) 2333 { 2334 struct virtio_net_data_ll *ll_lcore_dev_cur; 2335 struct virtio_net_data_ll *ll_main_dev_cur; 2336 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2337 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2338 struct vhost_dev *vdev; 2339 int lcore; 2340 2341 dev->flags &= ~VIRTIO_DEV_RUNNING; 2342 2343 vdev = (struct vhost_dev *)dev->priv; 2344 /*set the remove flag. */ 2345 vdev->remove = 1; 2346 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2347 rte_pause(); 2348 } 2349 2350 /* Search for entry to be removed from lcore ll */ 2351 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2352 while (ll_lcore_dev_cur != NULL) { 2353 if (ll_lcore_dev_cur->vdev == vdev) { 2354 break; 2355 } else { 2356 ll_lcore_dev_last = ll_lcore_dev_cur; 2357 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2358 } 2359 } 2360 2361 if (ll_lcore_dev_cur == NULL) { 2362 RTE_LOG(ERR, VHOST_CONFIG, 2363 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2364 dev->device_fh); 2365 return; 2366 } 2367 2368 /* Search for entry to be removed from main ll */ 2369 ll_main_dev_cur = ll_root_used; 2370 ll_main_dev_last = NULL; 2371 while (ll_main_dev_cur != NULL) { 2372 if (ll_main_dev_cur->vdev == vdev) { 2373 break; 2374 } else { 2375 ll_main_dev_last = ll_main_dev_cur; 2376 ll_main_dev_cur = ll_main_dev_cur->next; 2377 } 2378 } 2379 2380 /* Remove entries from the lcore and main ll. */ 2381 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2382 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2383 2384 /* Set the dev_removal_flag on each lcore. */ 2385 RTE_LCORE_FOREACH_SLAVE(lcore) { 2386 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2387 } 2388 2389 /* 2390 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2391 * they can no longer access the device removed from the linked lists and that the devices 2392 * are no longer in use. 2393 */ 2394 RTE_LCORE_FOREACH_SLAVE(lcore) { 2395 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2396 rte_pause(); 2397 } 2398 } 2399 2400 /* Add the entries back to the lcore and main free ll.*/ 2401 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2402 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2403 2404 /* Decrement number of device on the lcore. */ 2405 lcore_info[vdev->coreid].lcore_ll->device_num--; 2406 2407 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2408 2409 if (zero_copy) { 2410 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2411 2412 /* Stop the RX queue. */ 2413 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2414 LOG_DEBUG(VHOST_CONFIG, 2415 "(%"PRIu64") In destroy_device: Failed to stop " 2416 "rx queue:%d\n", 2417 dev->device_fh, 2418 vdev->vmdq_rx_q); 2419 } 2420 2421 LOG_DEBUG(VHOST_CONFIG, 2422 "(%"PRIu64") in destroy_device: Start put mbuf in " 2423 "mempool back to ring for RX queue: %d\n", 2424 dev->device_fh, vdev->vmdq_rx_q); 2425 2426 mbuf_destroy_zcp(vpool); 2427 2428 /* Stop the TX queue. */ 2429 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2430 LOG_DEBUG(VHOST_CONFIG, 2431 "(%"PRIu64") In destroy_device: Failed to " 2432 "stop tx queue:%d\n", 2433 dev->device_fh, vdev->vmdq_rx_q); 2434 } 2435 2436 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2437 2438 LOG_DEBUG(VHOST_CONFIG, 2439 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2440 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2441 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2442 dev->device_fh); 2443 2444 mbuf_destroy_zcp(vpool); 2445 rte_free(vdev->regions_hpa); 2446 } 2447 rte_free(vdev); 2448 2449 } 2450 2451 /* 2452 * Calculate the region count of physical continous regions for one particular 2453 * region of whose vhost virtual address is continous. The particular region 2454 * start from vva_start, with size of 'size' in argument. 2455 */ 2456 static uint32_t 2457 check_hpa_regions(uint64_t vva_start, uint64_t size) 2458 { 2459 uint32_t i, nregions = 0, page_size = getpagesize(); 2460 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2461 if (vva_start % page_size) { 2462 LOG_DEBUG(VHOST_CONFIG, 2463 "in check_countinous: vva start(%p) mod page_size(%d) " 2464 "has remainder\n", 2465 (void *)(uintptr_t)vva_start, page_size); 2466 return 0; 2467 } 2468 if (size % page_size) { 2469 LOG_DEBUG(VHOST_CONFIG, 2470 "in check_countinous: " 2471 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2472 size, page_size); 2473 return 0; 2474 } 2475 for (i = 0; i < size - page_size; i = i + page_size) { 2476 cur_phys_addr 2477 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2478 next_phys_addr = rte_mem_virt2phy( 2479 (void *)(uintptr_t)(vva_start + i + page_size)); 2480 if ((cur_phys_addr + page_size) != next_phys_addr) { 2481 ++nregions; 2482 LOG_DEBUG(VHOST_CONFIG, 2483 "in check_continuous: hva addr:(%p) is not " 2484 "continuous with hva addr:(%p), diff:%d\n", 2485 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2486 (void *)(uintptr_t)(vva_start + (uint64_t)i 2487 + page_size), page_size); 2488 LOG_DEBUG(VHOST_CONFIG, 2489 "in check_continuous: hpa addr:(%p) is not " 2490 "continuous with hpa addr:(%p), " 2491 "diff:(%"PRIu64")\n", 2492 (void *)(uintptr_t)cur_phys_addr, 2493 (void *)(uintptr_t)next_phys_addr, 2494 (next_phys_addr-cur_phys_addr)); 2495 } 2496 } 2497 return nregions; 2498 } 2499 2500 /* 2501 * Divide each region whose vhost virtual address is continous into a few 2502 * sub-regions, make sure the physical address within each sub-region are 2503 * continous. And fill offset(to GPA) and size etc. information of each 2504 * sub-region into regions_hpa. 2505 */ 2506 static uint32_t 2507 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2508 { 2509 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2510 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2511 2512 if (mem_region_hpa == NULL) 2513 return 0; 2514 2515 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2516 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2517 virtio_memory->regions[regionidx].address_offset; 2518 mem_region_hpa[regionidx_hpa].guest_phys_address 2519 = virtio_memory->regions[regionidx].guest_phys_address; 2520 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2521 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2522 mem_region_hpa[regionidx_hpa].guest_phys_address; 2523 LOG_DEBUG(VHOST_CONFIG, 2524 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2525 regionidx_hpa, 2526 (void *)(uintptr_t) 2527 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2528 LOG_DEBUG(VHOST_CONFIG, 2529 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2530 regionidx_hpa, 2531 (void *)(uintptr_t) 2532 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2533 for (i = 0, k = 0; 2534 i < virtio_memory->regions[regionidx].memory_size - 2535 page_size; 2536 i += page_size) { 2537 cur_phys_addr = rte_mem_virt2phy( 2538 (void *)(uintptr_t)(vva_start + i)); 2539 next_phys_addr = rte_mem_virt2phy( 2540 (void *)(uintptr_t)(vva_start + 2541 i + page_size)); 2542 if ((cur_phys_addr + page_size) != next_phys_addr) { 2543 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2544 mem_region_hpa[regionidx_hpa].guest_phys_address + 2545 k + page_size; 2546 mem_region_hpa[regionidx_hpa].memory_size 2547 = k + page_size; 2548 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2549 "phys addr end [%d]:(%p)\n", 2550 regionidx_hpa, 2551 (void *)(uintptr_t) 2552 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2553 LOG_DEBUG(VHOST_CONFIG, 2554 "in fill_hpa_regions: guest phys addr " 2555 "size [%d]:(%p)\n", 2556 regionidx_hpa, 2557 (void *)(uintptr_t) 2558 (mem_region_hpa[regionidx_hpa].memory_size)); 2559 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2560 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2561 ++regionidx_hpa; 2562 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2563 next_phys_addr - 2564 mem_region_hpa[regionidx_hpa].guest_phys_address; 2565 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2566 " phys addr start[%d]:(%p)\n", 2567 regionidx_hpa, 2568 (void *)(uintptr_t) 2569 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2570 LOG_DEBUG(VHOST_CONFIG, 2571 "in fill_hpa_regions: host phys addr " 2572 "start[%d]:(%p)\n", 2573 regionidx_hpa, 2574 (void *)(uintptr_t) 2575 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2576 k = 0; 2577 } else { 2578 k += page_size; 2579 } 2580 } 2581 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2582 = mem_region_hpa[regionidx_hpa].guest_phys_address 2583 + k + page_size; 2584 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2585 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2586 "[%d]:(%p)\n", regionidx_hpa, 2587 (void *)(uintptr_t) 2588 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2589 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2590 "[%d]:(%p)\n", regionidx_hpa, 2591 (void *)(uintptr_t) 2592 (mem_region_hpa[regionidx_hpa].memory_size)); 2593 ++regionidx_hpa; 2594 } 2595 return regionidx_hpa; 2596 } 2597 2598 /* 2599 * A new device is added to a data core. First the device is added to the main linked list 2600 * and the allocated to a specific data core. 2601 */ 2602 static int 2603 new_device (struct virtio_net *dev) 2604 { 2605 struct virtio_net_data_ll *ll_dev; 2606 int lcore, core_add = 0; 2607 uint32_t device_num_min = num_devices; 2608 struct vhost_dev *vdev; 2609 uint32_t regionidx; 2610 2611 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2612 if (vdev == NULL) { 2613 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2614 dev->device_fh); 2615 return -1; 2616 } 2617 vdev->dev = dev; 2618 dev->priv = vdev; 2619 2620 if (zero_copy) { 2621 vdev->nregions_hpa = dev->mem->nregions; 2622 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2623 vdev->nregions_hpa 2624 += check_hpa_regions( 2625 dev->mem->regions[regionidx].guest_phys_address 2626 + dev->mem->regions[regionidx].address_offset, 2627 dev->mem->regions[regionidx].memory_size); 2628 2629 } 2630 2631 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2632 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2633 RTE_CACHE_LINE_SIZE); 2634 if (vdev->regions_hpa == NULL) { 2635 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2636 rte_free(vdev); 2637 return -1; 2638 } 2639 2640 2641 if (fill_hpa_memory_regions( 2642 vdev->regions_hpa, dev->mem 2643 ) != vdev->nregions_hpa) { 2644 2645 RTE_LOG(ERR, VHOST_CONFIG, 2646 "hpa memory regions number mismatch: " 2647 "[%d]\n", vdev->nregions_hpa); 2648 rte_free(vdev->regions_hpa); 2649 rte_free(vdev); 2650 return -1; 2651 } 2652 } 2653 2654 2655 /* Add device to main ll */ 2656 ll_dev = get_data_ll_free_entry(&ll_root_free); 2657 if (ll_dev == NULL) { 2658 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2659 "of %d devices per core has been reached\n", 2660 dev->device_fh, num_devices); 2661 if (vdev->regions_hpa) 2662 rte_free(vdev->regions_hpa); 2663 rte_free(vdev); 2664 return -1; 2665 } 2666 ll_dev->vdev = vdev; 2667 add_data_ll_entry(&ll_root_used, ll_dev); 2668 vdev->vmdq_rx_q 2669 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2670 2671 if (zero_copy) { 2672 uint32_t index = vdev->vmdq_rx_q; 2673 uint32_t count_in_ring, i; 2674 struct mbuf_table *tx_q; 2675 2676 count_in_ring = rte_ring_count(vpool_array[index].ring); 2677 2678 LOG_DEBUG(VHOST_CONFIG, 2679 "(%"PRIu64") in new_device: mbuf count in mempool " 2680 "before attach is: %d\n", 2681 dev->device_fh, 2682 rte_mempool_count(vpool_array[index].pool)); 2683 LOG_DEBUG(VHOST_CONFIG, 2684 "(%"PRIu64") in new_device: mbuf count in ring " 2685 "before attach is : %d\n", 2686 dev->device_fh, count_in_ring); 2687 2688 /* 2689 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2690 */ 2691 for (i = 0; i < count_in_ring; i++) 2692 attach_rxmbuf_zcp(dev); 2693 2694 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2695 "mempool after attach is: %d\n", 2696 dev->device_fh, 2697 rte_mempool_count(vpool_array[index].pool)); 2698 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2699 "ring after attach is : %d\n", 2700 dev->device_fh, 2701 rte_ring_count(vpool_array[index].ring)); 2702 2703 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2704 tx_q->txq_id = vdev->vmdq_rx_q; 2705 2706 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2707 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2708 2709 LOG_DEBUG(VHOST_CONFIG, 2710 "(%"PRIu64") In new_device: Failed to start " 2711 "tx queue:%d\n", 2712 dev->device_fh, vdev->vmdq_rx_q); 2713 2714 mbuf_destroy_zcp(vpool); 2715 rte_free(vdev->regions_hpa); 2716 rte_free(vdev); 2717 return -1; 2718 } 2719 2720 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2721 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2722 2723 LOG_DEBUG(VHOST_CONFIG, 2724 "(%"PRIu64") In new_device: Failed to start " 2725 "rx queue:%d\n", 2726 dev->device_fh, vdev->vmdq_rx_q); 2727 2728 /* Stop the TX queue. */ 2729 if (rte_eth_dev_tx_queue_stop(ports[0], 2730 vdev->vmdq_rx_q) != 0) { 2731 LOG_DEBUG(VHOST_CONFIG, 2732 "(%"PRIu64") In new_device: Failed to " 2733 "stop tx queue:%d\n", 2734 dev->device_fh, vdev->vmdq_rx_q); 2735 } 2736 2737 mbuf_destroy_zcp(vpool); 2738 rte_free(vdev->regions_hpa); 2739 rte_free(vdev); 2740 return -1; 2741 } 2742 2743 } 2744 2745 /*reset ready flag*/ 2746 vdev->ready = DEVICE_MAC_LEARNING; 2747 vdev->remove = 0; 2748 2749 /* Find a suitable lcore to add the device. */ 2750 RTE_LCORE_FOREACH_SLAVE(lcore) { 2751 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2752 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2753 core_add = lcore; 2754 } 2755 } 2756 /* Add device to lcore ll */ 2757 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2758 if (ll_dev == NULL) { 2759 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2760 vdev->ready = DEVICE_SAFE_REMOVE; 2761 destroy_device(dev); 2762 if (vdev->regions_hpa) 2763 rte_free(vdev->regions_hpa); 2764 rte_free(vdev); 2765 return -1; 2766 } 2767 ll_dev->vdev = vdev; 2768 vdev->coreid = core_add; 2769 2770 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2771 2772 /* Initialize device stats */ 2773 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2774 2775 /* Disable notifications. */ 2776 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2777 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2778 lcore_info[vdev->coreid].lcore_ll->device_num++; 2779 dev->flags |= VIRTIO_DEV_RUNNING; 2780 2781 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2782 2783 return 0; 2784 } 2785 2786 /* 2787 * These callback allow devices to be added to the data core when configuration 2788 * has been fully complete. 2789 */ 2790 static const struct virtio_net_device_ops virtio_net_device_ops = 2791 { 2792 .new_device = new_device, 2793 .destroy_device = destroy_device, 2794 }; 2795 2796 /* 2797 * This is a thread will wake up after a period to print stats if the user has 2798 * enabled them. 2799 */ 2800 static void 2801 print_stats(void) 2802 { 2803 struct virtio_net_data_ll *dev_ll; 2804 uint64_t tx_dropped, rx_dropped; 2805 uint64_t tx, tx_total, rx, rx_total; 2806 uint32_t device_fh; 2807 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2808 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2809 2810 while(1) { 2811 sleep(enable_stats); 2812 2813 /* Clear screen and move to top left */ 2814 printf("%s%s", clr, top_left); 2815 2816 printf("\nDevice statistics ===================================="); 2817 2818 dev_ll = ll_root_used; 2819 while (dev_ll != NULL) { 2820 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2821 tx_total = dev_statistics[device_fh].tx_total; 2822 tx = dev_statistics[device_fh].tx; 2823 tx_dropped = tx_total - tx; 2824 if (zero_copy == 0) { 2825 rx_total = rte_atomic64_read( 2826 &dev_statistics[device_fh].rx_total_atomic); 2827 rx = rte_atomic64_read( 2828 &dev_statistics[device_fh].rx_atomic); 2829 } else { 2830 rx_total = dev_statistics[device_fh].rx_total; 2831 rx = dev_statistics[device_fh].rx; 2832 } 2833 rx_dropped = rx_total - rx; 2834 2835 printf("\nStatistics for device %"PRIu32" ------------------------------" 2836 "\nTX total: %"PRIu64"" 2837 "\nTX dropped: %"PRIu64"" 2838 "\nTX successful: %"PRIu64"" 2839 "\nRX total: %"PRIu64"" 2840 "\nRX dropped: %"PRIu64"" 2841 "\nRX successful: %"PRIu64"", 2842 device_fh, 2843 tx_total, 2844 tx_dropped, 2845 tx, 2846 rx_total, 2847 rx_dropped, 2848 rx); 2849 2850 dev_ll = dev_ll->next; 2851 } 2852 printf("\n======================================================\n"); 2853 } 2854 } 2855 2856 static void 2857 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2858 char *ring_name, uint32_t nb_mbuf) 2859 { 2860 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2861 vpool_array[index].pool 2862 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2863 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2864 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2865 rte_pktmbuf_init, NULL, socket, 0); 2866 if (vpool_array[index].pool != NULL) { 2867 vpool_array[index].ring 2868 = rte_ring_create(ring_name, 2869 rte_align32pow2(nb_mbuf + 1), 2870 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2871 if (likely(vpool_array[index].ring != NULL)) { 2872 LOG_DEBUG(VHOST_CONFIG, 2873 "in setup_mempool_tbl: mbuf count in " 2874 "mempool is: %d\n", 2875 rte_mempool_count(vpool_array[index].pool)); 2876 LOG_DEBUG(VHOST_CONFIG, 2877 "in setup_mempool_tbl: mbuf count in " 2878 "ring is: %d\n", 2879 rte_ring_count(vpool_array[index].ring)); 2880 } else { 2881 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2882 ring_name); 2883 } 2884 2885 /* Need consider head room. */ 2886 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2887 } else { 2888 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2889 } 2890 } 2891 2892 2893 /* 2894 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2895 * device is also registered here to handle the IOCTLs. 2896 */ 2897 int 2898 main(int argc, char *argv[]) 2899 { 2900 struct rte_mempool *mbuf_pool = NULL; 2901 unsigned lcore_id, core_id = 0; 2902 unsigned nb_ports, valid_num_ports; 2903 int ret; 2904 uint8_t portid; 2905 uint16_t queue_id; 2906 static pthread_t tid; 2907 2908 /* init EAL */ 2909 ret = rte_eal_init(argc, argv); 2910 if (ret < 0) 2911 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2912 argc -= ret; 2913 argv += ret; 2914 2915 /* parse app arguments */ 2916 ret = us_vhost_parse_args(argc, argv); 2917 if (ret < 0) 2918 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2919 2920 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2921 if (rte_lcore_is_enabled(lcore_id)) 2922 lcore_ids[core_id ++] = lcore_id; 2923 2924 if (rte_lcore_count() > RTE_MAX_LCORE) 2925 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2926 2927 /*set the number of swithcing cores available*/ 2928 num_switching_cores = rte_lcore_count()-1; 2929 2930 /* Get the number of physical ports. */ 2931 nb_ports = rte_eth_dev_count(); 2932 if (nb_ports > RTE_MAX_ETHPORTS) 2933 nb_ports = RTE_MAX_ETHPORTS; 2934 2935 /* 2936 * Update the global var NUM_PORTS and global array PORTS 2937 * and get value of var VALID_NUM_PORTS according to system ports number 2938 */ 2939 valid_num_ports = check_ports_num(nb_ports); 2940 2941 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2942 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2943 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2944 return -1; 2945 } 2946 2947 if (zero_copy == 0) { 2948 /* Create the mbuf pool. */ 2949 mbuf_pool = rte_mempool_create( 2950 "MBUF_POOL", 2951 NUM_MBUFS_PER_PORT 2952 * valid_num_ports, 2953 MBUF_SIZE, MBUF_CACHE_SIZE, 2954 sizeof(struct rte_pktmbuf_pool_private), 2955 rte_pktmbuf_pool_init, NULL, 2956 rte_pktmbuf_init, NULL, 2957 rte_socket_id(), 0); 2958 if (mbuf_pool == NULL) 2959 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2960 2961 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2962 vpool_array[queue_id].pool = mbuf_pool; 2963 2964 if (vm2vm_mode == VM2VM_HARDWARE) { 2965 /* Enable VT loop back to let L2 switch to do it. */ 2966 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2967 LOG_DEBUG(VHOST_CONFIG, 2968 "Enable loop back for L2 switch in vmdq.\n"); 2969 } 2970 } else { 2971 uint32_t nb_mbuf; 2972 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2973 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2974 2975 nb_mbuf = num_rx_descriptor 2976 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2977 + num_switching_cores * MAX_PKT_BURST; 2978 2979 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2980 snprintf(pool_name, sizeof(pool_name), 2981 "rxmbuf_pool_%u", queue_id); 2982 snprintf(ring_name, sizeof(ring_name), 2983 "rxmbuf_ring_%u", queue_id); 2984 setup_mempool_tbl(rte_socket_id(), queue_id, 2985 pool_name, ring_name, nb_mbuf); 2986 } 2987 2988 nb_mbuf = num_tx_descriptor 2989 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2990 + num_switching_cores * MAX_PKT_BURST; 2991 2992 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2993 snprintf(pool_name, sizeof(pool_name), 2994 "txmbuf_pool_%u", queue_id); 2995 snprintf(ring_name, sizeof(ring_name), 2996 "txmbuf_ring_%u", queue_id); 2997 setup_mempool_tbl(rte_socket_id(), 2998 (queue_id + MAX_QUEUES), 2999 pool_name, ring_name, nb_mbuf); 3000 } 3001 3002 if (vm2vm_mode == VM2VM_HARDWARE) { 3003 /* Enable VT loop back to let L2 switch to do it. */ 3004 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3005 LOG_DEBUG(VHOST_CONFIG, 3006 "Enable loop back for L2 switch in vmdq.\n"); 3007 } 3008 } 3009 /* Set log level. */ 3010 rte_set_log_level(LOG_LEVEL); 3011 3012 /* initialize all ports */ 3013 for (portid = 0; portid < nb_ports; portid++) { 3014 /* skip ports that are not enabled */ 3015 if ((enabled_port_mask & (1 << portid)) == 0) { 3016 RTE_LOG(INFO, VHOST_PORT, 3017 "Skipping disabled port %d\n", portid); 3018 continue; 3019 } 3020 if (port_init(portid) != 0) 3021 rte_exit(EXIT_FAILURE, 3022 "Cannot initialize network ports\n"); 3023 } 3024 3025 /* Initialise all linked lists. */ 3026 if (init_data_ll() == -1) 3027 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3028 3029 /* Initialize device stats */ 3030 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3031 3032 /* Enable stats if the user option is set. */ 3033 if (enable_stats) 3034 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3035 3036 /* Launch all data cores. */ 3037 if (zero_copy == 0) { 3038 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3039 rte_eal_remote_launch(switch_worker, 3040 mbuf_pool, lcore_id); 3041 } 3042 } else { 3043 uint32_t count_in_mempool, index, i; 3044 for (index = 0; index < 2*MAX_QUEUES; index++) { 3045 /* For all RX and TX queues. */ 3046 count_in_mempool 3047 = rte_mempool_count(vpool_array[index].pool); 3048 3049 /* 3050 * Transfer all un-attached mbufs from vpool.pool 3051 * to vpoo.ring. 3052 */ 3053 for (i = 0; i < count_in_mempool; i++) { 3054 struct rte_mbuf *mbuf 3055 = __rte_mbuf_raw_alloc( 3056 vpool_array[index].pool); 3057 rte_ring_sp_enqueue(vpool_array[index].ring, 3058 (void *)mbuf); 3059 } 3060 3061 LOG_DEBUG(VHOST_CONFIG, 3062 "in main: mbuf count in mempool at initial " 3063 "is: %d\n", count_in_mempool); 3064 LOG_DEBUG(VHOST_CONFIG, 3065 "in main: mbuf count in ring at initial is :" 3066 " %d\n", 3067 rte_ring_count(vpool_array[index].ring)); 3068 } 3069 3070 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3071 rte_eal_remote_launch(switch_worker_zcp, NULL, 3072 lcore_id); 3073 } 3074 3075 if (mergeable == 0) 3076 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3077 3078 /* Register CUSE device to handle IOCTLs. */ 3079 ret = rte_vhost_driver_register((char *)&dev_basename); 3080 if (ret != 0) 3081 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3082 3083 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3084 3085 /* Start CUSE session. */ 3086 rte_vhost_driver_session_start(); 3087 return 0; 3088 3089 } 3090 3091