1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518 78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \ 79 + RTE_PKTMBUF_HEADROOM) 80 #define MBUF_CACHE_SIZE_ZCP 0 81 82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 84 85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 87 88 #define JUMBO_FRAME_MAX_SIZE 0x2600 89 90 /* State of virtio device. */ 91 #define DEVICE_MAC_LEARNING 0 92 #define DEVICE_RX 1 93 #define DEVICE_SAFE_REMOVE 2 94 95 /* Config_core_flag status definitions. */ 96 #define REQUEST_DEV_REMOVAL 1 97 #define ACK_DEV_REMOVAL 0 98 99 /* Configurable number of RX/TX ring descriptors */ 100 #define RTE_TEST_RX_DESC_DEFAULT 1024 101 #define RTE_TEST_TX_DESC_DEFAULT 512 102 103 /* 104 * Need refine these 2 macros for legacy and DPDK based front end: 105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 106 * And then adjust power 2. 107 */ 108 /* 109 * For legacy front end, 128 descriptors, 110 * half for virtio header, another half for mbuf. 111 */ 112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 114 115 /* Get first 4 bytes in mbuf headroom. */ 116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 117 + sizeof(struct rte_mbuf))) 118 119 /* true if x is a power of 2 */ 120 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 121 122 #define INVALID_PORT_ID 0xFF 123 124 /* Max number of devices. Limited by vmdq. */ 125 #define MAX_DEVICES 64 126 127 /* Size of buffers used for snprintfs. */ 128 #define MAX_PRINT_BUFF 6072 129 130 /* Maximum character device basename size. */ 131 #define MAX_BASENAME_SZ 10 132 133 /* Maximum long option length for option parsing. */ 134 #define MAX_LONG_OPT_SZ 64 135 136 /* Used to compare MAC addresses. */ 137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 138 139 /* Number of descriptors per cacheline. */ 140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 141 142 /* mask of enabled ports */ 143 static uint32_t enabled_port_mask = 0; 144 145 /* Promiscuous mode */ 146 static uint32_t promiscuous; 147 148 /*Number of switching cores enabled*/ 149 static uint32_t num_switching_cores = 0; 150 151 /* number of devices/queues to support*/ 152 static uint32_t num_queues = 0; 153 static uint32_t num_devices; 154 155 /* 156 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 157 * disabled on default. 158 */ 159 static uint32_t zero_copy; 160 static int mergeable; 161 162 /* Do vlan strip on host, enabled on default */ 163 static uint32_t vlan_strip = 1; 164 165 /* number of descriptors to apply*/ 166 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 167 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 168 169 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 170 #define MAX_RING_DESC 4096 171 172 struct vpool { 173 struct rte_mempool *pool; 174 struct rte_ring *ring; 175 uint32_t buf_size; 176 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 177 178 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 179 typedef enum { 180 VM2VM_DISABLED = 0, 181 VM2VM_SOFTWARE = 1, 182 VM2VM_HARDWARE = 2, 183 VM2VM_LAST 184 } vm2vm_type; 185 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 186 187 /* The type of host physical address translated from guest physical address. */ 188 typedef enum { 189 PHYS_ADDR_CONTINUOUS = 0, 190 PHYS_ADDR_CROSS_SUBREG = 1, 191 PHYS_ADDR_INVALID = 2, 192 PHYS_ADDR_LAST 193 } hpa_type; 194 195 /* Enable stats. */ 196 static uint32_t enable_stats = 0; 197 /* Enable retries on RX. */ 198 static uint32_t enable_retry = 1; 199 /* Specify timeout (in useconds) between retries on RX. */ 200 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 201 /* Specify the number of retries on RX. */ 202 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 203 204 /* Character device basename. Can be set by user. */ 205 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 206 207 /* empty vmdq configuration structure. Filled in programatically */ 208 static struct rte_eth_conf vmdq_conf_default = { 209 .rxmode = { 210 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 211 .split_hdr_size = 0, 212 .header_split = 0, /**< Header Split disabled */ 213 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 214 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 215 /* 216 * It is necessary for 1G NIC such as I350, 217 * this fixes bug of ipv4 forwarding in guest can't 218 * forward pakets from one virtio dev to another virtio dev. 219 */ 220 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 221 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 222 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 223 }, 224 225 .txmode = { 226 .mq_mode = ETH_MQ_TX_NONE, 227 }, 228 .rx_adv_conf = { 229 /* 230 * should be overridden separately in code with 231 * appropriate values 232 */ 233 .vmdq_rx_conf = { 234 .nb_queue_pools = ETH_8_POOLS, 235 .enable_default_pool = 0, 236 .default_pool = 0, 237 .nb_pool_maps = 0, 238 .pool_map = {{0, 0},}, 239 }, 240 }, 241 }; 242 243 static unsigned lcore_ids[RTE_MAX_LCORE]; 244 static uint8_t ports[RTE_MAX_ETHPORTS]; 245 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 246 static uint16_t num_pf_queues, num_vmdq_queues; 247 static uint16_t vmdq_pool_base, vmdq_queue_base; 248 static uint16_t queues_per_pool; 249 250 static const uint16_t external_pkt_default_vlan_tag = 2000; 251 const uint16_t vlan_tags[] = { 252 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 253 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 254 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 255 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 256 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 257 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 258 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 259 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 260 }; 261 262 /* ethernet addresses of ports */ 263 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 264 265 /* heads for the main used and free linked lists for the data path. */ 266 static struct virtio_net_data_ll *ll_root_used = NULL; 267 static struct virtio_net_data_ll *ll_root_free = NULL; 268 269 /* Array of data core structures containing information on individual core linked lists. */ 270 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 271 272 /* Used for queueing bursts of TX packets. */ 273 struct mbuf_table { 274 unsigned len; 275 unsigned txq_id; 276 struct rte_mbuf *m_table[MAX_PKT_BURST]; 277 }; 278 279 /* TX queue for each data core. */ 280 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 281 282 /* TX queue fori each virtio device for zero copy. */ 283 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 284 285 /* Vlan header struct used to insert vlan tags on TX. */ 286 struct vlan_ethhdr { 287 unsigned char h_dest[ETH_ALEN]; 288 unsigned char h_source[ETH_ALEN]; 289 __be16 h_vlan_proto; 290 __be16 h_vlan_TCI; 291 __be16 h_vlan_encapsulated_proto; 292 }; 293 294 /* IPv4 Header */ 295 struct ipv4_hdr { 296 uint8_t version_ihl; /**< version and header length */ 297 uint8_t type_of_service; /**< type of service */ 298 uint16_t total_length; /**< length of packet */ 299 uint16_t packet_id; /**< packet ID */ 300 uint16_t fragment_offset; /**< fragmentation offset */ 301 uint8_t time_to_live; /**< time to live */ 302 uint8_t next_proto_id; /**< protocol ID */ 303 uint16_t hdr_checksum; /**< header checksum */ 304 uint32_t src_addr; /**< source address */ 305 uint32_t dst_addr; /**< destination address */ 306 } __attribute__((__packed__)); 307 308 /* Header lengths. */ 309 #define VLAN_HLEN 4 310 #define VLAN_ETH_HLEN 18 311 312 /* Per-device statistics struct */ 313 struct device_statistics { 314 uint64_t tx_total; 315 rte_atomic64_t rx_total_atomic; 316 uint64_t rx_total; 317 uint64_t tx; 318 rte_atomic64_t rx_atomic; 319 uint64_t rx; 320 } __rte_cache_aligned; 321 struct device_statistics dev_statistics[MAX_DEVICES]; 322 323 /* 324 * Builds up the correct configuration for VMDQ VLAN pool map 325 * according to the pool & queue limits. 326 */ 327 static inline int 328 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 329 { 330 struct rte_eth_vmdq_rx_conf conf; 331 struct rte_eth_vmdq_rx_conf *def_conf = 332 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 333 unsigned i; 334 335 memset(&conf, 0, sizeof(conf)); 336 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 337 conf.nb_pool_maps = num_devices; 338 conf.enable_loop_back = def_conf->enable_loop_back; 339 conf.rx_mode = def_conf->rx_mode; 340 341 for (i = 0; i < conf.nb_pool_maps; i++) { 342 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 343 conf.pool_map[i].pools = (1UL << i); 344 } 345 346 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 347 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 348 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 349 return 0; 350 } 351 352 /* 353 * Validate the device number according to the max pool number gotten form 354 * dev_info. If the device number is invalid, give the error message and 355 * return -1. Each device must have its own pool. 356 */ 357 static inline int 358 validate_num_devices(uint32_t max_nb_devices) 359 { 360 if (num_devices > max_nb_devices) { 361 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 362 return -1; 363 } 364 return 0; 365 } 366 367 /* 368 * Initialises a given port using global settings and with the rx buffers 369 * coming from the mbuf_pool passed as parameter 370 */ 371 static inline int 372 port_init(uint8_t port) 373 { 374 struct rte_eth_dev_info dev_info; 375 struct rte_eth_conf port_conf; 376 struct rte_eth_rxconf *rxconf; 377 struct rte_eth_txconf *txconf; 378 int16_t rx_rings, tx_rings; 379 uint16_t rx_ring_size, tx_ring_size; 380 int retval; 381 uint16_t q; 382 383 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 384 rte_eth_dev_info_get (port, &dev_info); 385 386 if (dev_info.max_rx_queues > MAX_QUEUES) { 387 rte_exit(EXIT_FAILURE, 388 "please define MAX_QUEUES no less than %u in %s\n", 389 dev_info.max_rx_queues, __FILE__); 390 } 391 392 rxconf = &dev_info.default_rxconf; 393 txconf = &dev_info.default_txconf; 394 rxconf->rx_drop_en = 1; 395 396 /* Enable vlan offload */ 397 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 398 399 /* 400 * Zero copy defers queue RX/TX start to the time when guest 401 * finishes its startup and packet buffers from that guest are 402 * available. 403 */ 404 if (zero_copy) { 405 rxconf->rx_deferred_start = 1; 406 rxconf->rx_drop_en = 0; 407 txconf->tx_deferred_start = 1; 408 } 409 410 /*configure the number of supported virtio devices based on VMDQ limits */ 411 num_devices = dev_info.max_vmdq_pools; 412 413 if (zero_copy) { 414 rx_ring_size = num_rx_descriptor; 415 tx_ring_size = num_tx_descriptor; 416 tx_rings = dev_info.max_tx_queues; 417 } else { 418 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 419 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 420 tx_rings = (uint16_t)rte_lcore_count(); 421 } 422 423 retval = validate_num_devices(MAX_DEVICES); 424 if (retval < 0) 425 return retval; 426 427 /* Get port configuration. */ 428 retval = get_eth_conf(&port_conf, num_devices); 429 if (retval < 0) 430 return retval; 431 /* NIC queues are divided into pf queues and vmdq queues. */ 432 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 433 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 434 num_vmdq_queues = num_devices * queues_per_pool; 435 num_queues = num_pf_queues + num_vmdq_queues; 436 vmdq_queue_base = dev_info.vmdq_queue_base; 437 vmdq_pool_base = dev_info.vmdq_pool_base; 438 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 439 num_pf_queues, num_devices, queues_per_pool); 440 441 if (port >= rte_eth_dev_count()) return -1; 442 443 rx_rings = (uint16_t)dev_info.max_rx_queues; 444 /* Configure ethernet device. */ 445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 446 if (retval != 0) 447 return retval; 448 449 /* Setup the queues. */ 450 for (q = 0; q < rx_rings; q ++) { 451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 452 rte_eth_dev_socket_id(port), 453 rxconf, 454 vpool_array[q].pool); 455 if (retval < 0) 456 return retval; 457 } 458 for (q = 0; q < tx_rings; q ++) { 459 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 460 rte_eth_dev_socket_id(port), 461 txconf); 462 if (retval < 0) 463 return retval; 464 } 465 466 /* Start the device. */ 467 retval = rte_eth_dev_start(port); 468 if (retval < 0) { 469 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 470 return retval; 471 } 472 473 if (promiscuous) 474 rte_eth_promiscuous_enable(port); 475 476 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 477 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 478 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 479 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 480 (unsigned)port, 481 vmdq_ports_eth_addr[port].addr_bytes[0], 482 vmdq_ports_eth_addr[port].addr_bytes[1], 483 vmdq_ports_eth_addr[port].addr_bytes[2], 484 vmdq_ports_eth_addr[port].addr_bytes[3], 485 vmdq_ports_eth_addr[port].addr_bytes[4], 486 vmdq_ports_eth_addr[port].addr_bytes[5]); 487 488 return 0; 489 } 490 491 /* 492 * Set character device basename. 493 */ 494 static int 495 us_vhost_parse_basename(const char *q_arg) 496 { 497 /* parse number string */ 498 499 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 500 return -1; 501 else 502 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 503 504 return 0; 505 } 506 507 /* 508 * Parse the portmask provided at run time. 509 */ 510 static int 511 parse_portmask(const char *portmask) 512 { 513 char *end = NULL; 514 unsigned long pm; 515 516 errno = 0; 517 518 /* parse hexadecimal string */ 519 pm = strtoul(portmask, &end, 16); 520 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 521 return -1; 522 523 if (pm == 0) 524 return -1; 525 526 return pm; 527 528 } 529 530 /* 531 * Parse num options at run time. 532 */ 533 static int 534 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 535 { 536 char *end = NULL; 537 unsigned long num; 538 539 errno = 0; 540 541 /* parse unsigned int string */ 542 num = strtoul(q_arg, &end, 10); 543 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 544 return -1; 545 546 if (num > max_valid_value) 547 return -1; 548 549 return num; 550 551 } 552 553 /* 554 * Display usage 555 */ 556 static void 557 us_vhost_usage(const char *prgname) 558 { 559 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 560 " --vm2vm [0|1|2]\n" 561 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 562 " --dev-basename <name>\n" 563 " --nb-devices ND\n" 564 " -p PORTMASK: Set mask for ports to be used by application\n" 565 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 566 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 567 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 568 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 569 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 570 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 571 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 572 " --dev-basename: The basename to be used for the character device.\n" 573 " --zero-copy [0|1]: disable(default)/enable rx/tx " 574 "zero copy\n" 575 " --rx-desc-num [0-N]: the number of descriptors on rx, " 576 "used only when zero copy is enabled.\n" 577 " --tx-desc-num [0-N]: the number of descriptors on tx, " 578 "used only when zero copy is enabled.\n", 579 prgname); 580 } 581 582 /* 583 * Parse the arguments given in the command line of the application. 584 */ 585 static int 586 us_vhost_parse_args(int argc, char **argv) 587 { 588 int opt, ret; 589 int option_index; 590 unsigned i; 591 const char *prgname = argv[0]; 592 static struct option long_option[] = { 593 {"vm2vm", required_argument, NULL, 0}, 594 {"rx-retry", required_argument, NULL, 0}, 595 {"rx-retry-delay", required_argument, NULL, 0}, 596 {"rx-retry-num", required_argument, NULL, 0}, 597 {"mergeable", required_argument, NULL, 0}, 598 {"vlan-strip", required_argument, NULL, 0}, 599 {"stats", required_argument, NULL, 0}, 600 {"dev-basename", required_argument, NULL, 0}, 601 {"zero-copy", required_argument, NULL, 0}, 602 {"rx-desc-num", required_argument, NULL, 0}, 603 {"tx-desc-num", required_argument, NULL, 0}, 604 {NULL, 0, 0, 0}, 605 }; 606 607 /* Parse command line */ 608 while ((opt = getopt_long(argc, argv, "p:P", 609 long_option, &option_index)) != EOF) { 610 switch (opt) { 611 /* Portmask */ 612 case 'p': 613 enabled_port_mask = parse_portmask(optarg); 614 if (enabled_port_mask == 0) { 615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 616 us_vhost_usage(prgname); 617 return -1; 618 } 619 break; 620 621 case 'P': 622 promiscuous = 1; 623 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 624 ETH_VMDQ_ACCEPT_BROADCAST | 625 ETH_VMDQ_ACCEPT_MULTICAST; 626 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 627 628 break; 629 630 case 0: 631 /* Enable/disable vm2vm comms. */ 632 if (!strncmp(long_option[option_index].name, "vm2vm", 633 MAX_LONG_OPT_SZ)) { 634 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 635 if (ret == -1) { 636 RTE_LOG(INFO, VHOST_CONFIG, 637 "Invalid argument for " 638 "vm2vm [0|1|2]\n"); 639 us_vhost_usage(prgname); 640 return -1; 641 } else { 642 vm2vm_mode = (vm2vm_type)ret; 643 } 644 } 645 646 /* Enable/disable retries on RX. */ 647 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 648 ret = parse_num_opt(optarg, 1); 649 if (ret == -1) { 650 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 651 us_vhost_usage(prgname); 652 return -1; 653 } else { 654 enable_retry = ret; 655 } 656 } 657 658 /* Specify the retries delay time (in useconds) on RX. */ 659 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 660 ret = parse_num_opt(optarg, INT32_MAX); 661 if (ret == -1) { 662 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 663 us_vhost_usage(prgname); 664 return -1; 665 } else { 666 burst_rx_delay_time = ret; 667 } 668 } 669 670 /* Specify the retries number on RX. */ 671 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 672 ret = parse_num_opt(optarg, INT32_MAX); 673 if (ret == -1) { 674 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 675 us_vhost_usage(prgname); 676 return -1; 677 } else { 678 burst_rx_retry_num = ret; 679 } 680 } 681 682 /* Enable/disable RX mergeable buffers. */ 683 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 684 ret = parse_num_opt(optarg, 1); 685 if (ret == -1) { 686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 687 us_vhost_usage(prgname); 688 return -1; 689 } else { 690 mergeable = !!ret; 691 if (ret) { 692 vmdq_conf_default.rxmode.jumbo_frame = 1; 693 vmdq_conf_default.rxmode.max_rx_pkt_len 694 = JUMBO_FRAME_MAX_SIZE; 695 } 696 } 697 } 698 699 /* Enable/disable RX VLAN strip on host. */ 700 if (!strncmp(long_option[option_index].name, 701 "vlan-strip", MAX_LONG_OPT_SZ)) { 702 ret = parse_num_opt(optarg, 1); 703 if (ret == -1) { 704 RTE_LOG(INFO, VHOST_CONFIG, 705 "Invalid argument for VLAN strip [0|1]\n"); 706 us_vhost_usage(prgname); 707 return -1; 708 } else { 709 vlan_strip = !!ret; 710 vmdq_conf_default.rxmode.hw_vlan_strip = 711 vlan_strip; 712 } 713 } 714 715 /* Enable/disable stats. */ 716 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 717 ret = parse_num_opt(optarg, INT32_MAX); 718 if (ret == -1) { 719 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 720 us_vhost_usage(prgname); 721 return -1; 722 } else { 723 enable_stats = ret; 724 } 725 } 726 727 /* Set character device basename. */ 728 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 729 if (us_vhost_parse_basename(optarg) == -1) { 730 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 731 us_vhost_usage(prgname); 732 return -1; 733 } 734 } 735 736 /* Enable/disable rx/tx zero copy. */ 737 if (!strncmp(long_option[option_index].name, 738 "zero-copy", MAX_LONG_OPT_SZ)) { 739 ret = parse_num_opt(optarg, 1); 740 if (ret == -1) { 741 RTE_LOG(INFO, VHOST_CONFIG, 742 "Invalid argument" 743 " for zero-copy [0|1]\n"); 744 us_vhost_usage(prgname); 745 return -1; 746 } else 747 zero_copy = ret; 748 749 if (zero_copy) { 750 #ifdef RTE_MBUF_REFCNT 751 RTE_LOG(ERR, VHOST_CONFIG, "Before running " 752 "zero copy vhost APP, please " 753 "disable RTE_MBUF_REFCNT\n" 754 "in config file and then rebuild DPDK " 755 "core lib!\n" 756 "Otherwise please disable zero copy " 757 "flag in command line!\n"); 758 return -1; 759 #endif 760 } 761 } 762 763 /* Specify the descriptor number on RX. */ 764 if (!strncmp(long_option[option_index].name, 765 "rx-desc-num", MAX_LONG_OPT_SZ)) { 766 ret = parse_num_opt(optarg, MAX_RING_DESC); 767 if ((ret == -1) || (!POWEROF2(ret))) { 768 RTE_LOG(INFO, VHOST_CONFIG, 769 "Invalid argument for rx-desc-num[0-N]," 770 "power of 2 required.\n"); 771 us_vhost_usage(prgname); 772 return -1; 773 } else { 774 num_rx_descriptor = ret; 775 } 776 } 777 778 /* Specify the descriptor number on TX. */ 779 if (!strncmp(long_option[option_index].name, 780 "tx-desc-num", MAX_LONG_OPT_SZ)) { 781 ret = parse_num_opt(optarg, MAX_RING_DESC); 782 if ((ret == -1) || (!POWEROF2(ret))) { 783 RTE_LOG(INFO, VHOST_CONFIG, 784 "Invalid argument for tx-desc-num [0-N]," 785 "power of 2 required.\n"); 786 us_vhost_usage(prgname); 787 return -1; 788 } else { 789 num_tx_descriptor = ret; 790 } 791 } 792 793 break; 794 795 /* Invalid option - print options. */ 796 default: 797 us_vhost_usage(prgname); 798 return -1; 799 } 800 } 801 802 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 803 if (enabled_port_mask & (1 << i)) 804 ports[num_ports++] = (uint8_t)i; 805 } 806 807 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 808 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 809 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 810 return -1; 811 } 812 813 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 814 RTE_LOG(INFO, VHOST_PORT, 815 "Vhost zero copy doesn't support software vm2vm," 816 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 817 return -1; 818 } 819 820 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 821 RTE_LOG(INFO, VHOST_PORT, 822 "Vhost zero copy doesn't support jumbo frame," 823 "please specify '--mergeable 0' to disable the " 824 "mergeable feature.\n"); 825 return -1; 826 } 827 828 return 0; 829 } 830 831 /* 832 * Update the global var NUM_PORTS and array PORTS according to system ports number 833 * and return valid ports number 834 */ 835 static unsigned check_ports_num(unsigned nb_ports) 836 { 837 unsigned valid_num_ports = num_ports; 838 unsigned portid; 839 840 if (num_ports > nb_ports) { 841 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 842 num_ports, nb_ports); 843 num_ports = nb_ports; 844 } 845 846 for (portid = 0; portid < num_ports; portid ++) { 847 if (ports[portid] >= nb_ports) { 848 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 849 ports[portid], (nb_ports - 1)); 850 ports[portid] = INVALID_PORT_ID; 851 valid_num_ports--; 852 } 853 } 854 return valid_num_ports; 855 } 856 857 /* 858 * Macro to print out packet contents. Wrapped in debug define so that the 859 * data path is not effected when debug is disabled. 860 */ 861 #ifdef DEBUG 862 #define PRINT_PACKET(device, addr, size, header) do { \ 863 char *pkt_addr = (char*)(addr); \ 864 unsigned int index; \ 865 char packet[MAX_PRINT_BUFF]; \ 866 \ 867 if ((header)) \ 868 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 869 else \ 870 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 871 for (index = 0; index < (size); index++) { \ 872 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 873 "%02hhx ", pkt_addr[index]); \ 874 } \ 875 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 876 \ 877 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 878 } while(0) 879 #else 880 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 881 #endif 882 883 /* 884 * Function to convert guest physical addresses to vhost physical addresses. 885 * This is used to convert virtio buffer addresses. 886 */ 887 static inline uint64_t __attribute__((always_inline)) 888 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 889 uint32_t buf_len, hpa_type *addr_type) 890 { 891 struct virtio_memory_regions_hpa *region; 892 uint32_t regionidx; 893 uint64_t vhost_pa = 0; 894 895 *addr_type = PHYS_ADDR_INVALID; 896 897 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 898 region = &vdev->regions_hpa[regionidx]; 899 if ((guest_pa >= region->guest_phys_address) && 900 (guest_pa <= region->guest_phys_address_end)) { 901 vhost_pa = region->host_phys_addr_offset + guest_pa; 902 if (likely((guest_pa + buf_len - 1) 903 <= region->guest_phys_address_end)) 904 *addr_type = PHYS_ADDR_CONTINUOUS; 905 else 906 *addr_type = PHYS_ADDR_CROSS_SUBREG; 907 break; 908 } 909 } 910 911 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 912 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 913 (void *)(uintptr_t)vhost_pa); 914 915 return vhost_pa; 916 } 917 918 /* 919 * Compares a packet destination MAC address to a device MAC address. 920 */ 921 static inline int __attribute__((always_inline)) 922 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 923 { 924 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 925 } 926 927 /* 928 * This function learns the MAC address of the device and registers this along with a 929 * vlan tag to a VMDQ. 930 */ 931 static int 932 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 933 { 934 struct ether_hdr *pkt_hdr; 935 struct virtio_net_data_ll *dev_ll; 936 struct virtio_net *dev = vdev->dev; 937 int i, ret; 938 939 /* Learn MAC address of guest device from packet */ 940 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 941 942 dev_ll = ll_root_used; 943 944 while (dev_ll != NULL) { 945 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 946 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 947 return -1; 948 } 949 dev_ll = dev_ll->next; 950 } 951 952 for (i = 0; i < ETHER_ADDR_LEN; i++) 953 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 954 955 /* vlan_tag currently uses the device_id. */ 956 vdev->vlan_tag = vlan_tags[dev->device_fh]; 957 958 /* Print out VMDQ registration info. */ 959 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 960 dev->device_fh, 961 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 962 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 963 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 964 vdev->vlan_tag); 965 966 /* Register the MAC address. */ 967 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 968 (uint32_t)dev->device_fh + vmdq_pool_base); 969 if (ret) 970 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 971 dev->device_fh); 972 973 /* Enable stripping of the vlan tag as we handle routing. */ 974 if (vlan_strip) 975 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 976 (uint16_t)vdev->vmdq_rx_q, 1); 977 978 /* Set device as ready for RX. */ 979 vdev->ready = DEVICE_RX; 980 981 return 0; 982 } 983 984 /* 985 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 986 * queue before disabling RX on the device. 987 */ 988 static inline void 989 unlink_vmdq(struct vhost_dev *vdev) 990 { 991 unsigned i = 0; 992 unsigned rx_count; 993 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 994 995 if (vdev->ready == DEVICE_RX) { 996 /*clear MAC and VLAN settings*/ 997 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 998 for (i = 0; i < 6; i++) 999 vdev->mac_address.addr_bytes[i] = 0; 1000 1001 vdev->vlan_tag = 0; 1002 1003 /*Clear out the receive buffers*/ 1004 rx_count = rte_eth_rx_burst(ports[0], 1005 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1006 1007 while (rx_count) { 1008 for (i = 0; i < rx_count; i++) 1009 rte_pktmbuf_free(pkts_burst[i]); 1010 1011 rx_count = rte_eth_rx_burst(ports[0], 1012 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1013 } 1014 1015 vdev->ready = DEVICE_MAC_LEARNING; 1016 } 1017 } 1018 1019 /* 1020 * Check if the packet destination MAC address is for a local device. If so then put 1021 * the packet on that devices RX queue. If not then return. 1022 */ 1023 static inline int __attribute__((always_inline)) 1024 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1025 { 1026 struct virtio_net_data_ll *dev_ll; 1027 struct ether_hdr *pkt_hdr; 1028 uint64_t ret = 0; 1029 struct virtio_net *dev = vdev->dev; 1030 struct virtio_net *tdev; /* destination virito device */ 1031 1032 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1033 1034 /*get the used devices list*/ 1035 dev_ll = ll_root_used; 1036 1037 while (dev_ll != NULL) { 1038 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1039 &dev_ll->vdev->mac_address)) { 1040 1041 /* Drop the packet if the TX packet is destined for the TX device. */ 1042 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1043 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1044 dev->device_fh); 1045 return 0; 1046 } 1047 tdev = dev_ll->vdev->dev; 1048 1049 1050 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1051 1052 if (unlikely(dev_ll->vdev->remove)) { 1053 /*drop the packet if the device is marked for removal*/ 1054 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1055 } else { 1056 /*send the packet to the local virtio device*/ 1057 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1058 if (enable_stats) { 1059 rte_atomic64_add( 1060 &dev_statistics[tdev->device_fh].rx_total_atomic, 1061 1); 1062 rte_atomic64_add( 1063 &dev_statistics[tdev->device_fh].rx_atomic, 1064 ret); 1065 dev_statistics[tdev->device_fh].tx_total++; 1066 dev_statistics[tdev->device_fh].tx += ret; 1067 } 1068 } 1069 1070 return 0; 1071 } 1072 dev_ll = dev_ll->next; 1073 } 1074 1075 return -1; 1076 } 1077 1078 /* 1079 * Check if the destination MAC of a packet is one local VM, 1080 * and get its vlan tag, and offset if it is. 1081 */ 1082 static inline int __attribute__((always_inline)) 1083 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1084 uint32_t *offset, uint16_t *vlan_tag) 1085 { 1086 struct virtio_net_data_ll *dev_ll = ll_root_used; 1087 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1088 1089 while (dev_ll != NULL) { 1090 if ((dev_ll->vdev->ready == DEVICE_RX) 1091 && ether_addr_cmp(&(pkt_hdr->d_addr), 1092 &dev_ll->vdev->mac_address)) { 1093 /* 1094 * Drop the packet if the TX packet is 1095 * destined for the TX device. 1096 */ 1097 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1098 LOG_DEBUG(VHOST_DATA, 1099 "(%"PRIu64") TX: Source and destination" 1100 " MAC addresses are the same. Dropping " 1101 "packet.\n", 1102 dev_ll->vdev->dev->device_fh); 1103 return -1; 1104 } 1105 1106 /* 1107 * HW vlan strip will reduce the packet length 1108 * by minus length of vlan tag, so need restore 1109 * the packet length by plus it. 1110 */ 1111 *offset = VLAN_HLEN; 1112 *vlan_tag = 1113 (uint16_t) 1114 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1115 1116 LOG_DEBUG(VHOST_DATA, 1117 "(%"PRIu64") TX: pkt to local VM device id:" 1118 "(%"PRIu64") vlan tag: %d.\n", 1119 dev->device_fh, dev_ll->vdev->dev->device_fh, 1120 vlan_tag); 1121 1122 break; 1123 } 1124 dev_ll = dev_ll->next; 1125 } 1126 return 0; 1127 } 1128 1129 /* 1130 * This function routes the TX packet to the correct interface. This may be a local device 1131 * or the physical port. 1132 */ 1133 static inline void __attribute__((always_inline)) 1134 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1135 { 1136 struct mbuf_table *tx_q; 1137 struct rte_mbuf **m_table; 1138 unsigned len, ret, offset = 0; 1139 const uint16_t lcore_id = rte_lcore_id(); 1140 struct virtio_net *dev = vdev->dev; 1141 struct ether_hdr *nh; 1142 1143 /*check if destination is local VM*/ 1144 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1145 rte_pktmbuf_free(m); 1146 return; 1147 } 1148 1149 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1150 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1151 rte_pktmbuf_free(m); 1152 return; 1153 } 1154 } 1155 1156 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1157 1158 /*Add packet to the port tx queue*/ 1159 tx_q = &lcore_tx_queue[lcore_id]; 1160 len = tx_q->len; 1161 1162 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1163 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1164 /* Guest has inserted the vlan tag. */ 1165 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1166 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1167 if ((vm2vm_mode == VM2VM_HARDWARE) && 1168 (vh->vlan_tci != vlan_tag_be)) 1169 vh->vlan_tci = vlan_tag_be; 1170 } else { 1171 m->ol_flags = PKT_TX_VLAN_PKT; 1172 1173 /* 1174 * Find the right seg to adjust the data len when offset is 1175 * bigger than tail room size. 1176 */ 1177 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1178 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1179 m->data_len += offset; 1180 else { 1181 struct rte_mbuf *seg = m; 1182 1183 while ((seg->next != NULL) && 1184 (offset > rte_pktmbuf_tailroom(seg))) 1185 seg = seg->next; 1186 1187 seg->data_len += offset; 1188 } 1189 m->pkt_len += offset; 1190 } 1191 1192 m->vlan_tci = vlan_tag; 1193 } 1194 1195 tx_q->m_table[len] = m; 1196 len++; 1197 if (enable_stats) { 1198 dev_statistics[dev->device_fh].tx_total++; 1199 dev_statistics[dev->device_fh].tx++; 1200 } 1201 1202 if (unlikely(len == MAX_PKT_BURST)) { 1203 m_table = (struct rte_mbuf **)tx_q->m_table; 1204 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1205 /* Free any buffers not handled by TX and update the port stats. */ 1206 if (unlikely(ret < len)) { 1207 do { 1208 rte_pktmbuf_free(m_table[ret]); 1209 } while (++ret < len); 1210 } 1211 1212 len = 0; 1213 } 1214 1215 tx_q->len = len; 1216 return; 1217 } 1218 /* 1219 * This function is called by each data core. It handles all RX/TX registered with the 1220 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1221 * with all devices in the main linked list. 1222 */ 1223 static int 1224 switch_worker(__attribute__((unused)) void *arg) 1225 { 1226 struct rte_mempool *mbuf_pool = arg; 1227 struct virtio_net *dev = NULL; 1228 struct vhost_dev *vdev = NULL; 1229 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1230 struct virtio_net_data_ll *dev_ll; 1231 struct mbuf_table *tx_q; 1232 volatile struct lcore_ll_info *lcore_ll; 1233 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1234 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1235 unsigned ret, i; 1236 const uint16_t lcore_id = rte_lcore_id(); 1237 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1238 uint16_t rx_count = 0; 1239 uint16_t tx_count; 1240 uint32_t retry = 0; 1241 1242 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1243 lcore_ll = lcore_info[lcore_id].lcore_ll; 1244 prev_tsc = 0; 1245 1246 tx_q = &lcore_tx_queue[lcore_id]; 1247 for (i = 0; i < num_cores; i ++) { 1248 if (lcore_ids[i] == lcore_id) { 1249 tx_q->txq_id = i; 1250 break; 1251 } 1252 } 1253 1254 while(1) { 1255 cur_tsc = rte_rdtsc(); 1256 /* 1257 * TX burst queue drain 1258 */ 1259 diff_tsc = cur_tsc - prev_tsc; 1260 if (unlikely(diff_tsc > drain_tsc)) { 1261 1262 if (tx_q->len) { 1263 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1264 1265 /*Tx any packets in the queue*/ 1266 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1267 (struct rte_mbuf **)tx_q->m_table, 1268 (uint16_t)tx_q->len); 1269 if (unlikely(ret < tx_q->len)) { 1270 do { 1271 rte_pktmbuf_free(tx_q->m_table[ret]); 1272 } while (++ret < tx_q->len); 1273 } 1274 1275 tx_q->len = 0; 1276 } 1277 1278 prev_tsc = cur_tsc; 1279 1280 } 1281 1282 rte_prefetch0(lcore_ll->ll_root_used); 1283 /* 1284 * Inform the configuration core that we have exited the linked list and that no devices are 1285 * in use if requested. 1286 */ 1287 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1288 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1289 1290 /* 1291 * Process devices 1292 */ 1293 dev_ll = lcore_ll->ll_root_used; 1294 1295 while (dev_ll != NULL) { 1296 /*get virtio device ID*/ 1297 vdev = dev_ll->vdev; 1298 dev = vdev->dev; 1299 1300 if (unlikely(vdev->remove)) { 1301 dev_ll = dev_ll->next; 1302 unlink_vmdq(vdev); 1303 vdev->ready = DEVICE_SAFE_REMOVE; 1304 continue; 1305 } 1306 if (likely(vdev->ready == DEVICE_RX)) { 1307 /*Handle guest RX*/ 1308 rx_count = rte_eth_rx_burst(ports[0], 1309 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1310 1311 if (rx_count) { 1312 /* 1313 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1314 * Here MAX_PKT_BURST must be less than virtio queue size 1315 */ 1316 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1317 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1318 rte_delay_us(burst_rx_delay_time); 1319 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1320 break; 1321 } 1322 } 1323 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1324 if (enable_stats) { 1325 rte_atomic64_add( 1326 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1327 rx_count); 1328 rte_atomic64_add( 1329 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1330 } 1331 while (likely(rx_count)) { 1332 rx_count--; 1333 rte_pktmbuf_free(pkts_burst[rx_count]); 1334 } 1335 1336 } 1337 } 1338 1339 if (likely(!vdev->remove)) { 1340 /* Handle guest TX*/ 1341 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1342 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1343 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1344 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1345 while (tx_count) 1346 rte_pktmbuf_free(pkts_burst[--tx_count]); 1347 } 1348 } 1349 while (tx_count) 1350 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1351 } 1352 1353 /*move to the next device in the list*/ 1354 dev_ll = dev_ll->next; 1355 } 1356 } 1357 1358 return 0; 1359 } 1360 1361 /* 1362 * This function gets available ring number for zero copy rx. 1363 * Only one thread will call this funciton for a paticular virtio device, 1364 * so, it is designed as non-thread-safe function. 1365 */ 1366 static inline uint32_t __attribute__((always_inline)) 1367 get_available_ring_num_zcp(struct virtio_net *dev) 1368 { 1369 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1370 uint16_t avail_idx; 1371 1372 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1373 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1374 } 1375 1376 /* 1377 * This function gets available ring index for zero copy rx, 1378 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1379 * Only one thread will call this funciton for a paticular virtio device, 1380 * so, it is designed as non-thread-safe function. 1381 */ 1382 static inline uint32_t __attribute__((always_inline)) 1383 get_available_ring_index_zcp(struct virtio_net *dev, 1384 uint16_t *res_base_idx, uint32_t count) 1385 { 1386 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1387 uint16_t avail_idx; 1388 uint32_t retry = 0; 1389 uint16_t free_entries; 1390 1391 *res_base_idx = vq->last_used_idx_res; 1392 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1393 free_entries = (avail_idx - *res_base_idx); 1394 1395 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1396 "avail idx: %d, " 1397 "res base idx:%d, free entries:%d\n", 1398 dev->device_fh, avail_idx, *res_base_idx, 1399 free_entries); 1400 1401 /* 1402 * If retry is enabled and the queue is full then we wait 1403 * and retry to avoid packet loss. 1404 */ 1405 if (enable_retry && unlikely(count > free_entries)) { 1406 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1407 rte_delay_us(burst_rx_delay_time); 1408 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1409 free_entries = (avail_idx - *res_base_idx); 1410 if (count <= free_entries) 1411 break; 1412 } 1413 } 1414 1415 /*check that we have enough buffers*/ 1416 if (unlikely(count > free_entries)) 1417 count = free_entries; 1418 1419 if (unlikely(count == 0)) { 1420 LOG_DEBUG(VHOST_DATA, 1421 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1422 "avail idx: %d, res base idx:%d, free entries:%d\n", 1423 dev->device_fh, avail_idx, 1424 *res_base_idx, free_entries); 1425 return 0; 1426 } 1427 1428 vq->last_used_idx_res = *res_base_idx + count; 1429 1430 return count; 1431 } 1432 1433 /* 1434 * This function put descriptor back to used list. 1435 */ 1436 static inline void __attribute__((always_inline)) 1437 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1438 { 1439 uint16_t res_cur_idx = vq->last_used_idx; 1440 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1441 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1442 rte_compiler_barrier(); 1443 *(volatile uint16_t *)&vq->used->idx += 1; 1444 vq->last_used_idx += 1; 1445 1446 /* Kick the guest if necessary. */ 1447 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1448 eventfd_write((int)vq->kickfd, 1); 1449 } 1450 1451 /* 1452 * This function get available descriptor from vitio vring and un-attached mbuf 1453 * from vpool->ring, and then attach them together. It needs adjust the offset 1454 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1455 * frame data may be put to wrong location in mbuf. 1456 */ 1457 static inline void __attribute__((always_inline)) 1458 attach_rxmbuf_zcp(struct virtio_net *dev) 1459 { 1460 uint16_t res_base_idx, desc_idx; 1461 uint64_t buff_addr, phys_addr; 1462 struct vhost_virtqueue *vq; 1463 struct vring_desc *desc; 1464 struct rte_mbuf *mbuf = NULL; 1465 struct vpool *vpool; 1466 hpa_type addr_type; 1467 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1468 1469 vpool = &vpool_array[vdev->vmdq_rx_q]; 1470 vq = dev->virtqueue[VIRTIO_RXQ]; 1471 1472 do { 1473 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1474 1) != 1)) 1475 return; 1476 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1477 1478 desc = &vq->desc[desc_idx]; 1479 if (desc->flags & VRING_DESC_F_NEXT) { 1480 desc = &vq->desc[desc->next]; 1481 buff_addr = gpa_to_vva(dev, desc->addr); 1482 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1483 &addr_type); 1484 } else { 1485 buff_addr = gpa_to_vva(dev, 1486 desc->addr + vq->vhost_hlen); 1487 phys_addr = gpa_to_hpa(vdev, 1488 desc->addr + vq->vhost_hlen, 1489 desc->len, &addr_type); 1490 } 1491 1492 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1493 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1494 " address found when attaching RX frame buffer" 1495 " address!\n", dev->device_fh); 1496 put_desc_to_used_list_zcp(vq, desc_idx); 1497 continue; 1498 } 1499 1500 /* 1501 * Check if the frame buffer address from guest crosses 1502 * sub-region or not. 1503 */ 1504 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1505 RTE_LOG(ERR, VHOST_DATA, 1506 "(%"PRIu64") Frame buffer address cross " 1507 "sub-regioin found when attaching RX frame " 1508 "buffer address!\n", 1509 dev->device_fh); 1510 put_desc_to_used_list_zcp(vq, desc_idx); 1511 continue; 1512 } 1513 } while (unlikely(phys_addr == 0)); 1514 1515 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1516 if (unlikely(mbuf == NULL)) { 1517 LOG_DEBUG(VHOST_DATA, 1518 "(%"PRIu64") in attach_rxmbuf_zcp: " 1519 "ring_sc_dequeue fail.\n", 1520 dev->device_fh); 1521 put_desc_to_used_list_zcp(vq, desc_idx); 1522 return; 1523 } 1524 1525 if (unlikely(vpool->buf_size > desc->len)) { 1526 LOG_DEBUG(VHOST_DATA, 1527 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1528 "length(%d) of descriptor idx: %d less than room " 1529 "size required: %d\n", 1530 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1531 put_desc_to_used_list_zcp(vq, desc_idx); 1532 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1533 return; 1534 } 1535 1536 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1537 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1538 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1539 mbuf->data_len = desc->len; 1540 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1541 1542 LOG_DEBUG(VHOST_DATA, 1543 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1544 "descriptor idx:%d\n", 1545 dev->device_fh, res_base_idx, desc_idx); 1546 1547 __rte_mbuf_raw_free(mbuf); 1548 1549 return; 1550 } 1551 1552 /* 1553 * Detach an attched packet mbuf - 1554 * - restore original mbuf address and length values. 1555 * - reset pktmbuf data and data_len to their default values. 1556 * All other fields of the given packet mbuf will be left intact. 1557 * 1558 * @param m 1559 * The attached packet mbuf. 1560 */ 1561 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1562 { 1563 const struct rte_mempool *mp = m->pool; 1564 void *buf = RTE_MBUF_TO_BADDR(m); 1565 uint32_t buf_ofs; 1566 uint32_t buf_len = mp->elt_size - sizeof(*m); 1567 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1568 1569 m->buf_addr = buf; 1570 m->buf_len = (uint16_t)buf_len; 1571 1572 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1573 RTE_PKTMBUF_HEADROOM : m->buf_len; 1574 m->data_off = buf_ofs; 1575 1576 m->data_len = 0; 1577 } 1578 1579 /* 1580 * This function is called after packets have been transimited. It fetchs mbuf 1581 * from vpool->pool, detached it and put into vpool->ring. It also update the 1582 * used index and kick the guest if necessary. 1583 */ 1584 static inline uint32_t __attribute__((always_inline)) 1585 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1586 { 1587 struct rte_mbuf *mbuf; 1588 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1589 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1590 uint32_t index = 0; 1591 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1592 1593 LOG_DEBUG(VHOST_DATA, 1594 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1595 "clean is: %d\n", 1596 dev->device_fh, mbuf_count); 1597 LOG_DEBUG(VHOST_DATA, 1598 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1599 "clean is : %d\n", 1600 dev->device_fh, rte_ring_count(vpool->ring)); 1601 1602 for (index = 0; index < mbuf_count; index++) { 1603 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1604 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1605 pktmbuf_detach_zcp(mbuf); 1606 rte_ring_sp_enqueue(vpool->ring, mbuf); 1607 1608 /* Update used index buffer information. */ 1609 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1610 vq->used->ring[used_idx].len = 0; 1611 1612 used_idx = (used_idx + 1) & (vq->size - 1); 1613 } 1614 1615 LOG_DEBUG(VHOST_DATA, 1616 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1617 "clean is: %d\n", 1618 dev->device_fh, rte_mempool_count(vpool->pool)); 1619 LOG_DEBUG(VHOST_DATA, 1620 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1621 "clean is : %d\n", 1622 dev->device_fh, rte_ring_count(vpool->ring)); 1623 LOG_DEBUG(VHOST_DATA, 1624 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1625 "vq->last_used_idx:%d\n", 1626 dev->device_fh, vq->last_used_idx); 1627 1628 vq->last_used_idx += mbuf_count; 1629 1630 LOG_DEBUG(VHOST_DATA, 1631 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1632 "vq->last_used_idx:%d\n", 1633 dev->device_fh, vq->last_used_idx); 1634 1635 rte_compiler_barrier(); 1636 1637 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1638 1639 /* Kick guest if required. */ 1640 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1641 eventfd_write((int)vq->kickfd, 1); 1642 1643 return 0; 1644 } 1645 1646 /* 1647 * This function is called when a virtio device is destroy. 1648 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1649 */ 1650 static void mbuf_destroy_zcp(struct vpool *vpool) 1651 { 1652 struct rte_mbuf *mbuf = NULL; 1653 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1654 1655 LOG_DEBUG(VHOST_CONFIG, 1656 "in mbuf_destroy_zcp: mbuf count in mempool before " 1657 "mbuf_destroy_zcp is: %d\n", 1658 mbuf_count); 1659 LOG_DEBUG(VHOST_CONFIG, 1660 "in mbuf_destroy_zcp: mbuf count in ring before " 1661 "mbuf_destroy_zcp is : %d\n", 1662 rte_ring_count(vpool->ring)); 1663 1664 for (index = 0; index < mbuf_count; index++) { 1665 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1666 if (likely(mbuf != NULL)) { 1667 if (likely(RTE_MBUF_INDIRECT(mbuf))) 1668 pktmbuf_detach_zcp(mbuf); 1669 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1670 } 1671 } 1672 1673 LOG_DEBUG(VHOST_CONFIG, 1674 "in mbuf_destroy_zcp: mbuf count in mempool after " 1675 "mbuf_destroy_zcp is: %d\n", 1676 rte_mempool_count(vpool->pool)); 1677 LOG_DEBUG(VHOST_CONFIG, 1678 "in mbuf_destroy_zcp: mbuf count in ring after " 1679 "mbuf_destroy_zcp is : %d\n", 1680 rte_ring_count(vpool->ring)); 1681 } 1682 1683 /* 1684 * This function update the use flag and counter. 1685 */ 1686 static inline uint32_t __attribute__((always_inline)) 1687 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1688 uint32_t count) 1689 { 1690 struct vhost_virtqueue *vq; 1691 struct vring_desc *desc; 1692 struct rte_mbuf *buff; 1693 /* The virtio_hdr is initialised to 0. */ 1694 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1695 = {{0, 0, 0, 0, 0, 0}, 0}; 1696 uint64_t buff_hdr_addr = 0; 1697 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1698 uint32_t head_idx, packet_success = 0; 1699 uint16_t res_cur_idx; 1700 1701 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1702 1703 if (count == 0) 1704 return 0; 1705 1706 vq = dev->virtqueue[VIRTIO_RXQ]; 1707 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1708 1709 res_cur_idx = vq->last_used_idx; 1710 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1711 dev->device_fh, res_cur_idx, res_cur_idx + count); 1712 1713 /* Retrieve all of the head indexes first to avoid caching issues. */ 1714 for (head_idx = 0; head_idx < count; head_idx++) 1715 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1716 1717 /*Prefetch descriptor index. */ 1718 rte_prefetch0(&vq->desc[head[packet_success]]); 1719 1720 while (packet_success != count) { 1721 /* Get descriptor from available ring */ 1722 desc = &vq->desc[head[packet_success]]; 1723 1724 buff = pkts[packet_success]; 1725 LOG_DEBUG(VHOST_DATA, 1726 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1727 "pkt[%d] descriptor idx: %d\n", 1728 dev->device_fh, packet_success, 1729 MBUF_HEADROOM_UINT32(buff)); 1730 1731 PRINT_PACKET(dev, 1732 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1733 + RTE_PKTMBUF_HEADROOM), 1734 rte_pktmbuf_data_len(buff), 0); 1735 1736 /* Buffer address translation for virtio header. */ 1737 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1738 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1739 1740 /* 1741 * If the descriptors are chained the header and data are 1742 * placed in separate buffers. 1743 */ 1744 if (desc->flags & VRING_DESC_F_NEXT) { 1745 desc->len = vq->vhost_hlen; 1746 desc = &vq->desc[desc->next]; 1747 desc->len = rte_pktmbuf_data_len(buff); 1748 } else { 1749 desc->len = packet_len; 1750 } 1751 1752 /* Update used ring with desc information */ 1753 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1754 = head[packet_success]; 1755 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1756 = packet_len; 1757 res_cur_idx++; 1758 packet_success++; 1759 1760 /* A header is required per buffer. */ 1761 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1762 (const void *)&virtio_hdr, vq->vhost_hlen); 1763 1764 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1765 1766 if (likely(packet_success < count)) { 1767 /* Prefetch descriptor index. */ 1768 rte_prefetch0(&vq->desc[head[packet_success]]); 1769 } 1770 } 1771 1772 rte_compiler_barrier(); 1773 1774 LOG_DEBUG(VHOST_DATA, 1775 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1776 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1777 dev->device_fh, vq->last_used_idx, vq->used->idx); 1778 1779 *(volatile uint16_t *)&vq->used->idx += count; 1780 vq->last_used_idx += count; 1781 1782 LOG_DEBUG(VHOST_DATA, 1783 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1784 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1785 dev->device_fh, vq->last_used_idx, vq->used->idx); 1786 1787 /* Kick the guest if necessary. */ 1788 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1789 eventfd_write((int)vq->kickfd, 1); 1790 1791 return count; 1792 } 1793 1794 /* 1795 * This function routes the TX packet to the correct interface. 1796 * This may be a local device or the physical port. 1797 */ 1798 static inline void __attribute__((always_inline)) 1799 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1800 uint32_t desc_idx, uint8_t need_copy) 1801 { 1802 struct mbuf_table *tx_q; 1803 struct rte_mbuf **m_table; 1804 struct rte_mbuf *mbuf = NULL; 1805 unsigned len, ret, offset = 0; 1806 struct vpool *vpool; 1807 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1808 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1809 1810 /*Add packet to the port tx queue*/ 1811 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1812 len = tx_q->len; 1813 1814 /* Allocate an mbuf and populate the structure. */ 1815 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1816 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf); 1817 if (unlikely(mbuf == NULL)) { 1818 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1819 RTE_LOG(ERR, VHOST_DATA, 1820 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1821 dev->device_fh); 1822 put_desc_to_used_list_zcp(vq, desc_idx); 1823 return; 1824 } 1825 1826 if (vm2vm_mode == VM2VM_HARDWARE) { 1827 /* Avoid using a vlan tag from any vm for external pkt, such as 1828 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1829 * selection, MAC address determines it as an external pkt 1830 * which should go to network, while vlan tag determine it as 1831 * a vm2vm pkt should forward to another vm. Hardware confuse 1832 * such a ambiguous situation, so pkt will lost. 1833 */ 1834 vlan_tag = external_pkt_default_vlan_tag; 1835 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1836 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1837 __rte_mbuf_raw_free(mbuf); 1838 return; 1839 } 1840 } 1841 1842 mbuf->nb_segs = m->nb_segs; 1843 mbuf->next = m->next; 1844 mbuf->data_len = m->data_len + offset; 1845 mbuf->pkt_len = mbuf->data_len; 1846 if (unlikely(need_copy)) { 1847 /* Copy the packet contents to the mbuf. */ 1848 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1849 rte_pktmbuf_mtod(m, void *), 1850 m->data_len); 1851 } else { 1852 mbuf->data_off = m->data_off; 1853 mbuf->buf_physaddr = m->buf_physaddr; 1854 mbuf->buf_addr = m->buf_addr; 1855 } 1856 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1857 mbuf->vlan_tci = vlan_tag; 1858 mbuf->l2_len = sizeof(struct ether_hdr); 1859 mbuf->l3_len = sizeof(struct ipv4_hdr); 1860 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1861 1862 tx_q->m_table[len] = mbuf; 1863 len++; 1864 1865 LOG_DEBUG(VHOST_DATA, 1866 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1867 dev->device_fh, 1868 mbuf->nb_segs, 1869 (mbuf->next == NULL) ? "null" : "non-null"); 1870 1871 if (enable_stats) { 1872 dev_statistics[dev->device_fh].tx_total++; 1873 dev_statistics[dev->device_fh].tx++; 1874 } 1875 1876 if (unlikely(len == MAX_PKT_BURST)) { 1877 m_table = (struct rte_mbuf **)tx_q->m_table; 1878 ret = rte_eth_tx_burst(ports[0], 1879 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1880 1881 /* 1882 * Free any buffers not handled by TX and update 1883 * the port stats. 1884 */ 1885 if (unlikely(ret < len)) { 1886 do { 1887 rte_pktmbuf_free(m_table[ret]); 1888 } while (++ret < len); 1889 } 1890 1891 len = 0; 1892 txmbuf_clean_zcp(dev, vpool); 1893 } 1894 1895 tx_q->len = len; 1896 1897 return; 1898 } 1899 1900 /* 1901 * This function TX all available packets in virtio TX queue for one 1902 * virtio-net device. If it is first packet, it learns MAC address and 1903 * setup VMDQ. 1904 */ 1905 static inline void __attribute__((always_inline)) 1906 virtio_dev_tx_zcp(struct virtio_net *dev) 1907 { 1908 struct rte_mbuf m; 1909 struct vhost_virtqueue *vq; 1910 struct vring_desc *desc; 1911 uint64_t buff_addr = 0, phys_addr; 1912 uint32_t head[MAX_PKT_BURST]; 1913 uint32_t i; 1914 uint16_t free_entries, packet_success = 0; 1915 uint16_t avail_idx; 1916 uint8_t need_copy = 0; 1917 hpa_type addr_type; 1918 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1919 1920 vq = dev->virtqueue[VIRTIO_TXQ]; 1921 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1922 1923 /* If there are no available buffers then return. */ 1924 if (vq->last_used_idx_res == avail_idx) 1925 return; 1926 1927 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1928 1929 /* Prefetch available ring to retrieve head indexes. */ 1930 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1931 1932 /* Get the number of free entries in the ring */ 1933 free_entries = (avail_idx - vq->last_used_idx_res); 1934 1935 /* Limit to MAX_PKT_BURST. */ 1936 free_entries 1937 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1938 1939 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1940 dev->device_fh, free_entries); 1941 1942 /* Retrieve all of the head indexes first to avoid caching issues. */ 1943 for (i = 0; i < free_entries; i++) 1944 head[i] 1945 = vq->avail->ring[(vq->last_used_idx_res + i) 1946 & (vq->size - 1)]; 1947 1948 vq->last_used_idx_res += free_entries; 1949 1950 /* Prefetch descriptor index. */ 1951 rte_prefetch0(&vq->desc[head[packet_success]]); 1952 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1953 1954 while (packet_success < free_entries) { 1955 desc = &vq->desc[head[packet_success]]; 1956 1957 /* Discard first buffer as it is the virtio header */ 1958 desc = &vq->desc[desc->next]; 1959 1960 /* Buffer address translation. */ 1961 buff_addr = gpa_to_vva(dev, desc->addr); 1962 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1963 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1964 &addr_type); 1965 1966 if (likely(packet_success < (free_entries - 1))) 1967 /* Prefetch descriptor index. */ 1968 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1969 1970 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1971 RTE_LOG(ERR, VHOST_DATA, 1972 "(%"PRIu64") Invalid frame buffer address found" 1973 "when TX packets!\n", 1974 dev->device_fh); 1975 packet_success++; 1976 continue; 1977 } 1978 1979 /* Prefetch buffer address. */ 1980 rte_prefetch0((void *)(uintptr_t)buff_addr); 1981 1982 /* 1983 * Setup dummy mbuf. This is copied to a real mbuf if 1984 * transmitted out the physical port. 1985 */ 1986 m.data_len = desc->len; 1987 m.nb_segs = 1; 1988 m.next = NULL; 1989 m.data_off = 0; 1990 m.buf_addr = (void *)(uintptr_t)buff_addr; 1991 m.buf_physaddr = phys_addr; 1992 1993 /* 1994 * Check if the frame buffer address from guest crosses 1995 * sub-region or not. 1996 */ 1997 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1998 RTE_LOG(ERR, VHOST_DATA, 1999 "(%"PRIu64") Frame buffer address cross " 2000 "sub-regioin found when attaching TX frame " 2001 "buffer address!\n", 2002 dev->device_fh); 2003 need_copy = 1; 2004 } else 2005 need_copy = 0; 2006 2007 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2008 2009 /* 2010 * If this is the first received packet we need to learn 2011 * the MAC and setup VMDQ 2012 */ 2013 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2014 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2015 /* 2016 * Discard frame if device is scheduled for 2017 * removal or a duplicate MAC address is found. 2018 */ 2019 packet_success += free_entries; 2020 vq->last_used_idx += packet_success; 2021 break; 2022 } 2023 } 2024 2025 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2026 packet_success++; 2027 } 2028 } 2029 2030 /* 2031 * This function is called by each data core. It handles all RX/TX registered 2032 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2033 * addresses are compared with all devices in the main linked list. 2034 */ 2035 static int 2036 switch_worker_zcp(__attribute__((unused)) void *arg) 2037 { 2038 struct virtio_net *dev = NULL; 2039 struct vhost_dev *vdev = NULL; 2040 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2041 struct virtio_net_data_ll *dev_ll; 2042 struct mbuf_table *tx_q; 2043 volatile struct lcore_ll_info *lcore_ll; 2044 const uint64_t drain_tsc 2045 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2046 * BURST_TX_DRAIN_US; 2047 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2048 unsigned ret; 2049 const uint16_t lcore_id = rte_lcore_id(); 2050 uint16_t count_in_ring, rx_count = 0; 2051 2052 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2053 2054 lcore_ll = lcore_info[lcore_id].lcore_ll; 2055 prev_tsc = 0; 2056 2057 while (1) { 2058 cur_tsc = rte_rdtsc(); 2059 2060 /* TX burst queue drain */ 2061 diff_tsc = cur_tsc - prev_tsc; 2062 if (unlikely(diff_tsc > drain_tsc)) { 2063 /* 2064 * Get mbuf from vpool.pool and detach mbuf and 2065 * put back into vpool.ring. 2066 */ 2067 dev_ll = lcore_ll->ll_root_used; 2068 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2069 /* Get virtio device ID */ 2070 vdev = dev_ll->vdev; 2071 dev = vdev->dev; 2072 2073 if (likely(!vdev->remove)) { 2074 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2075 if (tx_q->len) { 2076 LOG_DEBUG(VHOST_DATA, 2077 "TX queue drained after timeout" 2078 " with burst size %u\n", 2079 tx_q->len); 2080 2081 /* 2082 * Tx any packets in the queue 2083 */ 2084 ret = rte_eth_tx_burst( 2085 ports[0], 2086 (uint16_t)tx_q->txq_id, 2087 (struct rte_mbuf **) 2088 tx_q->m_table, 2089 (uint16_t)tx_q->len); 2090 if (unlikely(ret < tx_q->len)) { 2091 do { 2092 rte_pktmbuf_free( 2093 tx_q->m_table[ret]); 2094 } while (++ret < tx_q->len); 2095 } 2096 tx_q->len = 0; 2097 2098 txmbuf_clean_zcp(dev, 2099 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2100 } 2101 } 2102 dev_ll = dev_ll->next; 2103 } 2104 prev_tsc = cur_tsc; 2105 } 2106 2107 rte_prefetch0(lcore_ll->ll_root_used); 2108 2109 /* 2110 * Inform the configuration core that we have exited the linked 2111 * list and that no devices are in use if requested. 2112 */ 2113 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2114 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2115 2116 /* Process devices */ 2117 dev_ll = lcore_ll->ll_root_used; 2118 2119 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2120 vdev = dev_ll->vdev; 2121 dev = vdev->dev; 2122 if (unlikely(vdev->remove)) { 2123 dev_ll = dev_ll->next; 2124 unlink_vmdq(vdev); 2125 vdev->ready = DEVICE_SAFE_REMOVE; 2126 continue; 2127 } 2128 2129 if (likely(vdev->ready == DEVICE_RX)) { 2130 uint32_t index = vdev->vmdq_rx_q; 2131 uint16_t i; 2132 count_in_ring 2133 = rte_ring_count(vpool_array[index].ring); 2134 uint16_t free_entries 2135 = (uint16_t)get_available_ring_num_zcp(dev); 2136 2137 /* 2138 * Attach all mbufs in vpool.ring and put back 2139 * into vpool.pool. 2140 */ 2141 for (i = 0; 2142 i < RTE_MIN(free_entries, 2143 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2144 i++) 2145 attach_rxmbuf_zcp(dev); 2146 2147 /* Handle guest RX */ 2148 rx_count = rte_eth_rx_burst(ports[0], 2149 vdev->vmdq_rx_q, pkts_burst, 2150 MAX_PKT_BURST); 2151 2152 if (rx_count) { 2153 ret_count = virtio_dev_rx_zcp(dev, 2154 pkts_burst, rx_count); 2155 if (enable_stats) { 2156 dev_statistics[dev->device_fh].rx_total 2157 += rx_count; 2158 dev_statistics[dev->device_fh].rx 2159 += ret_count; 2160 } 2161 while (likely(rx_count)) { 2162 rx_count--; 2163 pktmbuf_detach_zcp( 2164 pkts_burst[rx_count]); 2165 rte_ring_sp_enqueue( 2166 vpool_array[index].ring, 2167 (void *)pkts_burst[rx_count]); 2168 } 2169 } 2170 } 2171 2172 if (likely(!vdev->remove)) 2173 /* Handle guest TX */ 2174 virtio_dev_tx_zcp(dev); 2175 2176 /* Move to the next device in the list */ 2177 dev_ll = dev_ll->next; 2178 } 2179 } 2180 2181 return 0; 2182 } 2183 2184 2185 /* 2186 * Add an entry to a used linked list. A free entry must first be found 2187 * in the free linked list using get_data_ll_free_entry(); 2188 */ 2189 static void 2190 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2191 struct virtio_net_data_ll *ll_dev) 2192 { 2193 struct virtio_net_data_ll *ll = *ll_root_addr; 2194 2195 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2196 ll_dev->next = NULL; 2197 rte_compiler_barrier(); 2198 2199 /* If ll == NULL then this is the first device. */ 2200 if (ll) { 2201 /* Increment to the tail of the linked list. */ 2202 while ((ll->next != NULL) ) 2203 ll = ll->next; 2204 2205 ll->next = ll_dev; 2206 } else { 2207 *ll_root_addr = ll_dev; 2208 } 2209 } 2210 2211 /* 2212 * Remove an entry from a used linked list. The entry must then be added to 2213 * the free linked list using put_data_ll_free_entry(). 2214 */ 2215 static void 2216 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2217 struct virtio_net_data_ll *ll_dev, 2218 struct virtio_net_data_ll *ll_dev_last) 2219 { 2220 struct virtio_net_data_ll *ll = *ll_root_addr; 2221 2222 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2223 return; 2224 2225 if (ll_dev == ll) 2226 *ll_root_addr = ll_dev->next; 2227 else 2228 if (likely(ll_dev_last != NULL)) 2229 ll_dev_last->next = ll_dev->next; 2230 else 2231 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2232 } 2233 2234 /* 2235 * Find and return an entry from the free linked list. 2236 */ 2237 static struct virtio_net_data_ll * 2238 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2239 { 2240 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2241 struct virtio_net_data_ll *ll_dev; 2242 2243 if (ll_free == NULL) 2244 return NULL; 2245 2246 ll_dev = ll_free; 2247 *ll_root_addr = ll_free->next; 2248 2249 return ll_dev; 2250 } 2251 2252 /* 2253 * Place an entry back on to the free linked list. 2254 */ 2255 static void 2256 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2257 struct virtio_net_data_ll *ll_dev) 2258 { 2259 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2260 2261 if (ll_dev == NULL) 2262 return; 2263 2264 ll_dev->next = ll_free; 2265 *ll_root_addr = ll_dev; 2266 } 2267 2268 /* 2269 * Creates a linked list of a given size. 2270 */ 2271 static struct virtio_net_data_ll * 2272 alloc_data_ll(uint32_t size) 2273 { 2274 struct virtio_net_data_ll *ll_new; 2275 uint32_t i; 2276 2277 /* Malloc and then chain the linked list. */ 2278 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2279 if (ll_new == NULL) { 2280 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2281 return NULL; 2282 } 2283 2284 for (i = 0; i < size - 1; i++) { 2285 ll_new[i].vdev = NULL; 2286 ll_new[i].next = &ll_new[i+1]; 2287 } 2288 ll_new[i].next = NULL; 2289 2290 return (ll_new); 2291 } 2292 2293 /* 2294 * Create the main linked list along with each individual cores linked list. A used and a free list 2295 * are created to manage entries. 2296 */ 2297 static int 2298 init_data_ll (void) 2299 { 2300 int lcore; 2301 2302 RTE_LCORE_FOREACH_SLAVE(lcore) { 2303 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2304 if (lcore_info[lcore].lcore_ll == NULL) { 2305 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2306 return -1; 2307 } 2308 2309 lcore_info[lcore].lcore_ll->device_num = 0; 2310 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2311 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2312 if (num_devices % num_switching_cores) 2313 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2314 else 2315 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2316 } 2317 2318 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2319 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2320 2321 return 0; 2322 } 2323 2324 /* 2325 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2326 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2327 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2328 */ 2329 static void 2330 destroy_device (volatile struct virtio_net *dev) 2331 { 2332 struct virtio_net_data_ll *ll_lcore_dev_cur; 2333 struct virtio_net_data_ll *ll_main_dev_cur; 2334 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2335 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2336 struct vhost_dev *vdev; 2337 int lcore; 2338 2339 dev->flags &= ~VIRTIO_DEV_RUNNING; 2340 2341 vdev = (struct vhost_dev *)dev->priv; 2342 /*set the remove flag. */ 2343 vdev->remove = 1; 2344 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2345 rte_pause(); 2346 } 2347 2348 /* Search for entry to be removed from lcore ll */ 2349 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2350 while (ll_lcore_dev_cur != NULL) { 2351 if (ll_lcore_dev_cur->vdev == vdev) { 2352 break; 2353 } else { 2354 ll_lcore_dev_last = ll_lcore_dev_cur; 2355 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2356 } 2357 } 2358 2359 if (ll_lcore_dev_cur == NULL) { 2360 RTE_LOG(ERR, VHOST_CONFIG, 2361 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2362 dev->device_fh); 2363 return; 2364 } 2365 2366 /* Search for entry to be removed from main ll */ 2367 ll_main_dev_cur = ll_root_used; 2368 ll_main_dev_last = NULL; 2369 while (ll_main_dev_cur != NULL) { 2370 if (ll_main_dev_cur->vdev == vdev) { 2371 break; 2372 } else { 2373 ll_main_dev_last = ll_main_dev_cur; 2374 ll_main_dev_cur = ll_main_dev_cur->next; 2375 } 2376 } 2377 2378 /* Remove entries from the lcore and main ll. */ 2379 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2380 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2381 2382 /* Set the dev_removal_flag on each lcore. */ 2383 RTE_LCORE_FOREACH_SLAVE(lcore) { 2384 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2385 } 2386 2387 /* 2388 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2389 * they can no longer access the device removed from the linked lists and that the devices 2390 * are no longer in use. 2391 */ 2392 RTE_LCORE_FOREACH_SLAVE(lcore) { 2393 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2394 rte_pause(); 2395 } 2396 } 2397 2398 /* Add the entries back to the lcore and main free ll.*/ 2399 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2400 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2401 2402 /* Decrement number of device on the lcore. */ 2403 lcore_info[vdev->coreid].lcore_ll->device_num--; 2404 2405 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2406 2407 if (zero_copy) { 2408 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2409 2410 /* Stop the RX queue. */ 2411 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2412 LOG_DEBUG(VHOST_CONFIG, 2413 "(%"PRIu64") In destroy_device: Failed to stop " 2414 "rx queue:%d\n", 2415 dev->device_fh, 2416 vdev->vmdq_rx_q); 2417 } 2418 2419 LOG_DEBUG(VHOST_CONFIG, 2420 "(%"PRIu64") in destroy_device: Start put mbuf in " 2421 "mempool back to ring for RX queue: %d\n", 2422 dev->device_fh, vdev->vmdq_rx_q); 2423 2424 mbuf_destroy_zcp(vpool); 2425 2426 /* Stop the TX queue. */ 2427 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2428 LOG_DEBUG(VHOST_CONFIG, 2429 "(%"PRIu64") In destroy_device: Failed to " 2430 "stop tx queue:%d\n", 2431 dev->device_fh, vdev->vmdq_rx_q); 2432 } 2433 2434 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2435 2436 LOG_DEBUG(VHOST_CONFIG, 2437 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2438 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2439 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2440 dev->device_fh); 2441 2442 mbuf_destroy_zcp(vpool); 2443 rte_free(vdev->regions_hpa); 2444 } 2445 rte_free(vdev); 2446 2447 } 2448 2449 /* 2450 * Calculate the region count of physical continous regions for one particular 2451 * region of whose vhost virtual address is continous. The particular region 2452 * start from vva_start, with size of 'size' in argument. 2453 */ 2454 static uint32_t 2455 check_hpa_regions(uint64_t vva_start, uint64_t size) 2456 { 2457 uint32_t i, nregions = 0, page_size = getpagesize(); 2458 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2459 if (vva_start % page_size) { 2460 LOG_DEBUG(VHOST_CONFIG, 2461 "in check_countinous: vva start(%p) mod page_size(%d) " 2462 "has remainder\n", 2463 (void *)(uintptr_t)vva_start, page_size); 2464 return 0; 2465 } 2466 if (size % page_size) { 2467 LOG_DEBUG(VHOST_CONFIG, 2468 "in check_countinous: " 2469 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2470 size, page_size); 2471 return 0; 2472 } 2473 for (i = 0; i < size - page_size; i = i + page_size) { 2474 cur_phys_addr 2475 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2476 next_phys_addr = rte_mem_virt2phy( 2477 (void *)(uintptr_t)(vva_start + i + page_size)); 2478 if ((cur_phys_addr + page_size) != next_phys_addr) { 2479 ++nregions; 2480 LOG_DEBUG(VHOST_CONFIG, 2481 "in check_continuous: hva addr:(%p) is not " 2482 "continuous with hva addr:(%p), diff:%d\n", 2483 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2484 (void *)(uintptr_t)(vva_start + (uint64_t)i 2485 + page_size), page_size); 2486 LOG_DEBUG(VHOST_CONFIG, 2487 "in check_continuous: hpa addr:(%p) is not " 2488 "continuous with hpa addr:(%p), " 2489 "diff:(%"PRIu64")\n", 2490 (void *)(uintptr_t)cur_phys_addr, 2491 (void *)(uintptr_t)next_phys_addr, 2492 (next_phys_addr-cur_phys_addr)); 2493 } 2494 } 2495 return nregions; 2496 } 2497 2498 /* 2499 * Divide each region whose vhost virtual address is continous into a few 2500 * sub-regions, make sure the physical address within each sub-region are 2501 * continous. And fill offset(to GPA) and size etc. information of each 2502 * sub-region into regions_hpa. 2503 */ 2504 static uint32_t 2505 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2506 { 2507 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2508 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2509 2510 if (mem_region_hpa == NULL) 2511 return 0; 2512 2513 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2514 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2515 virtio_memory->regions[regionidx].address_offset; 2516 mem_region_hpa[regionidx_hpa].guest_phys_address 2517 = virtio_memory->regions[regionidx].guest_phys_address; 2518 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2519 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2520 mem_region_hpa[regionidx_hpa].guest_phys_address; 2521 LOG_DEBUG(VHOST_CONFIG, 2522 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2523 regionidx_hpa, 2524 (void *)(uintptr_t) 2525 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2526 LOG_DEBUG(VHOST_CONFIG, 2527 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2528 regionidx_hpa, 2529 (void *)(uintptr_t) 2530 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2531 for (i = 0, k = 0; 2532 i < virtio_memory->regions[regionidx].memory_size - 2533 page_size; 2534 i += page_size) { 2535 cur_phys_addr = rte_mem_virt2phy( 2536 (void *)(uintptr_t)(vva_start + i)); 2537 next_phys_addr = rte_mem_virt2phy( 2538 (void *)(uintptr_t)(vva_start + 2539 i + page_size)); 2540 if ((cur_phys_addr + page_size) != next_phys_addr) { 2541 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2542 mem_region_hpa[regionidx_hpa].guest_phys_address + 2543 k + page_size; 2544 mem_region_hpa[regionidx_hpa].memory_size 2545 = k + page_size; 2546 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2547 "phys addr end [%d]:(%p)\n", 2548 regionidx_hpa, 2549 (void *)(uintptr_t) 2550 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2551 LOG_DEBUG(VHOST_CONFIG, 2552 "in fill_hpa_regions: guest phys addr " 2553 "size [%d]:(%p)\n", 2554 regionidx_hpa, 2555 (void *)(uintptr_t) 2556 (mem_region_hpa[regionidx_hpa].memory_size)); 2557 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2558 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2559 ++regionidx_hpa; 2560 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2561 next_phys_addr - 2562 mem_region_hpa[regionidx_hpa].guest_phys_address; 2563 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2564 " phys addr start[%d]:(%p)\n", 2565 regionidx_hpa, 2566 (void *)(uintptr_t) 2567 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2568 LOG_DEBUG(VHOST_CONFIG, 2569 "in fill_hpa_regions: host phys addr " 2570 "start[%d]:(%p)\n", 2571 regionidx_hpa, 2572 (void *)(uintptr_t) 2573 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2574 k = 0; 2575 } else { 2576 k += page_size; 2577 } 2578 } 2579 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2580 = mem_region_hpa[regionidx_hpa].guest_phys_address 2581 + k + page_size; 2582 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2583 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2584 "[%d]:(%p)\n", regionidx_hpa, 2585 (void *)(uintptr_t) 2586 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2587 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2588 "[%d]:(%p)\n", regionidx_hpa, 2589 (void *)(uintptr_t) 2590 (mem_region_hpa[regionidx_hpa].memory_size)); 2591 ++regionidx_hpa; 2592 } 2593 return regionidx_hpa; 2594 } 2595 2596 /* 2597 * A new device is added to a data core. First the device is added to the main linked list 2598 * and the allocated to a specific data core. 2599 */ 2600 static int 2601 new_device (struct virtio_net *dev) 2602 { 2603 struct virtio_net_data_ll *ll_dev; 2604 int lcore, core_add = 0; 2605 uint32_t device_num_min = num_devices; 2606 struct vhost_dev *vdev; 2607 uint32_t regionidx; 2608 2609 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2610 if (vdev == NULL) { 2611 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2612 dev->device_fh); 2613 return -1; 2614 } 2615 vdev->dev = dev; 2616 dev->priv = vdev; 2617 2618 if (zero_copy) { 2619 vdev->nregions_hpa = dev->mem->nregions; 2620 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2621 vdev->nregions_hpa 2622 += check_hpa_regions( 2623 dev->mem->regions[regionidx].guest_phys_address 2624 + dev->mem->regions[regionidx].address_offset, 2625 dev->mem->regions[regionidx].memory_size); 2626 2627 } 2628 2629 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region", 2630 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa, 2631 RTE_CACHE_LINE_SIZE); 2632 if (vdev->regions_hpa == NULL) { 2633 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2634 rte_free(vdev); 2635 return -1; 2636 } 2637 2638 2639 if (fill_hpa_memory_regions( 2640 vdev->regions_hpa, dev->mem 2641 ) != vdev->nregions_hpa) { 2642 2643 RTE_LOG(ERR, VHOST_CONFIG, 2644 "hpa memory regions number mismatch: " 2645 "[%d]\n", vdev->nregions_hpa); 2646 rte_free(vdev->regions_hpa); 2647 rte_free(vdev); 2648 return -1; 2649 } 2650 } 2651 2652 2653 /* Add device to main ll */ 2654 ll_dev = get_data_ll_free_entry(&ll_root_free); 2655 if (ll_dev == NULL) { 2656 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2657 "of %d devices per core has been reached\n", 2658 dev->device_fh, num_devices); 2659 if (vdev->regions_hpa) 2660 rte_free(vdev->regions_hpa); 2661 rte_free(vdev); 2662 return -1; 2663 } 2664 ll_dev->vdev = vdev; 2665 add_data_ll_entry(&ll_root_used, ll_dev); 2666 vdev->vmdq_rx_q 2667 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2668 2669 if (zero_copy) { 2670 uint32_t index = vdev->vmdq_rx_q; 2671 uint32_t count_in_ring, i; 2672 struct mbuf_table *tx_q; 2673 2674 count_in_ring = rte_ring_count(vpool_array[index].ring); 2675 2676 LOG_DEBUG(VHOST_CONFIG, 2677 "(%"PRIu64") in new_device: mbuf count in mempool " 2678 "before attach is: %d\n", 2679 dev->device_fh, 2680 rte_mempool_count(vpool_array[index].pool)); 2681 LOG_DEBUG(VHOST_CONFIG, 2682 "(%"PRIu64") in new_device: mbuf count in ring " 2683 "before attach is : %d\n", 2684 dev->device_fh, count_in_ring); 2685 2686 /* 2687 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2688 */ 2689 for (i = 0; i < count_in_ring; i++) 2690 attach_rxmbuf_zcp(dev); 2691 2692 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2693 "mempool after attach is: %d\n", 2694 dev->device_fh, 2695 rte_mempool_count(vpool_array[index].pool)); 2696 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2697 "ring after attach is : %d\n", 2698 dev->device_fh, 2699 rte_ring_count(vpool_array[index].ring)); 2700 2701 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2702 tx_q->txq_id = vdev->vmdq_rx_q; 2703 2704 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2705 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2706 2707 LOG_DEBUG(VHOST_CONFIG, 2708 "(%"PRIu64") In new_device: Failed to start " 2709 "tx queue:%d\n", 2710 dev->device_fh, vdev->vmdq_rx_q); 2711 2712 mbuf_destroy_zcp(vpool); 2713 rte_free(vdev->regions_hpa); 2714 rte_free(vdev); 2715 return -1; 2716 } 2717 2718 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2719 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2720 2721 LOG_DEBUG(VHOST_CONFIG, 2722 "(%"PRIu64") In new_device: Failed to start " 2723 "rx queue:%d\n", 2724 dev->device_fh, vdev->vmdq_rx_q); 2725 2726 /* Stop the TX queue. */ 2727 if (rte_eth_dev_tx_queue_stop(ports[0], 2728 vdev->vmdq_rx_q) != 0) { 2729 LOG_DEBUG(VHOST_CONFIG, 2730 "(%"PRIu64") In new_device: Failed to " 2731 "stop tx queue:%d\n", 2732 dev->device_fh, vdev->vmdq_rx_q); 2733 } 2734 2735 mbuf_destroy_zcp(vpool); 2736 rte_free(vdev->regions_hpa); 2737 rte_free(vdev); 2738 return -1; 2739 } 2740 2741 } 2742 2743 /*reset ready flag*/ 2744 vdev->ready = DEVICE_MAC_LEARNING; 2745 vdev->remove = 0; 2746 2747 /* Find a suitable lcore to add the device. */ 2748 RTE_LCORE_FOREACH_SLAVE(lcore) { 2749 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2750 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2751 core_add = lcore; 2752 } 2753 } 2754 /* Add device to lcore ll */ 2755 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2756 if (ll_dev == NULL) { 2757 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2758 vdev->ready = DEVICE_SAFE_REMOVE; 2759 destroy_device(dev); 2760 if (vdev->regions_hpa) 2761 rte_free(vdev->regions_hpa); 2762 rte_free(vdev); 2763 return -1; 2764 } 2765 ll_dev->vdev = vdev; 2766 vdev->coreid = core_add; 2767 2768 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2769 2770 /* Initialize device stats */ 2771 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2772 2773 /* Disable notifications. */ 2774 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2775 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2776 lcore_info[vdev->coreid].lcore_ll->device_num++; 2777 dev->flags |= VIRTIO_DEV_RUNNING; 2778 2779 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2780 2781 return 0; 2782 } 2783 2784 /* 2785 * These callback allow devices to be added to the data core when configuration 2786 * has been fully complete. 2787 */ 2788 static const struct virtio_net_device_ops virtio_net_device_ops = 2789 { 2790 .new_device = new_device, 2791 .destroy_device = destroy_device, 2792 }; 2793 2794 /* 2795 * This is a thread will wake up after a period to print stats if the user has 2796 * enabled them. 2797 */ 2798 static void 2799 print_stats(void) 2800 { 2801 struct virtio_net_data_ll *dev_ll; 2802 uint64_t tx_dropped, rx_dropped; 2803 uint64_t tx, tx_total, rx, rx_total; 2804 uint32_t device_fh; 2805 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2806 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2807 2808 while(1) { 2809 sleep(enable_stats); 2810 2811 /* Clear screen and move to top left */ 2812 printf("%s%s", clr, top_left); 2813 2814 printf("\nDevice statistics ===================================="); 2815 2816 dev_ll = ll_root_used; 2817 while (dev_ll != NULL) { 2818 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2819 tx_total = dev_statistics[device_fh].tx_total; 2820 tx = dev_statistics[device_fh].tx; 2821 tx_dropped = tx_total - tx; 2822 if (zero_copy == 0) { 2823 rx_total = rte_atomic64_read( 2824 &dev_statistics[device_fh].rx_total_atomic); 2825 rx = rte_atomic64_read( 2826 &dev_statistics[device_fh].rx_atomic); 2827 } else { 2828 rx_total = dev_statistics[device_fh].rx_total; 2829 rx = dev_statistics[device_fh].rx; 2830 } 2831 rx_dropped = rx_total - rx; 2832 2833 printf("\nStatistics for device %"PRIu32" ------------------------------" 2834 "\nTX total: %"PRIu64"" 2835 "\nTX dropped: %"PRIu64"" 2836 "\nTX successful: %"PRIu64"" 2837 "\nRX total: %"PRIu64"" 2838 "\nRX dropped: %"PRIu64"" 2839 "\nRX successful: %"PRIu64"", 2840 device_fh, 2841 tx_total, 2842 tx_dropped, 2843 tx, 2844 rx_total, 2845 rx_dropped, 2846 rx); 2847 2848 dev_ll = dev_ll->next; 2849 } 2850 printf("\n======================================================\n"); 2851 } 2852 } 2853 2854 static void 2855 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2856 char *ring_name, uint32_t nb_mbuf) 2857 { 2858 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM; 2859 vpool_array[index].pool 2860 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP, 2861 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private), 2862 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize, 2863 rte_pktmbuf_init, NULL, socket, 0); 2864 if (vpool_array[index].pool != NULL) { 2865 vpool_array[index].ring 2866 = rte_ring_create(ring_name, 2867 rte_align32pow2(nb_mbuf + 1), 2868 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2869 if (likely(vpool_array[index].ring != NULL)) { 2870 LOG_DEBUG(VHOST_CONFIG, 2871 "in setup_mempool_tbl: mbuf count in " 2872 "mempool is: %d\n", 2873 rte_mempool_count(vpool_array[index].pool)); 2874 LOG_DEBUG(VHOST_CONFIG, 2875 "in setup_mempool_tbl: mbuf count in " 2876 "ring is: %d\n", 2877 rte_ring_count(vpool_array[index].ring)); 2878 } else { 2879 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2880 ring_name); 2881 } 2882 2883 /* Need consider head room. */ 2884 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM; 2885 } else { 2886 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2887 } 2888 } 2889 2890 2891 /* 2892 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2893 * device is also registered here to handle the IOCTLs. 2894 */ 2895 int 2896 main(int argc, char *argv[]) 2897 { 2898 struct rte_mempool *mbuf_pool = NULL; 2899 unsigned lcore_id, core_id = 0; 2900 unsigned nb_ports, valid_num_ports; 2901 int ret; 2902 uint8_t portid; 2903 uint16_t queue_id; 2904 static pthread_t tid; 2905 2906 /* init EAL */ 2907 ret = rte_eal_init(argc, argv); 2908 if (ret < 0) 2909 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2910 argc -= ret; 2911 argv += ret; 2912 2913 /* parse app arguments */ 2914 ret = us_vhost_parse_args(argc, argv); 2915 if (ret < 0) 2916 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2917 2918 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2919 if (rte_lcore_is_enabled(lcore_id)) 2920 lcore_ids[core_id ++] = lcore_id; 2921 2922 if (rte_lcore_count() > RTE_MAX_LCORE) 2923 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2924 2925 /*set the number of swithcing cores available*/ 2926 num_switching_cores = rte_lcore_count()-1; 2927 2928 /* Get the number of physical ports. */ 2929 nb_ports = rte_eth_dev_count(); 2930 if (nb_ports > RTE_MAX_ETHPORTS) 2931 nb_ports = RTE_MAX_ETHPORTS; 2932 2933 /* 2934 * Update the global var NUM_PORTS and global array PORTS 2935 * and get value of var VALID_NUM_PORTS according to system ports number 2936 */ 2937 valid_num_ports = check_ports_num(nb_ports); 2938 2939 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2940 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2941 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2942 return -1; 2943 } 2944 2945 if (zero_copy == 0) { 2946 /* Create the mbuf pool. */ 2947 mbuf_pool = rte_mempool_create( 2948 "MBUF_POOL", 2949 NUM_MBUFS_PER_PORT 2950 * valid_num_ports, 2951 MBUF_SIZE, MBUF_CACHE_SIZE, 2952 sizeof(struct rte_pktmbuf_pool_private), 2953 rte_pktmbuf_pool_init, NULL, 2954 rte_pktmbuf_init, NULL, 2955 rte_socket_id(), 0); 2956 if (mbuf_pool == NULL) 2957 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2958 2959 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2960 vpool_array[queue_id].pool = mbuf_pool; 2961 2962 if (vm2vm_mode == VM2VM_HARDWARE) { 2963 /* Enable VT loop back to let L2 switch to do it. */ 2964 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2965 LOG_DEBUG(VHOST_CONFIG, 2966 "Enable loop back for L2 switch in vmdq.\n"); 2967 } 2968 } else { 2969 uint32_t nb_mbuf; 2970 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2971 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2972 2973 nb_mbuf = num_rx_descriptor 2974 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2975 + num_switching_cores * MAX_PKT_BURST; 2976 2977 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2978 snprintf(pool_name, sizeof(pool_name), 2979 "rxmbuf_pool_%u", queue_id); 2980 snprintf(ring_name, sizeof(ring_name), 2981 "rxmbuf_ring_%u", queue_id); 2982 setup_mempool_tbl(rte_socket_id(), queue_id, 2983 pool_name, ring_name, nb_mbuf); 2984 } 2985 2986 nb_mbuf = num_tx_descriptor 2987 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2988 + num_switching_cores * MAX_PKT_BURST; 2989 2990 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2991 snprintf(pool_name, sizeof(pool_name), 2992 "txmbuf_pool_%u", queue_id); 2993 snprintf(ring_name, sizeof(ring_name), 2994 "txmbuf_ring_%u", queue_id); 2995 setup_mempool_tbl(rte_socket_id(), 2996 (queue_id + MAX_QUEUES), 2997 pool_name, ring_name, nb_mbuf); 2998 } 2999 3000 if (vm2vm_mode == VM2VM_HARDWARE) { 3001 /* Enable VT loop back to let L2 switch to do it. */ 3002 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3003 LOG_DEBUG(VHOST_CONFIG, 3004 "Enable loop back for L2 switch in vmdq.\n"); 3005 } 3006 } 3007 /* Set log level. */ 3008 rte_set_log_level(LOG_LEVEL); 3009 3010 /* initialize all ports */ 3011 for (portid = 0; portid < nb_ports; portid++) { 3012 /* skip ports that are not enabled */ 3013 if ((enabled_port_mask & (1 << portid)) == 0) { 3014 RTE_LOG(INFO, VHOST_PORT, 3015 "Skipping disabled port %d\n", portid); 3016 continue; 3017 } 3018 if (port_init(portid) != 0) 3019 rte_exit(EXIT_FAILURE, 3020 "Cannot initialize network ports\n"); 3021 } 3022 3023 /* Initialise all linked lists. */ 3024 if (init_data_ll() == -1) 3025 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3026 3027 /* Initialize device stats */ 3028 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3029 3030 /* Enable stats if the user option is set. */ 3031 if (enable_stats) 3032 pthread_create(&tid, NULL, (void*)print_stats, NULL ); 3033 3034 /* Launch all data cores. */ 3035 if (zero_copy == 0) { 3036 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3037 rte_eal_remote_launch(switch_worker, 3038 mbuf_pool, lcore_id); 3039 } 3040 } else { 3041 uint32_t count_in_mempool, index, i; 3042 for (index = 0; index < 2*MAX_QUEUES; index++) { 3043 /* For all RX and TX queues. */ 3044 count_in_mempool 3045 = rte_mempool_count(vpool_array[index].pool); 3046 3047 /* 3048 * Transfer all un-attached mbufs from vpool.pool 3049 * to vpoo.ring. 3050 */ 3051 for (i = 0; i < count_in_mempool; i++) { 3052 struct rte_mbuf *mbuf 3053 = __rte_mbuf_raw_alloc( 3054 vpool_array[index].pool); 3055 rte_ring_sp_enqueue(vpool_array[index].ring, 3056 (void *)mbuf); 3057 } 3058 3059 LOG_DEBUG(VHOST_CONFIG, 3060 "in main: mbuf count in mempool at initial " 3061 "is: %d\n", count_in_mempool); 3062 LOG_DEBUG(VHOST_CONFIG, 3063 "in main: mbuf count in ring at initial is :" 3064 " %d\n", 3065 rte_ring_count(vpool_array[index].ring)); 3066 } 3067 3068 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3069 rte_eal_remote_launch(switch_worker_zcp, NULL, 3070 lcore_id); 3071 } 3072 3073 if (mergeable == 0) 3074 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3075 3076 /* Register CUSE device to handle IOCTLs. */ 3077 ret = rte_vhost_driver_register((char *)&dev_basename); 3078 if (ret != 0) 3079 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n"); 3080 3081 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3082 3083 /* Start CUSE session. */ 3084 rte_vhost_driver_session_start(); 3085 return 0; 3086 3087 } 3088 3089