1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 54 #include "main.h" 55 56 #define MAX_QUEUES 512 57 58 /* the maximum number of external ports supported */ 59 #define MAX_SUP_PORTS 1 60 61 /* 62 * Calculate the number of buffers needed per port 63 */ 64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 65 (num_switching_cores*MAX_PKT_BURST) + \ 66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 67 (num_switching_cores*MBUF_CACHE_SIZE)) 68 69 #define MBUF_CACHE_SIZE 128 70 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 71 72 /* 73 * No frame data buffer allocated from host are required for zero copy 74 * implementation, guest will allocate the frame data buffer, and vhost 75 * directly use it. 76 */ 77 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM 78 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE 79 #define MBUF_CACHE_SIZE_ZCP 0 80 81 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 82 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 83 84 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 85 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 86 87 #define JUMBO_FRAME_MAX_SIZE 0x2600 88 89 /* State of virtio device. */ 90 #define DEVICE_MAC_LEARNING 0 91 #define DEVICE_RX 1 92 #define DEVICE_SAFE_REMOVE 2 93 94 /* Config_core_flag status definitions. */ 95 #define REQUEST_DEV_REMOVAL 1 96 #define ACK_DEV_REMOVAL 0 97 98 /* Configurable number of RX/TX ring descriptors */ 99 #define RTE_TEST_RX_DESC_DEFAULT 1024 100 #define RTE_TEST_TX_DESC_DEFAULT 512 101 102 /* 103 * Need refine these 2 macros for legacy and DPDK based front end: 104 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 105 * And then adjust power 2. 106 */ 107 /* 108 * For legacy front end, 128 descriptors, 109 * half for virtio header, another half for mbuf. 110 */ 111 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 112 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 113 114 /* Get first 4 bytes in mbuf headroom. */ 115 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 116 + sizeof(struct rte_mbuf))) 117 118 /* true if x is a power of 2 */ 119 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 120 121 #define INVALID_PORT_ID 0xFF 122 123 /* Max number of devices. Limited by vmdq. */ 124 #define MAX_DEVICES 64 125 126 /* Size of buffers used for snprintfs. */ 127 #define MAX_PRINT_BUFF 6072 128 129 /* Maximum character device basename size. */ 130 #define MAX_BASENAME_SZ 10 131 132 /* Maximum long option length for option parsing. */ 133 #define MAX_LONG_OPT_SZ 64 134 135 /* Used to compare MAC addresses. */ 136 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 137 138 /* Number of descriptors per cacheline. */ 139 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 140 141 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb)) 142 143 /* mask of enabled ports */ 144 static uint32_t enabled_port_mask = 0; 145 146 /* Promiscuous mode */ 147 static uint32_t promiscuous; 148 149 /*Number of switching cores enabled*/ 150 static uint32_t num_switching_cores = 0; 151 152 /* number of devices/queues to support*/ 153 static uint32_t num_queues = 0; 154 static uint32_t num_devices; 155 156 /* 157 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 158 * disabled on default. 159 */ 160 static uint32_t zero_copy; 161 static int mergeable; 162 163 /* Do vlan strip on host, enabled on default */ 164 static uint32_t vlan_strip = 1; 165 166 /* number of descriptors to apply*/ 167 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 168 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 169 170 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 171 #define MAX_RING_DESC 4096 172 173 struct vpool { 174 struct rte_mempool *pool; 175 struct rte_ring *ring; 176 uint32_t buf_size; 177 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 178 179 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 180 typedef enum { 181 VM2VM_DISABLED = 0, 182 VM2VM_SOFTWARE = 1, 183 VM2VM_HARDWARE = 2, 184 VM2VM_LAST 185 } vm2vm_type; 186 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 187 188 /* The type of host physical address translated from guest physical address. */ 189 typedef enum { 190 PHYS_ADDR_CONTINUOUS = 0, 191 PHYS_ADDR_CROSS_SUBREG = 1, 192 PHYS_ADDR_INVALID = 2, 193 PHYS_ADDR_LAST 194 } hpa_type; 195 196 /* Enable stats. */ 197 static uint32_t enable_stats = 0; 198 /* Enable retries on RX. */ 199 static uint32_t enable_retry = 1; 200 /* Specify timeout (in useconds) between retries on RX. */ 201 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 202 /* Specify the number of retries on RX. */ 203 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 204 205 /* Character device basename. Can be set by user. */ 206 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 207 208 /* empty vmdq configuration structure. Filled in programatically */ 209 static struct rte_eth_conf vmdq_conf_default = { 210 .rxmode = { 211 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 212 .split_hdr_size = 0, 213 .header_split = 0, /**< Header Split disabled */ 214 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 215 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 216 /* 217 * It is necessary for 1G NIC such as I350, 218 * this fixes bug of ipv4 forwarding in guest can't 219 * forward pakets from one virtio dev to another virtio dev. 220 */ 221 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 222 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 223 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 224 }, 225 226 .txmode = { 227 .mq_mode = ETH_MQ_TX_NONE, 228 }, 229 .rx_adv_conf = { 230 /* 231 * should be overridden separately in code with 232 * appropriate values 233 */ 234 .vmdq_rx_conf = { 235 .nb_queue_pools = ETH_8_POOLS, 236 .enable_default_pool = 0, 237 .default_pool = 0, 238 .nb_pool_maps = 0, 239 .pool_map = {{0, 0},}, 240 }, 241 }, 242 }; 243 244 static unsigned lcore_ids[RTE_MAX_LCORE]; 245 static uint8_t ports[RTE_MAX_ETHPORTS]; 246 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 247 static uint16_t num_pf_queues, num_vmdq_queues; 248 static uint16_t vmdq_pool_base, vmdq_queue_base; 249 static uint16_t queues_per_pool; 250 251 static const uint16_t external_pkt_default_vlan_tag = 2000; 252 const uint16_t vlan_tags[] = { 253 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 254 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 255 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 256 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 257 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 258 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 259 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 260 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 261 }; 262 263 /* ethernet addresses of ports */ 264 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 265 266 /* heads for the main used and free linked lists for the data path. */ 267 static struct virtio_net_data_ll *ll_root_used = NULL; 268 static struct virtio_net_data_ll *ll_root_free = NULL; 269 270 /* Array of data core structures containing information on individual core linked lists. */ 271 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 272 273 /* Used for queueing bursts of TX packets. */ 274 struct mbuf_table { 275 unsigned len; 276 unsigned txq_id; 277 struct rte_mbuf *m_table[MAX_PKT_BURST]; 278 }; 279 280 /* TX queue for each data core. */ 281 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 282 283 /* TX queue fori each virtio device for zero copy. */ 284 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 285 286 /* Vlan header struct used to insert vlan tags on TX. */ 287 struct vlan_ethhdr { 288 unsigned char h_dest[ETH_ALEN]; 289 unsigned char h_source[ETH_ALEN]; 290 __be16 h_vlan_proto; 291 __be16 h_vlan_TCI; 292 __be16 h_vlan_encapsulated_proto; 293 }; 294 295 /* IPv4 Header */ 296 struct ipv4_hdr { 297 uint8_t version_ihl; /**< version and header length */ 298 uint8_t type_of_service; /**< type of service */ 299 uint16_t total_length; /**< length of packet */ 300 uint16_t packet_id; /**< packet ID */ 301 uint16_t fragment_offset; /**< fragmentation offset */ 302 uint8_t time_to_live; /**< time to live */ 303 uint8_t next_proto_id; /**< protocol ID */ 304 uint16_t hdr_checksum; /**< header checksum */ 305 uint32_t src_addr; /**< source address */ 306 uint32_t dst_addr; /**< destination address */ 307 } __attribute__((__packed__)); 308 309 /* Header lengths. */ 310 #define VLAN_HLEN 4 311 #define VLAN_ETH_HLEN 18 312 313 /* Per-device statistics struct */ 314 struct device_statistics { 315 uint64_t tx_total; 316 rte_atomic64_t rx_total_atomic; 317 uint64_t rx_total; 318 uint64_t tx; 319 rte_atomic64_t rx_atomic; 320 uint64_t rx; 321 } __rte_cache_aligned; 322 struct device_statistics dev_statistics[MAX_DEVICES]; 323 324 /* 325 * Builds up the correct configuration for VMDQ VLAN pool map 326 * according to the pool & queue limits. 327 */ 328 static inline int 329 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 330 { 331 struct rte_eth_vmdq_rx_conf conf; 332 struct rte_eth_vmdq_rx_conf *def_conf = 333 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 334 unsigned i; 335 336 memset(&conf, 0, sizeof(conf)); 337 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 338 conf.nb_pool_maps = num_devices; 339 conf.enable_loop_back = def_conf->enable_loop_back; 340 conf.rx_mode = def_conf->rx_mode; 341 342 for (i = 0; i < conf.nb_pool_maps; i++) { 343 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 344 conf.pool_map[i].pools = (1UL << i); 345 } 346 347 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 348 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 349 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 350 return 0; 351 } 352 353 /* 354 * Validate the device number according to the max pool number gotten form 355 * dev_info. If the device number is invalid, give the error message and 356 * return -1. Each device must have its own pool. 357 */ 358 static inline int 359 validate_num_devices(uint32_t max_nb_devices) 360 { 361 if (num_devices > max_nb_devices) { 362 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 363 return -1; 364 } 365 return 0; 366 } 367 368 /* 369 * Initialises a given port using global settings and with the rx buffers 370 * coming from the mbuf_pool passed as parameter 371 */ 372 static inline int 373 port_init(uint8_t port) 374 { 375 struct rte_eth_dev_info dev_info; 376 struct rte_eth_conf port_conf; 377 struct rte_eth_rxconf *rxconf; 378 struct rte_eth_txconf *txconf; 379 int16_t rx_rings, tx_rings; 380 uint16_t rx_ring_size, tx_ring_size; 381 int retval; 382 uint16_t q; 383 384 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 385 rte_eth_dev_info_get (port, &dev_info); 386 387 if (dev_info.max_rx_queues > MAX_QUEUES) { 388 rte_exit(EXIT_FAILURE, 389 "please define MAX_QUEUES no less than %u in %s\n", 390 dev_info.max_rx_queues, __FILE__); 391 } 392 393 rxconf = &dev_info.default_rxconf; 394 txconf = &dev_info.default_txconf; 395 rxconf->rx_drop_en = 1; 396 397 /* Enable vlan offload */ 398 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 399 400 /* 401 * Zero copy defers queue RX/TX start to the time when guest 402 * finishes its startup and packet buffers from that guest are 403 * available. 404 */ 405 if (zero_copy) { 406 rxconf->rx_deferred_start = 1; 407 rxconf->rx_drop_en = 0; 408 txconf->tx_deferred_start = 1; 409 } 410 411 /*configure the number of supported virtio devices based on VMDQ limits */ 412 num_devices = dev_info.max_vmdq_pools; 413 414 if (zero_copy) { 415 rx_ring_size = num_rx_descriptor; 416 tx_ring_size = num_tx_descriptor; 417 tx_rings = dev_info.max_tx_queues; 418 } else { 419 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 420 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 421 tx_rings = (uint16_t)rte_lcore_count(); 422 } 423 424 retval = validate_num_devices(MAX_DEVICES); 425 if (retval < 0) 426 return retval; 427 428 /* Get port configuration. */ 429 retval = get_eth_conf(&port_conf, num_devices); 430 if (retval < 0) 431 return retval; 432 /* NIC queues are divided into pf queues and vmdq queues. */ 433 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 434 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 435 num_vmdq_queues = num_devices * queues_per_pool; 436 num_queues = num_pf_queues + num_vmdq_queues; 437 vmdq_queue_base = dev_info.vmdq_queue_base; 438 vmdq_pool_base = dev_info.vmdq_pool_base; 439 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 440 num_pf_queues, num_devices, queues_per_pool); 441 442 if (port >= rte_eth_dev_count()) return -1; 443 444 rx_rings = (uint16_t)dev_info.max_rx_queues; 445 /* Configure ethernet device. */ 446 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 447 if (retval != 0) 448 return retval; 449 450 /* Setup the queues. */ 451 for (q = 0; q < rx_rings; q ++) { 452 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 453 rte_eth_dev_socket_id(port), 454 rxconf, 455 vpool_array[q].pool); 456 if (retval < 0) 457 return retval; 458 } 459 for (q = 0; q < tx_rings; q ++) { 460 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 461 rte_eth_dev_socket_id(port), 462 txconf); 463 if (retval < 0) 464 return retval; 465 } 466 467 /* Start the device. */ 468 retval = rte_eth_dev_start(port); 469 if (retval < 0) { 470 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 471 return retval; 472 } 473 474 if (promiscuous) 475 rte_eth_promiscuous_enable(port); 476 477 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 478 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 479 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 480 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 481 (unsigned)port, 482 vmdq_ports_eth_addr[port].addr_bytes[0], 483 vmdq_ports_eth_addr[port].addr_bytes[1], 484 vmdq_ports_eth_addr[port].addr_bytes[2], 485 vmdq_ports_eth_addr[port].addr_bytes[3], 486 vmdq_ports_eth_addr[port].addr_bytes[4], 487 vmdq_ports_eth_addr[port].addr_bytes[5]); 488 489 return 0; 490 } 491 492 /* 493 * Set character device basename. 494 */ 495 static int 496 us_vhost_parse_basename(const char *q_arg) 497 { 498 /* parse number string */ 499 500 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 501 return -1; 502 else 503 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 504 505 return 0; 506 } 507 508 /* 509 * Parse the portmask provided at run time. 510 */ 511 static int 512 parse_portmask(const char *portmask) 513 { 514 char *end = NULL; 515 unsigned long pm; 516 517 errno = 0; 518 519 /* parse hexadecimal string */ 520 pm = strtoul(portmask, &end, 16); 521 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 522 return -1; 523 524 if (pm == 0) 525 return -1; 526 527 return pm; 528 529 } 530 531 /* 532 * Parse num options at run time. 533 */ 534 static int 535 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 536 { 537 char *end = NULL; 538 unsigned long num; 539 540 errno = 0; 541 542 /* parse unsigned int string */ 543 num = strtoul(q_arg, &end, 10); 544 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 545 return -1; 546 547 if (num > max_valid_value) 548 return -1; 549 550 return num; 551 552 } 553 554 /* 555 * Display usage 556 */ 557 static void 558 us_vhost_usage(const char *prgname) 559 { 560 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 561 " --vm2vm [0|1|2]\n" 562 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 563 " --dev-basename <name>\n" 564 " --nb-devices ND\n" 565 " -p PORTMASK: Set mask for ports to be used by application\n" 566 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 567 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 568 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 569 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 570 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 571 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 572 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 573 " --dev-basename: The basename to be used for the character device.\n" 574 " --zero-copy [0|1]: disable(default)/enable rx/tx " 575 "zero copy\n" 576 " --rx-desc-num [0-N]: the number of descriptors on rx, " 577 "used only when zero copy is enabled.\n" 578 " --tx-desc-num [0-N]: the number of descriptors on tx, " 579 "used only when zero copy is enabled.\n", 580 prgname); 581 } 582 583 /* 584 * Parse the arguments given in the command line of the application. 585 */ 586 static int 587 us_vhost_parse_args(int argc, char **argv) 588 { 589 int opt, ret; 590 int option_index; 591 unsigned i; 592 const char *prgname = argv[0]; 593 static struct option long_option[] = { 594 {"vm2vm", required_argument, NULL, 0}, 595 {"rx-retry", required_argument, NULL, 0}, 596 {"rx-retry-delay", required_argument, NULL, 0}, 597 {"rx-retry-num", required_argument, NULL, 0}, 598 {"mergeable", required_argument, NULL, 0}, 599 {"vlan-strip", required_argument, NULL, 0}, 600 {"stats", required_argument, NULL, 0}, 601 {"dev-basename", required_argument, NULL, 0}, 602 {"zero-copy", required_argument, NULL, 0}, 603 {"rx-desc-num", required_argument, NULL, 0}, 604 {"tx-desc-num", required_argument, NULL, 0}, 605 {NULL, 0, 0, 0}, 606 }; 607 608 /* Parse command line */ 609 while ((opt = getopt_long(argc, argv, "p:P", 610 long_option, &option_index)) != EOF) { 611 switch (opt) { 612 /* Portmask */ 613 case 'p': 614 enabled_port_mask = parse_portmask(optarg); 615 if (enabled_port_mask == 0) { 616 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 617 us_vhost_usage(prgname); 618 return -1; 619 } 620 break; 621 622 case 'P': 623 promiscuous = 1; 624 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 625 ETH_VMDQ_ACCEPT_BROADCAST | 626 ETH_VMDQ_ACCEPT_MULTICAST; 627 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 628 629 break; 630 631 case 0: 632 /* Enable/disable vm2vm comms. */ 633 if (!strncmp(long_option[option_index].name, "vm2vm", 634 MAX_LONG_OPT_SZ)) { 635 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 636 if (ret == -1) { 637 RTE_LOG(INFO, VHOST_CONFIG, 638 "Invalid argument for " 639 "vm2vm [0|1|2]\n"); 640 us_vhost_usage(prgname); 641 return -1; 642 } else { 643 vm2vm_mode = (vm2vm_type)ret; 644 } 645 } 646 647 /* Enable/disable retries on RX. */ 648 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 649 ret = parse_num_opt(optarg, 1); 650 if (ret == -1) { 651 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 652 us_vhost_usage(prgname); 653 return -1; 654 } else { 655 enable_retry = ret; 656 } 657 } 658 659 /* Specify the retries delay time (in useconds) on RX. */ 660 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 661 ret = parse_num_opt(optarg, INT32_MAX); 662 if (ret == -1) { 663 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 664 us_vhost_usage(prgname); 665 return -1; 666 } else { 667 burst_rx_delay_time = ret; 668 } 669 } 670 671 /* Specify the retries number on RX. */ 672 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 673 ret = parse_num_opt(optarg, INT32_MAX); 674 if (ret == -1) { 675 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 676 us_vhost_usage(prgname); 677 return -1; 678 } else { 679 burst_rx_retry_num = ret; 680 } 681 } 682 683 /* Enable/disable RX mergeable buffers. */ 684 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 685 ret = parse_num_opt(optarg, 1); 686 if (ret == -1) { 687 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 688 us_vhost_usage(prgname); 689 return -1; 690 } else { 691 mergeable = !!ret; 692 if (ret) { 693 vmdq_conf_default.rxmode.jumbo_frame = 1; 694 vmdq_conf_default.rxmode.max_rx_pkt_len 695 = JUMBO_FRAME_MAX_SIZE; 696 } 697 } 698 } 699 700 /* Enable/disable RX VLAN strip on host. */ 701 if (!strncmp(long_option[option_index].name, 702 "vlan-strip", MAX_LONG_OPT_SZ)) { 703 ret = parse_num_opt(optarg, 1); 704 if (ret == -1) { 705 RTE_LOG(INFO, VHOST_CONFIG, 706 "Invalid argument for VLAN strip [0|1]\n"); 707 us_vhost_usage(prgname); 708 return -1; 709 } else { 710 vlan_strip = !!ret; 711 vmdq_conf_default.rxmode.hw_vlan_strip = 712 vlan_strip; 713 } 714 } 715 716 /* Enable/disable stats. */ 717 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 718 ret = parse_num_opt(optarg, INT32_MAX); 719 if (ret == -1) { 720 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 721 us_vhost_usage(prgname); 722 return -1; 723 } else { 724 enable_stats = ret; 725 } 726 } 727 728 /* Set character device basename. */ 729 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 730 if (us_vhost_parse_basename(optarg) == -1) { 731 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 732 us_vhost_usage(prgname); 733 return -1; 734 } 735 } 736 737 /* Enable/disable rx/tx zero copy. */ 738 if (!strncmp(long_option[option_index].name, 739 "zero-copy", MAX_LONG_OPT_SZ)) { 740 ret = parse_num_opt(optarg, 1); 741 if (ret == -1) { 742 RTE_LOG(INFO, VHOST_CONFIG, 743 "Invalid argument" 744 " for zero-copy [0|1]\n"); 745 us_vhost_usage(prgname); 746 return -1; 747 } else 748 zero_copy = ret; 749 } 750 751 /* Specify the descriptor number on RX. */ 752 if (!strncmp(long_option[option_index].name, 753 "rx-desc-num", MAX_LONG_OPT_SZ)) { 754 ret = parse_num_opt(optarg, MAX_RING_DESC); 755 if ((ret == -1) || (!POWEROF2(ret))) { 756 RTE_LOG(INFO, VHOST_CONFIG, 757 "Invalid argument for rx-desc-num[0-N]," 758 "power of 2 required.\n"); 759 us_vhost_usage(prgname); 760 return -1; 761 } else { 762 num_rx_descriptor = ret; 763 } 764 } 765 766 /* Specify the descriptor number on TX. */ 767 if (!strncmp(long_option[option_index].name, 768 "tx-desc-num", MAX_LONG_OPT_SZ)) { 769 ret = parse_num_opt(optarg, MAX_RING_DESC); 770 if ((ret == -1) || (!POWEROF2(ret))) { 771 RTE_LOG(INFO, VHOST_CONFIG, 772 "Invalid argument for tx-desc-num [0-N]," 773 "power of 2 required.\n"); 774 us_vhost_usage(prgname); 775 return -1; 776 } else { 777 num_tx_descriptor = ret; 778 } 779 } 780 781 break; 782 783 /* Invalid option - print options. */ 784 default: 785 us_vhost_usage(prgname); 786 return -1; 787 } 788 } 789 790 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 791 if (enabled_port_mask & (1 << i)) 792 ports[num_ports++] = (uint8_t)i; 793 } 794 795 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 796 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 797 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 798 return -1; 799 } 800 801 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 802 RTE_LOG(INFO, VHOST_PORT, 803 "Vhost zero copy doesn't support software vm2vm," 804 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 805 return -1; 806 } 807 808 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 809 RTE_LOG(INFO, VHOST_PORT, 810 "Vhost zero copy doesn't support jumbo frame," 811 "please specify '--mergeable 0' to disable the " 812 "mergeable feature.\n"); 813 return -1; 814 } 815 816 return 0; 817 } 818 819 /* 820 * Update the global var NUM_PORTS and array PORTS according to system ports number 821 * and return valid ports number 822 */ 823 static unsigned check_ports_num(unsigned nb_ports) 824 { 825 unsigned valid_num_ports = num_ports; 826 unsigned portid; 827 828 if (num_ports > nb_ports) { 829 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 830 num_ports, nb_ports); 831 num_ports = nb_ports; 832 } 833 834 for (portid = 0; portid < num_ports; portid ++) { 835 if (ports[portid] >= nb_ports) { 836 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 837 ports[portid], (nb_ports - 1)); 838 ports[portid] = INVALID_PORT_ID; 839 valid_num_ports--; 840 } 841 } 842 return valid_num_ports; 843 } 844 845 /* 846 * Macro to print out packet contents. Wrapped in debug define so that the 847 * data path is not effected when debug is disabled. 848 */ 849 #ifdef DEBUG 850 #define PRINT_PACKET(device, addr, size, header) do { \ 851 char *pkt_addr = (char*)(addr); \ 852 unsigned int index; \ 853 char packet[MAX_PRINT_BUFF]; \ 854 \ 855 if ((header)) \ 856 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 857 else \ 858 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 859 for (index = 0; index < (size); index++) { \ 860 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 861 "%02hhx ", pkt_addr[index]); \ 862 } \ 863 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 864 \ 865 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 866 } while(0) 867 #else 868 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 869 #endif 870 871 /* 872 * Function to convert guest physical addresses to vhost physical addresses. 873 * This is used to convert virtio buffer addresses. 874 */ 875 static inline uint64_t __attribute__((always_inline)) 876 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 877 uint32_t buf_len, hpa_type *addr_type) 878 { 879 struct virtio_memory_regions_hpa *region; 880 uint32_t regionidx; 881 uint64_t vhost_pa = 0; 882 883 *addr_type = PHYS_ADDR_INVALID; 884 885 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 886 region = &vdev->regions_hpa[regionidx]; 887 if ((guest_pa >= region->guest_phys_address) && 888 (guest_pa <= region->guest_phys_address_end)) { 889 vhost_pa = region->host_phys_addr_offset + guest_pa; 890 if (likely((guest_pa + buf_len - 1) 891 <= region->guest_phys_address_end)) 892 *addr_type = PHYS_ADDR_CONTINUOUS; 893 else 894 *addr_type = PHYS_ADDR_CROSS_SUBREG; 895 break; 896 } 897 } 898 899 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 900 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 901 (void *)(uintptr_t)vhost_pa); 902 903 return vhost_pa; 904 } 905 906 /* 907 * Compares a packet destination MAC address to a device MAC address. 908 */ 909 static inline int __attribute__((always_inline)) 910 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 911 { 912 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0); 913 } 914 915 /* 916 * This function learns the MAC address of the device and registers this along with a 917 * vlan tag to a VMDQ. 918 */ 919 static int 920 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 921 { 922 struct ether_hdr *pkt_hdr; 923 struct virtio_net_data_ll *dev_ll; 924 struct virtio_net *dev = vdev->dev; 925 int i, ret; 926 927 /* Learn MAC address of guest device from packet */ 928 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 929 930 dev_ll = ll_root_used; 931 932 while (dev_ll != NULL) { 933 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 934 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 935 return -1; 936 } 937 dev_ll = dev_ll->next; 938 } 939 940 for (i = 0; i < ETHER_ADDR_LEN; i++) 941 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 942 943 /* vlan_tag currently uses the device_id. */ 944 vdev->vlan_tag = vlan_tags[dev->device_fh]; 945 946 /* Print out VMDQ registration info. */ 947 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 948 dev->device_fh, 949 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 950 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 951 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 952 vdev->vlan_tag); 953 954 /* Register the MAC address. */ 955 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 956 (uint32_t)dev->device_fh + vmdq_pool_base); 957 if (ret) 958 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 959 dev->device_fh); 960 961 /* Enable stripping of the vlan tag as we handle routing. */ 962 if (vlan_strip) 963 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 964 (uint16_t)vdev->vmdq_rx_q, 1); 965 966 /* Set device as ready for RX. */ 967 vdev->ready = DEVICE_RX; 968 969 return 0; 970 } 971 972 /* 973 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 974 * queue before disabling RX on the device. 975 */ 976 static inline void 977 unlink_vmdq(struct vhost_dev *vdev) 978 { 979 unsigned i = 0; 980 unsigned rx_count; 981 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 982 983 if (vdev->ready == DEVICE_RX) { 984 /*clear MAC and VLAN settings*/ 985 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 986 for (i = 0; i < 6; i++) 987 vdev->mac_address.addr_bytes[i] = 0; 988 989 vdev->vlan_tag = 0; 990 991 /*Clear out the receive buffers*/ 992 rx_count = rte_eth_rx_burst(ports[0], 993 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 994 995 while (rx_count) { 996 for (i = 0; i < rx_count; i++) 997 rte_pktmbuf_free(pkts_burst[i]); 998 999 rx_count = rte_eth_rx_burst(ports[0], 1000 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1001 } 1002 1003 vdev->ready = DEVICE_MAC_LEARNING; 1004 } 1005 } 1006 1007 /* 1008 * Check if the packet destination MAC address is for a local device. If so then put 1009 * the packet on that devices RX queue. If not then return. 1010 */ 1011 static inline int __attribute__((always_inline)) 1012 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1013 { 1014 struct virtio_net_data_ll *dev_ll; 1015 struct ether_hdr *pkt_hdr; 1016 uint64_t ret = 0; 1017 struct virtio_net *dev = vdev->dev; 1018 struct virtio_net *tdev; /* destination virito device */ 1019 1020 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1021 1022 /*get the used devices list*/ 1023 dev_ll = ll_root_used; 1024 1025 while (dev_ll != NULL) { 1026 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1027 &dev_ll->vdev->mac_address)) { 1028 1029 /* Drop the packet if the TX packet is destined for the TX device. */ 1030 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1031 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1032 dev->device_fh); 1033 return 0; 1034 } 1035 tdev = dev_ll->vdev->dev; 1036 1037 1038 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1039 1040 if (unlikely(dev_ll->vdev->remove)) { 1041 /*drop the packet if the device is marked for removal*/ 1042 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1043 } else { 1044 /*send the packet to the local virtio device*/ 1045 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1046 if (enable_stats) { 1047 rte_atomic64_add( 1048 &dev_statistics[tdev->device_fh].rx_total_atomic, 1049 1); 1050 rte_atomic64_add( 1051 &dev_statistics[tdev->device_fh].rx_atomic, 1052 ret); 1053 dev_statistics[dev->device_fh].tx_total++; 1054 dev_statistics[dev->device_fh].tx += ret; 1055 } 1056 } 1057 1058 return 0; 1059 } 1060 dev_ll = dev_ll->next; 1061 } 1062 1063 return -1; 1064 } 1065 1066 /* 1067 * Check if the destination MAC of a packet is one local VM, 1068 * and get its vlan tag, and offset if it is. 1069 */ 1070 static inline int __attribute__((always_inline)) 1071 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1072 uint32_t *offset, uint16_t *vlan_tag) 1073 { 1074 struct virtio_net_data_ll *dev_ll = ll_root_used; 1075 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1076 1077 while (dev_ll != NULL) { 1078 if ((dev_ll->vdev->ready == DEVICE_RX) 1079 && ether_addr_cmp(&(pkt_hdr->d_addr), 1080 &dev_ll->vdev->mac_address)) { 1081 /* 1082 * Drop the packet if the TX packet is 1083 * destined for the TX device. 1084 */ 1085 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1086 LOG_DEBUG(VHOST_DATA, 1087 "(%"PRIu64") TX: Source and destination" 1088 " MAC addresses are the same. Dropping " 1089 "packet.\n", 1090 dev_ll->vdev->dev->device_fh); 1091 return -1; 1092 } 1093 1094 /* 1095 * HW vlan strip will reduce the packet length 1096 * by minus length of vlan tag, so need restore 1097 * the packet length by plus it. 1098 */ 1099 *offset = VLAN_HLEN; 1100 *vlan_tag = 1101 (uint16_t) 1102 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1103 1104 LOG_DEBUG(VHOST_DATA, 1105 "(%"PRIu64") TX: pkt to local VM device id:" 1106 "(%"PRIu64") vlan tag: %d.\n", 1107 dev->device_fh, dev_ll->vdev->dev->device_fh, 1108 (int)*vlan_tag); 1109 1110 break; 1111 } 1112 dev_ll = dev_ll->next; 1113 } 1114 return 0; 1115 } 1116 1117 /* 1118 * This function routes the TX packet to the correct interface. This may be a local device 1119 * or the physical port. 1120 */ 1121 static inline void __attribute__((always_inline)) 1122 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1123 { 1124 struct mbuf_table *tx_q; 1125 struct rte_mbuf **m_table; 1126 unsigned len, ret, offset = 0; 1127 const uint16_t lcore_id = rte_lcore_id(); 1128 struct virtio_net *dev = vdev->dev; 1129 struct ether_hdr *nh; 1130 1131 /*check if destination is local VM*/ 1132 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1133 rte_pktmbuf_free(m); 1134 return; 1135 } 1136 1137 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1138 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1139 rte_pktmbuf_free(m); 1140 return; 1141 } 1142 } 1143 1144 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1145 1146 /*Add packet to the port tx queue*/ 1147 tx_q = &lcore_tx_queue[lcore_id]; 1148 len = tx_q->len; 1149 1150 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1151 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1152 /* Guest has inserted the vlan tag. */ 1153 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1154 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1155 if ((vm2vm_mode == VM2VM_HARDWARE) && 1156 (vh->vlan_tci != vlan_tag_be)) 1157 vh->vlan_tci = vlan_tag_be; 1158 } else { 1159 m->ol_flags = PKT_TX_VLAN_PKT; 1160 1161 /* 1162 * Find the right seg to adjust the data len when offset is 1163 * bigger than tail room size. 1164 */ 1165 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1166 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1167 m->data_len += offset; 1168 else { 1169 struct rte_mbuf *seg = m; 1170 1171 while ((seg->next != NULL) && 1172 (offset > rte_pktmbuf_tailroom(seg))) 1173 seg = seg->next; 1174 1175 seg->data_len += offset; 1176 } 1177 m->pkt_len += offset; 1178 } 1179 1180 m->vlan_tci = vlan_tag; 1181 } 1182 1183 tx_q->m_table[len] = m; 1184 len++; 1185 if (enable_stats) { 1186 dev_statistics[dev->device_fh].tx_total++; 1187 dev_statistics[dev->device_fh].tx++; 1188 } 1189 1190 if (unlikely(len == MAX_PKT_BURST)) { 1191 m_table = (struct rte_mbuf **)tx_q->m_table; 1192 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1193 /* Free any buffers not handled by TX and update the port stats. */ 1194 if (unlikely(ret < len)) { 1195 do { 1196 rte_pktmbuf_free(m_table[ret]); 1197 } while (++ret < len); 1198 } 1199 1200 len = 0; 1201 } 1202 1203 tx_q->len = len; 1204 return; 1205 } 1206 /* 1207 * This function is called by each data core. It handles all RX/TX registered with the 1208 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1209 * with all devices in the main linked list. 1210 */ 1211 static int 1212 switch_worker(__attribute__((unused)) void *arg) 1213 { 1214 struct rte_mempool *mbuf_pool = arg; 1215 struct virtio_net *dev = NULL; 1216 struct vhost_dev *vdev = NULL; 1217 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1218 struct virtio_net_data_ll *dev_ll; 1219 struct mbuf_table *tx_q; 1220 volatile struct lcore_ll_info *lcore_ll; 1221 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1222 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1223 unsigned ret, i; 1224 const uint16_t lcore_id = rte_lcore_id(); 1225 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1226 uint16_t rx_count = 0; 1227 uint16_t tx_count; 1228 uint32_t retry = 0; 1229 1230 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1231 lcore_ll = lcore_info[lcore_id].lcore_ll; 1232 prev_tsc = 0; 1233 1234 tx_q = &lcore_tx_queue[lcore_id]; 1235 for (i = 0; i < num_cores; i ++) { 1236 if (lcore_ids[i] == lcore_id) { 1237 tx_q->txq_id = i; 1238 break; 1239 } 1240 } 1241 1242 while(1) { 1243 cur_tsc = rte_rdtsc(); 1244 /* 1245 * TX burst queue drain 1246 */ 1247 diff_tsc = cur_tsc - prev_tsc; 1248 if (unlikely(diff_tsc > drain_tsc)) { 1249 1250 if (tx_q->len) { 1251 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1252 1253 /*Tx any packets in the queue*/ 1254 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1255 (struct rte_mbuf **)tx_q->m_table, 1256 (uint16_t)tx_q->len); 1257 if (unlikely(ret < tx_q->len)) { 1258 do { 1259 rte_pktmbuf_free(tx_q->m_table[ret]); 1260 } while (++ret < tx_q->len); 1261 } 1262 1263 tx_q->len = 0; 1264 } 1265 1266 prev_tsc = cur_tsc; 1267 1268 } 1269 1270 rte_prefetch0(lcore_ll->ll_root_used); 1271 /* 1272 * Inform the configuration core that we have exited the linked list and that no devices are 1273 * in use if requested. 1274 */ 1275 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1276 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1277 1278 /* 1279 * Process devices 1280 */ 1281 dev_ll = lcore_ll->ll_root_used; 1282 1283 while (dev_ll != NULL) { 1284 /*get virtio device ID*/ 1285 vdev = dev_ll->vdev; 1286 dev = vdev->dev; 1287 1288 if (unlikely(vdev->remove)) { 1289 dev_ll = dev_ll->next; 1290 unlink_vmdq(vdev); 1291 vdev->ready = DEVICE_SAFE_REMOVE; 1292 continue; 1293 } 1294 if (likely(vdev->ready == DEVICE_RX)) { 1295 /*Handle guest RX*/ 1296 rx_count = rte_eth_rx_burst(ports[0], 1297 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1298 1299 if (rx_count) { 1300 /* 1301 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1302 * Here MAX_PKT_BURST must be less than virtio queue size 1303 */ 1304 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1305 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1306 rte_delay_us(burst_rx_delay_time); 1307 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1308 break; 1309 } 1310 } 1311 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1312 if (enable_stats) { 1313 rte_atomic64_add( 1314 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1315 rx_count); 1316 rte_atomic64_add( 1317 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1318 } 1319 while (likely(rx_count)) { 1320 rx_count--; 1321 rte_pktmbuf_free(pkts_burst[rx_count]); 1322 } 1323 1324 } 1325 } 1326 1327 if (likely(!vdev->remove)) { 1328 /* Handle guest TX*/ 1329 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1330 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1331 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1332 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1333 while (tx_count) 1334 rte_pktmbuf_free(pkts_burst[--tx_count]); 1335 } 1336 } 1337 while (tx_count) 1338 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1339 } 1340 1341 /*move to the next device in the list*/ 1342 dev_ll = dev_ll->next; 1343 } 1344 } 1345 1346 return 0; 1347 } 1348 1349 /* 1350 * This function gets available ring number for zero copy rx. 1351 * Only one thread will call this funciton for a paticular virtio device, 1352 * so, it is designed as non-thread-safe function. 1353 */ 1354 static inline uint32_t __attribute__((always_inline)) 1355 get_available_ring_num_zcp(struct virtio_net *dev) 1356 { 1357 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1358 uint16_t avail_idx; 1359 1360 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1361 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1362 } 1363 1364 /* 1365 * This function gets available ring index for zero copy rx, 1366 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1367 * Only one thread will call this funciton for a paticular virtio device, 1368 * so, it is designed as non-thread-safe function. 1369 */ 1370 static inline uint32_t __attribute__((always_inline)) 1371 get_available_ring_index_zcp(struct virtio_net *dev, 1372 uint16_t *res_base_idx, uint32_t count) 1373 { 1374 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1375 uint16_t avail_idx; 1376 uint32_t retry = 0; 1377 uint16_t free_entries; 1378 1379 *res_base_idx = vq->last_used_idx_res; 1380 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1381 free_entries = (avail_idx - *res_base_idx); 1382 1383 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1384 "avail idx: %d, " 1385 "res base idx:%d, free entries:%d\n", 1386 dev->device_fh, avail_idx, *res_base_idx, 1387 free_entries); 1388 1389 /* 1390 * If retry is enabled and the queue is full then we wait 1391 * and retry to avoid packet loss. 1392 */ 1393 if (enable_retry && unlikely(count > free_entries)) { 1394 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1395 rte_delay_us(burst_rx_delay_time); 1396 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1397 free_entries = (avail_idx - *res_base_idx); 1398 if (count <= free_entries) 1399 break; 1400 } 1401 } 1402 1403 /*check that we have enough buffers*/ 1404 if (unlikely(count > free_entries)) 1405 count = free_entries; 1406 1407 if (unlikely(count == 0)) { 1408 LOG_DEBUG(VHOST_DATA, 1409 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1410 "avail idx: %d, res base idx:%d, free entries:%d\n", 1411 dev->device_fh, avail_idx, 1412 *res_base_idx, free_entries); 1413 return 0; 1414 } 1415 1416 vq->last_used_idx_res = *res_base_idx + count; 1417 1418 return count; 1419 } 1420 1421 /* 1422 * This function put descriptor back to used list. 1423 */ 1424 static inline void __attribute__((always_inline)) 1425 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1426 { 1427 uint16_t res_cur_idx = vq->last_used_idx; 1428 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1429 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1430 rte_compiler_barrier(); 1431 *(volatile uint16_t *)&vq->used->idx += 1; 1432 vq->last_used_idx += 1; 1433 1434 /* Kick the guest if necessary. */ 1435 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1436 eventfd_write(vq->callfd, (eventfd_t)1); 1437 } 1438 1439 /* 1440 * This function get available descriptor from vitio vring and un-attached mbuf 1441 * from vpool->ring, and then attach them together. It needs adjust the offset 1442 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1443 * frame data may be put to wrong location in mbuf. 1444 */ 1445 static inline void __attribute__((always_inline)) 1446 attach_rxmbuf_zcp(struct virtio_net *dev) 1447 { 1448 uint16_t res_base_idx, desc_idx; 1449 uint64_t buff_addr, phys_addr; 1450 struct vhost_virtqueue *vq; 1451 struct vring_desc *desc; 1452 void *obj = NULL; 1453 struct rte_mbuf *mbuf; 1454 struct vpool *vpool; 1455 hpa_type addr_type; 1456 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1457 1458 vpool = &vpool_array[vdev->vmdq_rx_q]; 1459 vq = dev->virtqueue[VIRTIO_RXQ]; 1460 1461 do { 1462 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1463 1) != 1)) 1464 return; 1465 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1466 1467 desc = &vq->desc[desc_idx]; 1468 if (desc->flags & VRING_DESC_F_NEXT) { 1469 desc = &vq->desc[desc->next]; 1470 buff_addr = gpa_to_vva(dev, desc->addr); 1471 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1472 &addr_type); 1473 } else { 1474 buff_addr = gpa_to_vva(dev, 1475 desc->addr + vq->vhost_hlen); 1476 phys_addr = gpa_to_hpa(vdev, 1477 desc->addr + vq->vhost_hlen, 1478 desc->len, &addr_type); 1479 } 1480 1481 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1482 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1483 " address found when attaching RX frame buffer" 1484 " address!\n", dev->device_fh); 1485 put_desc_to_used_list_zcp(vq, desc_idx); 1486 continue; 1487 } 1488 1489 /* 1490 * Check if the frame buffer address from guest crosses 1491 * sub-region or not. 1492 */ 1493 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1494 RTE_LOG(ERR, VHOST_DATA, 1495 "(%"PRIu64") Frame buffer address cross " 1496 "sub-regioin found when attaching RX frame " 1497 "buffer address!\n", 1498 dev->device_fh); 1499 put_desc_to_used_list_zcp(vq, desc_idx); 1500 continue; 1501 } 1502 } while (unlikely(phys_addr == 0)); 1503 1504 rte_ring_sc_dequeue(vpool->ring, &obj); 1505 mbuf = obj; 1506 if (unlikely(mbuf == NULL)) { 1507 LOG_DEBUG(VHOST_DATA, 1508 "(%"PRIu64") in attach_rxmbuf_zcp: " 1509 "ring_sc_dequeue fail.\n", 1510 dev->device_fh); 1511 put_desc_to_used_list_zcp(vq, desc_idx); 1512 return; 1513 } 1514 1515 if (unlikely(vpool->buf_size > desc->len)) { 1516 LOG_DEBUG(VHOST_DATA, 1517 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1518 "length(%d) of descriptor idx: %d less than room " 1519 "size required: %d\n", 1520 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1521 put_desc_to_used_list_zcp(vq, desc_idx); 1522 rte_ring_sp_enqueue(vpool->ring, obj); 1523 return; 1524 } 1525 1526 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1527 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1528 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1529 mbuf->data_len = desc->len; 1530 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1531 1532 LOG_DEBUG(VHOST_DATA, 1533 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1534 "descriptor idx:%d\n", 1535 dev->device_fh, res_base_idx, desc_idx); 1536 1537 __rte_mbuf_raw_free(mbuf); 1538 1539 return; 1540 } 1541 1542 /* 1543 * Detach an attched packet mbuf - 1544 * - restore original mbuf address and length values. 1545 * - reset pktmbuf data and data_len to their default values. 1546 * All other fields of the given packet mbuf will be left intact. 1547 * 1548 * @param m 1549 * The attached packet mbuf. 1550 */ 1551 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1552 { 1553 const struct rte_mempool *mp = m->pool; 1554 void *buf = rte_mbuf_to_baddr(m); 1555 uint32_t buf_ofs; 1556 uint32_t buf_len = mp->elt_size - sizeof(*m); 1557 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1558 1559 m->buf_addr = buf; 1560 m->buf_len = (uint16_t)buf_len; 1561 1562 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1563 RTE_PKTMBUF_HEADROOM : m->buf_len; 1564 m->data_off = buf_ofs; 1565 1566 m->data_len = 0; 1567 } 1568 1569 /* 1570 * This function is called after packets have been transimited. It fetchs mbuf 1571 * from vpool->pool, detached it and put into vpool->ring. It also update the 1572 * used index and kick the guest if necessary. 1573 */ 1574 static inline uint32_t __attribute__((always_inline)) 1575 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1576 { 1577 struct rte_mbuf *mbuf; 1578 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1579 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1580 uint32_t index = 0; 1581 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1582 1583 LOG_DEBUG(VHOST_DATA, 1584 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1585 "clean is: %d\n", 1586 dev->device_fh, mbuf_count); 1587 LOG_DEBUG(VHOST_DATA, 1588 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1589 "clean is : %d\n", 1590 dev->device_fh, rte_ring_count(vpool->ring)); 1591 1592 for (index = 0; index < mbuf_count; index++) { 1593 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1594 if (likely(MBUF_EXT_MEM(mbuf))) 1595 pktmbuf_detach_zcp(mbuf); 1596 rte_ring_sp_enqueue(vpool->ring, mbuf); 1597 1598 /* Update used index buffer information. */ 1599 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1600 vq->used->ring[used_idx].len = 0; 1601 1602 used_idx = (used_idx + 1) & (vq->size - 1); 1603 } 1604 1605 LOG_DEBUG(VHOST_DATA, 1606 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1607 "clean is: %d\n", 1608 dev->device_fh, rte_mempool_count(vpool->pool)); 1609 LOG_DEBUG(VHOST_DATA, 1610 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1611 "clean is : %d\n", 1612 dev->device_fh, rte_ring_count(vpool->ring)); 1613 LOG_DEBUG(VHOST_DATA, 1614 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1615 "vq->last_used_idx:%d\n", 1616 dev->device_fh, vq->last_used_idx); 1617 1618 vq->last_used_idx += mbuf_count; 1619 1620 LOG_DEBUG(VHOST_DATA, 1621 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1622 "vq->last_used_idx:%d\n", 1623 dev->device_fh, vq->last_used_idx); 1624 1625 rte_compiler_barrier(); 1626 1627 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1628 1629 /* Kick guest if required. */ 1630 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1631 eventfd_write(vq->callfd, (eventfd_t)1); 1632 1633 return 0; 1634 } 1635 1636 /* 1637 * This function is called when a virtio device is destroy. 1638 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1639 */ 1640 static void mbuf_destroy_zcp(struct vpool *vpool) 1641 { 1642 struct rte_mbuf *mbuf = NULL; 1643 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1644 1645 LOG_DEBUG(VHOST_CONFIG, 1646 "in mbuf_destroy_zcp: mbuf count in mempool before " 1647 "mbuf_destroy_zcp is: %d\n", 1648 mbuf_count); 1649 LOG_DEBUG(VHOST_CONFIG, 1650 "in mbuf_destroy_zcp: mbuf count in ring before " 1651 "mbuf_destroy_zcp is : %d\n", 1652 rte_ring_count(vpool->ring)); 1653 1654 for (index = 0; index < mbuf_count; index++) { 1655 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1656 if (likely(mbuf != NULL)) { 1657 if (likely(MBUF_EXT_MEM(mbuf))) 1658 pktmbuf_detach_zcp(mbuf); 1659 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1660 } 1661 } 1662 1663 LOG_DEBUG(VHOST_CONFIG, 1664 "in mbuf_destroy_zcp: mbuf count in mempool after " 1665 "mbuf_destroy_zcp is: %d\n", 1666 rte_mempool_count(vpool->pool)); 1667 LOG_DEBUG(VHOST_CONFIG, 1668 "in mbuf_destroy_zcp: mbuf count in ring after " 1669 "mbuf_destroy_zcp is : %d\n", 1670 rte_ring_count(vpool->ring)); 1671 } 1672 1673 /* 1674 * This function update the use flag and counter. 1675 */ 1676 static inline uint32_t __attribute__((always_inline)) 1677 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1678 uint32_t count) 1679 { 1680 struct vhost_virtqueue *vq; 1681 struct vring_desc *desc; 1682 struct rte_mbuf *buff; 1683 /* The virtio_hdr is initialised to 0. */ 1684 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1685 = {{0, 0, 0, 0, 0, 0}, 0}; 1686 uint64_t buff_hdr_addr = 0; 1687 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1688 uint32_t head_idx, packet_success = 0; 1689 uint16_t res_cur_idx; 1690 1691 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1692 1693 if (count == 0) 1694 return 0; 1695 1696 vq = dev->virtqueue[VIRTIO_RXQ]; 1697 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1698 1699 res_cur_idx = vq->last_used_idx; 1700 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1701 dev->device_fh, res_cur_idx, res_cur_idx + count); 1702 1703 /* Retrieve all of the head indexes first to avoid caching issues. */ 1704 for (head_idx = 0; head_idx < count; head_idx++) 1705 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1706 1707 /*Prefetch descriptor index. */ 1708 rte_prefetch0(&vq->desc[head[packet_success]]); 1709 1710 while (packet_success != count) { 1711 /* Get descriptor from available ring */ 1712 desc = &vq->desc[head[packet_success]]; 1713 1714 buff = pkts[packet_success]; 1715 LOG_DEBUG(VHOST_DATA, 1716 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1717 "pkt[%d] descriptor idx: %d\n", 1718 dev->device_fh, packet_success, 1719 MBUF_HEADROOM_UINT32(buff)); 1720 1721 PRINT_PACKET(dev, 1722 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1723 + RTE_PKTMBUF_HEADROOM), 1724 rte_pktmbuf_data_len(buff), 0); 1725 1726 /* Buffer address translation for virtio header. */ 1727 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1728 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1729 1730 /* 1731 * If the descriptors are chained the header and data are 1732 * placed in separate buffers. 1733 */ 1734 if (desc->flags & VRING_DESC_F_NEXT) { 1735 desc->len = vq->vhost_hlen; 1736 desc = &vq->desc[desc->next]; 1737 desc->len = rte_pktmbuf_data_len(buff); 1738 } else { 1739 desc->len = packet_len; 1740 } 1741 1742 /* Update used ring with desc information */ 1743 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1744 = head[packet_success]; 1745 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1746 = packet_len; 1747 res_cur_idx++; 1748 packet_success++; 1749 1750 /* A header is required per buffer. */ 1751 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1752 (const void *)&virtio_hdr, vq->vhost_hlen); 1753 1754 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1755 1756 if (likely(packet_success < count)) { 1757 /* Prefetch descriptor index. */ 1758 rte_prefetch0(&vq->desc[head[packet_success]]); 1759 } 1760 } 1761 1762 rte_compiler_barrier(); 1763 1764 LOG_DEBUG(VHOST_DATA, 1765 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1766 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1767 dev->device_fh, vq->last_used_idx, vq->used->idx); 1768 1769 *(volatile uint16_t *)&vq->used->idx += count; 1770 vq->last_used_idx += count; 1771 1772 LOG_DEBUG(VHOST_DATA, 1773 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1774 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1775 dev->device_fh, vq->last_used_idx, vq->used->idx); 1776 1777 /* Kick the guest if necessary. */ 1778 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1779 eventfd_write(vq->callfd, (eventfd_t)1); 1780 1781 return count; 1782 } 1783 1784 /* 1785 * This function routes the TX packet to the correct interface. 1786 * This may be a local device or the physical port. 1787 */ 1788 static inline void __attribute__((always_inline)) 1789 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1790 uint32_t desc_idx, uint8_t need_copy) 1791 { 1792 struct mbuf_table *tx_q; 1793 struct rte_mbuf **m_table; 1794 void *obj = NULL; 1795 struct rte_mbuf *mbuf; 1796 unsigned len, ret, offset = 0; 1797 struct vpool *vpool; 1798 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1799 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1800 1801 /*Add packet to the port tx queue*/ 1802 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1803 len = tx_q->len; 1804 1805 /* Allocate an mbuf and populate the structure. */ 1806 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1807 rte_ring_sc_dequeue(vpool->ring, &obj); 1808 mbuf = obj; 1809 if (unlikely(mbuf == NULL)) { 1810 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1811 RTE_LOG(ERR, VHOST_DATA, 1812 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1813 dev->device_fh); 1814 put_desc_to_used_list_zcp(vq, desc_idx); 1815 return; 1816 } 1817 1818 if (vm2vm_mode == VM2VM_HARDWARE) { 1819 /* Avoid using a vlan tag from any vm for external pkt, such as 1820 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1821 * selection, MAC address determines it as an external pkt 1822 * which should go to network, while vlan tag determine it as 1823 * a vm2vm pkt should forward to another vm. Hardware confuse 1824 * such a ambiguous situation, so pkt will lost. 1825 */ 1826 vlan_tag = external_pkt_default_vlan_tag; 1827 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1828 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1829 __rte_mbuf_raw_free(mbuf); 1830 return; 1831 } 1832 } 1833 1834 mbuf->nb_segs = m->nb_segs; 1835 mbuf->next = m->next; 1836 mbuf->data_len = m->data_len + offset; 1837 mbuf->pkt_len = mbuf->data_len; 1838 if (unlikely(need_copy)) { 1839 /* Copy the packet contents to the mbuf. */ 1840 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1841 rte_pktmbuf_mtod(m, void *), 1842 m->data_len); 1843 } else { 1844 mbuf->data_off = m->data_off; 1845 mbuf->buf_physaddr = m->buf_physaddr; 1846 mbuf->buf_addr = m->buf_addr; 1847 } 1848 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1849 mbuf->vlan_tci = vlan_tag; 1850 mbuf->l2_len = sizeof(struct ether_hdr); 1851 mbuf->l3_len = sizeof(struct ipv4_hdr); 1852 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1853 1854 tx_q->m_table[len] = mbuf; 1855 len++; 1856 1857 LOG_DEBUG(VHOST_DATA, 1858 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1859 dev->device_fh, 1860 mbuf->nb_segs, 1861 (mbuf->next == NULL) ? "null" : "non-null"); 1862 1863 if (enable_stats) { 1864 dev_statistics[dev->device_fh].tx_total++; 1865 dev_statistics[dev->device_fh].tx++; 1866 } 1867 1868 if (unlikely(len == MAX_PKT_BURST)) { 1869 m_table = (struct rte_mbuf **)tx_q->m_table; 1870 ret = rte_eth_tx_burst(ports[0], 1871 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1872 1873 /* 1874 * Free any buffers not handled by TX and update 1875 * the port stats. 1876 */ 1877 if (unlikely(ret < len)) { 1878 do { 1879 rte_pktmbuf_free(m_table[ret]); 1880 } while (++ret < len); 1881 } 1882 1883 len = 0; 1884 txmbuf_clean_zcp(dev, vpool); 1885 } 1886 1887 tx_q->len = len; 1888 1889 return; 1890 } 1891 1892 /* 1893 * This function TX all available packets in virtio TX queue for one 1894 * virtio-net device. If it is first packet, it learns MAC address and 1895 * setup VMDQ. 1896 */ 1897 static inline void __attribute__((always_inline)) 1898 virtio_dev_tx_zcp(struct virtio_net *dev) 1899 { 1900 struct rte_mbuf m; 1901 struct vhost_virtqueue *vq; 1902 struct vring_desc *desc; 1903 uint64_t buff_addr = 0, phys_addr; 1904 uint32_t head[MAX_PKT_BURST]; 1905 uint32_t i; 1906 uint16_t free_entries, packet_success = 0; 1907 uint16_t avail_idx; 1908 uint8_t need_copy = 0; 1909 hpa_type addr_type; 1910 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1911 1912 vq = dev->virtqueue[VIRTIO_TXQ]; 1913 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1914 1915 /* If there are no available buffers then return. */ 1916 if (vq->last_used_idx_res == avail_idx) 1917 return; 1918 1919 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1920 1921 /* Prefetch available ring to retrieve head indexes. */ 1922 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1923 1924 /* Get the number of free entries in the ring */ 1925 free_entries = (avail_idx - vq->last_used_idx_res); 1926 1927 /* Limit to MAX_PKT_BURST. */ 1928 free_entries 1929 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1930 1931 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1932 dev->device_fh, free_entries); 1933 1934 /* Retrieve all of the head indexes first to avoid caching issues. */ 1935 for (i = 0; i < free_entries; i++) 1936 head[i] 1937 = vq->avail->ring[(vq->last_used_idx_res + i) 1938 & (vq->size - 1)]; 1939 1940 vq->last_used_idx_res += free_entries; 1941 1942 /* Prefetch descriptor index. */ 1943 rte_prefetch0(&vq->desc[head[packet_success]]); 1944 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1945 1946 while (packet_success < free_entries) { 1947 desc = &vq->desc[head[packet_success]]; 1948 1949 /* Discard first buffer as it is the virtio header */ 1950 desc = &vq->desc[desc->next]; 1951 1952 /* Buffer address translation. */ 1953 buff_addr = gpa_to_vva(dev, desc->addr); 1954 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1955 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1956 &addr_type); 1957 1958 if (likely(packet_success < (free_entries - 1))) 1959 /* Prefetch descriptor index. */ 1960 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1961 1962 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1963 RTE_LOG(ERR, VHOST_DATA, 1964 "(%"PRIu64") Invalid frame buffer address found" 1965 "when TX packets!\n", 1966 dev->device_fh); 1967 packet_success++; 1968 continue; 1969 } 1970 1971 /* Prefetch buffer address. */ 1972 rte_prefetch0((void *)(uintptr_t)buff_addr); 1973 1974 /* 1975 * Setup dummy mbuf. This is copied to a real mbuf if 1976 * transmitted out the physical port. 1977 */ 1978 m.data_len = desc->len; 1979 m.nb_segs = 1; 1980 m.next = NULL; 1981 m.data_off = 0; 1982 m.buf_addr = (void *)(uintptr_t)buff_addr; 1983 m.buf_physaddr = phys_addr; 1984 1985 /* 1986 * Check if the frame buffer address from guest crosses 1987 * sub-region or not. 1988 */ 1989 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1990 RTE_LOG(ERR, VHOST_DATA, 1991 "(%"PRIu64") Frame buffer address cross " 1992 "sub-regioin found when attaching TX frame " 1993 "buffer address!\n", 1994 dev->device_fh); 1995 need_copy = 1; 1996 } else 1997 need_copy = 0; 1998 1999 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2000 2001 /* 2002 * If this is the first received packet we need to learn 2003 * the MAC and setup VMDQ 2004 */ 2005 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2006 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2007 /* 2008 * Discard frame if device is scheduled for 2009 * removal or a duplicate MAC address is found. 2010 */ 2011 packet_success += free_entries; 2012 vq->last_used_idx += packet_success; 2013 break; 2014 } 2015 } 2016 2017 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2018 packet_success++; 2019 } 2020 } 2021 2022 /* 2023 * This function is called by each data core. It handles all RX/TX registered 2024 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2025 * addresses are compared with all devices in the main linked list. 2026 */ 2027 static int 2028 switch_worker_zcp(__attribute__((unused)) void *arg) 2029 { 2030 struct virtio_net *dev = NULL; 2031 struct vhost_dev *vdev = NULL; 2032 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2033 struct virtio_net_data_ll *dev_ll; 2034 struct mbuf_table *tx_q; 2035 volatile struct lcore_ll_info *lcore_ll; 2036 const uint64_t drain_tsc 2037 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2038 * BURST_TX_DRAIN_US; 2039 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2040 unsigned ret; 2041 const uint16_t lcore_id = rte_lcore_id(); 2042 uint16_t count_in_ring, rx_count = 0; 2043 2044 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2045 2046 lcore_ll = lcore_info[lcore_id].lcore_ll; 2047 prev_tsc = 0; 2048 2049 while (1) { 2050 cur_tsc = rte_rdtsc(); 2051 2052 /* TX burst queue drain */ 2053 diff_tsc = cur_tsc - prev_tsc; 2054 if (unlikely(diff_tsc > drain_tsc)) { 2055 /* 2056 * Get mbuf from vpool.pool and detach mbuf and 2057 * put back into vpool.ring. 2058 */ 2059 dev_ll = lcore_ll->ll_root_used; 2060 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2061 /* Get virtio device ID */ 2062 vdev = dev_ll->vdev; 2063 dev = vdev->dev; 2064 2065 if (likely(!vdev->remove)) { 2066 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2067 if (tx_q->len) { 2068 LOG_DEBUG(VHOST_DATA, 2069 "TX queue drained after timeout" 2070 " with burst size %u\n", 2071 tx_q->len); 2072 2073 /* 2074 * Tx any packets in the queue 2075 */ 2076 ret = rte_eth_tx_burst( 2077 ports[0], 2078 (uint16_t)tx_q->txq_id, 2079 (struct rte_mbuf **) 2080 tx_q->m_table, 2081 (uint16_t)tx_q->len); 2082 if (unlikely(ret < tx_q->len)) { 2083 do { 2084 rte_pktmbuf_free( 2085 tx_q->m_table[ret]); 2086 } while (++ret < tx_q->len); 2087 } 2088 tx_q->len = 0; 2089 2090 txmbuf_clean_zcp(dev, 2091 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2092 } 2093 } 2094 dev_ll = dev_ll->next; 2095 } 2096 prev_tsc = cur_tsc; 2097 } 2098 2099 rte_prefetch0(lcore_ll->ll_root_used); 2100 2101 /* 2102 * Inform the configuration core that we have exited the linked 2103 * list and that no devices are in use if requested. 2104 */ 2105 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2106 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2107 2108 /* Process devices */ 2109 dev_ll = lcore_ll->ll_root_used; 2110 2111 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2112 vdev = dev_ll->vdev; 2113 dev = vdev->dev; 2114 if (unlikely(vdev->remove)) { 2115 dev_ll = dev_ll->next; 2116 unlink_vmdq(vdev); 2117 vdev->ready = DEVICE_SAFE_REMOVE; 2118 continue; 2119 } 2120 2121 if (likely(vdev->ready == DEVICE_RX)) { 2122 uint32_t index = vdev->vmdq_rx_q; 2123 uint16_t i; 2124 count_in_ring 2125 = rte_ring_count(vpool_array[index].ring); 2126 uint16_t free_entries 2127 = (uint16_t)get_available_ring_num_zcp(dev); 2128 2129 /* 2130 * Attach all mbufs in vpool.ring and put back 2131 * into vpool.pool. 2132 */ 2133 for (i = 0; 2134 i < RTE_MIN(free_entries, 2135 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2136 i++) 2137 attach_rxmbuf_zcp(dev); 2138 2139 /* Handle guest RX */ 2140 rx_count = rte_eth_rx_burst(ports[0], 2141 vdev->vmdq_rx_q, pkts_burst, 2142 MAX_PKT_BURST); 2143 2144 if (rx_count) { 2145 ret_count = virtio_dev_rx_zcp(dev, 2146 pkts_burst, rx_count); 2147 if (enable_stats) { 2148 dev_statistics[dev->device_fh].rx_total 2149 += rx_count; 2150 dev_statistics[dev->device_fh].rx 2151 += ret_count; 2152 } 2153 while (likely(rx_count)) { 2154 rx_count--; 2155 pktmbuf_detach_zcp( 2156 pkts_burst[rx_count]); 2157 rte_ring_sp_enqueue( 2158 vpool_array[index].ring, 2159 (void *)pkts_burst[rx_count]); 2160 } 2161 } 2162 } 2163 2164 if (likely(!vdev->remove)) 2165 /* Handle guest TX */ 2166 virtio_dev_tx_zcp(dev); 2167 2168 /* Move to the next device in the list */ 2169 dev_ll = dev_ll->next; 2170 } 2171 } 2172 2173 return 0; 2174 } 2175 2176 2177 /* 2178 * Add an entry to a used linked list. A free entry must first be found 2179 * in the free linked list using get_data_ll_free_entry(); 2180 */ 2181 static void 2182 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2183 struct virtio_net_data_ll *ll_dev) 2184 { 2185 struct virtio_net_data_ll *ll = *ll_root_addr; 2186 2187 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2188 ll_dev->next = NULL; 2189 rte_compiler_barrier(); 2190 2191 /* If ll == NULL then this is the first device. */ 2192 if (ll) { 2193 /* Increment to the tail of the linked list. */ 2194 while ((ll->next != NULL) ) 2195 ll = ll->next; 2196 2197 ll->next = ll_dev; 2198 } else { 2199 *ll_root_addr = ll_dev; 2200 } 2201 } 2202 2203 /* 2204 * Remove an entry from a used linked list. The entry must then be added to 2205 * the free linked list using put_data_ll_free_entry(). 2206 */ 2207 static void 2208 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2209 struct virtio_net_data_ll *ll_dev, 2210 struct virtio_net_data_ll *ll_dev_last) 2211 { 2212 struct virtio_net_data_ll *ll = *ll_root_addr; 2213 2214 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2215 return; 2216 2217 if (ll_dev == ll) 2218 *ll_root_addr = ll_dev->next; 2219 else 2220 if (likely(ll_dev_last != NULL)) 2221 ll_dev_last->next = ll_dev->next; 2222 else 2223 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2224 } 2225 2226 /* 2227 * Find and return an entry from the free linked list. 2228 */ 2229 static struct virtio_net_data_ll * 2230 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2231 { 2232 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2233 struct virtio_net_data_ll *ll_dev; 2234 2235 if (ll_free == NULL) 2236 return NULL; 2237 2238 ll_dev = ll_free; 2239 *ll_root_addr = ll_free->next; 2240 2241 return ll_dev; 2242 } 2243 2244 /* 2245 * Place an entry back on to the free linked list. 2246 */ 2247 static void 2248 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2249 struct virtio_net_data_ll *ll_dev) 2250 { 2251 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2252 2253 if (ll_dev == NULL) 2254 return; 2255 2256 ll_dev->next = ll_free; 2257 *ll_root_addr = ll_dev; 2258 } 2259 2260 /* 2261 * Creates a linked list of a given size. 2262 */ 2263 static struct virtio_net_data_ll * 2264 alloc_data_ll(uint32_t size) 2265 { 2266 struct virtio_net_data_ll *ll_new; 2267 uint32_t i; 2268 2269 /* Malloc and then chain the linked list. */ 2270 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2271 if (ll_new == NULL) { 2272 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2273 return NULL; 2274 } 2275 2276 for (i = 0; i < size - 1; i++) { 2277 ll_new[i].vdev = NULL; 2278 ll_new[i].next = &ll_new[i+1]; 2279 } 2280 ll_new[i].next = NULL; 2281 2282 return (ll_new); 2283 } 2284 2285 /* 2286 * Create the main linked list along with each individual cores linked list. A used and a free list 2287 * are created to manage entries. 2288 */ 2289 static int 2290 init_data_ll (void) 2291 { 2292 int lcore; 2293 2294 RTE_LCORE_FOREACH_SLAVE(lcore) { 2295 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2296 if (lcore_info[lcore].lcore_ll == NULL) { 2297 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2298 return -1; 2299 } 2300 2301 lcore_info[lcore].lcore_ll->device_num = 0; 2302 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2303 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2304 if (num_devices % num_switching_cores) 2305 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2306 else 2307 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2308 } 2309 2310 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2311 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2312 2313 return 0; 2314 } 2315 2316 /* 2317 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2318 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2319 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2320 */ 2321 static void 2322 destroy_device (volatile struct virtio_net *dev) 2323 { 2324 struct virtio_net_data_ll *ll_lcore_dev_cur; 2325 struct virtio_net_data_ll *ll_main_dev_cur; 2326 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2327 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2328 struct vhost_dev *vdev; 2329 int lcore; 2330 2331 dev->flags &= ~VIRTIO_DEV_RUNNING; 2332 2333 vdev = (struct vhost_dev *)dev->priv; 2334 /*set the remove flag. */ 2335 vdev->remove = 1; 2336 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2337 rte_pause(); 2338 } 2339 2340 /* Search for entry to be removed from lcore ll */ 2341 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2342 while (ll_lcore_dev_cur != NULL) { 2343 if (ll_lcore_dev_cur->vdev == vdev) { 2344 break; 2345 } else { 2346 ll_lcore_dev_last = ll_lcore_dev_cur; 2347 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2348 } 2349 } 2350 2351 if (ll_lcore_dev_cur == NULL) { 2352 RTE_LOG(ERR, VHOST_CONFIG, 2353 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2354 dev->device_fh); 2355 return; 2356 } 2357 2358 /* Search for entry to be removed from main ll */ 2359 ll_main_dev_cur = ll_root_used; 2360 ll_main_dev_last = NULL; 2361 while (ll_main_dev_cur != NULL) { 2362 if (ll_main_dev_cur->vdev == vdev) { 2363 break; 2364 } else { 2365 ll_main_dev_last = ll_main_dev_cur; 2366 ll_main_dev_cur = ll_main_dev_cur->next; 2367 } 2368 } 2369 2370 /* Remove entries from the lcore and main ll. */ 2371 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2372 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2373 2374 /* Set the dev_removal_flag on each lcore. */ 2375 RTE_LCORE_FOREACH_SLAVE(lcore) { 2376 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2377 } 2378 2379 /* 2380 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2381 * they can no longer access the device removed from the linked lists and that the devices 2382 * are no longer in use. 2383 */ 2384 RTE_LCORE_FOREACH_SLAVE(lcore) { 2385 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2386 rte_pause(); 2387 } 2388 } 2389 2390 /* Add the entries back to the lcore and main free ll.*/ 2391 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2392 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2393 2394 /* Decrement number of device on the lcore. */ 2395 lcore_info[vdev->coreid].lcore_ll->device_num--; 2396 2397 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2398 2399 if (zero_copy) { 2400 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2401 2402 /* Stop the RX queue. */ 2403 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2404 LOG_DEBUG(VHOST_CONFIG, 2405 "(%"PRIu64") In destroy_device: Failed to stop " 2406 "rx queue:%d\n", 2407 dev->device_fh, 2408 vdev->vmdq_rx_q); 2409 } 2410 2411 LOG_DEBUG(VHOST_CONFIG, 2412 "(%"PRIu64") in destroy_device: Start put mbuf in " 2413 "mempool back to ring for RX queue: %d\n", 2414 dev->device_fh, vdev->vmdq_rx_q); 2415 2416 mbuf_destroy_zcp(vpool); 2417 2418 /* Stop the TX queue. */ 2419 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2420 LOG_DEBUG(VHOST_CONFIG, 2421 "(%"PRIu64") In destroy_device: Failed to " 2422 "stop tx queue:%d\n", 2423 dev->device_fh, vdev->vmdq_rx_q); 2424 } 2425 2426 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2427 2428 LOG_DEBUG(VHOST_CONFIG, 2429 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2430 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2431 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2432 dev->device_fh); 2433 2434 mbuf_destroy_zcp(vpool); 2435 rte_free(vdev->regions_hpa); 2436 } 2437 rte_free(vdev); 2438 2439 } 2440 2441 /* 2442 * Calculate the region count of physical continous regions for one particular 2443 * region of whose vhost virtual address is continous. The particular region 2444 * start from vva_start, with size of 'size' in argument. 2445 */ 2446 static uint32_t 2447 check_hpa_regions(uint64_t vva_start, uint64_t size) 2448 { 2449 uint32_t i, nregions = 0, page_size = getpagesize(); 2450 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2451 if (vva_start % page_size) { 2452 LOG_DEBUG(VHOST_CONFIG, 2453 "in check_countinous: vva start(%p) mod page_size(%d) " 2454 "has remainder\n", 2455 (void *)(uintptr_t)vva_start, page_size); 2456 return 0; 2457 } 2458 if (size % page_size) { 2459 LOG_DEBUG(VHOST_CONFIG, 2460 "in check_countinous: " 2461 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2462 size, page_size); 2463 return 0; 2464 } 2465 for (i = 0; i < size - page_size; i = i + page_size) { 2466 cur_phys_addr 2467 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2468 next_phys_addr = rte_mem_virt2phy( 2469 (void *)(uintptr_t)(vva_start + i + page_size)); 2470 if ((cur_phys_addr + page_size) != next_phys_addr) { 2471 ++nregions; 2472 LOG_DEBUG(VHOST_CONFIG, 2473 "in check_continuous: hva addr:(%p) is not " 2474 "continuous with hva addr:(%p), diff:%d\n", 2475 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2476 (void *)(uintptr_t)(vva_start + (uint64_t)i 2477 + page_size), page_size); 2478 LOG_DEBUG(VHOST_CONFIG, 2479 "in check_continuous: hpa addr:(%p) is not " 2480 "continuous with hpa addr:(%p), " 2481 "diff:(%"PRIu64")\n", 2482 (void *)(uintptr_t)cur_phys_addr, 2483 (void *)(uintptr_t)next_phys_addr, 2484 (next_phys_addr-cur_phys_addr)); 2485 } 2486 } 2487 return nregions; 2488 } 2489 2490 /* 2491 * Divide each region whose vhost virtual address is continous into a few 2492 * sub-regions, make sure the physical address within each sub-region are 2493 * continous. And fill offset(to GPA) and size etc. information of each 2494 * sub-region into regions_hpa. 2495 */ 2496 static uint32_t 2497 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2498 { 2499 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2500 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2501 2502 if (mem_region_hpa == NULL) 2503 return 0; 2504 2505 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2506 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2507 virtio_memory->regions[regionidx].address_offset; 2508 mem_region_hpa[regionidx_hpa].guest_phys_address 2509 = virtio_memory->regions[regionidx].guest_phys_address; 2510 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2511 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2512 mem_region_hpa[regionidx_hpa].guest_phys_address; 2513 LOG_DEBUG(VHOST_CONFIG, 2514 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2515 regionidx_hpa, 2516 (void *)(uintptr_t) 2517 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2518 LOG_DEBUG(VHOST_CONFIG, 2519 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2520 regionidx_hpa, 2521 (void *)(uintptr_t) 2522 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2523 for (i = 0, k = 0; 2524 i < virtio_memory->regions[regionidx].memory_size - 2525 page_size; 2526 i += page_size) { 2527 cur_phys_addr = rte_mem_virt2phy( 2528 (void *)(uintptr_t)(vva_start + i)); 2529 next_phys_addr = rte_mem_virt2phy( 2530 (void *)(uintptr_t)(vva_start + 2531 i + page_size)); 2532 if ((cur_phys_addr + page_size) != next_phys_addr) { 2533 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2534 mem_region_hpa[regionidx_hpa].guest_phys_address + 2535 k + page_size; 2536 mem_region_hpa[regionidx_hpa].memory_size 2537 = k + page_size; 2538 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2539 "phys addr end [%d]:(%p)\n", 2540 regionidx_hpa, 2541 (void *)(uintptr_t) 2542 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2543 LOG_DEBUG(VHOST_CONFIG, 2544 "in fill_hpa_regions: guest phys addr " 2545 "size [%d]:(%p)\n", 2546 regionidx_hpa, 2547 (void *)(uintptr_t) 2548 (mem_region_hpa[regionidx_hpa].memory_size)); 2549 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2550 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2551 ++regionidx_hpa; 2552 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2553 next_phys_addr - 2554 mem_region_hpa[regionidx_hpa].guest_phys_address; 2555 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2556 " phys addr start[%d]:(%p)\n", 2557 regionidx_hpa, 2558 (void *)(uintptr_t) 2559 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2560 LOG_DEBUG(VHOST_CONFIG, 2561 "in fill_hpa_regions: host phys addr " 2562 "start[%d]:(%p)\n", 2563 regionidx_hpa, 2564 (void *)(uintptr_t) 2565 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2566 k = 0; 2567 } else { 2568 k += page_size; 2569 } 2570 } 2571 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2572 = mem_region_hpa[regionidx_hpa].guest_phys_address 2573 + k + page_size; 2574 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2575 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2576 "[%d]:(%p)\n", regionidx_hpa, 2577 (void *)(uintptr_t) 2578 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2579 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2580 "[%d]:(%p)\n", regionidx_hpa, 2581 (void *)(uintptr_t) 2582 (mem_region_hpa[regionidx_hpa].memory_size)); 2583 ++regionidx_hpa; 2584 } 2585 return regionidx_hpa; 2586 } 2587 2588 /* 2589 * A new device is added to a data core. First the device is added to the main linked list 2590 * and the allocated to a specific data core. 2591 */ 2592 static int 2593 new_device (struct virtio_net *dev) 2594 { 2595 struct virtio_net_data_ll *ll_dev; 2596 int lcore, core_add = 0; 2597 uint32_t device_num_min = num_devices; 2598 struct vhost_dev *vdev; 2599 uint32_t regionidx; 2600 2601 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2602 if (vdev == NULL) { 2603 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2604 dev->device_fh); 2605 return -1; 2606 } 2607 vdev->dev = dev; 2608 dev->priv = vdev; 2609 2610 if (zero_copy) { 2611 vdev->nregions_hpa = dev->mem->nregions; 2612 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2613 vdev->nregions_hpa 2614 += check_hpa_regions( 2615 dev->mem->regions[regionidx].guest_phys_address 2616 + dev->mem->regions[regionidx].address_offset, 2617 dev->mem->regions[regionidx].memory_size); 2618 2619 } 2620 2621 vdev->regions_hpa = rte_calloc("vhost hpa region", 2622 vdev->nregions_hpa, 2623 sizeof(struct virtio_memory_regions_hpa), 2624 RTE_CACHE_LINE_SIZE); 2625 if (vdev->regions_hpa == NULL) { 2626 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2627 rte_free(vdev); 2628 return -1; 2629 } 2630 2631 2632 if (fill_hpa_memory_regions( 2633 vdev->regions_hpa, dev->mem 2634 ) != vdev->nregions_hpa) { 2635 2636 RTE_LOG(ERR, VHOST_CONFIG, 2637 "hpa memory regions number mismatch: " 2638 "[%d]\n", vdev->nregions_hpa); 2639 rte_free(vdev->regions_hpa); 2640 rte_free(vdev); 2641 return -1; 2642 } 2643 } 2644 2645 2646 /* Add device to main ll */ 2647 ll_dev = get_data_ll_free_entry(&ll_root_free); 2648 if (ll_dev == NULL) { 2649 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2650 "of %d devices per core has been reached\n", 2651 dev->device_fh, num_devices); 2652 if (vdev->regions_hpa) 2653 rte_free(vdev->regions_hpa); 2654 rte_free(vdev); 2655 return -1; 2656 } 2657 ll_dev->vdev = vdev; 2658 add_data_ll_entry(&ll_root_used, ll_dev); 2659 vdev->vmdq_rx_q 2660 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2661 2662 if (zero_copy) { 2663 uint32_t index = vdev->vmdq_rx_q; 2664 uint32_t count_in_ring, i; 2665 struct mbuf_table *tx_q; 2666 2667 count_in_ring = rte_ring_count(vpool_array[index].ring); 2668 2669 LOG_DEBUG(VHOST_CONFIG, 2670 "(%"PRIu64") in new_device: mbuf count in mempool " 2671 "before attach is: %d\n", 2672 dev->device_fh, 2673 rte_mempool_count(vpool_array[index].pool)); 2674 LOG_DEBUG(VHOST_CONFIG, 2675 "(%"PRIu64") in new_device: mbuf count in ring " 2676 "before attach is : %d\n", 2677 dev->device_fh, count_in_ring); 2678 2679 /* 2680 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2681 */ 2682 for (i = 0; i < count_in_ring; i++) 2683 attach_rxmbuf_zcp(dev); 2684 2685 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2686 "mempool after attach is: %d\n", 2687 dev->device_fh, 2688 rte_mempool_count(vpool_array[index].pool)); 2689 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2690 "ring after attach is : %d\n", 2691 dev->device_fh, 2692 rte_ring_count(vpool_array[index].ring)); 2693 2694 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2695 tx_q->txq_id = vdev->vmdq_rx_q; 2696 2697 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2698 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2699 2700 LOG_DEBUG(VHOST_CONFIG, 2701 "(%"PRIu64") In new_device: Failed to start " 2702 "tx queue:%d\n", 2703 dev->device_fh, vdev->vmdq_rx_q); 2704 2705 mbuf_destroy_zcp(vpool); 2706 rte_free(vdev->regions_hpa); 2707 rte_free(vdev); 2708 return -1; 2709 } 2710 2711 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2712 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2713 2714 LOG_DEBUG(VHOST_CONFIG, 2715 "(%"PRIu64") In new_device: Failed to start " 2716 "rx queue:%d\n", 2717 dev->device_fh, vdev->vmdq_rx_q); 2718 2719 /* Stop the TX queue. */ 2720 if (rte_eth_dev_tx_queue_stop(ports[0], 2721 vdev->vmdq_rx_q) != 0) { 2722 LOG_DEBUG(VHOST_CONFIG, 2723 "(%"PRIu64") In new_device: Failed to " 2724 "stop tx queue:%d\n", 2725 dev->device_fh, vdev->vmdq_rx_q); 2726 } 2727 2728 mbuf_destroy_zcp(vpool); 2729 rte_free(vdev->regions_hpa); 2730 rte_free(vdev); 2731 return -1; 2732 } 2733 2734 } 2735 2736 /*reset ready flag*/ 2737 vdev->ready = DEVICE_MAC_LEARNING; 2738 vdev->remove = 0; 2739 2740 /* Find a suitable lcore to add the device. */ 2741 RTE_LCORE_FOREACH_SLAVE(lcore) { 2742 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2743 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2744 core_add = lcore; 2745 } 2746 } 2747 /* Add device to lcore ll */ 2748 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2749 if (ll_dev == NULL) { 2750 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2751 vdev->ready = DEVICE_SAFE_REMOVE; 2752 destroy_device(dev); 2753 rte_free(vdev->regions_hpa); 2754 rte_free(vdev); 2755 return -1; 2756 } 2757 ll_dev->vdev = vdev; 2758 vdev->coreid = core_add; 2759 2760 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2761 2762 /* Initialize device stats */ 2763 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2764 2765 /* Disable notifications. */ 2766 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2767 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2768 lcore_info[vdev->coreid].lcore_ll->device_num++; 2769 dev->flags |= VIRTIO_DEV_RUNNING; 2770 2771 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2772 2773 return 0; 2774 } 2775 2776 /* 2777 * These callback allow devices to be added to the data core when configuration 2778 * has been fully complete. 2779 */ 2780 static const struct virtio_net_device_ops virtio_net_device_ops = 2781 { 2782 .new_device = new_device, 2783 .destroy_device = destroy_device, 2784 }; 2785 2786 /* 2787 * This is a thread will wake up after a period to print stats if the user has 2788 * enabled them. 2789 */ 2790 static void 2791 print_stats(void) 2792 { 2793 struct virtio_net_data_ll *dev_ll; 2794 uint64_t tx_dropped, rx_dropped; 2795 uint64_t tx, tx_total, rx, rx_total; 2796 uint32_t device_fh; 2797 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2798 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2799 2800 while(1) { 2801 sleep(enable_stats); 2802 2803 /* Clear screen and move to top left */ 2804 printf("%s%s", clr, top_left); 2805 2806 printf("\nDevice statistics ===================================="); 2807 2808 dev_ll = ll_root_used; 2809 while (dev_ll != NULL) { 2810 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2811 tx_total = dev_statistics[device_fh].tx_total; 2812 tx = dev_statistics[device_fh].tx; 2813 tx_dropped = tx_total - tx; 2814 if (zero_copy == 0) { 2815 rx_total = rte_atomic64_read( 2816 &dev_statistics[device_fh].rx_total_atomic); 2817 rx = rte_atomic64_read( 2818 &dev_statistics[device_fh].rx_atomic); 2819 } else { 2820 rx_total = dev_statistics[device_fh].rx_total; 2821 rx = dev_statistics[device_fh].rx; 2822 } 2823 rx_dropped = rx_total - rx; 2824 2825 printf("\nStatistics for device %"PRIu32" ------------------------------" 2826 "\nTX total: %"PRIu64"" 2827 "\nTX dropped: %"PRIu64"" 2828 "\nTX successful: %"PRIu64"" 2829 "\nRX total: %"PRIu64"" 2830 "\nRX dropped: %"PRIu64"" 2831 "\nRX successful: %"PRIu64"", 2832 device_fh, 2833 tx_total, 2834 tx_dropped, 2835 tx, 2836 rx_total, 2837 rx_dropped, 2838 rx); 2839 2840 dev_ll = dev_ll->next; 2841 } 2842 printf("\n======================================================\n"); 2843 } 2844 } 2845 2846 static void 2847 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2848 char *ring_name, uint32_t nb_mbuf) 2849 { 2850 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, 2851 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket); 2852 if (vpool_array[index].pool != NULL) { 2853 vpool_array[index].ring 2854 = rte_ring_create(ring_name, 2855 rte_align32pow2(nb_mbuf + 1), 2856 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2857 if (likely(vpool_array[index].ring != NULL)) { 2858 LOG_DEBUG(VHOST_CONFIG, 2859 "in setup_mempool_tbl: mbuf count in " 2860 "mempool is: %d\n", 2861 rte_mempool_count(vpool_array[index].pool)); 2862 LOG_DEBUG(VHOST_CONFIG, 2863 "in setup_mempool_tbl: mbuf count in " 2864 "ring is: %d\n", 2865 rte_ring_count(vpool_array[index].ring)); 2866 } else { 2867 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2868 ring_name); 2869 } 2870 2871 /* Need consider head room. */ 2872 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP; 2873 } else { 2874 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2875 } 2876 } 2877 2878 /* When we receive a INT signal, unregister vhost driver */ 2879 static void 2880 sigint_handler(__rte_unused int signum) 2881 { 2882 /* Unregister vhost driver. */ 2883 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 2884 if (ret != 0) 2885 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 2886 exit(0); 2887 } 2888 2889 /* 2890 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2891 * device is also registered here to handle the IOCTLs. 2892 */ 2893 int 2894 main(int argc, char *argv[]) 2895 { 2896 struct rte_mempool *mbuf_pool = NULL; 2897 unsigned lcore_id, core_id = 0; 2898 unsigned nb_ports, valid_num_ports; 2899 int ret; 2900 uint8_t portid; 2901 uint16_t queue_id; 2902 static pthread_t tid; 2903 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 2904 2905 signal(SIGINT, sigint_handler); 2906 2907 /* init EAL */ 2908 ret = rte_eal_init(argc, argv); 2909 if (ret < 0) 2910 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2911 argc -= ret; 2912 argv += ret; 2913 2914 /* parse app arguments */ 2915 ret = us_vhost_parse_args(argc, argv); 2916 if (ret < 0) 2917 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2918 2919 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2920 if (rte_lcore_is_enabled(lcore_id)) 2921 lcore_ids[core_id ++] = lcore_id; 2922 2923 if (rte_lcore_count() > RTE_MAX_LCORE) 2924 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2925 2926 /*set the number of swithcing cores available*/ 2927 num_switching_cores = rte_lcore_count()-1; 2928 2929 /* Get the number of physical ports. */ 2930 nb_ports = rte_eth_dev_count(); 2931 if (nb_ports > RTE_MAX_ETHPORTS) 2932 nb_ports = RTE_MAX_ETHPORTS; 2933 2934 /* 2935 * Update the global var NUM_PORTS and global array PORTS 2936 * and get value of var VALID_NUM_PORTS according to system ports number 2937 */ 2938 valid_num_ports = check_ports_num(nb_ports); 2939 2940 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2941 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2942 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2943 return -1; 2944 } 2945 2946 if (zero_copy == 0) { 2947 /* Create the mbuf pool. */ 2948 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 2949 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 2950 0, MBUF_DATA_SIZE, rte_socket_id()); 2951 if (mbuf_pool == NULL) 2952 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2953 2954 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2955 vpool_array[queue_id].pool = mbuf_pool; 2956 2957 if (vm2vm_mode == VM2VM_HARDWARE) { 2958 /* Enable VT loop back to let L2 switch to do it. */ 2959 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2960 LOG_DEBUG(VHOST_CONFIG, 2961 "Enable loop back for L2 switch in vmdq.\n"); 2962 } 2963 } else { 2964 uint32_t nb_mbuf; 2965 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2966 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2967 2968 nb_mbuf = num_rx_descriptor 2969 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2970 + num_switching_cores * MAX_PKT_BURST; 2971 2972 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2973 snprintf(pool_name, sizeof(pool_name), 2974 "rxmbuf_pool_%u", queue_id); 2975 snprintf(ring_name, sizeof(ring_name), 2976 "rxmbuf_ring_%u", queue_id); 2977 setup_mempool_tbl(rte_socket_id(), queue_id, 2978 pool_name, ring_name, nb_mbuf); 2979 } 2980 2981 nb_mbuf = num_tx_descriptor 2982 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2983 + num_switching_cores * MAX_PKT_BURST; 2984 2985 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2986 snprintf(pool_name, sizeof(pool_name), 2987 "txmbuf_pool_%u", queue_id); 2988 snprintf(ring_name, sizeof(ring_name), 2989 "txmbuf_ring_%u", queue_id); 2990 setup_mempool_tbl(rte_socket_id(), 2991 (queue_id + MAX_QUEUES), 2992 pool_name, ring_name, nb_mbuf); 2993 } 2994 2995 if (vm2vm_mode == VM2VM_HARDWARE) { 2996 /* Enable VT loop back to let L2 switch to do it. */ 2997 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2998 LOG_DEBUG(VHOST_CONFIG, 2999 "Enable loop back for L2 switch in vmdq.\n"); 3000 } 3001 } 3002 /* Set log level. */ 3003 rte_set_log_level(LOG_LEVEL); 3004 3005 /* initialize all ports */ 3006 for (portid = 0; portid < nb_ports; portid++) { 3007 /* skip ports that are not enabled */ 3008 if ((enabled_port_mask & (1 << portid)) == 0) { 3009 RTE_LOG(INFO, VHOST_PORT, 3010 "Skipping disabled port %d\n", portid); 3011 continue; 3012 } 3013 if (port_init(portid) != 0) 3014 rte_exit(EXIT_FAILURE, 3015 "Cannot initialize network ports\n"); 3016 } 3017 3018 /* Initialise all linked lists. */ 3019 if (init_data_ll() == -1) 3020 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3021 3022 /* Initialize device stats */ 3023 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3024 3025 /* Enable stats if the user option is set. */ 3026 if (enable_stats) { 3027 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 3028 if (ret != 0) 3029 rte_exit(EXIT_FAILURE, 3030 "Cannot create print-stats thread\n"); 3031 3032 /* Set thread_name for aid in debugging. */ 3033 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 3034 ret = rte_thread_setname(tid, thread_name); 3035 if (ret != 0) 3036 RTE_LOG(ERR, VHOST_CONFIG, 3037 "Cannot set print-stats name\n"); 3038 } 3039 3040 /* Launch all data cores. */ 3041 if (zero_copy == 0) { 3042 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3043 rte_eal_remote_launch(switch_worker, 3044 mbuf_pool, lcore_id); 3045 } 3046 } else { 3047 uint32_t count_in_mempool, index, i; 3048 for (index = 0; index < 2*MAX_QUEUES; index++) { 3049 /* For all RX and TX queues. */ 3050 count_in_mempool 3051 = rte_mempool_count(vpool_array[index].pool); 3052 3053 /* 3054 * Transfer all un-attached mbufs from vpool.pool 3055 * to vpoo.ring. 3056 */ 3057 for (i = 0; i < count_in_mempool; i++) { 3058 struct rte_mbuf *mbuf 3059 = __rte_mbuf_raw_alloc( 3060 vpool_array[index].pool); 3061 rte_ring_sp_enqueue(vpool_array[index].ring, 3062 (void *)mbuf); 3063 } 3064 3065 LOG_DEBUG(VHOST_CONFIG, 3066 "in main: mbuf count in mempool at initial " 3067 "is: %d\n", count_in_mempool); 3068 LOG_DEBUG(VHOST_CONFIG, 3069 "in main: mbuf count in ring at initial is :" 3070 " %d\n", 3071 rte_ring_count(vpool_array[index].ring)); 3072 } 3073 3074 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3075 rte_eal_remote_launch(switch_worker_zcp, NULL, 3076 lcore_id); 3077 } 3078 3079 if (mergeable == 0) 3080 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3081 3082 /* Register vhost(cuse or user) driver to handle vhost messages. */ 3083 ret = rte_vhost_driver_register((char *)&dev_basename); 3084 if (ret != 0) 3085 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 3086 3087 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3088 3089 /* Start CUSE session. */ 3090 rte_vhost_driver_session_start(); 3091 return 0; 3092 3093 } 3094