1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 55 #include "main.h" 56 57 #ifndef MAX_QUEUES 58 #define MAX_QUEUES 128 59 #endif 60 61 /* the maximum number of external ports supported */ 62 #define MAX_SUP_PORTS 1 63 64 /* 65 * Calculate the number of buffers needed per port 66 */ 67 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 68 (num_switching_cores*MAX_PKT_BURST) + \ 69 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 70 (num_switching_cores*MBUF_CACHE_SIZE)) 71 72 #define MBUF_CACHE_SIZE 128 73 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 74 75 /* 76 * No frame data buffer allocated from host are required for zero copy 77 * implementation, guest will allocate the frame data buffer, and vhost 78 * directly use it. 79 */ 80 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM 81 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE 82 #define MBUF_CACHE_SIZE_ZCP 0 83 84 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 85 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 86 87 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 88 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 89 90 #define JUMBO_FRAME_MAX_SIZE 0x2600 91 92 /* State of virtio device. */ 93 #define DEVICE_MAC_LEARNING 0 94 #define DEVICE_RX 1 95 #define DEVICE_SAFE_REMOVE 2 96 97 /* Config_core_flag status definitions. */ 98 #define REQUEST_DEV_REMOVAL 1 99 #define ACK_DEV_REMOVAL 0 100 101 /* Configurable number of RX/TX ring descriptors */ 102 #define RTE_TEST_RX_DESC_DEFAULT 1024 103 #define RTE_TEST_TX_DESC_DEFAULT 512 104 105 /* 106 * Need refine these 2 macros for legacy and DPDK based front end: 107 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 108 * And then adjust power 2. 109 */ 110 /* 111 * For legacy front end, 128 descriptors, 112 * half for virtio header, another half for mbuf. 113 */ 114 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 115 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 116 117 /* Get first 4 bytes in mbuf headroom. */ 118 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 119 + sizeof(struct rte_mbuf))) 120 121 /* true if x is a power of 2 */ 122 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 123 124 #define INVALID_PORT_ID 0xFF 125 126 /* Max number of devices. Limited by vmdq. */ 127 #define MAX_DEVICES 64 128 129 /* Size of buffers used for snprintfs. */ 130 #define MAX_PRINT_BUFF 6072 131 132 /* Maximum character device basename size. */ 133 #define MAX_BASENAME_SZ 10 134 135 /* Maximum long option length for option parsing. */ 136 #define MAX_LONG_OPT_SZ 64 137 138 /* Used to compare MAC addresses. */ 139 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 140 141 /* Number of descriptors per cacheline. */ 142 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 143 144 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb)) 145 146 /* mask of enabled ports */ 147 static uint32_t enabled_port_mask = 0; 148 149 /* Promiscuous mode */ 150 static uint32_t promiscuous; 151 152 /*Number of switching cores enabled*/ 153 static uint32_t num_switching_cores = 0; 154 155 /* number of devices/queues to support*/ 156 static uint32_t num_queues = 0; 157 static uint32_t num_devices; 158 159 /* 160 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 161 * disabled on default. 162 */ 163 static uint32_t zero_copy; 164 static int mergeable; 165 166 /* Do vlan strip on host, enabled on default */ 167 static uint32_t vlan_strip = 1; 168 169 /* number of descriptors to apply*/ 170 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 171 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 172 173 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 174 #define MAX_RING_DESC 4096 175 176 struct vpool { 177 struct rte_mempool *pool; 178 struct rte_ring *ring; 179 uint32_t buf_size; 180 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 181 182 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 183 typedef enum { 184 VM2VM_DISABLED = 0, 185 VM2VM_SOFTWARE = 1, 186 VM2VM_HARDWARE = 2, 187 VM2VM_LAST 188 } vm2vm_type; 189 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 190 191 /* The type of host physical address translated from guest physical address. */ 192 typedef enum { 193 PHYS_ADDR_CONTINUOUS = 0, 194 PHYS_ADDR_CROSS_SUBREG = 1, 195 PHYS_ADDR_INVALID = 2, 196 PHYS_ADDR_LAST 197 } hpa_type; 198 199 /* Enable stats. */ 200 static uint32_t enable_stats = 0; 201 /* Enable retries on RX. */ 202 static uint32_t enable_retry = 1; 203 /* Specify timeout (in useconds) between retries on RX. */ 204 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 205 /* Specify the number of retries on RX. */ 206 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 207 208 /* Character device basename. Can be set by user. */ 209 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 210 211 /* empty vmdq configuration structure. Filled in programatically */ 212 static struct rte_eth_conf vmdq_conf_default = { 213 .rxmode = { 214 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 215 .split_hdr_size = 0, 216 .header_split = 0, /**< Header Split disabled */ 217 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 218 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 219 /* 220 * It is necessary for 1G NIC such as I350, 221 * this fixes bug of ipv4 forwarding in guest can't 222 * forward pakets from one virtio dev to another virtio dev. 223 */ 224 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 225 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 226 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 227 }, 228 229 .txmode = { 230 .mq_mode = ETH_MQ_TX_NONE, 231 }, 232 .rx_adv_conf = { 233 /* 234 * should be overridden separately in code with 235 * appropriate values 236 */ 237 .vmdq_rx_conf = { 238 .nb_queue_pools = ETH_8_POOLS, 239 .enable_default_pool = 0, 240 .default_pool = 0, 241 .nb_pool_maps = 0, 242 .pool_map = {{0, 0},}, 243 }, 244 }, 245 }; 246 247 static unsigned lcore_ids[RTE_MAX_LCORE]; 248 static uint8_t ports[RTE_MAX_ETHPORTS]; 249 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 250 static uint16_t num_pf_queues, num_vmdq_queues; 251 static uint16_t vmdq_pool_base, vmdq_queue_base; 252 static uint16_t queues_per_pool; 253 254 static const uint16_t external_pkt_default_vlan_tag = 2000; 255 const uint16_t vlan_tags[] = { 256 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 257 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 258 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 259 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 260 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 261 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 262 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 263 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 264 }; 265 266 /* ethernet addresses of ports */ 267 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 268 269 /* heads for the main used and free linked lists for the data path. */ 270 static struct virtio_net_data_ll *ll_root_used = NULL; 271 static struct virtio_net_data_ll *ll_root_free = NULL; 272 273 /* Array of data core structures containing information on individual core linked lists. */ 274 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 275 276 /* Used for queueing bursts of TX packets. */ 277 struct mbuf_table { 278 unsigned len; 279 unsigned txq_id; 280 struct rte_mbuf *m_table[MAX_PKT_BURST]; 281 }; 282 283 /* TX queue for each data core. */ 284 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 285 286 /* TX queue fori each virtio device for zero copy. */ 287 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 288 289 /* Vlan header struct used to insert vlan tags on TX. */ 290 struct vlan_ethhdr { 291 unsigned char h_dest[ETH_ALEN]; 292 unsigned char h_source[ETH_ALEN]; 293 __be16 h_vlan_proto; 294 __be16 h_vlan_TCI; 295 __be16 h_vlan_encapsulated_proto; 296 }; 297 298 /* Header lengths. */ 299 #define VLAN_HLEN 4 300 #define VLAN_ETH_HLEN 18 301 302 /* Per-device statistics struct */ 303 struct device_statistics { 304 uint64_t tx_total; 305 rte_atomic64_t rx_total_atomic; 306 uint64_t rx_total; 307 uint64_t tx; 308 rte_atomic64_t rx_atomic; 309 uint64_t rx; 310 } __rte_cache_aligned; 311 struct device_statistics dev_statistics[MAX_DEVICES]; 312 313 /* 314 * Builds up the correct configuration for VMDQ VLAN pool map 315 * according to the pool & queue limits. 316 */ 317 static inline int 318 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 319 { 320 struct rte_eth_vmdq_rx_conf conf; 321 struct rte_eth_vmdq_rx_conf *def_conf = 322 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 323 unsigned i; 324 325 memset(&conf, 0, sizeof(conf)); 326 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 327 conf.nb_pool_maps = num_devices; 328 conf.enable_loop_back = def_conf->enable_loop_back; 329 conf.rx_mode = def_conf->rx_mode; 330 331 for (i = 0; i < conf.nb_pool_maps; i++) { 332 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 333 conf.pool_map[i].pools = (1UL << i); 334 } 335 336 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 337 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 338 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 339 return 0; 340 } 341 342 /* 343 * Validate the device number according to the max pool number gotten form 344 * dev_info. If the device number is invalid, give the error message and 345 * return -1. Each device must have its own pool. 346 */ 347 static inline int 348 validate_num_devices(uint32_t max_nb_devices) 349 { 350 if (num_devices > max_nb_devices) { 351 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 352 return -1; 353 } 354 return 0; 355 } 356 357 /* 358 * Initialises a given port using global settings and with the rx buffers 359 * coming from the mbuf_pool passed as parameter 360 */ 361 static inline int 362 port_init(uint8_t port) 363 { 364 struct rte_eth_dev_info dev_info; 365 struct rte_eth_conf port_conf; 366 struct rte_eth_rxconf *rxconf; 367 struct rte_eth_txconf *txconf; 368 int16_t rx_rings, tx_rings; 369 uint16_t rx_ring_size, tx_ring_size; 370 int retval; 371 uint16_t q; 372 373 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 374 rte_eth_dev_info_get (port, &dev_info); 375 376 if (dev_info.max_rx_queues > MAX_QUEUES) { 377 rte_exit(EXIT_FAILURE, 378 "please define MAX_QUEUES no less than %u in %s\n", 379 dev_info.max_rx_queues, __FILE__); 380 } 381 382 rxconf = &dev_info.default_rxconf; 383 txconf = &dev_info.default_txconf; 384 rxconf->rx_drop_en = 1; 385 386 /* Enable vlan offload */ 387 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 388 389 /* 390 * Zero copy defers queue RX/TX start to the time when guest 391 * finishes its startup and packet buffers from that guest are 392 * available. 393 */ 394 if (zero_copy) { 395 rxconf->rx_deferred_start = 1; 396 rxconf->rx_drop_en = 0; 397 txconf->tx_deferred_start = 1; 398 } 399 400 /*configure the number of supported virtio devices based on VMDQ limits */ 401 num_devices = dev_info.max_vmdq_pools; 402 403 if (zero_copy) { 404 rx_ring_size = num_rx_descriptor; 405 tx_ring_size = num_tx_descriptor; 406 tx_rings = dev_info.max_tx_queues; 407 } else { 408 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 409 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 410 tx_rings = (uint16_t)rte_lcore_count(); 411 } 412 413 retval = validate_num_devices(MAX_DEVICES); 414 if (retval < 0) 415 return retval; 416 417 /* Get port configuration. */ 418 retval = get_eth_conf(&port_conf, num_devices); 419 if (retval < 0) 420 return retval; 421 /* NIC queues are divided into pf queues and vmdq queues. */ 422 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 423 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 424 num_vmdq_queues = num_devices * queues_per_pool; 425 num_queues = num_pf_queues + num_vmdq_queues; 426 vmdq_queue_base = dev_info.vmdq_queue_base; 427 vmdq_pool_base = dev_info.vmdq_pool_base; 428 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 429 num_pf_queues, num_devices, queues_per_pool); 430 431 if (port >= rte_eth_dev_count()) return -1; 432 433 rx_rings = (uint16_t)dev_info.max_rx_queues; 434 /* Configure ethernet device. */ 435 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 436 if (retval != 0) 437 return retval; 438 439 /* Setup the queues. */ 440 for (q = 0; q < rx_rings; q ++) { 441 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 442 rte_eth_dev_socket_id(port), 443 rxconf, 444 vpool_array[q].pool); 445 if (retval < 0) 446 return retval; 447 } 448 for (q = 0; q < tx_rings; q ++) { 449 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 450 rte_eth_dev_socket_id(port), 451 txconf); 452 if (retval < 0) 453 return retval; 454 } 455 456 /* Start the device. */ 457 retval = rte_eth_dev_start(port); 458 if (retval < 0) { 459 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 460 return retval; 461 } 462 463 if (promiscuous) 464 rte_eth_promiscuous_enable(port); 465 466 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 467 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 468 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 469 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 470 (unsigned)port, 471 vmdq_ports_eth_addr[port].addr_bytes[0], 472 vmdq_ports_eth_addr[port].addr_bytes[1], 473 vmdq_ports_eth_addr[port].addr_bytes[2], 474 vmdq_ports_eth_addr[port].addr_bytes[3], 475 vmdq_ports_eth_addr[port].addr_bytes[4], 476 vmdq_ports_eth_addr[port].addr_bytes[5]); 477 478 return 0; 479 } 480 481 /* 482 * Set character device basename. 483 */ 484 static int 485 us_vhost_parse_basename(const char *q_arg) 486 { 487 /* parse number string */ 488 489 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 490 return -1; 491 else 492 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 493 494 return 0; 495 } 496 497 /* 498 * Parse the portmask provided at run time. 499 */ 500 static int 501 parse_portmask(const char *portmask) 502 { 503 char *end = NULL; 504 unsigned long pm; 505 506 errno = 0; 507 508 /* parse hexadecimal string */ 509 pm = strtoul(portmask, &end, 16); 510 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 511 return -1; 512 513 if (pm == 0) 514 return -1; 515 516 return pm; 517 518 } 519 520 /* 521 * Parse num options at run time. 522 */ 523 static int 524 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 525 { 526 char *end = NULL; 527 unsigned long num; 528 529 errno = 0; 530 531 /* parse unsigned int string */ 532 num = strtoul(q_arg, &end, 10); 533 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 534 return -1; 535 536 if (num > max_valid_value) 537 return -1; 538 539 return num; 540 541 } 542 543 /* 544 * Display usage 545 */ 546 static void 547 us_vhost_usage(const char *prgname) 548 { 549 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 550 " --vm2vm [0|1|2]\n" 551 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 552 " --dev-basename <name>\n" 553 " --nb-devices ND\n" 554 " -p PORTMASK: Set mask for ports to be used by application\n" 555 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 556 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 557 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 558 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 559 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 560 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 561 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 562 " --dev-basename: The basename to be used for the character device.\n" 563 " --zero-copy [0|1]: disable(default)/enable rx/tx " 564 "zero copy\n" 565 " --rx-desc-num [0-N]: the number of descriptors on rx, " 566 "used only when zero copy is enabled.\n" 567 " --tx-desc-num [0-N]: the number of descriptors on tx, " 568 "used only when zero copy is enabled.\n", 569 prgname); 570 } 571 572 /* 573 * Parse the arguments given in the command line of the application. 574 */ 575 static int 576 us_vhost_parse_args(int argc, char **argv) 577 { 578 int opt, ret; 579 int option_index; 580 unsigned i; 581 const char *prgname = argv[0]; 582 static struct option long_option[] = { 583 {"vm2vm", required_argument, NULL, 0}, 584 {"rx-retry", required_argument, NULL, 0}, 585 {"rx-retry-delay", required_argument, NULL, 0}, 586 {"rx-retry-num", required_argument, NULL, 0}, 587 {"mergeable", required_argument, NULL, 0}, 588 {"vlan-strip", required_argument, NULL, 0}, 589 {"stats", required_argument, NULL, 0}, 590 {"dev-basename", required_argument, NULL, 0}, 591 {"zero-copy", required_argument, NULL, 0}, 592 {"rx-desc-num", required_argument, NULL, 0}, 593 {"tx-desc-num", required_argument, NULL, 0}, 594 {NULL, 0, 0, 0}, 595 }; 596 597 /* Parse command line */ 598 while ((opt = getopt_long(argc, argv, "p:P", 599 long_option, &option_index)) != EOF) { 600 switch (opt) { 601 /* Portmask */ 602 case 'p': 603 enabled_port_mask = parse_portmask(optarg); 604 if (enabled_port_mask == 0) { 605 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 606 us_vhost_usage(prgname); 607 return -1; 608 } 609 break; 610 611 case 'P': 612 promiscuous = 1; 613 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 614 ETH_VMDQ_ACCEPT_BROADCAST | 615 ETH_VMDQ_ACCEPT_MULTICAST; 616 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 617 618 break; 619 620 case 0: 621 /* Enable/disable vm2vm comms. */ 622 if (!strncmp(long_option[option_index].name, "vm2vm", 623 MAX_LONG_OPT_SZ)) { 624 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 625 if (ret == -1) { 626 RTE_LOG(INFO, VHOST_CONFIG, 627 "Invalid argument for " 628 "vm2vm [0|1|2]\n"); 629 us_vhost_usage(prgname); 630 return -1; 631 } else { 632 vm2vm_mode = (vm2vm_type)ret; 633 } 634 } 635 636 /* Enable/disable retries on RX. */ 637 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 638 ret = parse_num_opt(optarg, 1); 639 if (ret == -1) { 640 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 641 us_vhost_usage(prgname); 642 return -1; 643 } else { 644 enable_retry = ret; 645 } 646 } 647 648 /* Specify the retries delay time (in useconds) on RX. */ 649 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 650 ret = parse_num_opt(optarg, INT32_MAX); 651 if (ret == -1) { 652 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 653 us_vhost_usage(prgname); 654 return -1; 655 } else { 656 burst_rx_delay_time = ret; 657 } 658 } 659 660 /* Specify the retries number on RX. */ 661 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 662 ret = parse_num_opt(optarg, INT32_MAX); 663 if (ret == -1) { 664 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 665 us_vhost_usage(prgname); 666 return -1; 667 } else { 668 burst_rx_retry_num = ret; 669 } 670 } 671 672 /* Enable/disable RX mergeable buffers. */ 673 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 674 ret = parse_num_opt(optarg, 1); 675 if (ret == -1) { 676 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 677 us_vhost_usage(prgname); 678 return -1; 679 } else { 680 mergeable = !!ret; 681 if (ret) { 682 vmdq_conf_default.rxmode.jumbo_frame = 1; 683 vmdq_conf_default.rxmode.max_rx_pkt_len 684 = JUMBO_FRAME_MAX_SIZE; 685 } 686 } 687 } 688 689 /* Enable/disable RX VLAN strip on host. */ 690 if (!strncmp(long_option[option_index].name, 691 "vlan-strip", MAX_LONG_OPT_SZ)) { 692 ret = parse_num_opt(optarg, 1); 693 if (ret == -1) { 694 RTE_LOG(INFO, VHOST_CONFIG, 695 "Invalid argument for VLAN strip [0|1]\n"); 696 us_vhost_usage(prgname); 697 return -1; 698 } else { 699 vlan_strip = !!ret; 700 vmdq_conf_default.rxmode.hw_vlan_strip = 701 vlan_strip; 702 } 703 } 704 705 /* Enable/disable stats. */ 706 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 707 ret = parse_num_opt(optarg, INT32_MAX); 708 if (ret == -1) { 709 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 710 us_vhost_usage(prgname); 711 return -1; 712 } else { 713 enable_stats = ret; 714 } 715 } 716 717 /* Set character device basename. */ 718 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 719 if (us_vhost_parse_basename(optarg) == -1) { 720 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 721 us_vhost_usage(prgname); 722 return -1; 723 } 724 } 725 726 /* Enable/disable rx/tx zero copy. */ 727 if (!strncmp(long_option[option_index].name, 728 "zero-copy", MAX_LONG_OPT_SZ)) { 729 ret = parse_num_opt(optarg, 1); 730 if (ret == -1) { 731 RTE_LOG(INFO, VHOST_CONFIG, 732 "Invalid argument" 733 " for zero-copy [0|1]\n"); 734 us_vhost_usage(prgname); 735 return -1; 736 } else 737 zero_copy = ret; 738 } 739 740 /* Specify the descriptor number on RX. */ 741 if (!strncmp(long_option[option_index].name, 742 "rx-desc-num", MAX_LONG_OPT_SZ)) { 743 ret = parse_num_opt(optarg, MAX_RING_DESC); 744 if ((ret == -1) || (!POWEROF2(ret))) { 745 RTE_LOG(INFO, VHOST_CONFIG, 746 "Invalid argument for rx-desc-num[0-N]," 747 "power of 2 required.\n"); 748 us_vhost_usage(prgname); 749 return -1; 750 } else { 751 num_rx_descriptor = ret; 752 } 753 } 754 755 /* Specify the descriptor number on TX. */ 756 if (!strncmp(long_option[option_index].name, 757 "tx-desc-num", MAX_LONG_OPT_SZ)) { 758 ret = parse_num_opt(optarg, MAX_RING_DESC); 759 if ((ret == -1) || (!POWEROF2(ret))) { 760 RTE_LOG(INFO, VHOST_CONFIG, 761 "Invalid argument for tx-desc-num [0-N]," 762 "power of 2 required.\n"); 763 us_vhost_usage(prgname); 764 return -1; 765 } else { 766 num_tx_descriptor = ret; 767 } 768 } 769 770 break; 771 772 /* Invalid option - print options. */ 773 default: 774 us_vhost_usage(prgname); 775 return -1; 776 } 777 } 778 779 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 780 if (enabled_port_mask & (1 << i)) 781 ports[num_ports++] = (uint8_t)i; 782 } 783 784 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 785 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 786 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 787 return -1; 788 } 789 790 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 791 RTE_LOG(INFO, VHOST_PORT, 792 "Vhost zero copy doesn't support software vm2vm," 793 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 794 return -1; 795 } 796 797 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 798 RTE_LOG(INFO, VHOST_PORT, 799 "Vhost zero copy doesn't support jumbo frame," 800 "please specify '--mergeable 0' to disable the " 801 "mergeable feature.\n"); 802 return -1; 803 } 804 805 return 0; 806 } 807 808 /* 809 * Update the global var NUM_PORTS and array PORTS according to system ports number 810 * and return valid ports number 811 */ 812 static unsigned check_ports_num(unsigned nb_ports) 813 { 814 unsigned valid_num_ports = num_ports; 815 unsigned portid; 816 817 if (num_ports > nb_ports) { 818 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 819 num_ports, nb_ports); 820 num_ports = nb_ports; 821 } 822 823 for (portid = 0; portid < num_ports; portid ++) { 824 if (ports[portid] >= nb_ports) { 825 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 826 ports[portid], (nb_ports - 1)); 827 ports[portid] = INVALID_PORT_ID; 828 valid_num_ports--; 829 } 830 } 831 return valid_num_ports; 832 } 833 834 /* 835 * Macro to print out packet contents. Wrapped in debug define so that the 836 * data path is not effected when debug is disabled. 837 */ 838 #ifdef DEBUG 839 #define PRINT_PACKET(device, addr, size, header) do { \ 840 char *pkt_addr = (char*)(addr); \ 841 unsigned int index; \ 842 char packet[MAX_PRINT_BUFF]; \ 843 \ 844 if ((header)) \ 845 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 846 else \ 847 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 848 for (index = 0; index < (size); index++) { \ 849 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 850 "%02hhx ", pkt_addr[index]); \ 851 } \ 852 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 853 \ 854 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 855 } while(0) 856 #else 857 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 858 #endif 859 860 /* 861 * Function to convert guest physical addresses to vhost physical addresses. 862 * This is used to convert virtio buffer addresses. 863 */ 864 static inline uint64_t __attribute__((always_inline)) 865 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 866 uint32_t buf_len, hpa_type *addr_type) 867 { 868 struct virtio_memory_regions_hpa *region; 869 uint32_t regionidx; 870 uint64_t vhost_pa = 0; 871 872 *addr_type = PHYS_ADDR_INVALID; 873 874 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 875 region = &vdev->regions_hpa[regionidx]; 876 if ((guest_pa >= region->guest_phys_address) && 877 (guest_pa <= region->guest_phys_address_end)) { 878 vhost_pa = region->host_phys_addr_offset + guest_pa; 879 if (likely((guest_pa + buf_len - 1) 880 <= region->guest_phys_address_end)) 881 *addr_type = PHYS_ADDR_CONTINUOUS; 882 else 883 *addr_type = PHYS_ADDR_CROSS_SUBREG; 884 break; 885 } 886 } 887 888 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 889 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 890 (void *)(uintptr_t)vhost_pa); 891 892 return vhost_pa; 893 } 894 895 /* 896 * Compares a packet destination MAC address to a device MAC address. 897 */ 898 static inline int __attribute__((always_inline)) 899 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 900 { 901 return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; 902 } 903 904 /* 905 * This function learns the MAC address of the device and registers this along with a 906 * vlan tag to a VMDQ. 907 */ 908 static int 909 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 910 { 911 struct ether_hdr *pkt_hdr; 912 struct virtio_net_data_ll *dev_ll; 913 struct virtio_net *dev = vdev->dev; 914 int i, ret; 915 916 /* Learn MAC address of guest device from packet */ 917 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 918 919 dev_ll = ll_root_used; 920 921 while (dev_ll != NULL) { 922 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 923 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 924 return -1; 925 } 926 dev_ll = dev_ll->next; 927 } 928 929 for (i = 0; i < ETHER_ADDR_LEN; i++) 930 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 931 932 /* vlan_tag currently uses the device_id. */ 933 vdev->vlan_tag = vlan_tags[dev->device_fh]; 934 935 /* Print out VMDQ registration info. */ 936 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 937 dev->device_fh, 938 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 939 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 940 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 941 vdev->vlan_tag); 942 943 /* Register the MAC address. */ 944 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 945 (uint32_t)dev->device_fh + vmdq_pool_base); 946 if (ret) 947 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 948 dev->device_fh); 949 950 /* Enable stripping of the vlan tag as we handle routing. */ 951 if (vlan_strip) 952 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 953 (uint16_t)vdev->vmdq_rx_q, 1); 954 955 /* Set device as ready for RX. */ 956 vdev->ready = DEVICE_RX; 957 958 return 0; 959 } 960 961 /* 962 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 963 * queue before disabling RX on the device. 964 */ 965 static inline void 966 unlink_vmdq(struct vhost_dev *vdev) 967 { 968 unsigned i = 0; 969 unsigned rx_count; 970 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 971 972 if (vdev->ready == DEVICE_RX) { 973 /*clear MAC and VLAN settings*/ 974 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 975 for (i = 0; i < 6; i++) 976 vdev->mac_address.addr_bytes[i] = 0; 977 978 vdev->vlan_tag = 0; 979 980 /*Clear out the receive buffers*/ 981 rx_count = rte_eth_rx_burst(ports[0], 982 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 983 984 while (rx_count) { 985 for (i = 0; i < rx_count; i++) 986 rte_pktmbuf_free(pkts_burst[i]); 987 988 rx_count = rte_eth_rx_burst(ports[0], 989 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 990 } 991 992 vdev->ready = DEVICE_MAC_LEARNING; 993 } 994 } 995 996 /* 997 * Check if the packet destination MAC address is for a local device. If so then put 998 * the packet on that devices RX queue. If not then return. 999 */ 1000 static inline int __attribute__((always_inline)) 1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1002 { 1003 struct virtio_net_data_ll *dev_ll; 1004 struct ether_hdr *pkt_hdr; 1005 uint64_t ret = 0; 1006 struct virtio_net *dev = vdev->dev; 1007 struct virtio_net *tdev; /* destination virito device */ 1008 1009 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1010 1011 /*get the used devices list*/ 1012 dev_ll = ll_root_used; 1013 1014 while (dev_ll != NULL) { 1015 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1016 &dev_ll->vdev->mac_address)) { 1017 1018 /* Drop the packet if the TX packet is destined for the TX device. */ 1019 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1020 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1021 dev->device_fh); 1022 return 0; 1023 } 1024 tdev = dev_ll->vdev->dev; 1025 1026 1027 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1028 1029 if (unlikely(dev_ll->vdev->remove)) { 1030 /*drop the packet if the device is marked for removal*/ 1031 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1032 } else { 1033 /*send the packet to the local virtio device*/ 1034 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1035 if (enable_stats) { 1036 rte_atomic64_add( 1037 &dev_statistics[tdev->device_fh].rx_total_atomic, 1038 1); 1039 rte_atomic64_add( 1040 &dev_statistics[tdev->device_fh].rx_atomic, 1041 ret); 1042 dev_statistics[dev->device_fh].tx_total++; 1043 dev_statistics[dev->device_fh].tx += ret; 1044 } 1045 } 1046 1047 return 0; 1048 } 1049 dev_ll = dev_ll->next; 1050 } 1051 1052 return -1; 1053 } 1054 1055 /* 1056 * Check if the destination MAC of a packet is one local VM, 1057 * and get its vlan tag, and offset if it is. 1058 */ 1059 static inline int __attribute__((always_inline)) 1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1061 uint32_t *offset, uint16_t *vlan_tag) 1062 { 1063 struct virtio_net_data_ll *dev_ll = ll_root_used; 1064 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1065 1066 while (dev_ll != NULL) { 1067 if ((dev_ll->vdev->ready == DEVICE_RX) 1068 && ether_addr_cmp(&(pkt_hdr->d_addr), 1069 &dev_ll->vdev->mac_address)) { 1070 /* 1071 * Drop the packet if the TX packet is 1072 * destined for the TX device. 1073 */ 1074 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1075 LOG_DEBUG(VHOST_DATA, 1076 "(%"PRIu64") TX: Source and destination" 1077 " MAC addresses are the same. Dropping " 1078 "packet.\n", 1079 dev_ll->vdev->dev->device_fh); 1080 return -1; 1081 } 1082 1083 /* 1084 * HW vlan strip will reduce the packet length 1085 * by minus length of vlan tag, so need restore 1086 * the packet length by plus it. 1087 */ 1088 *offset = VLAN_HLEN; 1089 *vlan_tag = 1090 (uint16_t) 1091 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1092 1093 LOG_DEBUG(VHOST_DATA, 1094 "(%"PRIu64") TX: pkt to local VM device id:" 1095 "(%"PRIu64") vlan tag: %d.\n", 1096 dev->device_fh, dev_ll->vdev->dev->device_fh, 1097 (int)*vlan_tag); 1098 1099 break; 1100 } 1101 dev_ll = dev_ll->next; 1102 } 1103 return 0; 1104 } 1105 1106 /* 1107 * This function routes the TX packet to the correct interface. This may be a local device 1108 * or the physical port. 1109 */ 1110 static inline void __attribute__((always_inline)) 1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1112 { 1113 struct mbuf_table *tx_q; 1114 struct rte_mbuf **m_table; 1115 unsigned len, ret, offset = 0; 1116 const uint16_t lcore_id = rte_lcore_id(); 1117 struct virtio_net *dev = vdev->dev; 1118 struct ether_hdr *nh; 1119 1120 /*check if destination is local VM*/ 1121 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1122 rte_pktmbuf_free(m); 1123 return; 1124 } 1125 1126 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1127 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1128 rte_pktmbuf_free(m); 1129 return; 1130 } 1131 } 1132 1133 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1134 1135 /*Add packet to the port tx queue*/ 1136 tx_q = &lcore_tx_queue[lcore_id]; 1137 len = tx_q->len; 1138 1139 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1140 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1141 /* Guest has inserted the vlan tag. */ 1142 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1143 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1144 if ((vm2vm_mode == VM2VM_HARDWARE) && 1145 (vh->vlan_tci != vlan_tag_be)) 1146 vh->vlan_tci = vlan_tag_be; 1147 } else { 1148 m->ol_flags = PKT_TX_VLAN_PKT; 1149 1150 /* 1151 * Find the right seg to adjust the data len when offset is 1152 * bigger than tail room size. 1153 */ 1154 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1155 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1156 m->data_len += offset; 1157 else { 1158 struct rte_mbuf *seg = m; 1159 1160 while ((seg->next != NULL) && 1161 (offset > rte_pktmbuf_tailroom(seg))) 1162 seg = seg->next; 1163 1164 seg->data_len += offset; 1165 } 1166 m->pkt_len += offset; 1167 } 1168 1169 m->vlan_tci = vlan_tag; 1170 } 1171 1172 tx_q->m_table[len] = m; 1173 len++; 1174 if (enable_stats) { 1175 dev_statistics[dev->device_fh].tx_total++; 1176 dev_statistics[dev->device_fh].tx++; 1177 } 1178 1179 if (unlikely(len == MAX_PKT_BURST)) { 1180 m_table = (struct rte_mbuf **)tx_q->m_table; 1181 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1182 /* Free any buffers not handled by TX and update the port stats. */ 1183 if (unlikely(ret < len)) { 1184 do { 1185 rte_pktmbuf_free(m_table[ret]); 1186 } while (++ret < len); 1187 } 1188 1189 len = 0; 1190 } 1191 1192 tx_q->len = len; 1193 return; 1194 } 1195 /* 1196 * This function is called by each data core. It handles all RX/TX registered with the 1197 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1198 * with all devices in the main linked list. 1199 */ 1200 static int 1201 switch_worker(__attribute__((unused)) void *arg) 1202 { 1203 struct rte_mempool *mbuf_pool = arg; 1204 struct virtio_net *dev = NULL; 1205 struct vhost_dev *vdev = NULL; 1206 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1207 struct virtio_net_data_ll *dev_ll; 1208 struct mbuf_table *tx_q; 1209 volatile struct lcore_ll_info *lcore_ll; 1210 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1211 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1212 unsigned ret, i; 1213 const uint16_t lcore_id = rte_lcore_id(); 1214 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1215 uint16_t rx_count = 0; 1216 uint16_t tx_count; 1217 uint32_t retry = 0; 1218 1219 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1220 lcore_ll = lcore_info[lcore_id].lcore_ll; 1221 prev_tsc = 0; 1222 1223 tx_q = &lcore_tx_queue[lcore_id]; 1224 for (i = 0; i < num_cores; i ++) { 1225 if (lcore_ids[i] == lcore_id) { 1226 tx_q->txq_id = i; 1227 break; 1228 } 1229 } 1230 1231 while(1) { 1232 cur_tsc = rte_rdtsc(); 1233 /* 1234 * TX burst queue drain 1235 */ 1236 diff_tsc = cur_tsc - prev_tsc; 1237 if (unlikely(diff_tsc > drain_tsc)) { 1238 1239 if (tx_q->len) { 1240 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1241 1242 /*Tx any packets in the queue*/ 1243 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1244 (struct rte_mbuf **)tx_q->m_table, 1245 (uint16_t)tx_q->len); 1246 if (unlikely(ret < tx_q->len)) { 1247 do { 1248 rte_pktmbuf_free(tx_q->m_table[ret]); 1249 } while (++ret < tx_q->len); 1250 } 1251 1252 tx_q->len = 0; 1253 } 1254 1255 prev_tsc = cur_tsc; 1256 1257 } 1258 1259 rte_prefetch0(lcore_ll->ll_root_used); 1260 /* 1261 * Inform the configuration core that we have exited the linked list and that no devices are 1262 * in use if requested. 1263 */ 1264 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1265 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1266 1267 /* 1268 * Process devices 1269 */ 1270 dev_ll = lcore_ll->ll_root_used; 1271 1272 while (dev_ll != NULL) { 1273 /*get virtio device ID*/ 1274 vdev = dev_ll->vdev; 1275 dev = vdev->dev; 1276 1277 if (unlikely(vdev->remove)) { 1278 dev_ll = dev_ll->next; 1279 unlink_vmdq(vdev); 1280 vdev->ready = DEVICE_SAFE_REMOVE; 1281 continue; 1282 } 1283 if (likely(vdev->ready == DEVICE_RX)) { 1284 /*Handle guest RX*/ 1285 rx_count = rte_eth_rx_burst(ports[0], 1286 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1287 1288 if (rx_count) { 1289 /* 1290 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1291 * Here MAX_PKT_BURST must be less than virtio queue size 1292 */ 1293 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1294 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1295 rte_delay_us(burst_rx_delay_time); 1296 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1297 break; 1298 } 1299 } 1300 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1301 if (enable_stats) { 1302 rte_atomic64_add( 1303 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1304 rx_count); 1305 rte_atomic64_add( 1306 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1307 } 1308 while (likely(rx_count)) { 1309 rx_count--; 1310 rte_pktmbuf_free(pkts_burst[rx_count]); 1311 } 1312 1313 } 1314 } 1315 1316 if (likely(!vdev->remove)) { 1317 /* Handle guest TX*/ 1318 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1319 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1320 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1321 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1322 while (tx_count) 1323 rte_pktmbuf_free(pkts_burst[--tx_count]); 1324 } 1325 } 1326 while (tx_count) 1327 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1328 } 1329 1330 /*move to the next device in the list*/ 1331 dev_ll = dev_ll->next; 1332 } 1333 } 1334 1335 return 0; 1336 } 1337 1338 /* 1339 * This function gets available ring number for zero copy rx. 1340 * Only one thread will call this funciton for a paticular virtio device, 1341 * so, it is designed as non-thread-safe function. 1342 */ 1343 static inline uint32_t __attribute__((always_inline)) 1344 get_available_ring_num_zcp(struct virtio_net *dev) 1345 { 1346 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1347 uint16_t avail_idx; 1348 1349 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1350 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1351 } 1352 1353 /* 1354 * This function gets available ring index for zero copy rx, 1355 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1356 * Only one thread will call this funciton for a paticular virtio device, 1357 * so, it is designed as non-thread-safe function. 1358 */ 1359 static inline uint32_t __attribute__((always_inline)) 1360 get_available_ring_index_zcp(struct virtio_net *dev, 1361 uint16_t *res_base_idx, uint32_t count) 1362 { 1363 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1364 uint16_t avail_idx; 1365 uint32_t retry = 0; 1366 uint16_t free_entries; 1367 1368 *res_base_idx = vq->last_used_idx_res; 1369 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1370 free_entries = (avail_idx - *res_base_idx); 1371 1372 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1373 "avail idx: %d, " 1374 "res base idx:%d, free entries:%d\n", 1375 dev->device_fh, avail_idx, *res_base_idx, 1376 free_entries); 1377 1378 /* 1379 * If retry is enabled and the queue is full then we wait 1380 * and retry to avoid packet loss. 1381 */ 1382 if (enable_retry && unlikely(count > free_entries)) { 1383 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1384 rte_delay_us(burst_rx_delay_time); 1385 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1386 free_entries = (avail_idx - *res_base_idx); 1387 if (count <= free_entries) 1388 break; 1389 } 1390 } 1391 1392 /*check that we have enough buffers*/ 1393 if (unlikely(count > free_entries)) 1394 count = free_entries; 1395 1396 if (unlikely(count == 0)) { 1397 LOG_DEBUG(VHOST_DATA, 1398 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1399 "avail idx: %d, res base idx:%d, free entries:%d\n", 1400 dev->device_fh, avail_idx, 1401 *res_base_idx, free_entries); 1402 return 0; 1403 } 1404 1405 vq->last_used_idx_res = *res_base_idx + count; 1406 1407 return count; 1408 } 1409 1410 /* 1411 * This function put descriptor back to used list. 1412 */ 1413 static inline void __attribute__((always_inline)) 1414 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1415 { 1416 uint16_t res_cur_idx = vq->last_used_idx; 1417 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1418 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1419 rte_compiler_barrier(); 1420 *(volatile uint16_t *)&vq->used->idx += 1; 1421 vq->last_used_idx += 1; 1422 1423 /* Kick the guest if necessary. */ 1424 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1425 eventfd_write(vq->callfd, (eventfd_t)1); 1426 } 1427 1428 /* 1429 * This function get available descriptor from vitio vring and un-attached mbuf 1430 * from vpool->ring, and then attach them together. It needs adjust the offset 1431 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1432 * frame data may be put to wrong location in mbuf. 1433 */ 1434 static inline void __attribute__((always_inline)) 1435 attach_rxmbuf_zcp(struct virtio_net *dev) 1436 { 1437 uint16_t res_base_idx, desc_idx; 1438 uint64_t buff_addr, phys_addr; 1439 struct vhost_virtqueue *vq; 1440 struct vring_desc *desc; 1441 void *obj = NULL; 1442 struct rte_mbuf *mbuf; 1443 struct vpool *vpool; 1444 hpa_type addr_type; 1445 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1446 1447 vpool = &vpool_array[vdev->vmdq_rx_q]; 1448 vq = dev->virtqueue[VIRTIO_RXQ]; 1449 1450 do { 1451 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1452 1) != 1)) 1453 return; 1454 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1455 1456 desc = &vq->desc[desc_idx]; 1457 if (desc->flags & VRING_DESC_F_NEXT) { 1458 desc = &vq->desc[desc->next]; 1459 buff_addr = gpa_to_vva(dev, desc->addr); 1460 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1461 &addr_type); 1462 } else { 1463 buff_addr = gpa_to_vva(dev, 1464 desc->addr + vq->vhost_hlen); 1465 phys_addr = gpa_to_hpa(vdev, 1466 desc->addr + vq->vhost_hlen, 1467 desc->len, &addr_type); 1468 } 1469 1470 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1471 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1472 " address found when attaching RX frame buffer" 1473 " address!\n", dev->device_fh); 1474 put_desc_to_used_list_zcp(vq, desc_idx); 1475 continue; 1476 } 1477 1478 /* 1479 * Check if the frame buffer address from guest crosses 1480 * sub-region or not. 1481 */ 1482 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1483 RTE_LOG(ERR, VHOST_DATA, 1484 "(%"PRIu64") Frame buffer address cross " 1485 "sub-regioin found when attaching RX frame " 1486 "buffer address!\n", 1487 dev->device_fh); 1488 put_desc_to_used_list_zcp(vq, desc_idx); 1489 continue; 1490 } 1491 } while (unlikely(phys_addr == 0)); 1492 1493 rte_ring_sc_dequeue(vpool->ring, &obj); 1494 mbuf = obj; 1495 if (unlikely(mbuf == NULL)) { 1496 LOG_DEBUG(VHOST_DATA, 1497 "(%"PRIu64") in attach_rxmbuf_zcp: " 1498 "ring_sc_dequeue fail.\n", 1499 dev->device_fh); 1500 put_desc_to_used_list_zcp(vq, desc_idx); 1501 return; 1502 } 1503 1504 if (unlikely(vpool->buf_size > desc->len)) { 1505 LOG_DEBUG(VHOST_DATA, 1506 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1507 "length(%d) of descriptor idx: %d less than room " 1508 "size required: %d\n", 1509 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1510 put_desc_to_used_list_zcp(vq, desc_idx); 1511 rte_ring_sp_enqueue(vpool->ring, obj); 1512 return; 1513 } 1514 1515 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1516 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1517 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1518 mbuf->data_len = desc->len; 1519 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1520 1521 LOG_DEBUG(VHOST_DATA, 1522 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1523 "descriptor idx:%d\n", 1524 dev->device_fh, res_base_idx, desc_idx); 1525 1526 __rte_mbuf_raw_free(mbuf); 1527 1528 return; 1529 } 1530 1531 /* 1532 * Detach an attched packet mbuf - 1533 * - restore original mbuf address and length values. 1534 * - reset pktmbuf data and data_len to their default values. 1535 * All other fields of the given packet mbuf will be left intact. 1536 * 1537 * @param m 1538 * The attached packet mbuf. 1539 */ 1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1541 { 1542 const struct rte_mempool *mp = m->pool; 1543 void *buf = rte_mbuf_to_baddr(m); 1544 uint32_t buf_ofs; 1545 uint32_t buf_len = mp->elt_size - sizeof(*m); 1546 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1547 1548 m->buf_addr = buf; 1549 m->buf_len = (uint16_t)buf_len; 1550 1551 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1552 RTE_PKTMBUF_HEADROOM : m->buf_len; 1553 m->data_off = buf_ofs; 1554 1555 m->data_len = 0; 1556 } 1557 1558 /* 1559 * This function is called after packets have been transimited. It fetchs mbuf 1560 * from vpool->pool, detached it and put into vpool->ring. It also update the 1561 * used index and kick the guest if necessary. 1562 */ 1563 static inline uint32_t __attribute__((always_inline)) 1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1565 { 1566 struct rte_mbuf *mbuf; 1567 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1568 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1569 uint32_t index = 0; 1570 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1571 1572 LOG_DEBUG(VHOST_DATA, 1573 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1574 "clean is: %d\n", 1575 dev->device_fh, mbuf_count); 1576 LOG_DEBUG(VHOST_DATA, 1577 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1578 "clean is : %d\n", 1579 dev->device_fh, rte_ring_count(vpool->ring)); 1580 1581 for (index = 0; index < mbuf_count; index++) { 1582 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1583 if (likely(MBUF_EXT_MEM(mbuf))) 1584 pktmbuf_detach_zcp(mbuf); 1585 rte_ring_sp_enqueue(vpool->ring, mbuf); 1586 1587 /* Update used index buffer information. */ 1588 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1589 vq->used->ring[used_idx].len = 0; 1590 1591 used_idx = (used_idx + 1) & (vq->size - 1); 1592 } 1593 1594 LOG_DEBUG(VHOST_DATA, 1595 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1596 "clean is: %d\n", 1597 dev->device_fh, rte_mempool_count(vpool->pool)); 1598 LOG_DEBUG(VHOST_DATA, 1599 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1600 "clean is : %d\n", 1601 dev->device_fh, rte_ring_count(vpool->ring)); 1602 LOG_DEBUG(VHOST_DATA, 1603 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1604 "vq->last_used_idx:%d\n", 1605 dev->device_fh, vq->last_used_idx); 1606 1607 vq->last_used_idx += mbuf_count; 1608 1609 LOG_DEBUG(VHOST_DATA, 1610 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1611 "vq->last_used_idx:%d\n", 1612 dev->device_fh, vq->last_used_idx); 1613 1614 rte_compiler_barrier(); 1615 1616 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1617 1618 /* Kick guest if required. */ 1619 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1620 eventfd_write(vq->callfd, (eventfd_t)1); 1621 1622 return 0; 1623 } 1624 1625 /* 1626 * This function is called when a virtio device is destroy. 1627 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1628 */ 1629 static void mbuf_destroy_zcp(struct vpool *vpool) 1630 { 1631 struct rte_mbuf *mbuf = NULL; 1632 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1633 1634 LOG_DEBUG(VHOST_CONFIG, 1635 "in mbuf_destroy_zcp: mbuf count in mempool before " 1636 "mbuf_destroy_zcp is: %d\n", 1637 mbuf_count); 1638 LOG_DEBUG(VHOST_CONFIG, 1639 "in mbuf_destroy_zcp: mbuf count in ring before " 1640 "mbuf_destroy_zcp is : %d\n", 1641 rte_ring_count(vpool->ring)); 1642 1643 for (index = 0; index < mbuf_count; index++) { 1644 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1645 if (likely(mbuf != NULL)) { 1646 if (likely(MBUF_EXT_MEM(mbuf))) 1647 pktmbuf_detach_zcp(mbuf); 1648 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1649 } 1650 } 1651 1652 LOG_DEBUG(VHOST_CONFIG, 1653 "in mbuf_destroy_zcp: mbuf count in mempool after " 1654 "mbuf_destroy_zcp is: %d\n", 1655 rte_mempool_count(vpool->pool)); 1656 LOG_DEBUG(VHOST_CONFIG, 1657 "in mbuf_destroy_zcp: mbuf count in ring after " 1658 "mbuf_destroy_zcp is : %d\n", 1659 rte_ring_count(vpool->ring)); 1660 } 1661 1662 /* 1663 * This function update the use flag and counter. 1664 */ 1665 static inline uint32_t __attribute__((always_inline)) 1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1667 uint32_t count) 1668 { 1669 struct vhost_virtqueue *vq; 1670 struct vring_desc *desc; 1671 struct rte_mbuf *buff; 1672 /* The virtio_hdr is initialised to 0. */ 1673 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1674 = {{0, 0, 0, 0, 0, 0}, 0}; 1675 uint64_t buff_hdr_addr = 0; 1676 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1677 uint32_t head_idx, packet_success = 0; 1678 uint16_t res_cur_idx; 1679 1680 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1681 1682 if (count == 0) 1683 return 0; 1684 1685 vq = dev->virtqueue[VIRTIO_RXQ]; 1686 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1687 1688 res_cur_idx = vq->last_used_idx; 1689 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1690 dev->device_fh, res_cur_idx, res_cur_idx + count); 1691 1692 /* Retrieve all of the head indexes first to avoid caching issues. */ 1693 for (head_idx = 0; head_idx < count; head_idx++) 1694 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1695 1696 /*Prefetch descriptor index. */ 1697 rte_prefetch0(&vq->desc[head[packet_success]]); 1698 1699 while (packet_success != count) { 1700 /* Get descriptor from available ring */ 1701 desc = &vq->desc[head[packet_success]]; 1702 1703 buff = pkts[packet_success]; 1704 LOG_DEBUG(VHOST_DATA, 1705 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1706 "pkt[%d] descriptor idx: %d\n", 1707 dev->device_fh, packet_success, 1708 MBUF_HEADROOM_UINT32(buff)); 1709 1710 PRINT_PACKET(dev, 1711 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1712 + RTE_PKTMBUF_HEADROOM), 1713 rte_pktmbuf_data_len(buff), 0); 1714 1715 /* Buffer address translation for virtio header. */ 1716 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1717 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1718 1719 /* 1720 * If the descriptors are chained the header and data are 1721 * placed in separate buffers. 1722 */ 1723 if (desc->flags & VRING_DESC_F_NEXT) { 1724 desc->len = vq->vhost_hlen; 1725 desc = &vq->desc[desc->next]; 1726 desc->len = rte_pktmbuf_data_len(buff); 1727 } else { 1728 desc->len = packet_len; 1729 } 1730 1731 /* Update used ring with desc information */ 1732 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1733 = head[packet_success]; 1734 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1735 = packet_len; 1736 res_cur_idx++; 1737 packet_success++; 1738 1739 /* A header is required per buffer. */ 1740 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1741 (const void *)&virtio_hdr, vq->vhost_hlen); 1742 1743 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1744 1745 if (likely(packet_success < count)) { 1746 /* Prefetch descriptor index. */ 1747 rte_prefetch0(&vq->desc[head[packet_success]]); 1748 } 1749 } 1750 1751 rte_compiler_barrier(); 1752 1753 LOG_DEBUG(VHOST_DATA, 1754 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1755 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1756 dev->device_fh, vq->last_used_idx, vq->used->idx); 1757 1758 *(volatile uint16_t *)&vq->used->idx += count; 1759 vq->last_used_idx += count; 1760 1761 LOG_DEBUG(VHOST_DATA, 1762 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1763 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1764 dev->device_fh, vq->last_used_idx, vq->used->idx); 1765 1766 /* Kick the guest if necessary. */ 1767 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1768 eventfd_write(vq->callfd, (eventfd_t)1); 1769 1770 return count; 1771 } 1772 1773 /* 1774 * This function routes the TX packet to the correct interface. 1775 * This may be a local device or the physical port. 1776 */ 1777 static inline void __attribute__((always_inline)) 1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1779 uint32_t desc_idx, uint8_t need_copy) 1780 { 1781 struct mbuf_table *tx_q; 1782 struct rte_mbuf **m_table; 1783 void *obj = NULL; 1784 struct rte_mbuf *mbuf; 1785 unsigned len, ret, offset = 0; 1786 struct vpool *vpool; 1787 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1788 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1789 1790 /*Add packet to the port tx queue*/ 1791 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1792 len = tx_q->len; 1793 1794 /* Allocate an mbuf and populate the structure. */ 1795 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1796 rte_ring_sc_dequeue(vpool->ring, &obj); 1797 mbuf = obj; 1798 if (unlikely(mbuf == NULL)) { 1799 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1800 RTE_LOG(ERR, VHOST_DATA, 1801 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1802 dev->device_fh); 1803 put_desc_to_used_list_zcp(vq, desc_idx); 1804 return; 1805 } 1806 1807 if (vm2vm_mode == VM2VM_HARDWARE) { 1808 /* Avoid using a vlan tag from any vm for external pkt, such as 1809 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1810 * selection, MAC address determines it as an external pkt 1811 * which should go to network, while vlan tag determine it as 1812 * a vm2vm pkt should forward to another vm. Hardware confuse 1813 * such a ambiguous situation, so pkt will lost. 1814 */ 1815 vlan_tag = external_pkt_default_vlan_tag; 1816 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1817 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1818 __rte_mbuf_raw_free(mbuf); 1819 return; 1820 } 1821 } 1822 1823 mbuf->nb_segs = m->nb_segs; 1824 mbuf->next = m->next; 1825 mbuf->data_len = m->data_len + offset; 1826 mbuf->pkt_len = mbuf->data_len; 1827 if (unlikely(need_copy)) { 1828 /* Copy the packet contents to the mbuf. */ 1829 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1830 rte_pktmbuf_mtod(m, void *), 1831 m->data_len); 1832 } else { 1833 mbuf->data_off = m->data_off; 1834 mbuf->buf_physaddr = m->buf_physaddr; 1835 mbuf->buf_addr = m->buf_addr; 1836 } 1837 mbuf->ol_flags = PKT_TX_VLAN_PKT; 1838 mbuf->vlan_tci = vlan_tag; 1839 mbuf->l2_len = sizeof(struct ether_hdr); 1840 mbuf->l3_len = sizeof(struct ipv4_hdr); 1841 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1842 1843 tx_q->m_table[len] = mbuf; 1844 len++; 1845 1846 LOG_DEBUG(VHOST_DATA, 1847 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1848 dev->device_fh, 1849 mbuf->nb_segs, 1850 (mbuf->next == NULL) ? "null" : "non-null"); 1851 1852 if (enable_stats) { 1853 dev_statistics[dev->device_fh].tx_total++; 1854 dev_statistics[dev->device_fh].tx++; 1855 } 1856 1857 if (unlikely(len == MAX_PKT_BURST)) { 1858 m_table = (struct rte_mbuf **)tx_q->m_table; 1859 ret = rte_eth_tx_burst(ports[0], 1860 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1861 1862 /* 1863 * Free any buffers not handled by TX and update 1864 * the port stats. 1865 */ 1866 if (unlikely(ret < len)) { 1867 do { 1868 rte_pktmbuf_free(m_table[ret]); 1869 } while (++ret < len); 1870 } 1871 1872 len = 0; 1873 txmbuf_clean_zcp(dev, vpool); 1874 } 1875 1876 tx_q->len = len; 1877 1878 return; 1879 } 1880 1881 /* 1882 * This function TX all available packets in virtio TX queue for one 1883 * virtio-net device. If it is first packet, it learns MAC address and 1884 * setup VMDQ. 1885 */ 1886 static inline void __attribute__((always_inline)) 1887 virtio_dev_tx_zcp(struct virtio_net *dev) 1888 { 1889 struct rte_mbuf m; 1890 struct vhost_virtqueue *vq; 1891 struct vring_desc *desc; 1892 uint64_t buff_addr = 0, phys_addr; 1893 uint32_t head[MAX_PKT_BURST]; 1894 uint32_t i; 1895 uint16_t free_entries, packet_success = 0; 1896 uint16_t avail_idx; 1897 uint8_t need_copy = 0; 1898 hpa_type addr_type; 1899 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1900 1901 vq = dev->virtqueue[VIRTIO_TXQ]; 1902 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1903 1904 /* If there are no available buffers then return. */ 1905 if (vq->last_used_idx_res == avail_idx) 1906 return; 1907 1908 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1909 1910 /* Prefetch available ring to retrieve head indexes. */ 1911 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1912 1913 /* Get the number of free entries in the ring */ 1914 free_entries = (avail_idx - vq->last_used_idx_res); 1915 1916 /* Limit to MAX_PKT_BURST. */ 1917 free_entries 1918 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1919 1920 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1921 dev->device_fh, free_entries); 1922 1923 /* Retrieve all of the head indexes first to avoid caching issues. */ 1924 for (i = 0; i < free_entries; i++) 1925 head[i] 1926 = vq->avail->ring[(vq->last_used_idx_res + i) 1927 & (vq->size - 1)]; 1928 1929 vq->last_used_idx_res += free_entries; 1930 1931 /* Prefetch descriptor index. */ 1932 rte_prefetch0(&vq->desc[head[packet_success]]); 1933 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 1934 1935 while (packet_success < free_entries) { 1936 desc = &vq->desc[head[packet_success]]; 1937 1938 /* Discard first buffer as it is the virtio header */ 1939 desc = &vq->desc[desc->next]; 1940 1941 /* Buffer address translation. */ 1942 buff_addr = gpa_to_vva(dev, desc->addr); 1943 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 1944 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 1945 &addr_type); 1946 1947 if (likely(packet_success < (free_entries - 1))) 1948 /* Prefetch descriptor index. */ 1949 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 1950 1951 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1952 RTE_LOG(ERR, VHOST_DATA, 1953 "(%"PRIu64") Invalid frame buffer address found" 1954 "when TX packets!\n", 1955 dev->device_fh); 1956 packet_success++; 1957 continue; 1958 } 1959 1960 /* Prefetch buffer address. */ 1961 rte_prefetch0((void *)(uintptr_t)buff_addr); 1962 1963 /* 1964 * Setup dummy mbuf. This is copied to a real mbuf if 1965 * transmitted out the physical port. 1966 */ 1967 m.data_len = desc->len; 1968 m.nb_segs = 1; 1969 m.next = NULL; 1970 m.data_off = 0; 1971 m.buf_addr = (void *)(uintptr_t)buff_addr; 1972 m.buf_physaddr = phys_addr; 1973 1974 /* 1975 * Check if the frame buffer address from guest crosses 1976 * sub-region or not. 1977 */ 1978 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1979 RTE_LOG(ERR, VHOST_DATA, 1980 "(%"PRIu64") Frame buffer address cross " 1981 "sub-regioin found when attaching TX frame " 1982 "buffer address!\n", 1983 dev->device_fh); 1984 need_copy = 1; 1985 } else 1986 need_copy = 0; 1987 1988 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 1989 1990 /* 1991 * If this is the first received packet we need to learn 1992 * the MAC and setup VMDQ 1993 */ 1994 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 1995 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 1996 /* 1997 * Discard frame if device is scheduled for 1998 * removal or a duplicate MAC address is found. 1999 */ 2000 packet_success += free_entries; 2001 vq->last_used_idx += packet_success; 2002 break; 2003 } 2004 } 2005 2006 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2007 packet_success++; 2008 } 2009 } 2010 2011 /* 2012 * This function is called by each data core. It handles all RX/TX registered 2013 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2014 * addresses are compared with all devices in the main linked list. 2015 */ 2016 static int 2017 switch_worker_zcp(__attribute__((unused)) void *arg) 2018 { 2019 struct virtio_net *dev = NULL; 2020 struct vhost_dev *vdev = NULL; 2021 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2022 struct virtio_net_data_ll *dev_ll; 2023 struct mbuf_table *tx_q; 2024 volatile struct lcore_ll_info *lcore_ll; 2025 const uint64_t drain_tsc 2026 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2027 * BURST_TX_DRAIN_US; 2028 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2029 unsigned ret; 2030 const uint16_t lcore_id = rte_lcore_id(); 2031 uint16_t count_in_ring, rx_count = 0; 2032 2033 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2034 2035 lcore_ll = lcore_info[lcore_id].lcore_ll; 2036 prev_tsc = 0; 2037 2038 while (1) { 2039 cur_tsc = rte_rdtsc(); 2040 2041 /* TX burst queue drain */ 2042 diff_tsc = cur_tsc - prev_tsc; 2043 if (unlikely(diff_tsc > drain_tsc)) { 2044 /* 2045 * Get mbuf from vpool.pool and detach mbuf and 2046 * put back into vpool.ring. 2047 */ 2048 dev_ll = lcore_ll->ll_root_used; 2049 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2050 /* Get virtio device ID */ 2051 vdev = dev_ll->vdev; 2052 dev = vdev->dev; 2053 2054 if (likely(!vdev->remove)) { 2055 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2056 if (tx_q->len) { 2057 LOG_DEBUG(VHOST_DATA, 2058 "TX queue drained after timeout" 2059 " with burst size %u\n", 2060 tx_q->len); 2061 2062 /* 2063 * Tx any packets in the queue 2064 */ 2065 ret = rte_eth_tx_burst( 2066 ports[0], 2067 (uint16_t)tx_q->txq_id, 2068 (struct rte_mbuf **) 2069 tx_q->m_table, 2070 (uint16_t)tx_q->len); 2071 if (unlikely(ret < tx_q->len)) { 2072 do { 2073 rte_pktmbuf_free( 2074 tx_q->m_table[ret]); 2075 } while (++ret < tx_q->len); 2076 } 2077 tx_q->len = 0; 2078 2079 txmbuf_clean_zcp(dev, 2080 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2081 } 2082 } 2083 dev_ll = dev_ll->next; 2084 } 2085 prev_tsc = cur_tsc; 2086 } 2087 2088 rte_prefetch0(lcore_ll->ll_root_used); 2089 2090 /* 2091 * Inform the configuration core that we have exited the linked 2092 * list and that no devices are in use if requested. 2093 */ 2094 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2095 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2096 2097 /* Process devices */ 2098 dev_ll = lcore_ll->ll_root_used; 2099 2100 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2101 vdev = dev_ll->vdev; 2102 dev = vdev->dev; 2103 if (unlikely(vdev->remove)) { 2104 dev_ll = dev_ll->next; 2105 unlink_vmdq(vdev); 2106 vdev->ready = DEVICE_SAFE_REMOVE; 2107 continue; 2108 } 2109 2110 if (likely(vdev->ready == DEVICE_RX)) { 2111 uint32_t index = vdev->vmdq_rx_q; 2112 uint16_t i; 2113 count_in_ring 2114 = rte_ring_count(vpool_array[index].ring); 2115 uint16_t free_entries 2116 = (uint16_t)get_available_ring_num_zcp(dev); 2117 2118 /* 2119 * Attach all mbufs in vpool.ring and put back 2120 * into vpool.pool. 2121 */ 2122 for (i = 0; 2123 i < RTE_MIN(free_entries, 2124 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2125 i++) 2126 attach_rxmbuf_zcp(dev); 2127 2128 /* Handle guest RX */ 2129 rx_count = rte_eth_rx_burst(ports[0], 2130 vdev->vmdq_rx_q, pkts_burst, 2131 MAX_PKT_BURST); 2132 2133 if (rx_count) { 2134 ret_count = virtio_dev_rx_zcp(dev, 2135 pkts_burst, rx_count); 2136 if (enable_stats) { 2137 dev_statistics[dev->device_fh].rx_total 2138 += rx_count; 2139 dev_statistics[dev->device_fh].rx 2140 += ret_count; 2141 } 2142 while (likely(rx_count)) { 2143 rx_count--; 2144 pktmbuf_detach_zcp( 2145 pkts_burst[rx_count]); 2146 rte_ring_sp_enqueue( 2147 vpool_array[index].ring, 2148 (void *)pkts_burst[rx_count]); 2149 } 2150 } 2151 } 2152 2153 if (likely(!vdev->remove)) 2154 /* Handle guest TX */ 2155 virtio_dev_tx_zcp(dev); 2156 2157 /* Move to the next device in the list */ 2158 dev_ll = dev_ll->next; 2159 } 2160 } 2161 2162 return 0; 2163 } 2164 2165 2166 /* 2167 * Add an entry to a used linked list. A free entry must first be found 2168 * in the free linked list using get_data_ll_free_entry(); 2169 */ 2170 static void 2171 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2172 struct virtio_net_data_ll *ll_dev) 2173 { 2174 struct virtio_net_data_ll *ll = *ll_root_addr; 2175 2176 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2177 ll_dev->next = NULL; 2178 rte_compiler_barrier(); 2179 2180 /* If ll == NULL then this is the first device. */ 2181 if (ll) { 2182 /* Increment to the tail of the linked list. */ 2183 while ((ll->next != NULL) ) 2184 ll = ll->next; 2185 2186 ll->next = ll_dev; 2187 } else { 2188 *ll_root_addr = ll_dev; 2189 } 2190 } 2191 2192 /* 2193 * Remove an entry from a used linked list. The entry must then be added to 2194 * the free linked list using put_data_ll_free_entry(). 2195 */ 2196 static void 2197 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2198 struct virtio_net_data_ll *ll_dev, 2199 struct virtio_net_data_ll *ll_dev_last) 2200 { 2201 struct virtio_net_data_ll *ll = *ll_root_addr; 2202 2203 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2204 return; 2205 2206 if (ll_dev == ll) 2207 *ll_root_addr = ll_dev->next; 2208 else 2209 if (likely(ll_dev_last != NULL)) 2210 ll_dev_last->next = ll_dev->next; 2211 else 2212 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2213 } 2214 2215 /* 2216 * Find and return an entry from the free linked list. 2217 */ 2218 static struct virtio_net_data_ll * 2219 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2220 { 2221 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2222 struct virtio_net_data_ll *ll_dev; 2223 2224 if (ll_free == NULL) 2225 return NULL; 2226 2227 ll_dev = ll_free; 2228 *ll_root_addr = ll_free->next; 2229 2230 return ll_dev; 2231 } 2232 2233 /* 2234 * Place an entry back on to the free linked list. 2235 */ 2236 static void 2237 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2238 struct virtio_net_data_ll *ll_dev) 2239 { 2240 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2241 2242 if (ll_dev == NULL) 2243 return; 2244 2245 ll_dev->next = ll_free; 2246 *ll_root_addr = ll_dev; 2247 } 2248 2249 /* 2250 * Creates a linked list of a given size. 2251 */ 2252 static struct virtio_net_data_ll * 2253 alloc_data_ll(uint32_t size) 2254 { 2255 struct virtio_net_data_ll *ll_new; 2256 uint32_t i; 2257 2258 /* Malloc and then chain the linked list. */ 2259 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2260 if (ll_new == NULL) { 2261 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2262 return NULL; 2263 } 2264 2265 for (i = 0; i < size - 1; i++) { 2266 ll_new[i].vdev = NULL; 2267 ll_new[i].next = &ll_new[i+1]; 2268 } 2269 ll_new[i].next = NULL; 2270 2271 return ll_new; 2272 } 2273 2274 /* 2275 * Create the main linked list along with each individual cores linked list. A used and a free list 2276 * are created to manage entries. 2277 */ 2278 static int 2279 init_data_ll (void) 2280 { 2281 int lcore; 2282 2283 RTE_LCORE_FOREACH_SLAVE(lcore) { 2284 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2285 if (lcore_info[lcore].lcore_ll == NULL) { 2286 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2287 return -1; 2288 } 2289 2290 lcore_info[lcore].lcore_ll->device_num = 0; 2291 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2292 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2293 if (num_devices % num_switching_cores) 2294 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2295 else 2296 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2297 } 2298 2299 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2300 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2301 2302 return 0; 2303 } 2304 2305 /* 2306 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2307 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2308 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2309 */ 2310 static void 2311 destroy_device (volatile struct virtio_net *dev) 2312 { 2313 struct virtio_net_data_ll *ll_lcore_dev_cur; 2314 struct virtio_net_data_ll *ll_main_dev_cur; 2315 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2316 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2317 struct vhost_dev *vdev; 2318 int lcore; 2319 2320 dev->flags &= ~VIRTIO_DEV_RUNNING; 2321 2322 vdev = (struct vhost_dev *)dev->priv; 2323 /*set the remove flag. */ 2324 vdev->remove = 1; 2325 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2326 rte_pause(); 2327 } 2328 2329 /* Search for entry to be removed from lcore ll */ 2330 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2331 while (ll_lcore_dev_cur != NULL) { 2332 if (ll_lcore_dev_cur->vdev == vdev) { 2333 break; 2334 } else { 2335 ll_lcore_dev_last = ll_lcore_dev_cur; 2336 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2337 } 2338 } 2339 2340 if (ll_lcore_dev_cur == NULL) { 2341 RTE_LOG(ERR, VHOST_CONFIG, 2342 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2343 dev->device_fh); 2344 return; 2345 } 2346 2347 /* Search for entry to be removed from main ll */ 2348 ll_main_dev_cur = ll_root_used; 2349 ll_main_dev_last = NULL; 2350 while (ll_main_dev_cur != NULL) { 2351 if (ll_main_dev_cur->vdev == vdev) { 2352 break; 2353 } else { 2354 ll_main_dev_last = ll_main_dev_cur; 2355 ll_main_dev_cur = ll_main_dev_cur->next; 2356 } 2357 } 2358 2359 /* Remove entries from the lcore and main ll. */ 2360 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2361 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2362 2363 /* Set the dev_removal_flag on each lcore. */ 2364 RTE_LCORE_FOREACH_SLAVE(lcore) { 2365 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2366 } 2367 2368 /* 2369 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2370 * they can no longer access the device removed from the linked lists and that the devices 2371 * are no longer in use. 2372 */ 2373 RTE_LCORE_FOREACH_SLAVE(lcore) { 2374 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2375 rte_pause(); 2376 } 2377 } 2378 2379 /* Add the entries back to the lcore and main free ll.*/ 2380 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2381 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2382 2383 /* Decrement number of device on the lcore. */ 2384 lcore_info[vdev->coreid].lcore_ll->device_num--; 2385 2386 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2387 2388 if (zero_copy) { 2389 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2390 2391 /* Stop the RX queue. */ 2392 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2393 LOG_DEBUG(VHOST_CONFIG, 2394 "(%"PRIu64") In destroy_device: Failed to stop " 2395 "rx queue:%d\n", 2396 dev->device_fh, 2397 vdev->vmdq_rx_q); 2398 } 2399 2400 LOG_DEBUG(VHOST_CONFIG, 2401 "(%"PRIu64") in destroy_device: Start put mbuf in " 2402 "mempool back to ring for RX queue: %d\n", 2403 dev->device_fh, vdev->vmdq_rx_q); 2404 2405 mbuf_destroy_zcp(vpool); 2406 2407 /* Stop the TX queue. */ 2408 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2409 LOG_DEBUG(VHOST_CONFIG, 2410 "(%"PRIu64") In destroy_device: Failed to " 2411 "stop tx queue:%d\n", 2412 dev->device_fh, vdev->vmdq_rx_q); 2413 } 2414 2415 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2416 2417 LOG_DEBUG(VHOST_CONFIG, 2418 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2419 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2420 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2421 dev->device_fh); 2422 2423 mbuf_destroy_zcp(vpool); 2424 rte_free(vdev->regions_hpa); 2425 } 2426 rte_free(vdev); 2427 2428 } 2429 2430 /* 2431 * Calculate the region count of physical continous regions for one particular 2432 * region of whose vhost virtual address is continous. The particular region 2433 * start from vva_start, with size of 'size' in argument. 2434 */ 2435 static uint32_t 2436 check_hpa_regions(uint64_t vva_start, uint64_t size) 2437 { 2438 uint32_t i, nregions = 0, page_size = getpagesize(); 2439 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2440 if (vva_start % page_size) { 2441 LOG_DEBUG(VHOST_CONFIG, 2442 "in check_countinous: vva start(%p) mod page_size(%d) " 2443 "has remainder\n", 2444 (void *)(uintptr_t)vva_start, page_size); 2445 return 0; 2446 } 2447 if (size % page_size) { 2448 LOG_DEBUG(VHOST_CONFIG, 2449 "in check_countinous: " 2450 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2451 size, page_size); 2452 return 0; 2453 } 2454 for (i = 0; i < size - page_size; i = i + page_size) { 2455 cur_phys_addr 2456 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2457 next_phys_addr = rte_mem_virt2phy( 2458 (void *)(uintptr_t)(vva_start + i + page_size)); 2459 if ((cur_phys_addr + page_size) != next_phys_addr) { 2460 ++nregions; 2461 LOG_DEBUG(VHOST_CONFIG, 2462 "in check_continuous: hva addr:(%p) is not " 2463 "continuous with hva addr:(%p), diff:%d\n", 2464 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2465 (void *)(uintptr_t)(vva_start + (uint64_t)i 2466 + page_size), page_size); 2467 LOG_DEBUG(VHOST_CONFIG, 2468 "in check_continuous: hpa addr:(%p) is not " 2469 "continuous with hpa addr:(%p), " 2470 "diff:(%"PRIu64")\n", 2471 (void *)(uintptr_t)cur_phys_addr, 2472 (void *)(uintptr_t)next_phys_addr, 2473 (next_phys_addr-cur_phys_addr)); 2474 } 2475 } 2476 return nregions; 2477 } 2478 2479 /* 2480 * Divide each region whose vhost virtual address is continous into a few 2481 * sub-regions, make sure the physical address within each sub-region are 2482 * continous. And fill offset(to GPA) and size etc. information of each 2483 * sub-region into regions_hpa. 2484 */ 2485 static uint32_t 2486 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2487 { 2488 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2489 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2490 2491 if (mem_region_hpa == NULL) 2492 return 0; 2493 2494 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2495 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2496 virtio_memory->regions[regionidx].address_offset; 2497 mem_region_hpa[regionidx_hpa].guest_phys_address 2498 = virtio_memory->regions[regionidx].guest_phys_address; 2499 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2500 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2501 mem_region_hpa[regionidx_hpa].guest_phys_address; 2502 LOG_DEBUG(VHOST_CONFIG, 2503 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2504 regionidx_hpa, 2505 (void *)(uintptr_t) 2506 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2507 LOG_DEBUG(VHOST_CONFIG, 2508 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2509 regionidx_hpa, 2510 (void *)(uintptr_t) 2511 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2512 for (i = 0, k = 0; 2513 i < virtio_memory->regions[regionidx].memory_size - 2514 page_size; 2515 i += page_size) { 2516 cur_phys_addr = rte_mem_virt2phy( 2517 (void *)(uintptr_t)(vva_start + i)); 2518 next_phys_addr = rte_mem_virt2phy( 2519 (void *)(uintptr_t)(vva_start + 2520 i + page_size)); 2521 if ((cur_phys_addr + page_size) != next_phys_addr) { 2522 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2523 mem_region_hpa[regionidx_hpa].guest_phys_address + 2524 k + page_size; 2525 mem_region_hpa[regionidx_hpa].memory_size 2526 = k + page_size; 2527 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2528 "phys addr end [%d]:(%p)\n", 2529 regionidx_hpa, 2530 (void *)(uintptr_t) 2531 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2532 LOG_DEBUG(VHOST_CONFIG, 2533 "in fill_hpa_regions: guest phys addr " 2534 "size [%d]:(%p)\n", 2535 regionidx_hpa, 2536 (void *)(uintptr_t) 2537 (mem_region_hpa[regionidx_hpa].memory_size)); 2538 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2539 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2540 ++regionidx_hpa; 2541 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2542 next_phys_addr - 2543 mem_region_hpa[regionidx_hpa].guest_phys_address; 2544 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2545 " phys addr start[%d]:(%p)\n", 2546 regionidx_hpa, 2547 (void *)(uintptr_t) 2548 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2549 LOG_DEBUG(VHOST_CONFIG, 2550 "in fill_hpa_regions: host phys addr " 2551 "start[%d]:(%p)\n", 2552 regionidx_hpa, 2553 (void *)(uintptr_t) 2554 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2555 k = 0; 2556 } else { 2557 k += page_size; 2558 } 2559 } 2560 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2561 = mem_region_hpa[regionidx_hpa].guest_phys_address 2562 + k + page_size; 2563 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2564 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2565 "[%d]:(%p)\n", regionidx_hpa, 2566 (void *)(uintptr_t) 2567 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2568 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2569 "[%d]:(%p)\n", regionidx_hpa, 2570 (void *)(uintptr_t) 2571 (mem_region_hpa[regionidx_hpa].memory_size)); 2572 ++regionidx_hpa; 2573 } 2574 return regionidx_hpa; 2575 } 2576 2577 /* 2578 * A new device is added to a data core. First the device is added to the main linked list 2579 * and the allocated to a specific data core. 2580 */ 2581 static int 2582 new_device (struct virtio_net *dev) 2583 { 2584 struct virtio_net_data_ll *ll_dev; 2585 int lcore, core_add = 0; 2586 uint32_t device_num_min = num_devices; 2587 struct vhost_dev *vdev; 2588 uint32_t regionidx; 2589 2590 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2591 if (vdev == NULL) { 2592 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2593 dev->device_fh); 2594 return -1; 2595 } 2596 vdev->dev = dev; 2597 dev->priv = vdev; 2598 2599 if (zero_copy) { 2600 vdev->nregions_hpa = dev->mem->nregions; 2601 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2602 vdev->nregions_hpa 2603 += check_hpa_regions( 2604 dev->mem->regions[regionidx].guest_phys_address 2605 + dev->mem->regions[regionidx].address_offset, 2606 dev->mem->regions[regionidx].memory_size); 2607 2608 } 2609 2610 vdev->regions_hpa = rte_calloc("vhost hpa region", 2611 vdev->nregions_hpa, 2612 sizeof(struct virtio_memory_regions_hpa), 2613 RTE_CACHE_LINE_SIZE); 2614 if (vdev->regions_hpa == NULL) { 2615 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2616 rte_free(vdev); 2617 return -1; 2618 } 2619 2620 2621 if (fill_hpa_memory_regions( 2622 vdev->regions_hpa, dev->mem 2623 ) != vdev->nregions_hpa) { 2624 2625 RTE_LOG(ERR, VHOST_CONFIG, 2626 "hpa memory regions number mismatch: " 2627 "[%d]\n", vdev->nregions_hpa); 2628 rte_free(vdev->regions_hpa); 2629 rte_free(vdev); 2630 return -1; 2631 } 2632 } 2633 2634 2635 /* Add device to main ll */ 2636 ll_dev = get_data_ll_free_entry(&ll_root_free); 2637 if (ll_dev == NULL) { 2638 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2639 "of %d devices per core has been reached\n", 2640 dev->device_fh, num_devices); 2641 if (vdev->regions_hpa) 2642 rte_free(vdev->regions_hpa); 2643 rte_free(vdev); 2644 return -1; 2645 } 2646 ll_dev->vdev = vdev; 2647 add_data_ll_entry(&ll_root_used, ll_dev); 2648 vdev->vmdq_rx_q 2649 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2650 2651 if (zero_copy) { 2652 uint32_t index = vdev->vmdq_rx_q; 2653 uint32_t count_in_ring, i; 2654 struct mbuf_table *tx_q; 2655 2656 count_in_ring = rte_ring_count(vpool_array[index].ring); 2657 2658 LOG_DEBUG(VHOST_CONFIG, 2659 "(%"PRIu64") in new_device: mbuf count in mempool " 2660 "before attach is: %d\n", 2661 dev->device_fh, 2662 rte_mempool_count(vpool_array[index].pool)); 2663 LOG_DEBUG(VHOST_CONFIG, 2664 "(%"PRIu64") in new_device: mbuf count in ring " 2665 "before attach is : %d\n", 2666 dev->device_fh, count_in_ring); 2667 2668 /* 2669 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2670 */ 2671 for (i = 0; i < count_in_ring; i++) 2672 attach_rxmbuf_zcp(dev); 2673 2674 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2675 "mempool after attach is: %d\n", 2676 dev->device_fh, 2677 rte_mempool_count(vpool_array[index].pool)); 2678 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2679 "ring after attach is : %d\n", 2680 dev->device_fh, 2681 rte_ring_count(vpool_array[index].ring)); 2682 2683 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2684 tx_q->txq_id = vdev->vmdq_rx_q; 2685 2686 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2687 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2688 2689 LOG_DEBUG(VHOST_CONFIG, 2690 "(%"PRIu64") In new_device: Failed to start " 2691 "tx queue:%d\n", 2692 dev->device_fh, vdev->vmdq_rx_q); 2693 2694 mbuf_destroy_zcp(vpool); 2695 rte_free(vdev->regions_hpa); 2696 rte_free(vdev); 2697 return -1; 2698 } 2699 2700 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2701 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2702 2703 LOG_DEBUG(VHOST_CONFIG, 2704 "(%"PRIu64") In new_device: Failed to start " 2705 "rx queue:%d\n", 2706 dev->device_fh, vdev->vmdq_rx_q); 2707 2708 /* Stop the TX queue. */ 2709 if (rte_eth_dev_tx_queue_stop(ports[0], 2710 vdev->vmdq_rx_q) != 0) { 2711 LOG_DEBUG(VHOST_CONFIG, 2712 "(%"PRIu64") In new_device: Failed to " 2713 "stop tx queue:%d\n", 2714 dev->device_fh, vdev->vmdq_rx_q); 2715 } 2716 2717 mbuf_destroy_zcp(vpool); 2718 rte_free(vdev->regions_hpa); 2719 rte_free(vdev); 2720 return -1; 2721 } 2722 2723 } 2724 2725 /*reset ready flag*/ 2726 vdev->ready = DEVICE_MAC_LEARNING; 2727 vdev->remove = 0; 2728 2729 /* Find a suitable lcore to add the device. */ 2730 RTE_LCORE_FOREACH_SLAVE(lcore) { 2731 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2732 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2733 core_add = lcore; 2734 } 2735 } 2736 /* Add device to lcore ll */ 2737 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2738 if (ll_dev == NULL) { 2739 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2740 vdev->ready = DEVICE_SAFE_REMOVE; 2741 destroy_device(dev); 2742 rte_free(vdev->regions_hpa); 2743 rte_free(vdev); 2744 return -1; 2745 } 2746 ll_dev->vdev = vdev; 2747 vdev->coreid = core_add; 2748 2749 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2750 2751 /* Initialize device stats */ 2752 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2753 2754 /* Disable notifications. */ 2755 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2756 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2757 lcore_info[vdev->coreid].lcore_ll->device_num++; 2758 dev->flags |= VIRTIO_DEV_RUNNING; 2759 2760 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2761 2762 return 0; 2763 } 2764 2765 /* 2766 * These callback allow devices to be added to the data core when configuration 2767 * has been fully complete. 2768 */ 2769 static const struct virtio_net_device_ops virtio_net_device_ops = 2770 { 2771 .new_device = new_device, 2772 .destroy_device = destroy_device, 2773 }; 2774 2775 /* 2776 * This is a thread will wake up after a period to print stats if the user has 2777 * enabled them. 2778 */ 2779 static void 2780 print_stats(void) 2781 { 2782 struct virtio_net_data_ll *dev_ll; 2783 uint64_t tx_dropped, rx_dropped; 2784 uint64_t tx, tx_total, rx, rx_total; 2785 uint32_t device_fh; 2786 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2787 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2788 2789 while(1) { 2790 sleep(enable_stats); 2791 2792 /* Clear screen and move to top left */ 2793 printf("%s%s", clr, top_left); 2794 2795 printf("\nDevice statistics ===================================="); 2796 2797 dev_ll = ll_root_used; 2798 while (dev_ll != NULL) { 2799 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2800 tx_total = dev_statistics[device_fh].tx_total; 2801 tx = dev_statistics[device_fh].tx; 2802 tx_dropped = tx_total - tx; 2803 if (zero_copy == 0) { 2804 rx_total = rte_atomic64_read( 2805 &dev_statistics[device_fh].rx_total_atomic); 2806 rx = rte_atomic64_read( 2807 &dev_statistics[device_fh].rx_atomic); 2808 } else { 2809 rx_total = dev_statistics[device_fh].rx_total; 2810 rx = dev_statistics[device_fh].rx; 2811 } 2812 rx_dropped = rx_total - rx; 2813 2814 printf("\nStatistics for device %"PRIu32" ------------------------------" 2815 "\nTX total: %"PRIu64"" 2816 "\nTX dropped: %"PRIu64"" 2817 "\nTX successful: %"PRIu64"" 2818 "\nRX total: %"PRIu64"" 2819 "\nRX dropped: %"PRIu64"" 2820 "\nRX successful: %"PRIu64"", 2821 device_fh, 2822 tx_total, 2823 tx_dropped, 2824 tx, 2825 rx_total, 2826 rx_dropped, 2827 rx); 2828 2829 dev_ll = dev_ll->next; 2830 } 2831 printf("\n======================================================\n"); 2832 } 2833 } 2834 2835 static void 2836 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2837 char *ring_name, uint32_t nb_mbuf) 2838 { 2839 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, 2840 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket); 2841 if (vpool_array[index].pool != NULL) { 2842 vpool_array[index].ring 2843 = rte_ring_create(ring_name, 2844 rte_align32pow2(nb_mbuf + 1), 2845 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2846 if (likely(vpool_array[index].ring != NULL)) { 2847 LOG_DEBUG(VHOST_CONFIG, 2848 "in setup_mempool_tbl: mbuf count in " 2849 "mempool is: %d\n", 2850 rte_mempool_count(vpool_array[index].pool)); 2851 LOG_DEBUG(VHOST_CONFIG, 2852 "in setup_mempool_tbl: mbuf count in " 2853 "ring is: %d\n", 2854 rte_ring_count(vpool_array[index].ring)); 2855 } else { 2856 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2857 ring_name); 2858 } 2859 2860 /* Need consider head room. */ 2861 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP; 2862 } else { 2863 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2864 } 2865 } 2866 2867 /* When we receive a INT signal, unregister vhost driver */ 2868 static void 2869 sigint_handler(__rte_unused int signum) 2870 { 2871 /* Unregister vhost driver. */ 2872 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 2873 if (ret != 0) 2874 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 2875 exit(0); 2876 } 2877 2878 /* 2879 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2880 * device is also registered here to handle the IOCTLs. 2881 */ 2882 int 2883 main(int argc, char *argv[]) 2884 { 2885 struct rte_mempool *mbuf_pool = NULL; 2886 unsigned lcore_id, core_id = 0; 2887 unsigned nb_ports, valid_num_ports; 2888 int ret; 2889 uint8_t portid; 2890 uint16_t queue_id; 2891 static pthread_t tid; 2892 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 2893 2894 signal(SIGINT, sigint_handler); 2895 2896 /* init EAL */ 2897 ret = rte_eal_init(argc, argv); 2898 if (ret < 0) 2899 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2900 argc -= ret; 2901 argv += ret; 2902 2903 /* parse app arguments */ 2904 ret = us_vhost_parse_args(argc, argv); 2905 if (ret < 0) 2906 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2907 2908 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2909 if (rte_lcore_is_enabled(lcore_id)) 2910 lcore_ids[core_id ++] = lcore_id; 2911 2912 if (rte_lcore_count() > RTE_MAX_LCORE) 2913 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2914 2915 /*set the number of swithcing cores available*/ 2916 num_switching_cores = rte_lcore_count()-1; 2917 2918 /* Get the number of physical ports. */ 2919 nb_ports = rte_eth_dev_count(); 2920 if (nb_ports > RTE_MAX_ETHPORTS) 2921 nb_ports = RTE_MAX_ETHPORTS; 2922 2923 /* 2924 * Update the global var NUM_PORTS and global array PORTS 2925 * and get value of var VALID_NUM_PORTS according to system ports number 2926 */ 2927 valid_num_ports = check_ports_num(nb_ports); 2928 2929 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 2930 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 2931 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 2932 return -1; 2933 } 2934 2935 if (zero_copy == 0) { 2936 /* Create the mbuf pool. */ 2937 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 2938 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 2939 0, MBUF_DATA_SIZE, rte_socket_id()); 2940 if (mbuf_pool == NULL) 2941 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 2942 2943 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 2944 vpool_array[queue_id].pool = mbuf_pool; 2945 2946 if (vm2vm_mode == VM2VM_HARDWARE) { 2947 /* Enable VT loop back to let L2 switch to do it. */ 2948 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2949 LOG_DEBUG(VHOST_CONFIG, 2950 "Enable loop back for L2 switch in vmdq.\n"); 2951 } 2952 } else { 2953 uint32_t nb_mbuf; 2954 char pool_name[RTE_MEMPOOL_NAMESIZE]; 2955 char ring_name[RTE_MEMPOOL_NAMESIZE]; 2956 2957 nb_mbuf = num_rx_descriptor 2958 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2959 + num_switching_cores * MAX_PKT_BURST; 2960 2961 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2962 snprintf(pool_name, sizeof(pool_name), 2963 "rxmbuf_pool_%u", queue_id); 2964 snprintf(ring_name, sizeof(ring_name), 2965 "rxmbuf_ring_%u", queue_id); 2966 setup_mempool_tbl(rte_socket_id(), queue_id, 2967 pool_name, ring_name, nb_mbuf); 2968 } 2969 2970 nb_mbuf = num_tx_descriptor 2971 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 2972 + num_switching_cores * MAX_PKT_BURST; 2973 2974 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 2975 snprintf(pool_name, sizeof(pool_name), 2976 "txmbuf_pool_%u", queue_id); 2977 snprintf(ring_name, sizeof(ring_name), 2978 "txmbuf_ring_%u", queue_id); 2979 setup_mempool_tbl(rte_socket_id(), 2980 (queue_id + MAX_QUEUES), 2981 pool_name, ring_name, nb_mbuf); 2982 } 2983 2984 if (vm2vm_mode == VM2VM_HARDWARE) { 2985 /* Enable VT loop back to let L2 switch to do it. */ 2986 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 2987 LOG_DEBUG(VHOST_CONFIG, 2988 "Enable loop back for L2 switch in vmdq.\n"); 2989 } 2990 } 2991 /* Set log level. */ 2992 rte_set_log_level(LOG_LEVEL); 2993 2994 /* initialize all ports */ 2995 for (portid = 0; portid < nb_ports; portid++) { 2996 /* skip ports that are not enabled */ 2997 if ((enabled_port_mask & (1 << portid)) == 0) { 2998 RTE_LOG(INFO, VHOST_PORT, 2999 "Skipping disabled port %d\n", portid); 3000 continue; 3001 } 3002 if (port_init(portid) != 0) 3003 rte_exit(EXIT_FAILURE, 3004 "Cannot initialize network ports\n"); 3005 } 3006 3007 /* Initialise all linked lists. */ 3008 if (init_data_ll() == -1) 3009 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3010 3011 /* Initialize device stats */ 3012 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3013 3014 /* Enable stats if the user option is set. */ 3015 if (enable_stats) { 3016 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 3017 if (ret != 0) 3018 rte_exit(EXIT_FAILURE, 3019 "Cannot create print-stats thread\n"); 3020 3021 /* Set thread_name for aid in debugging. */ 3022 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 3023 ret = rte_thread_setname(tid, thread_name); 3024 if (ret != 0) 3025 RTE_LOG(ERR, VHOST_CONFIG, 3026 "Cannot set print-stats name\n"); 3027 } 3028 3029 /* Launch all data cores. */ 3030 if (zero_copy == 0) { 3031 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3032 rte_eal_remote_launch(switch_worker, 3033 mbuf_pool, lcore_id); 3034 } 3035 } else { 3036 uint32_t count_in_mempool, index, i; 3037 for (index = 0; index < 2*MAX_QUEUES; index++) { 3038 /* For all RX and TX queues. */ 3039 count_in_mempool 3040 = rte_mempool_count(vpool_array[index].pool); 3041 3042 /* 3043 * Transfer all un-attached mbufs from vpool.pool 3044 * to vpoo.ring. 3045 */ 3046 for (i = 0; i < count_in_mempool; i++) { 3047 struct rte_mbuf *mbuf 3048 = __rte_mbuf_raw_alloc( 3049 vpool_array[index].pool); 3050 rte_ring_sp_enqueue(vpool_array[index].ring, 3051 (void *)mbuf); 3052 } 3053 3054 LOG_DEBUG(VHOST_CONFIG, 3055 "in main: mbuf count in mempool at initial " 3056 "is: %d\n", count_in_mempool); 3057 LOG_DEBUG(VHOST_CONFIG, 3058 "in main: mbuf count in ring at initial is :" 3059 " %d\n", 3060 rte_ring_count(vpool_array[index].ring)); 3061 } 3062 3063 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3064 rte_eal_remote_launch(switch_worker_zcp, NULL, 3065 lcore_id); 3066 } 3067 3068 if (mergeable == 0) 3069 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3070 3071 /* Register vhost(cuse or user) driver to handle vhost messages. */ 3072 ret = rte_vhost_driver_register((char *)&dev_basename); 3073 if (ret != 0) 3074 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 3075 3076 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3077 3078 /* Start CUSE session. */ 3079 rte_vhost_driver_session_start(); 3080 return 0; 3081 3082 } 3083