1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 56 #include "main.h" 57 58 #ifndef MAX_QUEUES 59 #define MAX_QUEUES 128 60 #endif 61 62 /* the maximum number of external ports supported */ 63 #define MAX_SUP_PORTS 1 64 65 /* 66 * Calculate the number of buffers needed per port 67 */ 68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 69 (num_switching_cores*MAX_PKT_BURST) + \ 70 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 71 ((num_switching_cores+1)*MBUF_CACHE_SIZE)) 72 73 #define MBUF_CACHE_SIZE 128 74 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 75 76 /* 77 * No frame data buffer allocated from host are required for zero copy 78 * implementation, guest will allocate the frame data buffer, and vhost 79 * directly use it. 80 */ 81 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM 82 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE 83 #define MBUF_CACHE_SIZE_ZCP 0 84 85 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 86 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 87 88 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 89 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 90 91 #define JUMBO_FRAME_MAX_SIZE 0x2600 92 93 /* State of virtio device. */ 94 #define DEVICE_MAC_LEARNING 0 95 #define DEVICE_RX 1 96 #define DEVICE_SAFE_REMOVE 2 97 98 /* Config_core_flag status definitions. */ 99 #define REQUEST_DEV_REMOVAL 1 100 #define ACK_DEV_REMOVAL 0 101 102 /* Configurable number of RX/TX ring descriptors */ 103 #define RTE_TEST_RX_DESC_DEFAULT 1024 104 #define RTE_TEST_TX_DESC_DEFAULT 512 105 106 /* 107 * Need refine these 2 macros for legacy and DPDK based front end: 108 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 109 * And then adjust power 2. 110 */ 111 /* 112 * For legacy front end, 128 descriptors, 113 * half for virtio header, another half for mbuf. 114 */ 115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 117 118 /* Get first 4 bytes in mbuf headroom. */ 119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 120 + sizeof(struct rte_mbuf))) 121 122 /* true if x is a power of 2 */ 123 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 124 125 #define INVALID_PORT_ID 0xFF 126 127 /* Max number of devices. Limited by vmdq. */ 128 #define MAX_DEVICES 64 129 130 /* Size of buffers used for snprintfs. */ 131 #define MAX_PRINT_BUFF 6072 132 133 /* Maximum character device basename size. */ 134 #define MAX_BASENAME_SZ 10 135 136 /* Maximum long option length for option parsing. */ 137 #define MAX_LONG_OPT_SZ 64 138 139 /* Used to compare MAC addresses. */ 140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 141 142 /* Number of descriptors per cacheline. */ 143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 144 145 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb)) 146 147 /* mask of enabled ports */ 148 static uint32_t enabled_port_mask = 0; 149 150 /* Promiscuous mode */ 151 static uint32_t promiscuous; 152 153 /*Number of switching cores enabled*/ 154 static uint32_t num_switching_cores = 0; 155 156 /* number of devices/queues to support*/ 157 static uint32_t num_queues = 0; 158 static uint32_t num_devices; 159 160 /* 161 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 162 * disabled on default. 163 */ 164 static uint32_t zero_copy; 165 static int mergeable; 166 167 /* Do vlan strip on host, enabled on default */ 168 static uint32_t vlan_strip = 1; 169 170 /* number of descriptors to apply*/ 171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 173 174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 175 #define MAX_RING_DESC 4096 176 177 struct vpool { 178 struct rte_mempool *pool; 179 struct rte_ring *ring; 180 uint32_t buf_size; 181 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 182 183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 184 typedef enum { 185 VM2VM_DISABLED = 0, 186 VM2VM_SOFTWARE = 1, 187 VM2VM_HARDWARE = 2, 188 VM2VM_LAST 189 } vm2vm_type; 190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 191 192 /* The type of host physical address translated from guest physical address. */ 193 typedef enum { 194 PHYS_ADDR_CONTINUOUS = 0, 195 PHYS_ADDR_CROSS_SUBREG = 1, 196 PHYS_ADDR_INVALID = 2, 197 PHYS_ADDR_LAST 198 } hpa_type; 199 200 /* Enable stats. */ 201 static uint32_t enable_stats = 0; 202 /* Enable retries on RX. */ 203 static uint32_t enable_retry = 1; 204 205 /* Disable TX checksum offload */ 206 static uint32_t enable_tx_csum; 207 208 /* Disable TSO offload */ 209 static uint32_t enable_tso; 210 211 /* Specify timeout (in useconds) between retries on RX. */ 212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 213 /* Specify the number of retries on RX. */ 214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 215 216 /* Character device basename. Can be set by user. */ 217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 218 219 /* empty vmdq configuration structure. Filled in programatically */ 220 static struct rte_eth_conf vmdq_conf_default = { 221 .rxmode = { 222 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 223 .split_hdr_size = 0, 224 .header_split = 0, /**< Header Split disabled */ 225 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 226 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 227 /* 228 * It is necessary for 1G NIC such as I350, 229 * this fixes bug of ipv4 forwarding in guest can't 230 * forward pakets from one virtio dev to another virtio dev. 231 */ 232 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 233 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 234 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 235 }, 236 237 .txmode = { 238 .mq_mode = ETH_MQ_TX_NONE, 239 }, 240 .rx_adv_conf = { 241 /* 242 * should be overridden separately in code with 243 * appropriate values 244 */ 245 .vmdq_rx_conf = { 246 .nb_queue_pools = ETH_8_POOLS, 247 .enable_default_pool = 0, 248 .default_pool = 0, 249 .nb_pool_maps = 0, 250 .pool_map = {{0, 0},}, 251 }, 252 }, 253 }; 254 255 static unsigned lcore_ids[RTE_MAX_LCORE]; 256 static uint8_t ports[RTE_MAX_ETHPORTS]; 257 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 258 static uint16_t num_pf_queues, num_vmdq_queues; 259 static uint16_t vmdq_pool_base, vmdq_queue_base; 260 static uint16_t queues_per_pool; 261 262 static const uint16_t external_pkt_default_vlan_tag = 2000; 263 const uint16_t vlan_tags[] = { 264 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 265 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 266 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 267 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 268 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 269 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 270 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 271 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 272 }; 273 274 /* ethernet addresses of ports */ 275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 276 277 /* heads for the main used and free linked lists for the data path. */ 278 static struct virtio_net_data_ll *ll_root_used = NULL; 279 static struct virtio_net_data_ll *ll_root_free = NULL; 280 281 /* Array of data core structures containing information on individual core linked lists. */ 282 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 283 284 /* Used for queueing bursts of TX packets. */ 285 struct mbuf_table { 286 unsigned len; 287 unsigned txq_id; 288 struct rte_mbuf *m_table[MAX_PKT_BURST]; 289 }; 290 291 /* TX queue for each data core. */ 292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 293 294 /* TX queue fori each virtio device for zero copy. */ 295 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 296 297 /* Vlan header struct used to insert vlan tags on TX. */ 298 struct vlan_ethhdr { 299 unsigned char h_dest[ETH_ALEN]; 300 unsigned char h_source[ETH_ALEN]; 301 __be16 h_vlan_proto; 302 __be16 h_vlan_TCI; 303 __be16 h_vlan_encapsulated_proto; 304 }; 305 306 /* Header lengths. */ 307 #define VLAN_HLEN 4 308 #define VLAN_ETH_HLEN 18 309 310 /* Per-device statistics struct */ 311 struct device_statistics { 312 uint64_t tx_total; 313 rte_atomic64_t rx_total_atomic; 314 uint64_t rx_total; 315 uint64_t tx; 316 rte_atomic64_t rx_atomic; 317 uint64_t rx; 318 } __rte_cache_aligned; 319 struct device_statistics dev_statistics[MAX_DEVICES]; 320 321 /* 322 * Builds up the correct configuration for VMDQ VLAN pool map 323 * according to the pool & queue limits. 324 */ 325 static inline int 326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 327 { 328 struct rte_eth_vmdq_rx_conf conf; 329 struct rte_eth_vmdq_rx_conf *def_conf = 330 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 331 unsigned i; 332 333 memset(&conf, 0, sizeof(conf)); 334 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 335 conf.nb_pool_maps = num_devices; 336 conf.enable_loop_back = def_conf->enable_loop_back; 337 conf.rx_mode = def_conf->rx_mode; 338 339 for (i = 0; i < conf.nb_pool_maps; i++) { 340 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 341 conf.pool_map[i].pools = (1UL << i); 342 } 343 344 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 345 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 346 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 347 return 0; 348 } 349 350 /* 351 * Validate the device number according to the max pool number gotten form 352 * dev_info. If the device number is invalid, give the error message and 353 * return -1. Each device must have its own pool. 354 */ 355 static inline int 356 validate_num_devices(uint32_t max_nb_devices) 357 { 358 if (num_devices > max_nb_devices) { 359 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 360 return -1; 361 } 362 return 0; 363 } 364 365 /* 366 * Initialises a given port using global settings and with the rx buffers 367 * coming from the mbuf_pool passed as parameter 368 */ 369 static inline int 370 port_init(uint8_t port) 371 { 372 struct rte_eth_dev_info dev_info; 373 struct rte_eth_conf port_conf; 374 struct rte_eth_rxconf *rxconf; 375 struct rte_eth_txconf *txconf; 376 int16_t rx_rings, tx_rings; 377 uint16_t rx_ring_size, tx_ring_size; 378 int retval; 379 uint16_t q; 380 381 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 382 rte_eth_dev_info_get (port, &dev_info); 383 384 if (dev_info.max_rx_queues > MAX_QUEUES) { 385 rte_exit(EXIT_FAILURE, 386 "please define MAX_QUEUES no less than %u in %s\n", 387 dev_info.max_rx_queues, __FILE__); 388 } 389 390 rxconf = &dev_info.default_rxconf; 391 txconf = &dev_info.default_txconf; 392 rxconf->rx_drop_en = 1; 393 394 /* Enable vlan offload */ 395 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 396 397 /* 398 * Zero copy defers queue RX/TX start to the time when guest 399 * finishes its startup and packet buffers from that guest are 400 * available. 401 */ 402 if (zero_copy) { 403 rxconf->rx_deferred_start = 1; 404 rxconf->rx_drop_en = 0; 405 txconf->tx_deferred_start = 1; 406 } 407 408 /*configure the number of supported virtio devices based on VMDQ limits */ 409 num_devices = dev_info.max_vmdq_pools; 410 411 if (zero_copy) { 412 rx_ring_size = num_rx_descriptor; 413 tx_ring_size = num_tx_descriptor; 414 tx_rings = dev_info.max_tx_queues; 415 } else { 416 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 417 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 418 tx_rings = (uint16_t)rte_lcore_count(); 419 } 420 421 retval = validate_num_devices(MAX_DEVICES); 422 if (retval < 0) 423 return retval; 424 425 /* Get port configuration. */ 426 retval = get_eth_conf(&port_conf, num_devices); 427 if (retval < 0) 428 return retval; 429 /* NIC queues are divided into pf queues and vmdq queues. */ 430 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 431 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 432 num_vmdq_queues = num_devices * queues_per_pool; 433 num_queues = num_pf_queues + num_vmdq_queues; 434 vmdq_queue_base = dev_info.vmdq_queue_base; 435 vmdq_pool_base = dev_info.vmdq_pool_base; 436 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 437 num_pf_queues, num_devices, queues_per_pool); 438 439 if (port >= rte_eth_dev_count()) return -1; 440 441 if (enable_tx_csum == 0) 442 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); 443 444 if (enable_tso == 0) { 445 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); 446 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); 447 } 448 449 rx_rings = (uint16_t)dev_info.max_rx_queues; 450 /* Configure ethernet device. */ 451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 452 if (retval != 0) 453 return retval; 454 455 /* Setup the queues. */ 456 for (q = 0; q < rx_rings; q ++) { 457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 458 rte_eth_dev_socket_id(port), 459 rxconf, 460 vpool_array[q].pool); 461 if (retval < 0) 462 return retval; 463 } 464 for (q = 0; q < tx_rings; q ++) { 465 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 466 rte_eth_dev_socket_id(port), 467 txconf); 468 if (retval < 0) 469 return retval; 470 } 471 472 /* Start the device. */ 473 retval = rte_eth_dev_start(port); 474 if (retval < 0) { 475 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 476 return retval; 477 } 478 479 if (promiscuous) 480 rte_eth_promiscuous_enable(port); 481 482 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 483 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 484 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 485 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 486 (unsigned)port, 487 vmdq_ports_eth_addr[port].addr_bytes[0], 488 vmdq_ports_eth_addr[port].addr_bytes[1], 489 vmdq_ports_eth_addr[port].addr_bytes[2], 490 vmdq_ports_eth_addr[port].addr_bytes[3], 491 vmdq_ports_eth_addr[port].addr_bytes[4], 492 vmdq_ports_eth_addr[port].addr_bytes[5]); 493 494 return 0; 495 } 496 497 /* 498 * Set character device basename. 499 */ 500 static int 501 us_vhost_parse_basename(const char *q_arg) 502 { 503 /* parse number string */ 504 505 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 506 return -1; 507 else 508 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 509 510 return 0; 511 } 512 513 /* 514 * Parse the portmask provided at run time. 515 */ 516 static int 517 parse_portmask(const char *portmask) 518 { 519 char *end = NULL; 520 unsigned long pm; 521 522 errno = 0; 523 524 /* parse hexadecimal string */ 525 pm = strtoul(portmask, &end, 16); 526 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 527 return -1; 528 529 if (pm == 0) 530 return -1; 531 532 return pm; 533 534 } 535 536 /* 537 * Parse num options at run time. 538 */ 539 static int 540 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 541 { 542 char *end = NULL; 543 unsigned long num; 544 545 errno = 0; 546 547 /* parse unsigned int string */ 548 num = strtoul(q_arg, &end, 10); 549 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 550 return -1; 551 552 if (num > max_valid_value) 553 return -1; 554 555 return num; 556 557 } 558 559 /* 560 * Display usage 561 */ 562 static void 563 us_vhost_usage(const char *prgname) 564 { 565 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 566 " --vm2vm [0|1|2]\n" 567 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 568 " --dev-basename <name>\n" 569 " --nb-devices ND\n" 570 " -p PORTMASK: Set mask for ports to be used by application\n" 571 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 572 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 573 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 574 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 575 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 576 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 577 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 578 " --dev-basename: The basename to be used for the character device.\n" 579 " --zero-copy [0|1]: disable(default)/enable rx/tx " 580 "zero copy\n" 581 " --rx-desc-num [0-N]: the number of descriptors on rx, " 582 "used only when zero copy is enabled.\n" 583 " --tx-desc-num [0-N]: the number of descriptors on tx, " 584 "used only when zero copy is enabled.\n" 585 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 586 " --tso [0|1] disable/enable TCP segment offload.\n", 587 prgname); 588 } 589 590 /* 591 * Parse the arguments given in the command line of the application. 592 */ 593 static int 594 us_vhost_parse_args(int argc, char **argv) 595 { 596 int opt, ret; 597 int option_index; 598 unsigned i; 599 const char *prgname = argv[0]; 600 static struct option long_option[] = { 601 {"vm2vm", required_argument, NULL, 0}, 602 {"rx-retry", required_argument, NULL, 0}, 603 {"rx-retry-delay", required_argument, NULL, 0}, 604 {"rx-retry-num", required_argument, NULL, 0}, 605 {"mergeable", required_argument, NULL, 0}, 606 {"vlan-strip", required_argument, NULL, 0}, 607 {"stats", required_argument, NULL, 0}, 608 {"dev-basename", required_argument, NULL, 0}, 609 {"zero-copy", required_argument, NULL, 0}, 610 {"rx-desc-num", required_argument, NULL, 0}, 611 {"tx-desc-num", required_argument, NULL, 0}, 612 {"tx-csum", required_argument, NULL, 0}, 613 {"tso", required_argument, NULL, 0}, 614 {NULL, 0, 0, 0}, 615 }; 616 617 /* Parse command line */ 618 while ((opt = getopt_long(argc, argv, "p:P", 619 long_option, &option_index)) != EOF) { 620 switch (opt) { 621 /* Portmask */ 622 case 'p': 623 enabled_port_mask = parse_portmask(optarg); 624 if (enabled_port_mask == 0) { 625 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 626 us_vhost_usage(prgname); 627 return -1; 628 } 629 break; 630 631 case 'P': 632 promiscuous = 1; 633 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 634 ETH_VMDQ_ACCEPT_BROADCAST | 635 ETH_VMDQ_ACCEPT_MULTICAST; 636 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 637 638 break; 639 640 case 0: 641 /* Enable/disable vm2vm comms. */ 642 if (!strncmp(long_option[option_index].name, "vm2vm", 643 MAX_LONG_OPT_SZ)) { 644 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 645 if (ret == -1) { 646 RTE_LOG(INFO, VHOST_CONFIG, 647 "Invalid argument for " 648 "vm2vm [0|1|2]\n"); 649 us_vhost_usage(prgname); 650 return -1; 651 } else { 652 vm2vm_mode = (vm2vm_type)ret; 653 } 654 } 655 656 /* Enable/disable retries on RX. */ 657 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 658 ret = parse_num_opt(optarg, 1); 659 if (ret == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 661 us_vhost_usage(prgname); 662 return -1; 663 } else { 664 enable_retry = ret; 665 } 666 } 667 668 /* Enable/disable TX checksum offload. */ 669 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 670 ret = parse_num_opt(optarg, 1); 671 if (ret == -1) { 672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } else 676 enable_tx_csum = ret; 677 } 678 679 /* Enable/disable TSO offload. */ 680 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 681 ret = parse_num_opt(optarg, 1); 682 if (ret == -1) { 683 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 684 us_vhost_usage(prgname); 685 return -1; 686 } else 687 enable_tso = ret; 688 } 689 690 /* Specify the retries delay time (in useconds) on RX. */ 691 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 692 ret = parse_num_opt(optarg, INT32_MAX); 693 if (ret == -1) { 694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 695 us_vhost_usage(prgname); 696 return -1; 697 } else { 698 burst_rx_delay_time = ret; 699 } 700 } 701 702 /* Specify the retries number on RX. */ 703 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 704 ret = parse_num_opt(optarg, INT32_MAX); 705 if (ret == -1) { 706 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 707 us_vhost_usage(prgname); 708 return -1; 709 } else { 710 burst_rx_retry_num = ret; 711 } 712 } 713 714 /* Enable/disable RX mergeable buffers. */ 715 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 716 ret = parse_num_opt(optarg, 1); 717 if (ret == -1) { 718 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 719 us_vhost_usage(prgname); 720 return -1; 721 } else { 722 mergeable = !!ret; 723 if (ret) { 724 vmdq_conf_default.rxmode.jumbo_frame = 1; 725 vmdq_conf_default.rxmode.max_rx_pkt_len 726 = JUMBO_FRAME_MAX_SIZE; 727 } 728 } 729 } 730 731 /* Enable/disable RX VLAN strip on host. */ 732 if (!strncmp(long_option[option_index].name, 733 "vlan-strip", MAX_LONG_OPT_SZ)) { 734 ret = parse_num_opt(optarg, 1); 735 if (ret == -1) { 736 RTE_LOG(INFO, VHOST_CONFIG, 737 "Invalid argument for VLAN strip [0|1]\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } else { 741 vlan_strip = !!ret; 742 vmdq_conf_default.rxmode.hw_vlan_strip = 743 vlan_strip; 744 } 745 } 746 747 /* Enable/disable stats. */ 748 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 749 ret = parse_num_opt(optarg, INT32_MAX); 750 if (ret == -1) { 751 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 752 us_vhost_usage(prgname); 753 return -1; 754 } else { 755 enable_stats = ret; 756 } 757 } 758 759 /* Set character device basename. */ 760 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 761 if (us_vhost_parse_basename(optarg) == -1) { 762 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 763 us_vhost_usage(prgname); 764 return -1; 765 } 766 } 767 768 /* Enable/disable rx/tx zero copy. */ 769 if (!strncmp(long_option[option_index].name, 770 "zero-copy", MAX_LONG_OPT_SZ)) { 771 ret = parse_num_opt(optarg, 1); 772 if (ret == -1) { 773 RTE_LOG(INFO, VHOST_CONFIG, 774 "Invalid argument" 775 " for zero-copy [0|1]\n"); 776 us_vhost_usage(prgname); 777 return -1; 778 } else 779 zero_copy = ret; 780 } 781 782 /* Specify the descriptor number on RX. */ 783 if (!strncmp(long_option[option_index].name, 784 "rx-desc-num", MAX_LONG_OPT_SZ)) { 785 ret = parse_num_opt(optarg, MAX_RING_DESC); 786 if ((ret == -1) || (!POWEROF2(ret))) { 787 RTE_LOG(INFO, VHOST_CONFIG, 788 "Invalid argument for rx-desc-num[0-N]," 789 "power of 2 required.\n"); 790 us_vhost_usage(prgname); 791 return -1; 792 } else { 793 num_rx_descriptor = ret; 794 } 795 } 796 797 /* Specify the descriptor number on TX. */ 798 if (!strncmp(long_option[option_index].name, 799 "tx-desc-num", MAX_LONG_OPT_SZ)) { 800 ret = parse_num_opt(optarg, MAX_RING_DESC); 801 if ((ret == -1) || (!POWEROF2(ret))) { 802 RTE_LOG(INFO, VHOST_CONFIG, 803 "Invalid argument for tx-desc-num [0-N]," 804 "power of 2 required.\n"); 805 us_vhost_usage(prgname); 806 return -1; 807 } else { 808 num_tx_descriptor = ret; 809 } 810 } 811 812 break; 813 814 /* Invalid option - print options. */ 815 default: 816 us_vhost_usage(prgname); 817 return -1; 818 } 819 } 820 821 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 822 if (enabled_port_mask & (1 << i)) 823 ports[num_ports++] = (uint8_t)i; 824 } 825 826 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 827 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 828 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 829 return -1; 830 } 831 832 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 833 RTE_LOG(INFO, VHOST_PORT, 834 "Vhost zero copy doesn't support software vm2vm," 835 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 836 return -1; 837 } 838 839 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 840 RTE_LOG(INFO, VHOST_PORT, 841 "Vhost zero copy doesn't support jumbo frame," 842 "please specify '--mergeable 0' to disable the " 843 "mergeable feature.\n"); 844 return -1; 845 } 846 847 return 0; 848 } 849 850 /* 851 * Update the global var NUM_PORTS and array PORTS according to system ports number 852 * and return valid ports number 853 */ 854 static unsigned check_ports_num(unsigned nb_ports) 855 { 856 unsigned valid_num_ports = num_ports; 857 unsigned portid; 858 859 if (num_ports > nb_ports) { 860 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 861 num_ports, nb_ports); 862 num_ports = nb_ports; 863 } 864 865 for (portid = 0; portid < num_ports; portid ++) { 866 if (ports[portid] >= nb_ports) { 867 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 868 ports[portid], (nb_ports - 1)); 869 ports[portid] = INVALID_PORT_ID; 870 valid_num_ports--; 871 } 872 } 873 return valid_num_ports; 874 } 875 876 /* 877 * Macro to print out packet contents. Wrapped in debug define so that the 878 * data path is not effected when debug is disabled. 879 */ 880 #ifdef DEBUG 881 #define PRINT_PACKET(device, addr, size, header) do { \ 882 char *pkt_addr = (char*)(addr); \ 883 unsigned int index; \ 884 char packet[MAX_PRINT_BUFF]; \ 885 \ 886 if ((header)) \ 887 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 888 else \ 889 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 890 for (index = 0; index < (size); index++) { \ 891 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 892 "%02hhx ", pkt_addr[index]); \ 893 } \ 894 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 895 \ 896 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 897 } while(0) 898 #else 899 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 900 #endif 901 902 /* 903 * Function to convert guest physical addresses to vhost physical addresses. 904 * This is used to convert virtio buffer addresses. 905 */ 906 static inline uint64_t __attribute__((always_inline)) 907 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 908 uint32_t buf_len, hpa_type *addr_type) 909 { 910 struct virtio_memory_regions_hpa *region; 911 uint32_t regionidx; 912 uint64_t vhost_pa = 0; 913 914 *addr_type = PHYS_ADDR_INVALID; 915 916 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 917 region = &vdev->regions_hpa[regionidx]; 918 if ((guest_pa >= region->guest_phys_address) && 919 (guest_pa <= region->guest_phys_address_end)) { 920 vhost_pa = region->host_phys_addr_offset + guest_pa; 921 if (likely((guest_pa + buf_len - 1) 922 <= region->guest_phys_address_end)) 923 *addr_type = PHYS_ADDR_CONTINUOUS; 924 else 925 *addr_type = PHYS_ADDR_CROSS_SUBREG; 926 break; 927 } 928 } 929 930 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 931 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 932 (void *)(uintptr_t)vhost_pa); 933 934 return vhost_pa; 935 } 936 937 /* 938 * Compares a packet destination MAC address to a device MAC address. 939 */ 940 static inline int __attribute__((always_inline)) 941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 942 { 943 return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; 944 } 945 946 /* 947 * This function learns the MAC address of the device and registers this along with a 948 * vlan tag to a VMDQ. 949 */ 950 static int 951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 952 { 953 struct ether_hdr *pkt_hdr; 954 struct virtio_net_data_ll *dev_ll; 955 struct virtio_net *dev = vdev->dev; 956 int i, ret; 957 958 /* Learn MAC address of guest device from packet */ 959 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 960 961 dev_ll = ll_root_used; 962 963 while (dev_ll != NULL) { 964 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 965 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 966 return -1; 967 } 968 dev_ll = dev_ll->next; 969 } 970 971 for (i = 0; i < ETHER_ADDR_LEN; i++) 972 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 973 974 /* vlan_tag currently uses the device_id. */ 975 vdev->vlan_tag = vlan_tags[dev->device_fh]; 976 977 /* Print out VMDQ registration info. */ 978 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 979 dev->device_fh, 980 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 981 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 982 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 983 vdev->vlan_tag); 984 985 /* Register the MAC address. */ 986 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 987 (uint32_t)dev->device_fh + vmdq_pool_base); 988 if (ret) 989 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 990 dev->device_fh); 991 992 /* Enable stripping of the vlan tag as we handle routing. */ 993 if (vlan_strip) 994 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 995 (uint16_t)vdev->vmdq_rx_q, 1); 996 997 /* Set device as ready for RX. */ 998 vdev->ready = DEVICE_RX; 999 1000 return 0; 1001 } 1002 1003 /* 1004 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 1005 * queue before disabling RX on the device. 1006 */ 1007 static inline void 1008 unlink_vmdq(struct vhost_dev *vdev) 1009 { 1010 unsigned i = 0; 1011 unsigned rx_count; 1012 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1013 1014 if (vdev->ready == DEVICE_RX) { 1015 /*clear MAC and VLAN settings*/ 1016 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 1017 for (i = 0; i < 6; i++) 1018 vdev->mac_address.addr_bytes[i] = 0; 1019 1020 vdev->vlan_tag = 0; 1021 1022 /*Clear out the receive buffers*/ 1023 rx_count = rte_eth_rx_burst(ports[0], 1024 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1025 1026 while (rx_count) { 1027 for (i = 0; i < rx_count; i++) 1028 rte_pktmbuf_free(pkts_burst[i]); 1029 1030 rx_count = rte_eth_rx_burst(ports[0], 1031 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1032 } 1033 1034 vdev->ready = DEVICE_MAC_LEARNING; 1035 } 1036 } 1037 1038 /* 1039 * Check if the packet destination MAC address is for a local device. If so then put 1040 * the packet on that devices RX queue. If not then return. 1041 */ 1042 static inline int __attribute__((always_inline)) 1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1044 { 1045 struct virtio_net_data_ll *dev_ll; 1046 struct ether_hdr *pkt_hdr; 1047 uint64_t ret = 0; 1048 struct virtio_net *dev = vdev->dev; 1049 struct virtio_net *tdev; /* destination virito device */ 1050 1051 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1052 1053 /*get the used devices list*/ 1054 dev_ll = ll_root_used; 1055 1056 while (dev_ll != NULL) { 1057 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1058 &dev_ll->vdev->mac_address)) { 1059 1060 /* Drop the packet if the TX packet is destined for the TX device. */ 1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1062 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1063 dev->device_fh); 1064 return 0; 1065 } 1066 tdev = dev_ll->vdev->dev; 1067 1068 1069 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1070 1071 if (unlikely(dev_ll->vdev->remove)) { 1072 /*drop the packet if the device is marked for removal*/ 1073 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1074 } else { 1075 /*send the packet to the local virtio device*/ 1076 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1077 if (enable_stats) { 1078 rte_atomic64_add( 1079 &dev_statistics[tdev->device_fh].rx_total_atomic, 1080 1); 1081 rte_atomic64_add( 1082 &dev_statistics[tdev->device_fh].rx_atomic, 1083 ret); 1084 dev_statistics[dev->device_fh].tx_total++; 1085 dev_statistics[dev->device_fh].tx += ret; 1086 } 1087 } 1088 1089 return 0; 1090 } 1091 dev_ll = dev_ll->next; 1092 } 1093 1094 return -1; 1095 } 1096 1097 /* 1098 * Check if the destination MAC of a packet is one local VM, 1099 * and get its vlan tag, and offset if it is. 1100 */ 1101 static inline int __attribute__((always_inline)) 1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1103 uint32_t *offset, uint16_t *vlan_tag) 1104 { 1105 struct virtio_net_data_ll *dev_ll = ll_root_used; 1106 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1107 1108 while (dev_ll != NULL) { 1109 if ((dev_ll->vdev->ready == DEVICE_RX) 1110 && ether_addr_cmp(&(pkt_hdr->d_addr), 1111 &dev_ll->vdev->mac_address)) { 1112 /* 1113 * Drop the packet if the TX packet is 1114 * destined for the TX device. 1115 */ 1116 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1117 LOG_DEBUG(VHOST_DATA, 1118 "(%"PRIu64") TX: Source and destination" 1119 " MAC addresses are the same. Dropping " 1120 "packet.\n", 1121 dev_ll->vdev->dev->device_fh); 1122 return -1; 1123 } 1124 1125 /* 1126 * HW vlan strip will reduce the packet length 1127 * by minus length of vlan tag, so need restore 1128 * the packet length by plus it. 1129 */ 1130 *offset = VLAN_HLEN; 1131 *vlan_tag = 1132 (uint16_t) 1133 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1134 1135 LOG_DEBUG(VHOST_DATA, 1136 "(%"PRIu64") TX: pkt to local VM device id:" 1137 "(%"PRIu64") vlan tag: %d.\n", 1138 dev->device_fh, dev_ll->vdev->dev->device_fh, 1139 (int)*vlan_tag); 1140 1141 break; 1142 } 1143 dev_ll = dev_ll->next; 1144 } 1145 return 0; 1146 } 1147 1148 static uint16_t 1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 1150 { 1151 if (ol_flags & PKT_TX_IPV4) 1152 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 1153 else /* assume ethertype == ETHER_TYPE_IPv6 */ 1154 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 1155 } 1156 1157 static void virtio_tx_offload(struct rte_mbuf *m) 1158 { 1159 void *l3_hdr; 1160 struct ipv4_hdr *ipv4_hdr = NULL; 1161 struct tcp_hdr *tcp_hdr = NULL; 1162 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1163 1164 l3_hdr = (char *)eth_hdr + m->l2_len; 1165 1166 ipv4_hdr = (struct ipv4_hdr *)l3_hdr; 1167 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 1168 m->ol_flags |= PKT_TX_IP_CKSUM; 1169 ipv4_hdr->hdr_checksum = 0; 1170 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 1171 } 1172 1173 /* 1174 * This function routes the TX packet to the correct interface. This may be a local device 1175 * or the physical port. 1176 */ 1177 static inline void __attribute__((always_inline)) 1178 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1179 { 1180 struct mbuf_table *tx_q; 1181 struct rte_mbuf **m_table; 1182 unsigned len, ret, offset = 0; 1183 const uint16_t lcore_id = rte_lcore_id(); 1184 struct virtio_net *dev = vdev->dev; 1185 struct ether_hdr *nh; 1186 1187 /*check if destination is local VM*/ 1188 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1189 rte_pktmbuf_free(m); 1190 return; 1191 } 1192 1193 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1194 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1195 rte_pktmbuf_free(m); 1196 return; 1197 } 1198 } 1199 1200 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1201 1202 /*Add packet to the port tx queue*/ 1203 tx_q = &lcore_tx_queue[lcore_id]; 1204 len = tx_q->len; 1205 1206 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1207 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1208 /* Guest has inserted the vlan tag. */ 1209 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1210 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1211 if ((vm2vm_mode == VM2VM_HARDWARE) && 1212 (vh->vlan_tci != vlan_tag_be)) 1213 vh->vlan_tci = vlan_tag_be; 1214 } else { 1215 m->ol_flags |= PKT_TX_VLAN_PKT; 1216 1217 /* 1218 * Find the right seg to adjust the data len when offset is 1219 * bigger than tail room size. 1220 */ 1221 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1222 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1223 m->data_len += offset; 1224 else { 1225 struct rte_mbuf *seg = m; 1226 1227 while ((seg->next != NULL) && 1228 (offset > rte_pktmbuf_tailroom(seg))) 1229 seg = seg->next; 1230 1231 seg->data_len += offset; 1232 } 1233 m->pkt_len += offset; 1234 } 1235 1236 m->vlan_tci = vlan_tag; 1237 } 1238 1239 if (m->ol_flags & PKT_TX_TCP_SEG) 1240 virtio_tx_offload(m); 1241 1242 tx_q->m_table[len] = m; 1243 len++; 1244 if (enable_stats) { 1245 dev_statistics[dev->device_fh].tx_total++; 1246 dev_statistics[dev->device_fh].tx++; 1247 } 1248 1249 if (unlikely(len == MAX_PKT_BURST)) { 1250 m_table = (struct rte_mbuf **)tx_q->m_table; 1251 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1252 /* Free any buffers not handled by TX and update the port stats. */ 1253 if (unlikely(ret < len)) { 1254 do { 1255 rte_pktmbuf_free(m_table[ret]); 1256 } while (++ret < len); 1257 } 1258 1259 len = 0; 1260 } 1261 1262 tx_q->len = len; 1263 return; 1264 } 1265 /* 1266 * This function is called by each data core. It handles all RX/TX registered with the 1267 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1268 * with all devices in the main linked list. 1269 */ 1270 static int 1271 switch_worker(__attribute__((unused)) void *arg) 1272 { 1273 struct rte_mempool *mbuf_pool = arg; 1274 struct virtio_net *dev = NULL; 1275 struct vhost_dev *vdev = NULL; 1276 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1277 struct virtio_net_data_ll *dev_ll; 1278 struct mbuf_table *tx_q; 1279 volatile struct lcore_ll_info *lcore_ll; 1280 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1281 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1282 unsigned ret, i; 1283 const uint16_t lcore_id = rte_lcore_id(); 1284 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1285 uint16_t rx_count = 0; 1286 uint16_t tx_count; 1287 uint32_t retry = 0; 1288 1289 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1290 lcore_ll = lcore_info[lcore_id].lcore_ll; 1291 prev_tsc = 0; 1292 1293 tx_q = &lcore_tx_queue[lcore_id]; 1294 for (i = 0; i < num_cores; i ++) { 1295 if (lcore_ids[i] == lcore_id) { 1296 tx_q->txq_id = i; 1297 break; 1298 } 1299 } 1300 1301 while(1) { 1302 cur_tsc = rte_rdtsc(); 1303 /* 1304 * TX burst queue drain 1305 */ 1306 diff_tsc = cur_tsc - prev_tsc; 1307 if (unlikely(diff_tsc > drain_tsc)) { 1308 1309 if (tx_q->len) { 1310 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1311 1312 /*Tx any packets in the queue*/ 1313 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1314 (struct rte_mbuf **)tx_q->m_table, 1315 (uint16_t)tx_q->len); 1316 if (unlikely(ret < tx_q->len)) { 1317 do { 1318 rte_pktmbuf_free(tx_q->m_table[ret]); 1319 } while (++ret < tx_q->len); 1320 } 1321 1322 tx_q->len = 0; 1323 } 1324 1325 prev_tsc = cur_tsc; 1326 1327 } 1328 1329 rte_prefetch0(lcore_ll->ll_root_used); 1330 /* 1331 * Inform the configuration core that we have exited the linked list and that no devices are 1332 * in use if requested. 1333 */ 1334 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1335 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1336 1337 /* 1338 * Process devices 1339 */ 1340 dev_ll = lcore_ll->ll_root_used; 1341 1342 while (dev_ll != NULL) { 1343 /*get virtio device ID*/ 1344 vdev = dev_ll->vdev; 1345 dev = vdev->dev; 1346 1347 if (unlikely(vdev->remove)) { 1348 dev_ll = dev_ll->next; 1349 unlink_vmdq(vdev); 1350 vdev->ready = DEVICE_SAFE_REMOVE; 1351 continue; 1352 } 1353 if (likely(vdev->ready == DEVICE_RX)) { 1354 /*Handle guest RX*/ 1355 rx_count = rte_eth_rx_burst(ports[0], 1356 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1357 1358 if (rx_count) { 1359 /* 1360 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1361 * Here MAX_PKT_BURST must be less than virtio queue size 1362 */ 1363 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1364 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1365 rte_delay_us(burst_rx_delay_time); 1366 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1367 break; 1368 } 1369 } 1370 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1371 if (enable_stats) { 1372 rte_atomic64_add( 1373 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1374 rx_count); 1375 rte_atomic64_add( 1376 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1377 } 1378 while (likely(rx_count)) { 1379 rx_count--; 1380 rte_pktmbuf_free(pkts_burst[rx_count]); 1381 } 1382 1383 } 1384 } 1385 1386 if (likely(!vdev->remove)) { 1387 /* Handle guest TX*/ 1388 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1389 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1390 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1391 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1392 while (tx_count) 1393 rte_pktmbuf_free(pkts_burst[--tx_count]); 1394 } 1395 } 1396 for (i = 0; i < tx_count; ++i) 1397 virtio_tx_route(vdev, pkts_burst[i], (uint16_t)dev->device_fh); 1398 } 1399 1400 /*move to the next device in the list*/ 1401 dev_ll = dev_ll->next; 1402 } 1403 } 1404 1405 return 0; 1406 } 1407 1408 /* 1409 * This function gets available ring number for zero copy rx. 1410 * Only one thread will call this funciton for a paticular virtio device, 1411 * so, it is designed as non-thread-safe function. 1412 */ 1413 static inline uint32_t __attribute__((always_inline)) 1414 get_available_ring_num_zcp(struct virtio_net *dev) 1415 { 1416 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1417 uint16_t avail_idx; 1418 1419 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1420 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1421 } 1422 1423 /* 1424 * This function gets available ring index for zero copy rx, 1425 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1426 * Only one thread will call this funciton for a paticular virtio device, 1427 * so, it is designed as non-thread-safe function. 1428 */ 1429 static inline uint32_t __attribute__((always_inline)) 1430 get_available_ring_index_zcp(struct virtio_net *dev, 1431 uint16_t *res_base_idx, uint32_t count) 1432 { 1433 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1434 uint16_t avail_idx; 1435 uint32_t retry = 0; 1436 uint16_t free_entries; 1437 1438 *res_base_idx = vq->last_used_idx_res; 1439 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1440 free_entries = (avail_idx - *res_base_idx); 1441 1442 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1443 "avail idx: %d, " 1444 "res base idx:%d, free entries:%d\n", 1445 dev->device_fh, avail_idx, *res_base_idx, 1446 free_entries); 1447 1448 /* 1449 * If retry is enabled and the queue is full then we wait 1450 * and retry to avoid packet loss. 1451 */ 1452 if (enable_retry && unlikely(count > free_entries)) { 1453 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1454 rte_delay_us(burst_rx_delay_time); 1455 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1456 free_entries = (avail_idx - *res_base_idx); 1457 if (count <= free_entries) 1458 break; 1459 } 1460 } 1461 1462 /*check that we have enough buffers*/ 1463 if (unlikely(count > free_entries)) 1464 count = free_entries; 1465 1466 if (unlikely(count == 0)) { 1467 LOG_DEBUG(VHOST_DATA, 1468 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1469 "avail idx: %d, res base idx:%d, free entries:%d\n", 1470 dev->device_fh, avail_idx, 1471 *res_base_idx, free_entries); 1472 return 0; 1473 } 1474 1475 vq->last_used_idx_res = *res_base_idx + count; 1476 1477 return count; 1478 } 1479 1480 /* 1481 * This function put descriptor back to used list. 1482 */ 1483 static inline void __attribute__((always_inline)) 1484 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1485 { 1486 uint16_t res_cur_idx = vq->last_used_idx; 1487 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1488 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1489 rte_compiler_barrier(); 1490 *(volatile uint16_t *)&vq->used->idx += 1; 1491 vq->last_used_idx += 1; 1492 1493 /* Kick the guest if necessary. */ 1494 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1495 eventfd_write(vq->callfd, (eventfd_t)1); 1496 } 1497 1498 /* 1499 * This function get available descriptor from vitio vring and un-attached mbuf 1500 * from vpool->ring, and then attach them together. It needs adjust the offset 1501 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1502 * frame data may be put to wrong location in mbuf. 1503 */ 1504 static inline void __attribute__((always_inline)) 1505 attach_rxmbuf_zcp(struct virtio_net *dev) 1506 { 1507 uint16_t res_base_idx, desc_idx; 1508 uint64_t buff_addr, phys_addr; 1509 struct vhost_virtqueue *vq; 1510 struct vring_desc *desc; 1511 void *obj = NULL; 1512 struct rte_mbuf *mbuf; 1513 struct vpool *vpool; 1514 hpa_type addr_type; 1515 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1516 1517 vpool = &vpool_array[vdev->vmdq_rx_q]; 1518 vq = dev->virtqueue[VIRTIO_RXQ]; 1519 1520 do { 1521 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1522 1) != 1)) 1523 return; 1524 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1525 1526 desc = &vq->desc[desc_idx]; 1527 if (desc->flags & VRING_DESC_F_NEXT) { 1528 desc = &vq->desc[desc->next]; 1529 buff_addr = gpa_to_vva(dev, desc->addr); 1530 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1531 &addr_type); 1532 } else { 1533 buff_addr = gpa_to_vva(dev, 1534 desc->addr + vq->vhost_hlen); 1535 phys_addr = gpa_to_hpa(vdev, 1536 desc->addr + vq->vhost_hlen, 1537 desc->len, &addr_type); 1538 } 1539 1540 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1541 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1542 " address found when attaching RX frame buffer" 1543 " address!\n", dev->device_fh); 1544 put_desc_to_used_list_zcp(vq, desc_idx); 1545 continue; 1546 } 1547 1548 /* 1549 * Check if the frame buffer address from guest crosses 1550 * sub-region or not. 1551 */ 1552 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1553 RTE_LOG(ERR, VHOST_DATA, 1554 "(%"PRIu64") Frame buffer address cross " 1555 "sub-regioin found when attaching RX frame " 1556 "buffer address!\n", 1557 dev->device_fh); 1558 put_desc_to_used_list_zcp(vq, desc_idx); 1559 continue; 1560 } 1561 } while (unlikely(phys_addr == 0)); 1562 1563 rte_ring_sc_dequeue(vpool->ring, &obj); 1564 mbuf = obj; 1565 if (unlikely(mbuf == NULL)) { 1566 LOG_DEBUG(VHOST_DATA, 1567 "(%"PRIu64") in attach_rxmbuf_zcp: " 1568 "ring_sc_dequeue fail.\n", 1569 dev->device_fh); 1570 put_desc_to_used_list_zcp(vq, desc_idx); 1571 return; 1572 } 1573 1574 if (unlikely(vpool->buf_size > desc->len)) { 1575 LOG_DEBUG(VHOST_DATA, 1576 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1577 "length(%d) of descriptor idx: %d less than room " 1578 "size required: %d\n", 1579 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1580 put_desc_to_used_list_zcp(vq, desc_idx); 1581 rte_ring_sp_enqueue(vpool->ring, obj); 1582 return; 1583 } 1584 1585 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1586 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1587 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1588 mbuf->data_len = desc->len; 1589 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1590 1591 LOG_DEBUG(VHOST_DATA, 1592 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1593 "descriptor idx:%d\n", 1594 dev->device_fh, res_base_idx, desc_idx); 1595 1596 __rte_mbuf_raw_free(mbuf); 1597 1598 return; 1599 } 1600 1601 /* 1602 * Detach an attched packet mbuf - 1603 * - restore original mbuf address and length values. 1604 * - reset pktmbuf data and data_len to their default values. 1605 * All other fields of the given packet mbuf will be left intact. 1606 * 1607 * @param m 1608 * The attached packet mbuf. 1609 */ 1610 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1611 { 1612 const struct rte_mempool *mp = m->pool; 1613 void *buf = rte_mbuf_to_baddr(m); 1614 uint32_t buf_ofs; 1615 uint32_t buf_len = mp->elt_size - sizeof(*m); 1616 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1617 1618 m->buf_addr = buf; 1619 m->buf_len = (uint16_t)buf_len; 1620 1621 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1622 RTE_PKTMBUF_HEADROOM : m->buf_len; 1623 m->data_off = buf_ofs; 1624 1625 m->data_len = 0; 1626 } 1627 1628 /* 1629 * This function is called after packets have been transimited. It fetchs mbuf 1630 * from vpool->pool, detached it and put into vpool->ring. It also update the 1631 * used index and kick the guest if necessary. 1632 */ 1633 static inline uint32_t __attribute__((always_inline)) 1634 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1635 { 1636 struct rte_mbuf *mbuf; 1637 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1638 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1639 uint32_t index = 0; 1640 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1641 1642 LOG_DEBUG(VHOST_DATA, 1643 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1644 "clean is: %d\n", 1645 dev->device_fh, mbuf_count); 1646 LOG_DEBUG(VHOST_DATA, 1647 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1648 "clean is : %d\n", 1649 dev->device_fh, rte_ring_count(vpool->ring)); 1650 1651 for (index = 0; index < mbuf_count; index++) { 1652 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1653 if (likely(MBUF_EXT_MEM(mbuf))) 1654 pktmbuf_detach_zcp(mbuf); 1655 rte_ring_sp_enqueue(vpool->ring, mbuf); 1656 1657 /* Update used index buffer information. */ 1658 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1659 vq->used->ring[used_idx].len = 0; 1660 1661 used_idx = (used_idx + 1) & (vq->size - 1); 1662 } 1663 1664 LOG_DEBUG(VHOST_DATA, 1665 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1666 "clean is: %d\n", 1667 dev->device_fh, rte_mempool_count(vpool->pool)); 1668 LOG_DEBUG(VHOST_DATA, 1669 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1670 "clean is : %d\n", 1671 dev->device_fh, rte_ring_count(vpool->ring)); 1672 LOG_DEBUG(VHOST_DATA, 1673 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1674 "vq->last_used_idx:%d\n", 1675 dev->device_fh, vq->last_used_idx); 1676 1677 vq->last_used_idx += mbuf_count; 1678 1679 LOG_DEBUG(VHOST_DATA, 1680 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1681 "vq->last_used_idx:%d\n", 1682 dev->device_fh, vq->last_used_idx); 1683 1684 rte_compiler_barrier(); 1685 1686 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1687 1688 /* Kick guest if required. */ 1689 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1690 eventfd_write(vq->callfd, (eventfd_t)1); 1691 1692 return 0; 1693 } 1694 1695 /* 1696 * This function is called when a virtio device is destroy. 1697 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1698 */ 1699 static void mbuf_destroy_zcp(struct vpool *vpool) 1700 { 1701 struct rte_mbuf *mbuf = NULL; 1702 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1703 1704 LOG_DEBUG(VHOST_CONFIG, 1705 "in mbuf_destroy_zcp: mbuf count in mempool before " 1706 "mbuf_destroy_zcp is: %d\n", 1707 mbuf_count); 1708 LOG_DEBUG(VHOST_CONFIG, 1709 "in mbuf_destroy_zcp: mbuf count in ring before " 1710 "mbuf_destroy_zcp is : %d\n", 1711 rte_ring_count(vpool->ring)); 1712 1713 for (index = 0; index < mbuf_count; index++) { 1714 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1715 if (likely(mbuf != NULL)) { 1716 if (likely(MBUF_EXT_MEM(mbuf))) 1717 pktmbuf_detach_zcp(mbuf); 1718 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1719 } 1720 } 1721 1722 LOG_DEBUG(VHOST_CONFIG, 1723 "in mbuf_destroy_zcp: mbuf count in mempool after " 1724 "mbuf_destroy_zcp is: %d\n", 1725 rte_mempool_count(vpool->pool)); 1726 LOG_DEBUG(VHOST_CONFIG, 1727 "in mbuf_destroy_zcp: mbuf count in ring after " 1728 "mbuf_destroy_zcp is : %d\n", 1729 rte_ring_count(vpool->ring)); 1730 } 1731 1732 /* 1733 * This function update the use flag and counter. 1734 */ 1735 static inline uint32_t __attribute__((always_inline)) 1736 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1737 uint32_t count) 1738 { 1739 struct vhost_virtqueue *vq; 1740 struct vring_desc *desc; 1741 struct rte_mbuf *buff; 1742 /* The virtio_hdr is initialised to 0. */ 1743 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1744 = {{0, 0, 0, 0, 0, 0}, 0}; 1745 uint64_t buff_hdr_addr = 0; 1746 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1747 uint32_t head_idx, packet_success = 0; 1748 uint16_t res_cur_idx; 1749 1750 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1751 1752 if (count == 0) 1753 return 0; 1754 1755 vq = dev->virtqueue[VIRTIO_RXQ]; 1756 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1757 1758 res_cur_idx = vq->last_used_idx; 1759 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1760 dev->device_fh, res_cur_idx, res_cur_idx + count); 1761 1762 /* Retrieve all of the head indexes first to avoid caching issues. */ 1763 for (head_idx = 0; head_idx < count; head_idx++) 1764 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1765 1766 /*Prefetch descriptor index. */ 1767 rte_prefetch0(&vq->desc[head[packet_success]]); 1768 1769 while (packet_success != count) { 1770 /* Get descriptor from available ring */ 1771 desc = &vq->desc[head[packet_success]]; 1772 1773 buff = pkts[packet_success]; 1774 LOG_DEBUG(VHOST_DATA, 1775 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1776 "pkt[%d] descriptor idx: %d\n", 1777 dev->device_fh, packet_success, 1778 MBUF_HEADROOM_UINT32(buff)); 1779 1780 PRINT_PACKET(dev, 1781 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1782 + RTE_PKTMBUF_HEADROOM), 1783 rte_pktmbuf_data_len(buff), 0); 1784 1785 /* Buffer address translation for virtio header. */ 1786 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1787 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1788 1789 /* 1790 * If the descriptors are chained the header and data are 1791 * placed in separate buffers. 1792 */ 1793 if (desc->flags & VRING_DESC_F_NEXT) { 1794 desc->len = vq->vhost_hlen; 1795 desc = &vq->desc[desc->next]; 1796 desc->len = rte_pktmbuf_data_len(buff); 1797 } else { 1798 desc->len = packet_len; 1799 } 1800 1801 /* Update used ring with desc information */ 1802 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1803 = head[packet_success]; 1804 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1805 = packet_len; 1806 res_cur_idx++; 1807 packet_success++; 1808 1809 /* A header is required per buffer. */ 1810 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1811 (const void *)&virtio_hdr, vq->vhost_hlen); 1812 1813 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1814 1815 if (likely(packet_success < count)) { 1816 /* Prefetch descriptor index. */ 1817 rte_prefetch0(&vq->desc[head[packet_success]]); 1818 } 1819 } 1820 1821 rte_compiler_barrier(); 1822 1823 LOG_DEBUG(VHOST_DATA, 1824 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1825 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1826 dev->device_fh, vq->last_used_idx, vq->used->idx); 1827 1828 *(volatile uint16_t *)&vq->used->idx += count; 1829 vq->last_used_idx += count; 1830 1831 LOG_DEBUG(VHOST_DATA, 1832 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1833 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1834 dev->device_fh, vq->last_used_idx, vq->used->idx); 1835 1836 /* Kick the guest if necessary. */ 1837 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1838 eventfd_write(vq->callfd, (eventfd_t)1); 1839 1840 return count; 1841 } 1842 1843 /* 1844 * This function routes the TX packet to the correct interface. 1845 * This may be a local device or the physical port. 1846 */ 1847 static inline void __attribute__((always_inline)) 1848 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1849 uint32_t desc_idx, uint8_t need_copy) 1850 { 1851 struct mbuf_table *tx_q; 1852 struct rte_mbuf **m_table; 1853 void *obj = NULL; 1854 struct rte_mbuf *mbuf; 1855 unsigned len, ret, offset = 0; 1856 struct vpool *vpool; 1857 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1858 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1859 1860 /*Add packet to the port tx queue*/ 1861 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1862 len = tx_q->len; 1863 1864 /* Allocate an mbuf and populate the structure. */ 1865 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1866 rte_ring_sc_dequeue(vpool->ring, &obj); 1867 mbuf = obj; 1868 if (unlikely(mbuf == NULL)) { 1869 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1870 RTE_LOG(ERR, VHOST_DATA, 1871 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1872 dev->device_fh); 1873 put_desc_to_used_list_zcp(vq, desc_idx); 1874 return; 1875 } 1876 1877 if (vm2vm_mode == VM2VM_HARDWARE) { 1878 /* Avoid using a vlan tag from any vm for external pkt, such as 1879 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1880 * selection, MAC address determines it as an external pkt 1881 * which should go to network, while vlan tag determine it as 1882 * a vm2vm pkt should forward to another vm. Hardware confuse 1883 * such a ambiguous situation, so pkt will lost. 1884 */ 1885 vlan_tag = external_pkt_default_vlan_tag; 1886 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1887 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1888 __rte_mbuf_raw_free(mbuf); 1889 return; 1890 } 1891 } 1892 1893 mbuf->nb_segs = m->nb_segs; 1894 mbuf->next = m->next; 1895 mbuf->data_len = m->data_len + offset; 1896 mbuf->pkt_len = mbuf->data_len; 1897 if (unlikely(need_copy)) { 1898 /* Copy the packet contents to the mbuf. */ 1899 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1900 rte_pktmbuf_mtod(m, void *), 1901 m->data_len); 1902 } else { 1903 mbuf->data_off = m->data_off; 1904 mbuf->buf_physaddr = m->buf_physaddr; 1905 mbuf->buf_addr = m->buf_addr; 1906 } 1907 mbuf->ol_flags |= PKT_TX_VLAN_PKT; 1908 mbuf->vlan_tci = vlan_tag; 1909 mbuf->l2_len = sizeof(struct ether_hdr); 1910 mbuf->l3_len = sizeof(struct ipv4_hdr); 1911 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1912 1913 tx_q->m_table[len] = mbuf; 1914 len++; 1915 1916 LOG_DEBUG(VHOST_DATA, 1917 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1918 dev->device_fh, 1919 mbuf->nb_segs, 1920 (mbuf->next == NULL) ? "null" : "non-null"); 1921 1922 if (enable_stats) { 1923 dev_statistics[dev->device_fh].tx_total++; 1924 dev_statistics[dev->device_fh].tx++; 1925 } 1926 1927 if (unlikely(len == MAX_PKT_BURST)) { 1928 m_table = (struct rte_mbuf **)tx_q->m_table; 1929 ret = rte_eth_tx_burst(ports[0], 1930 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1931 1932 /* 1933 * Free any buffers not handled by TX and update 1934 * the port stats. 1935 */ 1936 if (unlikely(ret < len)) { 1937 do { 1938 rte_pktmbuf_free(m_table[ret]); 1939 } while (++ret < len); 1940 } 1941 1942 len = 0; 1943 txmbuf_clean_zcp(dev, vpool); 1944 } 1945 1946 tx_q->len = len; 1947 1948 return; 1949 } 1950 1951 /* 1952 * This function TX all available packets in virtio TX queue for one 1953 * virtio-net device. If it is first packet, it learns MAC address and 1954 * setup VMDQ. 1955 */ 1956 static inline void __attribute__((always_inline)) 1957 virtio_dev_tx_zcp(struct virtio_net *dev) 1958 { 1959 struct rte_mbuf m; 1960 struct vhost_virtqueue *vq; 1961 struct vring_desc *desc; 1962 uint64_t buff_addr = 0, phys_addr; 1963 uint32_t head[MAX_PKT_BURST]; 1964 uint32_t i; 1965 uint16_t free_entries, packet_success = 0; 1966 uint16_t avail_idx; 1967 uint8_t need_copy = 0; 1968 hpa_type addr_type; 1969 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1970 1971 vq = dev->virtqueue[VIRTIO_TXQ]; 1972 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1973 1974 /* If there are no available buffers then return. */ 1975 if (vq->last_used_idx_res == avail_idx) 1976 return; 1977 1978 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1979 1980 /* Prefetch available ring to retrieve head indexes. */ 1981 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1982 1983 /* Get the number of free entries in the ring */ 1984 free_entries = (avail_idx - vq->last_used_idx_res); 1985 1986 /* Limit to MAX_PKT_BURST. */ 1987 free_entries 1988 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1989 1990 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1991 dev->device_fh, free_entries); 1992 1993 /* Retrieve all of the head indexes first to avoid caching issues. */ 1994 for (i = 0; i < free_entries; i++) 1995 head[i] 1996 = vq->avail->ring[(vq->last_used_idx_res + i) 1997 & (vq->size - 1)]; 1998 1999 vq->last_used_idx_res += free_entries; 2000 2001 /* Prefetch descriptor index. */ 2002 rte_prefetch0(&vq->desc[head[packet_success]]); 2003 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 2004 2005 while (packet_success < free_entries) { 2006 desc = &vq->desc[head[packet_success]]; 2007 2008 /* Discard first buffer as it is the virtio header */ 2009 desc = &vq->desc[desc->next]; 2010 2011 /* Buffer address translation. */ 2012 buff_addr = gpa_to_vva(dev, desc->addr); 2013 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 2014 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 2015 &addr_type); 2016 2017 if (likely(packet_success < (free_entries - 1))) 2018 /* Prefetch descriptor index. */ 2019 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 2020 2021 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 2022 RTE_LOG(ERR, VHOST_DATA, 2023 "(%"PRIu64") Invalid frame buffer address found" 2024 "when TX packets!\n", 2025 dev->device_fh); 2026 packet_success++; 2027 continue; 2028 } 2029 2030 /* Prefetch buffer address. */ 2031 rte_prefetch0((void *)(uintptr_t)buff_addr); 2032 2033 /* 2034 * Setup dummy mbuf. This is copied to a real mbuf if 2035 * transmitted out the physical port. 2036 */ 2037 m.data_len = desc->len; 2038 m.nb_segs = 1; 2039 m.next = NULL; 2040 m.data_off = 0; 2041 m.buf_addr = (void *)(uintptr_t)buff_addr; 2042 m.buf_physaddr = phys_addr; 2043 2044 /* 2045 * Check if the frame buffer address from guest crosses 2046 * sub-region or not. 2047 */ 2048 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2049 RTE_LOG(ERR, VHOST_DATA, 2050 "(%"PRIu64") Frame buffer address cross " 2051 "sub-regioin found when attaching TX frame " 2052 "buffer address!\n", 2053 dev->device_fh); 2054 need_copy = 1; 2055 } else 2056 need_copy = 0; 2057 2058 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2059 2060 /* 2061 * If this is the first received packet we need to learn 2062 * the MAC and setup VMDQ 2063 */ 2064 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2065 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2066 /* 2067 * Discard frame if device is scheduled for 2068 * removal or a duplicate MAC address is found. 2069 */ 2070 packet_success += free_entries; 2071 vq->last_used_idx += packet_success; 2072 break; 2073 } 2074 } 2075 2076 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2077 packet_success++; 2078 } 2079 } 2080 2081 /* 2082 * This function is called by each data core. It handles all RX/TX registered 2083 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2084 * addresses are compared with all devices in the main linked list. 2085 */ 2086 static int 2087 switch_worker_zcp(__attribute__((unused)) void *arg) 2088 { 2089 struct virtio_net *dev = NULL; 2090 struct vhost_dev *vdev = NULL; 2091 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2092 struct virtio_net_data_ll *dev_ll; 2093 struct mbuf_table *tx_q; 2094 volatile struct lcore_ll_info *lcore_ll; 2095 const uint64_t drain_tsc 2096 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2097 * BURST_TX_DRAIN_US; 2098 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2099 unsigned ret; 2100 const uint16_t lcore_id = rte_lcore_id(); 2101 uint16_t count_in_ring, rx_count = 0; 2102 2103 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2104 2105 lcore_ll = lcore_info[lcore_id].lcore_ll; 2106 prev_tsc = 0; 2107 2108 while (1) { 2109 cur_tsc = rte_rdtsc(); 2110 2111 /* TX burst queue drain */ 2112 diff_tsc = cur_tsc - prev_tsc; 2113 if (unlikely(diff_tsc > drain_tsc)) { 2114 /* 2115 * Get mbuf from vpool.pool and detach mbuf and 2116 * put back into vpool.ring. 2117 */ 2118 dev_ll = lcore_ll->ll_root_used; 2119 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2120 /* Get virtio device ID */ 2121 vdev = dev_ll->vdev; 2122 dev = vdev->dev; 2123 2124 if (likely(!vdev->remove)) { 2125 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2126 if (tx_q->len) { 2127 LOG_DEBUG(VHOST_DATA, 2128 "TX queue drained after timeout" 2129 " with burst size %u\n", 2130 tx_q->len); 2131 2132 /* 2133 * Tx any packets in the queue 2134 */ 2135 ret = rte_eth_tx_burst( 2136 ports[0], 2137 (uint16_t)tx_q->txq_id, 2138 (struct rte_mbuf **) 2139 tx_q->m_table, 2140 (uint16_t)tx_q->len); 2141 if (unlikely(ret < tx_q->len)) { 2142 do { 2143 rte_pktmbuf_free( 2144 tx_q->m_table[ret]); 2145 } while (++ret < tx_q->len); 2146 } 2147 tx_q->len = 0; 2148 2149 txmbuf_clean_zcp(dev, 2150 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2151 } 2152 } 2153 dev_ll = dev_ll->next; 2154 } 2155 prev_tsc = cur_tsc; 2156 } 2157 2158 rte_prefetch0(lcore_ll->ll_root_used); 2159 2160 /* 2161 * Inform the configuration core that we have exited the linked 2162 * list and that no devices are in use if requested. 2163 */ 2164 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2165 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2166 2167 /* Process devices */ 2168 dev_ll = lcore_ll->ll_root_used; 2169 2170 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2171 vdev = dev_ll->vdev; 2172 dev = vdev->dev; 2173 if (unlikely(vdev->remove)) { 2174 dev_ll = dev_ll->next; 2175 unlink_vmdq(vdev); 2176 vdev->ready = DEVICE_SAFE_REMOVE; 2177 continue; 2178 } 2179 2180 if (likely(vdev->ready == DEVICE_RX)) { 2181 uint32_t index = vdev->vmdq_rx_q; 2182 uint16_t i; 2183 count_in_ring 2184 = rte_ring_count(vpool_array[index].ring); 2185 uint16_t free_entries 2186 = (uint16_t)get_available_ring_num_zcp(dev); 2187 2188 /* 2189 * Attach all mbufs in vpool.ring and put back 2190 * into vpool.pool. 2191 */ 2192 for (i = 0; 2193 i < RTE_MIN(free_entries, 2194 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2195 i++) 2196 attach_rxmbuf_zcp(dev); 2197 2198 /* Handle guest RX */ 2199 rx_count = rte_eth_rx_burst(ports[0], 2200 vdev->vmdq_rx_q, pkts_burst, 2201 MAX_PKT_BURST); 2202 2203 if (rx_count) { 2204 ret_count = virtio_dev_rx_zcp(dev, 2205 pkts_burst, rx_count); 2206 if (enable_stats) { 2207 dev_statistics[dev->device_fh].rx_total 2208 += rx_count; 2209 dev_statistics[dev->device_fh].rx 2210 += ret_count; 2211 } 2212 while (likely(rx_count)) { 2213 rx_count--; 2214 pktmbuf_detach_zcp( 2215 pkts_burst[rx_count]); 2216 rte_ring_sp_enqueue( 2217 vpool_array[index].ring, 2218 (void *)pkts_burst[rx_count]); 2219 } 2220 } 2221 } 2222 2223 if (likely(!vdev->remove)) 2224 /* Handle guest TX */ 2225 virtio_dev_tx_zcp(dev); 2226 2227 /* Move to the next device in the list */ 2228 dev_ll = dev_ll->next; 2229 } 2230 } 2231 2232 return 0; 2233 } 2234 2235 2236 /* 2237 * Add an entry to a used linked list. A free entry must first be found 2238 * in the free linked list using get_data_ll_free_entry(); 2239 */ 2240 static void 2241 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2242 struct virtio_net_data_ll *ll_dev) 2243 { 2244 struct virtio_net_data_ll *ll = *ll_root_addr; 2245 2246 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2247 ll_dev->next = NULL; 2248 rte_compiler_barrier(); 2249 2250 /* If ll == NULL then this is the first device. */ 2251 if (ll) { 2252 /* Increment to the tail of the linked list. */ 2253 while ((ll->next != NULL) ) 2254 ll = ll->next; 2255 2256 ll->next = ll_dev; 2257 } else { 2258 *ll_root_addr = ll_dev; 2259 } 2260 } 2261 2262 /* 2263 * Remove an entry from a used linked list. The entry must then be added to 2264 * the free linked list using put_data_ll_free_entry(). 2265 */ 2266 static void 2267 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2268 struct virtio_net_data_ll *ll_dev, 2269 struct virtio_net_data_ll *ll_dev_last) 2270 { 2271 struct virtio_net_data_ll *ll = *ll_root_addr; 2272 2273 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2274 return; 2275 2276 if (ll_dev == ll) 2277 *ll_root_addr = ll_dev->next; 2278 else 2279 if (likely(ll_dev_last != NULL)) 2280 ll_dev_last->next = ll_dev->next; 2281 else 2282 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2283 } 2284 2285 /* 2286 * Find and return an entry from the free linked list. 2287 */ 2288 static struct virtio_net_data_ll * 2289 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2290 { 2291 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2292 struct virtio_net_data_ll *ll_dev; 2293 2294 if (ll_free == NULL) 2295 return NULL; 2296 2297 ll_dev = ll_free; 2298 *ll_root_addr = ll_free->next; 2299 2300 return ll_dev; 2301 } 2302 2303 /* 2304 * Place an entry back on to the free linked list. 2305 */ 2306 static void 2307 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2308 struct virtio_net_data_ll *ll_dev) 2309 { 2310 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2311 2312 if (ll_dev == NULL) 2313 return; 2314 2315 ll_dev->next = ll_free; 2316 *ll_root_addr = ll_dev; 2317 } 2318 2319 /* 2320 * Creates a linked list of a given size. 2321 */ 2322 static struct virtio_net_data_ll * 2323 alloc_data_ll(uint32_t size) 2324 { 2325 struct virtio_net_data_ll *ll_new; 2326 uint32_t i; 2327 2328 /* Malloc and then chain the linked list. */ 2329 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2330 if (ll_new == NULL) { 2331 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2332 return NULL; 2333 } 2334 2335 for (i = 0; i < size - 1; i++) { 2336 ll_new[i].vdev = NULL; 2337 ll_new[i].next = &ll_new[i+1]; 2338 } 2339 ll_new[i].next = NULL; 2340 2341 return ll_new; 2342 } 2343 2344 /* 2345 * Create the main linked list along with each individual cores linked list. A used and a free list 2346 * are created to manage entries. 2347 */ 2348 static int 2349 init_data_ll (void) 2350 { 2351 int lcore; 2352 2353 RTE_LCORE_FOREACH_SLAVE(lcore) { 2354 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2355 if (lcore_info[lcore].lcore_ll == NULL) { 2356 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2357 return -1; 2358 } 2359 2360 lcore_info[lcore].lcore_ll->device_num = 0; 2361 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2362 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2363 if (num_devices % num_switching_cores) 2364 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2365 else 2366 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2367 } 2368 2369 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2370 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2371 2372 return 0; 2373 } 2374 2375 /* 2376 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2377 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2378 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2379 */ 2380 static void 2381 destroy_device (volatile struct virtio_net *dev) 2382 { 2383 struct virtio_net_data_ll *ll_lcore_dev_cur; 2384 struct virtio_net_data_ll *ll_main_dev_cur; 2385 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2386 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2387 struct vhost_dev *vdev; 2388 int lcore; 2389 2390 dev->flags &= ~VIRTIO_DEV_RUNNING; 2391 2392 vdev = (struct vhost_dev *)dev->priv; 2393 /*set the remove flag. */ 2394 vdev->remove = 1; 2395 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2396 rte_pause(); 2397 } 2398 2399 /* Search for entry to be removed from lcore ll */ 2400 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2401 while (ll_lcore_dev_cur != NULL) { 2402 if (ll_lcore_dev_cur->vdev == vdev) { 2403 break; 2404 } else { 2405 ll_lcore_dev_last = ll_lcore_dev_cur; 2406 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2407 } 2408 } 2409 2410 if (ll_lcore_dev_cur == NULL) { 2411 RTE_LOG(ERR, VHOST_CONFIG, 2412 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2413 dev->device_fh); 2414 return; 2415 } 2416 2417 /* Search for entry to be removed from main ll */ 2418 ll_main_dev_cur = ll_root_used; 2419 ll_main_dev_last = NULL; 2420 while (ll_main_dev_cur != NULL) { 2421 if (ll_main_dev_cur->vdev == vdev) { 2422 break; 2423 } else { 2424 ll_main_dev_last = ll_main_dev_cur; 2425 ll_main_dev_cur = ll_main_dev_cur->next; 2426 } 2427 } 2428 2429 /* Remove entries from the lcore and main ll. */ 2430 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2431 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2432 2433 /* Set the dev_removal_flag on each lcore. */ 2434 RTE_LCORE_FOREACH_SLAVE(lcore) { 2435 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2436 } 2437 2438 /* 2439 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2440 * they can no longer access the device removed from the linked lists and that the devices 2441 * are no longer in use. 2442 */ 2443 RTE_LCORE_FOREACH_SLAVE(lcore) { 2444 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2445 rte_pause(); 2446 } 2447 } 2448 2449 /* Add the entries back to the lcore and main free ll.*/ 2450 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2451 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2452 2453 /* Decrement number of device on the lcore. */ 2454 lcore_info[vdev->coreid].lcore_ll->device_num--; 2455 2456 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2457 2458 if (zero_copy) { 2459 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2460 2461 /* Stop the RX queue. */ 2462 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2463 LOG_DEBUG(VHOST_CONFIG, 2464 "(%"PRIu64") In destroy_device: Failed to stop " 2465 "rx queue:%d\n", 2466 dev->device_fh, 2467 vdev->vmdq_rx_q); 2468 } 2469 2470 LOG_DEBUG(VHOST_CONFIG, 2471 "(%"PRIu64") in destroy_device: Start put mbuf in " 2472 "mempool back to ring for RX queue: %d\n", 2473 dev->device_fh, vdev->vmdq_rx_q); 2474 2475 mbuf_destroy_zcp(vpool); 2476 2477 /* Stop the TX queue. */ 2478 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2479 LOG_DEBUG(VHOST_CONFIG, 2480 "(%"PRIu64") In destroy_device: Failed to " 2481 "stop tx queue:%d\n", 2482 dev->device_fh, vdev->vmdq_rx_q); 2483 } 2484 2485 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2486 2487 LOG_DEBUG(VHOST_CONFIG, 2488 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2489 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2490 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2491 dev->device_fh); 2492 2493 mbuf_destroy_zcp(vpool); 2494 rte_free(vdev->regions_hpa); 2495 } 2496 rte_free(vdev); 2497 2498 } 2499 2500 /* 2501 * Calculate the region count of physical continous regions for one particular 2502 * region of whose vhost virtual address is continous. The particular region 2503 * start from vva_start, with size of 'size' in argument. 2504 */ 2505 static uint32_t 2506 check_hpa_regions(uint64_t vva_start, uint64_t size) 2507 { 2508 uint32_t i, nregions = 0, page_size = getpagesize(); 2509 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2510 if (vva_start % page_size) { 2511 LOG_DEBUG(VHOST_CONFIG, 2512 "in check_countinous: vva start(%p) mod page_size(%d) " 2513 "has remainder\n", 2514 (void *)(uintptr_t)vva_start, page_size); 2515 return 0; 2516 } 2517 if (size % page_size) { 2518 LOG_DEBUG(VHOST_CONFIG, 2519 "in check_countinous: " 2520 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2521 size, page_size); 2522 return 0; 2523 } 2524 for (i = 0; i < size - page_size; i = i + page_size) { 2525 cur_phys_addr 2526 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2527 next_phys_addr = rte_mem_virt2phy( 2528 (void *)(uintptr_t)(vva_start + i + page_size)); 2529 if ((cur_phys_addr + page_size) != next_phys_addr) { 2530 ++nregions; 2531 LOG_DEBUG(VHOST_CONFIG, 2532 "in check_continuous: hva addr:(%p) is not " 2533 "continuous with hva addr:(%p), diff:%d\n", 2534 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2535 (void *)(uintptr_t)(vva_start + (uint64_t)i 2536 + page_size), page_size); 2537 LOG_DEBUG(VHOST_CONFIG, 2538 "in check_continuous: hpa addr:(%p) is not " 2539 "continuous with hpa addr:(%p), " 2540 "diff:(%"PRIu64")\n", 2541 (void *)(uintptr_t)cur_phys_addr, 2542 (void *)(uintptr_t)next_phys_addr, 2543 (next_phys_addr-cur_phys_addr)); 2544 } 2545 } 2546 return nregions; 2547 } 2548 2549 /* 2550 * Divide each region whose vhost virtual address is continous into a few 2551 * sub-regions, make sure the physical address within each sub-region are 2552 * continous. And fill offset(to GPA) and size etc. information of each 2553 * sub-region into regions_hpa. 2554 */ 2555 static uint32_t 2556 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2557 { 2558 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2559 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2560 2561 if (mem_region_hpa == NULL) 2562 return 0; 2563 2564 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2565 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2566 virtio_memory->regions[regionidx].address_offset; 2567 mem_region_hpa[regionidx_hpa].guest_phys_address 2568 = virtio_memory->regions[regionidx].guest_phys_address; 2569 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2570 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2571 mem_region_hpa[regionidx_hpa].guest_phys_address; 2572 LOG_DEBUG(VHOST_CONFIG, 2573 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2574 regionidx_hpa, 2575 (void *)(uintptr_t) 2576 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2577 LOG_DEBUG(VHOST_CONFIG, 2578 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2579 regionidx_hpa, 2580 (void *)(uintptr_t) 2581 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2582 for (i = 0, k = 0; 2583 i < virtio_memory->regions[regionidx].memory_size - 2584 page_size; 2585 i += page_size) { 2586 cur_phys_addr = rte_mem_virt2phy( 2587 (void *)(uintptr_t)(vva_start + i)); 2588 next_phys_addr = rte_mem_virt2phy( 2589 (void *)(uintptr_t)(vva_start + 2590 i + page_size)); 2591 if ((cur_phys_addr + page_size) != next_phys_addr) { 2592 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2593 mem_region_hpa[regionidx_hpa].guest_phys_address + 2594 k + page_size; 2595 mem_region_hpa[regionidx_hpa].memory_size 2596 = k + page_size; 2597 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2598 "phys addr end [%d]:(%p)\n", 2599 regionidx_hpa, 2600 (void *)(uintptr_t) 2601 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2602 LOG_DEBUG(VHOST_CONFIG, 2603 "in fill_hpa_regions: guest phys addr " 2604 "size [%d]:(%p)\n", 2605 regionidx_hpa, 2606 (void *)(uintptr_t) 2607 (mem_region_hpa[regionidx_hpa].memory_size)); 2608 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2609 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2610 ++regionidx_hpa; 2611 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2612 next_phys_addr - 2613 mem_region_hpa[regionidx_hpa].guest_phys_address; 2614 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2615 " phys addr start[%d]:(%p)\n", 2616 regionidx_hpa, 2617 (void *)(uintptr_t) 2618 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2619 LOG_DEBUG(VHOST_CONFIG, 2620 "in fill_hpa_regions: host phys addr " 2621 "start[%d]:(%p)\n", 2622 regionidx_hpa, 2623 (void *)(uintptr_t) 2624 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2625 k = 0; 2626 } else { 2627 k += page_size; 2628 } 2629 } 2630 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2631 = mem_region_hpa[regionidx_hpa].guest_phys_address 2632 + k + page_size; 2633 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2634 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2635 "[%d]:(%p)\n", regionidx_hpa, 2636 (void *)(uintptr_t) 2637 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2638 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2639 "[%d]:(%p)\n", regionidx_hpa, 2640 (void *)(uintptr_t) 2641 (mem_region_hpa[regionidx_hpa].memory_size)); 2642 ++regionidx_hpa; 2643 } 2644 return regionidx_hpa; 2645 } 2646 2647 /* 2648 * A new device is added to a data core. First the device is added to the main linked list 2649 * and the allocated to a specific data core. 2650 */ 2651 static int 2652 new_device (struct virtio_net *dev) 2653 { 2654 struct virtio_net_data_ll *ll_dev; 2655 int lcore, core_add = 0; 2656 uint32_t device_num_min = num_devices; 2657 struct vhost_dev *vdev; 2658 uint32_t regionidx; 2659 2660 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2661 if (vdev == NULL) { 2662 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2663 dev->device_fh); 2664 return -1; 2665 } 2666 vdev->dev = dev; 2667 dev->priv = vdev; 2668 2669 if (zero_copy) { 2670 vdev->nregions_hpa = dev->mem->nregions; 2671 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2672 vdev->nregions_hpa 2673 += check_hpa_regions( 2674 dev->mem->regions[regionidx].guest_phys_address 2675 + dev->mem->regions[regionidx].address_offset, 2676 dev->mem->regions[regionidx].memory_size); 2677 2678 } 2679 2680 vdev->regions_hpa = rte_calloc("vhost hpa region", 2681 vdev->nregions_hpa, 2682 sizeof(struct virtio_memory_regions_hpa), 2683 RTE_CACHE_LINE_SIZE); 2684 if (vdev->regions_hpa == NULL) { 2685 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2686 rte_free(vdev); 2687 return -1; 2688 } 2689 2690 2691 if (fill_hpa_memory_regions( 2692 vdev->regions_hpa, dev->mem 2693 ) != vdev->nregions_hpa) { 2694 2695 RTE_LOG(ERR, VHOST_CONFIG, 2696 "hpa memory regions number mismatch: " 2697 "[%d]\n", vdev->nregions_hpa); 2698 rte_free(vdev->regions_hpa); 2699 rte_free(vdev); 2700 return -1; 2701 } 2702 } 2703 2704 2705 /* Add device to main ll */ 2706 ll_dev = get_data_ll_free_entry(&ll_root_free); 2707 if (ll_dev == NULL) { 2708 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2709 "of %d devices per core has been reached\n", 2710 dev->device_fh, num_devices); 2711 if (vdev->regions_hpa) 2712 rte_free(vdev->regions_hpa); 2713 rte_free(vdev); 2714 return -1; 2715 } 2716 ll_dev->vdev = vdev; 2717 add_data_ll_entry(&ll_root_used, ll_dev); 2718 vdev->vmdq_rx_q 2719 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2720 2721 if (zero_copy) { 2722 uint32_t index = vdev->vmdq_rx_q; 2723 uint32_t count_in_ring, i; 2724 struct mbuf_table *tx_q; 2725 2726 count_in_ring = rte_ring_count(vpool_array[index].ring); 2727 2728 LOG_DEBUG(VHOST_CONFIG, 2729 "(%"PRIu64") in new_device: mbuf count in mempool " 2730 "before attach is: %d\n", 2731 dev->device_fh, 2732 rte_mempool_count(vpool_array[index].pool)); 2733 LOG_DEBUG(VHOST_CONFIG, 2734 "(%"PRIu64") in new_device: mbuf count in ring " 2735 "before attach is : %d\n", 2736 dev->device_fh, count_in_ring); 2737 2738 /* 2739 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2740 */ 2741 for (i = 0; i < count_in_ring; i++) 2742 attach_rxmbuf_zcp(dev); 2743 2744 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2745 "mempool after attach is: %d\n", 2746 dev->device_fh, 2747 rte_mempool_count(vpool_array[index].pool)); 2748 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2749 "ring after attach is : %d\n", 2750 dev->device_fh, 2751 rte_ring_count(vpool_array[index].ring)); 2752 2753 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2754 tx_q->txq_id = vdev->vmdq_rx_q; 2755 2756 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2757 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2758 2759 LOG_DEBUG(VHOST_CONFIG, 2760 "(%"PRIu64") In new_device: Failed to start " 2761 "tx queue:%d\n", 2762 dev->device_fh, vdev->vmdq_rx_q); 2763 2764 mbuf_destroy_zcp(vpool); 2765 rte_free(vdev->regions_hpa); 2766 rte_free(vdev); 2767 return -1; 2768 } 2769 2770 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2771 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2772 2773 LOG_DEBUG(VHOST_CONFIG, 2774 "(%"PRIu64") In new_device: Failed to start " 2775 "rx queue:%d\n", 2776 dev->device_fh, vdev->vmdq_rx_q); 2777 2778 /* Stop the TX queue. */ 2779 if (rte_eth_dev_tx_queue_stop(ports[0], 2780 vdev->vmdq_rx_q) != 0) { 2781 LOG_DEBUG(VHOST_CONFIG, 2782 "(%"PRIu64") In new_device: Failed to " 2783 "stop tx queue:%d\n", 2784 dev->device_fh, vdev->vmdq_rx_q); 2785 } 2786 2787 mbuf_destroy_zcp(vpool); 2788 rte_free(vdev->regions_hpa); 2789 rte_free(vdev); 2790 return -1; 2791 } 2792 2793 } 2794 2795 /*reset ready flag*/ 2796 vdev->ready = DEVICE_MAC_LEARNING; 2797 vdev->remove = 0; 2798 2799 /* Find a suitable lcore to add the device. */ 2800 RTE_LCORE_FOREACH_SLAVE(lcore) { 2801 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2802 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2803 core_add = lcore; 2804 } 2805 } 2806 /* Add device to lcore ll */ 2807 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2808 if (ll_dev == NULL) { 2809 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2810 vdev->ready = DEVICE_SAFE_REMOVE; 2811 destroy_device(dev); 2812 rte_free(vdev->regions_hpa); 2813 rte_free(vdev); 2814 return -1; 2815 } 2816 ll_dev->vdev = vdev; 2817 vdev->coreid = core_add; 2818 2819 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2820 2821 /* Initialize device stats */ 2822 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2823 2824 /* Disable notifications. */ 2825 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2826 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2827 lcore_info[vdev->coreid].lcore_ll->device_num++; 2828 dev->flags |= VIRTIO_DEV_RUNNING; 2829 2830 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2831 2832 return 0; 2833 } 2834 2835 /* 2836 * These callback allow devices to be added to the data core when configuration 2837 * has been fully complete. 2838 */ 2839 static const struct virtio_net_device_ops virtio_net_device_ops = 2840 { 2841 .new_device = new_device, 2842 .destroy_device = destroy_device, 2843 }; 2844 2845 /* 2846 * This is a thread will wake up after a period to print stats if the user has 2847 * enabled them. 2848 */ 2849 static void 2850 print_stats(void) 2851 { 2852 struct virtio_net_data_ll *dev_ll; 2853 uint64_t tx_dropped, rx_dropped; 2854 uint64_t tx, tx_total, rx, rx_total; 2855 uint32_t device_fh; 2856 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2857 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2858 2859 while(1) { 2860 sleep(enable_stats); 2861 2862 /* Clear screen and move to top left */ 2863 printf("%s%s", clr, top_left); 2864 2865 printf("\nDevice statistics ===================================="); 2866 2867 dev_ll = ll_root_used; 2868 while (dev_ll != NULL) { 2869 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2870 tx_total = dev_statistics[device_fh].tx_total; 2871 tx = dev_statistics[device_fh].tx; 2872 tx_dropped = tx_total - tx; 2873 if (zero_copy == 0) { 2874 rx_total = rte_atomic64_read( 2875 &dev_statistics[device_fh].rx_total_atomic); 2876 rx = rte_atomic64_read( 2877 &dev_statistics[device_fh].rx_atomic); 2878 } else { 2879 rx_total = dev_statistics[device_fh].rx_total; 2880 rx = dev_statistics[device_fh].rx; 2881 } 2882 rx_dropped = rx_total - rx; 2883 2884 printf("\nStatistics for device %"PRIu32" ------------------------------" 2885 "\nTX total: %"PRIu64"" 2886 "\nTX dropped: %"PRIu64"" 2887 "\nTX successful: %"PRIu64"" 2888 "\nRX total: %"PRIu64"" 2889 "\nRX dropped: %"PRIu64"" 2890 "\nRX successful: %"PRIu64"", 2891 device_fh, 2892 tx_total, 2893 tx_dropped, 2894 tx, 2895 rx_total, 2896 rx_dropped, 2897 rx); 2898 2899 dev_ll = dev_ll->next; 2900 } 2901 printf("\n======================================================\n"); 2902 } 2903 } 2904 2905 static void 2906 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2907 char *ring_name, uint32_t nb_mbuf) 2908 { 2909 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, 2910 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket); 2911 if (vpool_array[index].pool != NULL) { 2912 vpool_array[index].ring 2913 = rte_ring_create(ring_name, 2914 rte_align32pow2(nb_mbuf + 1), 2915 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2916 if (likely(vpool_array[index].ring != NULL)) { 2917 LOG_DEBUG(VHOST_CONFIG, 2918 "in setup_mempool_tbl: mbuf count in " 2919 "mempool is: %d\n", 2920 rte_mempool_count(vpool_array[index].pool)); 2921 LOG_DEBUG(VHOST_CONFIG, 2922 "in setup_mempool_tbl: mbuf count in " 2923 "ring is: %d\n", 2924 rte_ring_count(vpool_array[index].ring)); 2925 } else { 2926 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2927 ring_name); 2928 } 2929 2930 /* Need consider head room. */ 2931 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP; 2932 } else { 2933 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2934 } 2935 } 2936 2937 /* When we receive a INT signal, unregister vhost driver */ 2938 static void 2939 sigint_handler(__rte_unused int signum) 2940 { 2941 /* Unregister vhost driver. */ 2942 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 2943 if (ret != 0) 2944 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 2945 exit(0); 2946 } 2947 2948 /* 2949 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2950 * device is also registered here to handle the IOCTLs. 2951 */ 2952 int 2953 main(int argc, char *argv[]) 2954 { 2955 struct rte_mempool *mbuf_pool = NULL; 2956 unsigned lcore_id, core_id = 0; 2957 unsigned nb_ports, valid_num_ports; 2958 int ret; 2959 uint8_t portid; 2960 uint16_t queue_id; 2961 static pthread_t tid; 2962 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 2963 2964 signal(SIGINT, sigint_handler); 2965 2966 /* init EAL */ 2967 ret = rte_eal_init(argc, argv); 2968 if (ret < 0) 2969 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2970 argc -= ret; 2971 argv += ret; 2972 2973 /* parse app arguments */ 2974 ret = us_vhost_parse_args(argc, argv); 2975 if (ret < 0) 2976 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2977 2978 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2979 if (rte_lcore_is_enabled(lcore_id)) 2980 lcore_ids[core_id ++] = lcore_id; 2981 2982 if (rte_lcore_count() > RTE_MAX_LCORE) 2983 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2984 2985 /*set the number of swithcing cores available*/ 2986 num_switching_cores = rte_lcore_count()-1; 2987 2988 /* Get the number of physical ports. */ 2989 nb_ports = rte_eth_dev_count(); 2990 if (nb_ports > RTE_MAX_ETHPORTS) 2991 nb_ports = RTE_MAX_ETHPORTS; 2992 2993 /* 2994 * Update the global var NUM_PORTS and global array PORTS 2995 * and get value of var VALID_NUM_PORTS according to system ports number 2996 */ 2997 valid_num_ports = check_ports_num(nb_ports); 2998 2999 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 3000 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 3001 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 3002 return -1; 3003 } 3004 3005 if (zero_copy == 0) { 3006 /* Create the mbuf pool. */ 3007 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 3008 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 3009 0, MBUF_DATA_SIZE, rte_socket_id()); 3010 if (mbuf_pool == NULL) 3011 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 3012 3013 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 3014 vpool_array[queue_id].pool = mbuf_pool; 3015 3016 if (vm2vm_mode == VM2VM_HARDWARE) { 3017 /* Enable VT loop back to let L2 switch to do it. */ 3018 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3019 LOG_DEBUG(VHOST_CONFIG, 3020 "Enable loop back for L2 switch in vmdq.\n"); 3021 } 3022 } else { 3023 uint32_t nb_mbuf; 3024 char pool_name[RTE_MEMPOOL_NAMESIZE]; 3025 char ring_name[RTE_MEMPOOL_NAMESIZE]; 3026 3027 nb_mbuf = num_rx_descriptor 3028 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3029 + num_switching_cores * MAX_PKT_BURST; 3030 3031 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3032 snprintf(pool_name, sizeof(pool_name), 3033 "rxmbuf_pool_%u", queue_id); 3034 snprintf(ring_name, sizeof(ring_name), 3035 "rxmbuf_ring_%u", queue_id); 3036 setup_mempool_tbl(rte_socket_id(), queue_id, 3037 pool_name, ring_name, nb_mbuf); 3038 } 3039 3040 nb_mbuf = num_tx_descriptor 3041 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3042 + num_switching_cores * MAX_PKT_BURST; 3043 3044 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3045 snprintf(pool_name, sizeof(pool_name), 3046 "txmbuf_pool_%u", queue_id); 3047 snprintf(ring_name, sizeof(ring_name), 3048 "txmbuf_ring_%u", queue_id); 3049 setup_mempool_tbl(rte_socket_id(), 3050 (queue_id + MAX_QUEUES), 3051 pool_name, ring_name, nb_mbuf); 3052 } 3053 3054 if (vm2vm_mode == VM2VM_HARDWARE) { 3055 /* Enable VT loop back to let L2 switch to do it. */ 3056 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3057 LOG_DEBUG(VHOST_CONFIG, 3058 "Enable loop back for L2 switch in vmdq.\n"); 3059 } 3060 } 3061 /* Set log level. */ 3062 rte_set_log_level(LOG_LEVEL); 3063 3064 /* initialize all ports */ 3065 for (portid = 0; portid < nb_ports; portid++) { 3066 /* skip ports that are not enabled */ 3067 if ((enabled_port_mask & (1 << portid)) == 0) { 3068 RTE_LOG(INFO, VHOST_PORT, 3069 "Skipping disabled port %d\n", portid); 3070 continue; 3071 } 3072 if (port_init(portid) != 0) 3073 rte_exit(EXIT_FAILURE, 3074 "Cannot initialize network ports\n"); 3075 } 3076 3077 /* Initialise all linked lists. */ 3078 if (init_data_ll() == -1) 3079 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3080 3081 /* Initialize device stats */ 3082 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3083 3084 /* Enable stats if the user option is set. */ 3085 if (enable_stats) { 3086 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 3087 if (ret != 0) 3088 rte_exit(EXIT_FAILURE, 3089 "Cannot create print-stats thread\n"); 3090 3091 /* Set thread_name for aid in debugging. */ 3092 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 3093 ret = rte_thread_setname(tid, thread_name); 3094 if (ret != 0) 3095 RTE_LOG(ERR, VHOST_CONFIG, 3096 "Cannot set print-stats name\n"); 3097 } 3098 3099 /* Launch all data cores. */ 3100 if (zero_copy == 0) { 3101 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3102 rte_eal_remote_launch(switch_worker, 3103 mbuf_pool, lcore_id); 3104 } 3105 } else { 3106 uint32_t count_in_mempool, index, i; 3107 for (index = 0; index < 2*MAX_QUEUES; index++) { 3108 /* For all RX and TX queues. */ 3109 count_in_mempool 3110 = rte_mempool_count(vpool_array[index].pool); 3111 3112 /* 3113 * Transfer all un-attached mbufs from vpool.pool 3114 * to vpoo.ring. 3115 */ 3116 for (i = 0; i < count_in_mempool; i++) { 3117 struct rte_mbuf *mbuf 3118 = __rte_mbuf_raw_alloc( 3119 vpool_array[index].pool); 3120 rte_ring_sp_enqueue(vpool_array[index].ring, 3121 (void *)mbuf); 3122 } 3123 3124 LOG_DEBUG(VHOST_CONFIG, 3125 "in main: mbuf count in mempool at initial " 3126 "is: %d\n", count_in_mempool); 3127 LOG_DEBUG(VHOST_CONFIG, 3128 "in main: mbuf count in ring at initial is :" 3129 " %d\n", 3130 rte_ring_count(vpool_array[index].ring)); 3131 } 3132 3133 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3134 rte_eal_remote_launch(switch_worker_zcp, NULL, 3135 lcore_id); 3136 } 3137 3138 if (mergeable == 0) 3139 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3140 3141 /* Register vhost(cuse or user) driver to handle vhost messages. */ 3142 ret = rte_vhost_driver_register((char *)&dev_basename); 3143 if (ret != 0) 3144 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 3145 3146 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3147 3148 /* Start CUSE session. */ 3149 rte_vhost_driver_session_start(); 3150 return 0; 3151 3152 } 3153