1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 56 #include "main.h" 57 58 #ifndef MAX_QUEUES 59 #define MAX_QUEUES 128 60 #endif 61 62 /* the maximum number of external ports supported */ 63 #define MAX_SUP_PORTS 1 64 65 /* 66 * Calculate the number of buffers needed per port 67 */ 68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 69 (num_switching_cores*MAX_PKT_BURST) + \ 70 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 71 ((num_switching_cores+1)*MBUF_CACHE_SIZE)) 72 73 #define MBUF_CACHE_SIZE 128 74 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 75 76 /* 77 * No frame data buffer allocated from host are required for zero copy 78 * implementation, guest will allocate the frame data buffer, and vhost 79 * directly use it. 80 */ 81 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM 82 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE 83 #define MBUF_CACHE_SIZE_ZCP 0 84 85 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 86 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 87 88 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 89 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 90 91 #define JUMBO_FRAME_MAX_SIZE 0x2600 92 93 /* State of virtio device. */ 94 #define DEVICE_MAC_LEARNING 0 95 #define DEVICE_RX 1 96 #define DEVICE_SAFE_REMOVE 2 97 98 /* Config_core_flag status definitions. */ 99 #define REQUEST_DEV_REMOVAL 1 100 #define ACK_DEV_REMOVAL 0 101 102 /* Configurable number of RX/TX ring descriptors */ 103 #define RTE_TEST_RX_DESC_DEFAULT 1024 104 #define RTE_TEST_TX_DESC_DEFAULT 512 105 106 /* 107 * Need refine these 2 macros for legacy and DPDK based front end: 108 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 109 * And then adjust power 2. 110 */ 111 /* 112 * For legacy front end, 128 descriptors, 113 * half for virtio header, another half for mbuf. 114 */ 115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 117 118 /* Get first 4 bytes in mbuf headroom. */ 119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 120 + sizeof(struct rte_mbuf))) 121 122 /* true if x is a power of 2 */ 123 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 124 125 #define INVALID_PORT_ID 0xFF 126 127 /* Max number of devices. Limited by vmdq. */ 128 #define MAX_DEVICES 64 129 130 /* Size of buffers used for snprintfs. */ 131 #define MAX_PRINT_BUFF 6072 132 133 /* Maximum character device basename size. */ 134 #define MAX_BASENAME_SZ 10 135 136 /* Maximum long option length for option parsing. */ 137 #define MAX_LONG_OPT_SZ 64 138 139 /* Used to compare MAC addresses. */ 140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 141 142 /* Number of descriptors per cacheline. */ 143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 144 145 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb)) 146 147 /* mask of enabled ports */ 148 static uint32_t enabled_port_mask = 0; 149 150 /* Promiscuous mode */ 151 static uint32_t promiscuous; 152 153 /*Number of switching cores enabled*/ 154 static uint32_t num_switching_cores = 0; 155 156 /* number of devices/queues to support*/ 157 static uint32_t num_queues = 0; 158 static uint32_t num_devices; 159 160 /* 161 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 162 * disabled on default. 163 */ 164 static uint32_t zero_copy; 165 static int mergeable; 166 167 /* Do vlan strip on host, enabled on default */ 168 static uint32_t vlan_strip = 1; 169 170 /* number of descriptors to apply*/ 171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 173 174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 175 #define MAX_RING_DESC 4096 176 177 struct vpool { 178 struct rte_mempool *pool; 179 struct rte_ring *ring; 180 uint32_t buf_size; 181 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 182 183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 184 typedef enum { 185 VM2VM_DISABLED = 0, 186 VM2VM_SOFTWARE = 1, 187 VM2VM_HARDWARE = 2, 188 VM2VM_LAST 189 } vm2vm_type; 190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 191 192 /* The type of host physical address translated from guest physical address. */ 193 typedef enum { 194 PHYS_ADDR_CONTINUOUS = 0, 195 PHYS_ADDR_CROSS_SUBREG = 1, 196 PHYS_ADDR_INVALID = 2, 197 PHYS_ADDR_LAST 198 } hpa_type; 199 200 /* Enable stats. */ 201 static uint32_t enable_stats = 0; 202 /* Enable retries on RX. */ 203 static uint32_t enable_retry = 1; 204 205 /* Disable TX checksum offload */ 206 static uint32_t enable_tx_csum; 207 208 /* Disable TSO offload */ 209 static uint32_t enable_tso; 210 211 /* Specify timeout (in useconds) between retries on RX. */ 212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 213 /* Specify the number of retries on RX. */ 214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 215 216 /* Character device basename. Can be set by user. */ 217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 218 219 /* empty vmdq configuration structure. Filled in programatically */ 220 static struct rte_eth_conf vmdq_conf_default = { 221 .rxmode = { 222 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 223 .split_hdr_size = 0, 224 .header_split = 0, /**< Header Split disabled */ 225 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 226 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 227 /* 228 * It is necessary for 1G NIC such as I350, 229 * this fixes bug of ipv4 forwarding in guest can't 230 * forward pakets from one virtio dev to another virtio dev. 231 */ 232 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 233 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 234 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 235 }, 236 237 .txmode = { 238 .mq_mode = ETH_MQ_TX_NONE, 239 }, 240 .rx_adv_conf = { 241 /* 242 * should be overridden separately in code with 243 * appropriate values 244 */ 245 .vmdq_rx_conf = { 246 .nb_queue_pools = ETH_8_POOLS, 247 .enable_default_pool = 0, 248 .default_pool = 0, 249 .nb_pool_maps = 0, 250 .pool_map = {{0, 0},}, 251 }, 252 }, 253 }; 254 255 static unsigned lcore_ids[RTE_MAX_LCORE]; 256 static uint8_t ports[RTE_MAX_ETHPORTS]; 257 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 258 static uint16_t num_pf_queues, num_vmdq_queues; 259 static uint16_t vmdq_pool_base, vmdq_queue_base; 260 static uint16_t queues_per_pool; 261 262 static const uint16_t external_pkt_default_vlan_tag = 2000; 263 const uint16_t vlan_tags[] = { 264 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 265 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 266 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 267 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 268 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 269 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 270 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 271 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 272 }; 273 274 /* ethernet addresses of ports */ 275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 276 277 /* heads for the main used and free linked lists for the data path. */ 278 static struct virtio_net_data_ll *ll_root_used = NULL; 279 static struct virtio_net_data_ll *ll_root_free = NULL; 280 281 /* Array of data core structures containing information on individual core linked lists. */ 282 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 283 284 /* Used for queueing bursts of TX packets. */ 285 struct mbuf_table { 286 unsigned len; 287 unsigned txq_id; 288 struct rte_mbuf *m_table[MAX_PKT_BURST]; 289 }; 290 291 /* TX queue for each data core. */ 292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 293 294 /* TX queue fori each virtio device for zero copy. */ 295 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 296 297 /* Vlan header struct used to insert vlan tags on TX. */ 298 struct vlan_ethhdr { 299 unsigned char h_dest[ETH_ALEN]; 300 unsigned char h_source[ETH_ALEN]; 301 __be16 h_vlan_proto; 302 __be16 h_vlan_TCI; 303 __be16 h_vlan_encapsulated_proto; 304 }; 305 306 /* Header lengths. */ 307 #define VLAN_HLEN 4 308 #define VLAN_ETH_HLEN 18 309 310 /* Per-device statistics struct */ 311 struct device_statistics { 312 uint64_t tx_total; 313 rte_atomic64_t rx_total_atomic; 314 uint64_t rx_total; 315 uint64_t tx; 316 rte_atomic64_t rx_atomic; 317 uint64_t rx; 318 } __rte_cache_aligned; 319 struct device_statistics dev_statistics[MAX_DEVICES]; 320 321 /* 322 * Builds up the correct configuration for VMDQ VLAN pool map 323 * according to the pool & queue limits. 324 */ 325 static inline int 326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 327 { 328 struct rte_eth_vmdq_rx_conf conf; 329 struct rte_eth_vmdq_rx_conf *def_conf = 330 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 331 unsigned i; 332 333 memset(&conf, 0, sizeof(conf)); 334 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 335 conf.nb_pool_maps = num_devices; 336 conf.enable_loop_back = def_conf->enable_loop_back; 337 conf.rx_mode = def_conf->rx_mode; 338 339 for (i = 0; i < conf.nb_pool_maps; i++) { 340 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 341 conf.pool_map[i].pools = (1UL << i); 342 } 343 344 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 345 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 346 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 347 return 0; 348 } 349 350 /* 351 * Validate the device number according to the max pool number gotten form 352 * dev_info. If the device number is invalid, give the error message and 353 * return -1. Each device must have its own pool. 354 */ 355 static inline int 356 validate_num_devices(uint32_t max_nb_devices) 357 { 358 if (num_devices > max_nb_devices) { 359 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 360 return -1; 361 } 362 return 0; 363 } 364 365 /* 366 * Initialises a given port using global settings and with the rx buffers 367 * coming from the mbuf_pool passed as parameter 368 */ 369 static inline int 370 port_init(uint8_t port) 371 { 372 struct rte_eth_dev_info dev_info; 373 struct rte_eth_conf port_conf; 374 struct rte_eth_rxconf *rxconf; 375 struct rte_eth_txconf *txconf; 376 int16_t rx_rings, tx_rings; 377 uint16_t rx_ring_size, tx_ring_size; 378 int retval; 379 uint16_t q; 380 381 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 382 rte_eth_dev_info_get (port, &dev_info); 383 384 if (dev_info.max_rx_queues > MAX_QUEUES) { 385 rte_exit(EXIT_FAILURE, 386 "please define MAX_QUEUES no less than %u in %s\n", 387 dev_info.max_rx_queues, __FILE__); 388 } 389 390 rxconf = &dev_info.default_rxconf; 391 txconf = &dev_info.default_txconf; 392 rxconf->rx_drop_en = 1; 393 394 /* Enable vlan offload */ 395 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 396 397 /* 398 * Zero copy defers queue RX/TX start to the time when guest 399 * finishes its startup and packet buffers from that guest are 400 * available. 401 */ 402 if (zero_copy) { 403 rxconf->rx_deferred_start = 1; 404 rxconf->rx_drop_en = 0; 405 txconf->tx_deferred_start = 1; 406 } 407 408 /*configure the number of supported virtio devices based on VMDQ limits */ 409 num_devices = dev_info.max_vmdq_pools; 410 411 if (zero_copy) { 412 rx_ring_size = num_rx_descriptor; 413 tx_ring_size = num_tx_descriptor; 414 tx_rings = dev_info.max_tx_queues; 415 } else { 416 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 417 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 418 tx_rings = (uint16_t)rte_lcore_count(); 419 } 420 421 retval = validate_num_devices(MAX_DEVICES); 422 if (retval < 0) 423 return retval; 424 425 /* Get port configuration. */ 426 retval = get_eth_conf(&port_conf, num_devices); 427 if (retval < 0) 428 return retval; 429 /* NIC queues are divided into pf queues and vmdq queues. */ 430 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 431 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 432 num_vmdq_queues = num_devices * queues_per_pool; 433 num_queues = num_pf_queues + num_vmdq_queues; 434 vmdq_queue_base = dev_info.vmdq_queue_base; 435 vmdq_pool_base = dev_info.vmdq_pool_base; 436 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 437 num_pf_queues, num_devices, queues_per_pool); 438 439 if (port >= rte_eth_dev_count()) return -1; 440 441 if (enable_tx_csum == 0) 442 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); 443 444 if (enable_tso == 0) { 445 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); 446 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); 447 } 448 449 rx_rings = (uint16_t)dev_info.max_rx_queues; 450 /* Configure ethernet device. */ 451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 452 if (retval != 0) 453 return retval; 454 455 /* Setup the queues. */ 456 for (q = 0; q < rx_rings; q ++) { 457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 458 rte_eth_dev_socket_id(port), 459 rxconf, 460 vpool_array[q].pool); 461 if (retval < 0) 462 return retval; 463 } 464 for (q = 0; q < tx_rings; q ++) { 465 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 466 rte_eth_dev_socket_id(port), 467 txconf); 468 if (retval < 0) 469 return retval; 470 } 471 472 /* Start the device. */ 473 retval = rte_eth_dev_start(port); 474 if (retval < 0) { 475 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 476 return retval; 477 } 478 479 if (promiscuous) 480 rte_eth_promiscuous_enable(port); 481 482 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 483 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 484 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 485 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 486 (unsigned)port, 487 vmdq_ports_eth_addr[port].addr_bytes[0], 488 vmdq_ports_eth_addr[port].addr_bytes[1], 489 vmdq_ports_eth_addr[port].addr_bytes[2], 490 vmdq_ports_eth_addr[port].addr_bytes[3], 491 vmdq_ports_eth_addr[port].addr_bytes[4], 492 vmdq_ports_eth_addr[port].addr_bytes[5]); 493 494 return 0; 495 } 496 497 /* 498 * Set character device basename. 499 */ 500 static int 501 us_vhost_parse_basename(const char *q_arg) 502 { 503 /* parse number string */ 504 505 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 506 return -1; 507 else 508 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 509 510 return 0; 511 } 512 513 /* 514 * Parse the portmask provided at run time. 515 */ 516 static int 517 parse_portmask(const char *portmask) 518 { 519 char *end = NULL; 520 unsigned long pm; 521 522 errno = 0; 523 524 /* parse hexadecimal string */ 525 pm = strtoul(portmask, &end, 16); 526 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 527 return -1; 528 529 if (pm == 0) 530 return -1; 531 532 return pm; 533 534 } 535 536 /* 537 * Parse num options at run time. 538 */ 539 static int 540 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 541 { 542 char *end = NULL; 543 unsigned long num; 544 545 errno = 0; 546 547 /* parse unsigned int string */ 548 num = strtoul(q_arg, &end, 10); 549 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 550 return -1; 551 552 if (num > max_valid_value) 553 return -1; 554 555 return num; 556 557 } 558 559 /* 560 * Display usage 561 */ 562 static void 563 us_vhost_usage(const char *prgname) 564 { 565 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 566 " --vm2vm [0|1|2]\n" 567 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 568 " --dev-basename <name>\n" 569 " --nb-devices ND\n" 570 " -p PORTMASK: Set mask for ports to be used by application\n" 571 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 572 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 573 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 574 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 575 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 576 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 577 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 578 " --dev-basename: The basename to be used for the character device.\n" 579 " --zero-copy [0|1]: disable(default)/enable rx/tx " 580 "zero copy\n" 581 " --rx-desc-num [0-N]: the number of descriptors on rx, " 582 "used only when zero copy is enabled.\n" 583 " --tx-desc-num [0-N]: the number of descriptors on tx, " 584 "used only when zero copy is enabled.\n" 585 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 586 " --tso [0|1] disable/enable TCP segment offload.\n", 587 prgname); 588 } 589 590 /* 591 * Parse the arguments given in the command line of the application. 592 */ 593 static int 594 us_vhost_parse_args(int argc, char **argv) 595 { 596 int opt, ret; 597 int option_index; 598 unsigned i; 599 const char *prgname = argv[0]; 600 static struct option long_option[] = { 601 {"vm2vm", required_argument, NULL, 0}, 602 {"rx-retry", required_argument, NULL, 0}, 603 {"rx-retry-delay", required_argument, NULL, 0}, 604 {"rx-retry-num", required_argument, NULL, 0}, 605 {"mergeable", required_argument, NULL, 0}, 606 {"vlan-strip", required_argument, NULL, 0}, 607 {"stats", required_argument, NULL, 0}, 608 {"dev-basename", required_argument, NULL, 0}, 609 {"zero-copy", required_argument, NULL, 0}, 610 {"rx-desc-num", required_argument, NULL, 0}, 611 {"tx-desc-num", required_argument, NULL, 0}, 612 {"tx-csum", required_argument, NULL, 0}, 613 {"tso", required_argument, NULL, 0}, 614 {NULL, 0, 0, 0}, 615 }; 616 617 /* Parse command line */ 618 while ((opt = getopt_long(argc, argv, "p:P", 619 long_option, &option_index)) != EOF) { 620 switch (opt) { 621 /* Portmask */ 622 case 'p': 623 enabled_port_mask = parse_portmask(optarg); 624 if (enabled_port_mask == 0) { 625 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 626 us_vhost_usage(prgname); 627 return -1; 628 } 629 break; 630 631 case 'P': 632 promiscuous = 1; 633 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 634 ETH_VMDQ_ACCEPT_BROADCAST | 635 ETH_VMDQ_ACCEPT_MULTICAST; 636 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 637 638 break; 639 640 case 0: 641 /* Enable/disable vm2vm comms. */ 642 if (!strncmp(long_option[option_index].name, "vm2vm", 643 MAX_LONG_OPT_SZ)) { 644 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 645 if (ret == -1) { 646 RTE_LOG(INFO, VHOST_CONFIG, 647 "Invalid argument for " 648 "vm2vm [0|1|2]\n"); 649 us_vhost_usage(prgname); 650 return -1; 651 } else { 652 vm2vm_mode = (vm2vm_type)ret; 653 } 654 } 655 656 /* Enable/disable retries on RX. */ 657 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 658 ret = parse_num_opt(optarg, 1); 659 if (ret == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 661 us_vhost_usage(prgname); 662 return -1; 663 } else { 664 enable_retry = ret; 665 } 666 } 667 668 /* Enable/disable TX checksum offload. */ 669 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 670 ret = parse_num_opt(optarg, 1); 671 if (ret == -1) { 672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } else 676 enable_tx_csum = ret; 677 } 678 679 /* Enable/disable TSO offload. */ 680 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 681 ret = parse_num_opt(optarg, 1); 682 if (ret == -1) { 683 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 684 us_vhost_usage(prgname); 685 return -1; 686 } else 687 enable_tso = ret; 688 } 689 690 /* Specify the retries delay time (in useconds) on RX. */ 691 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 692 ret = parse_num_opt(optarg, INT32_MAX); 693 if (ret == -1) { 694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 695 us_vhost_usage(prgname); 696 return -1; 697 } else { 698 burst_rx_delay_time = ret; 699 } 700 } 701 702 /* Specify the retries number on RX. */ 703 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 704 ret = parse_num_opt(optarg, INT32_MAX); 705 if (ret == -1) { 706 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 707 us_vhost_usage(prgname); 708 return -1; 709 } else { 710 burst_rx_retry_num = ret; 711 } 712 } 713 714 /* Enable/disable RX mergeable buffers. */ 715 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 716 ret = parse_num_opt(optarg, 1); 717 if (ret == -1) { 718 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 719 us_vhost_usage(prgname); 720 return -1; 721 } else { 722 mergeable = !!ret; 723 if (ret) { 724 vmdq_conf_default.rxmode.jumbo_frame = 1; 725 vmdq_conf_default.rxmode.max_rx_pkt_len 726 = JUMBO_FRAME_MAX_SIZE; 727 } 728 } 729 } 730 731 /* Enable/disable RX VLAN strip on host. */ 732 if (!strncmp(long_option[option_index].name, 733 "vlan-strip", MAX_LONG_OPT_SZ)) { 734 ret = parse_num_opt(optarg, 1); 735 if (ret == -1) { 736 RTE_LOG(INFO, VHOST_CONFIG, 737 "Invalid argument for VLAN strip [0|1]\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } else { 741 vlan_strip = !!ret; 742 vmdq_conf_default.rxmode.hw_vlan_strip = 743 vlan_strip; 744 } 745 } 746 747 /* Enable/disable stats. */ 748 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 749 ret = parse_num_opt(optarg, INT32_MAX); 750 if (ret == -1) { 751 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 752 us_vhost_usage(prgname); 753 return -1; 754 } else { 755 enable_stats = ret; 756 } 757 } 758 759 /* Set character device basename. */ 760 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 761 if (us_vhost_parse_basename(optarg) == -1) { 762 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 763 us_vhost_usage(prgname); 764 return -1; 765 } 766 } 767 768 /* Enable/disable rx/tx zero copy. */ 769 if (!strncmp(long_option[option_index].name, 770 "zero-copy", MAX_LONG_OPT_SZ)) { 771 ret = parse_num_opt(optarg, 1); 772 if (ret == -1) { 773 RTE_LOG(INFO, VHOST_CONFIG, 774 "Invalid argument" 775 " for zero-copy [0|1]\n"); 776 us_vhost_usage(prgname); 777 return -1; 778 } else 779 zero_copy = ret; 780 } 781 782 /* Specify the descriptor number on RX. */ 783 if (!strncmp(long_option[option_index].name, 784 "rx-desc-num", MAX_LONG_OPT_SZ)) { 785 ret = parse_num_opt(optarg, MAX_RING_DESC); 786 if ((ret == -1) || (!POWEROF2(ret))) { 787 RTE_LOG(INFO, VHOST_CONFIG, 788 "Invalid argument for rx-desc-num[0-N]," 789 "power of 2 required.\n"); 790 us_vhost_usage(prgname); 791 return -1; 792 } else { 793 num_rx_descriptor = ret; 794 } 795 } 796 797 /* Specify the descriptor number on TX. */ 798 if (!strncmp(long_option[option_index].name, 799 "tx-desc-num", MAX_LONG_OPT_SZ)) { 800 ret = parse_num_opt(optarg, MAX_RING_DESC); 801 if ((ret == -1) || (!POWEROF2(ret))) { 802 RTE_LOG(INFO, VHOST_CONFIG, 803 "Invalid argument for tx-desc-num [0-N]," 804 "power of 2 required.\n"); 805 us_vhost_usage(prgname); 806 return -1; 807 } else { 808 num_tx_descriptor = ret; 809 } 810 } 811 812 break; 813 814 /* Invalid option - print options. */ 815 default: 816 us_vhost_usage(prgname); 817 return -1; 818 } 819 } 820 821 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 822 if (enabled_port_mask & (1 << i)) 823 ports[num_ports++] = (uint8_t)i; 824 } 825 826 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 827 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 828 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 829 return -1; 830 } 831 832 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 833 RTE_LOG(INFO, VHOST_PORT, 834 "Vhost zero copy doesn't support software vm2vm," 835 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 836 return -1; 837 } 838 839 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 840 RTE_LOG(INFO, VHOST_PORT, 841 "Vhost zero copy doesn't support jumbo frame," 842 "please specify '--mergeable 0' to disable the " 843 "mergeable feature.\n"); 844 return -1; 845 } 846 847 return 0; 848 } 849 850 /* 851 * Update the global var NUM_PORTS and array PORTS according to system ports number 852 * and return valid ports number 853 */ 854 static unsigned check_ports_num(unsigned nb_ports) 855 { 856 unsigned valid_num_ports = num_ports; 857 unsigned portid; 858 859 if (num_ports > nb_ports) { 860 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 861 num_ports, nb_ports); 862 num_ports = nb_ports; 863 } 864 865 for (portid = 0; portid < num_ports; portid ++) { 866 if (ports[portid] >= nb_ports) { 867 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 868 ports[portid], (nb_ports - 1)); 869 ports[portid] = INVALID_PORT_ID; 870 valid_num_ports--; 871 } 872 } 873 return valid_num_ports; 874 } 875 876 /* 877 * Macro to print out packet contents. Wrapped in debug define so that the 878 * data path is not effected when debug is disabled. 879 */ 880 #if RTE_LOG_LEVEL >= RTE_LOG_DEBUG 881 #define PRINT_PACKET(device, addr, size, header) do { \ 882 char *pkt_addr = (char*)(addr); \ 883 unsigned int index; \ 884 char packet[MAX_PRINT_BUFF]; \ 885 \ 886 if ((header)) \ 887 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 888 else \ 889 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 890 for (index = 0; index < (size); index++) { \ 891 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 892 "%02hhx ", pkt_addr[index]); \ 893 } \ 894 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 895 \ 896 RTE_LOG(DEBUG, VHOST_DATA, "%s", packet); \ 897 } while(0) 898 #else 899 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 900 #endif 901 902 /* 903 * Function to convert guest physical addresses to vhost physical addresses. 904 * This is used to convert virtio buffer addresses. 905 */ 906 static inline uint64_t __attribute__((always_inline)) 907 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 908 uint32_t buf_len, hpa_type *addr_type) 909 { 910 struct virtio_memory_regions_hpa *region; 911 uint32_t regionidx; 912 uint64_t vhost_pa = 0; 913 914 *addr_type = PHYS_ADDR_INVALID; 915 916 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 917 region = &vdev->regions_hpa[regionidx]; 918 if ((guest_pa >= region->guest_phys_address) && 919 (guest_pa <= region->guest_phys_address_end)) { 920 vhost_pa = region->host_phys_addr_offset + guest_pa; 921 if (likely((guest_pa + buf_len - 1) 922 <= region->guest_phys_address_end)) 923 *addr_type = PHYS_ADDR_CONTINUOUS; 924 else 925 *addr_type = PHYS_ADDR_CROSS_SUBREG; 926 break; 927 } 928 } 929 930 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") GPA %p| HPA %p\n", 931 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 932 (void *)(uintptr_t)vhost_pa); 933 934 return vhost_pa; 935 } 936 937 /* 938 * Compares a packet destination MAC address to a device MAC address. 939 */ 940 static inline int __attribute__((always_inline)) 941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 942 { 943 return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; 944 } 945 946 /* 947 * This function learns the MAC address of the device and registers this along with a 948 * vlan tag to a VMDQ. 949 */ 950 static int 951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 952 { 953 struct ether_hdr *pkt_hdr; 954 struct virtio_net_data_ll *dev_ll; 955 struct virtio_net *dev = vdev->dev; 956 int i, ret; 957 958 /* Learn MAC address of guest device from packet */ 959 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 960 961 dev_ll = ll_root_used; 962 963 while (dev_ll != NULL) { 964 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 965 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 966 return -1; 967 } 968 dev_ll = dev_ll->next; 969 } 970 971 for (i = 0; i < ETHER_ADDR_LEN; i++) 972 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 973 974 /* vlan_tag currently uses the device_id. */ 975 vdev->vlan_tag = vlan_tags[dev->device_fh]; 976 977 /* Print out VMDQ registration info. */ 978 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 979 dev->device_fh, 980 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 981 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 982 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 983 vdev->vlan_tag); 984 985 /* Register the MAC address. */ 986 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 987 (uint32_t)dev->device_fh + vmdq_pool_base); 988 if (ret) 989 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 990 dev->device_fh); 991 992 /* Enable stripping of the vlan tag as we handle routing. */ 993 if (vlan_strip) 994 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 995 (uint16_t)vdev->vmdq_rx_q, 1); 996 997 /* Set device as ready for RX. */ 998 vdev->ready = DEVICE_RX; 999 1000 return 0; 1001 } 1002 1003 /* 1004 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 1005 * queue before disabling RX on the device. 1006 */ 1007 static inline void 1008 unlink_vmdq(struct vhost_dev *vdev) 1009 { 1010 unsigned i = 0; 1011 unsigned rx_count; 1012 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1013 1014 if (vdev->ready == DEVICE_RX) { 1015 /*clear MAC and VLAN settings*/ 1016 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 1017 for (i = 0; i < 6; i++) 1018 vdev->mac_address.addr_bytes[i] = 0; 1019 1020 vdev->vlan_tag = 0; 1021 1022 /*Clear out the receive buffers*/ 1023 rx_count = rte_eth_rx_burst(ports[0], 1024 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1025 1026 while (rx_count) { 1027 for (i = 0; i < rx_count; i++) 1028 rte_pktmbuf_free(pkts_burst[i]); 1029 1030 rx_count = rte_eth_rx_burst(ports[0], 1031 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1032 } 1033 1034 vdev->ready = DEVICE_MAC_LEARNING; 1035 } 1036 } 1037 1038 /* 1039 * Check if the packet destination MAC address is for a local device. If so then put 1040 * the packet on that devices RX queue. If not then return. 1041 */ 1042 static inline int __attribute__((always_inline)) 1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1044 { 1045 struct virtio_net_data_ll *dev_ll; 1046 struct ether_hdr *pkt_hdr; 1047 uint64_t ret = 0; 1048 struct virtio_net *dev = vdev->dev; 1049 struct virtio_net *tdev; /* destination virito device */ 1050 1051 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1052 1053 /*get the used devices list*/ 1054 dev_ll = ll_root_used; 1055 1056 while (dev_ll != NULL) { 1057 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1058 &dev_ll->vdev->mac_address)) { 1059 1060 /* Drop the packet if the TX packet is destined for the TX device. */ 1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1062 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " 1063 "Source and destination MAC addresses are the same. " 1064 "Dropping packet.\n", 1065 dev->device_fh); 1066 return 0; 1067 } 1068 tdev = dev_ll->vdev->dev; 1069 1070 1071 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " 1072 "MAC address is local\n", tdev->device_fh); 1073 1074 if (unlikely(dev_ll->vdev->remove)) { 1075 /*drop the packet if the device is marked for removal*/ 1076 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") " 1077 "Device is marked for removal\n", tdev->device_fh); 1078 } else { 1079 /*send the packet to the local virtio device*/ 1080 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1081 if (enable_stats) { 1082 rte_atomic64_add( 1083 &dev_statistics[tdev->device_fh].rx_total_atomic, 1084 1); 1085 rte_atomic64_add( 1086 &dev_statistics[tdev->device_fh].rx_atomic, 1087 ret); 1088 dev_statistics[dev->device_fh].tx_total++; 1089 dev_statistics[dev->device_fh].tx += ret; 1090 } 1091 } 1092 1093 return 0; 1094 } 1095 dev_ll = dev_ll->next; 1096 } 1097 1098 return -1; 1099 } 1100 1101 /* 1102 * Check if the destination MAC of a packet is one local VM, 1103 * and get its vlan tag, and offset if it is. 1104 */ 1105 static inline int __attribute__((always_inline)) 1106 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1107 uint32_t *offset, uint16_t *vlan_tag) 1108 { 1109 struct virtio_net_data_ll *dev_ll = ll_root_used; 1110 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1111 1112 while (dev_ll != NULL) { 1113 if ((dev_ll->vdev->ready == DEVICE_RX) 1114 && ether_addr_cmp(&(pkt_hdr->d_addr), 1115 &dev_ll->vdev->mac_address)) { 1116 /* 1117 * Drop the packet if the TX packet is 1118 * destined for the TX device. 1119 */ 1120 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1121 RTE_LOG(DEBUG, VHOST_DATA, 1122 "(%"PRIu64") TX: Source and destination" 1123 " MAC addresses are the same. Dropping " 1124 "packet.\n", 1125 dev_ll->vdev->dev->device_fh); 1126 return -1; 1127 } 1128 1129 /* 1130 * HW vlan strip will reduce the packet length 1131 * by minus length of vlan tag, so need restore 1132 * the packet length by plus it. 1133 */ 1134 *offset = VLAN_HLEN; 1135 *vlan_tag = 1136 (uint16_t) 1137 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1138 1139 RTE_LOG(DEBUG, VHOST_DATA, 1140 "(%"PRIu64") TX: pkt to local VM device id:" 1141 "(%"PRIu64") vlan tag: %d.\n", 1142 dev->device_fh, dev_ll->vdev->dev->device_fh, 1143 (int)*vlan_tag); 1144 1145 break; 1146 } 1147 dev_ll = dev_ll->next; 1148 } 1149 return 0; 1150 } 1151 1152 static uint16_t 1153 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 1154 { 1155 if (ol_flags & PKT_TX_IPV4) 1156 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 1157 else /* assume ethertype == ETHER_TYPE_IPv6 */ 1158 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 1159 } 1160 1161 static void virtio_tx_offload(struct rte_mbuf *m) 1162 { 1163 void *l3_hdr; 1164 struct ipv4_hdr *ipv4_hdr = NULL; 1165 struct tcp_hdr *tcp_hdr = NULL; 1166 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1167 1168 l3_hdr = (char *)eth_hdr + m->l2_len; 1169 1170 if (m->ol_flags & PKT_TX_IPV4) { 1171 ipv4_hdr = l3_hdr; 1172 ipv4_hdr->hdr_checksum = 0; 1173 m->ol_flags |= PKT_TX_IP_CKSUM; 1174 } 1175 1176 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 1177 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 1178 } 1179 1180 /* 1181 * This function routes the TX packet to the correct interface. This may be a local device 1182 * or the physical port. 1183 */ 1184 static inline void __attribute__((always_inline)) 1185 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1186 { 1187 struct mbuf_table *tx_q; 1188 struct rte_mbuf **m_table; 1189 unsigned len, ret, offset = 0; 1190 const uint16_t lcore_id = rte_lcore_id(); 1191 struct virtio_net *dev = vdev->dev; 1192 struct ether_hdr *nh; 1193 1194 /*check if destination is local VM*/ 1195 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1196 rte_pktmbuf_free(m); 1197 return; 1198 } 1199 1200 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1201 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1202 rte_pktmbuf_free(m); 1203 return; 1204 } 1205 } 1206 1207 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: " 1208 "MAC address is external\n", dev->device_fh); 1209 1210 /*Add packet to the port tx queue*/ 1211 tx_q = &lcore_tx_queue[lcore_id]; 1212 len = tx_q->len; 1213 1214 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1215 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1216 /* Guest has inserted the vlan tag. */ 1217 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1218 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1219 if ((vm2vm_mode == VM2VM_HARDWARE) && 1220 (vh->vlan_tci != vlan_tag_be)) 1221 vh->vlan_tci = vlan_tag_be; 1222 } else { 1223 m->ol_flags |= PKT_TX_VLAN_PKT; 1224 1225 /* 1226 * Find the right seg to adjust the data len when offset is 1227 * bigger than tail room size. 1228 */ 1229 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1230 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1231 m->data_len += offset; 1232 else { 1233 struct rte_mbuf *seg = m; 1234 1235 while ((seg->next != NULL) && 1236 (offset > rte_pktmbuf_tailroom(seg))) 1237 seg = seg->next; 1238 1239 seg->data_len += offset; 1240 } 1241 m->pkt_len += offset; 1242 } 1243 1244 m->vlan_tci = vlan_tag; 1245 } 1246 1247 if (m->ol_flags & PKT_TX_TCP_SEG) 1248 virtio_tx_offload(m); 1249 1250 tx_q->m_table[len] = m; 1251 len++; 1252 if (enable_stats) { 1253 dev_statistics[dev->device_fh].tx_total++; 1254 dev_statistics[dev->device_fh].tx++; 1255 } 1256 1257 if (unlikely(len == MAX_PKT_BURST)) { 1258 m_table = (struct rte_mbuf **)tx_q->m_table; 1259 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1260 /* Free any buffers not handled by TX and update the port stats. */ 1261 if (unlikely(ret < len)) { 1262 do { 1263 rte_pktmbuf_free(m_table[ret]); 1264 } while (++ret < len); 1265 } 1266 1267 len = 0; 1268 } 1269 1270 tx_q->len = len; 1271 return; 1272 } 1273 /* 1274 * This function is called by each data core. It handles all RX/TX registered with the 1275 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1276 * with all devices in the main linked list. 1277 */ 1278 static int 1279 switch_worker(__attribute__((unused)) void *arg) 1280 { 1281 struct rte_mempool *mbuf_pool = arg; 1282 struct virtio_net *dev = NULL; 1283 struct vhost_dev *vdev = NULL; 1284 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1285 struct virtio_net_data_ll *dev_ll; 1286 struct mbuf_table *tx_q; 1287 volatile struct lcore_ll_info *lcore_ll; 1288 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1289 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1290 unsigned ret, i; 1291 const uint16_t lcore_id = rte_lcore_id(); 1292 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1293 uint16_t rx_count = 0; 1294 uint16_t tx_count; 1295 uint32_t retry = 0; 1296 1297 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1298 lcore_ll = lcore_info[lcore_id].lcore_ll; 1299 prev_tsc = 0; 1300 1301 tx_q = &lcore_tx_queue[lcore_id]; 1302 for (i = 0; i < num_cores; i ++) { 1303 if (lcore_ids[i] == lcore_id) { 1304 tx_q->txq_id = i; 1305 break; 1306 } 1307 } 1308 1309 while(1) { 1310 cur_tsc = rte_rdtsc(); 1311 /* 1312 * TX burst queue drain 1313 */ 1314 diff_tsc = cur_tsc - prev_tsc; 1315 if (unlikely(diff_tsc > drain_tsc)) { 1316 1317 if (tx_q->len) { 1318 RTE_LOG(DEBUG, VHOST_DATA, 1319 "TX queue drained after timeout with burst size %u\n", 1320 tx_q->len); 1321 1322 /*Tx any packets in the queue*/ 1323 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1324 (struct rte_mbuf **)tx_q->m_table, 1325 (uint16_t)tx_q->len); 1326 if (unlikely(ret < tx_q->len)) { 1327 do { 1328 rte_pktmbuf_free(tx_q->m_table[ret]); 1329 } while (++ret < tx_q->len); 1330 } 1331 1332 tx_q->len = 0; 1333 } 1334 1335 prev_tsc = cur_tsc; 1336 1337 } 1338 1339 rte_prefetch0(lcore_ll->ll_root_used); 1340 /* 1341 * Inform the configuration core that we have exited the linked list and that no devices are 1342 * in use if requested. 1343 */ 1344 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1345 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1346 1347 /* 1348 * Process devices 1349 */ 1350 dev_ll = lcore_ll->ll_root_used; 1351 1352 while (dev_ll != NULL) { 1353 /*get virtio device ID*/ 1354 vdev = dev_ll->vdev; 1355 dev = vdev->dev; 1356 1357 if (unlikely(vdev->remove)) { 1358 dev_ll = dev_ll->next; 1359 unlink_vmdq(vdev); 1360 vdev->ready = DEVICE_SAFE_REMOVE; 1361 continue; 1362 } 1363 if (likely(vdev->ready == DEVICE_RX)) { 1364 /*Handle guest RX*/ 1365 rx_count = rte_eth_rx_burst(ports[0], 1366 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1367 1368 if (rx_count) { 1369 /* 1370 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1371 * Here MAX_PKT_BURST must be less than virtio queue size 1372 */ 1373 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1374 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1375 rte_delay_us(burst_rx_delay_time); 1376 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1377 break; 1378 } 1379 } 1380 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1381 if (enable_stats) { 1382 rte_atomic64_add( 1383 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1384 rx_count); 1385 rte_atomic64_add( 1386 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1387 } 1388 while (likely(rx_count)) { 1389 rx_count--; 1390 rte_pktmbuf_free(pkts_burst[rx_count]); 1391 } 1392 1393 } 1394 } 1395 1396 if (likely(!vdev->remove)) { 1397 /* Handle guest TX*/ 1398 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1399 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1400 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1401 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1402 while (tx_count) 1403 rte_pktmbuf_free(pkts_burst[--tx_count]); 1404 } 1405 } 1406 for (i = 0; i < tx_count; ++i) { 1407 virtio_tx_route(vdev, pkts_burst[i], 1408 vlan_tags[(uint16_t)dev->device_fh]); 1409 } 1410 } 1411 1412 /*move to the next device in the list*/ 1413 dev_ll = dev_ll->next; 1414 } 1415 } 1416 1417 return 0; 1418 } 1419 1420 /* 1421 * This function gets available ring number for zero copy rx. 1422 * Only one thread will call this funciton for a paticular virtio device, 1423 * so, it is designed as non-thread-safe function. 1424 */ 1425 static inline uint32_t __attribute__((always_inline)) 1426 get_available_ring_num_zcp(struct virtio_net *dev) 1427 { 1428 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1429 uint16_t avail_idx; 1430 1431 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1432 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1433 } 1434 1435 /* 1436 * This function gets available ring index for zero copy rx, 1437 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1438 * Only one thread will call this funciton for a paticular virtio device, 1439 * so, it is designed as non-thread-safe function. 1440 */ 1441 static inline uint32_t __attribute__((always_inline)) 1442 get_available_ring_index_zcp(struct virtio_net *dev, 1443 uint16_t *res_base_idx, uint32_t count) 1444 { 1445 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1446 uint16_t avail_idx; 1447 uint32_t retry = 0; 1448 uint16_t free_entries; 1449 1450 *res_base_idx = vq->last_used_idx_res; 1451 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1452 free_entries = (avail_idx - *res_base_idx); 1453 1454 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") in get_available_ring_index_zcp: " 1455 "avail idx: %d, " 1456 "res base idx:%d, free entries:%d\n", 1457 dev->device_fh, avail_idx, *res_base_idx, 1458 free_entries); 1459 1460 /* 1461 * If retry is enabled and the queue is full then we wait 1462 * and retry to avoid packet loss. 1463 */ 1464 if (enable_retry && unlikely(count > free_entries)) { 1465 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1466 rte_delay_us(burst_rx_delay_time); 1467 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1468 free_entries = (avail_idx - *res_base_idx); 1469 if (count <= free_entries) 1470 break; 1471 } 1472 } 1473 1474 /*check that we have enough buffers*/ 1475 if (unlikely(count > free_entries)) 1476 count = free_entries; 1477 1478 if (unlikely(count == 0)) { 1479 RTE_LOG(DEBUG, VHOST_DATA, 1480 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1481 "avail idx: %d, res base idx:%d, free entries:%d\n", 1482 dev->device_fh, avail_idx, 1483 *res_base_idx, free_entries); 1484 return 0; 1485 } 1486 1487 vq->last_used_idx_res = *res_base_idx + count; 1488 1489 return count; 1490 } 1491 1492 /* 1493 * This function put descriptor back to used list. 1494 */ 1495 static inline void __attribute__((always_inline)) 1496 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1497 { 1498 uint16_t res_cur_idx = vq->last_used_idx; 1499 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1500 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1501 rte_compiler_barrier(); 1502 *(volatile uint16_t *)&vq->used->idx += 1; 1503 vq->last_used_idx += 1; 1504 1505 /* Kick the guest if necessary. */ 1506 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1507 eventfd_write(vq->callfd, (eventfd_t)1); 1508 } 1509 1510 /* 1511 * This function get available descriptor from vitio vring and un-attached mbuf 1512 * from vpool->ring, and then attach them together. It needs adjust the offset 1513 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1514 * frame data may be put to wrong location in mbuf. 1515 */ 1516 static inline void __attribute__((always_inline)) 1517 attach_rxmbuf_zcp(struct virtio_net *dev) 1518 { 1519 uint16_t res_base_idx, desc_idx; 1520 uint64_t buff_addr, phys_addr; 1521 struct vhost_virtqueue *vq; 1522 struct vring_desc *desc; 1523 void *obj = NULL; 1524 struct rte_mbuf *mbuf; 1525 struct vpool *vpool; 1526 hpa_type addr_type; 1527 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1528 1529 vpool = &vpool_array[vdev->vmdq_rx_q]; 1530 vq = dev->virtqueue[VIRTIO_RXQ]; 1531 1532 do { 1533 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1534 1) != 1)) 1535 return; 1536 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1537 1538 desc = &vq->desc[desc_idx]; 1539 if (desc->flags & VRING_DESC_F_NEXT) { 1540 desc = &vq->desc[desc->next]; 1541 buff_addr = gpa_to_vva(dev, desc->addr); 1542 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1543 &addr_type); 1544 } else { 1545 buff_addr = gpa_to_vva(dev, 1546 desc->addr + vq->vhost_hlen); 1547 phys_addr = gpa_to_hpa(vdev, 1548 desc->addr + vq->vhost_hlen, 1549 desc->len, &addr_type); 1550 } 1551 1552 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1553 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1554 " address found when attaching RX frame buffer" 1555 " address!\n", dev->device_fh); 1556 put_desc_to_used_list_zcp(vq, desc_idx); 1557 continue; 1558 } 1559 1560 /* 1561 * Check if the frame buffer address from guest crosses 1562 * sub-region or not. 1563 */ 1564 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1565 RTE_LOG(ERR, VHOST_DATA, 1566 "(%"PRIu64") Frame buffer address cross " 1567 "sub-regioin found when attaching RX frame " 1568 "buffer address!\n", 1569 dev->device_fh); 1570 put_desc_to_used_list_zcp(vq, desc_idx); 1571 continue; 1572 } 1573 } while (unlikely(phys_addr == 0)); 1574 1575 rte_ring_sc_dequeue(vpool->ring, &obj); 1576 mbuf = obj; 1577 if (unlikely(mbuf == NULL)) { 1578 RTE_LOG(DEBUG, VHOST_DATA, 1579 "(%"PRIu64") in attach_rxmbuf_zcp: " 1580 "ring_sc_dequeue fail.\n", 1581 dev->device_fh); 1582 put_desc_to_used_list_zcp(vq, desc_idx); 1583 return; 1584 } 1585 1586 if (unlikely(vpool->buf_size > desc->len)) { 1587 RTE_LOG(DEBUG, VHOST_DATA, 1588 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1589 "length(%d) of descriptor idx: %d less than room " 1590 "size required: %d\n", 1591 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1592 put_desc_to_used_list_zcp(vq, desc_idx); 1593 rte_ring_sp_enqueue(vpool->ring, obj); 1594 return; 1595 } 1596 1597 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1598 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1599 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1600 mbuf->data_len = desc->len; 1601 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1602 1603 RTE_LOG(DEBUG, VHOST_DATA, 1604 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1605 "descriptor idx:%d\n", 1606 dev->device_fh, res_base_idx, desc_idx); 1607 1608 __rte_mbuf_raw_free(mbuf); 1609 1610 return; 1611 } 1612 1613 /* 1614 * Detach an attched packet mbuf - 1615 * - restore original mbuf address and length values. 1616 * - reset pktmbuf data and data_len to their default values. 1617 * All other fields of the given packet mbuf will be left intact. 1618 * 1619 * @param m 1620 * The attached packet mbuf. 1621 */ 1622 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1623 { 1624 const struct rte_mempool *mp = m->pool; 1625 void *buf = rte_mbuf_to_baddr(m); 1626 uint32_t buf_ofs; 1627 uint32_t buf_len = mp->elt_size - sizeof(*m); 1628 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1629 1630 m->buf_addr = buf; 1631 m->buf_len = (uint16_t)buf_len; 1632 1633 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1634 RTE_PKTMBUF_HEADROOM : m->buf_len; 1635 m->data_off = buf_ofs; 1636 1637 m->data_len = 0; 1638 } 1639 1640 /* 1641 * This function is called after packets have been transimited. It fetchs mbuf 1642 * from vpool->pool, detached it and put into vpool->ring. It also update the 1643 * used index and kick the guest if necessary. 1644 */ 1645 static inline uint32_t __attribute__((always_inline)) 1646 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1647 { 1648 struct rte_mbuf *mbuf; 1649 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1650 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1651 uint32_t index = 0; 1652 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1653 1654 RTE_LOG(DEBUG, VHOST_DATA, 1655 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1656 "clean is: %d\n", 1657 dev->device_fh, mbuf_count); 1658 RTE_LOG(DEBUG, VHOST_DATA, 1659 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1660 "clean is : %d\n", 1661 dev->device_fh, rte_ring_count(vpool->ring)); 1662 1663 for (index = 0; index < mbuf_count; index++) { 1664 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1665 if (likely(MBUF_EXT_MEM(mbuf))) 1666 pktmbuf_detach_zcp(mbuf); 1667 rte_ring_sp_enqueue(vpool->ring, mbuf); 1668 1669 /* Update used index buffer information. */ 1670 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1671 vq->used->ring[used_idx].len = 0; 1672 1673 used_idx = (used_idx + 1) & (vq->size - 1); 1674 } 1675 1676 RTE_LOG(DEBUG, VHOST_DATA, 1677 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1678 "clean is: %d\n", 1679 dev->device_fh, rte_mempool_count(vpool->pool)); 1680 RTE_LOG(DEBUG, VHOST_DATA, 1681 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1682 "clean is : %d\n", 1683 dev->device_fh, rte_ring_count(vpool->ring)); 1684 RTE_LOG(DEBUG, VHOST_DATA, 1685 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1686 "vq->last_used_idx:%d\n", 1687 dev->device_fh, vq->last_used_idx); 1688 1689 vq->last_used_idx += mbuf_count; 1690 1691 RTE_LOG(DEBUG, VHOST_DATA, 1692 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1693 "vq->last_used_idx:%d\n", 1694 dev->device_fh, vq->last_used_idx); 1695 1696 rte_compiler_barrier(); 1697 1698 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1699 1700 /* Kick guest if required. */ 1701 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1702 eventfd_write(vq->callfd, (eventfd_t)1); 1703 1704 return 0; 1705 } 1706 1707 /* 1708 * This function is called when a virtio device is destroy. 1709 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1710 */ 1711 static void mbuf_destroy_zcp(struct vpool *vpool) 1712 { 1713 struct rte_mbuf *mbuf = NULL; 1714 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1715 1716 RTE_LOG(DEBUG, VHOST_CONFIG, 1717 "in mbuf_destroy_zcp: mbuf count in mempool before " 1718 "mbuf_destroy_zcp is: %d\n", 1719 mbuf_count); 1720 RTE_LOG(DEBUG, VHOST_CONFIG, 1721 "in mbuf_destroy_zcp: mbuf count in ring before " 1722 "mbuf_destroy_zcp is : %d\n", 1723 rte_ring_count(vpool->ring)); 1724 1725 for (index = 0; index < mbuf_count; index++) { 1726 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1727 if (likely(mbuf != NULL)) { 1728 if (likely(MBUF_EXT_MEM(mbuf))) 1729 pktmbuf_detach_zcp(mbuf); 1730 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1731 } 1732 } 1733 1734 RTE_LOG(DEBUG, VHOST_CONFIG, 1735 "in mbuf_destroy_zcp: mbuf count in mempool after " 1736 "mbuf_destroy_zcp is: %d\n", 1737 rte_mempool_count(vpool->pool)); 1738 RTE_LOG(DEBUG, VHOST_CONFIG, 1739 "in mbuf_destroy_zcp: mbuf count in ring after " 1740 "mbuf_destroy_zcp is : %d\n", 1741 rte_ring_count(vpool->ring)); 1742 } 1743 1744 /* 1745 * This function update the use flag and counter. 1746 */ 1747 static inline uint32_t __attribute__((always_inline)) 1748 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1749 uint32_t count) 1750 { 1751 struct vhost_virtqueue *vq; 1752 struct vring_desc *desc; 1753 struct rte_mbuf *buff; 1754 /* The virtio_hdr is initialised to 0. */ 1755 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1756 = {{0, 0, 0, 0, 0, 0}, 0}; 1757 uint64_t buff_hdr_addr = 0; 1758 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1759 uint32_t head_idx, packet_success = 0; 1760 uint16_t res_cur_idx; 1761 1762 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_rx()\n", 1763 dev->device_fh); 1764 1765 if (count == 0) 1766 return 0; 1767 1768 vq = dev->virtqueue[VIRTIO_RXQ]; 1769 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1770 1771 res_cur_idx = vq->last_used_idx; 1772 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Current Index %d| End Index %d\n", 1773 dev->device_fh, res_cur_idx, res_cur_idx + count); 1774 1775 /* Retrieve all of the head indexes first to avoid caching issues. */ 1776 for (head_idx = 0; head_idx < count; head_idx++) 1777 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1778 1779 /*Prefetch descriptor index. */ 1780 rte_prefetch0(&vq->desc[head[packet_success]]); 1781 1782 while (packet_success != count) { 1783 /* Get descriptor from available ring */ 1784 desc = &vq->desc[head[packet_success]]; 1785 1786 buff = pkts[packet_success]; 1787 RTE_LOG(DEBUG, VHOST_DATA, 1788 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1789 "pkt[%d] descriptor idx: %d\n", 1790 dev->device_fh, packet_success, 1791 MBUF_HEADROOM_UINT32(buff)); 1792 1793 PRINT_PACKET(dev, 1794 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1795 + RTE_PKTMBUF_HEADROOM), 1796 rte_pktmbuf_data_len(buff), 0); 1797 1798 /* Buffer address translation for virtio header. */ 1799 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1800 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1801 1802 /* 1803 * If the descriptors are chained the header and data are 1804 * placed in separate buffers. 1805 */ 1806 if (desc->flags & VRING_DESC_F_NEXT) { 1807 desc->len = vq->vhost_hlen; 1808 desc = &vq->desc[desc->next]; 1809 desc->len = rte_pktmbuf_data_len(buff); 1810 } else { 1811 desc->len = packet_len; 1812 } 1813 1814 /* Update used ring with desc information */ 1815 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1816 = head[packet_success]; 1817 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1818 = packet_len; 1819 res_cur_idx++; 1820 packet_success++; 1821 1822 /* A header is required per buffer. */ 1823 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1824 (const void *)&virtio_hdr, vq->vhost_hlen); 1825 1826 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1827 1828 if (likely(packet_success < count)) { 1829 /* Prefetch descriptor index. */ 1830 rte_prefetch0(&vq->desc[head[packet_success]]); 1831 } 1832 } 1833 1834 rte_compiler_barrier(); 1835 1836 RTE_LOG(DEBUG, VHOST_DATA, 1837 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1838 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1839 dev->device_fh, vq->last_used_idx, vq->used->idx); 1840 1841 *(volatile uint16_t *)&vq->used->idx += count; 1842 vq->last_used_idx += count; 1843 1844 RTE_LOG(DEBUG, VHOST_DATA, 1845 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1846 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1847 dev->device_fh, vq->last_used_idx, vq->used->idx); 1848 1849 /* Kick the guest if necessary. */ 1850 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1851 eventfd_write(vq->callfd, (eventfd_t)1); 1852 1853 return count; 1854 } 1855 1856 /* 1857 * This function routes the TX packet to the correct interface. 1858 * This may be a local device or the physical port. 1859 */ 1860 static inline void __attribute__((always_inline)) 1861 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1862 uint32_t desc_idx, uint8_t need_copy) 1863 { 1864 struct mbuf_table *tx_q; 1865 struct rte_mbuf **m_table; 1866 void *obj = NULL; 1867 struct rte_mbuf *mbuf; 1868 unsigned len, ret, offset = 0; 1869 struct vpool *vpool; 1870 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1871 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1872 1873 /*Add packet to the port tx queue*/ 1874 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1875 len = tx_q->len; 1876 1877 /* Allocate an mbuf and populate the structure. */ 1878 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1879 rte_ring_sc_dequeue(vpool->ring, &obj); 1880 mbuf = obj; 1881 if (unlikely(mbuf == NULL)) { 1882 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1883 RTE_LOG(ERR, VHOST_DATA, 1884 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1885 dev->device_fh); 1886 put_desc_to_used_list_zcp(vq, desc_idx); 1887 return; 1888 } 1889 1890 if (vm2vm_mode == VM2VM_HARDWARE) { 1891 /* Avoid using a vlan tag from any vm for external pkt, such as 1892 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1893 * selection, MAC address determines it as an external pkt 1894 * which should go to network, while vlan tag determine it as 1895 * a vm2vm pkt should forward to another vm. Hardware confuse 1896 * such a ambiguous situation, so pkt will lost. 1897 */ 1898 vlan_tag = external_pkt_default_vlan_tag; 1899 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1900 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1901 __rte_mbuf_raw_free(mbuf); 1902 return; 1903 } 1904 } 1905 1906 mbuf->nb_segs = m->nb_segs; 1907 mbuf->next = m->next; 1908 mbuf->data_len = m->data_len + offset; 1909 mbuf->pkt_len = mbuf->data_len; 1910 if (unlikely(need_copy)) { 1911 /* Copy the packet contents to the mbuf. */ 1912 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1913 rte_pktmbuf_mtod(m, void *), 1914 m->data_len); 1915 } else { 1916 mbuf->data_off = m->data_off; 1917 mbuf->buf_physaddr = m->buf_physaddr; 1918 mbuf->buf_addr = m->buf_addr; 1919 } 1920 mbuf->ol_flags |= PKT_TX_VLAN_PKT; 1921 mbuf->vlan_tci = vlan_tag; 1922 mbuf->l2_len = sizeof(struct ether_hdr); 1923 mbuf->l3_len = sizeof(struct ipv4_hdr); 1924 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1925 1926 tx_q->m_table[len] = mbuf; 1927 len++; 1928 1929 RTE_LOG(DEBUG, VHOST_DATA, 1930 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1931 dev->device_fh, 1932 mbuf->nb_segs, 1933 (mbuf->next == NULL) ? "null" : "non-null"); 1934 1935 if (enable_stats) { 1936 dev_statistics[dev->device_fh].tx_total++; 1937 dev_statistics[dev->device_fh].tx++; 1938 } 1939 1940 if (unlikely(len == MAX_PKT_BURST)) { 1941 m_table = (struct rte_mbuf **)tx_q->m_table; 1942 ret = rte_eth_tx_burst(ports[0], 1943 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1944 1945 /* 1946 * Free any buffers not handled by TX and update 1947 * the port stats. 1948 */ 1949 if (unlikely(ret < len)) { 1950 do { 1951 rte_pktmbuf_free(m_table[ret]); 1952 } while (++ret < len); 1953 } 1954 1955 len = 0; 1956 txmbuf_clean_zcp(dev, vpool); 1957 } 1958 1959 tx_q->len = len; 1960 1961 return; 1962 } 1963 1964 /* 1965 * This function TX all available packets in virtio TX queue for one 1966 * virtio-net device. If it is first packet, it learns MAC address and 1967 * setup VMDQ. 1968 */ 1969 static inline void __attribute__((always_inline)) 1970 virtio_dev_tx_zcp(struct virtio_net *dev) 1971 { 1972 struct rte_mbuf m; 1973 struct vhost_virtqueue *vq; 1974 struct vring_desc *desc; 1975 uint64_t buff_addr = 0, phys_addr; 1976 uint32_t head[MAX_PKT_BURST]; 1977 uint32_t i; 1978 uint16_t free_entries, packet_success = 0; 1979 uint16_t avail_idx; 1980 uint8_t need_copy = 0; 1981 hpa_type addr_type; 1982 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1983 1984 vq = dev->virtqueue[VIRTIO_TXQ]; 1985 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1986 1987 /* If there are no available buffers then return. */ 1988 if (vq->last_used_idx_res == avail_idx) 1989 return; 1990 1991 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_tx()\n", 1992 dev->device_fh); 1993 1994 /* Prefetch available ring to retrieve head indexes. */ 1995 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1996 1997 /* Get the number of free entries in the ring */ 1998 free_entries = (avail_idx - vq->last_used_idx_res); 1999 2000 /* Limit to MAX_PKT_BURST. */ 2001 free_entries 2002 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 2003 2004 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Buffers available %d\n", 2005 dev->device_fh, free_entries); 2006 2007 /* Retrieve all of the head indexes first to avoid caching issues. */ 2008 for (i = 0; i < free_entries; i++) 2009 head[i] 2010 = vq->avail->ring[(vq->last_used_idx_res + i) 2011 & (vq->size - 1)]; 2012 2013 vq->last_used_idx_res += free_entries; 2014 2015 /* Prefetch descriptor index. */ 2016 rte_prefetch0(&vq->desc[head[packet_success]]); 2017 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 2018 2019 while (packet_success < free_entries) { 2020 desc = &vq->desc[head[packet_success]]; 2021 2022 /* Discard first buffer as it is the virtio header */ 2023 desc = &vq->desc[desc->next]; 2024 2025 /* Buffer address translation. */ 2026 buff_addr = gpa_to_vva(dev, desc->addr); 2027 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 2028 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 2029 &addr_type); 2030 2031 if (likely(packet_success < (free_entries - 1))) 2032 /* Prefetch descriptor index. */ 2033 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 2034 2035 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 2036 RTE_LOG(ERR, VHOST_DATA, 2037 "(%"PRIu64") Invalid frame buffer address found" 2038 "when TX packets!\n", 2039 dev->device_fh); 2040 packet_success++; 2041 continue; 2042 } 2043 2044 /* Prefetch buffer address. */ 2045 rte_prefetch0((void *)(uintptr_t)buff_addr); 2046 2047 /* 2048 * Setup dummy mbuf. This is copied to a real mbuf if 2049 * transmitted out the physical port. 2050 */ 2051 m.data_len = desc->len; 2052 m.nb_segs = 1; 2053 m.next = NULL; 2054 m.data_off = 0; 2055 m.buf_addr = (void *)(uintptr_t)buff_addr; 2056 m.buf_physaddr = phys_addr; 2057 2058 /* 2059 * Check if the frame buffer address from guest crosses 2060 * sub-region or not. 2061 */ 2062 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2063 RTE_LOG(ERR, VHOST_DATA, 2064 "(%"PRIu64") Frame buffer address cross " 2065 "sub-regioin found when attaching TX frame " 2066 "buffer address!\n", 2067 dev->device_fh); 2068 need_copy = 1; 2069 } else 2070 need_copy = 0; 2071 2072 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2073 2074 /* 2075 * If this is the first received packet we need to learn 2076 * the MAC and setup VMDQ 2077 */ 2078 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2079 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2080 /* 2081 * Discard frame if device is scheduled for 2082 * removal or a duplicate MAC address is found. 2083 */ 2084 packet_success += free_entries; 2085 vq->last_used_idx += packet_success; 2086 break; 2087 } 2088 } 2089 2090 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2091 packet_success++; 2092 } 2093 } 2094 2095 /* 2096 * This function is called by each data core. It handles all RX/TX registered 2097 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2098 * addresses are compared with all devices in the main linked list. 2099 */ 2100 static int 2101 switch_worker_zcp(__attribute__((unused)) void *arg) 2102 { 2103 struct virtio_net *dev = NULL; 2104 struct vhost_dev *vdev = NULL; 2105 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2106 struct virtio_net_data_ll *dev_ll; 2107 struct mbuf_table *tx_q; 2108 volatile struct lcore_ll_info *lcore_ll; 2109 const uint64_t drain_tsc 2110 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2111 * BURST_TX_DRAIN_US; 2112 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2113 unsigned ret; 2114 const uint16_t lcore_id = rte_lcore_id(); 2115 uint16_t count_in_ring, rx_count = 0; 2116 2117 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2118 2119 lcore_ll = lcore_info[lcore_id].lcore_ll; 2120 prev_tsc = 0; 2121 2122 while (1) { 2123 cur_tsc = rte_rdtsc(); 2124 2125 /* TX burst queue drain */ 2126 diff_tsc = cur_tsc - prev_tsc; 2127 if (unlikely(diff_tsc > drain_tsc)) { 2128 /* 2129 * Get mbuf from vpool.pool and detach mbuf and 2130 * put back into vpool.ring. 2131 */ 2132 dev_ll = lcore_ll->ll_root_used; 2133 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2134 /* Get virtio device ID */ 2135 vdev = dev_ll->vdev; 2136 dev = vdev->dev; 2137 2138 if (likely(!vdev->remove)) { 2139 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2140 if (tx_q->len) { 2141 RTE_LOG(DEBUG, VHOST_DATA, 2142 "TX queue drained after timeout" 2143 " with burst size %u\n", 2144 tx_q->len); 2145 2146 /* 2147 * Tx any packets in the queue 2148 */ 2149 ret = rte_eth_tx_burst( 2150 ports[0], 2151 (uint16_t)tx_q->txq_id, 2152 (struct rte_mbuf **) 2153 tx_q->m_table, 2154 (uint16_t)tx_q->len); 2155 if (unlikely(ret < tx_q->len)) { 2156 do { 2157 rte_pktmbuf_free( 2158 tx_q->m_table[ret]); 2159 } while (++ret < tx_q->len); 2160 } 2161 tx_q->len = 0; 2162 2163 txmbuf_clean_zcp(dev, 2164 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2165 } 2166 } 2167 dev_ll = dev_ll->next; 2168 } 2169 prev_tsc = cur_tsc; 2170 } 2171 2172 rte_prefetch0(lcore_ll->ll_root_used); 2173 2174 /* 2175 * Inform the configuration core that we have exited the linked 2176 * list and that no devices are in use if requested. 2177 */ 2178 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2179 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2180 2181 /* Process devices */ 2182 dev_ll = lcore_ll->ll_root_used; 2183 2184 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2185 vdev = dev_ll->vdev; 2186 dev = vdev->dev; 2187 if (unlikely(vdev->remove)) { 2188 dev_ll = dev_ll->next; 2189 unlink_vmdq(vdev); 2190 vdev->ready = DEVICE_SAFE_REMOVE; 2191 continue; 2192 } 2193 2194 if (likely(vdev->ready == DEVICE_RX)) { 2195 uint32_t index = vdev->vmdq_rx_q; 2196 uint16_t i; 2197 count_in_ring 2198 = rte_ring_count(vpool_array[index].ring); 2199 uint16_t free_entries 2200 = (uint16_t)get_available_ring_num_zcp(dev); 2201 2202 /* 2203 * Attach all mbufs in vpool.ring and put back 2204 * into vpool.pool. 2205 */ 2206 for (i = 0; 2207 i < RTE_MIN(free_entries, 2208 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2209 i++) 2210 attach_rxmbuf_zcp(dev); 2211 2212 /* Handle guest RX */ 2213 rx_count = rte_eth_rx_burst(ports[0], 2214 vdev->vmdq_rx_q, pkts_burst, 2215 MAX_PKT_BURST); 2216 2217 if (rx_count) { 2218 ret_count = virtio_dev_rx_zcp(dev, 2219 pkts_burst, rx_count); 2220 if (enable_stats) { 2221 dev_statistics[dev->device_fh].rx_total 2222 += rx_count; 2223 dev_statistics[dev->device_fh].rx 2224 += ret_count; 2225 } 2226 while (likely(rx_count)) { 2227 rx_count--; 2228 pktmbuf_detach_zcp( 2229 pkts_burst[rx_count]); 2230 rte_ring_sp_enqueue( 2231 vpool_array[index].ring, 2232 (void *)pkts_burst[rx_count]); 2233 } 2234 } 2235 } 2236 2237 if (likely(!vdev->remove)) 2238 /* Handle guest TX */ 2239 virtio_dev_tx_zcp(dev); 2240 2241 /* Move to the next device in the list */ 2242 dev_ll = dev_ll->next; 2243 } 2244 } 2245 2246 return 0; 2247 } 2248 2249 2250 /* 2251 * Add an entry to a used linked list. A free entry must first be found 2252 * in the free linked list using get_data_ll_free_entry(); 2253 */ 2254 static void 2255 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2256 struct virtio_net_data_ll *ll_dev) 2257 { 2258 struct virtio_net_data_ll *ll = *ll_root_addr; 2259 2260 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2261 ll_dev->next = NULL; 2262 rte_compiler_barrier(); 2263 2264 /* If ll == NULL then this is the first device. */ 2265 if (ll) { 2266 /* Increment to the tail of the linked list. */ 2267 while ((ll->next != NULL) ) 2268 ll = ll->next; 2269 2270 ll->next = ll_dev; 2271 } else { 2272 *ll_root_addr = ll_dev; 2273 } 2274 } 2275 2276 /* 2277 * Remove an entry from a used linked list. The entry must then be added to 2278 * the free linked list using put_data_ll_free_entry(). 2279 */ 2280 static void 2281 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2282 struct virtio_net_data_ll *ll_dev, 2283 struct virtio_net_data_ll *ll_dev_last) 2284 { 2285 struct virtio_net_data_ll *ll = *ll_root_addr; 2286 2287 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2288 return; 2289 2290 if (ll_dev == ll) 2291 *ll_root_addr = ll_dev->next; 2292 else 2293 if (likely(ll_dev_last != NULL)) 2294 ll_dev_last->next = ll_dev->next; 2295 else 2296 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2297 } 2298 2299 /* 2300 * Find and return an entry from the free linked list. 2301 */ 2302 static struct virtio_net_data_ll * 2303 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2304 { 2305 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2306 struct virtio_net_data_ll *ll_dev; 2307 2308 if (ll_free == NULL) 2309 return NULL; 2310 2311 ll_dev = ll_free; 2312 *ll_root_addr = ll_free->next; 2313 2314 return ll_dev; 2315 } 2316 2317 /* 2318 * Place an entry back on to the free linked list. 2319 */ 2320 static void 2321 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2322 struct virtio_net_data_ll *ll_dev) 2323 { 2324 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2325 2326 if (ll_dev == NULL) 2327 return; 2328 2329 ll_dev->next = ll_free; 2330 *ll_root_addr = ll_dev; 2331 } 2332 2333 /* 2334 * Creates a linked list of a given size. 2335 */ 2336 static struct virtio_net_data_ll * 2337 alloc_data_ll(uint32_t size) 2338 { 2339 struct virtio_net_data_ll *ll_new; 2340 uint32_t i; 2341 2342 /* Malloc and then chain the linked list. */ 2343 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2344 if (ll_new == NULL) { 2345 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2346 return NULL; 2347 } 2348 2349 for (i = 0; i < size - 1; i++) { 2350 ll_new[i].vdev = NULL; 2351 ll_new[i].next = &ll_new[i+1]; 2352 } 2353 ll_new[i].next = NULL; 2354 2355 return ll_new; 2356 } 2357 2358 /* 2359 * Create the main linked list along with each individual cores linked list. A used and a free list 2360 * are created to manage entries. 2361 */ 2362 static int 2363 init_data_ll (void) 2364 { 2365 int lcore; 2366 2367 RTE_LCORE_FOREACH_SLAVE(lcore) { 2368 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2369 if (lcore_info[lcore].lcore_ll == NULL) { 2370 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2371 return -1; 2372 } 2373 2374 lcore_info[lcore].lcore_ll->device_num = 0; 2375 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2376 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2377 if (num_devices % num_switching_cores) 2378 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2379 else 2380 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2381 } 2382 2383 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2384 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2385 2386 return 0; 2387 } 2388 2389 /* 2390 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2391 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2392 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2393 */ 2394 static void 2395 destroy_device (volatile struct virtio_net *dev) 2396 { 2397 struct virtio_net_data_ll *ll_lcore_dev_cur; 2398 struct virtio_net_data_ll *ll_main_dev_cur; 2399 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2400 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2401 struct vhost_dev *vdev; 2402 int lcore; 2403 2404 dev->flags &= ~VIRTIO_DEV_RUNNING; 2405 2406 vdev = (struct vhost_dev *)dev->priv; 2407 /*set the remove flag. */ 2408 vdev->remove = 1; 2409 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2410 rte_pause(); 2411 } 2412 2413 /* Search for entry to be removed from lcore ll */ 2414 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2415 while (ll_lcore_dev_cur != NULL) { 2416 if (ll_lcore_dev_cur->vdev == vdev) { 2417 break; 2418 } else { 2419 ll_lcore_dev_last = ll_lcore_dev_cur; 2420 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2421 } 2422 } 2423 2424 if (ll_lcore_dev_cur == NULL) { 2425 RTE_LOG(ERR, VHOST_CONFIG, 2426 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2427 dev->device_fh); 2428 return; 2429 } 2430 2431 /* Search for entry to be removed from main ll */ 2432 ll_main_dev_cur = ll_root_used; 2433 ll_main_dev_last = NULL; 2434 while (ll_main_dev_cur != NULL) { 2435 if (ll_main_dev_cur->vdev == vdev) { 2436 break; 2437 } else { 2438 ll_main_dev_last = ll_main_dev_cur; 2439 ll_main_dev_cur = ll_main_dev_cur->next; 2440 } 2441 } 2442 2443 /* Remove entries from the lcore and main ll. */ 2444 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2445 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2446 2447 /* Set the dev_removal_flag on each lcore. */ 2448 RTE_LCORE_FOREACH_SLAVE(lcore) { 2449 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2450 } 2451 2452 /* 2453 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2454 * they can no longer access the device removed from the linked lists and that the devices 2455 * are no longer in use. 2456 */ 2457 RTE_LCORE_FOREACH_SLAVE(lcore) { 2458 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2459 rte_pause(); 2460 } 2461 } 2462 2463 /* Add the entries back to the lcore and main free ll.*/ 2464 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2465 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2466 2467 /* Decrement number of device on the lcore. */ 2468 lcore_info[vdev->coreid].lcore_ll->device_num--; 2469 2470 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2471 2472 if (zero_copy) { 2473 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2474 2475 /* Stop the RX queue. */ 2476 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2477 RTE_LOG(DEBUG, VHOST_CONFIG, 2478 "(%"PRIu64") In destroy_device: Failed to stop " 2479 "rx queue:%d\n", 2480 dev->device_fh, 2481 vdev->vmdq_rx_q); 2482 } 2483 2484 RTE_LOG(DEBUG, VHOST_CONFIG, 2485 "(%"PRIu64") in destroy_device: Start put mbuf in " 2486 "mempool back to ring for RX queue: %d\n", 2487 dev->device_fh, vdev->vmdq_rx_q); 2488 2489 mbuf_destroy_zcp(vpool); 2490 2491 /* Stop the TX queue. */ 2492 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2493 RTE_LOG(DEBUG, VHOST_CONFIG, 2494 "(%"PRIu64") In destroy_device: Failed to " 2495 "stop tx queue:%d\n", 2496 dev->device_fh, vdev->vmdq_rx_q); 2497 } 2498 2499 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2500 2501 RTE_LOG(DEBUG, VHOST_CONFIG, 2502 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2503 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2504 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2505 dev->device_fh); 2506 2507 mbuf_destroy_zcp(vpool); 2508 rte_free(vdev->regions_hpa); 2509 } 2510 rte_free(vdev); 2511 2512 } 2513 2514 /* 2515 * Calculate the region count of physical continous regions for one particular 2516 * region of whose vhost virtual address is continous. The particular region 2517 * start from vva_start, with size of 'size' in argument. 2518 */ 2519 static uint32_t 2520 check_hpa_regions(uint64_t vva_start, uint64_t size) 2521 { 2522 uint32_t i, nregions = 0, page_size = getpagesize(); 2523 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2524 if (vva_start % page_size) { 2525 RTE_LOG(DEBUG, VHOST_CONFIG, 2526 "in check_countinous: vva start(%p) mod page_size(%d) " 2527 "has remainder\n", 2528 (void *)(uintptr_t)vva_start, page_size); 2529 return 0; 2530 } 2531 if (size % page_size) { 2532 RTE_LOG(DEBUG, VHOST_CONFIG, 2533 "in check_countinous: " 2534 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2535 size, page_size); 2536 return 0; 2537 } 2538 for (i = 0; i < size - page_size; i = i + page_size) { 2539 cur_phys_addr 2540 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2541 next_phys_addr = rte_mem_virt2phy( 2542 (void *)(uintptr_t)(vva_start + i + page_size)); 2543 if ((cur_phys_addr + page_size) != next_phys_addr) { 2544 ++nregions; 2545 RTE_LOG(DEBUG, VHOST_CONFIG, 2546 "in check_continuous: hva addr:(%p) is not " 2547 "continuous with hva addr:(%p), diff:%d\n", 2548 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2549 (void *)(uintptr_t)(vva_start + (uint64_t)i 2550 + page_size), page_size); 2551 RTE_LOG(DEBUG, VHOST_CONFIG, 2552 "in check_continuous: hpa addr:(%p) is not " 2553 "continuous with hpa addr:(%p), " 2554 "diff:(%"PRIu64")\n", 2555 (void *)(uintptr_t)cur_phys_addr, 2556 (void *)(uintptr_t)next_phys_addr, 2557 (next_phys_addr-cur_phys_addr)); 2558 } 2559 } 2560 return nregions; 2561 } 2562 2563 /* 2564 * Divide each region whose vhost virtual address is continous into a few 2565 * sub-regions, make sure the physical address within each sub-region are 2566 * continous. And fill offset(to GPA) and size etc. information of each 2567 * sub-region into regions_hpa. 2568 */ 2569 static uint32_t 2570 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2571 { 2572 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2573 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2574 2575 if (mem_region_hpa == NULL) 2576 return 0; 2577 2578 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2579 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2580 virtio_memory->regions[regionidx].address_offset; 2581 mem_region_hpa[regionidx_hpa].guest_phys_address 2582 = virtio_memory->regions[regionidx].guest_phys_address; 2583 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2584 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2585 mem_region_hpa[regionidx_hpa].guest_phys_address; 2586 RTE_LOG(DEBUG, VHOST_CONFIG, 2587 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2588 regionidx_hpa, 2589 (void *)(uintptr_t) 2590 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2591 RTE_LOG(DEBUG, VHOST_CONFIG, 2592 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2593 regionidx_hpa, 2594 (void *)(uintptr_t) 2595 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2596 for (i = 0, k = 0; 2597 i < virtio_memory->regions[regionidx].memory_size - 2598 page_size; 2599 i += page_size) { 2600 cur_phys_addr = rte_mem_virt2phy( 2601 (void *)(uintptr_t)(vva_start + i)); 2602 next_phys_addr = rte_mem_virt2phy( 2603 (void *)(uintptr_t)(vva_start + 2604 i + page_size)); 2605 if ((cur_phys_addr + page_size) != next_phys_addr) { 2606 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2607 mem_region_hpa[regionidx_hpa].guest_phys_address + 2608 k + page_size; 2609 mem_region_hpa[regionidx_hpa].memory_size 2610 = k + page_size; 2611 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest " 2612 "phys addr end [%d]:(%p)\n", 2613 regionidx_hpa, 2614 (void *)(uintptr_t) 2615 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2616 RTE_LOG(DEBUG, VHOST_CONFIG, 2617 "in fill_hpa_regions: guest phys addr " 2618 "size [%d]:(%p)\n", 2619 regionidx_hpa, 2620 (void *)(uintptr_t) 2621 (mem_region_hpa[regionidx_hpa].memory_size)); 2622 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2623 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2624 ++regionidx_hpa; 2625 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2626 next_phys_addr - 2627 mem_region_hpa[regionidx_hpa].guest_phys_address; 2628 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest" 2629 " phys addr start[%d]:(%p)\n", 2630 regionidx_hpa, 2631 (void *)(uintptr_t) 2632 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2633 RTE_LOG(DEBUG, VHOST_CONFIG, 2634 "in fill_hpa_regions: host phys addr " 2635 "start[%d]:(%p)\n", 2636 regionidx_hpa, 2637 (void *)(uintptr_t) 2638 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2639 k = 0; 2640 } else { 2641 k += page_size; 2642 } 2643 } 2644 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2645 = mem_region_hpa[regionidx_hpa].guest_phys_address 2646 + k + page_size; 2647 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2648 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2649 "[%d]:(%p)\n", regionidx_hpa, 2650 (void *)(uintptr_t) 2651 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2652 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2653 "[%d]:(%p)\n", regionidx_hpa, 2654 (void *)(uintptr_t) 2655 (mem_region_hpa[regionidx_hpa].memory_size)); 2656 ++regionidx_hpa; 2657 } 2658 return regionidx_hpa; 2659 } 2660 2661 /* 2662 * A new device is added to a data core. First the device is added to the main linked list 2663 * and the allocated to a specific data core. 2664 */ 2665 static int 2666 new_device (struct virtio_net *dev) 2667 { 2668 struct virtio_net_data_ll *ll_dev; 2669 int lcore, core_add = 0; 2670 uint32_t device_num_min = num_devices; 2671 struct vhost_dev *vdev; 2672 uint32_t regionidx; 2673 2674 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2675 if (vdev == NULL) { 2676 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2677 dev->device_fh); 2678 return -1; 2679 } 2680 vdev->dev = dev; 2681 dev->priv = vdev; 2682 2683 if (zero_copy) { 2684 vdev->nregions_hpa = dev->mem->nregions; 2685 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2686 vdev->nregions_hpa 2687 += check_hpa_regions( 2688 dev->mem->regions[regionidx].guest_phys_address 2689 + dev->mem->regions[regionidx].address_offset, 2690 dev->mem->regions[regionidx].memory_size); 2691 2692 } 2693 2694 vdev->regions_hpa = rte_calloc("vhost hpa region", 2695 vdev->nregions_hpa, 2696 sizeof(struct virtio_memory_regions_hpa), 2697 RTE_CACHE_LINE_SIZE); 2698 if (vdev->regions_hpa == NULL) { 2699 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2700 rte_free(vdev); 2701 return -1; 2702 } 2703 2704 2705 if (fill_hpa_memory_regions( 2706 vdev->regions_hpa, dev->mem 2707 ) != vdev->nregions_hpa) { 2708 2709 RTE_LOG(ERR, VHOST_CONFIG, 2710 "hpa memory regions number mismatch: " 2711 "[%d]\n", vdev->nregions_hpa); 2712 rte_free(vdev->regions_hpa); 2713 rte_free(vdev); 2714 return -1; 2715 } 2716 } 2717 2718 2719 /* Add device to main ll */ 2720 ll_dev = get_data_ll_free_entry(&ll_root_free); 2721 if (ll_dev == NULL) { 2722 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2723 "of %d devices per core has been reached\n", 2724 dev->device_fh, num_devices); 2725 if (vdev->regions_hpa) 2726 rte_free(vdev->regions_hpa); 2727 rte_free(vdev); 2728 return -1; 2729 } 2730 ll_dev->vdev = vdev; 2731 add_data_ll_entry(&ll_root_used, ll_dev); 2732 vdev->vmdq_rx_q 2733 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2734 2735 if (zero_copy) { 2736 uint32_t index = vdev->vmdq_rx_q; 2737 uint32_t count_in_ring, i; 2738 struct mbuf_table *tx_q; 2739 2740 count_in_ring = rte_ring_count(vpool_array[index].ring); 2741 2742 RTE_LOG(DEBUG, VHOST_CONFIG, 2743 "(%"PRIu64") in new_device: mbuf count in mempool " 2744 "before attach is: %d\n", 2745 dev->device_fh, 2746 rte_mempool_count(vpool_array[index].pool)); 2747 RTE_LOG(DEBUG, VHOST_CONFIG, 2748 "(%"PRIu64") in new_device: mbuf count in ring " 2749 "before attach is : %d\n", 2750 dev->device_fh, count_in_ring); 2751 2752 /* 2753 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2754 */ 2755 for (i = 0; i < count_in_ring; i++) 2756 attach_rxmbuf_zcp(dev); 2757 2758 RTE_LOG(DEBUG, VHOST_CONFIG, "(%" PRIu64 ") in new_device: " 2759 "mbuf count in mempool after attach is: %d\n", 2760 dev->device_fh, 2761 rte_mempool_count(vpool_array[index].pool)); 2762 RTE_LOG(DEBUG, VHOST_CONFIG, "(%" PRIu64 ") in new_device: " 2763 "mbuf count in ring after attach is : %d\n", 2764 dev->device_fh, 2765 rte_ring_count(vpool_array[index].ring)); 2766 2767 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2768 tx_q->txq_id = vdev->vmdq_rx_q; 2769 2770 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2771 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2772 2773 RTE_LOG(DEBUG, VHOST_CONFIG, 2774 "(%"PRIu64") In new_device: Failed to start " 2775 "tx queue:%d\n", 2776 dev->device_fh, vdev->vmdq_rx_q); 2777 2778 mbuf_destroy_zcp(vpool); 2779 rte_free(vdev->regions_hpa); 2780 rte_free(vdev); 2781 return -1; 2782 } 2783 2784 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2785 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2786 2787 RTE_LOG(DEBUG, VHOST_CONFIG, 2788 "(%"PRIu64") In new_device: Failed to start " 2789 "rx queue:%d\n", 2790 dev->device_fh, vdev->vmdq_rx_q); 2791 2792 /* Stop the TX queue. */ 2793 if (rte_eth_dev_tx_queue_stop(ports[0], 2794 vdev->vmdq_rx_q) != 0) { 2795 RTE_LOG(DEBUG, VHOST_CONFIG, 2796 "(%"PRIu64") In new_device: Failed to " 2797 "stop tx queue:%d\n", 2798 dev->device_fh, vdev->vmdq_rx_q); 2799 } 2800 2801 mbuf_destroy_zcp(vpool); 2802 rte_free(vdev->regions_hpa); 2803 rte_free(vdev); 2804 return -1; 2805 } 2806 2807 } 2808 2809 /*reset ready flag*/ 2810 vdev->ready = DEVICE_MAC_LEARNING; 2811 vdev->remove = 0; 2812 2813 /* Find a suitable lcore to add the device. */ 2814 RTE_LCORE_FOREACH_SLAVE(lcore) { 2815 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2816 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2817 core_add = lcore; 2818 } 2819 } 2820 /* Add device to lcore ll */ 2821 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2822 if (ll_dev == NULL) { 2823 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2824 vdev->ready = DEVICE_SAFE_REMOVE; 2825 destroy_device(dev); 2826 rte_free(vdev->regions_hpa); 2827 rte_free(vdev); 2828 return -1; 2829 } 2830 ll_dev->vdev = vdev; 2831 vdev->coreid = core_add; 2832 2833 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2834 2835 /* Initialize device stats */ 2836 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2837 2838 /* Disable notifications. */ 2839 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2840 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2841 lcore_info[vdev->coreid].lcore_ll->device_num++; 2842 dev->flags |= VIRTIO_DEV_RUNNING; 2843 2844 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2845 2846 return 0; 2847 } 2848 2849 /* 2850 * These callback allow devices to be added to the data core when configuration 2851 * has been fully complete. 2852 */ 2853 static const struct virtio_net_device_ops virtio_net_device_ops = 2854 { 2855 .new_device = new_device, 2856 .destroy_device = destroy_device, 2857 }; 2858 2859 /* 2860 * This is a thread will wake up after a period to print stats if the user has 2861 * enabled them. 2862 */ 2863 static void 2864 print_stats(void) 2865 { 2866 struct virtio_net_data_ll *dev_ll; 2867 uint64_t tx_dropped, rx_dropped; 2868 uint64_t tx, tx_total, rx, rx_total; 2869 uint32_t device_fh; 2870 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2871 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2872 2873 while(1) { 2874 sleep(enable_stats); 2875 2876 /* Clear screen and move to top left */ 2877 printf("%s%s", clr, top_left); 2878 2879 printf("\nDevice statistics ===================================="); 2880 2881 dev_ll = ll_root_used; 2882 while (dev_ll != NULL) { 2883 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2884 tx_total = dev_statistics[device_fh].tx_total; 2885 tx = dev_statistics[device_fh].tx; 2886 tx_dropped = tx_total - tx; 2887 if (zero_copy == 0) { 2888 rx_total = rte_atomic64_read( 2889 &dev_statistics[device_fh].rx_total_atomic); 2890 rx = rte_atomic64_read( 2891 &dev_statistics[device_fh].rx_atomic); 2892 } else { 2893 rx_total = dev_statistics[device_fh].rx_total; 2894 rx = dev_statistics[device_fh].rx; 2895 } 2896 rx_dropped = rx_total - rx; 2897 2898 printf("\nStatistics for device %"PRIu32" ------------------------------" 2899 "\nTX total: %"PRIu64"" 2900 "\nTX dropped: %"PRIu64"" 2901 "\nTX successful: %"PRIu64"" 2902 "\nRX total: %"PRIu64"" 2903 "\nRX dropped: %"PRIu64"" 2904 "\nRX successful: %"PRIu64"", 2905 device_fh, 2906 tx_total, 2907 tx_dropped, 2908 tx, 2909 rx_total, 2910 rx_dropped, 2911 rx); 2912 2913 dev_ll = dev_ll->next; 2914 } 2915 printf("\n======================================================\n"); 2916 } 2917 } 2918 2919 static void 2920 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2921 char *ring_name, uint32_t nb_mbuf) 2922 { 2923 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, 2924 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket); 2925 if (vpool_array[index].pool != NULL) { 2926 vpool_array[index].ring 2927 = rte_ring_create(ring_name, 2928 rte_align32pow2(nb_mbuf + 1), 2929 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2930 if (likely(vpool_array[index].ring != NULL)) { 2931 RTE_LOG(DEBUG, VHOST_CONFIG, 2932 "in setup_mempool_tbl: mbuf count in " 2933 "mempool is: %d\n", 2934 rte_mempool_count(vpool_array[index].pool)); 2935 RTE_LOG(DEBUG, VHOST_CONFIG, 2936 "in setup_mempool_tbl: mbuf count in " 2937 "ring is: %d\n", 2938 rte_ring_count(vpool_array[index].ring)); 2939 } else { 2940 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2941 ring_name); 2942 } 2943 2944 /* Need consider head room. */ 2945 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP; 2946 } else { 2947 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2948 } 2949 } 2950 2951 /* When we receive a INT signal, unregister vhost driver */ 2952 static void 2953 sigint_handler(__rte_unused int signum) 2954 { 2955 /* Unregister vhost driver. */ 2956 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 2957 if (ret != 0) 2958 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 2959 exit(0); 2960 } 2961 2962 /* 2963 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2964 * device is also registered here to handle the IOCTLs. 2965 */ 2966 int 2967 main(int argc, char *argv[]) 2968 { 2969 struct rte_mempool *mbuf_pool = NULL; 2970 unsigned lcore_id, core_id = 0; 2971 unsigned nb_ports, valid_num_ports; 2972 int ret; 2973 uint8_t portid; 2974 uint16_t queue_id; 2975 static pthread_t tid; 2976 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 2977 2978 signal(SIGINT, sigint_handler); 2979 2980 /* init EAL */ 2981 ret = rte_eal_init(argc, argv); 2982 if (ret < 0) 2983 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2984 argc -= ret; 2985 argv += ret; 2986 2987 /* parse app arguments */ 2988 ret = us_vhost_parse_args(argc, argv); 2989 if (ret < 0) 2990 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2991 2992 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2993 if (rte_lcore_is_enabled(lcore_id)) 2994 lcore_ids[core_id ++] = lcore_id; 2995 2996 if (rte_lcore_count() > RTE_MAX_LCORE) 2997 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2998 2999 /*set the number of swithcing cores available*/ 3000 num_switching_cores = rte_lcore_count()-1; 3001 3002 /* Get the number of physical ports. */ 3003 nb_ports = rte_eth_dev_count(); 3004 if (nb_ports > RTE_MAX_ETHPORTS) 3005 nb_ports = RTE_MAX_ETHPORTS; 3006 3007 /* 3008 * Update the global var NUM_PORTS and global array PORTS 3009 * and get value of var VALID_NUM_PORTS according to system ports number 3010 */ 3011 valid_num_ports = check_ports_num(nb_ports); 3012 3013 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 3014 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 3015 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 3016 return -1; 3017 } 3018 3019 if (zero_copy == 0) { 3020 /* Create the mbuf pool. */ 3021 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 3022 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 3023 0, MBUF_DATA_SIZE, rte_socket_id()); 3024 if (mbuf_pool == NULL) 3025 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 3026 3027 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 3028 vpool_array[queue_id].pool = mbuf_pool; 3029 3030 if (vm2vm_mode == VM2VM_HARDWARE) { 3031 /* Enable VT loop back to let L2 switch to do it. */ 3032 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3033 RTE_LOG(DEBUG, VHOST_CONFIG, 3034 "Enable loop back for L2 switch in vmdq.\n"); 3035 } 3036 } else { 3037 uint32_t nb_mbuf; 3038 char pool_name[RTE_MEMPOOL_NAMESIZE]; 3039 char ring_name[RTE_MEMPOOL_NAMESIZE]; 3040 3041 nb_mbuf = num_rx_descriptor 3042 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3043 + num_switching_cores * MAX_PKT_BURST; 3044 3045 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3046 snprintf(pool_name, sizeof(pool_name), 3047 "rxmbuf_pool_%u", queue_id); 3048 snprintf(ring_name, sizeof(ring_name), 3049 "rxmbuf_ring_%u", queue_id); 3050 setup_mempool_tbl(rte_socket_id(), queue_id, 3051 pool_name, ring_name, nb_mbuf); 3052 } 3053 3054 nb_mbuf = num_tx_descriptor 3055 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3056 + num_switching_cores * MAX_PKT_BURST; 3057 3058 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3059 snprintf(pool_name, sizeof(pool_name), 3060 "txmbuf_pool_%u", queue_id); 3061 snprintf(ring_name, sizeof(ring_name), 3062 "txmbuf_ring_%u", queue_id); 3063 setup_mempool_tbl(rte_socket_id(), 3064 (queue_id + MAX_QUEUES), 3065 pool_name, ring_name, nb_mbuf); 3066 } 3067 3068 if (vm2vm_mode == VM2VM_HARDWARE) { 3069 /* Enable VT loop back to let L2 switch to do it. */ 3070 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3071 RTE_LOG(DEBUG, VHOST_CONFIG, 3072 "Enable loop back for L2 switch in vmdq.\n"); 3073 } 3074 } 3075 3076 /* initialize all ports */ 3077 for (portid = 0; portid < nb_ports; portid++) { 3078 /* skip ports that are not enabled */ 3079 if ((enabled_port_mask & (1 << portid)) == 0) { 3080 RTE_LOG(INFO, VHOST_PORT, 3081 "Skipping disabled port %d\n", portid); 3082 continue; 3083 } 3084 if (port_init(portid) != 0) 3085 rte_exit(EXIT_FAILURE, 3086 "Cannot initialize network ports\n"); 3087 } 3088 3089 /* Initialise all linked lists. */ 3090 if (init_data_ll() == -1) 3091 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3092 3093 /* Initialize device stats */ 3094 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3095 3096 /* Enable stats if the user option is set. */ 3097 if (enable_stats) { 3098 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 3099 if (ret != 0) 3100 rte_exit(EXIT_FAILURE, 3101 "Cannot create print-stats thread\n"); 3102 3103 /* Set thread_name for aid in debugging. */ 3104 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 3105 ret = rte_thread_setname(tid, thread_name); 3106 if (ret != 0) 3107 RTE_LOG(ERR, VHOST_CONFIG, 3108 "Cannot set print-stats name\n"); 3109 } 3110 3111 /* Launch all data cores. */ 3112 if (zero_copy == 0) { 3113 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3114 rte_eal_remote_launch(switch_worker, 3115 mbuf_pool, lcore_id); 3116 } 3117 } else { 3118 uint32_t count_in_mempool, index, i; 3119 for (index = 0; index < 2*MAX_QUEUES; index++) { 3120 /* For all RX and TX queues. */ 3121 count_in_mempool 3122 = rte_mempool_count(vpool_array[index].pool); 3123 3124 /* 3125 * Transfer all un-attached mbufs from vpool.pool 3126 * to vpoo.ring. 3127 */ 3128 for (i = 0; i < count_in_mempool; i++) { 3129 struct rte_mbuf *mbuf 3130 = __rte_mbuf_raw_alloc( 3131 vpool_array[index].pool); 3132 rte_ring_sp_enqueue(vpool_array[index].ring, 3133 (void *)mbuf); 3134 } 3135 3136 RTE_LOG(DEBUG, VHOST_CONFIG, 3137 "in main: mbuf count in mempool at initial " 3138 "is: %d\n", count_in_mempool); 3139 RTE_LOG(DEBUG, VHOST_CONFIG, 3140 "in main: mbuf count in ring at initial is :" 3141 " %d\n", 3142 rte_ring_count(vpool_array[index].ring)); 3143 } 3144 3145 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3146 rte_eal_remote_launch(switch_worker_zcp, NULL, 3147 lcore_id); 3148 } 3149 3150 if (mergeable == 0) 3151 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3152 3153 /* Register vhost(cuse or user) driver to handle vhost messages. */ 3154 ret = rte_vhost_driver_register((char *)&dev_basename); 3155 if (ret != 0) 3156 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 3157 3158 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3159 3160 /* Start CUSE session. */ 3161 rte_vhost_driver_session_start(); 3162 return 0; 3163 3164 } 3165