1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 #include <rte_udp.h> 56 #include <rte_sctp.h> 57 58 #include "main.h" 59 60 #ifndef MAX_QUEUES 61 #define MAX_QUEUES 128 62 #endif 63 64 /* the maximum number of external ports supported */ 65 #define MAX_SUP_PORTS 1 66 67 /* 68 * Calculate the number of buffers needed per port 69 */ 70 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 71 (num_switching_cores*MAX_PKT_BURST) + \ 72 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 73 (num_switching_cores*MBUF_CACHE_SIZE)) 74 75 #define MBUF_CACHE_SIZE 128 76 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 77 78 /* 79 * No frame data buffer allocated from host are required for zero copy 80 * implementation, guest will allocate the frame data buffer, and vhost 81 * directly use it. 82 */ 83 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM 84 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE 85 #define MBUF_CACHE_SIZE_ZCP 0 86 87 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 88 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 89 90 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 91 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 92 93 #define JUMBO_FRAME_MAX_SIZE 0x2600 94 95 /* State of virtio device. */ 96 #define DEVICE_MAC_LEARNING 0 97 #define DEVICE_RX 1 98 #define DEVICE_SAFE_REMOVE 2 99 100 /* Config_core_flag status definitions. */ 101 #define REQUEST_DEV_REMOVAL 1 102 #define ACK_DEV_REMOVAL 0 103 104 /* Configurable number of RX/TX ring descriptors */ 105 #define RTE_TEST_RX_DESC_DEFAULT 1024 106 #define RTE_TEST_TX_DESC_DEFAULT 512 107 108 /* 109 * Need refine these 2 macros for legacy and DPDK based front end: 110 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 111 * And then adjust power 2. 112 */ 113 /* 114 * For legacy front end, 128 descriptors, 115 * half for virtio header, another half for mbuf. 116 */ 117 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 118 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 119 120 /* Get first 4 bytes in mbuf headroom. */ 121 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 122 + sizeof(struct rte_mbuf))) 123 124 /* true if x is a power of 2 */ 125 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 126 127 #define INVALID_PORT_ID 0xFF 128 129 /* Max number of devices. Limited by vmdq. */ 130 #define MAX_DEVICES 64 131 132 /* Size of buffers used for snprintfs. */ 133 #define MAX_PRINT_BUFF 6072 134 135 /* Maximum character device basename size. */ 136 #define MAX_BASENAME_SZ 10 137 138 /* Maximum long option length for option parsing. */ 139 #define MAX_LONG_OPT_SZ 64 140 141 /* Used to compare MAC addresses. */ 142 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 143 144 /* Number of descriptors per cacheline. */ 145 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 146 147 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb)) 148 149 /* mask of enabled ports */ 150 static uint32_t enabled_port_mask = 0; 151 152 /* Promiscuous mode */ 153 static uint32_t promiscuous; 154 155 /*Number of switching cores enabled*/ 156 static uint32_t num_switching_cores = 0; 157 158 /* number of devices/queues to support*/ 159 static uint32_t num_queues = 0; 160 static uint32_t num_devices; 161 162 /* 163 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 164 * disabled on default. 165 */ 166 static uint32_t zero_copy; 167 static int mergeable; 168 169 /* Do vlan strip on host, enabled on default */ 170 static uint32_t vlan_strip = 1; 171 172 /* number of descriptors to apply*/ 173 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 174 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 175 176 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 177 #define MAX_RING_DESC 4096 178 179 struct vpool { 180 struct rte_mempool *pool; 181 struct rte_ring *ring; 182 uint32_t buf_size; 183 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 184 185 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 186 typedef enum { 187 VM2VM_DISABLED = 0, 188 VM2VM_SOFTWARE = 1, 189 VM2VM_HARDWARE = 2, 190 VM2VM_LAST 191 } vm2vm_type; 192 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 193 194 /* The type of host physical address translated from guest physical address. */ 195 typedef enum { 196 PHYS_ADDR_CONTINUOUS = 0, 197 PHYS_ADDR_CROSS_SUBREG = 1, 198 PHYS_ADDR_INVALID = 2, 199 PHYS_ADDR_LAST 200 } hpa_type; 201 202 /* Enable stats. */ 203 static uint32_t enable_stats = 0; 204 /* Enable retries on RX. */ 205 static uint32_t enable_retry = 1; 206 207 /* Disable TX checksum offload */ 208 static uint32_t enable_tx_csum; 209 210 /* Disable TSO offload */ 211 static uint32_t enable_tso; 212 213 /* Specify timeout (in useconds) between retries on RX. */ 214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 215 /* Specify the number of retries on RX. */ 216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 217 218 /* Character device basename. Can be set by user. */ 219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 220 221 /* empty vmdq configuration structure. Filled in programatically */ 222 static struct rte_eth_conf vmdq_conf_default = { 223 .rxmode = { 224 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 225 .split_hdr_size = 0, 226 .header_split = 0, /**< Header Split disabled */ 227 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 228 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 229 /* 230 * It is necessary for 1G NIC such as I350, 231 * this fixes bug of ipv4 forwarding in guest can't 232 * forward pakets from one virtio dev to another virtio dev. 233 */ 234 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 235 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 236 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 237 }, 238 239 .txmode = { 240 .mq_mode = ETH_MQ_TX_NONE, 241 }, 242 .rx_adv_conf = { 243 /* 244 * should be overridden separately in code with 245 * appropriate values 246 */ 247 .vmdq_rx_conf = { 248 .nb_queue_pools = ETH_8_POOLS, 249 .enable_default_pool = 0, 250 .default_pool = 0, 251 .nb_pool_maps = 0, 252 .pool_map = {{0, 0},}, 253 }, 254 }, 255 }; 256 257 static unsigned lcore_ids[RTE_MAX_LCORE]; 258 static uint8_t ports[RTE_MAX_ETHPORTS]; 259 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 260 static uint16_t num_pf_queues, num_vmdq_queues; 261 static uint16_t vmdq_pool_base, vmdq_queue_base; 262 static uint16_t queues_per_pool; 263 264 static const uint16_t external_pkt_default_vlan_tag = 2000; 265 const uint16_t vlan_tags[] = { 266 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 267 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 268 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 269 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 270 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 271 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 272 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 273 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 274 }; 275 276 /* ethernet addresses of ports */ 277 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 278 279 /* heads for the main used and free linked lists for the data path. */ 280 static struct virtio_net_data_ll *ll_root_used = NULL; 281 static struct virtio_net_data_ll *ll_root_free = NULL; 282 283 /* Array of data core structures containing information on individual core linked lists. */ 284 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 285 286 /* Used for queueing bursts of TX packets. */ 287 struct mbuf_table { 288 unsigned len; 289 unsigned txq_id; 290 struct rte_mbuf *m_table[MAX_PKT_BURST]; 291 }; 292 293 /* TX queue for each data core. */ 294 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 295 296 /* TX queue fori each virtio device for zero copy. */ 297 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 298 299 /* Vlan header struct used to insert vlan tags on TX. */ 300 struct vlan_ethhdr { 301 unsigned char h_dest[ETH_ALEN]; 302 unsigned char h_source[ETH_ALEN]; 303 __be16 h_vlan_proto; 304 __be16 h_vlan_TCI; 305 __be16 h_vlan_encapsulated_proto; 306 }; 307 308 /* Header lengths. */ 309 #define VLAN_HLEN 4 310 #define VLAN_ETH_HLEN 18 311 312 /* Per-device statistics struct */ 313 struct device_statistics { 314 uint64_t tx_total; 315 rte_atomic64_t rx_total_atomic; 316 uint64_t rx_total; 317 uint64_t tx; 318 rte_atomic64_t rx_atomic; 319 uint64_t rx; 320 } __rte_cache_aligned; 321 struct device_statistics dev_statistics[MAX_DEVICES]; 322 323 /* 324 * Builds up the correct configuration for VMDQ VLAN pool map 325 * according to the pool & queue limits. 326 */ 327 static inline int 328 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 329 { 330 struct rte_eth_vmdq_rx_conf conf; 331 struct rte_eth_vmdq_rx_conf *def_conf = 332 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 333 unsigned i; 334 335 memset(&conf, 0, sizeof(conf)); 336 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 337 conf.nb_pool_maps = num_devices; 338 conf.enable_loop_back = def_conf->enable_loop_back; 339 conf.rx_mode = def_conf->rx_mode; 340 341 for (i = 0; i < conf.nb_pool_maps; i++) { 342 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 343 conf.pool_map[i].pools = (1UL << i); 344 } 345 346 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 347 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 348 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 349 return 0; 350 } 351 352 /* 353 * Validate the device number according to the max pool number gotten form 354 * dev_info. If the device number is invalid, give the error message and 355 * return -1. Each device must have its own pool. 356 */ 357 static inline int 358 validate_num_devices(uint32_t max_nb_devices) 359 { 360 if (num_devices > max_nb_devices) { 361 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 362 return -1; 363 } 364 return 0; 365 } 366 367 /* 368 * Initialises a given port using global settings and with the rx buffers 369 * coming from the mbuf_pool passed as parameter 370 */ 371 static inline int 372 port_init(uint8_t port) 373 { 374 struct rte_eth_dev_info dev_info; 375 struct rte_eth_conf port_conf; 376 struct rte_eth_rxconf *rxconf; 377 struct rte_eth_txconf *txconf; 378 int16_t rx_rings, tx_rings; 379 uint16_t rx_ring_size, tx_ring_size; 380 int retval; 381 uint16_t q; 382 383 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 384 rte_eth_dev_info_get (port, &dev_info); 385 386 if (dev_info.max_rx_queues > MAX_QUEUES) { 387 rte_exit(EXIT_FAILURE, 388 "please define MAX_QUEUES no less than %u in %s\n", 389 dev_info.max_rx_queues, __FILE__); 390 } 391 392 rxconf = &dev_info.default_rxconf; 393 txconf = &dev_info.default_txconf; 394 rxconf->rx_drop_en = 1; 395 396 /* Enable vlan offload */ 397 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 398 399 /* 400 * Zero copy defers queue RX/TX start to the time when guest 401 * finishes its startup and packet buffers from that guest are 402 * available. 403 */ 404 if (zero_copy) { 405 rxconf->rx_deferred_start = 1; 406 rxconf->rx_drop_en = 0; 407 txconf->tx_deferred_start = 1; 408 } 409 410 /*configure the number of supported virtio devices based on VMDQ limits */ 411 num_devices = dev_info.max_vmdq_pools; 412 413 if (zero_copy) { 414 rx_ring_size = num_rx_descriptor; 415 tx_ring_size = num_tx_descriptor; 416 tx_rings = dev_info.max_tx_queues; 417 } else { 418 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 419 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 420 tx_rings = (uint16_t)rte_lcore_count(); 421 } 422 423 retval = validate_num_devices(MAX_DEVICES); 424 if (retval < 0) 425 return retval; 426 427 /* Get port configuration. */ 428 retval = get_eth_conf(&port_conf, num_devices); 429 if (retval < 0) 430 return retval; 431 /* NIC queues are divided into pf queues and vmdq queues. */ 432 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 433 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 434 num_vmdq_queues = num_devices * queues_per_pool; 435 num_queues = num_pf_queues + num_vmdq_queues; 436 vmdq_queue_base = dev_info.vmdq_queue_base; 437 vmdq_pool_base = dev_info.vmdq_pool_base; 438 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 439 num_pf_queues, num_devices, queues_per_pool); 440 441 if (port >= rte_eth_dev_count()) return -1; 442 443 if (enable_tx_csum == 0) 444 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); 445 446 if (enable_tso == 0) { 447 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); 448 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); 449 } 450 451 rx_rings = (uint16_t)dev_info.max_rx_queues; 452 /* Configure ethernet device. */ 453 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 454 if (retval != 0) 455 return retval; 456 457 /* Setup the queues. */ 458 for (q = 0; q < rx_rings; q ++) { 459 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 460 rte_eth_dev_socket_id(port), 461 rxconf, 462 vpool_array[q].pool); 463 if (retval < 0) 464 return retval; 465 } 466 for (q = 0; q < tx_rings; q ++) { 467 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 468 rte_eth_dev_socket_id(port), 469 txconf); 470 if (retval < 0) 471 return retval; 472 } 473 474 /* Start the device. */ 475 retval = rte_eth_dev_start(port); 476 if (retval < 0) { 477 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 478 return retval; 479 } 480 481 if (promiscuous) 482 rte_eth_promiscuous_enable(port); 483 484 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 485 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 486 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 487 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 488 (unsigned)port, 489 vmdq_ports_eth_addr[port].addr_bytes[0], 490 vmdq_ports_eth_addr[port].addr_bytes[1], 491 vmdq_ports_eth_addr[port].addr_bytes[2], 492 vmdq_ports_eth_addr[port].addr_bytes[3], 493 vmdq_ports_eth_addr[port].addr_bytes[4], 494 vmdq_ports_eth_addr[port].addr_bytes[5]); 495 496 return 0; 497 } 498 499 /* 500 * Set character device basename. 501 */ 502 static int 503 us_vhost_parse_basename(const char *q_arg) 504 { 505 /* parse number string */ 506 507 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 508 return -1; 509 else 510 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 511 512 return 0; 513 } 514 515 /* 516 * Parse the portmask provided at run time. 517 */ 518 static int 519 parse_portmask(const char *portmask) 520 { 521 char *end = NULL; 522 unsigned long pm; 523 524 errno = 0; 525 526 /* parse hexadecimal string */ 527 pm = strtoul(portmask, &end, 16); 528 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 529 return -1; 530 531 if (pm == 0) 532 return -1; 533 534 return pm; 535 536 } 537 538 /* 539 * Parse num options at run time. 540 */ 541 static int 542 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 543 { 544 char *end = NULL; 545 unsigned long num; 546 547 errno = 0; 548 549 /* parse unsigned int string */ 550 num = strtoul(q_arg, &end, 10); 551 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 552 return -1; 553 554 if (num > max_valid_value) 555 return -1; 556 557 return num; 558 559 } 560 561 /* 562 * Display usage 563 */ 564 static void 565 us_vhost_usage(const char *prgname) 566 { 567 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 568 " --vm2vm [0|1|2]\n" 569 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 570 " --dev-basename <name>\n" 571 " --nb-devices ND\n" 572 " -p PORTMASK: Set mask for ports to be used by application\n" 573 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 574 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 575 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 576 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 577 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 578 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 579 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 580 " --dev-basename: The basename to be used for the character device.\n" 581 " --zero-copy [0|1]: disable(default)/enable rx/tx " 582 "zero copy\n" 583 " --rx-desc-num [0-N]: the number of descriptors on rx, " 584 "used only when zero copy is enabled.\n" 585 " --tx-desc-num [0-N]: the number of descriptors on tx, " 586 "used only when zero copy is enabled.\n" 587 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 588 " --tso [0|1] disable/enable TCP segment offload.\n", 589 prgname); 590 } 591 592 /* 593 * Parse the arguments given in the command line of the application. 594 */ 595 static int 596 us_vhost_parse_args(int argc, char **argv) 597 { 598 int opt, ret; 599 int option_index; 600 unsigned i; 601 const char *prgname = argv[0]; 602 static struct option long_option[] = { 603 {"vm2vm", required_argument, NULL, 0}, 604 {"rx-retry", required_argument, NULL, 0}, 605 {"rx-retry-delay", required_argument, NULL, 0}, 606 {"rx-retry-num", required_argument, NULL, 0}, 607 {"mergeable", required_argument, NULL, 0}, 608 {"vlan-strip", required_argument, NULL, 0}, 609 {"stats", required_argument, NULL, 0}, 610 {"dev-basename", required_argument, NULL, 0}, 611 {"zero-copy", required_argument, NULL, 0}, 612 {"rx-desc-num", required_argument, NULL, 0}, 613 {"tx-desc-num", required_argument, NULL, 0}, 614 {"tx-csum", required_argument, NULL, 0}, 615 {"tso", required_argument, NULL, 0}, 616 {NULL, 0, 0, 0}, 617 }; 618 619 /* Parse command line */ 620 while ((opt = getopt_long(argc, argv, "p:P", 621 long_option, &option_index)) != EOF) { 622 switch (opt) { 623 /* Portmask */ 624 case 'p': 625 enabled_port_mask = parse_portmask(optarg); 626 if (enabled_port_mask == 0) { 627 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 628 us_vhost_usage(prgname); 629 return -1; 630 } 631 break; 632 633 case 'P': 634 promiscuous = 1; 635 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 636 ETH_VMDQ_ACCEPT_BROADCAST | 637 ETH_VMDQ_ACCEPT_MULTICAST; 638 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 639 640 break; 641 642 case 0: 643 /* Enable/disable vm2vm comms. */ 644 if (!strncmp(long_option[option_index].name, "vm2vm", 645 MAX_LONG_OPT_SZ)) { 646 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 647 if (ret == -1) { 648 RTE_LOG(INFO, VHOST_CONFIG, 649 "Invalid argument for " 650 "vm2vm [0|1|2]\n"); 651 us_vhost_usage(prgname); 652 return -1; 653 } else { 654 vm2vm_mode = (vm2vm_type)ret; 655 } 656 } 657 658 /* Enable/disable retries on RX. */ 659 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 660 ret = parse_num_opt(optarg, 1); 661 if (ret == -1) { 662 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 663 us_vhost_usage(prgname); 664 return -1; 665 } else { 666 enable_retry = ret; 667 } 668 } 669 670 /* Enable/disable TX checksum offload. */ 671 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 672 ret = parse_num_opt(optarg, 1); 673 if (ret == -1) { 674 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 675 us_vhost_usage(prgname); 676 return -1; 677 } else 678 enable_tx_csum = ret; 679 } 680 681 /* Enable/disable TSO offload. */ 682 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 683 ret = parse_num_opt(optarg, 1); 684 if (ret == -1) { 685 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 686 us_vhost_usage(prgname); 687 return -1; 688 } else 689 enable_tso = ret; 690 } 691 692 /* Specify the retries delay time (in useconds) on RX. */ 693 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 694 ret = parse_num_opt(optarg, INT32_MAX); 695 if (ret == -1) { 696 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 697 us_vhost_usage(prgname); 698 return -1; 699 } else { 700 burst_rx_delay_time = ret; 701 } 702 } 703 704 /* Specify the retries number on RX. */ 705 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 706 ret = parse_num_opt(optarg, INT32_MAX); 707 if (ret == -1) { 708 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 709 us_vhost_usage(prgname); 710 return -1; 711 } else { 712 burst_rx_retry_num = ret; 713 } 714 } 715 716 /* Enable/disable RX mergeable buffers. */ 717 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 718 ret = parse_num_opt(optarg, 1); 719 if (ret == -1) { 720 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 721 us_vhost_usage(prgname); 722 return -1; 723 } else { 724 mergeable = !!ret; 725 if (ret) { 726 vmdq_conf_default.rxmode.jumbo_frame = 1; 727 vmdq_conf_default.rxmode.max_rx_pkt_len 728 = JUMBO_FRAME_MAX_SIZE; 729 } 730 } 731 } 732 733 /* Enable/disable RX VLAN strip on host. */ 734 if (!strncmp(long_option[option_index].name, 735 "vlan-strip", MAX_LONG_OPT_SZ)) { 736 ret = parse_num_opt(optarg, 1); 737 if (ret == -1) { 738 RTE_LOG(INFO, VHOST_CONFIG, 739 "Invalid argument for VLAN strip [0|1]\n"); 740 us_vhost_usage(prgname); 741 return -1; 742 } else { 743 vlan_strip = !!ret; 744 vmdq_conf_default.rxmode.hw_vlan_strip = 745 vlan_strip; 746 } 747 } 748 749 /* Enable/disable stats. */ 750 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 751 ret = parse_num_opt(optarg, INT32_MAX); 752 if (ret == -1) { 753 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 754 us_vhost_usage(prgname); 755 return -1; 756 } else { 757 enable_stats = ret; 758 } 759 } 760 761 /* Set character device basename. */ 762 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 763 if (us_vhost_parse_basename(optarg) == -1) { 764 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 765 us_vhost_usage(prgname); 766 return -1; 767 } 768 } 769 770 /* Enable/disable rx/tx zero copy. */ 771 if (!strncmp(long_option[option_index].name, 772 "zero-copy", MAX_LONG_OPT_SZ)) { 773 ret = parse_num_opt(optarg, 1); 774 if (ret == -1) { 775 RTE_LOG(INFO, VHOST_CONFIG, 776 "Invalid argument" 777 " for zero-copy [0|1]\n"); 778 us_vhost_usage(prgname); 779 return -1; 780 } else 781 zero_copy = ret; 782 } 783 784 /* Specify the descriptor number on RX. */ 785 if (!strncmp(long_option[option_index].name, 786 "rx-desc-num", MAX_LONG_OPT_SZ)) { 787 ret = parse_num_opt(optarg, MAX_RING_DESC); 788 if ((ret == -1) || (!POWEROF2(ret))) { 789 RTE_LOG(INFO, VHOST_CONFIG, 790 "Invalid argument for rx-desc-num[0-N]," 791 "power of 2 required.\n"); 792 us_vhost_usage(prgname); 793 return -1; 794 } else { 795 num_rx_descriptor = ret; 796 } 797 } 798 799 /* Specify the descriptor number on TX. */ 800 if (!strncmp(long_option[option_index].name, 801 "tx-desc-num", MAX_LONG_OPT_SZ)) { 802 ret = parse_num_opt(optarg, MAX_RING_DESC); 803 if ((ret == -1) || (!POWEROF2(ret))) { 804 RTE_LOG(INFO, VHOST_CONFIG, 805 "Invalid argument for tx-desc-num [0-N]," 806 "power of 2 required.\n"); 807 us_vhost_usage(prgname); 808 return -1; 809 } else { 810 num_tx_descriptor = ret; 811 } 812 } 813 814 break; 815 816 /* Invalid option - print options. */ 817 default: 818 us_vhost_usage(prgname); 819 return -1; 820 } 821 } 822 823 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 824 if (enabled_port_mask & (1 << i)) 825 ports[num_ports++] = (uint8_t)i; 826 } 827 828 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 829 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 830 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 831 return -1; 832 } 833 834 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 835 RTE_LOG(INFO, VHOST_PORT, 836 "Vhost zero copy doesn't support software vm2vm," 837 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 838 return -1; 839 } 840 841 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 842 RTE_LOG(INFO, VHOST_PORT, 843 "Vhost zero copy doesn't support jumbo frame," 844 "please specify '--mergeable 0' to disable the " 845 "mergeable feature.\n"); 846 return -1; 847 } 848 849 return 0; 850 } 851 852 /* 853 * Update the global var NUM_PORTS and array PORTS according to system ports number 854 * and return valid ports number 855 */ 856 static unsigned check_ports_num(unsigned nb_ports) 857 { 858 unsigned valid_num_ports = num_ports; 859 unsigned portid; 860 861 if (num_ports > nb_ports) { 862 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 863 num_ports, nb_ports); 864 num_ports = nb_ports; 865 } 866 867 for (portid = 0; portid < num_ports; portid ++) { 868 if (ports[portid] >= nb_ports) { 869 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 870 ports[portid], (nb_ports - 1)); 871 ports[portid] = INVALID_PORT_ID; 872 valid_num_ports--; 873 } 874 } 875 return valid_num_ports; 876 } 877 878 /* 879 * Macro to print out packet contents. Wrapped in debug define so that the 880 * data path is not effected when debug is disabled. 881 */ 882 #ifdef DEBUG 883 #define PRINT_PACKET(device, addr, size, header) do { \ 884 char *pkt_addr = (char*)(addr); \ 885 unsigned int index; \ 886 char packet[MAX_PRINT_BUFF]; \ 887 \ 888 if ((header)) \ 889 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 890 else \ 891 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 892 for (index = 0; index < (size); index++) { \ 893 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 894 "%02hhx ", pkt_addr[index]); \ 895 } \ 896 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 897 \ 898 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 899 } while(0) 900 #else 901 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 902 #endif 903 904 /* 905 * Function to convert guest physical addresses to vhost physical addresses. 906 * This is used to convert virtio buffer addresses. 907 */ 908 static inline uint64_t __attribute__((always_inline)) 909 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 910 uint32_t buf_len, hpa_type *addr_type) 911 { 912 struct virtio_memory_regions_hpa *region; 913 uint32_t regionidx; 914 uint64_t vhost_pa = 0; 915 916 *addr_type = PHYS_ADDR_INVALID; 917 918 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 919 region = &vdev->regions_hpa[regionidx]; 920 if ((guest_pa >= region->guest_phys_address) && 921 (guest_pa <= region->guest_phys_address_end)) { 922 vhost_pa = region->host_phys_addr_offset + guest_pa; 923 if (likely((guest_pa + buf_len - 1) 924 <= region->guest_phys_address_end)) 925 *addr_type = PHYS_ADDR_CONTINUOUS; 926 else 927 *addr_type = PHYS_ADDR_CROSS_SUBREG; 928 break; 929 } 930 } 931 932 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 933 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 934 (void *)(uintptr_t)vhost_pa); 935 936 return vhost_pa; 937 } 938 939 /* 940 * Compares a packet destination MAC address to a device MAC address. 941 */ 942 static inline int __attribute__((always_inline)) 943 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 944 { 945 return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; 946 } 947 948 /* 949 * This function learns the MAC address of the device and registers this along with a 950 * vlan tag to a VMDQ. 951 */ 952 static int 953 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 954 { 955 struct ether_hdr *pkt_hdr; 956 struct virtio_net_data_ll *dev_ll; 957 struct virtio_net *dev = vdev->dev; 958 int i, ret; 959 960 /* Learn MAC address of guest device from packet */ 961 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 962 963 dev_ll = ll_root_used; 964 965 while (dev_ll != NULL) { 966 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 967 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 968 return -1; 969 } 970 dev_ll = dev_ll->next; 971 } 972 973 for (i = 0; i < ETHER_ADDR_LEN; i++) 974 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 975 976 /* vlan_tag currently uses the device_id. */ 977 vdev->vlan_tag = vlan_tags[dev->device_fh]; 978 979 /* Print out VMDQ registration info. */ 980 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 981 dev->device_fh, 982 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 983 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 984 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 985 vdev->vlan_tag); 986 987 /* Register the MAC address. */ 988 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 989 (uint32_t)dev->device_fh + vmdq_pool_base); 990 if (ret) 991 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 992 dev->device_fh); 993 994 /* Enable stripping of the vlan tag as we handle routing. */ 995 if (vlan_strip) 996 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 997 (uint16_t)vdev->vmdq_rx_q, 1); 998 999 /* Set device as ready for RX. */ 1000 vdev->ready = DEVICE_RX; 1001 1002 return 0; 1003 } 1004 1005 /* 1006 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 1007 * queue before disabling RX on the device. 1008 */ 1009 static inline void 1010 unlink_vmdq(struct vhost_dev *vdev) 1011 { 1012 unsigned i = 0; 1013 unsigned rx_count; 1014 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1015 1016 if (vdev->ready == DEVICE_RX) { 1017 /*clear MAC and VLAN settings*/ 1018 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 1019 for (i = 0; i < 6; i++) 1020 vdev->mac_address.addr_bytes[i] = 0; 1021 1022 vdev->vlan_tag = 0; 1023 1024 /*Clear out the receive buffers*/ 1025 rx_count = rte_eth_rx_burst(ports[0], 1026 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1027 1028 while (rx_count) { 1029 for (i = 0; i < rx_count; i++) 1030 rte_pktmbuf_free(pkts_burst[i]); 1031 1032 rx_count = rte_eth_rx_burst(ports[0], 1033 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1034 } 1035 1036 vdev->ready = DEVICE_MAC_LEARNING; 1037 } 1038 } 1039 1040 /* 1041 * Check if the packet destination MAC address is for a local device. If so then put 1042 * the packet on that devices RX queue. If not then return. 1043 */ 1044 static inline int __attribute__((always_inline)) 1045 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1046 { 1047 struct virtio_net_data_ll *dev_ll; 1048 struct ether_hdr *pkt_hdr; 1049 uint64_t ret = 0; 1050 struct virtio_net *dev = vdev->dev; 1051 struct virtio_net *tdev; /* destination virito device */ 1052 1053 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1054 1055 /*get the used devices list*/ 1056 dev_ll = ll_root_used; 1057 1058 while (dev_ll != NULL) { 1059 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1060 &dev_ll->vdev->mac_address)) { 1061 1062 /* Drop the packet if the TX packet is destined for the TX device. */ 1063 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1064 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1065 dev->device_fh); 1066 return 0; 1067 } 1068 tdev = dev_ll->vdev->dev; 1069 1070 1071 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1072 1073 if (unlikely(dev_ll->vdev->remove)) { 1074 /*drop the packet if the device is marked for removal*/ 1075 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1076 } else { 1077 /*send the packet to the local virtio device*/ 1078 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1079 if (enable_stats) { 1080 rte_atomic64_add( 1081 &dev_statistics[tdev->device_fh].rx_total_atomic, 1082 1); 1083 rte_atomic64_add( 1084 &dev_statistics[tdev->device_fh].rx_atomic, 1085 ret); 1086 dev_statistics[dev->device_fh].tx_total++; 1087 dev_statistics[dev->device_fh].tx += ret; 1088 } 1089 } 1090 1091 return 0; 1092 } 1093 dev_ll = dev_ll->next; 1094 } 1095 1096 return -1; 1097 } 1098 1099 /* 1100 * Check if the destination MAC of a packet is one local VM, 1101 * and get its vlan tag, and offset if it is. 1102 */ 1103 static inline int __attribute__((always_inline)) 1104 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1105 uint32_t *offset, uint16_t *vlan_tag) 1106 { 1107 struct virtio_net_data_ll *dev_ll = ll_root_used; 1108 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1109 1110 while (dev_ll != NULL) { 1111 if ((dev_ll->vdev->ready == DEVICE_RX) 1112 && ether_addr_cmp(&(pkt_hdr->d_addr), 1113 &dev_ll->vdev->mac_address)) { 1114 /* 1115 * Drop the packet if the TX packet is 1116 * destined for the TX device. 1117 */ 1118 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1119 LOG_DEBUG(VHOST_DATA, 1120 "(%"PRIu64") TX: Source and destination" 1121 " MAC addresses are the same. Dropping " 1122 "packet.\n", 1123 dev_ll->vdev->dev->device_fh); 1124 return -1; 1125 } 1126 1127 /* 1128 * HW vlan strip will reduce the packet length 1129 * by minus length of vlan tag, so need restore 1130 * the packet length by plus it. 1131 */ 1132 *offset = VLAN_HLEN; 1133 *vlan_tag = 1134 (uint16_t) 1135 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1136 1137 LOG_DEBUG(VHOST_DATA, 1138 "(%"PRIu64") TX: pkt to local VM device id:" 1139 "(%"PRIu64") vlan tag: %d.\n", 1140 dev->device_fh, dev_ll->vdev->dev->device_fh, 1141 (int)*vlan_tag); 1142 1143 break; 1144 } 1145 dev_ll = dev_ll->next; 1146 } 1147 return 0; 1148 } 1149 1150 static uint16_t 1151 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 1152 { 1153 if (ol_flags & PKT_TX_IPV4) 1154 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 1155 else /* assume ethertype == ETHER_TYPE_IPv6 */ 1156 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 1157 } 1158 1159 static void virtio_tx_offload(struct rte_mbuf *m) 1160 { 1161 void *l3_hdr; 1162 struct ipv4_hdr *ipv4_hdr = NULL; 1163 struct tcp_hdr *tcp_hdr = NULL; 1164 struct udp_hdr *udp_hdr = NULL; 1165 struct sctp_hdr *sctp_hdr = NULL; 1166 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1167 1168 l3_hdr = (char *)eth_hdr + m->l2_len; 1169 1170 if (m->tso_segsz != 0) { 1171 ipv4_hdr = (struct ipv4_hdr *)l3_hdr; 1172 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 1173 m->ol_flags |= PKT_TX_IP_CKSUM; 1174 ipv4_hdr->hdr_checksum = 0; 1175 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 1176 return; 1177 } 1178 1179 if (m->ol_flags & PKT_TX_L4_MASK) { 1180 switch (m->ol_flags & PKT_TX_L4_MASK) { 1181 case PKT_TX_TCP_CKSUM: 1182 tcp_hdr = (struct tcp_hdr *) 1183 ((char *)l3_hdr + m->l3_len); 1184 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 1185 break; 1186 case PKT_TX_UDP_CKSUM: 1187 udp_hdr = (struct udp_hdr *) 1188 ((char *)l3_hdr + m->l3_len); 1189 udp_hdr->dgram_cksum = get_psd_sum(l3_hdr, m->ol_flags); 1190 break; 1191 case PKT_TX_SCTP_CKSUM: 1192 sctp_hdr = (struct sctp_hdr *) 1193 ((char *)l3_hdr + m->l3_len); 1194 sctp_hdr->cksum = 0; 1195 break; 1196 default: 1197 break; 1198 } 1199 } 1200 } 1201 1202 /* 1203 * This function routes the TX packet to the correct interface. This may be a local device 1204 * or the physical port. 1205 */ 1206 static inline void __attribute__((always_inline)) 1207 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1208 { 1209 struct mbuf_table *tx_q; 1210 struct rte_mbuf **m_table; 1211 unsigned len, ret, offset = 0; 1212 const uint16_t lcore_id = rte_lcore_id(); 1213 struct virtio_net *dev = vdev->dev; 1214 struct ether_hdr *nh; 1215 1216 /*check if destination is local VM*/ 1217 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1218 rte_pktmbuf_free(m); 1219 return; 1220 } 1221 1222 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1223 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1224 rte_pktmbuf_free(m); 1225 return; 1226 } 1227 } 1228 1229 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1230 1231 /*Add packet to the port tx queue*/ 1232 tx_q = &lcore_tx_queue[lcore_id]; 1233 len = tx_q->len; 1234 1235 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1236 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1237 /* Guest has inserted the vlan tag. */ 1238 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1239 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1240 if ((vm2vm_mode == VM2VM_HARDWARE) && 1241 (vh->vlan_tci != vlan_tag_be)) 1242 vh->vlan_tci = vlan_tag_be; 1243 } else { 1244 m->ol_flags |= PKT_TX_VLAN_PKT; 1245 1246 /* 1247 * Find the right seg to adjust the data len when offset is 1248 * bigger than tail room size. 1249 */ 1250 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1251 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1252 m->data_len += offset; 1253 else { 1254 struct rte_mbuf *seg = m; 1255 1256 while ((seg->next != NULL) && 1257 (offset > rte_pktmbuf_tailroom(seg))) 1258 seg = seg->next; 1259 1260 seg->data_len += offset; 1261 } 1262 m->pkt_len += offset; 1263 } 1264 1265 m->vlan_tci = vlan_tag; 1266 } 1267 1268 if ((m->ol_flags & PKT_TX_L4_MASK) || (m->ol_flags & PKT_TX_TCP_SEG)) 1269 virtio_tx_offload(m); 1270 1271 tx_q->m_table[len] = m; 1272 len++; 1273 if (enable_stats) { 1274 dev_statistics[dev->device_fh].tx_total++; 1275 dev_statistics[dev->device_fh].tx++; 1276 } 1277 1278 if (unlikely(len == MAX_PKT_BURST)) { 1279 m_table = (struct rte_mbuf **)tx_q->m_table; 1280 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1281 /* Free any buffers not handled by TX and update the port stats. */ 1282 if (unlikely(ret < len)) { 1283 do { 1284 rte_pktmbuf_free(m_table[ret]); 1285 } while (++ret < len); 1286 } 1287 1288 len = 0; 1289 } 1290 1291 tx_q->len = len; 1292 return; 1293 } 1294 /* 1295 * This function is called by each data core. It handles all RX/TX registered with the 1296 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1297 * with all devices in the main linked list. 1298 */ 1299 static int 1300 switch_worker(__attribute__((unused)) void *arg) 1301 { 1302 struct rte_mempool *mbuf_pool = arg; 1303 struct virtio_net *dev = NULL; 1304 struct vhost_dev *vdev = NULL; 1305 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1306 struct virtio_net_data_ll *dev_ll; 1307 struct mbuf_table *tx_q; 1308 volatile struct lcore_ll_info *lcore_ll; 1309 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1310 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1311 unsigned ret, i; 1312 const uint16_t lcore_id = rte_lcore_id(); 1313 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1314 uint16_t rx_count = 0; 1315 uint16_t tx_count; 1316 uint32_t retry = 0; 1317 1318 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1319 lcore_ll = lcore_info[lcore_id].lcore_ll; 1320 prev_tsc = 0; 1321 1322 tx_q = &lcore_tx_queue[lcore_id]; 1323 for (i = 0; i < num_cores; i ++) { 1324 if (lcore_ids[i] == lcore_id) { 1325 tx_q->txq_id = i; 1326 break; 1327 } 1328 } 1329 1330 while(1) { 1331 cur_tsc = rte_rdtsc(); 1332 /* 1333 * TX burst queue drain 1334 */ 1335 diff_tsc = cur_tsc - prev_tsc; 1336 if (unlikely(diff_tsc > drain_tsc)) { 1337 1338 if (tx_q->len) { 1339 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1340 1341 /*Tx any packets in the queue*/ 1342 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1343 (struct rte_mbuf **)tx_q->m_table, 1344 (uint16_t)tx_q->len); 1345 if (unlikely(ret < tx_q->len)) { 1346 do { 1347 rte_pktmbuf_free(tx_q->m_table[ret]); 1348 } while (++ret < tx_q->len); 1349 } 1350 1351 tx_q->len = 0; 1352 } 1353 1354 prev_tsc = cur_tsc; 1355 1356 } 1357 1358 rte_prefetch0(lcore_ll->ll_root_used); 1359 /* 1360 * Inform the configuration core that we have exited the linked list and that no devices are 1361 * in use if requested. 1362 */ 1363 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1364 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1365 1366 /* 1367 * Process devices 1368 */ 1369 dev_ll = lcore_ll->ll_root_used; 1370 1371 while (dev_ll != NULL) { 1372 /*get virtio device ID*/ 1373 vdev = dev_ll->vdev; 1374 dev = vdev->dev; 1375 1376 if (unlikely(vdev->remove)) { 1377 dev_ll = dev_ll->next; 1378 unlink_vmdq(vdev); 1379 vdev->ready = DEVICE_SAFE_REMOVE; 1380 continue; 1381 } 1382 if (likely(vdev->ready == DEVICE_RX)) { 1383 /*Handle guest RX*/ 1384 rx_count = rte_eth_rx_burst(ports[0], 1385 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1386 1387 if (rx_count) { 1388 /* 1389 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1390 * Here MAX_PKT_BURST must be less than virtio queue size 1391 */ 1392 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1393 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1394 rte_delay_us(burst_rx_delay_time); 1395 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1396 break; 1397 } 1398 } 1399 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1400 if (enable_stats) { 1401 rte_atomic64_add( 1402 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1403 rx_count); 1404 rte_atomic64_add( 1405 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1406 } 1407 while (likely(rx_count)) { 1408 rx_count--; 1409 rte_pktmbuf_free(pkts_burst[rx_count]); 1410 } 1411 1412 } 1413 } 1414 1415 if (likely(!vdev->remove)) { 1416 /* Handle guest TX*/ 1417 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1418 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1419 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1420 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1421 while (tx_count) 1422 rte_pktmbuf_free(pkts_burst[--tx_count]); 1423 } 1424 } 1425 while (tx_count) 1426 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh); 1427 } 1428 1429 /*move to the next device in the list*/ 1430 dev_ll = dev_ll->next; 1431 } 1432 } 1433 1434 return 0; 1435 } 1436 1437 /* 1438 * This function gets available ring number for zero copy rx. 1439 * Only one thread will call this funciton for a paticular virtio device, 1440 * so, it is designed as non-thread-safe function. 1441 */ 1442 static inline uint32_t __attribute__((always_inline)) 1443 get_available_ring_num_zcp(struct virtio_net *dev) 1444 { 1445 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1446 uint16_t avail_idx; 1447 1448 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1449 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1450 } 1451 1452 /* 1453 * This function gets available ring index for zero copy rx, 1454 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1455 * Only one thread will call this funciton for a paticular virtio device, 1456 * so, it is designed as non-thread-safe function. 1457 */ 1458 static inline uint32_t __attribute__((always_inline)) 1459 get_available_ring_index_zcp(struct virtio_net *dev, 1460 uint16_t *res_base_idx, uint32_t count) 1461 { 1462 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1463 uint16_t avail_idx; 1464 uint32_t retry = 0; 1465 uint16_t free_entries; 1466 1467 *res_base_idx = vq->last_used_idx_res; 1468 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1469 free_entries = (avail_idx - *res_base_idx); 1470 1471 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1472 "avail idx: %d, " 1473 "res base idx:%d, free entries:%d\n", 1474 dev->device_fh, avail_idx, *res_base_idx, 1475 free_entries); 1476 1477 /* 1478 * If retry is enabled and the queue is full then we wait 1479 * and retry to avoid packet loss. 1480 */ 1481 if (enable_retry && unlikely(count > free_entries)) { 1482 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1483 rte_delay_us(burst_rx_delay_time); 1484 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1485 free_entries = (avail_idx - *res_base_idx); 1486 if (count <= free_entries) 1487 break; 1488 } 1489 } 1490 1491 /*check that we have enough buffers*/ 1492 if (unlikely(count > free_entries)) 1493 count = free_entries; 1494 1495 if (unlikely(count == 0)) { 1496 LOG_DEBUG(VHOST_DATA, 1497 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1498 "avail idx: %d, res base idx:%d, free entries:%d\n", 1499 dev->device_fh, avail_idx, 1500 *res_base_idx, free_entries); 1501 return 0; 1502 } 1503 1504 vq->last_used_idx_res = *res_base_idx + count; 1505 1506 return count; 1507 } 1508 1509 /* 1510 * This function put descriptor back to used list. 1511 */ 1512 static inline void __attribute__((always_inline)) 1513 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1514 { 1515 uint16_t res_cur_idx = vq->last_used_idx; 1516 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1517 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1518 rte_compiler_barrier(); 1519 *(volatile uint16_t *)&vq->used->idx += 1; 1520 vq->last_used_idx += 1; 1521 1522 /* Kick the guest if necessary. */ 1523 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1524 eventfd_write(vq->callfd, (eventfd_t)1); 1525 } 1526 1527 /* 1528 * This function get available descriptor from vitio vring and un-attached mbuf 1529 * from vpool->ring, and then attach them together. It needs adjust the offset 1530 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1531 * frame data may be put to wrong location in mbuf. 1532 */ 1533 static inline void __attribute__((always_inline)) 1534 attach_rxmbuf_zcp(struct virtio_net *dev) 1535 { 1536 uint16_t res_base_idx, desc_idx; 1537 uint64_t buff_addr, phys_addr; 1538 struct vhost_virtqueue *vq; 1539 struct vring_desc *desc; 1540 void *obj = NULL; 1541 struct rte_mbuf *mbuf; 1542 struct vpool *vpool; 1543 hpa_type addr_type; 1544 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1545 1546 vpool = &vpool_array[vdev->vmdq_rx_q]; 1547 vq = dev->virtqueue[VIRTIO_RXQ]; 1548 1549 do { 1550 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1551 1) != 1)) 1552 return; 1553 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1554 1555 desc = &vq->desc[desc_idx]; 1556 if (desc->flags & VRING_DESC_F_NEXT) { 1557 desc = &vq->desc[desc->next]; 1558 buff_addr = gpa_to_vva(dev, desc->addr); 1559 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1560 &addr_type); 1561 } else { 1562 buff_addr = gpa_to_vva(dev, 1563 desc->addr + vq->vhost_hlen); 1564 phys_addr = gpa_to_hpa(vdev, 1565 desc->addr + vq->vhost_hlen, 1566 desc->len, &addr_type); 1567 } 1568 1569 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1570 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1571 " address found when attaching RX frame buffer" 1572 " address!\n", dev->device_fh); 1573 put_desc_to_used_list_zcp(vq, desc_idx); 1574 continue; 1575 } 1576 1577 /* 1578 * Check if the frame buffer address from guest crosses 1579 * sub-region or not. 1580 */ 1581 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1582 RTE_LOG(ERR, VHOST_DATA, 1583 "(%"PRIu64") Frame buffer address cross " 1584 "sub-regioin found when attaching RX frame " 1585 "buffer address!\n", 1586 dev->device_fh); 1587 put_desc_to_used_list_zcp(vq, desc_idx); 1588 continue; 1589 } 1590 } while (unlikely(phys_addr == 0)); 1591 1592 rte_ring_sc_dequeue(vpool->ring, &obj); 1593 mbuf = obj; 1594 if (unlikely(mbuf == NULL)) { 1595 LOG_DEBUG(VHOST_DATA, 1596 "(%"PRIu64") in attach_rxmbuf_zcp: " 1597 "ring_sc_dequeue fail.\n", 1598 dev->device_fh); 1599 put_desc_to_used_list_zcp(vq, desc_idx); 1600 return; 1601 } 1602 1603 if (unlikely(vpool->buf_size > desc->len)) { 1604 LOG_DEBUG(VHOST_DATA, 1605 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1606 "length(%d) of descriptor idx: %d less than room " 1607 "size required: %d\n", 1608 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1609 put_desc_to_used_list_zcp(vq, desc_idx); 1610 rte_ring_sp_enqueue(vpool->ring, obj); 1611 return; 1612 } 1613 1614 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1615 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1616 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1617 mbuf->data_len = desc->len; 1618 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1619 1620 LOG_DEBUG(VHOST_DATA, 1621 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1622 "descriptor idx:%d\n", 1623 dev->device_fh, res_base_idx, desc_idx); 1624 1625 __rte_mbuf_raw_free(mbuf); 1626 1627 return; 1628 } 1629 1630 /* 1631 * Detach an attched packet mbuf - 1632 * - restore original mbuf address and length values. 1633 * - reset pktmbuf data and data_len to their default values. 1634 * All other fields of the given packet mbuf will be left intact. 1635 * 1636 * @param m 1637 * The attached packet mbuf. 1638 */ 1639 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1640 { 1641 const struct rte_mempool *mp = m->pool; 1642 void *buf = rte_mbuf_to_baddr(m); 1643 uint32_t buf_ofs; 1644 uint32_t buf_len = mp->elt_size - sizeof(*m); 1645 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1646 1647 m->buf_addr = buf; 1648 m->buf_len = (uint16_t)buf_len; 1649 1650 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1651 RTE_PKTMBUF_HEADROOM : m->buf_len; 1652 m->data_off = buf_ofs; 1653 1654 m->data_len = 0; 1655 } 1656 1657 /* 1658 * This function is called after packets have been transimited. It fetchs mbuf 1659 * from vpool->pool, detached it and put into vpool->ring. It also update the 1660 * used index and kick the guest if necessary. 1661 */ 1662 static inline uint32_t __attribute__((always_inline)) 1663 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1664 { 1665 struct rte_mbuf *mbuf; 1666 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1667 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1668 uint32_t index = 0; 1669 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1670 1671 LOG_DEBUG(VHOST_DATA, 1672 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1673 "clean is: %d\n", 1674 dev->device_fh, mbuf_count); 1675 LOG_DEBUG(VHOST_DATA, 1676 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1677 "clean is : %d\n", 1678 dev->device_fh, rte_ring_count(vpool->ring)); 1679 1680 for (index = 0; index < mbuf_count; index++) { 1681 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1682 if (likely(MBUF_EXT_MEM(mbuf))) 1683 pktmbuf_detach_zcp(mbuf); 1684 rte_ring_sp_enqueue(vpool->ring, mbuf); 1685 1686 /* Update used index buffer information. */ 1687 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1688 vq->used->ring[used_idx].len = 0; 1689 1690 used_idx = (used_idx + 1) & (vq->size - 1); 1691 } 1692 1693 LOG_DEBUG(VHOST_DATA, 1694 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1695 "clean is: %d\n", 1696 dev->device_fh, rte_mempool_count(vpool->pool)); 1697 LOG_DEBUG(VHOST_DATA, 1698 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1699 "clean is : %d\n", 1700 dev->device_fh, rte_ring_count(vpool->ring)); 1701 LOG_DEBUG(VHOST_DATA, 1702 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1703 "vq->last_used_idx:%d\n", 1704 dev->device_fh, vq->last_used_idx); 1705 1706 vq->last_used_idx += mbuf_count; 1707 1708 LOG_DEBUG(VHOST_DATA, 1709 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1710 "vq->last_used_idx:%d\n", 1711 dev->device_fh, vq->last_used_idx); 1712 1713 rte_compiler_barrier(); 1714 1715 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1716 1717 /* Kick guest if required. */ 1718 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1719 eventfd_write(vq->callfd, (eventfd_t)1); 1720 1721 return 0; 1722 } 1723 1724 /* 1725 * This function is called when a virtio device is destroy. 1726 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1727 */ 1728 static void mbuf_destroy_zcp(struct vpool *vpool) 1729 { 1730 struct rte_mbuf *mbuf = NULL; 1731 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1732 1733 LOG_DEBUG(VHOST_CONFIG, 1734 "in mbuf_destroy_zcp: mbuf count in mempool before " 1735 "mbuf_destroy_zcp is: %d\n", 1736 mbuf_count); 1737 LOG_DEBUG(VHOST_CONFIG, 1738 "in mbuf_destroy_zcp: mbuf count in ring before " 1739 "mbuf_destroy_zcp is : %d\n", 1740 rte_ring_count(vpool->ring)); 1741 1742 for (index = 0; index < mbuf_count; index++) { 1743 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1744 if (likely(mbuf != NULL)) { 1745 if (likely(MBUF_EXT_MEM(mbuf))) 1746 pktmbuf_detach_zcp(mbuf); 1747 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1748 } 1749 } 1750 1751 LOG_DEBUG(VHOST_CONFIG, 1752 "in mbuf_destroy_zcp: mbuf count in mempool after " 1753 "mbuf_destroy_zcp is: %d\n", 1754 rte_mempool_count(vpool->pool)); 1755 LOG_DEBUG(VHOST_CONFIG, 1756 "in mbuf_destroy_zcp: mbuf count in ring after " 1757 "mbuf_destroy_zcp is : %d\n", 1758 rte_ring_count(vpool->ring)); 1759 } 1760 1761 /* 1762 * This function update the use flag and counter. 1763 */ 1764 static inline uint32_t __attribute__((always_inline)) 1765 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1766 uint32_t count) 1767 { 1768 struct vhost_virtqueue *vq; 1769 struct vring_desc *desc; 1770 struct rte_mbuf *buff; 1771 /* The virtio_hdr is initialised to 0. */ 1772 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1773 = {{0, 0, 0, 0, 0, 0}, 0}; 1774 uint64_t buff_hdr_addr = 0; 1775 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1776 uint32_t head_idx, packet_success = 0; 1777 uint16_t res_cur_idx; 1778 1779 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1780 1781 if (count == 0) 1782 return 0; 1783 1784 vq = dev->virtqueue[VIRTIO_RXQ]; 1785 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1786 1787 res_cur_idx = vq->last_used_idx; 1788 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1789 dev->device_fh, res_cur_idx, res_cur_idx + count); 1790 1791 /* Retrieve all of the head indexes first to avoid caching issues. */ 1792 for (head_idx = 0; head_idx < count; head_idx++) 1793 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1794 1795 /*Prefetch descriptor index. */ 1796 rte_prefetch0(&vq->desc[head[packet_success]]); 1797 1798 while (packet_success != count) { 1799 /* Get descriptor from available ring */ 1800 desc = &vq->desc[head[packet_success]]; 1801 1802 buff = pkts[packet_success]; 1803 LOG_DEBUG(VHOST_DATA, 1804 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1805 "pkt[%d] descriptor idx: %d\n", 1806 dev->device_fh, packet_success, 1807 MBUF_HEADROOM_UINT32(buff)); 1808 1809 PRINT_PACKET(dev, 1810 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1811 + RTE_PKTMBUF_HEADROOM), 1812 rte_pktmbuf_data_len(buff), 0); 1813 1814 /* Buffer address translation for virtio header. */ 1815 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1816 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1817 1818 /* 1819 * If the descriptors are chained the header and data are 1820 * placed in separate buffers. 1821 */ 1822 if (desc->flags & VRING_DESC_F_NEXT) { 1823 desc->len = vq->vhost_hlen; 1824 desc = &vq->desc[desc->next]; 1825 desc->len = rte_pktmbuf_data_len(buff); 1826 } else { 1827 desc->len = packet_len; 1828 } 1829 1830 /* Update used ring with desc information */ 1831 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1832 = head[packet_success]; 1833 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1834 = packet_len; 1835 res_cur_idx++; 1836 packet_success++; 1837 1838 /* A header is required per buffer. */ 1839 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1840 (const void *)&virtio_hdr, vq->vhost_hlen); 1841 1842 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1843 1844 if (likely(packet_success < count)) { 1845 /* Prefetch descriptor index. */ 1846 rte_prefetch0(&vq->desc[head[packet_success]]); 1847 } 1848 } 1849 1850 rte_compiler_barrier(); 1851 1852 LOG_DEBUG(VHOST_DATA, 1853 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1854 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1855 dev->device_fh, vq->last_used_idx, vq->used->idx); 1856 1857 *(volatile uint16_t *)&vq->used->idx += count; 1858 vq->last_used_idx += count; 1859 1860 LOG_DEBUG(VHOST_DATA, 1861 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1862 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1863 dev->device_fh, vq->last_used_idx, vq->used->idx); 1864 1865 /* Kick the guest if necessary. */ 1866 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1867 eventfd_write(vq->callfd, (eventfd_t)1); 1868 1869 return count; 1870 } 1871 1872 /* 1873 * This function routes the TX packet to the correct interface. 1874 * This may be a local device or the physical port. 1875 */ 1876 static inline void __attribute__((always_inline)) 1877 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1878 uint32_t desc_idx, uint8_t need_copy) 1879 { 1880 struct mbuf_table *tx_q; 1881 struct rte_mbuf **m_table; 1882 void *obj = NULL; 1883 struct rte_mbuf *mbuf; 1884 unsigned len, ret, offset = 0; 1885 struct vpool *vpool; 1886 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1887 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1888 1889 /*Add packet to the port tx queue*/ 1890 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1891 len = tx_q->len; 1892 1893 /* Allocate an mbuf and populate the structure. */ 1894 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1895 rte_ring_sc_dequeue(vpool->ring, &obj); 1896 mbuf = obj; 1897 if (unlikely(mbuf == NULL)) { 1898 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1899 RTE_LOG(ERR, VHOST_DATA, 1900 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1901 dev->device_fh); 1902 put_desc_to_used_list_zcp(vq, desc_idx); 1903 return; 1904 } 1905 1906 if (vm2vm_mode == VM2VM_HARDWARE) { 1907 /* Avoid using a vlan tag from any vm for external pkt, such as 1908 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1909 * selection, MAC address determines it as an external pkt 1910 * which should go to network, while vlan tag determine it as 1911 * a vm2vm pkt should forward to another vm. Hardware confuse 1912 * such a ambiguous situation, so pkt will lost. 1913 */ 1914 vlan_tag = external_pkt_default_vlan_tag; 1915 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1916 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1917 __rte_mbuf_raw_free(mbuf); 1918 return; 1919 } 1920 } 1921 1922 mbuf->nb_segs = m->nb_segs; 1923 mbuf->next = m->next; 1924 mbuf->data_len = m->data_len + offset; 1925 mbuf->pkt_len = mbuf->data_len; 1926 if (unlikely(need_copy)) { 1927 /* Copy the packet contents to the mbuf. */ 1928 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1929 rte_pktmbuf_mtod(m, void *), 1930 m->data_len); 1931 } else { 1932 mbuf->data_off = m->data_off; 1933 mbuf->buf_physaddr = m->buf_physaddr; 1934 mbuf->buf_addr = m->buf_addr; 1935 } 1936 mbuf->ol_flags |= PKT_TX_VLAN_PKT; 1937 mbuf->vlan_tci = vlan_tag; 1938 mbuf->l2_len = sizeof(struct ether_hdr); 1939 mbuf->l3_len = sizeof(struct ipv4_hdr); 1940 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1941 1942 tx_q->m_table[len] = mbuf; 1943 len++; 1944 1945 LOG_DEBUG(VHOST_DATA, 1946 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1947 dev->device_fh, 1948 mbuf->nb_segs, 1949 (mbuf->next == NULL) ? "null" : "non-null"); 1950 1951 if (enable_stats) { 1952 dev_statistics[dev->device_fh].tx_total++; 1953 dev_statistics[dev->device_fh].tx++; 1954 } 1955 1956 if (unlikely(len == MAX_PKT_BURST)) { 1957 m_table = (struct rte_mbuf **)tx_q->m_table; 1958 ret = rte_eth_tx_burst(ports[0], 1959 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1960 1961 /* 1962 * Free any buffers not handled by TX and update 1963 * the port stats. 1964 */ 1965 if (unlikely(ret < len)) { 1966 do { 1967 rte_pktmbuf_free(m_table[ret]); 1968 } while (++ret < len); 1969 } 1970 1971 len = 0; 1972 txmbuf_clean_zcp(dev, vpool); 1973 } 1974 1975 tx_q->len = len; 1976 1977 return; 1978 } 1979 1980 /* 1981 * This function TX all available packets in virtio TX queue for one 1982 * virtio-net device. If it is first packet, it learns MAC address and 1983 * setup VMDQ. 1984 */ 1985 static inline void __attribute__((always_inline)) 1986 virtio_dev_tx_zcp(struct virtio_net *dev) 1987 { 1988 struct rte_mbuf m; 1989 struct vhost_virtqueue *vq; 1990 struct vring_desc *desc; 1991 uint64_t buff_addr = 0, phys_addr; 1992 uint32_t head[MAX_PKT_BURST]; 1993 uint32_t i; 1994 uint16_t free_entries, packet_success = 0; 1995 uint16_t avail_idx; 1996 uint8_t need_copy = 0; 1997 hpa_type addr_type; 1998 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1999 2000 vq = dev->virtqueue[VIRTIO_TXQ]; 2001 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 2002 2003 /* If there are no available buffers then return. */ 2004 if (vq->last_used_idx_res == avail_idx) 2005 return; 2006 2007 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 2008 2009 /* Prefetch available ring to retrieve head indexes. */ 2010 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 2011 2012 /* Get the number of free entries in the ring */ 2013 free_entries = (avail_idx - vq->last_used_idx_res); 2014 2015 /* Limit to MAX_PKT_BURST. */ 2016 free_entries 2017 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 2018 2019 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 2020 dev->device_fh, free_entries); 2021 2022 /* Retrieve all of the head indexes first to avoid caching issues. */ 2023 for (i = 0; i < free_entries; i++) 2024 head[i] 2025 = vq->avail->ring[(vq->last_used_idx_res + i) 2026 & (vq->size - 1)]; 2027 2028 vq->last_used_idx_res += free_entries; 2029 2030 /* Prefetch descriptor index. */ 2031 rte_prefetch0(&vq->desc[head[packet_success]]); 2032 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 2033 2034 while (packet_success < free_entries) { 2035 desc = &vq->desc[head[packet_success]]; 2036 2037 /* Discard first buffer as it is the virtio header */ 2038 desc = &vq->desc[desc->next]; 2039 2040 /* Buffer address translation. */ 2041 buff_addr = gpa_to_vva(dev, desc->addr); 2042 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 2043 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 2044 &addr_type); 2045 2046 if (likely(packet_success < (free_entries - 1))) 2047 /* Prefetch descriptor index. */ 2048 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 2049 2050 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 2051 RTE_LOG(ERR, VHOST_DATA, 2052 "(%"PRIu64") Invalid frame buffer address found" 2053 "when TX packets!\n", 2054 dev->device_fh); 2055 packet_success++; 2056 continue; 2057 } 2058 2059 /* Prefetch buffer address. */ 2060 rte_prefetch0((void *)(uintptr_t)buff_addr); 2061 2062 /* 2063 * Setup dummy mbuf. This is copied to a real mbuf if 2064 * transmitted out the physical port. 2065 */ 2066 m.data_len = desc->len; 2067 m.nb_segs = 1; 2068 m.next = NULL; 2069 m.data_off = 0; 2070 m.buf_addr = (void *)(uintptr_t)buff_addr; 2071 m.buf_physaddr = phys_addr; 2072 2073 /* 2074 * Check if the frame buffer address from guest crosses 2075 * sub-region or not. 2076 */ 2077 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2078 RTE_LOG(ERR, VHOST_DATA, 2079 "(%"PRIu64") Frame buffer address cross " 2080 "sub-regioin found when attaching TX frame " 2081 "buffer address!\n", 2082 dev->device_fh); 2083 need_copy = 1; 2084 } else 2085 need_copy = 0; 2086 2087 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2088 2089 /* 2090 * If this is the first received packet we need to learn 2091 * the MAC and setup VMDQ 2092 */ 2093 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2094 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2095 /* 2096 * Discard frame if device is scheduled for 2097 * removal or a duplicate MAC address is found. 2098 */ 2099 packet_success += free_entries; 2100 vq->last_used_idx += packet_success; 2101 break; 2102 } 2103 } 2104 2105 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2106 packet_success++; 2107 } 2108 } 2109 2110 /* 2111 * This function is called by each data core. It handles all RX/TX registered 2112 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2113 * addresses are compared with all devices in the main linked list. 2114 */ 2115 static int 2116 switch_worker_zcp(__attribute__((unused)) void *arg) 2117 { 2118 struct virtio_net *dev = NULL; 2119 struct vhost_dev *vdev = NULL; 2120 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2121 struct virtio_net_data_ll *dev_ll; 2122 struct mbuf_table *tx_q; 2123 volatile struct lcore_ll_info *lcore_ll; 2124 const uint64_t drain_tsc 2125 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2126 * BURST_TX_DRAIN_US; 2127 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2128 unsigned ret; 2129 const uint16_t lcore_id = rte_lcore_id(); 2130 uint16_t count_in_ring, rx_count = 0; 2131 2132 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2133 2134 lcore_ll = lcore_info[lcore_id].lcore_ll; 2135 prev_tsc = 0; 2136 2137 while (1) { 2138 cur_tsc = rte_rdtsc(); 2139 2140 /* TX burst queue drain */ 2141 diff_tsc = cur_tsc - prev_tsc; 2142 if (unlikely(diff_tsc > drain_tsc)) { 2143 /* 2144 * Get mbuf from vpool.pool and detach mbuf and 2145 * put back into vpool.ring. 2146 */ 2147 dev_ll = lcore_ll->ll_root_used; 2148 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2149 /* Get virtio device ID */ 2150 vdev = dev_ll->vdev; 2151 dev = vdev->dev; 2152 2153 if (likely(!vdev->remove)) { 2154 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2155 if (tx_q->len) { 2156 LOG_DEBUG(VHOST_DATA, 2157 "TX queue drained after timeout" 2158 " with burst size %u\n", 2159 tx_q->len); 2160 2161 /* 2162 * Tx any packets in the queue 2163 */ 2164 ret = rte_eth_tx_burst( 2165 ports[0], 2166 (uint16_t)tx_q->txq_id, 2167 (struct rte_mbuf **) 2168 tx_q->m_table, 2169 (uint16_t)tx_q->len); 2170 if (unlikely(ret < tx_q->len)) { 2171 do { 2172 rte_pktmbuf_free( 2173 tx_q->m_table[ret]); 2174 } while (++ret < tx_q->len); 2175 } 2176 tx_q->len = 0; 2177 2178 txmbuf_clean_zcp(dev, 2179 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2180 } 2181 } 2182 dev_ll = dev_ll->next; 2183 } 2184 prev_tsc = cur_tsc; 2185 } 2186 2187 rte_prefetch0(lcore_ll->ll_root_used); 2188 2189 /* 2190 * Inform the configuration core that we have exited the linked 2191 * list and that no devices are in use if requested. 2192 */ 2193 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2194 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2195 2196 /* Process devices */ 2197 dev_ll = lcore_ll->ll_root_used; 2198 2199 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2200 vdev = dev_ll->vdev; 2201 dev = vdev->dev; 2202 if (unlikely(vdev->remove)) { 2203 dev_ll = dev_ll->next; 2204 unlink_vmdq(vdev); 2205 vdev->ready = DEVICE_SAFE_REMOVE; 2206 continue; 2207 } 2208 2209 if (likely(vdev->ready == DEVICE_RX)) { 2210 uint32_t index = vdev->vmdq_rx_q; 2211 uint16_t i; 2212 count_in_ring 2213 = rte_ring_count(vpool_array[index].ring); 2214 uint16_t free_entries 2215 = (uint16_t)get_available_ring_num_zcp(dev); 2216 2217 /* 2218 * Attach all mbufs in vpool.ring and put back 2219 * into vpool.pool. 2220 */ 2221 for (i = 0; 2222 i < RTE_MIN(free_entries, 2223 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2224 i++) 2225 attach_rxmbuf_zcp(dev); 2226 2227 /* Handle guest RX */ 2228 rx_count = rte_eth_rx_burst(ports[0], 2229 vdev->vmdq_rx_q, pkts_burst, 2230 MAX_PKT_BURST); 2231 2232 if (rx_count) { 2233 ret_count = virtio_dev_rx_zcp(dev, 2234 pkts_burst, rx_count); 2235 if (enable_stats) { 2236 dev_statistics[dev->device_fh].rx_total 2237 += rx_count; 2238 dev_statistics[dev->device_fh].rx 2239 += ret_count; 2240 } 2241 while (likely(rx_count)) { 2242 rx_count--; 2243 pktmbuf_detach_zcp( 2244 pkts_burst[rx_count]); 2245 rte_ring_sp_enqueue( 2246 vpool_array[index].ring, 2247 (void *)pkts_burst[rx_count]); 2248 } 2249 } 2250 } 2251 2252 if (likely(!vdev->remove)) 2253 /* Handle guest TX */ 2254 virtio_dev_tx_zcp(dev); 2255 2256 /* Move to the next device in the list */ 2257 dev_ll = dev_ll->next; 2258 } 2259 } 2260 2261 return 0; 2262 } 2263 2264 2265 /* 2266 * Add an entry to a used linked list. A free entry must first be found 2267 * in the free linked list using get_data_ll_free_entry(); 2268 */ 2269 static void 2270 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2271 struct virtio_net_data_ll *ll_dev) 2272 { 2273 struct virtio_net_data_ll *ll = *ll_root_addr; 2274 2275 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2276 ll_dev->next = NULL; 2277 rte_compiler_barrier(); 2278 2279 /* If ll == NULL then this is the first device. */ 2280 if (ll) { 2281 /* Increment to the tail of the linked list. */ 2282 while ((ll->next != NULL) ) 2283 ll = ll->next; 2284 2285 ll->next = ll_dev; 2286 } else { 2287 *ll_root_addr = ll_dev; 2288 } 2289 } 2290 2291 /* 2292 * Remove an entry from a used linked list. The entry must then be added to 2293 * the free linked list using put_data_ll_free_entry(). 2294 */ 2295 static void 2296 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2297 struct virtio_net_data_ll *ll_dev, 2298 struct virtio_net_data_ll *ll_dev_last) 2299 { 2300 struct virtio_net_data_ll *ll = *ll_root_addr; 2301 2302 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2303 return; 2304 2305 if (ll_dev == ll) 2306 *ll_root_addr = ll_dev->next; 2307 else 2308 if (likely(ll_dev_last != NULL)) 2309 ll_dev_last->next = ll_dev->next; 2310 else 2311 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2312 } 2313 2314 /* 2315 * Find and return an entry from the free linked list. 2316 */ 2317 static struct virtio_net_data_ll * 2318 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2319 { 2320 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2321 struct virtio_net_data_ll *ll_dev; 2322 2323 if (ll_free == NULL) 2324 return NULL; 2325 2326 ll_dev = ll_free; 2327 *ll_root_addr = ll_free->next; 2328 2329 return ll_dev; 2330 } 2331 2332 /* 2333 * Place an entry back on to the free linked list. 2334 */ 2335 static void 2336 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2337 struct virtio_net_data_ll *ll_dev) 2338 { 2339 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2340 2341 if (ll_dev == NULL) 2342 return; 2343 2344 ll_dev->next = ll_free; 2345 *ll_root_addr = ll_dev; 2346 } 2347 2348 /* 2349 * Creates a linked list of a given size. 2350 */ 2351 static struct virtio_net_data_ll * 2352 alloc_data_ll(uint32_t size) 2353 { 2354 struct virtio_net_data_ll *ll_new; 2355 uint32_t i; 2356 2357 /* Malloc and then chain the linked list. */ 2358 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2359 if (ll_new == NULL) { 2360 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2361 return NULL; 2362 } 2363 2364 for (i = 0; i < size - 1; i++) { 2365 ll_new[i].vdev = NULL; 2366 ll_new[i].next = &ll_new[i+1]; 2367 } 2368 ll_new[i].next = NULL; 2369 2370 return ll_new; 2371 } 2372 2373 /* 2374 * Create the main linked list along with each individual cores linked list. A used and a free list 2375 * are created to manage entries. 2376 */ 2377 static int 2378 init_data_ll (void) 2379 { 2380 int lcore; 2381 2382 RTE_LCORE_FOREACH_SLAVE(lcore) { 2383 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2384 if (lcore_info[lcore].lcore_ll == NULL) { 2385 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2386 return -1; 2387 } 2388 2389 lcore_info[lcore].lcore_ll->device_num = 0; 2390 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2391 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2392 if (num_devices % num_switching_cores) 2393 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2394 else 2395 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2396 } 2397 2398 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2399 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2400 2401 return 0; 2402 } 2403 2404 /* 2405 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2406 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2407 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2408 */ 2409 static void 2410 destroy_device (volatile struct virtio_net *dev) 2411 { 2412 struct virtio_net_data_ll *ll_lcore_dev_cur; 2413 struct virtio_net_data_ll *ll_main_dev_cur; 2414 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2415 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2416 struct vhost_dev *vdev; 2417 int lcore; 2418 2419 dev->flags &= ~VIRTIO_DEV_RUNNING; 2420 2421 vdev = (struct vhost_dev *)dev->priv; 2422 /*set the remove flag. */ 2423 vdev->remove = 1; 2424 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2425 rte_pause(); 2426 } 2427 2428 /* Search for entry to be removed from lcore ll */ 2429 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2430 while (ll_lcore_dev_cur != NULL) { 2431 if (ll_lcore_dev_cur->vdev == vdev) { 2432 break; 2433 } else { 2434 ll_lcore_dev_last = ll_lcore_dev_cur; 2435 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2436 } 2437 } 2438 2439 if (ll_lcore_dev_cur == NULL) { 2440 RTE_LOG(ERR, VHOST_CONFIG, 2441 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2442 dev->device_fh); 2443 return; 2444 } 2445 2446 /* Search for entry to be removed from main ll */ 2447 ll_main_dev_cur = ll_root_used; 2448 ll_main_dev_last = NULL; 2449 while (ll_main_dev_cur != NULL) { 2450 if (ll_main_dev_cur->vdev == vdev) { 2451 break; 2452 } else { 2453 ll_main_dev_last = ll_main_dev_cur; 2454 ll_main_dev_cur = ll_main_dev_cur->next; 2455 } 2456 } 2457 2458 /* Remove entries from the lcore and main ll. */ 2459 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2460 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2461 2462 /* Set the dev_removal_flag on each lcore. */ 2463 RTE_LCORE_FOREACH_SLAVE(lcore) { 2464 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2465 } 2466 2467 /* 2468 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2469 * they can no longer access the device removed from the linked lists and that the devices 2470 * are no longer in use. 2471 */ 2472 RTE_LCORE_FOREACH_SLAVE(lcore) { 2473 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2474 rte_pause(); 2475 } 2476 } 2477 2478 /* Add the entries back to the lcore and main free ll.*/ 2479 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2480 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2481 2482 /* Decrement number of device on the lcore. */ 2483 lcore_info[vdev->coreid].lcore_ll->device_num--; 2484 2485 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2486 2487 if (zero_copy) { 2488 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2489 2490 /* Stop the RX queue. */ 2491 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2492 LOG_DEBUG(VHOST_CONFIG, 2493 "(%"PRIu64") In destroy_device: Failed to stop " 2494 "rx queue:%d\n", 2495 dev->device_fh, 2496 vdev->vmdq_rx_q); 2497 } 2498 2499 LOG_DEBUG(VHOST_CONFIG, 2500 "(%"PRIu64") in destroy_device: Start put mbuf in " 2501 "mempool back to ring for RX queue: %d\n", 2502 dev->device_fh, vdev->vmdq_rx_q); 2503 2504 mbuf_destroy_zcp(vpool); 2505 2506 /* Stop the TX queue. */ 2507 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2508 LOG_DEBUG(VHOST_CONFIG, 2509 "(%"PRIu64") In destroy_device: Failed to " 2510 "stop tx queue:%d\n", 2511 dev->device_fh, vdev->vmdq_rx_q); 2512 } 2513 2514 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2515 2516 LOG_DEBUG(VHOST_CONFIG, 2517 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2518 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2519 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2520 dev->device_fh); 2521 2522 mbuf_destroy_zcp(vpool); 2523 rte_free(vdev->regions_hpa); 2524 } 2525 rte_free(vdev); 2526 2527 } 2528 2529 /* 2530 * Calculate the region count of physical continous regions for one particular 2531 * region of whose vhost virtual address is continous. The particular region 2532 * start from vva_start, with size of 'size' in argument. 2533 */ 2534 static uint32_t 2535 check_hpa_regions(uint64_t vva_start, uint64_t size) 2536 { 2537 uint32_t i, nregions = 0, page_size = getpagesize(); 2538 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2539 if (vva_start % page_size) { 2540 LOG_DEBUG(VHOST_CONFIG, 2541 "in check_countinous: vva start(%p) mod page_size(%d) " 2542 "has remainder\n", 2543 (void *)(uintptr_t)vva_start, page_size); 2544 return 0; 2545 } 2546 if (size % page_size) { 2547 LOG_DEBUG(VHOST_CONFIG, 2548 "in check_countinous: " 2549 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2550 size, page_size); 2551 return 0; 2552 } 2553 for (i = 0; i < size - page_size; i = i + page_size) { 2554 cur_phys_addr 2555 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2556 next_phys_addr = rte_mem_virt2phy( 2557 (void *)(uintptr_t)(vva_start + i + page_size)); 2558 if ((cur_phys_addr + page_size) != next_phys_addr) { 2559 ++nregions; 2560 LOG_DEBUG(VHOST_CONFIG, 2561 "in check_continuous: hva addr:(%p) is not " 2562 "continuous with hva addr:(%p), diff:%d\n", 2563 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2564 (void *)(uintptr_t)(vva_start + (uint64_t)i 2565 + page_size), page_size); 2566 LOG_DEBUG(VHOST_CONFIG, 2567 "in check_continuous: hpa addr:(%p) is not " 2568 "continuous with hpa addr:(%p), " 2569 "diff:(%"PRIu64")\n", 2570 (void *)(uintptr_t)cur_phys_addr, 2571 (void *)(uintptr_t)next_phys_addr, 2572 (next_phys_addr-cur_phys_addr)); 2573 } 2574 } 2575 return nregions; 2576 } 2577 2578 /* 2579 * Divide each region whose vhost virtual address is continous into a few 2580 * sub-regions, make sure the physical address within each sub-region are 2581 * continous. And fill offset(to GPA) and size etc. information of each 2582 * sub-region into regions_hpa. 2583 */ 2584 static uint32_t 2585 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2586 { 2587 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2588 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2589 2590 if (mem_region_hpa == NULL) 2591 return 0; 2592 2593 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2594 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2595 virtio_memory->regions[regionidx].address_offset; 2596 mem_region_hpa[regionidx_hpa].guest_phys_address 2597 = virtio_memory->regions[regionidx].guest_phys_address; 2598 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2599 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2600 mem_region_hpa[regionidx_hpa].guest_phys_address; 2601 LOG_DEBUG(VHOST_CONFIG, 2602 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2603 regionidx_hpa, 2604 (void *)(uintptr_t) 2605 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2606 LOG_DEBUG(VHOST_CONFIG, 2607 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2608 regionidx_hpa, 2609 (void *)(uintptr_t) 2610 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2611 for (i = 0, k = 0; 2612 i < virtio_memory->regions[regionidx].memory_size - 2613 page_size; 2614 i += page_size) { 2615 cur_phys_addr = rte_mem_virt2phy( 2616 (void *)(uintptr_t)(vva_start + i)); 2617 next_phys_addr = rte_mem_virt2phy( 2618 (void *)(uintptr_t)(vva_start + 2619 i + page_size)); 2620 if ((cur_phys_addr + page_size) != next_phys_addr) { 2621 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2622 mem_region_hpa[regionidx_hpa].guest_phys_address + 2623 k + page_size; 2624 mem_region_hpa[regionidx_hpa].memory_size 2625 = k + page_size; 2626 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2627 "phys addr end [%d]:(%p)\n", 2628 regionidx_hpa, 2629 (void *)(uintptr_t) 2630 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2631 LOG_DEBUG(VHOST_CONFIG, 2632 "in fill_hpa_regions: guest phys addr " 2633 "size [%d]:(%p)\n", 2634 regionidx_hpa, 2635 (void *)(uintptr_t) 2636 (mem_region_hpa[regionidx_hpa].memory_size)); 2637 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2638 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2639 ++regionidx_hpa; 2640 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2641 next_phys_addr - 2642 mem_region_hpa[regionidx_hpa].guest_phys_address; 2643 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2644 " phys addr start[%d]:(%p)\n", 2645 regionidx_hpa, 2646 (void *)(uintptr_t) 2647 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2648 LOG_DEBUG(VHOST_CONFIG, 2649 "in fill_hpa_regions: host phys addr " 2650 "start[%d]:(%p)\n", 2651 regionidx_hpa, 2652 (void *)(uintptr_t) 2653 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2654 k = 0; 2655 } else { 2656 k += page_size; 2657 } 2658 } 2659 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2660 = mem_region_hpa[regionidx_hpa].guest_phys_address 2661 + k + page_size; 2662 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2663 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2664 "[%d]:(%p)\n", regionidx_hpa, 2665 (void *)(uintptr_t) 2666 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2667 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2668 "[%d]:(%p)\n", regionidx_hpa, 2669 (void *)(uintptr_t) 2670 (mem_region_hpa[regionidx_hpa].memory_size)); 2671 ++regionidx_hpa; 2672 } 2673 return regionidx_hpa; 2674 } 2675 2676 /* 2677 * A new device is added to a data core. First the device is added to the main linked list 2678 * and the allocated to a specific data core. 2679 */ 2680 static int 2681 new_device (struct virtio_net *dev) 2682 { 2683 struct virtio_net_data_ll *ll_dev; 2684 int lcore, core_add = 0; 2685 uint32_t device_num_min = num_devices; 2686 struct vhost_dev *vdev; 2687 uint32_t regionidx; 2688 2689 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2690 if (vdev == NULL) { 2691 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2692 dev->device_fh); 2693 return -1; 2694 } 2695 vdev->dev = dev; 2696 dev->priv = vdev; 2697 2698 if (zero_copy) { 2699 vdev->nregions_hpa = dev->mem->nregions; 2700 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2701 vdev->nregions_hpa 2702 += check_hpa_regions( 2703 dev->mem->regions[regionidx].guest_phys_address 2704 + dev->mem->regions[regionidx].address_offset, 2705 dev->mem->regions[regionidx].memory_size); 2706 2707 } 2708 2709 vdev->regions_hpa = rte_calloc("vhost hpa region", 2710 vdev->nregions_hpa, 2711 sizeof(struct virtio_memory_regions_hpa), 2712 RTE_CACHE_LINE_SIZE); 2713 if (vdev->regions_hpa == NULL) { 2714 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2715 rte_free(vdev); 2716 return -1; 2717 } 2718 2719 2720 if (fill_hpa_memory_regions( 2721 vdev->regions_hpa, dev->mem 2722 ) != vdev->nregions_hpa) { 2723 2724 RTE_LOG(ERR, VHOST_CONFIG, 2725 "hpa memory regions number mismatch: " 2726 "[%d]\n", vdev->nregions_hpa); 2727 rte_free(vdev->regions_hpa); 2728 rte_free(vdev); 2729 return -1; 2730 } 2731 } 2732 2733 2734 /* Add device to main ll */ 2735 ll_dev = get_data_ll_free_entry(&ll_root_free); 2736 if (ll_dev == NULL) { 2737 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2738 "of %d devices per core has been reached\n", 2739 dev->device_fh, num_devices); 2740 if (vdev->regions_hpa) 2741 rte_free(vdev->regions_hpa); 2742 rte_free(vdev); 2743 return -1; 2744 } 2745 ll_dev->vdev = vdev; 2746 add_data_ll_entry(&ll_root_used, ll_dev); 2747 vdev->vmdq_rx_q 2748 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2749 2750 if (zero_copy) { 2751 uint32_t index = vdev->vmdq_rx_q; 2752 uint32_t count_in_ring, i; 2753 struct mbuf_table *tx_q; 2754 2755 count_in_ring = rte_ring_count(vpool_array[index].ring); 2756 2757 LOG_DEBUG(VHOST_CONFIG, 2758 "(%"PRIu64") in new_device: mbuf count in mempool " 2759 "before attach is: %d\n", 2760 dev->device_fh, 2761 rte_mempool_count(vpool_array[index].pool)); 2762 LOG_DEBUG(VHOST_CONFIG, 2763 "(%"PRIu64") in new_device: mbuf count in ring " 2764 "before attach is : %d\n", 2765 dev->device_fh, count_in_ring); 2766 2767 /* 2768 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2769 */ 2770 for (i = 0; i < count_in_ring; i++) 2771 attach_rxmbuf_zcp(dev); 2772 2773 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2774 "mempool after attach is: %d\n", 2775 dev->device_fh, 2776 rte_mempool_count(vpool_array[index].pool)); 2777 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2778 "ring after attach is : %d\n", 2779 dev->device_fh, 2780 rte_ring_count(vpool_array[index].ring)); 2781 2782 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2783 tx_q->txq_id = vdev->vmdq_rx_q; 2784 2785 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2786 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2787 2788 LOG_DEBUG(VHOST_CONFIG, 2789 "(%"PRIu64") In new_device: Failed to start " 2790 "tx queue:%d\n", 2791 dev->device_fh, vdev->vmdq_rx_q); 2792 2793 mbuf_destroy_zcp(vpool); 2794 rte_free(vdev->regions_hpa); 2795 rte_free(vdev); 2796 return -1; 2797 } 2798 2799 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2800 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2801 2802 LOG_DEBUG(VHOST_CONFIG, 2803 "(%"PRIu64") In new_device: Failed to start " 2804 "rx queue:%d\n", 2805 dev->device_fh, vdev->vmdq_rx_q); 2806 2807 /* Stop the TX queue. */ 2808 if (rte_eth_dev_tx_queue_stop(ports[0], 2809 vdev->vmdq_rx_q) != 0) { 2810 LOG_DEBUG(VHOST_CONFIG, 2811 "(%"PRIu64") In new_device: Failed to " 2812 "stop tx queue:%d\n", 2813 dev->device_fh, vdev->vmdq_rx_q); 2814 } 2815 2816 mbuf_destroy_zcp(vpool); 2817 rte_free(vdev->regions_hpa); 2818 rte_free(vdev); 2819 return -1; 2820 } 2821 2822 } 2823 2824 /*reset ready flag*/ 2825 vdev->ready = DEVICE_MAC_LEARNING; 2826 vdev->remove = 0; 2827 2828 /* Find a suitable lcore to add the device. */ 2829 RTE_LCORE_FOREACH_SLAVE(lcore) { 2830 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2831 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2832 core_add = lcore; 2833 } 2834 } 2835 /* Add device to lcore ll */ 2836 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2837 if (ll_dev == NULL) { 2838 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2839 vdev->ready = DEVICE_SAFE_REMOVE; 2840 destroy_device(dev); 2841 rte_free(vdev->regions_hpa); 2842 rte_free(vdev); 2843 return -1; 2844 } 2845 ll_dev->vdev = vdev; 2846 vdev->coreid = core_add; 2847 2848 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2849 2850 /* Initialize device stats */ 2851 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2852 2853 /* Disable notifications. */ 2854 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2855 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2856 lcore_info[vdev->coreid].lcore_ll->device_num++; 2857 dev->flags |= VIRTIO_DEV_RUNNING; 2858 2859 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2860 2861 return 0; 2862 } 2863 2864 /* 2865 * These callback allow devices to be added to the data core when configuration 2866 * has been fully complete. 2867 */ 2868 static const struct virtio_net_device_ops virtio_net_device_ops = 2869 { 2870 .new_device = new_device, 2871 .destroy_device = destroy_device, 2872 }; 2873 2874 /* 2875 * This is a thread will wake up after a period to print stats if the user has 2876 * enabled them. 2877 */ 2878 static void 2879 print_stats(void) 2880 { 2881 struct virtio_net_data_ll *dev_ll; 2882 uint64_t tx_dropped, rx_dropped; 2883 uint64_t tx, tx_total, rx, rx_total; 2884 uint32_t device_fh; 2885 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2886 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2887 2888 while(1) { 2889 sleep(enable_stats); 2890 2891 /* Clear screen and move to top left */ 2892 printf("%s%s", clr, top_left); 2893 2894 printf("\nDevice statistics ===================================="); 2895 2896 dev_ll = ll_root_used; 2897 while (dev_ll != NULL) { 2898 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2899 tx_total = dev_statistics[device_fh].tx_total; 2900 tx = dev_statistics[device_fh].tx; 2901 tx_dropped = tx_total - tx; 2902 if (zero_copy == 0) { 2903 rx_total = rte_atomic64_read( 2904 &dev_statistics[device_fh].rx_total_atomic); 2905 rx = rte_atomic64_read( 2906 &dev_statistics[device_fh].rx_atomic); 2907 } else { 2908 rx_total = dev_statistics[device_fh].rx_total; 2909 rx = dev_statistics[device_fh].rx; 2910 } 2911 rx_dropped = rx_total - rx; 2912 2913 printf("\nStatistics for device %"PRIu32" ------------------------------" 2914 "\nTX total: %"PRIu64"" 2915 "\nTX dropped: %"PRIu64"" 2916 "\nTX successful: %"PRIu64"" 2917 "\nRX total: %"PRIu64"" 2918 "\nRX dropped: %"PRIu64"" 2919 "\nRX successful: %"PRIu64"", 2920 device_fh, 2921 tx_total, 2922 tx_dropped, 2923 tx, 2924 rx_total, 2925 rx_dropped, 2926 rx); 2927 2928 dev_ll = dev_ll->next; 2929 } 2930 printf("\n======================================================\n"); 2931 } 2932 } 2933 2934 static void 2935 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2936 char *ring_name, uint32_t nb_mbuf) 2937 { 2938 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, 2939 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket); 2940 if (vpool_array[index].pool != NULL) { 2941 vpool_array[index].ring 2942 = rte_ring_create(ring_name, 2943 rte_align32pow2(nb_mbuf + 1), 2944 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2945 if (likely(vpool_array[index].ring != NULL)) { 2946 LOG_DEBUG(VHOST_CONFIG, 2947 "in setup_mempool_tbl: mbuf count in " 2948 "mempool is: %d\n", 2949 rte_mempool_count(vpool_array[index].pool)); 2950 LOG_DEBUG(VHOST_CONFIG, 2951 "in setup_mempool_tbl: mbuf count in " 2952 "ring is: %d\n", 2953 rte_ring_count(vpool_array[index].ring)); 2954 } else { 2955 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2956 ring_name); 2957 } 2958 2959 /* Need consider head room. */ 2960 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP; 2961 } else { 2962 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2963 } 2964 } 2965 2966 /* When we receive a INT signal, unregister vhost driver */ 2967 static void 2968 sigint_handler(__rte_unused int signum) 2969 { 2970 /* Unregister vhost driver. */ 2971 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 2972 if (ret != 0) 2973 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 2974 exit(0); 2975 } 2976 2977 /* 2978 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2979 * device is also registered here to handle the IOCTLs. 2980 */ 2981 int 2982 main(int argc, char *argv[]) 2983 { 2984 struct rte_mempool *mbuf_pool = NULL; 2985 unsigned lcore_id, core_id = 0; 2986 unsigned nb_ports, valid_num_ports; 2987 int ret; 2988 uint8_t portid; 2989 uint16_t queue_id; 2990 static pthread_t tid; 2991 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 2992 2993 signal(SIGINT, sigint_handler); 2994 2995 /* init EAL */ 2996 ret = rte_eal_init(argc, argv); 2997 if (ret < 0) 2998 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2999 argc -= ret; 3000 argv += ret; 3001 3002 /* parse app arguments */ 3003 ret = us_vhost_parse_args(argc, argv); 3004 if (ret < 0) 3005 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 3006 3007 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 3008 if (rte_lcore_is_enabled(lcore_id)) 3009 lcore_ids[core_id ++] = lcore_id; 3010 3011 if (rte_lcore_count() > RTE_MAX_LCORE) 3012 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 3013 3014 /*set the number of swithcing cores available*/ 3015 num_switching_cores = rte_lcore_count()-1; 3016 3017 /* Get the number of physical ports. */ 3018 nb_ports = rte_eth_dev_count(); 3019 if (nb_ports > RTE_MAX_ETHPORTS) 3020 nb_ports = RTE_MAX_ETHPORTS; 3021 3022 /* 3023 * Update the global var NUM_PORTS and global array PORTS 3024 * and get value of var VALID_NUM_PORTS according to system ports number 3025 */ 3026 valid_num_ports = check_ports_num(nb_ports); 3027 3028 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 3029 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 3030 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 3031 return -1; 3032 } 3033 3034 if (zero_copy == 0) { 3035 /* Create the mbuf pool. */ 3036 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 3037 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 3038 0, MBUF_DATA_SIZE, rte_socket_id()); 3039 if (mbuf_pool == NULL) 3040 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 3041 3042 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 3043 vpool_array[queue_id].pool = mbuf_pool; 3044 3045 if (vm2vm_mode == VM2VM_HARDWARE) { 3046 /* Enable VT loop back to let L2 switch to do it. */ 3047 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3048 LOG_DEBUG(VHOST_CONFIG, 3049 "Enable loop back for L2 switch in vmdq.\n"); 3050 } 3051 } else { 3052 uint32_t nb_mbuf; 3053 char pool_name[RTE_MEMPOOL_NAMESIZE]; 3054 char ring_name[RTE_MEMPOOL_NAMESIZE]; 3055 3056 nb_mbuf = num_rx_descriptor 3057 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3058 + num_switching_cores * MAX_PKT_BURST; 3059 3060 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3061 snprintf(pool_name, sizeof(pool_name), 3062 "rxmbuf_pool_%u", queue_id); 3063 snprintf(ring_name, sizeof(ring_name), 3064 "rxmbuf_ring_%u", queue_id); 3065 setup_mempool_tbl(rte_socket_id(), queue_id, 3066 pool_name, ring_name, nb_mbuf); 3067 } 3068 3069 nb_mbuf = num_tx_descriptor 3070 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3071 + num_switching_cores * MAX_PKT_BURST; 3072 3073 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3074 snprintf(pool_name, sizeof(pool_name), 3075 "txmbuf_pool_%u", queue_id); 3076 snprintf(ring_name, sizeof(ring_name), 3077 "txmbuf_ring_%u", queue_id); 3078 setup_mempool_tbl(rte_socket_id(), 3079 (queue_id + MAX_QUEUES), 3080 pool_name, ring_name, nb_mbuf); 3081 } 3082 3083 if (vm2vm_mode == VM2VM_HARDWARE) { 3084 /* Enable VT loop back to let L2 switch to do it. */ 3085 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3086 LOG_DEBUG(VHOST_CONFIG, 3087 "Enable loop back for L2 switch in vmdq.\n"); 3088 } 3089 } 3090 /* Set log level. */ 3091 rte_set_log_level(LOG_LEVEL); 3092 3093 /* initialize all ports */ 3094 for (portid = 0; portid < nb_ports; portid++) { 3095 /* skip ports that are not enabled */ 3096 if ((enabled_port_mask & (1 << portid)) == 0) { 3097 RTE_LOG(INFO, VHOST_PORT, 3098 "Skipping disabled port %d\n", portid); 3099 continue; 3100 } 3101 if (port_init(portid) != 0) 3102 rte_exit(EXIT_FAILURE, 3103 "Cannot initialize network ports\n"); 3104 } 3105 3106 /* Initialise all linked lists. */ 3107 if (init_data_ll() == -1) 3108 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3109 3110 /* Initialize device stats */ 3111 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3112 3113 /* Enable stats if the user option is set. */ 3114 if (enable_stats) { 3115 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 3116 if (ret != 0) 3117 rte_exit(EXIT_FAILURE, 3118 "Cannot create print-stats thread\n"); 3119 3120 /* Set thread_name for aid in debugging. */ 3121 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 3122 ret = rte_thread_setname(tid, thread_name); 3123 if (ret != 0) 3124 RTE_LOG(ERR, VHOST_CONFIG, 3125 "Cannot set print-stats name\n"); 3126 } 3127 3128 /* Launch all data cores. */ 3129 if (zero_copy == 0) { 3130 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3131 rte_eal_remote_launch(switch_worker, 3132 mbuf_pool, lcore_id); 3133 } 3134 } else { 3135 uint32_t count_in_mempool, index, i; 3136 for (index = 0; index < 2*MAX_QUEUES; index++) { 3137 /* For all RX and TX queues. */ 3138 count_in_mempool 3139 = rte_mempool_count(vpool_array[index].pool); 3140 3141 /* 3142 * Transfer all un-attached mbufs from vpool.pool 3143 * to vpoo.ring. 3144 */ 3145 for (i = 0; i < count_in_mempool; i++) { 3146 struct rte_mbuf *mbuf 3147 = __rte_mbuf_raw_alloc( 3148 vpool_array[index].pool); 3149 rte_ring_sp_enqueue(vpool_array[index].ring, 3150 (void *)mbuf); 3151 } 3152 3153 LOG_DEBUG(VHOST_CONFIG, 3154 "in main: mbuf count in mempool at initial " 3155 "is: %d\n", count_in_mempool); 3156 LOG_DEBUG(VHOST_CONFIG, 3157 "in main: mbuf count in ring at initial is :" 3158 " %d\n", 3159 rte_ring_count(vpool_array[index].ring)); 3160 } 3161 3162 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3163 rte_eal_remote_launch(switch_worker_zcp, NULL, 3164 lcore_id); 3165 } 3166 3167 if (mergeable == 0) 3168 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3169 3170 /* Register vhost(cuse or user) driver to handle vhost messages. */ 3171 ret = rte_vhost_driver_register((char *)&dev_basename); 3172 if (ret != 0) 3173 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 3174 3175 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3176 3177 /* Start CUSE session. */ 3178 rte_vhost_driver_session_start(); 3179 return 0; 3180 3181 } 3182