1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_virtio_net.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 56 #include "main.h" 57 58 #ifndef MAX_QUEUES 59 #define MAX_QUEUES 128 60 #endif 61 62 /* the maximum number of external ports supported */ 63 #define MAX_SUP_PORTS 1 64 65 /* 66 * Calculate the number of buffers needed per port 67 */ 68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \ 69 (num_switching_cores*MAX_PKT_BURST) + \ 70 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\ 71 ((num_switching_cores+1)*MBUF_CACHE_SIZE)) 72 73 #define MBUF_CACHE_SIZE 128 74 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 75 76 /* 77 * No frame data buffer allocated from host are required for zero copy 78 * implementation, guest will allocate the frame data buffer, and vhost 79 * directly use it. 80 */ 81 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM 82 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE 83 #define MBUF_CACHE_SIZE_ZCP 0 84 85 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */ 86 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 87 88 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 89 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 90 91 #define JUMBO_FRAME_MAX_SIZE 0x2600 92 93 /* State of virtio device. */ 94 #define DEVICE_MAC_LEARNING 0 95 #define DEVICE_RX 1 96 #define DEVICE_SAFE_REMOVE 2 97 98 /* Config_core_flag status definitions. */ 99 #define REQUEST_DEV_REMOVAL 1 100 #define ACK_DEV_REMOVAL 0 101 102 /* Configurable number of RX/TX ring descriptors */ 103 #define RTE_TEST_RX_DESC_DEFAULT 1024 104 #define RTE_TEST_TX_DESC_DEFAULT 512 105 106 /* 107 * Need refine these 2 macros for legacy and DPDK based front end: 108 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST 109 * And then adjust power 2. 110 */ 111 /* 112 * For legacy front end, 128 descriptors, 113 * half for virtio header, another half for mbuf. 114 */ 115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */ 116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */ 117 118 /* Get first 4 bytes in mbuf headroom. */ 119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \ 120 + sizeof(struct rte_mbuf))) 121 122 /* true if x is a power of 2 */ 123 #define POWEROF2(x) ((((x)-1) & (x)) == 0) 124 125 #define INVALID_PORT_ID 0xFF 126 127 /* Max number of devices. Limited by vmdq. */ 128 #define MAX_DEVICES 64 129 130 /* Size of buffers used for snprintfs. */ 131 #define MAX_PRINT_BUFF 6072 132 133 /* Maximum character device basename size. */ 134 #define MAX_BASENAME_SZ 10 135 136 /* Maximum long option length for option parsing. */ 137 #define MAX_LONG_OPT_SZ 64 138 139 /* Used to compare MAC addresses. */ 140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL 141 142 /* Number of descriptors per cacheline. */ 143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) 144 145 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb)) 146 147 /* mask of enabled ports */ 148 static uint32_t enabled_port_mask = 0; 149 150 /* Promiscuous mode */ 151 static uint32_t promiscuous; 152 153 /*Number of switching cores enabled*/ 154 static uint32_t num_switching_cores = 0; 155 156 /* number of devices/queues to support*/ 157 static uint32_t num_queues = 0; 158 static uint32_t num_devices; 159 160 /* 161 * Enable zero copy, pkts buffer will directly dma to hw descriptor, 162 * disabled on default. 163 */ 164 static uint32_t zero_copy; 165 static int mergeable; 166 167 /* Do vlan strip on host, enabled on default */ 168 static uint32_t vlan_strip = 1; 169 170 /* number of descriptors to apply*/ 171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP; 172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP; 173 174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */ 175 #define MAX_RING_DESC 4096 176 177 struct vpool { 178 struct rte_mempool *pool; 179 struct rte_ring *ring; 180 uint32_t buf_size; 181 } vpool_array[MAX_QUEUES+MAX_QUEUES]; 182 183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 184 typedef enum { 185 VM2VM_DISABLED = 0, 186 VM2VM_SOFTWARE = 1, 187 VM2VM_HARDWARE = 2, 188 VM2VM_LAST 189 } vm2vm_type; 190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 191 192 /* The type of host physical address translated from guest physical address. */ 193 typedef enum { 194 PHYS_ADDR_CONTINUOUS = 0, 195 PHYS_ADDR_CROSS_SUBREG = 1, 196 PHYS_ADDR_INVALID = 2, 197 PHYS_ADDR_LAST 198 } hpa_type; 199 200 /* Enable stats. */ 201 static uint32_t enable_stats = 0; 202 /* Enable retries on RX. */ 203 static uint32_t enable_retry = 1; 204 205 /* Disable TX checksum offload */ 206 static uint32_t enable_tx_csum; 207 208 /* Disable TSO offload */ 209 static uint32_t enable_tso; 210 211 /* Specify timeout (in useconds) between retries on RX. */ 212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 213 /* Specify the number of retries on RX. */ 214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 215 216 /* Character device basename. Can be set by user. */ 217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net"; 218 219 /* empty vmdq configuration structure. Filled in programatically */ 220 static struct rte_eth_conf vmdq_conf_default = { 221 .rxmode = { 222 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 223 .split_hdr_size = 0, 224 .header_split = 0, /**< Header Split disabled */ 225 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 226 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 227 /* 228 * It is necessary for 1G NIC such as I350, 229 * this fixes bug of ipv4 forwarding in guest can't 230 * forward pakets from one virtio dev to another virtio dev. 231 */ 232 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 233 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 234 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 235 }, 236 237 .txmode = { 238 .mq_mode = ETH_MQ_TX_NONE, 239 }, 240 .rx_adv_conf = { 241 /* 242 * should be overridden separately in code with 243 * appropriate values 244 */ 245 .vmdq_rx_conf = { 246 .nb_queue_pools = ETH_8_POOLS, 247 .enable_default_pool = 0, 248 .default_pool = 0, 249 .nb_pool_maps = 0, 250 .pool_map = {{0, 0},}, 251 }, 252 }, 253 }; 254 255 static unsigned lcore_ids[RTE_MAX_LCORE]; 256 static uint8_t ports[RTE_MAX_ETHPORTS]; 257 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 258 static uint16_t num_pf_queues, num_vmdq_queues; 259 static uint16_t vmdq_pool_base, vmdq_queue_base; 260 static uint16_t queues_per_pool; 261 262 static const uint16_t external_pkt_default_vlan_tag = 2000; 263 const uint16_t vlan_tags[] = { 264 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 265 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 266 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 267 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 268 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 269 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 270 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 271 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 272 }; 273 274 /* ethernet addresses of ports */ 275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 276 277 /* heads for the main used and free linked lists for the data path. */ 278 static struct virtio_net_data_ll *ll_root_used = NULL; 279 static struct virtio_net_data_ll *ll_root_free = NULL; 280 281 /* Array of data core structures containing information on individual core linked lists. */ 282 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 283 284 /* Used for queueing bursts of TX packets. */ 285 struct mbuf_table { 286 unsigned len; 287 unsigned txq_id; 288 struct rte_mbuf *m_table[MAX_PKT_BURST]; 289 }; 290 291 /* TX queue for each data core. */ 292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 293 294 /* TX queue fori each virtio device for zero copy. */ 295 struct mbuf_table tx_queue_zcp[MAX_QUEUES]; 296 297 /* Vlan header struct used to insert vlan tags on TX. */ 298 struct vlan_ethhdr { 299 unsigned char h_dest[ETH_ALEN]; 300 unsigned char h_source[ETH_ALEN]; 301 __be16 h_vlan_proto; 302 __be16 h_vlan_TCI; 303 __be16 h_vlan_encapsulated_proto; 304 }; 305 306 /* Header lengths. */ 307 #define VLAN_HLEN 4 308 #define VLAN_ETH_HLEN 18 309 310 /* Per-device statistics struct */ 311 struct device_statistics { 312 uint64_t tx_total; 313 rte_atomic64_t rx_total_atomic; 314 uint64_t rx_total; 315 uint64_t tx; 316 rte_atomic64_t rx_atomic; 317 uint64_t rx; 318 } __rte_cache_aligned; 319 struct device_statistics dev_statistics[MAX_DEVICES]; 320 321 /* 322 * Builds up the correct configuration for VMDQ VLAN pool map 323 * according to the pool & queue limits. 324 */ 325 static inline int 326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 327 { 328 struct rte_eth_vmdq_rx_conf conf; 329 struct rte_eth_vmdq_rx_conf *def_conf = 330 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 331 unsigned i; 332 333 memset(&conf, 0, sizeof(conf)); 334 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 335 conf.nb_pool_maps = num_devices; 336 conf.enable_loop_back = def_conf->enable_loop_back; 337 conf.rx_mode = def_conf->rx_mode; 338 339 for (i = 0; i < conf.nb_pool_maps; i++) { 340 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 341 conf.pool_map[i].pools = (1UL << i); 342 } 343 344 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 345 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 346 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 347 return 0; 348 } 349 350 /* 351 * Validate the device number according to the max pool number gotten form 352 * dev_info. If the device number is invalid, give the error message and 353 * return -1. Each device must have its own pool. 354 */ 355 static inline int 356 validate_num_devices(uint32_t max_nb_devices) 357 { 358 if (num_devices > max_nb_devices) { 359 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 360 return -1; 361 } 362 return 0; 363 } 364 365 /* 366 * Initialises a given port using global settings and with the rx buffers 367 * coming from the mbuf_pool passed as parameter 368 */ 369 static inline int 370 port_init(uint8_t port) 371 { 372 struct rte_eth_dev_info dev_info; 373 struct rte_eth_conf port_conf; 374 struct rte_eth_rxconf *rxconf; 375 struct rte_eth_txconf *txconf; 376 int16_t rx_rings, tx_rings; 377 uint16_t rx_ring_size, tx_ring_size; 378 int retval; 379 uint16_t q; 380 381 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 382 rte_eth_dev_info_get (port, &dev_info); 383 384 if (dev_info.max_rx_queues > MAX_QUEUES) { 385 rte_exit(EXIT_FAILURE, 386 "please define MAX_QUEUES no less than %u in %s\n", 387 dev_info.max_rx_queues, __FILE__); 388 } 389 390 rxconf = &dev_info.default_rxconf; 391 txconf = &dev_info.default_txconf; 392 rxconf->rx_drop_en = 1; 393 394 /* Enable vlan offload */ 395 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 396 397 /* 398 * Zero copy defers queue RX/TX start to the time when guest 399 * finishes its startup and packet buffers from that guest are 400 * available. 401 */ 402 if (zero_copy) { 403 rxconf->rx_deferred_start = 1; 404 rxconf->rx_drop_en = 0; 405 txconf->tx_deferred_start = 1; 406 } 407 408 /*configure the number of supported virtio devices based on VMDQ limits */ 409 num_devices = dev_info.max_vmdq_pools; 410 411 if (zero_copy) { 412 rx_ring_size = num_rx_descriptor; 413 tx_ring_size = num_tx_descriptor; 414 tx_rings = dev_info.max_tx_queues; 415 } else { 416 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 417 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 418 tx_rings = (uint16_t)rte_lcore_count(); 419 } 420 421 retval = validate_num_devices(MAX_DEVICES); 422 if (retval < 0) 423 return retval; 424 425 /* Get port configuration. */ 426 retval = get_eth_conf(&port_conf, num_devices); 427 if (retval < 0) 428 return retval; 429 /* NIC queues are divided into pf queues and vmdq queues. */ 430 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 431 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 432 num_vmdq_queues = num_devices * queues_per_pool; 433 num_queues = num_pf_queues + num_vmdq_queues; 434 vmdq_queue_base = dev_info.vmdq_queue_base; 435 vmdq_pool_base = dev_info.vmdq_pool_base; 436 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 437 num_pf_queues, num_devices, queues_per_pool); 438 439 if (port >= rte_eth_dev_count()) return -1; 440 441 if (enable_tx_csum == 0) 442 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM); 443 444 if (enable_tso == 0) { 445 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4); 446 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6); 447 } 448 449 rx_rings = (uint16_t)dev_info.max_rx_queues; 450 /* Configure ethernet device. */ 451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 452 if (retval != 0) 453 return retval; 454 455 /* Setup the queues. */ 456 for (q = 0; q < rx_rings; q ++) { 457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 458 rte_eth_dev_socket_id(port), 459 rxconf, 460 vpool_array[q].pool); 461 if (retval < 0) 462 return retval; 463 } 464 for (q = 0; q < tx_rings; q ++) { 465 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 466 rte_eth_dev_socket_id(port), 467 txconf); 468 if (retval < 0) 469 return retval; 470 } 471 472 /* Start the device. */ 473 retval = rte_eth_dev_start(port); 474 if (retval < 0) { 475 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n"); 476 return retval; 477 } 478 479 if (promiscuous) 480 rte_eth_promiscuous_enable(port); 481 482 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 483 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 484 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 485 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 486 (unsigned)port, 487 vmdq_ports_eth_addr[port].addr_bytes[0], 488 vmdq_ports_eth_addr[port].addr_bytes[1], 489 vmdq_ports_eth_addr[port].addr_bytes[2], 490 vmdq_ports_eth_addr[port].addr_bytes[3], 491 vmdq_ports_eth_addr[port].addr_bytes[4], 492 vmdq_ports_eth_addr[port].addr_bytes[5]); 493 494 return 0; 495 } 496 497 /* 498 * Set character device basename. 499 */ 500 static int 501 us_vhost_parse_basename(const char *q_arg) 502 { 503 /* parse number string */ 504 505 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) 506 return -1; 507 else 508 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); 509 510 return 0; 511 } 512 513 /* 514 * Parse the portmask provided at run time. 515 */ 516 static int 517 parse_portmask(const char *portmask) 518 { 519 char *end = NULL; 520 unsigned long pm; 521 522 errno = 0; 523 524 /* parse hexadecimal string */ 525 pm = strtoul(portmask, &end, 16); 526 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 527 return -1; 528 529 if (pm == 0) 530 return -1; 531 532 return pm; 533 534 } 535 536 /* 537 * Parse num options at run time. 538 */ 539 static int 540 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 541 { 542 char *end = NULL; 543 unsigned long num; 544 545 errno = 0; 546 547 /* parse unsigned int string */ 548 num = strtoul(q_arg, &end, 10); 549 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 550 return -1; 551 552 if (num > max_valid_value) 553 return -1; 554 555 return num; 556 557 } 558 559 /* 560 * Display usage 561 */ 562 static void 563 us_vhost_usage(const char *prgname) 564 { 565 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 566 " --vm2vm [0|1|2]\n" 567 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 568 " --dev-basename <name>\n" 569 " --nb-devices ND\n" 570 " -p PORTMASK: Set mask for ports to be used by application\n" 571 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 572 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 573 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 574 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 575 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 576 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n" 577 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 578 " --dev-basename: The basename to be used for the character device.\n" 579 " --zero-copy [0|1]: disable(default)/enable rx/tx " 580 "zero copy\n" 581 " --rx-desc-num [0-N]: the number of descriptors on rx, " 582 "used only when zero copy is enabled.\n" 583 " --tx-desc-num [0-N]: the number of descriptors on tx, " 584 "used only when zero copy is enabled.\n" 585 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 586 " --tso [0|1] disable/enable TCP segment offload.\n", 587 prgname); 588 } 589 590 /* 591 * Parse the arguments given in the command line of the application. 592 */ 593 static int 594 us_vhost_parse_args(int argc, char **argv) 595 { 596 int opt, ret; 597 int option_index; 598 unsigned i; 599 const char *prgname = argv[0]; 600 static struct option long_option[] = { 601 {"vm2vm", required_argument, NULL, 0}, 602 {"rx-retry", required_argument, NULL, 0}, 603 {"rx-retry-delay", required_argument, NULL, 0}, 604 {"rx-retry-num", required_argument, NULL, 0}, 605 {"mergeable", required_argument, NULL, 0}, 606 {"vlan-strip", required_argument, NULL, 0}, 607 {"stats", required_argument, NULL, 0}, 608 {"dev-basename", required_argument, NULL, 0}, 609 {"zero-copy", required_argument, NULL, 0}, 610 {"rx-desc-num", required_argument, NULL, 0}, 611 {"tx-desc-num", required_argument, NULL, 0}, 612 {"tx-csum", required_argument, NULL, 0}, 613 {"tso", required_argument, NULL, 0}, 614 {NULL, 0, 0, 0}, 615 }; 616 617 /* Parse command line */ 618 while ((opt = getopt_long(argc, argv, "p:P", 619 long_option, &option_index)) != EOF) { 620 switch (opt) { 621 /* Portmask */ 622 case 'p': 623 enabled_port_mask = parse_portmask(optarg); 624 if (enabled_port_mask == 0) { 625 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 626 us_vhost_usage(prgname); 627 return -1; 628 } 629 break; 630 631 case 'P': 632 promiscuous = 1; 633 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 634 ETH_VMDQ_ACCEPT_BROADCAST | 635 ETH_VMDQ_ACCEPT_MULTICAST; 636 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX); 637 638 break; 639 640 case 0: 641 /* Enable/disable vm2vm comms. */ 642 if (!strncmp(long_option[option_index].name, "vm2vm", 643 MAX_LONG_OPT_SZ)) { 644 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 645 if (ret == -1) { 646 RTE_LOG(INFO, VHOST_CONFIG, 647 "Invalid argument for " 648 "vm2vm [0|1|2]\n"); 649 us_vhost_usage(prgname); 650 return -1; 651 } else { 652 vm2vm_mode = (vm2vm_type)ret; 653 } 654 } 655 656 /* Enable/disable retries on RX. */ 657 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 658 ret = parse_num_opt(optarg, 1); 659 if (ret == -1) { 660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 661 us_vhost_usage(prgname); 662 return -1; 663 } else { 664 enable_retry = ret; 665 } 666 } 667 668 /* Enable/disable TX checksum offload. */ 669 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 670 ret = parse_num_opt(optarg, 1); 671 if (ret == -1) { 672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 673 us_vhost_usage(prgname); 674 return -1; 675 } else 676 enable_tx_csum = ret; 677 } 678 679 /* Enable/disable TSO offload. */ 680 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 681 ret = parse_num_opt(optarg, 1); 682 if (ret == -1) { 683 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 684 us_vhost_usage(prgname); 685 return -1; 686 } else 687 enable_tso = ret; 688 } 689 690 /* Specify the retries delay time (in useconds) on RX. */ 691 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 692 ret = parse_num_opt(optarg, INT32_MAX); 693 if (ret == -1) { 694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 695 us_vhost_usage(prgname); 696 return -1; 697 } else { 698 burst_rx_delay_time = ret; 699 } 700 } 701 702 /* Specify the retries number on RX. */ 703 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 704 ret = parse_num_opt(optarg, INT32_MAX); 705 if (ret == -1) { 706 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 707 us_vhost_usage(prgname); 708 return -1; 709 } else { 710 burst_rx_retry_num = ret; 711 } 712 } 713 714 /* Enable/disable RX mergeable buffers. */ 715 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 716 ret = parse_num_opt(optarg, 1); 717 if (ret == -1) { 718 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 719 us_vhost_usage(prgname); 720 return -1; 721 } else { 722 mergeable = !!ret; 723 if (ret) { 724 vmdq_conf_default.rxmode.jumbo_frame = 1; 725 vmdq_conf_default.rxmode.max_rx_pkt_len 726 = JUMBO_FRAME_MAX_SIZE; 727 } 728 } 729 } 730 731 /* Enable/disable RX VLAN strip on host. */ 732 if (!strncmp(long_option[option_index].name, 733 "vlan-strip", MAX_LONG_OPT_SZ)) { 734 ret = parse_num_opt(optarg, 1); 735 if (ret == -1) { 736 RTE_LOG(INFO, VHOST_CONFIG, 737 "Invalid argument for VLAN strip [0|1]\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } else { 741 vlan_strip = !!ret; 742 vmdq_conf_default.rxmode.hw_vlan_strip = 743 vlan_strip; 744 } 745 } 746 747 /* Enable/disable stats. */ 748 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 749 ret = parse_num_opt(optarg, INT32_MAX); 750 if (ret == -1) { 751 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n"); 752 us_vhost_usage(prgname); 753 return -1; 754 } else { 755 enable_stats = ret; 756 } 757 } 758 759 /* Set character device basename. */ 760 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) { 761 if (us_vhost_parse_basename(optarg) == -1) { 762 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ); 763 us_vhost_usage(prgname); 764 return -1; 765 } 766 } 767 768 /* Enable/disable rx/tx zero copy. */ 769 if (!strncmp(long_option[option_index].name, 770 "zero-copy", MAX_LONG_OPT_SZ)) { 771 ret = parse_num_opt(optarg, 1); 772 if (ret == -1) { 773 RTE_LOG(INFO, VHOST_CONFIG, 774 "Invalid argument" 775 " for zero-copy [0|1]\n"); 776 us_vhost_usage(prgname); 777 return -1; 778 } else 779 zero_copy = ret; 780 } 781 782 /* Specify the descriptor number on RX. */ 783 if (!strncmp(long_option[option_index].name, 784 "rx-desc-num", MAX_LONG_OPT_SZ)) { 785 ret = parse_num_opt(optarg, MAX_RING_DESC); 786 if ((ret == -1) || (!POWEROF2(ret))) { 787 RTE_LOG(INFO, VHOST_CONFIG, 788 "Invalid argument for rx-desc-num[0-N]," 789 "power of 2 required.\n"); 790 us_vhost_usage(prgname); 791 return -1; 792 } else { 793 num_rx_descriptor = ret; 794 } 795 } 796 797 /* Specify the descriptor number on TX. */ 798 if (!strncmp(long_option[option_index].name, 799 "tx-desc-num", MAX_LONG_OPT_SZ)) { 800 ret = parse_num_opt(optarg, MAX_RING_DESC); 801 if ((ret == -1) || (!POWEROF2(ret))) { 802 RTE_LOG(INFO, VHOST_CONFIG, 803 "Invalid argument for tx-desc-num [0-N]," 804 "power of 2 required.\n"); 805 us_vhost_usage(prgname); 806 return -1; 807 } else { 808 num_tx_descriptor = ret; 809 } 810 } 811 812 break; 813 814 /* Invalid option - print options. */ 815 default: 816 us_vhost_usage(prgname); 817 return -1; 818 } 819 } 820 821 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 822 if (enabled_port_mask & (1 << i)) 823 ports[num_ports++] = (uint8_t)i; 824 } 825 826 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 827 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 828 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 829 return -1; 830 } 831 832 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) { 833 RTE_LOG(INFO, VHOST_PORT, 834 "Vhost zero copy doesn't support software vm2vm," 835 "please specify 'vm2vm 2' to use hardware vm2vm.\n"); 836 return -1; 837 } 838 839 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { 840 RTE_LOG(INFO, VHOST_PORT, 841 "Vhost zero copy doesn't support jumbo frame," 842 "please specify '--mergeable 0' to disable the " 843 "mergeable feature.\n"); 844 return -1; 845 } 846 847 return 0; 848 } 849 850 /* 851 * Update the global var NUM_PORTS and array PORTS according to system ports number 852 * and return valid ports number 853 */ 854 static unsigned check_ports_num(unsigned nb_ports) 855 { 856 unsigned valid_num_ports = num_ports; 857 unsigned portid; 858 859 if (num_ports > nb_ports) { 860 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 861 num_ports, nb_ports); 862 num_ports = nb_ports; 863 } 864 865 for (portid = 0; portid < num_ports; portid ++) { 866 if (ports[portid] >= nb_ports) { 867 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 868 ports[portid], (nb_ports - 1)); 869 ports[portid] = INVALID_PORT_ID; 870 valid_num_ports--; 871 } 872 } 873 return valid_num_ports; 874 } 875 876 /* 877 * Macro to print out packet contents. Wrapped in debug define so that the 878 * data path is not effected when debug is disabled. 879 */ 880 #ifdef DEBUG 881 #define PRINT_PACKET(device, addr, size, header) do { \ 882 char *pkt_addr = (char*)(addr); \ 883 unsigned int index; \ 884 char packet[MAX_PRINT_BUFF]; \ 885 \ 886 if ((header)) \ 887 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ 888 else \ 889 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ 890 for (index = 0; index < (size); index++) { \ 891 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ 892 "%02hhx ", pkt_addr[index]); \ 893 } \ 894 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ 895 \ 896 LOG_DEBUG(VHOST_DATA, "%s", packet); \ 897 } while(0) 898 #else 899 #define PRINT_PACKET(device, addr, size, header) do{} while(0) 900 #endif 901 902 /* 903 * Function to convert guest physical addresses to vhost physical addresses. 904 * This is used to convert virtio buffer addresses. 905 */ 906 static inline uint64_t __attribute__((always_inline)) 907 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa, 908 uint32_t buf_len, hpa_type *addr_type) 909 { 910 struct virtio_memory_regions_hpa *region; 911 uint32_t regionidx; 912 uint64_t vhost_pa = 0; 913 914 *addr_type = PHYS_ADDR_INVALID; 915 916 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) { 917 region = &vdev->regions_hpa[regionidx]; 918 if ((guest_pa >= region->guest_phys_address) && 919 (guest_pa <= region->guest_phys_address_end)) { 920 vhost_pa = region->host_phys_addr_offset + guest_pa; 921 if (likely((guest_pa + buf_len - 1) 922 <= region->guest_phys_address_end)) 923 *addr_type = PHYS_ADDR_CONTINUOUS; 924 else 925 *addr_type = PHYS_ADDR_CROSS_SUBREG; 926 break; 927 } 928 } 929 930 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n", 931 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa, 932 (void *)(uintptr_t)vhost_pa); 933 934 return vhost_pa; 935 } 936 937 /* 938 * Compares a packet destination MAC address to a device MAC address. 939 */ 940 static inline int __attribute__((always_inline)) 941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) 942 { 943 return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; 944 } 945 946 /* 947 * This function learns the MAC address of the device and registers this along with a 948 * vlan tag to a VMDQ. 949 */ 950 static int 951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 952 { 953 struct ether_hdr *pkt_hdr; 954 struct virtio_net_data_ll *dev_ll; 955 struct virtio_net *dev = vdev->dev; 956 int i, ret; 957 958 /* Learn MAC address of guest device from packet */ 959 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 960 961 dev_ll = ll_root_used; 962 963 while (dev_ll != NULL) { 964 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) { 965 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh); 966 return -1; 967 } 968 dev_ll = dev_ll->next; 969 } 970 971 for (i = 0; i < ETHER_ADDR_LEN; i++) 972 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 973 974 /* vlan_tag currently uses the device_id. */ 975 vdev->vlan_tag = vlan_tags[dev->device_fh]; 976 977 /* Print out VMDQ registration info. */ 978 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n", 979 dev->device_fh, 980 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 981 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 982 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 983 vdev->vlan_tag); 984 985 /* Register the MAC address. */ 986 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 987 (uint32_t)dev->device_fh + vmdq_pool_base); 988 if (ret) 989 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n", 990 dev->device_fh); 991 992 /* Enable stripping of the vlan tag as we handle routing. */ 993 if (vlan_strip) 994 rte_eth_dev_set_vlan_strip_on_queue(ports[0], 995 (uint16_t)vdev->vmdq_rx_q, 1); 996 997 /* Set device as ready for RX. */ 998 vdev->ready = DEVICE_RX; 999 1000 return 0; 1001 } 1002 1003 /* 1004 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 1005 * queue before disabling RX on the device. 1006 */ 1007 static inline void 1008 unlink_vmdq(struct vhost_dev *vdev) 1009 { 1010 unsigned i = 0; 1011 unsigned rx_count; 1012 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1013 1014 if (vdev->ready == DEVICE_RX) { 1015 /*clear MAC and VLAN settings*/ 1016 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 1017 for (i = 0; i < 6; i++) 1018 vdev->mac_address.addr_bytes[i] = 0; 1019 1020 vdev->vlan_tag = 0; 1021 1022 /*Clear out the receive buffers*/ 1023 rx_count = rte_eth_rx_burst(ports[0], 1024 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1025 1026 while (rx_count) { 1027 for (i = 0; i < rx_count; i++) 1028 rte_pktmbuf_free(pkts_burst[i]); 1029 1030 rx_count = rte_eth_rx_burst(ports[0], 1031 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1032 } 1033 1034 vdev->ready = DEVICE_MAC_LEARNING; 1035 } 1036 } 1037 1038 /* 1039 * Check if the packet destination MAC address is for a local device. If so then put 1040 * the packet on that devices RX queue. If not then return. 1041 */ 1042 static inline int __attribute__((always_inline)) 1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1044 { 1045 struct virtio_net_data_ll *dev_ll; 1046 struct ether_hdr *pkt_hdr; 1047 uint64_t ret = 0; 1048 struct virtio_net *dev = vdev->dev; 1049 struct virtio_net *tdev; /* destination virito device */ 1050 1051 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1052 1053 /*get the used devices list*/ 1054 dev_ll = ll_root_used; 1055 1056 while (dev_ll != NULL) { 1057 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr), 1058 &dev_ll->vdev->mac_address)) { 1059 1060 /* Drop the packet if the TX packet is destined for the TX device. */ 1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1062 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n", 1063 dev->device_fh); 1064 return 0; 1065 } 1066 tdev = dev_ll->vdev->dev; 1067 1068 1069 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh); 1070 1071 if (unlikely(dev_ll->vdev->remove)) { 1072 /*drop the packet if the device is marked for removal*/ 1073 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh); 1074 } else { 1075 /*send the packet to the local virtio device*/ 1076 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1); 1077 if (enable_stats) { 1078 rte_atomic64_add( 1079 &dev_statistics[tdev->device_fh].rx_total_atomic, 1080 1); 1081 rte_atomic64_add( 1082 &dev_statistics[tdev->device_fh].rx_atomic, 1083 ret); 1084 dev_statistics[dev->device_fh].tx_total++; 1085 dev_statistics[dev->device_fh].tx += ret; 1086 } 1087 } 1088 1089 return 0; 1090 } 1091 dev_ll = dev_ll->next; 1092 } 1093 1094 return -1; 1095 } 1096 1097 /* 1098 * Check if the destination MAC of a packet is one local VM, 1099 * and get its vlan tag, and offset if it is. 1100 */ 1101 static inline int __attribute__((always_inline)) 1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m, 1103 uint32_t *offset, uint16_t *vlan_tag) 1104 { 1105 struct virtio_net_data_ll *dev_ll = ll_root_used; 1106 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1107 1108 while (dev_ll != NULL) { 1109 if ((dev_ll->vdev->ready == DEVICE_RX) 1110 && ether_addr_cmp(&(pkt_hdr->d_addr), 1111 &dev_ll->vdev->mac_address)) { 1112 /* 1113 * Drop the packet if the TX packet is 1114 * destined for the TX device. 1115 */ 1116 if (dev_ll->vdev->dev->device_fh == dev->device_fh) { 1117 LOG_DEBUG(VHOST_DATA, 1118 "(%"PRIu64") TX: Source and destination" 1119 " MAC addresses are the same. Dropping " 1120 "packet.\n", 1121 dev_ll->vdev->dev->device_fh); 1122 return -1; 1123 } 1124 1125 /* 1126 * HW vlan strip will reduce the packet length 1127 * by minus length of vlan tag, so need restore 1128 * the packet length by plus it. 1129 */ 1130 *offset = VLAN_HLEN; 1131 *vlan_tag = 1132 (uint16_t) 1133 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh]; 1134 1135 LOG_DEBUG(VHOST_DATA, 1136 "(%"PRIu64") TX: pkt to local VM device id:" 1137 "(%"PRIu64") vlan tag: %d.\n", 1138 dev->device_fh, dev_ll->vdev->dev->device_fh, 1139 (int)*vlan_tag); 1140 1141 break; 1142 } 1143 dev_ll = dev_ll->next; 1144 } 1145 return 0; 1146 } 1147 1148 static uint16_t 1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 1150 { 1151 if (ol_flags & PKT_TX_IPV4) 1152 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 1153 else /* assume ethertype == ETHER_TYPE_IPv6 */ 1154 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 1155 } 1156 1157 static void virtio_tx_offload(struct rte_mbuf *m) 1158 { 1159 void *l3_hdr; 1160 struct ipv4_hdr *ipv4_hdr = NULL; 1161 struct tcp_hdr *tcp_hdr = NULL; 1162 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 1163 1164 l3_hdr = (char *)eth_hdr + m->l2_len; 1165 1166 if (m->ol_flags & PKT_TX_IPV4) { 1167 ipv4_hdr = l3_hdr; 1168 ipv4_hdr->hdr_checksum = 0; 1169 m->ol_flags |= PKT_TX_IP_CKSUM; 1170 } 1171 1172 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 1173 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 1174 } 1175 1176 /* 1177 * This function routes the TX packet to the correct interface. This may be a local device 1178 * or the physical port. 1179 */ 1180 static inline void __attribute__((always_inline)) 1181 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1182 { 1183 struct mbuf_table *tx_q; 1184 struct rte_mbuf **m_table; 1185 unsigned len, ret, offset = 0; 1186 const uint16_t lcore_id = rte_lcore_id(); 1187 struct virtio_net *dev = vdev->dev; 1188 struct ether_hdr *nh; 1189 1190 /*check if destination is local VM*/ 1191 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 1192 rte_pktmbuf_free(m); 1193 return; 1194 } 1195 1196 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1197 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) { 1198 rte_pktmbuf_free(m); 1199 return; 1200 } 1201 } 1202 1203 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh); 1204 1205 /*Add packet to the port tx queue*/ 1206 tx_q = &lcore_tx_queue[lcore_id]; 1207 len = tx_q->len; 1208 1209 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 1210 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 1211 /* Guest has inserted the vlan tag. */ 1212 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 1213 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1214 if ((vm2vm_mode == VM2VM_HARDWARE) && 1215 (vh->vlan_tci != vlan_tag_be)) 1216 vh->vlan_tci = vlan_tag_be; 1217 } else { 1218 m->ol_flags |= PKT_TX_VLAN_PKT; 1219 1220 /* 1221 * Find the right seg to adjust the data len when offset is 1222 * bigger than tail room size. 1223 */ 1224 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1225 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1226 m->data_len += offset; 1227 else { 1228 struct rte_mbuf *seg = m; 1229 1230 while ((seg->next != NULL) && 1231 (offset > rte_pktmbuf_tailroom(seg))) 1232 seg = seg->next; 1233 1234 seg->data_len += offset; 1235 } 1236 m->pkt_len += offset; 1237 } 1238 1239 m->vlan_tci = vlan_tag; 1240 } 1241 1242 if (m->ol_flags & PKT_TX_TCP_SEG) 1243 virtio_tx_offload(m); 1244 1245 tx_q->m_table[len] = m; 1246 len++; 1247 if (enable_stats) { 1248 dev_statistics[dev->device_fh].tx_total++; 1249 dev_statistics[dev->device_fh].tx++; 1250 } 1251 1252 if (unlikely(len == MAX_PKT_BURST)) { 1253 m_table = (struct rte_mbuf **)tx_q->m_table; 1254 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1255 /* Free any buffers not handled by TX and update the port stats. */ 1256 if (unlikely(ret < len)) { 1257 do { 1258 rte_pktmbuf_free(m_table[ret]); 1259 } while (++ret < len); 1260 } 1261 1262 len = 0; 1263 } 1264 1265 tx_q->len = len; 1266 return; 1267 } 1268 /* 1269 * This function is called by each data core. It handles all RX/TX registered with the 1270 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared 1271 * with all devices in the main linked list. 1272 */ 1273 static int 1274 switch_worker(__attribute__((unused)) void *arg) 1275 { 1276 struct rte_mempool *mbuf_pool = arg; 1277 struct virtio_net *dev = NULL; 1278 struct vhost_dev *vdev = NULL; 1279 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1280 struct virtio_net_data_ll *dev_ll; 1281 struct mbuf_table *tx_q; 1282 volatile struct lcore_ll_info *lcore_ll; 1283 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1284 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 1285 unsigned ret, i; 1286 const uint16_t lcore_id = rte_lcore_id(); 1287 const uint16_t num_cores = (uint16_t)rte_lcore_count(); 1288 uint16_t rx_count = 0; 1289 uint16_t tx_count; 1290 uint32_t retry = 0; 1291 1292 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1293 lcore_ll = lcore_info[lcore_id].lcore_ll; 1294 prev_tsc = 0; 1295 1296 tx_q = &lcore_tx_queue[lcore_id]; 1297 for (i = 0; i < num_cores; i ++) { 1298 if (lcore_ids[i] == lcore_id) { 1299 tx_q->txq_id = i; 1300 break; 1301 } 1302 } 1303 1304 while(1) { 1305 cur_tsc = rte_rdtsc(); 1306 /* 1307 * TX burst queue drain 1308 */ 1309 diff_tsc = cur_tsc - prev_tsc; 1310 if (unlikely(diff_tsc > drain_tsc)) { 1311 1312 if (tx_q->len) { 1313 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len); 1314 1315 /*Tx any packets in the queue*/ 1316 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, 1317 (struct rte_mbuf **)tx_q->m_table, 1318 (uint16_t)tx_q->len); 1319 if (unlikely(ret < tx_q->len)) { 1320 do { 1321 rte_pktmbuf_free(tx_q->m_table[ret]); 1322 } while (++ret < tx_q->len); 1323 } 1324 1325 tx_q->len = 0; 1326 } 1327 1328 prev_tsc = cur_tsc; 1329 1330 } 1331 1332 rte_prefetch0(lcore_ll->ll_root_used); 1333 /* 1334 * Inform the configuration core that we have exited the linked list and that no devices are 1335 * in use if requested. 1336 */ 1337 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 1338 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 1339 1340 /* 1341 * Process devices 1342 */ 1343 dev_ll = lcore_ll->ll_root_used; 1344 1345 while (dev_ll != NULL) { 1346 /*get virtio device ID*/ 1347 vdev = dev_ll->vdev; 1348 dev = vdev->dev; 1349 1350 if (unlikely(vdev->remove)) { 1351 dev_ll = dev_ll->next; 1352 unlink_vmdq(vdev); 1353 vdev->ready = DEVICE_SAFE_REMOVE; 1354 continue; 1355 } 1356 if (likely(vdev->ready == DEVICE_RX)) { 1357 /*Handle guest RX*/ 1358 rx_count = rte_eth_rx_burst(ports[0], 1359 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1360 1361 if (rx_count) { 1362 /* 1363 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss 1364 * Here MAX_PKT_BURST must be less than virtio queue size 1365 */ 1366 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) { 1367 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1368 rte_delay_us(burst_rx_delay_time); 1369 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ)) 1370 break; 1371 } 1372 } 1373 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count); 1374 if (enable_stats) { 1375 rte_atomic64_add( 1376 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic, 1377 rx_count); 1378 rte_atomic64_add( 1379 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count); 1380 } 1381 while (likely(rx_count)) { 1382 rx_count--; 1383 rte_pktmbuf_free(pkts_burst[rx_count]); 1384 } 1385 1386 } 1387 } 1388 1389 if (likely(!vdev->remove)) { 1390 /* Handle guest TX*/ 1391 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST); 1392 /* If this is the first received packet we need to learn the MAC and setup VMDQ */ 1393 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) { 1394 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) { 1395 while (tx_count) 1396 rte_pktmbuf_free(pkts_burst[--tx_count]); 1397 } 1398 } 1399 for (i = 0; i < tx_count; ++i) { 1400 virtio_tx_route(vdev, pkts_burst[i], 1401 vlan_tags[(uint16_t)dev->device_fh]); 1402 } 1403 } 1404 1405 /*move to the next device in the list*/ 1406 dev_ll = dev_ll->next; 1407 } 1408 } 1409 1410 return 0; 1411 } 1412 1413 /* 1414 * This function gets available ring number for zero copy rx. 1415 * Only one thread will call this funciton for a paticular virtio device, 1416 * so, it is designed as non-thread-safe function. 1417 */ 1418 static inline uint32_t __attribute__((always_inline)) 1419 get_available_ring_num_zcp(struct virtio_net *dev) 1420 { 1421 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1422 uint16_t avail_idx; 1423 1424 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1425 return (uint32_t)(avail_idx - vq->last_used_idx_res); 1426 } 1427 1428 /* 1429 * This function gets available ring index for zero copy rx, 1430 * it will retry 'burst_rx_retry_num' times till it get enough ring index. 1431 * Only one thread will call this funciton for a paticular virtio device, 1432 * so, it is designed as non-thread-safe function. 1433 */ 1434 static inline uint32_t __attribute__((always_inline)) 1435 get_available_ring_index_zcp(struct virtio_net *dev, 1436 uint16_t *res_base_idx, uint32_t count) 1437 { 1438 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ]; 1439 uint16_t avail_idx; 1440 uint32_t retry = 0; 1441 uint16_t free_entries; 1442 1443 *res_base_idx = vq->last_used_idx_res; 1444 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1445 free_entries = (avail_idx - *res_base_idx); 1446 1447 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: " 1448 "avail idx: %d, " 1449 "res base idx:%d, free entries:%d\n", 1450 dev->device_fh, avail_idx, *res_base_idx, 1451 free_entries); 1452 1453 /* 1454 * If retry is enabled and the queue is full then we wait 1455 * and retry to avoid packet loss. 1456 */ 1457 if (enable_retry && unlikely(count > free_entries)) { 1458 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1459 rte_delay_us(burst_rx_delay_time); 1460 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1461 free_entries = (avail_idx - *res_base_idx); 1462 if (count <= free_entries) 1463 break; 1464 } 1465 } 1466 1467 /*check that we have enough buffers*/ 1468 if (unlikely(count > free_entries)) 1469 count = free_entries; 1470 1471 if (unlikely(count == 0)) { 1472 LOG_DEBUG(VHOST_DATA, 1473 "(%"PRIu64") Fail in get_available_ring_index_zcp: " 1474 "avail idx: %d, res base idx:%d, free entries:%d\n", 1475 dev->device_fh, avail_idx, 1476 *res_base_idx, free_entries); 1477 return 0; 1478 } 1479 1480 vq->last_used_idx_res = *res_base_idx + count; 1481 1482 return count; 1483 } 1484 1485 /* 1486 * This function put descriptor back to used list. 1487 */ 1488 static inline void __attribute__((always_inline)) 1489 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx) 1490 { 1491 uint16_t res_cur_idx = vq->last_used_idx; 1492 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx; 1493 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0; 1494 rte_compiler_barrier(); 1495 *(volatile uint16_t *)&vq->used->idx += 1; 1496 vq->last_used_idx += 1; 1497 1498 /* Kick the guest if necessary. */ 1499 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1500 eventfd_write(vq->callfd, (eventfd_t)1); 1501 } 1502 1503 /* 1504 * This function get available descriptor from vitio vring and un-attached mbuf 1505 * from vpool->ring, and then attach them together. It needs adjust the offset 1506 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the 1507 * frame data may be put to wrong location in mbuf. 1508 */ 1509 static inline void __attribute__((always_inline)) 1510 attach_rxmbuf_zcp(struct virtio_net *dev) 1511 { 1512 uint16_t res_base_idx, desc_idx; 1513 uint64_t buff_addr, phys_addr; 1514 struct vhost_virtqueue *vq; 1515 struct vring_desc *desc; 1516 void *obj = NULL; 1517 struct rte_mbuf *mbuf; 1518 struct vpool *vpool; 1519 hpa_type addr_type; 1520 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1521 1522 vpool = &vpool_array[vdev->vmdq_rx_q]; 1523 vq = dev->virtqueue[VIRTIO_RXQ]; 1524 1525 do { 1526 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx, 1527 1) != 1)) 1528 return; 1529 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)]; 1530 1531 desc = &vq->desc[desc_idx]; 1532 if (desc->flags & VRING_DESC_F_NEXT) { 1533 desc = &vq->desc[desc->next]; 1534 buff_addr = gpa_to_vva(dev, desc->addr); 1535 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, 1536 &addr_type); 1537 } else { 1538 buff_addr = gpa_to_vva(dev, 1539 desc->addr + vq->vhost_hlen); 1540 phys_addr = gpa_to_hpa(vdev, 1541 desc->addr + vq->vhost_hlen, 1542 desc->len, &addr_type); 1543 } 1544 1545 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 1546 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer" 1547 " address found when attaching RX frame buffer" 1548 " address!\n", dev->device_fh); 1549 put_desc_to_used_list_zcp(vq, desc_idx); 1550 continue; 1551 } 1552 1553 /* 1554 * Check if the frame buffer address from guest crosses 1555 * sub-region or not. 1556 */ 1557 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 1558 RTE_LOG(ERR, VHOST_DATA, 1559 "(%"PRIu64") Frame buffer address cross " 1560 "sub-regioin found when attaching RX frame " 1561 "buffer address!\n", 1562 dev->device_fh); 1563 put_desc_to_used_list_zcp(vq, desc_idx); 1564 continue; 1565 } 1566 } while (unlikely(phys_addr == 0)); 1567 1568 rte_ring_sc_dequeue(vpool->ring, &obj); 1569 mbuf = obj; 1570 if (unlikely(mbuf == NULL)) { 1571 LOG_DEBUG(VHOST_DATA, 1572 "(%"PRIu64") in attach_rxmbuf_zcp: " 1573 "ring_sc_dequeue fail.\n", 1574 dev->device_fh); 1575 put_desc_to_used_list_zcp(vq, desc_idx); 1576 return; 1577 } 1578 1579 if (unlikely(vpool->buf_size > desc->len)) { 1580 LOG_DEBUG(VHOST_DATA, 1581 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer " 1582 "length(%d) of descriptor idx: %d less than room " 1583 "size required: %d\n", 1584 dev->device_fh, desc->len, desc_idx, vpool->buf_size); 1585 put_desc_to_used_list_zcp(vq, desc_idx); 1586 rte_ring_sp_enqueue(vpool->ring, obj); 1587 return; 1588 } 1589 1590 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); 1591 mbuf->data_off = RTE_PKTMBUF_HEADROOM; 1592 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; 1593 mbuf->data_len = desc->len; 1594 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1595 1596 LOG_DEBUG(VHOST_DATA, 1597 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, " 1598 "descriptor idx:%d\n", 1599 dev->device_fh, res_base_idx, desc_idx); 1600 1601 __rte_mbuf_raw_free(mbuf); 1602 1603 return; 1604 } 1605 1606 /* 1607 * Detach an attched packet mbuf - 1608 * - restore original mbuf address and length values. 1609 * - reset pktmbuf data and data_len to their default values. 1610 * All other fields of the given packet mbuf will be left intact. 1611 * 1612 * @param m 1613 * The attached packet mbuf. 1614 */ 1615 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) 1616 { 1617 const struct rte_mempool *mp = m->pool; 1618 void *buf = rte_mbuf_to_baddr(m); 1619 uint32_t buf_ofs; 1620 uint32_t buf_len = mp->elt_size - sizeof(*m); 1621 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m); 1622 1623 m->buf_addr = buf; 1624 m->buf_len = (uint16_t)buf_len; 1625 1626 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? 1627 RTE_PKTMBUF_HEADROOM : m->buf_len; 1628 m->data_off = buf_ofs; 1629 1630 m->data_len = 0; 1631 } 1632 1633 /* 1634 * This function is called after packets have been transimited. It fetchs mbuf 1635 * from vpool->pool, detached it and put into vpool->ring. It also update the 1636 * used index and kick the guest if necessary. 1637 */ 1638 static inline uint32_t __attribute__((always_inline)) 1639 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool) 1640 { 1641 struct rte_mbuf *mbuf; 1642 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1643 uint32_t used_idx = vq->last_used_idx & (vq->size - 1); 1644 uint32_t index = 0; 1645 uint32_t mbuf_count = rte_mempool_count(vpool->pool); 1646 1647 LOG_DEBUG(VHOST_DATA, 1648 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before " 1649 "clean is: %d\n", 1650 dev->device_fh, mbuf_count); 1651 LOG_DEBUG(VHOST_DATA, 1652 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before " 1653 "clean is : %d\n", 1654 dev->device_fh, rte_ring_count(vpool->ring)); 1655 1656 for (index = 0; index < mbuf_count; index++) { 1657 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1658 if (likely(MBUF_EXT_MEM(mbuf))) 1659 pktmbuf_detach_zcp(mbuf); 1660 rte_ring_sp_enqueue(vpool->ring, mbuf); 1661 1662 /* Update used index buffer information. */ 1663 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf); 1664 vq->used->ring[used_idx].len = 0; 1665 1666 used_idx = (used_idx + 1) & (vq->size - 1); 1667 } 1668 1669 LOG_DEBUG(VHOST_DATA, 1670 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after " 1671 "clean is: %d\n", 1672 dev->device_fh, rte_mempool_count(vpool->pool)); 1673 LOG_DEBUG(VHOST_DATA, 1674 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after " 1675 "clean is : %d\n", 1676 dev->device_fh, rte_ring_count(vpool->ring)); 1677 LOG_DEBUG(VHOST_DATA, 1678 "(%"PRIu64") in txmbuf_clean_zcp: before updated " 1679 "vq->last_used_idx:%d\n", 1680 dev->device_fh, vq->last_used_idx); 1681 1682 vq->last_used_idx += mbuf_count; 1683 1684 LOG_DEBUG(VHOST_DATA, 1685 "(%"PRIu64") in txmbuf_clean_zcp: after updated " 1686 "vq->last_used_idx:%d\n", 1687 dev->device_fh, vq->last_used_idx); 1688 1689 rte_compiler_barrier(); 1690 1691 *(volatile uint16_t *)&vq->used->idx += mbuf_count; 1692 1693 /* Kick guest if required. */ 1694 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1695 eventfd_write(vq->callfd, (eventfd_t)1); 1696 1697 return 0; 1698 } 1699 1700 /* 1701 * This function is called when a virtio device is destroy. 1702 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring. 1703 */ 1704 static void mbuf_destroy_zcp(struct vpool *vpool) 1705 { 1706 struct rte_mbuf *mbuf = NULL; 1707 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool); 1708 1709 LOG_DEBUG(VHOST_CONFIG, 1710 "in mbuf_destroy_zcp: mbuf count in mempool before " 1711 "mbuf_destroy_zcp is: %d\n", 1712 mbuf_count); 1713 LOG_DEBUG(VHOST_CONFIG, 1714 "in mbuf_destroy_zcp: mbuf count in ring before " 1715 "mbuf_destroy_zcp is : %d\n", 1716 rte_ring_count(vpool->ring)); 1717 1718 for (index = 0; index < mbuf_count; index++) { 1719 mbuf = __rte_mbuf_raw_alloc(vpool->pool); 1720 if (likely(mbuf != NULL)) { 1721 if (likely(MBUF_EXT_MEM(mbuf))) 1722 pktmbuf_detach_zcp(mbuf); 1723 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf); 1724 } 1725 } 1726 1727 LOG_DEBUG(VHOST_CONFIG, 1728 "in mbuf_destroy_zcp: mbuf count in mempool after " 1729 "mbuf_destroy_zcp is: %d\n", 1730 rte_mempool_count(vpool->pool)); 1731 LOG_DEBUG(VHOST_CONFIG, 1732 "in mbuf_destroy_zcp: mbuf count in ring after " 1733 "mbuf_destroy_zcp is : %d\n", 1734 rte_ring_count(vpool->ring)); 1735 } 1736 1737 /* 1738 * This function update the use flag and counter. 1739 */ 1740 static inline uint32_t __attribute__((always_inline)) 1741 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts, 1742 uint32_t count) 1743 { 1744 struct vhost_virtqueue *vq; 1745 struct vring_desc *desc; 1746 struct rte_mbuf *buff; 1747 /* The virtio_hdr is initialised to 0. */ 1748 struct virtio_net_hdr_mrg_rxbuf virtio_hdr 1749 = {{0, 0, 0, 0, 0, 0}, 0}; 1750 uint64_t buff_hdr_addr = 0; 1751 uint32_t head[MAX_PKT_BURST], packet_len = 0; 1752 uint32_t head_idx, packet_success = 0; 1753 uint16_t res_cur_idx; 1754 1755 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); 1756 1757 if (count == 0) 1758 return 0; 1759 1760 vq = dev->virtqueue[VIRTIO_RXQ]; 1761 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; 1762 1763 res_cur_idx = vq->last_used_idx; 1764 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", 1765 dev->device_fh, res_cur_idx, res_cur_idx + count); 1766 1767 /* Retrieve all of the head indexes first to avoid caching issues. */ 1768 for (head_idx = 0; head_idx < count; head_idx++) 1769 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]); 1770 1771 /*Prefetch descriptor index. */ 1772 rte_prefetch0(&vq->desc[head[packet_success]]); 1773 1774 while (packet_success != count) { 1775 /* Get descriptor from available ring */ 1776 desc = &vq->desc[head[packet_success]]; 1777 1778 buff = pkts[packet_success]; 1779 LOG_DEBUG(VHOST_DATA, 1780 "(%"PRIu64") in dev_rx_zcp: update the used idx for " 1781 "pkt[%d] descriptor idx: %d\n", 1782 dev->device_fh, packet_success, 1783 MBUF_HEADROOM_UINT32(buff)); 1784 1785 PRINT_PACKET(dev, 1786 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr) 1787 + RTE_PKTMBUF_HEADROOM), 1788 rte_pktmbuf_data_len(buff), 0); 1789 1790 /* Buffer address translation for virtio header. */ 1791 buff_hdr_addr = gpa_to_vva(dev, desc->addr); 1792 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; 1793 1794 /* 1795 * If the descriptors are chained the header and data are 1796 * placed in separate buffers. 1797 */ 1798 if (desc->flags & VRING_DESC_F_NEXT) { 1799 desc->len = vq->vhost_hlen; 1800 desc = &vq->desc[desc->next]; 1801 desc->len = rte_pktmbuf_data_len(buff); 1802 } else { 1803 desc->len = packet_len; 1804 } 1805 1806 /* Update used ring with desc information */ 1807 vq->used->ring[res_cur_idx & (vq->size - 1)].id 1808 = head[packet_success]; 1809 vq->used->ring[res_cur_idx & (vq->size - 1)].len 1810 = packet_len; 1811 res_cur_idx++; 1812 packet_success++; 1813 1814 /* A header is required per buffer. */ 1815 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, 1816 (const void *)&virtio_hdr, vq->vhost_hlen); 1817 1818 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); 1819 1820 if (likely(packet_success < count)) { 1821 /* Prefetch descriptor index. */ 1822 rte_prefetch0(&vq->desc[head[packet_success]]); 1823 } 1824 } 1825 1826 rte_compiler_barrier(); 1827 1828 LOG_DEBUG(VHOST_DATA, 1829 "(%"PRIu64") in dev_rx_zcp: before update used idx: " 1830 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1831 dev->device_fh, vq->last_used_idx, vq->used->idx); 1832 1833 *(volatile uint16_t *)&vq->used->idx += count; 1834 vq->last_used_idx += count; 1835 1836 LOG_DEBUG(VHOST_DATA, 1837 "(%"PRIu64") in dev_rx_zcp: after update used idx: " 1838 "vq.last_used_idx: %d, vq->used->idx: %d\n", 1839 dev->device_fh, vq->last_used_idx, vq->used->idx); 1840 1841 /* Kick the guest if necessary. */ 1842 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) 1843 eventfd_write(vq->callfd, (eventfd_t)1); 1844 1845 return count; 1846 } 1847 1848 /* 1849 * This function routes the TX packet to the correct interface. 1850 * This may be a local device or the physical port. 1851 */ 1852 static inline void __attribute__((always_inline)) 1853 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, 1854 uint32_t desc_idx, uint8_t need_copy) 1855 { 1856 struct mbuf_table *tx_q; 1857 struct rte_mbuf **m_table; 1858 void *obj = NULL; 1859 struct rte_mbuf *mbuf; 1860 unsigned len, ret, offset = 0; 1861 struct vpool *vpool; 1862 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; 1863 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q; 1864 1865 /*Add packet to the port tx queue*/ 1866 tx_q = &tx_queue_zcp[vmdq_rx_q]; 1867 len = tx_q->len; 1868 1869 /* Allocate an mbuf and populate the structure. */ 1870 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q]; 1871 rte_ring_sc_dequeue(vpool->ring, &obj); 1872 mbuf = obj; 1873 if (unlikely(mbuf == NULL)) { 1874 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ]; 1875 RTE_LOG(ERR, VHOST_DATA, 1876 "(%"PRIu64") Failed to allocate memory for mbuf.\n", 1877 dev->device_fh); 1878 put_desc_to_used_list_zcp(vq, desc_idx); 1879 return; 1880 } 1881 1882 if (vm2vm_mode == VM2VM_HARDWARE) { 1883 /* Avoid using a vlan tag from any vm for external pkt, such as 1884 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool 1885 * selection, MAC address determines it as an external pkt 1886 * which should go to network, while vlan tag determine it as 1887 * a vm2vm pkt should forward to another vm. Hardware confuse 1888 * such a ambiguous situation, so pkt will lost. 1889 */ 1890 vlan_tag = external_pkt_default_vlan_tag; 1891 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) { 1892 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1893 __rte_mbuf_raw_free(mbuf); 1894 return; 1895 } 1896 } 1897 1898 mbuf->nb_segs = m->nb_segs; 1899 mbuf->next = m->next; 1900 mbuf->data_len = m->data_len + offset; 1901 mbuf->pkt_len = mbuf->data_len; 1902 if (unlikely(need_copy)) { 1903 /* Copy the packet contents to the mbuf. */ 1904 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), 1905 rte_pktmbuf_mtod(m, void *), 1906 m->data_len); 1907 } else { 1908 mbuf->data_off = m->data_off; 1909 mbuf->buf_physaddr = m->buf_physaddr; 1910 mbuf->buf_addr = m->buf_addr; 1911 } 1912 mbuf->ol_flags |= PKT_TX_VLAN_PKT; 1913 mbuf->vlan_tci = vlan_tag; 1914 mbuf->l2_len = sizeof(struct ether_hdr); 1915 mbuf->l3_len = sizeof(struct ipv4_hdr); 1916 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; 1917 1918 tx_q->m_table[len] = mbuf; 1919 len++; 1920 1921 LOG_DEBUG(VHOST_DATA, 1922 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", 1923 dev->device_fh, 1924 mbuf->nb_segs, 1925 (mbuf->next == NULL) ? "null" : "non-null"); 1926 1927 if (enable_stats) { 1928 dev_statistics[dev->device_fh].tx_total++; 1929 dev_statistics[dev->device_fh].tx++; 1930 } 1931 1932 if (unlikely(len == MAX_PKT_BURST)) { 1933 m_table = (struct rte_mbuf **)tx_q->m_table; 1934 ret = rte_eth_tx_burst(ports[0], 1935 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len); 1936 1937 /* 1938 * Free any buffers not handled by TX and update 1939 * the port stats. 1940 */ 1941 if (unlikely(ret < len)) { 1942 do { 1943 rte_pktmbuf_free(m_table[ret]); 1944 } while (++ret < len); 1945 } 1946 1947 len = 0; 1948 txmbuf_clean_zcp(dev, vpool); 1949 } 1950 1951 tx_q->len = len; 1952 1953 return; 1954 } 1955 1956 /* 1957 * This function TX all available packets in virtio TX queue for one 1958 * virtio-net device. If it is first packet, it learns MAC address and 1959 * setup VMDQ. 1960 */ 1961 static inline void __attribute__((always_inline)) 1962 virtio_dev_tx_zcp(struct virtio_net *dev) 1963 { 1964 struct rte_mbuf m; 1965 struct vhost_virtqueue *vq; 1966 struct vring_desc *desc; 1967 uint64_t buff_addr = 0, phys_addr; 1968 uint32_t head[MAX_PKT_BURST]; 1969 uint32_t i; 1970 uint16_t free_entries, packet_success = 0; 1971 uint16_t avail_idx; 1972 uint8_t need_copy = 0; 1973 hpa_type addr_type; 1974 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv; 1975 1976 vq = dev->virtqueue[VIRTIO_TXQ]; 1977 avail_idx = *((volatile uint16_t *)&vq->avail->idx); 1978 1979 /* If there are no available buffers then return. */ 1980 if (vq->last_used_idx_res == avail_idx) 1981 return; 1982 1983 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh); 1984 1985 /* Prefetch available ring to retrieve head indexes. */ 1986 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]); 1987 1988 /* Get the number of free entries in the ring */ 1989 free_entries = (avail_idx - vq->last_used_idx_res); 1990 1991 /* Limit to MAX_PKT_BURST. */ 1992 free_entries 1993 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries; 1994 1995 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", 1996 dev->device_fh, free_entries); 1997 1998 /* Retrieve all of the head indexes first to avoid caching issues. */ 1999 for (i = 0; i < free_entries; i++) 2000 head[i] 2001 = vq->avail->ring[(vq->last_used_idx_res + i) 2002 & (vq->size - 1)]; 2003 2004 vq->last_used_idx_res += free_entries; 2005 2006 /* Prefetch descriptor index. */ 2007 rte_prefetch0(&vq->desc[head[packet_success]]); 2008 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); 2009 2010 while (packet_success < free_entries) { 2011 desc = &vq->desc[head[packet_success]]; 2012 2013 /* Discard first buffer as it is the virtio header */ 2014 desc = &vq->desc[desc->next]; 2015 2016 /* Buffer address translation. */ 2017 buff_addr = gpa_to_vva(dev, desc->addr); 2018 /* Need check extra VLAN_HLEN size for inserting VLAN tag */ 2019 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN, 2020 &addr_type); 2021 2022 if (likely(packet_success < (free_entries - 1))) 2023 /* Prefetch descriptor index. */ 2024 rte_prefetch0(&vq->desc[head[packet_success + 1]]); 2025 2026 if (unlikely(addr_type == PHYS_ADDR_INVALID)) { 2027 RTE_LOG(ERR, VHOST_DATA, 2028 "(%"PRIu64") Invalid frame buffer address found" 2029 "when TX packets!\n", 2030 dev->device_fh); 2031 packet_success++; 2032 continue; 2033 } 2034 2035 /* Prefetch buffer address. */ 2036 rte_prefetch0((void *)(uintptr_t)buff_addr); 2037 2038 /* 2039 * Setup dummy mbuf. This is copied to a real mbuf if 2040 * transmitted out the physical port. 2041 */ 2042 m.data_len = desc->len; 2043 m.nb_segs = 1; 2044 m.next = NULL; 2045 m.data_off = 0; 2046 m.buf_addr = (void *)(uintptr_t)buff_addr; 2047 m.buf_physaddr = phys_addr; 2048 2049 /* 2050 * Check if the frame buffer address from guest crosses 2051 * sub-region or not. 2052 */ 2053 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) { 2054 RTE_LOG(ERR, VHOST_DATA, 2055 "(%"PRIu64") Frame buffer address cross " 2056 "sub-regioin found when attaching TX frame " 2057 "buffer address!\n", 2058 dev->device_fh); 2059 need_copy = 1; 2060 } else 2061 need_copy = 0; 2062 2063 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); 2064 2065 /* 2066 * If this is the first received packet we need to learn 2067 * the MAC and setup VMDQ 2068 */ 2069 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) { 2070 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) { 2071 /* 2072 * Discard frame if device is scheduled for 2073 * removal or a duplicate MAC address is found. 2074 */ 2075 packet_success += free_entries; 2076 vq->last_used_idx += packet_success; 2077 break; 2078 } 2079 } 2080 2081 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy); 2082 packet_success++; 2083 } 2084 } 2085 2086 /* 2087 * This function is called by each data core. It handles all RX/TX registered 2088 * with the core. For TX the specific lcore linked list is used. For RX, MAC 2089 * addresses are compared with all devices in the main linked list. 2090 */ 2091 static int 2092 switch_worker_zcp(__attribute__((unused)) void *arg) 2093 { 2094 struct virtio_net *dev = NULL; 2095 struct vhost_dev *vdev = NULL; 2096 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 2097 struct virtio_net_data_ll *dev_ll; 2098 struct mbuf_table *tx_q; 2099 volatile struct lcore_ll_info *lcore_ll; 2100 const uint64_t drain_tsc 2101 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S 2102 * BURST_TX_DRAIN_US; 2103 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0; 2104 unsigned ret; 2105 const uint16_t lcore_id = rte_lcore_id(); 2106 uint16_t count_in_ring, rx_count = 0; 2107 2108 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 2109 2110 lcore_ll = lcore_info[lcore_id].lcore_ll; 2111 prev_tsc = 0; 2112 2113 while (1) { 2114 cur_tsc = rte_rdtsc(); 2115 2116 /* TX burst queue drain */ 2117 diff_tsc = cur_tsc - prev_tsc; 2118 if (unlikely(diff_tsc > drain_tsc)) { 2119 /* 2120 * Get mbuf from vpool.pool and detach mbuf and 2121 * put back into vpool.ring. 2122 */ 2123 dev_ll = lcore_ll->ll_root_used; 2124 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2125 /* Get virtio device ID */ 2126 vdev = dev_ll->vdev; 2127 dev = vdev->dev; 2128 2129 if (likely(!vdev->remove)) { 2130 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2131 if (tx_q->len) { 2132 LOG_DEBUG(VHOST_DATA, 2133 "TX queue drained after timeout" 2134 " with burst size %u\n", 2135 tx_q->len); 2136 2137 /* 2138 * Tx any packets in the queue 2139 */ 2140 ret = rte_eth_tx_burst( 2141 ports[0], 2142 (uint16_t)tx_q->txq_id, 2143 (struct rte_mbuf **) 2144 tx_q->m_table, 2145 (uint16_t)tx_q->len); 2146 if (unlikely(ret < tx_q->len)) { 2147 do { 2148 rte_pktmbuf_free( 2149 tx_q->m_table[ret]); 2150 } while (++ret < tx_q->len); 2151 } 2152 tx_q->len = 0; 2153 2154 txmbuf_clean_zcp(dev, 2155 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]); 2156 } 2157 } 2158 dev_ll = dev_ll->next; 2159 } 2160 prev_tsc = cur_tsc; 2161 } 2162 2163 rte_prefetch0(lcore_ll->ll_root_used); 2164 2165 /* 2166 * Inform the configuration core that we have exited the linked 2167 * list and that no devices are in use if requested. 2168 */ 2169 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 2170 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2171 2172 /* Process devices */ 2173 dev_ll = lcore_ll->ll_root_used; 2174 2175 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) { 2176 vdev = dev_ll->vdev; 2177 dev = vdev->dev; 2178 if (unlikely(vdev->remove)) { 2179 dev_ll = dev_ll->next; 2180 unlink_vmdq(vdev); 2181 vdev->ready = DEVICE_SAFE_REMOVE; 2182 continue; 2183 } 2184 2185 if (likely(vdev->ready == DEVICE_RX)) { 2186 uint32_t index = vdev->vmdq_rx_q; 2187 uint16_t i; 2188 count_in_ring 2189 = rte_ring_count(vpool_array[index].ring); 2190 uint16_t free_entries 2191 = (uint16_t)get_available_ring_num_zcp(dev); 2192 2193 /* 2194 * Attach all mbufs in vpool.ring and put back 2195 * into vpool.pool. 2196 */ 2197 for (i = 0; 2198 i < RTE_MIN(free_entries, 2199 RTE_MIN(count_in_ring, MAX_PKT_BURST)); 2200 i++) 2201 attach_rxmbuf_zcp(dev); 2202 2203 /* Handle guest RX */ 2204 rx_count = rte_eth_rx_burst(ports[0], 2205 vdev->vmdq_rx_q, pkts_burst, 2206 MAX_PKT_BURST); 2207 2208 if (rx_count) { 2209 ret_count = virtio_dev_rx_zcp(dev, 2210 pkts_burst, rx_count); 2211 if (enable_stats) { 2212 dev_statistics[dev->device_fh].rx_total 2213 += rx_count; 2214 dev_statistics[dev->device_fh].rx 2215 += ret_count; 2216 } 2217 while (likely(rx_count)) { 2218 rx_count--; 2219 pktmbuf_detach_zcp( 2220 pkts_burst[rx_count]); 2221 rte_ring_sp_enqueue( 2222 vpool_array[index].ring, 2223 (void *)pkts_burst[rx_count]); 2224 } 2225 } 2226 } 2227 2228 if (likely(!vdev->remove)) 2229 /* Handle guest TX */ 2230 virtio_dev_tx_zcp(dev); 2231 2232 /* Move to the next device in the list */ 2233 dev_ll = dev_ll->next; 2234 } 2235 } 2236 2237 return 0; 2238 } 2239 2240 2241 /* 2242 * Add an entry to a used linked list. A free entry must first be found 2243 * in the free linked list using get_data_ll_free_entry(); 2244 */ 2245 static void 2246 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2247 struct virtio_net_data_ll *ll_dev) 2248 { 2249 struct virtio_net_data_ll *ll = *ll_root_addr; 2250 2251 /* Set next as NULL and use a compiler barrier to avoid reordering. */ 2252 ll_dev->next = NULL; 2253 rte_compiler_barrier(); 2254 2255 /* If ll == NULL then this is the first device. */ 2256 if (ll) { 2257 /* Increment to the tail of the linked list. */ 2258 while ((ll->next != NULL) ) 2259 ll = ll->next; 2260 2261 ll->next = ll_dev; 2262 } else { 2263 *ll_root_addr = ll_dev; 2264 } 2265 } 2266 2267 /* 2268 * Remove an entry from a used linked list. The entry must then be added to 2269 * the free linked list using put_data_ll_free_entry(). 2270 */ 2271 static void 2272 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, 2273 struct virtio_net_data_ll *ll_dev, 2274 struct virtio_net_data_ll *ll_dev_last) 2275 { 2276 struct virtio_net_data_ll *ll = *ll_root_addr; 2277 2278 if (unlikely((ll == NULL) || (ll_dev == NULL))) 2279 return; 2280 2281 if (ll_dev == ll) 2282 *ll_root_addr = ll_dev->next; 2283 else 2284 if (likely(ll_dev_last != NULL)) 2285 ll_dev_last->next = ll_dev->next; 2286 else 2287 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n"); 2288 } 2289 2290 /* 2291 * Find and return an entry from the free linked list. 2292 */ 2293 static struct virtio_net_data_ll * 2294 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr) 2295 { 2296 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2297 struct virtio_net_data_ll *ll_dev; 2298 2299 if (ll_free == NULL) 2300 return NULL; 2301 2302 ll_dev = ll_free; 2303 *ll_root_addr = ll_free->next; 2304 2305 return ll_dev; 2306 } 2307 2308 /* 2309 * Place an entry back on to the free linked list. 2310 */ 2311 static void 2312 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, 2313 struct virtio_net_data_ll *ll_dev) 2314 { 2315 struct virtio_net_data_ll *ll_free = *ll_root_addr; 2316 2317 if (ll_dev == NULL) 2318 return; 2319 2320 ll_dev->next = ll_free; 2321 *ll_root_addr = ll_dev; 2322 } 2323 2324 /* 2325 * Creates a linked list of a given size. 2326 */ 2327 static struct virtio_net_data_ll * 2328 alloc_data_ll(uint32_t size) 2329 { 2330 struct virtio_net_data_ll *ll_new; 2331 uint32_t i; 2332 2333 /* Malloc and then chain the linked list. */ 2334 ll_new = malloc(size * sizeof(struct virtio_net_data_ll)); 2335 if (ll_new == NULL) { 2336 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n"); 2337 return NULL; 2338 } 2339 2340 for (i = 0; i < size - 1; i++) { 2341 ll_new[i].vdev = NULL; 2342 ll_new[i].next = &ll_new[i+1]; 2343 } 2344 ll_new[i].next = NULL; 2345 2346 return ll_new; 2347 } 2348 2349 /* 2350 * Create the main linked list along with each individual cores linked list. A used and a free list 2351 * are created to manage entries. 2352 */ 2353 static int 2354 init_data_ll (void) 2355 { 2356 int lcore; 2357 2358 RTE_LCORE_FOREACH_SLAVE(lcore) { 2359 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info)); 2360 if (lcore_info[lcore].lcore_ll == NULL) { 2361 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n"); 2362 return -1; 2363 } 2364 2365 lcore_info[lcore].lcore_ll->device_num = 0; 2366 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL; 2367 lcore_info[lcore].lcore_ll->ll_root_used = NULL; 2368 if (num_devices % num_switching_cores) 2369 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1); 2370 else 2371 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores); 2372 } 2373 2374 /* Allocate devices up to a maximum of MAX_DEVICES. */ 2375 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES)); 2376 2377 return 0; 2378 } 2379 2380 /* 2381 * Remove a device from the specific data core linked list and from the main linked list. Synchonization 2382 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 2383 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 2384 */ 2385 static void 2386 destroy_device (volatile struct virtio_net *dev) 2387 { 2388 struct virtio_net_data_ll *ll_lcore_dev_cur; 2389 struct virtio_net_data_ll *ll_main_dev_cur; 2390 struct virtio_net_data_ll *ll_lcore_dev_last = NULL; 2391 struct virtio_net_data_ll *ll_main_dev_last = NULL; 2392 struct vhost_dev *vdev; 2393 int lcore; 2394 2395 dev->flags &= ~VIRTIO_DEV_RUNNING; 2396 2397 vdev = (struct vhost_dev *)dev->priv; 2398 /*set the remove flag. */ 2399 vdev->remove = 1; 2400 while(vdev->ready != DEVICE_SAFE_REMOVE) { 2401 rte_pause(); 2402 } 2403 2404 /* Search for entry to be removed from lcore ll */ 2405 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used; 2406 while (ll_lcore_dev_cur != NULL) { 2407 if (ll_lcore_dev_cur->vdev == vdev) { 2408 break; 2409 } else { 2410 ll_lcore_dev_last = ll_lcore_dev_cur; 2411 ll_lcore_dev_cur = ll_lcore_dev_cur->next; 2412 } 2413 } 2414 2415 if (ll_lcore_dev_cur == NULL) { 2416 RTE_LOG(ERR, VHOST_CONFIG, 2417 "(%"PRIu64") Failed to find the dev to be destroy.\n", 2418 dev->device_fh); 2419 return; 2420 } 2421 2422 /* Search for entry to be removed from main ll */ 2423 ll_main_dev_cur = ll_root_used; 2424 ll_main_dev_last = NULL; 2425 while (ll_main_dev_cur != NULL) { 2426 if (ll_main_dev_cur->vdev == vdev) { 2427 break; 2428 } else { 2429 ll_main_dev_last = ll_main_dev_cur; 2430 ll_main_dev_cur = ll_main_dev_cur->next; 2431 } 2432 } 2433 2434 /* Remove entries from the lcore and main ll. */ 2435 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last); 2436 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last); 2437 2438 /* Set the dev_removal_flag on each lcore. */ 2439 RTE_LCORE_FOREACH_SLAVE(lcore) { 2440 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL; 2441 } 2442 2443 /* 2444 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that 2445 * they can no longer access the device removed from the linked lists and that the devices 2446 * are no longer in use. 2447 */ 2448 RTE_LCORE_FOREACH_SLAVE(lcore) { 2449 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) { 2450 rte_pause(); 2451 } 2452 } 2453 2454 /* Add the entries back to the lcore and main free ll.*/ 2455 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur); 2456 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur); 2457 2458 /* Decrement number of device on the lcore. */ 2459 lcore_info[vdev->coreid].lcore_ll->device_num--; 2460 2461 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh); 2462 2463 if (zero_copy) { 2464 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2465 2466 /* Stop the RX queue. */ 2467 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2468 LOG_DEBUG(VHOST_CONFIG, 2469 "(%"PRIu64") In destroy_device: Failed to stop " 2470 "rx queue:%d\n", 2471 dev->device_fh, 2472 vdev->vmdq_rx_q); 2473 } 2474 2475 LOG_DEBUG(VHOST_CONFIG, 2476 "(%"PRIu64") in destroy_device: Start put mbuf in " 2477 "mempool back to ring for RX queue: %d\n", 2478 dev->device_fh, vdev->vmdq_rx_q); 2479 2480 mbuf_destroy_zcp(vpool); 2481 2482 /* Stop the TX queue. */ 2483 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) { 2484 LOG_DEBUG(VHOST_CONFIG, 2485 "(%"PRIu64") In destroy_device: Failed to " 2486 "stop tx queue:%d\n", 2487 dev->device_fh, vdev->vmdq_rx_q); 2488 } 2489 2490 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES]; 2491 2492 LOG_DEBUG(VHOST_CONFIG, 2493 "(%"PRIu64") destroy_device: Start put mbuf in mempool " 2494 "back to ring for TX queue: %d, dev:(%"PRIu64")\n", 2495 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES), 2496 dev->device_fh); 2497 2498 mbuf_destroy_zcp(vpool); 2499 rte_free(vdev->regions_hpa); 2500 } 2501 rte_free(vdev); 2502 2503 } 2504 2505 /* 2506 * Calculate the region count of physical continous regions for one particular 2507 * region of whose vhost virtual address is continous. The particular region 2508 * start from vva_start, with size of 'size' in argument. 2509 */ 2510 static uint32_t 2511 check_hpa_regions(uint64_t vva_start, uint64_t size) 2512 { 2513 uint32_t i, nregions = 0, page_size = getpagesize(); 2514 uint64_t cur_phys_addr = 0, next_phys_addr = 0; 2515 if (vva_start % page_size) { 2516 LOG_DEBUG(VHOST_CONFIG, 2517 "in check_countinous: vva start(%p) mod page_size(%d) " 2518 "has remainder\n", 2519 (void *)(uintptr_t)vva_start, page_size); 2520 return 0; 2521 } 2522 if (size % page_size) { 2523 LOG_DEBUG(VHOST_CONFIG, 2524 "in check_countinous: " 2525 "size((%"PRIu64")) mod page_size(%d) has remainder\n", 2526 size, page_size); 2527 return 0; 2528 } 2529 for (i = 0; i < size - page_size; i = i + page_size) { 2530 cur_phys_addr 2531 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i)); 2532 next_phys_addr = rte_mem_virt2phy( 2533 (void *)(uintptr_t)(vva_start + i + page_size)); 2534 if ((cur_phys_addr + page_size) != next_phys_addr) { 2535 ++nregions; 2536 LOG_DEBUG(VHOST_CONFIG, 2537 "in check_continuous: hva addr:(%p) is not " 2538 "continuous with hva addr:(%p), diff:%d\n", 2539 (void *)(uintptr_t)(vva_start + (uint64_t)i), 2540 (void *)(uintptr_t)(vva_start + (uint64_t)i 2541 + page_size), page_size); 2542 LOG_DEBUG(VHOST_CONFIG, 2543 "in check_continuous: hpa addr:(%p) is not " 2544 "continuous with hpa addr:(%p), " 2545 "diff:(%"PRIu64")\n", 2546 (void *)(uintptr_t)cur_phys_addr, 2547 (void *)(uintptr_t)next_phys_addr, 2548 (next_phys_addr-cur_phys_addr)); 2549 } 2550 } 2551 return nregions; 2552 } 2553 2554 /* 2555 * Divide each region whose vhost virtual address is continous into a few 2556 * sub-regions, make sure the physical address within each sub-region are 2557 * continous. And fill offset(to GPA) and size etc. information of each 2558 * sub-region into regions_hpa. 2559 */ 2560 static uint32_t 2561 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory) 2562 { 2563 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize(); 2564 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start; 2565 2566 if (mem_region_hpa == NULL) 2567 return 0; 2568 2569 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) { 2570 vva_start = virtio_memory->regions[regionidx].guest_phys_address + 2571 virtio_memory->regions[regionidx].address_offset; 2572 mem_region_hpa[regionidx_hpa].guest_phys_address 2573 = virtio_memory->regions[regionidx].guest_phys_address; 2574 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2575 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) - 2576 mem_region_hpa[regionidx_hpa].guest_phys_address; 2577 LOG_DEBUG(VHOST_CONFIG, 2578 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n", 2579 regionidx_hpa, 2580 (void *)(uintptr_t) 2581 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2582 LOG_DEBUG(VHOST_CONFIG, 2583 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n", 2584 regionidx_hpa, 2585 (void *)(uintptr_t) 2586 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2587 for (i = 0, k = 0; 2588 i < virtio_memory->regions[regionidx].memory_size - 2589 page_size; 2590 i += page_size) { 2591 cur_phys_addr = rte_mem_virt2phy( 2592 (void *)(uintptr_t)(vva_start + i)); 2593 next_phys_addr = rte_mem_virt2phy( 2594 (void *)(uintptr_t)(vva_start + 2595 i + page_size)); 2596 if ((cur_phys_addr + page_size) != next_phys_addr) { 2597 mem_region_hpa[regionidx_hpa].guest_phys_address_end = 2598 mem_region_hpa[regionidx_hpa].guest_phys_address + 2599 k + page_size; 2600 mem_region_hpa[regionidx_hpa].memory_size 2601 = k + page_size; 2602 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest " 2603 "phys addr end [%d]:(%p)\n", 2604 regionidx_hpa, 2605 (void *)(uintptr_t) 2606 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2607 LOG_DEBUG(VHOST_CONFIG, 2608 "in fill_hpa_regions: guest phys addr " 2609 "size [%d]:(%p)\n", 2610 regionidx_hpa, 2611 (void *)(uintptr_t) 2612 (mem_region_hpa[regionidx_hpa].memory_size)); 2613 mem_region_hpa[regionidx_hpa + 1].guest_phys_address 2614 = mem_region_hpa[regionidx_hpa].guest_phys_address_end; 2615 ++regionidx_hpa; 2616 mem_region_hpa[regionidx_hpa].host_phys_addr_offset = 2617 next_phys_addr - 2618 mem_region_hpa[regionidx_hpa].guest_phys_address; 2619 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest" 2620 " phys addr start[%d]:(%p)\n", 2621 regionidx_hpa, 2622 (void *)(uintptr_t) 2623 (mem_region_hpa[regionidx_hpa].guest_phys_address)); 2624 LOG_DEBUG(VHOST_CONFIG, 2625 "in fill_hpa_regions: host phys addr " 2626 "start[%d]:(%p)\n", 2627 regionidx_hpa, 2628 (void *)(uintptr_t) 2629 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset)); 2630 k = 0; 2631 } else { 2632 k += page_size; 2633 } 2634 } 2635 mem_region_hpa[regionidx_hpa].guest_phys_address_end 2636 = mem_region_hpa[regionidx_hpa].guest_phys_address 2637 + k + page_size; 2638 mem_region_hpa[regionidx_hpa].memory_size = k + page_size; 2639 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end " 2640 "[%d]:(%p)\n", regionidx_hpa, 2641 (void *)(uintptr_t) 2642 (mem_region_hpa[regionidx_hpa].guest_phys_address_end)); 2643 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size " 2644 "[%d]:(%p)\n", regionidx_hpa, 2645 (void *)(uintptr_t) 2646 (mem_region_hpa[regionidx_hpa].memory_size)); 2647 ++regionidx_hpa; 2648 } 2649 return regionidx_hpa; 2650 } 2651 2652 /* 2653 * A new device is added to a data core. First the device is added to the main linked list 2654 * and the allocated to a specific data core. 2655 */ 2656 static int 2657 new_device (struct virtio_net *dev) 2658 { 2659 struct virtio_net_data_ll *ll_dev; 2660 int lcore, core_add = 0; 2661 uint32_t device_num_min = num_devices; 2662 struct vhost_dev *vdev; 2663 uint32_t regionidx; 2664 2665 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 2666 if (vdev == NULL) { 2667 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n", 2668 dev->device_fh); 2669 return -1; 2670 } 2671 vdev->dev = dev; 2672 dev->priv = vdev; 2673 2674 if (zero_copy) { 2675 vdev->nregions_hpa = dev->mem->nregions; 2676 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { 2677 vdev->nregions_hpa 2678 += check_hpa_regions( 2679 dev->mem->regions[regionidx].guest_phys_address 2680 + dev->mem->regions[regionidx].address_offset, 2681 dev->mem->regions[regionidx].memory_size); 2682 2683 } 2684 2685 vdev->regions_hpa = rte_calloc("vhost hpa region", 2686 vdev->nregions_hpa, 2687 sizeof(struct virtio_memory_regions_hpa), 2688 RTE_CACHE_LINE_SIZE); 2689 if (vdev->regions_hpa == NULL) { 2690 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n"); 2691 rte_free(vdev); 2692 return -1; 2693 } 2694 2695 2696 if (fill_hpa_memory_regions( 2697 vdev->regions_hpa, dev->mem 2698 ) != vdev->nregions_hpa) { 2699 2700 RTE_LOG(ERR, VHOST_CONFIG, 2701 "hpa memory regions number mismatch: " 2702 "[%d]\n", vdev->nregions_hpa); 2703 rte_free(vdev->regions_hpa); 2704 rte_free(vdev); 2705 return -1; 2706 } 2707 } 2708 2709 2710 /* Add device to main ll */ 2711 ll_dev = get_data_ll_free_entry(&ll_root_free); 2712 if (ll_dev == NULL) { 2713 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit " 2714 "of %d devices per core has been reached\n", 2715 dev->device_fh, num_devices); 2716 if (vdev->regions_hpa) 2717 rte_free(vdev->regions_hpa); 2718 rte_free(vdev); 2719 return -1; 2720 } 2721 ll_dev->vdev = vdev; 2722 add_data_ll_entry(&ll_root_used, ll_dev); 2723 vdev->vmdq_rx_q 2724 = dev->device_fh * queues_per_pool + vmdq_queue_base; 2725 2726 if (zero_copy) { 2727 uint32_t index = vdev->vmdq_rx_q; 2728 uint32_t count_in_ring, i; 2729 struct mbuf_table *tx_q; 2730 2731 count_in_ring = rte_ring_count(vpool_array[index].ring); 2732 2733 LOG_DEBUG(VHOST_CONFIG, 2734 "(%"PRIu64") in new_device: mbuf count in mempool " 2735 "before attach is: %d\n", 2736 dev->device_fh, 2737 rte_mempool_count(vpool_array[index].pool)); 2738 LOG_DEBUG(VHOST_CONFIG, 2739 "(%"PRIu64") in new_device: mbuf count in ring " 2740 "before attach is : %d\n", 2741 dev->device_fh, count_in_ring); 2742 2743 /* 2744 * Attach all mbufs in vpool.ring and put back intovpool.pool. 2745 */ 2746 for (i = 0; i < count_in_ring; i++) 2747 attach_rxmbuf_zcp(dev); 2748 2749 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2750 "mempool after attach is: %d\n", 2751 dev->device_fh, 2752 rte_mempool_count(vpool_array[index].pool)); 2753 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in " 2754 "ring after attach is : %d\n", 2755 dev->device_fh, 2756 rte_ring_count(vpool_array[index].ring)); 2757 2758 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q]; 2759 tx_q->txq_id = vdev->vmdq_rx_q; 2760 2761 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2762 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2763 2764 LOG_DEBUG(VHOST_CONFIG, 2765 "(%"PRIu64") In new_device: Failed to start " 2766 "tx queue:%d\n", 2767 dev->device_fh, vdev->vmdq_rx_q); 2768 2769 mbuf_destroy_zcp(vpool); 2770 rte_free(vdev->regions_hpa); 2771 rte_free(vdev); 2772 return -1; 2773 } 2774 2775 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) { 2776 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q]; 2777 2778 LOG_DEBUG(VHOST_CONFIG, 2779 "(%"PRIu64") In new_device: Failed to start " 2780 "rx queue:%d\n", 2781 dev->device_fh, vdev->vmdq_rx_q); 2782 2783 /* Stop the TX queue. */ 2784 if (rte_eth_dev_tx_queue_stop(ports[0], 2785 vdev->vmdq_rx_q) != 0) { 2786 LOG_DEBUG(VHOST_CONFIG, 2787 "(%"PRIu64") In new_device: Failed to " 2788 "stop tx queue:%d\n", 2789 dev->device_fh, vdev->vmdq_rx_q); 2790 } 2791 2792 mbuf_destroy_zcp(vpool); 2793 rte_free(vdev->regions_hpa); 2794 rte_free(vdev); 2795 return -1; 2796 } 2797 2798 } 2799 2800 /*reset ready flag*/ 2801 vdev->ready = DEVICE_MAC_LEARNING; 2802 vdev->remove = 0; 2803 2804 /* Find a suitable lcore to add the device. */ 2805 RTE_LCORE_FOREACH_SLAVE(lcore) { 2806 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) { 2807 device_num_min = lcore_info[lcore].lcore_ll->device_num; 2808 core_add = lcore; 2809 } 2810 } 2811 /* Add device to lcore ll */ 2812 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free); 2813 if (ll_dev == NULL) { 2814 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh); 2815 vdev->ready = DEVICE_SAFE_REMOVE; 2816 destroy_device(dev); 2817 rte_free(vdev->regions_hpa); 2818 rte_free(vdev); 2819 return -1; 2820 } 2821 ll_dev->vdev = vdev; 2822 vdev->coreid = core_add; 2823 2824 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev); 2825 2826 /* Initialize device stats */ 2827 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics)); 2828 2829 /* Disable notifications. */ 2830 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0); 2831 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0); 2832 lcore_info[vdev->coreid].lcore_ll->device_num++; 2833 dev->flags |= VIRTIO_DEV_RUNNING; 2834 2835 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid); 2836 2837 return 0; 2838 } 2839 2840 /* 2841 * These callback allow devices to be added to the data core when configuration 2842 * has been fully complete. 2843 */ 2844 static const struct virtio_net_device_ops virtio_net_device_ops = 2845 { 2846 .new_device = new_device, 2847 .destroy_device = destroy_device, 2848 }; 2849 2850 /* 2851 * This is a thread will wake up after a period to print stats if the user has 2852 * enabled them. 2853 */ 2854 static void 2855 print_stats(void) 2856 { 2857 struct virtio_net_data_ll *dev_ll; 2858 uint64_t tx_dropped, rx_dropped; 2859 uint64_t tx, tx_total, rx, rx_total; 2860 uint32_t device_fh; 2861 const char clr[] = { 27, '[', '2', 'J', '\0' }; 2862 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 2863 2864 while(1) { 2865 sleep(enable_stats); 2866 2867 /* Clear screen and move to top left */ 2868 printf("%s%s", clr, top_left); 2869 2870 printf("\nDevice statistics ===================================="); 2871 2872 dev_ll = ll_root_used; 2873 while (dev_ll != NULL) { 2874 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh; 2875 tx_total = dev_statistics[device_fh].tx_total; 2876 tx = dev_statistics[device_fh].tx; 2877 tx_dropped = tx_total - tx; 2878 if (zero_copy == 0) { 2879 rx_total = rte_atomic64_read( 2880 &dev_statistics[device_fh].rx_total_atomic); 2881 rx = rte_atomic64_read( 2882 &dev_statistics[device_fh].rx_atomic); 2883 } else { 2884 rx_total = dev_statistics[device_fh].rx_total; 2885 rx = dev_statistics[device_fh].rx; 2886 } 2887 rx_dropped = rx_total - rx; 2888 2889 printf("\nStatistics for device %"PRIu32" ------------------------------" 2890 "\nTX total: %"PRIu64"" 2891 "\nTX dropped: %"PRIu64"" 2892 "\nTX successful: %"PRIu64"" 2893 "\nRX total: %"PRIu64"" 2894 "\nRX dropped: %"PRIu64"" 2895 "\nRX successful: %"PRIu64"", 2896 device_fh, 2897 tx_total, 2898 tx_dropped, 2899 tx, 2900 rx_total, 2901 rx_dropped, 2902 rx); 2903 2904 dev_ll = dev_ll->next; 2905 } 2906 printf("\n======================================================\n"); 2907 } 2908 } 2909 2910 static void 2911 setup_mempool_tbl(int socket, uint32_t index, char *pool_name, 2912 char *ring_name, uint32_t nb_mbuf) 2913 { 2914 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, 2915 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket); 2916 if (vpool_array[index].pool != NULL) { 2917 vpool_array[index].ring 2918 = rte_ring_create(ring_name, 2919 rte_align32pow2(nb_mbuf + 1), 2920 socket, RING_F_SP_ENQ | RING_F_SC_DEQ); 2921 if (likely(vpool_array[index].ring != NULL)) { 2922 LOG_DEBUG(VHOST_CONFIG, 2923 "in setup_mempool_tbl: mbuf count in " 2924 "mempool is: %d\n", 2925 rte_mempool_count(vpool_array[index].pool)); 2926 LOG_DEBUG(VHOST_CONFIG, 2927 "in setup_mempool_tbl: mbuf count in " 2928 "ring is: %d\n", 2929 rte_ring_count(vpool_array[index].ring)); 2930 } else { 2931 rte_exit(EXIT_FAILURE, "ring_create(%s) failed", 2932 ring_name); 2933 } 2934 2935 /* Need consider head room. */ 2936 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP; 2937 } else { 2938 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name); 2939 } 2940 } 2941 2942 /* When we receive a INT signal, unregister vhost driver */ 2943 static void 2944 sigint_handler(__rte_unused int signum) 2945 { 2946 /* Unregister vhost driver. */ 2947 int ret = rte_vhost_driver_unregister((char *)&dev_basename); 2948 if (ret != 0) 2949 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n"); 2950 exit(0); 2951 } 2952 2953 /* 2954 * Main function, does initialisation and calls the per-lcore functions. The CUSE 2955 * device is also registered here to handle the IOCTLs. 2956 */ 2957 int 2958 main(int argc, char *argv[]) 2959 { 2960 struct rte_mempool *mbuf_pool = NULL; 2961 unsigned lcore_id, core_id = 0; 2962 unsigned nb_ports, valid_num_ports; 2963 int ret; 2964 uint8_t portid; 2965 uint16_t queue_id; 2966 static pthread_t tid; 2967 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 2968 2969 signal(SIGINT, sigint_handler); 2970 2971 /* init EAL */ 2972 ret = rte_eal_init(argc, argv); 2973 if (ret < 0) 2974 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 2975 argc -= ret; 2976 argv += ret; 2977 2978 /* parse app arguments */ 2979 ret = us_vhost_parse_args(argc, argv); 2980 if (ret < 0) 2981 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 2982 2983 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) 2984 if (rte_lcore_is_enabled(lcore_id)) 2985 lcore_ids[core_id ++] = lcore_id; 2986 2987 if (rte_lcore_count() > RTE_MAX_LCORE) 2988 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 2989 2990 /*set the number of swithcing cores available*/ 2991 num_switching_cores = rte_lcore_count()-1; 2992 2993 /* Get the number of physical ports. */ 2994 nb_ports = rte_eth_dev_count(); 2995 if (nb_ports > RTE_MAX_ETHPORTS) 2996 nb_ports = RTE_MAX_ETHPORTS; 2997 2998 /* 2999 * Update the global var NUM_PORTS and global array PORTS 3000 * and get value of var VALID_NUM_PORTS according to system ports number 3001 */ 3002 valid_num_ports = check_ports_num(nb_ports); 3003 3004 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 3005 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 3006 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 3007 return -1; 3008 } 3009 3010 if (zero_copy == 0) { 3011 /* Create the mbuf pool. */ 3012 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", 3013 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 3014 0, MBUF_DATA_SIZE, rte_socket_id()); 3015 if (mbuf_pool == NULL) 3016 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 3017 3018 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++) 3019 vpool_array[queue_id].pool = mbuf_pool; 3020 3021 if (vm2vm_mode == VM2VM_HARDWARE) { 3022 /* Enable VT loop back to let L2 switch to do it. */ 3023 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3024 LOG_DEBUG(VHOST_CONFIG, 3025 "Enable loop back for L2 switch in vmdq.\n"); 3026 } 3027 } else { 3028 uint32_t nb_mbuf; 3029 char pool_name[RTE_MEMPOOL_NAMESIZE]; 3030 char ring_name[RTE_MEMPOOL_NAMESIZE]; 3031 3032 nb_mbuf = num_rx_descriptor 3033 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3034 + num_switching_cores * MAX_PKT_BURST; 3035 3036 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3037 snprintf(pool_name, sizeof(pool_name), 3038 "rxmbuf_pool_%u", queue_id); 3039 snprintf(ring_name, sizeof(ring_name), 3040 "rxmbuf_ring_%u", queue_id); 3041 setup_mempool_tbl(rte_socket_id(), queue_id, 3042 pool_name, ring_name, nb_mbuf); 3043 } 3044 3045 nb_mbuf = num_tx_descriptor 3046 + num_switching_cores * MBUF_CACHE_SIZE_ZCP 3047 + num_switching_cores * MAX_PKT_BURST; 3048 3049 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { 3050 snprintf(pool_name, sizeof(pool_name), 3051 "txmbuf_pool_%u", queue_id); 3052 snprintf(ring_name, sizeof(ring_name), 3053 "txmbuf_ring_%u", queue_id); 3054 setup_mempool_tbl(rte_socket_id(), 3055 (queue_id + MAX_QUEUES), 3056 pool_name, ring_name, nb_mbuf); 3057 } 3058 3059 if (vm2vm_mode == VM2VM_HARDWARE) { 3060 /* Enable VT loop back to let L2 switch to do it. */ 3061 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 3062 LOG_DEBUG(VHOST_CONFIG, 3063 "Enable loop back for L2 switch in vmdq.\n"); 3064 } 3065 } 3066 /* Set log level. */ 3067 rte_set_log_level(LOG_LEVEL); 3068 3069 /* initialize all ports */ 3070 for (portid = 0; portid < nb_ports; portid++) { 3071 /* skip ports that are not enabled */ 3072 if ((enabled_port_mask & (1 << portid)) == 0) { 3073 RTE_LOG(INFO, VHOST_PORT, 3074 "Skipping disabled port %d\n", portid); 3075 continue; 3076 } 3077 if (port_init(portid) != 0) 3078 rte_exit(EXIT_FAILURE, 3079 "Cannot initialize network ports\n"); 3080 } 3081 3082 /* Initialise all linked lists. */ 3083 if (init_data_ll() == -1) 3084 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n"); 3085 3086 /* Initialize device stats */ 3087 memset(&dev_statistics, 0, sizeof(dev_statistics)); 3088 3089 /* Enable stats if the user option is set. */ 3090 if (enable_stats) { 3091 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 3092 if (ret != 0) 3093 rte_exit(EXIT_FAILURE, 3094 "Cannot create print-stats thread\n"); 3095 3096 /* Set thread_name for aid in debugging. */ 3097 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 3098 ret = rte_thread_setname(tid, thread_name); 3099 if (ret != 0) 3100 RTE_LOG(ERR, VHOST_CONFIG, 3101 "Cannot set print-stats name\n"); 3102 } 3103 3104 /* Launch all data cores. */ 3105 if (zero_copy == 0) { 3106 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 3107 rte_eal_remote_launch(switch_worker, 3108 mbuf_pool, lcore_id); 3109 } 3110 } else { 3111 uint32_t count_in_mempool, index, i; 3112 for (index = 0; index < 2*MAX_QUEUES; index++) { 3113 /* For all RX and TX queues. */ 3114 count_in_mempool 3115 = rte_mempool_count(vpool_array[index].pool); 3116 3117 /* 3118 * Transfer all un-attached mbufs from vpool.pool 3119 * to vpoo.ring. 3120 */ 3121 for (i = 0; i < count_in_mempool; i++) { 3122 struct rte_mbuf *mbuf 3123 = __rte_mbuf_raw_alloc( 3124 vpool_array[index].pool); 3125 rte_ring_sp_enqueue(vpool_array[index].ring, 3126 (void *)mbuf); 3127 } 3128 3129 LOG_DEBUG(VHOST_CONFIG, 3130 "in main: mbuf count in mempool at initial " 3131 "is: %d\n", count_in_mempool); 3132 LOG_DEBUG(VHOST_CONFIG, 3133 "in main: mbuf count in ring at initial is :" 3134 " %d\n", 3135 rte_ring_count(vpool_array[index].ring)); 3136 } 3137 3138 RTE_LCORE_FOREACH_SLAVE(lcore_id) 3139 rte_eal_remote_launch(switch_worker_zcp, NULL, 3140 lcore_id); 3141 } 3142 3143 if (mergeable == 0) 3144 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF); 3145 3146 /* Register vhost(cuse or user) driver to handle vhost messages. */ 3147 ret = rte_vhost_driver_register((char *)&dev_basename); 3148 if (ret != 0) 3149 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n"); 3150 3151 rte_vhost_driver_callback_register(&virtio_net_device_ops); 3152 3153 /* Start CUSE session. */ 3154 rte_vhost_driver_session_start(); 3155 return 0; 3156 3157 } 3158