1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 28 #include "ioat.h" 29 #include "main.h" 30 31 #ifndef MAX_QUEUES 32 #define MAX_QUEUES 128 33 #endif 34 35 /* the maximum number of external ports supported */ 36 #define MAX_SUP_PORTS 1 37 38 #define MBUF_CACHE_SIZE 128 39 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 40 41 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 42 43 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 44 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 45 46 #define JUMBO_FRAME_MAX_SIZE 0x2600 47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 48 49 /* State of virtio device. */ 50 #define DEVICE_MAC_LEARNING 0 51 #define DEVICE_RX 1 52 #define DEVICE_SAFE_REMOVE 2 53 54 /* Configurable number of RX/TX ring descriptors */ 55 #define RTE_TEST_RX_DESC_DEFAULT 1024 56 #define RTE_TEST_TX_DESC_DEFAULT 512 57 58 #define INVALID_PORT_ID 0xFF 59 60 /* mask of enabled ports */ 61 static uint32_t enabled_port_mask = 0; 62 63 /* Promiscuous mode */ 64 static uint32_t promiscuous; 65 66 /* number of devices/queues to support*/ 67 static uint32_t num_queues = 0; 68 static uint32_t num_devices; 69 70 static struct rte_mempool *mbuf_pool; 71 static int mergeable; 72 73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 74 typedef enum { 75 VM2VM_DISABLED = 0, 76 VM2VM_SOFTWARE = 1, 77 VM2VM_HARDWARE = 2, 78 VM2VM_LAST 79 } vm2vm_type; 80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 81 82 /* Enable stats. */ 83 static uint32_t enable_stats = 0; 84 /* Enable retries on RX. */ 85 static uint32_t enable_retry = 1; 86 87 /* Disable TX checksum offload */ 88 static uint32_t enable_tx_csum; 89 90 /* Disable TSO offload */ 91 static uint32_t enable_tso; 92 93 static int client_mode; 94 95 static int builtin_net_driver; 96 97 static int async_vhost_driver; 98 99 static char *dma_type; 100 101 /* Specify timeout (in useconds) between retries on RX. */ 102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 103 /* Specify the number of retries on RX. */ 104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 105 106 /* Socket file paths. Can be set by user */ 107 static char *socket_files; 108 static int nb_sockets; 109 110 /* empty VMDq configuration structure. Filled in programmatically */ 111 static struct rte_eth_conf vmdq_conf_default = { 112 .rxmode = { 113 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 114 .split_hdr_size = 0, 115 /* 116 * VLAN strip is necessary for 1G NIC such as I350, 117 * this fixes bug of ipv4 forwarding in guest can't 118 * forward packets from one virtio dev to another virtio dev. 119 */ 120 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 121 }, 122 123 .txmode = { 124 .mq_mode = RTE_ETH_MQ_TX_NONE, 125 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 126 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 127 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 128 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 129 RTE_ETH_TX_OFFLOAD_TCP_TSO), 130 }, 131 .rx_adv_conf = { 132 /* 133 * should be overridden separately in code with 134 * appropriate values 135 */ 136 .vmdq_rx_conf = { 137 .nb_queue_pools = RTE_ETH_8_POOLS, 138 .enable_default_pool = 0, 139 .default_pool = 0, 140 .nb_pool_maps = 0, 141 .pool_map = {{0, 0},}, 142 }, 143 }, 144 }; 145 146 147 static unsigned lcore_ids[RTE_MAX_LCORE]; 148 static uint16_t ports[RTE_MAX_ETHPORTS]; 149 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 150 static uint16_t num_pf_queues, num_vmdq_queues; 151 static uint16_t vmdq_pool_base, vmdq_queue_base; 152 static uint16_t queues_per_pool; 153 154 const uint16_t vlan_tags[] = { 155 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 156 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 157 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 158 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 159 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 160 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 161 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 162 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 163 }; 164 165 /* ethernet addresses of ports */ 166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 167 168 static struct vhost_dev_tailq_list vhost_dev_list = 169 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 170 171 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 172 173 /* Used for queueing bursts of TX packets. */ 174 struct mbuf_table { 175 unsigned len; 176 unsigned txq_id; 177 struct rte_mbuf *m_table[MAX_PKT_BURST]; 178 }; 179 180 struct vhost_bufftable { 181 uint32_t len; 182 uint64_t pre_tsc; 183 struct rte_mbuf *m_table[MAX_PKT_BURST]; 184 }; 185 186 /* TX queue for each data core. */ 187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 188 189 /* 190 * Vhost TX buffer for each data core. 191 * Every data core maintains a TX buffer for every vhost device, 192 * which is used for batch pkts enqueue for higher performance. 193 */ 194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE]; 195 196 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 197 / US_PER_S * BURST_TX_DRAIN_US) 198 199 static inline int 200 open_dma(const char *value) 201 { 202 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) 203 return open_ioat(value); 204 205 return -1; 206 } 207 208 /* 209 * Builds up the correct configuration for VMDQ VLAN pool map 210 * according to the pool & queue limits. 211 */ 212 static inline int 213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 214 { 215 struct rte_eth_vmdq_rx_conf conf; 216 struct rte_eth_vmdq_rx_conf *def_conf = 217 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 218 unsigned i; 219 220 memset(&conf, 0, sizeof(conf)); 221 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 222 conf.nb_pool_maps = num_devices; 223 conf.enable_loop_back = def_conf->enable_loop_back; 224 conf.rx_mode = def_conf->rx_mode; 225 226 for (i = 0; i < conf.nb_pool_maps; i++) { 227 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 228 conf.pool_map[i].pools = (1UL << i); 229 } 230 231 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 232 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 233 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 234 return 0; 235 } 236 237 /* 238 * Initialises a given port using global settings and with the rx buffers 239 * coming from the mbuf_pool passed as parameter 240 */ 241 static inline int 242 port_init(uint16_t port) 243 { 244 struct rte_eth_dev_info dev_info; 245 struct rte_eth_conf port_conf; 246 struct rte_eth_rxconf *rxconf; 247 struct rte_eth_txconf *txconf; 248 int16_t rx_rings, tx_rings; 249 uint16_t rx_ring_size, tx_ring_size; 250 int retval; 251 uint16_t q; 252 253 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 254 retval = rte_eth_dev_info_get(port, &dev_info); 255 if (retval != 0) { 256 RTE_LOG(ERR, VHOST_PORT, 257 "Error during getting device (port %u) info: %s\n", 258 port, strerror(-retval)); 259 260 return retval; 261 } 262 263 rxconf = &dev_info.default_rxconf; 264 txconf = &dev_info.default_txconf; 265 rxconf->rx_drop_en = 1; 266 267 /*configure the number of supported virtio devices based on VMDQ limits */ 268 num_devices = dev_info.max_vmdq_pools; 269 270 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 271 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 272 273 tx_rings = (uint16_t)rte_lcore_count(); 274 275 if (mergeable) { 276 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 277 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 278 else 279 vmdq_conf_default.rxmode.mtu = MAX_MTU; 280 } 281 282 /* Get port configuration. */ 283 retval = get_eth_conf(&port_conf, num_devices); 284 if (retval < 0) 285 return retval; 286 /* NIC queues are divided into pf queues and vmdq queues. */ 287 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 288 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 289 num_vmdq_queues = num_devices * queues_per_pool; 290 num_queues = num_pf_queues + num_vmdq_queues; 291 vmdq_queue_base = dev_info.vmdq_queue_base; 292 vmdq_pool_base = dev_info.vmdq_pool_base; 293 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 294 num_pf_queues, num_devices, queues_per_pool); 295 296 if (!rte_eth_dev_is_valid_port(port)) 297 return -1; 298 299 rx_rings = (uint16_t)dev_info.max_rx_queues; 300 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 301 port_conf.txmode.offloads |= 302 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 303 /* Configure ethernet device. */ 304 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 305 if (retval != 0) { 306 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 307 port, strerror(-retval)); 308 return retval; 309 } 310 311 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 312 &tx_ring_size); 313 if (retval != 0) { 314 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 315 "for port %u: %s.\n", port, strerror(-retval)); 316 return retval; 317 } 318 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 319 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 320 "for Rx queues on port %u.\n", port); 321 return -1; 322 } 323 324 /* Setup the queues. */ 325 rxconf->offloads = port_conf.rxmode.offloads; 326 for (q = 0; q < rx_rings; q ++) { 327 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 328 rte_eth_dev_socket_id(port), 329 rxconf, 330 mbuf_pool); 331 if (retval < 0) { 332 RTE_LOG(ERR, VHOST_PORT, 333 "Failed to setup rx queue %u of port %u: %s.\n", 334 q, port, strerror(-retval)); 335 return retval; 336 } 337 } 338 txconf->offloads = port_conf.txmode.offloads; 339 for (q = 0; q < tx_rings; q ++) { 340 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 341 rte_eth_dev_socket_id(port), 342 txconf); 343 if (retval < 0) { 344 RTE_LOG(ERR, VHOST_PORT, 345 "Failed to setup tx queue %u of port %u: %s.\n", 346 q, port, strerror(-retval)); 347 return retval; 348 } 349 } 350 351 /* Start the device. */ 352 retval = rte_eth_dev_start(port); 353 if (retval < 0) { 354 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 355 port, strerror(-retval)); 356 return retval; 357 } 358 359 if (promiscuous) { 360 retval = rte_eth_promiscuous_enable(port); 361 if (retval != 0) { 362 RTE_LOG(ERR, VHOST_PORT, 363 "Failed to enable promiscuous mode on port %u: %s\n", 364 port, rte_strerror(-retval)); 365 return retval; 366 } 367 } 368 369 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 370 if (retval < 0) { 371 RTE_LOG(ERR, VHOST_PORT, 372 "Failed to get MAC address on port %u: %s\n", 373 port, rte_strerror(-retval)); 374 return retval; 375 } 376 377 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 378 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 379 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 380 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 381 382 return 0; 383 } 384 385 /* 386 * Set socket file path. 387 */ 388 static int 389 us_vhost_parse_socket_path(const char *q_arg) 390 { 391 char *old; 392 393 /* parse number string */ 394 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 395 return -1; 396 397 old = socket_files; 398 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 399 if (socket_files == NULL) { 400 free(old); 401 return -1; 402 } 403 404 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 405 nb_sockets++; 406 407 return 0; 408 } 409 410 /* 411 * Parse the portmask provided at run time. 412 */ 413 static int 414 parse_portmask(const char *portmask) 415 { 416 char *end = NULL; 417 unsigned long pm; 418 419 errno = 0; 420 421 /* parse hexadecimal string */ 422 pm = strtoul(portmask, &end, 16); 423 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 424 return 0; 425 426 return pm; 427 428 } 429 430 /* 431 * Parse num options at run time. 432 */ 433 static int 434 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 435 { 436 char *end = NULL; 437 unsigned long num; 438 439 errno = 0; 440 441 /* parse unsigned int string */ 442 num = strtoul(q_arg, &end, 10); 443 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 444 return -1; 445 446 if (num > max_valid_value) 447 return -1; 448 449 return num; 450 451 } 452 453 /* 454 * Display usage 455 */ 456 static void 457 us_vhost_usage(const char *prgname) 458 { 459 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 460 " --vm2vm [0|1|2]\n" 461 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 462 " --socket-file <path>\n" 463 " --nb-devices ND\n" 464 " -p PORTMASK: Set mask for ports to be used by application\n" 465 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 466 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 467 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 468 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 469 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 470 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 471 " --socket-file: The path of the socket file.\n" 472 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 473 " --tso [0|1] disable/enable TCP segment offload.\n" 474 " --client register a vhost-user socket as client mode.\n" 475 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n" 476 " --dmas register dma channel for specific vhost device.\n", 477 prgname); 478 } 479 480 enum { 481 #define OPT_VM2VM "vm2vm" 482 OPT_VM2VM_NUM = 256, 483 #define OPT_RX_RETRY "rx-retry" 484 OPT_RX_RETRY_NUM, 485 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 486 OPT_RX_RETRY_DELAY_NUM, 487 #define OPT_RX_RETRY_NUMB "rx-retry-num" 488 OPT_RX_RETRY_NUMB_NUM, 489 #define OPT_MERGEABLE "mergeable" 490 OPT_MERGEABLE_NUM, 491 #define OPT_STATS "stats" 492 OPT_STATS_NUM, 493 #define OPT_SOCKET_FILE "socket-file" 494 OPT_SOCKET_FILE_NUM, 495 #define OPT_TX_CSUM "tx-csum" 496 OPT_TX_CSUM_NUM, 497 #define OPT_TSO "tso" 498 OPT_TSO_NUM, 499 #define OPT_CLIENT "client" 500 OPT_CLIENT_NUM, 501 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 502 OPT_BUILTIN_NET_DRIVER_NUM, 503 #define OPT_DMA_TYPE "dma-type" 504 OPT_DMA_TYPE_NUM, 505 #define OPT_DMAS "dmas" 506 OPT_DMAS_NUM, 507 }; 508 509 /* 510 * Parse the arguments given in the command line of the application. 511 */ 512 static int 513 us_vhost_parse_args(int argc, char **argv) 514 { 515 int opt, ret; 516 int option_index; 517 unsigned i; 518 const char *prgname = argv[0]; 519 static struct option long_option[] = { 520 {OPT_VM2VM, required_argument, 521 NULL, OPT_VM2VM_NUM}, 522 {OPT_RX_RETRY, required_argument, 523 NULL, OPT_RX_RETRY_NUM}, 524 {OPT_RX_RETRY_DELAY, required_argument, 525 NULL, OPT_RX_RETRY_DELAY_NUM}, 526 {OPT_RX_RETRY_NUMB, required_argument, 527 NULL, OPT_RX_RETRY_NUMB_NUM}, 528 {OPT_MERGEABLE, required_argument, 529 NULL, OPT_MERGEABLE_NUM}, 530 {OPT_STATS, required_argument, 531 NULL, OPT_STATS_NUM}, 532 {OPT_SOCKET_FILE, required_argument, 533 NULL, OPT_SOCKET_FILE_NUM}, 534 {OPT_TX_CSUM, required_argument, 535 NULL, OPT_TX_CSUM_NUM}, 536 {OPT_TSO, required_argument, 537 NULL, OPT_TSO_NUM}, 538 {OPT_CLIENT, no_argument, 539 NULL, OPT_CLIENT_NUM}, 540 {OPT_BUILTIN_NET_DRIVER, no_argument, 541 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 542 {OPT_DMA_TYPE, required_argument, 543 NULL, OPT_DMA_TYPE_NUM}, 544 {OPT_DMAS, required_argument, 545 NULL, OPT_DMAS_NUM}, 546 {NULL, 0, 0, 0}, 547 }; 548 549 /* Parse command line */ 550 while ((opt = getopt_long(argc, argv, "p:P", 551 long_option, &option_index)) != EOF) { 552 switch (opt) { 553 /* Portmask */ 554 case 'p': 555 enabled_port_mask = parse_portmask(optarg); 556 if (enabled_port_mask == 0) { 557 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 558 us_vhost_usage(prgname); 559 return -1; 560 } 561 break; 562 563 case 'P': 564 promiscuous = 1; 565 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 566 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 567 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 568 break; 569 570 case OPT_VM2VM_NUM: 571 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 572 if (ret == -1) { 573 RTE_LOG(INFO, VHOST_CONFIG, 574 "Invalid argument for " 575 "vm2vm [0|1|2]\n"); 576 us_vhost_usage(prgname); 577 return -1; 578 } 579 vm2vm_mode = (vm2vm_type)ret; 580 break; 581 582 case OPT_RX_RETRY_NUM: 583 ret = parse_num_opt(optarg, 1); 584 if (ret == -1) { 585 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 586 us_vhost_usage(prgname); 587 return -1; 588 } 589 enable_retry = ret; 590 break; 591 592 case OPT_TX_CSUM_NUM: 593 ret = parse_num_opt(optarg, 1); 594 if (ret == -1) { 595 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 596 us_vhost_usage(prgname); 597 return -1; 598 } 599 enable_tx_csum = ret; 600 break; 601 602 case OPT_TSO_NUM: 603 ret = parse_num_opt(optarg, 1); 604 if (ret == -1) { 605 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 606 us_vhost_usage(prgname); 607 return -1; 608 } 609 enable_tso = ret; 610 break; 611 612 case OPT_RX_RETRY_DELAY_NUM: 613 ret = parse_num_opt(optarg, INT32_MAX); 614 if (ret == -1) { 615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 616 us_vhost_usage(prgname); 617 return -1; 618 } 619 burst_rx_delay_time = ret; 620 break; 621 622 case OPT_RX_RETRY_NUMB_NUM: 623 ret = parse_num_opt(optarg, INT32_MAX); 624 if (ret == -1) { 625 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 626 us_vhost_usage(prgname); 627 return -1; 628 } 629 burst_rx_retry_num = ret; 630 break; 631 632 case OPT_MERGEABLE_NUM: 633 ret = parse_num_opt(optarg, 1); 634 if (ret == -1) { 635 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 636 us_vhost_usage(prgname); 637 return -1; 638 } 639 mergeable = !!ret; 640 break; 641 642 case OPT_STATS_NUM: 643 ret = parse_num_opt(optarg, INT32_MAX); 644 if (ret == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, 646 "Invalid argument for stats [0..N]\n"); 647 us_vhost_usage(prgname); 648 return -1; 649 } 650 enable_stats = ret; 651 break; 652 653 /* Set socket file path. */ 654 case OPT_SOCKET_FILE_NUM: 655 if (us_vhost_parse_socket_path(optarg) == -1) { 656 RTE_LOG(INFO, VHOST_CONFIG, 657 "Invalid argument for socket name (Max %d characters)\n", 658 PATH_MAX); 659 us_vhost_usage(prgname); 660 return -1; 661 } 662 break; 663 664 case OPT_DMA_TYPE_NUM: 665 dma_type = optarg; 666 break; 667 668 case OPT_DMAS_NUM: 669 if (open_dma(optarg) == -1) { 670 RTE_LOG(INFO, VHOST_CONFIG, 671 "Wrong DMA args\n"); 672 us_vhost_usage(prgname); 673 return -1; 674 } 675 async_vhost_driver = 1; 676 break; 677 678 case OPT_CLIENT_NUM: 679 client_mode = 1; 680 break; 681 682 case OPT_BUILTIN_NET_DRIVER_NUM: 683 builtin_net_driver = 1; 684 break; 685 686 /* Invalid option - print options. */ 687 default: 688 us_vhost_usage(prgname); 689 return -1; 690 } 691 } 692 693 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 694 if (enabled_port_mask & (1 << i)) 695 ports[num_ports++] = i; 696 } 697 698 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 699 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 700 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 701 return -1; 702 } 703 704 return 0; 705 } 706 707 /* 708 * Update the global var NUM_PORTS and array PORTS according to system ports number 709 * and return valid ports number 710 */ 711 static unsigned check_ports_num(unsigned nb_ports) 712 { 713 unsigned valid_num_ports = num_ports; 714 unsigned portid; 715 716 if (num_ports > nb_ports) { 717 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 718 num_ports, nb_ports); 719 num_ports = nb_ports; 720 } 721 722 for (portid = 0; portid < num_ports; portid ++) { 723 if (!rte_eth_dev_is_valid_port(ports[portid])) { 724 RTE_LOG(INFO, VHOST_PORT, 725 "\nSpecified port ID(%u) is not valid\n", 726 ports[portid]); 727 ports[portid] = INVALID_PORT_ID; 728 valid_num_ports--; 729 } 730 } 731 return valid_num_ports; 732 } 733 734 static __rte_always_inline struct vhost_dev * 735 find_vhost_dev(struct rte_ether_addr *mac) 736 { 737 struct vhost_dev *vdev; 738 739 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 740 if (vdev->ready == DEVICE_RX && 741 rte_is_same_ether_addr(mac, &vdev->mac_address)) 742 return vdev; 743 } 744 745 return NULL; 746 } 747 748 /* 749 * This function learns the MAC address of the device and registers this along with a 750 * vlan tag to a VMDQ. 751 */ 752 static int 753 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 754 { 755 struct rte_ether_hdr *pkt_hdr; 756 int i, ret; 757 758 /* Learn MAC address of guest device from packet */ 759 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 760 761 if (find_vhost_dev(&pkt_hdr->src_addr)) { 762 RTE_LOG(ERR, VHOST_DATA, 763 "(%d) device is using a registered MAC!\n", 764 vdev->vid); 765 return -1; 766 } 767 768 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 769 vdev->mac_address.addr_bytes[i] = 770 pkt_hdr->src_addr.addr_bytes[i]; 771 772 /* vlan_tag currently uses the device_id. */ 773 vdev->vlan_tag = vlan_tags[vdev->vid]; 774 775 /* Print out VMDQ registration info. */ 776 RTE_LOG(INFO, VHOST_DATA, 777 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 778 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 779 vdev->vlan_tag); 780 781 /* Register the MAC address. */ 782 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 783 (uint32_t)vdev->vid + vmdq_pool_base); 784 if (ret) 785 RTE_LOG(ERR, VHOST_DATA, 786 "(%d) failed to add device MAC address to VMDQ\n", 787 vdev->vid); 788 789 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 790 791 /* Set device as ready for RX. */ 792 vdev->ready = DEVICE_RX; 793 794 return 0; 795 } 796 797 /* 798 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 799 * queue before disabling RX on the device. 800 */ 801 static inline void 802 unlink_vmdq(struct vhost_dev *vdev) 803 { 804 unsigned i = 0; 805 unsigned rx_count; 806 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 807 808 if (vdev->ready == DEVICE_RX) { 809 /*clear MAC and VLAN settings*/ 810 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 811 for (i = 0; i < 6; i++) 812 vdev->mac_address.addr_bytes[i] = 0; 813 814 vdev->vlan_tag = 0; 815 816 /*Clear out the receive buffers*/ 817 rx_count = rte_eth_rx_burst(ports[0], 818 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 819 820 while (rx_count) { 821 for (i = 0; i < rx_count; i++) 822 rte_pktmbuf_free(pkts_burst[i]); 823 824 rx_count = rte_eth_rx_burst(ports[0], 825 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 826 } 827 828 vdev->ready = DEVICE_MAC_LEARNING; 829 } 830 } 831 832 static inline void 833 free_pkts(struct rte_mbuf **pkts, uint16_t n) 834 { 835 while (n--) 836 rte_pktmbuf_free(pkts[n]); 837 } 838 839 static __rte_always_inline void 840 complete_async_pkts(struct vhost_dev *vdev) 841 { 842 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 843 uint16_t complete_count; 844 845 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 846 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST); 847 if (complete_count) { 848 free_pkts(p_cpl, complete_count); 849 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST); 850 } 851 852 } 853 854 static __rte_always_inline void 855 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 856 struct rte_mbuf *m) 857 { 858 uint16_t ret; 859 860 if (builtin_net_driver) { 861 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 862 } else { 863 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 864 } 865 866 if (enable_stats) { 867 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 868 __ATOMIC_SEQ_CST); 869 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 870 __ATOMIC_SEQ_CST); 871 src_vdev->stats.tx_total++; 872 src_vdev->stats.tx += ret; 873 } 874 } 875 876 static __rte_always_inline void 877 drain_vhost(struct vhost_dev *vdev) 878 { 879 uint16_t ret; 880 uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid; 881 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 882 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 883 884 if (builtin_net_driver) { 885 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); 886 } else if (async_vhost_driver) { 887 uint16_t enqueue_fail = 0; 888 889 complete_async_pkts(vdev); 890 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit); 891 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST); 892 893 enqueue_fail = nr_xmit - ret; 894 if (enqueue_fail) 895 free_pkts(&m[ret], nr_xmit - ret); 896 } else { 897 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 898 m, nr_xmit); 899 } 900 901 if (enable_stats) { 902 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 903 __ATOMIC_SEQ_CST); 904 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 905 __ATOMIC_SEQ_CST); 906 } 907 908 if (!async_vhost_driver) 909 free_pkts(m, nr_xmit); 910 } 911 912 static __rte_always_inline void 913 drain_vhost_table(void) 914 { 915 uint16_t lcore_id = rte_lcore_id(); 916 struct vhost_bufftable *vhost_txq; 917 struct vhost_dev *vdev; 918 uint64_t cur_tsc; 919 920 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 921 if (unlikely(vdev->remove == 1)) 922 continue; 923 924 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE 925 + vdev->vid]; 926 927 cur_tsc = rte_rdtsc(); 928 if (unlikely(cur_tsc - vhost_txq->pre_tsc 929 > MBUF_TABLE_DRAIN_TSC)) { 930 RTE_LOG_DP(DEBUG, VHOST_DATA, 931 "Vhost TX queue drained after timeout with burst size %u\n", 932 vhost_txq->len); 933 drain_vhost(vdev); 934 vhost_txq->len = 0; 935 vhost_txq->pre_tsc = cur_tsc; 936 } 937 } 938 } 939 940 /* 941 * Check if the packet destination MAC address is for a local device. If so then put 942 * the packet on that devices RX queue. If not then return. 943 */ 944 static __rte_always_inline int 945 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 946 { 947 struct rte_ether_hdr *pkt_hdr; 948 struct vhost_dev *dst_vdev; 949 struct vhost_bufftable *vhost_txq; 950 uint16_t lcore_id = rte_lcore_id(); 951 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 952 953 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 954 if (!dst_vdev) 955 return -1; 956 957 if (vdev->vid == dst_vdev->vid) { 958 RTE_LOG_DP(DEBUG, VHOST_DATA, 959 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 960 vdev->vid); 961 return 0; 962 } 963 964 RTE_LOG_DP(DEBUG, VHOST_DATA, 965 "(%d) TX: MAC address is local\n", dst_vdev->vid); 966 967 if (unlikely(dst_vdev->remove)) { 968 RTE_LOG_DP(DEBUG, VHOST_DATA, 969 "(%d) device is marked for removal\n", dst_vdev->vid); 970 return 0; 971 } 972 973 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid]; 974 vhost_txq->m_table[vhost_txq->len++] = m; 975 976 if (enable_stats) { 977 vdev->stats.tx_total++; 978 vdev->stats.tx++; 979 } 980 981 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 982 drain_vhost(dst_vdev); 983 vhost_txq->len = 0; 984 vhost_txq->pre_tsc = rte_rdtsc(); 985 } 986 return 0; 987 } 988 989 /* 990 * Check if the destination MAC of a packet is one local VM, 991 * and get its vlan tag, and offset if it is. 992 */ 993 static __rte_always_inline int 994 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 995 uint32_t *offset, uint16_t *vlan_tag) 996 { 997 struct vhost_dev *dst_vdev; 998 struct rte_ether_hdr *pkt_hdr = 999 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1000 1001 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1002 if (!dst_vdev) 1003 return 0; 1004 1005 if (vdev->vid == dst_vdev->vid) { 1006 RTE_LOG_DP(DEBUG, VHOST_DATA, 1007 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1008 vdev->vid); 1009 return -1; 1010 } 1011 1012 /* 1013 * HW vlan strip will reduce the packet length 1014 * by minus length of vlan tag, so need restore 1015 * the packet length by plus it. 1016 */ 1017 *offset = RTE_VLAN_HLEN; 1018 *vlan_tag = vlan_tags[vdev->vid]; 1019 1020 RTE_LOG_DP(DEBUG, VHOST_DATA, 1021 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1022 vdev->vid, dst_vdev->vid, *vlan_tag); 1023 1024 return 0; 1025 } 1026 1027 static void virtio_tx_offload(struct rte_mbuf *m) 1028 { 1029 struct rte_net_hdr_lens hdr_lens; 1030 struct rte_ipv4_hdr *ipv4_hdr; 1031 struct rte_tcp_hdr *tcp_hdr; 1032 uint32_t ptype; 1033 void *l3_hdr; 1034 1035 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1036 m->l2_len = hdr_lens.l2_len; 1037 m->l3_len = hdr_lens.l3_len; 1038 m->l4_len = hdr_lens.l4_len; 1039 1040 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1041 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1042 m->l2_len + m->l3_len); 1043 1044 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1045 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1046 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1047 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1048 ipv4_hdr = l3_hdr; 1049 ipv4_hdr->hdr_checksum = 0; 1050 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1051 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1052 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1053 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1054 } 1055 } 1056 1057 static __rte_always_inline void 1058 do_drain_mbuf_table(struct mbuf_table *tx_q) 1059 { 1060 uint16_t count; 1061 1062 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1063 tx_q->m_table, tx_q->len); 1064 if (unlikely(count < tx_q->len)) 1065 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1066 1067 tx_q->len = 0; 1068 } 1069 1070 /* 1071 * This function routes the TX packet to the correct interface. This 1072 * may be a local device or the physical port. 1073 */ 1074 static __rte_always_inline void 1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1076 { 1077 struct mbuf_table *tx_q; 1078 unsigned offset = 0; 1079 const uint16_t lcore_id = rte_lcore_id(); 1080 struct rte_ether_hdr *nh; 1081 1082 1083 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1084 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1085 struct vhost_dev *vdev2; 1086 1087 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1088 if (vdev2 != vdev) 1089 sync_virtio_xmit(vdev2, vdev, m); 1090 } 1091 goto queue2nic; 1092 } 1093 1094 /*check if destination is local VM*/ 1095 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1096 return; 1097 1098 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1099 if (unlikely(find_local_dest(vdev, m, &offset, 1100 &vlan_tag) != 0)) { 1101 rte_pktmbuf_free(m); 1102 return; 1103 } 1104 } 1105 1106 RTE_LOG_DP(DEBUG, VHOST_DATA, 1107 "(%d) TX: MAC address is external\n", vdev->vid); 1108 1109 queue2nic: 1110 1111 /*Add packet to the port tx queue*/ 1112 tx_q = &lcore_tx_queue[lcore_id]; 1113 1114 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1115 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1116 /* Guest has inserted the vlan tag. */ 1117 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1118 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1119 if ((vm2vm_mode == VM2VM_HARDWARE) && 1120 (vh->vlan_tci != vlan_tag_be)) 1121 vh->vlan_tci = vlan_tag_be; 1122 } else { 1123 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1124 1125 /* 1126 * Find the right seg to adjust the data len when offset is 1127 * bigger than tail room size. 1128 */ 1129 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1130 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1131 m->data_len += offset; 1132 else { 1133 struct rte_mbuf *seg = m; 1134 1135 while ((seg->next != NULL) && 1136 (offset > rte_pktmbuf_tailroom(seg))) 1137 seg = seg->next; 1138 1139 seg->data_len += offset; 1140 } 1141 m->pkt_len += offset; 1142 } 1143 1144 m->vlan_tci = vlan_tag; 1145 } 1146 1147 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1148 virtio_tx_offload(m); 1149 1150 tx_q->m_table[tx_q->len++] = m; 1151 if (enable_stats) { 1152 vdev->stats.tx_total++; 1153 vdev->stats.tx++; 1154 } 1155 1156 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1157 do_drain_mbuf_table(tx_q); 1158 } 1159 1160 1161 static __rte_always_inline void 1162 drain_mbuf_table(struct mbuf_table *tx_q) 1163 { 1164 static uint64_t prev_tsc; 1165 uint64_t cur_tsc; 1166 1167 if (tx_q->len == 0) 1168 return; 1169 1170 cur_tsc = rte_rdtsc(); 1171 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1172 prev_tsc = cur_tsc; 1173 1174 RTE_LOG_DP(DEBUG, VHOST_DATA, 1175 "TX queue drained after timeout with burst size %u\n", 1176 tx_q->len); 1177 do_drain_mbuf_table(tx_q); 1178 } 1179 } 1180 1181 static __rte_always_inline void 1182 drain_eth_rx(struct vhost_dev *vdev) 1183 { 1184 uint16_t rx_count, enqueue_count; 1185 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1186 1187 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1188 pkts, MAX_PKT_BURST); 1189 1190 if (!rx_count) 1191 return; 1192 1193 /* 1194 * When "enable_retry" is set, here we wait and retry when there 1195 * is no enough free slots in the queue to hold @rx_count packets, 1196 * to diminish packet loss. 1197 */ 1198 if (enable_retry && 1199 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1200 VIRTIO_RXQ))) { 1201 uint32_t retry; 1202 1203 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1204 rte_delay_us(burst_rx_delay_time); 1205 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1206 VIRTIO_RXQ)) 1207 break; 1208 } 1209 } 1210 1211 if (builtin_net_driver) { 1212 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1213 pkts, rx_count); 1214 } else if (async_vhost_driver) { 1215 uint16_t enqueue_fail = 0; 1216 1217 complete_async_pkts(vdev); 1218 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, 1219 VIRTIO_RXQ, pkts, rx_count); 1220 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST); 1221 1222 enqueue_fail = rx_count - enqueue_count; 1223 if (enqueue_fail) 1224 free_pkts(&pkts[enqueue_count], enqueue_fail); 1225 1226 } else { 1227 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1228 pkts, rx_count); 1229 } 1230 1231 if (enable_stats) { 1232 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1233 __ATOMIC_SEQ_CST); 1234 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1235 __ATOMIC_SEQ_CST); 1236 } 1237 1238 if (!async_vhost_driver) 1239 free_pkts(pkts, rx_count); 1240 } 1241 1242 static __rte_always_inline void 1243 drain_virtio_tx(struct vhost_dev *vdev) 1244 { 1245 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1246 uint16_t count; 1247 uint16_t i; 1248 1249 if (builtin_net_driver) { 1250 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1251 pkts, MAX_PKT_BURST); 1252 } else { 1253 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1254 mbuf_pool, pkts, MAX_PKT_BURST); 1255 } 1256 1257 /* setup VMDq for the first packet */ 1258 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1259 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1260 free_pkts(pkts, count); 1261 } 1262 1263 for (i = 0; i < count; ++i) 1264 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1265 } 1266 1267 /* 1268 * Main function of vhost-switch. It basically does: 1269 * 1270 * for each vhost device { 1271 * - drain_eth_rx() 1272 * 1273 * Which drains the host eth Rx queue linked to the vhost device, 1274 * and deliver all of them to guest virito Rx ring associated with 1275 * this vhost device. 1276 * 1277 * - drain_virtio_tx() 1278 * 1279 * Which drains the guest virtio Tx queue and deliver all of them 1280 * to the target, which could be another vhost device, or the 1281 * physical eth dev. The route is done in function "virtio_tx_route". 1282 * } 1283 */ 1284 static int 1285 switch_worker(void *arg __rte_unused) 1286 { 1287 unsigned i; 1288 unsigned lcore_id = rte_lcore_id(); 1289 struct vhost_dev *vdev; 1290 struct mbuf_table *tx_q; 1291 1292 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1293 1294 tx_q = &lcore_tx_queue[lcore_id]; 1295 for (i = 0; i < rte_lcore_count(); i++) { 1296 if (lcore_ids[i] == lcore_id) { 1297 tx_q->txq_id = i; 1298 break; 1299 } 1300 } 1301 1302 while(1) { 1303 drain_mbuf_table(tx_q); 1304 drain_vhost_table(); 1305 /* 1306 * Inform the configuration core that we have exited the 1307 * linked list and that no devices are in use if requested. 1308 */ 1309 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1310 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1311 1312 /* 1313 * Process vhost devices 1314 */ 1315 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1316 lcore_vdev_entry) { 1317 if (unlikely(vdev->remove)) { 1318 unlink_vmdq(vdev); 1319 vdev->ready = DEVICE_SAFE_REMOVE; 1320 continue; 1321 } 1322 1323 if (likely(vdev->ready == DEVICE_RX)) 1324 drain_eth_rx(vdev); 1325 1326 if (likely(!vdev->remove)) 1327 drain_virtio_tx(vdev); 1328 } 1329 } 1330 1331 return 0; 1332 } 1333 1334 /* 1335 * Remove a device from the specific data core linked list and from the 1336 * main linked list. Synchronization occurs through the use of the 1337 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1338 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1339 */ 1340 static void 1341 destroy_device(int vid) 1342 { 1343 struct vhost_dev *vdev = NULL; 1344 int lcore; 1345 uint16_t i; 1346 1347 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1348 if (vdev->vid == vid) 1349 break; 1350 } 1351 if (!vdev) 1352 return; 1353 /*set the remove flag. */ 1354 vdev->remove = 1; 1355 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1356 rte_pause(); 1357 } 1358 1359 for (i = 0; i < RTE_MAX_LCORE; i++) 1360 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]); 1361 1362 if (builtin_net_driver) 1363 vs_vhost_net_remove(vdev); 1364 1365 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1366 lcore_vdev_entry); 1367 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1368 1369 1370 /* Set the dev_removal_flag on each lcore. */ 1371 RTE_LCORE_FOREACH_WORKER(lcore) 1372 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1373 1374 /* 1375 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1376 * we can be sure that they can no longer access the device removed 1377 * from the linked lists and that the devices are no longer in use. 1378 */ 1379 RTE_LCORE_FOREACH_WORKER(lcore) { 1380 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1381 rte_pause(); 1382 } 1383 1384 lcore_info[vdev->coreid].device_num--; 1385 1386 RTE_LOG(INFO, VHOST_DATA, 1387 "(%d) device has been removed from data core\n", 1388 vdev->vid); 1389 1390 if (async_vhost_driver) { 1391 uint16_t n_pkt = 0; 1392 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1393 1394 while (vdev->pkts_inflight) { 1395 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ, 1396 m_cpl, vdev->pkts_inflight); 1397 free_pkts(m_cpl, n_pkt); 1398 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1399 } 1400 1401 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1402 } 1403 1404 rte_free(vdev); 1405 } 1406 1407 /* 1408 * A new device is added to a data core. First the device is added to the main linked list 1409 * and then allocated to a specific data core. 1410 */ 1411 static int 1412 new_device(int vid) 1413 { 1414 int lcore, core_add = 0; 1415 uint16_t i; 1416 uint32_t device_num_min = num_devices; 1417 struct vhost_dev *vdev; 1418 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1419 if (vdev == NULL) { 1420 RTE_LOG(INFO, VHOST_DATA, 1421 "(%d) couldn't allocate memory for vhost dev\n", 1422 vid); 1423 return -1; 1424 } 1425 vdev->vid = vid; 1426 1427 for (i = 0; i < RTE_MAX_LCORE; i++) { 1428 vhost_txbuff[i * MAX_VHOST_DEVICE + vid] 1429 = rte_zmalloc("vhost bufftable", 1430 sizeof(struct vhost_bufftable), 1431 RTE_CACHE_LINE_SIZE); 1432 1433 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) { 1434 RTE_LOG(INFO, VHOST_DATA, 1435 "(%d) couldn't allocate memory for vhost TX\n", vid); 1436 return -1; 1437 } 1438 } 1439 1440 if (builtin_net_driver) 1441 vs_vhost_net_setup(vdev); 1442 1443 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1444 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1445 1446 /*reset ready flag*/ 1447 vdev->ready = DEVICE_MAC_LEARNING; 1448 vdev->remove = 0; 1449 1450 /* Find a suitable lcore to add the device. */ 1451 RTE_LCORE_FOREACH_WORKER(lcore) { 1452 if (lcore_info[lcore].device_num < device_num_min) { 1453 device_num_min = lcore_info[lcore].device_num; 1454 core_add = lcore; 1455 } 1456 } 1457 vdev->coreid = core_add; 1458 1459 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1460 lcore_vdev_entry); 1461 lcore_info[vdev->coreid].device_num++; 1462 1463 /* Disable notifications. */ 1464 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1465 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1466 1467 RTE_LOG(INFO, VHOST_DATA, 1468 "(%d) device has been added to data core %d\n", 1469 vid, vdev->coreid); 1470 1471 if (async_vhost_driver) { 1472 struct rte_vhost_async_config config = {0}; 1473 struct rte_vhost_async_channel_ops channel_ops; 1474 1475 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) { 1476 channel_ops.transfer_data = ioat_transfer_data_cb; 1477 channel_ops.check_completed_copies = 1478 ioat_check_completed_copies_cb; 1479 1480 config.features = RTE_VHOST_ASYNC_INORDER; 1481 1482 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ, 1483 config, &channel_ops); 1484 } 1485 } 1486 1487 return 0; 1488 } 1489 1490 static int 1491 vring_state_changed(int vid, uint16_t queue_id, int enable) 1492 { 1493 struct vhost_dev *vdev = NULL; 1494 1495 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1496 if (vdev->vid == vid) 1497 break; 1498 } 1499 if (!vdev) 1500 return -1; 1501 1502 if (queue_id != VIRTIO_RXQ) 1503 return 0; 1504 1505 if (async_vhost_driver) { 1506 if (!enable) { 1507 uint16_t n_pkt = 0; 1508 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1509 1510 while (vdev->pkts_inflight) { 1511 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id, 1512 m_cpl, vdev->pkts_inflight); 1513 free_pkts(m_cpl, n_pkt); 1514 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1515 } 1516 } 1517 } 1518 1519 return 0; 1520 } 1521 1522 /* 1523 * These callback allow devices to be added to the data core when configuration 1524 * has been fully complete. 1525 */ 1526 static const struct rte_vhost_device_ops virtio_net_device_ops = 1527 { 1528 .new_device = new_device, 1529 .destroy_device = destroy_device, 1530 .vring_state_changed = vring_state_changed, 1531 }; 1532 1533 /* 1534 * This is a thread will wake up after a period to print stats if the user has 1535 * enabled them. 1536 */ 1537 static void * 1538 print_stats(__rte_unused void *arg) 1539 { 1540 struct vhost_dev *vdev; 1541 uint64_t tx_dropped, rx_dropped; 1542 uint64_t tx, tx_total, rx, rx_total; 1543 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1544 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1545 1546 while(1) { 1547 sleep(enable_stats); 1548 1549 /* Clear screen and move to top left */ 1550 printf("%s%s\n", clr, top_left); 1551 printf("Device statistics =================================\n"); 1552 1553 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1554 tx_total = vdev->stats.tx_total; 1555 tx = vdev->stats.tx; 1556 tx_dropped = tx_total - tx; 1557 1558 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1559 __ATOMIC_SEQ_CST); 1560 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1561 __ATOMIC_SEQ_CST); 1562 rx_dropped = rx_total - rx; 1563 1564 printf("Statistics for device %d\n" 1565 "-----------------------\n" 1566 "TX total: %" PRIu64 "\n" 1567 "TX dropped: %" PRIu64 "\n" 1568 "TX successful: %" PRIu64 "\n" 1569 "RX total: %" PRIu64 "\n" 1570 "RX dropped: %" PRIu64 "\n" 1571 "RX successful: %" PRIu64 "\n", 1572 vdev->vid, 1573 tx_total, tx_dropped, tx, 1574 rx_total, rx_dropped, rx); 1575 } 1576 1577 printf("===================================================\n"); 1578 1579 fflush(stdout); 1580 } 1581 1582 return NULL; 1583 } 1584 1585 static void 1586 unregister_drivers(int socket_num) 1587 { 1588 int i, ret; 1589 1590 for (i = 0; i < socket_num; i++) { 1591 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1592 if (ret != 0) 1593 RTE_LOG(ERR, VHOST_CONFIG, 1594 "Fail to unregister vhost driver for %s.\n", 1595 socket_files + i * PATH_MAX); 1596 } 1597 } 1598 1599 /* When we receive a INT signal, unregister vhost driver */ 1600 static void 1601 sigint_handler(__rte_unused int signum) 1602 { 1603 /* Unregister vhost driver. */ 1604 unregister_drivers(nb_sockets); 1605 1606 exit(0); 1607 } 1608 1609 /* 1610 * While creating an mbuf pool, one key thing is to figure out how 1611 * many mbuf entries is enough for our use. FYI, here are some 1612 * guidelines: 1613 * 1614 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1615 * 1616 * - For each switch core (A CPU core does the packet switch), we need 1617 * also make some reservation for receiving the packets from virtio 1618 * Tx queue. How many is enough depends on the usage. It's normally 1619 * a simple calculation like following: 1620 * 1621 * MAX_PKT_BURST * max packet size / mbuf size 1622 * 1623 * So, we definitely need allocate more mbufs when TSO is enabled. 1624 * 1625 * - Similarly, for each switching core, we should serve @nr_rx_desc 1626 * mbufs for receiving the packets from physical NIC device. 1627 * 1628 * - We also need make sure, for each switch core, we have allocated 1629 * enough mbufs to fill up the mbuf cache. 1630 */ 1631 static void 1632 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1633 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1634 { 1635 uint32_t nr_mbufs; 1636 uint32_t nr_mbufs_per_core; 1637 uint32_t mtu = 1500; 1638 1639 if (mergeable) 1640 mtu = 9000; 1641 if (enable_tso) 1642 mtu = 64 * 1024; 1643 1644 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1645 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1646 nr_mbufs_per_core += nr_rx_desc; 1647 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1648 1649 nr_mbufs = nr_queues * nr_rx_desc; 1650 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1651 nr_mbufs *= nr_port; 1652 1653 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1654 nr_mbuf_cache, 0, mbuf_size, 1655 rte_socket_id()); 1656 if (mbuf_pool == NULL) 1657 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1658 } 1659 1660 /* 1661 * Main function, does initialisation and calls the per-lcore functions. 1662 */ 1663 int 1664 main(int argc, char *argv[]) 1665 { 1666 unsigned lcore_id, core_id = 0; 1667 unsigned nb_ports, valid_num_ports; 1668 int ret, i; 1669 uint16_t portid; 1670 static pthread_t tid; 1671 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1672 1673 signal(SIGINT, sigint_handler); 1674 1675 /* init EAL */ 1676 ret = rte_eal_init(argc, argv); 1677 if (ret < 0) 1678 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1679 argc -= ret; 1680 argv += ret; 1681 1682 /* parse app arguments */ 1683 ret = us_vhost_parse_args(argc, argv); 1684 if (ret < 0) 1685 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1686 1687 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1688 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1689 1690 if (rte_lcore_is_enabled(lcore_id)) 1691 lcore_ids[core_id++] = lcore_id; 1692 } 1693 1694 if (rte_lcore_count() > RTE_MAX_LCORE) 1695 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1696 1697 /* Get the number of physical ports. */ 1698 nb_ports = rte_eth_dev_count_avail(); 1699 1700 /* 1701 * Update the global var NUM_PORTS and global array PORTS 1702 * and get value of var VALID_NUM_PORTS according to system ports number 1703 */ 1704 valid_num_ports = check_ports_num(nb_ports); 1705 1706 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1707 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1708 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1709 return -1; 1710 } 1711 1712 /* 1713 * FIXME: here we are trying to allocate mbufs big enough for 1714 * @MAX_QUEUES, but the truth is we're never going to use that 1715 * many queues here. We probably should only do allocation for 1716 * those queues we are going to use. 1717 */ 1718 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1719 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1720 1721 if (vm2vm_mode == VM2VM_HARDWARE) { 1722 /* Enable VT loop back to let L2 switch to do it. */ 1723 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1724 RTE_LOG(DEBUG, VHOST_CONFIG, 1725 "Enable loop back for L2 switch in vmdq.\n"); 1726 } 1727 1728 /* initialize all ports */ 1729 RTE_ETH_FOREACH_DEV(portid) { 1730 /* skip ports that are not enabled */ 1731 if ((enabled_port_mask & (1 << portid)) == 0) { 1732 RTE_LOG(INFO, VHOST_PORT, 1733 "Skipping disabled port %d\n", portid); 1734 continue; 1735 } 1736 if (port_init(portid) != 0) 1737 rte_exit(EXIT_FAILURE, 1738 "Cannot initialize network ports\n"); 1739 } 1740 1741 /* Enable stats if the user option is set. */ 1742 if (enable_stats) { 1743 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1744 print_stats, NULL); 1745 if (ret < 0) 1746 rte_exit(EXIT_FAILURE, 1747 "Cannot create print-stats thread\n"); 1748 } 1749 1750 /* Launch all data cores. */ 1751 RTE_LCORE_FOREACH_WORKER(lcore_id) 1752 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1753 1754 if (client_mode) 1755 flags |= RTE_VHOST_USER_CLIENT; 1756 1757 /* Register vhost user driver to handle vhost messages. */ 1758 for (i = 0; i < nb_sockets; i++) { 1759 char *file = socket_files + i * PATH_MAX; 1760 1761 if (async_vhost_driver) 1762 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1763 1764 ret = rte_vhost_driver_register(file, flags); 1765 if (ret != 0) { 1766 unregister_drivers(i); 1767 rte_exit(EXIT_FAILURE, 1768 "vhost driver register failure.\n"); 1769 } 1770 1771 if (builtin_net_driver) 1772 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1773 1774 if (mergeable == 0) { 1775 rte_vhost_driver_disable_features(file, 1776 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1777 } 1778 1779 if (enable_tx_csum == 0) { 1780 rte_vhost_driver_disable_features(file, 1781 1ULL << VIRTIO_NET_F_CSUM); 1782 } 1783 1784 if (enable_tso == 0) { 1785 rte_vhost_driver_disable_features(file, 1786 1ULL << VIRTIO_NET_F_HOST_TSO4); 1787 rte_vhost_driver_disable_features(file, 1788 1ULL << VIRTIO_NET_F_HOST_TSO6); 1789 rte_vhost_driver_disable_features(file, 1790 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1791 rte_vhost_driver_disable_features(file, 1792 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1793 } 1794 1795 if (promiscuous) { 1796 rte_vhost_driver_enable_features(file, 1797 1ULL << VIRTIO_NET_F_CTRL_RX); 1798 } 1799 1800 ret = rte_vhost_driver_callback_register(file, 1801 &virtio_net_device_ops); 1802 if (ret != 0) { 1803 rte_exit(EXIT_FAILURE, 1804 "failed to register vhost driver callbacks.\n"); 1805 } 1806 1807 if (rte_vhost_driver_start(file) < 0) { 1808 rte_exit(EXIT_FAILURE, 1809 "failed to start vhost driver.\n"); 1810 } 1811 } 1812 1813 RTE_LCORE_FOREACH_WORKER(lcore_id) 1814 rte_eal_wait_lcore(lcore_id); 1815 1816 /* clean up the EAL */ 1817 rte_eal_cleanup(); 1818 1819 return 0; 1820 } 1821