1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 #include <rte_dmadev.h> 28 #include <rte_vhost_async.h> 29 30 #include "main.h" 31 32 #ifndef MAX_QUEUES 33 #define MAX_QUEUES 128 34 #endif 35 36 /* the maximum number of external ports supported */ 37 #define MAX_SUP_PORTS 1 38 39 #define MBUF_CACHE_SIZE 128 40 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 41 42 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 43 44 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 45 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 46 47 #define JUMBO_FRAME_MAX_SIZE 0x2600 48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 49 50 /* State of virtio device. */ 51 #define DEVICE_MAC_LEARNING 0 52 #define DEVICE_RX 1 53 #define DEVICE_SAFE_REMOVE 2 54 55 /* Configurable number of RX/TX ring descriptors */ 56 #define RTE_TEST_RX_DESC_DEFAULT 1024 57 #define RTE_TEST_TX_DESC_DEFAULT 512 58 59 #define INVALID_PORT_ID 0xFF 60 #define INVALID_DMA_ID -1 61 62 #define DMA_RING_SIZE 4096 63 64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 66 static int dma_count; 67 68 /* mask of enabled ports */ 69 static uint32_t enabled_port_mask = 0; 70 71 /* Promiscuous mode */ 72 static uint32_t promiscuous; 73 74 /* number of devices/queues to support*/ 75 static uint32_t num_queues = 0; 76 static uint32_t num_devices; 77 78 static struct rte_mempool *mbuf_pool; 79 static int mergeable; 80 81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 82 typedef enum { 83 VM2VM_DISABLED = 0, 84 VM2VM_SOFTWARE = 1, 85 VM2VM_HARDWARE = 2, 86 VM2VM_LAST 87 } vm2vm_type; 88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 89 90 /* Enable stats. */ 91 static uint32_t enable_stats = 0; 92 /* Enable retries on RX. */ 93 static uint32_t enable_retry = 1; 94 95 /* Disable TX checksum offload */ 96 static uint32_t enable_tx_csum; 97 98 /* Disable TSO offload */ 99 static uint32_t enable_tso; 100 101 static int client_mode; 102 103 static int builtin_net_driver; 104 105 /* Specify timeout (in useconds) between retries on RX. */ 106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 107 /* Specify the number of retries on RX. */ 108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 109 110 /* Socket file paths. Can be set by user */ 111 static char *socket_files; 112 static int nb_sockets; 113 114 /* empty VMDq configuration structure. Filled in programmatically */ 115 static struct rte_eth_conf vmdq_conf_default = { 116 .rxmode = { 117 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 118 .split_hdr_size = 0, 119 /* 120 * VLAN strip is necessary for 1G NIC such as I350, 121 * this fixes bug of ipv4 forwarding in guest can't 122 * forward packets from one virtio dev to another virtio dev. 123 */ 124 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 125 }, 126 127 .txmode = { 128 .mq_mode = RTE_ETH_MQ_TX_NONE, 129 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 130 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 131 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 132 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 133 RTE_ETH_TX_OFFLOAD_TCP_TSO), 134 }, 135 .rx_adv_conf = { 136 /* 137 * should be overridden separately in code with 138 * appropriate values 139 */ 140 .vmdq_rx_conf = { 141 .nb_queue_pools = RTE_ETH_8_POOLS, 142 .enable_default_pool = 0, 143 .default_pool = 0, 144 .nb_pool_maps = 0, 145 .pool_map = {{0, 0},}, 146 }, 147 }, 148 }; 149 150 151 static unsigned lcore_ids[RTE_MAX_LCORE]; 152 static uint16_t ports[RTE_MAX_ETHPORTS]; 153 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 154 static uint16_t num_pf_queues, num_vmdq_queues; 155 static uint16_t vmdq_pool_base, vmdq_queue_base; 156 static uint16_t queues_per_pool; 157 158 const uint16_t vlan_tags[] = { 159 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 160 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 161 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 162 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 163 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 164 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 165 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 166 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 167 }; 168 169 /* ethernet addresses of ports */ 170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 171 172 static struct vhost_dev_tailq_list vhost_dev_list = 173 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 174 175 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 176 177 /* Used for queueing bursts of TX packets. */ 178 struct mbuf_table { 179 unsigned len; 180 unsigned txq_id; 181 struct rte_mbuf *m_table[MAX_PKT_BURST]; 182 }; 183 184 struct vhost_bufftable { 185 uint32_t len; 186 uint64_t pre_tsc; 187 struct rte_mbuf *m_table[MAX_PKT_BURST]; 188 }; 189 190 /* TX queue for each data core. */ 191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 192 193 /* 194 * Vhost TX buffer for each data core. 195 * Every data core maintains a TX buffer for every vhost device, 196 * which is used for batch pkts enqueue for higher performance. 197 */ 198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 199 200 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 201 / US_PER_S * BURST_TX_DRAIN_US) 202 203 static inline bool 204 is_dma_configured(int16_t dev_id) 205 { 206 int i; 207 208 for (i = 0; i < dma_count; i++) 209 if (dmas_id[i] == dev_id) 210 return true; 211 return false; 212 } 213 214 static inline int 215 open_dma(const char *value) 216 { 217 struct dma_for_vhost *dma_info = dma_bind; 218 char *input = strndup(value, strlen(value) + 1); 219 char *addrs = input; 220 char *ptrs[2]; 221 char *start, *end, *substr; 222 int64_t vid; 223 224 struct rte_dma_info info; 225 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 226 struct rte_dma_vchan_conf qconf = { 227 .direction = RTE_DMA_DIR_MEM_TO_MEM, 228 .nb_desc = DMA_RING_SIZE 229 }; 230 231 int dev_id; 232 int ret = 0; 233 uint16_t i = 0; 234 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 235 int args_nr; 236 237 while (isblank(*addrs)) 238 addrs++; 239 if (*addrs == '\0') { 240 ret = -1; 241 goto out; 242 } 243 244 /* process DMA devices within bracket. */ 245 addrs++; 246 substr = strtok(addrs, ";]"); 247 if (!substr) { 248 ret = -1; 249 goto out; 250 } 251 252 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 253 if (args_nr <= 0) { 254 ret = -1; 255 goto out; 256 } 257 258 while (i < args_nr) { 259 char *arg_temp = dma_arg[i]; 260 uint8_t sub_nr; 261 262 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 263 if (sub_nr != 2) { 264 ret = -1; 265 goto out; 266 } 267 268 start = strstr(ptrs[0], "txd"); 269 if (start == NULL) { 270 ret = -1; 271 goto out; 272 } 273 274 start += 3; 275 vid = strtol(start, &end, 0); 276 if (end == start) { 277 ret = -1; 278 goto out; 279 } 280 281 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 282 if (dev_id < 0) { 283 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 284 ret = -1; 285 goto out; 286 } 287 288 /* DMA device is already configured, so skip */ 289 if (is_dma_configured(dev_id)) 290 goto done; 291 292 if (rte_dma_info_get(dev_id, &info) != 0) { 293 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 294 ret = -1; 295 goto out; 296 } 297 298 if (info.max_vchans < 1) { 299 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 300 ret = -1; 301 goto out; 302 } 303 304 if (rte_dma_configure(dev_id, &dev_config) != 0) { 305 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 306 ret = -1; 307 goto out; 308 } 309 310 /* Check the max desc supported by DMA device */ 311 rte_dma_info_get(dev_id, &info); 312 if (info.nb_vchans != 1) { 313 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 314 dev_id); 315 ret = -1; 316 goto out; 317 } 318 319 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 320 321 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 322 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 323 ret = -1; 324 goto out; 325 } 326 327 if (rte_dma_start(dev_id) != 0) { 328 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 329 ret = -1; 330 goto out; 331 } 332 333 dmas_id[dma_count++] = dev_id; 334 335 done: 336 (dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id; 337 i++; 338 } 339 out: 340 free(input); 341 return ret; 342 } 343 344 /* 345 * Builds up the correct configuration for VMDQ VLAN pool map 346 * according to the pool & queue limits. 347 */ 348 static inline int 349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 350 { 351 struct rte_eth_vmdq_rx_conf conf; 352 struct rte_eth_vmdq_rx_conf *def_conf = 353 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 354 unsigned i; 355 356 memset(&conf, 0, sizeof(conf)); 357 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 358 conf.nb_pool_maps = num_devices; 359 conf.enable_loop_back = def_conf->enable_loop_back; 360 conf.rx_mode = def_conf->rx_mode; 361 362 for (i = 0; i < conf.nb_pool_maps; i++) { 363 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 364 conf.pool_map[i].pools = (1UL << i); 365 } 366 367 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 368 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 369 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 370 return 0; 371 } 372 373 /* 374 * Initialises a given port using global settings and with the rx buffers 375 * coming from the mbuf_pool passed as parameter 376 */ 377 static inline int 378 port_init(uint16_t port) 379 { 380 struct rte_eth_dev_info dev_info; 381 struct rte_eth_conf port_conf; 382 struct rte_eth_rxconf *rxconf; 383 struct rte_eth_txconf *txconf; 384 int16_t rx_rings, tx_rings; 385 uint16_t rx_ring_size, tx_ring_size; 386 int retval; 387 uint16_t q; 388 389 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 390 retval = rte_eth_dev_info_get(port, &dev_info); 391 if (retval != 0) { 392 RTE_LOG(ERR, VHOST_PORT, 393 "Error during getting device (port %u) info: %s\n", 394 port, strerror(-retval)); 395 396 return retval; 397 } 398 399 rxconf = &dev_info.default_rxconf; 400 txconf = &dev_info.default_txconf; 401 rxconf->rx_drop_en = 1; 402 403 /*configure the number of supported virtio devices based on VMDQ limits */ 404 num_devices = dev_info.max_vmdq_pools; 405 406 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 407 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 408 409 tx_rings = (uint16_t)rte_lcore_count(); 410 411 if (mergeable) { 412 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 413 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 414 else 415 vmdq_conf_default.rxmode.mtu = MAX_MTU; 416 } 417 418 /* Get port configuration. */ 419 retval = get_eth_conf(&port_conf, num_devices); 420 if (retval < 0) 421 return retval; 422 /* NIC queues are divided into pf queues and vmdq queues. */ 423 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 424 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 425 num_vmdq_queues = num_devices * queues_per_pool; 426 num_queues = num_pf_queues + num_vmdq_queues; 427 vmdq_queue_base = dev_info.vmdq_queue_base; 428 vmdq_pool_base = dev_info.vmdq_pool_base; 429 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 430 num_pf_queues, num_devices, queues_per_pool); 431 432 if (!rte_eth_dev_is_valid_port(port)) 433 return -1; 434 435 rx_rings = (uint16_t)dev_info.max_rx_queues; 436 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 437 port_conf.txmode.offloads |= 438 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 439 /* Configure ethernet device. */ 440 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 441 if (retval != 0) { 442 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 443 port, strerror(-retval)); 444 return retval; 445 } 446 447 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 448 &tx_ring_size); 449 if (retval != 0) { 450 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 451 "for port %u: %s.\n", port, strerror(-retval)); 452 return retval; 453 } 454 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 455 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 456 "for Rx queues on port %u.\n", port); 457 return -1; 458 } 459 460 /* Setup the queues. */ 461 rxconf->offloads = port_conf.rxmode.offloads; 462 for (q = 0; q < rx_rings; q ++) { 463 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 464 rte_eth_dev_socket_id(port), 465 rxconf, 466 mbuf_pool); 467 if (retval < 0) { 468 RTE_LOG(ERR, VHOST_PORT, 469 "Failed to setup rx queue %u of port %u: %s.\n", 470 q, port, strerror(-retval)); 471 return retval; 472 } 473 } 474 txconf->offloads = port_conf.txmode.offloads; 475 for (q = 0; q < tx_rings; q ++) { 476 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 477 rte_eth_dev_socket_id(port), 478 txconf); 479 if (retval < 0) { 480 RTE_LOG(ERR, VHOST_PORT, 481 "Failed to setup tx queue %u of port %u: %s.\n", 482 q, port, strerror(-retval)); 483 return retval; 484 } 485 } 486 487 /* Start the device. */ 488 retval = rte_eth_dev_start(port); 489 if (retval < 0) { 490 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 491 port, strerror(-retval)); 492 return retval; 493 } 494 495 if (promiscuous) { 496 retval = rte_eth_promiscuous_enable(port); 497 if (retval != 0) { 498 RTE_LOG(ERR, VHOST_PORT, 499 "Failed to enable promiscuous mode on port %u: %s\n", 500 port, rte_strerror(-retval)); 501 return retval; 502 } 503 } 504 505 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 506 if (retval < 0) { 507 RTE_LOG(ERR, VHOST_PORT, 508 "Failed to get MAC address on port %u: %s\n", 509 port, rte_strerror(-retval)); 510 return retval; 511 } 512 513 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 514 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 515 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 516 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 517 518 return 0; 519 } 520 521 /* 522 * Set socket file path. 523 */ 524 static int 525 us_vhost_parse_socket_path(const char *q_arg) 526 { 527 char *old; 528 529 /* parse number string */ 530 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 531 return -1; 532 533 old = socket_files; 534 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 535 if (socket_files == NULL) { 536 free(old); 537 return -1; 538 } 539 540 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 541 nb_sockets++; 542 543 return 0; 544 } 545 546 /* 547 * Parse the portmask provided at run time. 548 */ 549 static int 550 parse_portmask(const char *portmask) 551 { 552 char *end = NULL; 553 unsigned long pm; 554 555 errno = 0; 556 557 /* parse hexadecimal string */ 558 pm = strtoul(portmask, &end, 16); 559 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 560 return 0; 561 562 return pm; 563 564 } 565 566 /* 567 * Parse num options at run time. 568 */ 569 static int 570 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 571 { 572 char *end = NULL; 573 unsigned long num; 574 575 errno = 0; 576 577 /* parse unsigned int string */ 578 num = strtoul(q_arg, &end, 10); 579 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 580 return -1; 581 582 if (num > max_valid_value) 583 return -1; 584 585 return num; 586 587 } 588 589 /* 590 * Display usage 591 */ 592 static void 593 us_vhost_usage(const char *prgname) 594 { 595 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 596 " --vm2vm [0|1|2]\n" 597 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 598 " --socket-file <path>\n" 599 " --nb-devices ND\n" 600 " -p PORTMASK: Set mask for ports to be used by application\n" 601 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 602 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 603 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 604 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 605 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 606 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 607 " --socket-file: The path of the socket file.\n" 608 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 609 " --tso [0|1] disable/enable TCP segment offload.\n" 610 " --client register a vhost-user socket as client mode.\n" 611 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n" 612 " --dmas register dma channel for specific vhost device.\n", 613 prgname); 614 } 615 616 enum { 617 #define OPT_VM2VM "vm2vm" 618 OPT_VM2VM_NUM = 256, 619 #define OPT_RX_RETRY "rx-retry" 620 OPT_RX_RETRY_NUM, 621 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 622 OPT_RX_RETRY_DELAY_NUM, 623 #define OPT_RX_RETRY_NUMB "rx-retry-num" 624 OPT_RX_RETRY_NUMB_NUM, 625 #define OPT_MERGEABLE "mergeable" 626 OPT_MERGEABLE_NUM, 627 #define OPT_STATS "stats" 628 OPT_STATS_NUM, 629 #define OPT_SOCKET_FILE "socket-file" 630 OPT_SOCKET_FILE_NUM, 631 #define OPT_TX_CSUM "tx-csum" 632 OPT_TX_CSUM_NUM, 633 #define OPT_TSO "tso" 634 OPT_TSO_NUM, 635 #define OPT_CLIENT "client" 636 OPT_CLIENT_NUM, 637 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 638 OPT_BUILTIN_NET_DRIVER_NUM, 639 #define OPT_DMAS "dmas" 640 OPT_DMAS_NUM, 641 }; 642 643 /* 644 * Parse the arguments given in the command line of the application. 645 */ 646 static int 647 us_vhost_parse_args(int argc, char **argv) 648 { 649 int opt, ret; 650 int option_index; 651 unsigned i; 652 const char *prgname = argv[0]; 653 static struct option long_option[] = { 654 {OPT_VM2VM, required_argument, 655 NULL, OPT_VM2VM_NUM}, 656 {OPT_RX_RETRY, required_argument, 657 NULL, OPT_RX_RETRY_NUM}, 658 {OPT_RX_RETRY_DELAY, required_argument, 659 NULL, OPT_RX_RETRY_DELAY_NUM}, 660 {OPT_RX_RETRY_NUMB, required_argument, 661 NULL, OPT_RX_RETRY_NUMB_NUM}, 662 {OPT_MERGEABLE, required_argument, 663 NULL, OPT_MERGEABLE_NUM}, 664 {OPT_STATS, required_argument, 665 NULL, OPT_STATS_NUM}, 666 {OPT_SOCKET_FILE, required_argument, 667 NULL, OPT_SOCKET_FILE_NUM}, 668 {OPT_TX_CSUM, required_argument, 669 NULL, OPT_TX_CSUM_NUM}, 670 {OPT_TSO, required_argument, 671 NULL, OPT_TSO_NUM}, 672 {OPT_CLIENT, no_argument, 673 NULL, OPT_CLIENT_NUM}, 674 {OPT_BUILTIN_NET_DRIVER, no_argument, 675 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 676 {OPT_DMAS, required_argument, 677 NULL, OPT_DMAS_NUM}, 678 {NULL, 0, 0, 0}, 679 }; 680 681 /* Parse command line */ 682 while ((opt = getopt_long(argc, argv, "p:P", 683 long_option, &option_index)) != EOF) { 684 switch (opt) { 685 /* Portmask */ 686 case 'p': 687 enabled_port_mask = parse_portmask(optarg); 688 if (enabled_port_mask == 0) { 689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 690 us_vhost_usage(prgname); 691 return -1; 692 } 693 break; 694 695 case 'P': 696 promiscuous = 1; 697 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 698 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 699 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 700 break; 701 702 case OPT_VM2VM_NUM: 703 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 704 if (ret == -1) { 705 RTE_LOG(INFO, VHOST_CONFIG, 706 "Invalid argument for " 707 "vm2vm [0|1|2]\n"); 708 us_vhost_usage(prgname); 709 return -1; 710 } 711 vm2vm_mode = (vm2vm_type)ret; 712 break; 713 714 case OPT_RX_RETRY_NUM: 715 ret = parse_num_opt(optarg, 1); 716 if (ret == -1) { 717 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 718 us_vhost_usage(prgname); 719 return -1; 720 } 721 enable_retry = ret; 722 break; 723 724 case OPT_TX_CSUM_NUM: 725 ret = parse_num_opt(optarg, 1); 726 if (ret == -1) { 727 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 728 us_vhost_usage(prgname); 729 return -1; 730 } 731 enable_tx_csum = ret; 732 break; 733 734 case OPT_TSO_NUM: 735 ret = parse_num_opt(optarg, 1); 736 if (ret == -1) { 737 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 738 us_vhost_usage(prgname); 739 return -1; 740 } 741 enable_tso = ret; 742 break; 743 744 case OPT_RX_RETRY_DELAY_NUM: 745 ret = parse_num_opt(optarg, INT32_MAX); 746 if (ret == -1) { 747 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 748 us_vhost_usage(prgname); 749 return -1; 750 } 751 burst_rx_delay_time = ret; 752 break; 753 754 case OPT_RX_RETRY_NUMB_NUM: 755 ret = parse_num_opt(optarg, INT32_MAX); 756 if (ret == -1) { 757 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 758 us_vhost_usage(prgname); 759 return -1; 760 } 761 burst_rx_retry_num = ret; 762 break; 763 764 case OPT_MERGEABLE_NUM: 765 ret = parse_num_opt(optarg, 1); 766 if (ret == -1) { 767 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 768 us_vhost_usage(prgname); 769 return -1; 770 } 771 mergeable = !!ret; 772 break; 773 774 case OPT_STATS_NUM: 775 ret = parse_num_opt(optarg, INT32_MAX); 776 if (ret == -1) { 777 RTE_LOG(INFO, VHOST_CONFIG, 778 "Invalid argument for stats [0..N]\n"); 779 us_vhost_usage(prgname); 780 return -1; 781 } 782 enable_stats = ret; 783 break; 784 785 /* Set socket file path. */ 786 case OPT_SOCKET_FILE_NUM: 787 if (us_vhost_parse_socket_path(optarg) == -1) { 788 RTE_LOG(INFO, VHOST_CONFIG, 789 "Invalid argument for socket name (Max %d characters)\n", 790 PATH_MAX); 791 us_vhost_usage(prgname); 792 return -1; 793 } 794 break; 795 796 case OPT_DMAS_NUM: 797 if (open_dma(optarg) == -1) { 798 RTE_LOG(INFO, VHOST_CONFIG, 799 "Wrong DMA args\n"); 800 us_vhost_usage(prgname); 801 return -1; 802 } 803 break; 804 805 case OPT_CLIENT_NUM: 806 client_mode = 1; 807 break; 808 809 case OPT_BUILTIN_NET_DRIVER_NUM: 810 builtin_net_driver = 1; 811 break; 812 813 /* Invalid option - print options. */ 814 default: 815 us_vhost_usage(prgname); 816 return -1; 817 } 818 } 819 820 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 821 if (enabled_port_mask & (1 << i)) 822 ports[num_ports++] = i; 823 } 824 825 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 826 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 827 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 828 return -1; 829 } 830 831 return 0; 832 } 833 834 /* 835 * Update the global var NUM_PORTS and array PORTS according to system ports number 836 * and return valid ports number 837 */ 838 static unsigned check_ports_num(unsigned nb_ports) 839 { 840 unsigned valid_num_ports = num_ports; 841 unsigned portid; 842 843 if (num_ports > nb_ports) { 844 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 845 num_ports, nb_ports); 846 num_ports = nb_ports; 847 } 848 849 for (portid = 0; portid < num_ports; portid ++) { 850 if (!rte_eth_dev_is_valid_port(ports[portid])) { 851 RTE_LOG(INFO, VHOST_PORT, 852 "\nSpecified port ID(%u) is not valid\n", 853 ports[portid]); 854 ports[portid] = INVALID_PORT_ID; 855 valid_num_ports--; 856 } 857 } 858 return valid_num_ports; 859 } 860 861 static __rte_always_inline struct vhost_dev * 862 find_vhost_dev(struct rte_ether_addr *mac) 863 { 864 struct vhost_dev *vdev; 865 866 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 867 if (vdev->ready == DEVICE_RX && 868 rte_is_same_ether_addr(mac, &vdev->mac_address)) 869 return vdev; 870 } 871 872 return NULL; 873 } 874 875 /* 876 * This function learns the MAC address of the device and registers this along with a 877 * vlan tag to a VMDQ. 878 */ 879 static int 880 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 881 { 882 struct rte_ether_hdr *pkt_hdr; 883 int i, ret; 884 885 /* Learn MAC address of guest device from packet */ 886 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 887 888 if (find_vhost_dev(&pkt_hdr->src_addr)) { 889 RTE_LOG(ERR, VHOST_DATA, 890 "(%d) device is using a registered MAC!\n", 891 vdev->vid); 892 return -1; 893 } 894 895 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 896 vdev->mac_address.addr_bytes[i] = 897 pkt_hdr->src_addr.addr_bytes[i]; 898 899 /* vlan_tag currently uses the device_id. */ 900 vdev->vlan_tag = vlan_tags[vdev->vid]; 901 902 /* Print out VMDQ registration info. */ 903 RTE_LOG(INFO, VHOST_DATA, 904 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 905 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 906 vdev->vlan_tag); 907 908 /* Register the MAC address. */ 909 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 910 (uint32_t)vdev->vid + vmdq_pool_base); 911 if (ret) 912 RTE_LOG(ERR, VHOST_DATA, 913 "(%d) failed to add device MAC address to VMDQ\n", 914 vdev->vid); 915 916 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 917 918 /* Set device as ready for RX. */ 919 vdev->ready = DEVICE_RX; 920 921 return 0; 922 } 923 924 /* 925 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 926 * queue before disabling RX on the device. 927 */ 928 static inline void 929 unlink_vmdq(struct vhost_dev *vdev) 930 { 931 unsigned i = 0; 932 unsigned rx_count; 933 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 934 935 if (vdev->ready == DEVICE_RX) { 936 /*clear MAC and VLAN settings*/ 937 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 938 for (i = 0; i < 6; i++) 939 vdev->mac_address.addr_bytes[i] = 0; 940 941 vdev->vlan_tag = 0; 942 943 /*Clear out the receive buffers*/ 944 rx_count = rte_eth_rx_burst(ports[0], 945 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 946 947 while (rx_count) { 948 for (i = 0; i < rx_count; i++) 949 rte_pktmbuf_free(pkts_burst[i]); 950 951 rx_count = rte_eth_rx_burst(ports[0], 952 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 953 } 954 955 vdev->ready = DEVICE_MAC_LEARNING; 956 } 957 } 958 959 static inline void 960 free_pkts(struct rte_mbuf **pkts, uint16_t n) 961 { 962 while (n--) 963 rte_pktmbuf_free(pkts[n]); 964 } 965 966 static __rte_always_inline void 967 complete_async_pkts(struct vhost_dev *vdev) 968 { 969 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 970 uint16_t complete_count; 971 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id; 972 973 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 974 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 975 if (complete_count) { 976 free_pkts(p_cpl, complete_count); 977 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST); 978 } 979 980 } 981 982 static __rte_always_inline void 983 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 984 struct rte_mbuf *m) 985 { 986 uint16_t ret; 987 988 if (builtin_net_driver) { 989 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 990 } else { 991 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 992 } 993 994 if (enable_stats) { 995 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 996 __ATOMIC_SEQ_CST); 997 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 998 __ATOMIC_SEQ_CST); 999 src_vdev->stats.tx_total++; 1000 src_vdev->stats.tx += ret; 1001 } 1002 } 1003 1004 static __rte_always_inline void 1005 drain_vhost(struct vhost_dev *vdev) 1006 { 1007 uint16_t ret; 1008 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1009 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1010 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1011 1012 if (builtin_net_driver) { 1013 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); 1014 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) { 1015 uint16_t enqueue_fail = 0; 1016 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id; 1017 1018 complete_async_pkts(vdev); 1019 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0); 1020 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST); 1021 1022 enqueue_fail = nr_xmit - ret; 1023 if (enqueue_fail) 1024 free_pkts(&m[ret], nr_xmit - ret); 1025 } else { 1026 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1027 m, nr_xmit); 1028 } 1029 1030 if (enable_stats) { 1031 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 1032 __ATOMIC_SEQ_CST); 1033 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 1034 __ATOMIC_SEQ_CST); 1035 } 1036 1037 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) 1038 free_pkts(m, nr_xmit); 1039 } 1040 1041 static __rte_always_inline void 1042 drain_vhost_table(void) 1043 { 1044 uint16_t lcore_id = rte_lcore_id(); 1045 struct vhost_bufftable *vhost_txq; 1046 struct vhost_dev *vdev; 1047 uint64_t cur_tsc; 1048 1049 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1050 if (unlikely(vdev->remove == 1)) 1051 continue; 1052 1053 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1054 1055 cur_tsc = rte_rdtsc(); 1056 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1057 > MBUF_TABLE_DRAIN_TSC)) { 1058 RTE_LOG_DP(DEBUG, VHOST_DATA, 1059 "Vhost TX queue drained after timeout with burst size %u\n", 1060 vhost_txq->len); 1061 drain_vhost(vdev); 1062 vhost_txq->len = 0; 1063 vhost_txq->pre_tsc = cur_tsc; 1064 } 1065 } 1066 } 1067 1068 /* 1069 * Check if the packet destination MAC address is for a local device. If so then put 1070 * the packet on that devices RX queue. If not then return. 1071 */ 1072 static __rte_always_inline int 1073 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1074 { 1075 struct rte_ether_hdr *pkt_hdr; 1076 struct vhost_dev *dst_vdev; 1077 struct vhost_bufftable *vhost_txq; 1078 uint16_t lcore_id = rte_lcore_id(); 1079 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1080 1081 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1082 if (!dst_vdev) 1083 return -1; 1084 1085 if (vdev->vid == dst_vdev->vid) { 1086 RTE_LOG_DP(DEBUG, VHOST_DATA, 1087 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1088 vdev->vid); 1089 return 0; 1090 } 1091 1092 RTE_LOG_DP(DEBUG, VHOST_DATA, 1093 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1094 1095 if (unlikely(dst_vdev->remove)) { 1096 RTE_LOG_DP(DEBUG, VHOST_DATA, 1097 "(%d) device is marked for removal\n", dst_vdev->vid); 1098 return 0; 1099 } 1100 1101 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1102 vhost_txq->m_table[vhost_txq->len++] = m; 1103 1104 if (enable_stats) { 1105 vdev->stats.tx_total++; 1106 vdev->stats.tx++; 1107 } 1108 1109 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1110 drain_vhost(dst_vdev); 1111 vhost_txq->len = 0; 1112 vhost_txq->pre_tsc = rte_rdtsc(); 1113 } 1114 return 0; 1115 } 1116 1117 /* 1118 * Check if the destination MAC of a packet is one local VM, 1119 * and get its vlan tag, and offset if it is. 1120 */ 1121 static __rte_always_inline int 1122 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1123 uint32_t *offset, uint16_t *vlan_tag) 1124 { 1125 struct vhost_dev *dst_vdev; 1126 struct rte_ether_hdr *pkt_hdr = 1127 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1128 1129 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1130 if (!dst_vdev) 1131 return 0; 1132 1133 if (vdev->vid == dst_vdev->vid) { 1134 RTE_LOG_DP(DEBUG, VHOST_DATA, 1135 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1136 vdev->vid); 1137 return -1; 1138 } 1139 1140 /* 1141 * HW vlan strip will reduce the packet length 1142 * by minus length of vlan tag, so need restore 1143 * the packet length by plus it. 1144 */ 1145 *offset = RTE_VLAN_HLEN; 1146 *vlan_tag = vlan_tags[vdev->vid]; 1147 1148 RTE_LOG_DP(DEBUG, VHOST_DATA, 1149 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1150 vdev->vid, dst_vdev->vid, *vlan_tag); 1151 1152 return 0; 1153 } 1154 1155 static void virtio_tx_offload(struct rte_mbuf *m) 1156 { 1157 struct rte_net_hdr_lens hdr_lens; 1158 struct rte_ipv4_hdr *ipv4_hdr; 1159 struct rte_tcp_hdr *tcp_hdr; 1160 uint32_t ptype; 1161 void *l3_hdr; 1162 1163 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1164 m->l2_len = hdr_lens.l2_len; 1165 m->l3_len = hdr_lens.l3_len; 1166 m->l4_len = hdr_lens.l4_len; 1167 1168 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1169 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1170 m->l2_len + m->l3_len); 1171 1172 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1173 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1174 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1175 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1176 ipv4_hdr = l3_hdr; 1177 ipv4_hdr->hdr_checksum = 0; 1178 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1179 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1180 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1181 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1182 } 1183 } 1184 1185 static __rte_always_inline void 1186 do_drain_mbuf_table(struct mbuf_table *tx_q) 1187 { 1188 uint16_t count; 1189 1190 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1191 tx_q->m_table, tx_q->len); 1192 if (unlikely(count < tx_q->len)) 1193 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1194 1195 tx_q->len = 0; 1196 } 1197 1198 /* 1199 * This function routes the TX packet to the correct interface. This 1200 * may be a local device or the physical port. 1201 */ 1202 static __rte_always_inline void 1203 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1204 { 1205 struct mbuf_table *tx_q; 1206 unsigned offset = 0; 1207 const uint16_t lcore_id = rte_lcore_id(); 1208 struct rte_ether_hdr *nh; 1209 1210 1211 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1212 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1213 struct vhost_dev *vdev2; 1214 1215 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1216 if (vdev2 != vdev) 1217 sync_virtio_xmit(vdev2, vdev, m); 1218 } 1219 goto queue2nic; 1220 } 1221 1222 /*check if destination is local VM*/ 1223 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1224 return; 1225 1226 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1227 if (unlikely(find_local_dest(vdev, m, &offset, 1228 &vlan_tag) != 0)) { 1229 rte_pktmbuf_free(m); 1230 return; 1231 } 1232 } 1233 1234 RTE_LOG_DP(DEBUG, VHOST_DATA, 1235 "(%d) TX: MAC address is external\n", vdev->vid); 1236 1237 queue2nic: 1238 1239 /*Add packet to the port tx queue*/ 1240 tx_q = &lcore_tx_queue[lcore_id]; 1241 1242 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1243 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1244 /* Guest has inserted the vlan tag. */ 1245 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1246 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1247 if ((vm2vm_mode == VM2VM_HARDWARE) && 1248 (vh->vlan_tci != vlan_tag_be)) 1249 vh->vlan_tci = vlan_tag_be; 1250 } else { 1251 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1252 1253 /* 1254 * Find the right seg to adjust the data len when offset is 1255 * bigger than tail room size. 1256 */ 1257 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1258 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1259 m->data_len += offset; 1260 else { 1261 struct rte_mbuf *seg = m; 1262 1263 while ((seg->next != NULL) && 1264 (offset > rte_pktmbuf_tailroom(seg))) 1265 seg = seg->next; 1266 1267 seg->data_len += offset; 1268 } 1269 m->pkt_len += offset; 1270 } 1271 1272 m->vlan_tci = vlan_tag; 1273 } 1274 1275 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1276 virtio_tx_offload(m); 1277 1278 tx_q->m_table[tx_q->len++] = m; 1279 if (enable_stats) { 1280 vdev->stats.tx_total++; 1281 vdev->stats.tx++; 1282 } 1283 1284 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1285 do_drain_mbuf_table(tx_q); 1286 } 1287 1288 1289 static __rte_always_inline void 1290 drain_mbuf_table(struct mbuf_table *tx_q) 1291 { 1292 static uint64_t prev_tsc; 1293 uint64_t cur_tsc; 1294 1295 if (tx_q->len == 0) 1296 return; 1297 1298 cur_tsc = rte_rdtsc(); 1299 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1300 prev_tsc = cur_tsc; 1301 1302 RTE_LOG_DP(DEBUG, VHOST_DATA, 1303 "TX queue drained after timeout with burst size %u\n", 1304 tx_q->len); 1305 do_drain_mbuf_table(tx_q); 1306 } 1307 } 1308 1309 static __rte_always_inline void 1310 drain_eth_rx(struct vhost_dev *vdev) 1311 { 1312 uint16_t rx_count, enqueue_count; 1313 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1314 1315 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1316 pkts, MAX_PKT_BURST); 1317 1318 if (!rx_count) 1319 return; 1320 1321 /* 1322 * When "enable_retry" is set, here we wait and retry when there 1323 * is no enough free slots in the queue to hold @rx_count packets, 1324 * to diminish packet loss. 1325 */ 1326 if (enable_retry && 1327 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1328 VIRTIO_RXQ))) { 1329 uint32_t retry; 1330 1331 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1332 rte_delay_us(burst_rx_delay_time); 1333 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1334 VIRTIO_RXQ)) 1335 break; 1336 } 1337 } 1338 1339 if (builtin_net_driver) { 1340 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1341 pkts, rx_count); 1342 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) { 1343 uint16_t enqueue_fail = 0; 1344 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id; 1345 1346 complete_async_pkts(vdev); 1347 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, 1348 VIRTIO_RXQ, pkts, rx_count, dma_id, 0); 1349 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST); 1350 1351 enqueue_fail = rx_count - enqueue_count; 1352 if (enqueue_fail) 1353 free_pkts(&pkts[enqueue_count], enqueue_fail); 1354 1355 } else { 1356 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1357 pkts, rx_count); 1358 } 1359 1360 if (enable_stats) { 1361 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1362 __ATOMIC_SEQ_CST); 1363 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1364 __ATOMIC_SEQ_CST); 1365 } 1366 1367 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) 1368 free_pkts(pkts, rx_count); 1369 } 1370 1371 static __rte_always_inline void 1372 drain_virtio_tx(struct vhost_dev *vdev) 1373 { 1374 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1375 uint16_t count; 1376 uint16_t i; 1377 1378 if (builtin_net_driver) { 1379 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1380 pkts, MAX_PKT_BURST); 1381 } else { 1382 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1383 mbuf_pool, pkts, MAX_PKT_BURST); 1384 } 1385 1386 /* setup VMDq for the first packet */ 1387 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1388 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1389 free_pkts(pkts, count); 1390 } 1391 1392 for (i = 0; i < count; ++i) 1393 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1394 } 1395 1396 /* 1397 * Main function of vhost-switch. It basically does: 1398 * 1399 * for each vhost device { 1400 * - drain_eth_rx() 1401 * 1402 * Which drains the host eth Rx queue linked to the vhost device, 1403 * and deliver all of them to guest virito Rx ring associated with 1404 * this vhost device. 1405 * 1406 * - drain_virtio_tx() 1407 * 1408 * Which drains the guest virtio Tx queue and deliver all of them 1409 * to the target, which could be another vhost device, or the 1410 * physical eth dev. The route is done in function "virtio_tx_route". 1411 * } 1412 */ 1413 static int 1414 switch_worker(void *arg __rte_unused) 1415 { 1416 unsigned i; 1417 unsigned lcore_id = rte_lcore_id(); 1418 struct vhost_dev *vdev; 1419 struct mbuf_table *tx_q; 1420 1421 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1422 1423 tx_q = &lcore_tx_queue[lcore_id]; 1424 for (i = 0; i < rte_lcore_count(); i++) { 1425 if (lcore_ids[i] == lcore_id) { 1426 tx_q->txq_id = i; 1427 break; 1428 } 1429 } 1430 1431 while(1) { 1432 drain_mbuf_table(tx_q); 1433 drain_vhost_table(); 1434 /* 1435 * Inform the configuration core that we have exited the 1436 * linked list and that no devices are in use if requested. 1437 */ 1438 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1439 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1440 1441 /* 1442 * Process vhost devices 1443 */ 1444 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1445 lcore_vdev_entry) { 1446 if (unlikely(vdev->remove)) { 1447 unlink_vmdq(vdev); 1448 vdev->ready = DEVICE_SAFE_REMOVE; 1449 continue; 1450 } 1451 1452 if (likely(vdev->ready == DEVICE_RX)) 1453 drain_eth_rx(vdev); 1454 1455 if (likely(!vdev->remove)) 1456 drain_virtio_tx(vdev); 1457 } 1458 } 1459 1460 return 0; 1461 } 1462 1463 /* 1464 * Remove a device from the specific data core linked list and from the 1465 * main linked list. Synchronization occurs through the use of the 1466 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1467 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1468 */ 1469 static void 1470 destroy_device(int vid) 1471 { 1472 struct vhost_dev *vdev = NULL; 1473 int lcore; 1474 uint16_t i; 1475 1476 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1477 if (vdev->vid == vid) 1478 break; 1479 } 1480 if (!vdev) 1481 return; 1482 /*set the remove flag. */ 1483 vdev->remove = 1; 1484 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1485 rte_pause(); 1486 } 1487 1488 for (i = 0; i < RTE_MAX_LCORE; i++) 1489 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1490 1491 if (builtin_net_driver) 1492 vs_vhost_net_remove(vdev); 1493 1494 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1495 lcore_vdev_entry); 1496 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1497 1498 1499 /* Set the dev_removal_flag on each lcore. */ 1500 RTE_LCORE_FOREACH_WORKER(lcore) 1501 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1502 1503 /* 1504 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1505 * we can be sure that they can no longer access the device removed 1506 * from the linked lists and that the devices are no longer in use. 1507 */ 1508 RTE_LCORE_FOREACH_WORKER(lcore) { 1509 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1510 rte_pause(); 1511 } 1512 1513 lcore_info[vdev->coreid].device_num--; 1514 1515 RTE_LOG(INFO, VHOST_DATA, 1516 "(%d) device has been removed from data core\n", 1517 vdev->vid); 1518 1519 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1520 uint16_t n_pkt = 0; 1521 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id; 1522 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1523 1524 while (vdev->pkts_inflight) { 1525 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ, 1526 m_cpl, vdev->pkts_inflight, dma_id, 0); 1527 free_pkts(m_cpl, n_pkt); 1528 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1529 } 1530 1531 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1532 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1533 } 1534 1535 rte_free(vdev); 1536 } 1537 1538 /* 1539 * A new device is added to a data core. First the device is added to the main linked list 1540 * and then allocated to a specific data core. 1541 */ 1542 static int 1543 new_device(int vid) 1544 { 1545 int lcore, core_add = 0; 1546 uint16_t i; 1547 uint32_t device_num_min = num_devices; 1548 struct vhost_dev *vdev; 1549 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1550 if (vdev == NULL) { 1551 RTE_LOG(INFO, VHOST_DATA, 1552 "(%d) couldn't allocate memory for vhost dev\n", 1553 vid); 1554 return -1; 1555 } 1556 vdev->vid = vid; 1557 1558 for (i = 0; i < RTE_MAX_LCORE; i++) { 1559 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1560 = rte_zmalloc("vhost bufftable", 1561 sizeof(struct vhost_bufftable), 1562 RTE_CACHE_LINE_SIZE); 1563 1564 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1565 RTE_LOG(INFO, VHOST_DATA, 1566 "(%d) couldn't allocate memory for vhost TX\n", vid); 1567 return -1; 1568 } 1569 } 1570 1571 if (builtin_net_driver) 1572 vs_vhost_net_setup(vdev); 1573 1574 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1575 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1576 1577 /*reset ready flag*/ 1578 vdev->ready = DEVICE_MAC_LEARNING; 1579 vdev->remove = 0; 1580 1581 /* Find a suitable lcore to add the device. */ 1582 RTE_LCORE_FOREACH_WORKER(lcore) { 1583 if (lcore_info[lcore].device_num < device_num_min) { 1584 device_num_min = lcore_info[lcore].device_num; 1585 core_add = lcore; 1586 } 1587 } 1588 vdev->coreid = core_add; 1589 1590 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1591 lcore_vdev_entry); 1592 lcore_info[vdev->coreid].device_num++; 1593 1594 /* Disable notifications. */ 1595 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1596 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1597 1598 RTE_LOG(INFO, VHOST_DATA, 1599 "(%d) device has been added to data core %d\n", 1600 vid, vdev->coreid); 1601 1602 if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1603 int ret; 1604 1605 ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1606 if (ret == 0) 1607 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true; 1608 return ret; 1609 } 1610 1611 return 0; 1612 } 1613 1614 static int 1615 vring_state_changed(int vid, uint16_t queue_id, int enable) 1616 { 1617 struct vhost_dev *vdev = NULL; 1618 1619 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1620 if (vdev->vid == vid) 1621 break; 1622 } 1623 if (!vdev) 1624 return -1; 1625 1626 if (queue_id != VIRTIO_RXQ) 1627 return 0; 1628 1629 if (dma_bind[vid].dmas[queue_id].async_enabled) { 1630 if (!enable) { 1631 uint16_t n_pkt = 0; 1632 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id; 1633 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1634 1635 while (vdev->pkts_inflight) { 1636 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id, 1637 m_cpl, vdev->pkts_inflight, dma_id, 0); 1638 free_pkts(m_cpl, n_pkt); 1639 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1640 } 1641 } 1642 } 1643 1644 return 0; 1645 } 1646 1647 /* 1648 * These callback allow devices to be added to the data core when configuration 1649 * has been fully complete. 1650 */ 1651 static const struct rte_vhost_device_ops virtio_net_device_ops = 1652 { 1653 .new_device = new_device, 1654 .destroy_device = destroy_device, 1655 .vring_state_changed = vring_state_changed, 1656 }; 1657 1658 /* 1659 * This is a thread will wake up after a period to print stats if the user has 1660 * enabled them. 1661 */ 1662 static void * 1663 print_stats(__rte_unused void *arg) 1664 { 1665 struct vhost_dev *vdev; 1666 uint64_t tx_dropped, rx_dropped; 1667 uint64_t tx, tx_total, rx, rx_total; 1668 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1669 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1670 1671 while(1) { 1672 sleep(enable_stats); 1673 1674 /* Clear screen and move to top left */ 1675 printf("%s%s\n", clr, top_left); 1676 printf("Device statistics =================================\n"); 1677 1678 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1679 tx_total = vdev->stats.tx_total; 1680 tx = vdev->stats.tx; 1681 tx_dropped = tx_total - tx; 1682 1683 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1684 __ATOMIC_SEQ_CST); 1685 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1686 __ATOMIC_SEQ_CST); 1687 rx_dropped = rx_total - rx; 1688 1689 printf("Statistics for device %d\n" 1690 "-----------------------\n" 1691 "TX total: %" PRIu64 "\n" 1692 "TX dropped: %" PRIu64 "\n" 1693 "TX successful: %" PRIu64 "\n" 1694 "RX total: %" PRIu64 "\n" 1695 "RX dropped: %" PRIu64 "\n" 1696 "RX successful: %" PRIu64 "\n", 1697 vdev->vid, 1698 tx_total, tx_dropped, tx, 1699 rx_total, rx_dropped, rx); 1700 } 1701 1702 printf("===================================================\n"); 1703 1704 fflush(stdout); 1705 } 1706 1707 return NULL; 1708 } 1709 1710 static void 1711 unregister_drivers(int socket_num) 1712 { 1713 int i, ret; 1714 1715 for (i = 0; i < socket_num; i++) { 1716 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1717 if (ret != 0) 1718 RTE_LOG(ERR, VHOST_CONFIG, 1719 "Fail to unregister vhost driver for %s.\n", 1720 socket_files + i * PATH_MAX); 1721 } 1722 } 1723 1724 /* When we receive a INT signal, unregister vhost driver */ 1725 static void 1726 sigint_handler(__rte_unused int signum) 1727 { 1728 /* Unregister vhost driver. */ 1729 unregister_drivers(nb_sockets); 1730 1731 exit(0); 1732 } 1733 1734 /* 1735 * While creating an mbuf pool, one key thing is to figure out how 1736 * many mbuf entries is enough for our use. FYI, here are some 1737 * guidelines: 1738 * 1739 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1740 * 1741 * - For each switch core (A CPU core does the packet switch), we need 1742 * also make some reservation for receiving the packets from virtio 1743 * Tx queue. How many is enough depends on the usage. It's normally 1744 * a simple calculation like following: 1745 * 1746 * MAX_PKT_BURST * max packet size / mbuf size 1747 * 1748 * So, we definitely need allocate more mbufs when TSO is enabled. 1749 * 1750 * - Similarly, for each switching core, we should serve @nr_rx_desc 1751 * mbufs for receiving the packets from physical NIC device. 1752 * 1753 * - We also need make sure, for each switch core, we have allocated 1754 * enough mbufs to fill up the mbuf cache. 1755 */ 1756 static void 1757 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1758 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1759 { 1760 uint32_t nr_mbufs; 1761 uint32_t nr_mbufs_per_core; 1762 uint32_t mtu = 1500; 1763 1764 if (mergeable) 1765 mtu = 9000; 1766 if (enable_tso) 1767 mtu = 64 * 1024; 1768 1769 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1770 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1771 nr_mbufs_per_core += nr_rx_desc; 1772 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1773 1774 nr_mbufs = nr_queues * nr_rx_desc; 1775 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1776 nr_mbufs *= nr_port; 1777 1778 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1779 nr_mbuf_cache, 0, mbuf_size, 1780 rte_socket_id()); 1781 if (mbuf_pool == NULL) 1782 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1783 } 1784 1785 static void 1786 reset_dma(void) 1787 { 1788 int i; 1789 1790 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1791 int j; 1792 1793 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1794 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1795 dma_bind[i].dmas[j].async_enabled = false; 1796 } 1797 } 1798 1799 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1800 dmas_id[i] = INVALID_DMA_ID; 1801 } 1802 1803 /* 1804 * Main function, does initialisation and calls the per-lcore functions. 1805 */ 1806 int 1807 main(int argc, char *argv[]) 1808 { 1809 unsigned lcore_id, core_id = 0; 1810 unsigned nb_ports, valid_num_ports; 1811 int ret, i; 1812 uint16_t portid; 1813 static pthread_t tid; 1814 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1815 1816 signal(SIGINT, sigint_handler); 1817 1818 /* init EAL */ 1819 ret = rte_eal_init(argc, argv); 1820 if (ret < 0) 1821 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1822 argc -= ret; 1823 argv += ret; 1824 1825 /* initialize dma structures */ 1826 reset_dma(); 1827 1828 /* parse app arguments */ 1829 ret = us_vhost_parse_args(argc, argv); 1830 if (ret < 0) 1831 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1832 1833 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1834 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1835 1836 if (rte_lcore_is_enabled(lcore_id)) 1837 lcore_ids[core_id++] = lcore_id; 1838 } 1839 1840 if (rte_lcore_count() > RTE_MAX_LCORE) 1841 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1842 1843 /* Get the number of physical ports. */ 1844 nb_ports = rte_eth_dev_count_avail(); 1845 1846 /* 1847 * Update the global var NUM_PORTS and global array PORTS 1848 * and get value of var VALID_NUM_PORTS according to system ports number 1849 */ 1850 valid_num_ports = check_ports_num(nb_ports); 1851 1852 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1853 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1854 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1855 return -1; 1856 } 1857 1858 /* 1859 * FIXME: here we are trying to allocate mbufs big enough for 1860 * @MAX_QUEUES, but the truth is we're never going to use that 1861 * many queues here. We probably should only do allocation for 1862 * those queues we are going to use. 1863 */ 1864 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1865 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1866 1867 if (vm2vm_mode == VM2VM_HARDWARE) { 1868 /* Enable VT loop back to let L2 switch to do it. */ 1869 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1870 RTE_LOG(DEBUG, VHOST_CONFIG, 1871 "Enable loop back for L2 switch in vmdq.\n"); 1872 } 1873 1874 /* initialize all ports */ 1875 RTE_ETH_FOREACH_DEV(portid) { 1876 /* skip ports that are not enabled */ 1877 if ((enabled_port_mask & (1 << portid)) == 0) { 1878 RTE_LOG(INFO, VHOST_PORT, 1879 "Skipping disabled port %d\n", portid); 1880 continue; 1881 } 1882 if (port_init(portid) != 0) 1883 rte_exit(EXIT_FAILURE, 1884 "Cannot initialize network ports\n"); 1885 } 1886 1887 /* Enable stats if the user option is set. */ 1888 if (enable_stats) { 1889 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1890 print_stats, NULL); 1891 if (ret < 0) 1892 rte_exit(EXIT_FAILURE, 1893 "Cannot create print-stats thread\n"); 1894 } 1895 1896 /* Launch all data cores. */ 1897 RTE_LCORE_FOREACH_WORKER(lcore_id) 1898 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1899 1900 if (client_mode) 1901 flags |= RTE_VHOST_USER_CLIENT; 1902 1903 for (i = 0; i < dma_count; i++) { 1904 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 1905 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 1906 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 1907 } 1908 } 1909 1910 /* Register vhost user driver to handle vhost messages. */ 1911 for (i = 0; i < nb_sockets; i++) { 1912 char *file = socket_files + i * PATH_MAX; 1913 1914 if (dma_count) 1915 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1916 1917 ret = rte_vhost_driver_register(file, flags); 1918 if (ret != 0) { 1919 unregister_drivers(i); 1920 rte_exit(EXIT_FAILURE, 1921 "vhost driver register failure.\n"); 1922 } 1923 1924 if (builtin_net_driver) 1925 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1926 1927 if (mergeable == 0) { 1928 rte_vhost_driver_disable_features(file, 1929 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1930 } 1931 1932 if (enable_tx_csum == 0) { 1933 rte_vhost_driver_disable_features(file, 1934 1ULL << VIRTIO_NET_F_CSUM); 1935 } 1936 1937 if (enable_tso == 0) { 1938 rte_vhost_driver_disable_features(file, 1939 1ULL << VIRTIO_NET_F_HOST_TSO4); 1940 rte_vhost_driver_disable_features(file, 1941 1ULL << VIRTIO_NET_F_HOST_TSO6); 1942 rte_vhost_driver_disable_features(file, 1943 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1944 rte_vhost_driver_disable_features(file, 1945 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1946 } 1947 1948 if (promiscuous) { 1949 rte_vhost_driver_enable_features(file, 1950 1ULL << VIRTIO_NET_F_CTRL_RX); 1951 } 1952 1953 ret = rte_vhost_driver_callback_register(file, 1954 &virtio_net_device_ops); 1955 if (ret != 0) { 1956 rte_exit(EXIT_FAILURE, 1957 "failed to register vhost driver callbacks.\n"); 1958 } 1959 1960 if (rte_vhost_driver_start(file) < 0) { 1961 rte_exit(EXIT_FAILURE, 1962 "failed to start vhost driver.\n"); 1963 } 1964 } 1965 1966 RTE_LCORE_FOREACH_WORKER(lcore_id) 1967 rte_eal_wait_lcore(lcore_id); 1968 1969 /* clean up the EAL */ 1970 rte_eal_cleanup(); 1971 1972 return 0; 1973 } 1974