1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 #include <rte_dmadev.h> 28 #include <rte_vhost_async.h> 29 30 #include "main.h" 31 32 #ifndef MAX_QUEUES 33 #define MAX_QUEUES 128 34 #endif 35 36 /* the maximum number of external ports supported */ 37 #define MAX_SUP_PORTS 1 38 39 #define MBUF_CACHE_SIZE 128 40 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 41 42 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 43 44 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 45 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 46 47 #define JUMBO_FRAME_MAX_SIZE 0x2600 48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 49 50 /* State of virtio device. */ 51 #define DEVICE_MAC_LEARNING 0 52 #define DEVICE_RX 1 53 #define DEVICE_SAFE_REMOVE 2 54 55 /* Configurable number of RX/TX ring descriptors */ 56 #define RTE_TEST_RX_DESC_DEFAULT 1024 57 #define RTE_TEST_TX_DESC_DEFAULT 512 58 59 #define INVALID_PORT_ID 0xFF 60 #define INVALID_DMA_ID -1 61 62 #define DMA_RING_SIZE 4096 63 64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 66 static int dma_count; 67 68 /* mask of enabled ports */ 69 static uint32_t enabled_port_mask = 0; 70 71 /* Promiscuous mode */ 72 static uint32_t promiscuous; 73 74 /* number of devices/queues to support*/ 75 static uint32_t num_queues = 0; 76 static uint32_t num_devices; 77 78 static struct rte_mempool *mbuf_pool; 79 static int mergeable; 80 81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 82 typedef enum { 83 VM2VM_DISABLED = 0, 84 VM2VM_SOFTWARE = 1, 85 VM2VM_HARDWARE = 2, 86 VM2VM_LAST 87 } vm2vm_type; 88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 89 90 /* Enable stats. */ 91 static uint32_t enable_stats = 0; 92 /* Enable retries on RX. */ 93 static uint32_t enable_retry = 1; 94 95 /* Disable TX checksum offload */ 96 static uint32_t enable_tx_csum; 97 98 /* Disable TSO offload */ 99 static uint32_t enable_tso; 100 101 static int client_mode; 102 103 static int builtin_net_driver; 104 105 /* Specify timeout (in useconds) between retries on RX. */ 106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 107 /* Specify the number of retries on RX. */ 108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 109 110 /* Socket file paths. Can be set by user */ 111 static char *socket_files; 112 static int nb_sockets; 113 114 /* empty VMDq configuration structure. Filled in programmatically */ 115 static struct rte_eth_conf vmdq_conf_default = { 116 .rxmode = { 117 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 118 .split_hdr_size = 0, 119 /* 120 * VLAN strip is necessary for 1G NIC such as I350, 121 * this fixes bug of ipv4 forwarding in guest can't 122 * forward packets from one virtio dev to another virtio dev. 123 */ 124 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 125 }, 126 127 .txmode = { 128 .mq_mode = RTE_ETH_MQ_TX_NONE, 129 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 130 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 131 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 132 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 133 RTE_ETH_TX_OFFLOAD_TCP_TSO), 134 }, 135 .rx_adv_conf = { 136 /* 137 * should be overridden separately in code with 138 * appropriate values 139 */ 140 .vmdq_rx_conf = { 141 .nb_queue_pools = RTE_ETH_8_POOLS, 142 .enable_default_pool = 0, 143 .default_pool = 0, 144 .nb_pool_maps = 0, 145 .pool_map = {{0, 0},}, 146 }, 147 }, 148 }; 149 150 151 static unsigned lcore_ids[RTE_MAX_LCORE]; 152 static uint16_t ports[RTE_MAX_ETHPORTS]; 153 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 154 static uint16_t num_pf_queues, num_vmdq_queues; 155 static uint16_t vmdq_pool_base, vmdq_queue_base; 156 static uint16_t queues_per_pool; 157 158 const uint16_t vlan_tags[] = { 159 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 160 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 161 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 162 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 163 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 164 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 165 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 166 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 167 }; 168 169 /* ethernet addresses of ports */ 170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 171 172 static struct vhost_dev_tailq_list vhost_dev_list = 173 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 174 175 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 176 177 /* Used for queueing bursts of TX packets. */ 178 struct mbuf_table { 179 unsigned len; 180 unsigned txq_id; 181 struct rte_mbuf *m_table[MAX_PKT_BURST]; 182 }; 183 184 struct vhost_bufftable { 185 uint32_t len; 186 uint64_t pre_tsc; 187 struct rte_mbuf *m_table[MAX_PKT_BURST]; 188 }; 189 190 /* TX queue for each data core. */ 191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 192 193 /* 194 * Vhost TX buffer for each data core. 195 * Every data core maintains a TX buffer for every vhost device, 196 * which is used for batch pkts enqueue for higher performance. 197 */ 198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 199 200 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 201 / US_PER_S * BURST_TX_DRAIN_US) 202 203 static inline bool 204 is_dma_configured(int16_t dev_id) 205 { 206 int i; 207 208 for (i = 0; i < dma_count; i++) 209 if (dmas_id[i] == dev_id) 210 return true; 211 return false; 212 } 213 214 static inline int 215 open_dma(const char *value) 216 { 217 struct dma_for_vhost *dma_info = dma_bind; 218 char *input = strndup(value, strlen(value) + 1); 219 char *addrs = input; 220 char *ptrs[2]; 221 char *start, *end, *substr; 222 int64_t vid; 223 224 struct rte_dma_info info; 225 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 226 struct rte_dma_vchan_conf qconf = { 227 .direction = RTE_DMA_DIR_MEM_TO_MEM, 228 .nb_desc = DMA_RING_SIZE 229 }; 230 231 int dev_id; 232 int ret = 0; 233 uint16_t i = 0; 234 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 235 int args_nr; 236 237 while (isblank(*addrs)) 238 addrs++; 239 if (*addrs == '\0') { 240 ret = -1; 241 goto out; 242 } 243 244 /* process DMA devices within bracket. */ 245 addrs++; 246 substr = strtok(addrs, ";]"); 247 if (!substr) { 248 ret = -1; 249 goto out; 250 } 251 252 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 253 if (args_nr <= 0) { 254 ret = -1; 255 goto out; 256 } 257 258 while (i < args_nr) { 259 char *arg_temp = dma_arg[i]; 260 uint8_t sub_nr; 261 262 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 263 if (sub_nr != 2) { 264 ret = -1; 265 goto out; 266 } 267 268 start = strstr(ptrs[0], "txd"); 269 if (start == NULL) { 270 ret = -1; 271 goto out; 272 } 273 274 start += 3; 275 vid = strtol(start, &end, 0); 276 if (end == start) { 277 ret = -1; 278 goto out; 279 } 280 281 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 282 if (dev_id < 0) { 283 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 284 ret = -1; 285 goto out; 286 } 287 288 /* DMA device is already configured, so skip */ 289 if (is_dma_configured(dev_id)) 290 goto done; 291 292 if (rte_dma_info_get(dev_id, &info) != 0) { 293 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 294 ret = -1; 295 goto out; 296 } 297 298 if (info.max_vchans < 1) { 299 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 300 ret = -1; 301 goto out; 302 } 303 304 if (rte_dma_configure(dev_id, &dev_config) != 0) { 305 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 306 ret = -1; 307 goto out; 308 } 309 310 /* Check the max desc supported by DMA device */ 311 rte_dma_info_get(dev_id, &info); 312 if (info.nb_vchans != 1) { 313 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 314 dev_id); 315 ret = -1; 316 goto out; 317 } 318 319 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 320 321 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 322 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 323 ret = -1; 324 goto out; 325 } 326 327 if (rte_dma_start(dev_id) != 0) { 328 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 329 ret = -1; 330 goto out; 331 } 332 333 dmas_id[dma_count++] = dev_id; 334 335 done: 336 (dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id; 337 i++; 338 } 339 out: 340 free(input); 341 return ret; 342 } 343 344 /* 345 * Builds up the correct configuration for VMDQ VLAN pool map 346 * according to the pool & queue limits. 347 */ 348 static inline int 349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 350 { 351 struct rte_eth_vmdq_rx_conf conf; 352 struct rte_eth_vmdq_rx_conf *def_conf = 353 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 354 unsigned i; 355 356 memset(&conf, 0, sizeof(conf)); 357 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 358 conf.nb_pool_maps = num_devices; 359 conf.enable_loop_back = def_conf->enable_loop_back; 360 conf.rx_mode = def_conf->rx_mode; 361 362 for (i = 0; i < conf.nb_pool_maps; i++) { 363 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 364 conf.pool_map[i].pools = (1UL << i); 365 } 366 367 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 368 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 369 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 370 return 0; 371 } 372 373 /* 374 * Initialises a given port using global settings and with the rx buffers 375 * coming from the mbuf_pool passed as parameter 376 */ 377 static inline int 378 port_init(uint16_t port) 379 { 380 struct rte_eth_dev_info dev_info; 381 struct rte_eth_conf port_conf; 382 struct rte_eth_rxconf *rxconf; 383 struct rte_eth_txconf *txconf; 384 int16_t rx_rings, tx_rings; 385 uint16_t rx_ring_size, tx_ring_size; 386 int retval; 387 uint16_t q; 388 389 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 390 retval = rte_eth_dev_info_get(port, &dev_info); 391 if (retval != 0) { 392 RTE_LOG(ERR, VHOST_PORT, 393 "Error during getting device (port %u) info: %s\n", 394 port, strerror(-retval)); 395 396 return retval; 397 } 398 399 rxconf = &dev_info.default_rxconf; 400 txconf = &dev_info.default_txconf; 401 rxconf->rx_drop_en = 1; 402 403 /*configure the number of supported virtio devices based on VMDQ limits */ 404 num_devices = dev_info.max_vmdq_pools; 405 406 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 407 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 408 409 tx_rings = (uint16_t)rte_lcore_count(); 410 411 if (mergeable) { 412 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 413 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 414 else 415 vmdq_conf_default.rxmode.mtu = MAX_MTU; 416 } 417 418 /* Get port configuration. */ 419 retval = get_eth_conf(&port_conf, num_devices); 420 if (retval < 0) 421 return retval; 422 /* NIC queues are divided into pf queues and vmdq queues. */ 423 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 424 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 425 num_vmdq_queues = num_devices * queues_per_pool; 426 num_queues = num_pf_queues + num_vmdq_queues; 427 vmdq_queue_base = dev_info.vmdq_queue_base; 428 vmdq_pool_base = dev_info.vmdq_pool_base; 429 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 430 num_pf_queues, num_devices, queues_per_pool); 431 432 if (!rte_eth_dev_is_valid_port(port)) 433 return -1; 434 435 rx_rings = (uint16_t)dev_info.max_rx_queues; 436 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 437 port_conf.txmode.offloads |= 438 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 439 /* Configure ethernet device. */ 440 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 441 if (retval != 0) { 442 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 443 port, strerror(-retval)); 444 return retval; 445 } 446 447 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 448 &tx_ring_size); 449 if (retval != 0) { 450 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 451 "for port %u: %s.\n", port, strerror(-retval)); 452 return retval; 453 } 454 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 455 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 456 "for Rx queues on port %u.\n", port); 457 return -1; 458 } 459 460 /* Setup the queues. */ 461 rxconf->offloads = port_conf.rxmode.offloads; 462 for (q = 0; q < rx_rings; q ++) { 463 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 464 rte_eth_dev_socket_id(port), 465 rxconf, 466 mbuf_pool); 467 if (retval < 0) { 468 RTE_LOG(ERR, VHOST_PORT, 469 "Failed to setup rx queue %u of port %u: %s.\n", 470 q, port, strerror(-retval)); 471 return retval; 472 } 473 } 474 txconf->offloads = port_conf.txmode.offloads; 475 for (q = 0; q < tx_rings; q ++) { 476 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 477 rte_eth_dev_socket_id(port), 478 txconf); 479 if (retval < 0) { 480 RTE_LOG(ERR, VHOST_PORT, 481 "Failed to setup tx queue %u of port %u: %s.\n", 482 q, port, strerror(-retval)); 483 return retval; 484 } 485 } 486 487 /* Start the device. */ 488 retval = rte_eth_dev_start(port); 489 if (retval < 0) { 490 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 491 port, strerror(-retval)); 492 return retval; 493 } 494 495 if (promiscuous) { 496 retval = rte_eth_promiscuous_enable(port); 497 if (retval != 0) { 498 RTE_LOG(ERR, VHOST_PORT, 499 "Failed to enable promiscuous mode on port %u: %s\n", 500 port, rte_strerror(-retval)); 501 return retval; 502 } 503 } 504 505 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 506 if (retval < 0) { 507 RTE_LOG(ERR, VHOST_PORT, 508 "Failed to get MAC address on port %u: %s\n", 509 port, rte_strerror(-retval)); 510 return retval; 511 } 512 513 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 514 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 515 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 516 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 517 518 return 0; 519 } 520 521 /* 522 * Set socket file path. 523 */ 524 static int 525 us_vhost_parse_socket_path(const char *q_arg) 526 { 527 char *old; 528 529 /* parse number string */ 530 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 531 return -1; 532 533 old = socket_files; 534 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 535 if (socket_files == NULL) { 536 free(old); 537 return -1; 538 } 539 540 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 541 nb_sockets++; 542 543 return 0; 544 } 545 546 /* 547 * Parse the portmask provided at run time. 548 */ 549 static int 550 parse_portmask(const char *portmask) 551 { 552 char *end = NULL; 553 unsigned long pm; 554 555 errno = 0; 556 557 /* parse hexadecimal string */ 558 pm = strtoul(portmask, &end, 16); 559 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 560 return 0; 561 562 return pm; 563 564 } 565 566 /* 567 * Parse num options at run time. 568 */ 569 static int 570 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 571 { 572 char *end = NULL; 573 unsigned long num; 574 575 errno = 0; 576 577 /* parse unsigned int string */ 578 num = strtoul(q_arg, &end, 10); 579 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 580 return -1; 581 582 if (num > max_valid_value) 583 return -1; 584 585 return num; 586 587 } 588 589 /* 590 * Display usage 591 */ 592 static void 593 us_vhost_usage(const char *prgname) 594 { 595 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 596 " --vm2vm [0|1|2]\n" 597 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 598 " --socket-file <path>\n" 599 " --nb-devices ND\n" 600 " -p PORTMASK: Set mask for ports to be used by application\n" 601 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 602 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 603 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 604 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 605 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 606 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 607 " --socket-file: The path of the socket file.\n" 608 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 609 " --tso [0|1] disable/enable TCP segment offload.\n" 610 " --client register a vhost-user socket as client mode.\n" 611 " --dmas register dma channel for specific vhost device.\n", 612 prgname); 613 } 614 615 enum { 616 #define OPT_VM2VM "vm2vm" 617 OPT_VM2VM_NUM = 256, 618 #define OPT_RX_RETRY "rx-retry" 619 OPT_RX_RETRY_NUM, 620 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 621 OPT_RX_RETRY_DELAY_NUM, 622 #define OPT_RX_RETRY_NUMB "rx-retry-num" 623 OPT_RX_RETRY_NUMB_NUM, 624 #define OPT_MERGEABLE "mergeable" 625 OPT_MERGEABLE_NUM, 626 #define OPT_STATS "stats" 627 OPT_STATS_NUM, 628 #define OPT_SOCKET_FILE "socket-file" 629 OPT_SOCKET_FILE_NUM, 630 #define OPT_TX_CSUM "tx-csum" 631 OPT_TX_CSUM_NUM, 632 #define OPT_TSO "tso" 633 OPT_TSO_NUM, 634 #define OPT_CLIENT "client" 635 OPT_CLIENT_NUM, 636 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 637 OPT_BUILTIN_NET_DRIVER_NUM, 638 #define OPT_DMAS "dmas" 639 OPT_DMAS_NUM, 640 }; 641 642 /* 643 * Parse the arguments given in the command line of the application. 644 */ 645 static int 646 us_vhost_parse_args(int argc, char **argv) 647 { 648 int opt, ret; 649 int option_index; 650 unsigned i; 651 const char *prgname = argv[0]; 652 static struct option long_option[] = { 653 {OPT_VM2VM, required_argument, 654 NULL, OPT_VM2VM_NUM}, 655 {OPT_RX_RETRY, required_argument, 656 NULL, OPT_RX_RETRY_NUM}, 657 {OPT_RX_RETRY_DELAY, required_argument, 658 NULL, OPT_RX_RETRY_DELAY_NUM}, 659 {OPT_RX_RETRY_NUMB, required_argument, 660 NULL, OPT_RX_RETRY_NUMB_NUM}, 661 {OPT_MERGEABLE, required_argument, 662 NULL, OPT_MERGEABLE_NUM}, 663 {OPT_STATS, required_argument, 664 NULL, OPT_STATS_NUM}, 665 {OPT_SOCKET_FILE, required_argument, 666 NULL, OPT_SOCKET_FILE_NUM}, 667 {OPT_TX_CSUM, required_argument, 668 NULL, OPT_TX_CSUM_NUM}, 669 {OPT_TSO, required_argument, 670 NULL, OPT_TSO_NUM}, 671 {OPT_CLIENT, no_argument, 672 NULL, OPT_CLIENT_NUM}, 673 {OPT_BUILTIN_NET_DRIVER, no_argument, 674 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 675 {OPT_DMAS, required_argument, 676 NULL, OPT_DMAS_NUM}, 677 {NULL, 0, 0, 0}, 678 }; 679 680 /* Parse command line */ 681 while ((opt = getopt_long(argc, argv, "p:P", 682 long_option, &option_index)) != EOF) { 683 switch (opt) { 684 /* Portmask */ 685 case 'p': 686 enabled_port_mask = parse_portmask(optarg); 687 if (enabled_port_mask == 0) { 688 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 689 us_vhost_usage(prgname); 690 return -1; 691 } 692 break; 693 694 case 'P': 695 promiscuous = 1; 696 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 697 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 698 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 699 break; 700 701 case OPT_VM2VM_NUM: 702 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 703 if (ret == -1) { 704 RTE_LOG(INFO, VHOST_CONFIG, 705 "Invalid argument for " 706 "vm2vm [0|1|2]\n"); 707 us_vhost_usage(prgname); 708 return -1; 709 } 710 vm2vm_mode = (vm2vm_type)ret; 711 break; 712 713 case OPT_RX_RETRY_NUM: 714 ret = parse_num_opt(optarg, 1); 715 if (ret == -1) { 716 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 717 us_vhost_usage(prgname); 718 return -1; 719 } 720 enable_retry = ret; 721 break; 722 723 case OPT_TX_CSUM_NUM: 724 ret = parse_num_opt(optarg, 1); 725 if (ret == -1) { 726 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 727 us_vhost_usage(prgname); 728 return -1; 729 } 730 enable_tx_csum = ret; 731 break; 732 733 case OPT_TSO_NUM: 734 ret = parse_num_opt(optarg, 1); 735 if (ret == -1) { 736 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 737 us_vhost_usage(prgname); 738 return -1; 739 } 740 enable_tso = ret; 741 break; 742 743 case OPT_RX_RETRY_DELAY_NUM: 744 ret = parse_num_opt(optarg, INT32_MAX); 745 if (ret == -1) { 746 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 747 us_vhost_usage(prgname); 748 return -1; 749 } 750 burst_rx_delay_time = ret; 751 break; 752 753 case OPT_RX_RETRY_NUMB_NUM: 754 ret = parse_num_opt(optarg, INT32_MAX); 755 if (ret == -1) { 756 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 757 us_vhost_usage(prgname); 758 return -1; 759 } 760 burst_rx_retry_num = ret; 761 break; 762 763 case OPT_MERGEABLE_NUM: 764 ret = parse_num_opt(optarg, 1); 765 if (ret == -1) { 766 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 767 us_vhost_usage(prgname); 768 return -1; 769 } 770 mergeable = !!ret; 771 break; 772 773 case OPT_STATS_NUM: 774 ret = parse_num_opt(optarg, INT32_MAX); 775 if (ret == -1) { 776 RTE_LOG(INFO, VHOST_CONFIG, 777 "Invalid argument for stats [0..N]\n"); 778 us_vhost_usage(prgname); 779 return -1; 780 } 781 enable_stats = ret; 782 break; 783 784 /* Set socket file path. */ 785 case OPT_SOCKET_FILE_NUM: 786 if (us_vhost_parse_socket_path(optarg) == -1) { 787 RTE_LOG(INFO, VHOST_CONFIG, 788 "Invalid argument for socket name (Max %d characters)\n", 789 PATH_MAX); 790 us_vhost_usage(prgname); 791 return -1; 792 } 793 break; 794 795 case OPT_DMAS_NUM: 796 if (open_dma(optarg) == -1) { 797 RTE_LOG(INFO, VHOST_CONFIG, 798 "Wrong DMA args\n"); 799 us_vhost_usage(prgname); 800 return -1; 801 } 802 break; 803 804 case OPT_CLIENT_NUM: 805 client_mode = 1; 806 break; 807 808 case OPT_BUILTIN_NET_DRIVER_NUM: 809 builtin_net_driver = 1; 810 break; 811 812 /* Invalid option - print options. */ 813 default: 814 us_vhost_usage(prgname); 815 return -1; 816 } 817 } 818 819 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 820 if (enabled_port_mask & (1 << i)) 821 ports[num_ports++] = i; 822 } 823 824 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 825 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 826 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 827 return -1; 828 } 829 830 return 0; 831 } 832 833 /* 834 * Update the global var NUM_PORTS and array PORTS according to system ports number 835 * and return valid ports number 836 */ 837 static unsigned check_ports_num(unsigned nb_ports) 838 { 839 unsigned valid_num_ports = num_ports; 840 unsigned portid; 841 842 if (num_ports > nb_ports) { 843 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 844 num_ports, nb_ports); 845 num_ports = nb_ports; 846 } 847 848 for (portid = 0; portid < num_ports; portid ++) { 849 if (!rte_eth_dev_is_valid_port(ports[portid])) { 850 RTE_LOG(INFO, VHOST_PORT, 851 "\nSpecified port ID(%u) is not valid\n", 852 ports[portid]); 853 ports[portid] = INVALID_PORT_ID; 854 valid_num_ports--; 855 } 856 } 857 return valid_num_ports; 858 } 859 860 static __rte_always_inline struct vhost_dev * 861 find_vhost_dev(struct rte_ether_addr *mac) 862 { 863 struct vhost_dev *vdev; 864 865 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 866 if (vdev->ready == DEVICE_RX && 867 rte_is_same_ether_addr(mac, &vdev->mac_address)) 868 return vdev; 869 } 870 871 return NULL; 872 } 873 874 /* 875 * This function learns the MAC address of the device and registers this along with a 876 * vlan tag to a VMDQ. 877 */ 878 static int 879 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 880 { 881 struct rte_ether_hdr *pkt_hdr; 882 int i, ret; 883 884 /* Learn MAC address of guest device from packet */ 885 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 886 887 if (find_vhost_dev(&pkt_hdr->src_addr)) { 888 RTE_LOG(ERR, VHOST_DATA, 889 "(%d) device is using a registered MAC!\n", 890 vdev->vid); 891 return -1; 892 } 893 894 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 895 vdev->mac_address.addr_bytes[i] = 896 pkt_hdr->src_addr.addr_bytes[i]; 897 898 /* vlan_tag currently uses the device_id. */ 899 vdev->vlan_tag = vlan_tags[vdev->vid]; 900 901 /* Print out VMDQ registration info. */ 902 RTE_LOG(INFO, VHOST_DATA, 903 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 904 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 905 vdev->vlan_tag); 906 907 /* Register the MAC address. */ 908 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 909 (uint32_t)vdev->vid + vmdq_pool_base); 910 if (ret) 911 RTE_LOG(ERR, VHOST_DATA, 912 "(%d) failed to add device MAC address to VMDQ\n", 913 vdev->vid); 914 915 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 916 917 /* Set device as ready for RX. */ 918 vdev->ready = DEVICE_RX; 919 920 return 0; 921 } 922 923 /* 924 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 925 * queue before disabling RX on the device. 926 */ 927 static inline void 928 unlink_vmdq(struct vhost_dev *vdev) 929 { 930 unsigned i = 0; 931 unsigned rx_count; 932 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 933 934 if (vdev->ready == DEVICE_RX) { 935 /*clear MAC and VLAN settings*/ 936 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 937 for (i = 0; i < 6; i++) 938 vdev->mac_address.addr_bytes[i] = 0; 939 940 vdev->vlan_tag = 0; 941 942 /*Clear out the receive buffers*/ 943 rx_count = rte_eth_rx_burst(ports[0], 944 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 945 946 while (rx_count) { 947 for (i = 0; i < rx_count; i++) 948 rte_pktmbuf_free(pkts_burst[i]); 949 950 rx_count = rte_eth_rx_burst(ports[0], 951 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 952 } 953 954 vdev->ready = DEVICE_MAC_LEARNING; 955 } 956 } 957 958 static inline void 959 free_pkts(struct rte_mbuf **pkts, uint16_t n) 960 { 961 while (n--) 962 rte_pktmbuf_free(pkts[n]); 963 } 964 965 static __rte_always_inline void 966 complete_async_pkts(struct vhost_dev *vdev) 967 { 968 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 969 uint16_t complete_count; 970 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id; 971 972 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 973 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 974 if (complete_count) { 975 free_pkts(p_cpl, complete_count); 976 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST); 977 } 978 979 } 980 981 static __rte_always_inline void 982 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 983 struct rte_mbuf *m) 984 { 985 uint16_t ret; 986 987 if (builtin_net_driver) { 988 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 989 } else { 990 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 991 } 992 993 if (enable_stats) { 994 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 995 __ATOMIC_SEQ_CST); 996 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 997 __ATOMIC_SEQ_CST); 998 src_vdev->stats.tx_total++; 999 src_vdev->stats.tx += ret; 1000 } 1001 } 1002 1003 static __rte_always_inline void 1004 drain_vhost(struct vhost_dev *vdev) 1005 { 1006 uint16_t ret; 1007 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1008 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1009 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1010 1011 if (builtin_net_driver) { 1012 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); 1013 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) { 1014 uint16_t enqueue_fail = 0; 1015 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id; 1016 1017 complete_async_pkts(vdev); 1018 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0); 1019 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST); 1020 1021 enqueue_fail = nr_xmit - ret; 1022 if (enqueue_fail) 1023 free_pkts(&m[ret], nr_xmit - ret); 1024 } else { 1025 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1026 m, nr_xmit); 1027 } 1028 1029 if (enable_stats) { 1030 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 1031 __ATOMIC_SEQ_CST); 1032 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 1033 __ATOMIC_SEQ_CST); 1034 } 1035 1036 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) 1037 free_pkts(m, nr_xmit); 1038 } 1039 1040 static __rte_always_inline void 1041 drain_vhost_table(void) 1042 { 1043 uint16_t lcore_id = rte_lcore_id(); 1044 struct vhost_bufftable *vhost_txq; 1045 struct vhost_dev *vdev; 1046 uint64_t cur_tsc; 1047 1048 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1049 if (unlikely(vdev->remove == 1)) 1050 continue; 1051 1052 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1053 1054 cur_tsc = rte_rdtsc(); 1055 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1056 > MBUF_TABLE_DRAIN_TSC)) { 1057 RTE_LOG_DP(DEBUG, VHOST_DATA, 1058 "Vhost TX queue drained after timeout with burst size %u\n", 1059 vhost_txq->len); 1060 drain_vhost(vdev); 1061 vhost_txq->len = 0; 1062 vhost_txq->pre_tsc = cur_tsc; 1063 } 1064 } 1065 } 1066 1067 /* 1068 * Check if the packet destination MAC address is for a local device. If so then put 1069 * the packet on that devices RX queue. If not then return. 1070 */ 1071 static __rte_always_inline int 1072 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1073 { 1074 struct rte_ether_hdr *pkt_hdr; 1075 struct vhost_dev *dst_vdev; 1076 struct vhost_bufftable *vhost_txq; 1077 uint16_t lcore_id = rte_lcore_id(); 1078 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1079 1080 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1081 if (!dst_vdev) 1082 return -1; 1083 1084 if (vdev->vid == dst_vdev->vid) { 1085 RTE_LOG_DP(DEBUG, VHOST_DATA, 1086 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1087 vdev->vid); 1088 return 0; 1089 } 1090 1091 RTE_LOG_DP(DEBUG, VHOST_DATA, 1092 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1093 1094 if (unlikely(dst_vdev->remove)) { 1095 RTE_LOG_DP(DEBUG, VHOST_DATA, 1096 "(%d) device is marked for removal\n", dst_vdev->vid); 1097 return 0; 1098 } 1099 1100 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1101 vhost_txq->m_table[vhost_txq->len++] = m; 1102 1103 if (enable_stats) { 1104 vdev->stats.tx_total++; 1105 vdev->stats.tx++; 1106 } 1107 1108 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1109 drain_vhost(dst_vdev); 1110 vhost_txq->len = 0; 1111 vhost_txq->pre_tsc = rte_rdtsc(); 1112 } 1113 return 0; 1114 } 1115 1116 /* 1117 * Check if the destination MAC of a packet is one local VM, 1118 * and get its vlan tag, and offset if it is. 1119 */ 1120 static __rte_always_inline int 1121 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1122 uint32_t *offset, uint16_t *vlan_tag) 1123 { 1124 struct vhost_dev *dst_vdev; 1125 struct rte_ether_hdr *pkt_hdr = 1126 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1127 1128 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1129 if (!dst_vdev) 1130 return 0; 1131 1132 if (vdev->vid == dst_vdev->vid) { 1133 RTE_LOG_DP(DEBUG, VHOST_DATA, 1134 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1135 vdev->vid); 1136 return -1; 1137 } 1138 1139 /* 1140 * HW vlan strip will reduce the packet length 1141 * by minus length of vlan tag, so need restore 1142 * the packet length by plus it. 1143 */ 1144 *offset = RTE_VLAN_HLEN; 1145 *vlan_tag = vlan_tags[vdev->vid]; 1146 1147 RTE_LOG_DP(DEBUG, VHOST_DATA, 1148 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1149 vdev->vid, dst_vdev->vid, *vlan_tag); 1150 1151 return 0; 1152 } 1153 1154 static void virtio_tx_offload(struct rte_mbuf *m) 1155 { 1156 struct rte_net_hdr_lens hdr_lens; 1157 struct rte_ipv4_hdr *ipv4_hdr; 1158 struct rte_tcp_hdr *tcp_hdr; 1159 uint32_t ptype; 1160 void *l3_hdr; 1161 1162 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1163 m->l2_len = hdr_lens.l2_len; 1164 m->l3_len = hdr_lens.l3_len; 1165 m->l4_len = hdr_lens.l4_len; 1166 1167 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1168 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1169 m->l2_len + m->l3_len); 1170 1171 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1172 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1173 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1174 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1175 ipv4_hdr = l3_hdr; 1176 ipv4_hdr->hdr_checksum = 0; 1177 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1178 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1179 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1180 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1181 } 1182 } 1183 1184 static __rte_always_inline void 1185 do_drain_mbuf_table(struct mbuf_table *tx_q) 1186 { 1187 uint16_t count; 1188 1189 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1190 tx_q->m_table, tx_q->len); 1191 if (unlikely(count < tx_q->len)) 1192 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1193 1194 tx_q->len = 0; 1195 } 1196 1197 /* 1198 * This function routes the TX packet to the correct interface. This 1199 * may be a local device or the physical port. 1200 */ 1201 static __rte_always_inline void 1202 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1203 { 1204 struct mbuf_table *tx_q; 1205 unsigned offset = 0; 1206 const uint16_t lcore_id = rte_lcore_id(); 1207 struct rte_ether_hdr *nh; 1208 1209 1210 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1211 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1212 struct vhost_dev *vdev2; 1213 1214 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1215 if (vdev2 != vdev) 1216 sync_virtio_xmit(vdev2, vdev, m); 1217 } 1218 goto queue2nic; 1219 } 1220 1221 /*check if destination is local VM*/ 1222 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1223 return; 1224 1225 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1226 if (unlikely(find_local_dest(vdev, m, &offset, 1227 &vlan_tag) != 0)) { 1228 rte_pktmbuf_free(m); 1229 return; 1230 } 1231 } 1232 1233 RTE_LOG_DP(DEBUG, VHOST_DATA, 1234 "(%d) TX: MAC address is external\n", vdev->vid); 1235 1236 queue2nic: 1237 1238 /*Add packet to the port tx queue*/ 1239 tx_q = &lcore_tx_queue[lcore_id]; 1240 1241 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1242 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1243 /* Guest has inserted the vlan tag. */ 1244 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1245 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1246 if ((vm2vm_mode == VM2VM_HARDWARE) && 1247 (vh->vlan_tci != vlan_tag_be)) 1248 vh->vlan_tci = vlan_tag_be; 1249 } else { 1250 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1251 1252 /* 1253 * Find the right seg to adjust the data len when offset is 1254 * bigger than tail room size. 1255 */ 1256 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1257 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1258 m->data_len += offset; 1259 else { 1260 struct rte_mbuf *seg = m; 1261 1262 while ((seg->next != NULL) && 1263 (offset > rte_pktmbuf_tailroom(seg))) 1264 seg = seg->next; 1265 1266 seg->data_len += offset; 1267 } 1268 m->pkt_len += offset; 1269 } 1270 1271 m->vlan_tci = vlan_tag; 1272 } 1273 1274 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1275 virtio_tx_offload(m); 1276 1277 tx_q->m_table[tx_q->len++] = m; 1278 if (enable_stats) { 1279 vdev->stats.tx_total++; 1280 vdev->stats.tx++; 1281 } 1282 1283 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1284 do_drain_mbuf_table(tx_q); 1285 } 1286 1287 1288 static __rte_always_inline void 1289 drain_mbuf_table(struct mbuf_table *tx_q) 1290 { 1291 static uint64_t prev_tsc; 1292 uint64_t cur_tsc; 1293 1294 if (tx_q->len == 0) 1295 return; 1296 1297 cur_tsc = rte_rdtsc(); 1298 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1299 prev_tsc = cur_tsc; 1300 1301 RTE_LOG_DP(DEBUG, VHOST_DATA, 1302 "TX queue drained after timeout with burst size %u\n", 1303 tx_q->len); 1304 do_drain_mbuf_table(tx_q); 1305 } 1306 } 1307 1308 static __rte_always_inline void 1309 drain_eth_rx(struct vhost_dev *vdev) 1310 { 1311 uint16_t rx_count, enqueue_count; 1312 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1313 1314 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1315 pkts, MAX_PKT_BURST); 1316 1317 if (!rx_count) 1318 return; 1319 1320 /* 1321 * When "enable_retry" is set, here we wait and retry when there 1322 * is no enough free slots in the queue to hold @rx_count packets, 1323 * to diminish packet loss. 1324 */ 1325 if (enable_retry && 1326 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1327 VIRTIO_RXQ))) { 1328 uint32_t retry; 1329 1330 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1331 rte_delay_us(burst_rx_delay_time); 1332 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1333 VIRTIO_RXQ)) 1334 break; 1335 } 1336 } 1337 1338 if (builtin_net_driver) { 1339 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1340 pkts, rx_count); 1341 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) { 1342 uint16_t enqueue_fail = 0; 1343 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id; 1344 1345 complete_async_pkts(vdev); 1346 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, 1347 VIRTIO_RXQ, pkts, rx_count, dma_id, 0); 1348 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST); 1349 1350 enqueue_fail = rx_count - enqueue_count; 1351 if (enqueue_fail) 1352 free_pkts(&pkts[enqueue_count], enqueue_fail); 1353 1354 } else { 1355 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1356 pkts, rx_count); 1357 } 1358 1359 if (enable_stats) { 1360 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1361 __ATOMIC_SEQ_CST); 1362 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1363 __ATOMIC_SEQ_CST); 1364 } 1365 1366 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) 1367 free_pkts(pkts, rx_count); 1368 } 1369 1370 static __rte_always_inline void 1371 drain_virtio_tx(struct vhost_dev *vdev) 1372 { 1373 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1374 uint16_t count; 1375 uint16_t i; 1376 1377 if (builtin_net_driver) { 1378 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1379 pkts, MAX_PKT_BURST); 1380 } else { 1381 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1382 mbuf_pool, pkts, MAX_PKT_BURST); 1383 } 1384 1385 /* setup VMDq for the first packet */ 1386 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1387 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1388 free_pkts(pkts, count); 1389 } 1390 1391 for (i = 0; i < count; ++i) 1392 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1393 } 1394 1395 /* 1396 * Main function of vhost-switch. It basically does: 1397 * 1398 * for each vhost device { 1399 * - drain_eth_rx() 1400 * 1401 * Which drains the host eth Rx queue linked to the vhost device, 1402 * and deliver all of them to guest virito Rx ring associated with 1403 * this vhost device. 1404 * 1405 * - drain_virtio_tx() 1406 * 1407 * Which drains the guest virtio Tx queue and deliver all of them 1408 * to the target, which could be another vhost device, or the 1409 * physical eth dev. The route is done in function "virtio_tx_route". 1410 * } 1411 */ 1412 static int 1413 switch_worker(void *arg __rte_unused) 1414 { 1415 unsigned i; 1416 unsigned lcore_id = rte_lcore_id(); 1417 struct vhost_dev *vdev; 1418 struct mbuf_table *tx_q; 1419 1420 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1421 1422 tx_q = &lcore_tx_queue[lcore_id]; 1423 for (i = 0; i < rte_lcore_count(); i++) { 1424 if (lcore_ids[i] == lcore_id) { 1425 tx_q->txq_id = i; 1426 break; 1427 } 1428 } 1429 1430 while(1) { 1431 drain_mbuf_table(tx_q); 1432 drain_vhost_table(); 1433 /* 1434 * Inform the configuration core that we have exited the 1435 * linked list and that no devices are in use if requested. 1436 */ 1437 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1438 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1439 1440 /* 1441 * Process vhost devices 1442 */ 1443 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1444 lcore_vdev_entry) { 1445 if (unlikely(vdev->remove)) { 1446 unlink_vmdq(vdev); 1447 vdev->ready = DEVICE_SAFE_REMOVE; 1448 continue; 1449 } 1450 1451 if (likely(vdev->ready == DEVICE_RX)) 1452 drain_eth_rx(vdev); 1453 1454 if (likely(!vdev->remove)) 1455 drain_virtio_tx(vdev); 1456 } 1457 } 1458 1459 return 0; 1460 } 1461 1462 /* 1463 * Remove a device from the specific data core linked list and from the 1464 * main linked list. Synchronization occurs through the use of the 1465 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1466 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1467 */ 1468 static void 1469 destroy_device(int vid) 1470 { 1471 struct vhost_dev *vdev = NULL; 1472 int lcore; 1473 uint16_t i; 1474 1475 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1476 if (vdev->vid == vid) 1477 break; 1478 } 1479 if (!vdev) 1480 return; 1481 /*set the remove flag. */ 1482 vdev->remove = 1; 1483 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1484 rte_pause(); 1485 } 1486 1487 for (i = 0; i < RTE_MAX_LCORE; i++) 1488 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1489 1490 if (builtin_net_driver) 1491 vs_vhost_net_remove(vdev); 1492 1493 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1494 lcore_vdev_entry); 1495 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1496 1497 1498 /* Set the dev_removal_flag on each lcore. */ 1499 RTE_LCORE_FOREACH_WORKER(lcore) 1500 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1501 1502 /* 1503 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1504 * we can be sure that they can no longer access the device removed 1505 * from the linked lists and that the devices are no longer in use. 1506 */ 1507 RTE_LCORE_FOREACH_WORKER(lcore) { 1508 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1509 rte_pause(); 1510 } 1511 1512 lcore_info[vdev->coreid].device_num--; 1513 1514 RTE_LOG(INFO, VHOST_DATA, 1515 "(%d) device has been removed from data core\n", 1516 vdev->vid); 1517 1518 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1519 uint16_t n_pkt = 0; 1520 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id; 1521 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1522 1523 while (vdev->pkts_inflight) { 1524 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ, 1525 m_cpl, vdev->pkts_inflight, dma_id, 0); 1526 free_pkts(m_cpl, n_pkt); 1527 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1528 } 1529 1530 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1531 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1532 } 1533 1534 rte_free(vdev); 1535 } 1536 1537 /* 1538 * A new device is added to a data core. First the device is added to the main linked list 1539 * and then allocated to a specific data core. 1540 */ 1541 static int 1542 new_device(int vid) 1543 { 1544 int lcore, core_add = 0; 1545 uint16_t i; 1546 uint32_t device_num_min = num_devices; 1547 struct vhost_dev *vdev; 1548 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1549 if (vdev == NULL) { 1550 RTE_LOG(INFO, VHOST_DATA, 1551 "(%d) couldn't allocate memory for vhost dev\n", 1552 vid); 1553 return -1; 1554 } 1555 vdev->vid = vid; 1556 1557 for (i = 0; i < RTE_MAX_LCORE; i++) { 1558 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1559 = rte_zmalloc("vhost bufftable", 1560 sizeof(struct vhost_bufftable), 1561 RTE_CACHE_LINE_SIZE); 1562 1563 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1564 RTE_LOG(INFO, VHOST_DATA, 1565 "(%d) couldn't allocate memory for vhost TX\n", vid); 1566 return -1; 1567 } 1568 } 1569 1570 if (builtin_net_driver) 1571 vs_vhost_net_setup(vdev); 1572 1573 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1574 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1575 1576 /*reset ready flag*/ 1577 vdev->ready = DEVICE_MAC_LEARNING; 1578 vdev->remove = 0; 1579 1580 /* Find a suitable lcore to add the device. */ 1581 RTE_LCORE_FOREACH_WORKER(lcore) { 1582 if (lcore_info[lcore].device_num < device_num_min) { 1583 device_num_min = lcore_info[lcore].device_num; 1584 core_add = lcore; 1585 } 1586 } 1587 vdev->coreid = core_add; 1588 1589 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1590 lcore_vdev_entry); 1591 lcore_info[vdev->coreid].device_num++; 1592 1593 /* Disable notifications. */ 1594 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1595 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1596 1597 RTE_LOG(INFO, VHOST_DATA, 1598 "(%d) device has been added to data core %d\n", 1599 vid, vdev->coreid); 1600 1601 if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1602 int ret; 1603 1604 ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1605 if (ret == 0) 1606 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true; 1607 return ret; 1608 } 1609 1610 return 0; 1611 } 1612 1613 static int 1614 vring_state_changed(int vid, uint16_t queue_id, int enable) 1615 { 1616 struct vhost_dev *vdev = NULL; 1617 1618 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1619 if (vdev->vid == vid) 1620 break; 1621 } 1622 if (!vdev) 1623 return -1; 1624 1625 if (queue_id != VIRTIO_RXQ) 1626 return 0; 1627 1628 if (dma_bind[vid].dmas[queue_id].async_enabled) { 1629 if (!enable) { 1630 uint16_t n_pkt = 0; 1631 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id; 1632 struct rte_mbuf *m_cpl[vdev->pkts_inflight]; 1633 1634 while (vdev->pkts_inflight) { 1635 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id, 1636 m_cpl, vdev->pkts_inflight, dma_id, 0); 1637 free_pkts(m_cpl, n_pkt); 1638 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST); 1639 } 1640 } 1641 } 1642 1643 return 0; 1644 } 1645 1646 /* 1647 * These callback allow devices to be added to the data core when configuration 1648 * has been fully complete. 1649 */ 1650 static const struct rte_vhost_device_ops virtio_net_device_ops = 1651 { 1652 .new_device = new_device, 1653 .destroy_device = destroy_device, 1654 .vring_state_changed = vring_state_changed, 1655 }; 1656 1657 /* 1658 * This is a thread will wake up after a period to print stats if the user has 1659 * enabled them. 1660 */ 1661 static void * 1662 print_stats(__rte_unused void *arg) 1663 { 1664 struct vhost_dev *vdev; 1665 uint64_t tx_dropped, rx_dropped; 1666 uint64_t tx, tx_total, rx, rx_total; 1667 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1668 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1669 1670 while(1) { 1671 sleep(enable_stats); 1672 1673 /* Clear screen and move to top left */ 1674 printf("%s%s\n", clr, top_left); 1675 printf("Device statistics =================================\n"); 1676 1677 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1678 tx_total = vdev->stats.tx_total; 1679 tx = vdev->stats.tx; 1680 tx_dropped = tx_total - tx; 1681 1682 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1683 __ATOMIC_SEQ_CST); 1684 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1685 __ATOMIC_SEQ_CST); 1686 rx_dropped = rx_total - rx; 1687 1688 printf("Statistics for device %d\n" 1689 "-----------------------\n" 1690 "TX total: %" PRIu64 "\n" 1691 "TX dropped: %" PRIu64 "\n" 1692 "TX successful: %" PRIu64 "\n" 1693 "RX total: %" PRIu64 "\n" 1694 "RX dropped: %" PRIu64 "\n" 1695 "RX successful: %" PRIu64 "\n", 1696 vdev->vid, 1697 tx_total, tx_dropped, tx, 1698 rx_total, rx_dropped, rx); 1699 } 1700 1701 printf("===================================================\n"); 1702 1703 fflush(stdout); 1704 } 1705 1706 return NULL; 1707 } 1708 1709 static void 1710 unregister_drivers(int socket_num) 1711 { 1712 int i, ret; 1713 1714 for (i = 0; i < socket_num; i++) { 1715 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1716 if (ret != 0) 1717 RTE_LOG(ERR, VHOST_CONFIG, 1718 "Fail to unregister vhost driver for %s.\n", 1719 socket_files + i * PATH_MAX); 1720 } 1721 } 1722 1723 /* When we receive a INT signal, unregister vhost driver */ 1724 static void 1725 sigint_handler(__rte_unused int signum) 1726 { 1727 /* Unregister vhost driver. */ 1728 unregister_drivers(nb_sockets); 1729 1730 exit(0); 1731 } 1732 1733 /* 1734 * While creating an mbuf pool, one key thing is to figure out how 1735 * many mbuf entries is enough for our use. FYI, here are some 1736 * guidelines: 1737 * 1738 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1739 * 1740 * - For each switch core (A CPU core does the packet switch), we need 1741 * also make some reservation for receiving the packets from virtio 1742 * Tx queue. How many is enough depends on the usage. It's normally 1743 * a simple calculation like following: 1744 * 1745 * MAX_PKT_BURST * max packet size / mbuf size 1746 * 1747 * So, we definitely need allocate more mbufs when TSO is enabled. 1748 * 1749 * - Similarly, for each switching core, we should serve @nr_rx_desc 1750 * mbufs for receiving the packets from physical NIC device. 1751 * 1752 * - We also need make sure, for each switch core, we have allocated 1753 * enough mbufs to fill up the mbuf cache. 1754 */ 1755 static void 1756 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1757 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1758 { 1759 uint32_t nr_mbufs; 1760 uint32_t nr_mbufs_per_core; 1761 uint32_t mtu = 1500; 1762 1763 if (mergeable) 1764 mtu = 9000; 1765 if (enable_tso) 1766 mtu = 64 * 1024; 1767 1768 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1769 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1770 nr_mbufs_per_core += nr_rx_desc; 1771 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1772 1773 nr_mbufs = nr_queues * nr_rx_desc; 1774 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1775 nr_mbufs *= nr_port; 1776 1777 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1778 nr_mbuf_cache, 0, mbuf_size, 1779 rte_socket_id()); 1780 if (mbuf_pool == NULL) 1781 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1782 } 1783 1784 static void 1785 reset_dma(void) 1786 { 1787 int i; 1788 1789 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1790 int j; 1791 1792 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1793 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1794 dma_bind[i].dmas[j].async_enabled = false; 1795 } 1796 } 1797 1798 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1799 dmas_id[i] = INVALID_DMA_ID; 1800 } 1801 1802 /* 1803 * Main function, does initialisation and calls the per-lcore functions. 1804 */ 1805 int 1806 main(int argc, char *argv[]) 1807 { 1808 unsigned lcore_id, core_id = 0; 1809 unsigned nb_ports, valid_num_ports; 1810 int ret, i; 1811 uint16_t portid; 1812 static pthread_t tid; 1813 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1814 1815 signal(SIGINT, sigint_handler); 1816 1817 /* init EAL */ 1818 ret = rte_eal_init(argc, argv); 1819 if (ret < 0) 1820 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1821 argc -= ret; 1822 argv += ret; 1823 1824 /* initialize dma structures */ 1825 reset_dma(); 1826 1827 /* parse app arguments */ 1828 ret = us_vhost_parse_args(argc, argv); 1829 if (ret < 0) 1830 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1831 1832 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1833 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1834 1835 if (rte_lcore_is_enabled(lcore_id)) 1836 lcore_ids[core_id++] = lcore_id; 1837 } 1838 1839 if (rte_lcore_count() > RTE_MAX_LCORE) 1840 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1841 1842 /* Get the number of physical ports. */ 1843 nb_ports = rte_eth_dev_count_avail(); 1844 1845 /* 1846 * Update the global var NUM_PORTS and global array PORTS 1847 * and get value of var VALID_NUM_PORTS according to system ports number 1848 */ 1849 valid_num_ports = check_ports_num(nb_ports); 1850 1851 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1852 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1853 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1854 return -1; 1855 } 1856 1857 /* 1858 * FIXME: here we are trying to allocate mbufs big enough for 1859 * @MAX_QUEUES, but the truth is we're never going to use that 1860 * many queues here. We probably should only do allocation for 1861 * those queues we are going to use. 1862 */ 1863 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1864 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1865 1866 if (vm2vm_mode == VM2VM_HARDWARE) { 1867 /* Enable VT loop back to let L2 switch to do it. */ 1868 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1869 RTE_LOG(DEBUG, VHOST_CONFIG, 1870 "Enable loop back for L2 switch in vmdq.\n"); 1871 } 1872 1873 /* initialize all ports */ 1874 RTE_ETH_FOREACH_DEV(portid) { 1875 /* skip ports that are not enabled */ 1876 if ((enabled_port_mask & (1 << portid)) == 0) { 1877 RTE_LOG(INFO, VHOST_PORT, 1878 "Skipping disabled port %d\n", portid); 1879 continue; 1880 } 1881 if (port_init(portid) != 0) 1882 rte_exit(EXIT_FAILURE, 1883 "Cannot initialize network ports\n"); 1884 } 1885 1886 /* Enable stats if the user option is set. */ 1887 if (enable_stats) { 1888 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1889 print_stats, NULL); 1890 if (ret < 0) 1891 rte_exit(EXIT_FAILURE, 1892 "Cannot create print-stats thread\n"); 1893 } 1894 1895 /* Launch all data cores. */ 1896 RTE_LCORE_FOREACH_WORKER(lcore_id) 1897 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1898 1899 if (client_mode) 1900 flags |= RTE_VHOST_USER_CLIENT; 1901 1902 for (i = 0; i < dma_count; i++) { 1903 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 1904 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 1905 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 1906 } 1907 } 1908 1909 /* Register vhost user driver to handle vhost messages. */ 1910 for (i = 0; i < nb_sockets; i++) { 1911 char *file = socket_files + i * PATH_MAX; 1912 1913 if (dma_count) 1914 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1915 1916 ret = rte_vhost_driver_register(file, flags); 1917 if (ret != 0) { 1918 unregister_drivers(i); 1919 rte_exit(EXIT_FAILURE, 1920 "vhost driver register failure.\n"); 1921 } 1922 1923 if (builtin_net_driver) 1924 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1925 1926 if (mergeable == 0) { 1927 rte_vhost_driver_disable_features(file, 1928 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1929 } 1930 1931 if (enable_tx_csum == 0) { 1932 rte_vhost_driver_disable_features(file, 1933 1ULL << VIRTIO_NET_F_CSUM); 1934 } 1935 1936 if (enable_tso == 0) { 1937 rte_vhost_driver_disable_features(file, 1938 1ULL << VIRTIO_NET_F_HOST_TSO4); 1939 rte_vhost_driver_disable_features(file, 1940 1ULL << VIRTIO_NET_F_HOST_TSO6); 1941 rte_vhost_driver_disable_features(file, 1942 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1943 rte_vhost_driver_disable_features(file, 1944 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1945 } 1946 1947 if (promiscuous) { 1948 rte_vhost_driver_enable_features(file, 1949 1ULL << VIRTIO_NET_F_CTRL_RX); 1950 } 1951 1952 ret = rte_vhost_driver_callback_register(file, 1953 &virtio_net_device_ops); 1954 if (ret != 0) { 1955 rte_exit(EXIT_FAILURE, 1956 "failed to register vhost driver callbacks.\n"); 1957 } 1958 1959 if (rte_vhost_driver_start(file) < 0) { 1960 rte_exit(EXIT_FAILURE, 1961 "failed to start vhost driver.\n"); 1962 } 1963 } 1964 1965 RTE_LCORE_FOREACH_WORKER(lcore_id) 1966 rte_eal_wait_lcore(lcore_id); 1967 1968 /* clean up the EAL */ 1969 rte_eal_cleanup(); 1970 1971 return 0; 1972 } 1973