1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <arpa/inet.h> 6 #include <getopt.h> 7 #include <linux/if_ether.h> 8 #include <linux/if_vlan.h> 9 #include <linux/virtio_net.h> 10 #include <linux/virtio_ring.h> 11 #include <signal.h> 12 #include <stdint.h> 13 #include <sys/eventfd.h> 14 #include <sys/param.h> 15 #include <unistd.h> 16 17 #include <rte_cycles.h> 18 #include <rte_ethdev.h> 19 #include <rte_log.h> 20 #include <rte_string_fns.h> 21 #include <rte_malloc.h> 22 #include <rte_net.h> 23 #include <rte_vhost.h> 24 #include <rte_ip.h> 25 #include <rte_tcp.h> 26 #include <rte_pause.h> 27 #include <rte_dmadev.h> 28 #include <rte_vhost_async.h> 29 30 #include "main.h" 31 32 #ifndef MAX_QUEUES 33 #define MAX_QUEUES 128 34 #endif 35 36 #define NUM_MBUFS_DEFAULT 0x24000 37 38 /* the maximum number of external ports supported */ 39 #define MAX_SUP_PORTS 1 40 41 #define MBUF_CACHE_SIZE 128 42 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 43 44 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 45 46 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 47 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 48 49 #define JUMBO_FRAME_MAX_SIZE 0x2600 50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 51 52 /* State of virtio device. */ 53 #define DEVICE_MAC_LEARNING 0 54 #define DEVICE_RX 1 55 #define DEVICE_SAFE_REMOVE 2 56 57 /* Configurable number of RX/TX ring descriptors */ 58 #define RTE_TEST_RX_DESC_DEFAULT 1024 59 #define RTE_TEST_TX_DESC_DEFAULT 512 60 61 #define INVALID_PORT_ID 0xFF 62 #define INVALID_DMA_ID -1 63 64 #define DMA_RING_SIZE 4096 65 66 #define ASYNC_ENQUEUE_VHOST 1 67 #define ASYNC_DEQUEUE_VHOST 2 68 69 /* number of mbufs in all pools - if specified on command-line. */ 70 static int total_num_mbufs = NUM_MBUFS_DEFAULT; 71 72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 74 static int dma_count; 75 76 /* mask of enabled ports */ 77 static uint32_t enabled_port_mask = 0; 78 79 /* Promiscuous mode */ 80 static uint32_t promiscuous; 81 82 /* number of devices/queues to support*/ 83 static uint32_t num_queues = 0; 84 static uint32_t num_devices; 85 86 static struct rte_mempool *mbuf_pool; 87 static int mergeable; 88 89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 90 typedef enum { 91 VM2VM_DISABLED = 0, 92 VM2VM_SOFTWARE = 1, 93 VM2VM_HARDWARE = 2, 94 VM2VM_LAST 95 } vm2vm_type; 96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 97 98 /* Enable stats. */ 99 static uint32_t enable_stats = 0; 100 /* Enable retries on RX. */ 101 static uint32_t enable_retry = 1; 102 103 /* Disable TX checksum offload */ 104 static uint32_t enable_tx_csum; 105 106 /* Disable TSO offload */ 107 static uint32_t enable_tso; 108 109 static int client_mode; 110 111 static int builtin_net_driver; 112 113 /* Specify timeout (in useconds) between retries on RX. */ 114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 115 /* Specify the number of retries on RX. */ 116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 117 118 /* Socket file paths. Can be set by user */ 119 static char *socket_files; 120 static int nb_sockets; 121 122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE]; 123 124 /* empty VMDq configuration structure. Filled in programmatically */ 125 static struct rte_eth_conf vmdq_conf_default = { 126 .rxmode = { 127 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 128 .split_hdr_size = 0, 129 /* 130 * VLAN strip is necessary for 1G NIC such as I350, 131 * this fixes bug of ipv4 forwarding in guest can't 132 * forward packets from one virtio dev to another virtio dev. 133 */ 134 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 135 }, 136 137 .txmode = { 138 .mq_mode = RTE_ETH_MQ_TX_NONE, 139 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 140 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 141 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 142 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 143 RTE_ETH_TX_OFFLOAD_TCP_TSO), 144 }, 145 .rx_adv_conf = { 146 /* 147 * should be overridden separately in code with 148 * appropriate values 149 */ 150 .vmdq_rx_conf = { 151 .nb_queue_pools = RTE_ETH_8_POOLS, 152 .enable_default_pool = 0, 153 .default_pool = 0, 154 .nb_pool_maps = 0, 155 .pool_map = {{0, 0},}, 156 }, 157 }, 158 }; 159 160 161 static unsigned lcore_ids[RTE_MAX_LCORE]; 162 static uint16_t ports[RTE_MAX_ETHPORTS]; 163 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 164 static uint16_t num_pf_queues, num_vmdq_queues; 165 static uint16_t vmdq_pool_base, vmdq_queue_base; 166 static uint16_t queues_per_pool; 167 168 const uint16_t vlan_tags[] = { 169 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 170 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 171 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 172 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 173 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 174 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 175 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 176 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 177 }; 178 179 /* ethernet addresses of ports */ 180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 181 182 static struct vhost_dev_tailq_list vhost_dev_list = 183 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 184 185 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 186 187 /* Used for queueing bursts of TX packets. */ 188 struct mbuf_table { 189 unsigned len; 190 unsigned txq_id; 191 struct rte_mbuf *m_table[MAX_PKT_BURST]; 192 }; 193 194 struct vhost_bufftable { 195 uint32_t len; 196 uint64_t pre_tsc; 197 struct rte_mbuf *m_table[MAX_PKT_BURST]; 198 }; 199 200 /* TX queue for each data core. */ 201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 202 203 /* 204 * Vhost TX buffer for each data core. 205 * Every data core maintains a TX buffer for every vhost device, 206 * which is used for batch pkts enqueue for higher performance. 207 */ 208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 209 210 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 211 / US_PER_S * BURST_TX_DRAIN_US) 212 213 static int vid2socketid[RTE_MAX_VHOST_DEVICE]; 214 215 static inline uint32_t 216 get_async_flag_by_socketid(int socketid) 217 { 218 return dma_bind[socketid].async_flag; 219 } 220 221 static inline void 222 init_vid2socketid_array(int vid, int socketid) 223 { 224 vid2socketid[vid] = socketid; 225 } 226 227 static inline bool 228 is_dma_configured(int16_t dev_id) 229 { 230 int i; 231 232 for (i = 0; i < dma_count; i++) 233 if (dmas_id[i] == dev_id) 234 return true; 235 return false; 236 } 237 238 static inline int 239 open_dma(const char *value) 240 { 241 struct dma_for_vhost *dma_info = dma_bind; 242 char *input = strndup(value, strlen(value) + 1); 243 char *addrs = input; 244 char *ptrs[2]; 245 char *start, *end, *substr; 246 int64_t socketid, vring_id; 247 248 struct rte_dma_info info; 249 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 250 struct rte_dma_vchan_conf qconf = { 251 .direction = RTE_DMA_DIR_MEM_TO_MEM, 252 .nb_desc = DMA_RING_SIZE 253 }; 254 255 int dev_id; 256 int ret = 0; 257 uint16_t i = 0; 258 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 259 int args_nr; 260 261 while (isblank(*addrs)) 262 addrs++; 263 if (*addrs == '\0') { 264 ret = -1; 265 goto out; 266 } 267 268 /* process DMA devices within bracket. */ 269 addrs++; 270 substr = strtok(addrs, ";]"); 271 if (!substr) { 272 ret = -1; 273 goto out; 274 } 275 276 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 277 if (args_nr <= 0) { 278 ret = -1; 279 goto out; 280 } 281 282 while (i < args_nr) { 283 char *arg_temp = dma_arg[i]; 284 char *txd, *rxd; 285 uint8_t sub_nr; 286 int async_flag; 287 288 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 289 if (sub_nr != 2) { 290 ret = -1; 291 goto out; 292 } 293 294 txd = strstr(ptrs[0], "txd"); 295 rxd = strstr(ptrs[0], "rxd"); 296 if (txd) { 297 start = txd; 298 vring_id = VIRTIO_RXQ; 299 async_flag = ASYNC_ENQUEUE_VHOST; 300 } else if (rxd) { 301 start = rxd; 302 vring_id = VIRTIO_TXQ; 303 async_flag = ASYNC_DEQUEUE_VHOST; 304 } else { 305 ret = -1; 306 goto out; 307 } 308 309 start += 3; 310 socketid = strtol(start, &end, 0); 311 if (end == start) { 312 ret = -1; 313 goto out; 314 } 315 316 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 317 if (dev_id < 0) { 318 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 319 ret = -1; 320 goto out; 321 } 322 323 /* DMA device is already configured, so skip */ 324 if (is_dma_configured(dev_id)) 325 goto done; 326 327 if (rte_dma_info_get(dev_id, &info) != 0) { 328 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 329 ret = -1; 330 goto out; 331 } 332 333 if (info.max_vchans < 1) { 334 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 335 ret = -1; 336 goto out; 337 } 338 339 if (rte_dma_configure(dev_id, &dev_config) != 0) { 340 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 341 ret = -1; 342 goto out; 343 } 344 345 /* Check the max desc supported by DMA device */ 346 rte_dma_info_get(dev_id, &info); 347 if (info.nb_vchans != 1) { 348 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 349 dev_id); 350 ret = -1; 351 goto out; 352 } 353 354 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 355 356 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 357 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 358 ret = -1; 359 goto out; 360 } 361 362 if (rte_dma_start(dev_id) != 0) { 363 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 364 ret = -1; 365 goto out; 366 } 367 368 dmas_id[dma_count++] = dev_id; 369 370 done: 371 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id; 372 (dma_info + socketid)->async_flag |= async_flag; 373 i++; 374 } 375 out: 376 free(input); 377 return ret; 378 } 379 380 /* 381 * Builds up the correct configuration for VMDQ VLAN pool map 382 * according to the pool & queue limits. 383 */ 384 static inline int 385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 386 { 387 struct rte_eth_vmdq_rx_conf conf; 388 struct rte_eth_vmdq_rx_conf *def_conf = 389 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 390 unsigned i; 391 392 memset(&conf, 0, sizeof(conf)); 393 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 394 conf.nb_pool_maps = num_devices; 395 conf.enable_loop_back = def_conf->enable_loop_back; 396 conf.rx_mode = def_conf->rx_mode; 397 398 for (i = 0; i < conf.nb_pool_maps; i++) { 399 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 400 conf.pool_map[i].pools = (1UL << i); 401 } 402 403 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 404 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 405 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 406 return 0; 407 } 408 409 /* 410 * Initialises a given port using global settings and with the rx buffers 411 * coming from the mbuf_pool passed as parameter 412 */ 413 static inline int 414 port_init(uint16_t port) 415 { 416 struct rte_eth_dev_info dev_info; 417 struct rte_eth_conf port_conf; 418 struct rte_eth_rxconf *rxconf; 419 struct rte_eth_txconf *txconf; 420 int16_t rx_rings, tx_rings; 421 uint16_t rx_ring_size, tx_ring_size; 422 int retval; 423 uint16_t q; 424 425 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 426 retval = rte_eth_dev_info_get(port, &dev_info); 427 if (retval != 0) { 428 RTE_LOG(ERR, VHOST_PORT, 429 "Error during getting device (port %u) info: %s\n", 430 port, strerror(-retval)); 431 432 return retval; 433 } 434 435 rxconf = &dev_info.default_rxconf; 436 txconf = &dev_info.default_txconf; 437 rxconf->rx_drop_en = 1; 438 439 /*configure the number of supported virtio devices based on VMDQ limits */ 440 num_devices = dev_info.max_vmdq_pools; 441 442 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 443 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 444 445 tx_rings = (uint16_t)rte_lcore_count(); 446 447 if (mergeable) { 448 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 449 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 450 else 451 vmdq_conf_default.rxmode.mtu = MAX_MTU; 452 } 453 454 /* Get port configuration. */ 455 retval = get_eth_conf(&port_conf, num_devices); 456 if (retval < 0) 457 return retval; 458 /* NIC queues are divided into pf queues and vmdq queues. */ 459 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 460 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 461 num_vmdq_queues = num_devices * queues_per_pool; 462 num_queues = num_pf_queues + num_vmdq_queues; 463 vmdq_queue_base = dev_info.vmdq_queue_base; 464 vmdq_pool_base = dev_info.vmdq_pool_base; 465 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 466 num_pf_queues, num_devices, queues_per_pool); 467 468 if (!rte_eth_dev_is_valid_port(port)) 469 return -1; 470 471 rx_rings = (uint16_t)dev_info.max_rx_queues; 472 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 473 port_conf.txmode.offloads |= 474 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 475 /* Configure ethernet device. */ 476 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 477 if (retval != 0) { 478 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 479 port, strerror(-retval)); 480 return retval; 481 } 482 483 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 484 &tx_ring_size); 485 if (retval != 0) { 486 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 487 "for port %u: %s.\n", port, strerror(-retval)); 488 return retval; 489 } 490 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 491 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 492 "for Rx queues on port %u.\n", port); 493 return -1; 494 } 495 496 /* Setup the queues. */ 497 rxconf->offloads = port_conf.rxmode.offloads; 498 for (q = 0; q < rx_rings; q ++) { 499 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 500 rte_eth_dev_socket_id(port), 501 rxconf, 502 mbuf_pool); 503 if (retval < 0) { 504 RTE_LOG(ERR, VHOST_PORT, 505 "Failed to setup rx queue %u of port %u: %s.\n", 506 q, port, strerror(-retval)); 507 return retval; 508 } 509 } 510 txconf->offloads = port_conf.txmode.offloads; 511 for (q = 0; q < tx_rings; q ++) { 512 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 513 rte_eth_dev_socket_id(port), 514 txconf); 515 if (retval < 0) { 516 RTE_LOG(ERR, VHOST_PORT, 517 "Failed to setup tx queue %u of port %u: %s.\n", 518 q, port, strerror(-retval)); 519 return retval; 520 } 521 } 522 523 /* Start the device. */ 524 retval = rte_eth_dev_start(port); 525 if (retval < 0) { 526 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 527 port, strerror(-retval)); 528 return retval; 529 } 530 531 if (promiscuous) { 532 retval = rte_eth_promiscuous_enable(port); 533 if (retval != 0) { 534 RTE_LOG(ERR, VHOST_PORT, 535 "Failed to enable promiscuous mode on port %u: %s\n", 536 port, rte_strerror(-retval)); 537 return retval; 538 } 539 } 540 541 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 542 if (retval < 0) { 543 RTE_LOG(ERR, VHOST_PORT, 544 "Failed to get MAC address on port %u: %s\n", 545 port, rte_strerror(-retval)); 546 return retval; 547 } 548 549 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 550 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 551 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 552 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 553 554 return 0; 555 } 556 557 /* 558 * Set socket file path. 559 */ 560 static int 561 us_vhost_parse_socket_path(const char *q_arg) 562 { 563 char *old; 564 565 /* parse number string */ 566 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 567 return -1; 568 569 old = socket_files; 570 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 571 if (socket_files == NULL) { 572 free(old); 573 return -1; 574 } 575 576 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 577 nb_sockets++; 578 579 return 0; 580 } 581 582 /* 583 * Parse the portmask provided at run time. 584 */ 585 static int 586 parse_portmask(const char *portmask) 587 { 588 char *end = NULL; 589 unsigned long pm; 590 591 errno = 0; 592 593 /* parse hexadecimal string */ 594 pm = strtoul(portmask, &end, 16); 595 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 596 return 0; 597 598 return pm; 599 600 } 601 602 /* 603 * Parse num options at run time. 604 */ 605 static int 606 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 607 { 608 char *end = NULL; 609 unsigned long num; 610 611 errno = 0; 612 613 /* parse unsigned int string */ 614 num = strtoul(q_arg, &end, 10); 615 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 616 return -1; 617 618 if (num > max_valid_value) 619 return -1; 620 621 return num; 622 623 } 624 625 /* 626 * Display usage 627 */ 628 static void 629 us_vhost_usage(const char *prgname) 630 { 631 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 632 " --vm2vm [0|1|2]\n" 633 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 634 " --socket-file <path>\n" 635 " --nb-devices ND\n" 636 " -p PORTMASK: Set mask for ports to be used by application\n" 637 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 638 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 639 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 640 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 641 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 642 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 643 " --socket-file: The path of the socket file.\n" 644 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 645 " --tso [0|1] disable/enable TCP segment offload.\n" 646 " --client register a vhost-user socket as client mode.\n" 647 " --dmas register dma channel for specific vhost device.\n" 648 " --total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n", 649 prgname); 650 } 651 652 enum { 653 #define OPT_VM2VM "vm2vm" 654 OPT_VM2VM_NUM = 256, 655 #define OPT_RX_RETRY "rx-retry" 656 OPT_RX_RETRY_NUM, 657 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 658 OPT_RX_RETRY_DELAY_NUM, 659 #define OPT_RX_RETRY_NUMB "rx-retry-num" 660 OPT_RX_RETRY_NUMB_NUM, 661 #define OPT_MERGEABLE "mergeable" 662 OPT_MERGEABLE_NUM, 663 #define OPT_STATS "stats" 664 OPT_STATS_NUM, 665 #define OPT_SOCKET_FILE "socket-file" 666 OPT_SOCKET_FILE_NUM, 667 #define OPT_TX_CSUM "tx-csum" 668 OPT_TX_CSUM_NUM, 669 #define OPT_TSO "tso" 670 OPT_TSO_NUM, 671 #define OPT_CLIENT "client" 672 OPT_CLIENT_NUM, 673 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 674 OPT_BUILTIN_NET_DRIVER_NUM, 675 #define OPT_DMAS "dmas" 676 OPT_DMAS_NUM, 677 #define OPT_NUM_MBUFS "total-num-mbufs" 678 OPT_NUM_MBUFS_NUM, 679 }; 680 681 /* 682 * Parse the arguments given in the command line of the application. 683 */ 684 static int 685 us_vhost_parse_args(int argc, char **argv) 686 { 687 int opt, ret; 688 int option_index; 689 unsigned i; 690 const char *prgname = argv[0]; 691 static struct option long_option[] = { 692 {OPT_VM2VM, required_argument, 693 NULL, OPT_VM2VM_NUM}, 694 {OPT_RX_RETRY, required_argument, 695 NULL, OPT_RX_RETRY_NUM}, 696 {OPT_RX_RETRY_DELAY, required_argument, 697 NULL, OPT_RX_RETRY_DELAY_NUM}, 698 {OPT_RX_RETRY_NUMB, required_argument, 699 NULL, OPT_RX_RETRY_NUMB_NUM}, 700 {OPT_MERGEABLE, required_argument, 701 NULL, OPT_MERGEABLE_NUM}, 702 {OPT_STATS, required_argument, 703 NULL, OPT_STATS_NUM}, 704 {OPT_SOCKET_FILE, required_argument, 705 NULL, OPT_SOCKET_FILE_NUM}, 706 {OPT_TX_CSUM, required_argument, 707 NULL, OPT_TX_CSUM_NUM}, 708 {OPT_TSO, required_argument, 709 NULL, OPT_TSO_NUM}, 710 {OPT_CLIENT, no_argument, 711 NULL, OPT_CLIENT_NUM}, 712 {OPT_BUILTIN_NET_DRIVER, no_argument, 713 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 714 {OPT_DMAS, required_argument, 715 NULL, OPT_DMAS_NUM}, 716 {OPT_NUM_MBUFS, required_argument, 717 NULL, OPT_NUM_MBUFS_NUM}, 718 {NULL, 0, 0, 0}, 719 }; 720 721 /* Parse command line */ 722 while ((opt = getopt_long(argc, argv, "p:P", 723 long_option, &option_index)) != EOF) { 724 switch (opt) { 725 /* Portmask */ 726 case 'p': 727 enabled_port_mask = parse_portmask(optarg); 728 if (enabled_port_mask == 0) { 729 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 730 us_vhost_usage(prgname); 731 return -1; 732 } 733 break; 734 735 case 'P': 736 promiscuous = 1; 737 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 738 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 739 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 740 break; 741 742 case OPT_VM2VM_NUM: 743 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 744 if (ret == -1) { 745 RTE_LOG(INFO, VHOST_CONFIG, 746 "Invalid argument for " 747 "vm2vm [0|1|2]\n"); 748 us_vhost_usage(prgname); 749 return -1; 750 } 751 vm2vm_mode = (vm2vm_type)ret; 752 break; 753 754 case OPT_RX_RETRY_NUM: 755 ret = parse_num_opt(optarg, 1); 756 if (ret == -1) { 757 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 758 us_vhost_usage(prgname); 759 return -1; 760 } 761 enable_retry = ret; 762 break; 763 764 case OPT_TX_CSUM_NUM: 765 ret = parse_num_opt(optarg, 1); 766 if (ret == -1) { 767 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 768 us_vhost_usage(prgname); 769 return -1; 770 } 771 enable_tx_csum = ret; 772 break; 773 774 case OPT_TSO_NUM: 775 ret = parse_num_opt(optarg, 1); 776 if (ret == -1) { 777 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 778 us_vhost_usage(prgname); 779 return -1; 780 } 781 enable_tso = ret; 782 break; 783 784 case OPT_RX_RETRY_DELAY_NUM: 785 ret = parse_num_opt(optarg, INT32_MAX); 786 if (ret == -1) { 787 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 788 us_vhost_usage(prgname); 789 return -1; 790 } 791 burst_rx_delay_time = ret; 792 break; 793 794 case OPT_RX_RETRY_NUMB_NUM: 795 ret = parse_num_opt(optarg, INT32_MAX); 796 if (ret == -1) { 797 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 798 us_vhost_usage(prgname); 799 return -1; 800 } 801 burst_rx_retry_num = ret; 802 break; 803 804 case OPT_MERGEABLE_NUM: 805 ret = parse_num_opt(optarg, 1); 806 if (ret == -1) { 807 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 808 us_vhost_usage(prgname); 809 return -1; 810 } 811 mergeable = !!ret; 812 break; 813 814 case OPT_STATS_NUM: 815 ret = parse_num_opt(optarg, INT32_MAX); 816 if (ret == -1) { 817 RTE_LOG(INFO, VHOST_CONFIG, 818 "Invalid argument for stats [0..N]\n"); 819 us_vhost_usage(prgname); 820 return -1; 821 } 822 enable_stats = ret; 823 break; 824 825 /* Set socket file path. */ 826 case OPT_SOCKET_FILE_NUM: 827 if (us_vhost_parse_socket_path(optarg) == -1) { 828 RTE_LOG(INFO, VHOST_CONFIG, 829 "Invalid argument for socket name (Max %d characters)\n", 830 PATH_MAX); 831 us_vhost_usage(prgname); 832 return -1; 833 } 834 break; 835 836 case OPT_DMAS_NUM: 837 if (open_dma(optarg) == -1) { 838 RTE_LOG(INFO, VHOST_CONFIG, 839 "Wrong DMA args\n"); 840 us_vhost_usage(prgname); 841 return -1; 842 } 843 break; 844 845 case OPT_NUM_MBUFS_NUM: 846 ret = parse_num_opt(optarg, INT32_MAX); 847 if (ret == -1) { 848 RTE_LOG(INFO, VHOST_CONFIG, 849 "Invalid argument for total-num-mbufs [0..N]\n"); 850 us_vhost_usage(prgname); 851 return -1; 852 } 853 854 if (total_num_mbufs < ret) 855 total_num_mbufs = ret; 856 break; 857 858 case OPT_CLIENT_NUM: 859 client_mode = 1; 860 break; 861 862 case OPT_BUILTIN_NET_DRIVER_NUM: 863 builtin_net_driver = 1; 864 break; 865 866 /* Invalid option - print options. */ 867 default: 868 us_vhost_usage(prgname); 869 return -1; 870 } 871 } 872 873 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 874 if (enabled_port_mask & (1 << i)) 875 ports[num_ports++] = i; 876 } 877 878 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 879 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 880 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 881 return -1; 882 } 883 884 return 0; 885 } 886 887 /* 888 * Update the global var NUM_PORTS and array PORTS according to system ports number 889 * and return valid ports number 890 */ 891 static unsigned check_ports_num(unsigned nb_ports) 892 { 893 unsigned valid_num_ports = num_ports; 894 unsigned portid; 895 896 if (num_ports > nb_ports) { 897 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 898 num_ports, nb_ports); 899 num_ports = nb_ports; 900 } 901 902 for (portid = 0; portid < num_ports; portid ++) { 903 if (!rte_eth_dev_is_valid_port(ports[portid])) { 904 RTE_LOG(INFO, VHOST_PORT, 905 "\nSpecified port ID(%u) is not valid\n", 906 ports[portid]); 907 ports[portid] = INVALID_PORT_ID; 908 valid_num_ports--; 909 } 910 } 911 return valid_num_ports; 912 } 913 914 static __rte_always_inline struct vhost_dev * 915 find_vhost_dev(struct rte_ether_addr *mac) 916 { 917 struct vhost_dev *vdev; 918 919 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 920 if (vdev->ready == DEVICE_RX && 921 rte_is_same_ether_addr(mac, &vdev->mac_address)) 922 return vdev; 923 } 924 925 return NULL; 926 } 927 928 /* 929 * This function learns the MAC address of the device and registers this along with a 930 * vlan tag to a VMDQ. 931 */ 932 static int 933 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 934 { 935 struct rte_ether_hdr *pkt_hdr; 936 int i, ret; 937 938 /* Learn MAC address of guest device from packet */ 939 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 940 941 if (find_vhost_dev(&pkt_hdr->src_addr)) { 942 RTE_LOG(ERR, VHOST_DATA, 943 "(%d) device is using a registered MAC!\n", 944 vdev->vid); 945 return -1; 946 } 947 948 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 949 vdev->mac_address.addr_bytes[i] = 950 pkt_hdr->src_addr.addr_bytes[i]; 951 952 /* vlan_tag currently uses the device_id. */ 953 vdev->vlan_tag = vlan_tags[vdev->vid]; 954 955 /* Print out VMDQ registration info. */ 956 RTE_LOG(INFO, VHOST_DATA, 957 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 958 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 959 vdev->vlan_tag); 960 961 /* Register the MAC address. */ 962 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 963 (uint32_t)vdev->vid + vmdq_pool_base); 964 if (ret) 965 RTE_LOG(ERR, VHOST_DATA, 966 "(%d) failed to add device MAC address to VMDQ\n", 967 vdev->vid); 968 969 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 970 971 /* Set device as ready for RX. */ 972 vdev->ready = DEVICE_RX; 973 974 return 0; 975 } 976 977 /* 978 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 979 * queue before disabling RX on the device. 980 */ 981 static inline void 982 unlink_vmdq(struct vhost_dev *vdev) 983 { 984 unsigned i = 0; 985 unsigned rx_count; 986 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 987 988 if (vdev->ready == DEVICE_RX) { 989 /*clear MAC and VLAN settings*/ 990 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 991 for (i = 0; i < 6; i++) 992 vdev->mac_address.addr_bytes[i] = 0; 993 994 vdev->vlan_tag = 0; 995 996 /*Clear out the receive buffers*/ 997 rx_count = rte_eth_rx_burst(ports[0], 998 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 999 1000 while (rx_count) { 1001 for (i = 0; i < rx_count; i++) 1002 rte_pktmbuf_free(pkts_burst[i]); 1003 1004 rx_count = rte_eth_rx_burst(ports[0], 1005 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1006 } 1007 1008 vdev->ready = DEVICE_MAC_LEARNING; 1009 } 1010 } 1011 1012 static inline void 1013 free_pkts(struct rte_mbuf **pkts, uint16_t n) 1014 { 1015 while (n--) 1016 rte_pktmbuf_free(pkts[n]); 1017 } 1018 1019 static __rte_always_inline void 1020 complete_async_pkts(struct vhost_dev *vdev) 1021 { 1022 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 1023 uint16_t complete_count; 1024 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id; 1025 1026 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 1027 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 1028 if (complete_count) 1029 free_pkts(p_cpl, complete_count); 1030 1031 } 1032 1033 static __rte_always_inline void 1034 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 1035 struct rte_mbuf *m) 1036 { 1037 uint16_t ret; 1038 1039 if (builtin_net_driver) { 1040 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 1041 } else { 1042 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 1043 } 1044 1045 if (enable_stats) { 1046 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1, 1047 __ATOMIC_SEQ_CST); 1048 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret, 1049 __ATOMIC_SEQ_CST); 1050 src_vdev->stats.tx_total++; 1051 src_vdev->stats.tx += ret; 1052 } 1053 } 1054 1055 static __rte_always_inline void 1056 drain_vhost(struct vhost_dev *vdev) 1057 { 1058 uint16_t ret; 1059 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1060 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1061 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1062 1063 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit); 1064 1065 if (enable_stats) { 1066 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, 1067 __ATOMIC_SEQ_CST); 1068 __atomic_add_fetch(&vdev->stats.rx_atomic, ret, 1069 __ATOMIC_SEQ_CST); 1070 } 1071 1072 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1073 free_pkts(m, nr_xmit); 1074 } 1075 1076 static __rte_always_inline void 1077 drain_vhost_table(void) 1078 { 1079 uint16_t lcore_id = rte_lcore_id(); 1080 struct vhost_bufftable *vhost_txq; 1081 struct vhost_dev *vdev; 1082 uint64_t cur_tsc; 1083 1084 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1085 if (unlikely(vdev->remove == 1)) 1086 continue; 1087 1088 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1089 1090 cur_tsc = rte_rdtsc(); 1091 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1092 > MBUF_TABLE_DRAIN_TSC)) { 1093 RTE_LOG_DP(DEBUG, VHOST_DATA, 1094 "Vhost TX queue drained after timeout with burst size %u\n", 1095 vhost_txq->len); 1096 drain_vhost(vdev); 1097 vhost_txq->len = 0; 1098 vhost_txq->pre_tsc = cur_tsc; 1099 } 1100 } 1101 } 1102 1103 /* 1104 * Check if the packet destination MAC address is for a local device. If so then put 1105 * the packet on that devices RX queue. If not then return. 1106 */ 1107 static __rte_always_inline int 1108 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1109 { 1110 struct rte_ether_hdr *pkt_hdr; 1111 struct vhost_dev *dst_vdev; 1112 struct vhost_bufftable *vhost_txq; 1113 uint16_t lcore_id = rte_lcore_id(); 1114 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1115 1116 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1117 if (!dst_vdev) 1118 return -1; 1119 1120 if (vdev->vid == dst_vdev->vid) { 1121 RTE_LOG_DP(DEBUG, VHOST_DATA, 1122 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1123 vdev->vid); 1124 return 0; 1125 } 1126 1127 RTE_LOG_DP(DEBUG, VHOST_DATA, 1128 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1129 1130 if (unlikely(dst_vdev->remove)) { 1131 RTE_LOG_DP(DEBUG, VHOST_DATA, 1132 "(%d) device is marked for removal\n", dst_vdev->vid); 1133 return 0; 1134 } 1135 1136 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1137 vhost_txq->m_table[vhost_txq->len++] = m; 1138 1139 if (enable_stats) { 1140 vdev->stats.tx_total++; 1141 vdev->stats.tx++; 1142 } 1143 1144 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1145 drain_vhost(dst_vdev); 1146 vhost_txq->len = 0; 1147 vhost_txq->pre_tsc = rte_rdtsc(); 1148 } 1149 return 0; 1150 } 1151 1152 /* 1153 * Check if the destination MAC of a packet is one local VM, 1154 * and get its vlan tag, and offset if it is. 1155 */ 1156 static __rte_always_inline int 1157 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1158 uint32_t *offset, uint16_t *vlan_tag) 1159 { 1160 struct vhost_dev *dst_vdev; 1161 struct rte_ether_hdr *pkt_hdr = 1162 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1163 1164 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1165 if (!dst_vdev) 1166 return 0; 1167 1168 if (vdev->vid == dst_vdev->vid) { 1169 RTE_LOG_DP(DEBUG, VHOST_DATA, 1170 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1171 vdev->vid); 1172 return -1; 1173 } 1174 1175 /* 1176 * HW vlan strip will reduce the packet length 1177 * by minus length of vlan tag, so need restore 1178 * the packet length by plus it. 1179 */ 1180 *offset = RTE_VLAN_HLEN; 1181 *vlan_tag = vlan_tags[vdev->vid]; 1182 1183 RTE_LOG_DP(DEBUG, VHOST_DATA, 1184 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1185 vdev->vid, dst_vdev->vid, *vlan_tag); 1186 1187 return 0; 1188 } 1189 1190 static void virtio_tx_offload(struct rte_mbuf *m) 1191 { 1192 struct rte_net_hdr_lens hdr_lens; 1193 struct rte_ipv4_hdr *ipv4_hdr; 1194 struct rte_tcp_hdr *tcp_hdr; 1195 uint32_t ptype; 1196 void *l3_hdr; 1197 1198 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1199 m->l2_len = hdr_lens.l2_len; 1200 m->l3_len = hdr_lens.l3_len; 1201 m->l4_len = hdr_lens.l4_len; 1202 1203 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1204 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1205 m->l2_len + m->l3_len); 1206 1207 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1208 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1209 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1210 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1211 ipv4_hdr = l3_hdr; 1212 ipv4_hdr->hdr_checksum = 0; 1213 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1214 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1215 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1216 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1217 } 1218 } 1219 1220 static __rte_always_inline void 1221 do_drain_mbuf_table(struct mbuf_table *tx_q) 1222 { 1223 uint16_t count; 1224 1225 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1226 tx_q->m_table, tx_q->len); 1227 if (unlikely(count < tx_q->len)) 1228 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1229 1230 tx_q->len = 0; 1231 } 1232 1233 /* 1234 * This function routes the TX packet to the correct interface. This 1235 * may be a local device or the physical port. 1236 */ 1237 static __rte_always_inline void 1238 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1239 { 1240 struct mbuf_table *tx_q; 1241 unsigned offset = 0; 1242 const uint16_t lcore_id = rte_lcore_id(); 1243 struct rte_ether_hdr *nh; 1244 1245 1246 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1247 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1248 struct vhost_dev *vdev2; 1249 1250 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1251 if (vdev2 != vdev) 1252 sync_virtio_xmit(vdev2, vdev, m); 1253 } 1254 goto queue2nic; 1255 } 1256 1257 /*check if destination is local VM*/ 1258 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1259 return; 1260 1261 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1262 if (unlikely(find_local_dest(vdev, m, &offset, 1263 &vlan_tag) != 0)) { 1264 rte_pktmbuf_free(m); 1265 return; 1266 } 1267 } 1268 1269 RTE_LOG_DP(DEBUG, VHOST_DATA, 1270 "(%d) TX: MAC address is external\n", vdev->vid); 1271 1272 queue2nic: 1273 1274 /*Add packet to the port tx queue*/ 1275 tx_q = &lcore_tx_queue[lcore_id]; 1276 1277 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1278 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1279 /* Guest has inserted the vlan tag. */ 1280 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1281 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1282 if ((vm2vm_mode == VM2VM_HARDWARE) && 1283 (vh->vlan_tci != vlan_tag_be)) 1284 vh->vlan_tci = vlan_tag_be; 1285 } else { 1286 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1287 1288 /* 1289 * Find the right seg to adjust the data len when offset is 1290 * bigger than tail room size. 1291 */ 1292 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1293 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1294 m->data_len += offset; 1295 else { 1296 struct rte_mbuf *seg = m; 1297 1298 while ((seg->next != NULL) && 1299 (offset > rte_pktmbuf_tailroom(seg))) 1300 seg = seg->next; 1301 1302 seg->data_len += offset; 1303 } 1304 m->pkt_len += offset; 1305 } 1306 1307 m->vlan_tci = vlan_tag; 1308 } 1309 1310 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1311 virtio_tx_offload(m); 1312 1313 tx_q->m_table[tx_q->len++] = m; 1314 if (enable_stats) { 1315 vdev->stats.tx_total++; 1316 vdev->stats.tx++; 1317 } 1318 1319 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1320 do_drain_mbuf_table(tx_q); 1321 } 1322 1323 1324 static __rte_always_inline void 1325 drain_mbuf_table(struct mbuf_table *tx_q) 1326 { 1327 static uint64_t prev_tsc; 1328 uint64_t cur_tsc; 1329 1330 if (tx_q->len == 0) 1331 return; 1332 1333 cur_tsc = rte_rdtsc(); 1334 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1335 prev_tsc = cur_tsc; 1336 1337 RTE_LOG_DP(DEBUG, VHOST_DATA, 1338 "TX queue drained after timeout with burst size %u\n", 1339 tx_q->len); 1340 do_drain_mbuf_table(tx_q); 1341 } 1342 } 1343 1344 uint16_t 1345 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1346 struct rte_mbuf **pkts, uint32_t rx_count) 1347 { 1348 uint16_t enqueue_count; 1349 uint16_t enqueue_fail = 0; 1350 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id; 1351 1352 complete_async_pkts(dev); 1353 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id, 1354 pkts, rx_count, dma_id, 0); 1355 1356 enqueue_fail = rx_count - enqueue_count; 1357 if (enqueue_fail) 1358 free_pkts(&pkts[enqueue_count], enqueue_fail); 1359 1360 return enqueue_count; 1361 } 1362 1363 uint16_t 1364 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1365 struct rte_mbuf **pkts, uint32_t rx_count) 1366 { 1367 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count); 1368 } 1369 1370 static __rte_always_inline void 1371 drain_eth_rx(struct vhost_dev *vdev) 1372 { 1373 uint16_t rx_count, enqueue_count; 1374 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1375 1376 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1377 pkts, MAX_PKT_BURST); 1378 1379 if (!rx_count) 1380 return; 1381 1382 /* 1383 * When "enable_retry" is set, here we wait and retry when there 1384 * is no enough free slots in the queue to hold @rx_count packets, 1385 * to diminish packet loss. 1386 */ 1387 if (enable_retry && 1388 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1389 VIRTIO_RXQ))) { 1390 uint32_t retry; 1391 1392 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1393 rte_delay_us(burst_rx_delay_time); 1394 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1395 VIRTIO_RXQ)) 1396 break; 1397 } 1398 } 1399 1400 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1401 VIRTIO_RXQ, pkts, rx_count); 1402 1403 if (enable_stats) { 1404 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, 1405 __ATOMIC_SEQ_CST); 1406 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count, 1407 __ATOMIC_SEQ_CST); 1408 } 1409 1410 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) 1411 free_pkts(pkts, rx_count); 1412 } 1413 1414 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1415 struct rte_mempool *mbuf_pool, 1416 struct rte_mbuf **pkts, uint16_t count) 1417 { 1418 int nr_inflight; 1419 uint16_t dequeue_count; 1420 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id; 1421 1422 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id, 1423 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0); 1424 1425 return dequeue_count; 1426 } 1427 1428 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1429 struct rte_mempool *mbuf_pool, 1430 struct rte_mbuf **pkts, uint16_t count) 1431 { 1432 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count); 1433 } 1434 1435 static __rte_always_inline void 1436 drain_virtio_tx(struct vhost_dev *vdev) 1437 { 1438 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1439 uint16_t count; 1440 uint16_t i; 1441 1442 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev, 1443 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST); 1444 1445 /* setup VMDq for the first packet */ 1446 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1447 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1448 free_pkts(pkts, count); 1449 } 1450 1451 for (i = 0; i < count; ++i) 1452 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1453 } 1454 1455 /* 1456 * Main function of vhost-switch. It basically does: 1457 * 1458 * for each vhost device { 1459 * - drain_eth_rx() 1460 * 1461 * Which drains the host eth Rx queue linked to the vhost device, 1462 * and deliver all of them to guest virito Rx ring associated with 1463 * this vhost device. 1464 * 1465 * - drain_virtio_tx() 1466 * 1467 * Which drains the guest virtio Tx queue and deliver all of them 1468 * to the target, which could be another vhost device, or the 1469 * physical eth dev. The route is done in function "virtio_tx_route". 1470 * } 1471 */ 1472 static int 1473 switch_worker(void *arg __rte_unused) 1474 { 1475 unsigned i; 1476 unsigned lcore_id = rte_lcore_id(); 1477 struct vhost_dev *vdev; 1478 struct mbuf_table *tx_q; 1479 1480 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1481 1482 tx_q = &lcore_tx_queue[lcore_id]; 1483 for (i = 0; i < rte_lcore_count(); i++) { 1484 if (lcore_ids[i] == lcore_id) { 1485 tx_q->txq_id = i; 1486 break; 1487 } 1488 } 1489 1490 while(1) { 1491 drain_mbuf_table(tx_q); 1492 drain_vhost_table(); 1493 /* 1494 * Inform the configuration core that we have exited the 1495 * linked list and that no devices are in use if requested. 1496 */ 1497 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1498 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1499 1500 /* 1501 * Process vhost devices 1502 */ 1503 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1504 lcore_vdev_entry) { 1505 if (unlikely(vdev->remove)) { 1506 unlink_vmdq(vdev); 1507 vdev->ready = DEVICE_SAFE_REMOVE; 1508 continue; 1509 } 1510 1511 if (likely(vdev->ready == DEVICE_RX)) 1512 drain_eth_rx(vdev); 1513 1514 if (likely(!vdev->remove)) 1515 drain_virtio_tx(vdev); 1516 } 1517 } 1518 1519 return 0; 1520 } 1521 1522 static void 1523 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id) 1524 { 1525 uint16_t n_pkt = 0; 1526 int pkts_inflight; 1527 1528 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1529 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id); 1530 1531 struct rte_mbuf *m_cpl[pkts_inflight]; 1532 1533 while (pkts_inflight) { 1534 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl, 1535 pkts_inflight, dma_id, 0); 1536 free_pkts(m_cpl, n_pkt); 1537 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, 1538 queue_id); 1539 } 1540 } 1541 1542 /* 1543 * Remove a device from the specific data core linked list and from the 1544 * main linked list. Synchronization occurs through the use of the 1545 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1546 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1547 */ 1548 static void 1549 destroy_device(int vid) 1550 { 1551 struct vhost_dev *vdev = NULL; 1552 int lcore; 1553 uint16_t i; 1554 1555 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1556 if (vdev->vid == vid) 1557 break; 1558 } 1559 if (!vdev) 1560 return; 1561 /*set the remove flag. */ 1562 vdev->remove = 1; 1563 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1564 rte_pause(); 1565 } 1566 1567 for (i = 0; i < RTE_MAX_LCORE; i++) 1568 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1569 1570 if (builtin_net_driver) 1571 vs_vhost_net_remove(vdev); 1572 1573 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1574 lcore_vdev_entry); 1575 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1576 1577 1578 /* Set the dev_removal_flag on each lcore. */ 1579 RTE_LCORE_FOREACH_WORKER(lcore) 1580 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1581 1582 /* 1583 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1584 * we can be sure that they can no longer access the device removed 1585 * from the linked lists and that the devices are no longer in use. 1586 */ 1587 RTE_LCORE_FOREACH_WORKER(lcore) { 1588 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1589 rte_pause(); 1590 } 1591 1592 lcore_info[vdev->coreid].device_num--; 1593 1594 RTE_LOG(INFO, VHOST_DATA, 1595 "(%d) device has been removed from data core\n", 1596 vdev->vid); 1597 1598 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1599 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_RXQ); 1600 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1601 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1602 } 1603 1604 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) { 1605 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_TXQ); 1606 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ); 1607 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false; 1608 } 1609 1610 rte_free(vdev); 1611 } 1612 1613 static inline int 1614 get_socketid_by_vid(int vid) 1615 { 1616 int i; 1617 char ifname[PATH_MAX]; 1618 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 1619 1620 for (i = 0; i < nb_sockets; i++) { 1621 char *file = socket_files + i * PATH_MAX; 1622 if (strcmp(file, ifname) == 0) 1623 return i; 1624 } 1625 1626 return -1; 1627 } 1628 1629 static int 1630 init_vhost_queue_ops(int vid) 1631 { 1632 if (builtin_net_driver) { 1633 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; 1634 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; 1635 } else { 1636 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled) 1637 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts; 1638 else 1639 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts; 1640 1641 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled) 1642 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts; 1643 else 1644 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; 1645 } 1646 1647 return 0; 1648 } 1649 1650 static inline int 1651 vhost_async_channel_register(int vid) 1652 { 1653 int rx_ret = 0, tx_ret = 0; 1654 1655 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1656 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1657 if (rx_ret == 0) 1658 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true; 1659 } 1660 1661 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) { 1662 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ); 1663 if (tx_ret == 0) 1664 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true; 1665 } 1666 1667 return rx_ret | tx_ret; 1668 } 1669 1670 1671 1672 /* 1673 * A new device is added to a data core. First the device is added to the main linked list 1674 * and then allocated to a specific data core. 1675 */ 1676 static int 1677 new_device(int vid) 1678 { 1679 int lcore, core_add = 0; 1680 uint16_t i; 1681 uint32_t device_num_min = num_devices; 1682 struct vhost_dev *vdev; 1683 int ret; 1684 1685 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1686 if (vdev == NULL) { 1687 RTE_LOG(INFO, VHOST_DATA, 1688 "(%d) couldn't allocate memory for vhost dev\n", 1689 vid); 1690 return -1; 1691 } 1692 vdev->vid = vid; 1693 1694 for (i = 0; i < RTE_MAX_LCORE; i++) { 1695 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1696 = rte_zmalloc("vhost bufftable", 1697 sizeof(struct vhost_bufftable), 1698 RTE_CACHE_LINE_SIZE); 1699 1700 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1701 RTE_LOG(INFO, VHOST_DATA, 1702 "(%d) couldn't allocate memory for vhost TX\n", vid); 1703 return -1; 1704 } 1705 } 1706 1707 int socketid = get_socketid_by_vid(vid); 1708 if (socketid == -1) 1709 return -1; 1710 1711 init_vid2socketid_array(vid, socketid); 1712 1713 ret = vhost_async_channel_register(vid); 1714 1715 if (init_vhost_queue_ops(vid) != 0) 1716 return -1; 1717 1718 if (builtin_net_driver) 1719 vs_vhost_net_setup(vdev); 1720 1721 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1722 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1723 1724 /*reset ready flag*/ 1725 vdev->ready = DEVICE_MAC_LEARNING; 1726 vdev->remove = 0; 1727 1728 /* Find a suitable lcore to add the device. */ 1729 RTE_LCORE_FOREACH_WORKER(lcore) { 1730 if (lcore_info[lcore].device_num < device_num_min) { 1731 device_num_min = lcore_info[lcore].device_num; 1732 core_add = lcore; 1733 } 1734 } 1735 vdev->coreid = core_add; 1736 1737 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1738 lcore_vdev_entry); 1739 lcore_info[vdev->coreid].device_num++; 1740 1741 /* Disable notifications. */ 1742 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1743 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1744 1745 RTE_LOG(INFO, VHOST_DATA, 1746 "(%d) device has been added to data core %d\n", 1747 vid, vdev->coreid); 1748 1749 return ret; 1750 } 1751 1752 static int 1753 vring_state_changed(int vid, uint16_t queue_id, int enable) 1754 { 1755 struct vhost_dev *vdev = NULL; 1756 1757 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1758 if (vdev->vid == vid) 1759 break; 1760 } 1761 if (!vdev) 1762 return -1; 1763 1764 if (queue_id != VIRTIO_RXQ) 1765 return 0; 1766 1767 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) { 1768 if (!enable) 1769 vhost_clear_queue_thread_unsafe(vdev, queue_id); 1770 } 1771 1772 return 0; 1773 } 1774 1775 /* 1776 * These callback allow devices to be added to the data core when configuration 1777 * has been fully complete. 1778 */ 1779 static const struct rte_vhost_device_ops virtio_net_device_ops = 1780 { 1781 .new_device = new_device, 1782 .destroy_device = destroy_device, 1783 .vring_state_changed = vring_state_changed, 1784 }; 1785 1786 /* 1787 * This is a thread will wake up after a period to print stats if the user has 1788 * enabled them. 1789 */ 1790 static void * 1791 print_stats(__rte_unused void *arg) 1792 { 1793 struct vhost_dev *vdev; 1794 uint64_t tx_dropped, rx_dropped; 1795 uint64_t tx, tx_total, rx, rx_total; 1796 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1797 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1798 1799 while(1) { 1800 sleep(enable_stats); 1801 1802 /* Clear screen and move to top left */ 1803 printf("%s%s\n", clr, top_left); 1804 printf("Device statistics =================================\n"); 1805 1806 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1807 tx_total = vdev->stats.tx_total; 1808 tx = vdev->stats.tx; 1809 tx_dropped = tx_total - tx; 1810 1811 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1812 __ATOMIC_SEQ_CST); 1813 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1814 __ATOMIC_SEQ_CST); 1815 rx_dropped = rx_total - rx; 1816 1817 printf("Statistics for device %d\n" 1818 "-----------------------\n" 1819 "TX total: %" PRIu64 "\n" 1820 "TX dropped: %" PRIu64 "\n" 1821 "TX successful: %" PRIu64 "\n" 1822 "RX total: %" PRIu64 "\n" 1823 "RX dropped: %" PRIu64 "\n" 1824 "RX successful: %" PRIu64 "\n", 1825 vdev->vid, 1826 tx_total, tx_dropped, tx, 1827 rx_total, rx_dropped, rx); 1828 } 1829 1830 printf("===================================================\n"); 1831 1832 fflush(stdout); 1833 } 1834 1835 return NULL; 1836 } 1837 1838 static void 1839 unregister_drivers(int socket_num) 1840 { 1841 int i, ret; 1842 1843 for (i = 0; i < socket_num; i++) { 1844 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1845 if (ret != 0) 1846 RTE_LOG(ERR, VHOST_CONFIG, 1847 "Fail to unregister vhost driver for %s.\n", 1848 socket_files + i * PATH_MAX); 1849 } 1850 } 1851 1852 /* When we receive a INT signal, unregister vhost driver */ 1853 static void 1854 sigint_handler(__rte_unused int signum) 1855 { 1856 /* Unregister vhost driver. */ 1857 unregister_drivers(nb_sockets); 1858 1859 exit(0); 1860 } 1861 1862 static void 1863 reset_dma(void) 1864 { 1865 int i; 1866 1867 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1868 int j; 1869 1870 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1871 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1872 dma_bind[i].dmas[j].async_enabled = false; 1873 } 1874 } 1875 1876 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1877 dmas_id[i] = INVALID_DMA_ID; 1878 } 1879 1880 /* 1881 * Main function, does initialisation and calls the per-lcore functions. 1882 */ 1883 int 1884 main(int argc, char *argv[]) 1885 { 1886 unsigned lcore_id, core_id = 0; 1887 unsigned nb_ports, valid_num_ports; 1888 int ret, i; 1889 uint16_t portid; 1890 static pthread_t tid; 1891 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1892 1893 signal(SIGINT, sigint_handler); 1894 1895 /* init EAL */ 1896 ret = rte_eal_init(argc, argv); 1897 if (ret < 0) 1898 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1899 argc -= ret; 1900 argv += ret; 1901 1902 /* initialize dma structures */ 1903 reset_dma(); 1904 1905 /* parse app arguments */ 1906 ret = us_vhost_parse_args(argc, argv); 1907 if (ret < 0) 1908 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1909 1910 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1911 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1912 1913 if (rte_lcore_is_enabled(lcore_id)) 1914 lcore_ids[core_id++] = lcore_id; 1915 } 1916 1917 if (rte_lcore_count() > RTE_MAX_LCORE) 1918 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1919 1920 /* Get the number of physical ports. */ 1921 nb_ports = rte_eth_dev_count_avail(); 1922 1923 /* 1924 * Update the global var NUM_PORTS and global array PORTS 1925 * and get value of var VALID_NUM_PORTS according to system ports number 1926 */ 1927 valid_num_ports = check_ports_num(nb_ports); 1928 1929 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1930 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1931 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1932 return -1; 1933 } 1934 1935 /* 1936 * FIXME: here we are trying to allocate mbufs big enough for 1937 * @MAX_QUEUES, but the truth is we're never going to use that 1938 * many queues here. We probably should only do allocation for 1939 * those queues we are going to use. 1940 */ 1941 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs, 1942 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE, 1943 rte_socket_id()); 1944 if (mbuf_pool == NULL) 1945 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1946 1947 if (vm2vm_mode == VM2VM_HARDWARE) { 1948 /* Enable VT loop back to let L2 switch to do it. */ 1949 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1950 RTE_LOG(DEBUG, VHOST_CONFIG, 1951 "Enable loop back for L2 switch in vmdq.\n"); 1952 } 1953 1954 /* initialize all ports */ 1955 RTE_ETH_FOREACH_DEV(portid) { 1956 /* skip ports that are not enabled */ 1957 if ((enabled_port_mask & (1 << portid)) == 0) { 1958 RTE_LOG(INFO, VHOST_PORT, 1959 "Skipping disabled port %d\n", portid); 1960 continue; 1961 } 1962 if (port_init(portid) != 0) 1963 rte_exit(EXIT_FAILURE, 1964 "Cannot initialize network ports\n"); 1965 } 1966 1967 /* Enable stats if the user option is set. */ 1968 if (enable_stats) { 1969 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1970 print_stats, NULL); 1971 if (ret < 0) 1972 rte_exit(EXIT_FAILURE, 1973 "Cannot create print-stats thread\n"); 1974 } 1975 1976 /* Launch all data cores. */ 1977 RTE_LCORE_FOREACH_WORKER(lcore_id) 1978 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1979 1980 if (client_mode) 1981 flags |= RTE_VHOST_USER_CLIENT; 1982 1983 for (i = 0; i < dma_count; i++) { 1984 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 1985 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 1986 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 1987 } 1988 } 1989 1990 /* Register vhost user driver to handle vhost messages. */ 1991 for (i = 0; i < nb_sockets; i++) { 1992 char *file = socket_files + i * PATH_MAX; 1993 1994 if (dma_count && get_async_flag_by_socketid(i) != 0) 1995 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 1996 1997 ret = rte_vhost_driver_register(file, flags); 1998 if (ret != 0) { 1999 unregister_drivers(i); 2000 rte_exit(EXIT_FAILURE, 2001 "vhost driver register failure.\n"); 2002 } 2003 2004 if (builtin_net_driver) 2005 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 2006 2007 if (mergeable == 0) { 2008 rte_vhost_driver_disable_features(file, 2009 1ULL << VIRTIO_NET_F_MRG_RXBUF); 2010 } 2011 2012 if (enable_tx_csum == 0) { 2013 rte_vhost_driver_disable_features(file, 2014 1ULL << VIRTIO_NET_F_CSUM); 2015 } 2016 2017 if (enable_tso == 0) { 2018 rte_vhost_driver_disable_features(file, 2019 1ULL << VIRTIO_NET_F_HOST_TSO4); 2020 rte_vhost_driver_disable_features(file, 2021 1ULL << VIRTIO_NET_F_HOST_TSO6); 2022 rte_vhost_driver_disable_features(file, 2023 1ULL << VIRTIO_NET_F_GUEST_TSO4); 2024 rte_vhost_driver_disable_features(file, 2025 1ULL << VIRTIO_NET_F_GUEST_TSO6); 2026 } 2027 2028 if (promiscuous) { 2029 rte_vhost_driver_enable_features(file, 2030 1ULL << VIRTIO_NET_F_CTRL_RX); 2031 } 2032 2033 ret = rte_vhost_driver_callback_register(file, 2034 &virtio_net_device_ops); 2035 if (ret != 0) { 2036 rte_exit(EXIT_FAILURE, 2037 "failed to register vhost driver callbacks.\n"); 2038 } 2039 2040 if (rte_vhost_driver_start(file) < 0) { 2041 rte_exit(EXIT_FAILURE, 2042 "failed to start vhost driver.\n"); 2043 } 2044 } 2045 2046 RTE_LCORE_FOREACH_WORKER(lcore_id) 2047 rte_eal_wait_lcore(lcore_id); 2048 2049 /* clean up the EAL */ 2050 rte_eal_cleanup(); 2051 2052 return 0; 2053 } 2054