1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <ctype.h> 6 #include <arpa/inet.h> 7 #include <getopt.h> 8 #include <linux/if_ether.h> 9 #include <linux/if_vlan.h> 10 #include <linux/virtio_net.h> 11 #include <linux/virtio_ring.h> 12 #include <signal.h> 13 #include <stdint.h> 14 #include <stdlib.h> 15 #include <sys/eventfd.h> 16 #include <sys/param.h> 17 #include <unistd.h> 18 19 #include <rte_cycles.h> 20 #include <rte_ethdev.h> 21 #include <rte_log.h> 22 #include <rte_string_fns.h> 23 #include <rte_malloc.h> 24 #include <rte_net.h> 25 #include <rte_vhost.h> 26 #include <rte_ip.h> 27 #include <rte_tcp.h> 28 #include <rte_pause.h> 29 #include <rte_dmadev.h> 30 #include <rte_vhost_async.h> 31 #include <rte_thread.h> 32 33 #include "main.h" 34 35 #ifndef MAX_QUEUES 36 #define MAX_QUEUES 128 37 #endif 38 39 #define NUM_MBUFS_DEFAULT 0x24000 40 41 /* the maximum number of external ports supported */ 42 #define MAX_SUP_PORTS 1 43 44 #define MBUF_CACHE_SIZE 128 45 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 46 47 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 48 49 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 50 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 51 52 #define JUMBO_FRAME_MAX_SIZE 0x2600 53 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 54 55 /* State of virtio device. */ 56 #define DEVICE_MAC_LEARNING 0 57 #define DEVICE_RX 1 58 #define DEVICE_SAFE_REMOVE 2 59 60 /* Configurable number of RX/TX ring descriptors */ 61 #define RX_DESC_DEFAULT 1024 62 #define TX_DESC_DEFAULT 512 63 64 #define INVALID_PORT_ID 0xFF 65 #define INVALID_DMA_ID -1 66 67 #define DMA_RING_SIZE 4096 68 69 #define ASYNC_ENQUEUE_VHOST 1 70 #define ASYNC_DEQUEUE_VHOST 2 71 72 /* number of mbufs in all pools - if specified on command-line. */ 73 static int total_num_mbufs = NUM_MBUFS_DEFAULT; 74 75 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 76 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 77 static int dma_count; 78 79 /* mask of enabled ports */ 80 static uint32_t enabled_port_mask = 0; 81 82 /* Promiscuous mode */ 83 static uint32_t promiscuous; 84 85 /* number of devices/queues to support*/ 86 static uint32_t num_queues = 0; 87 static uint32_t num_devices; 88 89 static struct rte_mempool *mbuf_pool; 90 static int mergeable; 91 92 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 93 typedef enum { 94 VM2VM_DISABLED = 0, 95 VM2VM_SOFTWARE = 1, 96 VM2VM_HARDWARE = 2, 97 VM2VM_LAST 98 } vm2vm_type; 99 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 100 101 /* Enable stats. */ 102 static uint32_t enable_stats = 0; 103 /* Enable retries on RX. */ 104 static uint32_t enable_retry = 1; 105 106 /* Disable TX checksum offload */ 107 static uint32_t enable_tx_csum; 108 109 /* Disable TSO offload */ 110 static uint32_t enable_tso; 111 112 static int client_mode; 113 114 static int builtin_net_driver; 115 116 /* Specify timeout (in useconds) between retries on RX. */ 117 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 118 /* Specify the number of retries on RX. */ 119 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 120 121 /* Socket file paths. Can be set by user */ 122 static char *socket_files; 123 static int nb_sockets; 124 125 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE]; 126 127 /* empty VMDq configuration structure. Filled in programmatically */ 128 static struct rte_eth_conf vmdq_conf_default = { 129 .rxmode = { 130 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 131 /* 132 * VLAN strip is necessary for 1G NIC such as I350, 133 * this fixes bug of ipv4 forwarding in guest can't 134 * forward packets from one virtio dev to another virtio dev. 135 */ 136 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 137 }, 138 139 .txmode = { 140 .mq_mode = RTE_ETH_MQ_TX_NONE, 141 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 142 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 143 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 144 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 145 RTE_ETH_TX_OFFLOAD_TCP_TSO), 146 }, 147 .rx_adv_conf = { 148 /* 149 * should be overridden separately in code with 150 * appropriate values 151 */ 152 .vmdq_rx_conf = { 153 .nb_queue_pools = RTE_ETH_8_POOLS, 154 .enable_default_pool = 0, 155 .default_pool = 0, 156 .nb_pool_maps = 0, 157 .pool_map = {{0, 0},}, 158 }, 159 }, 160 }; 161 162 163 static unsigned lcore_ids[RTE_MAX_LCORE]; 164 static uint16_t ports[RTE_MAX_ETHPORTS]; 165 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 166 static uint16_t num_pf_queues, num_vmdq_queues; 167 static uint16_t vmdq_pool_base, vmdq_queue_base; 168 static uint16_t queues_per_pool; 169 170 const uint16_t vlan_tags[] = { 171 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 172 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 173 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 174 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 175 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 176 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 177 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 178 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 179 }; 180 181 /* ethernet addresses of ports */ 182 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 183 184 static struct vhost_dev_tailq_list vhost_dev_list = 185 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 186 187 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 188 189 /* Used for queueing bursts of TX packets. */ 190 struct mbuf_table { 191 unsigned len; 192 unsigned txq_id; 193 struct rte_mbuf *m_table[MAX_PKT_BURST]; 194 }; 195 196 struct vhost_bufftable { 197 uint32_t len; 198 uint64_t pre_tsc; 199 struct rte_mbuf *m_table[MAX_PKT_BURST]; 200 }; 201 202 /* TX queue for each data core. */ 203 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 204 205 /* 206 * Vhost TX buffer for each data core. 207 * Every data core maintains a TX buffer for every vhost device, 208 * which is used for batch pkts enqueue for higher performance. 209 */ 210 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 211 212 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 213 / US_PER_S * BURST_TX_DRAIN_US) 214 215 static int vid2socketid[RTE_MAX_VHOST_DEVICE]; 216 217 static inline uint32_t 218 get_async_flag_by_socketid(int socketid) 219 { 220 return dma_bind[socketid].async_flag; 221 } 222 223 static inline void 224 init_vid2socketid_array(int vid, int socketid) 225 { 226 vid2socketid[vid] = socketid; 227 } 228 229 static inline bool 230 is_dma_configured(int16_t dev_id) 231 { 232 int i; 233 234 for (i = 0; i < dma_count; i++) 235 if (dmas_id[i] == dev_id) 236 return true; 237 return false; 238 } 239 240 static inline int 241 open_dma(const char *value) 242 { 243 struct dma_for_vhost *dma_info = dma_bind; 244 char *input = strndup(value, strlen(value) + 1); 245 char *addrs = input; 246 char *ptrs[2]; 247 char *start, *end, *substr; 248 int64_t socketid, vring_id; 249 250 struct rte_dma_info info; 251 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 252 struct rte_dma_vchan_conf qconf = { 253 .direction = RTE_DMA_DIR_MEM_TO_MEM, 254 .nb_desc = DMA_RING_SIZE 255 }; 256 257 int dev_id; 258 int ret = 0; 259 uint16_t i = 0; 260 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 261 int args_nr; 262 263 while (isblank(*addrs)) 264 addrs++; 265 if (*addrs == '\0') { 266 ret = -1; 267 goto out; 268 } 269 270 /* process DMA devices within bracket. */ 271 addrs++; 272 substr = strtok(addrs, ";]"); 273 if (!substr) { 274 ret = -1; 275 goto out; 276 } 277 278 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 279 if (args_nr <= 0) { 280 ret = -1; 281 goto out; 282 } 283 284 while (i < args_nr) { 285 char *arg_temp = dma_arg[i]; 286 char *txd, *rxd; 287 uint8_t sub_nr; 288 int async_flag; 289 290 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 291 if (sub_nr != 2) { 292 ret = -1; 293 goto out; 294 } 295 296 txd = strstr(ptrs[0], "txd"); 297 rxd = strstr(ptrs[0], "rxd"); 298 if (txd) { 299 start = txd; 300 vring_id = VIRTIO_RXQ; 301 async_flag = ASYNC_ENQUEUE_VHOST; 302 } else if (rxd) { 303 start = rxd; 304 vring_id = VIRTIO_TXQ; 305 async_flag = ASYNC_DEQUEUE_VHOST; 306 } else { 307 ret = -1; 308 goto out; 309 } 310 311 start += 3; 312 socketid = strtol(start, &end, 0); 313 if (end == start) { 314 ret = -1; 315 goto out; 316 } 317 318 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 319 if (dev_id < 0) { 320 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 321 ret = -1; 322 goto out; 323 } 324 325 /* DMA device is already configured, so skip */ 326 if (is_dma_configured(dev_id)) 327 goto done; 328 329 if (rte_dma_info_get(dev_id, &info) != 0) { 330 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 331 ret = -1; 332 goto out; 333 } 334 335 if (info.max_vchans < 1) { 336 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 337 ret = -1; 338 goto out; 339 } 340 341 if (rte_dma_configure(dev_id, &dev_config) != 0) { 342 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 343 ret = -1; 344 goto out; 345 } 346 347 /* Check the max desc supported by DMA device */ 348 rte_dma_info_get(dev_id, &info); 349 if (info.nb_vchans != 1) { 350 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 351 dev_id); 352 ret = -1; 353 goto out; 354 } 355 356 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 357 358 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 359 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 360 ret = -1; 361 goto out; 362 } 363 364 if (rte_dma_start(dev_id) != 0) { 365 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 366 ret = -1; 367 goto out; 368 } 369 370 dmas_id[dma_count++] = dev_id; 371 372 done: 373 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id; 374 (dma_info + socketid)->async_flag |= async_flag; 375 i++; 376 } 377 out: 378 free(input); 379 return ret; 380 } 381 382 /* 383 * Builds up the correct configuration for VMDQ VLAN pool map 384 * according to the pool & queue limits. 385 */ 386 static inline int 387 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 388 { 389 struct rte_eth_vmdq_rx_conf conf; 390 struct rte_eth_vmdq_rx_conf *def_conf = 391 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 392 unsigned i; 393 394 memset(&conf, 0, sizeof(conf)); 395 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 396 conf.nb_pool_maps = num_devices; 397 conf.enable_loop_back = def_conf->enable_loop_back; 398 conf.rx_mode = def_conf->rx_mode; 399 400 for (i = 0; i < conf.nb_pool_maps; i++) { 401 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 402 conf.pool_map[i].pools = (1UL << i); 403 } 404 405 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 406 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 407 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 408 return 0; 409 } 410 411 /* 412 * Initialises a given port using global settings and with the rx buffers 413 * coming from the mbuf_pool passed as parameter 414 */ 415 static inline int 416 port_init(uint16_t port) 417 { 418 struct rte_eth_dev_info dev_info; 419 struct rte_eth_conf port_conf; 420 struct rte_eth_rxconf *rxconf; 421 struct rte_eth_txconf *txconf; 422 int16_t rx_rings, tx_rings; 423 uint16_t rx_ring_size, tx_ring_size; 424 int retval; 425 uint16_t q; 426 427 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 428 retval = rte_eth_dev_info_get(port, &dev_info); 429 if (retval != 0) { 430 RTE_LOG(ERR, VHOST_PORT, 431 "Error during getting device (port %u) info: %s\n", 432 port, strerror(-retval)); 433 434 return retval; 435 } 436 if (dev_info.max_vmdq_pools == 0) { 437 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n"); 438 return -1; 439 } 440 441 rxconf = &dev_info.default_rxconf; 442 txconf = &dev_info.default_txconf; 443 rxconf->rx_drop_en = 1; 444 445 /*configure the number of supported virtio devices based on VMDQ limits */ 446 num_devices = dev_info.max_vmdq_pools; 447 448 rx_ring_size = RX_DESC_DEFAULT; 449 tx_ring_size = TX_DESC_DEFAULT; 450 451 tx_rings = (uint16_t)rte_lcore_count(); 452 453 if (mergeable) { 454 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 455 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 456 else 457 vmdq_conf_default.rxmode.mtu = MAX_MTU; 458 } 459 460 /* Get port configuration. */ 461 retval = get_eth_conf(&port_conf, num_devices); 462 if (retval < 0) 463 return retval; 464 /* NIC queues are divided into pf queues and vmdq queues. */ 465 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 466 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 467 num_vmdq_queues = num_devices * queues_per_pool; 468 num_queues = num_pf_queues + num_vmdq_queues; 469 vmdq_queue_base = dev_info.vmdq_queue_base; 470 vmdq_pool_base = dev_info.vmdq_pool_base; 471 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 472 num_pf_queues, num_devices, queues_per_pool); 473 474 if (!rte_eth_dev_is_valid_port(port)) 475 return -1; 476 477 rx_rings = (uint16_t)dev_info.max_rx_queues; 478 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 479 port_conf.txmode.offloads |= 480 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 481 /* Configure ethernet device. */ 482 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 483 if (retval != 0) { 484 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 485 port, strerror(-retval)); 486 return retval; 487 } 488 489 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 490 &tx_ring_size); 491 if (retval != 0) { 492 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 493 "for port %u: %s.\n", port, strerror(-retval)); 494 return retval; 495 } 496 if (rx_ring_size > RX_DESC_DEFAULT) { 497 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 498 "for Rx queues on port %u.\n", port); 499 return -1; 500 } 501 502 /* Setup the queues. */ 503 rxconf->offloads = port_conf.rxmode.offloads; 504 for (q = 0; q < rx_rings; q ++) { 505 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 506 rte_eth_dev_socket_id(port), 507 rxconf, 508 mbuf_pool); 509 if (retval < 0) { 510 RTE_LOG(ERR, VHOST_PORT, 511 "Failed to setup rx queue %u of port %u: %s.\n", 512 q, port, strerror(-retval)); 513 return retval; 514 } 515 } 516 txconf->offloads = port_conf.txmode.offloads; 517 for (q = 0; q < tx_rings; q ++) { 518 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 519 rte_eth_dev_socket_id(port), 520 txconf); 521 if (retval < 0) { 522 RTE_LOG(ERR, VHOST_PORT, 523 "Failed to setup tx queue %u of port %u: %s.\n", 524 q, port, strerror(-retval)); 525 return retval; 526 } 527 } 528 529 /* Start the device. */ 530 retval = rte_eth_dev_start(port); 531 if (retval < 0) { 532 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 533 port, strerror(-retval)); 534 return retval; 535 } 536 537 if (promiscuous) { 538 retval = rte_eth_promiscuous_enable(port); 539 if (retval != 0) { 540 RTE_LOG(ERR, VHOST_PORT, 541 "Failed to enable promiscuous mode on port %u: %s\n", 542 port, rte_strerror(-retval)); 543 return retval; 544 } 545 } 546 547 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 548 if (retval < 0) { 549 RTE_LOG(ERR, VHOST_PORT, 550 "Failed to get MAC address on port %u: %s\n", 551 port, rte_strerror(-retval)); 552 return retval; 553 } 554 555 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 556 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 557 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 558 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 559 560 return 0; 561 } 562 563 /* 564 * Set socket file path. 565 */ 566 static int 567 us_vhost_parse_socket_path(const char *q_arg) 568 { 569 char *old; 570 571 /* parse number string */ 572 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 573 return -1; 574 575 old = socket_files; 576 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 577 if (socket_files == NULL) { 578 free(old); 579 return -1; 580 } 581 582 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 583 nb_sockets++; 584 585 return 0; 586 } 587 588 /* 589 * Parse the portmask provided at run time. 590 */ 591 static int 592 parse_portmask(const char *portmask) 593 { 594 char *end = NULL; 595 unsigned long pm; 596 597 errno = 0; 598 599 /* parse hexadecimal string */ 600 pm = strtoul(portmask, &end, 16); 601 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 602 return 0; 603 604 return pm; 605 606 } 607 608 /* 609 * Parse num options at run time. 610 */ 611 static int 612 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 613 { 614 char *end = NULL; 615 unsigned long num; 616 617 errno = 0; 618 619 /* parse unsigned int string */ 620 num = strtoul(q_arg, &end, 10); 621 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 622 return -1; 623 624 if (num > max_valid_value) 625 return -1; 626 627 return num; 628 629 } 630 631 /* 632 * Display usage 633 */ 634 static void 635 us_vhost_usage(const char *prgname) 636 { 637 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 638 " --vm2vm [0|1|2]\n" 639 " --rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n" 640 " --socket-file <path>\n" 641 " -p PORTMASK: Set mask for ports to be used by application\n" 642 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 643 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 644 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 645 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 646 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 647 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 648 " --socket-file: The path of the socket file.\n" 649 " --tx-csum [0|1]: disable/enable TX checksum offload.\n" 650 " --tso [0|1]: disable/enable TCP segment offload.\n" 651 " --client: register a vhost-user socket as client mode.\n" 652 " --dmas: register dma channel for specific vhost device.\n" 653 " --total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n" 654 " --builtin-net-driver: enable simple vhost-user net driver\n", 655 prgname); 656 } 657 658 enum { 659 #define OPT_VM2VM "vm2vm" 660 OPT_VM2VM_NUM = 256, 661 #define OPT_RX_RETRY "rx-retry" 662 OPT_RX_RETRY_NUM, 663 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 664 OPT_RX_RETRY_DELAY_NUM, 665 #define OPT_RX_RETRY_NUMB "rx-retry-num" 666 OPT_RX_RETRY_NUMB_NUM, 667 #define OPT_MERGEABLE "mergeable" 668 OPT_MERGEABLE_NUM, 669 #define OPT_STATS "stats" 670 OPT_STATS_NUM, 671 #define OPT_SOCKET_FILE "socket-file" 672 OPT_SOCKET_FILE_NUM, 673 #define OPT_TX_CSUM "tx-csum" 674 OPT_TX_CSUM_NUM, 675 #define OPT_TSO "tso" 676 OPT_TSO_NUM, 677 #define OPT_CLIENT "client" 678 OPT_CLIENT_NUM, 679 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 680 OPT_BUILTIN_NET_DRIVER_NUM, 681 #define OPT_DMAS "dmas" 682 OPT_DMAS_NUM, 683 #define OPT_NUM_MBUFS "total-num-mbufs" 684 OPT_NUM_MBUFS_NUM, 685 }; 686 687 /* 688 * Parse the arguments given in the command line of the application. 689 */ 690 static int 691 us_vhost_parse_args(int argc, char **argv) 692 { 693 int opt, ret; 694 int option_index; 695 unsigned i; 696 const char *prgname = argv[0]; 697 static struct option long_option[] = { 698 {OPT_VM2VM, required_argument, 699 NULL, OPT_VM2VM_NUM}, 700 {OPT_RX_RETRY, required_argument, 701 NULL, OPT_RX_RETRY_NUM}, 702 {OPT_RX_RETRY_DELAY, required_argument, 703 NULL, OPT_RX_RETRY_DELAY_NUM}, 704 {OPT_RX_RETRY_NUMB, required_argument, 705 NULL, OPT_RX_RETRY_NUMB_NUM}, 706 {OPT_MERGEABLE, required_argument, 707 NULL, OPT_MERGEABLE_NUM}, 708 {OPT_STATS, required_argument, 709 NULL, OPT_STATS_NUM}, 710 {OPT_SOCKET_FILE, required_argument, 711 NULL, OPT_SOCKET_FILE_NUM}, 712 {OPT_TX_CSUM, required_argument, 713 NULL, OPT_TX_CSUM_NUM}, 714 {OPT_TSO, required_argument, 715 NULL, OPT_TSO_NUM}, 716 {OPT_CLIENT, no_argument, 717 NULL, OPT_CLIENT_NUM}, 718 {OPT_BUILTIN_NET_DRIVER, no_argument, 719 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 720 {OPT_DMAS, required_argument, 721 NULL, OPT_DMAS_NUM}, 722 {OPT_NUM_MBUFS, required_argument, 723 NULL, OPT_NUM_MBUFS_NUM}, 724 {NULL, 0, 0, 0}, 725 }; 726 727 /* Parse command line */ 728 while ((opt = getopt_long(argc, argv, "p:P", 729 long_option, &option_index)) != EOF) { 730 switch (opt) { 731 /* Portmask */ 732 case 'p': 733 enabled_port_mask = parse_portmask(optarg); 734 if (enabled_port_mask == 0) { 735 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 736 us_vhost_usage(prgname); 737 return -1; 738 } 739 break; 740 741 case 'P': 742 promiscuous = 1; 743 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 744 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 745 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 746 break; 747 748 case OPT_VM2VM_NUM: 749 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 750 if (ret == -1) { 751 RTE_LOG(INFO, VHOST_CONFIG, 752 "Invalid argument for " 753 "vm2vm [0|1|2]\n"); 754 us_vhost_usage(prgname); 755 return -1; 756 } 757 vm2vm_mode = (vm2vm_type)ret; 758 break; 759 760 case OPT_RX_RETRY_NUM: 761 ret = parse_num_opt(optarg, 1); 762 if (ret == -1) { 763 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 764 us_vhost_usage(prgname); 765 return -1; 766 } 767 enable_retry = ret; 768 break; 769 770 case OPT_TX_CSUM_NUM: 771 ret = parse_num_opt(optarg, 1); 772 if (ret == -1) { 773 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 774 us_vhost_usage(prgname); 775 return -1; 776 } 777 enable_tx_csum = ret; 778 break; 779 780 case OPT_TSO_NUM: 781 ret = parse_num_opt(optarg, 1); 782 if (ret == -1) { 783 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 784 us_vhost_usage(prgname); 785 return -1; 786 } 787 enable_tso = ret; 788 break; 789 790 case OPT_RX_RETRY_DELAY_NUM: 791 ret = parse_num_opt(optarg, INT32_MAX); 792 if (ret == -1) { 793 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 794 us_vhost_usage(prgname); 795 return -1; 796 } 797 burst_rx_delay_time = ret; 798 break; 799 800 case OPT_RX_RETRY_NUMB_NUM: 801 ret = parse_num_opt(optarg, INT32_MAX); 802 if (ret == -1) { 803 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 804 us_vhost_usage(prgname); 805 return -1; 806 } 807 burst_rx_retry_num = ret; 808 break; 809 810 case OPT_MERGEABLE_NUM: 811 ret = parse_num_opt(optarg, 1); 812 if (ret == -1) { 813 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 814 us_vhost_usage(prgname); 815 return -1; 816 } 817 mergeable = !!ret; 818 break; 819 820 case OPT_STATS_NUM: 821 ret = parse_num_opt(optarg, INT32_MAX); 822 if (ret == -1) { 823 RTE_LOG(INFO, VHOST_CONFIG, 824 "Invalid argument for stats [0..N]\n"); 825 us_vhost_usage(prgname); 826 return -1; 827 } 828 enable_stats = ret; 829 break; 830 831 /* Set socket file path. */ 832 case OPT_SOCKET_FILE_NUM: 833 if (us_vhost_parse_socket_path(optarg) == -1) { 834 RTE_LOG(INFO, VHOST_CONFIG, 835 "Invalid argument for socket name (Max %d characters)\n", 836 PATH_MAX); 837 us_vhost_usage(prgname); 838 return -1; 839 } 840 break; 841 842 case OPT_DMAS_NUM: 843 if (open_dma(optarg) == -1) { 844 RTE_LOG(INFO, VHOST_CONFIG, 845 "Wrong DMA args\n"); 846 us_vhost_usage(prgname); 847 return -1; 848 } 849 break; 850 851 case OPT_NUM_MBUFS_NUM: 852 ret = parse_num_opt(optarg, INT32_MAX); 853 if (ret == -1) { 854 RTE_LOG(INFO, VHOST_CONFIG, 855 "Invalid argument for total-num-mbufs [0..N]\n"); 856 us_vhost_usage(prgname); 857 return -1; 858 } 859 860 if (total_num_mbufs < ret) 861 total_num_mbufs = ret; 862 break; 863 864 case OPT_CLIENT_NUM: 865 client_mode = 1; 866 break; 867 868 case OPT_BUILTIN_NET_DRIVER_NUM: 869 builtin_net_driver = 1; 870 break; 871 872 /* Invalid option - print options. */ 873 default: 874 us_vhost_usage(prgname); 875 return -1; 876 } 877 } 878 879 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 880 if (enabled_port_mask & (1 << i)) 881 ports[num_ports++] = i; 882 } 883 884 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 885 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 886 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 887 return -1; 888 } 889 890 return 0; 891 } 892 893 /* 894 * Update the global var NUM_PORTS and array PORTS according to system ports number 895 * and return valid ports number 896 */ 897 static unsigned check_ports_num(unsigned nb_ports) 898 { 899 unsigned valid_num_ports = num_ports; 900 unsigned portid; 901 902 if (num_ports > nb_ports) { 903 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 904 num_ports, nb_ports); 905 num_ports = nb_ports; 906 } 907 908 for (portid = 0; portid < num_ports; portid ++) { 909 if (!rte_eth_dev_is_valid_port(ports[portid])) { 910 RTE_LOG(INFO, VHOST_PORT, 911 "\nSpecified port ID(%u) is not valid\n", 912 ports[portid]); 913 ports[portid] = INVALID_PORT_ID; 914 valid_num_ports--; 915 } 916 } 917 return valid_num_ports; 918 } 919 920 static __rte_always_inline struct vhost_dev * 921 find_vhost_dev(struct rte_ether_addr *mac) 922 { 923 struct vhost_dev *vdev; 924 925 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 926 if (vdev->ready == DEVICE_RX && 927 rte_is_same_ether_addr(mac, &vdev->mac_address)) 928 return vdev; 929 } 930 931 return NULL; 932 } 933 934 /* 935 * This function learns the MAC address of the device and registers this along with a 936 * vlan tag to a VMDQ. 937 */ 938 static int 939 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 940 { 941 struct rte_ether_hdr *pkt_hdr; 942 int i, ret; 943 944 /* Learn MAC address of guest device from packet */ 945 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 946 947 if (find_vhost_dev(&pkt_hdr->src_addr)) { 948 RTE_LOG(ERR, VHOST_DATA, 949 "(%d) device is using a registered MAC!\n", 950 vdev->vid); 951 return -1; 952 } 953 954 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 955 vdev->mac_address.addr_bytes[i] = 956 pkt_hdr->src_addr.addr_bytes[i]; 957 958 /* vlan_tag currently uses the device_id. */ 959 vdev->vlan_tag = vlan_tags[vdev->vid]; 960 961 /* Print out VMDQ registration info. */ 962 RTE_LOG(INFO, VHOST_DATA, 963 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 964 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 965 vdev->vlan_tag); 966 967 /* Register the MAC address. */ 968 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 969 (uint32_t)vdev->vid + vmdq_pool_base); 970 if (ret) 971 RTE_LOG(ERR, VHOST_DATA, 972 "(%d) failed to add device MAC address to VMDQ\n", 973 vdev->vid); 974 975 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 976 977 /* Set device as ready for RX. */ 978 vdev->ready = DEVICE_RX; 979 980 return 0; 981 } 982 983 /* 984 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 985 * queue before disabling RX on the device. 986 */ 987 static inline void 988 unlink_vmdq(struct vhost_dev *vdev) 989 { 990 unsigned i = 0; 991 unsigned rx_count; 992 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 993 994 if (vdev->ready == DEVICE_RX) { 995 /*clear MAC and VLAN settings*/ 996 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 997 for (i = 0; i < 6; i++) 998 vdev->mac_address.addr_bytes[i] = 0; 999 1000 vdev->vlan_tag = 0; 1001 1002 /*Clear out the receive buffers*/ 1003 rx_count = rte_eth_rx_burst(ports[0], 1004 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1005 1006 while (rx_count) { 1007 for (i = 0; i < rx_count; i++) 1008 rte_pktmbuf_free(pkts_burst[i]); 1009 1010 rx_count = rte_eth_rx_burst(ports[0], 1011 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1012 } 1013 1014 vdev->ready = DEVICE_MAC_LEARNING; 1015 } 1016 } 1017 1018 static inline void 1019 free_pkts(struct rte_mbuf **pkts, uint16_t n) 1020 { 1021 while (n--) 1022 rte_pktmbuf_free(pkts[n]); 1023 } 1024 1025 static __rte_always_inline void 1026 complete_async_pkts(struct vhost_dev *vdev) 1027 { 1028 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 1029 uint16_t complete_count; 1030 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id; 1031 1032 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 1033 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 1034 if (complete_count) 1035 free_pkts(p_cpl, complete_count); 1036 1037 } 1038 1039 static __rte_always_inline void 1040 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 1041 struct rte_mbuf *m) 1042 { 1043 uint16_t ret; 1044 1045 if (builtin_net_driver) { 1046 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 1047 } else { 1048 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 1049 } 1050 1051 if (enable_stats) { 1052 __atomic_fetch_add(&dst_vdev->stats.rx_total_atomic, 1, 1053 __ATOMIC_SEQ_CST); 1054 __atomic_fetch_add(&dst_vdev->stats.rx_atomic, ret, 1055 __ATOMIC_SEQ_CST); 1056 src_vdev->stats.tx_total++; 1057 src_vdev->stats.tx += ret; 1058 } 1059 } 1060 1061 static __rte_always_inline void 1062 drain_vhost(struct vhost_dev *vdev) 1063 { 1064 uint16_t ret; 1065 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1066 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1067 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1068 1069 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit); 1070 1071 if (enable_stats) { 1072 __atomic_fetch_add(&vdev->stats.rx_total_atomic, nr_xmit, 1073 __ATOMIC_SEQ_CST); 1074 __atomic_fetch_add(&vdev->stats.rx_atomic, ret, 1075 __ATOMIC_SEQ_CST); 1076 } 1077 1078 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) { 1079 free_pkts(m, nr_xmit); 1080 } else { 1081 uint16_t enqueue_fail = nr_xmit - ret; 1082 if (enqueue_fail > 0) 1083 free_pkts(&m[ret], enqueue_fail); 1084 } 1085 } 1086 1087 static __rte_always_inline void 1088 drain_vhost_table(void) 1089 { 1090 uint16_t lcore_id = rte_lcore_id(); 1091 struct vhost_bufftable *vhost_txq; 1092 struct vhost_dev *vdev; 1093 uint64_t cur_tsc; 1094 1095 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1096 if (unlikely(vdev->remove == 1)) 1097 continue; 1098 1099 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1100 1101 cur_tsc = rte_rdtsc(); 1102 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1103 > MBUF_TABLE_DRAIN_TSC)) { 1104 RTE_LOG_DP(DEBUG, VHOST_DATA, 1105 "Vhost TX queue drained after timeout with burst size %u\n", 1106 vhost_txq->len); 1107 drain_vhost(vdev); 1108 vhost_txq->len = 0; 1109 vhost_txq->pre_tsc = cur_tsc; 1110 } 1111 } 1112 } 1113 1114 /* 1115 * Check if the packet destination MAC address is for a local device. If so then put 1116 * the packet on that devices RX queue. If not then return. 1117 */ 1118 static __rte_always_inline int 1119 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1120 { 1121 struct rte_ether_hdr *pkt_hdr; 1122 struct vhost_dev *dst_vdev; 1123 struct vhost_bufftable *vhost_txq; 1124 uint16_t lcore_id = rte_lcore_id(); 1125 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1126 1127 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1128 if (!dst_vdev) 1129 return -1; 1130 1131 if (vdev->vid == dst_vdev->vid) { 1132 RTE_LOG_DP(DEBUG, VHOST_DATA, 1133 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1134 vdev->vid); 1135 return 0; 1136 } 1137 1138 RTE_LOG_DP(DEBUG, VHOST_DATA, 1139 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1140 1141 if (unlikely(dst_vdev->remove)) { 1142 RTE_LOG_DP(DEBUG, VHOST_DATA, 1143 "(%d) device is marked for removal\n", dst_vdev->vid); 1144 return 0; 1145 } 1146 1147 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1148 vhost_txq->m_table[vhost_txq->len++] = m; 1149 1150 if (enable_stats) { 1151 vdev->stats.tx_total++; 1152 vdev->stats.tx++; 1153 } 1154 1155 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1156 drain_vhost(dst_vdev); 1157 vhost_txq->len = 0; 1158 vhost_txq->pre_tsc = rte_rdtsc(); 1159 } 1160 return 0; 1161 } 1162 1163 /* 1164 * Check if the destination MAC of a packet is one local VM, 1165 * and get its vlan tag, and offset if it is. 1166 */ 1167 static __rte_always_inline int 1168 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1169 uint32_t *offset, uint16_t *vlan_tag) 1170 { 1171 struct vhost_dev *dst_vdev; 1172 struct rte_ether_hdr *pkt_hdr = 1173 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1174 1175 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1176 if (!dst_vdev) 1177 return 0; 1178 1179 if (vdev->vid == dst_vdev->vid) { 1180 RTE_LOG_DP(DEBUG, VHOST_DATA, 1181 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1182 vdev->vid); 1183 return -1; 1184 } 1185 1186 /* 1187 * HW vlan strip will reduce the packet length 1188 * by minus length of vlan tag, so need restore 1189 * the packet length by plus it. 1190 */ 1191 *offset = RTE_VLAN_HLEN; 1192 *vlan_tag = vlan_tags[vdev->vid]; 1193 1194 RTE_LOG_DP(DEBUG, VHOST_DATA, 1195 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1196 vdev->vid, dst_vdev->vid, *vlan_tag); 1197 1198 return 0; 1199 } 1200 1201 static void virtio_tx_offload(struct rte_mbuf *m) 1202 { 1203 struct rte_net_hdr_lens hdr_lens; 1204 struct rte_ipv4_hdr *ipv4_hdr; 1205 struct rte_tcp_hdr *tcp_hdr; 1206 uint32_t ptype; 1207 void *l3_hdr; 1208 1209 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1210 m->l2_len = hdr_lens.l2_len; 1211 m->l3_len = hdr_lens.l3_len; 1212 m->l4_len = hdr_lens.l4_len; 1213 1214 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1215 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1216 m->l2_len + m->l3_len); 1217 1218 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1219 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1220 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1221 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1222 ipv4_hdr = l3_hdr; 1223 ipv4_hdr->hdr_checksum = 0; 1224 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1225 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1226 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1227 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1228 } 1229 } 1230 1231 static __rte_always_inline void 1232 do_drain_mbuf_table(struct mbuf_table *tx_q) 1233 { 1234 uint16_t count; 1235 1236 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1237 tx_q->m_table, tx_q->len); 1238 if (unlikely(count < tx_q->len)) 1239 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1240 1241 tx_q->len = 0; 1242 } 1243 1244 /* 1245 * This function routes the TX packet to the correct interface. This 1246 * may be a local device or the physical port. 1247 */ 1248 static __rte_always_inline void 1249 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1250 { 1251 struct mbuf_table *tx_q; 1252 unsigned offset = 0; 1253 const uint16_t lcore_id = rte_lcore_id(); 1254 struct rte_ether_hdr *nh; 1255 1256 1257 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1258 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1259 struct vhost_dev *vdev2; 1260 1261 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1262 if (vdev2 != vdev) 1263 sync_virtio_xmit(vdev2, vdev, m); 1264 } 1265 goto queue2nic; 1266 } 1267 1268 /*check if destination is local VM*/ 1269 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1270 return; 1271 1272 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1273 if (unlikely(find_local_dest(vdev, m, &offset, 1274 &vlan_tag) != 0)) { 1275 rte_pktmbuf_free(m); 1276 return; 1277 } 1278 } 1279 1280 RTE_LOG_DP(DEBUG, VHOST_DATA, 1281 "(%d) TX: MAC address is external\n", vdev->vid); 1282 1283 queue2nic: 1284 1285 /*Add packet to the port tx queue*/ 1286 tx_q = &lcore_tx_queue[lcore_id]; 1287 1288 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1289 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1290 /* Guest has inserted the vlan tag. */ 1291 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1292 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1293 if ((vm2vm_mode == VM2VM_HARDWARE) && 1294 (vh->vlan_tci != vlan_tag_be)) 1295 vh->vlan_tci = vlan_tag_be; 1296 } else { 1297 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1298 1299 /* 1300 * Find the right seg to adjust the data len when offset is 1301 * bigger than tail room size. 1302 */ 1303 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1304 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1305 m->data_len += offset; 1306 else { 1307 struct rte_mbuf *seg = m; 1308 1309 while ((seg->next != NULL) && 1310 (offset > rte_pktmbuf_tailroom(seg))) 1311 seg = seg->next; 1312 1313 seg->data_len += offset; 1314 } 1315 m->pkt_len += offset; 1316 } 1317 1318 m->vlan_tci = vlan_tag; 1319 } 1320 1321 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1322 virtio_tx_offload(m); 1323 1324 tx_q->m_table[tx_q->len++] = m; 1325 if (enable_stats) { 1326 vdev->stats.tx_total++; 1327 vdev->stats.tx++; 1328 } 1329 1330 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1331 do_drain_mbuf_table(tx_q); 1332 } 1333 1334 1335 static __rte_always_inline void 1336 drain_mbuf_table(struct mbuf_table *tx_q) 1337 { 1338 static uint64_t prev_tsc; 1339 uint64_t cur_tsc; 1340 1341 if (tx_q->len == 0) 1342 return; 1343 1344 cur_tsc = rte_rdtsc(); 1345 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1346 prev_tsc = cur_tsc; 1347 1348 RTE_LOG_DP(DEBUG, VHOST_DATA, 1349 "TX queue drained after timeout with burst size %u\n", 1350 tx_q->len); 1351 do_drain_mbuf_table(tx_q); 1352 } 1353 } 1354 1355 uint16_t 1356 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1357 struct rte_mbuf **pkts, uint32_t rx_count) 1358 { 1359 uint16_t enqueue_count; 1360 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id; 1361 1362 complete_async_pkts(dev); 1363 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id, 1364 pkts, rx_count, dma_id, 0); 1365 1366 return enqueue_count; 1367 } 1368 1369 uint16_t 1370 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1371 struct rte_mbuf **pkts, uint32_t rx_count) 1372 { 1373 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count); 1374 } 1375 1376 static __rte_always_inline void 1377 drain_eth_rx(struct vhost_dev *vdev) 1378 { 1379 uint16_t rx_count, enqueue_count; 1380 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1381 1382 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1383 pkts, MAX_PKT_BURST); 1384 1385 if (!rx_count) 1386 return; 1387 1388 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1389 VIRTIO_RXQ, pkts, rx_count); 1390 1391 /* Retry if necessary */ 1392 if (enable_retry && unlikely(enqueue_count < rx_count)) { 1393 uint32_t retry = 0; 1394 1395 while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) { 1396 rte_delay_us(burst_rx_delay_time); 1397 enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1398 VIRTIO_RXQ, &pkts[enqueue_count], 1399 rx_count - enqueue_count); 1400 } 1401 } 1402 1403 if (enable_stats) { 1404 __atomic_fetch_add(&vdev->stats.rx_total_atomic, rx_count, 1405 __ATOMIC_SEQ_CST); 1406 __atomic_fetch_add(&vdev->stats.rx_atomic, enqueue_count, 1407 __ATOMIC_SEQ_CST); 1408 } 1409 1410 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) { 1411 free_pkts(pkts, rx_count); 1412 } else { 1413 uint16_t enqueue_fail = rx_count - enqueue_count; 1414 if (enqueue_fail > 0) 1415 free_pkts(&pkts[enqueue_count], enqueue_fail); 1416 } 1417 } 1418 1419 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1420 struct rte_mempool *mbuf_pool, 1421 struct rte_mbuf **pkts, uint16_t count) 1422 { 1423 int nr_inflight; 1424 uint16_t dequeue_count; 1425 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id; 1426 1427 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id, 1428 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0); 1429 1430 return dequeue_count; 1431 } 1432 1433 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1434 struct rte_mempool *mbuf_pool, 1435 struct rte_mbuf **pkts, uint16_t count) 1436 { 1437 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count); 1438 } 1439 1440 static __rte_always_inline void 1441 drain_virtio_tx(struct vhost_dev *vdev) 1442 { 1443 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1444 uint16_t count; 1445 uint16_t i; 1446 1447 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev, 1448 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST); 1449 1450 /* setup VMDq for the first packet */ 1451 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1452 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1453 free_pkts(pkts, count); 1454 } 1455 1456 for (i = 0; i < count; ++i) 1457 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1458 } 1459 1460 /* 1461 * Main function of vhost-switch. It basically does: 1462 * 1463 * for each vhost device { 1464 * - drain_eth_rx() 1465 * 1466 * Which drains the host eth Rx queue linked to the vhost device, 1467 * and deliver all of them to guest virito Rx ring associated with 1468 * this vhost device. 1469 * 1470 * - drain_virtio_tx() 1471 * 1472 * Which drains the guest virtio Tx queue and deliver all of them 1473 * to the target, which could be another vhost device, or the 1474 * physical eth dev. The route is done in function "virtio_tx_route". 1475 * } 1476 */ 1477 static int 1478 switch_worker(void *arg __rte_unused) 1479 { 1480 unsigned i; 1481 unsigned lcore_id = rte_lcore_id(); 1482 struct vhost_dev *vdev; 1483 struct mbuf_table *tx_q; 1484 1485 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1486 1487 tx_q = &lcore_tx_queue[lcore_id]; 1488 for (i = 0; i < rte_lcore_count(); i++) { 1489 if (lcore_ids[i] == lcore_id) { 1490 tx_q->txq_id = i; 1491 break; 1492 } 1493 } 1494 1495 while(1) { 1496 drain_mbuf_table(tx_q); 1497 drain_vhost_table(); 1498 /* 1499 * Inform the configuration core that we have exited the 1500 * linked list and that no devices are in use if requested. 1501 */ 1502 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1503 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1504 1505 /* 1506 * Process vhost devices 1507 */ 1508 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1509 lcore_vdev_entry) { 1510 if (unlikely(vdev->remove)) { 1511 unlink_vmdq(vdev); 1512 vdev->ready = DEVICE_SAFE_REMOVE; 1513 continue; 1514 } 1515 1516 if (likely(vdev->ready == DEVICE_RX)) 1517 drain_eth_rx(vdev); 1518 1519 if (likely(!vdev->remove)) 1520 drain_virtio_tx(vdev); 1521 } 1522 } 1523 1524 return 0; 1525 } 1526 1527 static void 1528 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id) 1529 { 1530 uint16_t n_pkt = 0; 1531 int pkts_inflight; 1532 1533 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1534 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id); 1535 1536 struct rte_mbuf *m_cpl[pkts_inflight]; 1537 1538 while (pkts_inflight) { 1539 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl, 1540 pkts_inflight, dma_id, 0); 1541 free_pkts(m_cpl, n_pkt); 1542 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, 1543 queue_id); 1544 } 1545 } 1546 1547 static void 1548 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id) 1549 { 1550 uint16_t n_pkt = 0; 1551 int pkts_inflight; 1552 1553 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1554 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1555 1556 struct rte_mbuf *m_cpl[pkts_inflight]; 1557 1558 while (pkts_inflight) { 1559 n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl, 1560 pkts_inflight, dma_id, 0); 1561 free_pkts(m_cpl, n_pkt); 1562 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1563 } 1564 } 1565 1566 /* 1567 * Remove a device from the specific data core linked list and from the 1568 * main linked list. Synchronization occurs through the use of the 1569 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1570 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1571 */ 1572 static void 1573 destroy_device(int vid) 1574 { 1575 struct vhost_dev *vdev = NULL; 1576 int lcore; 1577 uint16_t i; 1578 1579 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1580 if (vdev->vid == vid) 1581 break; 1582 } 1583 if (!vdev) 1584 return; 1585 /*set the remove flag. */ 1586 vdev->remove = 1; 1587 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1588 rte_pause(); 1589 } 1590 1591 for (i = 0; i < RTE_MAX_LCORE; i++) 1592 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1593 1594 if (builtin_net_driver) 1595 vs_vhost_net_remove(vdev); 1596 1597 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1598 lcore_vdev_entry); 1599 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1600 1601 1602 /* Set the dev_removal_flag on each lcore. */ 1603 RTE_LCORE_FOREACH_WORKER(lcore) 1604 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1605 1606 /* 1607 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1608 * we can be sure that they can no longer access the device removed 1609 * from the linked lists and that the devices are no longer in use. 1610 */ 1611 RTE_LCORE_FOREACH_WORKER(lcore) { 1612 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1613 rte_pause(); 1614 } 1615 1616 lcore_info[vdev->coreid].device_num--; 1617 1618 RTE_LOG(INFO, VHOST_DATA, 1619 "(%d) device has been removed from data core\n", 1620 vdev->vid); 1621 1622 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1623 vhost_clear_queue(vdev, VIRTIO_RXQ); 1624 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1625 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1626 } 1627 1628 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) { 1629 vhost_clear_queue(vdev, VIRTIO_TXQ); 1630 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ); 1631 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false; 1632 } 1633 1634 rte_free(vdev); 1635 } 1636 1637 static inline int 1638 get_socketid_by_vid(int vid) 1639 { 1640 int i; 1641 char ifname[PATH_MAX]; 1642 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 1643 1644 for (i = 0; i < nb_sockets; i++) { 1645 char *file = socket_files + i * PATH_MAX; 1646 if (strcmp(file, ifname) == 0) 1647 return i; 1648 } 1649 1650 return -1; 1651 } 1652 1653 static int 1654 init_vhost_queue_ops(int vid) 1655 { 1656 if (builtin_net_driver) { 1657 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; 1658 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; 1659 } else { 1660 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled) 1661 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts; 1662 else 1663 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts; 1664 1665 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled) 1666 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts; 1667 else 1668 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; 1669 } 1670 1671 return 0; 1672 } 1673 1674 static inline int 1675 vhost_async_channel_register(int vid) 1676 { 1677 int rx_ret = 0, tx_ret = 0; 1678 1679 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1680 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1681 if (rx_ret == 0) 1682 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true; 1683 } 1684 1685 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) { 1686 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ); 1687 if (tx_ret == 0) 1688 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true; 1689 } 1690 1691 return rx_ret | tx_ret; 1692 } 1693 1694 1695 1696 /* 1697 * A new device is added to a data core. First the device is added to the main linked list 1698 * and then allocated to a specific data core. 1699 */ 1700 static int 1701 new_device(int vid) 1702 { 1703 int lcore, core_add = 0; 1704 uint16_t i; 1705 uint32_t device_num_min = num_devices; 1706 struct vhost_dev *vdev; 1707 int ret; 1708 1709 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1710 if (vdev == NULL) { 1711 RTE_LOG(INFO, VHOST_DATA, 1712 "(%d) couldn't allocate memory for vhost dev\n", 1713 vid); 1714 return -1; 1715 } 1716 vdev->vid = vid; 1717 1718 for (i = 0; i < RTE_MAX_LCORE; i++) { 1719 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1720 = rte_zmalloc("vhost bufftable", 1721 sizeof(struct vhost_bufftable), 1722 RTE_CACHE_LINE_SIZE); 1723 1724 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1725 RTE_LOG(INFO, VHOST_DATA, 1726 "(%d) couldn't allocate memory for vhost TX\n", vid); 1727 return -1; 1728 } 1729 } 1730 1731 int socketid = get_socketid_by_vid(vid); 1732 if (socketid == -1) 1733 return -1; 1734 1735 init_vid2socketid_array(vid, socketid); 1736 1737 ret = vhost_async_channel_register(vid); 1738 1739 if (init_vhost_queue_ops(vid) != 0) 1740 return -1; 1741 1742 if (builtin_net_driver) 1743 vs_vhost_net_setup(vdev); 1744 1745 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1746 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1747 1748 /*reset ready flag*/ 1749 vdev->ready = DEVICE_MAC_LEARNING; 1750 vdev->remove = 0; 1751 1752 /* Find a suitable lcore to add the device. */ 1753 RTE_LCORE_FOREACH_WORKER(lcore) { 1754 if (lcore_info[lcore].device_num < device_num_min) { 1755 device_num_min = lcore_info[lcore].device_num; 1756 core_add = lcore; 1757 } 1758 } 1759 vdev->coreid = core_add; 1760 1761 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1762 lcore_vdev_entry); 1763 lcore_info[vdev->coreid].device_num++; 1764 1765 /* Disable notifications. */ 1766 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1767 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1768 1769 RTE_LOG(INFO, VHOST_DATA, 1770 "(%d) device has been added to data core %d\n", 1771 vid, vdev->coreid); 1772 1773 return ret; 1774 } 1775 1776 static int 1777 vring_state_changed(int vid, uint16_t queue_id, int enable) 1778 { 1779 struct vhost_dev *vdev = NULL; 1780 1781 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1782 if (vdev->vid == vid) 1783 break; 1784 } 1785 if (!vdev) 1786 return -1; 1787 1788 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) { 1789 if (!enable) 1790 vhost_clear_queue_thread_unsafe(vdev, queue_id); 1791 } 1792 1793 return 0; 1794 } 1795 1796 /* 1797 * These callback allow devices to be added to the data core when configuration 1798 * has been fully complete. 1799 */ 1800 static const struct rte_vhost_device_ops virtio_net_device_ops = 1801 { 1802 .new_device = new_device, 1803 .destroy_device = destroy_device, 1804 .vring_state_changed = vring_state_changed, 1805 }; 1806 1807 /* 1808 * This is a thread will wake up after a period to print stats if the user has 1809 * enabled them. 1810 */ 1811 static uint32_t 1812 print_stats(__rte_unused void *arg) 1813 { 1814 struct vhost_dev *vdev; 1815 uint64_t tx_dropped, rx_dropped; 1816 uint64_t tx, tx_total, rx, rx_total; 1817 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1818 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1819 1820 while(1) { 1821 sleep(enable_stats); 1822 1823 /* Clear screen and move to top left */ 1824 printf("%s%s\n", clr, top_left); 1825 printf("Device statistics =================================\n"); 1826 1827 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1828 tx_total = vdev->stats.tx_total; 1829 tx = vdev->stats.tx; 1830 tx_dropped = tx_total - tx; 1831 1832 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1833 __ATOMIC_SEQ_CST); 1834 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1835 __ATOMIC_SEQ_CST); 1836 rx_dropped = rx_total - rx; 1837 1838 printf("Statistics for device %d\n" 1839 "-----------------------\n" 1840 "TX total: %" PRIu64 "\n" 1841 "TX dropped: %" PRIu64 "\n" 1842 "TX successful: %" PRIu64 "\n" 1843 "RX total: %" PRIu64 "\n" 1844 "RX dropped: %" PRIu64 "\n" 1845 "RX successful: %" PRIu64 "\n", 1846 vdev->vid, 1847 tx_total, tx_dropped, tx, 1848 rx_total, rx_dropped, rx); 1849 } 1850 1851 printf("===================================================\n"); 1852 1853 fflush(stdout); 1854 } 1855 1856 return 0; 1857 } 1858 1859 static void 1860 unregister_drivers(int socket_num) 1861 { 1862 int i, ret; 1863 1864 for (i = 0; i < socket_num; i++) { 1865 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1866 if (ret != 0) 1867 RTE_LOG(ERR, VHOST_CONFIG, 1868 "Fail to unregister vhost driver for %s.\n", 1869 socket_files + i * PATH_MAX); 1870 } 1871 } 1872 1873 /* When we receive a INT signal, unregister vhost driver */ 1874 static void 1875 sigint_handler(__rte_unused int signum) 1876 { 1877 /* Unregister vhost driver. */ 1878 unregister_drivers(nb_sockets); 1879 1880 exit(0); 1881 } 1882 1883 static void 1884 reset_dma(void) 1885 { 1886 int i; 1887 1888 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1889 int j; 1890 1891 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1892 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1893 dma_bind[i].dmas[j].async_enabled = false; 1894 } 1895 } 1896 1897 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1898 dmas_id[i] = INVALID_DMA_ID; 1899 } 1900 1901 /* 1902 * Main function, does initialisation and calls the per-lcore functions. 1903 */ 1904 int 1905 main(int argc, char *argv[]) 1906 { 1907 unsigned lcore_id, core_id = 0; 1908 unsigned nb_ports, valid_num_ports; 1909 int ret, i; 1910 uint16_t portid; 1911 rte_thread_t tid; 1912 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1913 1914 signal(SIGINT, sigint_handler); 1915 1916 /* init EAL */ 1917 ret = rte_eal_init(argc, argv); 1918 if (ret < 0) 1919 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1920 argc -= ret; 1921 argv += ret; 1922 1923 /* initialize dma structures */ 1924 reset_dma(); 1925 1926 /* parse app arguments */ 1927 ret = us_vhost_parse_args(argc, argv); 1928 if (ret < 0) 1929 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1930 1931 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1932 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1933 1934 if (rte_lcore_is_enabled(lcore_id)) 1935 lcore_ids[core_id++] = lcore_id; 1936 } 1937 1938 if (rte_lcore_count() > RTE_MAX_LCORE) 1939 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1940 1941 /* Get the number of physical ports. */ 1942 nb_ports = rte_eth_dev_count_avail(); 1943 1944 /* 1945 * Update the global var NUM_PORTS and global array PORTS 1946 * and get value of var VALID_NUM_PORTS according to system ports number 1947 */ 1948 valid_num_ports = check_ports_num(nb_ports); 1949 1950 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1951 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1952 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1953 return -1; 1954 } 1955 1956 /* 1957 * FIXME: here we are trying to allocate mbufs big enough for 1958 * @MAX_QUEUES, but the truth is we're never going to use that 1959 * many queues here. We probably should only do allocation for 1960 * those queues we are going to use. 1961 */ 1962 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs, 1963 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE, 1964 rte_socket_id()); 1965 if (mbuf_pool == NULL) 1966 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1967 1968 if (vm2vm_mode == VM2VM_HARDWARE) { 1969 /* Enable VT loop back to let L2 switch to do it. */ 1970 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1971 RTE_LOG(DEBUG, VHOST_CONFIG, 1972 "Enable loop back for L2 switch in vmdq.\n"); 1973 } 1974 1975 /* initialize all ports */ 1976 RTE_ETH_FOREACH_DEV(portid) { 1977 /* skip ports that are not enabled */ 1978 if ((enabled_port_mask & (1 << portid)) == 0) { 1979 RTE_LOG(INFO, VHOST_PORT, 1980 "Skipping disabled port %d\n", portid); 1981 continue; 1982 } 1983 if (port_init(portid) != 0) 1984 rte_exit(EXIT_FAILURE, 1985 "Cannot initialize network ports\n"); 1986 } 1987 1988 /* Enable stats if the user option is set. */ 1989 if (enable_stats) { 1990 ret = rte_thread_create_control(&tid, "dpdk-vhost-stat", 1991 print_stats, NULL); 1992 if (ret < 0) 1993 rte_exit(EXIT_FAILURE, 1994 "Cannot create dpdk-vhost-stat thread\n"); 1995 } 1996 1997 /* Launch all data cores. */ 1998 RTE_LCORE_FOREACH_WORKER(lcore_id) 1999 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 2000 2001 if (client_mode) 2002 flags |= RTE_VHOST_USER_CLIENT; 2003 2004 for (i = 0; i < dma_count; i++) { 2005 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 2006 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 2007 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 2008 } 2009 } 2010 2011 /* Register vhost user driver to handle vhost messages. */ 2012 for (i = 0; i < nb_sockets; i++) { 2013 char *file = socket_files + i * PATH_MAX; 2014 2015 if (dma_count && get_async_flag_by_socketid(i) != 0) 2016 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 2017 2018 ret = rte_vhost_driver_register(file, flags); 2019 if (ret != 0) { 2020 unregister_drivers(i); 2021 rte_exit(EXIT_FAILURE, 2022 "vhost driver register failure.\n"); 2023 } 2024 2025 if (builtin_net_driver) 2026 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 2027 2028 if (mergeable == 0) { 2029 rte_vhost_driver_disable_features(file, 2030 1ULL << VIRTIO_NET_F_MRG_RXBUF); 2031 } 2032 2033 if (enable_tx_csum == 0) { 2034 rte_vhost_driver_disable_features(file, 2035 1ULL << VIRTIO_NET_F_CSUM); 2036 } 2037 2038 if (enable_tso == 0) { 2039 rte_vhost_driver_disable_features(file, 2040 1ULL << VIRTIO_NET_F_HOST_TSO4); 2041 rte_vhost_driver_disable_features(file, 2042 1ULL << VIRTIO_NET_F_HOST_TSO6); 2043 rte_vhost_driver_disable_features(file, 2044 1ULL << VIRTIO_NET_F_GUEST_TSO4); 2045 rte_vhost_driver_disable_features(file, 2046 1ULL << VIRTIO_NET_F_GUEST_TSO6); 2047 } 2048 2049 if (promiscuous) { 2050 rte_vhost_driver_enable_features(file, 2051 1ULL << VIRTIO_NET_F_CTRL_RX); 2052 } 2053 2054 ret = rte_vhost_driver_callback_register(file, 2055 &virtio_net_device_ops); 2056 if (ret != 0) { 2057 rte_exit(EXIT_FAILURE, 2058 "failed to register vhost driver callbacks.\n"); 2059 } 2060 2061 if (rte_vhost_driver_start(file) < 0) { 2062 rte_exit(EXIT_FAILURE, 2063 "failed to start vhost driver.\n"); 2064 } 2065 } 2066 2067 RTE_LCORE_FOREACH_WORKER(lcore_id) 2068 rte_eal_wait_lcore(lcore_id); 2069 2070 for (i = 0; i < dma_count; i++) { 2071 if (rte_vhost_async_dma_unconfigure(dmas_id[i], 0) < 0) { 2072 RTE_LOG(ERR, VHOST_PORT, 2073 "Failed to unconfigure DMA %d in vhost.\n", dmas_id[i]); 2074 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 2075 } 2076 } 2077 2078 /* clean up the EAL */ 2079 rte_eal_cleanup(); 2080 2081 return 0; 2082 } 2083