1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2017 Intel Corporation 3 */ 4 5 #include <ctype.h> 6 #include <arpa/inet.h> 7 #include <getopt.h> 8 #include <linux/if_ether.h> 9 #include <linux/if_vlan.h> 10 #include <linux/virtio_net.h> 11 #include <linux/virtio_ring.h> 12 #include <signal.h> 13 #include <stdint.h> 14 #include <stdlib.h> 15 #include <sys/eventfd.h> 16 #include <sys/param.h> 17 #include <unistd.h> 18 19 #include <rte_cycles.h> 20 #include <rte_ethdev.h> 21 #include <rte_log.h> 22 #include <rte_string_fns.h> 23 #include <rte_malloc.h> 24 #include <rte_net.h> 25 #include <rte_vhost.h> 26 #include <rte_ip.h> 27 #include <rte_tcp.h> 28 #include <rte_pause.h> 29 #include <rte_dmadev.h> 30 #include <rte_vhost_async.h> 31 32 #include "main.h" 33 34 #ifndef MAX_QUEUES 35 #define MAX_QUEUES 128 36 #endif 37 38 #define NUM_MBUFS_DEFAULT 0x24000 39 40 /* the maximum number of external ports supported */ 41 #define MAX_SUP_PORTS 1 42 43 #define MBUF_CACHE_SIZE 128 44 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 45 46 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 47 48 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 49 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 50 51 #define JUMBO_FRAME_MAX_SIZE 0x2600 52 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)) 53 54 /* State of virtio device. */ 55 #define DEVICE_MAC_LEARNING 0 56 #define DEVICE_RX 1 57 #define DEVICE_SAFE_REMOVE 2 58 59 /* Configurable number of RX/TX ring descriptors */ 60 #define RX_DESC_DEFAULT 1024 61 #define TX_DESC_DEFAULT 512 62 63 #define INVALID_PORT_ID 0xFF 64 #define INVALID_DMA_ID -1 65 66 #define DMA_RING_SIZE 4096 67 68 #define ASYNC_ENQUEUE_VHOST 1 69 #define ASYNC_DEQUEUE_VHOST 2 70 71 /* number of mbufs in all pools - if specified on command-line. */ 72 static int total_num_mbufs = NUM_MBUFS_DEFAULT; 73 74 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE]; 75 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX]; 76 static int dma_count; 77 78 /* mask of enabled ports */ 79 static uint32_t enabled_port_mask = 0; 80 81 /* Promiscuous mode */ 82 static uint32_t promiscuous; 83 84 /* number of devices/queues to support*/ 85 static uint32_t num_queues = 0; 86 static uint32_t num_devices; 87 88 static struct rte_mempool *mbuf_pool; 89 static int mergeable; 90 91 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 92 typedef enum { 93 VM2VM_DISABLED = 0, 94 VM2VM_SOFTWARE = 1, 95 VM2VM_HARDWARE = 2, 96 VM2VM_LAST 97 } vm2vm_type; 98 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 99 100 /* Enable stats. */ 101 static uint32_t enable_stats = 0; 102 /* Enable retries on RX. */ 103 static uint32_t enable_retry = 1; 104 105 /* Disable TX checksum offload */ 106 static uint32_t enable_tx_csum; 107 108 /* Disable TSO offload */ 109 static uint32_t enable_tso; 110 111 static int client_mode; 112 113 static int builtin_net_driver; 114 115 /* Specify timeout (in useconds) between retries on RX. */ 116 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 117 /* Specify the number of retries on RX. */ 118 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 119 120 /* Socket file paths. Can be set by user */ 121 static char *socket_files; 122 static int nb_sockets; 123 124 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE]; 125 126 /* empty VMDq configuration structure. Filled in programmatically */ 127 static struct rte_eth_conf vmdq_conf_default = { 128 .rxmode = { 129 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY, 130 /* 131 * VLAN strip is necessary for 1G NIC such as I350, 132 * this fixes bug of ipv4 forwarding in guest can't 133 * forward packets from one virtio dev to another virtio dev. 134 */ 135 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP, 136 }, 137 138 .txmode = { 139 .mq_mode = RTE_ETH_MQ_TX_NONE, 140 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | 141 RTE_ETH_TX_OFFLOAD_TCP_CKSUM | 142 RTE_ETH_TX_OFFLOAD_VLAN_INSERT | 143 RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 144 RTE_ETH_TX_OFFLOAD_TCP_TSO), 145 }, 146 .rx_adv_conf = { 147 /* 148 * should be overridden separately in code with 149 * appropriate values 150 */ 151 .vmdq_rx_conf = { 152 .nb_queue_pools = RTE_ETH_8_POOLS, 153 .enable_default_pool = 0, 154 .default_pool = 0, 155 .nb_pool_maps = 0, 156 .pool_map = {{0, 0},}, 157 }, 158 }, 159 }; 160 161 162 static unsigned lcore_ids[RTE_MAX_LCORE]; 163 static uint16_t ports[RTE_MAX_ETHPORTS]; 164 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 165 static uint16_t num_pf_queues, num_vmdq_queues; 166 static uint16_t vmdq_pool_base, vmdq_queue_base; 167 static uint16_t queues_per_pool; 168 169 const uint16_t vlan_tags[] = { 170 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 171 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 172 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 173 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 174 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 175 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 176 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 177 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 178 }; 179 180 /* ethernet addresses of ports */ 181 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 182 183 static struct vhost_dev_tailq_list vhost_dev_list = 184 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 185 186 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 187 188 /* Used for queueing bursts of TX packets. */ 189 struct mbuf_table { 190 unsigned len; 191 unsigned txq_id; 192 struct rte_mbuf *m_table[MAX_PKT_BURST]; 193 }; 194 195 struct vhost_bufftable { 196 uint32_t len; 197 uint64_t pre_tsc; 198 struct rte_mbuf *m_table[MAX_PKT_BURST]; 199 }; 200 201 /* TX queue for each data core. */ 202 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 203 204 /* 205 * Vhost TX buffer for each data core. 206 * Every data core maintains a TX buffer for every vhost device, 207 * which is used for batch pkts enqueue for higher performance. 208 */ 209 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE]; 210 211 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 212 / US_PER_S * BURST_TX_DRAIN_US) 213 214 static int vid2socketid[RTE_MAX_VHOST_DEVICE]; 215 216 static inline uint32_t 217 get_async_flag_by_socketid(int socketid) 218 { 219 return dma_bind[socketid].async_flag; 220 } 221 222 static inline void 223 init_vid2socketid_array(int vid, int socketid) 224 { 225 vid2socketid[vid] = socketid; 226 } 227 228 static inline bool 229 is_dma_configured(int16_t dev_id) 230 { 231 int i; 232 233 for (i = 0; i < dma_count; i++) 234 if (dmas_id[i] == dev_id) 235 return true; 236 return false; 237 } 238 239 static inline int 240 open_dma(const char *value) 241 { 242 struct dma_for_vhost *dma_info = dma_bind; 243 char *input = strndup(value, strlen(value) + 1); 244 char *addrs = input; 245 char *ptrs[2]; 246 char *start, *end, *substr; 247 int64_t socketid, vring_id; 248 249 struct rte_dma_info info; 250 struct rte_dma_conf dev_config = { .nb_vchans = 1 }; 251 struct rte_dma_vchan_conf qconf = { 252 .direction = RTE_DMA_DIR_MEM_TO_MEM, 253 .nb_desc = DMA_RING_SIZE 254 }; 255 256 int dev_id; 257 int ret = 0; 258 uint16_t i = 0; 259 char *dma_arg[RTE_MAX_VHOST_DEVICE]; 260 int args_nr; 261 262 while (isblank(*addrs)) 263 addrs++; 264 if (*addrs == '\0') { 265 ret = -1; 266 goto out; 267 } 268 269 /* process DMA devices within bracket. */ 270 addrs++; 271 substr = strtok(addrs, ";]"); 272 if (!substr) { 273 ret = -1; 274 goto out; 275 } 276 277 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ','); 278 if (args_nr <= 0) { 279 ret = -1; 280 goto out; 281 } 282 283 while (i < args_nr) { 284 char *arg_temp = dma_arg[i]; 285 char *txd, *rxd; 286 uint8_t sub_nr; 287 int async_flag; 288 289 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); 290 if (sub_nr != 2) { 291 ret = -1; 292 goto out; 293 } 294 295 txd = strstr(ptrs[0], "txd"); 296 rxd = strstr(ptrs[0], "rxd"); 297 if (txd) { 298 start = txd; 299 vring_id = VIRTIO_RXQ; 300 async_flag = ASYNC_ENQUEUE_VHOST; 301 } else if (rxd) { 302 start = rxd; 303 vring_id = VIRTIO_TXQ; 304 async_flag = ASYNC_DEQUEUE_VHOST; 305 } else { 306 ret = -1; 307 goto out; 308 } 309 310 start += 3; 311 socketid = strtol(start, &end, 0); 312 if (end == start) { 313 ret = -1; 314 goto out; 315 } 316 317 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); 318 if (dev_id < 0) { 319 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]); 320 ret = -1; 321 goto out; 322 } 323 324 /* DMA device is already configured, so skip */ 325 if (is_dma_configured(dev_id)) 326 goto done; 327 328 if (rte_dma_info_get(dev_id, &info) != 0) { 329 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n"); 330 ret = -1; 331 goto out; 332 } 333 334 if (info.max_vchans < 1) { 335 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id); 336 ret = -1; 337 goto out; 338 } 339 340 if (rte_dma_configure(dev_id, &dev_config) != 0) { 341 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id); 342 ret = -1; 343 goto out; 344 } 345 346 /* Check the max desc supported by DMA device */ 347 rte_dma_info_get(dev_id, &info); 348 if (info.nb_vchans != 1) { 349 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n", 350 dev_id); 351 ret = -1; 352 goto out; 353 } 354 355 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc); 356 357 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { 358 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id); 359 ret = -1; 360 goto out; 361 } 362 363 if (rte_dma_start(dev_id) != 0) { 364 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id); 365 ret = -1; 366 goto out; 367 } 368 369 dmas_id[dma_count++] = dev_id; 370 371 done: 372 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id; 373 (dma_info + socketid)->async_flag |= async_flag; 374 i++; 375 } 376 out: 377 free(input); 378 return ret; 379 } 380 381 /* 382 * Builds up the correct configuration for VMDQ VLAN pool map 383 * according to the pool & queue limits. 384 */ 385 static inline int 386 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 387 { 388 struct rte_eth_vmdq_rx_conf conf; 389 struct rte_eth_vmdq_rx_conf *def_conf = 390 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 391 unsigned i; 392 393 memset(&conf, 0, sizeof(conf)); 394 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 395 conf.nb_pool_maps = num_devices; 396 conf.enable_loop_back = def_conf->enable_loop_back; 397 conf.rx_mode = def_conf->rx_mode; 398 399 for (i = 0; i < conf.nb_pool_maps; i++) { 400 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 401 conf.pool_map[i].pools = (1UL << i); 402 } 403 404 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 405 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 406 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 407 return 0; 408 } 409 410 /* 411 * Initialises a given port using global settings and with the rx buffers 412 * coming from the mbuf_pool passed as parameter 413 */ 414 static inline int 415 port_init(uint16_t port) 416 { 417 struct rte_eth_dev_info dev_info; 418 struct rte_eth_conf port_conf; 419 struct rte_eth_rxconf *rxconf; 420 struct rte_eth_txconf *txconf; 421 int16_t rx_rings, tx_rings; 422 uint16_t rx_ring_size, tx_ring_size; 423 int retval; 424 uint16_t q; 425 426 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 427 retval = rte_eth_dev_info_get(port, &dev_info); 428 if (retval != 0) { 429 RTE_LOG(ERR, VHOST_PORT, 430 "Error during getting device (port %u) info: %s\n", 431 port, strerror(-retval)); 432 433 return retval; 434 } 435 if (dev_info.max_vmdq_pools == 0) { 436 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n"); 437 return -1; 438 } 439 440 rxconf = &dev_info.default_rxconf; 441 txconf = &dev_info.default_txconf; 442 rxconf->rx_drop_en = 1; 443 444 /*configure the number of supported virtio devices based on VMDQ limits */ 445 num_devices = dev_info.max_vmdq_pools; 446 447 rx_ring_size = RX_DESC_DEFAULT; 448 tx_ring_size = TX_DESC_DEFAULT; 449 450 tx_rings = (uint16_t)rte_lcore_count(); 451 452 if (mergeable) { 453 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu) 454 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu; 455 else 456 vmdq_conf_default.rxmode.mtu = MAX_MTU; 457 } 458 459 /* Get port configuration. */ 460 retval = get_eth_conf(&port_conf, num_devices); 461 if (retval < 0) 462 return retval; 463 /* NIC queues are divided into pf queues and vmdq queues. */ 464 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 465 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 466 num_vmdq_queues = num_devices * queues_per_pool; 467 num_queues = num_pf_queues + num_vmdq_queues; 468 vmdq_queue_base = dev_info.vmdq_queue_base; 469 vmdq_pool_base = dev_info.vmdq_pool_base; 470 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 471 num_pf_queues, num_devices, queues_per_pool); 472 473 if (!rte_eth_dev_is_valid_port(port)) 474 return -1; 475 476 rx_rings = (uint16_t)dev_info.max_rx_queues; 477 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 478 port_conf.txmode.offloads |= 479 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 480 /* Configure ethernet device. */ 481 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 482 if (retval != 0) { 483 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 484 port, strerror(-retval)); 485 return retval; 486 } 487 488 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 489 &tx_ring_size); 490 if (retval != 0) { 491 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 492 "for port %u: %s.\n", port, strerror(-retval)); 493 return retval; 494 } 495 if (rx_ring_size > RX_DESC_DEFAULT) { 496 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 497 "for Rx queues on port %u.\n", port); 498 return -1; 499 } 500 501 /* Setup the queues. */ 502 rxconf->offloads = port_conf.rxmode.offloads; 503 for (q = 0; q < rx_rings; q ++) { 504 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 505 rte_eth_dev_socket_id(port), 506 rxconf, 507 mbuf_pool); 508 if (retval < 0) { 509 RTE_LOG(ERR, VHOST_PORT, 510 "Failed to setup rx queue %u of port %u: %s.\n", 511 q, port, strerror(-retval)); 512 return retval; 513 } 514 } 515 txconf->offloads = port_conf.txmode.offloads; 516 for (q = 0; q < tx_rings; q ++) { 517 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 518 rte_eth_dev_socket_id(port), 519 txconf); 520 if (retval < 0) { 521 RTE_LOG(ERR, VHOST_PORT, 522 "Failed to setup tx queue %u of port %u: %s.\n", 523 q, port, strerror(-retval)); 524 return retval; 525 } 526 } 527 528 /* Start the device. */ 529 retval = rte_eth_dev_start(port); 530 if (retval < 0) { 531 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 532 port, strerror(-retval)); 533 return retval; 534 } 535 536 if (promiscuous) { 537 retval = rte_eth_promiscuous_enable(port); 538 if (retval != 0) { 539 RTE_LOG(ERR, VHOST_PORT, 540 "Failed to enable promiscuous mode on port %u: %s\n", 541 port, rte_strerror(-retval)); 542 return retval; 543 } 544 } 545 546 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 547 if (retval < 0) { 548 RTE_LOG(ERR, VHOST_PORT, 549 "Failed to get MAC address on port %u: %s\n", 550 port, rte_strerror(-retval)); 551 return retval; 552 } 553 554 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 555 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 556 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 557 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port])); 558 559 return 0; 560 } 561 562 /* 563 * Set socket file path. 564 */ 565 static int 566 us_vhost_parse_socket_path(const char *q_arg) 567 { 568 char *old; 569 570 /* parse number string */ 571 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 572 return -1; 573 574 old = socket_files; 575 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 576 if (socket_files == NULL) { 577 free(old); 578 return -1; 579 } 580 581 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX); 582 nb_sockets++; 583 584 return 0; 585 } 586 587 /* 588 * Parse the portmask provided at run time. 589 */ 590 static int 591 parse_portmask(const char *portmask) 592 { 593 char *end = NULL; 594 unsigned long pm; 595 596 errno = 0; 597 598 /* parse hexadecimal string */ 599 pm = strtoul(portmask, &end, 16); 600 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 601 return 0; 602 603 return pm; 604 605 } 606 607 /* 608 * Parse num options at run time. 609 */ 610 static int 611 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 612 { 613 char *end = NULL; 614 unsigned long num; 615 616 errno = 0; 617 618 /* parse unsigned int string */ 619 num = strtoul(q_arg, &end, 10); 620 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 621 return -1; 622 623 if (num > max_valid_value) 624 return -1; 625 626 return num; 627 628 } 629 630 /* 631 * Display usage 632 */ 633 static void 634 us_vhost_usage(const char *prgname) 635 { 636 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 637 " --vm2vm [0|1|2]\n" 638 " --rx-retry [0|1] --mergeable [0|1] --stats [0-N]\n" 639 " --socket-file <path>\n" 640 " -p PORTMASK: Set mask for ports to be used by application\n" 641 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 642 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n" 643 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 644 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 645 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 646 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 647 " --socket-file: The path of the socket file.\n" 648 " --tx-csum [0|1]: disable/enable TX checksum offload.\n" 649 " --tso [0|1]: disable/enable TCP segment offload.\n" 650 " --client: register a vhost-user socket as client mode.\n" 651 " --dmas: register dma channel for specific vhost device.\n" 652 " --total-num-mbufs [0-N]: set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n" 653 " --builtin-net-driver: enable simple vhost-user net driver\n", 654 prgname); 655 } 656 657 enum { 658 #define OPT_VM2VM "vm2vm" 659 OPT_VM2VM_NUM = 256, 660 #define OPT_RX_RETRY "rx-retry" 661 OPT_RX_RETRY_NUM, 662 #define OPT_RX_RETRY_DELAY "rx-retry-delay" 663 OPT_RX_RETRY_DELAY_NUM, 664 #define OPT_RX_RETRY_NUMB "rx-retry-num" 665 OPT_RX_RETRY_NUMB_NUM, 666 #define OPT_MERGEABLE "mergeable" 667 OPT_MERGEABLE_NUM, 668 #define OPT_STATS "stats" 669 OPT_STATS_NUM, 670 #define OPT_SOCKET_FILE "socket-file" 671 OPT_SOCKET_FILE_NUM, 672 #define OPT_TX_CSUM "tx-csum" 673 OPT_TX_CSUM_NUM, 674 #define OPT_TSO "tso" 675 OPT_TSO_NUM, 676 #define OPT_CLIENT "client" 677 OPT_CLIENT_NUM, 678 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" 679 OPT_BUILTIN_NET_DRIVER_NUM, 680 #define OPT_DMAS "dmas" 681 OPT_DMAS_NUM, 682 #define OPT_NUM_MBUFS "total-num-mbufs" 683 OPT_NUM_MBUFS_NUM, 684 }; 685 686 /* 687 * Parse the arguments given in the command line of the application. 688 */ 689 static int 690 us_vhost_parse_args(int argc, char **argv) 691 { 692 int opt, ret; 693 int option_index; 694 unsigned i; 695 const char *prgname = argv[0]; 696 static struct option long_option[] = { 697 {OPT_VM2VM, required_argument, 698 NULL, OPT_VM2VM_NUM}, 699 {OPT_RX_RETRY, required_argument, 700 NULL, OPT_RX_RETRY_NUM}, 701 {OPT_RX_RETRY_DELAY, required_argument, 702 NULL, OPT_RX_RETRY_DELAY_NUM}, 703 {OPT_RX_RETRY_NUMB, required_argument, 704 NULL, OPT_RX_RETRY_NUMB_NUM}, 705 {OPT_MERGEABLE, required_argument, 706 NULL, OPT_MERGEABLE_NUM}, 707 {OPT_STATS, required_argument, 708 NULL, OPT_STATS_NUM}, 709 {OPT_SOCKET_FILE, required_argument, 710 NULL, OPT_SOCKET_FILE_NUM}, 711 {OPT_TX_CSUM, required_argument, 712 NULL, OPT_TX_CSUM_NUM}, 713 {OPT_TSO, required_argument, 714 NULL, OPT_TSO_NUM}, 715 {OPT_CLIENT, no_argument, 716 NULL, OPT_CLIENT_NUM}, 717 {OPT_BUILTIN_NET_DRIVER, no_argument, 718 NULL, OPT_BUILTIN_NET_DRIVER_NUM}, 719 {OPT_DMAS, required_argument, 720 NULL, OPT_DMAS_NUM}, 721 {OPT_NUM_MBUFS, required_argument, 722 NULL, OPT_NUM_MBUFS_NUM}, 723 {NULL, 0, 0, 0}, 724 }; 725 726 /* Parse command line */ 727 while ((opt = getopt_long(argc, argv, "p:P", 728 long_option, &option_index)) != EOF) { 729 switch (opt) { 730 /* Portmask */ 731 case 'p': 732 enabled_port_mask = parse_portmask(optarg); 733 if (enabled_port_mask == 0) { 734 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 735 us_vhost_usage(prgname); 736 return -1; 737 } 738 break; 739 740 case 'P': 741 promiscuous = 1; 742 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 743 RTE_ETH_VMDQ_ACCEPT_BROADCAST | 744 RTE_ETH_VMDQ_ACCEPT_MULTICAST; 745 break; 746 747 case OPT_VM2VM_NUM: 748 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 749 if (ret == -1) { 750 RTE_LOG(INFO, VHOST_CONFIG, 751 "Invalid argument for " 752 "vm2vm [0|1|2]\n"); 753 us_vhost_usage(prgname); 754 return -1; 755 } 756 vm2vm_mode = (vm2vm_type)ret; 757 break; 758 759 case OPT_RX_RETRY_NUM: 760 ret = parse_num_opt(optarg, 1); 761 if (ret == -1) { 762 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 763 us_vhost_usage(prgname); 764 return -1; 765 } 766 enable_retry = ret; 767 break; 768 769 case OPT_TX_CSUM_NUM: 770 ret = parse_num_opt(optarg, 1); 771 if (ret == -1) { 772 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 773 us_vhost_usage(prgname); 774 return -1; 775 } 776 enable_tx_csum = ret; 777 break; 778 779 case OPT_TSO_NUM: 780 ret = parse_num_opt(optarg, 1); 781 if (ret == -1) { 782 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 783 us_vhost_usage(prgname); 784 return -1; 785 } 786 enable_tso = ret; 787 break; 788 789 case OPT_RX_RETRY_DELAY_NUM: 790 ret = parse_num_opt(optarg, INT32_MAX); 791 if (ret == -1) { 792 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 793 us_vhost_usage(prgname); 794 return -1; 795 } 796 burst_rx_delay_time = ret; 797 break; 798 799 case OPT_RX_RETRY_NUMB_NUM: 800 ret = parse_num_opt(optarg, INT32_MAX); 801 if (ret == -1) { 802 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 803 us_vhost_usage(prgname); 804 return -1; 805 } 806 burst_rx_retry_num = ret; 807 break; 808 809 case OPT_MERGEABLE_NUM: 810 ret = parse_num_opt(optarg, 1); 811 if (ret == -1) { 812 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 813 us_vhost_usage(prgname); 814 return -1; 815 } 816 mergeable = !!ret; 817 break; 818 819 case OPT_STATS_NUM: 820 ret = parse_num_opt(optarg, INT32_MAX); 821 if (ret == -1) { 822 RTE_LOG(INFO, VHOST_CONFIG, 823 "Invalid argument for stats [0..N]\n"); 824 us_vhost_usage(prgname); 825 return -1; 826 } 827 enable_stats = ret; 828 break; 829 830 /* Set socket file path. */ 831 case OPT_SOCKET_FILE_NUM: 832 if (us_vhost_parse_socket_path(optarg) == -1) { 833 RTE_LOG(INFO, VHOST_CONFIG, 834 "Invalid argument for socket name (Max %d characters)\n", 835 PATH_MAX); 836 us_vhost_usage(prgname); 837 return -1; 838 } 839 break; 840 841 case OPT_DMAS_NUM: 842 if (open_dma(optarg) == -1) { 843 RTE_LOG(INFO, VHOST_CONFIG, 844 "Wrong DMA args\n"); 845 us_vhost_usage(prgname); 846 return -1; 847 } 848 break; 849 850 case OPT_NUM_MBUFS_NUM: 851 ret = parse_num_opt(optarg, INT32_MAX); 852 if (ret == -1) { 853 RTE_LOG(INFO, VHOST_CONFIG, 854 "Invalid argument for total-num-mbufs [0..N]\n"); 855 us_vhost_usage(prgname); 856 return -1; 857 } 858 859 if (total_num_mbufs < ret) 860 total_num_mbufs = ret; 861 break; 862 863 case OPT_CLIENT_NUM: 864 client_mode = 1; 865 break; 866 867 case OPT_BUILTIN_NET_DRIVER_NUM: 868 builtin_net_driver = 1; 869 break; 870 871 /* Invalid option - print options. */ 872 default: 873 us_vhost_usage(prgname); 874 return -1; 875 } 876 } 877 878 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 879 if (enabled_port_mask & (1 << i)) 880 ports[num_ports++] = i; 881 } 882 883 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 884 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 885 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 886 return -1; 887 } 888 889 return 0; 890 } 891 892 /* 893 * Update the global var NUM_PORTS and array PORTS according to system ports number 894 * and return valid ports number 895 */ 896 static unsigned check_ports_num(unsigned nb_ports) 897 { 898 unsigned valid_num_ports = num_ports; 899 unsigned portid; 900 901 if (num_ports > nb_ports) { 902 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 903 num_ports, nb_ports); 904 num_ports = nb_ports; 905 } 906 907 for (portid = 0; portid < num_ports; portid ++) { 908 if (!rte_eth_dev_is_valid_port(ports[portid])) { 909 RTE_LOG(INFO, VHOST_PORT, 910 "\nSpecified port ID(%u) is not valid\n", 911 ports[portid]); 912 ports[portid] = INVALID_PORT_ID; 913 valid_num_ports--; 914 } 915 } 916 return valid_num_ports; 917 } 918 919 static __rte_always_inline struct vhost_dev * 920 find_vhost_dev(struct rte_ether_addr *mac) 921 { 922 struct vhost_dev *vdev; 923 924 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 925 if (vdev->ready == DEVICE_RX && 926 rte_is_same_ether_addr(mac, &vdev->mac_address)) 927 return vdev; 928 } 929 930 return NULL; 931 } 932 933 /* 934 * This function learns the MAC address of the device and registers this along with a 935 * vlan tag to a VMDQ. 936 */ 937 static int 938 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 939 { 940 struct rte_ether_hdr *pkt_hdr; 941 int i, ret; 942 943 /* Learn MAC address of guest device from packet */ 944 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 945 946 if (find_vhost_dev(&pkt_hdr->src_addr)) { 947 RTE_LOG(ERR, VHOST_DATA, 948 "(%d) device is using a registered MAC!\n", 949 vdev->vid); 950 return -1; 951 } 952 953 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++) 954 vdev->mac_address.addr_bytes[i] = 955 pkt_hdr->src_addr.addr_bytes[i]; 956 957 /* vlan_tag currently uses the device_id. */ 958 vdev->vlan_tag = vlan_tags[vdev->vid]; 959 960 /* Print out VMDQ registration info. */ 961 RTE_LOG(INFO, VHOST_DATA, 962 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n", 963 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address), 964 vdev->vlan_tag); 965 966 /* Register the MAC address. */ 967 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 968 (uint32_t)vdev->vid + vmdq_pool_base); 969 if (ret) 970 RTE_LOG(ERR, VHOST_DATA, 971 "(%d) failed to add device MAC address to VMDQ\n", 972 vdev->vid); 973 974 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 975 976 /* Set device as ready for RX. */ 977 vdev->ready = DEVICE_RX; 978 979 return 0; 980 } 981 982 /* 983 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 984 * queue before disabling RX on the device. 985 */ 986 static inline void 987 unlink_vmdq(struct vhost_dev *vdev) 988 { 989 unsigned i = 0; 990 unsigned rx_count; 991 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 992 993 if (vdev->ready == DEVICE_RX) { 994 /*clear MAC and VLAN settings*/ 995 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 996 for (i = 0; i < 6; i++) 997 vdev->mac_address.addr_bytes[i] = 0; 998 999 vdev->vlan_tag = 0; 1000 1001 /*Clear out the receive buffers*/ 1002 rx_count = rte_eth_rx_burst(ports[0], 1003 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1004 1005 while (rx_count) { 1006 for (i = 0; i < rx_count; i++) 1007 rte_pktmbuf_free(pkts_burst[i]); 1008 1009 rx_count = rte_eth_rx_burst(ports[0], 1010 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 1011 } 1012 1013 vdev->ready = DEVICE_MAC_LEARNING; 1014 } 1015 } 1016 1017 static inline void 1018 free_pkts(struct rte_mbuf **pkts, uint16_t n) 1019 { 1020 while (n--) 1021 rte_pktmbuf_free(pkts[n]); 1022 } 1023 1024 static __rte_always_inline void 1025 complete_async_pkts(struct vhost_dev *vdev) 1026 { 1027 struct rte_mbuf *p_cpl[MAX_PKT_BURST]; 1028 uint16_t complete_count; 1029 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id; 1030 1031 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, 1032 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0); 1033 if (complete_count) 1034 free_pkts(p_cpl, complete_count); 1035 1036 } 1037 1038 static __rte_always_inline void 1039 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 1040 struct rte_mbuf *m) 1041 { 1042 uint16_t ret; 1043 1044 if (builtin_net_driver) { 1045 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 1046 } else { 1047 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 1048 } 1049 1050 if (enable_stats) { 1051 __atomic_fetch_add(&dst_vdev->stats.rx_total_atomic, 1, 1052 __ATOMIC_SEQ_CST); 1053 __atomic_fetch_add(&dst_vdev->stats.rx_atomic, ret, 1054 __ATOMIC_SEQ_CST); 1055 src_vdev->stats.tx_total++; 1056 src_vdev->stats.tx += ret; 1057 } 1058 } 1059 1060 static __rte_always_inline void 1061 drain_vhost(struct vhost_dev *vdev) 1062 { 1063 uint16_t ret; 1064 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid; 1065 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; 1066 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; 1067 1068 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit); 1069 1070 if (enable_stats) { 1071 __atomic_fetch_add(&vdev->stats.rx_total_atomic, nr_xmit, 1072 __ATOMIC_SEQ_CST); 1073 __atomic_fetch_add(&vdev->stats.rx_atomic, ret, 1074 __ATOMIC_SEQ_CST); 1075 } 1076 1077 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) { 1078 free_pkts(m, nr_xmit); 1079 } else { 1080 uint16_t enqueue_fail = nr_xmit - ret; 1081 if (enqueue_fail > 0) 1082 free_pkts(&m[ret], enqueue_fail); 1083 } 1084 } 1085 1086 static __rte_always_inline void 1087 drain_vhost_table(void) 1088 { 1089 uint16_t lcore_id = rte_lcore_id(); 1090 struct vhost_bufftable *vhost_txq; 1091 struct vhost_dev *vdev; 1092 uint64_t cur_tsc; 1093 1094 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1095 if (unlikely(vdev->remove == 1)) 1096 continue; 1097 1098 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid]; 1099 1100 cur_tsc = rte_rdtsc(); 1101 if (unlikely(cur_tsc - vhost_txq->pre_tsc 1102 > MBUF_TABLE_DRAIN_TSC)) { 1103 RTE_LOG_DP(DEBUG, VHOST_DATA, 1104 "Vhost TX queue drained after timeout with burst size %u\n", 1105 vhost_txq->len); 1106 drain_vhost(vdev); 1107 vhost_txq->len = 0; 1108 vhost_txq->pre_tsc = cur_tsc; 1109 } 1110 } 1111 } 1112 1113 /* 1114 * Check if the packet destination MAC address is for a local device. If so then put 1115 * the packet on that devices RX queue. If not then return. 1116 */ 1117 static __rte_always_inline int 1118 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 1119 { 1120 struct rte_ether_hdr *pkt_hdr; 1121 struct vhost_dev *dst_vdev; 1122 struct vhost_bufftable *vhost_txq; 1123 uint16_t lcore_id = rte_lcore_id(); 1124 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1125 1126 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1127 if (!dst_vdev) 1128 return -1; 1129 1130 if (vdev->vid == dst_vdev->vid) { 1131 RTE_LOG_DP(DEBUG, VHOST_DATA, 1132 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1133 vdev->vid); 1134 return 0; 1135 } 1136 1137 RTE_LOG_DP(DEBUG, VHOST_DATA, 1138 "(%d) TX: MAC address is local\n", dst_vdev->vid); 1139 1140 if (unlikely(dst_vdev->remove)) { 1141 RTE_LOG_DP(DEBUG, VHOST_DATA, 1142 "(%d) device is marked for removal\n", dst_vdev->vid); 1143 return 0; 1144 } 1145 1146 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid]; 1147 vhost_txq->m_table[vhost_txq->len++] = m; 1148 1149 if (enable_stats) { 1150 vdev->stats.tx_total++; 1151 vdev->stats.tx++; 1152 } 1153 1154 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) { 1155 drain_vhost(dst_vdev); 1156 vhost_txq->len = 0; 1157 vhost_txq->pre_tsc = rte_rdtsc(); 1158 } 1159 return 0; 1160 } 1161 1162 /* 1163 * Check if the destination MAC of a packet is one local VM, 1164 * and get its vlan tag, and offset if it is. 1165 */ 1166 static __rte_always_inline int 1167 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 1168 uint32_t *offset, uint16_t *vlan_tag) 1169 { 1170 struct vhost_dev *dst_vdev; 1171 struct rte_ether_hdr *pkt_hdr = 1172 rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1173 1174 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr); 1175 if (!dst_vdev) 1176 return 0; 1177 1178 if (vdev->vid == dst_vdev->vid) { 1179 RTE_LOG_DP(DEBUG, VHOST_DATA, 1180 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 1181 vdev->vid); 1182 return -1; 1183 } 1184 1185 /* 1186 * HW vlan strip will reduce the packet length 1187 * by minus length of vlan tag, so need restore 1188 * the packet length by plus it. 1189 */ 1190 *offset = RTE_VLAN_HLEN; 1191 *vlan_tag = vlan_tags[vdev->vid]; 1192 1193 RTE_LOG_DP(DEBUG, VHOST_DATA, 1194 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 1195 vdev->vid, dst_vdev->vid, *vlan_tag); 1196 1197 return 0; 1198 } 1199 1200 static void virtio_tx_offload(struct rte_mbuf *m) 1201 { 1202 struct rte_net_hdr_lens hdr_lens; 1203 struct rte_ipv4_hdr *ipv4_hdr; 1204 struct rte_tcp_hdr *tcp_hdr; 1205 uint32_t ptype; 1206 void *l3_hdr; 1207 1208 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); 1209 m->l2_len = hdr_lens.l2_len; 1210 m->l3_len = hdr_lens.l3_len; 1211 m->l4_len = hdr_lens.l4_len; 1212 1213 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len); 1214 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *, 1215 m->l2_len + m->l3_len); 1216 1217 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; 1218 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) { 1219 m->ol_flags |= RTE_MBUF_F_TX_IPV4; 1220 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; 1221 ipv4_hdr = l3_hdr; 1222 ipv4_hdr->hdr_checksum = 0; 1223 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags); 1224 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ 1225 m->ol_flags |= RTE_MBUF_F_TX_IPV6; 1226 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags); 1227 } 1228 } 1229 1230 static __rte_always_inline void 1231 do_drain_mbuf_table(struct mbuf_table *tx_q) 1232 { 1233 uint16_t count; 1234 1235 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 1236 tx_q->m_table, tx_q->len); 1237 if (unlikely(count < tx_q->len)) 1238 free_pkts(&tx_q->m_table[count], tx_q->len - count); 1239 1240 tx_q->len = 0; 1241 } 1242 1243 /* 1244 * This function routes the TX packet to the correct interface. This 1245 * may be a local device or the physical port. 1246 */ 1247 static __rte_always_inline void 1248 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 1249 { 1250 struct mbuf_table *tx_q; 1251 unsigned offset = 0; 1252 const uint16_t lcore_id = rte_lcore_id(); 1253 struct rte_ether_hdr *nh; 1254 1255 1256 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1257 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) { 1258 struct vhost_dev *vdev2; 1259 1260 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 1261 if (vdev2 != vdev) 1262 sync_virtio_xmit(vdev2, vdev, m); 1263 } 1264 goto queue2nic; 1265 } 1266 1267 /*check if destination is local VM*/ 1268 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) 1269 return; 1270 1271 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1272 if (unlikely(find_local_dest(vdev, m, &offset, 1273 &vlan_tag) != 0)) { 1274 rte_pktmbuf_free(m); 1275 return; 1276 } 1277 } 1278 1279 RTE_LOG_DP(DEBUG, VHOST_DATA, 1280 "(%d) TX: MAC address is external\n", vdev->vid); 1281 1282 queue2nic: 1283 1284 /*Add packet to the port tx queue*/ 1285 tx_q = &lcore_tx_queue[lcore_id]; 1286 1287 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 1288 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) { 1289 /* Guest has inserted the vlan tag. */ 1290 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1); 1291 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 1292 if ((vm2vm_mode == VM2VM_HARDWARE) && 1293 (vh->vlan_tci != vlan_tag_be)) 1294 vh->vlan_tci = vlan_tag_be; 1295 } else { 1296 m->ol_flags |= RTE_MBUF_F_TX_VLAN; 1297 1298 /* 1299 * Find the right seg to adjust the data len when offset is 1300 * bigger than tail room size. 1301 */ 1302 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1303 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1304 m->data_len += offset; 1305 else { 1306 struct rte_mbuf *seg = m; 1307 1308 while ((seg->next != NULL) && 1309 (offset > rte_pktmbuf_tailroom(seg))) 1310 seg = seg->next; 1311 1312 seg->data_len += offset; 1313 } 1314 m->pkt_len += offset; 1315 } 1316 1317 m->vlan_tci = vlan_tag; 1318 } 1319 1320 if (m->ol_flags & RTE_MBUF_F_RX_LRO) 1321 virtio_tx_offload(m); 1322 1323 tx_q->m_table[tx_q->len++] = m; 1324 if (enable_stats) { 1325 vdev->stats.tx_total++; 1326 vdev->stats.tx++; 1327 } 1328 1329 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1330 do_drain_mbuf_table(tx_q); 1331 } 1332 1333 1334 static __rte_always_inline void 1335 drain_mbuf_table(struct mbuf_table *tx_q) 1336 { 1337 static uint64_t prev_tsc; 1338 uint64_t cur_tsc; 1339 1340 if (tx_q->len == 0) 1341 return; 1342 1343 cur_tsc = rte_rdtsc(); 1344 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1345 prev_tsc = cur_tsc; 1346 1347 RTE_LOG_DP(DEBUG, VHOST_DATA, 1348 "TX queue drained after timeout with burst size %u\n", 1349 tx_q->len); 1350 do_drain_mbuf_table(tx_q); 1351 } 1352 } 1353 1354 uint16_t 1355 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1356 struct rte_mbuf **pkts, uint32_t rx_count) 1357 { 1358 uint16_t enqueue_count; 1359 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id; 1360 1361 complete_async_pkts(dev); 1362 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id, 1363 pkts, rx_count, dma_id, 0); 1364 1365 return enqueue_count; 1366 } 1367 1368 uint16_t 1369 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1370 struct rte_mbuf **pkts, uint32_t rx_count) 1371 { 1372 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count); 1373 } 1374 1375 static __rte_always_inline void 1376 drain_eth_rx(struct vhost_dev *vdev) 1377 { 1378 uint16_t rx_count, enqueue_count; 1379 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1380 1381 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1382 pkts, MAX_PKT_BURST); 1383 1384 if (!rx_count) 1385 return; 1386 1387 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1388 VIRTIO_RXQ, pkts, rx_count); 1389 1390 /* Retry if necessary */ 1391 if (enable_retry && unlikely(enqueue_count < rx_count)) { 1392 uint32_t retry = 0; 1393 1394 while (enqueue_count < rx_count && retry++ < burst_rx_retry_num) { 1395 rte_delay_us(burst_rx_delay_time); 1396 enqueue_count += vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, 1397 VIRTIO_RXQ, &pkts[enqueue_count], 1398 rx_count - enqueue_count); 1399 } 1400 } 1401 1402 if (enable_stats) { 1403 __atomic_fetch_add(&vdev->stats.rx_total_atomic, rx_count, 1404 __ATOMIC_SEQ_CST); 1405 __atomic_fetch_add(&vdev->stats.rx_atomic, enqueue_count, 1406 __ATOMIC_SEQ_CST); 1407 } 1408 1409 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled) { 1410 free_pkts(pkts, rx_count); 1411 } else { 1412 uint16_t enqueue_fail = rx_count - enqueue_count; 1413 if (enqueue_fail > 0) 1414 free_pkts(&pkts[enqueue_count], enqueue_fail); 1415 } 1416 } 1417 1418 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1419 struct rte_mempool *mbuf_pool, 1420 struct rte_mbuf **pkts, uint16_t count) 1421 { 1422 int nr_inflight; 1423 uint16_t dequeue_count; 1424 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id; 1425 1426 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id, 1427 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0); 1428 1429 return dequeue_count; 1430 } 1431 1432 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, 1433 struct rte_mempool *mbuf_pool, 1434 struct rte_mbuf **pkts, uint16_t count) 1435 { 1436 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count); 1437 } 1438 1439 static __rte_always_inline void 1440 drain_virtio_tx(struct vhost_dev *vdev) 1441 { 1442 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1443 uint16_t count; 1444 uint16_t i; 1445 1446 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev, 1447 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST); 1448 1449 /* setup VMDq for the first packet */ 1450 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1451 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1452 free_pkts(pkts, count); 1453 } 1454 1455 for (i = 0; i < count; ++i) 1456 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1457 } 1458 1459 /* 1460 * Main function of vhost-switch. It basically does: 1461 * 1462 * for each vhost device { 1463 * - drain_eth_rx() 1464 * 1465 * Which drains the host eth Rx queue linked to the vhost device, 1466 * and deliver all of them to guest virito Rx ring associated with 1467 * this vhost device. 1468 * 1469 * - drain_virtio_tx() 1470 * 1471 * Which drains the guest virtio Tx queue and deliver all of them 1472 * to the target, which could be another vhost device, or the 1473 * physical eth dev. The route is done in function "virtio_tx_route". 1474 * } 1475 */ 1476 static int 1477 switch_worker(void *arg __rte_unused) 1478 { 1479 unsigned i; 1480 unsigned lcore_id = rte_lcore_id(); 1481 struct vhost_dev *vdev; 1482 struct mbuf_table *tx_q; 1483 1484 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id); 1485 1486 tx_q = &lcore_tx_queue[lcore_id]; 1487 for (i = 0; i < rte_lcore_count(); i++) { 1488 if (lcore_ids[i] == lcore_id) { 1489 tx_q->txq_id = i; 1490 break; 1491 } 1492 } 1493 1494 while(1) { 1495 drain_mbuf_table(tx_q); 1496 drain_vhost_table(); 1497 /* 1498 * Inform the configuration core that we have exited the 1499 * linked list and that no devices are in use if requested. 1500 */ 1501 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1502 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1503 1504 /* 1505 * Process vhost devices 1506 */ 1507 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1508 lcore_vdev_entry) { 1509 if (unlikely(vdev->remove)) { 1510 unlink_vmdq(vdev); 1511 vdev->ready = DEVICE_SAFE_REMOVE; 1512 continue; 1513 } 1514 1515 if (likely(vdev->ready == DEVICE_RX)) 1516 drain_eth_rx(vdev); 1517 1518 if (likely(!vdev->remove)) 1519 drain_virtio_tx(vdev); 1520 } 1521 } 1522 1523 return 0; 1524 } 1525 1526 static void 1527 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id) 1528 { 1529 uint16_t n_pkt = 0; 1530 int pkts_inflight; 1531 1532 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1533 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id); 1534 1535 struct rte_mbuf *m_cpl[pkts_inflight]; 1536 1537 while (pkts_inflight) { 1538 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl, 1539 pkts_inflight, dma_id, 0); 1540 free_pkts(m_cpl, n_pkt); 1541 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, 1542 queue_id); 1543 } 1544 } 1545 1546 static void 1547 vhost_clear_queue(struct vhost_dev *vdev, uint16_t queue_id) 1548 { 1549 uint16_t n_pkt = 0; 1550 int pkts_inflight; 1551 1552 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id; 1553 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1554 1555 struct rte_mbuf *m_cpl[pkts_inflight]; 1556 1557 while (pkts_inflight) { 1558 n_pkt = rte_vhost_clear_queue(vdev->vid, queue_id, m_cpl, 1559 pkts_inflight, dma_id, 0); 1560 free_pkts(m_cpl, n_pkt); 1561 pkts_inflight = rte_vhost_async_get_inflight(vdev->vid, queue_id); 1562 } 1563 } 1564 1565 /* 1566 * Remove a device from the specific data core linked list and from the 1567 * main linked list. Synchronization occurs through the use of the 1568 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1569 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1570 */ 1571 static void 1572 destroy_device(int vid) 1573 { 1574 struct vhost_dev *vdev = NULL; 1575 int lcore; 1576 uint16_t i; 1577 1578 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1579 if (vdev->vid == vid) 1580 break; 1581 } 1582 if (!vdev) 1583 return; 1584 /*set the remove flag. */ 1585 vdev->remove = 1; 1586 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1587 rte_pause(); 1588 } 1589 1590 for (i = 0; i < RTE_MAX_LCORE; i++) 1591 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]); 1592 1593 if (builtin_net_driver) 1594 vs_vhost_net_remove(vdev); 1595 1596 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1597 lcore_vdev_entry); 1598 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1599 1600 1601 /* Set the dev_removal_flag on each lcore. */ 1602 RTE_LCORE_FOREACH_WORKER(lcore) 1603 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1604 1605 /* 1606 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1607 * we can be sure that they can no longer access the device removed 1608 * from the linked lists and that the devices are no longer in use. 1609 */ 1610 RTE_LCORE_FOREACH_WORKER(lcore) { 1611 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1612 rte_pause(); 1613 } 1614 1615 lcore_info[vdev->coreid].device_num--; 1616 1617 RTE_LOG(INFO, VHOST_DATA, 1618 "(%d) device has been removed from data core\n", 1619 vdev->vid); 1620 1621 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { 1622 vhost_clear_queue(vdev, VIRTIO_RXQ); 1623 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); 1624 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; 1625 } 1626 1627 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) { 1628 vhost_clear_queue(vdev, VIRTIO_TXQ); 1629 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ); 1630 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false; 1631 } 1632 1633 rte_free(vdev); 1634 } 1635 1636 static inline int 1637 get_socketid_by_vid(int vid) 1638 { 1639 int i; 1640 char ifname[PATH_MAX]; 1641 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 1642 1643 for (i = 0; i < nb_sockets; i++) { 1644 char *file = socket_files + i * PATH_MAX; 1645 if (strcmp(file, ifname) == 0) 1646 return i; 1647 } 1648 1649 return -1; 1650 } 1651 1652 static int 1653 init_vhost_queue_ops(int vid) 1654 { 1655 if (builtin_net_driver) { 1656 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; 1657 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; 1658 } else { 1659 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled) 1660 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts; 1661 else 1662 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts; 1663 1664 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled) 1665 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts; 1666 else 1667 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; 1668 } 1669 1670 return 0; 1671 } 1672 1673 static inline int 1674 vhost_async_channel_register(int vid) 1675 { 1676 int rx_ret = 0, tx_ret = 0; 1677 1678 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { 1679 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); 1680 if (rx_ret == 0) 1681 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true; 1682 } 1683 1684 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) { 1685 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ); 1686 if (tx_ret == 0) 1687 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true; 1688 } 1689 1690 return rx_ret | tx_ret; 1691 } 1692 1693 1694 1695 /* 1696 * A new device is added to a data core. First the device is added to the main linked list 1697 * and then allocated to a specific data core. 1698 */ 1699 static int 1700 new_device(int vid) 1701 { 1702 int lcore, core_add = 0; 1703 uint16_t i; 1704 uint32_t device_num_min = num_devices; 1705 struct vhost_dev *vdev; 1706 int ret; 1707 1708 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1709 if (vdev == NULL) { 1710 RTE_LOG(INFO, VHOST_DATA, 1711 "(%d) couldn't allocate memory for vhost dev\n", 1712 vid); 1713 return -1; 1714 } 1715 vdev->vid = vid; 1716 1717 for (i = 0; i < RTE_MAX_LCORE; i++) { 1718 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] 1719 = rte_zmalloc("vhost bufftable", 1720 sizeof(struct vhost_bufftable), 1721 RTE_CACHE_LINE_SIZE); 1722 1723 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) { 1724 RTE_LOG(INFO, VHOST_DATA, 1725 "(%d) couldn't allocate memory for vhost TX\n", vid); 1726 return -1; 1727 } 1728 } 1729 1730 int socketid = get_socketid_by_vid(vid); 1731 if (socketid == -1) 1732 return -1; 1733 1734 init_vid2socketid_array(vid, socketid); 1735 1736 ret = vhost_async_channel_register(vid); 1737 1738 if (init_vhost_queue_ops(vid) != 0) 1739 return -1; 1740 1741 if (builtin_net_driver) 1742 vs_vhost_net_setup(vdev); 1743 1744 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1745 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1746 1747 /*reset ready flag*/ 1748 vdev->ready = DEVICE_MAC_LEARNING; 1749 vdev->remove = 0; 1750 1751 /* Find a suitable lcore to add the device. */ 1752 RTE_LCORE_FOREACH_WORKER(lcore) { 1753 if (lcore_info[lcore].device_num < device_num_min) { 1754 device_num_min = lcore_info[lcore].device_num; 1755 core_add = lcore; 1756 } 1757 } 1758 vdev->coreid = core_add; 1759 1760 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1761 lcore_vdev_entry); 1762 lcore_info[vdev->coreid].device_num++; 1763 1764 /* Disable notifications. */ 1765 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1766 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1767 1768 RTE_LOG(INFO, VHOST_DATA, 1769 "(%d) device has been added to data core %d\n", 1770 vid, vdev->coreid); 1771 1772 return ret; 1773 } 1774 1775 static int 1776 vring_state_changed(int vid, uint16_t queue_id, int enable) 1777 { 1778 struct vhost_dev *vdev = NULL; 1779 1780 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1781 if (vdev->vid == vid) 1782 break; 1783 } 1784 if (!vdev) 1785 return -1; 1786 1787 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) { 1788 if (!enable) 1789 vhost_clear_queue_thread_unsafe(vdev, queue_id); 1790 } 1791 1792 return 0; 1793 } 1794 1795 /* 1796 * These callback allow devices to be added to the data core when configuration 1797 * has been fully complete. 1798 */ 1799 static const struct rte_vhost_device_ops virtio_net_device_ops = 1800 { 1801 .new_device = new_device, 1802 .destroy_device = destroy_device, 1803 .vring_state_changed = vring_state_changed, 1804 }; 1805 1806 /* 1807 * This is a thread will wake up after a period to print stats if the user has 1808 * enabled them. 1809 */ 1810 static void * 1811 print_stats(__rte_unused void *arg) 1812 { 1813 struct vhost_dev *vdev; 1814 uint64_t tx_dropped, rx_dropped; 1815 uint64_t tx, tx_total, rx, rx_total; 1816 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1817 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1818 1819 while(1) { 1820 sleep(enable_stats); 1821 1822 /* Clear screen and move to top left */ 1823 printf("%s%s\n", clr, top_left); 1824 printf("Device statistics =================================\n"); 1825 1826 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1827 tx_total = vdev->stats.tx_total; 1828 tx = vdev->stats.tx; 1829 tx_dropped = tx_total - tx; 1830 1831 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic, 1832 __ATOMIC_SEQ_CST); 1833 rx = __atomic_load_n(&vdev->stats.rx_atomic, 1834 __ATOMIC_SEQ_CST); 1835 rx_dropped = rx_total - rx; 1836 1837 printf("Statistics for device %d\n" 1838 "-----------------------\n" 1839 "TX total: %" PRIu64 "\n" 1840 "TX dropped: %" PRIu64 "\n" 1841 "TX successful: %" PRIu64 "\n" 1842 "RX total: %" PRIu64 "\n" 1843 "RX dropped: %" PRIu64 "\n" 1844 "RX successful: %" PRIu64 "\n", 1845 vdev->vid, 1846 tx_total, tx_dropped, tx, 1847 rx_total, rx_dropped, rx); 1848 } 1849 1850 printf("===================================================\n"); 1851 1852 fflush(stdout); 1853 } 1854 1855 return NULL; 1856 } 1857 1858 static void 1859 unregister_drivers(int socket_num) 1860 { 1861 int i, ret; 1862 1863 for (i = 0; i < socket_num; i++) { 1864 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1865 if (ret != 0) 1866 RTE_LOG(ERR, VHOST_CONFIG, 1867 "Fail to unregister vhost driver for %s.\n", 1868 socket_files + i * PATH_MAX); 1869 } 1870 } 1871 1872 /* When we receive a INT signal, unregister vhost driver */ 1873 static void 1874 sigint_handler(__rte_unused int signum) 1875 { 1876 /* Unregister vhost driver. */ 1877 unregister_drivers(nb_sockets); 1878 1879 exit(0); 1880 } 1881 1882 static void 1883 reset_dma(void) 1884 { 1885 int i; 1886 1887 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) { 1888 int j; 1889 1890 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { 1891 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; 1892 dma_bind[i].dmas[j].async_enabled = false; 1893 } 1894 } 1895 1896 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) 1897 dmas_id[i] = INVALID_DMA_ID; 1898 } 1899 1900 /* 1901 * Main function, does initialisation and calls the per-lcore functions. 1902 */ 1903 int 1904 main(int argc, char *argv[]) 1905 { 1906 unsigned lcore_id, core_id = 0; 1907 unsigned nb_ports, valid_num_ports; 1908 int ret, i; 1909 uint16_t portid; 1910 static pthread_t tid; 1911 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS; 1912 1913 signal(SIGINT, sigint_handler); 1914 1915 /* init EAL */ 1916 ret = rte_eal_init(argc, argv); 1917 if (ret < 0) 1918 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1919 argc -= ret; 1920 argv += ret; 1921 1922 /* initialize dma structures */ 1923 reset_dma(); 1924 1925 /* parse app arguments */ 1926 ret = us_vhost_parse_args(argc, argv); 1927 if (ret < 0) 1928 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1929 1930 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1931 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1932 1933 if (rte_lcore_is_enabled(lcore_id)) 1934 lcore_ids[core_id++] = lcore_id; 1935 } 1936 1937 if (rte_lcore_count() > RTE_MAX_LCORE) 1938 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1939 1940 /* Get the number of physical ports. */ 1941 nb_ports = rte_eth_dev_count_avail(); 1942 1943 /* 1944 * Update the global var NUM_PORTS and global array PORTS 1945 * and get value of var VALID_NUM_PORTS according to system ports number 1946 */ 1947 valid_num_ports = check_ports_num(nb_ports); 1948 1949 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1950 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1951 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1952 return -1; 1953 } 1954 1955 /* 1956 * FIXME: here we are trying to allocate mbufs big enough for 1957 * @MAX_QUEUES, but the truth is we're never going to use that 1958 * many queues here. We probably should only do allocation for 1959 * those queues we are going to use. 1960 */ 1961 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs, 1962 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE, 1963 rte_socket_id()); 1964 if (mbuf_pool == NULL) 1965 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1966 1967 if (vm2vm_mode == VM2VM_HARDWARE) { 1968 /* Enable VT loop back to let L2 switch to do it. */ 1969 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1970 RTE_LOG(DEBUG, VHOST_CONFIG, 1971 "Enable loop back for L2 switch in vmdq.\n"); 1972 } 1973 1974 /* initialize all ports */ 1975 RTE_ETH_FOREACH_DEV(portid) { 1976 /* skip ports that are not enabled */ 1977 if ((enabled_port_mask & (1 << portid)) == 0) { 1978 RTE_LOG(INFO, VHOST_PORT, 1979 "Skipping disabled port %d\n", portid); 1980 continue; 1981 } 1982 if (port_init(portid) != 0) 1983 rte_exit(EXIT_FAILURE, 1984 "Cannot initialize network ports\n"); 1985 } 1986 1987 /* Enable stats if the user option is set. */ 1988 if (enable_stats) { 1989 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL, 1990 print_stats, NULL); 1991 if (ret < 0) 1992 rte_exit(EXIT_FAILURE, 1993 "Cannot create print-stats thread\n"); 1994 } 1995 1996 /* Launch all data cores. */ 1997 RTE_LCORE_FOREACH_WORKER(lcore_id) 1998 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1999 2000 if (client_mode) 2001 flags |= RTE_VHOST_USER_CLIENT; 2002 2003 for (i = 0; i < dma_count; i++) { 2004 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) { 2005 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n"); 2006 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 2007 } 2008 } 2009 2010 /* Register vhost user driver to handle vhost messages. */ 2011 for (i = 0; i < nb_sockets; i++) { 2012 char *file = socket_files + i * PATH_MAX; 2013 2014 if (dma_count && get_async_flag_by_socketid(i) != 0) 2015 flags = flags | RTE_VHOST_USER_ASYNC_COPY; 2016 2017 ret = rte_vhost_driver_register(file, flags); 2018 if (ret != 0) { 2019 unregister_drivers(i); 2020 rte_exit(EXIT_FAILURE, 2021 "vhost driver register failure.\n"); 2022 } 2023 2024 if (builtin_net_driver) 2025 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 2026 2027 if (mergeable == 0) { 2028 rte_vhost_driver_disable_features(file, 2029 1ULL << VIRTIO_NET_F_MRG_RXBUF); 2030 } 2031 2032 if (enable_tx_csum == 0) { 2033 rte_vhost_driver_disable_features(file, 2034 1ULL << VIRTIO_NET_F_CSUM); 2035 } 2036 2037 if (enable_tso == 0) { 2038 rte_vhost_driver_disable_features(file, 2039 1ULL << VIRTIO_NET_F_HOST_TSO4); 2040 rte_vhost_driver_disable_features(file, 2041 1ULL << VIRTIO_NET_F_HOST_TSO6); 2042 rte_vhost_driver_disable_features(file, 2043 1ULL << VIRTIO_NET_F_GUEST_TSO4); 2044 rte_vhost_driver_disable_features(file, 2045 1ULL << VIRTIO_NET_F_GUEST_TSO6); 2046 } 2047 2048 if (promiscuous) { 2049 rte_vhost_driver_enable_features(file, 2050 1ULL << VIRTIO_NET_F_CTRL_RX); 2051 } 2052 2053 ret = rte_vhost_driver_callback_register(file, 2054 &virtio_net_device_ops); 2055 if (ret != 0) { 2056 rte_exit(EXIT_FAILURE, 2057 "failed to register vhost driver callbacks.\n"); 2058 } 2059 2060 if (rte_vhost_driver_start(file) < 0) { 2061 rte_exit(EXIT_FAILURE, 2062 "failed to start vhost driver.\n"); 2063 } 2064 } 2065 2066 RTE_LCORE_FOREACH_WORKER(lcore_id) 2067 rte_eal_wait_lcore(lcore_id); 2068 2069 for (i = 0; i < dma_count; i++) { 2070 if (rte_vhost_async_dma_unconfigure(dmas_id[i], 0) < 0) { 2071 RTE_LOG(ERR, VHOST_PORT, 2072 "Failed to unconfigure DMA %d in vhost.\n", dmas_id[i]); 2073 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n"); 2074 } 2075 } 2076 2077 /* clean up the EAL */ 2078 rte_eal_cleanup(); 2079 2080 return 0; 2081 } 2082