1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) 2016 IGEL Co., Ltd. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of IGEL Co.,Ltd. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 #include <unistd.h> 34 #include <pthread.h> 35 #include <stdbool.h> 36 #ifdef RTE_LIBRTE_VHOST_NUMA 37 #include <numaif.h> 38 #endif 39 40 #include <rte_mbuf.h> 41 #include <rte_ethdev.h> 42 #include <rte_malloc.h> 43 #include <rte_memcpy.h> 44 #include <rte_vdev.h> 45 #include <rte_kvargs.h> 46 #include <rte_virtio_net.h> 47 #include <rte_spinlock.h> 48 49 #include "rte_eth_vhost.h" 50 51 #define ETH_VHOST_IFACE_ARG "iface" 52 #define ETH_VHOST_QUEUES_ARG "queues" 53 #define ETH_VHOST_CLIENT_ARG "client" 54 #define ETH_VHOST_DEQUEUE_ZERO_COPY "dequeue-zero-copy" 55 56 static const char *drivername = "VHOST PMD"; 57 58 static const char *valid_arguments[] = { 59 ETH_VHOST_IFACE_ARG, 60 ETH_VHOST_QUEUES_ARG, 61 ETH_VHOST_CLIENT_ARG, 62 ETH_VHOST_DEQUEUE_ZERO_COPY, 63 NULL 64 }; 65 66 static struct ether_addr base_eth_addr = { 67 .addr_bytes = { 68 0x56 /* V */, 69 0x48 /* H */, 70 0x4F /* O */, 71 0x53 /* S */, 72 0x54 /* T */, 73 0x00 74 } 75 }; 76 77 enum vhost_xstats_pkts { 78 VHOST_UNDERSIZE_PKT = 0, 79 VHOST_64_PKT, 80 VHOST_65_TO_127_PKT, 81 VHOST_128_TO_255_PKT, 82 VHOST_256_TO_511_PKT, 83 VHOST_512_TO_1023_PKT, 84 VHOST_1024_TO_1522_PKT, 85 VHOST_1523_TO_MAX_PKT, 86 VHOST_BROADCAST_PKT, 87 VHOST_MULTICAST_PKT, 88 VHOST_UNICAST_PKT, 89 VHOST_ERRORS_PKT, 90 VHOST_ERRORS_FRAGMENTED, 91 VHOST_ERRORS_JABBER, 92 VHOST_UNKNOWN_PROTOCOL, 93 VHOST_XSTATS_MAX, 94 }; 95 96 struct vhost_stats { 97 uint64_t pkts; 98 uint64_t bytes; 99 uint64_t missed_pkts; 100 uint64_t xstats[VHOST_XSTATS_MAX]; 101 }; 102 103 struct vhost_queue { 104 int vid; 105 rte_atomic32_t allow_queuing; 106 rte_atomic32_t while_queuing; 107 struct pmd_internal *internal; 108 struct rte_mempool *mb_pool; 109 uint8_t port; 110 uint16_t virtqueue_id; 111 struct vhost_stats stats; 112 }; 113 114 struct pmd_internal { 115 char *dev_name; 116 char *iface_name; 117 uint16_t max_queues; 118 uint64_t flags; 119 120 volatile uint16_t once; 121 }; 122 123 struct internal_list { 124 TAILQ_ENTRY(internal_list) next; 125 struct rte_eth_dev *eth_dev; 126 }; 127 128 TAILQ_HEAD(internal_list_head, internal_list); 129 static struct internal_list_head internal_list = 130 TAILQ_HEAD_INITIALIZER(internal_list); 131 132 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER; 133 134 static rte_atomic16_t nb_started_ports; 135 static pthread_t session_th; 136 137 static struct rte_eth_link pmd_link = { 138 .link_speed = 10000, 139 .link_duplex = ETH_LINK_FULL_DUPLEX, 140 .link_status = ETH_LINK_DOWN 141 }; 142 143 struct rte_vhost_vring_state { 144 rte_spinlock_t lock; 145 146 bool cur[RTE_MAX_QUEUES_PER_PORT * 2]; 147 bool seen[RTE_MAX_QUEUES_PER_PORT * 2]; 148 unsigned int index; 149 unsigned int max_vring; 150 }; 151 152 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS]; 153 154 #define VHOST_XSTATS_NAME_SIZE 64 155 156 struct vhost_xstats_name_off { 157 char name[VHOST_XSTATS_NAME_SIZE]; 158 uint64_t offset; 159 }; 160 161 /* [rx]_is prepended to the name string here */ 162 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = { 163 {"good_packets", 164 offsetof(struct vhost_queue, stats.pkts)}, 165 {"total_bytes", 166 offsetof(struct vhost_queue, stats.bytes)}, 167 {"missed_pkts", 168 offsetof(struct vhost_queue, stats.missed_pkts)}, 169 {"broadcast_packets", 170 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])}, 171 {"multicast_packets", 172 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])}, 173 {"unicast_packets", 174 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])}, 175 {"undersize_packets", 176 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])}, 177 {"size_64_packets", 178 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])}, 179 {"size_65_to_127_packets", 180 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])}, 181 {"size_128_to_255_packets", 182 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])}, 183 {"size_256_to_511_packets", 184 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])}, 185 {"size_512_to_1023_packets", 186 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])}, 187 {"size_1024_to_1522_packets", 188 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])}, 189 {"size_1523_to_max_packets", 190 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])}, 191 {"errors_with_bad_CRC", 192 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])}, 193 {"fragmented_errors", 194 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])}, 195 {"jabber_errors", 196 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])}, 197 {"unknown_protos_packets", 198 offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])}, 199 }; 200 201 /* [tx]_ is prepended to the name string here */ 202 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = { 203 {"good_packets", 204 offsetof(struct vhost_queue, stats.pkts)}, 205 {"total_bytes", 206 offsetof(struct vhost_queue, stats.bytes)}, 207 {"missed_pkts", 208 offsetof(struct vhost_queue, stats.missed_pkts)}, 209 {"broadcast_packets", 210 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])}, 211 {"multicast_packets", 212 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])}, 213 {"unicast_packets", 214 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])}, 215 {"undersize_packets", 216 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])}, 217 {"size_64_packets", 218 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])}, 219 {"size_65_to_127_packets", 220 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])}, 221 {"size_128_to_255_packets", 222 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])}, 223 {"size_256_to_511_packets", 224 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])}, 225 {"size_512_to_1023_packets", 226 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])}, 227 {"size_1024_to_1522_packets", 228 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])}, 229 {"size_1523_to_max_packets", 230 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])}, 231 {"errors_with_bad_CRC", 232 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])}, 233 }; 234 235 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \ 236 sizeof(vhost_rxport_stat_strings[0])) 237 238 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \ 239 sizeof(vhost_txport_stat_strings[0])) 240 241 static void 242 vhost_dev_xstats_reset(struct rte_eth_dev *dev) 243 { 244 struct vhost_queue *vq = NULL; 245 unsigned int i = 0; 246 247 for (i = 0; i < dev->data->nb_rx_queues; i++) { 248 vq = dev->data->rx_queues[i]; 249 if (!vq) 250 continue; 251 memset(&vq->stats, 0, sizeof(vq->stats)); 252 } 253 for (i = 0; i < dev->data->nb_tx_queues; i++) { 254 vq = dev->data->tx_queues[i]; 255 if (!vq) 256 continue; 257 memset(&vq->stats, 0, sizeof(vq->stats)); 258 } 259 } 260 261 static int 262 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused, 263 struct rte_eth_xstat_name *xstats_names, 264 unsigned int limit __rte_unused) 265 { 266 unsigned int t = 0; 267 int count = 0; 268 int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT; 269 270 if (!xstats_names) 271 return nstats; 272 for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) { 273 snprintf(xstats_names[count].name, 274 sizeof(xstats_names[count].name), 275 "rx_%s", vhost_rxport_stat_strings[t].name); 276 count++; 277 } 278 for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) { 279 snprintf(xstats_names[count].name, 280 sizeof(xstats_names[count].name), 281 "tx_%s", vhost_txport_stat_strings[t].name); 282 count++; 283 } 284 return count; 285 } 286 287 static int 288 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats, 289 unsigned int n) 290 { 291 unsigned int i; 292 unsigned int t; 293 unsigned int count = 0; 294 struct vhost_queue *vq = NULL; 295 unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT; 296 297 if (n < nxstats) 298 return nxstats; 299 300 for (i = 0; i < dev->data->nb_rx_queues; i++) { 301 vq = dev->data->rx_queues[i]; 302 if (!vq) 303 continue; 304 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts 305 - (vq->stats.xstats[VHOST_BROADCAST_PKT] 306 + vq->stats.xstats[VHOST_MULTICAST_PKT]); 307 } 308 for (i = 0; i < dev->data->nb_tx_queues; i++) { 309 vq = dev->data->tx_queues[i]; 310 if (!vq) 311 continue; 312 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts 313 + vq->stats.missed_pkts 314 - (vq->stats.xstats[VHOST_BROADCAST_PKT] 315 + vq->stats.xstats[VHOST_MULTICAST_PKT]); 316 } 317 for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) { 318 xstats[count].value = 0; 319 for (i = 0; i < dev->data->nb_rx_queues; i++) { 320 vq = dev->data->rx_queues[i]; 321 if (!vq) 322 continue; 323 xstats[count].value += 324 *(uint64_t *)(((char *)vq) 325 + vhost_rxport_stat_strings[t].offset); 326 } 327 count++; 328 } 329 for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) { 330 xstats[count].value = 0; 331 for (i = 0; i < dev->data->nb_tx_queues; i++) { 332 vq = dev->data->tx_queues[i]; 333 if (!vq) 334 continue; 335 xstats[count].value += 336 *(uint64_t *)(((char *)vq) 337 + vhost_txport_stat_strings[t].offset); 338 } 339 count++; 340 } 341 return count; 342 } 343 344 static inline void 345 vhost_count_multicast_broadcast(struct vhost_queue *vq, 346 struct rte_mbuf *mbuf) 347 { 348 struct ether_addr *ea = NULL; 349 struct vhost_stats *pstats = &vq->stats; 350 351 ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *); 352 if (is_multicast_ether_addr(ea)) { 353 if (is_broadcast_ether_addr(ea)) 354 pstats->xstats[VHOST_BROADCAST_PKT]++; 355 else 356 pstats->xstats[VHOST_MULTICAST_PKT]++; 357 } 358 } 359 360 static void 361 vhost_update_packet_xstats(struct vhost_queue *vq, 362 struct rte_mbuf **bufs, 363 uint16_t count) 364 { 365 uint32_t pkt_len = 0; 366 uint64_t i = 0; 367 uint64_t index; 368 struct vhost_stats *pstats = &vq->stats; 369 370 for (i = 0; i < count ; i++) { 371 pkt_len = bufs[i]->pkt_len; 372 if (pkt_len == 64) { 373 pstats->xstats[VHOST_64_PKT]++; 374 } else if (pkt_len > 64 && pkt_len < 1024) { 375 index = (sizeof(pkt_len) * 8) 376 - __builtin_clz(pkt_len) - 5; 377 pstats->xstats[index]++; 378 } else { 379 if (pkt_len < 64) 380 pstats->xstats[VHOST_UNDERSIZE_PKT]++; 381 else if (pkt_len <= 1522) 382 pstats->xstats[VHOST_1024_TO_1522_PKT]++; 383 else if (pkt_len > 1522) 384 pstats->xstats[VHOST_1523_TO_MAX_PKT]++; 385 } 386 vhost_count_multicast_broadcast(vq, bufs[i]); 387 } 388 } 389 390 static uint16_t 391 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs) 392 { 393 struct vhost_queue *r = q; 394 uint16_t i, nb_rx = 0; 395 396 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) 397 return 0; 398 399 rte_atomic32_set(&r->while_queuing, 1); 400 401 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) 402 goto out; 403 404 /* Dequeue packets from guest TX queue */ 405 nb_rx = rte_vhost_dequeue_burst(r->vid, 406 r->virtqueue_id, r->mb_pool, bufs, nb_bufs); 407 408 r->stats.pkts += nb_rx; 409 410 for (i = 0; likely(i < nb_rx); i++) { 411 bufs[i]->port = r->port; 412 r->stats.bytes += bufs[i]->pkt_len; 413 } 414 415 vhost_update_packet_xstats(r, bufs, nb_rx); 416 417 out: 418 rte_atomic32_set(&r->while_queuing, 0); 419 420 return nb_rx; 421 } 422 423 static uint16_t 424 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs) 425 { 426 struct vhost_queue *r = q; 427 uint16_t i, nb_tx = 0; 428 429 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) 430 return 0; 431 432 rte_atomic32_set(&r->while_queuing, 1); 433 434 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) 435 goto out; 436 437 /* Enqueue packets to guest RX queue */ 438 nb_tx = rte_vhost_enqueue_burst(r->vid, 439 r->virtqueue_id, bufs, nb_bufs); 440 441 r->stats.pkts += nb_tx; 442 r->stats.missed_pkts += nb_bufs - nb_tx; 443 444 for (i = 0; likely(i < nb_tx); i++) 445 r->stats.bytes += bufs[i]->pkt_len; 446 447 vhost_update_packet_xstats(r, bufs, nb_tx); 448 449 /* According to RFC2863 page42 section ifHCOutMulticastPkts and 450 * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast" 451 * are increased when packets are not transmitted successfully. 452 */ 453 for (i = nb_tx; i < nb_bufs; i++) 454 vhost_count_multicast_broadcast(r, bufs[i]); 455 456 for (i = 0; likely(i < nb_tx); i++) 457 rte_pktmbuf_free(bufs[i]); 458 out: 459 rte_atomic32_set(&r->while_queuing, 0); 460 461 return nb_tx; 462 } 463 464 static int 465 eth_dev_configure(struct rte_eth_dev *dev __rte_unused) 466 { 467 return 0; 468 } 469 470 static inline struct internal_list * 471 find_internal_resource(char *ifname) 472 { 473 int found = 0; 474 struct internal_list *list; 475 struct pmd_internal *internal; 476 477 if (ifname == NULL) 478 return NULL; 479 480 pthread_mutex_lock(&internal_list_lock); 481 482 TAILQ_FOREACH(list, &internal_list, next) { 483 internal = list->eth_dev->data->dev_private; 484 if (!strcmp(internal->iface_name, ifname)) { 485 found = 1; 486 break; 487 } 488 } 489 490 pthread_mutex_unlock(&internal_list_lock); 491 492 if (!found) 493 return NULL; 494 495 return list; 496 } 497 498 static int 499 new_device(int vid) 500 { 501 struct rte_eth_dev *eth_dev; 502 struct internal_list *list; 503 struct pmd_internal *internal; 504 struct vhost_queue *vq; 505 unsigned i; 506 char ifname[PATH_MAX]; 507 #ifdef RTE_LIBRTE_VHOST_NUMA 508 int newnode; 509 #endif 510 511 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 512 list = find_internal_resource(ifname); 513 if (list == NULL) { 514 RTE_LOG(INFO, PMD, "Invalid device name: %s\n", ifname); 515 return -1; 516 } 517 518 eth_dev = list->eth_dev; 519 internal = eth_dev->data->dev_private; 520 521 #ifdef RTE_LIBRTE_VHOST_NUMA 522 newnode = rte_vhost_get_numa_node(vid); 523 if (newnode >= 0) 524 eth_dev->data->numa_node = newnode; 525 #endif 526 527 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) { 528 vq = eth_dev->data->rx_queues[i]; 529 if (vq == NULL) 530 continue; 531 vq->vid = vid; 532 vq->internal = internal; 533 vq->port = eth_dev->data->port_id; 534 } 535 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) { 536 vq = eth_dev->data->tx_queues[i]; 537 if (vq == NULL) 538 continue; 539 vq->vid = vid; 540 vq->internal = internal; 541 vq->port = eth_dev->data->port_id; 542 } 543 544 for (i = 0; i < rte_vhost_get_queue_num(vid) * VIRTIO_QNUM; i++) 545 rte_vhost_enable_guest_notification(vid, i, 0); 546 547 eth_dev->data->dev_link.link_status = ETH_LINK_UP; 548 549 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) { 550 vq = eth_dev->data->rx_queues[i]; 551 if (vq == NULL) 552 continue; 553 rte_atomic32_set(&vq->allow_queuing, 1); 554 } 555 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) { 556 vq = eth_dev->data->tx_queues[i]; 557 if (vq == NULL) 558 continue; 559 rte_atomic32_set(&vq->allow_queuing, 1); 560 } 561 562 RTE_LOG(INFO, PMD, "New connection established\n"); 563 564 _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL); 565 566 return 0; 567 } 568 569 static void 570 destroy_device(int vid) 571 { 572 struct rte_eth_dev *eth_dev; 573 struct vhost_queue *vq; 574 struct internal_list *list; 575 char ifname[PATH_MAX]; 576 unsigned i; 577 struct rte_vhost_vring_state *state; 578 579 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 580 list = find_internal_resource(ifname); 581 if (list == NULL) { 582 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname); 583 return; 584 } 585 eth_dev = list->eth_dev; 586 587 /* Wait until rx/tx_pkt_burst stops accessing vhost device */ 588 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) { 589 vq = eth_dev->data->rx_queues[i]; 590 if (vq == NULL) 591 continue; 592 rte_atomic32_set(&vq->allow_queuing, 0); 593 while (rte_atomic32_read(&vq->while_queuing)) 594 rte_pause(); 595 } 596 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) { 597 vq = eth_dev->data->tx_queues[i]; 598 if (vq == NULL) 599 continue; 600 rte_atomic32_set(&vq->allow_queuing, 0); 601 while (rte_atomic32_read(&vq->while_queuing)) 602 rte_pause(); 603 } 604 605 eth_dev->data->dev_link.link_status = ETH_LINK_DOWN; 606 607 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) { 608 vq = eth_dev->data->rx_queues[i]; 609 if (vq == NULL) 610 continue; 611 vq->vid = -1; 612 } 613 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) { 614 vq = eth_dev->data->tx_queues[i]; 615 if (vq == NULL) 616 continue; 617 vq->vid = -1; 618 } 619 620 state = vring_states[eth_dev->data->port_id]; 621 rte_spinlock_lock(&state->lock); 622 for (i = 0; i <= state->max_vring; i++) { 623 state->cur[i] = false; 624 state->seen[i] = false; 625 } 626 state->max_vring = 0; 627 rte_spinlock_unlock(&state->lock); 628 629 RTE_LOG(INFO, PMD, "Connection closed\n"); 630 631 _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL); 632 } 633 634 static int 635 vring_state_changed(int vid, uint16_t vring, int enable) 636 { 637 struct rte_vhost_vring_state *state; 638 struct rte_eth_dev *eth_dev; 639 struct internal_list *list; 640 char ifname[PATH_MAX]; 641 642 rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); 643 list = find_internal_resource(ifname); 644 if (list == NULL) { 645 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname); 646 return -1; 647 } 648 649 eth_dev = list->eth_dev; 650 /* won't be NULL */ 651 state = vring_states[eth_dev->data->port_id]; 652 rte_spinlock_lock(&state->lock); 653 state->cur[vring] = enable; 654 state->max_vring = RTE_MAX(vring, state->max_vring); 655 rte_spinlock_unlock(&state->lock); 656 657 RTE_LOG(INFO, PMD, "vring%u is %s\n", 658 vring, enable ? "enabled" : "disabled"); 659 660 _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL); 661 662 return 0; 663 } 664 665 int 666 rte_eth_vhost_get_queue_event(uint8_t port_id, 667 struct rte_eth_vhost_queue_event *event) 668 { 669 struct rte_vhost_vring_state *state; 670 unsigned int i; 671 int idx; 672 673 if (port_id >= RTE_MAX_ETHPORTS) { 674 RTE_LOG(ERR, PMD, "Invalid port id\n"); 675 return -1; 676 } 677 678 state = vring_states[port_id]; 679 if (!state) { 680 RTE_LOG(ERR, PMD, "Unused port\n"); 681 return -1; 682 } 683 684 rte_spinlock_lock(&state->lock); 685 for (i = 0; i <= state->max_vring; i++) { 686 idx = state->index++ % (state->max_vring + 1); 687 688 if (state->cur[idx] != state->seen[idx]) { 689 state->seen[idx] = state->cur[idx]; 690 event->queue_id = idx / 2; 691 event->rx = idx & 1; 692 event->enable = state->cur[idx]; 693 rte_spinlock_unlock(&state->lock); 694 return 0; 695 } 696 } 697 rte_spinlock_unlock(&state->lock); 698 699 return -1; 700 } 701 702 int 703 rte_eth_vhost_get_vid_from_port_id(uint8_t port_id) 704 { 705 struct internal_list *list; 706 struct rte_eth_dev *eth_dev; 707 struct vhost_queue *vq; 708 int vid = -1; 709 710 if (!rte_eth_dev_is_valid_port(port_id)) 711 return -1; 712 713 pthread_mutex_lock(&internal_list_lock); 714 715 TAILQ_FOREACH(list, &internal_list, next) { 716 eth_dev = list->eth_dev; 717 if (eth_dev->data->port_id == port_id) { 718 vq = eth_dev->data->rx_queues[0]; 719 if (vq) { 720 vid = vq->vid; 721 } 722 break; 723 } 724 } 725 726 pthread_mutex_unlock(&internal_list_lock); 727 728 return vid; 729 } 730 731 static void * 732 vhost_driver_session(void *param __rte_unused) 733 { 734 static struct virtio_net_device_ops vhost_ops; 735 736 /* set vhost arguments */ 737 vhost_ops.new_device = new_device; 738 vhost_ops.destroy_device = destroy_device; 739 vhost_ops.vring_state_changed = vring_state_changed; 740 if (rte_vhost_driver_callback_register(&vhost_ops) < 0) 741 RTE_LOG(ERR, PMD, "Can't register callbacks\n"); 742 743 /* start event handling */ 744 rte_vhost_driver_session_start(); 745 746 return NULL; 747 } 748 749 static int 750 vhost_driver_session_start(void) 751 { 752 int ret; 753 754 ret = pthread_create(&session_th, 755 NULL, vhost_driver_session, NULL); 756 if (ret) 757 RTE_LOG(ERR, PMD, "Can't create a thread\n"); 758 759 return ret; 760 } 761 762 static void 763 vhost_driver_session_stop(void) 764 { 765 int ret; 766 767 ret = pthread_cancel(session_th); 768 if (ret) 769 RTE_LOG(ERR, PMD, "Can't cancel the thread\n"); 770 771 ret = pthread_join(session_th, NULL); 772 if (ret) 773 RTE_LOG(ERR, PMD, "Can't join the thread\n"); 774 } 775 776 static int 777 eth_dev_start(struct rte_eth_dev *dev) 778 { 779 struct pmd_internal *internal = dev->data->dev_private; 780 int ret = 0; 781 782 if (rte_atomic16_cmpset(&internal->once, 0, 1)) { 783 ret = rte_vhost_driver_register(internal->iface_name, 784 internal->flags); 785 if (ret) 786 return ret; 787 } 788 789 /* We need only one message handling thread */ 790 if (rte_atomic16_add_return(&nb_started_ports, 1) == 1) 791 ret = vhost_driver_session_start(); 792 793 return ret; 794 } 795 796 static void 797 eth_dev_stop(struct rte_eth_dev *dev) 798 { 799 struct pmd_internal *internal = dev->data->dev_private; 800 801 if (rte_atomic16_cmpset(&internal->once, 1, 0)) 802 rte_vhost_driver_unregister(internal->iface_name); 803 804 if (rte_atomic16_sub_return(&nb_started_ports, 1) == 0) 805 vhost_driver_session_stop(); 806 } 807 808 static int 809 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id, 810 uint16_t nb_rx_desc __rte_unused, 811 unsigned int socket_id, 812 const struct rte_eth_rxconf *rx_conf __rte_unused, 813 struct rte_mempool *mb_pool) 814 { 815 struct vhost_queue *vq; 816 817 vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue), 818 RTE_CACHE_LINE_SIZE, socket_id); 819 if (vq == NULL) { 820 RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n"); 821 return -ENOMEM; 822 } 823 824 vq->mb_pool = mb_pool; 825 vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ; 826 dev->data->rx_queues[rx_queue_id] = vq; 827 828 return 0; 829 } 830 831 static int 832 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id, 833 uint16_t nb_tx_desc __rte_unused, 834 unsigned int socket_id, 835 const struct rte_eth_txconf *tx_conf __rte_unused) 836 { 837 struct vhost_queue *vq; 838 839 vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue), 840 RTE_CACHE_LINE_SIZE, socket_id); 841 if (vq == NULL) { 842 RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n"); 843 return -ENOMEM; 844 } 845 846 vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ; 847 dev->data->tx_queues[tx_queue_id] = vq; 848 849 return 0; 850 } 851 852 static void 853 eth_dev_info(struct rte_eth_dev *dev, 854 struct rte_eth_dev_info *dev_info) 855 { 856 struct pmd_internal *internal; 857 858 internal = dev->data->dev_private; 859 if (internal == NULL) { 860 RTE_LOG(ERR, PMD, "Invalid device specified\n"); 861 return; 862 } 863 864 dev_info->driver_name = drivername; 865 dev_info->max_mac_addrs = 1; 866 dev_info->max_rx_pktlen = (uint32_t)-1; 867 dev_info->max_rx_queues = internal->max_queues; 868 dev_info->max_tx_queues = internal->max_queues; 869 dev_info->min_rx_bufsize = 0; 870 } 871 872 static void 873 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) 874 { 875 unsigned i; 876 unsigned long rx_total = 0, tx_total = 0, tx_missed_total = 0; 877 unsigned long rx_total_bytes = 0, tx_total_bytes = 0; 878 struct vhost_queue *vq; 879 880 for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS && 881 i < dev->data->nb_rx_queues; i++) { 882 if (dev->data->rx_queues[i] == NULL) 883 continue; 884 vq = dev->data->rx_queues[i]; 885 stats->q_ipackets[i] = vq->stats.pkts; 886 rx_total += stats->q_ipackets[i]; 887 888 stats->q_ibytes[i] = vq->stats.bytes; 889 rx_total_bytes += stats->q_ibytes[i]; 890 } 891 892 for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS && 893 i < dev->data->nb_tx_queues; i++) { 894 if (dev->data->tx_queues[i] == NULL) 895 continue; 896 vq = dev->data->tx_queues[i]; 897 stats->q_opackets[i] = vq->stats.pkts; 898 tx_missed_total += vq->stats.missed_pkts; 899 tx_total += stats->q_opackets[i]; 900 901 stats->q_obytes[i] = vq->stats.bytes; 902 tx_total_bytes += stats->q_obytes[i]; 903 } 904 905 stats->ipackets = rx_total; 906 stats->opackets = tx_total; 907 stats->oerrors = tx_missed_total; 908 stats->ibytes = rx_total_bytes; 909 stats->obytes = tx_total_bytes; 910 } 911 912 static void 913 eth_stats_reset(struct rte_eth_dev *dev) 914 { 915 struct vhost_queue *vq; 916 unsigned i; 917 918 for (i = 0; i < dev->data->nb_rx_queues; i++) { 919 if (dev->data->rx_queues[i] == NULL) 920 continue; 921 vq = dev->data->rx_queues[i]; 922 vq->stats.pkts = 0; 923 vq->stats.bytes = 0; 924 } 925 for (i = 0; i < dev->data->nb_tx_queues; i++) { 926 if (dev->data->tx_queues[i] == NULL) 927 continue; 928 vq = dev->data->tx_queues[i]; 929 vq->stats.pkts = 0; 930 vq->stats.bytes = 0; 931 vq->stats.missed_pkts = 0; 932 } 933 } 934 935 static void 936 eth_queue_release(void *q) 937 { 938 rte_free(q); 939 } 940 941 static int 942 eth_link_update(struct rte_eth_dev *dev __rte_unused, 943 int wait_to_complete __rte_unused) 944 { 945 return 0; 946 } 947 948 /** 949 * Disable features in feature_mask. Returns 0 on success. 950 */ 951 int 952 rte_eth_vhost_feature_disable(uint64_t feature_mask) 953 { 954 return rte_vhost_feature_disable(feature_mask); 955 } 956 957 /** 958 * Enable features in feature_mask. Returns 0 on success. 959 */ 960 int 961 rte_eth_vhost_feature_enable(uint64_t feature_mask) 962 { 963 return rte_vhost_feature_enable(feature_mask); 964 } 965 966 /* Returns currently supported vhost features */ 967 uint64_t 968 rte_eth_vhost_feature_get(void) 969 { 970 return rte_vhost_feature_get(); 971 } 972 973 static const struct eth_dev_ops ops = { 974 .dev_start = eth_dev_start, 975 .dev_stop = eth_dev_stop, 976 .dev_configure = eth_dev_configure, 977 .dev_infos_get = eth_dev_info, 978 .rx_queue_setup = eth_rx_queue_setup, 979 .tx_queue_setup = eth_tx_queue_setup, 980 .rx_queue_release = eth_queue_release, 981 .tx_queue_release = eth_queue_release, 982 .link_update = eth_link_update, 983 .stats_get = eth_stats_get, 984 .stats_reset = eth_stats_reset, 985 .xstats_reset = vhost_dev_xstats_reset, 986 .xstats_get = vhost_dev_xstats_get, 987 .xstats_get_names = vhost_dev_xstats_get_names, 988 }; 989 990 static int 991 eth_dev_vhost_create(const char *name, char *iface_name, int16_t queues, 992 const unsigned numa_node, uint64_t flags) 993 { 994 struct rte_eth_dev_data *data = NULL; 995 struct pmd_internal *internal = NULL; 996 struct rte_eth_dev *eth_dev = NULL; 997 struct ether_addr *eth_addr = NULL; 998 struct rte_vhost_vring_state *vring_state = NULL; 999 struct internal_list *list = NULL; 1000 1001 RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n", 1002 numa_node); 1003 1004 /* now do all data allocation - for eth_dev structure, dummy pci driver 1005 * and internal (private) data 1006 */ 1007 data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node); 1008 if (data == NULL) 1009 goto error; 1010 1011 internal = rte_zmalloc_socket(name, sizeof(*internal), 0, numa_node); 1012 if (internal == NULL) 1013 goto error; 1014 1015 list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node); 1016 if (list == NULL) 1017 goto error; 1018 1019 /* reserve an ethdev entry */ 1020 eth_dev = rte_eth_dev_allocate(name); 1021 if (eth_dev == NULL) 1022 goto error; 1023 1024 eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node); 1025 if (eth_addr == NULL) 1026 goto error; 1027 *eth_addr = base_eth_addr; 1028 eth_addr->addr_bytes[5] = eth_dev->data->port_id; 1029 1030 vring_state = rte_zmalloc_socket(name, 1031 sizeof(*vring_state), 0, numa_node); 1032 if (vring_state == NULL) 1033 goto error; 1034 1035 /* now put it all together 1036 * - store queue data in internal, 1037 * - store numa_node info in ethdev data 1038 * - point eth_dev_data to internals 1039 * - and point eth_dev structure to new eth_dev_data structure 1040 */ 1041 internal->dev_name = strdup(name); 1042 if (internal->dev_name == NULL) 1043 goto error; 1044 internal->iface_name = strdup(iface_name); 1045 if (internal->iface_name == NULL) 1046 goto error; 1047 internal->flags = flags; 1048 1049 list->eth_dev = eth_dev; 1050 pthread_mutex_lock(&internal_list_lock); 1051 TAILQ_INSERT_TAIL(&internal_list, list, next); 1052 pthread_mutex_unlock(&internal_list_lock); 1053 1054 rte_spinlock_init(&vring_state->lock); 1055 vring_states[eth_dev->data->port_id] = vring_state; 1056 1057 data->dev_private = internal; 1058 data->port_id = eth_dev->data->port_id; 1059 memmove(data->name, eth_dev->data->name, sizeof(data->name)); 1060 data->nb_rx_queues = queues; 1061 data->nb_tx_queues = queues; 1062 internal->max_queues = queues; 1063 data->dev_link = pmd_link; 1064 data->mac_addrs = eth_addr; 1065 1066 /* We'll replace the 'data' originally allocated by eth_dev. So the 1067 * vhost PMD resources won't be shared between multi processes. 1068 */ 1069 eth_dev->data = data; 1070 eth_dev->dev_ops = &ops; 1071 eth_dev->driver = NULL; 1072 data->dev_flags = 1073 RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC; 1074 data->kdrv = RTE_KDRV_NONE; 1075 data->drv_name = internal->dev_name; 1076 data->numa_node = numa_node; 1077 1078 /* finally assign rx and tx ops */ 1079 eth_dev->rx_pkt_burst = eth_vhost_rx; 1080 eth_dev->tx_pkt_burst = eth_vhost_tx; 1081 1082 return data->port_id; 1083 1084 error: 1085 if (internal) 1086 free(internal->dev_name); 1087 rte_free(vring_state); 1088 rte_free(eth_addr); 1089 if (eth_dev) 1090 rte_eth_dev_release_port(eth_dev); 1091 rte_free(internal); 1092 rte_free(list); 1093 rte_free(data); 1094 1095 return -1; 1096 } 1097 1098 static inline int 1099 open_iface(const char *key __rte_unused, const char *value, void *extra_args) 1100 { 1101 const char **iface_name = extra_args; 1102 1103 if (value == NULL) 1104 return -1; 1105 1106 *iface_name = value; 1107 1108 return 0; 1109 } 1110 1111 static inline int 1112 open_int(const char *key __rte_unused, const char *value, void *extra_args) 1113 { 1114 uint16_t *n = extra_args; 1115 1116 if (value == NULL || extra_args == NULL) 1117 return -EINVAL; 1118 1119 *n = (uint16_t)strtoul(value, NULL, 0); 1120 if (*n == USHRT_MAX && errno == ERANGE) 1121 return -1; 1122 1123 return 0; 1124 } 1125 1126 static int 1127 rte_pmd_vhost_probe(const char *name, const char *params) 1128 { 1129 struct rte_kvargs *kvlist = NULL; 1130 int ret = 0; 1131 char *iface_name; 1132 uint16_t queues; 1133 uint64_t flags = 0; 1134 int client_mode = 0; 1135 int dequeue_zero_copy = 0; 1136 1137 RTE_LOG(INFO, PMD, "Initializing pmd_vhost for %s\n", name); 1138 1139 kvlist = rte_kvargs_parse(params, valid_arguments); 1140 if (kvlist == NULL) 1141 return -1; 1142 1143 if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) { 1144 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG, 1145 &open_iface, &iface_name); 1146 if (ret < 0) 1147 goto out_free; 1148 } else { 1149 ret = -1; 1150 goto out_free; 1151 } 1152 1153 if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) { 1154 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG, 1155 &open_int, &queues); 1156 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT) 1157 goto out_free; 1158 1159 } else 1160 queues = 1; 1161 1162 if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) { 1163 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG, 1164 &open_int, &client_mode); 1165 if (ret < 0) 1166 goto out_free; 1167 1168 if (client_mode) 1169 flags |= RTE_VHOST_USER_CLIENT; 1170 } 1171 1172 if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) { 1173 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY, 1174 &open_int, &dequeue_zero_copy); 1175 if (ret < 0) 1176 goto out_free; 1177 1178 if (dequeue_zero_copy) 1179 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY; 1180 } 1181 1182 eth_dev_vhost_create(name, iface_name, queues, rte_socket_id(), flags); 1183 1184 out_free: 1185 rte_kvargs_free(kvlist); 1186 return ret; 1187 } 1188 1189 static int 1190 rte_pmd_vhost_remove(const char *name) 1191 { 1192 struct rte_eth_dev *eth_dev = NULL; 1193 struct pmd_internal *internal; 1194 struct internal_list *list; 1195 unsigned int i; 1196 1197 RTE_LOG(INFO, PMD, "Un-Initializing pmd_vhost for %s\n", name); 1198 1199 /* find an ethdev entry */ 1200 eth_dev = rte_eth_dev_allocated(name); 1201 if (eth_dev == NULL) 1202 return -ENODEV; 1203 1204 internal = eth_dev->data->dev_private; 1205 if (internal == NULL) 1206 return -ENODEV; 1207 1208 list = find_internal_resource(internal->iface_name); 1209 if (list == NULL) 1210 return -ENODEV; 1211 1212 pthread_mutex_lock(&internal_list_lock); 1213 TAILQ_REMOVE(&internal_list, list, next); 1214 pthread_mutex_unlock(&internal_list_lock); 1215 rte_free(list); 1216 1217 eth_dev_stop(eth_dev); 1218 1219 rte_free(vring_states[eth_dev->data->port_id]); 1220 vring_states[eth_dev->data->port_id] = NULL; 1221 1222 free(internal->dev_name); 1223 free(internal->iface_name); 1224 1225 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) 1226 rte_free(eth_dev->data->rx_queues[i]); 1227 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) 1228 rte_free(eth_dev->data->tx_queues[i]); 1229 1230 rte_free(eth_dev->data->mac_addrs); 1231 rte_free(eth_dev->data); 1232 rte_free(internal); 1233 1234 rte_eth_dev_release_port(eth_dev); 1235 1236 return 0; 1237 } 1238 1239 static struct rte_vdev_driver pmd_vhost_drv = { 1240 .probe = rte_pmd_vhost_probe, 1241 .remove = rte_pmd_vhost_remove, 1242 }; 1243 1244 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv); 1245 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost); 1246 RTE_PMD_REGISTER_PARAM_STRING(net_vhost, 1247 "iface=<ifc> " 1248 "queues=<int>"); 1249