1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/env.h" 37 #include "spdk/likely.h" 38 #include "spdk/string.h" 39 #include "spdk/util.h" 40 #include "spdk/memory.h" 41 #include "spdk/barrier.h" 42 #include "spdk/vhost.h" 43 #include "vhost_internal.h" 44 45 static struct spdk_cpuset g_vhost_core_mask; 46 47 /* Path to folder where character device will be created. Can be set by user. */ 48 static char dev_dirname[PATH_MAX] = ""; 49 50 /* Thread performing all vhost management operations */ 51 static struct spdk_thread *g_vhost_init_thread; 52 53 static spdk_vhost_fini_cb g_fini_cpl_cb; 54 55 /** 56 * DPDK calls our callbacks synchronously but the work those callbacks 57 * perform needs to be async. Luckily, all DPDK callbacks are called on 58 * a DPDK-internal pthread, so we'll just wait on a semaphore in there. 59 */ 60 static sem_t g_dpdk_sem; 61 62 /** Return code for the current DPDK callback */ 63 static int g_dpdk_response; 64 65 struct vhost_session_fn_ctx { 66 /** Device pointer obtained before enqueuing the event */ 67 struct spdk_vhost_dev *vdev; 68 69 /** ID of the session to send event to. */ 70 uint32_t vsession_id; 71 72 /** User provided function to be executed on session's thread. */ 73 spdk_vhost_session_fn cb_fn; 74 75 /** 76 * User provided function to be called on the init thread 77 * after iterating through all sessions. 78 */ 79 spdk_vhost_dev_fn cpl_fn; 80 81 /** Custom user context */ 82 void *user_ctx; 83 }; 84 85 static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER( 86 g_vhost_devices); 87 static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER; 88 89 void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len) 90 { 91 void *vva; 92 uint64_t newlen; 93 94 newlen = len; 95 vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen); 96 if (newlen != len) { 97 return NULL; 98 } 99 100 return vva; 101 102 } 103 104 static void 105 vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue, 106 uint16_t req_id) 107 { 108 struct vring_desc *desc, *desc_table; 109 uint32_t desc_table_size; 110 int rc; 111 112 if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { 113 return; 114 } 115 116 rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size); 117 if (spdk_unlikely(rc != 0)) { 118 SPDK_ERRLOG("Can't log used ring descriptors!\n"); 119 return; 120 } 121 122 do { 123 if (vhost_vring_desc_is_wr(desc)) { 124 /* To be honest, only pages realy touched should be logged, but 125 * doing so would require tracking those changes in each backed. 126 * Also backend most likely will touch all/most of those pages so 127 * for lets assume we touched all pages passed to as writeable buffers. */ 128 rte_vhost_log_write(vsession->vid, desc->addr, desc->len); 129 } 130 vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); 131 } while (desc); 132 } 133 134 static void 135 vhost_log_used_vring_elem(struct spdk_vhost_session *vsession, 136 struct spdk_vhost_virtqueue *virtqueue, 137 uint16_t idx) 138 { 139 uint64_t offset, len; 140 141 if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { 142 return; 143 } 144 145 if (spdk_unlikely(virtqueue->packed.packed_ring)) { 146 offset = idx * sizeof(struct vring_packed_desc); 147 len = sizeof(struct vring_packed_desc); 148 } else { 149 offset = offsetof(struct vring_used, ring[idx]); 150 len = sizeof(virtqueue->vring.used->ring[idx]); 151 } 152 153 rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len); 154 } 155 156 static void 157 vhost_log_used_vring_idx(struct spdk_vhost_session *vsession, 158 struct spdk_vhost_virtqueue *virtqueue) 159 { 160 uint64_t offset, len; 161 uint16_t vq_idx; 162 163 if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { 164 return; 165 } 166 167 offset = offsetof(struct vring_used, idx); 168 len = sizeof(virtqueue->vring.used->idx); 169 vq_idx = virtqueue - vsession->virtqueue; 170 171 rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len); 172 } 173 174 /* 175 * Get available requests from avail ring. 176 */ 177 uint16_t 178 vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs, 179 uint16_t reqs_len) 180 { 181 struct rte_vhost_vring *vring = &virtqueue->vring; 182 struct vring_avail *avail = vring->avail; 183 uint16_t size_mask = vring->size - 1; 184 uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx; 185 uint16_t count, i; 186 187 spdk_smp_rmb(); 188 189 count = avail_idx - last_idx; 190 if (spdk_likely(count == 0)) { 191 return 0; 192 } 193 194 if (spdk_unlikely(count > vring->size)) { 195 /* TODO: the queue is unrecoverably broken and should be marked so. 196 * For now we will fail silently and report there are no new avail entries. 197 */ 198 return 0; 199 } 200 201 count = spdk_min(count, reqs_len); 202 if (virtqueue->vsession && virtqueue->vsession->interrupt_mode) { 203 /* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH, 204 * io_getevent should be called again to ensure all completed IO are processed. 205 */ 206 int rc; 207 uint64_t num_events; 208 209 rc = read(vring->kickfd, &num_events, sizeof(num_events)); 210 if (rc < 0) { 211 SPDK_ERRLOG("failed to acknowledge kickfd: %s.\n", spdk_strerror(errno)); 212 return -errno; 213 } 214 215 if ((uint16_t)(avail_idx - last_idx) != num_events) { 216 SPDK_DEBUGLOG(vhost_ring, 217 "virtqueue gets %d reqs, but kickfd shows %lu reqs\n", 218 avail_idx - last_idx, num_events); 219 } 220 221 if (num_events > count) { 222 SPDK_DEBUGLOG(vhost_ring, 223 "virtqueue kickfd shows %lu reqs, take %d, send notice for other reqs\n", 224 num_events, reqs_len); 225 num_events -= count; 226 rc = write(vring->kickfd, &num_events, sizeof(num_events)); 227 if (rc < 0) { 228 SPDK_ERRLOG("failed to kick vring: %s.\n", spdk_strerror(errno)); 229 return -errno; 230 } 231 } 232 } 233 234 virtqueue->last_avail_idx += count; 235 for (i = 0; i < count; i++) { 236 reqs[i] = vring->avail->ring[(last_idx + i) & size_mask]; 237 } 238 239 SPDK_DEBUGLOG(vhost_ring, 240 "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", 241 last_idx, avail_idx, count); 242 243 return count; 244 } 245 246 static bool 247 vhost_vring_desc_is_indirect(struct vring_desc *cur_desc) 248 { 249 return !!(cur_desc->flags & VRING_DESC_F_INDIRECT); 250 } 251 252 static bool 253 vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc) 254 { 255 return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0; 256 } 257 258 int 259 vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue, 260 uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, 261 uint32_t *desc_table_size) 262 { 263 if (spdk_unlikely(req_idx >= virtqueue->vring.size)) { 264 return -1; 265 } 266 267 *desc = &virtqueue->vring.desc[req_idx]; 268 269 if (vhost_vring_desc_is_indirect(*desc)) { 270 *desc_table_size = (*desc)->len / sizeof(**desc); 271 *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr, 272 sizeof(**desc) * *desc_table_size); 273 *desc = *desc_table; 274 if (*desc == NULL) { 275 return -1; 276 } 277 278 return 0; 279 } 280 281 *desc_table = virtqueue->vring.desc; 282 *desc_table_size = virtqueue->vring.size; 283 284 return 0; 285 } 286 287 int 288 vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession, 289 struct spdk_vhost_virtqueue *virtqueue, 290 uint16_t req_idx, struct vring_packed_desc **desc, 291 struct vring_packed_desc **desc_table, uint32_t *desc_table_size) 292 { 293 *desc = &virtqueue->vring.desc_packed[req_idx]; 294 295 /* In packed ring when the desc is non-indirect we get next desc 296 * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc 297 * is indirect we get next desc by idx and desc_table_size. It's 298 * different from split ring. 299 */ 300 if (vhost_vring_packed_desc_is_indirect(*desc)) { 301 *desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc); 302 *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr, 303 (*desc)->len); 304 *desc = *desc_table; 305 if (spdk_unlikely(*desc == NULL)) { 306 return -1; 307 } 308 } else { 309 *desc_table = NULL; 310 *desc_table_size = 0; 311 } 312 313 return 0; 314 } 315 316 int 317 vhost_vq_used_signal(struct spdk_vhost_session *vsession, 318 struct spdk_vhost_virtqueue *virtqueue) 319 { 320 if (virtqueue->used_req_cnt == 0) { 321 return 0; 322 } 323 324 virtqueue->req_cnt += virtqueue->used_req_cnt; 325 virtqueue->used_req_cnt = 0; 326 327 SPDK_DEBUGLOG(vhost_ring, 328 "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n", 329 virtqueue - vsession->virtqueue, virtqueue->last_used_idx); 330 331 if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) { 332 /* interrupt signalled */ 333 return 1; 334 } else { 335 /* interrupt not signalled */ 336 return 0; 337 } 338 } 339 340 static void 341 session_vq_io_stats_update(struct spdk_vhost_session *vsession, 342 struct spdk_vhost_virtqueue *virtqueue, uint64_t now) 343 { 344 uint32_t irq_delay_base = vsession->coalescing_delay_time_base; 345 uint32_t io_threshold = vsession->coalescing_io_rate_threshold; 346 int32_t irq_delay; 347 uint32_t req_cnt; 348 349 req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt; 350 if (req_cnt <= io_threshold) { 351 return; 352 } 353 354 irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold; 355 virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay); 356 357 virtqueue->req_cnt = 0; 358 virtqueue->next_event_time = now; 359 } 360 361 static void 362 check_session_vq_io_stats(struct spdk_vhost_session *vsession, 363 struct spdk_vhost_virtqueue *virtqueue, uint64_t now) 364 { 365 if (now < vsession->next_stats_check_time) { 366 return; 367 } 368 369 vsession->next_stats_check_time = now + vsession->stats_check_interval; 370 session_vq_io_stats_update(vsession, virtqueue, now); 371 } 372 373 static inline bool 374 vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq) 375 { 376 if (spdk_unlikely(vq->packed.packed_ring)) { 377 if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) { 378 return true; 379 } 380 } else { 381 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { 382 return true; 383 } 384 } 385 386 return false; 387 } 388 389 void 390 vhost_session_vq_used_signal(struct spdk_vhost_virtqueue *virtqueue) 391 { 392 struct spdk_vhost_session *vsession = virtqueue->vsession; 393 uint64_t now; 394 395 if (vsession->coalescing_delay_time_base == 0) { 396 if (virtqueue->vring.desc == NULL) { 397 return; 398 } 399 400 if (vhost_vq_event_is_suppressed(virtqueue)) { 401 return; 402 } 403 404 vhost_vq_used_signal(vsession, virtqueue); 405 } else { 406 now = spdk_get_ticks(); 407 check_session_vq_io_stats(vsession, virtqueue, now); 408 409 /* No need for event right now */ 410 if (now < virtqueue->next_event_time) { 411 return; 412 } 413 414 if (vhost_vq_event_is_suppressed(virtqueue)) { 415 return; 416 } 417 418 if (!vhost_vq_used_signal(vsession, virtqueue)) { 419 return; 420 } 421 422 /* Syscall is quite long so update time */ 423 now = spdk_get_ticks(); 424 virtqueue->next_event_time = now + virtqueue->irq_delay_time; 425 } 426 } 427 428 void 429 vhost_session_used_signal(struct spdk_vhost_session *vsession) 430 { 431 struct spdk_vhost_virtqueue *virtqueue; 432 uint16_t q_idx; 433 434 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { 435 virtqueue = &vsession->virtqueue[q_idx]; 436 vhost_session_vq_used_signal(virtqueue); 437 } 438 } 439 440 static int 441 vhost_session_set_coalescing(struct spdk_vhost_dev *vdev, 442 struct spdk_vhost_session *vsession, void *ctx) 443 { 444 vsession->coalescing_delay_time_base = 445 vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL; 446 vsession->coalescing_io_rate_threshold = 447 vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; 448 return 0; 449 } 450 451 static int 452 vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, 453 uint32_t iops_threshold) 454 { 455 uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL; 456 uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; 457 458 if (delay_time_base >= UINT32_MAX) { 459 SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us); 460 return -EINVAL; 461 } else if (io_rate == 0) { 462 SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate, 463 1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS); 464 return -EINVAL; 465 } 466 467 vdev->coalescing_delay_us = delay_base_us; 468 vdev->coalescing_iops_threshold = iops_threshold; 469 return 0; 470 } 471 472 int 473 spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, 474 uint32_t iops_threshold) 475 { 476 int rc; 477 478 rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold); 479 if (rc != 0) { 480 return rc; 481 } 482 483 vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL); 484 return 0; 485 } 486 487 void 488 spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us, 489 uint32_t *iops_threshold) 490 { 491 if (delay_base_us) { 492 *delay_base_us = vdev->coalescing_delay_us; 493 } 494 495 if (iops_threshold) { 496 *iops_threshold = vdev->coalescing_iops_threshold; 497 } 498 } 499 500 /* 501 * Enqueue id and len to used ring. 502 */ 503 void 504 vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession, 505 struct spdk_vhost_virtqueue *virtqueue, 506 uint16_t id, uint32_t len) 507 { 508 struct rte_vhost_vring *vring = &virtqueue->vring; 509 struct vring_used *used = vring->used; 510 uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1); 511 uint16_t vq_idx = virtqueue->vring_idx; 512 513 SPDK_DEBUGLOG(vhost_ring, 514 "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n", 515 virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len); 516 517 vhost_log_req_desc(vsession, virtqueue, id); 518 519 virtqueue->last_used_idx++; 520 used->ring[last_idx].id = id; 521 used->ring[last_idx].len = len; 522 523 /* Ensure the used ring is updated before we log it or increment used->idx. */ 524 spdk_smp_wmb(); 525 526 rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id); 527 528 vhost_log_used_vring_elem(vsession, virtqueue, last_idx); 529 * (volatile uint16_t *) &used->idx = virtqueue->last_used_idx; 530 vhost_log_used_vring_idx(vsession, virtqueue); 531 532 rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id); 533 534 virtqueue->used_req_cnt++; 535 536 if (vsession->interrupt_mode) { 537 if (virtqueue->vring.desc == NULL || vhost_vq_event_is_suppressed(virtqueue)) { 538 return; 539 } 540 541 vhost_vq_used_signal(vsession, virtqueue); 542 } 543 } 544 545 void 546 vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession, 547 struct spdk_vhost_virtqueue *virtqueue, 548 uint16_t num_descs, uint16_t buffer_id, 549 uint32_t length) 550 { 551 struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx]; 552 bool used, avail; 553 554 SPDK_DEBUGLOG(vhost_ring, 555 "Queue %td - RING: buffer_id=%"PRIu16"\n", 556 virtqueue - vsession->virtqueue, buffer_id); 557 558 /* When the descriptor is used, two flags in descriptor 559 * avail flag and used flag are set to equal 560 * and used flag value == used_wrap_counter. 561 */ 562 used = !!(desc->flags & VRING_DESC_F_USED); 563 avail = !!(desc->flags & VRING_DESC_F_AVAIL); 564 if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) { 565 SPDK_ERRLOG("descriptor has been used before\n"); 566 return; 567 } 568 569 /* In used desc addr is unused and len specifies the buffer length 570 * that has been written to by the device. 571 */ 572 desc->addr = 0; 573 desc->len = length; 574 575 /* This bit specifies whether any data has been written by the device */ 576 if (length != 0) { 577 desc->flags |= VRING_DESC_F_WRITE; 578 } 579 580 /* Buffer ID is included in the last descriptor in the list. 581 * The driver needs to keep track of the size of the list corresponding 582 * to each buffer ID. 583 */ 584 desc->id = buffer_id; 585 586 /* A device MUST NOT make the descriptor used before buffer_id is 587 * written to the descriptor. 588 */ 589 spdk_smp_wmb(); 590 /* To mark a desc as used, the device sets the F_USED bit in flags to match 591 * the internal Device ring wrap counter. It also sets the F_AVAIL bit to 592 * match the same value. 593 */ 594 if (virtqueue->packed.used_phase) { 595 desc->flags |= VRING_DESC_F_AVAIL_USED; 596 } else { 597 desc->flags &= ~VRING_DESC_F_AVAIL_USED; 598 } 599 600 vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx); 601 virtqueue->last_used_idx += num_descs; 602 if (virtqueue->last_used_idx >= virtqueue->vring.size) { 603 virtqueue->last_used_idx -= virtqueue->vring.size; 604 virtqueue->packed.used_phase = !virtqueue->packed.used_phase; 605 } 606 607 virtqueue->used_req_cnt++; 608 } 609 610 bool 611 vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue) 612 { 613 uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags; 614 615 /* To mark a desc as available, the driver sets the F_AVAIL bit in flags 616 * to match the internal avail wrap counter. It also sets the F_USED bit to 617 * match the inverse value but it's not mandatory. 618 */ 619 return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase); 620 } 621 622 bool 623 vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc) 624 { 625 return (cur_desc->flags & VRING_DESC_F_WRITE) != 0; 626 } 627 628 int 629 vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx, 630 struct spdk_vhost_virtqueue *vq, 631 struct vring_packed_desc *desc_table, 632 uint32_t desc_table_size) 633 { 634 if (desc_table != NULL) { 635 /* When the desc_table isn't NULL means it's indirect and we get the next 636 * desc by req_idx and desc_table_size. The return value is NULL means 637 * we reach the last desc of this request. 638 */ 639 (*req_idx)++; 640 if (*req_idx < desc_table_size) { 641 *desc = &desc_table[*req_idx]; 642 } else { 643 *desc = NULL; 644 } 645 } else { 646 /* When the desc_table is NULL means it's non-indirect and we get the next 647 * desc by req_idx and F_NEXT in flags. The return value is NULL means 648 * we reach the last desc of this request. When return new desc 649 * we update the req_idx too. 650 */ 651 if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) { 652 *desc = NULL; 653 return 0; 654 } 655 656 *req_idx = (*req_idx + 1) % vq->vring.size; 657 *desc = &vq->vring.desc_packed[*req_idx]; 658 } 659 660 return 0; 661 } 662 663 static int 664 vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, 665 uint16_t *iov_index, uintptr_t payload, uint64_t remaining) 666 { 667 uintptr_t vva; 668 uint64_t len; 669 670 do { 671 if (*iov_index >= SPDK_VHOST_IOVS_MAX) { 672 SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX); 673 return -1; 674 } 675 len = remaining; 676 vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len); 677 if (vva == 0 || len == 0) { 678 SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload); 679 return -1; 680 } 681 iov[*iov_index].iov_base = (void *)vva; 682 iov[*iov_index].iov_len = len; 683 remaining -= len; 684 payload += len; 685 (*iov_index)++; 686 } while (remaining); 687 688 return 0; 689 } 690 691 int 692 vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, 693 uint16_t *iov_index, const struct vring_packed_desc *desc) 694 { 695 return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index, 696 desc->addr, desc->len); 697 } 698 699 /* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx. 700 * 2, Update the vq->last_avail_idx to point next available desc chain. 701 * 3, Update the avail_wrap_counter if last_avail_idx overturn. 702 */ 703 uint16_t 704 vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx, 705 uint16_t *num_descs) 706 { 707 struct vring_packed_desc *desc; 708 uint16_t desc_head = req_idx; 709 710 *num_descs = 1; 711 712 desc = &vq->vring.desc_packed[req_idx]; 713 if (!vhost_vring_packed_desc_is_indirect(desc)) { 714 while ((desc->flags & VRING_DESC_F_NEXT) != 0) { 715 req_idx = (req_idx + 1) % vq->vring.size; 716 desc = &vq->vring.desc_packed[req_idx]; 717 (*num_descs)++; 718 } 719 } 720 721 /* Queue Size doesn't have to be a power of 2 722 * Device maintains last_avail_idx so we can make sure 723 * the value is valid(0 ~ vring.size - 1) 724 */ 725 vq->last_avail_idx = (req_idx + 1) % vq->vring.size; 726 if (vq->last_avail_idx < desc_head) { 727 vq->packed.avail_phase = !vq->packed.avail_phase; 728 } 729 730 return desc->id; 731 } 732 733 int 734 vhost_vring_desc_get_next(struct vring_desc **desc, 735 struct vring_desc *desc_table, uint32_t desc_table_size) 736 { 737 struct vring_desc *old_desc = *desc; 738 uint16_t next_idx; 739 740 if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) { 741 *desc = NULL; 742 return 0; 743 } 744 745 next_idx = old_desc->next; 746 if (spdk_unlikely(next_idx >= desc_table_size)) { 747 *desc = NULL; 748 return -1; 749 } 750 751 *desc = &desc_table[next_idx]; 752 return 0; 753 } 754 755 int 756 vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, 757 uint16_t *iov_index, const struct vring_desc *desc) 758 { 759 return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index, 760 desc->addr, desc->len); 761 } 762 763 static struct spdk_vhost_session * 764 vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id) 765 { 766 struct spdk_vhost_session *vsession; 767 768 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { 769 if (vsession->id == id) { 770 return vsession; 771 } 772 } 773 774 return NULL; 775 } 776 777 struct spdk_vhost_session * 778 vhost_session_find_by_vid(int vid) 779 { 780 struct spdk_vhost_dev *vdev; 781 struct spdk_vhost_session *vsession; 782 783 TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { 784 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { 785 if (vsession->vid == vid) { 786 return vsession; 787 } 788 } 789 } 790 791 return NULL; 792 } 793 794 struct spdk_vhost_dev * 795 spdk_vhost_dev_next(struct spdk_vhost_dev *vdev) 796 { 797 if (vdev == NULL) { 798 return TAILQ_FIRST(&g_vhost_devices); 799 } 800 801 return TAILQ_NEXT(vdev, tailq); 802 } 803 804 struct spdk_vhost_dev * 805 spdk_vhost_dev_find(const char *ctrlr_name) 806 { 807 struct spdk_vhost_dev *vdev; 808 size_t dev_dirname_len = strlen(dev_dirname); 809 810 if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) { 811 ctrlr_name += dev_dirname_len; 812 } 813 814 TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { 815 if (strcmp(vdev->name, ctrlr_name) == 0) { 816 return vdev; 817 } 818 } 819 820 return NULL; 821 } 822 823 static int 824 vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask) 825 { 826 int rc; 827 828 if (cpumask == NULL) { 829 return -1; 830 } 831 832 if (mask == NULL) { 833 spdk_cpuset_copy(cpumask, &g_vhost_core_mask); 834 return 0; 835 } 836 837 rc = spdk_cpuset_parse(cpumask, mask); 838 if (rc < 0) { 839 SPDK_ERRLOG("invalid cpumask %s\n", mask); 840 return -1; 841 } 842 843 spdk_cpuset_and(cpumask, &g_vhost_core_mask); 844 845 if (spdk_cpuset_count(cpumask) == 0) { 846 SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n", 847 spdk_cpuset_fmt(&g_vhost_core_mask)); 848 return -1; 849 } 850 851 return 0; 852 } 853 854 static void 855 vhost_setup_core_mask(void *ctx) 856 { 857 struct spdk_thread *thread = spdk_get_thread(); 858 spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread)); 859 } 860 861 static void 862 vhost_setup_core_mask_done(void *ctx) 863 { 864 spdk_vhost_init_cb init_cb = ctx; 865 866 if (spdk_cpuset_count(&g_vhost_core_mask) == 0) { 867 init_cb(-ECHILD); 868 return; 869 } 870 871 init_cb(0); 872 } 873 874 static void 875 vhost_dev_thread_exit(void *arg1) 876 { 877 spdk_thread_exit(spdk_get_thread()); 878 } 879 880 int 881 vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, 882 const struct spdk_vhost_dev_backend *backend) 883 { 884 char path[PATH_MAX]; 885 struct spdk_cpuset cpumask = {}; 886 int rc; 887 888 assert(vdev); 889 if (name == NULL) { 890 SPDK_ERRLOG("Can't register controller with no name\n"); 891 return -EINVAL; 892 } 893 894 if (vhost_parse_core_mask(mask_str, &cpumask) != 0) { 895 SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n", 896 mask_str, spdk_cpuset_fmt(&g_vhost_core_mask)); 897 return -EINVAL; 898 } 899 900 if (spdk_vhost_dev_find(name)) { 901 SPDK_ERRLOG("vhost controller %s already exists.\n", name); 902 return -EEXIST; 903 } 904 905 if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) { 906 SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname, 907 name); 908 return -EINVAL; 909 } 910 911 vdev->name = strdup(name); 912 vdev->path = strdup(path); 913 if (vdev->name == NULL || vdev->path == NULL) { 914 rc = -EIO; 915 goto out; 916 } 917 918 vdev->thread = spdk_thread_create(vdev->name, &cpumask); 919 if (vdev->thread == NULL) { 920 SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name); 921 rc = -EIO; 922 goto out; 923 } 924 925 vdev->registered = true; 926 vdev->backend = backend; 927 TAILQ_INIT(&vdev->vsessions); 928 929 vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US, 930 SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD); 931 932 if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features, 933 vdev->protocol_features)) { 934 spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); 935 rc = -EIO; 936 goto out; 937 } 938 939 TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq); 940 941 SPDK_INFOLOG(vhost, "Controller %s: new controller added\n", vdev->name); 942 return 0; 943 944 out: 945 free(vdev->name); 946 free(vdev->path); 947 return rc; 948 } 949 950 int 951 vhost_dev_unregister(struct spdk_vhost_dev *vdev) 952 { 953 if (!TAILQ_EMPTY(&vdev->vsessions)) { 954 SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name); 955 return -EBUSY; 956 } 957 958 if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) { 959 SPDK_ERRLOG("Could not unregister controller %s with vhost library\n" 960 "Check if domain socket %s still exists\n", 961 vdev->name, vdev->path); 962 return -EIO; 963 } 964 965 SPDK_INFOLOG(vhost, "Controller %s: removed\n", vdev->name); 966 967 spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); 968 969 free(vdev->name); 970 free(vdev->path); 971 TAILQ_REMOVE(&g_vhost_devices, vdev, tailq); 972 return 0; 973 } 974 975 const char * 976 spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev) 977 { 978 assert(vdev != NULL); 979 return vdev->name; 980 } 981 982 const struct spdk_cpuset * 983 spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev) 984 { 985 assert(vdev != NULL); 986 return spdk_thread_get_cpumask(vdev->thread); 987 } 988 989 static void 990 wait_for_semaphore(int timeout_sec, const char *errmsg) 991 { 992 struct timespec timeout; 993 int rc; 994 995 clock_gettime(CLOCK_REALTIME, &timeout); 996 timeout.tv_sec += timeout_sec; 997 rc = sem_timedwait(&g_dpdk_sem, &timeout); 998 if (rc != 0) { 999 SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg); 1000 sem_wait(&g_dpdk_sem); 1001 } 1002 } 1003 1004 static void 1005 vhost_session_cb_done(int rc) 1006 { 1007 g_dpdk_response = rc; 1008 sem_post(&g_dpdk_sem); 1009 } 1010 1011 void 1012 vhost_session_start_done(struct spdk_vhost_session *vsession, int response) 1013 { 1014 if (response == 0) { 1015 vsession->started = true; 1016 1017 assert(vsession->vdev->active_session_num < UINT32_MAX); 1018 vsession->vdev->active_session_num++; 1019 } 1020 1021 vhost_session_cb_done(response); 1022 } 1023 1024 void 1025 vhost_session_stop_done(struct spdk_vhost_session *vsession, int response) 1026 { 1027 if (response == 0) { 1028 vsession->started = false; 1029 1030 assert(vsession->vdev->active_session_num > 0); 1031 vsession->vdev->active_session_num--; 1032 } 1033 1034 vhost_session_cb_done(response); 1035 } 1036 1037 static void 1038 vhost_event_cb(void *arg1) 1039 { 1040 struct vhost_session_fn_ctx *ctx = arg1; 1041 struct spdk_vhost_session *vsession; 1042 1043 if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { 1044 spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1); 1045 return; 1046 } 1047 1048 vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id); 1049 ctx->cb_fn(ctx->vdev, vsession, NULL); 1050 pthread_mutex_unlock(&g_vhost_mutex); 1051 } 1052 1053 int 1054 vhost_session_send_event(struct spdk_vhost_session *vsession, 1055 spdk_vhost_session_fn cb_fn, unsigned timeout_sec, 1056 const char *errmsg) 1057 { 1058 struct vhost_session_fn_ctx ev_ctx = {0}; 1059 struct spdk_vhost_dev *vdev = vsession->vdev; 1060 1061 ev_ctx.vdev = vdev; 1062 ev_ctx.vsession_id = vsession->id; 1063 ev_ctx.cb_fn = cb_fn; 1064 1065 spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx); 1066 1067 pthread_mutex_unlock(&g_vhost_mutex); 1068 wait_for_semaphore(timeout_sec, errmsg); 1069 pthread_mutex_lock(&g_vhost_mutex); 1070 1071 return g_dpdk_response; 1072 } 1073 1074 static void 1075 foreach_session_finish_cb(void *arg1) 1076 { 1077 struct vhost_session_fn_ctx *ev_ctx = arg1; 1078 struct spdk_vhost_dev *vdev = ev_ctx->vdev; 1079 1080 if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { 1081 spdk_thread_send_msg(spdk_get_thread(), 1082 foreach_session_finish_cb, arg1); 1083 return; 1084 } 1085 1086 assert(vdev->pending_async_op_num > 0); 1087 vdev->pending_async_op_num--; 1088 if (ev_ctx->cpl_fn != NULL) { 1089 ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx); 1090 } 1091 1092 pthread_mutex_unlock(&g_vhost_mutex); 1093 free(ev_ctx); 1094 } 1095 1096 static void 1097 foreach_session(void *arg1) 1098 { 1099 struct vhost_session_fn_ctx *ev_ctx = arg1; 1100 struct spdk_vhost_session *vsession; 1101 struct spdk_vhost_dev *vdev = ev_ctx->vdev; 1102 int rc; 1103 1104 if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { 1105 spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1); 1106 return; 1107 } 1108 1109 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { 1110 if (vsession->initialized) { 1111 rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx); 1112 if (rc < 0) { 1113 goto out; 1114 } 1115 } 1116 } 1117 1118 out: 1119 pthread_mutex_unlock(&g_vhost_mutex); 1120 1121 spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1); 1122 } 1123 1124 void 1125 vhost_dev_foreach_session(struct spdk_vhost_dev *vdev, 1126 spdk_vhost_session_fn fn, 1127 spdk_vhost_dev_fn cpl_fn, 1128 void *arg) 1129 { 1130 struct vhost_session_fn_ctx *ev_ctx; 1131 1132 ev_ctx = calloc(1, sizeof(*ev_ctx)); 1133 if (ev_ctx == NULL) { 1134 SPDK_ERRLOG("Failed to alloc vhost event.\n"); 1135 assert(false); 1136 return; 1137 } 1138 1139 ev_ctx->vdev = vdev; 1140 ev_ctx->cb_fn = fn; 1141 ev_ctx->cpl_fn = cpl_fn; 1142 ev_ctx->user_ctx = arg; 1143 1144 assert(vdev->pending_async_op_num < UINT32_MAX); 1145 vdev->pending_async_op_num++; 1146 1147 spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx); 1148 } 1149 1150 static int 1151 _stop_session(struct spdk_vhost_session *vsession) 1152 { 1153 struct spdk_vhost_dev *vdev = vsession->vdev; 1154 struct spdk_vhost_virtqueue *q; 1155 int rc; 1156 uint16_t i; 1157 1158 rc = vdev->backend->stop_session(vsession); 1159 if (rc != 0) { 1160 SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid); 1161 pthread_mutex_unlock(&g_vhost_mutex); 1162 return rc; 1163 } 1164 1165 for (i = 0; i < vsession->max_queues; i++) { 1166 q = &vsession->virtqueue[i]; 1167 1168 /* vring.desc and vring.desc_packed are in a union struct 1169 * so q->vring.desc can replace q->vring.desc_packed. 1170 */ 1171 if (q->vring.desc == NULL) { 1172 continue; 1173 } 1174 1175 /* Packed virtqueues support up to 2^15 entries each 1176 * so left one bit can be used as wrap counter. 1177 */ 1178 if (q->packed.packed_ring) { 1179 q->last_avail_idx = q->last_avail_idx | 1180 ((uint16_t)q->packed.avail_phase << 15); 1181 q->last_used_idx = q->last_used_idx | 1182 ((uint16_t)q->packed.used_phase << 15); 1183 } 1184 1185 rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx); 1186 } 1187 1188 vhost_session_mem_unregister(vsession->mem); 1189 free(vsession->mem); 1190 1191 return 0; 1192 } 1193 1194 int 1195 vhost_stop_device_cb(int vid) 1196 { 1197 struct spdk_vhost_session *vsession; 1198 int rc; 1199 1200 pthread_mutex_lock(&g_vhost_mutex); 1201 vsession = vhost_session_find_by_vid(vid); 1202 if (vsession == NULL) { 1203 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); 1204 pthread_mutex_unlock(&g_vhost_mutex); 1205 return -EINVAL; 1206 } 1207 1208 if (!vsession->started) { 1209 /* already stopped, nothing to do */ 1210 pthread_mutex_unlock(&g_vhost_mutex); 1211 return -EALREADY; 1212 } 1213 1214 rc = _stop_session(vsession); 1215 pthread_mutex_unlock(&g_vhost_mutex); 1216 1217 return rc; 1218 } 1219 1220 int 1221 vhost_start_device_cb(int vid) 1222 { 1223 struct spdk_vhost_dev *vdev; 1224 struct spdk_vhost_session *vsession; 1225 int rc = -1; 1226 uint16_t i; 1227 bool packed_ring; 1228 1229 pthread_mutex_lock(&g_vhost_mutex); 1230 1231 vsession = vhost_session_find_by_vid(vid); 1232 if (vsession == NULL) { 1233 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); 1234 goto out; 1235 } 1236 1237 if (spdk_interrupt_mode_is_enabled()) { 1238 vsession->interrupt_mode = true; 1239 } 1240 1241 vdev = vsession->vdev; 1242 if (vsession->started) { 1243 /* already started, nothing to do */ 1244 rc = 0; 1245 goto out; 1246 } 1247 1248 if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) { 1249 SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid); 1250 goto out; 1251 } 1252 1253 packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0); 1254 1255 vsession->max_queues = 0; 1256 memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue)); 1257 for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) { 1258 struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; 1259 1260 q->vsession = vsession; 1261 q->vring_idx = -1; 1262 if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) { 1263 continue; 1264 } 1265 q->vring_idx = i; 1266 rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight); 1267 1268 /* vring.desc and vring.desc_packed are in a union struct 1269 * so q->vring.desc can replace q->vring.desc_packed. 1270 */ 1271 if (q->vring.desc == NULL || q->vring.size == 0) { 1272 continue; 1273 } 1274 1275 if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) { 1276 q->vring.desc = NULL; 1277 continue; 1278 } 1279 1280 if (packed_ring) { 1281 /* Packed virtqueues support up to 2^15 entries each 1282 * so left one bit can be used as wrap counter. 1283 */ 1284 q->packed.avail_phase = q->last_avail_idx >> 15; 1285 q->last_avail_idx = q->last_avail_idx & 0x7FFF; 1286 q->packed.used_phase = q->last_used_idx >> 15; 1287 q->last_used_idx = q->last_used_idx & 0x7FFF; 1288 1289 if (!vsession->interrupt_mode) { 1290 /* Disable I/O submission notifications, we'll be polling. */ 1291 q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE; 1292 } 1293 } else { 1294 if (!vsession->interrupt_mode) { 1295 /* Disable I/O submission notifications, we'll be polling. */ 1296 q->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1297 } 1298 } 1299 1300 q->packed.packed_ring = packed_ring; 1301 vsession->max_queues = i + 1; 1302 } 1303 1304 if (vhost_get_mem_table(vid, &vsession->mem) != 0) { 1305 SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid); 1306 goto out; 1307 } 1308 1309 /* 1310 * Not sure right now but this look like some kind of QEMU bug and guest IO 1311 * might be frozed without kicking all queues after live-migration. This look like 1312 * the previous vhost instance failed to effectively deliver all interrupts before 1313 * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts 1314 * should be ignored by guest virtio driver. 1315 * 1316 * Tested on QEMU 2.10.91 and 2.11.50. 1317 */ 1318 for (i = 0; i < vsession->max_queues; i++) { 1319 struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; 1320 1321 /* vring.desc and vring.desc_packed are in a union struct 1322 * so q->vring.desc can replace q->vring.desc_packed. 1323 */ 1324 if (q->vring.desc != NULL && q->vring.size > 0) { 1325 rte_vhost_vring_call(vsession->vid, q->vring_idx); 1326 } 1327 } 1328 1329 vhost_session_set_coalescing(vdev, vsession, NULL); 1330 vhost_session_mem_register(vsession->mem); 1331 vsession->initialized = true; 1332 rc = vdev->backend->start_session(vsession); 1333 if (rc != 0) { 1334 vhost_session_mem_unregister(vsession->mem); 1335 free(vsession->mem); 1336 goto out; 1337 } 1338 1339 out: 1340 pthread_mutex_unlock(&g_vhost_mutex); 1341 return rc; 1342 } 1343 1344 int 1345 spdk_vhost_set_socket_path(const char *basename) 1346 { 1347 int ret; 1348 1349 if (basename && strlen(basename) > 0) { 1350 ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename); 1351 if (ret <= 0) { 1352 return -EINVAL; 1353 } 1354 if ((size_t)ret >= sizeof(dev_dirname) - 2) { 1355 SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret); 1356 return -EINVAL; 1357 } 1358 1359 if (dev_dirname[ret - 1] != '/') { 1360 dev_dirname[ret] = '/'; 1361 dev_dirname[ret + 1] = '\0'; 1362 } 1363 } 1364 1365 return 0; 1366 } 1367 1368 void 1369 vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) 1370 { 1371 assert(vdev->backend->dump_info_json != NULL); 1372 vdev->backend->dump_info_json(vdev, w); 1373 } 1374 1375 int 1376 spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev) 1377 { 1378 if (vdev->pending_async_op_num) { 1379 return -EBUSY; 1380 } 1381 1382 return vdev->backend->remove_device(vdev); 1383 } 1384 1385 int 1386 vhost_new_connection_cb(int vid, const char *ifname) 1387 { 1388 struct spdk_vhost_dev *vdev; 1389 struct spdk_vhost_session *vsession; 1390 1391 pthread_mutex_lock(&g_vhost_mutex); 1392 1393 vdev = spdk_vhost_dev_find(ifname); 1394 if (vdev == NULL) { 1395 SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid); 1396 pthread_mutex_unlock(&g_vhost_mutex); 1397 return -1; 1398 } 1399 1400 /* We expect sessions inside vdev->vsessions to be sorted in ascending 1401 * order in regard of vsession->id. For now we always set id = vsessions_cnt++ 1402 * and append each session to the very end of the vsessions list. 1403 * This is required for spdk_vhost_dev_foreach_session() to work. 1404 */ 1405 if (vdev->vsessions_num == UINT_MAX) { 1406 assert(false); 1407 return -EINVAL; 1408 } 1409 1410 if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) + 1411 vdev->backend->session_ctx_size)) { 1412 SPDK_ERRLOG("vsession alloc failed\n"); 1413 pthread_mutex_unlock(&g_vhost_mutex); 1414 return -1; 1415 } 1416 memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size); 1417 1418 vsession->vdev = vdev; 1419 vsession->vid = vid; 1420 vsession->id = vdev->vsessions_num++; 1421 vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid); 1422 if (vsession->name == NULL) { 1423 SPDK_ERRLOG("vsession alloc failed\n"); 1424 pthread_mutex_unlock(&g_vhost_mutex); 1425 free(vsession); 1426 return -1; 1427 } 1428 vsession->started = false; 1429 vsession->initialized = false; 1430 vsession->next_stats_check_time = 0; 1431 vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS * 1432 spdk_get_ticks_hz() / 1000UL; 1433 TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq); 1434 1435 vhost_session_install_rte_compat_hooks(vsession); 1436 pthread_mutex_unlock(&g_vhost_mutex); 1437 return 0; 1438 } 1439 1440 int 1441 vhost_destroy_connection_cb(int vid) 1442 { 1443 struct spdk_vhost_session *vsession; 1444 int rc = 0; 1445 1446 pthread_mutex_lock(&g_vhost_mutex); 1447 vsession = vhost_session_find_by_vid(vid); 1448 if (vsession == NULL) { 1449 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); 1450 pthread_mutex_unlock(&g_vhost_mutex); 1451 return -EINVAL; 1452 } 1453 1454 if (vsession->started) { 1455 rc = _stop_session(vsession); 1456 } 1457 1458 TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq); 1459 free(vsession->name); 1460 free(vsession); 1461 pthread_mutex_unlock(&g_vhost_mutex); 1462 1463 return rc; 1464 } 1465 1466 void 1467 spdk_vhost_lock(void) 1468 { 1469 pthread_mutex_lock(&g_vhost_mutex); 1470 } 1471 1472 int 1473 spdk_vhost_trylock(void) 1474 { 1475 return -pthread_mutex_trylock(&g_vhost_mutex); 1476 } 1477 1478 void 1479 spdk_vhost_unlock(void) 1480 { 1481 pthread_mutex_unlock(&g_vhost_mutex); 1482 } 1483 1484 void 1485 spdk_vhost_init(spdk_vhost_init_cb init_cb) 1486 { 1487 size_t len; 1488 int ret; 1489 1490 g_vhost_init_thread = spdk_get_thread(); 1491 assert(g_vhost_init_thread != NULL); 1492 1493 if (dev_dirname[0] == '\0') { 1494 if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) { 1495 SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno)); 1496 ret = -1; 1497 goto out; 1498 } 1499 1500 len = strlen(dev_dirname); 1501 if (dev_dirname[len - 1] != '/') { 1502 dev_dirname[len] = '/'; 1503 dev_dirname[len + 1] = '\0'; 1504 } 1505 } 1506 1507 ret = sem_init(&g_dpdk_sem, 0, 0); 1508 if (ret != 0) { 1509 SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n"); 1510 ret = -1; 1511 goto out; 1512 } 1513 1514 spdk_cpuset_zero(&g_vhost_core_mask); 1515 1516 /* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really 1517 * created. 1518 */ 1519 spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done); 1520 return; 1521 out: 1522 init_cb(ret); 1523 } 1524 1525 static void 1526 vhost_fini(void *arg1) 1527 { 1528 struct spdk_vhost_dev *vdev, *tmp; 1529 1530 spdk_vhost_lock(); 1531 vdev = spdk_vhost_dev_next(NULL); 1532 while (vdev != NULL) { 1533 tmp = spdk_vhost_dev_next(vdev); 1534 spdk_vhost_dev_remove(vdev); 1535 /* don't care if it fails, there's nothing we can do for now */ 1536 vdev = tmp; 1537 } 1538 spdk_vhost_unlock(); 1539 1540 spdk_cpuset_zero(&g_vhost_core_mask); 1541 1542 /* All devices are removed now. */ 1543 sem_destroy(&g_dpdk_sem); 1544 1545 g_fini_cpl_cb(); 1546 } 1547 1548 static void * 1549 session_shutdown(void *arg) 1550 { 1551 struct spdk_vhost_dev *vdev = NULL; 1552 1553 TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { 1554 vhost_driver_unregister(vdev->path); 1555 vdev->registered = false; 1556 } 1557 1558 SPDK_INFOLOG(vhost, "Exiting\n"); 1559 spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL); 1560 return NULL; 1561 } 1562 1563 void 1564 spdk_vhost_fini(spdk_vhost_fini_cb fini_cb) 1565 { 1566 pthread_t tid; 1567 int rc; 1568 1569 assert(spdk_get_thread() == g_vhost_init_thread); 1570 g_fini_cpl_cb = fini_cb; 1571 1572 /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK 1573 * ops for stopping a device or removing a connection, we need to call it from 1574 * a separate thread to avoid deadlock. 1575 */ 1576 rc = pthread_create(&tid, NULL, &session_shutdown, NULL); 1577 if (rc < 0) { 1578 SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc)); 1579 abort(); 1580 } 1581 pthread_detach(tid); 1582 } 1583 1584 void 1585 spdk_vhost_config_json(struct spdk_json_write_ctx *w) 1586 { 1587 struct spdk_vhost_dev *vdev; 1588 uint32_t delay_base_us; 1589 uint32_t iops_threshold; 1590 1591 spdk_json_write_array_begin(w); 1592 1593 spdk_vhost_lock(); 1594 vdev = spdk_vhost_dev_next(NULL); 1595 while (vdev != NULL) { 1596 vdev->backend->write_config_json(vdev, w); 1597 1598 spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); 1599 if (delay_base_us) { 1600 spdk_json_write_object_begin(w); 1601 spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing"); 1602 1603 spdk_json_write_named_object_begin(w, "params"); 1604 spdk_json_write_named_string(w, "ctrlr", vdev->name); 1605 spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us); 1606 spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold); 1607 spdk_json_write_object_end(w); 1608 1609 spdk_json_write_object_end(w); 1610 } 1611 vdev = spdk_vhost_dev_next(vdev); 1612 } 1613 spdk_vhost_unlock(); 1614 1615 spdk_json_write_array_end(w); 1616 } 1617 1618 SPDK_LOG_REGISTER_COMPONENT(vhost) 1619 SPDK_LOG_REGISTER_COMPONENT(vhost_ring) 1620