1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/bdev.h" 37 #include "spdk/conf.h" 38 39 #include "spdk/config.h" 40 #include "spdk/env.h" 41 #include "spdk/event.h" 42 #include "spdk/thread.h" 43 #include "spdk/likely.h" 44 #include "spdk/queue.h" 45 #include "spdk/nvme_spec.h" 46 #include "spdk/scsi_spec.h" 47 #include "spdk/notify.h" 48 #include "spdk/util.h" 49 #include "spdk/trace.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk_internal/log.h" 53 #include "spdk/string.h" 54 55 #include "bdev_internal.h" 56 57 #ifdef SPDK_CONFIG_VTUNE 58 #include "ittnotify.h" 59 #include "ittnotify_types.h" 60 int __itt_init_ittlib(const char *, __itt_group_id); 61 #endif 62 63 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 64 #define SPDK_BDEV_IO_CACHE_SIZE 256 65 #define BUF_SMALL_POOL_SIZE 8191 66 #define BUF_LARGE_POOL_SIZE 1023 67 #define NOMEM_THRESHOLD_COUNT 8 68 #define ZERO_BUFFER_SIZE 0x100000 69 70 #define OWNER_BDEV 0x2 71 72 #define OBJECT_BDEV_IO 0x2 73 74 #define TRACE_GROUP_BDEV 0x3 75 #define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) 76 #define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) 77 78 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 79 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 80 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 81 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 82 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 83 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 84 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 85 86 #define SPDK_BDEV_POOL_ALIGNMENT 512 87 88 static const char *qos_conf_type[] = {"Limit_IOPS", 89 "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS" 90 }; 91 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 92 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 93 }; 94 95 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 96 97 struct spdk_bdev_mgr { 98 struct spdk_mempool *bdev_io_pool; 99 100 struct spdk_mempool *buf_small_pool; 101 struct spdk_mempool *buf_large_pool; 102 103 void *zero_buffer; 104 105 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 106 107 struct spdk_bdev_list bdevs; 108 109 bool init_complete; 110 bool module_init_complete; 111 112 pthread_mutex_t mutex; 113 114 #ifdef SPDK_CONFIG_VTUNE 115 __itt_domain *domain; 116 #endif 117 }; 118 119 static struct spdk_bdev_mgr g_bdev_mgr = { 120 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 121 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 122 .init_complete = false, 123 .module_init_complete = false, 124 .mutex = PTHREAD_MUTEX_INITIALIZER, 125 }; 126 127 typedef void (*lock_range_cb)(void *ctx, int status); 128 129 struct lba_range { 130 uint64_t offset; 131 uint64_t length; 132 void *locked_ctx; 133 struct spdk_bdev_channel *owner_ch; 134 TAILQ_ENTRY(lba_range) tailq; 135 }; 136 137 static struct spdk_bdev_opts g_bdev_opts = { 138 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 139 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 140 }; 141 142 static spdk_bdev_init_cb g_init_cb_fn = NULL; 143 static void *g_init_cb_arg = NULL; 144 145 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 146 static void *g_fini_cb_arg = NULL; 147 static struct spdk_thread *g_fini_thread = NULL; 148 149 struct spdk_bdev_qos_limit { 150 /** IOs or bytes allowed per second (i.e., 1s). */ 151 uint64_t limit; 152 153 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 154 * For remaining bytes, allowed to run negative if an I/O is submitted when 155 * some bytes are remaining, but the I/O is bigger than that amount. The 156 * excess will be deducted from the next timeslice. 157 */ 158 int64_t remaining_this_timeslice; 159 160 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 161 uint32_t min_per_timeslice; 162 163 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 164 uint32_t max_per_timeslice; 165 166 /** Function to check whether to queue the IO. */ 167 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 168 169 /** Function to update for the submitted IO. */ 170 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 171 }; 172 173 struct spdk_bdev_qos { 174 /** Types of structure of rate limits. */ 175 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 176 177 /** The channel that all I/O are funneled through. */ 178 struct spdk_bdev_channel *ch; 179 180 /** The thread on which the poller is running. */ 181 struct spdk_thread *thread; 182 183 /** Queue of I/O waiting to be issued. */ 184 bdev_io_tailq_t queued; 185 186 /** Size of a timeslice in tsc ticks. */ 187 uint64_t timeslice_size; 188 189 /** Timestamp of start of last timeslice. */ 190 uint64_t last_timeslice; 191 192 /** Poller that processes queued I/O commands each time slice. */ 193 struct spdk_poller *poller; 194 }; 195 196 struct spdk_bdev_mgmt_channel { 197 bdev_io_stailq_t need_buf_small; 198 bdev_io_stailq_t need_buf_large; 199 200 /* 201 * Each thread keeps a cache of bdev_io - this allows 202 * bdev threads which are *not* DPDK threads to still 203 * benefit from a per-thread bdev_io cache. Without 204 * this, non-DPDK threads fetching from the mempool 205 * incur a cmpxchg on get and put. 206 */ 207 bdev_io_stailq_t per_thread_cache; 208 uint32_t per_thread_cache_count; 209 uint32_t bdev_io_cache_size; 210 211 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 212 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 213 }; 214 215 /* 216 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 217 * will queue here their IO that awaits retry. It makes it possible to retry sending 218 * IO to one bdev after IO from other bdev completes. 219 */ 220 struct spdk_bdev_shared_resource { 221 /* The bdev management channel */ 222 struct spdk_bdev_mgmt_channel *mgmt_ch; 223 224 /* 225 * Count of I/O submitted to bdev module and waiting for completion. 226 * Incremented before submit_request() is called on an spdk_bdev_io. 227 */ 228 uint64_t io_outstanding; 229 230 /* 231 * Queue of IO awaiting retry because of a previous NOMEM status returned 232 * on this channel. 233 */ 234 bdev_io_tailq_t nomem_io; 235 236 /* 237 * Threshold which io_outstanding must drop to before retrying nomem_io. 238 */ 239 uint64_t nomem_threshold; 240 241 /* I/O channel allocated by a bdev module */ 242 struct spdk_io_channel *shared_ch; 243 244 /* Refcount of bdev channels using this resource */ 245 uint32_t ref; 246 247 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 248 }; 249 250 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 251 #define BDEV_CH_QOS_ENABLED (1 << 1) 252 253 struct spdk_bdev_channel { 254 struct spdk_bdev *bdev; 255 256 /* The channel for the underlying device */ 257 struct spdk_io_channel *channel; 258 259 /* Per io_device per thread data */ 260 struct spdk_bdev_shared_resource *shared_resource; 261 262 struct spdk_bdev_io_stat stat; 263 264 /* 265 * Count of I/O submitted to the underlying dev module through this channel 266 * and waiting for completion. 267 */ 268 uint64_t io_outstanding; 269 270 /* 271 * List of spdk_bdev_io directly associated with a call to the public bdev API. 272 * It does not include any spdk_bdev_io that are generated via splitting. 273 */ 274 bdev_io_tailq_t io_submitted; 275 276 /* 277 * List of spdk_bdev_io that are currently queued because they write to a locked 278 * LBA range. 279 */ 280 bdev_io_tailq_t io_locked; 281 282 uint32_t flags; 283 284 struct spdk_histogram_data *histogram; 285 286 #ifdef SPDK_CONFIG_VTUNE 287 uint64_t start_tsc; 288 uint64_t interval_tsc; 289 __itt_string_handle *handle; 290 struct spdk_bdev_io_stat prev_stat; 291 #endif 292 293 bdev_io_tailq_t queued_resets; 294 295 lba_range_tailq_t locked_ranges; 296 }; 297 298 struct media_event_entry { 299 struct spdk_bdev_media_event event; 300 TAILQ_ENTRY(media_event_entry) tailq; 301 }; 302 303 #define MEDIA_EVENT_POOL_SIZE 64 304 305 struct spdk_bdev_desc { 306 struct spdk_bdev *bdev; 307 struct spdk_thread *thread; 308 struct { 309 bool open_with_ext; 310 union { 311 spdk_bdev_remove_cb_t remove_fn; 312 spdk_bdev_event_cb_t event_fn; 313 }; 314 void *ctx; 315 } callback; 316 bool closed; 317 bool write; 318 pthread_mutex_t mutex; 319 uint32_t refs; 320 TAILQ_HEAD(, media_event_entry) pending_media_events; 321 TAILQ_HEAD(, media_event_entry) free_media_events; 322 struct media_event_entry *media_events_buffer; 323 TAILQ_ENTRY(spdk_bdev_desc) link; 324 325 uint64_t timeout_in_sec; 326 spdk_bdev_io_timeout_cb cb_fn; 327 void *cb_arg; 328 struct spdk_poller *io_timeout_poller; 329 }; 330 331 struct spdk_bdev_iostat_ctx { 332 struct spdk_bdev_io_stat *stat; 333 spdk_bdev_get_device_stat_cb cb; 334 void *cb_arg; 335 }; 336 337 struct set_qos_limit_ctx { 338 void (*cb_fn)(void *cb_arg, int status); 339 void *cb_arg; 340 struct spdk_bdev *bdev; 341 }; 342 343 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 344 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 345 346 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 347 static void bdev_write_zero_buffer_next(void *_bdev_io); 348 349 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 350 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 351 352 static int 353 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 354 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 355 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg); 356 static int 357 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 358 struct iovec *iov, int iovcnt, void *md_buf, 359 uint64_t offset_blocks, uint64_t num_blocks, 360 spdk_bdev_io_completion_cb cb, void *cb_arg); 361 362 static int 363 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 364 uint64_t offset, uint64_t length, 365 lock_range_cb cb_fn, void *cb_arg); 366 367 static int 368 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 369 uint64_t offset, uint64_t length, 370 lock_range_cb cb_fn, void *cb_arg); 371 372 void 373 spdk_bdev_get_opts(struct spdk_bdev_opts *opts) 374 { 375 *opts = g_bdev_opts; 376 } 377 378 int 379 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 380 { 381 uint32_t min_pool_size; 382 383 /* 384 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 385 * initialization. A second mgmt_ch will be created on the same thread when the application starts 386 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 387 */ 388 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 389 if (opts->bdev_io_pool_size < min_pool_size) { 390 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 391 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 392 spdk_thread_get_count()); 393 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 394 return -1; 395 } 396 397 g_bdev_opts = *opts; 398 return 0; 399 } 400 401 struct spdk_bdev * 402 spdk_bdev_first(void) 403 { 404 struct spdk_bdev *bdev; 405 406 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 407 if (bdev) { 408 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 409 } 410 411 return bdev; 412 } 413 414 struct spdk_bdev * 415 spdk_bdev_next(struct spdk_bdev *prev) 416 { 417 struct spdk_bdev *bdev; 418 419 bdev = TAILQ_NEXT(prev, internal.link); 420 if (bdev) { 421 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 422 } 423 424 return bdev; 425 } 426 427 static struct spdk_bdev * 428 _bdev_next_leaf(struct spdk_bdev *bdev) 429 { 430 while (bdev != NULL) { 431 if (bdev->internal.claim_module == NULL) { 432 return bdev; 433 } else { 434 bdev = TAILQ_NEXT(bdev, internal.link); 435 } 436 } 437 438 return bdev; 439 } 440 441 struct spdk_bdev * 442 spdk_bdev_first_leaf(void) 443 { 444 struct spdk_bdev *bdev; 445 446 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 447 448 if (bdev) { 449 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); 450 } 451 452 return bdev; 453 } 454 455 struct spdk_bdev * 456 spdk_bdev_next_leaf(struct spdk_bdev *prev) 457 { 458 struct spdk_bdev *bdev; 459 460 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 461 462 if (bdev) { 463 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); 464 } 465 466 return bdev; 467 } 468 469 struct spdk_bdev * 470 spdk_bdev_get_by_name(const char *bdev_name) 471 { 472 struct spdk_bdev_alias *tmp; 473 struct spdk_bdev *bdev = spdk_bdev_first(); 474 475 while (bdev != NULL) { 476 if (strcmp(bdev_name, bdev->name) == 0) { 477 return bdev; 478 } 479 480 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 481 if (strcmp(bdev_name, tmp->alias) == 0) { 482 return bdev; 483 } 484 } 485 486 bdev = spdk_bdev_next(bdev); 487 } 488 489 return NULL; 490 } 491 492 void 493 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 494 { 495 struct iovec *iovs; 496 497 if (bdev_io->u.bdev.iovs == NULL) { 498 bdev_io->u.bdev.iovs = &bdev_io->iov; 499 bdev_io->u.bdev.iovcnt = 1; 500 } 501 502 iovs = bdev_io->u.bdev.iovs; 503 504 assert(iovs != NULL); 505 assert(bdev_io->u.bdev.iovcnt >= 1); 506 507 iovs[0].iov_base = buf; 508 iovs[0].iov_len = len; 509 } 510 511 void 512 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 513 { 514 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 515 bdev_io->u.bdev.md_buf = md_buf; 516 } 517 518 static bool 519 _is_buf_allocated(const struct iovec *iovs) 520 { 521 if (iovs == NULL) { 522 return false; 523 } 524 525 return iovs[0].iov_base != NULL; 526 } 527 528 static bool 529 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 530 { 531 int i; 532 uintptr_t iov_base; 533 534 if (spdk_likely(alignment == 1)) { 535 return true; 536 } 537 538 for (i = 0; i < iovcnt; i++) { 539 iov_base = (uintptr_t)iovs[i].iov_base; 540 if ((iov_base & (alignment - 1)) != 0) { 541 return false; 542 } 543 } 544 545 return true; 546 } 547 548 static void 549 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 550 { 551 int i; 552 size_t len; 553 554 for (i = 0; i < iovcnt; i++) { 555 len = spdk_min(iovs[i].iov_len, buf_len); 556 memcpy(buf, iovs[i].iov_base, len); 557 buf += len; 558 buf_len -= len; 559 } 560 } 561 562 static void 563 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 564 { 565 int i; 566 size_t len; 567 568 for (i = 0; i < iovcnt; i++) { 569 len = spdk_min(iovs[i].iov_len, buf_len); 570 memcpy(iovs[i].iov_base, buf, len); 571 buf += len; 572 buf_len -= len; 573 } 574 } 575 576 static void 577 _bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 578 { 579 /* save original iovec */ 580 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 581 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 582 /* set bounce iov */ 583 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 584 bdev_io->u.bdev.iovcnt = 1; 585 /* set bounce buffer for this operation */ 586 bdev_io->u.bdev.iovs[0].iov_base = buf; 587 bdev_io->u.bdev.iovs[0].iov_len = len; 588 /* if this is write path, copy data from original buffer to bounce buffer */ 589 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 590 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 591 } 592 } 593 594 static void 595 _bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 596 { 597 /* save original md_buf */ 598 bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf; 599 /* set bounce md_buf */ 600 bdev_io->u.bdev.md_buf = md_buf; 601 602 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 603 memcpy(md_buf, bdev_io->internal.orig_md_buf, len); 604 } 605 } 606 607 static void 608 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status) 609 { 610 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 611 612 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 613 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 614 bdev_io->internal.get_aux_buf_cb = NULL; 615 } else { 616 assert(bdev_io->internal.get_buf_cb != NULL); 617 bdev_io->internal.buf = buf; 618 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 619 bdev_io->internal.get_buf_cb = NULL; 620 } 621 } 622 623 static void 624 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 625 { 626 struct spdk_bdev *bdev = bdev_io->bdev; 627 bool buf_allocated; 628 uint64_t md_len, alignment; 629 void *aligned_buf; 630 631 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 632 bdev_io_get_buf_complete(bdev_io, buf, true); 633 return; 634 } 635 636 alignment = spdk_bdev_get_buf_align(bdev); 637 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 638 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 639 640 if (buf_allocated) { 641 _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); 642 } else { 643 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 644 } 645 646 if (spdk_bdev_is_md_separate(bdev)) { 647 aligned_buf = (char *)aligned_buf + len; 648 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 649 650 assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0); 651 652 if (bdev_io->u.bdev.md_buf != NULL) { 653 _bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len); 654 } else { 655 spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len); 656 } 657 } 658 bdev_io_get_buf_complete(bdev_io, buf, true); 659 } 660 661 static void 662 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 663 { 664 struct spdk_bdev *bdev = bdev_io->bdev; 665 struct spdk_mempool *pool; 666 struct spdk_bdev_io *tmp; 667 bdev_io_stailq_t *stailq; 668 struct spdk_bdev_mgmt_channel *ch; 669 uint64_t md_len, alignment; 670 671 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 672 alignment = spdk_bdev_get_buf_align(bdev); 673 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 674 675 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 676 SPDK_BDEV_POOL_ALIGNMENT) { 677 pool = g_bdev_mgr.buf_small_pool; 678 stailq = &ch->need_buf_small; 679 } else { 680 pool = g_bdev_mgr.buf_large_pool; 681 stailq = &ch->need_buf_large; 682 } 683 684 if (STAILQ_EMPTY(stailq)) { 685 spdk_mempool_put(pool, buf); 686 } else { 687 tmp = STAILQ_FIRST(stailq); 688 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 689 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 690 } 691 } 692 693 static void 694 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 695 { 696 assert(bdev_io->internal.buf != NULL); 697 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 698 bdev_io->internal.buf = NULL; 699 } 700 701 void 702 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 703 { 704 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 705 706 assert(buf != NULL); 707 _bdev_io_put_buf(bdev_io, buf, len); 708 } 709 710 static void 711 _bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) 712 { 713 if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { 714 assert(bdev_io->internal.orig_md_buf == NULL); 715 return; 716 } 717 718 /* if this is read path, copy data from bounce buffer to original buffer */ 719 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 720 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 721 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 722 bdev_io->internal.orig_iovcnt, 723 bdev_io->internal.bounce_iov.iov_base, 724 bdev_io->internal.bounce_iov.iov_len); 725 } 726 /* set original buffer for this io */ 727 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 728 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 729 /* disable bouncing buffer for this io */ 730 bdev_io->internal.orig_iovcnt = 0; 731 bdev_io->internal.orig_iovs = NULL; 732 733 /* do the same for metadata buffer */ 734 if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) { 735 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 736 737 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 738 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 739 memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf, 740 bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev)); 741 } 742 743 bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf; 744 bdev_io->internal.orig_md_buf = NULL; 745 } 746 747 /* We want to free the bounce buffer here since we know we're done with it (as opposed 748 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 749 */ 750 bdev_io_put_buf(bdev_io); 751 } 752 753 static void 754 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 755 { 756 struct spdk_bdev *bdev = bdev_io->bdev; 757 struct spdk_mempool *pool; 758 bdev_io_stailq_t *stailq; 759 struct spdk_bdev_mgmt_channel *mgmt_ch; 760 uint64_t alignment, md_len; 761 void *buf; 762 763 alignment = spdk_bdev_get_buf_align(bdev); 764 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 765 766 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 767 SPDK_BDEV_POOL_ALIGNMENT) { 768 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 769 len + alignment); 770 bdev_io_get_buf_complete(bdev_io, NULL, false); 771 return; 772 } 773 774 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 775 776 bdev_io->internal.buf_len = len; 777 778 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 779 SPDK_BDEV_POOL_ALIGNMENT) { 780 pool = g_bdev_mgr.buf_small_pool; 781 stailq = &mgmt_ch->need_buf_small; 782 } else { 783 pool = g_bdev_mgr.buf_large_pool; 784 stailq = &mgmt_ch->need_buf_large; 785 } 786 787 buf = spdk_mempool_get(pool); 788 if (!buf) { 789 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 790 } else { 791 _bdev_io_set_buf(bdev_io, buf, len); 792 } 793 } 794 795 void 796 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 797 { 798 struct spdk_bdev *bdev = bdev_io->bdev; 799 uint64_t alignment; 800 801 assert(cb != NULL); 802 bdev_io->internal.get_buf_cb = cb; 803 804 alignment = spdk_bdev_get_buf_align(bdev); 805 806 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 807 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 808 /* Buffer already present and aligned */ 809 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 810 return; 811 } 812 813 bdev_io_get_buf(bdev_io, len); 814 } 815 816 void 817 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 818 { 819 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 820 821 assert(cb != NULL); 822 assert(bdev_io->internal.get_aux_buf_cb == NULL); 823 bdev_io->internal.get_aux_buf_cb = cb; 824 bdev_io_get_buf(bdev_io, len); 825 } 826 827 static int 828 bdev_module_get_max_ctx_size(void) 829 { 830 struct spdk_bdev_module *bdev_module; 831 int max_bdev_module_size = 0; 832 833 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 834 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 835 max_bdev_module_size = bdev_module->get_ctx_size(); 836 } 837 } 838 839 return max_bdev_module_size; 840 } 841 842 void 843 spdk_bdev_config_text(FILE *fp) 844 { 845 struct spdk_bdev_module *bdev_module; 846 847 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 848 if (bdev_module->config_text) { 849 bdev_module->config_text(fp); 850 } 851 } 852 } 853 854 static void 855 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 856 { 857 int i; 858 struct spdk_bdev_qos *qos = bdev->internal.qos; 859 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 860 861 if (!qos) { 862 return; 863 } 864 865 spdk_bdev_get_qos_rate_limits(bdev, limits); 866 867 spdk_json_write_object_begin(w); 868 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 869 870 spdk_json_write_named_object_begin(w, "params"); 871 spdk_json_write_named_string(w, "name", bdev->name); 872 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 873 if (limits[i] > 0) { 874 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 875 } 876 } 877 spdk_json_write_object_end(w); 878 879 spdk_json_write_object_end(w); 880 } 881 882 void 883 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 884 { 885 struct spdk_bdev_module *bdev_module; 886 struct spdk_bdev *bdev; 887 888 assert(w != NULL); 889 890 spdk_json_write_array_begin(w); 891 892 spdk_json_write_object_begin(w); 893 spdk_json_write_named_string(w, "method", "bdev_set_options"); 894 spdk_json_write_named_object_begin(w, "params"); 895 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 896 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 897 spdk_json_write_object_end(w); 898 spdk_json_write_object_end(w); 899 900 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 901 if (bdev_module->config_json) { 902 bdev_module->config_json(w); 903 } 904 } 905 906 pthread_mutex_lock(&g_bdev_mgr.mutex); 907 908 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 909 if (bdev->fn_table->write_config_json) { 910 bdev->fn_table->write_config_json(bdev, w); 911 } 912 913 bdev_qos_config_json(bdev, w); 914 } 915 916 pthread_mutex_unlock(&g_bdev_mgr.mutex); 917 918 spdk_json_write_array_end(w); 919 } 920 921 static int 922 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 923 { 924 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 925 struct spdk_bdev_io *bdev_io; 926 uint32_t i; 927 928 STAILQ_INIT(&ch->need_buf_small); 929 STAILQ_INIT(&ch->need_buf_large); 930 931 STAILQ_INIT(&ch->per_thread_cache); 932 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 933 934 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 935 ch->per_thread_cache_count = 0; 936 for (i = 0; i < ch->bdev_io_cache_size; i++) { 937 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 938 assert(bdev_io != NULL); 939 ch->per_thread_cache_count++; 940 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 941 } 942 943 TAILQ_INIT(&ch->shared_resources); 944 TAILQ_INIT(&ch->io_wait_queue); 945 946 return 0; 947 } 948 949 static void 950 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 951 { 952 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 953 struct spdk_bdev_io *bdev_io; 954 955 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 956 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 957 } 958 959 if (!TAILQ_EMPTY(&ch->shared_resources)) { 960 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 961 } 962 963 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 964 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 965 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 966 ch->per_thread_cache_count--; 967 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 968 } 969 970 assert(ch->per_thread_cache_count == 0); 971 } 972 973 static void 974 bdev_init_complete(int rc) 975 { 976 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 977 void *cb_arg = g_init_cb_arg; 978 struct spdk_bdev_module *m; 979 980 g_bdev_mgr.init_complete = true; 981 g_init_cb_fn = NULL; 982 g_init_cb_arg = NULL; 983 984 /* 985 * For modules that need to know when subsystem init is complete, 986 * inform them now. 987 */ 988 if (rc == 0) { 989 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 990 if (m->init_complete) { 991 m->init_complete(); 992 } 993 } 994 } 995 996 cb_fn(cb_arg, rc); 997 } 998 999 static void 1000 bdev_module_action_complete(void) 1001 { 1002 struct spdk_bdev_module *m; 1003 1004 /* 1005 * Don't finish bdev subsystem initialization if 1006 * module pre-initialization is still in progress, or 1007 * the subsystem been already initialized. 1008 */ 1009 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1010 return; 1011 } 1012 1013 /* 1014 * Check all bdev modules for inits/examinations in progress. If any 1015 * exist, return immediately since we cannot finish bdev subsystem 1016 * initialization until all are completed. 1017 */ 1018 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1019 if (m->internal.action_in_progress > 0) { 1020 return; 1021 } 1022 } 1023 1024 /* 1025 * Modules already finished initialization - now that all 1026 * the bdev modules have finished their asynchronous I/O 1027 * processing, the entire bdev layer can be marked as complete. 1028 */ 1029 bdev_init_complete(0); 1030 } 1031 1032 static void 1033 bdev_module_action_done(struct spdk_bdev_module *module) 1034 { 1035 assert(module->internal.action_in_progress > 0); 1036 module->internal.action_in_progress--; 1037 bdev_module_action_complete(); 1038 } 1039 1040 void 1041 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1042 { 1043 bdev_module_action_done(module); 1044 } 1045 1046 void 1047 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1048 { 1049 bdev_module_action_done(module); 1050 } 1051 1052 /** The last initialized bdev module */ 1053 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1054 1055 static void 1056 bdev_init_failed(void *cb_arg) 1057 { 1058 struct spdk_bdev_module *module = cb_arg; 1059 1060 module->internal.action_in_progress--; 1061 bdev_init_complete(-1); 1062 } 1063 1064 static int 1065 bdev_modules_init(void) 1066 { 1067 struct spdk_bdev_module *module; 1068 int rc = 0; 1069 1070 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1071 g_resume_bdev_module = module; 1072 if (module->async_init) { 1073 module->internal.action_in_progress = 1; 1074 } 1075 rc = module->module_init(); 1076 if (rc != 0) { 1077 /* Bump action_in_progress to prevent other modules from completion of modules_init 1078 * Send message to defer application shutdown until resources are cleaned up */ 1079 module->internal.action_in_progress = 1; 1080 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1081 return rc; 1082 } 1083 } 1084 1085 g_resume_bdev_module = NULL; 1086 return 0; 1087 } 1088 1089 void 1090 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1091 { 1092 struct spdk_conf_section *sp; 1093 struct spdk_bdev_opts bdev_opts; 1094 int32_t bdev_io_pool_size, bdev_io_cache_size; 1095 int cache_size; 1096 int rc = 0; 1097 char mempool_name[32]; 1098 1099 assert(cb_fn != NULL); 1100 1101 sp = spdk_conf_find_section(NULL, "Bdev"); 1102 if (sp != NULL) { 1103 spdk_bdev_get_opts(&bdev_opts); 1104 1105 bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); 1106 if (bdev_io_pool_size >= 0) { 1107 bdev_opts.bdev_io_pool_size = bdev_io_pool_size; 1108 } 1109 1110 bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); 1111 if (bdev_io_cache_size >= 0) { 1112 bdev_opts.bdev_io_cache_size = bdev_io_cache_size; 1113 } 1114 1115 if (spdk_bdev_set_opts(&bdev_opts)) { 1116 bdev_init_complete(-1); 1117 return; 1118 } 1119 1120 assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); 1121 } 1122 1123 g_init_cb_fn = cb_fn; 1124 g_init_cb_arg = cb_arg; 1125 1126 spdk_notify_type_register("bdev_register"); 1127 spdk_notify_type_register("bdev_unregister"); 1128 1129 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1130 1131 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1132 g_bdev_opts.bdev_io_pool_size, 1133 sizeof(struct spdk_bdev_io) + 1134 bdev_module_get_max_ctx_size(), 1135 0, 1136 SPDK_ENV_SOCKET_ID_ANY); 1137 1138 if (g_bdev_mgr.bdev_io_pool == NULL) { 1139 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1140 bdev_init_complete(-1); 1141 return; 1142 } 1143 1144 /** 1145 * Ensure no more than half of the total buffers end up local caches, by 1146 * using spdk_thread_get_count() to determine how many local caches we need 1147 * to account for. 1148 */ 1149 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count()); 1150 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1151 1152 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1153 BUF_SMALL_POOL_SIZE, 1154 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1155 SPDK_BDEV_POOL_ALIGNMENT, 1156 cache_size, 1157 SPDK_ENV_SOCKET_ID_ANY); 1158 if (!g_bdev_mgr.buf_small_pool) { 1159 SPDK_ERRLOG("create rbuf small pool failed\n"); 1160 bdev_init_complete(-1); 1161 return; 1162 } 1163 1164 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count()); 1165 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1166 1167 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1168 BUF_LARGE_POOL_SIZE, 1169 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1170 SPDK_BDEV_POOL_ALIGNMENT, 1171 cache_size, 1172 SPDK_ENV_SOCKET_ID_ANY); 1173 if (!g_bdev_mgr.buf_large_pool) { 1174 SPDK_ERRLOG("create rbuf large pool failed\n"); 1175 bdev_init_complete(-1); 1176 return; 1177 } 1178 1179 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1180 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1181 if (!g_bdev_mgr.zero_buffer) { 1182 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1183 bdev_init_complete(-1); 1184 return; 1185 } 1186 1187 #ifdef SPDK_CONFIG_VTUNE 1188 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1189 #endif 1190 1191 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1192 bdev_mgmt_channel_destroy, 1193 sizeof(struct spdk_bdev_mgmt_channel), 1194 "bdev_mgr"); 1195 1196 rc = bdev_modules_init(); 1197 g_bdev_mgr.module_init_complete = true; 1198 if (rc != 0) { 1199 SPDK_ERRLOG("bdev modules init failed\n"); 1200 return; 1201 } 1202 1203 bdev_module_action_complete(); 1204 } 1205 1206 static void 1207 bdev_mgr_unregister_cb(void *io_device) 1208 { 1209 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1210 1211 if (g_bdev_mgr.bdev_io_pool) { 1212 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1213 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1214 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1215 g_bdev_opts.bdev_io_pool_size); 1216 } 1217 1218 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1219 } 1220 1221 if (g_bdev_mgr.buf_small_pool) { 1222 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { 1223 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1224 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1225 BUF_SMALL_POOL_SIZE); 1226 assert(false); 1227 } 1228 1229 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1230 } 1231 1232 if (g_bdev_mgr.buf_large_pool) { 1233 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { 1234 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1235 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1236 BUF_LARGE_POOL_SIZE); 1237 assert(false); 1238 } 1239 1240 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1241 } 1242 1243 spdk_free(g_bdev_mgr.zero_buffer); 1244 1245 cb_fn(g_fini_cb_arg); 1246 g_fini_cb_fn = NULL; 1247 g_fini_cb_arg = NULL; 1248 g_bdev_mgr.init_complete = false; 1249 g_bdev_mgr.module_init_complete = false; 1250 pthread_mutex_destroy(&g_bdev_mgr.mutex); 1251 } 1252 1253 static void 1254 bdev_module_finish_iter(void *arg) 1255 { 1256 struct spdk_bdev_module *bdev_module; 1257 1258 /* FIXME: Handling initialization failures is broken now, 1259 * so we won't even try cleaning up after successfully 1260 * initialized modules. if module_init_complete is false, 1261 * just call spdk_bdev_mgr_unregister_cb 1262 */ 1263 if (!g_bdev_mgr.module_init_complete) { 1264 bdev_mgr_unregister_cb(NULL); 1265 return; 1266 } 1267 1268 /* Start iterating from the last touched module */ 1269 if (!g_resume_bdev_module) { 1270 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1271 } else { 1272 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1273 internal.tailq); 1274 } 1275 1276 while (bdev_module) { 1277 if (bdev_module->async_fini) { 1278 /* Save our place so we can resume later. We must 1279 * save the variable here, before calling module_fini() 1280 * below, because in some cases the module may immediately 1281 * call spdk_bdev_module_finish_done() and re-enter 1282 * this function to continue iterating. */ 1283 g_resume_bdev_module = bdev_module; 1284 } 1285 1286 if (bdev_module->module_fini) { 1287 bdev_module->module_fini(); 1288 } 1289 1290 if (bdev_module->async_fini) { 1291 return; 1292 } 1293 1294 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1295 internal.tailq); 1296 } 1297 1298 g_resume_bdev_module = NULL; 1299 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1300 } 1301 1302 void 1303 spdk_bdev_module_finish_done(void) 1304 { 1305 if (spdk_get_thread() != g_fini_thread) { 1306 spdk_thread_send_msg(g_fini_thread, bdev_module_finish_iter, NULL); 1307 } else { 1308 bdev_module_finish_iter(NULL); 1309 } 1310 } 1311 1312 static void 1313 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1314 { 1315 struct spdk_bdev *bdev = cb_arg; 1316 1317 if (bdeverrno && bdev) { 1318 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1319 bdev->name); 1320 1321 /* 1322 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1323 * bdev; try to continue by manually removing this bdev from the list and continue 1324 * with the next bdev in the list. 1325 */ 1326 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1327 } 1328 1329 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1330 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); 1331 /* 1332 * Bdev module finish need to be deferred as we might be in the middle of some context 1333 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1334 * after returning. 1335 */ 1336 spdk_thread_send_msg(spdk_get_thread(), bdev_module_finish_iter, NULL); 1337 return; 1338 } 1339 1340 /* 1341 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1342 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1343 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1344 * base bdevs. 1345 * 1346 * Also, walk the list in the reverse order. 1347 */ 1348 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1349 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1350 if (bdev->internal.claim_module != NULL) { 1351 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n", 1352 bdev->name, bdev->internal.claim_module->name); 1353 continue; 1354 } 1355 1356 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); 1357 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1358 return; 1359 } 1360 1361 /* 1362 * If any bdev fails to unclaim underlying bdev properly, we may face the 1363 * case of bdev list consisting of claimed bdevs only (if claims are managed 1364 * correctly, this would mean there's a loop in the claims graph which is 1365 * clearly impossible). Warn and unregister last bdev on the list then. 1366 */ 1367 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1368 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1369 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1370 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1371 return; 1372 } 1373 } 1374 1375 void 1376 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1377 { 1378 struct spdk_bdev_module *m; 1379 1380 assert(cb_fn != NULL); 1381 1382 g_fini_thread = spdk_get_thread(); 1383 1384 g_fini_cb_fn = cb_fn; 1385 g_fini_cb_arg = cb_arg; 1386 1387 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1388 if (m->fini_start) { 1389 m->fini_start(); 1390 } 1391 } 1392 1393 bdev_finish_unregister_bdevs_iter(NULL, 0); 1394 } 1395 1396 struct spdk_bdev_io * 1397 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1398 { 1399 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1400 struct spdk_bdev_io *bdev_io; 1401 1402 if (ch->per_thread_cache_count > 0) { 1403 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1404 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1405 ch->per_thread_cache_count--; 1406 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1407 /* 1408 * Don't try to look for bdev_ios in the global pool if there are 1409 * waiters on bdev_ios - we don't want this caller to jump the line. 1410 */ 1411 bdev_io = NULL; 1412 } else { 1413 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1414 } 1415 1416 return bdev_io; 1417 } 1418 1419 void 1420 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1421 { 1422 struct spdk_bdev_mgmt_channel *ch; 1423 1424 assert(bdev_io != NULL); 1425 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1426 1427 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1428 1429 if (bdev_io->internal.buf != NULL) { 1430 bdev_io_put_buf(bdev_io); 1431 } 1432 1433 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1434 ch->per_thread_cache_count++; 1435 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1436 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1437 struct spdk_bdev_io_wait_entry *entry; 1438 1439 entry = TAILQ_FIRST(&ch->io_wait_queue); 1440 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1441 entry->cb_fn(entry->cb_arg); 1442 } 1443 } else { 1444 /* We should never have a full cache with entries on the io wait queue. */ 1445 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1446 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1447 } 1448 } 1449 1450 static bool 1451 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1452 { 1453 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1454 1455 switch (limit) { 1456 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1457 return true; 1458 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1459 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1460 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1461 return false; 1462 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1463 default: 1464 return false; 1465 } 1466 } 1467 1468 static bool 1469 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1470 { 1471 switch (bdev_io->type) { 1472 case SPDK_BDEV_IO_TYPE_NVME_IO: 1473 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1474 case SPDK_BDEV_IO_TYPE_READ: 1475 case SPDK_BDEV_IO_TYPE_WRITE: 1476 return true; 1477 case SPDK_BDEV_IO_TYPE_ZCOPY: 1478 if (bdev_io->u.bdev.zcopy.start) { 1479 return true; 1480 } else { 1481 return false; 1482 } 1483 default: 1484 return false; 1485 } 1486 } 1487 1488 static bool 1489 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1490 { 1491 switch (bdev_io->type) { 1492 case SPDK_BDEV_IO_TYPE_NVME_IO: 1493 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1494 /* Bit 1 (0x2) set for read operation */ 1495 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1496 return true; 1497 } else { 1498 return false; 1499 } 1500 case SPDK_BDEV_IO_TYPE_READ: 1501 return true; 1502 case SPDK_BDEV_IO_TYPE_ZCOPY: 1503 /* Populate to read from disk */ 1504 if (bdev_io->u.bdev.zcopy.populate) { 1505 return true; 1506 } else { 1507 return false; 1508 } 1509 default: 1510 return false; 1511 } 1512 } 1513 1514 static uint64_t 1515 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1516 { 1517 struct spdk_bdev *bdev = bdev_io->bdev; 1518 1519 switch (bdev_io->type) { 1520 case SPDK_BDEV_IO_TYPE_NVME_IO: 1521 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1522 return bdev_io->u.nvme_passthru.nbytes; 1523 case SPDK_BDEV_IO_TYPE_READ: 1524 case SPDK_BDEV_IO_TYPE_WRITE: 1525 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1526 case SPDK_BDEV_IO_TYPE_ZCOPY: 1527 /* Track the data in the start phase only */ 1528 if (bdev_io->u.bdev.zcopy.start) { 1529 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1530 } else { 1531 return 0; 1532 } 1533 default: 1534 return 0; 1535 } 1536 } 1537 1538 static bool 1539 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1540 { 1541 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 1542 return true; 1543 } else { 1544 return false; 1545 } 1546 } 1547 1548 static bool 1549 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1550 { 1551 if (bdev_is_read_io(io) == false) { 1552 return false; 1553 } 1554 1555 return bdev_qos_rw_queue_io(limit, io); 1556 } 1557 1558 static bool 1559 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1560 { 1561 if (bdev_is_read_io(io) == true) { 1562 return false; 1563 } 1564 1565 return bdev_qos_rw_queue_io(limit, io); 1566 } 1567 1568 static void 1569 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1570 { 1571 limit->remaining_this_timeslice--; 1572 } 1573 1574 static void 1575 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1576 { 1577 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 1578 } 1579 1580 static void 1581 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1582 { 1583 if (bdev_is_read_io(io) == false) { 1584 return; 1585 } 1586 1587 return bdev_qos_rw_bps_update_quota(limit, io); 1588 } 1589 1590 static void 1591 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 1592 { 1593 if (bdev_is_read_io(io) == true) { 1594 return; 1595 } 1596 1597 return bdev_qos_rw_bps_update_quota(limit, io); 1598 } 1599 1600 static void 1601 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 1602 { 1603 int i; 1604 1605 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1606 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 1607 qos->rate_limits[i].queue_io = NULL; 1608 qos->rate_limits[i].update_quota = NULL; 1609 continue; 1610 } 1611 1612 switch (i) { 1613 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1614 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1615 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 1616 break; 1617 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1618 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 1619 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 1620 break; 1621 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1622 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 1623 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 1624 break; 1625 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1626 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 1627 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 1628 break; 1629 default: 1630 break; 1631 } 1632 } 1633 } 1634 1635 static inline void 1636 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1637 { 1638 struct spdk_bdev *bdev = bdev_io->bdev; 1639 struct spdk_io_channel *ch = bdev_ch->channel; 1640 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1641 1642 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 1643 bdev_ch->io_outstanding++; 1644 shared_resource->io_outstanding++; 1645 bdev_io->internal.in_submit_request = true; 1646 bdev->fn_table->submit_request(ch, bdev_io); 1647 bdev_io->internal.in_submit_request = false; 1648 } else { 1649 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1650 } 1651 } 1652 1653 static int 1654 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 1655 { 1656 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 1657 int i, submitted_ios = 0; 1658 1659 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 1660 if (bdev_qos_io_to_limit(bdev_io) == true) { 1661 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1662 if (!qos->rate_limits[i].queue_io) { 1663 continue; 1664 } 1665 1666 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 1667 bdev_io) == true) { 1668 return submitted_ios; 1669 } 1670 } 1671 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1672 if (!qos->rate_limits[i].update_quota) { 1673 continue; 1674 } 1675 1676 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 1677 } 1678 } 1679 1680 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 1681 bdev_io_do_submit(ch, bdev_io); 1682 submitted_ios++; 1683 } 1684 1685 return submitted_ios; 1686 } 1687 1688 static void 1689 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 1690 { 1691 int rc; 1692 1693 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 1694 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 1695 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 1696 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 1697 &bdev_io->internal.waitq_entry); 1698 if (rc != 0) { 1699 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 1700 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1701 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1702 } 1703 } 1704 1705 static bool 1706 bdev_io_type_can_split(uint8_t type) 1707 { 1708 assert(type != SPDK_BDEV_IO_TYPE_INVALID); 1709 assert(type < SPDK_BDEV_NUM_IO_TYPES); 1710 1711 /* Only split READ and WRITE I/O. Theoretically other types of I/O like 1712 * UNMAP could be split, but these types of I/O are typically much larger 1713 * in size (sometimes the size of the entire block device), and the bdev 1714 * module can more efficiently split these types of I/O. Plus those types 1715 * of I/O do not have a payload, which makes the splitting process simpler. 1716 */ 1717 if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { 1718 return true; 1719 } else { 1720 return false; 1721 } 1722 } 1723 1724 static bool 1725 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 1726 { 1727 uint64_t start_stripe, end_stripe; 1728 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 1729 1730 if (io_boundary == 0) { 1731 return false; 1732 } 1733 1734 if (!bdev_io_type_can_split(bdev_io->type)) { 1735 return false; 1736 } 1737 1738 start_stripe = bdev_io->u.bdev.offset_blocks; 1739 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 1740 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 1741 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 1742 start_stripe >>= spdk_u32log2(io_boundary); 1743 end_stripe >>= spdk_u32log2(io_boundary); 1744 } else { 1745 start_stripe /= io_boundary; 1746 end_stripe /= io_boundary; 1747 } 1748 return (start_stripe != end_stripe); 1749 } 1750 1751 static uint32_t 1752 _to_next_boundary(uint64_t offset, uint32_t boundary) 1753 { 1754 return (boundary - (offset % boundary)); 1755 } 1756 1757 static void 1758 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 1759 1760 static void 1761 _bdev_io_split(void *_bdev_io) 1762 { 1763 struct spdk_bdev_io *bdev_io = _bdev_io; 1764 uint64_t current_offset, remaining; 1765 uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 1766 struct iovec *parent_iov, *iov; 1767 uint64_t parent_iov_offset, iov_len; 1768 uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; 1769 void *md_buf = NULL; 1770 int rc; 1771 1772 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 1773 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 1774 blocklen = bdev_io->bdev->blocklen; 1775 parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; 1776 parent_iovcnt = bdev_io->u.bdev.iovcnt; 1777 1778 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 1779 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1780 if (parent_iov_offset < parent_iov->iov_len) { 1781 break; 1782 } 1783 parent_iov_offset -= parent_iov->iov_len; 1784 } 1785 1786 child_iovcnt = 0; 1787 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1788 to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); 1789 to_next_boundary = spdk_min(remaining, to_next_boundary); 1790 to_next_boundary_bytes = to_next_boundary * blocklen; 1791 iov = &bdev_io->child_iov[child_iovcnt]; 1792 iovcnt = 0; 1793 1794 if (bdev_io->u.bdev.md_buf) { 1795 assert((parent_iov_offset % blocklen) > 0); 1796 md_buf = (char *)bdev_io->u.bdev.md_buf + (parent_iov_offset / blocklen) * 1797 spdk_bdev_get_md_size(bdev_io->bdev); 1798 } 1799 1800 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 1801 child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 1802 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 1803 iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); 1804 to_next_boundary_bytes -= iov_len; 1805 1806 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 1807 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 1808 1809 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 1810 parent_iov_offset += iov_len; 1811 } else { 1812 parent_iovpos++; 1813 parent_iov_offset = 0; 1814 } 1815 child_iovcnt++; 1816 iovcnt++; 1817 } 1818 1819 if (to_next_boundary_bytes > 0) { 1820 /* We had to stop this child I/O early because we ran out of 1821 * child_iov space. Ensure the iovs to be aligned with block 1822 * size and then adjust to_next_boundary before starting the 1823 * child I/O. 1824 */ 1825 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV); 1826 to_last_block_bytes = to_next_boundary_bytes % blocklen; 1827 if (to_last_block_bytes != 0) { 1828 uint32_t child_iovpos = child_iovcnt - 1; 1829 /* don't decrease child_iovcnt so the loop will naturally end */ 1830 1831 to_last_block_bytes = blocklen - to_last_block_bytes; 1832 to_next_boundary_bytes += to_last_block_bytes; 1833 while (to_last_block_bytes > 0 && iovcnt > 0) { 1834 iov_len = spdk_min(to_last_block_bytes, 1835 bdev_io->child_iov[child_iovpos].iov_len); 1836 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 1837 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 1838 child_iovpos--; 1839 if (--iovcnt == 0) { 1840 return; 1841 } 1842 } 1843 to_last_block_bytes -= iov_len; 1844 } 1845 1846 assert(to_last_block_bytes == 0); 1847 } 1848 to_next_boundary -= to_next_boundary_bytes / blocklen; 1849 } 1850 1851 bdev_io->u.bdev.split_outstanding++; 1852 1853 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1854 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 1855 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1856 iov, iovcnt, md_buf, current_offset, 1857 to_next_boundary, 1858 bdev_io_split_done, bdev_io); 1859 } else { 1860 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 1861 spdk_io_channel_from_ctx(bdev_io->internal.ch), 1862 iov, iovcnt, md_buf, current_offset, 1863 to_next_boundary, 1864 bdev_io_split_done, bdev_io); 1865 } 1866 1867 if (rc == 0) { 1868 current_offset += to_next_boundary; 1869 remaining -= to_next_boundary; 1870 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 1871 bdev_io->u.bdev.split_remaining_num_blocks = remaining; 1872 } else { 1873 bdev_io->u.bdev.split_outstanding--; 1874 if (rc == -ENOMEM) { 1875 if (bdev_io->u.bdev.split_outstanding == 0) { 1876 /* No I/O is outstanding. Hence we should wait here. */ 1877 bdev_queue_io_wait_with_cb(bdev_io, _bdev_io_split); 1878 } 1879 } else { 1880 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1881 if (bdev_io->u.bdev.split_outstanding == 0) { 1882 spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, 1883 (uintptr_t)bdev_io, 0); 1884 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 1885 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 1886 } 1887 } 1888 1889 return; 1890 } 1891 } 1892 } 1893 1894 static void 1895 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1896 { 1897 struct spdk_bdev_io *parent_io = cb_arg; 1898 1899 spdk_bdev_free_io(bdev_io); 1900 1901 if (!success) { 1902 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1903 } 1904 parent_io->u.bdev.split_outstanding--; 1905 if (parent_io->u.bdev.split_outstanding != 0) { 1906 return; 1907 } 1908 1909 /* 1910 * Parent I/O finishes when all blocks are consumed. 1911 */ 1912 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 1913 assert(parent_io->internal.cb != bdev_io_split_done); 1914 spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, 1915 (uintptr_t)parent_io, 0); 1916 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 1917 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 1918 parent_io->internal.caller_ctx); 1919 return; 1920 } 1921 1922 /* 1923 * Continue with the splitting process. This function will complete the parent I/O if the 1924 * splitting is done. 1925 */ 1926 _bdev_io_split(parent_io); 1927 } 1928 1929 static void 1930 bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 1931 1932 static void 1933 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 1934 { 1935 assert(bdev_io_type_can_split(bdev_io->type)); 1936 1937 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 1938 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 1939 bdev_io->u.bdev.split_outstanding = 0; 1940 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 1941 1942 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 1943 _bdev_io_split(bdev_io); 1944 } else { 1945 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1946 spdk_bdev_io_get_buf(bdev_io, bdev_io_split_get_buf_cb, 1947 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 1948 } 1949 } 1950 1951 static void 1952 bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 1953 { 1954 if (!success) { 1955 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1956 return; 1957 } 1958 1959 bdev_io_split(ch, bdev_io); 1960 } 1961 1962 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 1963 * be inlined, at least on some compilers. 1964 */ 1965 static inline void 1966 _bdev_io_submit(void *ctx) 1967 { 1968 struct spdk_bdev_io *bdev_io = ctx; 1969 struct spdk_bdev *bdev = bdev_io->bdev; 1970 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1971 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1972 uint64_t tsc; 1973 1974 tsc = spdk_get_ticks(); 1975 bdev_io->internal.submit_tsc = tsc; 1976 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); 1977 1978 if (spdk_likely(bdev_ch->flags == 0)) { 1979 bdev_io_do_submit(bdev_ch, bdev_io); 1980 return; 1981 } 1982 1983 bdev_ch->io_outstanding++; 1984 shared_resource->io_outstanding++; 1985 bdev_io->internal.in_submit_request = true; 1986 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 1987 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1988 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 1989 bdev_ch->io_outstanding--; 1990 shared_resource->io_outstanding--; 1991 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 1992 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 1993 } else { 1994 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 1995 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 1996 } 1997 bdev_io->internal.in_submit_request = false; 1998 } 1999 2000 bool 2001 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2002 2003 bool 2004 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2005 { 2006 if (range1->length == 0 || range2->length == 0) { 2007 return false; 2008 } 2009 2010 if (range1->offset + range1->length <= range2->offset) { 2011 return false; 2012 } 2013 2014 if (range2->offset + range2->length <= range1->offset) { 2015 return false; 2016 } 2017 2018 return true; 2019 } 2020 2021 static bool 2022 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2023 { 2024 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2025 struct lba_range r; 2026 2027 switch (bdev_io->type) { 2028 case SPDK_BDEV_IO_TYPE_NVME_IO: 2029 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2030 /* Don't try to decode the NVMe command - just assume worst-case and that 2031 * it overlaps a locked range. 2032 */ 2033 return true; 2034 case SPDK_BDEV_IO_TYPE_WRITE: 2035 case SPDK_BDEV_IO_TYPE_UNMAP: 2036 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2037 case SPDK_BDEV_IO_TYPE_ZCOPY: 2038 r.offset = bdev_io->u.bdev.offset_blocks; 2039 r.length = bdev_io->u.bdev.num_blocks; 2040 if (!bdev_lba_range_overlapped(range, &r)) { 2041 /* This I/O doesn't overlap the specified LBA range. */ 2042 return false; 2043 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2044 /* This I/O overlaps, but the I/O is on the same channel that locked this 2045 * range, and the caller_ctx is the same as the locked_ctx. This means 2046 * that this I/O is associated with the lock, and is allowed to execute. 2047 */ 2048 return false; 2049 } else { 2050 return true; 2051 } 2052 default: 2053 return false; 2054 } 2055 } 2056 2057 void 2058 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2059 { 2060 struct spdk_bdev *bdev = bdev_io->bdev; 2061 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2062 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2063 2064 assert(thread != NULL); 2065 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2066 2067 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2068 struct lba_range *range; 2069 2070 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2071 if (bdev_io_range_is_locked(bdev_io, range)) { 2072 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2073 return; 2074 } 2075 } 2076 } 2077 2078 /* Add the bdev_io to io_submitted only if it is the original 2079 * submission from the bdev user. When a bdev_io is split, 2080 * it comes back through this code path, so we need to make sure 2081 * we don't try to add it a second time. 2082 */ 2083 if (bdev_io->internal.cb != bdev_io_split_done) { 2084 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2085 } 2086 2087 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bdev_io)) { 2088 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2089 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2090 (uintptr_t)bdev_io, bdev_io->type); 2091 bdev_io_split(NULL, bdev_io); 2092 return; 2093 } 2094 2095 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2096 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2097 _bdev_io_submit(bdev_io); 2098 } else { 2099 bdev_io->internal.io_submit_ch = ch; 2100 bdev_io->internal.ch = bdev->internal.qos->ch; 2101 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2102 } 2103 } else { 2104 _bdev_io_submit(bdev_io); 2105 } 2106 } 2107 2108 static void 2109 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2110 { 2111 struct spdk_bdev *bdev = bdev_io->bdev; 2112 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2113 struct spdk_io_channel *ch = bdev_ch->channel; 2114 2115 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2116 2117 bdev_io->internal.in_submit_request = true; 2118 bdev->fn_table->submit_request(ch, bdev_io); 2119 bdev_io->internal.in_submit_request = false; 2120 } 2121 2122 void 2123 bdev_io_init(struct spdk_bdev_io *bdev_io, 2124 struct spdk_bdev *bdev, void *cb_arg, 2125 spdk_bdev_io_completion_cb cb) 2126 { 2127 bdev_io->bdev = bdev; 2128 bdev_io->internal.caller_ctx = cb_arg; 2129 bdev_io->internal.cb = cb; 2130 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2131 bdev_io->internal.in_submit_request = false; 2132 bdev_io->internal.buf = NULL; 2133 bdev_io->internal.io_submit_ch = NULL; 2134 bdev_io->internal.orig_iovs = NULL; 2135 bdev_io->internal.orig_iovcnt = 0; 2136 bdev_io->internal.orig_md_buf = NULL; 2137 bdev_io->internal.error.nvme.cdw0 = 0; 2138 bdev_io->num_retries = 0; 2139 bdev_io->internal.get_buf_cb = NULL; 2140 bdev_io->internal.get_aux_buf_cb = NULL; 2141 } 2142 2143 static bool 2144 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2145 { 2146 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2147 } 2148 2149 bool 2150 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2151 { 2152 bool supported; 2153 2154 supported = bdev_io_type_supported(bdev, io_type); 2155 2156 if (!supported) { 2157 switch (io_type) { 2158 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2159 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2160 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2161 break; 2162 case SPDK_BDEV_IO_TYPE_ZCOPY: 2163 /* Zero copy can be emulated with regular read and write */ 2164 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ) && 2165 bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2166 break; 2167 default: 2168 break; 2169 } 2170 } 2171 2172 return supported; 2173 } 2174 2175 int 2176 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2177 { 2178 if (bdev->fn_table->dump_info_json) { 2179 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 2180 } 2181 2182 return 0; 2183 } 2184 2185 static void 2186 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 2187 { 2188 uint32_t max_per_timeslice = 0; 2189 int i; 2190 2191 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2192 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2193 qos->rate_limits[i].max_per_timeslice = 0; 2194 continue; 2195 } 2196 2197 max_per_timeslice = qos->rate_limits[i].limit * 2198 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 2199 2200 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 2201 qos->rate_limits[i].min_per_timeslice); 2202 2203 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 2204 } 2205 2206 bdev_qos_set_ops(qos); 2207 } 2208 2209 static int 2210 bdev_channel_poll_qos(void *arg) 2211 { 2212 struct spdk_bdev_qos *qos = arg; 2213 uint64_t now = spdk_get_ticks(); 2214 int i; 2215 2216 if (now < (qos->last_timeslice + qos->timeslice_size)) { 2217 /* We received our callback earlier than expected - return 2218 * immediately and wait to do accounting until at least one 2219 * timeslice has actually expired. This should never happen 2220 * with a well-behaved timer implementation. 2221 */ 2222 return 0; 2223 } 2224 2225 /* Reset for next round of rate limiting */ 2226 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2227 /* We may have allowed the IOs or bytes to slightly overrun in the last 2228 * timeslice. remaining_this_timeslice is signed, so if it's negative 2229 * here, we'll account for the overrun so that the next timeslice will 2230 * be appropriately reduced. 2231 */ 2232 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 2233 qos->rate_limits[i].remaining_this_timeslice = 0; 2234 } 2235 } 2236 2237 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 2238 qos->last_timeslice += qos->timeslice_size; 2239 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2240 qos->rate_limits[i].remaining_this_timeslice += 2241 qos->rate_limits[i].max_per_timeslice; 2242 } 2243 } 2244 2245 return bdev_qos_io_submit(qos->ch, qos); 2246 } 2247 2248 static void 2249 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 2250 { 2251 struct spdk_bdev_shared_resource *shared_resource; 2252 struct lba_range *range; 2253 2254 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 2255 range = TAILQ_FIRST(&ch->locked_ranges); 2256 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 2257 free(range); 2258 } 2259 2260 spdk_put_io_channel(ch->channel); 2261 2262 shared_resource = ch->shared_resource; 2263 2264 assert(TAILQ_EMPTY(&ch->io_locked)); 2265 assert(TAILQ_EMPTY(&ch->io_submitted)); 2266 assert(ch->io_outstanding == 0); 2267 assert(shared_resource->ref > 0); 2268 shared_resource->ref--; 2269 if (shared_resource->ref == 0) { 2270 assert(shared_resource->io_outstanding == 0); 2271 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 2272 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 2273 free(shared_resource); 2274 } 2275 } 2276 2277 /* Caller must hold bdev->internal.mutex. */ 2278 static void 2279 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 2280 { 2281 struct spdk_bdev_qos *qos = bdev->internal.qos; 2282 int i; 2283 2284 /* Rate limiting on this bdev enabled */ 2285 if (qos) { 2286 if (qos->ch == NULL) { 2287 struct spdk_io_channel *io_ch; 2288 2289 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 2290 bdev->name, spdk_get_thread()); 2291 2292 /* No qos channel has been selected, so set one up */ 2293 2294 /* Take another reference to ch */ 2295 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 2296 assert(io_ch != NULL); 2297 qos->ch = ch; 2298 2299 qos->thread = spdk_io_channel_get_thread(io_ch); 2300 2301 TAILQ_INIT(&qos->queued); 2302 2303 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2304 if (bdev_qos_is_iops_rate_limit(i) == true) { 2305 qos->rate_limits[i].min_per_timeslice = 2306 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 2307 } else { 2308 qos->rate_limits[i].min_per_timeslice = 2309 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 2310 } 2311 2312 if (qos->rate_limits[i].limit == 0) { 2313 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 2314 } 2315 } 2316 bdev_qos_update_max_quota_per_timeslice(qos); 2317 qos->timeslice_size = 2318 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 2319 qos->last_timeslice = spdk_get_ticks(); 2320 qos->poller = spdk_poller_register(bdev_channel_poll_qos, 2321 qos, 2322 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 2323 } 2324 2325 ch->flags |= BDEV_CH_QOS_ENABLED; 2326 } 2327 } 2328 2329 struct poll_timeout_ctx { 2330 struct spdk_bdev_desc *desc; 2331 uint64_t timeout_in_sec; 2332 spdk_bdev_io_timeout_cb cb_fn; 2333 void *cb_arg; 2334 }; 2335 2336 static void 2337 bdev_desc_free(struct spdk_bdev_desc *desc) 2338 { 2339 pthread_mutex_destroy(&desc->mutex); 2340 free(desc->media_events_buffer); 2341 free(desc); 2342 } 2343 2344 static void 2345 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 2346 { 2347 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2348 struct spdk_bdev_desc *desc = ctx->desc; 2349 2350 free(ctx); 2351 2352 pthread_mutex_lock(&desc->mutex); 2353 desc->refs--; 2354 if (desc->closed == true && desc->refs == 0) { 2355 pthread_mutex_unlock(&desc->mutex); 2356 bdev_desc_free(desc); 2357 return; 2358 } 2359 pthread_mutex_unlock(&desc->mutex); 2360 } 2361 2362 static void 2363 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 2364 { 2365 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 2366 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 2367 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 2368 struct spdk_bdev_desc *desc = ctx->desc; 2369 struct spdk_bdev_io *bdev_io; 2370 uint64_t now; 2371 2372 pthread_mutex_lock(&desc->mutex); 2373 if (desc->closed == true) { 2374 pthread_mutex_unlock(&desc->mutex); 2375 spdk_for_each_channel_continue(i, -1); 2376 return; 2377 } 2378 pthread_mutex_unlock(&desc->mutex); 2379 2380 now = spdk_get_ticks(); 2381 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 2382 /* I/O are added to this TAILQ as they are submitted. 2383 * So once we find an I/O that has not timed out, we can immediately exit the loop. */ 2384 if (now < (bdev_io->internal.submit_tsc + 2385 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 2386 goto end; 2387 } 2388 2389 if (bdev_io->internal.desc == desc) { 2390 ctx->cb_fn(ctx->cb_arg, bdev_io); 2391 } 2392 } 2393 2394 end: 2395 spdk_for_each_channel_continue(i, 0); 2396 } 2397 2398 static int 2399 bdev_poll_timeout_io(void *arg) 2400 { 2401 struct spdk_bdev_desc *desc = arg; 2402 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 2403 struct poll_timeout_ctx *ctx; 2404 2405 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 2406 if (!ctx) { 2407 SPDK_ERRLOG("failed to allocate memory\n"); 2408 return 1; 2409 } 2410 ctx->desc = desc; 2411 ctx->cb_arg = desc->cb_arg; 2412 ctx->cb_fn = desc->cb_fn; 2413 ctx->timeout_in_sec = desc->timeout_in_sec; 2414 2415 /* Take a ref on the descriptor in case it gets closed while we are checking 2416 * all of the channels. 2417 */ 2418 pthread_mutex_lock(&desc->mutex); 2419 desc->refs++; 2420 pthread_mutex_unlock(&desc->mutex); 2421 2422 spdk_for_each_channel(__bdev_to_io_dev(bdev), 2423 bdev_channel_poll_timeout_io, 2424 ctx, 2425 bdev_channel_poll_timeout_io_done); 2426 2427 return 1; 2428 } 2429 2430 int 2431 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 2432 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 2433 { 2434 assert(desc->thread == spdk_get_thread()); 2435 2436 spdk_poller_unregister(&desc->io_timeout_poller); 2437 2438 if (timeout_in_sec) { 2439 assert(cb_fn != NULL); 2440 desc->io_timeout_poller = spdk_poller_register(bdev_poll_timeout_io, 2441 desc, 2442 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 2443 1000); 2444 if (desc->io_timeout_poller == NULL) { 2445 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 2446 return -1; 2447 } 2448 } 2449 2450 desc->cb_fn = cb_fn; 2451 desc->cb_arg = cb_arg; 2452 desc->timeout_in_sec = timeout_in_sec; 2453 2454 return 0; 2455 } 2456 2457 static int 2458 bdev_channel_create(void *io_device, void *ctx_buf) 2459 { 2460 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 2461 struct spdk_bdev_channel *ch = ctx_buf; 2462 struct spdk_io_channel *mgmt_io_ch; 2463 struct spdk_bdev_mgmt_channel *mgmt_ch; 2464 struct spdk_bdev_shared_resource *shared_resource; 2465 struct lba_range *range; 2466 2467 ch->bdev = bdev; 2468 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 2469 if (!ch->channel) { 2470 return -1; 2471 } 2472 2473 assert(ch->histogram == NULL); 2474 if (bdev->internal.histogram_enabled) { 2475 ch->histogram = spdk_histogram_data_alloc(); 2476 if (ch->histogram == NULL) { 2477 SPDK_ERRLOG("Could not allocate histogram\n"); 2478 } 2479 } 2480 2481 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 2482 if (!mgmt_io_ch) { 2483 spdk_put_io_channel(ch->channel); 2484 return -1; 2485 } 2486 2487 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 2488 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 2489 if (shared_resource->shared_ch == ch->channel) { 2490 spdk_put_io_channel(mgmt_io_ch); 2491 shared_resource->ref++; 2492 break; 2493 } 2494 } 2495 2496 if (shared_resource == NULL) { 2497 shared_resource = calloc(1, sizeof(*shared_resource)); 2498 if (shared_resource == NULL) { 2499 spdk_put_io_channel(ch->channel); 2500 spdk_put_io_channel(mgmt_io_ch); 2501 return -1; 2502 } 2503 2504 shared_resource->mgmt_ch = mgmt_ch; 2505 shared_resource->io_outstanding = 0; 2506 TAILQ_INIT(&shared_resource->nomem_io); 2507 shared_resource->nomem_threshold = 0; 2508 shared_resource->shared_ch = ch->channel; 2509 shared_resource->ref = 1; 2510 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 2511 } 2512 2513 memset(&ch->stat, 0, sizeof(ch->stat)); 2514 ch->stat.ticks_rate = spdk_get_ticks_hz(); 2515 ch->io_outstanding = 0; 2516 TAILQ_INIT(&ch->queued_resets); 2517 TAILQ_INIT(&ch->locked_ranges); 2518 ch->flags = 0; 2519 ch->shared_resource = shared_resource; 2520 2521 TAILQ_INIT(&ch->io_submitted); 2522 TAILQ_INIT(&ch->io_locked); 2523 2524 #ifdef SPDK_CONFIG_VTUNE 2525 { 2526 char *name; 2527 __itt_init_ittlib(NULL, 0); 2528 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 2529 if (!name) { 2530 bdev_channel_destroy_resource(ch); 2531 return -1; 2532 } 2533 ch->handle = __itt_string_handle_create(name); 2534 free(name); 2535 ch->start_tsc = spdk_get_ticks(); 2536 ch->interval_tsc = spdk_get_ticks_hz() / 100; 2537 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 2538 } 2539 #endif 2540 2541 pthread_mutex_lock(&bdev->internal.mutex); 2542 bdev_enable_qos(bdev, ch); 2543 2544 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 2545 struct lba_range *new_range; 2546 2547 new_range = calloc(1, sizeof(*new_range)); 2548 if (new_range == NULL) { 2549 pthread_mutex_unlock(&bdev->internal.mutex); 2550 bdev_channel_destroy_resource(ch); 2551 return -1; 2552 } 2553 new_range->length = range->length; 2554 new_range->offset = range->offset; 2555 new_range->locked_ctx = range->locked_ctx; 2556 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 2557 } 2558 2559 pthread_mutex_unlock(&bdev->internal.mutex); 2560 2561 return 0; 2562 } 2563 2564 /* 2565 * Abort I/O that are waiting on a data buffer. These types of I/O are 2566 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 2567 */ 2568 static void 2569 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 2570 { 2571 bdev_io_stailq_t tmp; 2572 struct spdk_bdev_io *bdev_io; 2573 2574 STAILQ_INIT(&tmp); 2575 2576 while (!STAILQ_EMPTY(queue)) { 2577 bdev_io = STAILQ_FIRST(queue); 2578 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 2579 if (bdev_io->internal.ch == ch) { 2580 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2581 } else { 2582 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 2583 } 2584 } 2585 2586 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 2587 } 2588 2589 /* 2590 * Abort I/O that are queued waiting for submission. These types of I/O are 2591 * linked using the spdk_bdev_io link TAILQ_ENTRY. 2592 */ 2593 static void 2594 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 2595 { 2596 struct spdk_bdev_io *bdev_io, *tmp; 2597 2598 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 2599 if (bdev_io->internal.ch == ch) { 2600 TAILQ_REMOVE(queue, bdev_io, internal.link); 2601 /* 2602 * spdk_bdev_io_complete() assumes that the completed I/O had 2603 * been submitted to the bdev module. Since in this case it 2604 * hadn't, bump io_outstanding to account for the decrement 2605 * that spdk_bdev_io_complete() will do. 2606 */ 2607 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 2608 ch->io_outstanding++; 2609 ch->shared_resource->io_outstanding++; 2610 } 2611 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2612 } 2613 } 2614 } 2615 2616 static void 2617 bdev_qos_channel_destroy(void *cb_arg) 2618 { 2619 struct spdk_bdev_qos *qos = cb_arg; 2620 2621 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 2622 spdk_poller_unregister(&qos->poller); 2623 2624 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); 2625 2626 free(qos); 2627 } 2628 2629 static int 2630 bdev_qos_destroy(struct spdk_bdev *bdev) 2631 { 2632 int i; 2633 2634 /* 2635 * Cleanly shutting down the QoS poller is tricky, because 2636 * during the asynchronous operation the user could open 2637 * a new descriptor and create a new channel, spawning 2638 * a new QoS poller. 2639 * 2640 * The strategy is to create a new QoS structure here and swap it 2641 * in. The shutdown path then continues to refer to the old one 2642 * until it completes and then releases it. 2643 */ 2644 struct spdk_bdev_qos *new_qos, *old_qos; 2645 2646 old_qos = bdev->internal.qos; 2647 2648 new_qos = calloc(1, sizeof(*new_qos)); 2649 if (!new_qos) { 2650 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 2651 return -ENOMEM; 2652 } 2653 2654 /* Copy the old QoS data into the newly allocated structure */ 2655 memcpy(new_qos, old_qos, sizeof(*new_qos)); 2656 2657 /* Zero out the key parts of the QoS structure */ 2658 new_qos->ch = NULL; 2659 new_qos->thread = NULL; 2660 new_qos->poller = NULL; 2661 TAILQ_INIT(&new_qos->queued); 2662 /* 2663 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 2664 * It will be used later for the new QoS structure. 2665 */ 2666 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2667 new_qos->rate_limits[i].remaining_this_timeslice = 0; 2668 new_qos->rate_limits[i].min_per_timeslice = 0; 2669 new_qos->rate_limits[i].max_per_timeslice = 0; 2670 } 2671 2672 bdev->internal.qos = new_qos; 2673 2674 if (old_qos->thread == NULL) { 2675 free(old_qos); 2676 } else { 2677 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 2678 } 2679 2680 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 2681 * been destroyed yet. The destruction path will end up waiting for the final 2682 * channel to be put before it releases resources. */ 2683 2684 return 0; 2685 } 2686 2687 static void 2688 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 2689 { 2690 total->bytes_read += add->bytes_read; 2691 total->num_read_ops += add->num_read_ops; 2692 total->bytes_written += add->bytes_written; 2693 total->num_write_ops += add->num_write_ops; 2694 total->bytes_unmapped += add->bytes_unmapped; 2695 total->num_unmap_ops += add->num_unmap_ops; 2696 total->read_latency_ticks += add->read_latency_ticks; 2697 total->write_latency_ticks += add->write_latency_ticks; 2698 total->unmap_latency_ticks += add->unmap_latency_ticks; 2699 } 2700 2701 static void 2702 bdev_channel_destroy(void *io_device, void *ctx_buf) 2703 { 2704 struct spdk_bdev_channel *ch = ctx_buf; 2705 struct spdk_bdev_mgmt_channel *mgmt_ch; 2706 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 2707 2708 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 2709 spdk_get_thread()); 2710 2711 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 2712 pthread_mutex_lock(&ch->bdev->internal.mutex); 2713 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 2714 pthread_mutex_unlock(&ch->bdev->internal.mutex); 2715 2716 mgmt_ch = shared_resource->mgmt_ch; 2717 2718 bdev_abort_queued_io(&ch->queued_resets, ch); 2719 bdev_abort_queued_io(&shared_resource->nomem_io, ch); 2720 bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); 2721 bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); 2722 2723 if (ch->histogram) { 2724 spdk_histogram_data_free(ch->histogram); 2725 } 2726 2727 bdev_channel_destroy_resource(ch); 2728 } 2729 2730 int 2731 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 2732 { 2733 struct spdk_bdev_alias *tmp; 2734 2735 if (alias == NULL) { 2736 SPDK_ERRLOG("Empty alias passed\n"); 2737 return -EINVAL; 2738 } 2739 2740 if (spdk_bdev_get_by_name(alias)) { 2741 SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); 2742 return -EEXIST; 2743 } 2744 2745 tmp = calloc(1, sizeof(*tmp)); 2746 if (tmp == NULL) { 2747 SPDK_ERRLOG("Unable to allocate alias\n"); 2748 return -ENOMEM; 2749 } 2750 2751 tmp->alias = strdup(alias); 2752 if (tmp->alias == NULL) { 2753 free(tmp); 2754 SPDK_ERRLOG("Unable to allocate alias\n"); 2755 return -ENOMEM; 2756 } 2757 2758 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 2759 2760 return 0; 2761 } 2762 2763 int 2764 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 2765 { 2766 struct spdk_bdev_alias *tmp; 2767 2768 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 2769 if (strcmp(alias, tmp->alias) == 0) { 2770 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 2771 free(tmp->alias); 2772 free(tmp); 2773 return 0; 2774 } 2775 } 2776 2777 SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); 2778 2779 return -ENOENT; 2780 } 2781 2782 void 2783 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 2784 { 2785 struct spdk_bdev_alias *p, *tmp; 2786 2787 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 2788 TAILQ_REMOVE(&bdev->aliases, p, tailq); 2789 free(p->alias); 2790 free(p); 2791 } 2792 } 2793 2794 struct spdk_io_channel * 2795 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 2796 { 2797 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 2798 } 2799 2800 const char * 2801 spdk_bdev_get_name(const struct spdk_bdev *bdev) 2802 { 2803 return bdev->name; 2804 } 2805 2806 const char * 2807 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 2808 { 2809 return bdev->product_name; 2810 } 2811 2812 const struct spdk_bdev_aliases_list * 2813 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 2814 { 2815 return &bdev->aliases; 2816 } 2817 2818 uint32_t 2819 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 2820 { 2821 return bdev->blocklen; 2822 } 2823 2824 uint32_t 2825 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 2826 { 2827 return bdev->write_unit_size; 2828 } 2829 2830 uint64_t 2831 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 2832 { 2833 return bdev->blockcnt; 2834 } 2835 2836 const char * 2837 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 2838 { 2839 return qos_rpc_type[type]; 2840 } 2841 2842 void 2843 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 2844 { 2845 int i; 2846 2847 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2848 2849 pthread_mutex_lock(&bdev->internal.mutex); 2850 if (bdev->internal.qos) { 2851 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2852 if (bdev->internal.qos->rate_limits[i].limit != 2853 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2854 limits[i] = bdev->internal.qos->rate_limits[i].limit; 2855 if (bdev_qos_is_iops_rate_limit(i) == false) { 2856 /* Change from Byte to Megabyte which is user visible. */ 2857 limits[i] = limits[i] / 1024 / 1024; 2858 } 2859 } 2860 } 2861 } 2862 pthread_mutex_unlock(&bdev->internal.mutex); 2863 } 2864 2865 size_t 2866 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 2867 { 2868 return 1 << bdev->required_alignment; 2869 } 2870 2871 uint32_t 2872 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 2873 { 2874 return bdev->optimal_io_boundary; 2875 } 2876 2877 bool 2878 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 2879 { 2880 return bdev->write_cache; 2881 } 2882 2883 const struct spdk_uuid * 2884 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 2885 { 2886 return &bdev->uuid; 2887 } 2888 2889 uint16_t 2890 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 2891 { 2892 return bdev->acwu; 2893 } 2894 2895 uint32_t 2896 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 2897 { 2898 return bdev->md_len; 2899 } 2900 2901 bool 2902 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 2903 { 2904 return (bdev->md_len != 0) && bdev->md_interleave; 2905 } 2906 2907 bool 2908 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 2909 { 2910 return (bdev->md_len != 0) && !bdev->md_interleave; 2911 } 2912 2913 bool 2914 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 2915 { 2916 return bdev->zoned; 2917 } 2918 2919 uint32_t 2920 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 2921 { 2922 if (spdk_bdev_is_md_interleaved(bdev)) { 2923 return bdev->blocklen - bdev->md_len; 2924 } else { 2925 return bdev->blocklen; 2926 } 2927 } 2928 2929 static uint32_t 2930 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 2931 { 2932 if (!spdk_bdev_is_md_interleaved(bdev)) { 2933 return bdev->blocklen + bdev->md_len; 2934 } else { 2935 return bdev->blocklen; 2936 } 2937 } 2938 2939 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 2940 { 2941 if (bdev->md_len != 0) { 2942 return bdev->dif_type; 2943 } else { 2944 return SPDK_DIF_DISABLE; 2945 } 2946 } 2947 2948 bool 2949 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 2950 { 2951 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 2952 return bdev->dif_is_head_of_md; 2953 } else { 2954 return false; 2955 } 2956 } 2957 2958 bool 2959 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 2960 enum spdk_dif_check_type check_type) 2961 { 2962 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 2963 return false; 2964 } 2965 2966 switch (check_type) { 2967 case SPDK_DIF_CHECK_TYPE_REFTAG: 2968 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 2969 case SPDK_DIF_CHECK_TYPE_APPTAG: 2970 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 2971 case SPDK_DIF_CHECK_TYPE_GUARD: 2972 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 2973 default: 2974 return false; 2975 } 2976 } 2977 2978 uint64_t 2979 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 2980 { 2981 return bdev->internal.measured_queue_depth; 2982 } 2983 2984 uint64_t 2985 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 2986 { 2987 return bdev->internal.period; 2988 } 2989 2990 uint64_t 2991 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 2992 { 2993 return bdev->internal.weighted_io_time; 2994 } 2995 2996 uint64_t 2997 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 2998 { 2999 return bdev->internal.io_time; 3000 } 3001 3002 static void 3003 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3004 { 3005 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3006 3007 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3008 3009 if (bdev->internal.measured_queue_depth) { 3010 bdev->internal.io_time += bdev->internal.period; 3011 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3012 } 3013 } 3014 3015 static void 3016 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3017 { 3018 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3019 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3020 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3021 3022 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3023 spdk_for_each_channel_continue(i, 0); 3024 } 3025 3026 static int 3027 bdev_calculate_measured_queue_depth(void *ctx) 3028 { 3029 struct spdk_bdev *bdev = ctx; 3030 bdev->internal.temporary_queue_depth = 0; 3031 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3032 _calculate_measured_qd_cpl); 3033 return 0; 3034 } 3035 3036 void 3037 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3038 { 3039 bdev->internal.period = period; 3040 3041 if (bdev->internal.qd_poller != NULL) { 3042 spdk_poller_unregister(&bdev->internal.qd_poller); 3043 bdev->internal.measured_queue_depth = UINT64_MAX; 3044 } 3045 3046 if (period != 0) { 3047 bdev->internal.qd_poller = spdk_poller_register(bdev_calculate_measured_queue_depth, bdev, 3048 period); 3049 } 3050 } 3051 3052 static void 3053 _resize_notify(void *arg) 3054 { 3055 struct spdk_bdev_desc *desc = arg; 3056 3057 pthread_mutex_lock(&desc->mutex); 3058 desc->refs--; 3059 if (!desc->closed) { 3060 pthread_mutex_unlock(&desc->mutex); 3061 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 3062 desc->bdev, 3063 desc->callback.ctx); 3064 return; 3065 } else if (0 == desc->refs) { 3066 /* This descriptor was closed after this resize_notify message was sent. 3067 * spdk_bdev_close() could not free the descriptor since this message was 3068 * in flight, so we free it now using bdev_desc_free(). 3069 */ 3070 pthread_mutex_unlock(&desc->mutex); 3071 bdev_desc_free(desc); 3072 return; 3073 } 3074 pthread_mutex_unlock(&desc->mutex); 3075 } 3076 3077 int 3078 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 3079 { 3080 struct spdk_bdev_desc *desc; 3081 int ret; 3082 3083 pthread_mutex_lock(&bdev->internal.mutex); 3084 3085 /* bdev has open descriptors */ 3086 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 3087 bdev->blockcnt > size) { 3088 ret = -EBUSY; 3089 } else { 3090 bdev->blockcnt = size; 3091 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 3092 pthread_mutex_lock(&desc->mutex); 3093 if (desc->callback.open_with_ext && !desc->closed) { 3094 desc->refs++; 3095 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 3096 } 3097 pthread_mutex_unlock(&desc->mutex); 3098 } 3099 ret = 0; 3100 } 3101 3102 pthread_mutex_unlock(&bdev->internal.mutex); 3103 3104 return ret; 3105 } 3106 3107 /* 3108 * Convert I/O offset and length from bytes to blocks. 3109 * 3110 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 3111 */ 3112 static uint64_t 3113 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 3114 uint64_t num_bytes, uint64_t *num_blocks) 3115 { 3116 uint32_t block_size = bdev->blocklen; 3117 uint8_t shift_cnt; 3118 3119 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3120 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 3121 shift_cnt = spdk_u32log2(block_size); 3122 *offset_blocks = offset_bytes >> shift_cnt; 3123 *num_blocks = num_bytes >> shift_cnt; 3124 return (offset_bytes - (*offset_blocks << shift_cnt)) | 3125 (num_bytes - (*num_blocks << shift_cnt)); 3126 } else { 3127 *offset_blocks = offset_bytes / block_size; 3128 *num_blocks = num_bytes / block_size; 3129 return (offset_bytes % block_size) | (num_bytes % block_size); 3130 } 3131 } 3132 3133 static bool 3134 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 3135 { 3136 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 3137 * has been an overflow and hence the offset has been wrapped around */ 3138 if (offset_blocks + num_blocks < offset_blocks) { 3139 return false; 3140 } 3141 3142 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 3143 if (offset_blocks + num_blocks > bdev->blockcnt) { 3144 return false; 3145 } 3146 3147 return true; 3148 } 3149 3150 static bool 3151 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 3152 { 3153 return _is_buf_allocated(iovs) == (md_buf != NULL); 3154 } 3155 3156 static int 3157 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 3158 void *md_buf, int64_t offset_blocks, uint64_t num_blocks, 3159 spdk_bdev_io_completion_cb cb, void *cb_arg) 3160 { 3161 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3162 struct spdk_bdev_io *bdev_io; 3163 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3164 3165 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3166 return -EINVAL; 3167 } 3168 3169 bdev_io = bdev_channel_get_io(channel); 3170 if (!bdev_io) { 3171 return -ENOMEM; 3172 } 3173 3174 bdev_io->internal.ch = channel; 3175 bdev_io->internal.desc = desc; 3176 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3177 bdev_io->u.bdev.iovs = &bdev_io->iov; 3178 bdev_io->u.bdev.iovs[0].iov_base = buf; 3179 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3180 bdev_io->u.bdev.iovcnt = 1; 3181 bdev_io->u.bdev.md_buf = md_buf; 3182 bdev_io->u.bdev.num_blocks = num_blocks; 3183 bdev_io->u.bdev.offset_blocks = offset_blocks; 3184 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3185 3186 bdev_io_submit(bdev_io); 3187 return 0; 3188 } 3189 3190 int 3191 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3192 void *buf, uint64_t offset, uint64_t nbytes, 3193 spdk_bdev_io_completion_cb cb, void *cb_arg) 3194 { 3195 uint64_t offset_blocks, num_blocks; 3196 3197 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3198 nbytes, &num_blocks) != 0) { 3199 return -EINVAL; 3200 } 3201 3202 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3203 } 3204 3205 int 3206 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3207 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3208 spdk_bdev_io_completion_cb cb, void *cb_arg) 3209 { 3210 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 3211 } 3212 3213 int 3214 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3215 void *buf, void *md_buf, int64_t offset_blocks, uint64_t num_blocks, 3216 spdk_bdev_io_completion_cb cb, void *cb_arg) 3217 { 3218 struct iovec iov = { 3219 .iov_base = buf, 3220 }; 3221 3222 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3223 return -EINVAL; 3224 } 3225 3226 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3227 return -EINVAL; 3228 } 3229 3230 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3231 cb, cb_arg); 3232 } 3233 3234 int 3235 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3236 struct iovec *iov, int iovcnt, 3237 uint64_t offset, uint64_t nbytes, 3238 spdk_bdev_io_completion_cb cb, void *cb_arg) 3239 { 3240 uint64_t offset_blocks, num_blocks; 3241 3242 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3243 nbytes, &num_blocks) != 0) { 3244 return -EINVAL; 3245 } 3246 3247 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3248 } 3249 3250 static int 3251 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3252 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 3253 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg) 3254 { 3255 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3256 struct spdk_bdev_io *bdev_io; 3257 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3258 3259 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3260 return -EINVAL; 3261 } 3262 3263 bdev_io = bdev_channel_get_io(channel); 3264 if (!bdev_io) { 3265 return -ENOMEM; 3266 } 3267 3268 bdev_io->internal.ch = channel; 3269 bdev_io->internal.desc = desc; 3270 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3271 bdev_io->u.bdev.iovs = iov; 3272 bdev_io->u.bdev.iovcnt = iovcnt; 3273 bdev_io->u.bdev.md_buf = md_buf; 3274 bdev_io->u.bdev.num_blocks = num_blocks; 3275 bdev_io->u.bdev.offset_blocks = offset_blocks; 3276 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3277 3278 bdev_io_submit(bdev_io); 3279 return 0; 3280 } 3281 3282 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3283 struct iovec *iov, int iovcnt, 3284 uint64_t offset_blocks, uint64_t num_blocks, 3285 spdk_bdev_io_completion_cb cb, void *cb_arg) 3286 { 3287 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3288 num_blocks, cb, cb_arg); 3289 } 3290 3291 int 3292 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3293 struct iovec *iov, int iovcnt, void *md_buf, 3294 uint64_t offset_blocks, uint64_t num_blocks, 3295 spdk_bdev_io_completion_cb cb, void *cb_arg) 3296 { 3297 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3298 return -EINVAL; 3299 } 3300 3301 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3302 return -EINVAL; 3303 } 3304 3305 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3306 num_blocks, cb, cb_arg); 3307 } 3308 3309 static int 3310 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3311 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3312 spdk_bdev_io_completion_cb cb, void *cb_arg) 3313 { 3314 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3315 struct spdk_bdev_io *bdev_io; 3316 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3317 3318 if (!desc->write) { 3319 return -EBADF; 3320 } 3321 3322 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3323 return -EINVAL; 3324 } 3325 3326 bdev_io = bdev_channel_get_io(channel); 3327 if (!bdev_io) { 3328 return -ENOMEM; 3329 } 3330 3331 bdev_io->internal.ch = channel; 3332 bdev_io->internal.desc = desc; 3333 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3334 bdev_io->u.bdev.iovs = &bdev_io->iov; 3335 bdev_io->u.bdev.iovs[0].iov_base = buf; 3336 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3337 bdev_io->u.bdev.iovcnt = 1; 3338 bdev_io->u.bdev.md_buf = md_buf; 3339 bdev_io->u.bdev.num_blocks = num_blocks; 3340 bdev_io->u.bdev.offset_blocks = offset_blocks; 3341 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3342 3343 bdev_io_submit(bdev_io); 3344 return 0; 3345 } 3346 3347 int 3348 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3349 void *buf, uint64_t offset, uint64_t nbytes, 3350 spdk_bdev_io_completion_cb cb, void *cb_arg) 3351 { 3352 uint64_t offset_blocks, num_blocks; 3353 3354 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3355 nbytes, &num_blocks) != 0) { 3356 return -EINVAL; 3357 } 3358 3359 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 3360 } 3361 3362 int 3363 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3364 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3365 spdk_bdev_io_completion_cb cb, void *cb_arg) 3366 { 3367 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 3368 cb, cb_arg); 3369 } 3370 3371 int 3372 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3373 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3374 spdk_bdev_io_completion_cb cb, void *cb_arg) 3375 { 3376 struct iovec iov = { 3377 .iov_base = buf, 3378 }; 3379 3380 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3381 return -EINVAL; 3382 } 3383 3384 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3385 return -EINVAL; 3386 } 3387 3388 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3389 cb, cb_arg); 3390 } 3391 3392 static int 3393 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3394 struct iovec *iov, int iovcnt, void *md_buf, 3395 uint64_t offset_blocks, uint64_t num_blocks, 3396 spdk_bdev_io_completion_cb cb, void *cb_arg) 3397 { 3398 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3399 struct spdk_bdev_io *bdev_io; 3400 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3401 3402 if (!desc->write) { 3403 return -EBADF; 3404 } 3405 3406 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3407 return -EINVAL; 3408 } 3409 3410 bdev_io = bdev_channel_get_io(channel); 3411 if (!bdev_io) { 3412 return -ENOMEM; 3413 } 3414 3415 bdev_io->internal.ch = channel; 3416 bdev_io->internal.desc = desc; 3417 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3418 bdev_io->u.bdev.iovs = iov; 3419 bdev_io->u.bdev.iovcnt = iovcnt; 3420 bdev_io->u.bdev.md_buf = md_buf; 3421 bdev_io->u.bdev.num_blocks = num_blocks; 3422 bdev_io->u.bdev.offset_blocks = offset_blocks; 3423 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3424 3425 bdev_io_submit(bdev_io); 3426 return 0; 3427 } 3428 3429 int 3430 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3431 struct iovec *iov, int iovcnt, 3432 uint64_t offset, uint64_t len, 3433 spdk_bdev_io_completion_cb cb, void *cb_arg) 3434 { 3435 uint64_t offset_blocks, num_blocks; 3436 3437 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3438 len, &num_blocks) != 0) { 3439 return -EINVAL; 3440 } 3441 3442 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 3443 } 3444 3445 int 3446 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3447 struct iovec *iov, int iovcnt, 3448 uint64_t offset_blocks, uint64_t num_blocks, 3449 spdk_bdev_io_completion_cb cb, void *cb_arg) 3450 { 3451 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3452 num_blocks, cb, cb_arg); 3453 } 3454 3455 int 3456 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3457 struct iovec *iov, int iovcnt, void *md_buf, 3458 uint64_t offset_blocks, uint64_t num_blocks, 3459 spdk_bdev_io_completion_cb cb, void *cb_arg) 3460 { 3461 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3462 return -EINVAL; 3463 } 3464 3465 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3466 return -EINVAL; 3467 } 3468 3469 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3470 num_blocks, cb, cb_arg); 3471 } 3472 3473 static void 3474 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3475 { 3476 struct spdk_bdev_io *parent_io = cb_arg; 3477 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 3478 int i, rc = 0; 3479 3480 if (!success) { 3481 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3482 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3483 spdk_bdev_free_io(bdev_io); 3484 return; 3485 } 3486 3487 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 3488 rc = memcmp(read_buf, 3489 parent_io->u.bdev.iovs[i].iov_base, 3490 parent_io->u.bdev.iovs[i].iov_len); 3491 if (rc) { 3492 break; 3493 } 3494 read_buf += parent_io->u.bdev.iovs[i].iov_len; 3495 } 3496 3497 spdk_bdev_free_io(bdev_io); 3498 3499 if (rc == 0) { 3500 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3501 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 3502 } else { 3503 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 3504 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 3505 } 3506 } 3507 3508 static void 3509 bdev_compare_do_read(void *_bdev_io) 3510 { 3511 struct spdk_bdev_io *bdev_io = _bdev_io; 3512 int rc; 3513 3514 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 3515 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 3516 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3517 bdev_compare_do_read_done, bdev_io); 3518 3519 if (rc == -ENOMEM) { 3520 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 3521 } else if (rc != 0) { 3522 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3523 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3524 } 3525 } 3526 3527 static int 3528 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3529 struct iovec *iov, int iovcnt, void *md_buf, 3530 uint64_t offset_blocks, uint64_t num_blocks, 3531 spdk_bdev_io_completion_cb cb, void *cb_arg) 3532 { 3533 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3534 struct spdk_bdev_io *bdev_io; 3535 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3536 3537 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3538 return -EINVAL; 3539 } 3540 3541 bdev_io = bdev_channel_get_io(channel); 3542 if (!bdev_io) { 3543 return -ENOMEM; 3544 } 3545 3546 bdev_io->internal.ch = channel; 3547 bdev_io->internal.desc = desc; 3548 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 3549 bdev_io->u.bdev.iovs = iov; 3550 bdev_io->u.bdev.iovcnt = iovcnt; 3551 bdev_io->u.bdev.md_buf = md_buf; 3552 bdev_io->u.bdev.num_blocks = num_blocks; 3553 bdev_io->u.bdev.offset_blocks = offset_blocks; 3554 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3555 3556 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 3557 bdev_io_submit(bdev_io); 3558 return 0; 3559 } 3560 3561 bdev_compare_do_read(bdev_io); 3562 3563 return 0; 3564 } 3565 3566 int 3567 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3568 struct iovec *iov, int iovcnt, 3569 uint64_t offset_blocks, uint64_t num_blocks, 3570 spdk_bdev_io_completion_cb cb, void *cb_arg) 3571 { 3572 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 3573 num_blocks, cb, cb_arg); 3574 } 3575 3576 int 3577 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3578 struct iovec *iov, int iovcnt, void *md_buf, 3579 uint64_t offset_blocks, uint64_t num_blocks, 3580 spdk_bdev_io_completion_cb cb, void *cb_arg) 3581 { 3582 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3583 return -EINVAL; 3584 } 3585 3586 if (!_bdev_io_check_md_buf(iov, md_buf)) { 3587 return -EINVAL; 3588 } 3589 3590 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 3591 num_blocks, cb, cb_arg); 3592 } 3593 3594 static int 3595 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3596 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3597 spdk_bdev_io_completion_cb cb, void *cb_arg) 3598 { 3599 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3600 struct spdk_bdev_io *bdev_io; 3601 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3602 3603 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3604 return -EINVAL; 3605 } 3606 3607 bdev_io = bdev_channel_get_io(channel); 3608 if (!bdev_io) { 3609 return -ENOMEM; 3610 } 3611 3612 bdev_io->internal.ch = channel; 3613 bdev_io->internal.desc = desc; 3614 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 3615 bdev_io->u.bdev.iovs = &bdev_io->iov; 3616 bdev_io->u.bdev.iovs[0].iov_base = buf; 3617 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 3618 bdev_io->u.bdev.iovcnt = 1; 3619 bdev_io->u.bdev.md_buf = md_buf; 3620 bdev_io->u.bdev.num_blocks = num_blocks; 3621 bdev_io->u.bdev.offset_blocks = offset_blocks; 3622 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3623 3624 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 3625 bdev_io_submit(bdev_io); 3626 return 0; 3627 } 3628 3629 bdev_compare_do_read(bdev_io); 3630 3631 return 0; 3632 } 3633 3634 int 3635 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3636 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 3637 spdk_bdev_io_completion_cb cb, void *cb_arg) 3638 { 3639 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 3640 cb, cb_arg); 3641 } 3642 3643 int 3644 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3645 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 3646 spdk_bdev_io_completion_cb cb, void *cb_arg) 3647 { 3648 struct iovec iov = { 3649 .iov_base = buf, 3650 }; 3651 3652 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 3653 return -EINVAL; 3654 } 3655 3656 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 3657 return -EINVAL; 3658 } 3659 3660 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 3661 cb, cb_arg); 3662 } 3663 3664 static void 3665 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 3666 { 3667 struct spdk_bdev_io *bdev_io = ctx; 3668 3669 if (unlock_status) { 3670 SPDK_ERRLOG("LBA range unlock failed\n"); 3671 } 3672 3673 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 3674 false, bdev_io->internal.caller_ctx); 3675 } 3676 3677 static void 3678 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 3679 { 3680 bdev_io->internal.status = status; 3681 3682 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 3683 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3684 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 3685 } 3686 3687 static void 3688 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3689 { 3690 struct spdk_bdev_io *parent_io = cb_arg; 3691 3692 if (!success) { 3693 SPDK_ERRLOG("Compare and write operation failed\n"); 3694 } 3695 3696 spdk_bdev_free_io(bdev_io); 3697 3698 bdev_comparev_and_writev_blocks_unlock(parent_io, 3699 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 3700 } 3701 3702 static void 3703 bdev_compare_and_write_do_write(void *_bdev_io) 3704 { 3705 struct spdk_bdev_io *bdev_io = _bdev_io; 3706 int rc; 3707 3708 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 3709 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3710 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 3711 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3712 bdev_compare_and_write_do_write_done, bdev_io); 3713 3714 3715 if (rc == -ENOMEM) { 3716 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 3717 } else if (rc != 0) { 3718 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3719 } 3720 } 3721 3722 static void 3723 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3724 { 3725 struct spdk_bdev_io *parent_io = cb_arg; 3726 3727 spdk_bdev_free_io(bdev_io); 3728 3729 if (!success) { 3730 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 3731 return; 3732 } 3733 3734 bdev_compare_and_write_do_write(parent_io); 3735 } 3736 3737 static void 3738 bdev_compare_and_write_do_compare(void *_bdev_io) 3739 { 3740 struct spdk_bdev_io *bdev_io = _bdev_io; 3741 int rc; 3742 3743 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 3744 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 3745 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3746 bdev_compare_and_write_do_compare_done, bdev_io); 3747 3748 if (rc == -ENOMEM) { 3749 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 3750 } else if (rc != 0) { 3751 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 3752 } 3753 } 3754 3755 static void 3756 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 3757 { 3758 struct spdk_bdev_io *bdev_io = ctx; 3759 3760 if (status) { 3761 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 3762 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3763 } 3764 3765 bdev_compare_and_write_do_compare(bdev_io); 3766 } 3767 3768 int 3769 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3770 struct iovec *compare_iov, int compare_iovcnt, 3771 struct iovec *write_iov, int write_iovcnt, 3772 uint64_t offset_blocks, uint64_t num_blocks, 3773 spdk_bdev_io_completion_cb cb, void *cb_arg) 3774 { 3775 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3776 struct spdk_bdev_io *bdev_io; 3777 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3778 3779 if (!desc->write) { 3780 return -EBADF; 3781 } 3782 3783 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3784 return -EINVAL; 3785 } 3786 3787 if (num_blocks > bdev->acwu) { 3788 return -EINVAL; 3789 } 3790 3791 bdev_io = bdev_channel_get_io(channel); 3792 if (!bdev_io) { 3793 return -ENOMEM; 3794 } 3795 3796 bdev_io->internal.ch = channel; 3797 bdev_io->internal.desc = desc; 3798 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 3799 bdev_io->u.bdev.iovs = compare_iov; 3800 bdev_io->u.bdev.iovcnt = compare_iovcnt; 3801 bdev_io->u.bdev.fused_iovs = write_iov; 3802 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 3803 bdev_io->u.bdev.md_buf = NULL; 3804 bdev_io->u.bdev.num_blocks = num_blocks; 3805 bdev_io->u.bdev.offset_blocks = offset_blocks; 3806 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3807 3808 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 3809 bdev_io_submit(bdev_io); 3810 return 0; 3811 } 3812 3813 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 3814 bdev_comparev_and_writev_blocks_locked, bdev_io); 3815 } 3816 3817 static void 3818 bdev_zcopy_get_buf(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3819 { 3820 if (!success) { 3821 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 3822 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 3823 bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); 3824 return; 3825 } 3826 3827 if (bdev_io->u.bdev.zcopy.populate) { 3828 /* Read the real data into the buffer */ 3829 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 3830 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3831 bdev_io_submit(bdev_io); 3832 return; 3833 } 3834 3835 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 3836 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3837 bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); 3838 } 3839 3840 int 3841 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3842 uint64_t offset_blocks, uint64_t num_blocks, 3843 bool populate, 3844 spdk_bdev_io_completion_cb cb, void *cb_arg) 3845 { 3846 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3847 struct spdk_bdev_io *bdev_io; 3848 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3849 3850 if (!desc->write) { 3851 return -EBADF; 3852 } 3853 3854 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3855 return -EINVAL; 3856 } 3857 3858 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 3859 return -ENOTSUP; 3860 } 3861 3862 bdev_io = bdev_channel_get_io(channel); 3863 if (!bdev_io) { 3864 return -ENOMEM; 3865 } 3866 3867 bdev_io->internal.ch = channel; 3868 bdev_io->internal.desc = desc; 3869 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 3870 bdev_io->u.bdev.num_blocks = num_blocks; 3871 bdev_io->u.bdev.offset_blocks = offset_blocks; 3872 bdev_io->u.bdev.iovs = NULL; 3873 bdev_io->u.bdev.iovcnt = 0; 3874 bdev_io->u.bdev.md_buf = NULL; 3875 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 3876 bdev_io->u.bdev.zcopy.commit = 0; 3877 bdev_io->u.bdev.zcopy.start = 1; 3878 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3879 3880 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 3881 bdev_io_submit(bdev_io); 3882 } else { 3883 /* Emulate zcopy by allocating a buffer */ 3884 spdk_bdev_io_get_buf(bdev_io, bdev_zcopy_get_buf, 3885 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3886 } 3887 3888 return 0; 3889 } 3890 3891 int 3892 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 3893 spdk_bdev_io_completion_cb cb, void *cb_arg) 3894 { 3895 struct spdk_bdev *bdev = bdev_io->bdev; 3896 3897 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 3898 /* This can happen if the zcopy was emulated in start */ 3899 if (bdev_io->u.bdev.zcopy.start != 1) { 3900 return -EINVAL; 3901 } 3902 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 3903 } 3904 3905 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 3906 return -EINVAL; 3907 } 3908 3909 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 3910 bdev_io->u.bdev.zcopy.start = 0; 3911 bdev_io->internal.caller_ctx = cb_arg; 3912 bdev_io->internal.cb = cb; 3913 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3914 3915 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 3916 bdev_io_submit(bdev_io); 3917 return 0; 3918 } 3919 3920 if (!bdev_io->u.bdev.zcopy.commit) { 3921 /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ 3922 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3923 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 3924 return 0; 3925 } 3926 3927 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 3928 bdev_io_submit(bdev_io); 3929 3930 return 0; 3931 } 3932 3933 int 3934 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3935 uint64_t offset, uint64_t len, 3936 spdk_bdev_io_completion_cb cb, void *cb_arg) 3937 { 3938 uint64_t offset_blocks, num_blocks; 3939 3940 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 3941 len, &num_blocks) != 0) { 3942 return -EINVAL; 3943 } 3944 3945 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 3946 } 3947 3948 int 3949 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3950 uint64_t offset_blocks, uint64_t num_blocks, 3951 spdk_bdev_io_completion_cb cb, void *cb_arg) 3952 { 3953 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3954 struct spdk_bdev_io *bdev_io; 3955 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 3956 3957 if (!desc->write) { 3958 return -EBADF; 3959 } 3960 3961 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 3962 return -EINVAL; 3963 } 3964 3965 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 3966 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 3967 return -ENOTSUP; 3968 } 3969 3970 bdev_io = bdev_channel_get_io(channel); 3971 3972 if (!bdev_io) { 3973 return -ENOMEM; 3974 } 3975 3976 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 3977 bdev_io->internal.ch = channel; 3978 bdev_io->internal.desc = desc; 3979 bdev_io->u.bdev.offset_blocks = offset_blocks; 3980 bdev_io->u.bdev.num_blocks = num_blocks; 3981 bdev_io_init(bdev_io, bdev, cb_arg, cb); 3982 3983 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 3984 bdev_io_submit(bdev_io); 3985 return 0; 3986 } 3987 3988 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 3989 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 3990 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 3991 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 3992 bdev_write_zero_buffer_next(bdev_io); 3993 3994 return 0; 3995 } 3996 3997 int 3998 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 3999 uint64_t offset, uint64_t nbytes, 4000 spdk_bdev_io_completion_cb cb, void *cb_arg) 4001 { 4002 uint64_t offset_blocks, num_blocks; 4003 4004 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4005 nbytes, &num_blocks) != 0) { 4006 return -EINVAL; 4007 } 4008 4009 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4010 } 4011 4012 int 4013 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4014 uint64_t offset_blocks, uint64_t num_blocks, 4015 spdk_bdev_io_completion_cb cb, void *cb_arg) 4016 { 4017 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4018 struct spdk_bdev_io *bdev_io; 4019 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4020 4021 if (!desc->write) { 4022 return -EBADF; 4023 } 4024 4025 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4026 return -EINVAL; 4027 } 4028 4029 if (num_blocks == 0) { 4030 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 4031 return -EINVAL; 4032 } 4033 4034 bdev_io = bdev_channel_get_io(channel); 4035 if (!bdev_io) { 4036 return -ENOMEM; 4037 } 4038 4039 bdev_io->internal.ch = channel; 4040 bdev_io->internal.desc = desc; 4041 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 4042 4043 bdev_io->u.bdev.iovs = &bdev_io->iov; 4044 bdev_io->u.bdev.iovs[0].iov_base = NULL; 4045 bdev_io->u.bdev.iovs[0].iov_len = 0; 4046 bdev_io->u.bdev.iovcnt = 1; 4047 4048 bdev_io->u.bdev.offset_blocks = offset_blocks; 4049 bdev_io->u.bdev.num_blocks = num_blocks; 4050 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4051 4052 bdev_io_submit(bdev_io); 4053 return 0; 4054 } 4055 4056 int 4057 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4058 uint64_t offset, uint64_t length, 4059 spdk_bdev_io_completion_cb cb, void *cb_arg) 4060 { 4061 uint64_t offset_blocks, num_blocks; 4062 4063 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4064 length, &num_blocks) != 0) { 4065 return -EINVAL; 4066 } 4067 4068 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4069 } 4070 4071 int 4072 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4073 uint64_t offset_blocks, uint64_t num_blocks, 4074 spdk_bdev_io_completion_cb cb, void *cb_arg) 4075 { 4076 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4077 struct spdk_bdev_io *bdev_io; 4078 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4079 4080 if (!desc->write) { 4081 return -EBADF; 4082 } 4083 4084 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4085 return -EINVAL; 4086 } 4087 4088 bdev_io = bdev_channel_get_io(channel); 4089 if (!bdev_io) { 4090 return -ENOMEM; 4091 } 4092 4093 bdev_io->internal.ch = channel; 4094 bdev_io->internal.desc = desc; 4095 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 4096 bdev_io->u.bdev.iovs = NULL; 4097 bdev_io->u.bdev.iovcnt = 0; 4098 bdev_io->u.bdev.offset_blocks = offset_blocks; 4099 bdev_io->u.bdev.num_blocks = num_blocks; 4100 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4101 4102 bdev_io_submit(bdev_io); 4103 return 0; 4104 } 4105 4106 static void 4107 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 4108 { 4109 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 4110 struct spdk_bdev_io *bdev_io; 4111 4112 bdev_io = TAILQ_FIRST(&ch->queued_resets); 4113 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 4114 bdev_io_submit_reset(bdev_io); 4115 } 4116 4117 static void 4118 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 4119 { 4120 struct spdk_io_channel *ch; 4121 struct spdk_bdev_channel *channel; 4122 struct spdk_bdev_mgmt_channel *mgmt_channel; 4123 struct spdk_bdev_shared_resource *shared_resource; 4124 bdev_io_tailq_t tmp_queued; 4125 4126 TAILQ_INIT(&tmp_queued); 4127 4128 ch = spdk_io_channel_iter_get_channel(i); 4129 channel = spdk_io_channel_get_ctx(ch); 4130 shared_resource = channel->shared_resource; 4131 mgmt_channel = shared_resource->mgmt_ch; 4132 4133 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 4134 4135 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 4136 /* The QoS object is always valid and readable while 4137 * the channel flag is set, so the lock here should not 4138 * be necessary. We're not in the fast path though, so 4139 * just take it anyway. */ 4140 pthread_mutex_lock(&channel->bdev->internal.mutex); 4141 if (channel->bdev->internal.qos->ch == channel) { 4142 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 4143 } 4144 pthread_mutex_unlock(&channel->bdev->internal.mutex); 4145 } 4146 4147 bdev_abort_queued_io(&shared_resource->nomem_io, channel); 4148 bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); 4149 bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); 4150 bdev_abort_queued_io(&tmp_queued, channel); 4151 4152 spdk_for_each_channel_continue(i, 0); 4153 } 4154 4155 static void 4156 bdev_start_reset(void *ctx) 4157 { 4158 struct spdk_bdev_channel *ch = ctx; 4159 4160 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 4161 ch, bdev_reset_dev); 4162 } 4163 4164 static void 4165 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 4166 { 4167 struct spdk_bdev *bdev = ch->bdev; 4168 4169 assert(!TAILQ_EMPTY(&ch->queued_resets)); 4170 4171 pthread_mutex_lock(&bdev->internal.mutex); 4172 if (bdev->internal.reset_in_progress == NULL) { 4173 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 4174 /* 4175 * Take a channel reference for the target bdev for the life of this 4176 * reset. This guards against the channel getting destroyed while 4177 * spdk_for_each_channel() calls related to this reset IO are in 4178 * progress. We will release the reference when this reset is 4179 * completed. 4180 */ 4181 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4182 bdev_start_reset(ch); 4183 } 4184 pthread_mutex_unlock(&bdev->internal.mutex); 4185 } 4186 4187 int 4188 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4189 spdk_bdev_io_completion_cb cb, void *cb_arg) 4190 { 4191 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4192 struct spdk_bdev_io *bdev_io; 4193 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4194 4195 bdev_io = bdev_channel_get_io(channel); 4196 if (!bdev_io) { 4197 return -ENOMEM; 4198 } 4199 4200 bdev_io->internal.ch = channel; 4201 bdev_io->internal.desc = desc; 4202 bdev_io->internal.submit_tsc = spdk_get_ticks(); 4203 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 4204 bdev_io->u.reset.ch_ref = NULL; 4205 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4206 4207 pthread_mutex_lock(&bdev->internal.mutex); 4208 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 4209 pthread_mutex_unlock(&bdev->internal.mutex); 4210 4211 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 4212 internal.ch_link); 4213 4214 bdev_channel_start_reset(channel); 4215 4216 return 0; 4217 } 4218 4219 void 4220 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4221 struct spdk_bdev_io_stat *stat) 4222 { 4223 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4224 4225 *stat = channel->stat; 4226 } 4227 4228 static void 4229 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 4230 { 4231 void *io_device = spdk_io_channel_iter_get_io_device(i); 4232 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4233 4234 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 4235 bdev_iostat_ctx->cb_arg, 0); 4236 free(bdev_iostat_ctx); 4237 } 4238 4239 static void 4240 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 4241 { 4242 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 4243 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 4244 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4245 4246 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 4247 spdk_for_each_channel_continue(i, 0); 4248 } 4249 4250 void 4251 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 4252 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 4253 { 4254 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 4255 4256 assert(bdev != NULL); 4257 assert(stat != NULL); 4258 assert(cb != NULL); 4259 4260 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 4261 if (bdev_iostat_ctx == NULL) { 4262 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 4263 cb(bdev, stat, cb_arg, -ENOMEM); 4264 return; 4265 } 4266 4267 bdev_iostat_ctx->stat = stat; 4268 bdev_iostat_ctx->cb = cb; 4269 bdev_iostat_ctx->cb_arg = cb_arg; 4270 4271 /* Start with the statistics from previously deleted channels. */ 4272 pthread_mutex_lock(&bdev->internal.mutex); 4273 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 4274 pthread_mutex_unlock(&bdev->internal.mutex); 4275 4276 /* Then iterate and add the statistics from each existing channel. */ 4277 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4278 bdev_get_each_channel_stat, 4279 bdev_iostat_ctx, 4280 bdev_get_device_stat_done); 4281 } 4282 4283 int 4284 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4285 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4286 spdk_bdev_io_completion_cb cb, void *cb_arg) 4287 { 4288 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4289 struct spdk_bdev_io *bdev_io; 4290 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4291 4292 if (!desc->write) { 4293 return -EBADF; 4294 } 4295 4296 bdev_io = bdev_channel_get_io(channel); 4297 if (!bdev_io) { 4298 return -ENOMEM; 4299 } 4300 4301 bdev_io->internal.ch = channel; 4302 bdev_io->internal.desc = desc; 4303 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 4304 bdev_io->u.nvme_passthru.cmd = *cmd; 4305 bdev_io->u.nvme_passthru.buf = buf; 4306 bdev_io->u.nvme_passthru.nbytes = nbytes; 4307 bdev_io->u.nvme_passthru.md_buf = NULL; 4308 bdev_io->u.nvme_passthru.md_len = 0; 4309 4310 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4311 4312 bdev_io_submit(bdev_io); 4313 return 0; 4314 } 4315 4316 int 4317 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4318 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 4319 spdk_bdev_io_completion_cb cb, void *cb_arg) 4320 { 4321 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4322 struct spdk_bdev_io *bdev_io; 4323 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4324 4325 if (!desc->write) { 4326 /* 4327 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4328 * to easily determine if the command is a read or write, but for now just 4329 * do not allow io_passthru with a read-only descriptor. 4330 */ 4331 return -EBADF; 4332 } 4333 4334 bdev_io = bdev_channel_get_io(channel); 4335 if (!bdev_io) { 4336 return -ENOMEM; 4337 } 4338 4339 bdev_io->internal.ch = channel; 4340 bdev_io->internal.desc = desc; 4341 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 4342 bdev_io->u.nvme_passthru.cmd = *cmd; 4343 bdev_io->u.nvme_passthru.buf = buf; 4344 bdev_io->u.nvme_passthru.nbytes = nbytes; 4345 bdev_io->u.nvme_passthru.md_buf = NULL; 4346 bdev_io->u.nvme_passthru.md_len = 0; 4347 4348 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4349 4350 bdev_io_submit(bdev_io); 4351 return 0; 4352 } 4353 4354 int 4355 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4356 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 4357 spdk_bdev_io_completion_cb cb, void *cb_arg) 4358 { 4359 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4360 struct spdk_bdev_io *bdev_io; 4361 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4362 4363 if (!desc->write) { 4364 /* 4365 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 4366 * to easily determine if the command is a read or write, but for now just 4367 * do not allow io_passthru with a read-only descriptor. 4368 */ 4369 return -EBADF; 4370 } 4371 4372 bdev_io = bdev_channel_get_io(channel); 4373 if (!bdev_io) { 4374 return -ENOMEM; 4375 } 4376 4377 bdev_io->internal.ch = channel; 4378 bdev_io->internal.desc = desc; 4379 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 4380 bdev_io->u.nvme_passthru.cmd = *cmd; 4381 bdev_io->u.nvme_passthru.buf = buf; 4382 bdev_io->u.nvme_passthru.nbytes = nbytes; 4383 bdev_io->u.nvme_passthru.md_buf = md_buf; 4384 bdev_io->u.nvme_passthru.md_len = md_len; 4385 4386 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4387 4388 bdev_io_submit(bdev_io); 4389 return 0; 4390 } 4391 4392 int 4393 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 4394 struct spdk_bdev_io_wait_entry *entry) 4395 { 4396 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4397 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 4398 4399 if (bdev != entry->bdev) { 4400 SPDK_ERRLOG("bdevs do not match\n"); 4401 return -EINVAL; 4402 } 4403 4404 if (mgmt_ch->per_thread_cache_count > 0) { 4405 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 4406 return -EINVAL; 4407 } 4408 4409 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 4410 return 0; 4411 } 4412 4413 static void 4414 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 4415 { 4416 struct spdk_bdev *bdev = bdev_ch->bdev; 4417 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 4418 struct spdk_bdev_io *bdev_io; 4419 4420 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 4421 /* 4422 * Allow some more I/O to complete before retrying the nomem_io queue. 4423 * Some drivers (such as nvme) cannot immediately take a new I/O in 4424 * the context of a completion, because the resources for the I/O are 4425 * not released until control returns to the bdev poller. Also, we 4426 * may require several small I/O to complete before a larger I/O 4427 * (that requires splitting) can be submitted. 4428 */ 4429 return; 4430 } 4431 4432 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 4433 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 4434 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 4435 bdev_io->internal.ch->io_outstanding++; 4436 shared_resource->io_outstanding++; 4437 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4438 bdev_io->internal.error.nvme.cdw0 = 0; 4439 bdev_io->num_retries++; 4440 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 4441 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 4442 break; 4443 } 4444 } 4445 } 4446 4447 static inline void 4448 bdev_io_complete(void *ctx) 4449 { 4450 struct spdk_bdev_io *bdev_io = ctx; 4451 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 4452 uint64_t tsc, tsc_diff; 4453 4454 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 4455 /* 4456 * Send the completion to the thread that originally submitted the I/O, 4457 * which may not be the current thread in the case of QoS. 4458 */ 4459 if (bdev_io->internal.io_submit_ch) { 4460 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 4461 bdev_io->internal.io_submit_ch = NULL; 4462 } 4463 4464 /* 4465 * Defer completion to avoid potential infinite recursion if the 4466 * user's completion callback issues a new I/O. 4467 */ 4468 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 4469 bdev_io_complete, bdev_io); 4470 return; 4471 } 4472 4473 tsc = spdk_get_ticks(); 4474 tsc_diff = tsc - bdev_io->internal.submit_tsc; 4475 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); 4476 /* When a bdev_io is split, the children bdev_io are not added 4477 * to the io_submitted list. So don't try to remove them in that 4478 * case. 4479 */ 4480 if (bdev_io->internal.cb != bdev_io_split_done) { 4481 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 4482 } 4483 4484 if (bdev_io->internal.ch->histogram) { 4485 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 4486 } 4487 4488 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 4489 switch (bdev_io->type) { 4490 case SPDK_BDEV_IO_TYPE_READ: 4491 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4492 bdev_io->internal.ch->stat.num_read_ops++; 4493 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 4494 break; 4495 case SPDK_BDEV_IO_TYPE_WRITE: 4496 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4497 bdev_io->internal.ch->stat.num_write_ops++; 4498 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 4499 break; 4500 case SPDK_BDEV_IO_TYPE_UNMAP: 4501 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4502 bdev_io->internal.ch->stat.num_unmap_ops++; 4503 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 4504 break; 4505 case SPDK_BDEV_IO_TYPE_ZCOPY: 4506 /* Track the data in the start phase only */ 4507 if (bdev_io->u.bdev.zcopy.start) { 4508 if (bdev_io->u.bdev.zcopy.populate) { 4509 bdev_io->internal.ch->stat.bytes_read += 4510 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4511 bdev_io->internal.ch->stat.num_read_ops++; 4512 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 4513 } else { 4514 bdev_io->internal.ch->stat.bytes_written += 4515 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 4516 bdev_io->internal.ch->stat.num_write_ops++; 4517 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 4518 } 4519 } 4520 break; 4521 default: 4522 break; 4523 } 4524 } 4525 4526 #ifdef SPDK_CONFIG_VTUNE 4527 uint64_t now_tsc = spdk_get_ticks(); 4528 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 4529 uint64_t data[5]; 4530 4531 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 4532 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 4533 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 4534 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 4535 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 4536 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 4537 4538 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 4539 __itt_metadata_u64, 5, data); 4540 4541 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 4542 bdev_io->internal.ch->start_tsc = now_tsc; 4543 } 4544 #endif 4545 4546 assert(bdev_io->internal.cb != NULL); 4547 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 4548 4549 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 4550 bdev_io->internal.caller_ctx); 4551 } 4552 4553 static void 4554 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 4555 { 4556 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 4557 4558 if (bdev_io->u.reset.ch_ref != NULL) { 4559 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 4560 bdev_io->u.reset.ch_ref = NULL; 4561 } 4562 4563 bdev_io_complete(bdev_io); 4564 } 4565 4566 static void 4567 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 4568 { 4569 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 4570 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4571 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 4572 struct spdk_bdev_io *queued_reset; 4573 4574 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 4575 while (!TAILQ_EMPTY(&ch->queued_resets)) { 4576 queued_reset = TAILQ_FIRST(&ch->queued_resets); 4577 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 4578 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 4579 } 4580 4581 spdk_for_each_channel_continue(i, 0); 4582 } 4583 4584 void 4585 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 4586 { 4587 struct spdk_bdev *bdev = bdev_io->bdev; 4588 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 4589 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 4590 4591 bdev_io->internal.status = status; 4592 4593 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 4594 bool unlock_channels = false; 4595 4596 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 4597 SPDK_ERRLOG("NOMEM returned for reset\n"); 4598 } 4599 pthread_mutex_lock(&bdev->internal.mutex); 4600 if (bdev_io == bdev->internal.reset_in_progress) { 4601 bdev->internal.reset_in_progress = NULL; 4602 unlock_channels = true; 4603 } 4604 pthread_mutex_unlock(&bdev->internal.mutex); 4605 4606 if (unlock_channels) { 4607 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 4608 bdev_io, bdev_reset_complete); 4609 return; 4610 } 4611 } else { 4612 _bdev_io_unset_bounce_buf(bdev_io); 4613 4614 assert(bdev_ch->io_outstanding > 0); 4615 assert(shared_resource->io_outstanding > 0); 4616 bdev_ch->io_outstanding--; 4617 shared_resource->io_outstanding--; 4618 4619 if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { 4620 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 4621 /* 4622 * Wait for some of the outstanding I/O to complete before we 4623 * retry any of the nomem_io. Normally we will wait for 4624 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 4625 * depth channels we will instead wait for half to complete. 4626 */ 4627 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 4628 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 4629 return; 4630 } 4631 4632 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 4633 bdev_ch_retry_io(bdev_ch); 4634 } 4635 } 4636 4637 bdev_io_complete(bdev_io); 4638 } 4639 4640 void 4641 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 4642 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 4643 { 4644 if (sc == SPDK_SCSI_STATUS_GOOD) { 4645 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4646 } else { 4647 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 4648 bdev_io->internal.error.scsi.sc = sc; 4649 bdev_io->internal.error.scsi.sk = sk; 4650 bdev_io->internal.error.scsi.asc = asc; 4651 bdev_io->internal.error.scsi.ascq = ascq; 4652 } 4653 4654 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 4655 } 4656 4657 void 4658 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 4659 int *sc, int *sk, int *asc, int *ascq) 4660 { 4661 assert(sc != NULL); 4662 assert(sk != NULL); 4663 assert(asc != NULL); 4664 assert(ascq != NULL); 4665 4666 switch (bdev_io->internal.status) { 4667 case SPDK_BDEV_IO_STATUS_SUCCESS: 4668 *sc = SPDK_SCSI_STATUS_GOOD; 4669 *sk = SPDK_SCSI_SENSE_NO_SENSE; 4670 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 4671 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 4672 break; 4673 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 4674 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 4675 break; 4676 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 4677 *sc = bdev_io->internal.error.scsi.sc; 4678 *sk = bdev_io->internal.error.scsi.sk; 4679 *asc = bdev_io->internal.error.scsi.asc; 4680 *ascq = bdev_io->internal.error.scsi.ascq; 4681 break; 4682 default: 4683 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 4684 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 4685 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 4686 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 4687 break; 4688 } 4689 } 4690 4691 void 4692 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 4693 { 4694 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 4695 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4696 } else { 4697 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 4698 } 4699 4700 bdev_io->internal.error.nvme.cdw0 = cdw0; 4701 bdev_io->internal.error.nvme.sct = sct; 4702 bdev_io->internal.error.nvme.sc = sc; 4703 4704 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 4705 } 4706 4707 void 4708 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 4709 { 4710 assert(sct != NULL); 4711 assert(sc != NULL); 4712 assert(cdw0 != NULL); 4713 4714 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 4715 *sct = bdev_io->internal.error.nvme.sct; 4716 *sc = bdev_io->internal.error.nvme.sc; 4717 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 4718 *sct = SPDK_NVME_SCT_GENERIC; 4719 *sc = SPDK_NVME_SC_SUCCESS; 4720 } else { 4721 *sct = SPDK_NVME_SCT_GENERIC; 4722 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 4723 } 4724 4725 *cdw0 = bdev_io->internal.error.nvme.cdw0; 4726 } 4727 4728 void 4729 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 4730 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 4731 { 4732 assert(first_sct != NULL); 4733 assert(first_sc != NULL); 4734 assert(second_sct != NULL); 4735 assert(second_sc != NULL); 4736 assert(cdw0 != NULL); 4737 4738 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 4739 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 4740 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 4741 *first_sct = bdev_io->internal.error.nvme.sct; 4742 *first_sc = bdev_io->internal.error.nvme.sc; 4743 *second_sct = SPDK_NVME_SCT_GENERIC; 4744 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 4745 } else { 4746 *first_sct = SPDK_NVME_SCT_GENERIC; 4747 *first_sc = SPDK_NVME_SC_SUCCESS; 4748 *second_sct = bdev_io->internal.error.nvme.sct; 4749 *second_sc = bdev_io->internal.error.nvme.sc; 4750 } 4751 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 4752 *first_sct = SPDK_NVME_SCT_GENERIC; 4753 *first_sc = SPDK_NVME_SC_SUCCESS; 4754 *second_sct = SPDK_NVME_SCT_GENERIC; 4755 *second_sc = SPDK_NVME_SC_SUCCESS; 4756 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 4757 *first_sct = SPDK_NVME_SCT_GENERIC; 4758 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 4759 *second_sct = SPDK_NVME_SCT_GENERIC; 4760 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 4761 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 4762 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 4763 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 4764 *second_sct = SPDK_NVME_SCT_GENERIC; 4765 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 4766 } else { 4767 *first_sct = SPDK_NVME_SCT_GENERIC; 4768 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 4769 *second_sct = SPDK_NVME_SCT_GENERIC; 4770 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 4771 } 4772 4773 *cdw0 = bdev_io->internal.error.nvme.cdw0; 4774 } 4775 4776 struct spdk_thread * 4777 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 4778 { 4779 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 4780 } 4781 4782 struct spdk_io_channel * 4783 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 4784 { 4785 return bdev_io->internal.ch->channel; 4786 } 4787 4788 static void 4789 bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) 4790 { 4791 uint64_t min_qos_set; 4792 int i; 4793 4794 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4795 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4796 break; 4797 } 4798 } 4799 4800 if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 4801 SPDK_ERRLOG("Invalid rate limits set.\n"); 4802 return; 4803 } 4804 4805 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4806 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4807 continue; 4808 } 4809 4810 if (bdev_qos_is_iops_rate_limit(i) == true) { 4811 min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 4812 } else { 4813 min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 4814 } 4815 4816 if (limits[i] == 0 || limits[i] % min_qos_set) { 4817 SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", 4818 limits[i], bdev->name, min_qos_set); 4819 SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); 4820 return; 4821 } 4822 } 4823 4824 if (!bdev->internal.qos) { 4825 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 4826 if (!bdev->internal.qos) { 4827 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 4828 return; 4829 } 4830 } 4831 4832 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4833 bdev->internal.qos->rate_limits[i].limit = limits[i]; 4834 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", 4835 bdev->name, i, limits[i]); 4836 } 4837 4838 return; 4839 } 4840 4841 static void 4842 bdev_qos_config(struct spdk_bdev *bdev) 4843 { 4844 struct spdk_conf_section *sp = NULL; 4845 const char *val = NULL; 4846 int i = 0, j = 0; 4847 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; 4848 bool config_qos = false; 4849 4850 sp = spdk_conf_find_section(NULL, "QoS"); 4851 if (!sp) { 4852 return; 4853 } 4854 4855 while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { 4856 limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4857 4858 i = 0; 4859 while (true) { 4860 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); 4861 if (!val) { 4862 break; 4863 } 4864 4865 if (strcmp(bdev->name, val) != 0) { 4866 i++; 4867 continue; 4868 } 4869 4870 val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); 4871 if (val) { 4872 if (bdev_qos_is_iops_rate_limit(j) == true) { 4873 limits[j] = strtoull(val, NULL, 10); 4874 } else { 4875 limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; 4876 } 4877 config_qos = true; 4878 } 4879 4880 break; 4881 } 4882 4883 j++; 4884 } 4885 4886 if (config_qos == true) { 4887 bdev_qos_config_limit(bdev, limits); 4888 } 4889 4890 return; 4891 } 4892 4893 static int 4894 bdev_init(struct spdk_bdev *bdev) 4895 { 4896 char *bdev_name; 4897 4898 assert(bdev->module != NULL); 4899 4900 if (!bdev->name) { 4901 SPDK_ERRLOG("Bdev name is NULL\n"); 4902 return -EINVAL; 4903 } 4904 4905 if (!strlen(bdev->name)) { 4906 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 4907 return -EINVAL; 4908 } 4909 4910 if (spdk_bdev_get_by_name(bdev->name)) { 4911 SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); 4912 return -EEXIST; 4913 } 4914 4915 /* Users often register their own I/O devices using the bdev name. In 4916 * order to avoid conflicts, prepend bdev_. */ 4917 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 4918 if (!bdev_name) { 4919 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 4920 return -ENOMEM; 4921 } 4922 4923 bdev->internal.status = SPDK_BDEV_STATUS_READY; 4924 bdev->internal.measured_queue_depth = UINT64_MAX; 4925 bdev->internal.claim_module = NULL; 4926 bdev->internal.qd_poller = NULL; 4927 bdev->internal.qos = NULL; 4928 4929 /* If the user didn't specify a uuid, generate one. */ 4930 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 4931 spdk_uuid_generate(&bdev->uuid); 4932 } 4933 4934 if (spdk_bdev_get_buf_align(bdev) > 1) { 4935 if (bdev->split_on_optimal_io_boundary) { 4936 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 4937 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 4938 } else { 4939 bdev->split_on_optimal_io_boundary = true; 4940 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 4941 } 4942 } 4943 4944 /* If the user didn't specify a write unit size, set it to one. */ 4945 if (bdev->write_unit_size == 0) { 4946 bdev->write_unit_size = 1; 4947 } 4948 4949 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 4950 if (bdev->acwu == 0) { 4951 bdev->acwu = 1; 4952 } 4953 4954 TAILQ_INIT(&bdev->internal.open_descs); 4955 TAILQ_INIT(&bdev->internal.locked_ranges); 4956 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 4957 4958 TAILQ_INIT(&bdev->aliases); 4959 4960 bdev->internal.reset_in_progress = NULL; 4961 4962 bdev_qos_config(bdev); 4963 4964 spdk_io_device_register(__bdev_to_io_dev(bdev), 4965 bdev_channel_create, bdev_channel_destroy, 4966 sizeof(struct spdk_bdev_channel), 4967 bdev_name); 4968 4969 free(bdev_name); 4970 4971 pthread_mutex_init(&bdev->internal.mutex, NULL); 4972 return 0; 4973 } 4974 4975 static void 4976 bdev_destroy_cb(void *io_device) 4977 { 4978 int rc; 4979 struct spdk_bdev *bdev; 4980 spdk_bdev_unregister_cb cb_fn; 4981 void *cb_arg; 4982 4983 bdev = __bdev_from_io_dev(io_device); 4984 cb_fn = bdev->internal.unregister_cb; 4985 cb_arg = bdev->internal.unregister_ctx; 4986 4987 rc = bdev->fn_table->destruct(bdev->ctxt); 4988 if (rc < 0) { 4989 SPDK_ERRLOG("destruct failed\n"); 4990 } 4991 if (rc <= 0 && cb_fn != NULL) { 4992 cb_fn(cb_arg, rc); 4993 } 4994 } 4995 4996 4997 static void 4998 bdev_fini(struct spdk_bdev *bdev) 4999 { 5000 pthread_mutex_destroy(&bdev->internal.mutex); 5001 5002 free(bdev->internal.qos); 5003 5004 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 5005 } 5006 5007 static void 5008 bdev_start(struct spdk_bdev *bdev) 5009 { 5010 struct spdk_bdev_module *module; 5011 uint32_t action; 5012 5013 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); 5014 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 5015 5016 /* Examine configuration before initializing I/O */ 5017 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 5018 if (module->examine_config) { 5019 action = module->internal.action_in_progress; 5020 module->internal.action_in_progress++; 5021 module->examine_config(bdev); 5022 if (action != module->internal.action_in_progress) { 5023 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 5024 module->name); 5025 } 5026 } 5027 } 5028 5029 if (bdev->internal.claim_module) { 5030 if (bdev->internal.claim_module->examine_disk) { 5031 bdev->internal.claim_module->internal.action_in_progress++; 5032 bdev->internal.claim_module->examine_disk(bdev); 5033 } 5034 return; 5035 } 5036 5037 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 5038 if (module->examine_disk) { 5039 module->internal.action_in_progress++; 5040 module->examine_disk(bdev); 5041 } 5042 } 5043 } 5044 5045 int 5046 spdk_bdev_register(struct spdk_bdev *bdev) 5047 { 5048 int rc = bdev_init(bdev); 5049 5050 if (rc == 0) { 5051 bdev_start(bdev); 5052 } 5053 5054 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 5055 return rc; 5056 } 5057 5058 int 5059 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) 5060 { 5061 SPDK_ERRLOG("This function is deprecated. Use spdk_bdev_register() instead.\n"); 5062 return spdk_bdev_register(vbdev); 5063 } 5064 5065 void 5066 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 5067 { 5068 if (bdev->internal.unregister_cb != NULL) { 5069 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 5070 } 5071 } 5072 5073 static void 5074 _remove_notify(void *arg) 5075 { 5076 struct spdk_bdev_desc *desc = arg; 5077 5078 pthread_mutex_lock(&desc->mutex); 5079 desc->refs--; 5080 5081 if (!desc->closed) { 5082 pthread_mutex_unlock(&desc->mutex); 5083 if (desc->callback.open_with_ext) { 5084 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 5085 } else { 5086 desc->callback.remove_fn(desc->callback.ctx); 5087 } 5088 return; 5089 } else if (0 == desc->refs) { 5090 /* This descriptor was closed after this remove_notify message was sent. 5091 * spdk_bdev_close() could not free the descriptor since this message was 5092 * in flight, so we free it now using bdev_desc_free(). 5093 */ 5094 pthread_mutex_unlock(&desc->mutex); 5095 bdev_desc_free(desc); 5096 return; 5097 } 5098 pthread_mutex_unlock(&desc->mutex); 5099 } 5100 5101 /* Must be called while holding bdev->internal.mutex. 5102 * returns: 0 - bdev removed and ready to be destructed. 5103 * -EBUSY - bdev can't be destructed yet. */ 5104 static int 5105 bdev_unregister_unsafe(struct spdk_bdev *bdev) 5106 { 5107 struct spdk_bdev_desc *desc, *tmp; 5108 int rc = 0; 5109 5110 /* Notify each descriptor about hotremoval */ 5111 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 5112 rc = -EBUSY; 5113 pthread_mutex_lock(&desc->mutex); 5114 /* 5115 * Defer invocation of the event_cb to a separate message that will 5116 * run later on its thread. This ensures this context unwinds and 5117 * we don't recursively unregister this bdev again if the event_cb 5118 * immediately closes its descriptor. 5119 */ 5120 desc->refs++; 5121 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 5122 pthread_mutex_unlock(&desc->mutex); 5123 } 5124 5125 /* If there are no descriptors, proceed removing the bdev */ 5126 if (rc == 0) { 5127 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 5128 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list done\n", bdev->name); 5129 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 5130 } 5131 5132 return rc; 5133 } 5134 5135 void 5136 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 5137 { 5138 struct spdk_thread *thread; 5139 int rc; 5140 5141 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); 5142 5143 thread = spdk_get_thread(); 5144 if (!thread) { 5145 /* The user called this from a non-SPDK thread. */ 5146 if (cb_fn != NULL) { 5147 cb_fn(cb_arg, -ENOTSUP); 5148 } 5149 return; 5150 } 5151 5152 pthread_mutex_lock(&g_bdev_mgr.mutex); 5153 pthread_mutex_lock(&bdev->internal.mutex); 5154 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5155 pthread_mutex_unlock(&bdev->internal.mutex); 5156 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5157 if (cb_fn) { 5158 cb_fn(cb_arg, -EBUSY); 5159 } 5160 return; 5161 } 5162 5163 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 5164 bdev->internal.unregister_cb = cb_fn; 5165 bdev->internal.unregister_ctx = cb_arg; 5166 5167 /* Call under lock. */ 5168 rc = bdev_unregister_unsafe(bdev); 5169 pthread_mutex_unlock(&bdev->internal.mutex); 5170 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5171 5172 if (rc == 0) { 5173 bdev_fini(bdev); 5174 } 5175 } 5176 5177 static void 5178 bdev_dummy_event_cb(void *remove_ctx) 5179 { 5180 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev remove event received with no remove callback specified"); 5181 } 5182 5183 static int 5184 bdev_start_qos(struct spdk_bdev *bdev) 5185 { 5186 struct set_qos_limit_ctx *ctx; 5187 5188 /* Enable QoS */ 5189 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 5190 ctx = calloc(1, sizeof(*ctx)); 5191 if (ctx == NULL) { 5192 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 5193 return -ENOMEM; 5194 } 5195 ctx->bdev = bdev; 5196 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5197 bdev_enable_qos_msg, ctx, 5198 bdev_enable_qos_done); 5199 } 5200 5201 return 0; 5202 } 5203 5204 static int 5205 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 5206 { 5207 struct spdk_thread *thread; 5208 int rc = 0; 5209 5210 thread = spdk_get_thread(); 5211 if (!thread) { 5212 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 5213 return -ENOTSUP; 5214 } 5215 5216 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 5217 spdk_get_thread()); 5218 5219 desc->bdev = bdev; 5220 desc->thread = thread; 5221 desc->write = write; 5222 5223 pthread_mutex_lock(&bdev->internal.mutex); 5224 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 5225 pthread_mutex_unlock(&bdev->internal.mutex); 5226 return -ENODEV; 5227 } 5228 5229 if (write && bdev->internal.claim_module) { 5230 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 5231 bdev->name, bdev->internal.claim_module->name); 5232 pthread_mutex_unlock(&bdev->internal.mutex); 5233 return -EPERM; 5234 } 5235 5236 rc = bdev_start_qos(bdev); 5237 if (rc != 0) { 5238 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 5239 pthread_mutex_unlock(&bdev->internal.mutex); 5240 return rc; 5241 } 5242 5243 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 5244 5245 pthread_mutex_unlock(&bdev->internal.mutex); 5246 5247 return 0; 5248 } 5249 5250 int 5251 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, 5252 void *remove_ctx, struct spdk_bdev_desc **_desc) 5253 { 5254 struct spdk_bdev_desc *desc; 5255 int rc; 5256 5257 desc = calloc(1, sizeof(*desc)); 5258 if (desc == NULL) { 5259 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 5260 return -ENOMEM; 5261 } 5262 5263 if (remove_cb == NULL) { 5264 remove_cb = bdev_dummy_event_cb; 5265 } 5266 5267 TAILQ_INIT(&desc->pending_media_events); 5268 TAILQ_INIT(&desc->free_media_events); 5269 5270 desc->callback.open_with_ext = false; 5271 desc->callback.remove_fn = remove_cb; 5272 desc->callback.ctx = remove_ctx; 5273 pthread_mutex_init(&desc->mutex, NULL); 5274 5275 pthread_mutex_lock(&g_bdev_mgr.mutex); 5276 5277 rc = bdev_open(bdev, write, desc); 5278 if (rc != 0) { 5279 bdev_desc_free(desc); 5280 desc = NULL; 5281 } 5282 5283 *_desc = desc; 5284 5285 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5286 5287 return rc; 5288 } 5289 5290 int 5291 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 5292 void *event_ctx, struct spdk_bdev_desc **_desc) 5293 { 5294 struct spdk_bdev_desc *desc; 5295 struct spdk_bdev *bdev; 5296 unsigned int event_id; 5297 int rc; 5298 5299 if (event_cb == NULL) { 5300 SPDK_ERRLOG("Missing event callback function\n"); 5301 return -EINVAL; 5302 } 5303 5304 pthread_mutex_lock(&g_bdev_mgr.mutex); 5305 5306 bdev = spdk_bdev_get_by_name(bdev_name); 5307 5308 if (bdev == NULL) { 5309 SPDK_ERRLOG("Failed to find bdev with name: %s\n", bdev_name); 5310 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5311 return -EINVAL; 5312 } 5313 5314 desc = calloc(1, sizeof(*desc)); 5315 if (desc == NULL) { 5316 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 5317 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5318 return -ENOMEM; 5319 } 5320 5321 TAILQ_INIT(&desc->pending_media_events); 5322 TAILQ_INIT(&desc->free_media_events); 5323 5324 desc->callback.open_with_ext = true; 5325 desc->callback.event_fn = event_cb; 5326 desc->callback.ctx = event_ctx; 5327 pthread_mutex_init(&desc->mutex, NULL); 5328 5329 if (bdev->media_events) { 5330 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 5331 sizeof(*desc->media_events_buffer)); 5332 if (desc->media_events_buffer == NULL) { 5333 SPDK_ERRLOG("Failed to initialize media event pool\n"); 5334 bdev_desc_free(desc); 5335 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5336 return -ENOMEM; 5337 } 5338 5339 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 5340 TAILQ_INSERT_TAIL(&desc->free_media_events, 5341 &desc->media_events_buffer[event_id], tailq); 5342 } 5343 } 5344 5345 rc = bdev_open(bdev, write, desc); 5346 if (rc != 0) { 5347 bdev_desc_free(desc); 5348 desc = NULL; 5349 } 5350 5351 *_desc = desc; 5352 5353 pthread_mutex_unlock(&g_bdev_mgr.mutex); 5354 5355 return rc; 5356 } 5357 5358 void 5359 spdk_bdev_close(struct spdk_bdev_desc *desc) 5360 { 5361 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5362 int rc; 5363 5364 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 5365 spdk_get_thread()); 5366 5367 assert(desc->thread == spdk_get_thread()); 5368 5369 spdk_poller_unregister(&desc->io_timeout_poller); 5370 5371 pthread_mutex_lock(&bdev->internal.mutex); 5372 pthread_mutex_lock(&desc->mutex); 5373 5374 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 5375 5376 desc->closed = true; 5377 5378 if (0 == desc->refs) { 5379 pthread_mutex_unlock(&desc->mutex); 5380 bdev_desc_free(desc); 5381 } else { 5382 pthread_mutex_unlock(&desc->mutex); 5383 } 5384 5385 /* If no more descriptors, kill QoS channel */ 5386 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 5387 SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 5388 bdev->name, spdk_get_thread()); 5389 5390 if (bdev_qos_destroy(bdev)) { 5391 /* There isn't anything we can do to recover here. Just let the 5392 * old QoS poller keep running. The QoS handling won't change 5393 * cores when the user allocates a new channel, but it won't break. */ 5394 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 5395 } 5396 } 5397 5398 spdk_bdev_set_qd_sampling_period(bdev, 0); 5399 5400 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 5401 rc = bdev_unregister_unsafe(bdev); 5402 pthread_mutex_unlock(&bdev->internal.mutex); 5403 5404 if (rc == 0) { 5405 bdev_fini(bdev); 5406 } 5407 } else { 5408 pthread_mutex_unlock(&bdev->internal.mutex); 5409 } 5410 } 5411 5412 int 5413 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 5414 struct spdk_bdev_module *module) 5415 { 5416 if (bdev->internal.claim_module != NULL) { 5417 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 5418 bdev->internal.claim_module->name); 5419 return -EPERM; 5420 } 5421 5422 if (desc && !desc->write) { 5423 desc->write = true; 5424 } 5425 5426 bdev->internal.claim_module = module; 5427 return 0; 5428 } 5429 5430 void 5431 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 5432 { 5433 assert(bdev->internal.claim_module != NULL); 5434 bdev->internal.claim_module = NULL; 5435 } 5436 5437 struct spdk_bdev * 5438 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 5439 { 5440 assert(desc != NULL); 5441 return desc->bdev; 5442 } 5443 5444 void 5445 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 5446 { 5447 struct iovec *iovs; 5448 int iovcnt; 5449 5450 if (bdev_io == NULL) { 5451 return; 5452 } 5453 5454 switch (bdev_io->type) { 5455 case SPDK_BDEV_IO_TYPE_READ: 5456 case SPDK_BDEV_IO_TYPE_WRITE: 5457 case SPDK_BDEV_IO_TYPE_ZCOPY: 5458 iovs = bdev_io->u.bdev.iovs; 5459 iovcnt = bdev_io->u.bdev.iovcnt; 5460 break; 5461 default: 5462 iovs = NULL; 5463 iovcnt = 0; 5464 break; 5465 } 5466 5467 if (iovp) { 5468 *iovp = iovs; 5469 } 5470 if (iovcntp) { 5471 *iovcntp = iovcnt; 5472 } 5473 } 5474 5475 void * 5476 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 5477 { 5478 if (bdev_io == NULL) { 5479 return NULL; 5480 } 5481 5482 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 5483 return NULL; 5484 } 5485 5486 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 5487 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 5488 return bdev_io->u.bdev.md_buf; 5489 } 5490 5491 return NULL; 5492 } 5493 5494 void 5495 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 5496 { 5497 5498 if (spdk_bdev_module_list_find(bdev_module->name)) { 5499 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 5500 assert(false); 5501 } 5502 5503 /* 5504 * Modules with examine callbacks must be initialized first, so they are 5505 * ready to handle examine callbacks from later modules that will 5506 * register physical bdevs. 5507 */ 5508 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 5509 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 5510 } else { 5511 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 5512 } 5513 } 5514 5515 struct spdk_bdev_module * 5516 spdk_bdev_module_list_find(const char *name) 5517 { 5518 struct spdk_bdev_module *bdev_module; 5519 5520 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 5521 if (strcmp(name, bdev_module->name) == 0) { 5522 break; 5523 } 5524 } 5525 5526 return bdev_module; 5527 } 5528 5529 static void 5530 bdev_write_zero_buffer_next(void *_bdev_io) 5531 { 5532 struct spdk_bdev_io *bdev_io = _bdev_io; 5533 uint64_t num_bytes, num_blocks; 5534 void *md_buf = NULL; 5535 int rc; 5536 5537 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 5538 bdev_io->u.bdev.split_remaining_num_blocks, 5539 ZERO_BUFFER_SIZE); 5540 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 5541 5542 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 5543 md_buf = (char *)g_bdev_mgr.zero_buffer + 5544 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 5545 } 5546 5547 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 5548 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5549 g_bdev_mgr.zero_buffer, md_buf, 5550 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 5551 bdev_write_zero_buffer_done, bdev_io); 5552 if (rc == 0) { 5553 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 5554 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 5555 } else if (rc == -ENOMEM) { 5556 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 5557 } else { 5558 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5559 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5560 } 5561 } 5562 5563 static void 5564 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5565 { 5566 struct spdk_bdev_io *parent_io = cb_arg; 5567 5568 spdk_bdev_free_io(bdev_io); 5569 5570 if (!success) { 5571 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5572 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5573 return; 5574 } 5575 5576 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 5577 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5578 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5579 return; 5580 } 5581 5582 bdev_write_zero_buffer_next(parent_io); 5583 } 5584 5585 static void 5586 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 5587 { 5588 pthread_mutex_lock(&ctx->bdev->internal.mutex); 5589 ctx->bdev->internal.qos_mod_in_progress = false; 5590 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 5591 5592 if (ctx->cb_fn) { 5593 ctx->cb_fn(ctx->cb_arg, status); 5594 } 5595 free(ctx); 5596 } 5597 5598 static void 5599 bdev_disable_qos_done(void *cb_arg) 5600 { 5601 struct set_qos_limit_ctx *ctx = cb_arg; 5602 struct spdk_bdev *bdev = ctx->bdev; 5603 struct spdk_bdev_io *bdev_io; 5604 struct spdk_bdev_qos *qos; 5605 5606 pthread_mutex_lock(&bdev->internal.mutex); 5607 qos = bdev->internal.qos; 5608 bdev->internal.qos = NULL; 5609 pthread_mutex_unlock(&bdev->internal.mutex); 5610 5611 while (!TAILQ_EMPTY(&qos->queued)) { 5612 /* Send queued I/O back to their original thread for resubmission. */ 5613 bdev_io = TAILQ_FIRST(&qos->queued); 5614 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 5615 5616 if (bdev_io->internal.io_submit_ch) { 5617 /* 5618 * Channel was changed when sending it to the QoS thread - change it back 5619 * before sending it back to the original thread. 5620 */ 5621 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5622 bdev_io->internal.io_submit_ch = NULL; 5623 } 5624 5625 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5626 _bdev_io_submit, bdev_io); 5627 } 5628 5629 if (qos->thread != NULL) { 5630 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 5631 spdk_poller_unregister(&qos->poller); 5632 } 5633 5634 free(qos); 5635 5636 bdev_set_qos_limit_done(ctx, 0); 5637 } 5638 5639 static void 5640 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 5641 { 5642 void *io_device = spdk_io_channel_iter_get_io_device(i); 5643 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 5644 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5645 struct spdk_thread *thread; 5646 5647 pthread_mutex_lock(&bdev->internal.mutex); 5648 thread = bdev->internal.qos->thread; 5649 pthread_mutex_unlock(&bdev->internal.mutex); 5650 5651 if (thread != NULL) { 5652 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 5653 } else { 5654 bdev_disable_qos_done(ctx); 5655 } 5656 } 5657 5658 static void 5659 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 5660 { 5661 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5662 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 5663 5664 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 5665 5666 spdk_for_each_channel_continue(i, 0); 5667 } 5668 5669 static void 5670 bdev_update_qos_rate_limit_msg(void *cb_arg) 5671 { 5672 struct set_qos_limit_ctx *ctx = cb_arg; 5673 struct spdk_bdev *bdev = ctx->bdev; 5674 5675 pthread_mutex_lock(&bdev->internal.mutex); 5676 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 5677 pthread_mutex_unlock(&bdev->internal.mutex); 5678 5679 bdev_set_qos_limit_done(ctx, 0); 5680 } 5681 5682 static void 5683 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 5684 { 5685 void *io_device = spdk_io_channel_iter_get_io_device(i); 5686 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 5687 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5688 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 5689 5690 pthread_mutex_lock(&bdev->internal.mutex); 5691 bdev_enable_qos(bdev, bdev_ch); 5692 pthread_mutex_unlock(&bdev->internal.mutex); 5693 spdk_for_each_channel_continue(i, 0); 5694 } 5695 5696 static void 5697 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 5698 { 5699 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5700 5701 bdev_set_qos_limit_done(ctx, status); 5702 } 5703 5704 static void 5705 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 5706 { 5707 int i; 5708 5709 assert(bdev->internal.qos != NULL); 5710 5711 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5712 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 5713 bdev->internal.qos->rate_limits[i].limit = limits[i]; 5714 5715 if (limits[i] == 0) { 5716 bdev->internal.qos->rate_limits[i].limit = 5717 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 5718 } 5719 } 5720 } 5721 } 5722 5723 void 5724 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 5725 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 5726 { 5727 struct set_qos_limit_ctx *ctx; 5728 uint32_t limit_set_complement; 5729 uint64_t min_limit_per_sec; 5730 int i; 5731 bool disable_rate_limit = true; 5732 5733 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5734 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 5735 continue; 5736 } 5737 5738 if (limits[i] > 0) { 5739 disable_rate_limit = false; 5740 } 5741 5742 if (bdev_qos_is_iops_rate_limit(i) == true) { 5743 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 5744 } else { 5745 /* Change from megabyte to byte rate limit */ 5746 limits[i] = limits[i] * 1024 * 1024; 5747 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 5748 } 5749 5750 limit_set_complement = limits[i] % min_limit_per_sec; 5751 if (limit_set_complement) { 5752 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 5753 limits[i], min_limit_per_sec); 5754 limits[i] += min_limit_per_sec - limit_set_complement; 5755 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 5756 } 5757 } 5758 5759 ctx = calloc(1, sizeof(*ctx)); 5760 if (ctx == NULL) { 5761 cb_fn(cb_arg, -ENOMEM); 5762 return; 5763 } 5764 5765 ctx->cb_fn = cb_fn; 5766 ctx->cb_arg = cb_arg; 5767 ctx->bdev = bdev; 5768 5769 pthread_mutex_lock(&bdev->internal.mutex); 5770 if (bdev->internal.qos_mod_in_progress) { 5771 pthread_mutex_unlock(&bdev->internal.mutex); 5772 free(ctx); 5773 cb_fn(cb_arg, -EAGAIN); 5774 return; 5775 } 5776 bdev->internal.qos_mod_in_progress = true; 5777 5778 if (disable_rate_limit == true && bdev->internal.qos) { 5779 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5780 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 5781 (bdev->internal.qos->rate_limits[i].limit > 0 && 5782 bdev->internal.qos->rate_limits[i].limit != 5783 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 5784 disable_rate_limit = false; 5785 break; 5786 } 5787 } 5788 } 5789 5790 if (disable_rate_limit == false) { 5791 if (bdev->internal.qos == NULL) { 5792 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 5793 if (!bdev->internal.qos) { 5794 pthread_mutex_unlock(&bdev->internal.mutex); 5795 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 5796 bdev_set_qos_limit_done(ctx, -ENOMEM); 5797 return; 5798 } 5799 } 5800 5801 if (bdev->internal.qos->thread == NULL) { 5802 /* Enabling */ 5803 bdev_set_qos_rate_limits(bdev, limits); 5804 5805 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5806 bdev_enable_qos_msg, ctx, 5807 bdev_enable_qos_done); 5808 } else { 5809 /* Updating */ 5810 bdev_set_qos_rate_limits(bdev, limits); 5811 5812 spdk_thread_send_msg(bdev->internal.qos->thread, 5813 bdev_update_qos_rate_limit_msg, ctx); 5814 } 5815 } else { 5816 if (bdev->internal.qos != NULL) { 5817 bdev_set_qos_rate_limits(bdev, limits); 5818 5819 /* Disabling */ 5820 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5821 bdev_disable_qos_msg, ctx, 5822 bdev_disable_qos_msg_done); 5823 } else { 5824 pthread_mutex_unlock(&bdev->internal.mutex); 5825 bdev_set_qos_limit_done(ctx, 0); 5826 return; 5827 } 5828 } 5829 5830 pthread_mutex_unlock(&bdev->internal.mutex); 5831 } 5832 5833 struct spdk_bdev_histogram_ctx { 5834 spdk_bdev_histogram_status_cb cb_fn; 5835 void *cb_arg; 5836 struct spdk_bdev *bdev; 5837 int status; 5838 }; 5839 5840 static void 5841 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 5842 { 5843 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5844 5845 pthread_mutex_lock(&ctx->bdev->internal.mutex); 5846 ctx->bdev->internal.histogram_in_progress = false; 5847 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 5848 ctx->cb_fn(ctx->cb_arg, ctx->status); 5849 free(ctx); 5850 } 5851 5852 static void 5853 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 5854 { 5855 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5856 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5857 5858 if (ch->histogram != NULL) { 5859 spdk_histogram_data_free(ch->histogram); 5860 ch->histogram = NULL; 5861 } 5862 spdk_for_each_channel_continue(i, 0); 5863 } 5864 5865 static void 5866 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 5867 { 5868 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5869 5870 if (status != 0) { 5871 ctx->status = status; 5872 ctx->bdev->internal.histogram_enabled = false; 5873 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 5874 bdev_histogram_disable_channel_cb); 5875 } else { 5876 pthread_mutex_lock(&ctx->bdev->internal.mutex); 5877 ctx->bdev->internal.histogram_in_progress = false; 5878 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 5879 ctx->cb_fn(ctx->cb_arg, ctx->status); 5880 free(ctx); 5881 } 5882 } 5883 5884 static void 5885 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 5886 { 5887 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5888 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5889 int status = 0; 5890 5891 if (ch->histogram == NULL) { 5892 ch->histogram = spdk_histogram_data_alloc(); 5893 if (ch->histogram == NULL) { 5894 status = -ENOMEM; 5895 } 5896 } 5897 5898 spdk_for_each_channel_continue(i, status); 5899 } 5900 5901 void 5902 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 5903 void *cb_arg, bool enable) 5904 { 5905 struct spdk_bdev_histogram_ctx *ctx; 5906 5907 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 5908 if (ctx == NULL) { 5909 cb_fn(cb_arg, -ENOMEM); 5910 return; 5911 } 5912 5913 ctx->bdev = bdev; 5914 ctx->status = 0; 5915 ctx->cb_fn = cb_fn; 5916 ctx->cb_arg = cb_arg; 5917 5918 pthread_mutex_lock(&bdev->internal.mutex); 5919 if (bdev->internal.histogram_in_progress) { 5920 pthread_mutex_unlock(&bdev->internal.mutex); 5921 free(ctx); 5922 cb_fn(cb_arg, -EAGAIN); 5923 return; 5924 } 5925 5926 bdev->internal.histogram_in_progress = true; 5927 pthread_mutex_unlock(&bdev->internal.mutex); 5928 5929 bdev->internal.histogram_enabled = enable; 5930 5931 if (enable) { 5932 /* Allocate histogram for each channel */ 5933 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 5934 bdev_histogram_enable_channel_cb); 5935 } else { 5936 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 5937 bdev_histogram_disable_channel_cb); 5938 } 5939 } 5940 5941 struct spdk_bdev_histogram_data_ctx { 5942 spdk_bdev_histogram_data_cb cb_fn; 5943 void *cb_arg; 5944 struct spdk_bdev *bdev; 5945 /** merged histogram data from all channels */ 5946 struct spdk_histogram_data *histogram; 5947 }; 5948 5949 static void 5950 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 5951 { 5952 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5953 5954 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 5955 free(ctx); 5956 } 5957 5958 static void 5959 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 5960 { 5961 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5962 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5963 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5964 int status = 0; 5965 5966 if (ch->histogram == NULL) { 5967 status = -EFAULT; 5968 } else { 5969 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 5970 } 5971 5972 spdk_for_each_channel_continue(i, status); 5973 } 5974 5975 void 5976 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 5977 spdk_bdev_histogram_data_cb cb_fn, 5978 void *cb_arg) 5979 { 5980 struct spdk_bdev_histogram_data_ctx *ctx; 5981 5982 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 5983 if (ctx == NULL) { 5984 cb_fn(cb_arg, -ENOMEM, NULL); 5985 return; 5986 } 5987 5988 ctx->bdev = bdev; 5989 ctx->cb_fn = cb_fn; 5990 ctx->cb_arg = cb_arg; 5991 5992 ctx->histogram = histogram; 5993 5994 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 5995 bdev_histogram_get_channel_cb); 5996 } 5997 5998 size_t 5999 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 6000 size_t max_events) 6001 { 6002 struct media_event_entry *entry; 6003 size_t num_events = 0; 6004 6005 for (; num_events < max_events; ++num_events) { 6006 entry = TAILQ_FIRST(&desc->pending_media_events); 6007 if (entry == NULL) { 6008 break; 6009 } 6010 6011 events[num_events] = entry->event; 6012 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 6013 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 6014 } 6015 6016 return num_events; 6017 } 6018 6019 int 6020 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 6021 size_t num_events) 6022 { 6023 struct spdk_bdev_desc *desc; 6024 struct media_event_entry *entry; 6025 size_t event_id; 6026 int rc = 0; 6027 6028 assert(bdev->media_events); 6029 6030 pthread_mutex_lock(&bdev->internal.mutex); 6031 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6032 if (desc->write) { 6033 break; 6034 } 6035 } 6036 6037 if (desc == NULL || desc->media_events_buffer == NULL) { 6038 rc = -ENODEV; 6039 goto out; 6040 } 6041 6042 for (event_id = 0; event_id < num_events; ++event_id) { 6043 entry = TAILQ_FIRST(&desc->free_media_events); 6044 if (entry == NULL) { 6045 break; 6046 } 6047 6048 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 6049 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 6050 entry->event = events[event_id]; 6051 } 6052 6053 rc = event_id; 6054 out: 6055 pthread_mutex_unlock(&bdev->internal.mutex); 6056 return rc; 6057 } 6058 6059 void 6060 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 6061 { 6062 struct spdk_bdev_desc *desc; 6063 6064 pthread_mutex_lock(&bdev->internal.mutex); 6065 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 6066 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 6067 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 6068 desc->callback.ctx); 6069 } 6070 } 6071 pthread_mutex_unlock(&bdev->internal.mutex); 6072 } 6073 6074 struct locked_lba_range_ctx { 6075 struct lba_range range; 6076 struct spdk_bdev *bdev; 6077 struct lba_range *current_range; 6078 struct lba_range *owner_range; 6079 struct spdk_poller *poller; 6080 lock_range_cb cb_fn; 6081 void *cb_arg; 6082 }; 6083 6084 static void 6085 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 6086 { 6087 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6088 6089 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 6090 free(ctx); 6091 } 6092 6093 static void 6094 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 6095 6096 static void 6097 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6098 { 6099 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6100 struct spdk_bdev *bdev = ctx->bdev; 6101 6102 if (status == -ENOMEM) { 6103 /* One of the channels could not allocate a range object. 6104 * So we have to go back and clean up any ranges that were 6105 * allocated successfully before we return error status to 6106 * the caller. We can reuse the unlock function to do that 6107 * clean up. 6108 */ 6109 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6110 bdev_unlock_lba_range_get_channel, ctx, 6111 bdev_lock_error_cleanup_cb); 6112 return; 6113 } 6114 6115 /* All channels have locked this range and no I/O overlapping the range 6116 * are outstanding! Set the owner_ch for the range object for the 6117 * locking channel, so that this channel will know that it is allowed 6118 * to write to this range. 6119 */ 6120 ctx->owner_range->owner_ch = ctx->range.owner_ch; 6121 ctx->cb_fn(ctx->cb_arg, status); 6122 6123 /* Don't free the ctx here. Its range is in the bdev's global list of 6124 * locked ranges still, and will be removed and freed when this range 6125 * is later unlocked. 6126 */ 6127 } 6128 6129 static int 6130 bdev_lock_lba_range_check_io(void *_i) 6131 { 6132 struct spdk_io_channel_iter *i = _i; 6133 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6134 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6135 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6136 struct lba_range *range = ctx->current_range; 6137 struct spdk_bdev_io *bdev_io; 6138 6139 spdk_poller_unregister(&ctx->poller); 6140 6141 /* The range is now in the locked_ranges, so no new IO can be submitted to this 6142 * range. But we need to wait until any outstanding IO overlapping with this range 6143 * are completed. 6144 */ 6145 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 6146 if (bdev_io_range_is_locked(bdev_io, range)) { 6147 ctx->poller = spdk_poller_register(bdev_lock_lba_range_check_io, i, 100); 6148 return 1; 6149 } 6150 } 6151 6152 spdk_for_each_channel_continue(i, 0); 6153 return 1; 6154 } 6155 6156 static void 6157 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6158 { 6159 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6160 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6161 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6162 struct lba_range *range; 6163 6164 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6165 if (range->length == ctx->range.length && 6166 range->offset == ctx->range.offset && 6167 range->locked_ctx == ctx->range.locked_ctx) { 6168 /* This range already exists on this channel, so don't add 6169 * it again. This can happen when a new channel is created 6170 * while the for_each_channel operation is in progress. 6171 * Do not check for outstanding I/O in that case, since the 6172 * range was locked before any I/O could be submitted to the 6173 * new channel. 6174 */ 6175 spdk_for_each_channel_continue(i, 0); 6176 return; 6177 } 6178 } 6179 6180 range = calloc(1, sizeof(*range)); 6181 if (range == NULL) { 6182 spdk_for_each_channel_continue(i, -ENOMEM); 6183 return; 6184 } 6185 6186 range->length = ctx->range.length; 6187 range->offset = ctx->range.offset; 6188 range->locked_ctx = ctx->range.locked_ctx; 6189 ctx->current_range = range; 6190 if (ctx->range.owner_ch == ch) { 6191 /* This is the range object for the channel that will hold 6192 * the lock. Store it in the ctx object so that we can easily 6193 * set its owner_ch after the lock is finally acquired. 6194 */ 6195 ctx->owner_range = range; 6196 } 6197 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 6198 bdev_lock_lba_range_check_io(i); 6199 } 6200 6201 static void 6202 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 6203 { 6204 assert(spdk_get_thread() == ctx->range.owner_ch->channel->thread); 6205 6206 /* We will add a copy of this range to each channel now. */ 6207 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 6208 bdev_lock_lba_range_cb); 6209 } 6210 6211 static bool 6212 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 6213 { 6214 struct lba_range *r; 6215 6216 TAILQ_FOREACH(r, tailq, tailq) { 6217 if (bdev_lba_range_overlapped(range, r)) { 6218 return true; 6219 } 6220 } 6221 return false; 6222 } 6223 6224 static int 6225 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6226 uint64_t offset, uint64_t length, 6227 lock_range_cb cb_fn, void *cb_arg) 6228 { 6229 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6230 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6231 struct locked_lba_range_ctx *ctx; 6232 6233 if (cb_arg == NULL) { 6234 SPDK_ERRLOG("cb_arg must not be NULL\n"); 6235 return -EINVAL; 6236 } 6237 6238 ctx = calloc(1, sizeof(*ctx)); 6239 if (ctx == NULL) { 6240 return -ENOMEM; 6241 } 6242 6243 ctx->range.offset = offset; 6244 ctx->range.length = length; 6245 ctx->range.owner_ch = ch; 6246 ctx->range.locked_ctx = cb_arg; 6247 ctx->bdev = bdev; 6248 ctx->cb_fn = cb_fn; 6249 ctx->cb_arg = cb_arg; 6250 6251 pthread_mutex_lock(&bdev->internal.mutex); 6252 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 6253 /* There is an active lock overlapping with this range. 6254 * Put it on the pending list until this range no 6255 * longer overlaps with another. 6256 */ 6257 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 6258 } else { 6259 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 6260 bdev_lock_lba_range_ctx(bdev, ctx); 6261 } 6262 pthread_mutex_unlock(&bdev->internal.mutex); 6263 return 0; 6264 } 6265 6266 static void 6267 bdev_lock_lba_range_ctx_msg(void *_ctx) 6268 { 6269 struct locked_lba_range_ctx *ctx = _ctx; 6270 6271 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 6272 } 6273 6274 static void 6275 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 6276 { 6277 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6278 struct locked_lba_range_ctx *pending_ctx; 6279 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 6280 struct spdk_bdev *bdev = ch->bdev; 6281 struct lba_range *range, *tmp; 6282 6283 pthread_mutex_lock(&bdev->internal.mutex); 6284 /* Check if there are any pending locked ranges that overlap with this range 6285 * that was just unlocked. If there are, check that it doesn't overlap with any 6286 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 6287 * the lock process. 6288 */ 6289 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 6290 if (bdev_lba_range_overlapped(range, &ctx->range) && 6291 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 6292 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 6293 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 6294 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 6295 spdk_thread_send_msg(pending_ctx->range.owner_ch->channel->thread, 6296 bdev_lock_lba_range_ctx_msg, pending_ctx); 6297 } 6298 } 6299 pthread_mutex_unlock(&bdev->internal.mutex); 6300 6301 ctx->cb_fn(ctx->cb_arg, status); 6302 free(ctx); 6303 } 6304 6305 static void 6306 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 6307 { 6308 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6309 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6310 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6311 TAILQ_HEAD(, spdk_bdev_io) io_locked; 6312 struct spdk_bdev_io *bdev_io; 6313 struct lba_range *range; 6314 6315 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6316 if (ctx->range.offset == range->offset && 6317 ctx->range.length == range->length && 6318 ctx->range.locked_ctx == range->locked_ctx) { 6319 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 6320 free(range); 6321 break; 6322 } 6323 } 6324 6325 /* Note: we should almost always be able to assert that the range specified 6326 * was found. But there are some very rare corner cases where a new channel 6327 * gets created simultaneously with a range unlock, where this function 6328 * would execute on that new channel and wouldn't have the range. 6329 * We also use this to clean up range allocations when a later allocation 6330 * fails in the locking path. 6331 * So we can't actually assert() here. 6332 */ 6333 6334 /* Swap the locked IO into a temporary list, and then try to submit them again. 6335 * We could hyper-optimize this to only resubmit locked I/O that overlap 6336 * with the range that was just unlocked, but this isn't a performance path so 6337 * we go for simplicity here. 6338 */ 6339 TAILQ_INIT(&io_locked); 6340 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 6341 while (!TAILQ_EMPTY(&io_locked)) { 6342 bdev_io = TAILQ_FIRST(&io_locked); 6343 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 6344 bdev_io_submit(bdev_io); 6345 } 6346 6347 spdk_for_each_channel_continue(i, 0); 6348 } 6349 6350 static int 6351 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 6352 uint64_t offset, uint64_t length, 6353 lock_range_cb cb_fn, void *cb_arg) 6354 { 6355 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6356 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6357 struct locked_lba_range_ctx *ctx; 6358 struct lba_range *range; 6359 bool range_found = false; 6360 6361 /* Let's make sure the specified channel actually has a lock on 6362 * the specified range. Note that the range must match exactly. 6363 */ 6364 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 6365 if (range->offset == offset && range->length == length && 6366 range->owner_ch == ch && range->locked_ctx == cb_arg) { 6367 range_found = true; 6368 break; 6369 } 6370 } 6371 6372 if (!range_found) { 6373 return -EINVAL; 6374 } 6375 6376 pthread_mutex_lock(&bdev->internal.mutex); 6377 /* We confirmed that this channel has locked the specified range. To 6378 * start the unlock the process, we find the range in the bdev's locked_ranges 6379 * and remove it. This ensures new channels don't inherit the locked range. 6380 * Then we will send a message to each channel (including the one specified 6381 * here) to remove the range from its per-channel list. 6382 */ 6383 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 6384 if (range->offset == offset && range->length == length && 6385 range->locked_ctx == cb_arg) { 6386 break; 6387 } 6388 } 6389 if (range == NULL) { 6390 assert(false); 6391 pthread_mutex_unlock(&bdev->internal.mutex); 6392 return -EINVAL; 6393 } 6394 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 6395 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 6396 pthread_mutex_unlock(&bdev->internal.mutex); 6397 6398 ctx->cb_fn = cb_fn; 6399 ctx->cb_arg = cb_arg; 6400 6401 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 6402 bdev_unlock_lba_range_cb); 6403 return 0; 6404 } 6405 6406 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) 6407 6408 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 6409 { 6410 spdk_trace_register_owner(OWNER_BDEV, 'b'); 6411 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 6412 spdk_trace_register_description("BDEV_IO_START", TRACE_BDEV_IO_START, OWNER_BDEV, 6413 OBJECT_BDEV_IO, 1, 0, "type: "); 6414 spdk_trace_register_description("BDEV_IO_DONE", TRACE_BDEV_IO_DONE, OWNER_BDEV, 6415 OBJECT_BDEV_IO, 0, 0, ""); 6416 } 6417